Merge pull request #3 from seermedical/lazy-fetch-urls

Lazy fetch urls
seermedical · Jun 15, 2018 · 0a9c1d7 · 0a9c1d7
2 parents 00ab5f2 + e1e495d
commit 0a9c1d7
Show file tree

Hide file tree

Showing 4 changed files with 153 additions and 52 deletions.
diff --git a/Examples/ContestDataDownloader.py b/Examples/ContestDataDownloader.py
@@ -55,19 +55,14 @@
         print('  Retrieving metadata...')
 
         allData = None
-        allData, channelGroups, segments, channels, dataChunks, labelGroups, labels = client.createMetaData(study)
+        allData = client.createMetaData(study)
+
+        #return values in uV
+        allData['channelGroups.exponent'] = 0
 
         if dtMin is not None and dtMax is not None:
             allData = allData[(allData.loc[:,'segments.startTime']>=dtMin) & (allData.loc[:,'segments.startTime']<=dtMax)]
-
-#        startTimes = allData.loc[:,'segments.startTime']
-#        startTimes = pd.to_datetime(pd.Series(startTimes), unit='ms')
-#        startTimes = startTimes.dt.minute
-#        startTimesCompare = startTimes.unique()
-#
-#        allData['Preictal'] = np.where(startTimes==30, 1, 0)
-
-        time.sleep(2)
+
 
         numFiles = len(allData['segments.startTime'].unique())
         print('  Downloading %d file(s)...' % numFiles)
@@ -78,7 +73,7 @@
             startTime = datetime.fromtimestamp(chunk/1000, tz=timezone.utc)
             hour = (startTime - baseTime).total_seconds()/3600
             minute = startTime.minute
-            if minute == 30:
+            if minute >= 30:
                 preictal = 1
             else:
                 preictal = 0
@@ -107,7 +102,7 @@
             savemat(
                 filename + '.mat',
                 { 'data': np.asarray(data.iloc[:,-16:], dtype=np.float32) },
-                appendmat = False, do_compression = True
+                appendmat = False, do_compression = False
             )
             counter += 1
 

diff --git a/README.md b/README.md
@@ -1,34 +1,31 @@
 # seer-py
-Python wrapper for seer-api, with the purpose of authenticating a user, downloading filtered data, and uploading labels.
+Python SDK for the Seer data platform, which handles authenticating a user, downloading channel data, and uploading labels/annotations.
 
 ## Install
-To install, simply clone or download this repository, then type `pip install .` which will install all dependencies, and the Seer python API.
+To install, simply clone or download this repository, then type `pip install .` which will install all the dependencies.
 
 ### Epilepsy Ecosystem Data
-For users attempting to download data for the [Epilepsy Ecosystem](https://www.epilepsyecosystem.org/howitworks/), please download the [latest release](https://github.com/seermedical/seer-py/releases/latest) instead of cloning the repository or downloading the master branch. The file ContestDataDownloader.py in Examples will guide you through the download process.
+For users attempting to download data for the [Epilepsy Ecosystem](https://www.epilepsyecosystem.org/howitworks/), please download the [latest release](https://github.com/seermedical/seer-py/releases/latest) instead of cloning the repository or downloading the master branch. Then open the script `ContestDataDownloader.py` in `Examples` and it will guide you through the download process (you will need to change a few things in this script including the path to download the data to).
 
 ## Requirements
-This library currently requires python 3, and it if you don't currently have a python 3 installation, we recommend you use the Anaconda distribution for its simplicity, support, stability and extensibility. It can be downloaded here: https://www.anaconda.com/download
+This library currently requires Python 3, and it if you don't currently have a Python 3 installation, we recommend you use the Anaconda distribution for its simplicity, support, stability and extensibility. It can be downloaded here: https://www.anaconda.com/download
 
 The install instructions above will install all the required dependencies, however, if you wish to install them yourself, here's what you'll need:
+- [`gql`](https://github.com/graphql-python/gql): a GraphQL python library is required to query the Seer platform. To install, simply run: `pip install gql`
+- Pandas, numpy, and matplotlib are also required. Some of these installs can be tricky, so Anaconda is recommended, however, they can be installed separately. Please see these guides for more detailed information:
+  - https://scipy.org/install.html
+  - https://matplotlib.org/users/installing.html
+  - https://pandas.pydata.org/pandas-docs/stable/install.html
 
-GraphQL python library is required to query the Seer database (in addition to Anaconda). To install, simply run:
-`pip install gql`
-
-Pandas, numpy, and matplotlib are also required. Some of these installs can be tricky, so Anaconda is recommended, however, they can be installed separately. Please see these guides for more detailed information:
-https://scipy.org/install.html
-https://matplotlib.org/users/installing.html
-https://pandas.pydata.org/pandas-docs/stable/install.html
-
-To run the jupyter notebook example (optional, included in Anaconda):
-`pip install notebook`
+To run the Jupyter notebook example (optional, included in Anaconda): `pip install notebook`
 
 ## Getting Started
+Check out the [Example](`Examples/Example.ipynb`) for a step-by-step example of how to use the SDK to access data on the Seer platform. 
 
-Check out the [Example](Examples/Example.ipynb) for a step-by-step example of how to use the API
+To start a Jupyter notebook, run `jupyter notebook` in a command/bash window. Further instructions on Jupyter can be found here: https://github.com/jupyter/notebook
 
-To start jupyter notebooks, run `jupyter notebook` in a command/bash window. Further instructions on Jupyter can be found here: https://github.com/jupyter/notebook
+## Troubleshooting
 
+### Downloading hangs on Windows
+There is a known issue with using python's multiprocessing module on Windows with spyder. The function `getLinks` uses `multiprocessing.Pool` to run multiple downloads simultaneously, which can cause the process to run indefinitely. The workaround for this is to ensure that the current working directory is set to the directory containing your script. Running the script from a command window will also solve this problem. Alternatively, setting `threads=1` in the `getLinks` function will stop in from using `multiprocessing` altogether.
 
-## Multiprocessing module
-Using multiprocessing to download links in parallel can speed things up, but Windows can sometimes make this difficult. The SeerConenct.getLinks function has the 'threads' argument, which will default to 5 on linux/MacOS, and 1 on Windows. Setting 'threads' to 1 means that no new processes will be spawned, which can avoid errors on Windows. Using multiple threads in Windows is recommended for advanced users only.
diff --git a/seerpy/graphql.py b/seerpy/graphql.py
@@ -26,11 +26,6 @@ def studyWithDataQueryString(studyId):
                         id
                         startTime
                         duration
-                        dataChunks {
-                            time
-                            length
-                            url
-                        }
                     }
                     channels {
                         id
@@ -41,21 +36,85 @@ def studyWithDataQueryString(studyId):
                         }
                     }
                 }
-                labelGroups {
+            }
+        }
+    ''' % (studyId)
+
+
+def dataChunksQueryString(studyId, channelGroupId, fromTime, toTime):
+    return '''
+        query {
+            study (id: "%s") {
+                id
+                name
+                channelGroup (channelGroupId: "%s") {
+                    id
+                    name
+                    segments (fromTime: %f, toTime: %f) {
+                        id
+                        startTime
+                        duration
+                        dataChunks {
+                            time
+                            length
+                            url
+                        }
+                    }
+                }
+            }
+        }
+    ''' % (studyId, channelGroupId, fromTime, toTime)
+
+def getLabesQueryString(studyId, labelGroupId, fromTime, toTime, limit, offset):
+        return '''
+        query {
+            study (id: "%s") {
+                id
+                name
+                labelGroup (labelGroupId: "%s") {
                     id
                     name
                     labelType
                     description
-                    labels {
+                    labels (limit: %.0f, offset: %.0f) {
                         id
                         note
                         startTime
                         duration
                         timezone
+                        tags {
+                            id
+                            tagType {
+                                id
+                                category {
+                                        id
+                                        name
+                                        description
+                                        }
+                                value
+                                }
+                            }
                     }
                 }
             }
         }
+    ''' % (studyId, labelGroupId, limit, offset)
+
+
+def labelGroupsQueryString(studyId):
+        return '''
+        query {
+            study (id: "%s") {
+                id
+                name
+                labelGroups {
+                    id
+                    name
+                    labelType
+                    description
+                }
+            }
+        }
     ''' % (studyId)
 
 def channelGroupsQueryString(studyId):

diff --git a/seerpy/seerpy.py b/seerpy/seerpy.py
@@ -10,7 +10,7 @@
 import os
 
 from multiprocessing import Pool
-import time
+#import time
 from time import gmtime, strftime
 
 
@@ -173,7 +173,58 @@ def getStudy(self, studyID):
     def getChannelGroups(self, studyID):
         queryString = graphql.channelGroupsQueryString(studyID)
         response = self.graphqlClient.execute(gql(queryString))
-        return response['channelGroups']
+        return response['study']['channelGroups']
+
+    def getDataChunks(self, studyId, channelGroupId, fromTime=0, toTime=9e12):
+        queryString = graphql.dataChunksQueryString(studyId, channelGroupId, fromTime, toTime)
+        response = self.graphqlClient.execute(gql(queryString))['study']['channelGroup']
+        response = json_normalize(response['segments'])
+        dataChunks = self.pandasFlatten(response, '', 'dataChunks')
+        return dataChunks
+
+    def getLabels(self, studyId, labelGroupId, fromTime=0, toTime=9e12,
+                  limit=200, offset=0):
+
+        labelResults = None
+
+        while True:
+            queryString = graphql.getLabesQueryString(studyId, labelGroupId, fromTime,
+                                                      toTime, limit, offset)
+            response = self.graphqlClient.execute(gql(queryString))['study']
+            labelGroup = json_normalize(response)
+            labels = self.pandasFlatten(labelGroup, 'labelGroup.', 'labels')
+            if len(labels) == 0:
+                break
+            tags = self.pandasFlatten(labels, 'labels.', 'tags')
+#            tagType = self.pandasFlatten(tags, 'tags.', 'tagType')
+#            category = self.pandasFlatten(tagType, 'tagType.', 'category')
+
+            if 'labelGroup.labels' in labelGroup.columns: del labelGroup['labelGroup.labels']
+            if 'labels.tags' in labels.columns: del labels['labels.tags']
+#            if 'tags.tagType' in tags.columns: del tags['tags.tagType']
+#            if 'tagType.category' in tagType.columns: del tagType['tagType.category']
+
+            try:
+                labelGroup  = labelGroup.merge(labels, how='left', on='labelGroup.id', suffixes=('', '_y'))
+                labelGroup  = labelGroup.merge(tags, how='left', on='labels.id', suffixes=('', '_y'))
+#                labelGroup  = labelGroup.merge(tagType, how='left', on='tags.id', suffixes=('', '_y'))
+#                labelGroup  = labelGroup.merge(category, how='left', on='tagType.id', suffixes=('', '_y'))
+            except Exception as e:
+    #            print(e)
+                pass
+
+            offset += limit
+
+            if labelResults is None:
+                labelResults = labelGroup.copy()
+            else:
+                labelResults = labelResults.append(labelGroup, ignore_index=True, verify_integrity=False)
+        return labelResults
+
+    def getLabelGroups(self, studyID):
+        queryString = graphql.labelGroupsQueryString(studyID)
+        response = self.graphqlClient.execute(gql(queryString))
+        return response['study']['labelGroups']
 
     def getAllMetaData(self, study=None):
         """Get all the data available to user in the form of
@@ -215,7 +266,7 @@ def getAllMetaData(self, study=None):
         result = []
 
         for sdy in studiesToGet:
-            t = time.time()
+#            t = time.time()
             queryString = graphql.studyWithDataQueryString(sdy)
             result.append(self.graphqlClient.execute(gql(queryString))['study'])
             # print('study query time: ', round(time.time()-t,2))
@@ -251,28 +302,18 @@ def createMetaData(self, study=None):
         channelGroups   = self.pandasFlatten(allData, '', 'channelGroups')
         channels        = self.pandasFlatten(channelGroups, 'channelGroups.', 'channels')
         segments        = self.pandasFlatten(channelGroups, 'channelGroups.', 'segments')
-        dataChunks      = self.pandasFlatten(segments, 'segments.', 'dataChunks')
-        labelGroups     = self.pandasFlatten(allData, '', 'labelGroups')
-        labels          = self.pandasFlatten(labelGroups, 'labelGroups.', 'labels')
 
-        if 'labelGroups.labels' in labelGroups.columns: del labelGroups['labelGroups.labels']
         if 'segments.dataChunks' in segments.columns: del segments['segments.dataChunks']
         if 'channelGroups.segments' in channelGroups.columns: del channelGroups['channelGroups.segments']
         if 'channelGroups.channels' in channelGroups.columns: del channelGroups['channelGroups.channels']
         if 'channelGroups' in allData.columns: del allData['channelGroups']
         if 'labelGroups' in allData.columns: del allData['labelGroups']
 
-#        print('dataframes created')
-
-#        labelGroupsM    = labelGroups.merge(labels, how='left', on='labelGroups.id', suffixes=('', '_y'))
-        segmentsM       = segments.merge(dataChunks, how='left', on='segments.id', suffixes=('', '_y'))
-        channelGroupsM  = channelGroups.merge(segmentsM, how='left', on='channelGroups.id', suffixes=('', '_y'))
+        channelGroupsM  = channelGroups.merge(segments, how='left', on='channelGroups.id', suffixes=('', '_y'))
         channelGroupsM  = channelGroupsM.merge(channels, how='left', on='channelGroups.id', suffixes=('', '_y'))
         allData         = allData.merge(channelGroupsM, how='left', on='id', suffixes=('', '_y'))
-#        allData         = allData.merge(labelGroupsM, how='left', on='id', suffixes=('', '_y'))
-#        print('dataframes merged')
 
-        return [allData, channelGroups, segments, channels, dataChunks, labelGroups, labels]
+        return allData
 
     def getLinks(self, allData, threads=None):
         """Download data chunks and stich them together in one dataframe
@@ -312,9 +353,18 @@ def getLinks(self, allData, threads=None):
             for channelGroupsID in allData['channelGroups.id'].copy().drop_duplicates().tolist():
                 for segmentsID in allData['segments.id'].copy().drop_duplicates().tolist():
                     metaData = allData[(allData['id']==studyID) & (allData['channelGroups.id']==channelGroupsID) & (allData['segments.id']==segmentsID)].copy()
+
                     numChannels = len(metaData['channels.id'].copy().drop_duplicates().tolist())
                     channelNames = metaData['channels.name'].copy().drop_duplicates().tolist()
                     actualChannelNames = channelNames if len(channelNames) == numChannels else ['Channel %s' % (i) for i in range(0, numChannels)]
+
+                    metaData = metaData.drop_duplicates('segments.id')
+
+                    fromTime = metaData['segments.startTime'].min()
+                    toTime = fromTime + metaData['segments.duration'].sum()
+                    dataChunks = self.getDataChunks(studyID, channelGroupsID, fromTime, toTime)
+                    metaData = metaData.merge(dataChunks, how='left', left_on='segments.id', right_on='id', suffixes=('', '_y'))
+
                     metaData = metaData[['dataChunks.url', 'dataChunks.time', 'channelGroups.sampleEncoding', 'channelGroups.sampleRate', 'channelGroups.samplesPerRecord',
                                          'channelGroups.recordsPerChunk', 'channelGroups.compression', 'channelGroups.signalMin', 'channelGroups.signalMax', 'channelGroups.exponent']]
                     metaData = metaData.drop_duplicates()