diff --git a/Examples/ContestDataDownloader.py b/Examples/ContestDataDownloader.py index 5567e2b..e2e4780 100644 --- a/Examples/ContestDataDownloader.py +++ b/Examples/ContestDataDownloader.py @@ -55,19 +55,14 @@ print(' Retrieving metadata...') allData = None - allData, channelGroups, segments, channels, dataChunks, labelGroups, labels = client.createMetaData(study) + allData = client.createMetaData(study) + + #return values in uV + allData['channelGroups.exponent'] = 0 if dtMin is not None and dtMax is not None: allData = allData[(allData.loc[:,'segments.startTime']>=dtMin) & (allData.loc[:,'segments.startTime']<=dtMax)] - -# startTimes = allData.loc[:,'segments.startTime'] -# startTimes = pd.to_datetime(pd.Series(startTimes), unit='ms') -# startTimes = startTimes.dt.minute -# startTimesCompare = startTimes.unique() -# -# allData['Preictal'] = np.where(startTimes==30, 1, 0) - - time.sleep(2) + numFiles = len(allData['segments.startTime'].unique()) print(' Downloading %d file(s)...' % numFiles) @@ -78,7 +73,7 @@ startTime = datetime.fromtimestamp(chunk/1000, tz=timezone.utc) hour = (startTime - baseTime).total_seconds()/3600 minute = startTime.minute - if minute == 30: + if minute >= 30: preictal = 1 else: preictal = 0 @@ -107,7 +102,7 @@ savemat( filename + '.mat', { 'data': np.asarray(data.iloc[:,-16:], dtype=np.float32) }, - appendmat = False, do_compression = True + appendmat = False, do_compression = False ) counter += 1 diff --git a/README.md b/README.md index 2a6b53e..19bd745 100644 --- a/README.md +++ b/README.md @@ -1,34 +1,31 @@ # seer-py -Python wrapper for seer-api, with the purpose of authenticating a user, downloading filtered data, and uploading labels. +Python SDK for the Seer data platform, which handles authenticating a user, downloading channel data, and uploading labels/annotations. ## Install -To install, simply clone or download this repository, then type `pip install .` which will install all dependencies, and the Seer python API. +To install, simply clone or download this repository, then type `pip install .` which will install all the dependencies. ### Epilepsy Ecosystem Data -For users attempting to download data for the [Epilepsy Ecosystem](https://www.epilepsyecosystem.org/howitworks/), please download the [latest release](https://github.com/seermedical/seer-py/releases/latest) instead of cloning the repository or downloading the master branch. The file ContestDataDownloader.py in Examples will guide you through the download process. +For users attempting to download data for the [Epilepsy Ecosystem](https://www.epilepsyecosystem.org/howitworks/), please download the [latest release](https://github.com/seermedical/seer-py/releases/latest) instead of cloning the repository or downloading the master branch. Then open the script `ContestDataDownloader.py` in `Examples` and it will guide you through the download process (you will need to change a few things in this script including the path to download the data to). ## Requirements -This library currently requires python 3, and it if you don't currently have a python 3 installation, we recommend you use the Anaconda distribution for its simplicity, support, stability and extensibility. It can be downloaded here: https://www.anaconda.com/download +This library currently requires Python 3, and it if you don't currently have a Python 3 installation, we recommend you use the Anaconda distribution for its simplicity, support, stability and extensibility. It can be downloaded here: https://www.anaconda.com/download The install instructions above will install all the required dependencies, however, if you wish to install them yourself, here's what you'll need: +- [`gql`](https://github.com/graphql-python/gql): a GraphQL python library is required to query the Seer platform. To install, simply run: `pip install gql` +- Pandas, numpy, and matplotlib are also required. Some of these installs can be tricky, so Anaconda is recommended, however, they can be installed separately. Please see these guides for more detailed information: + - https://scipy.org/install.html + - https://matplotlib.org/users/installing.html + - https://pandas.pydata.org/pandas-docs/stable/install.html -GraphQL python library is required to query the Seer database (in addition to Anaconda). To install, simply run: -`pip install gql` - -Pandas, numpy, and matplotlib are also required. Some of these installs can be tricky, so Anaconda is recommended, however, they can be installed separately. Please see these guides for more detailed information: -https://scipy.org/install.html -https://matplotlib.org/users/installing.html -https://pandas.pydata.org/pandas-docs/stable/install.html - -To run the jupyter notebook example (optional, included in Anaconda): -`pip install notebook` +To run the Jupyter notebook example (optional, included in Anaconda): `pip install notebook` ## Getting Started +Check out the [Example](`Examples/Example.ipynb`) for a step-by-step example of how to use the SDK to access data on the Seer platform. -Check out the [Example](Examples/Example.ipynb) for a step-by-step example of how to use the API +To start a Jupyter notebook, run `jupyter notebook` in a command/bash window. Further instructions on Jupyter can be found here: https://github.com/jupyter/notebook -To start jupyter notebooks, run `jupyter notebook` in a command/bash window. Further instructions on Jupyter can be found here: https://github.com/jupyter/notebook +## Troubleshooting +### Downloading hangs on Windows +There is a known issue with using python's multiprocessing module on Windows with spyder. The function `getLinks` uses `multiprocessing.Pool` to run multiple downloads simultaneously, which can cause the process to run indefinitely. The workaround for this is to ensure that the current working directory is set to the directory containing your script. Running the script from a command window will also solve this problem. Alternatively, setting `threads=1` in the `getLinks` function will stop in from using `multiprocessing` altogether. -## Multiprocessing module -Using multiprocessing to download links in parallel can speed things up, but Windows can sometimes make this difficult. The SeerConenct.getLinks function has the 'threads' argument, which will default to 5 on linux/MacOS, and 1 on Windows. Setting 'threads' to 1 means that no new processes will be spawned, which can avoid errors on Windows. Using multiple threads in Windows is recommended for advanced users only. diff --git a/seerpy/graphql.py b/seerpy/graphql.py index fe6edec..a8153ab 100644 --- a/seerpy/graphql.py +++ b/seerpy/graphql.py @@ -26,11 +26,6 @@ def studyWithDataQueryString(studyId): id startTime duration - dataChunks { - time - length - url - } } channels { id @@ -41,21 +36,85 @@ def studyWithDataQueryString(studyId): } } } - labelGroups { + } + } + ''' % (studyId) + + +def dataChunksQueryString(studyId, channelGroupId, fromTime, toTime): + return ''' + query { + study (id: "%s") { + id + name + channelGroup (channelGroupId: "%s") { + id + name + segments (fromTime: %f, toTime: %f) { + id + startTime + duration + dataChunks { + time + length + url + } + } + } + } + } + ''' % (studyId, channelGroupId, fromTime, toTime) + +def getLabesQueryString(studyId, labelGroupId, fromTime, toTime, limit, offset): + return ''' + query { + study (id: "%s") { + id + name + labelGroup (labelGroupId: "%s") { id name labelType description - labels { + labels (limit: %.0f, offset: %.0f) { id note startTime duration timezone + tags { + id + tagType { + id + category { + id + name + description + } + value + } + } } } } } + ''' % (studyId, labelGroupId, limit, offset) + + +def labelGroupsQueryString(studyId): + return ''' + query { + study (id: "%s") { + id + name + labelGroups { + id + name + labelType + description + } + } + } ''' % (studyId) def channelGroupsQueryString(studyId): diff --git a/seerpy/seerpy.py b/seerpy/seerpy.py index 4adac94..f4cf817 100644 --- a/seerpy/seerpy.py +++ b/seerpy/seerpy.py @@ -10,7 +10,7 @@ import os from multiprocessing import Pool -import time +#import time from time import gmtime, strftime @@ -173,7 +173,58 @@ def getStudy(self, studyID): def getChannelGroups(self, studyID): queryString = graphql.channelGroupsQueryString(studyID) response = self.graphqlClient.execute(gql(queryString)) - return response['channelGroups'] + return response['study']['channelGroups'] + + def getDataChunks(self, studyId, channelGroupId, fromTime=0, toTime=9e12): + queryString = graphql.dataChunksQueryString(studyId, channelGroupId, fromTime, toTime) + response = self.graphqlClient.execute(gql(queryString))['study']['channelGroup'] + response = json_normalize(response['segments']) + dataChunks = self.pandasFlatten(response, '', 'dataChunks') + return dataChunks + + def getLabels(self, studyId, labelGroupId, fromTime=0, toTime=9e12, + limit=200, offset=0): + + labelResults = None + + while True: + queryString = graphql.getLabesQueryString(studyId, labelGroupId, fromTime, + toTime, limit, offset) + response = self.graphqlClient.execute(gql(queryString))['study'] + labelGroup = json_normalize(response) + labels = self.pandasFlatten(labelGroup, 'labelGroup.', 'labels') + if len(labels) == 0: + break + tags = self.pandasFlatten(labels, 'labels.', 'tags') +# tagType = self.pandasFlatten(tags, 'tags.', 'tagType') +# category = self.pandasFlatten(tagType, 'tagType.', 'category') + + if 'labelGroup.labels' in labelGroup.columns: del labelGroup['labelGroup.labels'] + if 'labels.tags' in labels.columns: del labels['labels.tags'] +# if 'tags.tagType' in tags.columns: del tags['tags.tagType'] +# if 'tagType.category' in tagType.columns: del tagType['tagType.category'] + + try: + labelGroup = labelGroup.merge(labels, how='left', on='labelGroup.id', suffixes=('', '_y')) + labelGroup = labelGroup.merge(tags, how='left', on='labels.id', suffixes=('', '_y')) +# labelGroup = labelGroup.merge(tagType, how='left', on='tags.id', suffixes=('', '_y')) +# labelGroup = labelGroup.merge(category, how='left', on='tagType.id', suffixes=('', '_y')) + except Exception as e: + # print(e) + pass + + offset += limit + + if labelResults is None: + labelResults = labelGroup.copy() + else: + labelResults = labelResults.append(labelGroup, ignore_index=True, verify_integrity=False) + return labelResults + + def getLabelGroups(self, studyID): + queryString = graphql.labelGroupsQueryString(studyID) + response = self.graphqlClient.execute(gql(queryString)) + return response['study']['labelGroups'] def getAllMetaData(self, study=None): """Get all the data available to user in the form of @@ -215,7 +266,7 @@ def getAllMetaData(self, study=None): result = [] for sdy in studiesToGet: - t = time.time() +# t = time.time() queryString = graphql.studyWithDataQueryString(sdy) result.append(self.graphqlClient.execute(gql(queryString))['study']) # print('study query time: ', round(time.time()-t,2)) @@ -251,28 +302,18 @@ def createMetaData(self, study=None): channelGroups = self.pandasFlatten(allData, '', 'channelGroups') channels = self.pandasFlatten(channelGroups, 'channelGroups.', 'channels') segments = self.pandasFlatten(channelGroups, 'channelGroups.', 'segments') - dataChunks = self.pandasFlatten(segments, 'segments.', 'dataChunks') - labelGroups = self.pandasFlatten(allData, '', 'labelGroups') - labels = self.pandasFlatten(labelGroups, 'labelGroups.', 'labels') - if 'labelGroups.labels' in labelGroups.columns: del labelGroups['labelGroups.labels'] if 'segments.dataChunks' in segments.columns: del segments['segments.dataChunks'] if 'channelGroups.segments' in channelGroups.columns: del channelGroups['channelGroups.segments'] if 'channelGroups.channels' in channelGroups.columns: del channelGroups['channelGroups.channels'] if 'channelGroups' in allData.columns: del allData['channelGroups'] if 'labelGroups' in allData.columns: del allData['labelGroups'] -# print('dataframes created') - -# labelGroupsM = labelGroups.merge(labels, how='left', on='labelGroups.id', suffixes=('', '_y')) - segmentsM = segments.merge(dataChunks, how='left', on='segments.id', suffixes=('', '_y')) - channelGroupsM = channelGroups.merge(segmentsM, how='left', on='channelGroups.id', suffixes=('', '_y')) + channelGroupsM = channelGroups.merge(segments, how='left', on='channelGroups.id', suffixes=('', '_y')) channelGroupsM = channelGroupsM.merge(channels, how='left', on='channelGroups.id', suffixes=('', '_y')) allData = allData.merge(channelGroupsM, how='left', on='id', suffixes=('', '_y')) -# allData = allData.merge(labelGroupsM, how='left', on='id', suffixes=('', '_y')) -# print('dataframes merged') - return [allData, channelGroups, segments, channels, dataChunks, labelGroups, labels] + return allData def getLinks(self, allData, threads=None): """Download data chunks and stich them together in one dataframe @@ -312,9 +353,18 @@ def getLinks(self, allData, threads=None): for channelGroupsID in allData['channelGroups.id'].copy().drop_duplicates().tolist(): for segmentsID in allData['segments.id'].copy().drop_duplicates().tolist(): metaData = allData[(allData['id']==studyID) & (allData['channelGroups.id']==channelGroupsID) & (allData['segments.id']==segmentsID)].copy() + numChannels = len(metaData['channels.id'].copy().drop_duplicates().tolist()) channelNames = metaData['channels.name'].copy().drop_duplicates().tolist() actualChannelNames = channelNames if len(channelNames) == numChannels else ['Channel %s' % (i) for i in range(0, numChannels)] + + metaData = metaData.drop_duplicates('segments.id') + + fromTime = metaData['segments.startTime'].min() + toTime = fromTime + metaData['segments.duration'].sum() + dataChunks = self.getDataChunks(studyID, channelGroupsID, fromTime, toTime) + metaData = metaData.merge(dataChunks, how='left', left_on='segments.id', right_on='id', suffixes=('', '_y')) + metaData = metaData[['dataChunks.url', 'dataChunks.time', 'channelGroups.sampleEncoding', 'channelGroups.sampleRate', 'channelGroups.samplesPerRecord', 'channelGroups.recordsPerChunk', 'channelGroups.compression', 'channelGroups.signalMin', 'channelGroups.signalMax', 'channelGroups.exponent']] metaData = metaData.drop_duplicates()