Skip to content

Commit

Permalink
feat: add pmc as a source for fetching articles data (#7)
Browse files Browse the repository at this point in the history
  • Loading branch information
EverVino authored Feb 9, 2024
1 parent 7d8cd97 commit 3bb3536
Show file tree
Hide file tree
Showing 10 changed files with 459 additions and 37 deletions.
4 changes: 2 additions & 2 deletions src/pymedx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""PyMedX package."""
from importlib import metadata as importlib_metadata

from .api import PubMed
from .api import PubMed, PubMedCentral


def get_version():
Expand All @@ -18,4 +18,4 @@ def get_version():
__version__ = version


__all__ = ["PubMed", "__version__"]
__all__ = ["PubMed", "PubMedCentral", "__version__"]
110 changes: 84 additions & 26 deletions src/pymedx/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from lxml import etree as xml

from .article import PubMedArticle
from .article import PubMedArticle, PubMedCentralArticle
from .book import PubMedBookArticle
from .helpers import batches

Expand Down Expand Up @@ -62,10 +62,10 @@ def __init__(
def query(
self,
query: str,
min_date: str,
max_date: str,
max_results: int = 100,
) -> Iterable[Union[PubMedArticle, PubMedBookArticle]]:
) -> Iterable[
Union[PubMedArticle, PubMedBookArticle, PubMedCentralArticle]
]:
"""
Execute a query agains the GraphQL schema.
Expand All @@ -84,8 +84,6 @@ def query(
# Retrieve the article IDs for the query
article_ids = self._getArticleIds(
query=query,
min_date=min_date,
max_date=max_date,
max_results=max_results,
)

Expand Down Expand Up @@ -188,8 +186,7 @@ def _get(

# Set the response mode

if parameters:
parameters["retmode"] = output
parameters["retmode"] = output

# Make the request to PubMed
response = requests.get(f"{BASE_URL}{url}", params=parameters)
Expand All @@ -207,7 +204,9 @@ def _get(

def _getArticles(
self, article_ids: List[str]
) -> Iterable[Union[PubMedArticle, PubMedBookArticle]]:
) -> Iterable[
Union[PubMedArticle, PubMedBookArticle, PubMedCentralArticle]
]:
"""Batch a list of article IDs and retrieves the content.
Parameters
Expand Down Expand Up @@ -241,8 +240,6 @@ def _getArticles(
def _getArticleIds(
self,
query: str,
min_date: str,
max_date: str,
max_results: int,
) -> List[str]:
"""Retrieve the article IDs for a query.
Expand All @@ -267,29 +264,19 @@ def _getArticleIds(

# Add specific query parameters
parameters["term"] = query
parameters["retmax"] = 50000
parameters["retmax"] = 500000
parameters["datetype"] = "edat"
parameters["mindate"] = min_date
parameters["maxdate"] = max_date

retmax: int = cast(int, parameters["retmax"])

# Calculate a cut off point based on the max_results parameter
if max_results < retmax:
parameters["retmax"] = max_results

new_url = (
"/entrez/eutils/esearch.fcgi?"
f"db={parameters['db']}&"
f"term={parameters['term']}&"
f"retmax={parameters['retmax']}&"
f"datetype={parameters['datetype']}&"
f"mindate={parameters['mindate']}&"
f"maxdate={parameters['maxdate']}&"
f"retmode=json"
)

# Make the first request to PubMed
response: requests.models.Response = self._get(url=new_url)
response: requests.models.Response = self._get(
url="/entrez/eutils/esearch.fcgi", parameters=parameters
)

# Add the retrieved IDs to the list
article_ids += response.get("esearchresult", {}).get("idlist", [])
Expand Down Expand Up @@ -335,3 +322,74 @@ def _getArticleIds(

# Return the response
return article_ids


class PubMedCentral(PubMed):
"""Warp around the PubMedCentral API."""

def __init__(
self,
tool: str = "my_tool",
email: str = "[email protected]",
api_key: str = "",
) -> None:
"""
Initialize the PubMedCentral object.
Parameters
----------
tool: String
name of the tool that is executing the query.
This parameter is not required but kindly requested by
PMC (PubMed Central).
email: String
email of the user of the tool. This parameter
is not required but kindly requested by PMC (PubMed Central).
api_key: str
the NCBI API KEY
Returns
-------
None
"""
# Inherits from PubMed object and initialize.
super().__init__(tool, email, api_key)
# Changes database source to pmc (PubMedCentral)
self.parameters["db"] = "pmc"

def _getArticles(
self, article_ids: List[str]
) -> Iterable[
Union[PubMedArticle, PubMedBookArticle, PubMedCentralArticle]
]:
"""Batch a list of article IDs and retrieves the content.
Parameters
----------
- article_ids List, article IDs.
Returns
-------
- articles List, article objects.
"""
# Get the default parameters
parameters = self.parameters.copy()
parameters["id"] = article_ids

# Make the request
response = self._get(
url="/entrez/eutils/efetch.fcgi",
parameters=parameters,
output="xml",
)

# Parse as XML
root = xml.fromstring(response)

# Loop over the articles and construct article objects
for article in root.iter("article"):
yield PubMedCentralArticle(xml_element=article)

# TODO: Adapt to PubMed Central API
# for book in root.iter("PubmedBookArticle"):
# yield PubMedBookArticle(xml_element=book)
195 changes: 193 additions & 2 deletions src/pymedx/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from lxml.etree import _Element

from .helpers import getAllContent, getContent, getContentUnique
from .helpers import getAbstract, getAllContent, getContent, getContentUnique


class PubMedArticle:
Expand Down Expand Up @@ -69,7 +69,7 @@ def _extractJournal(self, xml_element: _Element) -> Union[str, None, int]:

def _extractAbstract(self, xml_element: _Element) -> Union[str, None, int]:
path = ".//AbstractText"
return getAllContent(element=xml_element, path=path)
return getAbstract(element=xml_element, path=path)

def _extractConclusions(
self, xml_element: _Element
Expand Down Expand Up @@ -171,3 +171,194 @@ def toJSON(self) -> str:
sort_keys=True,
indent=4,
)


class PubMedCentralArticle:
"""Data class that contains a PubMedCentral article."""

# Full slots
"""
__slots__ = (
"pmc_id",
"title",
"abstract",
"keywords",
"journal",
"publication_date",
"authors",
"methods",
"conclusions",
"results",
"copyrights",
"doi",
"xml",
)
"""

# slots which have been implemented
__slots__ = (
"pmc_id",
"title",
"abstract",
"publication_date",
"authors",
"doi",
)

def __init__(
self,
xml_element: Optional[_Element] = None,
*args: List[Any],
**kwargs: Dict[Any, Any],
) -> None:
"""Initialize of the object from XML or from parameters."""
if args:
# keep it for resolving problems with linter
pass
# If an XML element is provided, use it for initialization
if xml_element is not None:
self._initializeFromXML(xml_element=xml_element)

# If no XML element was provided, try to parse the input parameters
else:
for field in self.__slots__:
self.__setattr__(field, kwargs.get(field, None))

def _extractPMCId(self, xml_element: _Element) -> Union[str, None, int]:
path = ".//article-meta/article-id[@pub-id-type='pmc']"
return getContentUnique(element=xml_element, path=path)

def _extractTitle(self, xml_element: _Element) -> Union[str, None, int]:
path = ".//title-group"
return getAllContent(element=xml_element, path=path)

# TODO: adapt the function for PubMed Central
# def _extractKeywords(self, xml_element: _Element) -> List[Any]:
# path = ".//Keyword"
# return [
# keyword.text
# for keyword in xml_element.findall(path)
# if keyword is not None
# ]
# TODO: adapt the function for PubMed Central
# def _extractJournal
# (self, xml_element: _Element) -> Union[str, None, int]:
# path = ".//Journal/Title"
# return getContent(element=xml_element, path=path)

def _extractAbstract(self, xml_element: _Element) -> Union[str, None, int]:
path = ".//abstract"
return getAllContent(element=xml_element, path=path)

# TODO: adapt the function for PubMed Central
# def _extractConclusions(
# self, xml_element: _Element
# ) -> Union[str, None, int]:
# path = ".//AbstractText[@Label='CONCLUSION']"
# return getContent(element=xml_element, path=path)
# TODO: adapt the function for PubMed Central
# def _extractMethods(self, xml_element: _Element)
# -> Union[str, None, int]:
# path = ".//AbstractText[@Label='METHOD']"
# return getContent(element=xml_element, path=path)
# TODO: adapt the function for PubMed Central
# def _extractResults(self, xml_element: _Element)
# -> Union[str, None, int]:
# path = ".//AbstractText[@Label='RESULTS']"
# return getContent(element=xml_element, path=path)
# TODO: adapt the function for PubMed Central
# def _extractCopyrights(
# self, xml_element: _Element
# ) -> Union[str, None, int]:
# path = ".//CopyrightInformation"
# return getContent(element=xml_element, path=path)

def _extractDoi(self, xml_element: _Element) -> Union[str, None, int]:
path = ".//article-meta/article-id[@pub-id-type='doi']"
return getContentUnique(element=xml_element, path=path)

def _extractPublicationDate(
self, xml_element: _Element
) -> Optional[datetime.date]:
# Get the publication date

# Get the publication elements
publication_date = xml_element.find(".//pub-date[@pub-type='epub']")

if not publication_date: # Check this part
publication_date = xml_element.find(".//pub-date")

if publication_date is not None:
publication_year = getContent(publication_date, ".//year", None)

publication_month = getContent(publication_date, ".//month", "1")

publication_day = getContent(publication_date, ".//day", "1")

# Construct a datetime object from the info
date_str: str = (
f"{publication_year}/{publication_month}/{publication_day}"
)

return datetime.datetime.strptime(date_str, "%Y/%m/%d")

# Unable to parse the datetime
return None

def _extractAuthors(
self, xml_element: _Element
) -> List[dict[str, Union[str, None, int]]]:
contrib_group = xml_element.findall(".//contrib-group")
if contrib_group:
return [
{
"lastname": getContent(author, ".//surname", None),
"firstname": getContent(author, ".//given-names", None),
# TODO: adapt the function for PubMed Central
# "initials": getContent(author, ".//Initials", None),
# "affiliation": getContent(
# author, ".//AffiliationInfo/Affiliation", None
# ),
}
for author in contrib_group[0].findall(
".//contrib[@contrib-type='author']"
)
]
return []

def _initializeFromXML(self, xml_element: _Element) -> None:
"""Parse an XML element into an article object."""
# Parse the different fields of the article
self.pmc_id = self._extractPMCId(xml_element)
self.title = self._extractTitle(xml_element)
self.abstract = self._extractAbstract(xml_element)
self.doi = self._extractDoi(xml_element)
self.publication_date = self._extractPublicationDate(xml_element)
self.authors = self._extractAuthors(xml_element)
# TODO: adapt the function for PubMed Central
# self.xml = xml_element
# self.keywords = self._extractKeywords(xml_element)
# self.journal = self._extractJournal(xml_element)
# self.conclusions = self._extractConclusions(xml_element)
# self.methods = self._extractMethods(xml_element)
# self.results = self._extractResults(xml_element)
# self.copyrights = self._extractCopyrights(xml_element)

def toDict(self) -> Dict[Any, Any]:
"""Convert the parsed information to a Python dict."""
return {key: self.__getattribute__(key) for key in self.__slots__}

def toJSON(self) -> str:
"""Dump the object as JSON string."""
return json.dumps(
{
key: (
value
if not isinstance(value, (datetime.date, _Element))
else str(value)
)
for key, value in self.toDict().items()
},
sort_keys=True,
indent=4,
)
Loading

0 comments on commit 3bb3536

Please sign in to comment.