Skip to content

Commit

Permalink
updating jats output files with allowed html tags
Browse files Browse the repository at this point in the history
  • Loading branch information
Mugdha Polimera committed Apr 15, 2024
1 parent 2560371 commit 8a3be4a
Show file tree
Hide file tree
Showing 9 changed files with 8,461 additions and 7,128 deletions.
2 changes: 1 addition & 1 deletion adsingestp/parsers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,7 @@ class BaseBeautifulSoupParser(IngestBase):
HTML_TAGSET = {
"title": HTML_TAGS_MATH + HTML_TAGS_HTML + ["a"],
"abstract": HTML_TAGS_MATH + HTML_TAGS_HTML + ["a", "pre", "br"],
"comments": HTML_TAGS_MATH + HTML_TAGS_HTML + ["a", "pre", "br"],
"comments": HTML_TAGS_MATH + HTML_TAGS_HTML + ["a", "pre", "br", "p"],
"affiliations": ["email", "orcid"],
"keywords": HTML_TAGS_HTML,
}
Expand Down
15 changes: 9 additions & 6 deletions adsingestp/parsers/elsevier.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,26 +131,26 @@ def _parse_title_abstract(self):
if self.record_meta.find("ce:title"):
self.base_metadata["title"] = self._clean_output(
self._detag(
self.record_meta.find("ce:title"), self.HTML_TAGSET["abstract"]
self.record_meta.find("ce:title"), self.HTML_TAGSET["title"]
).strip()
)
elif self.record_header.find("dct:title"):
self.base_metadata["title"] = self._clean_output(
self._detag(
self.record_header.find("dct:title"), self.HTML_TAGSET["abstract"]
self.record_header.find("dct:title"), self.HTML_TAGSET["title"]
).strip()
)
elif self.record_meta.find("cd:textfn"):
self.base_metadata["title"] = self._clean_output(
self._detag(
self.record_meta.find("ce:textfn"), self.HTML_TAGSET["abstract"]
self.record_meta.find("ce:textfn"), self.HTML_TAGSET["title"]
).strip()
)

if self.record_meta.find("ce:subtitle"):
self.base_metadata["subtitle"] = self._clean_output(
self._detag(
self.record_meta.find("ce:subtitle"), self.HTML_TAGSET["abstract"]
self.record_meta.find("ce:subtitle"), self.HTML_TAGSET["title"]
).strip()
)

Expand All @@ -172,7 +172,7 @@ def _parse_title_abstract(self):
self.base_metadata["abstract"] = abstract
break
elif abs.find("ce:section-title").get_text().lower() == "highlights":
abs_text_all = abs.find_all("ce:para")
abs_text_all = abs.find_all("p")
for abs_text in abs_text_all:
abstract = (
abstract
Expand Down Expand Up @@ -379,6 +379,9 @@ def _remove_namespaces(self, text):
"bold": "b",
"sup": "sup",
"inf": "sub",
"list": "ul",
"list-item": "li",
"para": "p",
}

root = etree.fromstring(text)
Expand Down Expand Up @@ -410,7 +413,7 @@ def parse(self, text):
except Exception as err:
raise XmlLoadException(err)

self.record_header = d.find("Description")
self.record_header = d.find("rdf:Description")

article_type, document_enum = self._find_article_type(d)
self.base_metadata["doctype"] = document_enum
Expand Down
1,141 changes: 1,141 additions & 0 deletions tests/stubdata/input/els_list.xml

Large diffs are not rendered by default.

186 changes: 186 additions & 0 deletions tests/stubdata/output/els_list.json

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -234,4 +234,4 @@
"title": {
"textEnglish": "Author Correction: Super-resolved trajectory-derived nanoclustering analysis using spatiotemporal indexing"
}
}
}
Loading

0 comments on commit 8a3be4a

Please sign in to comment.