updating jats output files with allowed html tags

adsabs · Apr 15, 2024 · 8a3be4a · 8a3be4a
1 parent 2560371
commit 8a3be4a
Show file tree

Hide file tree

Showing 9 changed files with 8,461 additions and 7,128 deletions.
diff --git a/adsingestp/parsers/base.py b/adsingestp/parsers/base.py
@@ -496,7 +496,7 @@ class BaseBeautifulSoupParser(IngestBase):
     HTML_TAGSET = {
         "title": HTML_TAGS_MATH + HTML_TAGS_HTML + ["a"],
         "abstract": HTML_TAGS_MATH + HTML_TAGS_HTML + ["a", "pre", "br"],
-        "comments": HTML_TAGS_MATH + HTML_TAGS_HTML + ["a", "pre", "br"],
+        "comments": HTML_TAGS_MATH + HTML_TAGS_HTML + ["a", "pre", "br", "p"],
         "affiliations": ["email", "orcid"],
         "keywords": HTML_TAGS_HTML,
     }

diff --git a/adsingestp/parsers/elsevier.py b/adsingestp/parsers/elsevier.py
@@ -131,26 +131,26 @@ def _parse_title_abstract(self):
         if self.record_meta.find("ce:title"):
             self.base_metadata["title"] = self._clean_output(
                 self._detag(
-                    self.record_meta.find("ce:title"), self.HTML_TAGSET["abstract"]
+                    self.record_meta.find("ce:title"), self.HTML_TAGSET["title"]
                 ).strip()
             )
         elif self.record_header.find("dct:title"):
             self.base_metadata["title"] = self._clean_output(
                 self._detag(
-                    self.record_header.find("dct:title"), self.HTML_TAGSET["abstract"]
+                    self.record_header.find("dct:title"), self.HTML_TAGSET["title"]
                 ).strip()
             )
         elif self.record_meta.find("cd:textfn"):
             self.base_metadata["title"] = self._clean_output(
                 self._detag(
-                    self.record_meta.find("ce:textfn"), self.HTML_TAGSET["abstract"]
+                    self.record_meta.find("ce:textfn"), self.HTML_TAGSET["title"]
                 ).strip()
             )
 
         if self.record_meta.find("ce:subtitle"):
             self.base_metadata["subtitle"] = self._clean_output(
                 self._detag(
-                    self.record_meta.find("ce:subtitle"), self.HTML_TAGSET["abstract"]
+                    self.record_meta.find("ce:subtitle"), self.HTML_TAGSET["title"]
                 ).strip()
             )
 
@@ -172,7 +172,7 @@ def _parse_title_abstract(self):
                             self.base_metadata["abstract"] = abstract
                             break
                         elif abs.find("ce:section-title").get_text().lower() == "highlights":
-                            abs_text_all = abs.find_all("ce:para")
+                            abs_text_all = abs.find_all("p")
                             for abs_text in abs_text_all:
                                 abstract = (
                                     abstract
@@ -379,6 +379,9 @@ def _remove_namespaces(self, text):
             "bold": "b",
             "sup": "sup",
             "inf": "sub",
+            "list": "ul",
+            "list-item": "li",
+            "para": "p",
         }
 
         root = etree.fromstring(text)
@@ -410,7 +413,7 @@ def parse(self, text):
         except Exception as err:
             raise XmlLoadException(err)
 
-        self.record_header = d.find("Description")
+        self.record_header = d.find("rdf:Description")
 
         article_type, document_enum = self._find_article_type(d)
         self.base_metadata["doctype"] = document_enum

diff --git a/tests/stubdata/input/els_list.xml b/tests/stubdata/input/els_list.xml
diff --git a/tests/stubdata/output/els_list.json b/tests/stubdata/output/els_list.json
diff --git a/tests/stubdata/output/jats_nature_41467_2023_Article_40261_nlm.json b/tests/stubdata/output/jats_nature_41467_2023_Article_40261_nlm.json
@@ -234,4 +234,4 @@
   "title": {
     "textEnglish": "Author Correction: Super-resolved trajectory-derived nanoclustering analysis using spatiotemporal indexing"
   }
-}
+}