Skip to content

Commit

Permalink
Remove latex markup from tex-math tags (#123)
Browse files Browse the repository at this point in the history
* new test files with tex markup

* added function to remove latex markup from tex-math tags

* Delete tests/stubdata/output/jats_edp_jnwpu_40_96_tmp.json

* lint fix

* fixing test output

---------

Co-authored-by: Mugdha Polimera <[email protected]>
Co-authored-by: Mugdha Polimera <[email protected]>
  • Loading branch information
3 people authored Aug 20, 2024
1 parent 8142f83 commit 4874484
Show file tree
Hide file tree
Showing 6 changed files with 11,482 additions and 72 deletions.
18 changes: 18 additions & 0 deletions adsingestp/parsers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,24 @@ def bsstrtodict(self, input_xml, parser="lxml-xml"):

return bs4.BeautifulSoup(input_xml, parser)

def _remove_latex(self, r):
"""
Removes LaTeX markup inside <tex-math> tags from input BeautifulSoup object
:param r: BeautifulSoup object
:return: newr: string with LaTeX removed
"""
math_elements = r.find_all("tex-math")
for e in math_elements:
text = e.get_text()
begin = text.find("\\begin{document}")
end = text.find("\\end{document}")
begin_len = len("\\begin{document}")
if begin == -1 or end == -1:
continue
newtext = text[begin + begin_len : end]
e.string = newtext
return r

def _detag(self, r, tags_keep):
"""
Removes tags from input BeautifulSoup object
Expand Down
28 changes: 23 additions & 5 deletions adsingestp/parsers/jats.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,7 @@ def _parse_title_abstract(self):
# all title footnotes:
for df in title_group.find_all("fn"):
key = df.get("id", None)
df = self._remove_latex(df)
note = self._detag(df, self.HTML_TAGSET["abstract"]).strip()
if key and note:
title_fn_dict[key] = note
Expand All @@ -613,7 +614,10 @@ def _parse_title_abstract(self):
if title_fn_dict.get(key, None):
title_fn_list.append(title_fn_dict.get(key, None))
dx.decompose()
# strip latex out of title
title = self._remove_latex(title)
art_title = self._detag(title, self.HTML_TAGSET["title"]).strip()

title_notes = []
if title_fn_list:
title_notes.extend(title_fn_list)
Expand All @@ -626,6 +630,7 @@ def _parse_title_abstract(self):
if title_fn_dict.get(key, None):
subtitle_fn_list.append(title_fn_dict.get(key, None))
dx.decompose()
subtitle = self._remove_latex(subtitle)
sub_title = self._detag(subtitle, self.HTML_TAGSET["title"]).strip()
subtitle_notes = []
if subtitle_fn_list:
Expand All @@ -644,13 +649,15 @@ def _parse_title_abstract(self):
abstract_all = self.article_meta.find("abstract").find_all("p")
abstract_paragraph_list = list()
for paragraph in abstract_all:
paragraph = self._remove_latex(paragraph)
para = self._detag(paragraph, self.HTML_TAGSET["abstract"])
abstract_paragraph_list.append(para)
self.base_metadata["abstract"] = "\n".join(abstract_paragraph_list)
if title_fn_list:
self.base_metadata["abstract"] += " " + " ".join(title_fn_list)
else:
abs_raw = self.article_meta.find("abstract")
abs_raw = self._remove_latex(abs_raw)
abs_txt = self._detag(abs_raw, self.HTML_TAGSET["abstract"])
self.base_metadata["abstract"] = abs_txt

Expand Down Expand Up @@ -720,8 +727,10 @@ def _parse_keywords(self):
for kk in keys_uat_test:
# Check for UAT first:
if kk["content-type"] == "uat-code":
kk = self._remove_latex(kk)
keyid = self._detag(kk, self.HTML_TAGSET["keywords"])
if kk["content-type"] == "term":
kk = self._remove_latex(kk)
keystring = self._detag(kk, self.HTML_TAGSET["keywords"])

if keyid or keystring:
Expand All @@ -730,18 +739,21 @@ def _parse_keywords(self):
if not keys_uat:
keys_misc_test = kg.find_all("kwd")
for kk in keys_misc_test:
kk = self._remove_latex(kk)
keys_misc.append(self._detag(kk, self.HTML_TAGSET["keywords"]))

# Then check for AAS:
if kg.get("kwd-group-type", "") == "AAS":
keys_aas_test = kg.find_all("kwd")
for kk in keys_aas_test:
kk = self._remove_latex(kk)
keys_aas.append(self._detag(kk, self.HTML_TAGSET["keywords"]))

# If all else fails, just search for 'kwd'
if (not keys_uat) and (not keys_aas):
keys_misc_test = kg.find_all("kwd")
for kk in keys_misc_test:
kk = self._remove_latex(kk)
keys_misc.append(self._detag(kk, self.HTML_TAGSET["keywords"]))

if keys_uat:
Expand Down Expand Up @@ -772,6 +784,7 @@ def _parse_keywords(self):
subjects = sg.find_all("subject")

for k in subjects:
k = self._remove_latex(k)
keys_out.append(
{
"system": "subject",
Expand All @@ -789,13 +802,16 @@ def _parse_conference(self):
event_meta = self.article_meta.find("conference")

if event_meta.find("conf-name"):
self.base_metadata["conf_name"] = self._detag(event_meta.find("conf-name"), [])
conf_name = self._remove_latex(event_meta.find("conf-name", ""))
self.base_metadata["conf_name"] = self._detag(conf_name, [])

if event_meta.find("conf-loc"):
self.base_metadata["conf_location"] = self._detag(event_meta.find("conf-loc"), [])
conf_loc = self._remove_latex(event_meta.find("conf-loc", ""))
self.base_metadata["conf_location"] = self._detag(conf_loc, [])

if event_meta.find("conf-date"):
self.base_metadata["conf_date"] = self._detag(event_meta.find("conf-date"), [])
conf_date = self._remove_latex(event_meta.find("conf-date", ""))
self.base_metadata["conf_date"] = self._detag(conf_date, [])

def _parse_pub(self):
journal = None
Expand All @@ -807,14 +823,16 @@ def _parse_pub(self):
journal = self.journal_meta.find("journal-title")

if journal:
journal = self._remove_latex(journal)
self.base_metadata["publication"] = self._detag(journal, [])

if self.journal_meta.find("publisher") and self.journal_meta.find("publisher").find(
"publisher-name"
):
self.base_metadata["publisher"] = self._detag(
self.journal_meta.find("publisher").find("publisher-name"), []
publisher_name = self._remove_latex(
self.journal_meta.find("publisher").find("publisher-name")
)
self.base_metadata["publisher"] = self._detag(publisher_name, [])

issn_all = self.journal_meta.find_all("issn")
issns = []
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ exclude = docs
# E401 multiple imports on one line
# E501 line too long
# The rest are the default ignored warnings.
extend-ignore = E265,E266,E123,E133,E226,E241,E242,E301,E401,E501
extend-ignore = E265,E266,E123,E133,E226,E241,E242,E301,E401,E501,E203
Loading

0 comments on commit 4874484

Please sign in to comment.