Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add mypy typing #203

Merged
merged 11 commits into from
Dec 19, 2022
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ['3.6', '3.7', '3.8', '3.9']
python-version: ['3.7', '3.8', '3.9']

steps:
- uses: actions/checkout@v2
Expand Down
5 changes: 5 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,8 @@ repos:
language_version: python3
repo: https://github.com/PyCQA/isort
rev: 5.10.1
- hooks:
- id: mypy
additional_dependencies: [types-requests, types-mock, types-six, lxml-stubs]
repo: https://github.com/pre-commit/mirrors-mypy
rev: v0.982
45 changes: 24 additions & 21 deletions extruct/_extruct.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from __future__ import annotations

import logging
import warnings
from typing import Any, Callable

from extruct.dublincore import DublinCoreExtractor
from extruct.jsonld import JsonLdExtractor
Expand All @@ -15,17 +18,17 @@


def extract(
htmlstring,
base_url=None,
encoding="UTF-8",
syntaxes=SYNTAXES,
errors="strict",
uniform=False,
return_html_node=False,
schema_context="http://schema.org",
with_og_array=False,
**kwargs
):
htmlstring: str | bytes,
BurnzZ marked this conversation as resolved.
Show resolved Hide resolved
base_url: str | None = None,
encoding: str = "UTF-8",
syntaxes: list[str] = SYNTAXES,
errors: str = "strict",
uniform: bool = False,
return_html_node: bool = False,
schema_context: str = "http://schema.org",
with_og_array: bool = False,
url: str | None = None, # deprecated
) -> dict[str, list[dict[str, Any]]]:
"""
htmlstring: string with valid html document;
base_url: base url of the html document
Expand All @@ -44,15 +47,13 @@ def extract(
The feature is supported only by microdata syntax.
Each node is of `lxml.etree.Element` type.
schema_context: schema's context for current page"""
if base_url is None and "url" in kwargs:
if base_url is None and url is not None:
warnings.warn(
'"url" argument is deprecated, please use "base_url"',
DeprecationWarning,
stacklevel=2,
)
base_url = kwargs.pop("url")
if kwargs:
raise TypeError("Unexpected keyword arguments")
base_url = url
if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)):
raise ValueError(
"syntaxes must be a list with any or all (default) of"
Expand Down Expand Up @@ -112,7 +113,7 @@ def extract(
tree,
)
)
output = {}
output: dict[str, list[dict[str, Any]]] = {}
for syntax, extract, document in processors:
try:
output[syntax] = list(extract(document, base_url=base_url))
Expand All @@ -124,7 +125,9 @@ def extract(
if errors == "strict":
raise
if uniform:
uniform_processors = []
uniform_processors: list[
tuple[str, Callable[..., Any], list[Any], str | None]
] = []
if "microdata" in syntaxes:
uniform_processors.append(
(
Expand Down Expand Up @@ -162,14 +165,14 @@ def extract(
)
)

for syntax, uniform, raw, schema_context in uniform_processors:
for syntax, uniform_fn, raw, schema_ctx in uniform_processors:
try:
if syntax == "opengraph":
output[syntax] = uniform(raw, with_og_array=with_og_array)
output[syntax] = uniform_fn(raw, with_og_array=with_og_array)
elif syntax == "dublincore":
output[syntax] = uniform(raw)
output[syntax] = uniform_fn(raw)
else:
output[syntax] = uniform(raw, schema_context)
output[syntax] = uniform_fn(raw, schema_ctx)
except Exception as e:
if errors == "ignore":
output[syntax] = []
Expand Down
3 changes: 2 additions & 1 deletion extruct/dublincore.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# mypy: disallow_untyped_defs=False
import re

from w3lib.html import strip_html5_whitespace
Expand Down Expand Up @@ -110,7 +111,7 @@ def get_lower_attrib(name):
return re.sub(r".*\.", "", name).lower()


class DublinCoreExtractor(object):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

got rid of some python2 style stuff

class DublinCoreExtractor:
"""DublinCore extractor following extruct API."""

def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
Expand Down
5 changes: 3 additions & 2 deletions extruct/jsonld.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# mypy: disallow_untyped_defs=False
# -*- coding: utf-8 -*-
"""
JSON-LD extractor
Expand All @@ -14,7 +15,7 @@
HTML_OR_JS_COMMENTLINE = re.compile(r"^\s*(//.*|<!--.*-->)")


class JsonLdExtractor(object):
class JsonLdExtractor:
_xp_jsonld = lxml.etree.XPath(
'descendant-or-self::script[@type="application/ld+json"]'
)
Expand All @@ -26,7 +27,7 @@ def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
def extract_items(self, document, base_url=None):
return [
item
for items in map(self._extract_items, self._xp_jsonld(document))
for items in map(self._extract_items, self._xp_jsonld(document)) # type: ignore[arg-type]
if items
for item in items
if item
Expand Down
3 changes: 2 additions & 1 deletion extruct/microformat.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# mypy: disallow_untyped_defs=False
import mf2py


class MicroformatExtractor(object):
class MicroformatExtractor:
def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
return list(self.extract_items(htmlstring, base_url=base_url))

Expand Down
3 changes: 2 additions & 1 deletion extruct/opengraph.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# mypy: disallow_untyped_defs=False
import re

from extruct.utils import parse_html
Expand All @@ -15,7 +16,7 @@
}


class OpenGraphExtractor(object):
class OpenGraphExtractor:
"""OpenGraph extractor following extruct API."""

def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
Expand Down
5 changes: 3 additions & 2 deletions extruct/rdfa.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# mypy: disallow_untyped_defs=False
# -*- coding: utf-8 -*-
"""
RDFa extractor
Expand All @@ -16,7 +17,7 @@
from pyRdfa import pyRdfa as PyRdfa
from pyRdfa.initialcontext import initial_context
from rdflib import Graph
from rdflib import logger as rdflib_logger
from rdflib import logger as rdflib_logger # type: ignore[no-redef]

from extruct.utils import parse_xmldom_html

Expand All @@ -37,7 +38,7 @@
)


class RDFaExtractor(object):
class RDFaExtractor:
def _replaceNS(self, prop, html_element, head_element):
"""Expand namespace to match with returned json (e.g.: og -> 'http://ogp.me/ns#')"""

Expand Down
24 changes: 15 additions & 9 deletions extruct/tool.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from __future__ import annotations

import argparse
import json
from typing import Any

import requests

Expand All @@ -8,14 +11,17 @@


def metadata_from_url(
url,
syntaxes=SYNTAXES,
uniform=False,
schema_context="http://schema.org",
errors="strict",
):
url: str,
syntaxes: list[str] = SYNTAXES,
uniform: bool = False,
schema_context: str = "http://schema.org",
errors: str = "strict",
) -> dict[str, Any]:
resp = requests.get(url, timeout=30)
result = {"url": url, "status": "{} {}".format(resp.status_code, resp.reason)}
result: dict[str, Any] = {
"url": url,
"status": "{} {}".format(resp.status_code, resp.reason),
}
try:
resp.raise_for_status()
except requests.exceptions.HTTPError:
Expand All @@ -33,7 +39,7 @@ def metadata_from_url(
return result


def main(args=None):
def main(args: Any | None = None) -> Any:
parser = argparse.ArgumentParser(prog="extruct", description=__doc__)
arg = parser.add_argument
arg("url", help="The target URL")
Expand All @@ -51,7 +57,7 @@ def main(args=None):
default=False,
help="""If True uniform output format of all syntaxes to a list of dicts.
Returned dicts structure:
{'@context': 'http://example.com',
{'@context': 'http://example.com',
'@type': 'example_type',
/* All other the properties in keys here */
}""",
Expand Down
4 changes: 3 additions & 1 deletion extruct/uniform.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# mypy: disallow_untyped_defs=False
import copy
from typing import Any

from six.moves.urllib.parse import urljoin, urlparse

Expand All @@ -10,7 +12,7 @@ def _uopengraph(extracted, with_og_array=False):
for obj in extracted:
# In order of appearance in the page
properties = list(obj["properties"])
flattened = {}
flattened: dict[Any, Any] = {}

for k, v in properties:
if k not in flattened.keys():
Expand Down
1 change: 1 addition & 0 deletions extruct/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# mypy: disallow_untyped_defs=False
# -*- coding: utf-8 -*-
import lxml.html

Expand Down
14 changes: 9 additions & 5 deletions extruct/w3cmicrodata.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# mypy: disallow_untyped_defs=False
"""
HTML Microdata parser

Expand All @@ -9,8 +10,11 @@

"""

from __future__ import annotations

import collections
from functools import partial
from typing import Any, Set

try:
from urlparse import urljoin
Expand Down Expand Up @@ -43,7 +47,7 @@
)


class LxmlMicrodataExtractor(object):
class LxmlMicrodataExtractor:
# iterate in document order (used below for fast get_docid)
_xp_item = lxml.etree.XPath("descendant-or-self::*[@itemscope]")
_xp_prop = lxml.etree.XPath(
Expand All @@ -70,14 +74,14 @@ def extract(self, htmlstring, base_url=None, encoding="UTF-8"):

def extract_items(self, document, base_url):
itemids = self._build_itemids(document)
items_seen = set()
items_seen: Set[Any] = set()
return [
item
for item in (
self._extract_item(
it, items_seen=items_seen, base_url=base_url, itemids=itemids
)
for it in self._xp_item(document)
for it in self._xp_item(document) # type: ignore[union-attr]
)
if item
]
Expand All @@ -88,7 +92,7 @@ def get_docid(self, node, itemids):
def _build_itemids(self, document):
"""Build itemids for a fast get_docid implementation. Use document order."""
root = document.getroottree().getroot()
return {node: idx + 1 for idx, node in enumerate(self._xp_item(root))}
return {node: idx + 1 for idx, node in enumerate(self._xp_item(root))} # type: ignore[arg-type]

def _extract_item(self, node, items_seen, base_url, itemids):
itemid = self.get_docid(node, itemids)
Expand Down Expand Up @@ -160,7 +164,7 @@ def _extract_item(self, node, items_seen, base_url, itemids):
return item

def _extract_properties(self, node, items_seen, base_url, itemids):
for prop in self._xp_prop(node):
for prop in self._xp_prop(node): # type: ignore[union-attr]
for p, v in self._extract_property(
prop, items_seen=items_seen, base_url=base_url, itemids=itemids
):
Expand Down
Loading