diff --git a/setup.py b/setup.py index 6b342aad..f661be6a 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,7 @@ 'url-matcher', 'multidict', 'w3lib >= 1.22.0', + 'yarl', ], classifiers=[ 'Development Status :: 2 - Pre-Alpha', diff --git a/tests/test_mixins.py b/tests/test_mixins.py index 73199601..8176bf87 100644 --- a/tests/test_mixins.py +++ b/tests/test_mixins.py @@ -16,7 +16,7 @@ def my_page(book_list_html_response): def test_url(my_page): - assert my_page.url == 'http://books.toscrape.com/index.html' + assert str(my_page.url) == 'http://books.toscrape.com/index.html' def test_html(my_page, book_list_html): @@ -56,7 +56,7 @@ def test_custom_baseurl(): ) page = MyPage(response=response) - assert page.url == 'http://www.example.com/path' + assert str(page.url) == 'http://www.example.com/path' assert page.base_url == 'http://example.com/foo/' assert page.urljoin("bar") == 'http://example.com/foo/bar' assert page.urljoin("http://example.com/1") == "http://example.com/1" diff --git a/tests/test_page_inputs.py b/tests/test_page_inputs.py index 65934a10..ee3f122e 100644 --- a/tests/test_page_inputs.py +++ b/tests/test_page_inputs.py @@ -4,8 +4,11 @@ import pytest import requests +import yarl import parsel from web_poet.page_inputs import ( + RequestUrl, + ResponseUrl, HttpRequest, HttpResponse, HttpRequestBody, @@ -16,6 +19,70 @@ ) +@pytest.mark.parametrize("cls", [RequestUrl, ResponseUrl]) +def test_url(cls): + url_value = "https://example.com/category/product?query=123&id=xyz#frag1" + + url = cls(url_value) + + assert str(url) == url_value + assert url.scheme == "https" + assert url.host == "example.com" + assert url.path == "/category/product" + assert url.query_string == "query=123&id=xyz" + assert url.fragment == "frag1" + + new_url = cls(url) + assert url == new_url + assert str(url) == str(new_url) + + +@pytest.mark.parametrize("cls", [RequestUrl, ResponseUrl]) +def test_url_init(cls): + # via string + url_value = "https://example.com" + url = cls(url_value) + + # via yarl + assert cls(yarl.URL(url_value)) == url + + # via _Url subclasses + assert cls(cls(url_value)) == url + + +@pytest.mark.parametrize("compare_cls", [True, False]) +@pytest.mark.parametrize("cls", [RequestUrl, ResponseUrl]) +def test_url_equality(compare_cls, cls): + # Trailing / in the base URL + no_trail = cls("https://example.com") + with_trail = "https://example.com/" + if compare_cls: + with_trail = cls(with_trail) + assert no_trail == with_trail + else: + assert no_trail != with_trail + assert str(no_trail) != str(with_trail) + + # Trailing / in the path URL + no_trail = cls("https://example.com/foo") + with_trail = "https://example.com/foo/" + if compare_cls: + with_trail = cls(with_trail) + assert no_trail != with_trail # Should not be equal + assert str(no_trail) != str(with_trail) + + +@pytest.mark.parametrize("cls", [RequestUrl, ResponseUrl]) +def test_url_encoding(cls): + url_value = "http://εμπορικόσήμα.eu/путь/這裡" + + url = cls(url_value) + str(url) == url_value + + url = cls(url_value, encoded=False) + str(url) == "http://xn--jxagkqfkduily1i.eu/%D0%BF%D1%83%D1%82%D1%8C/%E9%80%99%E8%A3%A1" + + @pytest.mark.parametrize("body_cls", [HttpRequestBody, HttpResponseBody]) def test_http_body_hashable(body_cls): http_body = body_cls(b"content") @@ -62,17 +129,18 @@ def test_http_response_body_json(): @pytest.mark.parametrize( - ["cls", "body_cls"], + ["cls", "body_cls", "url_cls"], [ - (HttpRequest, HttpRequestBody), - (HttpResponse, HttpResponseBody), + (HttpRequest, HttpRequestBody, RequestUrl), + (HttpResponse, HttpResponseBody, ResponseUrl), ] ) -def test_http_defaults(cls, body_cls): +def test_http_defaults(cls, body_cls, url_cls): http_body = body_cls(b"content") obj = cls("url", body=http_body) - assert obj.url == "url" + assert isinstance(obj.url, url_cls) + assert str(obj.url) == "url" assert obj.body == b"content" assert not obj.headers assert obj.headers.get("user-agent") is None @@ -164,7 +232,8 @@ def test_http_headers_init_dict(cls, headers_cls): def test_http_request_init_minimal(): req = HttpRequest("url") - assert req.url == "url" + assert isinstance(req.url, RequestUrl) + assert str(req.url) == "url" assert req.method == "GET" assert isinstance(req.method, str) assert not req.headers diff --git a/tests/test_pages.py b/tests/test_pages.py index da4a55fc..878ba96a 100644 --- a/tests/test_pages.py +++ b/tests/test_pages.py @@ -34,7 +34,7 @@ class MyWebPage(ItemWebPage): def to_item(self) -> dict: return { - 'url': self.url, + 'url': str(self.url), 'title': self.css('title::text').get().strip(), } diff --git a/tests/test_requests.py b/tests/test_requests.py index 9e6fef57..4e7dd5e7 100644 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -3,6 +3,7 @@ import pytest from web_poet.exceptions import RequestBackendError, HttpResponseError from web_poet.page_inputs import ( + ResponseUrl, HttpClient, HttpRequest, HttpResponse, @@ -37,7 +38,8 @@ async def test_perform_request_from_httpclient(async_mock): response = await client.get(url) # The async downloader implementation should return the HttpResponse - assert response.url == url + assert isinstance(response.url, ResponseUrl) + assert str(response.url) == url assert isinstance(response, HttpResponse) @@ -161,8 +163,9 @@ async def test_http_client_execute(async_mock): request = HttpRequest("url-1") response = await client.execute(request) + assert isinstance(response.url, ResponseUrl) assert isinstance(response, HttpResponse) - assert response.url == "url-1" + assert str(response.url) == "url-1" @pytest.mark.asyncio diff --git a/web_poet/__init__.py b/web_poet/__init__.py index f0f35d10..3c7cf3a2 100644 --- a/web_poet/__init__.py +++ b/web_poet/__init__.py @@ -10,8 +10,8 @@ HttpRequestBody, HttpResponseBody, Meta, - RequestURL, - ResponseURL, + RequestUrl, + ResponseUrl, ) from .overrides import PageObjectRegistry, consume_modules, OverrideRule diff --git a/web_poet/mixins.py b/web_poet/mixins.py index faf6c0f6..78aae99d 100644 --- a/web_poet/mixins.py +++ b/web_poet/mixins.py @@ -67,7 +67,7 @@ def base_url(self) -> str: # FIXME: move it to HttpResponse if self._cached_base_url is None: text = self.html[:4096] - self._cached_base_url = get_base_url(text, self.url) + self._cached_base_url = get_base_url(text, str(self.url)) return self._cached_base_url def urljoin(self, url: str) -> str: diff --git a/web_poet/page_inputs/__init__.py b/web_poet/page_inputs/__init__.py index ddb3c65b..5bbe356f 100644 --- a/web_poet/page_inputs/__init__.py +++ b/web_poet/page_inputs/__init__.py @@ -7,7 +7,7 @@ HttpResponseHeaders, HttpRequestBody, HttpResponseBody, - RequestURL, - ResponseURL + RequestUrl, + ResponseUrl ) from .browser import BrowserHtml diff --git a/web_poet/page_inputs/http.py b/web_poet/page_inputs/http.py index a3df744d..071dc51f 100644 --- a/web_poet/page_inputs/http.py +++ b/web_poet/page_inputs/http.py @@ -9,6 +9,7 @@ http_content_type_encoding ) +import yarl from web_poet._base import _HttpHeaders from web_poet.utils import memoizemethod_noargs from web_poet.mixins import SelectableMixin @@ -18,13 +19,64 @@ _AnyStrDict = Dict[AnyStr, Union[AnyStr, List[AnyStr], Tuple[AnyStr, ...]]] -class ResponseURL(str): - """ URL of the response """ +class _Url: + def __init__(self, url: Union[str, yarl.URL, '_Url'], encoded=True): + self._url = yarl.URL(str(url), encoded=encoded) + + def __str__(self) -> str: + return str(self._url) + + def __repr__(self) -> str: + return f'{type(self).__name__}({str(self._url)!r})' + + def __eq__(self, other) -> bool: + if not isinstance(other, type(self)): + return False + if self._url.path == "/": + if self._url.path == other.path: + return True + return str(self._url) == str(other) + + @property + def scheme(self) -> str: + return self._url.scheme + + @property + def host(self) -> Optional[str]: + return self._url.host + + @property + def path(self) -> str: + return self._url.path + + @property + def query_string(self) -> str: + return self._url.query_string + + @property + def fragment(self) -> str: + return self._url.fragment + + +class ResponseUrl(_Url): + """ URL of the response + + :param url: a string representation of a URL. + :param encoded: If set to False, the given ``url`` would be auto-encoded. + However, there's no guarantee that correct encoding is used. Thus, + it's recommended to set this in the *default* ``False`` value. + """ pass -class RequestURL(str): - """ URL of the request """ +class RequestUrl(_Url): + """ URL of the request + + :param url: a string representation of a URL. + :param encoded: If set to False, the given ``url`` would be auto-encoded. + However, there's no guarantee that correct encoding is used. Thus, + it's recommended to set this in the *default* ``False`` value. + """ pass @@ -162,7 +214,7 @@ class HttpRequest: **web-poet** like :class:`~.HttpClient`. """ - url: RequestURL = attrs.field(converter=RequestURL) + url: RequestUrl = attrs.field(converter=RequestUrl) method: str = attrs.field(default="GET", kw_only=True) headers: HttpRequestHeaders = attrs.field( factory=HttpRequestHeaders, converter=HttpRequestHeaders, kw_only=True @@ -195,7 +247,7 @@ class HttpResponse(SelectableMixin): is auto-detected from headers and body content. """ - url: ResponseURL = attrs.field(converter=ResponseURL) + url: ResponseUrl = attrs.field(converter=ResponseUrl) body: HttpResponseBody = attrs.field(converter=HttpResponseBody) status: Optional[int] = attrs.field(default=None, kw_only=True) headers: HttpResponseHeaders = attrs.field(factory=HttpResponseHeaders,