scrapinghub · BurnzZ · Jun 1, 2022 · Jun 1, 2022 · Jun 1, 2022 · Jun 1, 2022
diff --git a/setup.py b/setup.py
@@ -25,6 +25,7 @@
         'url-matcher',
         'multidict',
         'w3lib >= 1.22.0',
+        'yarl',
     ],
     classifiers=[
         'Development Status :: 2 - Pre-Alpha',

diff --git a/tests/test_page_inputs.py b/tests/test_page_inputs.py
@@ -6,6 +6,8 @@
 
 import parsel
 from web_poet.page_inputs import (
+    RequestUrl,
+    ResponseUrl,
     HttpRequest,
     HttpResponse,
     HttpRequestBody,
@@ -16,6 +18,33 @@
 )
 
 
+@pytest.mark.parametrize("cls", [RequestUrl, ResponseUrl])
+def test_url(cls):
+    url_value = "https://example.com/category/product?query=123&id=xyz#frag1"
+
+    url = cls(url_value)
+
+    assert str(url) == url_value
+    assert url.scheme == "https"
+    assert url.host == "example.com"
+    assert url.path == "/category/product"
+    assert url.query_string == "query=123&id=xyz"
+    assert url.fragment == "frag1"
+
+    new_url = cls(url)
+
+
+@pytest.mark.parametrize("cls", [RequestUrl, ResponseUrl])
+def test_url_encoding(cls):
+    url_value = "http://εμπορικόσήμα.eu/путь/這裡"
+
+    url = cls(url_value)
+    str(url) == url_value
+
+    url = cls(url_value, encoded=False)
+    str(url) == "http://xn--jxagkqfkduily1i.eu/%D0%BF%D1%83%D1%82%D1%8C/%E9%80%99%E8%A3%A1"
+
+
 @pytest.mark.parametrize("body_cls", [HttpRequestBody, HttpResponseBody])
 def test_http_body_hashable(body_cls):
     http_body = body_cls(b"content")

diff --git a/web_poet/__init__.py b/web_poet/__init__.py
@@ -10,8 +10,8 @@
     HttpRequestBody,
     HttpResponseBody,
     Meta,
-    RequestURL,
-    ResponseURL,
+    RequestUrl,
+    ResponseUrl,
 )
 from .overrides import PageObjectRegistry, consume_modules, OverrideRule
 

diff --git a/web_poet/mixins.py b/web_poet/mixins.py
@@ -67,7 +67,7 @@ def base_url(self) -> str:
         # FIXME: move it to HttpResponse
         if self._cached_base_url is None:
             text = self.html[:4096]
-            self._cached_base_url = get_base_url(text, self.url)
+            self._cached_base_url = get_base_url(text, str(self.url))
         return self._cached_base_url
 
     def urljoin(self, url: str) -> str:

diff --git a/web_poet/page_inputs/__init__.py b/web_poet/page_inputs/__init__.py
@@ -7,7 +7,7 @@
     HttpResponseHeaders,
     HttpRequestBody,
     HttpResponseBody,
-    RequestURL,
-    ResponseURL
+    RequestUrl,
+    ResponseUrl
 )
 from .browser import BrowserHtml
diff --git a/web_poet/page_inputs/http.py b/web_poet/page_inputs/http.py
@@ -9,6 +9,7 @@
     http_content_type_encoding
 )
 
+import yarl
 from web_poet._base import _HttpHeaders
 from web_poet.utils import memoizemethod_noargs
 from web_poet.mixins import SelectableMixin
@@ -18,13 +19,59 @@
 _AnyStrDict = Dict[AnyStr, Union[AnyStr, List[AnyStr], Tuple[AnyStr, ...]]]
 
 
-class ResponseURL(str):
-    """ URL of the response """
+class _Url:
+    def __init__(self, url: Union[str, yarl.URL], encoded=True):
+        self.__url = yarl.URL(str(url), encoded=encoded)
+
+    def __str__(self) -> str:
+        return str(self.__url)
+
+    def __repr__(self) -> str:
+        return f'{type(self).__name__}({str(self.__url)!r})'
+
+    def __eq__(self, other) -> bool:
+        return str(self.__url) == str(other)
+
+    @property
+    def scheme(self) -> str:
+        return self.__url.scheme
+
+    @property
+    def host(self) -> Optional[str]:
+        return self.__url.host
+
+    @property
+    def path(self) -> str:
+        return self.__url.path
+
+    @property
+    def query_string(self) -> str:
+        return self.__url.query_string
+
+    @property
+    def fragment(self) -> str:
+        return self.__url.fragment
+
+
+class ResponseUrl(_Url):
+    """ URL of the response
+
+    :param url: a string representation of a URL.
+    :param encoded: If set to False, the given ``url`` would be auto-encoded.
+        However, there's no guarantee that correct encoding is used. Thus,
+        it's recommended to set this in the *default* ``False`` value.
+    """
     pass
 
 
-class RequestURL(str):
-    """ URL of the request """
+class RequestUrl(_Url):
+    """ URL of the request
+
+    :param url: a string representation of a URL.
+    :param encoded: If set to False, the given ``url`` would be auto-encoded.
+        However, there's no guarantee that correct encoding is used. Thus,
+        it's recommended to set this in the *default* ``False`` value.
+    """
     pass
 
 
@@ -162,7 +209,7 @@ class HttpRequest:
     **web-poet** like :class:`~.HttpClient`.
     """
 
-    url: RequestURL = attrs.field(converter=RequestURL)
+    url: RequestUrl = attrs.field(converter=RequestUrl)
     method: str = attrs.field(default="GET", kw_only=True)
     headers: HttpRequestHeaders = attrs.field(
         factory=HttpRequestHeaders, converter=HttpRequestHeaders, kw_only=True
@@ -195,7 +242,7 @@ class HttpResponse(SelectableMixin):
     is auto-detected from headers and body content.
     """
 
-    url: ResponseURL = attrs.field(converter=ResponseURL)
+    url: ResponseUrl = attrs.field(converter=ResponseUrl)
     body: HttpResponseBody = attrs.field(converter=HttpResponseBody)
     status: Optional[int] = attrs.field(default=None, kw_only=True)
     headers: HttpResponseHeaders = attrs.field(factory=HttpResponseHeaders,