Skip to content

Commit

Permalink
Retry page.content if necessary
Browse files Browse the repository at this point in the history
  • Loading branch information
elacuesta committed Aug 3, 2023
1 parent 8a73cf3 commit bec45bb
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 41 deletions.
81 changes: 81 additions & 0 deletions scrapy_playwright/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import logging
from typing import Awaitable, Iterator, Tuple

from playwright.async_api import Error, Page
from scrapy import Spider
from scrapy.http.headers import Headers
from scrapy.utils.python import to_unicode
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding


logger = logging.getLogger("scrapy-playwright")


async def _maybe_await(obj):
if isinstance(obj, Awaitable):
return await obj
return obj


def _possible_encodings(headers: Headers, text: str) -> Iterator[str]:
if headers.get("content-type"):
content_type = to_unicode(headers["content-type"])
yield http_content_type_encoding(content_type)
yield html_body_declared_encoding(text)


def _encode_body(headers: Headers, text: str) -> Tuple[bytes, str]:
for encoding in filter(None, _possible_encodings(headers, text)):
try:
body = text.encode(encoding)
except UnicodeEncodeError:
pass
else:
return body, encoding
return text.encode("utf-8"), "utf-8" # fallback


def _is_safe_close_error(error: Error) -> bool:
"""
Taken almost verbatim from
https://github.com/microsoft/playwright-python/blob/v1.20.0/playwright/_impl/_helper.py#L234-L238
"""
message = str(error)
return message.endswith("Browser has been closed") or message.endswith(
"Target page, context or browser has been closed"
)


_NAVIGATION_ERROR_MSG = (
"Unable to retrieve content because the page is navigating and changing the content."
)


async def _get_page_content(
page: Page,
spider: Spider,
context_name: str,
scrapy_request_url: str,
scrapy_request_method: str,
) -> str:
"""Wrapper around Page.content to retry if necessary.
Arguments other than the page are only for logging.
"""
try:
return await page.content()
except Error as err:
if err.message == _NAVIGATION_ERROR_MSG:
logger.debug(
"Retrying to get content from page '%s', error: '%s'",
page.url,
_NAVIGATION_ERROR_MSG,
extra={
"spider": spider,
"context_name": context_name,
"scrapy_request_url": scrapy_request_url,
"scrapy_request_method": scrapy_request_method,
"playwright_page_url": page.url,
},
)
return await page.content()
raise
56 changes: 16 additions & 40 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
from dataclasses import dataclass
from ipaddress import ip_address
from time import time
from typing import Awaitable, Callable, Dict, Generator, Optional, Tuple, Type, TypeVar, Union
from typing import Awaitable, Callable, Dict, Optional, Type, TypeVar, Union

from playwright.async_api import (
Browser,
BrowserContext,
BrowserType,
Error as PlaywrightError,
Page,
PlaywrightContextManager,
Request as PlaywrightRequest,
Expand All @@ -24,13 +25,17 @@
from scrapy.responsetypes import responsetypes
from scrapy.utils.defer import deferred_from_coro
from scrapy.utils.misc import load_object
from scrapy.utils.python import to_unicode
from scrapy.utils.reactor import verify_installed_reactor
from twisted.internet.defer import Deferred, inlineCallbacks
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding

from scrapy_playwright.headers import use_scrapy_headers
from scrapy_playwright.page import PageMethod
from scrapy_playwright._utils import (
_encode_body,
_get_page_content,
_is_safe_close_error,
_maybe_await,
)


__all__ = ["ScrapyPlaywrightDownloadHandler"]
Expand Down Expand Up @@ -351,7 +356,13 @@ async def _download_request_with_page(
headers = Headers(await response.all_headers())
headers.pop("Content-Encoding", None)
await self._apply_page_methods(page, request, spider)
body_str = await page.content()
body_str = await _get_page_content(
page=page,
spider=spider,
context_name=context_name,
scrapy_request_url=request.url,
scrapy_request_method=request.method,
)
request.meta["download_latency"] = time() - start_time

if not request.meta.get("playwright_include_page"):
Expand Down Expand Up @@ -532,7 +543,7 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
"playwright_request_method_new": overrides["method"],
},
)
except Exception as ex:
except PlaywrightError as ex:
if _is_safe_close_error(ex):
logger.warning(
"Failed processing Playwright request: <%s %s> exc_type=%s exc_msg=%s",
Expand All @@ -556,12 +567,6 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
return _request_handler


async def _maybe_await(obj):
if isinstance(obj, Awaitable):
return await obj
return obj


def _attach_page_event_handlers(
page: Page, request: Request, spider: Spider, context_name: str
) -> None:
Expand Down Expand Up @@ -651,32 +656,3 @@ async def _log_response(response: PlaywrightResponse) -> None:
)

return _log_response


def _possible_encodings(headers: Headers, text: str) -> Generator[str, None, None]:
if headers.get("content-type"):
content_type = to_unicode(headers["content-type"])
yield http_content_type_encoding(content_type)
yield html_body_declared_encoding(text)


def _encode_body(headers: Headers, text: str) -> Tuple[bytes, str]:
for encoding in filter(None, _possible_encodings(headers, text)):
try:
body = text.encode(encoding)
except UnicodeEncodeError:
pass
else:
return body, encoding
return text.encode("utf-8"), "utf-8" # fallback


def _is_safe_close_error(error: Exception) -> bool:
"""
Taken verbatim from
https://github.com/microsoft/playwright-python/blob/v1.20.0/playwright/_impl/_helper.py#L234-L238
"""
message = str(error)
return message.endswith("Browser has been closed") or message.endswith(
"Target page, context or browser has been closed"
)
13 changes: 13 additions & 0 deletions tests/site/redirect.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<!DOCTYPE html>
<html lang="en">
<head>
<title>Page should redirect</title>
<link rel="canonical" href="index.html">
<meta name="robots" content="noindex">
<meta charset="utf-8">
<meta http-equiv="refresh" content="0; url=index.html">
</head>
<body>
<p>You should not see this because you are immediately redirected.</p>
</body>
</html>
22 changes: 21 additions & 1 deletion tests/test_playwright_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pytest
from playwright.async_api import (
Dialog,
Error as PlaywrightError,
Page as PlaywrightPage,
TimeoutError as PlaywrightTimeoutError,
)
Expand Down Expand Up @@ -122,6 +123,25 @@ async def test_timeout_error(self, caplog):
f" exc_type={type(excinfo.value)} exc_msg={str(excinfo.value)}",
) in caplog.record_tuples

@pytest.mark.asyncio
async def test_retry_page_content_still_navigating(self, caplog):
caplog.set_level(logging.DEBUG)
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
with StaticMockServer() as server:
req = Request(server.urljoin("/redirect.html"), meta={"playwright": True})
resp = await handler._download_request(req, Spider("foo"))

assert resp.request is req
assert resp.url == server.urljoin("/index.html") # redirected
assert resp.status == 200
assert "playwright" in resp.flags
assert (
"scrapy-playwright",
logging.DEBUG,
f"Retrying to get content from page '{req.url}', error: 'Unable to retrieve"
" content because the page is navigating and changing the content.'",
) in caplog.record_tuples

@pytest.mark.skipif(sys.version_info < (3, 8), reason="AsyncMock was added on Python 3.8")
@patch("scrapy_playwright.handler.logger")
@pytest.mark.asyncio
Expand All @@ -147,7 +167,7 @@ async def test_route_continue_exception(self, logger):
playwright_request.all_headers.return_value = {}

# safe error, only warn
ex = Exception("Target page, context or browser has been closed")
ex = PlaywrightError("Target page, context or browser has been closed")
route.continue_.side_effect = ex
await req_handler(route, playwright_request)
logger.warning.assert_called_with(
Expand Down

0 comments on commit bec45bb

Please sign in to comment.