-
Notifications
You must be signed in to change notification settings - Fork 113
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
131 additions
and
41 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
import logging | ||
from typing import Awaitable, Iterator, Tuple | ||
|
||
from playwright.async_api import Error, Page | ||
from scrapy import Spider | ||
from scrapy.http.headers import Headers | ||
from scrapy.utils.python import to_unicode | ||
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding | ||
|
||
|
||
logger = logging.getLogger("scrapy-playwright") | ||
|
||
|
||
async def _maybe_await(obj): | ||
if isinstance(obj, Awaitable): | ||
return await obj | ||
return obj | ||
|
||
|
||
def _possible_encodings(headers: Headers, text: str) -> Iterator[str]: | ||
if headers.get("content-type"): | ||
content_type = to_unicode(headers["content-type"]) | ||
yield http_content_type_encoding(content_type) | ||
yield html_body_declared_encoding(text) | ||
|
||
|
||
def _encode_body(headers: Headers, text: str) -> Tuple[bytes, str]: | ||
for encoding in filter(None, _possible_encodings(headers, text)): | ||
try: | ||
body = text.encode(encoding) | ||
except UnicodeEncodeError: | ||
pass | ||
else: | ||
return body, encoding | ||
return text.encode("utf-8"), "utf-8" # fallback | ||
|
||
|
||
def _is_safe_close_error(error: Error) -> bool: | ||
""" | ||
Taken almost verbatim from | ||
https://github.com/microsoft/playwright-python/blob/v1.20.0/playwright/_impl/_helper.py#L234-L238 | ||
""" | ||
message = str(error) | ||
return message.endswith("Browser has been closed") or message.endswith( | ||
"Target page, context or browser has been closed" | ||
) | ||
|
||
|
||
_NAVIGATION_ERROR_MSG = ( | ||
"Unable to retrieve content because the page is navigating and changing the content." | ||
) | ||
|
||
|
||
async def _get_page_content( | ||
page: Page, | ||
spider: Spider, | ||
context_name: str, | ||
scrapy_request_url: str, | ||
scrapy_request_method: str, | ||
) -> str: | ||
"""Wrapper around Page.content to retry if necessary. | ||
Arguments other than the page are only for logging. | ||
""" | ||
try: | ||
return await page.content() | ||
except Error as err: | ||
if err.message == _NAVIGATION_ERROR_MSG: | ||
logger.debug( | ||
"Retrying to get content from page '%s', error: '%s'", | ||
page.url, | ||
_NAVIGATION_ERROR_MSG, | ||
extra={ | ||
"spider": spider, | ||
"context_name": context_name, | ||
"scrapy_request_url": scrapy_request_url, | ||
"scrapy_request_method": scrapy_request_method, | ||
"playwright_page_url": page.url, | ||
}, | ||
) | ||
return await page.content() | ||
raise |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
<!DOCTYPE html> | ||
<html lang="en"> | ||
<head> | ||
<title>Page should redirect</title> | ||
<link rel="canonical" href="index.html"> | ||
<meta name="robots" content="noindex"> | ||
<meta charset="utf-8"> | ||
<meta http-equiv="refresh" content="0; url=index.html"> | ||
</head> | ||
<body> | ||
<p>You should not see this because you are immediately redirected.</p> | ||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters