diff --git a/src/apify/_actor.py b/src/apify/_actor.py index f60a99df..4f3f032f 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -8,7 +8,6 @@ from lazy_object_proxy import Proxy from pydantic import AliasChoices -from typing_extensions import Self from apify_client import ApifyClientAsync from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars @@ -31,6 +30,8 @@ import logging from types import TracebackType + from typing_extensions import Self + from crawlee.proxy_configuration import _NewUrlFunction from apify._models import Webhook diff --git a/src/apify/storages/_actor_inputs.py b/src/apify/storages/_actor_inputs.py index 9437a578..524ac70b 100644 --- a/src/apify/storages/_actor_inputs.py +++ b/src/apify/storages/_actor_inputs.py @@ -23,12 +23,12 @@ ) - class _RequestDetails(BaseModel): method: HttpMethod payload: str = '' headers: dict[str, str] = Field(default_factory=dict) - user_data: dict[str, str]= Field(default_factory=dict, alias=ActorInputKeys.startUrls.userData) + user_data: dict[str, str] = Field(default_factory=dict, alias=ActorInputKeys.startUrls.userData) + class _RequestsFromUrlInput(_RequestDetails): requests_from_url: str = Field(alias=ActorInputKeys.startUrls.requestsFromUrl) @@ -37,6 +37,7 @@ class _RequestsFromUrlInput(_RequestDetails): class _SimpleUrlInput(_RequestDetails): url: str + class Input(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) start_urls: RequestList @@ -45,11 +46,13 @@ class Input(BaseModel): async def read(cls, raw_input: dict[str, Any], http_client: BaseHttpClient | None = None) -> Self: if ActorInputKeys.startUrls in raw_input: request_list = await _create_request_list( - actor_start_urls_input=raw_input[ActorInputKeys.startUrls], http_client=http_client) + actor_start_urls_input=raw_input[ActorInputKeys.startUrls], http_client=http_client + ) else: request_list = RequestList() return cls(start_urls=request_list) + async def _create_request_list( *, actor_start_urls_input: list[dict[str, Any]], http_client: BaseHttpClient | None = None ) -> RequestList: @@ -69,10 +72,13 @@ async def _create_request_list( if not http_client: http_client = HttpxHttpClient() simple_url_requests_inputs = [ - _SimpleUrlInput(**request_input) for request_input in actor_start_urls_input - if ActorInputKeys.startUrls.url in request_input] + _SimpleUrlInput(**request_input) + for request_input in actor_start_urls_input + if ActorInputKeys.startUrls.url in request_input + ] remote_url_requests_inputs = [ - _RequestsFromUrlInput(**request_input) for request_input in actor_start_urls_input + _RequestsFromUrlInput(**request_input) + for request_input in actor_start_urls_input if ActorInputKeys.startUrls.requestsFromUrl in request_input ] @@ -109,12 +115,18 @@ async def _create_requests_from_url( def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None: """Callback to scrape response body with regexp and create Requests from matches.""" matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8')) - created_requests.extend([Request.from_url( - match.group(0), - method=request_input.method, - payload=request_input.payload.encode('utf-8'), - headers=request_input.headers, - user_data=request_input.user_data) for match in matches]) + created_requests.extend( + [ + Request.from_url( + match.group(0), + method=request_input.method, + payload=request_input.payload.encode('utf-8'), + headers=request_input.headers, + user_data=request_input.user_data, + ) + for match in matches + ] + ) remote_url_requests = [] for remote_url_requests_input in remote_url_requests_inputs: diff --git a/src/apify/storages/_known_actor_input_keys.py b/src/apify/storages/_known_actor_input_keys.py index 49347393..2283a056 100644 --- a/src/apify/storages/_known_actor_input_keys.py +++ b/src/apify/storages/_known_actor_input_keys.py @@ -1,23 +1,23 @@ - - class _KnownInputKey(str): __slots__ = ('_name',) + def __init__(self, name: str) -> None: self._name = name def __str__(self) -> str: return self._name - def __repr__(self) ->str: + def __repr__(self) -> str: return str(self) + class _StartUrls(_KnownInputKey): - url='url' + url = 'url' requestsFromUrl = 'requestsFromUrl' # noqa: N815 # Intentional to respect actual naming of input keys. - method='method' - payload= 'payload' - userData='userData' # noqa: N815 # Intentional to respect actual naming of input keys. - headers='headers' + method = 'method' + payload = 'payload' + userData = 'userData' # noqa: N815 # Intentional to respect actual naming of input keys. + headers = 'headers' class _ActorInputKeys: @@ -25,4 +25,5 @@ class _ActorInputKeys: startUrls: _StartUrls = _StartUrls('startUrls') # noqa: N815 # Intentional to respect actual naming of input keys. # More inputs should be gradually added + ActorInputKeys = _ActorInputKeys() diff --git a/tests/unit/actor/test_actor_inputs.py b/tests/unit/actor/test_actor_inputs.py index 08a3f155..736a1137 100644 --- a/tests/unit/actor/test_actor_inputs.py +++ b/tests/unit/actor/test_actor_inputs.py @@ -19,8 +19,11 @@ 'optional_input', [ {}, - {ActorInputKeys.startUrls.payload: 'some payload', ActorInputKeys.startUrls.userData: - {'some key': 'some value'}, ActorInputKeys.startUrls.headers: {'h1': 'v1', 'h2': 'v2'}}, + { + ActorInputKeys.startUrls.payload: 'some payload', + ActorInputKeys.startUrls.userData: {'some key': 'some value'}, + ActorInputKeys.startUrls.headers: {'h1': 'v1', 'h2': 'v2'}, + }, ], ids=['minimal', 'all_options'], ) @@ -28,8 +31,10 @@ async def test_actor_create_request_list_request_types( request_method: HttpMethod, optional_input: dict[str, Any] ) -> None: """Test proper request list generation from both minimal and full inputs for all method types for simple input.""" - minimal_request_dict_input = {ActorInputKeys.startUrls.url: 'https://www.abc.com', - ActorInputKeys.startUrls.method: request_method} + minimal_request_dict_input = { + ActorInputKeys.startUrls.url: 'https://www.abc.com', + ActorInputKeys.startUrls.method: request_method, + } request_dict_input = {**minimal_request_dict_input, **optional_input} example_actor_input: dict[str, Any] = {ActorInputKeys.startUrls: [request_dict_input]} @@ -75,17 +80,25 @@ def read(self) -> bytes: async def test_actor_create_request_list_from_url_correctly_send_requests() -> None: """Test that injected HttpClient's method send_request is called with properly passed arguments.""" - example_actor_input: dict[str, Any] = {ActorInputKeys.startUrls: [ - {ActorInputKeys.startUrls.requestsFromUrl: 'https://abc.dev/file.txt', ActorInputKeys.startUrls.method: 'GET'}, - {ActorInputKeys.startUrls.requestsFromUrl: 'https://www.abc.dev/file2', ActorInputKeys.startUrls.method: 'PUT'}, - { - ActorInputKeys.startUrls.requestsFromUrl: 'https://www.something.som', - ActorInputKeys.startUrls.method: 'POST', - ActorInputKeys.startUrls.headers: {'key': 'value'}, - ActorInputKeys.startUrls.payload: 'some_payload', - ActorInputKeys.startUrls.userData: {'another_key': 'another_value'}, - }, - ]} + example_actor_input: dict[str, Any] = { + ActorInputKeys.startUrls: [ + { + ActorInputKeys.startUrls.requestsFromUrl: 'https://abc.dev/file.txt', + ActorInputKeys.startUrls.method: 'GET', + }, + { + ActorInputKeys.startUrls.requestsFromUrl: 'https://www.abc.dev/file2', + ActorInputKeys.startUrls.method: 'PUT', + }, + { + ActorInputKeys.startUrls.requestsFromUrl: 'https://www.something.som', + ActorInputKeys.startUrls.method: 'POST', + ActorInputKeys.startUrls.headers: {'key': 'value'}, + ActorInputKeys.startUrls.payload: 'some_payload', + ActorInputKeys.startUrls.userData: {'another_key': 'another_value'}, + }, + ] + } mocked_read_outputs = ('' for url in example_actor_input[ActorInputKeys.startUrls]) http_client = HttpxHttpClient() @@ -117,11 +130,19 @@ async def test_actor_create_request_list_from_url() -> None: ) ) - example_actor_input:dict[str, Any] = {ActorInputKeys.startUrls:[ - {ActorInputKeys.startUrls.requestsFromUrl: 'https://abc.dev/file.txt', ActorInputKeys.startUrls.method: 'GET'}, - {ActorInputKeys.startUrls.url: expected_simple_url, ActorInputKeys.startUrls.method: 'GET'}, - {ActorInputKeys.startUrls.requestsFromUrl: 'https://www.abc.dev/file2', ActorInputKeys.startUrls.method: 'GET'}, - ]} + example_actor_input: dict[str, Any] = { + ActorInputKeys.startUrls: [ + { + ActorInputKeys.startUrls.requestsFromUrl: 'https://abc.dev/file.txt', + ActorInputKeys.startUrls.method: 'GET', + }, + {ActorInputKeys.startUrls.url: expected_simple_url, ActorInputKeys.startUrls.method: 'GET'}, + { + ActorInputKeys.startUrls.requestsFromUrl: 'https://www.abc.dev/file2', + ActorInputKeys.startUrls.method: 'GET', + }, + ] + } http_client = HttpxHttpClient() with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): @@ -133,7 +154,8 @@ async def test_actor_create_request_list_from_url() -> None: # Check correctly created requests' urls in request list assert {generated_request.url for generated_request in generated_requests} == expected_urls -async def test_actor_create_request_list_from_url_additional_inputs() -> None: + +async def test_actor_create_request_list_from_url_additional_inputs() -> None: """Test that all generated request properties are correctly populated from input values.""" expected_simple_url = 'https://www.someurl.com' example_start_url_input = { @@ -141,8 +163,9 @@ async def test_actor_create_request_list_from_url_additional_inputs() -> None: ActorInputKeys.startUrls.method: 'POST', ActorInputKeys.startUrls.headers: {'key': 'value'}, ActorInputKeys.startUrls.payload: 'some_payload', - ActorInputKeys.startUrls.userData: {'another_key': 'another_value'}} - example_actor_input: dict[str, Any] = {ActorInputKeys.startUrls:[example_start_url_input]} + ActorInputKeys.startUrls.userData: {'another_key': 'another_value'}, + } + example_actor_input: dict[str, Any] = {ActorInputKeys.startUrls: [example_start_url_input]} response_bodies = iter((expected_simple_url,)) http_client = HttpxHttpClient() with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): @@ -162,43 +185,50 @@ async def test_actor_create_request_list_from_url_additional_inputs() -> None: assert request.user_data == expected_user_data -@pytest.mark.parametrize('true_positive', [ - 'http://www.something.com', - 'https://www.something.net', - 'http://nowww.cz', - 'https://with-hypen.com', - 'http://number1.com', - 'http://www.number.123.abc', - 'http://many.dots.com', - 'http://a.com', - 'http://www.something.com/somethignelse' - 'http://www.something.com/somethignelse.txt', - 'http://non-english-chars-áíéåü.com', - 'http://www.port.com:1234', - 'http://username:password@something.else.com' -]) +@pytest.mark.parametrize( + 'true_positive', + [ + 'http://www.something.com', + 'https://www.something.net', + 'http://nowww.cz', + 'https://with-hypen.com', + 'http://number1.com', + 'http://www.number.123.abc', + 'http://many.dots.com', + 'http://a.com', + 'http://www.something.com/somethignelse' 'http://www.something.com/somethignelse.txt', + 'http://non-english-chars-áíéåü.com', + 'http://www.port.com:1234', + 'http://username:password@something.else.com', + ], +) def test_url_no_commas_regex_true_positives(true_positive: str) -> None: - example_string= f'Some text {true_positive} some more text' + example_string = f'Some text {true_positive} some more text' matches = list(re.finditer(URL_NO_COMMAS_REGEX, example_string)) assert len(matches) == 1 assert matches[0].group(0) == true_positive -@pytest.mark.parametrize('false_positive',[ - 'http://www.a', - 'http://a', - 'http://a.a', - 'http://123.456', - 'www.something.com', - 'http:www.something.com', -]) + +@pytest.mark.parametrize( + 'false_positive', + [ + 'http://www.a', + 'http://a', + 'http://a.a', + 'http://123.456', + 'www.something.com', + 'http:www.something.com', + ], +) def test_url_no_commas_regex_false_positives(false_positive: str) -> None: - example_string= f'Some text {false_positive} some more text' + example_string = f'Some text {false_positive} some more text' matches = list(re.findall(URL_NO_COMMAS_REGEX, example_string)) assert len(matches) == 0 + def test_url_no_commas_regex_multi_line() -> None: true_positives = ('http://www.something.com', 'http://www.else.com') - example_string= 'Some text {} some more text \n Some new line text {} ...'.format(*true_positives) + example_string = 'Some text {} some more text \n Some new line text {} ...'.format(*true_positives) matches = list(re.finditer(URL_NO_COMMAS_REGEX, example_string)) assert len(matches) == 2 assert {match.group(0) for match in matches} == set(true_positives)