Skip to content

Commit

Permalink
Make ruff happy
Browse files Browse the repository at this point in the history
  • Loading branch information
Pijukatel committed Nov 18, 2024
1 parent 8412a10 commit 382b650
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 70 deletions.
3 changes: 2 additions & 1 deletion src/apify/_actor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from lazy_object_proxy import Proxy
from pydantic import AliasChoices
from typing_extensions import Self

from apify_client import ApifyClientAsync
from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
Expand All @@ -31,6 +30,8 @@
import logging
from types import TracebackType

from typing_extensions import Self

from crawlee.proxy_configuration import _NewUrlFunction

from apify._models import Webhook
Expand Down
36 changes: 24 additions & 12 deletions src/apify/storages/_actor_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@
)



class _RequestDetails(BaseModel):
method: HttpMethod
payload: str = ''
headers: dict[str, str] = Field(default_factory=dict)
user_data: dict[str, str]= Field(default_factory=dict, alias=ActorInputKeys.startUrls.userData)
user_data: dict[str, str] = Field(default_factory=dict, alias=ActorInputKeys.startUrls.userData)


class _RequestsFromUrlInput(_RequestDetails):
requests_from_url: str = Field(alias=ActorInputKeys.startUrls.requestsFromUrl)
Expand All @@ -37,6 +37,7 @@ class _RequestsFromUrlInput(_RequestDetails):
class _SimpleUrlInput(_RequestDetails):
url: str


class Input(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
start_urls: RequestList
Expand All @@ -45,11 +46,13 @@ class Input(BaseModel):
async def read(cls, raw_input: dict[str, Any], http_client: BaseHttpClient | None = None) -> Self:
if ActorInputKeys.startUrls in raw_input:
request_list = await _create_request_list(
actor_start_urls_input=raw_input[ActorInputKeys.startUrls], http_client=http_client)
actor_start_urls_input=raw_input[ActorInputKeys.startUrls], http_client=http_client
)
else:
request_list = RequestList()
return cls(start_urls=request_list)


async def _create_request_list(
*, actor_start_urls_input: list[dict[str, Any]], http_client: BaseHttpClient | None = None
) -> RequestList:
Expand All @@ -69,10 +72,13 @@ async def _create_request_list(
if not http_client:
http_client = HttpxHttpClient()
simple_url_requests_inputs = [
_SimpleUrlInput(**request_input) for request_input in actor_start_urls_input
if ActorInputKeys.startUrls.url in request_input]
_SimpleUrlInput(**request_input)
for request_input in actor_start_urls_input
if ActorInputKeys.startUrls.url in request_input
]
remote_url_requests_inputs = [
_RequestsFromUrlInput(**request_input) for request_input in actor_start_urls_input
_RequestsFromUrlInput(**request_input)
for request_input in actor_start_urls_input
if ActorInputKeys.startUrls.requestsFromUrl in request_input
]

Expand Down Expand Up @@ -109,12 +115,18 @@ async def _create_requests_from_url(
def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
"""Callback to scrape response body with regexp and create Requests from matches."""
matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8'))
created_requests.extend([Request.from_url(
match.group(0),
method=request_input.method,
payload=request_input.payload.encode('utf-8'),
headers=request_input.headers,
user_data=request_input.user_data) for match in matches])
created_requests.extend(
[
Request.from_url(
match.group(0),
method=request_input.method,
payload=request_input.payload.encode('utf-8'),
headers=request_input.headers,
user_data=request_input.user_data,
)
for match in matches
]
)

remote_url_requests = []
for remote_url_requests_input in remote_url_requests_inputs:
Expand Down
17 changes: 9 additions & 8 deletions src/apify/storages/_known_actor_input_keys.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,29 @@


class _KnownInputKey(str):
__slots__ = ('_name',)

def __init__(self, name: str) -> None:
self._name = name

def __str__(self) -> str:
return self._name

def __repr__(self) ->str:
def __repr__(self) -> str:
return str(self)


class _StartUrls(_KnownInputKey):
url='url'
url = 'url'
requestsFromUrl = 'requestsFromUrl' # noqa: N815 # Intentional to respect actual naming of input keys.
method='method'
payload= 'payload'
userData='userData' # noqa: N815 # Intentional to respect actual naming of input keys.
headers='headers'
method = 'method'
payload = 'payload'
userData = 'userData' # noqa: N815 # Intentional to respect actual naming of input keys.
headers = 'headers'


class _ActorInputKeys:
# Helper class to have actor input strings all in one place and easy to use with code completion.
startUrls: _StartUrls = _StartUrls('startUrls') # noqa: N815 # Intentional to respect actual naming of input keys.
# More inputs should be gradually added


ActorInputKeys = _ActorInputKeys()
128 changes: 79 additions & 49 deletions tests/unit/actor/test_actor_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,22 @@
'optional_input',
[
{},
{ActorInputKeys.startUrls.payload: 'some payload', ActorInputKeys.startUrls.userData:
{'some key': 'some value'}, ActorInputKeys.startUrls.headers: {'h1': 'v1', 'h2': 'v2'}},
{
ActorInputKeys.startUrls.payload: 'some payload',
ActorInputKeys.startUrls.userData: {'some key': 'some value'},
ActorInputKeys.startUrls.headers: {'h1': 'v1', 'h2': 'v2'},
},
],
ids=['minimal', 'all_options'],
)
async def test_actor_create_request_list_request_types(
request_method: HttpMethod, optional_input: dict[str, Any]
) -> None:
"""Test proper request list generation from both minimal and full inputs for all method types for simple input."""
minimal_request_dict_input = {ActorInputKeys.startUrls.url: 'https://www.abc.com',
ActorInputKeys.startUrls.method: request_method}
minimal_request_dict_input = {
ActorInputKeys.startUrls.url: 'https://www.abc.com',
ActorInputKeys.startUrls.method: request_method,
}
request_dict_input = {**minimal_request_dict_input, **optional_input}
example_actor_input: dict[str, Any] = {ActorInputKeys.startUrls: [request_dict_input]}

Expand Down Expand Up @@ -75,17 +80,25 @@ def read(self) -> bytes:

async def test_actor_create_request_list_from_url_correctly_send_requests() -> None:
"""Test that injected HttpClient's method send_request is called with properly passed arguments."""
example_actor_input: dict[str, Any] = {ActorInputKeys.startUrls: [
{ActorInputKeys.startUrls.requestsFromUrl: 'https://abc.dev/file.txt', ActorInputKeys.startUrls.method: 'GET'},
{ActorInputKeys.startUrls.requestsFromUrl: 'https://www.abc.dev/file2', ActorInputKeys.startUrls.method: 'PUT'},
{
ActorInputKeys.startUrls.requestsFromUrl: 'https://www.something.som',
ActorInputKeys.startUrls.method: 'POST',
ActorInputKeys.startUrls.headers: {'key': 'value'},
ActorInputKeys.startUrls.payload: 'some_payload',
ActorInputKeys.startUrls.userData: {'another_key': 'another_value'},
},
]}
example_actor_input: dict[str, Any] = {
ActorInputKeys.startUrls: [
{
ActorInputKeys.startUrls.requestsFromUrl: 'https://abc.dev/file.txt',
ActorInputKeys.startUrls.method: 'GET',
},
{
ActorInputKeys.startUrls.requestsFromUrl: 'https://www.abc.dev/file2',
ActorInputKeys.startUrls.method: 'PUT',
},
{
ActorInputKeys.startUrls.requestsFromUrl: 'https://www.something.som',
ActorInputKeys.startUrls.method: 'POST',
ActorInputKeys.startUrls.headers: {'key': 'value'},
ActorInputKeys.startUrls.payload: 'some_payload',
ActorInputKeys.startUrls.userData: {'another_key': 'another_value'},
},
]
}

mocked_read_outputs = ('' for url in example_actor_input[ActorInputKeys.startUrls])
http_client = HttpxHttpClient()
Expand Down Expand Up @@ -117,11 +130,19 @@ async def test_actor_create_request_list_from_url() -> None:
)
)

example_actor_input:dict[str, Any] = {ActorInputKeys.startUrls:[
{ActorInputKeys.startUrls.requestsFromUrl: 'https://abc.dev/file.txt', ActorInputKeys.startUrls.method: 'GET'},
{ActorInputKeys.startUrls.url: expected_simple_url, ActorInputKeys.startUrls.method: 'GET'},
{ActorInputKeys.startUrls.requestsFromUrl: 'https://www.abc.dev/file2', ActorInputKeys.startUrls.method: 'GET'},
]}
example_actor_input: dict[str, Any] = {
ActorInputKeys.startUrls: [
{
ActorInputKeys.startUrls.requestsFromUrl: 'https://abc.dev/file.txt',
ActorInputKeys.startUrls.method: 'GET',
},
{ActorInputKeys.startUrls.url: expected_simple_url, ActorInputKeys.startUrls.method: 'GET'},
{
ActorInputKeys.startUrls.requestsFromUrl: 'https://www.abc.dev/file2',
ActorInputKeys.startUrls.method: 'GET',
},
]
}

http_client = HttpxHttpClient()
with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)):
Expand All @@ -133,16 +154,18 @@ async def test_actor_create_request_list_from_url() -> None:
# Check correctly created requests' urls in request list
assert {generated_request.url for generated_request in generated_requests} == expected_urls

async def test_actor_create_request_list_from_url_additional_inputs() -> None:

async def test_actor_create_request_list_from_url_additional_inputs() -> None:
"""Test that all generated request properties are correctly populated from input values."""
expected_simple_url = 'https://www.someurl.com'
example_start_url_input = {
ActorInputKeys.startUrls.requestsFromUrl: 'https://crawlee.dev/file.txt',
ActorInputKeys.startUrls.method: 'POST',
ActorInputKeys.startUrls.headers: {'key': 'value'},
ActorInputKeys.startUrls.payload: 'some_payload',
ActorInputKeys.startUrls.userData: {'another_key': 'another_value'}}
example_actor_input: dict[str, Any] = {ActorInputKeys.startUrls:[example_start_url_input]}
ActorInputKeys.startUrls.userData: {'another_key': 'another_value'},
}
example_actor_input: dict[str, Any] = {ActorInputKeys.startUrls: [example_start_url_input]}
response_bodies = iter((expected_simple_url,))
http_client = HttpxHttpClient()
with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)):
Expand All @@ -162,43 +185,50 @@ async def test_actor_create_request_list_from_url_additional_inputs() -> None:
assert request.user_data == expected_user_data


@pytest.mark.parametrize('true_positive', [
'http://www.something.com',
'https://www.something.net',
'http://nowww.cz',
'https://with-hypen.com',
'http://number1.com',
'http://www.number.123.abc',
'http://many.dots.com',
'http://a.com',
'http://www.something.com/somethignelse'
'http://www.something.com/somethignelse.txt',
'http://non-english-chars-áíéåü.com',
'http://www.port.com:1234',
'http://username:[email protected]'
])
@pytest.mark.parametrize(
'true_positive',
[
'http://www.something.com',
'https://www.something.net',
'http://nowww.cz',
'https://with-hypen.com',
'http://number1.com',
'http://www.number.123.abc',
'http://many.dots.com',
'http://a.com',
'http://www.something.com/somethignelse' 'http://www.something.com/somethignelse.txt',
'http://non-english-chars-áíéåü.com',
'http://www.port.com:1234',
'http://username:[email protected]',
],
)
def test_url_no_commas_regex_true_positives(true_positive: str) -> None:
example_string= f'Some text {true_positive} some more text'
example_string = f'Some text {true_positive} some more text'
matches = list(re.finditer(URL_NO_COMMAS_REGEX, example_string))
assert len(matches) == 1
assert matches[0].group(0) == true_positive

@pytest.mark.parametrize('false_positive',[
'http://www.a',
'http://a',
'http://a.a',
'http://123.456',
'www.something.com',
'http:www.something.com',
])

@pytest.mark.parametrize(
'false_positive',
[
'http://www.a',
'http://a',
'http://a.a',
'http://123.456',
'www.something.com',
'http:www.something.com',
],
)
def test_url_no_commas_regex_false_positives(false_positive: str) -> None:
example_string= f'Some text {false_positive} some more text'
example_string = f'Some text {false_positive} some more text'
matches = list(re.findall(URL_NO_COMMAS_REGEX, example_string))
assert len(matches) == 0


def test_url_no_commas_regex_multi_line() -> None:
true_positives = ('http://www.something.com', 'http://www.else.com')
example_string= 'Some text {} some more text \n Some new line text {} ...'.format(*true_positives)
example_string = 'Some text {} some more text \n Some new line text {} ...'.format(*true_positives)
matches = list(re.finditer(URL_NO_COMMAS_REGEX, example_string))
assert len(matches) == 2
assert {match.group(0) for match in matches} == set(true_positives)

0 comments on commit 382b650

Please sign in to comment.