Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 19 additions & 16 deletions src/crawlee/_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import TYPE_CHECKING, Annotated, Any, TypedDict, cast

from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, PlainSerializer, PlainValidator, TypeAdapter
from pydantic.alias_generators import to_camel
from yarl import URL

from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, JsonSerializable
Expand Down Expand Up @@ -34,31 +35,33 @@ class RequestState(IntEnum):
class CrawleeRequestData(BaseModel):
"""Crawlee-specific configuration stored in the `user_data`."""

max_retries: Annotated[int | None, Field(alias='maxRetries', frozen=True)] = None
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel)

max_retries: Annotated[int | None, Field(frozen=True)] = None
"""Maximum number of retries for this request. Allows to override the global `max_request_retries` option of
`BasicCrawler`."""

enqueue_strategy: Annotated[EnqueueStrategy | None, Field(alias='enqueueStrategy')] = None
enqueue_strategy: EnqueueStrategy | None = None
"""The strategy that was used for enqueuing the request."""

state: RequestState = RequestState.UNPROCESSED
"""Describes the request's current lifecycle state."""

session_rotation_count: Annotated[int | None, Field(alias='sessionRotationCount')] = None
session_rotation_count: int | None = None
"""The number of finished session rotations for this request."""

skip_navigation: Annotated[bool, Field(alias='skipNavigation')] = False
skip_navigation: bool = False

last_proxy_tier: Annotated[int | None, Field(alias='lastProxyTier')] = None
last_proxy_tier: int | None = None
"""The last proxy tier used to process the request."""

forefront: Annotated[bool, Field()] = False
forefront: bool = False
"""Indicate whether the request should be enqueued at the front of the queue."""

crawl_depth: Annotated[int, Field(alias='crawlDepth')] = 0
crawl_depth: int = 0
"""The depth of the request in the crawl tree."""

session_id: Annotated[str | None, Field()] = None
session_id: str | None = None
"""ID of a session to which the request is bound."""


Expand Down Expand Up @@ -166,9 +169,9 @@ class Request(BaseModel):
```
"""

model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel)

unique_key: Annotated[str, Field(alias='uniqueKey', frozen=True)]
unique_key: Annotated[str, Field(frozen=True)]
"""A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
to the same URL.

Expand Down Expand Up @@ -212,7 +215,7 @@ class Request(BaseModel):
# Internally, the model contains `UserData`, this is just for convenience
user_data: Annotated[
MutableMapping[str, JsonSerializable],
Field(alias='userData', default_factory=UserData),
Field(default_factory=UserData),
PlainValidator(user_data_adapter.validate_python),
PlainSerializer(
lambda instance: user_data_adapter.dump_python(
Expand All @@ -228,16 +231,16 @@ class Request(BaseModel):
request's scope, keeping them accessible on retries, failures etc.
"""

retry_count: Annotated[int, Field(alias='retryCount')] = 0
retry_count: int = 0
"""Number of times the request has been retried."""

no_retry: Annotated[bool, Field(alias='noRetry')] = False
no_retry: bool = False
"""If set to `True`, the request will not be retried in case of failure."""

loaded_url: Annotated[str | None, BeforeValidator(validate_http_url), Field(alias='loadedUrl')] = None
loaded_url: Annotated[str | None, BeforeValidator(validate_http_url)] = None
"""URL of the web page that was loaded. This can differ from the original URL in case of redirects."""

handled_at: Annotated[datetime | None, Field(alias='handledAt')] = None
handled_at: datetime | None = None
"""Timestamp when the request was handled."""

@classmethod
Expand Down Expand Up @@ -434,5 +437,5 @@ def was_already_handled(self) -> bool:
class RequestWithLock(Request):
"""A crawling request with information about locks."""

lock_expires_at: Annotated[datetime, Field(alias='lockExpiresAt')]
lock_expires_at: datetime
"""The timestamp when the lock expires."""
31 changes: 8 additions & 23 deletions src/crawlee/_utils/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import psutil
from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator
from pydantic.alias_generators import to_camel

from crawlee._utils.byte_size import ByteSize

Expand Down Expand Up @@ -36,36 +37,29 @@ def _get_used_memory(process: psutil.Process) -> int:
class CpuInfo(BaseModel):
"""Information about the CPU usage."""

model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel)

used_ratio: Annotated[float, Field(alias='usedRatio')]
used_ratio: float
"""The ratio of CPU currently in use, represented as a float between 0 and 1."""

# Workaround for Pydantic and type checkers when using Annotated with default_factory
if TYPE_CHECKING:
created_at: datetime = datetime.now(timezone.utc)
"""The time at which the measurement was taken."""
else:
created_at: Annotated[
datetime,
Field(
alias='createdAt',
default_factory=lambda: datetime.now(timezone.utc),
),
]
created_at: Annotated[datetime, Field(default_factory=lambda: datetime.now(timezone.utc))]
"""The time at which the measurement was taken."""


class MemoryUsageInfo(BaseModel):
"""Information about the memory usage."""

model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel)

current_size: Annotated[
ByteSize,
PlainValidator(ByteSize.validate),
PlainSerializer(lambda size: size.bytes),
Field(alias='currentSize'),
]
"""Memory usage of the current Python process and its children."""

Expand All @@ -74,31 +68,22 @@ class MemoryUsageInfo(BaseModel):
created_at: datetime = datetime.now(timezone.utc)
"""The time at which the measurement was taken."""
else:
created_at: Annotated[
datetime,
Field(
alias='createdAt',
default_factory=lambda: datetime.now(timezone.utc),
),
]
created_at: Annotated[datetime, Field(default_factory=lambda: datetime.now(timezone.utc))]
"""The time at which the measurement was taken."""


class MemoryInfo(MemoryUsageInfo):
"""Information about system memory."""

model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel)

total_size: Annotated[
ByteSize, PlainValidator(ByteSize.validate), PlainSerializer(lambda size: size.bytes), Field(alias='totalSize')
]
total_size: Annotated[ByteSize, PlainValidator(ByteSize.validate), PlainSerializer(lambda size: size.bytes)]
"""Total memory available in the system."""

system_wide_used_size: Annotated[
ByteSize,
PlainValidator(ByteSize.validate),
PlainSerializer(lambda size: size.bytes),
Field(alias='systemWideUsedSize'),
]
"""Total memory used by all processes system-wide (including non-crawlee processes)."""

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from __future__ import annotations

from typing import Annotated

from pydantic import ConfigDict, Field
from pydantic import ConfigDict
from pydantic.alias_generators import to_camel

from crawlee._utils.docs import docs_group
from crawlee.statistics import StatisticsState
Expand All @@ -12,13 +11,15 @@
class AdaptivePlaywrightCrawlerStatisticState(StatisticsState):
"""Statistic data about a crawler run with additional information related to adaptive crawling."""

model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants')
model_config = ConfigDict(
validate_by_name=True, validate_by_alias=True, alias_generator=to_camel, ser_json_inf_nan='constants'
)

http_only_request_handler_runs: Annotated[int, Field(alias='http_only_request_handler_runs')] = 0
http_only_request_handler_runs: int = 0
"""Number representing how many times static http based crawling was used."""

browser_request_handler_runs: Annotated[int, Field(alias='browser_request_handler_runs')] = 0
browser_request_handler_runs: int = 0
"""Number representing how many times browser based crawling was used."""

rendering_type_mispredictions: Annotated[int, Field(alias='rendering_type_mispredictions')] = 0
rendering_type_mispredictions: int = 0
"""Number representing how many times the predictor gave incorrect prediction."""
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from jaro import jaro_winkler_metric
from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator
from pydantic.alias_generators import to_camel
from sklearn.linear_model import LogisticRegression
from typing_extensions import override

Expand All @@ -32,7 +33,7 @@


class RenderingTypePredictorState(BaseModel):
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel)

model: Annotated[
LogisticRegression,
Expand All @@ -41,7 +42,7 @@ class RenderingTypePredictorState(BaseModel):
PlainSerializer(sklearn_model_serializer),
]

labels_coefficients: Annotated[defaultdict[str, float], Field(alias='labelsCoefficients')]
labels_coefficients: defaultdict[str, float]


@docs_group('Other')
Expand Down
22 changes: 10 additions & 12 deletions src/crawlee/events/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Annotated, Any, TypeVar

from pydantic import BaseModel, ConfigDict, Field
from pydantic.alias_generators import to_camel

from crawlee._utils.docs import docs_group
from crawlee._utils.models import timedelta_secs
Expand Down Expand Up @@ -40,29 +41,26 @@ class Event(str, Enum):
class EventPersistStateData(BaseModel):
"""Data for the persist state event."""

model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel)

is_migrating: Annotated[bool, Field(alias='isMigrating')]
is_migrating: bool


@docs_group('Event data')
class EventSystemInfoData(BaseModel):
"""Data for the system info event."""

model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel)

cpu_info: Annotated[CpuInfo, Field(alias='cpuInfo')]
memory_info: Annotated[
MemoryUsageInfo,
Field(alias='memoryInfo'),
]
cpu_info: CpuInfo
memory_info: MemoryUsageInfo


@docs_group('Event data')
class EventMigratingData(BaseModel):
"""Data for the migrating event."""

model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel)

# The remaining time in seconds before the migration is forced and the process is killed
# Optional because it's not present when the event handler is called manually
Expand All @@ -73,21 +71,21 @@ class EventMigratingData(BaseModel):
class EventAbortingData(BaseModel):
"""Data for the aborting event."""

model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel)


@docs_group('Event data')
class EventExitData(BaseModel):
"""Data for the exit event."""

model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel)


@docs_group('Event data')
class EventCrawlerStatusData(BaseModel):
"""Data for the crawler status event."""

model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel)

message: str
"""A message describing the current status of the crawler."""
Expand Down
21 changes: 11 additions & 10 deletions src/crawlee/fingerprint_suite/_types.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from __future__ import annotations

from typing import Annotated, Literal
from typing import Literal

from pydantic import BaseModel, ConfigDict, Field
from pydantic import BaseModel, ConfigDict
from pydantic.alias_generators import to_camel

SupportedOperatingSystems = Literal['windows', 'macos', 'linux', 'android', 'ios']
SupportedDevices = Literal['desktop', 'mobile']
Expand All @@ -11,32 +12,32 @@


class ScreenOptions(BaseModel):
model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True, alias_generator=to_camel)

"""Defines the screen constrains for the fingerprint generator."""

min_width: Annotated[float | None, Field(alias='minWidth')] = None
min_width: float | None = None
"""Minimal screen width constraint for the fingerprint generator."""

max_width: Annotated[float | None, Field(alias='maxWidth')] = None
max_width: float | None = None
"""Maximal screen width constraint for the fingerprint generator."""

min_height: Annotated[float | None, Field(alias='minHeight')] = None
min_height: float | None = None
"""Minimal screen height constraint for the fingerprint generator."""

max_height: Annotated[float | None, Field(alias='maxHeight')] = None
max_height: float | None = None
"""Maximal screen height constraint for the fingerprint generator."""


class HeaderGeneratorOptions(BaseModel):
"""Collection of header related attributes that can be used by the fingerprint generator."""

model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True, alias_generator=to_camel)

browsers: list[SupportedBrowserType] | None = None
"""List of BrowserSpecifications to generate the headers for."""

operating_systems: Annotated[list[SupportedOperatingSystems] | None, Field(alias='operatingSystems')] = None
operating_systems: list[SupportedOperatingSystems] | None = None
"""List of operating systems to generate the headers for."""

devices: list[SupportedDevices] | None = None
Expand All @@ -47,7 +48,7 @@ class HeaderGeneratorOptions(BaseModel):
(https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Language) request header
in the language format accepted by that header, for example `en`, `en-US` or `de`."""

http_version: Annotated[SupportedHttpVersion | None, Field(alias='httpVersion')] = None
http_version: SupportedHttpVersion | None = None
"""HTTP version to be used for header generation (the headers differ depending on the version)."""

strict: bool | None = None
Expand Down
9 changes: 5 additions & 4 deletions src/crawlee/request_loaders/_request_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import Annotated

from pydantic import BaseModel, ConfigDict, Field, ValidationError
from pydantic.alias_generators import to_camel
from typing_extensions import override

from crawlee._request import Request
Expand All @@ -17,11 +18,11 @@


class RequestListState(BaseModel):
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel)

next_index: Annotated[int, Field(alias='nextIndex')] = 0
next_unique_key: Annotated[str | None, Field(alias='nextUniqueKey')] = None
in_progress: Annotated[set[str], Field(alias='inProgress')] = set()
next_index: int = 0
next_unique_key: str | None = None
in_progress: set[str] = set()


class RequestListData(BaseModel):
Expand Down
Loading
Loading