From efaec3ee21190ac22699f46d3703c79fcec306bd Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 8 Jun 2026 14:26:45 +0200 Subject: [PATCH 1/2] refactor: derive Pydantic camelCase aliases via to_camel generator Replace manual per-field `Field(alias=...)` with `alias_generator=to_camel` in each model's `ConfigDict`, keeping explicit aliases only where `to_camel` would produce a different key. The serialized wire format is unchanged. --- src/crawlee/_request.py | 36 ++++---- src/crawlee/_utils/system.py | 31 ++----- ..._adaptive_playwright_crawler_statistics.py | 7 +- .../_rendering_type_predictor.py | 5 +- src/crawlee/events/_types.py | 22 ++--- src/crawlee/fingerprint_suite/_types.py | 21 ++--- src/crawlee/request_loaders/_request_list.py | 9 +- .../_sitemap_request_loader.py | 20 +++-- src/crawlee/sessions/_models.py | 37 ++++---- src/crawlee/statistics/_models.py | 37 ++++---- src/crawlee/storage_clients/models.py | 85 ++++++++++++------- 11 files changed, 163 insertions(+), 147 deletions(-) diff --git a/src/crawlee/_request.py b/src/crawlee/_request.py index 0d9598db8b..05667af46b 100644 --- a/src/crawlee/_request.py +++ b/src/crawlee/_request.py @@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Annotated, Any, TypedDict, cast from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, PlainSerializer, PlainValidator, TypeAdapter +from pydantic.alias_generators import to_camel from yarl import URL from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, JsonSerializable @@ -34,31 +35,34 @@ class RequestState(IntEnum): class CrawleeRequestData(BaseModel): """Crawlee-specific configuration stored in the `user_data`.""" - max_retries: Annotated[int | None, Field(alias='maxRetries', frozen=True)] = None + model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel) + + max_retries: Annotated[int | None, Field(frozen=True)] = None """Maximum number of retries for this request. Allows to override the global `max_request_retries` option of `BasicCrawler`.""" - enqueue_strategy: Annotated[EnqueueStrategy | None, Field(alias='enqueueStrategy')] = None + enqueue_strategy: EnqueueStrategy | None = None """The strategy that was used for enqueuing the request.""" state: RequestState = RequestState.UNPROCESSED """Describes the request's current lifecycle state.""" - session_rotation_count: Annotated[int | None, Field(alias='sessionRotationCount')] = None + session_rotation_count: int | None = None """The number of finished session rotations for this request.""" - skip_navigation: Annotated[bool, Field(alias='skipNavigation')] = False + skip_navigation: bool = False - last_proxy_tier: Annotated[int | None, Field(alias='lastProxyTier')] = None + last_proxy_tier: int | None = None """The last proxy tier used to process the request.""" - forefront: Annotated[bool, Field()] = False + forefront: bool = False """Indicate whether the request should be enqueued at the front of the queue.""" - crawl_depth: Annotated[int, Field(alias='crawlDepth')] = 0 + crawl_depth: int = 0 """The depth of the request in the crawl tree.""" - session_id: Annotated[str | None, Field()] = None + # Serialized with a snake_case key, so it keeps an explicit alias that overrides the camelCase generator. + session_id: Annotated[str | None, Field(alias='session_id')] = None """ID of a session to which the request is bound.""" @@ -166,9 +170,9 @@ class Request(BaseModel): ``` """ - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) + model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel) - unique_key: Annotated[str, Field(alias='uniqueKey', frozen=True)] + unique_key: Annotated[str, Field(frozen=True)] """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing to the same URL. @@ -212,7 +216,7 @@ class Request(BaseModel): # Internally, the model contains `UserData`, this is just for convenience user_data: Annotated[ MutableMapping[str, JsonSerializable], - Field(alias='userData', default_factory=UserData), + Field(default_factory=UserData), PlainValidator(user_data_adapter.validate_python), PlainSerializer( lambda instance: user_data_adapter.dump_python( @@ -228,16 +232,16 @@ class Request(BaseModel): request's scope, keeping them accessible on retries, failures etc. """ - retry_count: Annotated[int, Field(alias='retryCount')] = 0 + retry_count: int = 0 """Number of times the request has been retried.""" - no_retry: Annotated[bool, Field(alias='noRetry')] = False + no_retry: bool = False """If set to `True`, the request will not be retried in case of failure.""" - loaded_url: Annotated[str | None, BeforeValidator(validate_http_url), Field(alias='loadedUrl')] = None + loaded_url: Annotated[str | None, BeforeValidator(validate_http_url)] = None """URL of the web page that was loaded. This can differ from the original URL in case of redirects.""" - handled_at: Annotated[datetime | None, Field(alias='handledAt')] = None + handled_at: datetime | None = None """Timestamp when the request was handled.""" @classmethod @@ -434,5 +438,5 @@ def was_already_handled(self) -> bool: class RequestWithLock(Request): """A crawling request with information about locks.""" - lock_expires_at: Annotated[datetime, Field(alias='lockExpiresAt')] + lock_expires_at: datetime """The timestamp when the lock expires.""" diff --git a/src/crawlee/_utils/system.py b/src/crawlee/_utils/system.py index 56eeaadf24..f6030c6de9 100644 --- a/src/crawlee/_utils/system.py +++ b/src/crawlee/_utils/system.py @@ -9,6 +9,7 @@ import psutil from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator +from pydantic.alias_generators import to_camel from crawlee._utils.byte_size import ByteSize @@ -36,9 +37,9 @@ def _get_used_memory(process: psutil.Process) -> int: class CpuInfo(BaseModel): """Information about the CPU usage.""" - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) + model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel) - used_ratio: Annotated[float, Field(alias='usedRatio')] + used_ratio: float """The ratio of CPU currently in use, represented as a float between 0 and 1.""" # Workaround for Pydantic and type checkers when using Annotated with default_factory @@ -46,26 +47,19 @@ class CpuInfo(BaseModel): created_at: datetime = datetime.now(timezone.utc) """The time at which the measurement was taken.""" else: - created_at: Annotated[ - datetime, - Field( - alias='createdAt', - default_factory=lambda: datetime.now(timezone.utc), - ), - ] + created_at: Annotated[datetime, Field(default_factory=lambda: datetime.now(timezone.utc))] """The time at which the measurement was taken.""" class MemoryUsageInfo(BaseModel): """Information about the memory usage.""" - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) + model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel) current_size: Annotated[ ByteSize, PlainValidator(ByteSize.validate), PlainSerializer(lambda size: size.bytes), - Field(alias='currentSize'), ] """Memory usage of the current Python process and its children.""" @@ -74,31 +68,22 @@ class MemoryUsageInfo(BaseModel): created_at: datetime = datetime.now(timezone.utc) """The time at which the measurement was taken.""" else: - created_at: Annotated[ - datetime, - Field( - alias='createdAt', - default_factory=lambda: datetime.now(timezone.utc), - ), - ] + created_at: Annotated[datetime, Field(default_factory=lambda: datetime.now(timezone.utc))] """The time at which the measurement was taken.""" class MemoryInfo(MemoryUsageInfo): """Information about system memory.""" - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) + model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel) - total_size: Annotated[ - ByteSize, PlainValidator(ByteSize.validate), PlainSerializer(lambda size: size.bytes), Field(alias='totalSize') - ] + total_size: Annotated[ByteSize, PlainValidator(ByteSize.validate), PlainSerializer(lambda size: size.bytes)] """Total memory available in the system.""" system_wide_used_size: Annotated[ ByteSize, PlainValidator(ByteSize.validate), PlainSerializer(lambda size: size.bytes), - Field(alias='systemWideUsedSize'), ] """Total memory used by all processes system-wide (including non-crawlee processes).""" diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py index 150dfab14b..00a23db51e 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py @@ -3,6 +3,7 @@ from typing import Annotated from pydantic import ConfigDict, Field +from pydantic.alias_generators import to_camel from crawlee._utils.docs import docs_group from crawlee.statistics import StatisticsState @@ -12,8 +13,12 @@ class AdaptivePlaywrightCrawlerStatisticState(StatisticsState): """Statistic data about a crawler run with additional information related to adaptive crawling.""" - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants') + model_config = ConfigDict( + validate_by_name=True, validate_by_alias=True, alias_generator=to_camel, ser_json_inf_nan='constants' + ) + # These fields are serialized with snake_case keys, so they keep explicit aliases that override the + # camelCase alias generator. http_only_request_handler_runs: Annotated[int, Field(alias='http_only_request_handler_runs')] = 0 """Number representing how many times static http based crawling was used.""" diff --git a/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py b/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py index e03572515b..fac7a67ecd 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py @@ -11,6 +11,7 @@ from jaro import jaro_winkler_metric from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator +from pydantic.alias_generators import to_camel from sklearn.linear_model import LogisticRegression from typing_extensions import override @@ -32,7 +33,7 @@ class RenderingTypePredictorState(BaseModel): - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) + model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel) model: Annotated[ LogisticRegression, @@ -41,7 +42,7 @@ class RenderingTypePredictorState(BaseModel): PlainSerializer(sklearn_model_serializer), ] - labels_coefficients: Annotated[defaultdict[str, float], Field(alias='labelsCoefficients')] + labels_coefficients: defaultdict[str, float] @docs_group('Other') diff --git a/src/crawlee/events/_types.py b/src/crawlee/events/_types.py index bd2dfc260b..768f3449a5 100644 --- a/src/crawlee/events/_types.py +++ b/src/crawlee/events/_types.py @@ -5,6 +5,7 @@ from typing import Annotated, Any, TypeVar from pydantic import BaseModel, ConfigDict, Field +from pydantic.alias_generators import to_camel from crawlee._utils.docs import docs_group from crawlee._utils.models import timedelta_secs @@ -40,29 +41,26 @@ class Event(str, Enum): class EventPersistStateData(BaseModel): """Data for the persist state event.""" - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) + model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel) - is_migrating: Annotated[bool, Field(alias='isMigrating')] + is_migrating: bool @docs_group('Event data') class EventSystemInfoData(BaseModel): """Data for the system info event.""" - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) + model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel) - cpu_info: Annotated[CpuInfo, Field(alias='cpuInfo')] - memory_info: Annotated[ - MemoryUsageInfo, - Field(alias='memoryInfo'), - ] + cpu_info: CpuInfo + memory_info: MemoryUsageInfo @docs_group('Event data') class EventMigratingData(BaseModel): """Data for the migrating event.""" - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) + model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel) # The remaining time in seconds before the migration is forced and the process is killed # Optional because it's not present when the event handler is called manually @@ -73,20 +71,22 @@ class EventMigratingData(BaseModel): class EventAbortingData(BaseModel): """Data for the aborting event.""" - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) + model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel) @docs_group('Event data') class EventExitData(BaseModel): """Data for the exit event.""" - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) + model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel) @docs_group('Event data') class EventCrawlerStatusData(BaseModel): """Data for the crawler status event.""" + # This event is only emitted and consumed in-process, and its fields keep their snake_case keys, so the + # camelCase alias generator is intentionally not applied here. model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) message: str diff --git a/src/crawlee/fingerprint_suite/_types.py b/src/crawlee/fingerprint_suite/_types.py index 2e09cf8a55..f5cb79e334 100644 --- a/src/crawlee/fingerprint_suite/_types.py +++ b/src/crawlee/fingerprint_suite/_types.py @@ -1,8 +1,9 @@ from __future__ import annotations -from typing import Annotated, Literal +from typing import Literal -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict +from pydantic.alias_generators import to_camel SupportedOperatingSystems = Literal['windows', 'macos', 'linux', 'android', 'ios'] SupportedDevices = Literal['desktop', 'mobile'] @@ -11,32 +12,32 @@ class ScreenOptions(BaseModel): - model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True) + model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True, alias_generator=to_camel) """Defines the screen constrains for the fingerprint generator.""" - min_width: Annotated[float | None, Field(alias='minWidth')] = None + min_width: float | None = None """Minimal screen width constraint for the fingerprint generator.""" - max_width: Annotated[float | None, Field(alias='maxWidth')] = None + max_width: float | None = None """Maximal screen width constraint for the fingerprint generator.""" - min_height: Annotated[float | None, Field(alias='minHeight')] = None + min_height: float | None = None """Minimal screen height constraint for the fingerprint generator.""" - max_height: Annotated[float | None, Field(alias='maxHeight')] = None + max_height: float | None = None """Maximal screen height constraint for the fingerprint generator.""" class HeaderGeneratorOptions(BaseModel): """Collection of header related attributes that can be used by the fingerprint generator.""" - model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True) + model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True, alias_generator=to_camel) browsers: list[SupportedBrowserType] | None = None """List of BrowserSpecifications to generate the headers for.""" - operating_systems: Annotated[list[SupportedOperatingSystems] | None, Field(alias='operatingSystems')] = None + operating_systems: list[SupportedOperatingSystems] | None = None """List of operating systems to generate the headers for.""" devices: list[SupportedDevices] | None = None @@ -47,7 +48,7 @@ class HeaderGeneratorOptions(BaseModel): (https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Language) request header in the language format accepted by that header, for example `en`, `en-US` or `de`.""" - http_version: Annotated[SupportedHttpVersion | None, Field(alias='httpVersion')] = None + http_version: SupportedHttpVersion | None = None """HTTP version to be used for header generation (the headers differ depending on the version).""" strict: bool | None = None diff --git a/src/crawlee/request_loaders/_request_list.py b/src/crawlee/request_loaders/_request_list.py index c4523ba2c8..00f9002a68 100644 --- a/src/crawlee/request_loaders/_request_list.py +++ b/src/crawlee/request_loaders/_request_list.py @@ -7,6 +7,7 @@ from typing import Annotated from pydantic import BaseModel, ConfigDict, Field, ValidationError +from pydantic.alias_generators import to_camel from typing_extensions import override from crawlee._request import Request @@ -17,11 +18,11 @@ class RequestListState(BaseModel): - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) + model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel) - next_index: Annotated[int, Field(alias='nextIndex')] = 0 - next_unique_key: Annotated[str | None, Field(alias='nextUniqueKey')] = None - in_progress: Annotated[set[str], Field(alias='inProgress')] = set() + next_index: int = 0 + next_unique_key: str | None = None + in_progress: set[str] = set() class RequestListData(BaseModel): diff --git a/src/crawlee/request_loaders/_sitemap_request_loader.py b/src/crawlee/request_loaders/_sitemap_request_loader.py index 664686b23f..ba4380a586 100644 --- a/src/crawlee/request_loaders/_sitemap_request_loader.py +++ b/src/crawlee/request_loaders/_sitemap_request_loader.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Annotated, Any from pydantic import BaseModel, ConfigDict, Field +from pydantic.alias_generators import to_camel from typing_extensions import override from yarl import URL @@ -60,33 +61,34 @@ class SitemapRequestLoaderState(BaseModel): `in_progress` is cleared. """ - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) + model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel) - url_queue: Annotated[deque[str], Field(alias='urlQueue')] + url_queue: deque[str] """Queue of URLs extracted from sitemaps and ready for processing.""" - in_progress: Annotated[set[str], Field(alias='inProgress')] = set() + in_progress: set[str] = set() """Set of request URLs currently being processed.""" - pending_sitemap_urls: Annotated[deque[str], Field(alias='pendingSitemapUrls')] + pending_sitemap_urls: deque[str] """Queue of sitemap URLs that need to be fetched and processed.""" - in_progress_sitemap_url: Annotated[str | None, Field(alias='inProgressSitemapUrl')] = None + in_progress_sitemap_url: str | None = None """The sitemap URL currently being processed.""" - current_sitemap_processed_urls: Annotated[set[str], Field(alias='currentSitemapProcessedUrls')] = set() + current_sitemap_processed_urls: set[str] = set() """URLs from the current sitemap that have been added to the queue.""" - processed_sitemap_urls: Annotated[set[str], Field(alias='processedSitemapUrls')] = set() + processed_sitemap_urls: set[str] = set() """Set of processed sitemap URLs.""" + # Serialized as `sitemapCompleted`, which `to_camel('completed')` would not produce, so keep an explicit alias. completed: Annotated[bool, Field(alias='sitemapCompleted')] = False """Whether all sitemaps have been fully processed.""" - total_count: Annotated[int, Field(alias='totalCount')] = 0 + total_count: int = 0 """Total number of URLs found and added to the queue from all processed sitemaps.""" - handled_count: Annotated[int, Field(alias='handledCount')] = 0 + handled_count: int = 0 """Number of URLs that have been successfully handled.""" diff --git a/src/crawlee/sessions/_models.py b/src/crawlee/sessions/_models.py index fa0b64042c..5e64ce4457 100644 --- a/src/crawlee/sessions/_models.py +++ b/src/crawlee/sessions/_models.py @@ -8,11 +8,11 @@ BaseModel, BeforeValidator, ConfigDict, - Field, GetPydanticSchema, PlainSerializer, computed_field, ) +from pydantic.alias_generators import to_camel from crawlee._types import JsonSerializable @@ -23,27 +23,27 @@ class SessionModel(BaseModel): """Model for a Session object.""" - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) + model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel) - id: Annotated[str, Field(alias='id')] - max_age: Annotated[timedelta, Field(alias='maxAge')] - user_data: Annotated[MutableMapping[str, JsonSerializable], Field(alias='userData')] - max_error_score: Annotated[float, Field(alias='maxErrorScore')] - error_score_decrement: Annotated[float, Field(alias='errorScoreDecrement')] - created_at: Annotated[datetime, Field(alias='createdAt')] - usage_count: Annotated[int, Field(alias='usageCount')] - max_usage_count: Annotated[int, Field(alias='maxUsageCount')] - error_score: Annotated[float, Field(alias='errorScore')] - cookies: Annotated[list[CookieParam], Field(alias='cookies')] - blocked_status_codes: Annotated[list[int], Field(alias='blockedStatusCodes')] + id: str + max_age: timedelta + user_data: MutableMapping[str, JsonSerializable] + max_error_score: float + error_score_decrement: float + created_at: datetime + usage_count: int + max_usage_count: int + error_score: float + cookies: list[CookieParam] + blocked_status_codes: list[int] class SessionPoolModel(BaseModel): """Model for a SessionPool object.""" - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) + model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel) - max_pool_size: Annotated[int, Field(alias='maxPoolSize')] + max_pool_size: int sessions: Annotated[ dict[ @@ -52,7 +52,6 @@ class SessionPoolModel(BaseModel): Session, GetPydanticSchema(lambda _, handler: handler(Any)) ], # handler(Any) is fine - we validate manually in the BeforeValidator ], - Field(alias='sessions'), PlainSerializer( lambda value: [session.get_state().model_dump(by_alias=True) for session in value.values()], return_type=list, @@ -66,19 +65,19 @@ class SessionPoolModel(BaseModel): ), ] - @computed_field(alias='sessionCount') + @computed_field @property def session_count(self) -> int: """Get the total number of sessions currently maintained in the pool.""" return len(self.sessions) - @computed_field(alias='usableSessionCount') + @computed_field @property def usable_session_count(self) -> int: """Get the number of sessions that are currently usable.""" return len([session for _, session in self.sessions.items() if session.is_usable]) - @computed_field(alias='retiredSessionCount') + @computed_field @property def retired_session_count(self) -> int: """Get the number of sessions that are no longer usable.""" diff --git a/src/crawlee/statistics/_models.py b/src/crawlee/statistics/_models.py index b17c618540..b555ae8c34 100644 --- a/src/crawlee/statistics/_models.py +++ b/src/crawlee/statistics/_models.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Annotated, Any from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator, computed_field +from pydantic.alias_generators import to_camel from typing_extensions import override from crawlee._utils.console import make_table @@ -58,14 +59,16 @@ def __str__(self) -> str: class StatisticsState(BaseModel): """Statistic data about a crawler run.""" - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants') - stats_id: Annotated[int | None, Field(alias='statsId')] = None + model_config = ConfigDict( + validate_by_name=True, validate_by_alias=True, alias_generator=to_camel, ser_json_inf_nan='constants' + ) + stats_id: int | None = None - requests_finished: Annotated[int, Field(alias='requestsFinished')] = 0 - requests_failed: Annotated[int, Field(alias='requestsFailed')] = 0 - requests_retries: Annotated[int, Field(alias='requestsRetries')] = 0 - requests_failed_per_minute: Annotated[float, Field(alias='requestsFailedPerMinute')] = 0 - requests_finished_per_minute: Annotated[float, Field(alias='requestsFinishedPerMinute')] = 0 + requests_finished: int = 0 + requests_failed: int = 0 + requests_retries: int = 0 + requests_failed_per_minute: float = 0 + requests_finished_per_minute: float = 0 request_min_duration: Annotated[timedelta_ms | None, Field(alias='requestMinDurationMillis')] = None request_max_duration: Annotated[timedelta_ms | None, Field(alias='requestMaxDurationMillis')] = None request_total_failed_duration: Annotated[timedelta_ms, Field(alias='requestTotalFailedDurationMillis')] = ( @@ -74,9 +77,9 @@ class StatisticsState(BaseModel): request_total_finished_duration: Annotated[timedelta_ms, Field(alias='requestTotalFinishedDurationMillis')] = ( timedelta() ) - crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None + crawler_started_at: datetime | None = None crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None - crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None + crawler_finished_at: datetime | None = None # Workaround for Pydantic and type checkers when using Annotated with default_factory if TYPE_CHECKING: @@ -85,18 +88,12 @@ class StatisticsState(BaseModel): requests_with_status_code: dict[str, int] = {} else: errors: Annotated[dict[str, Any], Field(default_factory=dict)] - retry_errors: Annotated[dict[str, Any], Field(alias='retryErrors', default_factory=dict)] - requests_with_status_code: Annotated[ - dict[str, int], - Field(alias='requestsWithStatusCode', default_factory=dict), - ] - - stats_persisted_at: Annotated[ - datetime | None, Field(alias='statsPersistedAt'), PlainSerializer(lambda _: datetime.now(timezone.utc)) - ] = None + retry_errors: Annotated[dict[str, Any], Field(default_factory=dict)] + requests_with_status_code: Annotated[dict[str, int], Field(default_factory=dict)] + + stats_persisted_at: Annotated[datetime | None, PlainSerializer(lambda _: datetime.now(timezone.utc))] = None request_retry_histogram: Annotated[ dict[int, int], - Field(alias='requestRetryHistogram'), PlainValidator(lambda value: dict(enumerate(value)), json_schema_input_type=list[int]), PlainSerializer( lambda value: [value.get(i, 0) for i in range(max(value.keys(), default=0) + 1)], @@ -150,7 +147,7 @@ def request_avg_failed_duration(self) -> timedelta | None: def request_avg_finished_duration(self) -> timedelta | None: return (self.request_total_finished_duration / self.requests_finished) if self.requests_finished else None - @computed_field(alias='requestsTotal') + @computed_field @property def requests_total(self) -> int: return self.requests_failed + self.requests_finished diff --git a/src/crawlee/storage_clients/models.py b/src/crawlee/storage_clients/models.py index 1725340a53..33b94ef506 100644 --- a/src/crawlee/storage_clients/models.py +++ b/src/crawlee/storage_clients/models.py @@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Annotated, Any, Generic from pydantic import BaseModel, BeforeValidator, ConfigDict, Field +from pydantic.alias_generators import to_camel from typing_extensions import TypeVar from crawlee._types import HttpMethod, JsonSerializable @@ -23,21 +24,23 @@ class StorageMetadata(BaseModel): It contains common fields shared across all specific storage types. """ - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, extra='allow', from_attributes=True) + model_config = ConfigDict( + validate_by_name=True, validate_by_alias=True, alias_generator=to_camel, extra='allow', from_attributes=True + ) - id: Annotated[str, Field(alias='id')] + id: str """The unique identifier of the storage.""" - name: Annotated[str | None, Field(alias='name', default=None)] + name: str | None = None """The name of the storage.""" - accessed_at: Annotated[datetime, Field(alias='accessedAt')] + accessed_at: datetime """The timestamp when the storage was last accessed.""" - created_at: Annotated[datetime, Field(alias='createdAt')] + created_at: datetime """The timestamp when the storage was created.""" - modified_at: Annotated[datetime, Field(alias='modifiedAt')] + modified_at: datetime """The timestamp when the storage was last modified.""" @@ -45,9 +48,11 @@ class StorageMetadata(BaseModel): class DatasetMetadata(StorageMetadata): """Model for a dataset metadata.""" - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True) + model_config = ConfigDict( + validate_by_name=True, validate_by_alias=True, alias_generator=to_camel, from_attributes=True + ) - item_count: Annotated[int, Field(alias='itemCount')] + item_count: int """The number of items in the dataset.""" @@ -55,25 +60,29 @@ class DatasetMetadata(StorageMetadata): class KeyValueStoreMetadata(StorageMetadata): """Model for a key-value store metadata.""" - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True) + model_config = ConfigDict( + validate_by_name=True, validate_by_alias=True, alias_generator=to_camel, from_attributes=True + ) @docs_group('Storage data') class RequestQueueMetadata(StorageMetadata): """Model for a request queue metadata.""" - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True) + model_config = ConfigDict( + validate_by_name=True, validate_by_alias=True, alias_generator=to_camel, from_attributes=True + ) - had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients')] + had_multiple_clients: bool """Indicates whether the queue has been accessed by multiple clients (consumers).""" - handled_request_count: Annotated[int, Field(alias='handledRequestCount')] + handled_request_count: int """The number of requests that have been handled from the queue.""" - pending_request_count: Annotated[int, Field(alias='pendingRequestCount')] + pending_request_count: int """The number of requests that are still pending in the queue.""" - total_request_count: Annotated[int, Field(alias='totalRequestCount')] + total_request_count: int """The total number of requests that have been added to the queue.""" @@ -81,21 +90,23 @@ class RequestQueueMetadata(StorageMetadata): class KeyValueStoreRecordMetadata(BaseModel): """Model for a key-value store record metadata.""" - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True) + model_config = ConfigDict( + validate_by_name=True, validate_by_alias=True, alias_generator=to_camel, from_attributes=True + ) - key: Annotated[str, Field(alias='key')] + key: str """The key of the record. A unique identifier for the record in the key-value store. """ - content_type: Annotated[str, Field(alias='contentType')] + content_type: str """The MIME type of the record. Describe the format and type of data stored in the record, following the MIME specification. """ - size: Annotated[int | None, Field(alias='size', default=None)] = None + size: int | None = None """The size of the record in bytes.""" @@ -103,9 +114,11 @@ class KeyValueStoreRecordMetadata(BaseModel): class KeyValueStoreRecord(KeyValueStoreRecordMetadata, Generic[KvsValueType]): """Model for a key-value store record.""" - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True) + model_config = ConfigDict( + validate_by_name=True, validate_by_alias=True, alias_generator=to_camel, from_attributes=True + ) - value: Annotated[KvsValueType, Field(alias='value')] + value: KvsValueType """The value of the record.""" @@ -113,7 +126,9 @@ class KeyValueStoreRecord(KeyValueStoreRecordMetadata, Generic[KvsValueType]): class DatasetItemsListPage(BaseModel): """Model for a single page of dataset items returned from a collection list method.""" - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True) + model_config = ConfigDict( + validate_by_name=True, validate_by_alias=True, alias_generator=to_camel, from_attributes=True + ) count: Annotated[int, Field(default=0)] """The number of objects returned on this page.""" @@ -143,25 +158,29 @@ class DatasetItemsListPage(BaseModel): class ProcessedRequest(BaseModel): """Represents a processed request.""" - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True) + model_config = ConfigDict( + validate_by_name=True, validate_by_alias=True, alias_generator=to_camel, from_attributes=True + ) id: Annotated[str | None, Field(alias='requestId', default=None)] = None """Internal representation of the request by the storage client. Only some clients use id.""" - unique_key: Annotated[str, Field(alias='uniqueKey')] - was_already_present: Annotated[bool, Field(alias='wasAlreadyPresent')] - was_already_handled: Annotated[bool, Field(alias='wasAlreadyHandled')] + unique_key: str + was_already_present: bool + was_already_handled: bool @docs_group('Storage data') class UnprocessedRequest(BaseModel): """Represents an unprocessed request.""" - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True) + model_config = ConfigDict( + validate_by_name=True, validate_by_alias=True, alias_generator=to_camel, from_attributes=True + ) - unique_key: Annotated[str, Field(alias='uniqueKey')] - url: Annotated[str, BeforeValidator(validate_http_url), Field()] - method: Annotated[HttpMethod | None, Field()] = None + unique_key: str + url: Annotated[str, BeforeValidator(validate_http_url)] + method: HttpMethod | None = None @docs_group('Storage data') @@ -173,11 +192,13 @@ class AddRequestsResponse(BaseModel): encountered issues during processing. """ - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True) + model_config = ConfigDict( + validate_by_name=True, validate_by_alias=True, alias_generator=to_camel, from_attributes=True + ) - processed_requests: Annotated[list[ProcessedRequest], Field(alias='processedRequests')] + processed_requests: list[ProcessedRequest] """Successfully processed requests, including information about whether they were already present in the queue and whether they had been handled previously.""" - unprocessed_requests: Annotated[list[UnprocessedRequest], Field(alias='unprocessedRequests')] + unprocessed_requests: list[UnprocessedRequest] """Requests that could not be processed, typically due to validation errors or other issues.""" From b72f93e21ec88439243e3e6e6b2540df50abcaac Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 8 Jun 2026 15:20:21 +0200 Subject: [PATCH 2/2] fix: use camelCase serialization keys for adaptive stats, crawler status, and session_id --- src/crawlee/_request.py | 3 +-- .../_adaptive_playwright_crawler_statistics.py | 12 ++++-------- src/crawlee/events/_types.py | 4 +--- 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/src/crawlee/_request.py b/src/crawlee/_request.py index 05667af46b..ef20dd3736 100644 --- a/src/crawlee/_request.py +++ b/src/crawlee/_request.py @@ -61,8 +61,7 @@ class CrawleeRequestData(BaseModel): crawl_depth: int = 0 """The depth of the request in the crawl tree.""" - # Serialized with a snake_case key, so it keeps an explicit alias that overrides the camelCase generator. - session_id: Annotated[str | None, Field(alias='session_id')] = None + session_id: str | None = None """ID of a session to which the request is bound.""" diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py index 00a23db51e..6dc0761eb7 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py @@ -1,8 +1,6 @@ from __future__ import annotations -from typing import Annotated - -from pydantic import ConfigDict, Field +from pydantic import ConfigDict from pydantic.alias_generators import to_camel from crawlee._utils.docs import docs_group @@ -17,13 +15,11 @@ class AdaptivePlaywrightCrawlerStatisticState(StatisticsState): validate_by_name=True, validate_by_alias=True, alias_generator=to_camel, ser_json_inf_nan='constants' ) - # These fields are serialized with snake_case keys, so they keep explicit aliases that override the - # camelCase alias generator. - http_only_request_handler_runs: Annotated[int, Field(alias='http_only_request_handler_runs')] = 0 + http_only_request_handler_runs: int = 0 """Number representing how many times static http based crawling was used.""" - browser_request_handler_runs: Annotated[int, Field(alias='browser_request_handler_runs')] = 0 + browser_request_handler_runs: int = 0 """Number representing how many times browser based crawling was used.""" - rendering_type_mispredictions: Annotated[int, Field(alias='rendering_type_mispredictions')] = 0 + rendering_type_mispredictions: int = 0 """Number representing how many times the predictor gave incorrect prediction.""" diff --git a/src/crawlee/events/_types.py b/src/crawlee/events/_types.py index 768f3449a5..5895a4f6c0 100644 --- a/src/crawlee/events/_types.py +++ b/src/crawlee/events/_types.py @@ -85,9 +85,7 @@ class EventExitData(BaseModel): class EventCrawlerStatusData(BaseModel): """Data for the crawler status event.""" - # This event is only emitted and consumed in-process, and its fields keep their snake_case keys, so the - # camelCase alias generator is intentionally not applied here. - model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) + model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, alias_generator=to_camel) message: str """A message describing the current status of the crawler."""