From dad6ecf8857c748a640d1f1b951dbf973d75d4ed Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 7 May 2025 09:21:24 +0200 Subject: [PATCH 1/2] Use `JsonSerializable` for `user_data` --- src/crawlee/_request.py | 2 +- src/crawlee/_types.py | 6 +++--- .../_abstract_http/_abstract_http_crawler.py | 16 +++++++++++----- .../crawlers/_playwright/_playwright_crawler.py | 14 ++++++++++---- 4 files changed, 25 insertions(+), 13 deletions(-) diff --git a/src/crawlee/_request.py b/src/crawlee/_request.py index adb43949ea..93a255641a 100644 --- a/src/crawlee/_request.py +++ b/src/crawlee/_request.py @@ -129,7 +129,7 @@ class RequestOptions(TypedDict): keep_url_fragment: NotRequired[bool] use_extended_unique_key: NotRequired[bool] always_enqueue: NotRequired[bool] - user_data: NotRequired[dict[str, JsonSerializable]] + user_data: NotRequired[JsonSerializable] no_retry: NotRequired[bool] diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 1764f658a3..d05c282737 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -346,7 +346,7 @@ def __call__( *, selector: str | None = None, label: str | None = None, - user_data: dict[str, Any] | None = None, + user_data: JsonSerializable = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, None]: ... @@ -361,7 +361,7 @@ def __call__( *, selector: str | None = None, label: str | None = None, - user_data: dict[str, Any] | None = None, + user_data: JsonSerializable = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, requests: Sequence[str | Request] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], @@ -398,7 +398,7 @@ def __call__( *, selector: str = 'a', label: str | None = None, - user_data: dict[str, Any] | None = None, + user_data: JsonSerializable = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, list[Request]]: diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 5d05098886..be5b93858b 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -3,7 +3,7 @@ import asyncio import logging from abc import ABC -from typing import TYPE_CHECKING, Any, Callable, Generic, Union +from typing import TYPE_CHECKING, Callable, Generic, Union from pydantic import ValidationError from typing_extensions import TypeVar @@ -23,7 +23,13 @@ from typing_extensions import Unpack from crawlee import RequestTransformAction - from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, EnqueueLinksKwargs, ExtractLinksFunction + from crawlee._types import ( + BasicCrawlingContext, + EnqueueLinksFunction, + EnqueueLinksKwargs, + ExtractLinksFunction, + JsonSerializable, + ) from ._abstract_http_parser import AbstractHttpParser @@ -150,7 +156,7 @@ async def extract_links( *, selector: str = 'a', label: str | None = None, - user_data: dict[str, Any] | None = None, + user_data: JsonSerializable = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], @@ -173,7 +179,7 @@ async def extract_links( skipped.append(url) continue - request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label) + request_options = RequestOptions(url=url, user_data=base_user_data, label=label) if transform_request_function: transform_request_options = transform_request_function(request_options) @@ -220,7 +226,7 @@ async def enqueue_links( *, selector: str | None = None, label: str | None = None, - user_data: dict[str, Any] | None = None, + user_data: JsonSerializable = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, requests: Sequence[str | Request] | None = None, diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 946645b585..0eb1681599 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -37,7 +37,13 @@ from typing_extensions import Unpack from crawlee import RequestTransformAction - from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, EnqueueLinksKwargs, ExtractLinksFunction + from crawlee._types import ( + BasicCrawlingContext, + EnqueueLinksFunction, + EnqueueLinksKwargs, + ExtractLinksFunction, + JsonSerializable, + ) from crawlee.browsers._types import BrowserType @@ -281,7 +287,7 @@ async def extract_links( *, selector: str = 'a', label: str | None = None, - user_data: dict | None = None, + user_data: JsonSerializable = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], @@ -314,7 +320,7 @@ async def extract_links( skipped.append(url) continue - request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label}) + request_option = RequestOptions({'url': url, 'user_data': base_user_data, 'label': label}) if transform_request_function: transform_request_option = transform_request_function(request_option) @@ -352,7 +358,7 @@ async def enqueue_links( *, selector: str | None = None, label: str | None = None, - user_data: dict | None = None, + user_data: JsonSerializable = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, requests: Sequence[str | Request] | None = None, From 5eab9d8668042ed865e75f9f108e14f2676cb46e Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 7 May 2025 10:36:06 +0200 Subject: [PATCH 2/2] Keep the top dict --- src/crawlee/_request.py | 2 +- src/crawlee/_types.py | 6 +++--- .../crawlers/_abstract_http/_abstract_http_crawler.py | 4 ++-- src/crawlee/crawlers/_playwright/_playwright_crawler.py | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/crawlee/_request.py b/src/crawlee/_request.py index 93a255641a..adb43949ea 100644 --- a/src/crawlee/_request.py +++ b/src/crawlee/_request.py @@ -129,7 +129,7 @@ class RequestOptions(TypedDict): keep_url_fragment: NotRequired[bool] use_extended_unique_key: NotRequired[bool] always_enqueue: NotRequired[bool] - user_data: NotRequired[JsonSerializable] + user_data: NotRequired[dict[str, JsonSerializable]] no_retry: NotRequired[bool] diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index d05c282737..f777295c47 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -346,7 +346,7 @@ def __call__( *, selector: str | None = None, label: str | None = None, - user_data: JsonSerializable = None, + user_data: dict[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, None]: ... @@ -361,7 +361,7 @@ def __call__( *, selector: str | None = None, label: str | None = None, - user_data: JsonSerializable = None, + user_data: dict[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, requests: Sequence[str | Request] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], @@ -398,7 +398,7 @@ def __call__( *, selector: str = 'a', label: str | None = None, - user_data: JsonSerializable = None, + user_data: dict[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, list[Request]]: diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index be5b93858b..6d656a4887 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -156,7 +156,7 @@ async def extract_links( *, selector: str = 'a', label: str | None = None, - user_data: JsonSerializable = None, + user_data: dict[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], @@ -226,7 +226,7 @@ async def enqueue_links( *, selector: str | None = None, label: str | None = None, - user_data: JsonSerializable = None, + user_data: dict[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, requests: Sequence[str | Request] | None = None, diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 0eb1681599..ca5fcf617d 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -287,7 +287,7 @@ async def extract_links( *, selector: str = 'a', label: str | None = None, - user_data: JsonSerializable = None, + user_data: dict[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], @@ -358,7 +358,7 @@ async def enqueue_links( *, selector: str | None = None, label: str | None = None, - user_data: JsonSerializable = None, + user_data: dict[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, requests: Sequence[str | Request] | None = None,