From 0426a58f9e0641434ce17978e65bdb8d98a6d768 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 3 Jun 2026 19:56:30 +0200 Subject: [PATCH 01/24] docs: Add uv guide Add a new guide on managing Actor projects with the uv package manager, covering project setup, local development with the Apify CLI, the uv-based Dockerfile, deployment, and dependency management. --- docs/01_introduction/quick-start.mdx | 1 + docs/03_guides/08_uv.mdx | 180 ++++++++++++++++++ docs/03_guides/code/uv_project/Dockerfile | 38 ++++ .../code/uv_project/my_actor/__init__.py | 0 .../code/uv_project/my_actor/__main__.py | 6 + .../code/uv_project/my_actor/main.py | 8 + docs/03_guides/code/uv_project/pyproject.toml | 13 ++ 7 files changed, 246 insertions(+) create mode 100644 docs/03_guides/08_uv.mdx create mode 100644 docs/03_guides/code/uv_project/Dockerfile create mode 100644 docs/03_guides/code/uv_project/my_actor/__init__.py create mode 100644 docs/03_guides/code/uv_project/my_actor/__main__.py create mode 100644 docs/03_guides/code/uv_project/my_actor/main.py create mode 100644 docs/03_guides/code/uv_project/pyproject.toml diff --git a/docs/01_introduction/quick-start.mdx b/docs/01_introduction/quick-start.mdx index da166da9..4b99c491 100644 --- a/docs/01_introduction/quick-start.mdx +++ b/docs/01_introduction/quick-start.mdx @@ -106,3 +106,4 @@ To see how you can integrate the Apify SDK with popular web scraping libraries, - [Crawlee](../guides/crawlee) - [Scrapy](../guides/scrapy) - [Running webserver](../guides/running-webserver) +- [uv](../guides/uv) diff --git a/docs/03_guides/08_uv.mdx b/docs/03_guides/08_uv.mdx new file mode 100644 index 00000000..199bd42a --- /dev/null +++ b/docs/03_guides/08_uv.mdx @@ -0,0 +1,180 @@ +--- +id: uv +title: Use uv +description: Manage your Actor's Python version, dependencies, and virtual environment with the uv package and project manager. +--- + +import CodeBlock from '@theme/CodeBlock'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +import PyprojectExample from '!!raw-loader!./code/uv_project/pyproject.toml'; +import MainExample from '!!raw-loader!./code/uv_project/my_actor/main.py'; +import UnderscoreMainExample from '!!raw-loader!./code/uv_project/my_actor/__main__.py'; +import DockerfileExample from '!!raw-loader!./code/uv_project/Dockerfile'; + +In this guide, you'll learn how to use [uv](https://docs.astral.sh/uv/) to manage your Apify Actor projects - from creating a new project, through running it locally, to building and deploying it on the Apify platform. + +## Introduction + +[uv](https://docs.astral.sh/uv/) is an extremely fast Python package and project manager. It replaces the combination of pip, virtualenv, and similar tools with a single binary that manages your project's Python version, virtual environment, and dependencies. It records the project metadata in the standard [`pyproject.toml`](https://packaging.python.org/en/latest/guides/writing-pyproject-toml/) file and the exact resolved versions of all dependencies in a [`uv.lock`](https://docs.astral.sh/uv/concepts/projects/sync/) lockfile. + +The [Python Actor templates](https://apify.com/templates/categories/python) declare their dependencies in a `requirements.txt` file, which is the default approach for Actors. Using uv instead brings a few advantages: + +- The lockfile guarantees that the dependencies installed in the Actor's Docker image are exactly the ones you developed and tested against locally. +- Dependency installation during the Docker build is significantly faster than with pip, especially with a warm cache. +- A single tool manages your Python interpreter, virtual environment, and dependencies, so the project works the same on every machine. + +To follow along, install [uv](https://docs.astral.sh/uv/getting-started/installation/) and the [Apify CLI](https://docs.apify.com/cli/docs/installation) first. If you prefer to start from a ready-made project instead of setting one up step by step, use the [uv Actor template](https://apify.com/templates/python-uv). + +## Create a new project + +Create a new uv project and add the Apify SDK to its dependencies: + +```bash +uv init my-actor --bare +cd my-actor +uv python pin 3.14 +uv add apify +``` + +The [`uv init`](https://docs.astral.sh/uv/reference/cli/#uv-init) command with the `--bare` option creates just the `pyproject.toml` project manifest. The `uv python pin` command writes the project's Python version to the `.python-version` file - uv automatically downloads that Python version if it's not installed on your machine. Finally, [`uv add`](https://docs.astral.sh/uv/reference/cli/#uv-add) records the dependency in `pyproject.toml`, resolves the exact versions of the whole dependency tree into `uv.lock`, and installs everything into the project's virtual environment in `.venv`. + +The `uv add` command constrains the dependency to the latest version it resolved. You can edit the constraint as you see fit - this guide's example Actor allows any version of the SDK within the current major one: + + + {PyprojectExample} + + +The `package = false` setting in the `[tool.uv]` section tells uv that the project is not a Python package that needs to be built and installed - the Actor just runs as a module straight from the source tree, and uv only manages its dependencies. + +## Add the Actor scaffolding + +For the project to be runnable as an Actor, it needs two more pieces: the source code as a runnable Python package, and the `.actor/` directory with the [Actor configuration](https://docs.apify.com/platform/actors/development/actor-definition/actor-json). + +Create a `my_actor` package with the Actor's source code: + + + + + {MainExample} + + + + + {UnderscoreMainExample} + + + + +Don't forget to add an empty `my_actor/__init__.py` file, so that the directory is a regular Python package executable with `python -m my_actor`. + +Then add the Actor definition to `.actor/actor.json`: + +```json title=".actor/actor.json" +{ + "$schema": "https://apify.com/schemas/v1/actor.ide.json", + "actorSpecification": 1, + "name": "my-actor", + "title": "My uv Actor", + "description": "An Apify Actor with dependencies managed by uv.", + "version": "0.1", + "buildTag": "latest", + "dockerfile": "../Dockerfile" +} +``` + +The final project structure looks like this: + +```text +my-actor/ +├── .actor/ +│ └── actor.json +├── my_actor/ +│ ├── __init__.py +│ ├── __main__.py +│ └── main.py +├── .python-version +├── Dockerfile +├── pyproject.toml +└── uv.lock +``` + +Make sure to commit `uv.lock` and `.python-version` to version control, so that every machine - and the Actor's Docker build - works with identical dependencies and Python version. + +## Run the Actor locally + +If you've just cloned the project (or skipped `uv add` above), install the dependencies first: + +```bash +uv sync +``` + +The [`uv sync`](https://docs.astral.sh/uv/reference/cli/#uv-sync) command creates the `.venv` virtual environment (if it doesn't exist yet) and installs the locked dependencies into it. Then run the Actor with the Apify CLI: + +```bash +apify run --purge +``` + +The [`apify run`](https://docs.apify.com/cli/docs/reference#apify-run) command automatically detects the virtual environment in `.venv` and uses it to run the Actor as a module (`python -m my_actor`), with the environment set up to emulate the Apify platform locally - for example, the Actor input is read from `storage/key_value_stores/default/INPUT.json`. + +## Use uv in the Dockerfile + +On the Apify platform, the Actor runs as a Docker container built from the Dockerfile referenced in `.actor/actor.json`. The following Dockerfile installs the locked dependencies with uv on top of the [Apify Python base image](https://hub.docker.com/r/apify/actor-python): + + + {DockerfileExample} + + +A few details worth understanding: + +- The uv binary is copied from its [official Docker image](https://docs.astral.sh/uv/guides/integration/docker/), pinned to a minor version line, so builds are reproducible and there is no need to install uv with pip. +- `uv sync --locked --no-dev` installs the dependencies exactly as recorded in `uv.lock` and skips development dependencies. If the lockfile is missing or out of sync with `pyproject.toml`, the build fails instead of silently resolving different versions. +- The dependencies are installed in a separate layer before the source code is copied, so editing your code doesn't invalidate the dependency layer, and rebuilds are fast. +- Putting `.venv/bin` first on `PATH` makes `python` resolve to the project's virtual environment, both during the build and when the Actor runs. + +Also create a `.dockerignore` file and exclude at least `.venv`, `.git`, and `storage` from the Docker build context - the local virtual environment must never be copied into the image, since it's recreated by `uv sync` during the build. + +## Deploy to the Apify platform + +Once the Actor works locally, log in and push it to the Apify platform: + +```bash +apify login +apify push +``` + +The [`apify push`](https://docs.apify.com/cli/docs/reference#apify-push) command uploads the project to the platform and builds the Docker image from the Dockerfile above. Thanks to the committed lockfile, the platform build installs exactly the dependency versions you ran locally. + +## Manage dependencies + +Day-to-day dependency management goes through uv as well: + +```bash +# Add a dependency (records it in pyproject.toml and updates uv.lock). +uv add httpx + +# Add a development-only dependency (skipped in the Docker build by --no-dev). +uv add --dev ruff + +# Remove a dependency. +uv remove httpx + +# Upgrade all dependencies to the latest versions allowed by pyproject.toml. +uv lock --upgrade +uv sync +``` + +Whenever the dependencies change, commit the updated `uv.lock` together with `pyproject.toml`. + +## Conclusion + +In this guide, you learned how to use uv to manage Apify Actor projects. You can now create a uv project with the Apify SDK, run it locally with the Apify CLI, install the locked dependencies with uv in the Actor's Docker image, and deploy the whole project to the Apify platform with reproducible builds. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy coding! + +## Additional resources + +- [uv: Official documentation](https://docs.astral.sh/uv/) +- [uv: Working on projects](https://docs.astral.sh/uv/guides/projects/) +- [uv: Using uv in Docker](https://docs.astral.sh/uv/guides/integration/docker/) +- [Apify: Actor Dockerfile documentation](https://docs.apify.com/platform/actors/development/actor-definition/dockerfile) +- [Apify templates: Python](https://apify.com/templates/categories/python) diff --git a/docs/03_guides/code/uv_project/Dockerfile b/docs/03_guides/code/uv_project/Dockerfile new file mode 100644 index 00000000..24e7a44b --- /dev/null +++ b/docs/03_guides/code/uv_project/Dockerfile @@ -0,0 +1,38 @@ +# syntax=docker/dockerfile:1 +# First, specify the base Docker image. +# You can see the Docker images from Apify at https://hub.docker.com/r/apify/. +# You can also use any other image from Docker Hub. +FROM apify/actor-python:3.14 + +# Add the uv binary from its official distroless image (pinned to the 0.11.x line). +COPY --from=ghcr.io/astral-sh/uv:0.11 /uv /uvx /bin/ + +# Configure uv for container builds: +# - compile installed packages to bytecode, so the Actor starts faster, +# - copy packages instead of hardlinking, which avoids warnings with the cache mount, +# - never download a managed Python, always reuse the base image's interpreter, +# - put the project virtual environment first on PATH, so `python` resolves to it. +ENV UV_COMPILE_BYTECODE=1 \ + UV_LINK_MODE=copy \ + UV_PYTHON_DOWNLOADS=0 \ + PATH="/usr/src/app/.venv/bin:$PATH" + +# Install dependencies into the project virtual environment (.venv) as a separate +# layer. The cache mount speeds up repeated builds, and the bind mounts make the +# project metadata available without copying it into the image. This layer is +# rebuilt only when uv.lock or pyproject.toml change - not on source code edits. +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,source=uv.lock,target=uv.lock \ + --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ + uv sync --locked --no-dev + +# Next, copy the remaining files and directories with the source code. +# Since we do this after installing the dependencies, quick rebuilds will be +# really fast for most source file changes. +COPY . ./ + +# Use compileall to ensure the runnability of the Actor Python code. +RUN python -m compileall -q my_actor/ + +# Specify how to launch the source code of your Actor. +CMD ["python", "-m", "my_actor"] diff --git a/docs/03_guides/code/uv_project/my_actor/__init__.py b/docs/03_guides/code/uv_project/my_actor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/03_guides/code/uv_project/my_actor/__main__.py b/docs/03_guides/code/uv_project/my_actor/__main__.py new file mode 100644 index 00000000..8c4ab0b8 --- /dev/null +++ b/docs/03_guides/code/uv_project/my_actor/__main__.py @@ -0,0 +1,6 @@ +import asyncio + +from .main import main + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/uv_project/my_actor/main.py b/docs/03_guides/code/uv_project/my_actor/main.py new file mode 100644 index 00000000..10e88e19 --- /dev/null +++ b/docs/03_guides/code/uv_project/my_actor/main.py @@ -0,0 +1,8 @@ +from apify import Actor + + +async def main() -> None: + async with Actor: + actor_input = await Actor.get_input() or {} + Actor.log.info('Actor input: %s', actor_input) + await Actor.set_value('OUTPUT', 'Hello from a uv-managed Actor!') diff --git a/docs/03_guides/code/uv_project/pyproject.toml b/docs/03_guides/code/uv_project/pyproject.toml new file mode 100644 index 00000000..1e695559 --- /dev/null +++ b/docs/03_guides/code/uv_project/pyproject.toml @@ -0,0 +1,13 @@ +[project] +name = "my-actor" +version = "0.1.0" +description = "An Apify Actor with dependencies managed by uv." +requires-python = ">=3.14" +dependencies = [ + "apify>=3.0.0,<4.0.0", +] + +[tool.uv] +# The Actor runs straight from the source tree as a module. uv only manages +# its dependencies, the project itself is not built and installed as a package. +package = false From 7efcf3e4157978acb335ca2a568f0e91c9fdfeaf Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 3 Jun 2026 20:19:33 +0200 Subject: [PATCH 02/24] docs: Clarify Dockerfile forward reference and simplify run command in uv guide The scaffolding section referenced a Dockerfile that is only created later in the guide, which was confusing without a pointer. The local run example also used the optional --purge flag, which is not needed for the tutorial flow. --- docs/03_guides/08_uv.mdx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/03_guides/08_uv.mdx b/docs/03_guides/08_uv.mdx index 199bd42a..31b6cda3 100644 --- a/docs/03_guides/08_uv.mdx +++ b/docs/03_guides/08_uv.mdx @@ -84,6 +84,8 @@ Then add the Actor definition to `.actor/actor.json`: } ``` +The `dockerfile` field points to the project's `Dockerfile`, which doesn't exist yet - you'll create it in the [Use uv in the Dockerfile](#use-uv-in-the-dockerfile) section below. + The final project structure looks like this: ```text @@ -113,7 +115,7 @@ uv sync The [`uv sync`](https://docs.astral.sh/uv/reference/cli/#uv-sync) command creates the `.venv` virtual environment (if it doesn't exist yet) and installs the locked dependencies into it. Then run the Actor with the Apify CLI: ```bash -apify run --purge +apify run ``` The [`apify run`](https://docs.apify.com/cli/docs/reference#apify-run) command automatically detects the virtual environment in `.venv` and uses it to run the Actor as a module (`python -m my_actor`), with the environment set up to emulate the Apify platform locally - for example, the Actor input is read from `storage/key_value_stores/default/INPUT.json`. From 4421d77d98e5b8018c9d6fd3ac515816e24373f7 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 3 Jun 2026 20:23:17 +0200 Subject: [PATCH 03/24] docs: Retitle uv guide to describe what it is for "Use uv" was too terse - unlike the scraping-library guides, the tool name alone does not convey the guide's purpose. The new title mirrors the guide's intro sentence and the verb-first sidebar convention. --- docs/03_guides/08_uv.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/03_guides/08_uv.mdx b/docs/03_guides/08_uv.mdx index 31b6cda3..4fcb826d 100644 --- a/docs/03_guides/08_uv.mdx +++ b/docs/03_guides/08_uv.mdx @@ -1,6 +1,6 @@ --- id: uv -title: Use uv +title: Manage your project with uv description: Manage your Actor's Python version, dependencies, and virtual environment with the uv package and project manager. --- From d5ab7310d33c25c1bbbcc524194b8aa8b99e890a Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 3 Jun 2026 20:42:57 +0200 Subject: [PATCH 04/24] docs: Note in the uv guide that Actor templates don't support uv yet The guide pointed readers to a python-uv template that isn't published. An info banner now explains templates are pip-only for now and links the tracking issue apify/actor-templates#350. --- docs/03_guides/08_uv.mdx | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/03_guides/08_uv.mdx b/docs/03_guides/08_uv.mdx index 4fcb826d..7fd67882 100644 --- a/docs/03_guides/08_uv.mdx +++ b/docs/03_guides/08_uv.mdx @@ -25,7 +25,13 @@ The [Python Actor templates](https://apify.com/templates/categories/python) decl - Dependency installation during the Docker build is significantly faster than with pip, especially with a warm cache. - A single tool manages your Python interpreter, virtual environment, and dependencies, so the project works the same on every machine. -To follow along, install [uv](https://docs.astral.sh/uv/getting-started/installation/) and the [Apify CLI](https://docs.apify.com/cli/docs/installation) first. If you prefer to start from a ready-made project instead of setting one up step by step, use the [uv Actor template](https://apify.com/templates/python-uv). +:::info Actor templates don't support uv yet + +The [Apify Actor templates](https://apify.com/templates) currently support only pip with `requirements.txt`. Adding uv-based templates is planned - follow [apify/actor-templates#350](https://github.com/apify/actor-templates/issues/350) for updates. + +::: + +To follow along, install [uv](https://docs.astral.sh/uv/getting-started/installation/) and the [Apify CLI](https://docs.apify.com/cli/docs/installation) first. ## Create a new project From 309b4fca47cc27a6d6aa7e7a3f475e05c95f377e Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 4 Jun 2026 09:58:40 +0200 Subject: [PATCH 05/24] docs: Address review feedback on the uv guide --- docs/03_guides/08_uv.mdx | 6 +++--- docs/03_guides/code/uv_project/pyproject.toml | 5 ----- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/docs/03_guides/08_uv.mdx b/docs/03_guides/08_uv.mdx index 7fd67882..6d480c81 100644 --- a/docs/03_guides/08_uv.mdx +++ b/docs/03_guides/08_uv.mdx @@ -23,7 +23,7 @@ The [Python Actor templates](https://apify.com/templates/categories/python) decl - The lockfile guarantees that the dependencies installed in the Actor's Docker image are exactly the ones you developed and tested against locally. - Dependency installation during the Docker build is significantly faster than with pip, especially with a warm cache. -- A single tool manages your Python interpreter, virtual environment, and dependencies, so the project works the same on every machine. +- During local development, a single tool manages your Python version, virtual environment, and dependencies, so the project behaves the same on every developer's machine. :::info Actor templates don't support uv yet @@ -52,7 +52,7 @@ The `uv add` command constrains the dependency to the latest version it resolved {PyprojectExample} -The `package = false` setting in the `[tool.uv]` section tells uv that the project is not a Python package that needs to be built and installed - the Actor just runs as a module straight from the source tree, and uv only manages its dependencies. +Note that the example has no `[build-system]` section. Without one, uv treats the project as a non-package ("virtual") project: it doesn't try to build and install the project itself, it only manages its dependencies. That's exactly what we want here - the Actor runs as a module straight from the source tree. ## Add the Actor scaffolding @@ -108,7 +108,7 @@ my-actor/ └── uv.lock ``` -Make sure to commit `uv.lock` and `.python-version` to version control, so that every machine - and the Actor's Docker build - works with identical dependencies and Python version. +Make sure to commit `uv.lock` and `.python-version` to version control, so that every developer's machine works with identical dependencies and the same Python version. The Actor's Docker build gets its Python interpreter from the base image instead, so keep the base image tag (`apify/actor-python:3.14`) in sync with `.python-version`. ## Run the Actor locally diff --git a/docs/03_guides/code/uv_project/pyproject.toml b/docs/03_guides/code/uv_project/pyproject.toml index 1e695559..7500a440 100644 --- a/docs/03_guides/code/uv_project/pyproject.toml +++ b/docs/03_guides/code/uv_project/pyproject.toml @@ -6,8 +6,3 @@ requires-python = ">=3.14" dependencies = [ "apify>=3.0.0,<4.0.0", ] - -[tool.uv] -# The Actor runs straight from the source tree as a module. uv only manages -# its dependencies, the project itself is not built and installed as a package. -package = false From 54e153d74860c271eb31142644ad088a0b4f6569 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 10:40:25 +0200 Subject: [PATCH 06/24] docs: Add Scrapling guide --- docs/01_introduction/quick-start.mdx | 1 + docs/03_guides/09_scrapling.mdx | 123 +++++++++++++++++++++++++++ docs/03_guides/code/09_scrapling.py | 95 +++++++++++++++++++++ 3 files changed, 219 insertions(+) create mode 100644 docs/03_guides/09_scrapling.mdx create mode 100644 docs/03_guides/code/09_scrapling.py diff --git a/docs/01_introduction/quick-start.mdx b/docs/01_introduction/quick-start.mdx index da166da9..c0f8bec3 100644 --- a/docs/01_introduction/quick-start.mdx +++ b/docs/01_introduction/quick-start.mdx @@ -105,4 +105,5 @@ To see how you can integrate the Apify SDK with popular web scraping libraries, - [Selenium](../guides/selenium) - [Crawlee](../guides/crawlee) - [Scrapy](../guides/scrapy) +- [Scrapling](../guides/scrapling) - [Running webserver](../guides/running-webserver) diff --git a/docs/03_guides/09_scrapling.mdx b/docs/03_guides/09_scrapling.mdx new file mode 100644 index 00000000..459e5a25 --- /dev/null +++ b/docs/03_guides/09_scrapling.mdx @@ -0,0 +1,123 @@ +--- +id: scrapling +title: Use Scrapling +description: Build an Apify Actor that scrapes web pages using the Scrapling adaptive web scraping library. +--- + +import CodeBlock from '@theme/CodeBlock'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import ScraplingExample from '!!raw-loader!roa-loader!./code/09_scrapling.py'; + +In this guide, you'll learn how to use the [Scrapling](https://scrapling.readthedocs.io/) library in your Apify Actors. + +## Introduction + +[Scrapling](https://scrapling.readthedocs.io/) is an adaptive web scraping library for Python that combines fetching and parsing behind a single, high-level API. It can fetch a page with fast HTTP requests or with a real browser, parse the result with familiar CSS selectors and XPath, and even relocate your selectors automatically when a website's structure changes. + +Some of the features that make Scrapling a good fit for Apify Actors: + +- **Multiple fetchers** - A single API exposes a fast HTTP client with browser TLS-fingerprint impersonation, as well as full browser automation for JavaScript-heavy or protected pages. +- **Adaptive selectors** - Scrapling can remember the elements you scraped and find them again after a website redesign, so your scrapers keep working with fewer manual fixes. +- **Anti-bot evasion** - Built-in stealth features (browser impersonation, realistic headers, and automatic Cloudflare Turnstile solving with the browser fetchers) help you avoid being blocked. +- **Familiar parsing API** - Elements are selected with CSS selectors (including the `::text` and `::attr()` pseudo-elements) or XPath, with a Scrapy/Parsel-like `.get()` and `.getall()` interface. +- **First-class async support** - Every fetcher has an asynchronous variant, which integrates naturally with the asyncio-based Apify SDK. + +Scrapling's parser works on its own, while the fetchers are an optional extra. Install Scrapling with the `fetchers` extra to get the HTTP and browser fetchers: + +```bash +pip install "scrapling[fetchers]" +``` + +## Choosing a fetcher + +All of Scrapling's fetchers are importable from `scrapling.fetchers`. Pick the one that matches the website you're scraping: + +- **`Fetcher` / `AsyncFetcher`** - Plain HTTP requests via `.get()`, `.post()`, `.put()`, and `.delete()`. Fast and lightweight, with optional browser TLS-fingerprint impersonation (`impersonate`) and realistic headers (`stealthy_headers`). This is the best choice for static pages and APIs, and it needs no browser binaries. +- **`DynamicFetcher` / `DynamicSession`** - Full browser automation based on [Playwright](https://playwright.dev/), for pages that require JavaScript rendering or interaction. Fetch a page with `.fetch()` or its async variant `.async_fetch()`. +- **`StealthyFetcher` / `StealthySession`** - A stealth-hardened browser fetcher that can automatically solve Cloudflare Turnstile challenges (`solve_cloudflare=True`). Use it for the most heavily protected websites. + +The returned `Response` object is also a Scrapling selector, so you can call `.css()`, `.xpath()`, `.find_all()`, and the other parsing methods on it directly. + +The HTTP fetchers work with just the `scrapling[fetchers]` extra. The browser-based fetchers (`DynamicFetcher` and `StealthyFetcher`) additionally need browser binaries, which you download with the `scrapling install` command - see [Running browser-based fetchers](#running-browser-based-fetchers) below. + +The example Actor in this guide uses the HTTP `AsyncFetcher`, which is the simplest to deploy and pairs well with Apify Proxy. + +## Example Actor + +The following Actor recursively scrapes titles from all linked pages, up to a user-defined maximum depth, starting from the URLs in the Actor input. It uses Scrapling's `AsyncFetcher` to fetch each page through [Apify Proxy](https://docs.apify.com/platform/proxy), and CSS selectors to extract the title, headings, and links. + + + {ScraplingExample} + + +A few things worth pointing out: + +- The response of `AsyncFetcher.get` is a Scrapling selector, so `response.css('title::text').get()` reads the page title and `response.css('a::attr(href)').getall()` returns every link's `href` in one call. +- `response.urljoin(link_href)` resolves relative links against the page URL, so you can enqueue them directly. +- The `impersonate='chrome'` and `stealthy_headers=True` options make the request look like it comes from a real Chrome browser, which - combined with Apify Proxy - reduces the chance of being blocked. + +## Using Apify Proxy + +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. The example above creates a proxy configuration and passes a fresh proxy URL to every request: + +```python +proxy_configuration = await Actor.create_proxy_configuration() +... +proxy_url = None +if proxy_configuration: + proxy_url = await proxy_configuration.new_url() + +response = await AsyncFetcher.get(url, proxy=proxy_url) +``` + +Scrapling accepts the proxy as a URL string (for example `http://user:pass@proxy.apify.com:8000`), which is exactly what `ProxyConfiguration.new_url` returns. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. The browser-based fetchers accept the same `proxy` argument. + +## Running browser-based fetchers + +`DynamicFetcher` and `StealthyFetcher` drive a real browser, so they need the browser binaries installed with the `scrapling install` command. Locally, run it once after installing the `scrapling[fetchers]` extra: + +```bash +scrapling install +``` + +On the Apify platform, the Actor runs in a Docker container, so the browsers have to be installed during the image build. Build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser together with all of its system-level dependencies, and then download the browser binaries that Scrapling expects: + + +{`FROM apify/actor-python-playwright:3.14-1.60.0 + +COPY --chown=myuser:myuser requirements.txt ./ +RUN pip install -r requirements.txt + +# Download the browser binaries Scrapling needs. The base image already provides +# their system-level dependencies, so run this step as root. +USER root +RUN scrapling install +USER myuser + +COPY --chown=myuser:myuser . ./ +RUN python -m compileall -q my_actor/ + +CMD ["python", "-m", "my_actor"]`} + + +Fetching a page then only differs in which fetcher you call - the parsing API is identical: + +```python +from scrapling.fetchers import DynamicFetcher + +response = await DynamicFetcher.async_fetch(url, headless=True, network_idle=True) +quotes = response.css('.quote .text::text').getall() +``` + +## Conclusion + +In this guide, you learned how to use Scrapling in your Apify Actors. You can now fetch pages with Scrapling's HTTP or browser-based fetchers, extract data with its CSS and XPath selectors, route requests through Apify Proxy, and run the whole thing on the Apify platform. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! + +## Additional resources + +- [Scrapling: Official documentation](https://scrapling.readthedocs.io/) +- [Scrapling: Fetchers](https://scrapling.readthedocs.io/en/latest/fetching/choosing/) +- [Scrapling: Parsing and selecting elements](https://scrapling.readthedocs.io/en/latest/parsing/selection/) +- [Scrapling: GitHub repository](https://github.com/D4Vinci/Scrapling) +- [Apify: Proxy management](https://docs.apify.com/platform/proxy) diff --git a/docs/03_guides/code/09_scrapling.py b/docs/03_guides/code/09_scrapling.py new file mode 100644 index 00000000..fed1b5ae --- /dev/null +++ b/docs/03_guides/code/09_scrapling.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +from scrapling.fetchers import AsyncFetcher + +from apify import Actor, Request + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Retrieve the Actor input, and use default values if not provided. + actor_input = await Actor.get_input() or {} + start_urls = actor_input.get('start_urls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('max_depth', 1) + + # Exit if no start URLs are provided. + if not start_urls: + Actor.log.info('No start URLs specified in Actor input, exiting...') + await Actor.exit() + + # Create a proxy configuration that routes requests through Apify Proxy. + proxy_configuration = await Actor.create_proxy_configuration() + + # Open the default request queue for handling URLs to be processed. + request_queue = await Actor.open_request_queue() + + # Enqueue the start URLs with an initial crawl depth of 0. + for start_url in start_urls: + url = start_url.get('url') + Actor.log.info(f'Enqueuing {url} ...') + new_request = Request.from_url(url, user_data={'depth': 0}) + await request_queue.add_request(new_request) + + # Process the URLs from the request queue. + while request := await request_queue.fetch_next_request(): + url = request.url + + if not isinstance(request.user_data['depth'], (str, int)): + raise TypeError('Request.depth is an unexpected type.') + + depth = int(request.user_data['depth']) + Actor.log.info(f'Scraping {url} (depth={depth}) ...') + + try: + # Get a fresh proxy URL for each request (None if no proxy is set up). + proxy_url = None + if proxy_configuration: + proxy_url = await proxy_configuration.new_url() + + # Fetch the page with Scrapling's asynchronous HTTP fetcher. The + # `impersonate` and `stealthy_headers` options make the request look + # like it comes from a real Chrome browser, reducing the chance of + # being blocked. The returned response is also a Scrapling selector. + response = await AsyncFetcher.get( + url, + proxy=proxy_url, + impersonate='chrome', + stealthy_headers=True, + timeout=60, + ) + + # If the current depth is less than max_depth, find nested links + # and enqueue them. The `::attr(href)` pseudo-selector reads the + # attribute, and `response.urljoin` resolves it against the page URL. + if depth < max_depth: + for link_href in response.css('a::attr(href)').getall(): + link_url = response.urljoin(link_href) + + if link_url.startswith(('http://', 'https://')): + Actor.log.info(f'Enqueuing {link_url} ...') + new_request = Request.from_url( + link_url, + user_data={'depth': depth + 1}, + ) + await request_queue.add_request(new_request) + + # Extract the desired data using Scrapling's CSS selectors. The + # `::text` pseudo-element returns the text content of the elements. + data = { + 'url': url, + 'title': response.css('title::text').get(), + 'h1s': response.css('h1::text').getall(), + 'h2s': response.css('h2::text').getall(), + 'h3s': response.css('h3::text').getall(), + } + + # Store the extracted data to the default dataset. + await Actor.push_data(data) + + except Exception: + Actor.log.exception(f'Cannot extract data from {url}.') + + finally: + # Mark the request as handled to ensure it is not processed again. + await request_queue.mark_request_as_handled(request) From 29c4c8a8a35a83410f998ad8f6d6efea0f9decbf Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 11:24:10 +0200 Subject: [PATCH 07/24] docs: Split Scrapling guide example into modules and use code tabs --- docs/03_guides/09_scrapling.mdx | 93 +++++++++--------- docs/03_guides/code/09_scrapling.py | 95 ------------------- .../code/scrapling_browser_project/Dockerfile | 21 ++++ .../my_actor/scraper.py | 45 +++++++++ .../scrapling_project/my_actor/__init__.py | 0 .../scrapling_project/my_actor/__main__.py | 8 ++ .../code/scrapling_project/my_actor/main.py | 71 ++++++++++++++ .../scrapling_project/my_actor/scraper.py | 47 +++++++++ pyproject.toml | 4 + 9 files changed, 245 insertions(+), 139 deletions(-) delete mode 100644 docs/03_guides/code/09_scrapling.py create mode 100644 docs/03_guides/code/scrapling_browser_project/Dockerfile create mode 100644 docs/03_guides/code/scrapling_browser_project/my_actor/scraper.py create mode 100644 docs/03_guides/code/scrapling_project/my_actor/__init__.py create mode 100644 docs/03_guides/code/scrapling_project/my_actor/__main__.py create mode 100644 docs/03_guides/code/scrapling_project/my_actor/main.py create mode 100644 docs/03_guides/code/scrapling_project/my_actor/scraper.py diff --git a/docs/03_guides/09_scrapling.mdx b/docs/03_guides/09_scrapling.mdx index 459e5a25..3e76ebca 100644 --- a/docs/03_guides/09_scrapling.mdx +++ b/docs/03_guides/09_scrapling.mdx @@ -5,9 +5,14 @@ description: Build an Apify Actor that scrapes web pages using the Scrapling ada --- import CodeBlock from '@theme/CodeBlock'; -import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; -import ScraplingExample from '!!raw-loader!roa-loader!./code/09_scrapling.py'; +import ScraplingMain from '!!raw-loader!./code/scrapling_project/my_actor/main.py'; +import ScraplingScraper from '!!raw-loader!./code/scrapling_project/my_actor/scraper.py'; +import ScraplingEntrypoint from '!!raw-loader!./code/scrapling_project/my_actor/__main__.py'; +import ScraplingBrowserScraper from '!!raw-loader!./code/scrapling_browser_project/my_actor/scraper.py'; +import ScraplingBrowserDockerfile from '!!raw-loader!./code/scrapling_browser_project/Dockerfile'; In this guide, you'll learn how to use the [Scrapling](https://scrapling.readthedocs.io/) library in your Apify Actors. @@ -47,29 +52,40 @@ The example Actor in this guide uses the HTTP `AsyncFetcher`, which is the simpl The following Actor recursively scrapes titles from all linked pages, up to a user-defined maximum depth, starting from the URLs in the Actor input. It uses Scrapling's `AsyncFetcher` to fetch each page through [Apify Proxy](https://docs.apify.com/platform/proxy), and CSS selectors to extract the title, headings, and links. - - {ScraplingExample} - +The code is split into three small modules, following the structure of the Apify Python Actor templates: + +- `my_actor/main.py` - The Actor's main coroutine. It handles the [Actor](https://docs.apify.com/platform/actors) lifecycle, reads the input, sets up [Apify Proxy](https://docs.apify.com/platform/proxy) and the [request queue](https://docs.apify.com/platform/storage/request-queue), and drives the crawl. +- `my_actor/scraper.py` - The Scrapling-specific logic. A single `scrape_page` function fetches a page and returns the extracted data together with the links found on it. +- `my_actor/__main__.py` - The entry point that runs the `main` coroutine with `asyncio`. + + + + + {ScraplingMain} + + + + + {ScraplingScraper} + + + + + {ScraplingEntrypoint} + + + A few things worth pointing out: +- Keeping the fetching and parsing in `scrape_page` separates the Scrapling-specific code from the Actor's orchestration logic. The function returns the extracted data together with the discovered links, so `my_actor/main.py` decides what to store and what to enqueue. - The response of `AsyncFetcher.get` is a Scrapling selector, so `response.css('title::text').get()` reads the page title and `response.css('a::attr(href)').getall()` returns every link's `href` in one call. - `response.urljoin(link_href)` resolves relative links against the page URL, so you can enqueue them directly. - The `impersonate='chrome'` and `stealthy_headers=True` options make the request look like it comes from a real Chrome browser, which - combined with Apify Proxy - reduces the chance of being blocked. ## Using Apify Proxy -Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. The example above creates a proxy configuration and passes a fresh proxy URL to every request: - -```python -proxy_configuration = await Actor.create_proxy_configuration() -... -proxy_url = None -if proxy_configuration: - proxy_url = await proxy_configuration.new_url() - -response = await AsyncFetcher.get(url, proxy=proxy_url) -``` +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. In the example above, `my_actor/main.py` creates a proxy configuration with `Actor.create_proxy_configuration` and passes a fresh proxy URL to `scrape_page` for every request, which forwards it to Scrapling's `proxy` argument. Scrapling accepts the proxy as a URL string (for example `http://user:pass@proxy.apify.com:8000`), which is exactly what `ProxyConfiguration.new_url` returns. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. The browser-based fetchers accept the same `proxy` argument. @@ -81,34 +97,23 @@ Scrapling accepts the proxy as a URL string (for example `http://user:pass@proxy scrapling install ``` -On the Apify platform, the Actor runs in a Docker container, so the browsers have to be installed during the image build. Build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser together with all of its system-level dependencies, and then download the browser binaries that Scrapling expects: - - -{`FROM apify/actor-python-playwright:3.14-1.60.0 - -COPY --chown=myuser:myuser requirements.txt ./ -RUN pip install -r requirements.txt - -# Download the browser binaries Scrapling needs. The base image already provides -# their system-level dependencies, so run this step as root. -USER root -RUN scrapling install -USER myuser - -COPY --chown=myuser:myuser . ./ -RUN python -m compileall -q my_actor/ - -CMD ["python", "-m", "my_actor"]`} - - -Fetching a page then only differs in which fetcher you call - the parsing API is identical: - -```python -from scrapling.fetchers import DynamicFetcher - -response = await DynamicFetcher.async_fetch(url, headless=True, network_idle=True) -quotes = response.css('.quote .text::text').getall() -``` +Switching the example Actor from HTTP to a real browser only takes two changes - the rest of the project, including `my_actor/main.py`, stays exactly the same: + +1. Swap the fetcher call in `my_actor/scraper.py` for `DynamicFetcher.async_fetch`. The parsing API is identical, so the data extraction is unchanged. +2. Build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser together with all of its system-level dependencies, and run `scrapling install` during the build to download the browser binaries that Scrapling expects. + + + + + {ScraplingBrowserScraper} + + + + + {ScraplingBrowserDockerfile} + + + ## Conclusion diff --git a/docs/03_guides/code/09_scrapling.py b/docs/03_guides/code/09_scrapling.py deleted file mode 100644 index fed1b5ae..00000000 --- a/docs/03_guides/code/09_scrapling.py +++ /dev/null @@ -1,95 +0,0 @@ -from __future__ import annotations - -from scrapling.fetchers import AsyncFetcher - -from apify import Actor, Request - - -async def main() -> None: - # Enter the context of the Actor. - async with Actor: - # Retrieve the Actor input, and use default values if not provided. - actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{'url': 'https://crawlee.dev'}]) - max_depth = actor_input.get('max_depth', 1) - - # Exit if no start URLs are provided. - if not start_urls: - Actor.log.info('No start URLs specified in Actor input, exiting...') - await Actor.exit() - - # Create a proxy configuration that routes requests through Apify Proxy. - proxy_configuration = await Actor.create_proxy_configuration() - - # Open the default request queue for handling URLs to be processed. - request_queue = await Actor.open_request_queue() - - # Enqueue the start URLs with an initial crawl depth of 0. - for start_url in start_urls: - url = start_url.get('url') - Actor.log.info(f'Enqueuing {url} ...') - new_request = Request.from_url(url, user_data={'depth': 0}) - await request_queue.add_request(new_request) - - # Process the URLs from the request queue. - while request := await request_queue.fetch_next_request(): - url = request.url - - if not isinstance(request.user_data['depth'], (str, int)): - raise TypeError('Request.depth is an unexpected type.') - - depth = int(request.user_data['depth']) - Actor.log.info(f'Scraping {url} (depth={depth}) ...') - - try: - # Get a fresh proxy URL for each request (None if no proxy is set up). - proxy_url = None - if proxy_configuration: - proxy_url = await proxy_configuration.new_url() - - # Fetch the page with Scrapling's asynchronous HTTP fetcher. The - # `impersonate` and `stealthy_headers` options make the request look - # like it comes from a real Chrome browser, reducing the chance of - # being blocked. The returned response is also a Scrapling selector. - response = await AsyncFetcher.get( - url, - proxy=proxy_url, - impersonate='chrome', - stealthy_headers=True, - timeout=60, - ) - - # If the current depth is less than max_depth, find nested links - # and enqueue them. The `::attr(href)` pseudo-selector reads the - # attribute, and `response.urljoin` resolves it against the page URL. - if depth < max_depth: - for link_href in response.css('a::attr(href)').getall(): - link_url = response.urljoin(link_href) - - if link_url.startswith(('http://', 'https://')): - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url( - link_url, - user_data={'depth': depth + 1}, - ) - await request_queue.add_request(new_request) - - # Extract the desired data using Scrapling's CSS selectors. The - # `::text` pseudo-element returns the text content of the elements. - data = { - 'url': url, - 'title': response.css('title::text').get(), - 'h1s': response.css('h1::text').getall(), - 'h2s': response.css('h2::text').getall(), - 'h3s': response.css('h3::text').getall(), - } - - # Store the extracted data to the default dataset. - await Actor.push_data(data) - - except Exception: - Actor.log.exception(f'Cannot extract data from {url}.') - - finally: - # Mark the request as handled to ensure it is not processed again. - await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/scrapling_browser_project/Dockerfile b/docs/03_guides/code/scrapling_browser_project/Dockerfile new file mode 100644 index 00000000..38b30c60 --- /dev/null +++ b/docs/03_guides/code/scrapling_browser_project/Dockerfile @@ -0,0 +1,21 @@ +# Use the Apify Playwright base image, which already ships a browser together +# with all of its system-level dependencies. +FROM apify/actor-python-playwright:3.14-1.60.0 + +# Copy just requirements.txt first to leverage the Docker build cache. +COPY --chown=myuser:myuser requirements.txt ./ +RUN pip install -r requirements.txt + +# Download the browser binaries that Scrapling expects. The base image already +# provides their system-level dependencies, so run this step as root and then +# switch back to the unprivileged user. +USER root +RUN scrapling install +USER myuser + +# Copy the rest of the source code and verify that it compiles. +COPY --chown=myuser:myuser . ./ +RUN python -m compileall -q my_actor/ + +# Specify how to launch the Actor. +CMD ["python", "-m", "my_actor"] diff --git a/docs/03_guides/code/scrapling_browser_project/my_actor/scraper.py b/docs/03_guides/code/scrapling_browser_project/my_actor/scraper.py new file mode 100644 index 00000000..fb7d4579 --- /dev/null +++ b/docs/03_guides/code/scrapling_browser_project/my_actor/scraper.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from typing import Any + +from scrapling.fetchers import DynamicFetcher + + +async def scrape_page( + url: str, + *, + proxy_url: str | None = None, +) -> tuple[dict[str, Any], list[str]]: + """Fetch a single page in a real browser and extract its data and links. + + `DynamicFetcher` drives a real browser via Playwright, so it can render + JavaScript-heavy pages. `network_idle` waits until the page stops making + network requests before the HTML is captured. Apart from the fetcher call, + everything else - including the parsing - is identical to the HTTP version. + """ + response = await DynamicFetcher.async_fetch( + url, + proxy=proxy_url, + headless=True, + network_idle=True, + ) + + # Extract the desired data using CSS selectors. The `::text` pseudo-element + # returns the text content of the matched elements. + data = { + 'url': url, + 'title': response.css('title::text').get(), + 'h1s': response.css('h1::text').getall(), + 'h2s': response.css('h2::text').getall(), + 'h3s': response.css('h3::text').getall(), + } + + # Collect absolute links from the page. The `::attr(href)` pseudo-selector + # reads the attribute and `response.urljoin` resolves it against the page URL. + links: list[str] = [] + for href in response.css('a::attr(href)').getall(): + link_url = response.urljoin(href) + if link_url.startswith(('http://', 'https://')): + links.append(link_url) + + return data, links diff --git a/docs/03_guides/code/scrapling_project/my_actor/__init__.py b/docs/03_guides/code/scrapling_project/my_actor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/03_guides/code/scrapling_project/my_actor/__main__.py b/docs/03_guides/code/scrapling_project/my_actor/__main__.py new file mode 100644 index 00000000..6aeaf3d5 --- /dev/null +++ b/docs/03_guides/code/scrapling_project/my_actor/__main__.py @@ -0,0 +1,8 @@ +from __future__ import annotations + +import asyncio + +from .main import main + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/scrapling_project/my_actor/main.py b/docs/03_guides/code/scrapling_project/my_actor/main.py new file mode 100644 index 00000000..d2cd36e7 --- /dev/null +++ b/docs/03_guides/code/scrapling_project/my_actor/main.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +from apify import Actor, Request + +from .scraper import scrape_page + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Retrieve the Actor input, and use default values if not provided. + actor_input = await Actor.get_input() or {} + start_urls = actor_input.get('start_urls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('max_depth', 1) + + # Exit if no start URLs are provided. + if not start_urls: + Actor.log.info('No start URLs specified in Actor input, exiting...') + await Actor.exit() + + # Create a proxy configuration that routes requests through Apify Proxy. + proxy_configuration = await Actor.create_proxy_configuration() + + # Open the default request queue for handling URLs to be processed. + request_queue = await Actor.open_request_queue() + + # Enqueue the start URLs with an initial crawl depth of 0. + for start_url in start_urls: + url = start_url.get('url') + Actor.log.info(f'Enqueuing {url} ...') + request = Request.from_url(url, user_data={'depth': 0}) + await request_queue.add_request(request) + + # Process the URLs from the request queue. + while request := await request_queue.fetch_next_request(): + url = request.url + + if not isinstance(request.user_data['depth'], (str, int)): + raise TypeError('Request.depth is an unexpected type.') + + depth = int(request.user_data['depth']) + Actor.log.info(f'Scraping {url} (depth={depth}) ...') + + try: + # Get a fresh proxy URL for each request (None if no proxy set up). + proxy_url = None + if proxy_configuration: + proxy_url = await proxy_configuration.new_url() + + # Fetch the page and extract its data and nested links. + data, links = await scrape_page(url, proxy_url=proxy_url) + + # Store the extracted data to the default dataset. + await Actor.push_data(data) + + # If we are not too deep yet, enqueue the links we found. + if depth < max_depth: + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + new_request = Request.from_url( + link_url, + user_data={'depth': depth + 1}, + ) + await request_queue.add_request(new_request) + + except Exception: + Actor.log.exception(f'Cannot extract data from {url}.') + + finally: + # Mark the request as handled so it is not processed again. + await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/scrapling_project/my_actor/scraper.py b/docs/03_guides/code/scrapling_project/my_actor/scraper.py new file mode 100644 index 00000000..b840db82 --- /dev/null +++ b/docs/03_guides/code/scrapling_project/my_actor/scraper.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from typing import Any + +from scrapling.fetchers import AsyncFetcher + + +async def scrape_page( + url: str, + *, + proxy_url: str | None = None, +) -> tuple[dict[str, Any], list[str]]: + """Fetch a single page with Scrapling and extract its data and links. + + The page is fetched with Scrapling's asynchronous HTTP fetcher. The + `impersonate` and `stealthy_headers` options make the request look like it + comes from a real Chrome browser, which reduces the chance of being blocked. + The returned response is also a Scrapling selector, so it can be queried with + CSS selectors directly. + """ + response = await AsyncFetcher.get( + url, + proxy=proxy_url, + impersonate='chrome', + stealthy_headers=True, + timeout=60, + ) + + # Extract the desired data using CSS selectors. The `::text` pseudo-element + # returns the text content of the matched elements. + data = { + 'url': url, + 'title': response.css('title::text').get(), + 'h1s': response.css('h1::text').getall(), + 'h2s': response.css('h2::text').getall(), + 'h3s': response.css('h3::text').getall(), + } + + # Collect absolute links from the page. The `::attr(href)` pseudo-selector + # reads the attribute and `response.urljoin` resolves it against the page URL. + links: list[str] = [] + for href in response.css('a::attr(href)').getall(): + link_url = response.urljoin(href) + if link_url.startswith(('http://', 'https://')): + links.append(link_url) + + return data, links diff --git a/pyproject.toml b/pyproject.toml index d17bdc01..d8697219 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -181,6 +181,10 @@ indent-style = "space" # Local imports in Scrapy project. "TID252", # Prefer absolute imports over relative imports from parent modules ] +"**/docs/**/scrapling_project/**" = [ + # Local imports are mixed up with the Apify SDK. + "I001", # Import block is un-sorted or un-formatted +] [tool.ruff.lint.flake8-quotes] docstring-quotes = "double" From 2a41a3f3e19b1e664adcbe35a39bfdacc58e816d Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 12:00:53 +0200 Subject: [PATCH 08/24] docs: use Request.crawl_depth for depth tracking in Scrapling example --- .../code/scrapling_project/my_actor/main.py | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/docs/03_guides/code/scrapling_project/my_actor/main.py b/docs/03_guides/code/scrapling_project/my_actor/main.py index d2cd36e7..52e9ef4c 100644 --- a/docs/03_guides/code/scrapling_project/my_actor/main.py +++ b/docs/03_guides/code/scrapling_project/my_actor/main.py @@ -24,21 +24,18 @@ async def main() -> None: # Open the default request queue for handling URLs to be processed. request_queue = await Actor.open_request_queue() - # Enqueue the start URLs with an initial crawl depth of 0. + # Enqueue the start URLs. Their crawl depth defaults to 0. for start_url in start_urls: url = start_url.get('url') Actor.log.info(f'Enqueuing {url} ...') - request = Request.from_url(url, user_data={'depth': 0}) - await request_queue.add_request(request) + await request_queue.add_request(Request.from_url(url)) # Process the URLs from the request queue. while request := await request_queue.fetch_next_request(): url = request.url - if not isinstance(request.user_data['depth'], (str, int)): - raise TypeError('Request.depth is an unexpected type.') - - depth = int(request.user_data['depth']) + # Read the crawl depth tracked by the request itself. + depth = request.crawl_depth Actor.log.info(f'Scraping {url} (depth={depth}) ...') try: @@ -53,14 +50,13 @@ async def main() -> None: # Store the extracted data to the default dataset. await Actor.push_data(data) - # If we are not too deep yet, enqueue the links we found. + # If we are not too deep yet, enqueue the links we found one + # level deeper than the current page. if depth < max_depth: for link_url in links: Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url( - link_url, - user_data={'depth': depth + 1}, - ) + new_request = Request.from_url(link_url) + new_request.crawl_depth = depth + 1 await request_queue.add_request(new_request) except Exception: From 65f8e0d1bdce589c6ba3249290e4257eb1da7473 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 13:24:12 +0200 Subject: [PATCH 09/24] docs: Flatten scraper examples and fix guide inaccuracies --- docs/03_guides/06_scrapy.mdx | 6 +- docs/03_guides/07_running_webserver.mdx | 12 ++- docs/03_guides/code/01_beautifulsoup_httpx.py | 89 ++++++++++-------- docs/03_guides/code/02_parsel_impit.py | 93 ++++++++++--------- docs/03_guides/code/03_playwright.py | 86 ++++++++++------- docs/03_guides/code/04_selenium.py | 77 ++++++++------- 6 files changed, 209 insertions(+), 154 deletions(-) diff --git a/docs/03_guides/06_scrapy.mdx b/docs/03_guides/06_scrapy.mdx index 12525609..81409ab2 100644 --- a/docs/03_guides/06_scrapy.mdx +++ b/docs/03_guides/06_scrapy.mdx @@ -23,9 +23,9 @@ In this guide, you'll learn how to use the [Scrapy](https://scrapy.org/) framewo ## Integrating Scrapy with the Apify platform -The Apify SDK provides an Apify-Scrapy integration. The main challenge of this is to combine two asynchronous frameworks that use different event loop implementations. Scrapy uses [Twisted](https://twisted.org/) for asynchronous execution, while the Apify SDK is based on [asyncio](https://docs.python.org/3/library/asyncio.html). The key thing is to install the Twisted's `asyncioreactor` to run Twisted's asyncio compatible event loop. The `apify.scrapy.run_scrapy_actor` function handles this reactor installation automatically. This allows both Twisted and asyncio to run on a single event loop, enabling a Scrapy spider to run as an Apify Actor with minimal modifications. +The Apify SDK provides an Apify-Scrapy integration. The main challenge of this is to combine two asynchronous frameworks that use different event loop implementations. Scrapy uses [Twisted](https://twisted.org/) for asynchronous execution, while the Apify SDK is based on [asyncio](https://docs.python.org/3/library/asyncio.html). The key thing is to install Twisted's `asyncioreactor` to run Twisted's asyncio compatible event loop. The `apify.scrapy.run_scrapy_actor` function handles this reactor installation automatically. This allows both Twisted and asyncio to run on a single event loop, enabling a Scrapy spider to run as an Apify Actor with minimal modifications. - + {UnderscoreMainExample} @@ -74,7 +74,7 @@ For further details, see the [Scrapy migration guide](https://docs.apify.com/cli The following example shows a Scrapy Actor that scrapes page titles and enqueues links found on each page. This example aligns with the structure provided in the Apify Actor templates. - + {UnderscoreMainExample} diff --git a/docs/03_guides/07_running_webserver.mdx b/docs/03_guides/07_running_webserver.mdx index c17c313b..9b63976a 100644 --- a/docs/03_guides/07_running_webserver.mdx +++ b/docs/03_guides/07_running_webserver.mdx @@ -18,9 +18,9 @@ The URL is available in the following places: - In Apify Console, on the Actor run details page as the **Container URL** field. - In the API as the `container_url` property of the [Run object](https://docs.apify.com/api/v2#/reference/actors/run-object/get-run). -- In the Actor as the `Actor.configuration.container_url` property. +- In the Actor as the `Actor.configuration.web_server_url` property. -The web server running inside the container must listen at the port defined by the `Actor.configuration.container_port` property. When running Actors locally, the port defaults to `4321`, so the web server will be accessible at `http://localhost:4321`. +The web server running inside the container must listen at the port defined by the `Actor.configuration.web_server_port` property. When running Actors locally, the port defaults to `4321`, so the web server will be accessible at `http://localhost:4321`. ## Example Actor @@ -30,6 +30,14 @@ The following example shows how to start a simple web server in your Actor, whic {WebserverExample} +## Actor Standby + +The example above runs a web server for the duration of a single Actor run. With [Actor Standby](https://docs.apify.com/platform/actors/development/programming-interface/standby), you can instead expose your Actor as an always-ready HTTP API: the platform keeps the Actor running in the background and routes incoming HTTP requests to the web server inside it, spinning up additional instances as the load grows. + +From the SDK's perspective, a Standby Actor is built the same way as the web server above — start an HTTP server listening on the port from `Actor.configuration.web_server_port`. The difference is operational: instead of doing its work once and exiting, a Standby Actor stays up and serves requests. This makes it a good fit for low-latency, on-demand use cases, such as serving scraped data or acting as a microservice. + +To get started quickly, use the [Standby Python template](https://apify.com/templates/python-standby). For details on enabling Standby, request routing, and readiness probes, see the [Actor Standby documentation](https://docs.apify.com/platform/actors/development/programming-interface/standby). + ## Conclusion In this guide, you learned how to run a web server inside your Apify Actor. By leveraging the container URL and port provided by the platform, you can expose HTTP endpoints for monitoring, reporting, or serving content during Actor execution. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). diff --git a/docs/03_guides/code/01_beautifulsoup_httpx.py b/docs/03_guides/code/01_beautifulsoup_httpx.py index 5dbfab2a..86e83868 100644 --- a/docs/03_guides/code/01_beautifulsoup_httpx.py +++ b/docs/03_guides/code/01_beautifulsoup_httpx.py @@ -1,4 +1,5 @@ import asyncio +from typing import Any from urllib.parse import urljoin import httpx @@ -7,6 +8,40 @@ from apify import Actor, Request +async def scrape_page( + client: httpx.AsyncClient, url: str +) -> tuple[dict[str, Any], list[str]]: + """Fetch a single page with HTTPX and extract its data and links. + + Keeping the fetching and parsing in this helper keeps the Actor's main loop + shallow. It returns the extracted data together with the links found on the + page, so `main` only has to decide what to store and what to enqueue. + """ + # Fetch the HTTP response from the specified URL using HTTPX. + response = await client.get(url, follow_redirects=True) + + # Parse the HTML content using Beautiful Soup. + soup = BeautifulSoup(response.content, 'html.parser') + + # Extract the desired data. + data = { + 'url': url, + 'title': soup.title.string if soup.title else None, + 'h1s': [h1.text for h1 in soup.find_all('h1')], + 'h2s': [h2.text for h2 in soup.find_all('h2')], + 'h3s': [h3.text for h3 in soup.find_all('h3')], + } + + # Collect absolute links found on the page so the caller can enqueue them. + links: list[str] = [] + for link in soup.find_all('a'): + link_url = urljoin(url, link.get('href')) + if link_url.startswith(('http://', 'https://')): + links.append(link_url) + + return data, links + + async def main() -> None: # Enter the context of the Actor. async with Actor: @@ -23,12 +58,11 @@ async def main() -> None: # Open the default request queue for handling URLs to be processed. request_queue = await Actor.open_request_queue() - # Enqueue the start URLs with an initial crawl depth of 0. + # Enqueue the start URLs. Their crawl depth defaults to 0. for start_url in start_urls: url = start_url.get('url') Actor.log.info(f'Enqueuing {url} ...') - new_request = Request.from_url(url, user_data={'depth': 0}) - await request_queue.add_request(new_request) + await request_queue.add_request(Request.from_url(url)) # Create an HTTPX client to fetch the HTML content of the URLs. async with httpx.AsyncClient() as client: @@ -36,52 +70,31 @@ async def main() -> None: while request := await request_queue.fetch_next_request(): url = request.url - if not isinstance(request.user_data['depth'], (str, int)): - raise TypeError('Request.depth is an unexpected type.') - - depth = int(request.user_data['depth']) + # Read the crawl depth tracked by the request itself. + depth = request.crawl_depth Actor.log.info(f'Scraping {url} (depth={depth}) ...') try: - # Fetch the HTTP response from the specified URL using HTTPX. - response = await client.get(url, follow_redirects=True) - - # Parse the HTML content using Beautiful Soup. - soup = BeautifulSoup(response.content, 'html.parser') - - # If the current depth is less than max_depth, find nested links - # and enqueue them. - if depth < max_depth: - for link in soup.find_all('a'): - link_href = link.get('href') - link_url = urljoin(url, link_href) - - if link_url.startswith(('http://', 'https://')): - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url( - link_url, - user_data={'depth': depth + 1}, - ) - await request_queue.add_request(new_request) - - # Extract the desired data. - data = { - 'url': url, - 'title': soup.title.string if soup.title else None, - 'h1s': [h1.text for h1 in soup.find_all('h1')], - 'h2s': [h2.text for h2 in soup.find_all('h2')], - 'h3s': [h3.text for h3 in soup.find_all('h3')], - } + # Fetch the page and extract its data and nested links. + data, links = await scrape_page(client, url) # Store the extracted data to the default dataset. await Actor.push_data(data) + # If we are not too deep yet, enqueue the links we found. + if depth < max_depth: + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + new_request = Request.from_url(link_url) + new_request.crawl_depth = depth + 1 + await request_queue.add_request(new_request) + except Exception: Actor.log.exception(f'Cannot extract data from {url}.') finally: - # Mark the request as handled to ensure it is not processed again. - await request_queue.mark_request_as_handled(new_request) + # Mark the request as handled so it is not processed again. + await request_queue.mark_request_as_handled(request) if __name__ == '__main__': diff --git a/docs/03_guides/code/02_parsel_impit.py b/docs/03_guides/code/02_parsel_impit.py index 21b5e74f..1a0c4f77 100644 --- a/docs/03_guides/code/02_parsel_impit.py +++ b/docs/03_guides/code/02_parsel_impit.py @@ -1,4 +1,5 @@ import asyncio +from typing import Any from urllib.parse import urljoin import impit @@ -7,6 +8,40 @@ from apify import Actor, Request +async def scrape_page( + client: impit.AsyncClient, url: str +) -> tuple[dict[str, Any], list[str]]: + """Fetch a single page with Impit and extract its data and links. + + Keeping the fetching and parsing in this helper keeps the Actor's main loop + shallow. It returns the extracted data together with the links found on the + page, so `main` only has to decide what to store and what to enqueue. + """ + # Fetch the HTTP response from the specified URL using Impit. + response = await client.get(url) + + # Parse the HTML content using a Parsel selector. + selector = parsel.Selector(text=response.text) + + # Extract the desired data using Parsel selectors. + data = { + 'url': url, + 'title': selector.css('title::text').get(), + 'h1s': selector.css('h1::text').getall(), + 'h2s': selector.css('h2::text').getall(), + 'h3s': selector.css('h3::text').getall(), + } + + # Collect absolute links found on the page so the caller can enqueue them. + links: list[str] = [] + for link_href in selector.css('a::attr(href)').getall(): + link_url = urljoin(url, link_href) + if link_url.startswith(('http://', 'https://')): + links.append(link_url) + + return data, links + + async def main() -> None: # Enter the context of the Actor. async with Actor: @@ -23,12 +58,11 @@ async def main() -> None: # Open the default request queue for handling URLs to be processed. request_queue = await Actor.open_request_queue() - # Enqueue the start URLs with an initial crawl depth of 0. + # Enqueue the start URLs. Their crawl depth defaults to 0. for start_url in start_urls: url = start_url.get('url') Actor.log.info(f'Enqueuing {url} ...') - new_request = Request.from_url(url, user_data={'depth': 0}) - await request_queue.add_request(new_request) + await request_queue.add_request(Request.from_url(url)) # Create an Impit client to fetch the HTML content of the URLs. async with impit.AsyncClient() as client: @@ -36,57 +70,30 @@ async def main() -> None: while request := await request_queue.fetch_next_request(): url = request.url - if not isinstance(request.user_data['depth'], (str, int)): - raise TypeError('Request.depth is an unexpected type.') - - depth = int(request.user_data['depth']) + # Read the crawl depth tracked by the request itself. + depth = request.crawl_depth Actor.log.info(f'Scraping {url} (depth={depth}) ...') try: - # Fetch the HTTP response from the specified URL using Impit. - response = await client.get(url) - - # Parse the HTML content using Parsel Selector. - selector = parsel.Selector(text=response.text) - - # If the current depth is less than max_depth, find nested links - # and enqueue them. - if depth < max_depth: - # Extract all links using CSS selector - links = selector.css('a::attr(href)').getall() - for link_href in links: - link_url = urljoin(url, link_href) - - if link_url.startswith(('http://', 'https://')): - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url( - link_url, - user_data={'depth': depth + 1}, - ) - await request_queue.add_request(new_request) - - # Extract the desired data using Parsel selectors. - title = selector.css('title::text').get() - h1s = selector.css('h1::text').getall() - h2s = selector.css('h2::text').getall() - h3s = selector.css('h3::text').getall() - - data = { - 'url': url, - 'title': title, - 'h1s': h1s, - 'h2s': h2s, - 'h3s': h3s, - } + # Fetch the page and extract its data and nested links. + data, links = await scrape_page(client, url) # Store the extracted data to the default dataset. await Actor.push_data(data) + # If we are not too deep yet, enqueue the links we found. + if depth < max_depth: + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + new_request = Request.from_url(link_url) + new_request.crawl_depth = depth + 1 + await request_queue.add_request(new_request) + except Exception: Actor.log.exception(f'Cannot extract data from {url}.') finally: - # Mark the request as handled to ensure it is not processed again. + # Mark the request as handled so it is not processed again. await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/03_playwright.py b/docs/03_guides/code/03_playwright.py index 3eecb4ac..1f2fc1d7 100644 --- a/docs/03_guides/code/03_playwright.py +++ b/docs/03_guides/code/03_playwright.py @@ -1,7 +1,8 @@ import asyncio +from typing import Any from urllib.parse import urljoin -from playwright.async_api import async_playwright +from playwright.async_api import BrowserContext, async_playwright from apify import Actor, Request @@ -11,6 +12,39 @@ # in the Actor's Docker image. +async def scrape_page( + context: BrowserContext, url: str +) -> tuple[dict[str, Any], list[str]]: + """Open a page in the browser, extract its data, and collect its links. + + Keeping the page handling in this helper keeps the Actor's main loop shallow. + It returns the extracted data together with the links found on the page, so + `main` only has to decide what to store and what to enqueue. + """ + page = await context.new_page() + try: + await page.goto(url) + + # Extract the desired data. + data = { + 'url': url, + 'title': await page.title(), + } + + # Collect absolute links found on the page so the caller can enqueue them. + links: list[str] = [] + for link in await page.locator('a').all(): + link_href = await link.get_attribute('href') + link_url = urljoin(url, link_href) + if link_url.startswith(('http://', 'https://')): + links.append(link_url) + + return data, links + + finally: + await page.close() + + async def main() -> None: # Enter the context of the Actor. async with Actor: @@ -21,18 +55,17 @@ async def main() -> None: # Exit if no start URLs are provided. if not start_urls: - Actor.log.info('No start URLs specified in actor input, exiting...') + Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() # Open the default request queue for handling URLs to be processed. request_queue = await Actor.open_request_queue() - # Enqueue the start URLs with an initial crawl depth of 0. + # Enqueue the start URLs. Their crawl depth defaults to 0. for start_url in start_urls: url = start_url.get('url') Actor.log.info(f'Enqueuing {url} ...') - new_request = Request.from_url(url, user_data={'depth': 0}) - await request_queue.add_request(new_request) + await request_queue.add_request(Request.from_url(url)) Actor.log.info('Launching Playwright...') @@ -49,47 +82,30 @@ async def main() -> None: while request := await request_queue.fetch_next_request(): url = request.url - if not isinstance(request.user_data['depth'], (str, int)): - raise TypeError('Request.depth is an unexpected type.') - - depth = int(request.user_data['depth']) + # Read the crawl depth tracked by the request itself. + depth = request.crawl_depth Actor.log.info(f'Scraping {url} (depth={depth}) ...') try: - # Open a new page in the browser context and navigate to the URL. - page = await context.new_page() - await page.goto(url) - - # If the current depth is less than max_depth, find nested links - # and enqueue them. - if depth < max_depth: - for link in await page.locator('a').all(): - link_href = await link.get_attribute('href') - link_url = urljoin(url, link_href) - - if link_url.startswith(('http://', 'https://')): - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url( - link_url, - user_data={'depth': depth + 1}, - ) - await request_queue.add_request(new_request) - - # Extract the desired data. - data = { - 'url': url, - 'title': await page.title(), - } + # Fetch the page and extract its data and nested links. + data, links = await scrape_page(context, url) # Store the extracted data to the default dataset. await Actor.push_data(data) + # If we are not too deep yet, enqueue the links we found. + if depth < max_depth: + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + new_request = Request.from_url(link_url) + new_request.crawl_depth = depth + 1 + await request_queue.add_request(new_request) + except Exception: Actor.log.exception(f'Cannot extract data from {url}.') finally: - await page.close() - # Mark the request as handled to ensure it is not processed again. + # Mark the request as handled so it is not processed again. await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/04_selenium.py b/docs/03_guides/code/04_selenium.py index 4b427a7a..42dc3509 100644 --- a/docs/03_guides/code/04_selenium.py +++ b/docs/03_guides/code/04_selenium.py @@ -1,4 +1,5 @@ import asyncio +from typing import Any from urllib.parse import urljoin from selenium import webdriver @@ -14,6 +15,32 @@ # in the Actor's Docker image. +def scrape_page(driver: webdriver.Chrome, url: str) -> tuple[dict[str, Any], list[str]]: + """Navigate to a page with Selenium, extract its data, and collect its links. + + These are blocking WebDriver calls, so the Actor's main loop runs this helper + in a worker thread via `asyncio.to_thread`. It returns the extracted data + together with the links found on the page, so `main` only has to decide what + to store and what to enqueue. + """ + driver.get(url) + + # Extract the desired data. + data = { + 'url': url, + 'title': driver.title, + } + + # Collect absolute links found on the page so the caller can enqueue them. + links: list[str] = [] + for link in driver.find_elements(By.TAG_NAME, 'a'): + link_url = urljoin(url, link.get_attribute('href')) + if link_url.startswith(('http://', 'https://')): + links.append(link_url) + + return data, links + + async def main() -> None: # Enter the context of the Actor. async with Actor: @@ -24,18 +51,17 @@ async def main() -> None: # Exit if no start URLs are provided. if not start_urls: - Actor.log.info('No start URLs specified in actor input, exiting...') + Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() # Open the default request queue for handling URLs to be processed. request_queue = await Actor.open_request_queue() - # Enqueue the start URLs with an initial crawl depth of 0. + # Enqueue the start URLs. Their crawl depth defaults to 0. for start_url in start_urls: url = start_url.get('url') Actor.log.info(f'Enqueuing {url} ...') - new_request = Request.from_url(url, user_data={'depth': 0}) - await request_queue.add_request(new_request) + await request_queue.add_request(Request.from_url(url)) # Launch a new Selenium Chrome WebDriver and configure it. Actor.log.info('Launching Chrome WebDriver...') @@ -57,46 +83,31 @@ async def main() -> None: while request := await request_queue.fetch_next_request(): url = request.url - if not isinstance(request.user_data['depth'], (str, int)): - raise TypeError('Request.depth is an unexpected type.') - - depth = int(request.user_data['depth']) + # Read the crawl depth tracked by the request itself. + depth = request.crawl_depth Actor.log.info(f'Scraping {url} (depth={depth}) ...') try: - # Navigate to the URL using Selenium WebDriver. Use asyncio.to_thread - # for non-blocking execution. - await asyncio.to_thread(driver.get, url) - - # If the current depth is less than max_depth, find nested links - # and enqueue them. - if depth < max_depth: - for link in driver.find_elements(By.TAG_NAME, 'a'): - link_href = link.get_attribute('href') - link_url = urljoin(url, link_href) - - if link_url.startswith(('http://', 'https://')): - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url( - link_url, - user_data={'depth': depth + 1}, - ) - await request_queue.add_request(new_request) - - # Extract the desired data. - data = { - 'url': url, - 'title': driver.title, - } + # Fetch the page and extract its data and nested links. The blocking + # WebDriver calls run in a worker thread to keep the loop responsive. + data, links = await asyncio.to_thread(scrape_page, driver, url) # Store the extracted data to the default dataset. await Actor.push_data(data) + # If we are not too deep yet, enqueue the links we found. + if depth < max_depth: + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + new_request = Request.from_url(link_url) + new_request.crawl_depth = depth + 1 + await request_queue.add_request(new_request) + except Exception: Actor.log.exception(f'Cannot extract data from {url}.') finally: - # Mark the request as handled to ensure it is not processed again. + # Mark the request as handled so it is not processed again. await request_queue.mark_request_as_handled(request) driver.quit() From 7faa27a78cf6ca50b1a13c4abac8481e832c01a1 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 16:45:55 +0200 Subject: [PATCH 10/24] docs: add guide on validating Actor input with Pydantic --- docs/01_introduction/quick-start.mdx | 1 + docs/02_concepts/02_actor_input.mdx | 4 + docs/03_guides/10_pydantic.mdx | 119 +++++++++++++++++++++++++++ docs/03_guides/code/10_pydantic.py | 72 ++++++++++++++++ 4 files changed, 196 insertions(+) create mode 100644 docs/03_guides/10_pydantic.mdx create mode 100644 docs/03_guides/code/10_pydantic.py diff --git a/docs/01_introduction/quick-start.mdx b/docs/01_introduction/quick-start.mdx index da166da9..e0d2e641 100644 --- a/docs/01_introduction/quick-start.mdx +++ b/docs/01_introduction/quick-start.mdx @@ -106,3 +106,4 @@ To see how you can integrate the Apify SDK with popular web scraping libraries, - [Crawlee](../guides/crawlee) - [Scrapy](../guides/scrapy) - [Running webserver](../guides/running-webserver) +- [Validate Actor input with Pydantic](../guides/input-validation) diff --git a/docs/02_concepts/02_actor_input.mdx b/docs/02_concepts/02_actor_input.mdx index 15807c05..f975e6ae 100644 --- a/docs/02_concepts/02_actor_input.mdx +++ b/docs/02_concepts/02_actor_input.mdx @@ -20,6 +20,10 @@ For example, if an Actor received a JSON input with two fields, `{ "firstNumber" {InputExample} +## Validating input + +Reading values straight out of the raw input dictionary works for simple cases, but it gives you no type guarantees, no constraint checks, and no clear error when the input is malformed. For anything beyond a couple of fields, validate the input with [Pydantic](https://docs.pydantic.dev/) so your code works with a typed, guaranteed-valid object instead. See the [Validate Actor input with Pydantic](../guides/input-validation) guide for the recommended approach. + ## Loading URLs from Actor input Actors commonly receive a list of URLs to process via their input. The `ApifyRequestList` class (from `apify.request_loaders`) can parse the standard Apify input format for URL sources. It supports both direct URL objects (`{"url": "https://example.com"}`) and remote URL lists (`{"requestsFromUrl": "https://example.com/urls.txt"}`), where the remote file contains one URL per line. diff --git a/docs/03_guides/10_pydantic.mdx b/docs/03_guides/10_pydantic.mdx new file mode 100644 index 00000000..19a20c85 --- /dev/null +++ b/docs/03_guides/10_pydantic.mdx @@ -0,0 +1,119 @@ +--- +id: input-validation +title: Validate Actor input with Pydantic +description: Parse, validate, and type your Actor's input with Pydantic models instead of reaching into a raw dictionary. +--- + +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; +import ApiLink from '@theme/ApiLink'; + +import PydanticExample from '!!raw-loader!roa-loader!./code/10_pydantic.py'; + +In this guide, you'll learn how to validate your Apify Actor's input with [Pydantic](https://docs.pydantic.dev/), so that your code works with a typed, guaranteed-valid object instead of a raw dictionary. + +## Introduction + +An Actor reads its input with `Actor.get_input`, which returns the input record as a plain `dict` (or `None` when there's no input). Working with that dictionary directly is fragile: + +```python +actor_input = await Actor.get_input() or {} +search_terms = actor_input.get('searchTerms', []) +max_results = actor_input.get('maxResults', 10) +``` + +- There are no type guarantees - `max_results` could just as easily arrive as the string `"10"` or `None`, and you'd only find out when something blows up later. +- There's no validation - nothing stops `max_results` from being `0` or `-5`, or `search_terms` from being empty. +- A typo in a key (`maxResult` instead of `maxResults`) silently falls back to the default instead of failing. +- Defaults are scattered across the codebase, and your editor can't autocomplete the fields or catch mistakes. + +[Pydantic](https://docs.pydantic.dev/) solves all of this. You declare the shape of your input once as a model, and Pydantic parses the raw dictionary into a typed object, applying defaults, enforcing constraints, and producing clear error messages when the input doesn't match. Pydantic is already a dependency of the Apify SDK, so there's nothing extra to install. + +## Example Actor + +The following Actor declares its input as a Pydantic `BaseModel`, validates the raw input against it, and then works with a fully typed object. On invalid input it fails fast with a readable error; on valid input it logs the normalized values and stores them as the Actor's output. + + + {PydanticExample} + + +A few things worth pointing out about the **model**: + +- **Aliases bridge the naming conventions.** Apify input fields are conventionally `camelCase` (`maxResults`), while Python attributes are `snake_case` (`max_results`). `Field(alias='maxResults')` maps one to the other, and `populate_by_name=True` lets the model accept either spelling - handy in tests. +- **Defaults and `required` fields are explicit.** A field without a default (`search_terms`) is required; one with a default (`max_results`) is optional. There's a single, obvious place where every default lives. +- **Constraints are declarative.** `ge=1, le=100` enforces a numeric range, `min_length=1` rejects an empty list, and `Literal['json', 'csv']` restricts a field to a fixed set of choices - mirroring an `enum` in the input schema. +- **Custom validators handle the rest.** The `field_validator` normalizes the search terms (trimming whitespace, dropping empties) and rejects input that has nothing left, so the rest of your code never has to repeat those checks. +- **Unknown fields are ignored.** `extra='ignore'` means adding a new field to your input schema won't break an older Actor build that doesn't know about it yet. Use `extra='forbid'` instead if you'd rather reject anything unexpected. + +And about the **validation** itself: + +- `model_validate` parses the raw dictionary into a typed `ActorInput` instance, filling in defaults and guaranteeing every field is valid - or raising a `ValidationError` describing every problem at once. +- Catching that error, logging a readable summary, and re-raising makes the Actor **fail fast** with a clear explanation right at the start, rather than crashing with an obscure error somewhere deep in the run. Because the body runs inside `async with Actor:`, the re-raised exception automatically marks the run as `FAILED`. +- The error messages refer to the fields by their input-schema aliases. For invalid input like `{"searchTerms": [], "maxResults": 999, "outputFormat": "xml"}`, the log shows exactly what's wrong: + + ```text + The Actor input is invalid: + 3 validation errors for ActorInput + searchTerms + List should have at least 1 item after validation, not 0 ... + maxResults + Input should be less than or equal to 100 ... + outputFormat + Input should be 'json' or 'csv' ... + ``` + +Once validation passes, the rest of `main` works with `actor_input.search_terms`, `actor_input.max_results`, and `actor_input.output_format` - all correctly typed, with editor autocompletion and static type checking. + +## Relationship to the input schema + +Pydantic validation **complements** the Actor's [input schema](https://docs.apify.com/platform/actors/development/input-schema) (`.actor/input_schema.json`) - it doesn't replace it. The two serve different layers: + +- The **input schema** drives the Apify Console form, documents the fields for your users, and lets the platform validate input before the run even starts. Keep declaring your fields there. +- The **Pydantic model** validates the input again *inside your Python code*, where it gives you a typed object, IDE support, and richer rules (normalization, cross-field checks, custom formats) that the input schema can't express. It's also your safety net for runs started programmatically by [another Actor](../concepts/interacting-with-other-actors) or executed [locally](https://docs.apify.com/cli/docs/reference#apify-run), and for keeping the two definitions honest with each other. + +Keep the model's aliases in sync with the field keys in `input_schema.json`, and the two definitions describe the same input from both sides. + +## Useful validation features + +Pydantic offers much more than the example uses. A few features that come up often when validating Actor input: + +- **Format-validated types** for common string formats, for example `HttpUrl` for URLs or `EmailStr` for e-mail addresses (the latter needs the `pydantic[email]` extra): + + ```python + from pydantic import BaseModel, HttpUrl + + class ActorInput(BaseModel): + target_url: HttpUrl + ``` + +- **Cross-field validation** with `model_validator`, when one field's validity depends on another: + + ```python + from pydantic import BaseModel, model_validator + from typing_extensions import Self + + class ActorInput(BaseModel): + min_price: int = 0 + max_price: int = 100 + + @model_validator(mode='after') + def _check_range(self) -> Self: + if self.min_price > self.max_price: + raise ValueError('min_price must not exceed max_price') + return self + ``` + +- **Secret input fields.** The platform decrypts [secret input fields](https://docs.apify.com/platform/actors/development/secret-input) for you before `Actor.get_input` returns, so you receive plaintext. Wrap such fields in Pydantic's `SecretStr` to keep them from leaking into logs or `model_dump()` output. + +For the full set of types, constraints, and validators, see the [Pydantic documentation](https://docs.pydantic.dev/latest/concepts/models/). + +## Conclusion + +In this guide, you learned how to validate Actor input with Pydantic: declaring the input as a model with aliases, defaults, and constraints; parsing the raw input with `model_validate`; failing fast with a readable error when the input is invalid; and working with a typed object for the rest of the run. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own Actors. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy validating! + +## Additional resources + +- [Pydantic: Official documentation](https://docs.pydantic.dev/) +- [Pydantic: Models](https://docs.pydantic.dev/latest/concepts/models/) +- [Pydantic: Validators](https://docs.pydantic.dev/latest/concepts/validators/) +- [Apify: Actor input](https://docs.apify.com/platform/actors/running/input) +- [Apify: Input schema specification](https://docs.apify.com/platform/actors/development/input-schema) diff --git a/docs/03_guides/code/10_pydantic.py b/docs/03_guides/code/10_pydantic.py new file mode 100644 index 00000000..e836fea9 --- /dev/null +++ b/docs/03_guides/code/10_pydantic.py @@ -0,0 +1,72 @@ +import asyncio +from typing import Literal + +from pydantic import BaseModel, ConfigDict, Field, ValidationError, field_validator + +from apify import Actor + + +class ActorInput(BaseModel): + """Typed and validated representation of the Actor input. + + The field names follow Python's `snake_case`, while the aliases match the + `camelCase` keys produced by the Apify input schema editor. With + `populate_by_name`, the model accepts either form, and unknown fields are + ignored (`extra='ignore'`) so that adding a field to the input schema never + breaks an older Actor build. + """ + + model_config = ConfigDict(populate_by_name=True, extra='ignore') + + # Required: a non-empty list of search terms. The validator below trims + # each entry and drops the empty ones. + search_terms: list[str] = Field(alias='searchTerms', min_length=1) + + # Optional: defaults to 10 and must fall within the inclusive 1-100 range. + max_results: int = Field(alias='maxResults', default=10, ge=1, le=100) + + # Optional: restricted to a fixed set of choices, like an input schema enum. + output_format: Literal['json', 'csv'] = Field(alias='outputFormat', default='json') + + @field_validator('search_terms') + @classmethod + def _normalize_terms(cls, value: list[str]) -> list[str]: + # Trim whitespace and drop empty terms, then ensure something is left. + cleaned = [term.strip() for term in value if term.strip()] + if not cleaned: + raise ValueError('searchTerms must contain at least one non-empty term') + return cleaned + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Read the raw input record from the default key-value store. It's a + # plain dict (or None) - no validation has happened yet. + raw_input = await Actor.get_input() or {} + + # Validate the raw input against the model. On success, `actor_input` is + # a fully typed `ActorInput` with defaults filled in and every field + # guaranteed to be valid. + try: + actor_input = ActorInput.model_validate(raw_input) + except ValidationError as exc: + # Log a readable, per-field summary, then re-raise so the context + # manager marks the run as FAILED. Failing fast here beats crashing + # later with an obscure error deep in the code. + Actor.log.error('The Actor input is invalid:\n%s', exc) + raise + + # From here on, work with typed attributes instead of dict lookups. + Actor.log.info('Input passed validation: %s', actor_input.model_dump()) + + max_results = actor_input.max_results + for term in actor_input.search_terms: + Actor.log.info('Processing %r (max %d results)', term, max_results) + + # Store the normalized input as the Actor's output. + await Actor.set_value('OUTPUT', actor_input.model_dump()) + + +if __name__ == '__main__': + asyncio.run(main()) From a62a06b521835f15efa24c27d28cdd26eac3c87d Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 16:50:16 +0200 Subject: [PATCH 11/24] docs: add Crawl4AI guide --- docs/01_introduction/quick-start.mdx | 1 + docs/03_guides/10_crawl4ai.mdx | 111 ++++++++++++++++++ .../code/crawl4ai_project/Dockerfile | 19 +++ .../crawl4ai_project/my_actor/__init__.py | 0 .../crawl4ai_project/my_actor/__main__.py | 8 ++ .../code/crawl4ai_project/my_actor/main.py | 73 ++++++++++++ .../code/crawl4ai_project/my_actor/scraper.py | 46 ++++++++ pyproject.toml | 4 + 8 files changed, 262 insertions(+) create mode 100644 docs/03_guides/10_crawl4ai.mdx create mode 100644 docs/03_guides/code/crawl4ai_project/Dockerfile create mode 100644 docs/03_guides/code/crawl4ai_project/my_actor/__init__.py create mode 100644 docs/03_guides/code/crawl4ai_project/my_actor/__main__.py create mode 100644 docs/03_guides/code/crawl4ai_project/my_actor/main.py create mode 100644 docs/03_guides/code/crawl4ai_project/my_actor/scraper.py diff --git a/docs/01_introduction/quick-start.mdx b/docs/01_introduction/quick-start.mdx index da166da9..6bd65f39 100644 --- a/docs/01_introduction/quick-start.mdx +++ b/docs/01_introduction/quick-start.mdx @@ -105,4 +105,5 @@ To see how you can integrate the Apify SDK with popular web scraping libraries, - [Selenium](../guides/selenium) - [Crawlee](../guides/crawlee) - [Scrapy](../guides/scrapy) +- [Crawl4AI](../guides/crawl4ai) - [Running webserver](../guides/running-webserver) diff --git a/docs/03_guides/10_crawl4ai.mdx b/docs/03_guides/10_crawl4ai.mdx new file mode 100644 index 00000000..7dc996d8 --- /dev/null +++ b/docs/03_guides/10_crawl4ai.mdx @@ -0,0 +1,111 @@ +--- +id: crawl4ai +title: Use Crawl4AI +description: Build an Apify Actor that scrapes web pages into LLM-ready markdown using the Crawl4AI library. +--- + +import CodeBlock from '@theme/CodeBlock'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +import Crawl4aiMain from '!!raw-loader!./code/crawl4ai_project/my_actor/main.py'; +import Crawl4aiScraper from '!!raw-loader!./code/crawl4ai_project/my_actor/scraper.py'; +import Crawl4aiEntrypoint from '!!raw-loader!./code/crawl4ai_project/my_actor/__main__.py'; +import Crawl4aiDockerfile from '!!raw-loader!./code/crawl4ai_project/Dockerfile'; + +In this guide, you'll learn how to use the [Crawl4AI](https://crawl4ai.com/) library in your Apify Actors. + +## Introduction + +[Crawl4AI](https://crawl4ai.com/) is an open-source, asynchronous web crawler built for LLM and AI workflows. It renders a page in a real browser and turns the result into clean, structured markdown that's ready to feed into a language model or a retrieval-augmented generation (RAG) pipeline, while still giving you the raw HTML, extracted links, and media when you need them. + +Some of the features that make Crawl4AI a good fit for Apify Actors: + +- **LLM-ready markdown** - Crawl4AI converts each page into clean markdown, stripping boilerplate and optionally filtering content, so the output can be fed straight into a language model. +- **Real browser rendering** - Pages are loaded in a [Playwright](https://playwright.dev/)-driven browser, so JavaScript-heavy and dynamically rendered websites work out of the box. +- **Built-in link and media extraction** - Every crawl returns the page's links already split into `internal` and `external` groups, together with the media it found, which makes recursive crawling straightforward. +- **Flexible extraction strategies** - Beyond markdown, Crawl4AI can extract structured data with CSS/XPath schemas or with an LLM, all configured per request. +- **First-class async support** - The `AsyncWebCrawler` is built on `asyncio`, which integrates naturally with the asyncio-based Apify SDK. +- **Per-request proxy** - Each request can be routed through its own proxy, which pairs well with Apify Proxy and its rotating IP addresses. + +Crawl4AI drives a real browser through Playwright, so after installing the library you need to download the browser binaries once with the `crawl4ai-setup` command: + +```bash +pip install crawl4ai +crawl4ai-setup +``` + +## Example Actor + +The following Actor recursively crawls pages, starting from the URLs in the Actor input and following links up to a user-defined maximum depth. It uses Crawl4AI's `AsyncWebCrawler` to render each page through [Apify Proxy](https://docs.apify.com/platform/proxy), stores the page's markdown in the dataset, and follows the internal links that Crawl4AI discovers. + +The code is split into three small modules, following the structure of the Apify Python Actor templates: + +- `my_actor/main.py` - The Actor's main coroutine. It handles the [Actor](https://docs.apify.com/platform/actors) lifecycle, reads the input, sets up [Apify Proxy](https://docs.apify.com/platform/proxy) and the [request queue](https://docs.apify.com/platform/storage/request-queue), opens a single browser-backed crawler, and drives the crawl. +- `my_actor/scraper.py` - The Crawl4AI-specific logic. A single `scrape_page` function crawls a page and returns the extracted data together with the links found on it. +- `my_actor/__main__.py` - The entry point that runs the `main` coroutine with `asyncio`. + + + + + {Crawl4aiMain} + + + + + {Crawl4aiScraper} + + + + + {Crawl4aiEntrypoint} + + + + +A few things worth pointing out: + +- A single `AsyncWebCrawler` is opened once and reused for every request. The crawler manages one browser instance, so reusing it across the whole crawl is far cheaper than launching a new browser per page. +- Keeping the crawling and parsing in `scrape_page` separates the Crawl4AI-specific code from the Actor's orchestration logic. The function returns the extracted data together with the discovered links, so `my_actor/main.py` decides what to store and what to enqueue. +- `result.markdown` is the rendered page as clean markdown, and `result.metadata` carries page-level fields such as the title - exactly the kind of output you want when preparing data for an LLM. +- `result.links` already separates `internal` (same-site) links from `external` ones, so the example follows only the internal links to keep the crawl on the same website. +- `CacheMode.BYPASS` tells Crawl4AI to always fetch a fresh copy of the page instead of serving it from its local cache. + +## Using Apify Proxy + +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. In the example above, `my_actor/main.py` creates a proxy configuration with `Actor.create_proxy_configuration` and passes a fresh proxy URL to `scrape_page` for every request, which forwards it to Crawl4AI's per-request `CrawlerRunConfig`. + +`ProxyConfig.from_string` parses the proxy URL returned by `ProxyConfiguration.new_url` (for example `http://groups-RESIDENTIAL:@proxy.apify.com:8000`) into the server, username, and password that the browser needs - the browser cannot take the credentials embedded directly in the URL. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. + +## Running on the Apify platform + +Because Crawl4AI renders pages in a real browser, the Actor image needs a browser and its system-level dependencies. Build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser - Crawl4AI reuses those binaries, so no separate browser-install step is required in the Dockerfile. + +Add `apify` and `crawl4ai` to your `requirements.txt`: + +```text +apify +crawl4ai +``` + + + + + {Crawl4aiDockerfile} + + + + +The example pins the Python 3.13 base image because some of Crawl4AI's dependencies do not yet publish wheels for the newest Python versions, which would otherwise force a slow source build during the image build. + +## Conclusion + +In this guide, you learned how to use Crawl4AI in your Apify Actors. You can now render pages in a real browser, turn them into LLM-ready markdown, follow the links Crawl4AI discovers, route requests through Apify Proxy, and run the whole thing on the Apify platform. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! + +## Additional resources + +- [Crawl4AI: Official documentation](https://docs.crawl4ai.com/) +- [Crawl4AI: AsyncWebCrawler and configuration](https://docs.crawl4ai.com/api/async-webcrawler/) +- [Crawl4AI: Proxy and security](https://docs.crawl4ai.com/advanced/proxy-security/) +- [Crawl4AI: GitHub repository](https://github.com/unclecode/crawl4ai) +- [Apify: Proxy management](https://docs.apify.com/platform/proxy) diff --git a/docs/03_guides/code/crawl4ai_project/Dockerfile b/docs/03_guides/code/crawl4ai_project/Dockerfile new file mode 100644 index 00000000..348f6ff2 --- /dev/null +++ b/docs/03_guides/code/crawl4ai_project/Dockerfile @@ -0,0 +1,19 @@ +# Use the Apify Playwright base image, which already ships a browser together +# with all of its system-level dependencies. Crawl4AI drives this browser +# through Playwright and reuses the binaries the image provides, so no separate +# browser-install step is needed. +# +# The Python 3.13 image is used because some of Crawl4AI's dependencies do not +# yet publish wheels for newer Python versions. +FROM apify/actor-python-playwright:3.13-1.60.0 + +# Copy just requirements.txt first to leverage the Docker build cache. +COPY --chown=myuser:myuser requirements.txt ./ +RUN pip install -r requirements.txt + +# Copy the rest of the source code and verify that it compiles. +COPY --chown=myuser:myuser . ./ +RUN python -m compileall -q my_actor/ + +# Specify how to launch the Actor. +CMD ["python", "-m", "my_actor"] diff --git a/docs/03_guides/code/crawl4ai_project/my_actor/__init__.py b/docs/03_guides/code/crawl4ai_project/my_actor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/03_guides/code/crawl4ai_project/my_actor/__main__.py b/docs/03_guides/code/crawl4ai_project/my_actor/__main__.py new file mode 100644 index 00000000..6aeaf3d5 --- /dev/null +++ b/docs/03_guides/code/crawl4ai_project/my_actor/__main__.py @@ -0,0 +1,8 @@ +from __future__ import annotations + +import asyncio + +from .main import main + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/crawl4ai_project/my_actor/main.py b/docs/03_guides/code/crawl4ai_project/my_actor/main.py new file mode 100644 index 00000000..4e6befe6 --- /dev/null +++ b/docs/03_guides/code/crawl4ai_project/my_actor/main.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +from crawl4ai import AsyncWebCrawler, BrowserConfig + +from apify import Actor, Request + +from .scraper import scrape_page + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Retrieve the Actor input, and use default values if not provided. + actor_input = await Actor.get_input() or {} + start_urls = actor_input.get('start_urls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('max_depth', 1) + + # Exit if no start URLs are provided. + if not start_urls: + Actor.log.info('No start URLs specified in Actor input, exiting...') + await Actor.exit() + + # Create a proxy configuration that routes requests through Apify Proxy. + proxy_configuration = await Actor.create_proxy_configuration() + + # Open the default request queue for handling URLs to be processed. + request_queue = await Actor.open_request_queue() + + # Enqueue the start URLs. Their crawl depth defaults to 0. + for start_url in start_urls: + url = start_url.get('url') + Actor.log.info(f'Enqueuing {url} ...') + await request_queue.add_request(Request.from_url(url)) + + # Configure the headless browser that Crawl4AI drives. + browser_config = BrowserConfig(headless=True) + + # Open a single browser-backed crawler and reuse it for every request. + async with AsyncWebCrawler(config=browser_config) as crawler: + # Process the URLs from the request queue. + while request := await request_queue.fetch_next_request(): + url = request.url + + # Read the crawl depth tracked by the request itself. + depth = request.crawl_depth + Actor.log.info(f'Scraping {url} (depth={depth}) ...') + + try: + # Get a fresh proxy URL for each request (None if no proxy set up). + proxy_url = None + if proxy_configuration: + proxy_url = await proxy_configuration.new_url() + + # Crawl the page and extract its markdown and nested links. + data, links = await scrape_page(crawler, url, proxy_url=proxy_url) + + # Store the extracted data to the default dataset. + await Actor.push_data(data) + + # If we are not too deep yet, enqueue the links we found one + # level deeper than the current page. + if depth < max_depth: + for link_url in links: + new_request = Request.from_url(link_url) + new_request.crawl_depth = depth + 1 + await request_queue.add_request(new_request) + + except Exception: + Actor.log.exception(f'Cannot extract data from {url}.') + + finally: + # Mark the request as handled so it is not processed again. + await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/crawl4ai_project/my_actor/scraper.py b/docs/03_guides/code/crawl4ai_project/my_actor/scraper.py new file mode 100644 index 00000000..f96f76e3 --- /dev/null +++ b/docs/03_guides/code/crawl4ai_project/my_actor/scraper.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from crawl4ai import CacheMode, CrawlerRunConfig, ProxyConfig + +if TYPE_CHECKING: + from crawl4ai import AsyncWebCrawler + + +async def scrape_page( + crawler: AsyncWebCrawler, + url: str, + *, + proxy_url: str | None = None, +) -> tuple[dict[str, Any], list[str]]: + """Crawl a single page with Crawl4AI and extract its markdown and links. + + The page is rendered in the browser managed by `crawler`, and Crawl4AI turns + the result into clean, LLM-ready markdown. Setting `proxy_config` on the + per-request `CrawlerRunConfig` routes this request through Apify Proxy, so + every page can use a fresh IP address. + """ + run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + proxy_config=ProxyConfig.from_string(proxy_url) if proxy_url else None, + ) + + result = await crawler.arun(url, config=run_config) + if not result.success: + raise RuntimeError(result.error_message or f'Failed to crawl {url}') + + # `result.markdown` is the rendered page as clean markdown, and + # `result.metadata` carries page-level fields such as the title. + data = { + 'url': result.url, + 'title': (result.metadata or {}).get('title'), + 'markdown': str(result.markdown), + } + + # Crawl4AI already splits links into `internal` (same site) and `external`. + # We follow only the internal ones to keep the crawl on the same website. + internal_links = result.links.get('internal', []) + links = [link['href'] for link in internal_links if link.get('href')] + + return data, links diff --git a/pyproject.toml b/pyproject.toml index d17bdc01..38846e70 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -181,6 +181,10 @@ indent-style = "space" # Local imports in Scrapy project. "TID252", # Prefer absolute imports over relative imports from parent modules ] +"**/docs/**/crawl4ai_project/**" = [ + # Local imports are mixed up with the Apify SDK. + "I001", # Import block is un-sorted or un-formatted +] [tool.ruff.lint.flake8-quotes] docstring-quotes = "double" From a27cf5a167cc0a413d032deaa1a2718f02c4348e Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 17:02:54 +0200 Subject: [PATCH 12/24] docs: move guide code examples into dedicated files --- docs/03_guides/10_pydantic.mdx | 44 +++++++++-------------- docs/03_guides/code/10_http_url.py | 5 +++ docs/03_guides/code/10_model_validator.py | 14 ++++++++ docs/03_guides/code/10_raw_input.py | 18 ++++++++++ 4 files changed, 53 insertions(+), 28 deletions(-) create mode 100644 docs/03_guides/code/10_http_url.py create mode 100644 docs/03_guides/code/10_model_validator.py create mode 100644 docs/03_guides/code/10_raw_input.py diff --git a/docs/03_guides/10_pydantic.mdx b/docs/03_guides/10_pydantic.mdx index 19a20c85..f633d85b 100644 --- a/docs/03_guides/10_pydantic.mdx +++ b/docs/03_guides/10_pydantic.mdx @@ -4,10 +4,14 @@ title: Validate Actor input with Pydantic description: Parse, validate, and type your Actor's input with Pydantic models instead of reaching into a raw dictionary. --- +import CodeBlock from '@theme/CodeBlock'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import ApiLink from '@theme/ApiLink'; +import RawInputExample from '!!raw-loader!roa-loader!./code/10_raw_input.py'; import PydanticExample from '!!raw-loader!roa-loader!./code/10_pydantic.py'; +import HttpUrlExample from '!!raw-loader!./code/10_http_url.py'; +import ModelValidatorExample from '!!raw-loader!./code/10_model_validator.py'; In this guide, you'll learn how to validate your Apify Actor's input with [Pydantic](https://docs.pydantic.dev/), so that your code works with a typed, guaranteed-valid object instead of a raw dictionary. @@ -15,11 +19,9 @@ In this guide, you'll learn how to validate your Apify Actor's input with [Pydan An Actor reads its input with `Actor.get_input`, which returns the input record as a plain `dict` (or `None` when there's no input). Working with that dictionary directly is fragile: -```python -actor_input = await Actor.get_input() or {} -search_terms = actor_input.get('searchTerms', []) -max_results = actor_input.get('maxResults', 10) -``` + + {RawInputExample} + - There are no type guarantees - `max_results` could just as easily arrive as the string `"10"` or `None`, and you'd only find out when something blows up later. - There's no validation - nothing stops `max_results` from being `0` or `-5`, or `search_terms` from being empty. @@ -76,33 +78,19 @@ Keep the model's aliases in sync with the field keys in `input_schema.json`, and Pydantic offers much more than the example uses. A few features that come up often when validating Actor input: -- **Format-validated types** for common string formats, for example `HttpUrl` for URLs or `EmailStr` for e-mail addresses (the latter needs the `pydantic[email]` extra): +**Format-validated types** for common string formats, for example `HttpUrl` for URLs or `EmailStr` for e-mail addresses (the latter needs the `pydantic[email]` extra): - ```python - from pydantic import BaseModel, HttpUrl - - class ActorInput(BaseModel): - target_url: HttpUrl - ``` + + {HttpUrlExample} + -- **Cross-field validation** with `model_validator`, when one field's validity depends on another: +**Cross-field validation** with `model_validator`, when one field's validity depends on another: - ```python - from pydantic import BaseModel, model_validator - from typing_extensions import Self - - class ActorInput(BaseModel): - min_price: int = 0 - max_price: int = 100 - - @model_validator(mode='after') - def _check_range(self) -> Self: - if self.min_price > self.max_price: - raise ValueError('min_price must not exceed max_price') - return self - ``` + + {ModelValidatorExample} + -- **Secret input fields.** The platform decrypts [secret input fields](https://docs.apify.com/platform/actors/development/secret-input) for you before `Actor.get_input` returns, so you receive plaintext. Wrap such fields in Pydantic's `SecretStr` to keep them from leaking into logs or `model_dump()` output. +**Secret input fields.** The platform decrypts [secret input fields](https://docs.apify.com/platform/actors/development/secret-input) for you before `Actor.get_input` returns, so you receive plaintext. Wrap such fields in Pydantic's `SecretStr` to keep them from leaking into logs or `model_dump()` output. For the full set of types, constraints, and validators, see the [Pydantic documentation](https://docs.pydantic.dev/latest/concepts/models/). diff --git a/docs/03_guides/code/10_http_url.py b/docs/03_guides/code/10_http_url.py new file mode 100644 index 00000000..80bf1f19 --- /dev/null +++ b/docs/03_guides/code/10_http_url.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel, HttpUrl + + +class ActorInput(BaseModel): + target_url: HttpUrl diff --git a/docs/03_guides/code/10_model_validator.py b/docs/03_guides/code/10_model_validator.py new file mode 100644 index 00000000..29c4c98e --- /dev/null +++ b/docs/03_guides/code/10_model_validator.py @@ -0,0 +1,14 @@ +from typing import Self + +from pydantic import BaseModel, model_validator + + +class ActorInput(BaseModel): + min_price: int = 0 + max_price: int = 100 + + @model_validator(mode='after') + def _check_range(self) -> Self: + if self.min_price > self.max_price: + raise ValueError('min_price must not exceed max_price') + return self diff --git a/docs/03_guides/code/10_raw_input.py b/docs/03_guides/code/10_raw_input.py new file mode 100644 index 00000000..7bfbeede --- /dev/null +++ b/docs/03_guides/code/10_raw_input.py @@ -0,0 +1,18 @@ +import asyncio + +from apify import Actor + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Read the input and reach into the raw dictionary for each value. + actor_input = await Actor.get_input() or {} + search_terms = actor_input.get('searchTerms', []) + max_results = actor_input.get('maxResults', 10) + + Actor.log.info('search_terms=%s, max_results=%s', search_terms, max_results) + + +if __name__ == '__main__': + asyncio.run(main()) From 694bd1b491e480621cd6e0e59559457fc4f7d912 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 17:07:48 +0200 Subject: [PATCH 13/24] docs: add Browser Use guide Add a guide and runnable example Actor for the Browser Use LLM browser-automation library, following the existing scraping-library guides. --- docs/01_introduction/quick-start.mdx | 1 + docs/03_guides/10_browser_use.mdx | 119 ++++++++++++++++++ .../code/browser_use_project/Dockerfile | 21 ++++ .../browser_use_project/my_actor/__init__.py | 0 .../browser_use_project/my_actor/__main__.py | 8 ++ .../browser_use_project/my_actor/agent.py | 69 ++++++++++ .../code/browser_use_project/my_actor/main.py | 53 ++++++++ pyproject.toml | 4 + 8 files changed, 275 insertions(+) create mode 100644 docs/03_guides/10_browser_use.mdx create mode 100644 docs/03_guides/code/browser_use_project/Dockerfile create mode 100644 docs/03_guides/code/browser_use_project/my_actor/__init__.py create mode 100644 docs/03_guides/code/browser_use_project/my_actor/__main__.py create mode 100644 docs/03_guides/code/browser_use_project/my_actor/agent.py create mode 100644 docs/03_guides/code/browser_use_project/my_actor/main.py diff --git a/docs/01_introduction/quick-start.mdx b/docs/01_introduction/quick-start.mdx index da166da9..9eed691f 100644 --- a/docs/01_introduction/quick-start.mdx +++ b/docs/01_introduction/quick-start.mdx @@ -105,4 +105,5 @@ To see how you can integrate the Apify SDK with popular web scraping libraries, - [Selenium](../guides/selenium) - [Crawlee](../guides/crawlee) - [Scrapy](../guides/scrapy) +- [Browser Use](../guides/browser-use) - [Running webserver](../guides/running-webserver) diff --git a/docs/03_guides/10_browser_use.mdx b/docs/03_guides/10_browser_use.mdx new file mode 100644 index 00000000..6c5ff5b5 --- /dev/null +++ b/docs/03_guides/10_browser_use.mdx @@ -0,0 +1,119 @@ +--- +id: browser-use +title: Use Browser Use +description: Build an Apify Actor that automates a browser with an LLM agent using the Browser Use library. +--- + +import CodeBlock from '@theme/CodeBlock'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +import BrowserUseMain from '!!raw-loader!./code/browser_use_project/my_actor/main.py'; +import BrowserUseAgent from '!!raw-loader!./code/browser_use_project/my_actor/agent.py'; +import BrowserUseEntrypoint from '!!raw-loader!./code/browser_use_project/my_actor/__main__.py'; +import BrowserUseDockerfile from '!!raw-loader!./code/browser_use_project/Dockerfile'; + +In this guide, you'll learn how to use the [Browser Use](https://browser-use.com/) library in your Apify Actors. + +## Introduction + +[Browser Use](https://browser-use.com/) is a Python library that lets an LLM control a real web browser. Instead of writing selectors and navigation steps by hand, you give an agent a natural-language task - such as "find the top post on Hacker News and return its title and URL" - and the agent decides which pages to open, what to click, and what to read until the task is done. + +Some of the features that make Browser Use a good fit for Apify Actors: + +- **Natural-language tasks** - Describe what you want in plain English; the agent figures out the steps. This is well suited to pages whose structure changes often or is hard to target with fixed selectors. +- **Model-agnostic** - Browser Use ships wrappers for many providers (`ChatOpenAI`, `ChatAnthropic`, `ChatGoogle`, and more), so you can pick the model that fits your task and budget. +- **Structured output** - Pass a [Pydantic](https://docs.pydantic.dev/) model as the output schema and the agent returns a validated object instead of free-form text, which maps cleanly onto an Apify dataset. +- **Real browser via CDP** - The agent drives a real Chromium over the Chrome DevTools Protocol, so JavaScript-heavy pages render just like they would for a human. +- **First-class async support** - The agent's `run` method is asynchronous, which integrates naturally with the asyncio-based Apify SDK. + +Browser Use needs only the `browser-use` package - install it with: + +```bash +pip install browser-use +``` + +## Configuring the LLM + +Browser Use needs an LLM to drive the agent. You choose a provider wrapper, give it a model name, and supply the provider's API key: + +- **`ChatOpenAI`** - OpenAI models such as `gpt-4.1-mini` or `gpt-5-mini`. Reads the key from `OPENAI_API_KEY`, or accepts it via the `api_key` argument. +- **`ChatAnthropic`** - Anthropic Claude models such as `claude-sonnet-4-5` or `claude-haiku-4-5`. Reads the key from `ANTHROPIC_API_KEY`. +- **`ChatGoogle`** - Google Gemini models such as `gemini-2.5-flash`. Reads the key from `GOOGLE_API_KEY`. + +The example Actor in this guide uses `ChatOpenAI`, but switching providers is a one-line change in `my_actor/agent.py`. More capable models generally complete tasks in fewer steps and more reliably, while smaller models are cheaper per step. + +Keep the API key out of the Actor input and source code. The example reads it from an environment variable, which on the Apify platform you set as a [secret environment variable](https://docs.apify.com/platform/actors/development/programming-interface/environment-variables) (for example `OPENAI_API_KEY`), and locally you export in your shell. + +## Example Actor + +The following Actor runs a Browser Use agent for a single task and stores its structured result in the default dataset. By default it opens [Hacker News](https://news.ycombinator.com) and returns the title and URL of the top five posts, but the task, model, and step limit are all configurable through the Actor input. + +The code is split into three small modules, following the structure of the Apify Python Actor templates: + +- `my_actor/main.py` - The Actor's main coroutine. It handles the [Actor](https://docs.apify.com/platform/actors) lifecycle, reads the input, sets up [Apify Proxy](https://docs.apify.com/platform/proxy), runs the agent, and stores the result. +- `my_actor/agent.py` - The Browser Use-specific logic. It defines the output schema and a single `run_agent_task` function that builds the LLM, browser, and agent, then returns the agent's structured output. +- `my_actor/__main__.py` - The entry point that runs the `main` coroutine with `asyncio`. + + + + + {BrowserUseMain} + + + + + {BrowserUseAgent} + + + + + {BrowserUseEntrypoint} + + + + +A few things worth pointing out: + +- Keeping the agent setup in `run_agent_task` separates the Browser Use-specific code from the Actor's orchestration logic. `my_actor/main.py` only decides what to read from the input and what to store. +- Passing `output_model_schema=Posts` makes the agent return a validated `Posts` instance via `history.structured_output`, so `my_actor/main.py` can push each item straight to the dataset. Adapt the task and the `Post`/`Posts` models together to fit your own use case. +- `enable_signal_handler=False` leaves signal handling to the Actor, which manages the run's lifecycle. Without it, Browser Use would install its own handlers and interfere with a clean shutdown. +- `headless=Actor.configuration.headless` runs the browser without a visible window, which is what you want on the platform. + +## Using Apify Proxy + +Running on the Apify platform gives your agent access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. In the example above, `my_actor/main.py` creates a proxy configuration with `Actor.create_proxy_configuration` and passes a fresh proxy URL to `run_agent_task`. + +Browser Use expects the proxy as a `ProxySettings` object with separate `server`, `username`, and `password` fields, whereas `ProxyConfiguration.new_url` returns a single URL string (for example `http://user:pass@proxy.apify.com:8000`). The `_proxy_settings` helper in `my_actor/agent.py` splits that URL into the fields Browser Use expects. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. + +## Running on the Apify platform + +Browser Use drives a real Chromium over CDP, so the Actor needs a browser binary available at runtime. The simplest way to provide one is to build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser together with all of its system-level dependencies. Browser Use discovers that browser automatically, so no extra install step is needed in the image. + + + + + {BrowserUseDockerfile} + + + + +When running the Actor locally, install the browser once with the `browser-use install` command, which downloads a Chromium build together with its dependencies: + +```bash +browser-use install +``` + +Remember to provide the LLM API key in both environments - as a secret environment variable on the platform, and exported in your shell when running locally. + +## Conclusion + +In this guide, you learned how to use Browser Use in your Apify Actors. You can now drive a real browser with an LLM agent, return its results as a validated Pydantic model, route the browser through Apify Proxy, and run the whole thing on the Apify platform. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own automation tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy automating! + +## Additional resources + +- [Browser Use: Official documentation](https://docs.browser-use.com/) +- [Browser Use: Supported models](https://docs.browser-use.com/customize/supported-models) +- [Browser Use: Structured output](https://docs.browser-use.com/customize/agent/output-format) +- [Browser Use: GitHub repository](https://github.com/browser-use/browser-use) +- [Apify: Proxy management](https://docs.apify.com/platform/proxy) diff --git a/docs/03_guides/code/browser_use_project/Dockerfile b/docs/03_guides/code/browser_use_project/Dockerfile new file mode 100644 index 00000000..c35bbfc9 --- /dev/null +++ b/docs/03_guides/code/browser_use_project/Dockerfile @@ -0,0 +1,21 @@ +# Use the Apify Playwright base image, which already ships a Chromium browser together +# with all of its system-level dependencies. Browser Use launches that browser via CDP, +# so no extra browser install step is needed. +FROM apify/actor-python-playwright:3.14-1.60.0 + +USER myuser + +# Copy just requirements.txt first to leverage the Docker build cache. +COPY --chown=myuser:myuser requirements.txt ./ +RUN pip install -r requirements.txt + +# Copy the rest of the source code and verify that it compiles. +COPY --chown=myuser:myuser . ./ +RUN python -m compileall -q my_actor/ + +# Disable Browser Use telemetry and cloud sync inside the Actor. +ENV ANONYMIZED_TELEMETRY=false +ENV BROWSER_USE_CLOUD_SYNC=false + +# Specify how to launch the Actor. +CMD ["python", "-m", "my_actor"] diff --git a/docs/03_guides/code/browser_use_project/my_actor/__init__.py b/docs/03_guides/code/browser_use_project/my_actor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/03_guides/code/browser_use_project/my_actor/__main__.py b/docs/03_guides/code/browser_use_project/my_actor/__main__.py new file mode 100644 index 00000000..6aeaf3d5 --- /dev/null +++ b/docs/03_guides/code/browser_use_project/my_actor/__main__.py @@ -0,0 +1,8 @@ +from __future__ import annotations + +import asyncio + +from .main import main + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/browser_use_project/my_actor/agent.py b/docs/03_guides/code/browser_use_project/my_actor/agent.py new file mode 100644 index 00000000..708265fb --- /dev/null +++ b/docs/03_guides/code/browser_use_project/my_actor/agent.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +from urllib.parse import urlsplit + +from browser_use import Agent, Browser, ChatOpenAI +from browser_use.browser import ProxySettings +from pydantic import BaseModel + + +class Post(BaseModel): + """A single item the agent is asked to extract.""" + + title: str + url: str + + +class Posts(BaseModel): + """The structured result returned by the agent.""" + + posts: list[Post] + + +async def run_agent_task( + task: str, + *, + model: str, + llm_api_key: str, + max_steps: int, + headless: bool = True, + proxy_url: str | None = None, +) -> Posts | None: + """Run a Browser Use agent for a single task and return its structured output. + + The agent is driven by an OpenAI model and a real Chromium browser. Passing + `output_model_schema` makes the agent return a validated `Posts` instance instead + of free-form text, and `enable_signal_handler=False` leaves signal handling to the + Actor. + """ + # Configure the LLM that drives the agent. Swap `ChatOpenAI` for `ChatAnthropic`, + # `ChatGoogle`, or another provider to use a different model. + llm = ChatOpenAI(model=model, api_key=llm_api_key) + + # Configure the browser. When a proxy URL is provided, route the browser through it. + browser = Browser( + headless=headless, + proxy=_proxy_settings(proxy_url) if proxy_url else None, + ) + + # Create the agent and run it for at most `max_steps` steps. + agent = Agent( + task=task, + llm=llm, + browser=browser, + output_model_schema=Posts, + enable_signal_handler=False, + ) + + history = await agent.run(max_steps=max_steps) + return history.structured_output + + +def _proxy_settings(proxy_url: str) -> ProxySettings: + """Convert an Apify Proxy URL into Browser Use `ProxySettings`.""" + parts = urlsplit(proxy_url) + return ProxySettings( + server=f'{parts.scheme}://{parts.hostname}:{parts.port}', + username=parts.username, + password=parts.password, + ) diff --git a/docs/03_guides/code/browser_use_project/my_actor/main.py b/docs/03_guides/code/browser_use_project/my_actor/main.py new file mode 100644 index 00000000..d045759e --- /dev/null +++ b/docs/03_guides/code/browser_use_project/my_actor/main.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +import os + +from apify import Actor + +from .agent import run_agent_task + +# The default task is aligned with the `Posts` output schema defined in `agent.py`. +DEFAULT_TASK = ( + 'Open https://news.ycombinator.com and return the title and URL ' + 'of the top 5 posts on the front page.' +) + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Retrieve the Actor input, and use default values if not provided. + actor_input = await Actor.get_input() or {} + task = actor_input.get('task', DEFAULT_TASK) + model = actor_input.get('model', 'gpt-4.1-mini') + max_steps = actor_input.get('max_steps', 25) + + # Read the LLM API key from the environment so it is never stored in the Actor + # input. On the Apify platform, set it as a secret environment variable. + llm_api_key = os.environ.get('OPENAI_API_KEY') + if not llm_api_key: + raise RuntimeError('The OPENAI_API_KEY environment variable is not set.') + + # Create a proxy configuration that routes the browser through Apify Proxy. + proxy_configuration = await Actor.create_proxy_configuration() + proxy_url = await proxy_configuration.new_url() if proxy_configuration else None + + Actor.log.info(f'Running the agent (model={model}) for task: {task}') + + # Run the Browser Use agent and collect its structured output. + result = await run_agent_task( + task, + model=model, + llm_api_key=llm_api_key, + max_steps=max_steps, + headless=Actor.configuration.headless, + proxy_url=proxy_url, + ) + + if result is None: + Actor.log.warning('The agent did not return any structured output.') + return + + # Store every extracted item as a separate row in the default dataset. + for post in result.posts: + await Actor.push_data(post.model_dump()) diff --git a/pyproject.toml b/pyproject.toml index d17bdc01..a84ef15e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -181,6 +181,10 @@ indent-style = "space" # Local imports in Scrapy project. "TID252", # Prefer absolute imports over relative imports from parent modules ] +"**/docs/**/browser_use_project/**" = [ + # Local imports are mixed up with the Apify SDK. + "I001", # Import block is un-sorted or un-formatted +] [tool.ruff.lint.flake8-quotes] docstring-quotes = "double" From 11c78d2d69cc4f724ac7de82dfb7621594ff1058 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 17:47:51 +0200 Subject: [PATCH 14/24] docs: renumber guides into a logical order and resolve numbering collision --- docs/01_introduction/quick-start.mdx | 6 +++--- docs/03_guides/{09_scrapling.mdx => 07_scrapling.mdx} | 0 docs/03_guides/{10_crawl4ai.mdx => 08_crawl4ai.mdx} | 0 docs/03_guides/{10_browser_use.mdx => 09_browser_use.mdx} | 0 docs/03_guides/{08_uv.mdx => 10_uv.mdx} | 0 docs/03_guides/{10_pydantic.mdx => 11_pydantic.mdx} | 8 ++++---- ...{07_running_webserver.mdx => 12_running_webserver.mdx} | 2 +- docs/03_guides/code/{10_http_url.py => 11_http_url.py} | 0 .../code/{10_model_validator.py => 11_model_validator.py} | 0 docs/03_guides/code/{10_pydantic.py => 11_pydantic.py} | 0 docs/03_guides/code/{10_raw_input.py => 11_raw_input.py} | 0 docs/03_guides/code/{07_webserver.py => 12_webserver.py} | 0 12 files changed, 8 insertions(+), 8 deletions(-) rename docs/03_guides/{09_scrapling.mdx => 07_scrapling.mdx} (100%) rename docs/03_guides/{10_crawl4ai.mdx => 08_crawl4ai.mdx} (100%) rename docs/03_guides/{10_browser_use.mdx => 09_browser_use.mdx} (100%) rename docs/03_guides/{08_uv.mdx => 10_uv.mdx} (100%) rename docs/03_guides/{10_pydantic.mdx => 11_pydantic.mdx} (96%) rename docs/03_guides/{07_running_webserver.mdx => 12_running_webserver.mdx} (98%) rename docs/03_guides/code/{10_http_url.py => 11_http_url.py} (100%) rename docs/03_guides/code/{10_model_validator.py => 11_model_validator.py} (100%) rename docs/03_guides/code/{10_pydantic.py => 11_pydantic.py} (100%) rename docs/03_guides/code/{10_raw_input.py => 11_raw_input.py} (100%) rename docs/03_guides/code/{07_webserver.py => 12_webserver.py} (100%) diff --git a/docs/01_introduction/quick-start.mdx b/docs/01_introduction/quick-start.mdx index b22875d6..c0aaa0c9 100644 --- a/docs/01_introduction/quick-start.mdx +++ b/docs/01_introduction/quick-start.mdx @@ -106,8 +106,8 @@ To see how you can integrate the Apify SDK with popular web scraping libraries, - [Crawlee](../guides/crawlee) - [Scrapy](../guides/scrapy) - [Scrapling](../guides/scrapling) -- [Browser Use](../guides/browser-use) - [Crawl4AI](../guides/crawl4ai) -- [Running webserver](../guides/running-webserver) -- [Validate Actor input with Pydantic](../guides/input-validation) +- [Browser Use](../guides/browser-use) - [uv](../guides/uv) +- [Validate Actor input with Pydantic](../guides/input-validation) +- [Running webserver](../guides/running-webserver) diff --git a/docs/03_guides/09_scrapling.mdx b/docs/03_guides/07_scrapling.mdx similarity index 100% rename from docs/03_guides/09_scrapling.mdx rename to docs/03_guides/07_scrapling.mdx diff --git a/docs/03_guides/10_crawl4ai.mdx b/docs/03_guides/08_crawl4ai.mdx similarity index 100% rename from docs/03_guides/10_crawl4ai.mdx rename to docs/03_guides/08_crawl4ai.mdx diff --git a/docs/03_guides/10_browser_use.mdx b/docs/03_guides/09_browser_use.mdx similarity index 100% rename from docs/03_guides/10_browser_use.mdx rename to docs/03_guides/09_browser_use.mdx diff --git a/docs/03_guides/08_uv.mdx b/docs/03_guides/10_uv.mdx similarity index 100% rename from docs/03_guides/08_uv.mdx rename to docs/03_guides/10_uv.mdx diff --git a/docs/03_guides/10_pydantic.mdx b/docs/03_guides/11_pydantic.mdx similarity index 96% rename from docs/03_guides/10_pydantic.mdx rename to docs/03_guides/11_pydantic.mdx index f633d85b..c6245193 100644 --- a/docs/03_guides/10_pydantic.mdx +++ b/docs/03_guides/11_pydantic.mdx @@ -8,10 +8,10 @@ import CodeBlock from '@theme/CodeBlock'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import ApiLink from '@theme/ApiLink'; -import RawInputExample from '!!raw-loader!roa-loader!./code/10_raw_input.py'; -import PydanticExample from '!!raw-loader!roa-loader!./code/10_pydantic.py'; -import HttpUrlExample from '!!raw-loader!./code/10_http_url.py'; -import ModelValidatorExample from '!!raw-loader!./code/10_model_validator.py'; +import RawInputExample from '!!raw-loader!roa-loader!./code/11_raw_input.py'; +import PydanticExample from '!!raw-loader!roa-loader!./code/11_pydantic.py'; +import HttpUrlExample from '!!raw-loader!./code/11_http_url.py'; +import ModelValidatorExample from '!!raw-loader!./code/11_model_validator.py'; In this guide, you'll learn how to validate your Apify Actor's input with [Pydantic](https://docs.pydantic.dev/), so that your code works with a typed, guaranteed-valid object instead of a raw dictionary. diff --git a/docs/03_guides/07_running_webserver.mdx b/docs/03_guides/12_running_webserver.mdx similarity index 98% rename from docs/03_guides/07_running_webserver.mdx rename to docs/03_guides/12_running_webserver.mdx index 9b63976a..0050189d 100644 --- a/docs/03_guides/07_running_webserver.mdx +++ b/docs/03_guides/12_running_webserver.mdx @@ -6,7 +6,7 @@ description: Run an HTTP server inside your Actor for monitoring or serving cont import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; -import WebserverExample from '!!raw-loader!roa-loader!./code/07_webserver.py'; +import WebserverExample from '!!raw-loader!roa-loader!./code/12_webserver.py'; In this guide, you'll learn how to run a web server inside your Apify Actor. This is useful for monitoring Actor progress, creating custom APIs, or serving content during the Actor run. diff --git a/docs/03_guides/code/10_http_url.py b/docs/03_guides/code/11_http_url.py similarity index 100% rename from docs/03_guides/code/10_http_url.py rename to docs/03_guides/code/11_http_url.py diff --git a/docs/03_guides/code/10_model_validator.py b/docs/03_guides/code/11_model_validator.py similarity index 100% rename from docs/03_guides/code/10_model_validator.py rename to docs/03_guides/code/11_model_validator.py diff --git a/docs/03_guides/code/10_pydantic.py b/docs/03_guides/code/11_pydantic.py similarity index 100% rename from docs/03_guides/code/10_pydantic.py rename to docs/03_guides/code/11_pydantic.py diff --git a/docs/03_guides/code/10_raw_input.py b/docs/03_guides/code/11_raw_input.py similarity index 100% rename from docs/03_guides/code/10_raw_input.py rename to docs/03_guides/code/11_raw_input.py diff --git a/docs/03_guides/code/07_webserver.py b/docs/03_guides/code/12_webserver.py similarity index 100% rename from docs/03_guides/code/07_webserver.py rename to docs/03_guides/code/12_webserver.py From 8fc517f33b98a66d1ba644922bb58736f87f624d Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 18:09:40 +0200 Subject: [PATCH 15/24] docs: split the quick-start guides list into scraping and other topics --- docs/01_introduction/quick-start.mdx | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/01_introduction/quick-start.mdx b/docs/01_introduction/quick-start.mdx index c0aaa0c9..59cdc929 100644 --- a/docs/01_introduction/quick-start.mdx +++ b/docs/01_introduction/quick-start.mdx @@ -97,7 +97,7 @@ To learn more about the features of the Apify SDK and how to use them, check out ### Guides -To see how you can integrate the Apify SDK with popular web scraping libraries, check out our guides: +To see how you can integrate the Apify SDK with popular web scraping libraries, check out these guides: - [BeautifulSoup with HTTPX](../guides/beautifulsoup-httpx) - [Parsel with Impit](../guides/parsel-impit) @@ -108,6 +108,9 @@ To see how you can integrate the Apify SDK with popular web scraping libraries, - [Scrapling](../guides/scrapling) - [Crawl4AI](../guides/crawl4ai) - [Browser Use](../guides/browser-use) -- [uv](../guides/uv) + +For other aspects of Actor development, explore these guides: + +- [Manage your project with uv](../guides/uv) - [Validate Actor input with Pydantic](../guides/input-validation) -- [Running webserver](../guides/running-webserver) +- [Run a web server](../guides/running-webserver) From 64c1a6b66338d2bc99514df046f101b9dbb1e826 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 18:09:41 +0200 Subject: [PATCH 16/24] docs: convert LLM-scraping guide examples to single runnable files --- docs/03_guides/07_scrapling.mdx | 66 +++++------------ docs/03_guides/08_crawl4ai.mdx | 53 +++----------- docs/03_guides/09_browser_use.mdx | 55 ++++---------- .../my_actor/main.py => 07_scrapling.py} | 51 ++++++++++++- .../scraper.py => 07_scrapling_browser.py} | 2 - .../my_actor/main.py => 08_crawl4ai.py} | 53 +++++++++++++- .../my_actor/agent.py => 09_browser_use.py} | 72 ++++++++++++++++--- .../code/browser_use_project/Dockerfile | 21 ------ .../browser_use_project/my_actor/__init__.py | 0 .../browser_use_project/my_actor/__main__.py | 8 --- .../code/browser_use_project/my_actor/main.py | 53 -------------- .../code/crawl4ai_project/Dockerfile | 19 ----- .../crawl4ai_project/my_actor/__init__.py | 0 .../crawl4ai_project/my_actor/__main__.py | 8 --- .../code/crawl4ai_project/my_actor/scraper.py | 46 ------------ .../code/scrapling_browser_project/Dockerfile | 21 ------ .../scrapling_project/my_actor/__init__.py | 0 .../scrapling_project/my_actor/__main__.py | 8 --- .../scrapling_project/my_actor/scraper.py | 47 ------------ pyproject.toml | 12 ---- 20 files changed, 202 insertions(+), 393 deletions(-) rename docs/03_guides/code/{scrapling_project/my_actor/main.py => 07_scrapling.py} (62%) rename docs/03_guides/code/{scrapling_browser_project/my_actor/scraper.py => 07_scrapling_browser.py} (97%) rename docs/03_guides/code/{crawl4ai_project/my_actor/main.py => 08_crawl4ai.py} (64%) rename docs/03_guides/code/{browser_use_project/my_actor/agent.py => 09_browser_use.py} (50%) delete mode 100644 docs/03_guides/code/browser_use_project/Dockerfile delete mode 100644 docs/03_guides/code/browser_use_project/my_actor/__init__.py delete mode 100644 docs/03_guides/code/browser_use_project/my_actor/__main__.py delete mode 100644 docs/03_guides/code/browser_use_project/my_actor/main.py delete mode 100644 docs/03_guides/code/crawl4ai_project/Dockerfile delete mode 100644 docs/03_guides/code/crawl4ai_project/my_actor/__init__.py delete mode 100644 docs/03_guides/code/crawl4ai_project/my_actor/__main__.py delete mode 100644 docs/03_guides/code/crawl4ai_project/my_actor/scraper.py delete mode 100644 docs/03_guides/code/scrapling_browser_project/Dockerfile delete mode 100644 docs/03_guides/code/scrapling_project/my_actor/__init__.py delete mode 100644 docs/03_guides/code/scrapling_project/my_actor/__main__.py delete mode 100644 docs/03_guides/code/scrapling_project/my_actor/scraper.py diff --git a/docs/03_guides/07_scrapling.mdx b/docs/03_guides/07_scrapling.mdx index 3e76ebca..3495a824 100644 --- a/docs/03_guides/07_scrapling.mdx +++ b/docs/03_guides/07_scrapling.mdx @@ -5,14 +5,10 @@ description: Build an Apify Actor that scrapes web pages using the Scrapling ada --- import CodeBlock from '@theme/CodeBlock'; -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; -import ScraplingMain from '!!raw-loader!./code/scrapling_project/my_actor/main.py'; -import ScraplingScraper from '!!raw-loader!./code/scrapling_project/my_actor/scraper.py'; -import ScraplingEntrypoint from '!!raw-loader!./code/scrapling_project/my_actor/__main__.py'; -import ScraplingBrowserScraper from '!!raw-loader!./code/scrapling_browser_project/my_actor/scraper.py'; -import ScraplingBrowserDockerfile from '!!raw-loader!./code/scrapling_browser_project/Dockerfile'; +import ScraplingExample from '!!raw-loader!roa-loader!./code/07_scrapling.py'; +import ScraplingBrowserScraper from '!!raw-loader!./code/07_scrapling_browser.py'; In this guide, you'll learn how to use the [Scrapling](https://scrapling.readthedocs.io/) library in your Apify Actors. @@ -52,40 +48,22 @@ The example Actor in this guide uses the HTTP `AsyncFetcher`, which is the simpl The following Actor recursively scrapes titles from all linked pages, up to a user-defined maximum depth, starting from the URLs in the Actor input. It uses Scrapling's `AsyncFetcher` to fetch each page through [Apify Proxy](https://docs.apify.com/platform/proxy), and CSS selectors to extract the title, headings, and links. -The code is split into three small modules, following the structure of the Apify Python Actor templates: - -- `my_actor/main.py` - The Actor's main coroutine. It handles the [Actor](https://docs.apify.com/platform/actors) lifecycle, reads the input, sets up [Apify Proxy](https://docs.apify.com/platform/proxy) and the [request queue](https://docs.apify.com/platform/storage/request-queue), and drives the crawl. -- `my_actor/scraper.py` - The Scrapling-specific logic. A single `scrape_page` function fetches a page and returns the extracted data together with the links found on it. -- `my_actor/__main__.py` - The entry point that runs the `main` coroutine with `asyncio`. - - - - - {ScraplingMain} - - - - - {ScraplingScraper} - - - - - {ScraplingEntrypoint} - - - +The whole Actor fits in a single file. A `scrape_page` helper holds the Scrapling-specific fetching and parsing, while the `main` coroutine handles the [Actor](https://docs.apify.com/platform/actors) lifecycle, reads the input, sets up [Apify Proxy](https://docs.apify.com/platform/proxy) and the [request queue](https://docs.apify.com/platform/storage/request-queue), and drives the crawl: + + + {ScraplingExample} + A few things worth pointing out: -- Keeping the fetching and parsing in `scrape_page` separates the Scrapling-specific code from the Actor's orchestration logic. The function returns the extracted data together with the discovered links, so `my_actor/main.py` decides what to store and what to enqueue. +- Keeping the fetching and parsing in `scrape_page` separates the Scrapling-specific code from the Actor's orchestration logic. The function returns the extracted data together with the discovered links, so `main` decides what to store and what to enqueue. - The response of `AsyncFetcher.get` is a Scrapling selector, so `response.css('title::text').get()` reads the page title and `response.css('a::attr(href)').getall()` returns every link's `href` in one call. - `response.urljoin(link_href)` resolves relative links against the page URL, so you can enqueue them directly. - The `impersonate='chrome'` and `stealthy_headers=True` options make the request look like it comes from a real Chrome browser, which - combined with Apify Proxy - reduces the chance of being blocked. ## Using Apify Proxy -Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. In the example above, `my_actor/main.py` creates a proxy configuration with `Actor.create_proxy_configuration` and passes a fresh proxy URL to `scrape_page` for every request, which forwards it to Scrapling's `proxy` argument. +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. In the example above, `main` creates a proxy configuration with `Actor.create_proxy_configuration` and passes a fresh proxy URL to `scrape_page` for every request, which forwards it to Scrapling's `proxy` argument. Scrapling accepts the proxy as a URL string (for example `http://user:pass@proxy.apify.com:8000`), which is exactly what `ProxyConfiguration.new_url` returns. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. The browser-based fetchers accept the same `proxy` argument. @@ -97,23 +75,13 @@ Scrapling accepts the proxy as a URL string (for example `http://user:pass@proxy scrapling install ``` -Switching the example Actor from HTTP to a real browser only takes two changes - the rest of the project, including `my_actor/main.py`, stays exactly the same: - -1. Swap the fetcher call in `my_actor/scraper.py` for `DynamicFetcher.async_fetch`. The parsing API is identical, so the data extraction is unchanged. -2. Build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser together with all of its system-level dependencies, and run `scrapling install` during the build to download the browser binaries that Scrapling expects. - - - - - {ScraplingBrowserScraper} - - - - - {ScraplingBrowserDockerfile} - - - +Switching the example Actor from HTTP to a real browser takes only one code change - swap the `AsyncFetcher.get` call in `scrape_page` for `DynamicFetcher.async_fetch`. The parsing API is identical, so the rest of the Actor stays exactly the same: + + + {ScraplingBrowserScraper} + + +To run this on the Apify platform, build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser together with all of its system-level dependencies, and run `scrapling install` during the Docker build to download the browser binaries that Scrapling expects. ## Conclusion diff --git a/docs/03_guides/08_crawl4ai.mdx b/docs/03_guides/08_crawl4ai.mdx index 7dc996d8..01df4dec 100644 --- a/docs/03_guides/08_crawl4ai.mdx +++ b/docs/03_guides/08_crawl4ai.mdx @@ -4,14 +4,9 @@ title: Use Crawl4AI description: Build an Apify Actor that scrapes web pages into LLM-ready markdown using the Crawl4AI library. --- -import CodeBlock from '@theme/CodeBlock'; -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; -import Crawl4aiMain from '!!raw-loader!./code/crawl4ai_project/my_actor/main.py'; -import Crawl4aiScraper from '!!raw-loader!./code/crawl4ai_project/my_actor/scraper.py'; -import Crawl4aiEntrypoint from '!!raw-loader!./code/crawl4ai_project/my_actor/__main__.py'; -import Crawl4aiDockerfile from '!!raw-loader!./code/crawl4ai_project/Dockerfile'; +import Crawl4aiExample from '!!raw-loader!roa-loader!./code/08_crawl4ai.py'; In this guide, you'll learn how to use the [Crawl4AI](https://crawl4ai.com/) library in your Apify Actors. @@ -39,41 +34,23 @@ crawl4ai-setup The following Actor recursively crawls pages, starting from the URLs in the Actor input and following links up to a user-defined maximum depth. It uses Crawl4AI's `AsyncWebCrawler` to render each page through [Apify Proxy](https://docs.apify.com/platform/proxy), stores the page's markdown in the dataset, and follows the internal links that Crawl4AI discovers. -The code is split into three small modules, following the structure of the Apify Python Actor templates: - -- `my_actor/main.py` - The Actor's main coroutine. It handles the [Actor](https://docs.apify.com/platform/actors) lifecycle, reads the input, sets up [Apify Proxy](https://docs.apify.com/platform/proxy) and the [request queue](https://docs.apify.com/platform/storage/request-queue), opens a single browser-backed crawler, and drives the crawl. -- `my_actor/scraper.py` - The Crawl4AI-specific logic. A single `scrape_page` function crawls a page and returns the extracted data together with the links found on it. -- `my_actor/__main__.py` - The entry point that runs the `main` coroutine with `asyncio`. - - - - - {Crawl4aiMain} - - - - - {Crawl4aiScraper} - - - - - {Crawl4aiEntrypoint} - - - +The whole Actor fits in a single file. A `scrape_page` helper holds the Crawl4AI-specific crawling and parsing, while the `main` coroutine handles the [Actor](https://docs.apify.com/platform/actors) lifecycle, reads the input, sets up [Apify Proxy](https://docs.apify.com/platform/proxy) and the [request queue](https://docs.apify.com/platform/storage/request-queue), opens a single browser-backed crawler, and drives the crawl: + + + {Crawl4aiExample} + A few things worth pointing out: - A single `AsyncWebCrawler` is opened once and reused for every request. The crawler manages one browser instance, so reusing it across the whole crawl is far cheaper than launching a new browser per page. -- Keeping the crawling and parsing in `scrape_page` separates the Crawl4AI-specific code from the Actor's orchestration logic. The function returns the extracted data together with the discovered links, so `my_actor/main.py` decides what to store and what to enqueue. +- Keeping the crawling and parsing in `scrape_page` separates the Crawl4AI-specific code from the Actor's orchestration logic. The function returns the extracted data together with the discovered links, so `main` decides what to store and what to enqueue. - `result.markdown` is the rendered page as clean markdown, and `result.metadata` carries page-level fields such as the title - exactly the kind of output you want when preparing data for an LLM. - `result.links` already separates `internal` (same-site) links from `external` ones, so the example follows only the internal links to keep the crawl on the same website. - `CacheMode.BYPASS` tells Crawl4AI to always fetch a fresh copy of the page instead of serving it from its local cache. ## Using Apify Proxy -Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. In the example above, `my_actor/main.py` creates a proxy configuration with `Actor.create_proxy_configuration` and passes a fresh proxy URL to `scrape_page` for every request, which forwards it to Crawl4AI's per-request `CrawlerRunConfig`. +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. In the example above, `main` creates a proxy configuration with `Actor.create_proxy_configuration` and passes a fresh proxy URL to `scrape_page` for every request, which forwards it to Crawl4AI's per-request `CrawlerRunConfig`. `ProxyConfig.from_string` parses the proxy URL returned by `ProxyConfiguration.new_url` (for example `http://groups-RESIDENTIAL:@proxy.apify.com:8000`) into the server, username, and password that the browser needs - the browser cannot take the credentials embedded directly in the URL. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. @@ -81,6 +58,8 @@ Running on the Apify platform gives your scraper access to [Apify Proxy](https:/ Because Crawl4AI renders pages in a real browser, the Actor image needs a browser and its system-level dependencies. Build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser - Crawl4AI reuses those binaries, so no separate browser-install step is required in the Dockerfile. +Pin the Python 3.13 variant of that image (for example `apify/actor-python-playwright:3.13-1.60.0`), because some of Crawl4AI's dependencies do not yet publish wheels for the newest Python versions, which would otherwise force a slow source build during the image build. + Add `apify` and `crawl4ai` to your `requirements.txt`: ```text @@ -88,16 +67,6 @@ apify crawl4ai ``` - - - - {Crawl4aiDockerfile} - - - - -The example pins the Python 3.13 base image because some of Crawl4AI's dependencies do not yet publish wheels for the newest Python versions, which would otherwise force a slow source build during the image build. - ## Conclusion In this guide, you learned how to use Crawl4AI in your Apify Actors. You can now render pages in a real browser, turn them into LLM-ready markdown, follow the links Crawl4AI discovers, route requests through Apify Proxy, and run the whole thing on the Apify platform. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/03_guides/09_browser_use.mdx b/docs/03_guides/09_browser_use.mdx index 6c5ff5b5..1760505d 100644 --- a/docs/03_guides/09_browser_use.mdx +++ b/docs/03_guides/09_browser_use.mdx @@ -4,14 +4,9 @@ title: Use Browser Use description: Build an Apify Actor that automates a browser with an LLM agent using the Browser Use library. --- -import CodeBlock from '@theme/CodeBlock'; -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; -import BrowserUseMain from '!!raw-loader!./code/browser_use_project/my_actor/main.py'; -import BrowserUseAgent from '!!raw-loader!./code/browser_use_project/my_actor/agent.py'; -import BrowserUseEntrypoint from '!!raw-loader!./code/browser_use_project/my_actor/__main__.py'; -import BrowserUseDockerfile from '!!raw-loader!./code/browser_use_project/Dockerfile'; +import BrowserUseExample from '!!raw-loader!roa-loader!./code/09_browser_use.py'; In this guide, you'll learn how to use the [Browser Use](https://browser-use.com/) library in your Apify Actors. @@ -41,7 +36,7 @@ Browser Use needs an LLM to drive the agent. You choose a provider wrapper, give - **`ChatAnthropic`** - Anthropic Claude models such as `claude-sonnet-4-5` or `claude-haiku-4-5`. Reads the key from `ANTHROPIC_API_KEY`. - **`ChatGoogle`** - Google Gemini models such as `gemini-2.5-flash`. Reads the key from `GOOGLE_API_KEY`. -The example Actor in this guide uses `ChatOpenAI`, but switching providers is a one-line change in `my_actor/agent.py`. More capable models generally complete tasks in fewer steps and more reliably, while smaller models are cheaper per step. +The example Actor in this guide uses `ChatOpenAI`, but switching providers is a one-line change in `run_agent_task`. More capable models generally complete tasks in fewer steps and more reliably, while smaller models are cheaper per step. Keep the API key out of the Actor input and source code. The example reads it from an environment variable, which on the Apify platform you set as a [secret environment variable](https://docs.apify.com/platform/actors/development/programming-interface/environment-variables) (for example `OPENAI_API_KEY`), and locally you export in your shell. @@ -49,54 +44,30 @@ Keep the API key out of the Actor input and source code. The example reads it fr The following Actor runs a Browser Use agent for a single task and stores its structured result in the default dataset. By default it opens [Hacker News](https://news.ycombinator.com) and returns the title and URL of the top five posts, but the task, model, and step limit are all configurable through the Actor input. -The code is split into three small modules, following the structure of the Apify Python Actor templates: - -- `my_actor/main.py` - The Actor's main coroutine. It handles the [Actor](https://docs.apify.com/platform/actors) lifecycle, reads the input, sets up [Apify Proxy](https://docs.apify.com/platform/proxy), runs the agent, and stores the result. -- `my_actor/agent.py` - The Browser Use-specific logic. It defines the output schema and a single `run_agent_task` function that builds the LLM, browser, and agent, then returns the agent's structured output. -- `my_actor/__main__.py` - The entry point that runs the `main` coroutine with `asyncio`. - - - - - {BrowserUseMain} - - - - - {BrowserUseAgent} - - - - - {BrowserUseEntrypoint} - - - +The whole Actor fits in a single file. A `run_agent_task` helper holds the Browser Use-specific logic - it defines the output schema and builds the LLM, browser, and agent - while the `main` coroutine handles the [Actor](https://docs.apify.com/platform/actors) lifecycle, reads the input, sets up [Apify Proxy](https://docs.apify.com/platform/proxy), runs the agent, and stores the result: + + + {BrowserUseExample} + A few things worth pointing out: -- Keeping the agent setup in `run_agent_task` separates the Browser Use-specific code from the Actor's orchestration logic. `my_actor/main.py` only decides what to read from the input and what to store. -- Passing `output_model_schema=Posts` makes the agent return a validated `Posts` instance via `history.structured_output`, so `my_actor/main.py` can push each item straight to the dataset. Adapt the task and the `Post`/`Posts` models together to fit your own use case. +- Keeping the agent setup in `run_agent_task` separates the Browser Use-specific code from the Actor's orchestration logic. `main` only decides what to read from the input and what to store. +- Passing `output_model_schema=Posts` makes the agent return a validated `Posts` instance via `history.structured_output`, so `main` can push each item straight to the dataset. Adapt the task and the `Post`/`Posts` models together to fit your own use case. - `enable_signal_handler=False` leaves signal handling to the Actor, which manages the run's lifecycle. Without it, Browser Use would install its own handlers and interfere with a clean shutdown. - `headless=Actor.configuration.headless` runs the browser without a visible window, which is what you want on the platform. ## Using Apify Proxy -Running on the Apify platform gives your agent access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. In the example above, `my_actor/main.py` creates a proxy configuration with `Actor.create_proxy_configuration` and passes a fresh proxy URL to `run_agent_task`. +Running on the Apify platform gives your agent access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. In the example above, `main` creates a proxy configuration with `Actor.create_proxy_configuration` and passes a fresh proxy URL to `run_agent_task`. -Browser Use expects the proxy as a `ProxySettings` object with separate `server`, `username`, and `password` fields, whereas `ProxyConfiguration.new_url` returns a single URL string (for example `http://user:pass@proxy.apify.com:8000`). The `_proxy_settings` helper in `my_actor/agent.py` splits that URL into the fields Browser Use expects. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. +Browser Use expects the proxy as a `ProxySettings` object with separate `server`, `username`, and `password` fields, whereas `ProxyConfiguration.new_url` returns a single URL string (for example `http://user:pass@proxy.apify.com:8000`). The `_proxy_settings` helper splits that URL into the fields Browser Use expects. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. ## Running on the Apify platform Browser Use drives a real Chromium over CDP, so the Actor needs a browser binary available at runtime. The simplest way to provide one is to build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser together with all of its system-level dependencies. Browser Use discovers that browser automatically, so no extra install step is needed in the image. - - - - {BrowserUseDockerfile} - - - +Disable Browser Use's telemetry and cloud sync inside the Actor by setting the `ANONYMIZED_TELEMETRY=false` and `BROWSER_USE_CLOUD_SYNC=false` environment variables in your Dockerfile. When running the Actor locally, install the browser once with the `browser-use install` command, which downloads a Chromium build together with its dependencies: diff --git a/docs/03_guides/code/scrapling_project/my_actor/main.py b/docs/03_guides/code/07_scrapling.py similarity index 62% rename from docs/03_guides/code/scrapling_project/my_actor/main.py rename to docs/03_guides/code/07_scrapling.py index 52e9ef4c..a817c09b 100644 --- a/docs/03_guides/code/scrapling_project/my_actor/main.py +++ b/docs/03_guides/code/07_scrapling.py @@ -1,8 +1,51 @@ -from __future__ import annotations +import asyncio +from typing import Any + +from scrapling.fetchers import AsyncFetcher from apify import Actor, Request -from .scraper import scrape_page + +async def scrape_page( + url: str, + *, + proxy_url: str | None = None, +) -> tuple[dict[str, Any], list[str]]: + """Fetch a single page with Scrapling and extract its data and links. + + The page is fetched with Scrapling's asynchronous HTTP fetcher. The + `impersonate` and `stealthy_headers` options make the request look like it + comes from a real Chrome browser, which reduces the chance of being blocked. + The returned response is also a Scrapling selector, so it can be queried with + CSS selectors directly. + """ + response = await AsyncFetcher.get( + url, + proxy=proxy_url, + impersonate='chrome', + stealthy_headers=True, + timeout=60, + ) + + # Extract the desired data using CSS selectors. The `::text` pseudo-element + # returns the text content of the matched elements. + data = { + 'url': url, + 'title': response.css('title::text').get(), + 'h1s': response.css('h1::text').getall(), + 'h2s': response.css('h2::text').getall(), + 'h3s': response.css('h3::text').getall(), + } + + # Collect absolute links from the page. The `::attr(href)` pseudo-selector + # reads the attribute and `response.urljoin` resolves it against the page URL. + links: list[str] = [] + for href in response.css('a::attr(href)').getall(): + link_url = response.urljoin(href) + if link_url.startswith(('http://', 'https://')): + links.append(link_url) + + return data, links async def main() -> None: @@ -65,3 +108,7 @@ async def main() -> None: finally: # Mark the request as handled so it is not processed again. await request_queue.mark_request_as_handled(request) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/scrapling_browser_project/my_actor/scraper.py b/docs/03_guides/code/07_scrapling_browser.py similarity index 97% rename from docs/03_guides/code/scrapling_browser_project/my_actor/scraper.py rename to docs/03_guides/code/07_scrapling_browser.py index fb7d4579..d96f2d19 100644 --- a/docs/03_guides/code/scrapling_browser_project/my_actor/scraper.py +++ b/docs/03_guides/code/07_scrapling_browser.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from typing import Any from scrapling.fetchers import DynamicFetcher diff --git a/docs/03_guides/code/crawl4ai_project/my_actor/main.py b/docs/03_guides/code/08_crawl4ai.py similarity index 64% rename from docs/03_guides/code/crawl4ai_project/my_actor/main.py rename to docs/03_guides/code/08_crawl4ai.py index 4e6befe6..c6813f6b 100644 --- a/docs/03_guides/code/crawl4ai_project/my_actor/main.py +++ b/docs/03_guides/code/08_crawl4ai.py @@ -1,10 +1,53 @@ -from __future__ import annotations +import asyncio +from typing import Any -from crawl4ai import AsyncWebCrawler, BrowserConfig +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CacheMode, + CrawlerRunConfig, + ProxyConfig, +) from apify import Actor, Request -from .scraper import scrape_page + +async def scrape_page( + crawler: AsyncWebCrawler, + url: str, + *, + proxy_url: str | None = None, +) -> tuple[dict[str, Any], list[str]]: + """Crawl a single page with Crawl4AI and extract its markdown and links. + + The page is rendered in the browser managed by `crawler`, and Crawl4AI turns + the result into clean, LLM-ready markdown. Setting `proxy_config` on the + per-request `CrawlerRunConfig` routes this request through Apify Proxy, so + every page can use a fresh IP address. + """ + run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + proxy_config=ProxyConfig.from_string(proxy_url) if proxy_url else None, + ) + + result = await crawler.arun(url, config=run_config) + if not result.success: + raise RuntimeError(result.error_message or f'Failed to crawl {url}') + + # `result.markdown` is the rendered page as clean markdown, and + # `result.metadata` carries page-level fields such as the title. + data = { + 'url': result.url, + 'title': (result.metadata or {}).get('title'), + 'markdown': str(result.markdown), + } + + # Crawl4AI already splits links into `internal` (same site) and `external`. + # We follow only the internal ones to keep the crawl on the same website. + internal_links = result.links.get('internal', []) + links = [link['href'] for link in internal_links if link.get('href')] + + return data, links async def main() -> None: @@ -71,3 +114,7 @@ async def main() -> None: finally: # Mark the request as handled so it is not processed again. await request_queue.mark_request_as_handled(request) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/browser_use_project/my_actor/agent.py b/docs/03_guides/code/09_browser_use.py similarity index 50% rename from docs/03_guides/code/browser_use_project/my_actor/agent.py rename to docs/03_guides/code/09_browser_use.py index 708265fb..2a4e338a 100644 --- a/docs/03_guides/code/browser_use_project/my_actor/agent.py +++ b/docs/03_guides/code/09_browser_use.py @@ -1,11 +1,19 @@ -from __future__ import annotations - +import asyncio +import os from urllib.parse import urlsplit from browser_use import Agent, Browser, ChatOpenAI from browser_use.browser import ProxySettings from pydantic import BaseModel +from apify import Actor + +# The default task is aligned with the `Posts` output schema defined below. +DEFAULT_TASK = ( + 'Open https://news.ycombinator.com and return the title and URL ' + 'of the top 5 posts on the front page.' +) + class Post(BaseModel): """A single item the agent is asked to extract.""" @@ -20,6 +28,16 @@ class Posts(BaseModel): posts: list[Post] +def _proxy_settings(proxy_url: str) -> ProxySettings: + """Convert an Apify Proxy URL into Browser Use `ProxySettings`.""" + parts = urlsplit(proxy_url) + return ProxySettings( + server=f'{parts.scheme}://{parts.hostname}:{parts.port}', + username=parts.username, + password=parts.password, + ) + + async def run_agent_task( task: str, *, @@ -59,11 +77,45 @@ async def run_agent_task( return history.structured_output -def _proxy_settings(proxy_url: str) -> ProxySettings: - """Convert an Apify Proxy URL into Browser Use `ProxySettings`.""" - parts = urlsplit(proxy_url) - return ProxySettings( - server=f'{parts.scheme}://{parts.hostname}:{parts.port}', - username=parts.username, - password=parts.password, - ) +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Retrieve the Actor input, and use default values if not provided. + actor_input = await Actor.get_input() or {} + task = actor_input.get('task', DEFAULT_TASK) + model = actor_input.get('model', 'gpt-4.1-mini') + max_steps = actor_input.get('max_steps', 25) + + # Read the LLM API key from the environment so it is never stored in the Actor + # input. On the Apify platform, set it as a secret environment variable. + llm_api_key = os.environ.get('OPENAI_API_KEY') + if not llm_api_key: + raise RuntimeError('The OPENAI_API_KEY environment variable is not set.') + + # Create a proxy configuration that routes the browser through Apify Proxy. + proxy_configuration = await Actor.create_proxy_configuration() + proxy_url = await proxy_configuration.new_url() if proxy_configuration else None + + Actor.log.info(f'Running the agent (model={model}) for task: {task}') + + # Run the Browser Use agent and collect its structured output. + result = await run_agent_task( + task, + model=model, + llm_api_key=llm_api_key, + max_steps=max_steps, + headless=Actor.configuration.headless, + proxy_url=proxy_url, + ) + + if result is None: + Actor.log.warning('The agent did not return any structured output.') + return + + # Store every extracted item as a separate row in the default dataset. + for post in result.posts: + await Actor.push_data(post.model_dump()) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/browser_use_project/Dockerfile b/docs/03_guides/code/browser_use_project/Dockerfile deleted file mode 100644 index c35bbfc9..00000000 --- a/docs/03_guides/code/browser_use_project/Dockerfile +++ /dev/null @@ -1,21 +0,0 @@ -# Use the Apify Playwright base image, which already ships a Chromium browser together -# with all of its system-level dependencies. Browser Use launches that browser via CDP, -# so no extra browser install step is needed. -FROM apify/actor-python-playwright:3.14-1.60.0 - -USER myuser - -# Copy just requirements.txt first to leverage the Docker build cache. -COPY --chown=myuser:myuser requirements.txt ./ -RUN pip install -r requirements.txt - -# Copy the rest of the source code and verify that it compiles. -COPY --chown=myuser:myuser . ./ -RUN python -m compileall -q my_actor/ - -# Disable Browser Use telemetry and cloud sync inside the Actor. -ENV ANONYMIZED_TELEMETRY=false -ENV BROWSER_USE_CLOUD_SYNC=false - -# Specify how to launch the Actor. -CMD ["python", "-m", "my_actor"] diff --git a/docs/03_guides/code/browser_use_project/my_actor/__init__.py b/docs/03_guides/code/browser_use_project/my_actor/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/03_guides/code/browser_use_project/my_actor/__main__.py b/docs/03_guides/code/browser_use_project/my_actor/__main__.py deleted file mode 100644 index 6aeaf3d5..00000000 --- a/docs/03_guides/code/browser_use_project/my_actor/__main__.py +++ /dev/null @@ -1,8 +0,0 @@ -from __future__ import annotations - -import asyncio - -from .main import main - -if __name__ == '__main__': - asyncio.run(main()) diff --git a/docs/03_guides/code/browser_use_project/my_actor/main.py b/docs/03_guides/code/browser_use_project/my_actor/main.py deleted file mode 100644 index d045759e..00000000 --- a/docs/03_guides/code/browser_use_project/my_actor/main.py +++ /dev/null @@ -1,53 +0,0 @@ -from __future__ import annotations - -import os - -from apify import Actor - -from .agent import run_agent_task - -# The default task is aligned with the `Posts` output schema defined in `agent.py`. -DEFAULT_TASK = ( - 'Open https://news.ycombinator.com and return the title and URL ' - 'of the top 5 posts on the front page.' -) - - -async def main() -> None: - # Enter the context of the Actor. - async with Actor: - # Retrieve the Actor input, and use default values if not provided. - actor_input = await Actor.get_input() or {} - task = actor_input.get('task', DEFAULT_TASK) - model = actor_input.get('model', 'gpt-4.1-mini') - max_steps = actor_input.get('max_steps', 25) - - # Read the LLM API key from the environment so it is never stored in the Actor - # input. On the Apify platform, set it as a secret environment variable. - llm_api_key = os.environ.get('OPENAI_API_KEY') - if not llm_api_key: - raise RuntimeError('The OPENAI_API_KEY environment variable is not set.') - - # Create a proxy configuration that routes the browser through Apify Proxy. - proxy_configuration = await Actor.create_proxy_configuration() - proxy_url = await proxy_configuration.new_url() if proxy_configuration else None - - Actor.log.info(f'Running the agent (model={model}) for task: {task}') - - # Run the Browser Use agent and collect its structured output. - result = await run_agent_task( - task, - model=model, - llm_api_key=llm_api_key, - max_steps=max_steps, - headless=Actor.configuration.headless, - proxy_url=proxy_url, - ) - - if result is None: - Actor.log.warning('The agent did not return any structured output.') - return - - # Store every extracted item as a separate row in the default dataset. - for post in result.posts: - await Actor.push_data(post.model_dump()) diff --git a/docs/03_guides/code/crawl4ai_project/Dockerfile b/docs/03_guides/code/crawl4ai_project/Dockerfile deleted file mode 100644 index 348f6ff2..00000000 --- a/docs/03_guides/code/crawl4ai_project/Dockerfile +++ /dev/null @@ -1,19 +0,0 @@ -# Use the Apify Playwright base image, which already ships a browser together -# with all of its system-level dependencies. Crawl4AI drives this browser -# through Playwright and reuses the binaries the image provides, so no separate -# browser-install step is needed. -# -# The Python 3.13 image is used because some of Crawl4AI's dependencies do not -# yet publish wheels for newer Python versions. -FROM apify/actor-python-playwright:3.13-1.60.0 - -# Copy just requirements.txt first to leverage the Docker build cache. -COPY --chown=myuser:myuser requirements.txt ./ -RUN pip install -r requirements.txt - -# Copy the rest of the source code and verify that it compiles. -COPY --chown=myuser:myuser . ./ -RUN python -m compileall -q my_actor/ - -# Specify how to launch the Actor. -CMD ["python", "-m", "my_actor"] diff --git a/docs/03_guides/code/crawl4ai_project/my_actor/__init__.py b/docs/03_guides/code/crawl4ai_project/my_actor/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/03_guides/code/crawl4ai_project/my_actor/__main__.py b/docs/03_guides/code/crawl4ai_project/my_actor/__main__.py deleted file mode 100644 index 6aeaf3d5..00000000 --- a/docs/03_guides/code/crawl4ai_project/my_actor/__main__.py +++ /dev/null @@ -1,8 +0,0 @@ -from __future__ import annotations - -import asyncio - -from .main import main - -if __name__ == '__main__': - asyncio.run(main()) diff --git a/docs/03_guides/code/crawl4ai_project/my_actor/scraper.py b/docs/03_guides/code/crawl4ai_project/my_actor/scraper.py deleted file mode 100644 index f96f76e3..00000000 --- a/docs/03_guides/code/crawl4ai_project/my_actor/scraper.py +++ /dev/null @@ -1,46 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Any - -from crawl4ai import CacheMode, CrawlerRunConfig, ProxyConfig - -if TYPE_CHECKING: - from crawl4ai import AsyncWebCrawler - - -async def scrape_page( - crawler: AsyncWebCrawler, - url: str, - *, - proxy_url: str | None = None, -) -> tuple[dict[str, Any], list[str]]: - """Crawl a single page with Crawl4AI and extract its markdown and links. - - The page is rendered in the browser managed by `crawler`, and Crawl4AI turns - the result into clean, LLM-ready markdown. Setting `proxy_config` on the - per-request `CrawlerRunConfig` routes this request through Apify Proxy, so - every page can use a fresh IP address. - """ - run_config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - proxy_config=ProxyConfig.from_string(proxy_url) if proxy_url else None, - ) - - result = await crawler.arun(url, config=run_config) - if not result.success: - raise RuntimeError(result.error_message or f'Failed to crawl {url}') - - # `result.markdown` is the rendered page as clean markdown, and - # `result.metadata` carries page-level fields such as the title. - data = { - 'url': result.url, - 'title': (result.metadata or {}).get('title'), - 'markdown': str(result.markdown), - } - - # Crawl4AI already splits links into `internal` (same site) and `external`. - # We follow only the internal ones to keep the crawl on the same website. - internal_links = result.links.get('internal', []) - links = [link['href'] for link in internal_links if link.get('href')] - - return data, links diff --git a/docs/03_guides/code/scrapling_browser_project/Dockerfile b/docs/03_guides/code/scrapling_browser_project/Dockerfile deleted file mode 100644 index 38b30c60..00000000 --- a/docs/03_guides/code/scrapling_browser_project/Dockerfile +++ /dev/null @@ -1,21 +0,0 @@ -# Use the Apify Playwright base image, which already ships a browser together -# with all of its system-level dependencies. -FROM apify/actor-python-playwright:3.14-1.60.0 - -# Copy just requirements.txt first to leverage the Docker build cache. -COPY --chown=myuser:myuser requirements.txt ./ -RUN pip install -r requirements.txt - -# Download the browser binaries that Scrapling expects. The base image already -# provides their system-level dependencies, so run this step as root and then -# switch back to the unprivileged user. -USER root -RUN scrapling install -USER myuser - -# Copy the rest of the source code and verify that it compiles. -COPY --chown=myuser:myuser . ./ -RUN python -m compileall -q my_actor/ - -# Specify how to launch the Actor. -CMD ["python", "-m", "my_actor"] diff --git a/docs/03_guides/code/scrapling_project/my_actor/__init__.py b/docs/03_guides/code/scrapling_project/my_actor/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/03_guides/code/scrapling_project/my_actor/__main__.py b/docs/03_guides/code/scrapling_project/my_actor/__main__.py deleted file mode 100644 index 6aeaf3d5..00000000 --- a/docs/03_guides/code/scrapling_project/my_actor/__main__.py +++ /dev/null @@ -1,8 +0,0 @@ -from __future__ import annotations - -import asyncio - -from .main import main - -if __name__ == '__main__': - asyncio.run(main()) diff --git a/docs/03_guides/code/scrapling_project/my_actor/scraper.py b/docs/03_guides/code/scrapling_project/my_actor/scraper.py deleted file mode 100644 index b840db82..00000000 --- a/docs/03_guides/code/scrapling_project/my_actor/scraper.py +++ /dev/null @@ -1,47 +0,0 @@ -from __future__ import annotations - -from typing import Any - -from scrapling.fetchers import AsyncFetcher - - -async def scrape_page( - url: str, - *, - proxy_url: str | None = None, -) -> tuple[dict[str, Any], list[str]]: - """Fetch a single page with Scrapling and extract its data and links. - - The page is fetched with Scrapling's asynchronous HTTP fetcher. The - `impersonate` and `stealthy_headers` options make the request look like it - comes from a real Chrome browser, which reduces the chance of being blocked. - The returned response is also a Scrapling selector, so it can be queried with - CSS selectors directly. - """ - response = await AsyncFetcher.get( - url, - proxy=proxy_url, - impersonate='chrome', - stealthy_headers=True, - timeout=60, - ) - - # Extract the desired data using CSS selectors. The `::text` pseudo-element - # returns the text content of the matched elements. - data = { - 'url': url, - 'title': response.css('title::text').get(), - 'h1s': response.css('h1::text').getall(), - 'h2s': response.css('h2::text').getall(), - 'h3s': response.css('h3::text').getall(), - } - - # Collect absolute links from the page. The `::attr(href)` pseudo-selector - # reads the attribute and `response.urljoin` resolves it against the page URL. - links: list[str] = [] - for href in response.css('a::attr(href)').getall(): - link_url = response.urljoin(href) - if link_url.startswith(('http://', 'https://')): - links.append(link_url) - - return data, links diff --git a/pyproject.toml b/pyproject.toml index ac7a9b18..d17bdc01 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -181,18 +181,6 @@ indent-style = "space" # Local imports in Scrapy project. "TID252", # Prefer absolute imports over relative imports from parent modules ] -"**/docs/**/scrapling_project/**" = [ - # Local imports are mixed up with the Apify SDK. - "I001", # Import block is un-sorted or un-formatted -] -"**/docs/**/browser_use_project/**" = [ - # Local imports are mixed up with the Apify SDK. - "I001", # Import block is un-sorted or un-formatted -] -"**/docs/**/crawl4ai_project/**" = [ - # Local imports are mixed up with the Apify SDK. - "I001", # Import block is un-sorted or un-formatted -] [tool.ruff.lint.flake8-quotes] docstring-quotes = "double" From e2034bfe584070f46c52779159ea886c4e21f460 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 18:56:56 +0200 Subject: [PATCH 17/24] docs: unify scraping guide examples (proxy, camelCase input, logging) and add FastAPI webserver example --- docs/01_introduction/quick-start.mdx | 2 +- docs/03_guides/01_beautifulsoup_httpx.mdx | 6 +- docs/03_guides/02_parsel_impit.mdx | 6 +- docs/03_guides/03_playwright.mdx | 6 +- docs/03_guides/04_selenium.mdx | 8 +- docs/03_guides/05_crawlee.mdx | 4 + docs/03_guides/12_running_webserver.mdx | 24 +++++ docs/03_guides/code/01_beautifulsoup_httpx.py | 95 +++++++++++-------- docs/03_guides/code/02_parsel_impit.py | 95 +++++++++++-------- docs/03_guides/code/03_playwright.py | 37 +++++++- docs/03_guides/code/04_selenium.py | 89 +++++++++++++++-- .../code/05_crawlee_beautifulsoup.py | 31 ++++-- docs/03_guides/code/05_crawlee_parsel.py | 31 ++++-- docs/03_guides/code/05_crawlee_playwright.py | 37 +++++--- docs/03_guides/code/07_scrapling.py | 10 +- docs/03_guides/code/08_crawl4ai.py | 11 ++- docs/03_guides/code/09_browser_use.py | 4 +- docs/03_guides/code/12_webserver.py | 2 +- docs/03_guides/code/12_webserver_fastapi.py | 49 ++++++++++ 19 files changed, 417 insertions(+), 130 deletions(-) create mode 100644 docs/03_guides/code/12_webserver_fastapi.py diff --git a/docs/01_introduction/quick-start.mdx b/docs/01_introduction/quick-start.mdx index 59cdc929..e487f201 100644 --- a/docs/01_introduction/quick-start.mdx +++ b/docs/01_introduction/quick-start.mdx @@ -67,7 +67,7 @@ The Actor's source code is in the `src` folder. This folder contains two importa {MainExample} - + {UnderscoreMainExample} diff --git a/docs/03_guides/01_beautifulsoup_httpx.mdx b/docs/03_guides/01_beautifulsoup_httpx.mdx index ba15df03..f7b4f797 100644 --- a/docs/03_guides/01_beautifulsoup_httpx.mdx +++ b/docs/03_guides/01_beautifulsoup_httpx.mdx @@ -20,12 +20,16 @@ To create an Actor which uses those libraries, start from the [BeautifulSoup & P ## Example Actor -Below is a simple Actor that recursively scrapes titles from all linked websites, up to a specified maximum depth, starting from URLs provided in the Actor input. It uses [HTTPX](https://www.python-httpx.org/) for fetching pages and [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for parsing their content to extract titles and links to other pages. +Below is a simple Actor that recursively scrapes titles from all linked websites, up to a specified maximum depth, starting from URLs provided in the Actor input. It uses [HTTPX](https://www.python-httpx.org/) for fetching pages through [Apify Proxy](https://docs.apify.com/platform/proxy) and [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for parsing their content to extract the title, headings, and links to other pages. {BeautifulSoupHttpxExample} +## Using Apify Proxy + +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. The example creates a proxy configuration with `Actor.create_proxy_configuration` and fetches a fresh proxy URL for every request, so each page goes through a different IP. A new HTTPX client is created per request to apply that URL. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. + ## Conclusion In this guide, you learned how to use the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) with the [HTTPX](https://www.python-httpx.org/) in your Apify Actors. By combining these libraries, you can efficiently extract data from HTML or XML files, making it easy to build web scraping tasks in Python. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/03_guides/02_parsel_impit.mdx b/docs/03_guides/02_parsel_impit.mdx index da5a2866..3511292f 100644 --- a/docs/03_guides/02_parsel_impit.mdx +++ b/docs/03_guides/02_parsel_impit.mdx @@ -18,12 +18,16 @@ In this guide, you'll learn how to combine the [Parsel](https://github.com/scrap ## Example Actor -The following example shows a simple Actor that recursively scrapes titles from linked pages, up to a user-defined maximum depth. It uses [Impit](https://github.com/apify/impit) to fetch pages and [Parsel](https://github.com/scrapy/parsel) to extract titles and discover new links. +The following example shows a simple Actor that recursively scrapes titles from linked pages, up to a user-defined maximum depth. It uses [Impit](https://github.com/apify/impit) to fetch pages through [Apify Proxy](https://docs.apify.com/platform/proxy) and [Parsel](https://github.com/scrapy/parsel) to extract the title, headings, and links. {ParselImpitExample} +## Using Apify Proxy + +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. The example creates a proxy configuration with `Actor.create_proxy_configuration` and fetches a fresh proxy URL for every request, so each page goes through a different IP. A new Impit client is created per request to apply that URL. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. + ## Conclusion In this guide, you learned how to use [Parsel](https://github.com/scrapy/parsel) with [Impit](https://github.com/apify/impit) in your Apify Actors. By combining these libraries, you get a powerful and efficient solution for web scraping: [Parsel](https://github.com/scrapy/parsel) provides excellent CSS selector and XPath support for data extraction, while [Impit](https://github.com/apify/impit) offers a fast and simple HTTP client built by Apify. This combination makes it easy to build scalable web scraping tasks in Python. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/03_guides/03_playwright.mdx b/docs/03_guides/03_playwright.mdx index 0e20b9e4..a497bdd3 100644 --- a/docs/03_guides/03_playwright.mdx +++ b/docs/03_guides/03_playwright.mdx @@ -50,12 +50,16 @@ playwright install --with-deps` This is a simple Actor that recursively scrapes titles from all linked websites, up to a maximum depth, starting from URLs in the Actor input. -It uses Playwright to open the pages in an automated Chrome browser, and to extract the title and anchor elements after the pages load. +It uses Playwright to open the pages in an automated Chrome browser, and to extract the title, headings, and links after the pages load. {PlaywrightExample} +## Using Apify Proxy + +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. The example creates a proxy configuration with `Actor.create_proxy_configuration` and launches the browser through it. Playwright applies the proxy at the browser level, so the whole run shares a single proxy URL rather than rotating per request; the `to_playwright_proxy` helper splits that URL into the `server`, `username`, and `password` fields Playwright expects. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. + ## Conclusion In this guide you learned how to create Actors that use Playwright to scrape websites. Playwright is a powerful tool that can be used to manage browser instances and scrape websites that require JavaScript execution. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/03_guides/04_selenium.mdx b/docs/03_guides/04_selenium.mdx index e878c3a6..ea1076ea 100644 --- a/docs/03_guides/04_selenium.mdx +++ b/docs/03_guides/04_selenium.mdx @@ -34,12 +34,18 @@ Refer to the [Selenium documentation](https://www.selenium.dev/documentation/web This is a simple Actor that recursively scrapes titles from all linked websites, up to a maximum depth, starting from URLs in the Actor input. -It uses Selenium ChromeDriver to open the pages in an automated Chrome browser, and to extract the title and anchor elements after the pages load. +It uses Selenium ChromeDriver to open the pages in an automated Chrome browser, and to extract the title, headings, and links after the pages load. {SeleniumExample} +## Using Apify Proxy + +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. The example creates a proxy configuration with `Actor.create_proxy_configuration` and routes the browser through it for the whole run. + +Chrome ignores the credentials passed in the `--proxy-server` flag, so an authenticated proxy such as Apify Proxy has to be configured from inside a small extension. The `proxy_auth_extension` helper builds one at runtime: its service worker sets the proxy server and answers the browser's authentication challenge with the username and password. Note that the new headless mode (`--headless=new`) is required for Chrome to load the extension. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. + ## Conclusion In this guide you learned how to use Selenium for web scraping in Apify Actors. You can now create your own Actors that use Selenium to scrape dynamic websites and interact with web pages just like a human would. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/03_guides/05_crawlee.mdx b/docs/03_guides/05_crawlee.mdx index 34bb0f46..b6dc2f74 100644 --- a/docs/03_guides/05_crawlee.mdx +++ b/docs/03_guides/05_crawlee.mdx @@ -42,6 +42,10 @@ The [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler {CrawleePlaywrightExample} +## Using Apify Proxy + +All three crawlers above route their requests through [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. `Actor.create_proxy_configuration` returns a Crawlee-compatible proxy configuration, which is passed to the crawler as `proxy_configuration`; Crawlee then rotates the proxy IP for every request on its own. Because the configuration is only available inside the running Actor, the crawler is created in `main` and the request handler is registered on a standalone [`Router`](https://crawlee.dev/python/api/class/Router) up front. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. + ## Conclusion In this guide, you learned how to use the [Crawlee](https://crawlee.dev/python) library in your Apify Actors. By using the [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler), [`ParselCrawler`](https://crawlee.dev/python/api/class/ParselCrawler), and [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) crawlers, you can efficiently scrape static or dynamic web pages, making it easy to build web scraping tasks in Python. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/03_guides/12_running_webserver.mdx b/docs/03_guides/12_running_webserver.mdx index 0050189d..7ff54a47 100644 --- a/docs/03_guides/12_running_webserver.mdx +++ b/docs/03_guides/12_running_webserver.mdx @@ -7,6 +7,7 @@ description: Run an HTTP server inside your Actor for monitoring or serving cont import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import WebserverExample from '!!raw-loader!roa-loader!./code/12_webserver.py'; +import WebserverFastApiExample from '!!raw-loader!roa-loader!./code/12_webserver_fastapi.py'; In this guide, you'll learn how to run a web server inside your Apify Actor. This is useful for monitoring Actor progress, creating custom APIs, or serving content during the Actor run. @@ -30,6 +31,29 @@ The following example shows how to start a simple web server in your Actor, whic {WebserverExample} +## Using FastAPI + +The example above relies only on Python's standard library, which keeps it dependency-free but leaves you handling requests by hand. For anything beyond a single endpoint, a web framework such as [FastAPI](https://fastapi.tiangolo.com/) is a better fit - it gives you routing, request parsing, and automatic JSON responses, and is served by an ASGI server like [uvicorn](https://www.uvicorn.org/). + +Install both, for example by adding them to your `requirements.txt`: + +```text +fastapi +uvicorn[standard] +``` + +The following Actor serves the same processed-items counter as before, but through a FastAPI endpoint. The key difference is that uvicorn runs inside the Actor's event loop as a background task, bound to `Actor.configuration.web_server_port` so the platform routes the container URL to it: + + + {WebserverFastApiExample} + + +A few things worth pointing out: + +- `uvicorn.Server(...).serve()` is a coroutine, so it runs as an `asyncio` task alongside the Actor's own work instead of blocking it. Setting `server.should_exit = True` triggers a graceful shutdown once the work is done. +- The server binds to `0.0.0.0` (all interfaces) rather than `localhost`, so it's reachable through the container URL, not only from inside the container. +- The same pattern powers an [Actor Standby](#actor-standby) service - swap the one-off work loop for an Actor that just keeps serving requests. + ## Actor Standby The example above runs a web server for the duration of a single Actor run. With [Actor Standby](https://docs.apify.com/platform/actors/development/programming-interface/standby), you can instead expose your Actor as an always-ready HTTP API: the platform keeps the Actor running in the background and routes incoming HTTP requests to the web server inside it, spinning up additional instances as the load grows. diff --git a/docs/03_guides/code/01_beautifulsoup_httpx.py b/docs/03_guides/code/01_beautifulsoup_httpx.py index 86e83868..45633da4 100644 --- a/docs/03_guides/code/01_beautifulsoup_httpx.py +++ b/docs/03_guides/code/01_beautifulsoup_httpx.py @@ -9,16 +9,21 @@ async def scrape_page( - client: httpx.AsyncClient, url: str + url: str, + *, + proxy_url: str | None = None, ) -> tuple[dict[str, Any], list[str]]: """Fetch a single page with HTTPX and extract its data and links. - Keeping the fetching and parsing in this helper keeps the Actor's main loop - shallow. It returns the extracted data together with the links found on the - page, so `main` only has to decide what to store and what to enqueue. + A fresh client is created per call, so each request can go through a new + proxy URL and a different IP address. The helper returns the extracted data + together with the links found on the page, so `main` only has to decide what + to store and what to enqueue. """ - # Fetch the HTTP response from the specified URL using HTTPX. - response = await client.get(url, follow_redirects=True) + # Fetch the HTTP response from the specified URL using HTTPX, optionally + # routing the request through the provided Apify Proxy URL. + async with httpx.AsyncClient(proxy=proxy_url) as client: + response = await client.get(url, follow_redirects=True) # Parse the HTML content using Beautiful Soup. soup = BeautifulSoup(response.content, 'html.parser') @@ -47,54 +52,64 @@ async def main() -> None: async with Actor: # Retrieve the Actor input, and use default values if not provided. actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}]) - max_depth = actor_input.get('max_depth', 1) + start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('maxDepth', 1) # Exit if no start URLs are provided. if not start_urls: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() + # Create a proxy configuration that routes requests through Apify Proxy. + proxy_configuration = await Actor.create_proxy_configuration() + # Open the default request queue for handling URLs to be processed. request_queue = await Actor.open_request_queue() # Enqueue the start URLs. Their crawl depth defaults to 0. for start_url in start_urls: url = start_url.get('url') - Actor.log.info(f'Enqueuing {url} ...') + Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) - # Create an HTTPX client to fetch the HTML content of the URLs. - async with httpx.AsyncClient() as client: - # Process the URLs from the request queue. - while request := await request_queue.fetch_next_request(): - url = request.url - - # Read the crawl depth tracked by the request itself. - depth = request.crawl_depth - Actor.log.info(f'Scraping {url} (depth={depth}) ...') - - try: - # Fetch the page and extract its data and nested links. - data, links = await scrape_page(client, url) - - # Store the extracted data to the default dataset. - await Actor.push_data(data) - - # If we are not too deep yet, enqueue the links we found. - if depth < max_depth: - for link_url in links: - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url(link_url) - new_request.crawl_depth = depth + 1 - await request_queue.add_request(new_request) - - except Exception: - Actor.log.exception(f'Cannot extract data from {url}.') - - finally: - # Mark the request as handled so it is not processed again. - await request_queue.mark_request_as_handled(request) + # Process the URLs from the request queue. + while request := await request_queue.fetch_next_request(): + url = request.url + + # Read the crawl depth tracked by the request itself. + depth = request.crawl_depth + Actor.log.info(f'Scraping {url} (depth={depth}) ...') + + try: + # Get a fresh proxy URL for each request (None if no proxy set up). + proxy_url = None + if proxy_configuration: + proxy_url = await proxy_configuration.new_url() + + # Fetch the page and extract its data and nested links. + data, links = await scrape_page(url, proxy_url=proxy_url) + + # Store the extracted data to the default dataset. + await Actor.push_data(data) + Actor.log.info( + f'Stored data from {url} ' + f'(title={data["title"]!r}, {len(links)} links found).' + ) + + # If we are not too deep yet, enqueue the links we found. + if depth < max_depth: + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + new_request = Request.from_url(link_url) + new_request.crawl_depth = depth + 1 + await request_queue.add_request(new_request) + + except Exception: + Actor.log.exception(f'Cannot extract data from {url}.') + + finally: + # Mark the request as handled so it is not processed again. + await request_queue.mark_request_as_handled(request) if __name__ == '__main__': diff --git a/docs/03_guides/code/02_parsel_impit.py b/docs/03_guides/code/02_parsel_impit.py index 1a0c4f77..30ea6428 100644 --- a/docs/03_guides/code/02_parsel_impit.py +++ b/docs/03_guides/code/02_parsel_impit.py @@ -9,16 +9,21 @@ async def scrape_page( - client: impit.AsyncClient, url: str + url: str, + *, + proxy_url: str | None = None, ) -> tuple[dict[str, Any], list[str]]: """Fetch a single page with Impit and extract its data and links. - Keeping the fetching and parsing in this helper keeps the Actor's main loop - shallow. It returns the extracted data together with the links found on the - page, so `main` only has to decide what to store and what to enqueue. + A fresh client is created per call, so each request can go through a new + proxy URL and a different IP address. The helper returns the extracted data + together with the links found on the page, so `main` only has to decide what + to store and what to enqueue. """ - # Fetch the HTTP response from the specified URL using Impit. - response = await client.get(url) + # Fetch the HTTP response from the specified URL using Impit, optionally + # routing the request through the provided Apify Proxy URL. + async with impit.AsyncClient(proxy=proxy_url) as client: + response = await client.get(url) # Parse the HTML content using a Parsel selector. selector = parsel.Selector(text=response.text) @@ -47,54 +52,64 @@ async def main() -> None: async with Actor: # Retrieve the Actor input, and use default values if not provided. actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}]) - max_depth = actor_input.get('max_depth', 1) + start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('maxDepth', 1) # Exit if no start URLs are provided. if not start_urls: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() + # Create a proxy configuration that routes requests through Apify Proxy. + proxy_configuration = await Actor.create_proxy_configuration() + # Open the default request queue for handling URLs to be processed. request_queue = await Actor.open_request_queue() # Enqueue the start URLs. Their crawl depth defaults to 0. for start_url in start_urls: url = start_url.get('url') - Actor.log.info(f'Enqueuing {url} ...') + Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) - # Create an Impit client to fetch the HTML content of the URLs. - async with impit.AsyncClient() as client: - # Process the URLs from the request queue. - while request := await request_queue.fetch_next_request(): - url = request.url - - # Read the crawl depth tracked by the request itself. - depth = request.crawl_depth - Actor.log.info(f'Scraping {url} (depth={depth}) ...') - - try: - # Fetch the page and extract its data and nested links. - data, links = await scrape_page(client, url) - - # Store the extracted data to the default dataset. - await Actor.push_data(data) - - # If we are not too deep yet, enqueue the links we found. - if depth < max_depth: - for link_url in links: - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url(link_url) - new_request.crawl_depth = depth + 1 - await request_queue.add_request(new_request) - - except Exception: - Actor.log.exception(f'Cannot extract data from {url}.') - - finally: - # Mark the request as handled so it is not processed again. - await request_queue.mark_request_as_handled(request) + # Process the URLs from the request queue. + while request := await request_queue.fetch_next_request(): + url = request.url + + # Read the crawl depth tracked by the request itself. + depth = request.crawl_depth + Actor.log.info(f'Scraping {url} (depth={depth}) ...') + + try: + # Get a fresh proxy URL for each request (None if no proxy set up). + proxy_url = None + if proxy_configuration: + proxy_url = await proxy_configuration.new_url() + + # Fetch the page and extract its data and nested links. + data, links = await scrape_page(url, proxy_url=proxy_url) + + # Store the extracted data to the default dataset. + await Actor.push_data(data) + Actor.log.info( + f'Stored data from {url} ' + f'(title={data["title"]!r}, {len(links)} links found).' + ) + + # If we are not too deep yet, enqueue the links we found. + if depth < max_depth: + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + new_request = Request.from_url(link_url) + new_request.crawl_depth = depth + 1 + await request_queue.add_request(new_request) + + except Exception: + Actor.log.exception(f'Cannot extract data from {url}.') + + finally: + # Mark the request as handled so it is not processed again. + await request_queue.mark_request_as_handled(request) if __name__ == '__main__': diff --git a/docs/03_guides/code/03_playwright.py b/docs/03_guides/code/03_playwright.py index 1f2fc1d7..4175cbc2 100644 --- a/docs/03_guides/code/03_playwright.py +++ b/docs/03_guides/code/03_playwright.py @@ -1,6 +1,6 @@ import asyncio from typing import Any -from urllib.parse import urljoin +from urllib.parse import urljoin, urlsplit from playwright.async_api import BrowserContext, async_playwright @@ -12,6 +12,21 @@ # in the Actor's Docker image. +def to_playwright_proxy(proxy_url: str) -> dict[str, str]: + """Convert an Apify Proxy URL into Playwright proxy settings. + + Playwright wants the proxy as a `server` URL with the credentials in separate + `username` and `password` fields, so the single URL returned by + `ProxyConfiguration.new_url` has to be split into its parts. + """ + parts = urlsplit(proxy_url) + return { + 'server': f'{parts.scheme}://{parts.hostname}:{parts.port}', + 'username': parts.username or '', + 'password': parts.password or '', + } + + async def scrape_page( context: BrowserContext, url: str ) -> tuple[dict[str, Any], list[str]]: @@ -29,6 +44,9 @@ async def scrape_page( data = { 'url': url, 'title': await page.title(), + 'h1s': [await h1.text_content() for h1 in await page.locator('h1').all()], + 'h2s': [await h2.text_content() for h2 in await page.locator('h2').all()], + 'h3s': [await h3.text_content() for h3 in await page.locator('h3').all()], } # Collect absolute links found on the page so the caller can enqueue them. @@ -50,21 +68,27 @@ async def main() -> None: async with Actor: # Retrieve the Actor input, and use default values if not provided. actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}]) - max_depth = actor_input.get('max_depth', 1) + start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('maxDepth', 1) # Exit if no start URLs are provided. if not start_urls: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() + # Create a proxy configuration that routes the browser through Apify Proxy. + # Playwright applies the proxy at the browser level, so the whole run shares + # a single proxy URL rather than rotating it per request. + proxy_configuration = await Actor.create_proxy_configuration() + proxy_url = await proxy_configuration.new_url() if proxy_configuration else None + # Open the default request queue for handling URLs to be processed. request_queue = await Actor.open_request_queue() # Enqueue the start URLs. Their crawl depth defaults to 0. for start_url in start_urls: url = start_url.get('url') - Actor.log.info(f'Enqueuing {url} ...') + Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) Actor.log.info('Launching Playwright...') @@ -74,6 +98,7 @@ async def main() -> None: # Configure the browser to launch in headless mode as per Actor configuration. browser = await playwright.chromium.launch( headless=Actor.configuration.headless, + proxy=to_playwright_proxy(proxy_url) if proxy_url else None, args=['--disable-gpu'], ) context = await browser.new_context() @@ -92,6 +117,10 @@ async def main() -> None: # Store the extracted data to the default dataset. await Actor.push_data(data) + Actor.log.info( + f'Stored data from {url} ' + f'(title={data["title"]!r}, {len(links)} links found).' + ) # If we are not too deep yet, enqueue the links we found. if depth < max_depth: diff --git a/docs/03_guides/code/04_selenium.py b/docs/03_guides/code/04_selenium.py index 42dc3509..3f28db24 100644 --- a/docs/03_guides/code/04_selenium.py +++ b/docs/03_guides/code/04_selenium.py @@ -1,6 +1,10 @@ import asyncio +import json +from pathlib import Path +from tempfile import mkdtemp from typing import Any -from urllib.parse import urljoin +from urllib.parse import urljoin, urlsplit +from zipfile import ZipFile from selenium import webdriver from selenium.webdriver.chrome.options import Options as ChromeOptions @@ -15,6 +19,61 @@ # in the Actor's Docker image. +def proxy_auth_extension(proxy_url: str) -> str: + """Build a temporary Chrome extension that routes Chrome through a proxy. + + Chrome ignores credentials passed in the `--proxy-server` flag, so an + authenticated proxy such as Apify Proxy has to be configured from inside an + extension: its service worker sets the proxy server and answers the browser's + authentication challenge with the username and password. The function returns + the path to a packed extension ready to be loaded with `add_extension`. + """ + parts = urlsplit(proxy_url) + + manifest = { + 'name': 'Apify Proxy', + 'version': '1.0.0', + 'manifest_version': 3, + 'permissions': ['proxy', 'webRequest', 'webRequestAuthProvider'], + 'host_permissions': [''], + 'background': {'service_worker': 'background.js'}, + 'minimum_chrome_version': '108', + } + + # The service worker sets the proxy server and supplies the credentials when + # Chrome is challenged for authentication. `json.dumps` handles the escaping. + proxy_config = json.dumps( + { + 'mode': 'fixed_servers', + 'rules': { + 'singleProxy': { + 'scheme': parts.scheme, + 'host': parts.hostname, + 'port': parts.port, + }, + }, + } + ) + credentials = json.dumps( + {'username': parts.username or '', 'password': parts.password or ''} + ) + background = ( + 'chrome.proxy.settings.set(' + '{value: ' + proxy_config + ', scope: "regular"});\n' + 'chrome.webRequest.onAuthRequired.addListener(\n' + ' () => ({authCredentials: ' + credentials + '}),\n' + ' {urls: [""]},\n' + ' ["blocking"],\n' + ');\n' + ) + + extension_path = Path(mkdtemp()) / 'apify_proxy.zip' + with ZipFile(extension_path, 'w') as archive: + archive.writestr('manifest.json', json.dumps(manifest)) + archive.writestr('background.js', background) + return str(extension_path) + + def scrape_page(driver: webdriver.Chrome, url: str) -> tuple[dict[str, Any], list[str]]: """Navigate to a page with Selenium, extract its data, and collect its links. @@ -29,6 +88,9 @@ def scrape_page(driver: webdriver.Chrome, url: str) -> tuple[dict[str, Any], lis data = { 'url': url, 'title': driver.title, + 'h1s': [el.text for el in driver.find_elements(By.TAG_NAME, 'h1')], + 'h2s': [el.text for el in driver.find_elements(By.TAG_NAME, 'h2')], + 'h3s': [el.text for el in driver.find_elements(By.TAG_NAME, 'h3')], } # Collect absolute links found on the page so the caller can enqueue them. @@ -46,8 +108,8 @@ async def main() -> None: async with Actor: # Retrieve the Actor input, and use default values if not provided. actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}]) - max_depth = actor_input.get('max_depth', 1) + start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('maxDepth', 1) # Exit if no start URLs are provided. if not start_urls: @@ -60,7 +122,7 @@ async def main() -> None: # Enqueue the start URLs. Their crawl depth defaults to 0. for start_url in start_urls: url = start_url.get('url') - Actor.log.info(f'Enqueuing {url} ...') + Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) # Launch a new Selenium Chrome WebDriver and configure it. @@ -68,14 +130,25 @@ async def main() -> None: chrome_options = ChromeOptions() if Actor.configuration.headless: - chrome_options.add_argument('--headless') + # The new headless mode is required for the proxy extension to load. + chrome_options.add_argument('--headless=new') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') + + # Route the browser through Apify Proxy. Selenium applies the proxy at the + # browser level, so the whole run shares a single proxy URL. + proxy_configuration = await Actor.create_proxy_configuration() + if proxy_configuration and (proxy_url := await proxy_configuration.new_url()): + chrome_options.add_extension(proxy_auth_extension(proxy_url)) + chrome_options.add_argument( + '--disable-features=DisableLoadExtensionCommandLineSwitch' + ) + driver = webdriver.Chrome(options=chrome_options) # Test WebDriver setup by navigating to an example page. - driver.get('http://www.example.com') + driver.get('https://example.com') if driver.title != 'Example Domain': raise ValueError('Failed to open example page.') @@ -94,6 +167,10 @@ async def main() -> None: # Store the extracted data to the default dataset. await Actor.push_data(data) + Actor.log.info( + f'Stored data from {url} ' + f'(title={data["title"]!r}, {len(links)} links found).' + ) # If we are not too deep yet, enqueue the links we found. if depth < max_depth: diff --git a/docs/03_guides/code/05_crawlee_beautifulsoup.py b/docs/03_guides/code/05_crawlee_beautifulsoup.py index 4d3a81d7..0e6dea2f 100644 --- a/docs/03_guides/code/05_crawlee_beautifulsoup.py +++ b/docs/03_guides/code/05_crawlee_beautifulsoup.py @@ -1,20 +1,19 @@ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.router import Router from apify import Actor -# Create a crawler. -crawler = BeautifulSoupCrawler( - # Limit the crawl to max requests. Remove or increase it for crawling all links. - max_requests_per_crawl=50, -) +# Define the request router up front, so the crawler itself can be created later +# inside `main`, once the Apify Proxy configuration is available. +router = Router[BeautifulSoupCrawlingContext]() # Define a request handler, which will be called for every request. -@crawler.router.default_handler +@router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: - Actor.log.info(f'Scraping {context.request.url}...') + Actor.log.info(f'Scraping {context.request.url} ...') # Extract the desired data. data = { @@ -27,6 +26,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: # Store the extracted data to the default dataset. await context.push_data(data) + Actor.log.info(f'Stored data from {context.request.url} (title={data["title"]!r}).') # Enqueue additional links found on the current page. await context.enqueue_links(strategy='same-domain') @@ -39,7 +39,7 @@ async def main() -> None: actor_input = await Actor.get_input() or {} start_urls = [ url.get('url') - for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}]) + for url in actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) ] # Exit if no start URLs are provided. @@ -47,6 +47,21 @@ async def main() -> None: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() + # Create a proxy configuration that routes requests through Apify Proxy. + # Crawlee rotates the proxy URL for every request on its own. + proxy_configuration = await Actor.create_proxy_configuration() + if proxy_configuration is None: + raise RuntimeError('Failed to create the proxy configuration.') + + # Create a crawler that uses the router above and routes requests through + # Apify Proxy. + crawler = BeautifulSoupCrawler( + proxy_configuration=proxy_configuration, + request_handler=router, + # Limit the crawl; remove or increase to follow all links. + max_requests_per_crawl=50, + ) + # Run the crawler with the starting requests. await crawler.run(start_urls) diff --git a/docs/03_guides/code/05_crawlee_parsel.py b/docs/03_guides/code/05_crawlee_parsel.py index 31f39d8b..659f3b4d 100644 --- a/docs/03_guides/code/05_crawlee_parsel.py +++ b/docs/03_guides/code/05_crawlee_parsel.py @@ -1,20 +1,19 @@ import asyncio from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.router import Router from apify import Actor -# Create a crawler. -crawler = ParselCrawler( - # Limit the crawl to max requests. Remove or increase it for crawling all links. - max_requests_per_crawl=50, -) +# Define the request router up front, so the crawler itself can be created later +# inside `main`, once the Apify Proxy configuration is available. +router = Router[ParselCrawlingContext]() # Define a request handler, which will be called for every request. -@crawler.router.default_handler +@router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: - Actor.log.info(f'Scraping {context.request.url}...') + Actor.log.info(f'Scraping {context.request.url} ...') # Extract the desired data. data = { @@ -27,6 +26,7 @@ async def request_handler(context: ParselCrawlingContext) -> None: # Store the extracted data to the default dataset. await context.push_data(data) + Actor.log.info(f'Stored data from {context.request.url} (title={data["title"]!r}).') # Enqueue additional links found on the current page. await context.enqueue_links(strategy='same-domain') @@ -39,7 +39,7 @@ async def main() -> None: actor_input = await Actor.get_input() or {} start_urls = [ url.get('url') - for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}]) + for url in actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) ] # Exit if no start URLs are provided. @@ -47,6 +47,21 @@ async def main() -> None: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() + # Create a proxy configuration that routes requests through Apify Proxy. + # Crawlee rotates the proxy URL for every request on its own. + proxy_configuration = await Actor.create_proxy_configuration() + if proxy_configuration is None: + raise RuntimeError('Failed to create the proxy configuration.') + + # Create a crawler that uses the router above and routes requests through + # Apify Proxy. + crawler = ParselCrawler( + proxy_configuration=proxy_configuration, + request_handler=router, + # Limit the crawl; remove or increase to follow all links. + max_requests_per_crawl=50, + ) + # Run the crawler with the starting requests. await crawler.run(start_urls) diff --git a/docs/03_guides/code/05_crawlee_playwright.py b/docs/03_guides/code/05_crawlee_playwright.py index be4ea29e..63482f35 100644 --- a/docs/03_guides/code/05_crawlee_playwright.py +++ b/docs/03_guides/code/05_crawlee_playwright.py @@ -1,23 +1,19 @@ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.router import Router from apify import Actor -# Create a crawler. -crawler = PlaywrightCrawler( - # Limit the crawl to max requests. Remove or increase it for crawling all links. - max_requests_per_crawl=50, - # Run the browser in a headless mode. - headless=True, - browser_launch_options={'args': ['--disable-gpu']}, -) +# Define the request router up front, so the crawler itself can be created later +# inside `main`, once the Apify Proxy configuration is available. +router = Router[PlaywrightCrawlingContext]() # Define a request handler, which will be called for every request. -@crawler.router.default_handler +@router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: - Actor.log.info(f'Scraping {context.request.url}...') + Actor.log.info(f'Scraping {context.request.url} ...') # Extract the desired data. data = { @@ -30,6 +26,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: # Store the extracted data to the default dataset. await context.push_data(data) + Actor.log.info(f'Stored data from {context.request.url} (title={data["title"]!r}).') # Enqueue additional links found on the current page. await context.enqueue_links(strategy='same-domain') @@ -42,7 +39,7 @@ async def main() -> None: actor_input = await Actor.get_input() or {} start_urls = [ url.get('url') - for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}]) + for url in actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) ] # Exit if no start URLs are provided. @@ -50,6 +47,24 @@ async def main() -> None: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() + # Create a proxy configuration that routes requests through Apify Proxy. + # Crawlee rotates the proxy URL for every request on its own. + proxy_configuration = await Actor.create_proxy_configuration() + if proxy_configuration is None: + raise RuntimeError('Failed to create the proxy configuration.') + + # Create a crawler that uses the router above and routes requests through + # Apify Proxy. + crawler = PlaywrightCrawler( + proxy_configuration=proxy_configuration, + request_handler=router, + # Limit the crawl; remove or increase to follow all links. + max_requests_per_crawl=50, + # Run the browser in a headless mode. + headless=True, + browser_launch_options={'args': ['--disable-gpu']}, + ) + # Run the crawler with the starting requests. await crawler.run(start_urls) diff --git a/docs/03_guides/code/07_scrapling.py b/docs/03_guides/code/07_scrapling.py index a817c09b..b266a426 100644 --- a/docs/03_guides/code/07_scrapling.py +++ b/docs/03_guides/code/07_scrapling.py @@ -53,8 +53,8 @@ async def main() -> None: async with Actor: # Retrieve the Actor input, and use default values if not provided. actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{'url': 'https://crawlee.dev'}]) - max_depth = actor_input.get('max_depth', 1) + start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('maxDepth', 1) # Exit if no start URLs are provided. if not start_urls: @@ -70,7 +70,7 @@ async def main() -> None: # Enqueue the start URLs. Their crawl depth defaults to 0. for start_url in start_urls: url = start_url.get('url') - Actor.log.info(f'Enqueuing {url} ...') + Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) # Process the URLs from the request queue. @@ -92,6 +92,10 @@ async def main() -> None: # Store the extracted data to the default dataset. await Actor.push_data(data) + Actor.log.info( + f'Stored data from {url} ' + f'(title={data["title"]!r}, {len(links)} links found).' + ) # If we are not too deep yet, enqueue the links we found one # level deeper than the current page. diff --git a/docs/03_guides/code/08_crawl4ai.py b/docs/03_guides/code/08_crawl4ai.py index c6813f6b..71d4318c 100644 --- a/docs/03_guides/code/08_crawl4ai.py +++ b/docs/03_guides/code/08_crawl4ai.py @@ -55,8 +55,8 @@ async def main() -> None: async with Actor: # Retrieve the Actor input, and use default values if not provided. actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{'url': 'https://crawlee.dev'}]) - max_depth = actor_input.get('max_depth', 1) + start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('maxDepth', 1) # Exit if no start URLs are provided. if not start_urls: @@ -72,7 +72,7 @@ async def main() -> None: # Enqueue the start URLs. Their crawl depth defaults to 0. for start_url in start_urls: url = start_url.get('url') - Actor.log.info(f'Enqueuing {url} ...') + Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) # Configure the headless browser that Crawl4AI drives. @@ -99,11 +99,16 @@ async def main() -> None: # Store the extracted data to the default dataset. await Actor.push_data(data) + Actor.log.info( + f'Stored data from {url} ' + f'(title={data["title"]!r}, {len(links)} links found).' + ) # If we are not too deep yet, enqueue the links we found one # level deeper than the current page. if depth < max_depth: for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') new_request = Request.from_url(link_url) new_request.crawl_depth = depth + 1 await request_queue.add_request(new_request) diff --git a/docs/03_guides/code/09_browser_use.py b/docs/03_guides/code/09_browser_use.py index 2a4e338a..29dc98eb 100644 --- a/docs/03_guides/code/09_browser_use.py +++ b/docs/03_guides/code/09_browser_use.py @@ -84,7 +84,7 @@ async def main() -> None: actor_input = await Actor.get_input() or {} task = actor_input.get('task', DEFAULT_TASK) model = actor_input.get('model', 'gpt-4.1-mini') - max_steps = actor_input.get('max_steps', 25) + max_steps = actor_input.get('maxSteps', 25) # Read the LLM API key from the environment so it is never stored in the Actor # input. On the Apify platform, set it as a secret environment variable. @@ -113,7 +113,9 @@ async def main() -> None: return # Store every extracted item as a separate row in the default dataset. + Actor.log.info(f'The agent returned {len(result.posts)} post(s); storing them.') for post in result.posts: + Actor.log.info(f'Storing post: {post.title!r} ({post.url})') await Actor.push_data(post.model_dump()) diff --git a/docs/03_guides/code/12_webserver.py b/docs/03_guides/code/12_webserver.py index 66ecfe3c..aef8e869 100644 --- a/docs/03_guides/code/12_webserver.py +++ b/docs/03_guides/code/12_webserver.py @@ -10,7 +10,7 @@ class RequestHandler(BaseHTTPRequestHandler): """A handler that prints the number of processed items on every GET request.""" - def do_get(self) -> None: + def do_GET(self) -> None: self.log_request() self.send_response(200) self.end_headers() diff --git a/docs/03_guides/code/12_webserver_fastapi.py b/docs/03_guides/code/12_webserver_fastapi.py new file mode 100644 index 00000000..1b1e6240 --- /dev/null +++ b/docs/03_guides/code/12_webserver_fastapi.py @@ -0,0 +1,49 @@ +import asyncio + +import uvicorn +from fastapi import FastAPI + +from apify import Actor + +# A module-level counter that the web server reports and the Actor keeps updating. +processed_items = 0 + +# The FastAPI application with a single endpoint. +app = FastAPI() + + +@app.get('/') +async def index() -> dict[str, int]: + """Respond to every GET request with the number of processed items.""" + return {'processed_items': processed_items} + + +async def main() -> None: + global processed_items + async with Actor: + # Serve the FastAPI app with uvicorn on the platform's web server port. + # Binding to 0.0.0.0 makes it reachable through the Actor's container URL. + config = uvicorn.Config( + app, + host='0.0.0.0', # noqa: S104 + port=Actor.configuration.web_server_port, + ) + server = uvicorn.Server(config) + + # Run the server in the background while the Actor does its work. + server_task = asyncio.create_task(server.serve()) + Actor.log.info(f'Server running at {Actor.configuration.web_server_url}') + + # Simulate doing some work, updating the counter the endpoint reports. + for _ in range(100): + await asyncio.sleep(1) + processed_items += 1 + Actor.log.info(f'Processed items: {processed_items}') + + # Signal the server to shut down, and wait for it to finish. + server.should_exit = True + await server_task + + +if __name__ == '__main__': + asyncio.run(main()) From a03d26f5e832732d7de38a527db4b9c7bdc9ff4d Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 19:36:54 +0200 Subject: [PATCH 18/24] docs: keep scraping examples on-site, cap crawls, and align browser flags --- docs/03_guides/01_beautifulsoup_httpx.mdx | 2 +- docs/03_guides/02_parsel_impit.mdx | 2 +- docs/03_guides/03_playwright.mdx | 2 +- docs/03_guides/04_selenium.mdx | 2 +- docs/03_guides/07_scrapling.mdx | 2 +- docs/03_guides/code/01_beautifulsoup_httpx.py | 20 ++++++++--- docs/03_guides/code/02_parsel_impit.py | 20 ++++++++--- docs/03_guides/code/03_playwright.py | 20 ++++++++--- docs/03_guides/code/04_selenium.py | 33 ++++++++++++------- docs/03_guides/code/05_crawlee_playwright.py | 5 ++- docs/03_guides/code/07_scrapling.py | 22 ++++++++++--- docs/03_guides/code/08_crawl4ai.py | 11 +++++-- docs/03_guides/code/09_browser_use.py | 4 +-- 13 files changed, 103 insertions(+), 42 deletions(-) diff --git a/docs/03_guides/01_beautifulsoup_httpx.mdx b/docs/03_guides/01_beautifulsoup_httpx.mdx index f7b4f797..ed44d5f8 100644 --- a/docs/03_guides/01_beautifulsoup_httpx.mdx +++ b/docs/03_guides/01_beautifulsoup_httpx.mdx @@ -20,7 +20,7 @@ To create an Actor which uses those libraries, start from the [BeautifulSoup & P ## Example Actor -Below is a simple Actor that recursively scrapes titles from all linked websites, up to a specified maximum depth, starting from URLs provided in the Actor input. It uses [HTTPX](https://www.python-httpx.org/) for fetching pages through [Apify Proxy](https://docs.apify.com/platform/proxy) and [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for parsing their content to extract the title, headings, and links to other pages. +Below is a simple Actor that recursively scrapes data from linked pages on the same site, up to a specified maximum depth, starting from URLs provided in the Actor input. It uses [HTTPX](https://www.python-httpx.org/) for fetching pages through [Apify Proxy](https://docs.apify.com/platform/proxy) and [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for parsing their content to extract the title, headings, and links to other pages. {BeautifulSoupHttpxExample} diff --git a/docs/03_guides/02_parsel_impit.mdx b/docs/03_guides/02_parsel_impit.mdx index 3511292f..da427a04 100644 --- a/docs/03_guides/02_parsel_impit.mdx +++ b/docs/03_guides/02_parsel_impit.mdx @@ -18,7 +18,7 @@ In this guide, you'll learn how to combine the [Parsel](https://github.com/scrap ## Example Actor -The following example shows a simple Actor that recursively scrapes titles from linked pages, up to a user-defined maximum depth. It uses [Impit](https://github.com/apify/impit) to fetch pages through [Apify Proxy](https://docs.apify.com/platform/proxy) and [Parsel](https://github.com/scrapy/parsel) to extract the title, headings, and links. +The following example shows a simple Actor that recursively scrapes data from linked pages on the same site, up to a user-defined maximum depth. It uses [Impit](https://github.com/apify/impit) to fetch pages through [Apify Proxy](https://docs.apify.com/platform/proxy) and [Parsel](https://github.com/scrapy/parsel) to extract the title, headings, and links. {ParselImpitExample} diff --git a/docs/03_guides/03_playwright.mdx b/docs/03_guides/03_playwright.mdx index a497bdd3..3d42ec63 100644 --- a/docs/03_guides/03_playwright.mdx +++ b/docs/03_guides/03_playwright.mdx @@ -48,7 +48,7 @@ playwright install --with-deps` ## Example Actor -This is a simple Actor that recursively scrapes titles from all linked websites, up to a maximum depth, starting from URLs in the Actor input. +This is a simple Actor that recursively scrapes data from linked pages on the same site, up to a maximum depth, starting from URLs in the Actor input. It uses Playwright to open the pages in an automated Chrome browser, and to extract the title, headings, and links after the pages load. diff --git a/docs/03_guides/04_selenium.mdx b/docs/03_guides/04_selenium.mdx index ea1076ea..faed6296 100644 --- a/docs/03_guides/04_selenium.mdx +++ b/docs/03_guides/04_selenium.mdx @@ -32,7 +32,7 @@ Refer to the [Selenium documentation](https://www.selenium.dev/documentation/web ## Example Actor -This is a simple Actor that recursively scrapes titles from all linked websites, up to a maximum depth, starting from URLs in the Actor input. +This is a simple Actor that recursively scrapes data from linked pages on the same site, up to a maximum depth, starting from URLs in the Actor input. It uses Selenium ChromeDriver to open the pages in an automated Chrome browser, and to extract the title, headings, and links after the pages load. diff --git a/docs/03_guides/07_scrapling.mdx b/docs/03_guides/07_scrapling.mdx index 3495a824..e8da41e1 100644 --- a/docs/03_guides/07_scrapling.mdx +++ b/docs/03_guides/07_scrapling.mdx @@ -46,7 +46,7 @@ The example Actor in this guide uses the HTTP `AsyncFetcher`, which is the simpl ## Example Actor -The following Actor recursively scrapes titles from all linked pages, up to a user-defined maximum depth, starting from the URLs in the Actor input. It uses Scrapling's `AsyncFetcher` to fetch each page through [Apify Proxy](https://docs.apify.com/platform/proxy), and CSS selectors to extract the title, headings, and links. +The following Actor recursively scrapes data from linked pages on the same site, up to a user-defined maximum depth, starting from the URLs in the Actor input. It uses Scrapling's `AsyncFetcher` to fetch each page through [Apify Proxy](https://docs.apify.com/platform/proxy), and CSS selectors to extract the title, headings, and links. The whole Actor fits in a single file. A `scrape_page` helper holds the Scrapling-specific fetching and parsing, while the `main` coroutine handles the [Actor](https://docs.apify.com/platform/actors) lifecycle, reads the input, sets up [Apify Proxy](https://docs.apify.com/platform/proxy) and the [request queue](https://docs.apify.com/platform/storage/request-queue), and drives the crawl: diff --git a/docs/03_guides/code/01_beautifulsoup_httpx.py b/docs/03_guides/code/01_beautifulsoup_httpx.py index 45633da4..b683e771 100644 --- a/docs/03_guides/code/01_beautifulsoup_httpx.py +++ b/docs/03_guides/code/01_beautifulsoup_httpx.py @@ -1,6 +1,6 @@ import asyncio from typing import Any -from urllib.parse import urljoin +from urllib.parse import urljoin, urlsplit import httpx from bs4 import BeautifulSoup @@ -37,11 +37,14 @@ async def scrape_page( 'h3s': [h3.text for h3 in soup.find_all('h3')], } - # Collect absolute links found on the page so the caller can enqueue them. + # Collect absolute links on the same host so the crawl stays on this site. links: list[str] = [] + host = urlsplit(url).netloc for link in soup.find_all('a'): link_url = urljoin(url, link.get('href')) - if link_url.startswith(('http://', 'https://')): + if not link_url.startswith(('http://', 'https://')): + continue + if urlsplit(link_url).netloc == host: links.append(link_url) return data, links @@ -72,8 +75,15 @@ async def main() -> None: Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) - # Process the URLs from the request queue. - while request := await request_queue.fetch_next_request(): + # Limit the crawl; raise or remove the cap to follow more pages. + max_requests = 50 + handled_requests = 0 + + # Process the URLs from the request queue, up to the request limit. + while handled_requests < max_requests and ( + request := await request_queue.fetch_next_request() + ): + handled_requests += 1 url = request.url # Read the crawl depth tracked by the request itself. diff --git a/docs/03_guides/code/02_parsel_impit.py b/docs/03_guides/code/02_parsel_impit.py index 30ea6428..5348344d 100644 --- a/docs/03_guides/code/02_parsel_impit.py +++ b/docs/03_guides/code/02_parsel_impit.py @@ -1,6 +1,6 @@ import asyncio from typing import Any -from urllib.parse import urljoin +from urllib.parse import urljoin, urlsplit import impit import parsel @@ -37,11 +37,14 @@ async def scrape_page( 'h3s': selector.css('h3::text').getall(), } - # Collect absolute links found on the page so the caller can enqueue them. + # Collect absolute links on the same host so the crawl stays on this site. links: list[str] = [] + host = urlsplit(url).netloc for link_href in selector.css('a::attr(href)').getall(): link_url = urljoin(url, link_href) - if link_url.startswith(('http://', 'https://')): + if not link_url.startswith(('http://', 'https://')): + continue + if urlsplit(link_url).netloc == host: links.append(link_url) return data, links @@ -72,8 +75,15 @@ async def main() -> None: Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) - # Process the URLs from the request queue. - while request := await request_queue.fetch_next_request(): + # Limit the crawl; raise or remove the cap to follow more pages. + max_requests = 50 + handled_requests = 0 + + # Process the URLs from the request queue, up to the request limit. + while handled_requests < max_requests and ( + request := await request_queue.fetch_next_request() + ): + handled_requests += 1 url = request.url # Read the crawl depth tracked by the request itself. diff --git a/docs/03_guides/code/03_playwright.py b/docs/03_guides/code/03_playwright.py index 4175cbc2..c03ee91a 100644 --- a/docs/03_guides/code/03_playwright.py +++ b/docs/03_guides/code/03_playwright.py @@ -49,12 +49,15 @@ async def scrape_page( 'h3s': [await h3.text_content() for h3 in await page.locator('h3').all()], } - # Collect absolute links found on the page so the caller can enqueue them. + # Collect absolute links on the same host so the crawl stays on this site. links: list[str] = [] + host = urlsplit(url).netloc for link in await page.locator('a').all(): link_href = await link.get_attribute('href') link_url = urljoin(url, link_href) - if link_url.startswith(('http://', 'https://')): + if not link_url.startswith(('http://', 'https://')): + continue + if urlsplit(link_url).netloc == host: links.append(link_url) return data, links @@ -91,6 +94,10 @@ async def main() -> None: Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) + # Limit the crawl; raise or remove the cap to follow more pages. + max_requests = 50 + handled_requests = 0 + Actor.log.info('Launching Playwright...') # Launch Playwright and open a new browser context. @@ -99,12 +106,15 @@ async def main() -> None: browser = await playwright.chromium.launch( headless=Actor.configuration.headless, proxy=to_playwright_proxy(proxy_url) if proxy_url else None, - args=['--disable-gpu'], + args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu'], ) context = await browser.new_context() - # Process the URLs from the request queue. - while request := await request_queue.fetch_next_request(): + # Process the URLs from the request queue, up to the request limit. + while handled_requests < max_requests and ( + request := await request_queue.fetch_next_request() + ): + handled_requests += 1 url = request.url # Read the crawl depth tracked by the request itself. diff --git a/docs/03_guides/code/04_selenium.py b/docs/03_guides/code/04_selenium.py index 3f28db24..1f212503 100644 --- a/docs/03_guides/code/04_selenium.py +++ b/docs/03_guides/code/04_selenium.py @@ -93,11 +93,14 @@ def scrape_page(driver: webdriver.Chrome, url: str) -> tuple[dict[str, Any], lis 'h3s': [el.text for el in driver.find_elements(By.TAG_NAME, 'h3')], } - # Collect absolute links found on the page so the caller can enqueue them. + # Collect absolute links on the same host so the crawl stays on this site. links: list[str] = [] + host = urlsplit(url).netloc for link in driver.find_elements(By.TAG_NAME, 'a'): link_url = urljoin(url, link.get_attribute('href')) - if link_url.startswith(('http://', 'https://')): + if not link_url.startswith(('http://', 'https://')): + continue + if urlsplit(link_url).netloc == host: links.append(link_url) return data, links @@ -116,6 +119,11 @@ async def main() -> None: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() + # Create a proxy configuration that routes the browser through Apify Proxy. + # Selenium applies the proxy at the browser level, so the whole run shares + # a single proxy URL. + proxy_configuration = await Actor.create_proxy_configuration() + # Open the default request queue for handling URLs to be processed. request_queue = await Actor.open_request_queue() @@ -125,6 +133,10 @@ async def main() -> None: Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) + # Limit the crawl; raise or remove the cap to follow more pages. + max_requests = 50 + handled_requests = 0 + # Launch a new Selenium Chrome WebDriver and configure it. Actor.log.info('Launching Chrome WebDriver...') chrome_options = ChromeOptions() @@ -135,10 +147,9 @@ async def main() -> None: chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--disable-gpu') - # Route the browser through Apify Proxy. Selenium applies the proxy at the - # browser level, so the whole run shares a single proxy URL. - proxy_configuration = await Actor.create_proxy_configuration() + # Route the browser through Apify Proxy via an authentication extension. if proxy_configuration and (proxy_url := await proxy_configuration.new_url()): chrome_options.add_extension(proxy_auth_extension(proxy_url)) chrome_options.add_argument( @@ -147,13 +158,11 @@ async def main() -> None: driver = webdriver.Chrome(options=chrome_options) - # Test WebDriver setup by navigating to an example page. - driver.get('https://example.com') - if driver.title != 'Example Domain': - raise ValueError('Failed to open example page.') - - # Process the URLs from the request queue. - while request := await request_queue.fetch_next_request(): + # Process the URLs from the request queue, up to the request limit. + while handled_requests < max_requests and ( + request := await request_queue.fetch_next_request() + ): + handled_requests += 1 url = request.url # Read the crawl depth tracked by the request itself. diff --git a/docs/03_guides/code/05_crawlee_playwright.py b/docs/03_guides/code/05_crawlee_playwright.py index 63482f35..93e9162a 100644 --- a/docs/03_guides/code/05_crawlee_playwright.py +++ b/docs/03_guides/code/05_crawlee_playwright.py @@ -53,6 +53,9 @@ async def main() -> None: if proxy_configuration is None: raise RuntimeError('Failed to create the proxy configuration.') + # Common Chrome flags for running the browser in a container. + browser_args = ['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu'] + # Create a crawler that uses the router above and routes requests through # Apify Proxy. crawler = PlaywrightCrawler( @@ -62,7 +65,7 @@ async def main() -> None: max_requests_per_crawl=50, # Run the browser in a headless mode. headless=True, - browser_launch_options={'args': ['--disable-gpu']}, + browser_launch_options={'args': browser_args}, ) # Run the crawler with the starting requests. diff --git a/docs/03_guides/code/07_scrapling.py b/docs/03_guides/code/07_scrapling.py index b266a426..8771d61b 100644 --- a/docs/03_guides/code/07_scrapling.py +++ b/docs/03_guides/code/07_scrapling.py @@ -1,5 +1,6 @@ import asyncio from typing import Any +from urllib.parse import urlsplit from scrapling.fetchers import AsyncFetcher @@ -37,12 +38,16 @@ async def scrape_page( 'h3s': response.css('h3::text').getall(), } - # Collect absolute links from the page. The `::attr(href)` pseudo-selector - # reads the attribute and `response.urljoin` resolves it against the page URL. + # Collect absolute links on the same host so the crawl stays on this site. + # The `::attr(href)` selector reads the attribute and `response.urljoin` + # resolves it against the page URL. links: list[str] = [] + host = urlsplit(url).netloc for href in response.css('a::attr(href)').getall(): link_url = response.urljoin(href) - if link_url.startswith(('http://', 'https://')): + if not link_url.startswith(('http://', 'https://')): + continue + if urlsplit(link_url).netloc == host: links.append(link_url) return data, links @@ -73,8 +78,15 @@ async def main() -> None: Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) - # Process the URLs from the request queue. - while request := await request_queue.fetch_next_request(): + # Limit the crawl; raise or remove the cap to follow more pages. + max_requests = 50 + handled_requests = 0 + + # Process the URLs from the request queue, up to the request limit. + while handled_requests < max_requests and ( + request := await request_queue.fetch_next_request() + ): + handled_requests += 1 url = request.url # Read the crawl depth tracked by the request itself. diff --git a/docs/03_guides/code/08_crawl4ai.py b/docs/03_guides/code/08_crawl4ai.py index 71d4318c..88491a46 100644 --- a/docs/03_guides/code/08_crawl4ai.py +++ b/docs/03_guides/code/08_crawl4ai.py @@ -75,13 +75,20 @@ async def main() -> None: Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) + # Limit the crawl; raise or remove the cap to follow more pages. + max_requests = 50 + handled_requests = 0 + # Configure the headless browser that Crawl4AI drives. browser_config = BrowserConfig(headless=True) # Open a single browser-backed crawler and reuse it for every request. async with AsyncWebCrawler(config=browser_config) as crawler: - # Process the URLs from the request queue. - while request := await request_queue.fetch_next_request(): + # Process the URLs from the request queue, up to the request limit. + while handled_requests < max_requests and ( + request := await request_queue.fetch_next_request() + ): + handled_requests += 1 url = request.url # Read the crawl depth tracked by the request itself. diff --git a/docs/03_guides/code/09_browser_use.py b/docs/03_guides/code/09_browser_use.py index 29dc98eb..673eca07 100644 --- a/docs/03_guides/code/09_browser_use.py +++ b/docs/03_guides/code/09_browser_use.py @@ -28,7 +28,7 @@ class Posts(BaseModel): posts: list[Post] -def _proxy_settings(proxy_url: str) -> ProxySettings: +def to_browser_use_proxy(proxy_url: str) -> ProxySettings: """Convert an Apify Proxy URL into Browser Use `ProxySettings`.""" parts = urlsplit(proxy_url) return ProxySettings( @@ -61,7 +61,7 @@ async def run_agent_task( # Configure the browser. When a proxy URL is provided, route the browser through it. browser = Browser( headless=headless, - proxy=_proxy_settings(proxy_url) if proxy_url else None, + proxy=to_browser_use_proxy(proxy_url) if proxy_url else None, ) # Create the agent and run it for at most `max_steps` steps. From f5a80b9390d0ae49f5b3c161946fea0068727a38 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 19:51:12 +0200 Subject: [PATCH 19/24] docs: extract enqueue_links and Chrome driver helpers to flatten scraping examples --- docs/03_guides/code/01_beautifulsoup_httpx.py | 34 ++++++-- docs/03_guides/code/02_parsel_impit.py | 34 ++++++-- docs/03_guides/code/03_playwright.py | 34 ++++++-- docs/03_guides/code/04_selenium.py | 87 +++++++++++++------ docs/03_guides/code/07_scrapling.py | 35 ++++++-- docs/03_guides/code/08_crawl4ai.py | 35 ++++++-- 6 files changed, 196 insertions(+), 63 deletions(-) diff --git a/docs/03_guides/code/01_beautifulsoup_httpx.py b/docs/03_guides/code/01_beautifulsoup_httpx.py index b683e771..cfe7ee75 100644 --- a/docs/03_guides/code/01_beautifulsoup_httpx.py +++ b/docs/03_guides/code/01_beautifulsoup_httpx.py @@ -6,6 +6,7 @@ from bs4 import BeautifulSoup from apify import Actor, Request +from apify.storages import RequestQueue async def scrape_page( @@ -50,6 +51,28 @@ async def scrape_page( return data, links +async def enqueue_links( + request_queue: RequestQueue, + links: list[str], + *, + depth: int, + max_depth: int, +) -> None: + """Enqueue the given links one level deeper than the current page. + + Nothing is enqueued once `depth` reaches `max_depth`, which keeps the crawl + bounded to the requested depth. + """ + if depth >= max_depth: + return + + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + request = Request.from_url(link_url) + request.crawl_depth = depth + 1 + await request_queue.add_request(request) + + async def main() -> None: # Enter the context of the Actor. async with Actor: @@ -106,13 +129,10 @@ async def main() -> None: f'(title={data["title"]!r}, {len(links)} links found).' ) - # If we are not too deep yet, enqueue the links we found. - if depth < max_depth: - for link_url in links: - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url(link_url) - new_request.crawl_depth = depth + 1 - await request_queue.add_request(new_request) + # Enqueue the links found on the page, one level deeper. + await enqueue_links( + request_queue, links, depth=depth, max_depth=max_depth + ) except Exception: Actor.log.exception(f'Cannot extract data from {url}.') diff --git a/docs/03_guides/code/02_parsel_impit.py b/docs/03_guides/code/02_parsel_impit.py index 5348344d..f905c659 100644 --- a/docs/03_guides/code/02_parsel_impit.py +++ b/docs/03_guides/code/02_parsel_impit.py @@ -6,6 +6,7 @@ import parsel from apify import Actor, Request +from apify.storages import RequestQueue async def scrape_page( @@ -50,6 +51,28 @@ async def scrape_page( return data, links +async def enqueue_links( + request_queue: RequestQueue, + links: list[str], + *, + depth: int, + max_depth: int, +) -> None: + """Enqueue the given links one level deeper than the current page. + + Nothing is enqueued once `depth` reaches `max_depth`, which keeps the crawl + bounded to the requested depth. + """ + if depth >= max_depth: + return + + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + request = Request.from_url(link_url) + request.crawl_depth = depth + 1 + await request_queue.add_request(request) + + async def main() -> None: # Enter the context of the Actor. async with Actor: @@ -106,13 +129,10 @@ async def main() -> None: f'(title={data["title"]!r}, {len(links)} links found).' ) - # If we are not too deep yet, enqueue the links we found. - if depth < max_depth: - for link_url in links: - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url(link_url) - new_request.crawl_depth = depth + 1 - await request_queue.add_request(new_request) + # Enqueue the links found on the page, one level deeper. + await enqueue_links( + request_queue, links, depth=depth, max_depth=max_depth + ) except Exception: Actor.log.exception(f'Cannot extract data from {url}.') diff --git a/docs/03_guides/code/03_playwright.py b/docs/03_guides/code/03_playwright.py index c03ee91a..7ac536ec 100644 --- a/docs/03_guides/code/03_playwright.py +++ b/docs/03_guides/code/03_playwright.py @@ -5,6 +5,7 @@ from playwright.async_api import BrowserContext, async_playwright from apify import Actor, Request +from apify.storages import RequestQueue # Note: To run this Actor locally, ensure that Playwright browsers are installed. # Run `playwright install --with-deps` in the Actor's virtual environment to install them. @@ -66,6 +67,28 @@ async def scrape_page( await page.close() +async def enqueue_links( + request_queue: RequestQueue, + links: list[str], + *, + depth: int, + max_depth: int, +) -> None: + """Enqueue the given links one level deeper than the current page. + + Nothing is enqueued once `depth` reaches `max_depth`, which keeps the crawl + bounded to the requested depth. + """ + if depth >= max_depth: + return + + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + request = Request.from_url(link_url) + request.crawl_depth = depth + 1 + await request_queue.add_request(request) + + async def main() -> None: # Enter the context of the Actor. async with Actor: @@ -132,13 +155,10 @@ async def main() -> None: f'(title={data["title"]!r}, {len(links)} links found).' ) - # If we are not too deep yet, enqueue the links we found. - if depth < max_depth: - for link_url in links: - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url(link_url) - new_request.crawl_depth = depth + 1 - await request_queue.add_request(new_request) + # Enqueue the links found on the page, one level deeper. + await enqueue_links( + request_queue, links, depth=depth, max_depth=max_depth + ) except Exception: Actor.log.exception(f'Cannot extract data from {url}.') diff --git a/docs/03_guides/code/04_selenium.py b/docs/03_guides/code/04_selenium.py index 1f212503..37b8303c 100644 --- a/docs/03_guides/code/04_selenium.py +++ b/docs/03_guides/code/04_selenium.py @@ -11,6 +11,7 @@ from selenium.webdriver.common.by import By from apify import Actor, Request +from apify.storages import RequestQueue # To run this Actor locally, you need to have the Selenium Chromedriver installed. # Follow the installation guide at: @@ -74,6 +75,33 @@ def proxy_auth_extension(proxy_url: str) -> str: return str(extension_path) +def build_chrome_driver(proxy_url: str | None = None) -> webdriver.Chrome: + """Create a headless Chrome WebDriver, optionally routed through a proxy. + + When a proxy URL is given, the browser is configured with a small + authentication extension (see `proxy_auth_extension`), because Chrome ignores + the credentials passed via the `--proxy-server` flag. + """ + chrome_options = ChromeOptions() + + if Actor.configuration.headless: + # The new headless mode is required for the proxy extension to load. + chrome_options.add_argument('--headless=new') + + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--disable-gpu') + + # Route the browser through Apify Proxy via an authentication extension. + if proxy_url: + chrome_options.add_extension(proxy_auth_extension(proxy_url)) + chrome_options.add_argument( + '--disable-features=DisableLoadExtensionCommandLineSwitch' + ) + + return webdriver.Chrome(options=chrome_options) + + def scrape_page(driver: webdriver.Chrome, url: str) -> tuple[dict[str, Any], list[str]]: """Navigate to a page with Selenium, extract its data, and collect its links. @@ -106,6 +134,28 @@ def scrape_page(driver: webdriver.Chrome, url: str) -> tuple[dict[str, Any], lis return data, links +async def enqueue_links( + request_queue: RequestQueue, + links: list[str], + *, + depth: int, + max_depth: int, +) -> None: + """Enqueue the given links one level deeper than the current page. + + Nothing is enqueued once `depth` reaches `max_depth`, which keeps the crawl + bounded to the requested depth. + """ + if depth >= max_depth: + return + + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + request = Request.from_url(link_url) + request.crawl_depth = depth + 1 + await request_queue.add_request(request) + + async def main() -> None: # Enter the context of the Actor. async with Actor: @@ -137,26 +187,14 @@ async def main() -> None: max_requests = 50 handled_requests = 0 - # Launch a new Selenium Chrome WebDriver and configure it. - Actor.log.info('Launching Chrome WebDriver...') - chrome_options = ChromeOptions() - - if Actor.configuration.headless: - # The new headless mode is required for the proxy extension to load. - chrome_options.add_argument('--headless=new') + # Get a proxy URL to route the browser through (None if no proxy set up). + proxy_url = None + if proxy_configuration: + proxy_url = await proxy_configuration.new_url() - chrome_options.add_argument('--no-sandbox') - chrome_options.add_argument('--disable-dev-shm-usage') - chrome_options.add_argument('--disable-gpu') - - # Route the browser through Apify Proxy via an authentication extension. - if proxy_configuration and (proxy_url := await proxy_configuration.new_url()): - chrome_options.add_extension(proxy_auth_extension(proxy_url)) - chrome_options.add_argument( - '--disable-features=DisableLoadExtensionCommandLineSwitch' - ) - - driver = webdriver.Chrome(options=chrome_options) + # Launch and configure a Selenium Chrome WebDriver. + Actor.log.info('Launching Chrome WebDriver...') + driver = build_chrome_driver(proxy_url) # Process the URLs from the request queue, up to the request limit. while handled_requests < max_requests and ( @@ -181,13 +219,10 @@ async def main() -> None: f'(title={data["title"]!r}, {len(links)} links found).' ) - # If we are not too deep yet, enqueue the links we found. - if depth < max_depth: - for link_url in links: - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url(link_url) - new_request.crawl_depth = depth + 1 - await request_queue.add_request(new_request) + # Enqueue the links found on the page, one level deeper. + await enqueue_links( + request_queue, links, depth=depth, max_depth=max_depth + ) except Exception: Actor.log.exception(f'Cannot extract data from {url}.') diff --git a/docs/03_guides/code/07_scrapling.py b/docs/03_guides/code/07_scrapling.py index 8771d61b..1faa96ea 100644 --- a/docs/03_guides/code/07_scrapling.py +++ b/docs/03_guides/code/07_scrapling.py @@ -5,6 +5,7 @@ from scrapling.fetchers import AsyncFetcher from apify import Actor, Request +from apify.storages import RequestQueue async def scrape_page( @@ -53,6 +54,28 @@ async def scrape_page( return data, links +async def enqueue_links( + request_queue: RequestQueue, + links: list[str], + *, + depth: int, + max_depth: int, +) -> None: + """Enqueue the given links one level deeper than the current page. + + Nothing is enqueued once `depth` reaches `max_depth`, which keeps the crawl + bounded to the requested depth. + """ + if depth >= max_depth: + return + + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + request = Request.from_url(link_url) + request.crawl_depth = depth + 1 + await request_queue.add_request(request) + + async def main() -> None: # Enter the context of the Actor. async with Actor: @@ -109,14 +132,10 @@ async def main() -> None: f'(title={data["title"]!r}, {len(links)} links found).' ) - # If we are not too deep yet, enqueue the links we found one - # level deeper than the current page. - if depth < max_depth: - for link_url in links: - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url(link_url) - new_request.crawl_depth = depth + 1 - await request_queue.add_request(new_request) + # Enqueue the links found on the page, one level deeper. + await enqueue_links( + request_queue, links, depth=depth, max_depth=max_depth + ) except Exception: Actor.log.exception(f'Cannot extract data from {url}.') diff --git a/docs/03_guides/code/08_crawl4ai.py b/docs/03_guides/code/08_crawl4ai.py index 88491a46..c43ad4f0 100644 --- a/docs/03_guides/code/08_crawl4ai.py +++ b/docs/03_guides/code/08_crawl4ai.py @@ -10,6 +10,7 @@ ) from apify import Actor, Request +from apify.storages import RequestQueue async def scrape_page( @@ -50,6 +51,28 @@ async def scrape_page( return data, links +async def enqueue_links( + request_queue: RequestQueue, + links: list[str], + *, + depth: int, + max_depth: int, +) -> None: + """Enqueue the given links one level deeper than the current page. + + Nothing is enqueued once `depth` reaches `max_depth`, which keeps the crawl + bounded to the requested depth. + """ + if depth >= max_depth: + return + + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + request = Request.from_url(link_url) + request.crawl_depth = depth + 1 + await request_queue.add_request(request) + + async def main() -> None: # Enter the context of the Actor. async with Actor: @@ -111,14 +134,10 @@ async def main() -> None: f'(title={data["title"]!r}, {len(links)} links found).' ) - # If we are not too deep yet, enqueue the links we found one - # level deeper than the current page. - if depth < max_depth: - for link_url in links: - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url(link_url) - new_request.crawl_depth = depth + 1 - await request_queue.add_request(new_request) + # Enqueue the links found on the page, one level deeper. + await enqueue_links( + request_queue, links, depth=depth, max_depth=max_depth + ) except Exception: Actor.log.exception(f'Cannot extract data from {url}.') From 52cad7808934c9ede7d3e4bb7fbde8c9ca0e4e65 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 20:07:01 +0200 Subject: [PATCH 20/24] docs: trim comments and shorten docstrings in guide examples --- docs/03_guides/code/01_beautifulsoup_httpx.py | 44 +++-------- docs/03_guides/code/02_parsel_impit.py | 44 +++-------- docs/03_guides/code/03_playwright.py | 54 +++----------- docs/03_guides/code/04_selenium.py | 73 ++++--------------- .../code/05_crawlee_beautifulsoup.py | 21 ++---- docs/03_guides/code/05_crawlee_parsel.py | 21 ++---- docs/03_guides/code/05_crawlee_playwright.py | 22 ++---- docs/03_guides/code/07_scrapling.py | 45 +++--------- docs/03_guides/code/07_scrapling_browser.py | 14 +--- docs/03_guides/code/08_crawl4ai.py | 45 +++--------- docs/03_guides/code/09_browser_use.py | 28 +++---- docs/03_guides/code/11_pydantic.py | 35 +++------ docs/03_guides/code/11_raw_input.py | 2 +- docs/03_guides/code/12_webserver.py | 4 +- docs/03_guides/code/12_webserver_fastapi.py | 13 ++-- .../code/scrapy_project/src/__main__.py | 2 +- .../03_guides/code/scrapy_project/src/main.py | 6 +- .../code/scrapy_project/src/settings.py | 2 +- .../code/scrapy_project/src/spiders/title.py | 29 ++------ 19 files changed, 121 insertions(+), 383 deletions(-) diff --git a/docs/03_guides/code/01_beautifulsoup_httpx.py b/docs/03_guides/code/01_beautifulsoup_httpx.py index cfe7ee75..adc03361 100644 --- a/docs/03_guides/code/01_beautifulsoup_httpx.py +++ b/docs/03_guides/code/01_beautifulsoup_httpx.py @@ -14,22 +14,13 @@ async def scrape_page( *, proxy_url: str | None = None, ) -> tuple[dict[str, Any], list[str]]: - """Fetch a single page with HTTPX and extract its data and links. - - A fresh client is created per call, so each request can go through a new - proxy URL and a different IP address. The helper returns the extracted data - together with the links found on the page, so `main` only has to decide what - to store and what to enqueue. - """ - # Fetch the HTTP response from the specified URL using HTTPX, optionally - # routing the request through the provided Apify Proxy URL. + """Fetch a page with HTTPX and return its data and same-site links.""" + # A fresh client per call lets each request use a new proxy URL. async with httpx.AsyncClient(proxy=proxy_url) as client: response = await client.get(url, follow_redirects=True) - # Parse the HTML content using Beautiful Soup. soup = BeautifulSoup(response.content, 'html.parser') - # Extract the desired data. data = { 'url': url, 'title': soup.title.string if soup.title else None, @@ -38,7 +29,7 @@ async def scrape_page( 'h3s': [h3.text for h3 in soup.find_all('h3')], } - # Collect absolute links on the same host so the crawl stays on this site. + # Keep only absolute links on the same host. links: list[str] = [] host = urlsplit(url).netloc for link in soup.find_all('a'): @@ -58,11 +49,7 @@ async def enqueue_links( depth: int, max_depth: int, ) -> None: - """Enqueue the given links one level deeper than the current page. - - Nothing is enqueued once `depth` reaches `max_depth`, which keeps the crawl - bounded to the requested depth. - """ + """Enqueue the links one level deeper, unless max_depth was reached.""" if depth >= max_depth: return @@ -74,62 +61,50 @@ async def enqueue_links( async def main() -> None: - # Enter the context of the Actor. async with Actor: - # Retrieve the Actor input, and use default values if not provided. + # Read the Actor input. actor_input = await Actor.get_input() or {} start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) max_depth = actor_input.get('maxDepth', 1) - # Exit if no start URLs are provided. if not start_urls: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() - # Create a proxy configuration that routes requests through Apify Proxy. + # Set up Apify Proxy and the request queue. proxy_configuration = await Actor.create_proxy_configuration() - - # Open the default request queue for handling URLs to be processed. request_queue = await Actor.open_request_queue() - # Enqueue the start URLs. Their crawl depth defaults to 0. + # Enqueue the start URLs (crawl depth defaults to 0). for start_url in start_urls: url = start_url.get('url') Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) - # Limit the crawl; raise or remove the cap to follow more pages. + # Cap the crawl; raise or remove to follow more pages. max_requests = 50 handled_requests = 0 - # Process the URLs from the request queue, up to the request limit. while handled_requests < max_requests and ( request := await request_queue.fetch_next_request() ): handled_requests += 1 url = request.url - - # Read the crawl depth tracked by the request itself. depth = request.crawl_depth Actor.log.info(f'Scraping {url} (depth={depth}) ...') try: - # Get a fresh proxy URL for each request (None if no proxy set up). + # Fresh proxy URL per request (None if no proxy). proxy_url = None if proxy_configuration: proxy_url = await proxy_configuration.new_url() - # Fetch the page and extract its data and nested links. data, links = await scrape_page(url, proxy_url=proxy_url) - - # Store the extracted data to the default dataset. await Actor.push_data(data) Actor.log.info( f'Stored data from {url} ' f'(title={data["title"]!r}, {len(links)} links found).' ) - - # Enqueue the links found on the page, one level deeper. await enqueue_links( request_queue, links, depth=depth, max_depth=max_depth ) @@ -138,7 +113,6 @@ async def main() -> None: Actor.log.exception(f'Cannot extract data from {url}.') finally: - # Mark the request as handled so it is not processed again. await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/02_parsel_impit.py b/docs/03_guides/code/02_parsel_impit.py index f905c659..c937f48e 100644 --- a/docs/03_guides/code/02_parsel_impit.py +++ b/docs/03_guides/code/02_parsel_impit.py @@ -14,22 +14,13 @@ async def scrape_page( *, proxy_url: str | None = None, ) -> tuple[dict[str, Any], list[str]]: - """Fetch a single page with Impit and extract its data and links. - - A fresh client is created per call, so each request can go through a new - proxy URL and a different IP address. The helper returns the extracted data - together with the links found on the page, so `main` only has to decide what - to store and what to enqueue. - """ - # Fetch the HTTP response from the specified URL using Impit, optionally - # routing the request through the provided Apify Proxy URL. + """Fetch a page with Impit and return its data and same-site links.""" + # A fresh client per call lets each request use a new proxy URL. async with impit.AsyncClient(proxy=proxy_url) as client: response = await client.get(url) - # Parse the HTML content using a Parsel selector. selector = parsel.Selector(text=response.text) - # Extract the desired data using Parsel selectors. data = { 'url': url, 'title': selector.css('title::text').get(), @@ -38,7 +29,7 @@ async def scrape_page( 'h3s': selector.css('h3::text').getall(), } - # Collect absolute links on the same host so the crawl stays on this site. + # Keep only absolute links on the same host. links: list[str] = [] host = urlsplit(url).netloc for link_href in selector.css('a::attr(href)').getall(): @@ -58,11 +49,7 @@ async def enqueue_links( depth: int, max_depth: int, ) -> None: - """Enqueue the given links one level deeper than the current page. - - Nothing is enqueued once `depth` reaches `max_depth`, which keeps the crawl - bounded to the requested depth. - """ + """Enqueue the links one level deeper, unless max_depth was reached.""" if depth >= max_depth: return @@ -74,62 +61,50 @@ async def enqueue_links( async def main() -> None: - # Enter the context of the Actor. async with Actor: - # Retrieve the Actor input, and use default values if not provided. + # Read the Actor input. actor_input = await Actor.get_input() or {} start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) max_depth = actor_input.get('maxDepth', 1) - # Exit if no start URLs are provided. if not start_urls: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() - # Create a proxy configuration that routes requests through Apify Proxy. + # Set up Apify Proxy and the request queue. proxy_configuration = await Actor.create_proxy_configuration() - - # Open the default request queue for handling URLs to be processed. request_queue = await Actor.open_request_queue() - # Enqueue the start URLs. Their crawl depth defaults to 0. + # Enqueue the start URLs (crawl depth defaults to 0). for start_url in start_urls: url = start_url.get('url') Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) - # Limit the crawl; raise or remove the cap to follow more pages. + # Cap the crawl; raise or remove to follow more pages. max_requests = 50 handled_requests = 0 - # Process the URLs from the request queue, up to the request limit. while handled_requests < max_requests and ( request := await request_queue.fetch_next_request() ): handled_requests += 1 url = request.url - - # Read the crawl depth tracked by the request itself. depth = request.crawl_depth Actor.log.info(f'Scraping {url} (depth={depth}) ...') try: - # Get a fresh proxy URL for each request (None if no proxy set up). + # Fresh proxy URL per request (None if no proxy). proxy_url = None if proxy_configuration: proxy_url = await proxy_configuration.new_url() - # Fetch the page and extract its data and nested links. data, links = await scrape_page(url, proxy_url=proxy_url) - - # Store the extracted data to the default dataset. await Actor.push_data(data) Actor.log.info( f'Stored data from {url} ' f'(title={data["title"]!r}, {len(links)} links found).' ) - - # Enqueue the links found on the page, one level deeper. await enqueue_links( request_queue, links, depth=depth, max_depth=max_depth ) @@ -138,7 +113,6 @@ async def main() -> None: Actor.log.exception(f'Cannot extract data from {url}.') finally: - # Mark the request as handled so it is not processed again. await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/03_playwright.py b/docs/03_guides/code/03_playwright.py index 7ac536ec..46c89867 100644 --- a/docs/03_guides/code/03_playwright.py +++ b/docs/03_guides/code/03_playwright.py @@ -7,19 +7,12 @@ from apify import Actor, Request from apify.storages import RequestQueue -# Note: To run this Actor locally, ensure that Playwright browsers are installed. -# Run `playwright install --with-deps` in the Actor's virtual environment to install them. -# When running on the Apify platform, these dependencies are already included -# in the Actor's Docker image. +# To run locally, install the browsers first: `playwright install --with-deps`. +# On the Apify platform they are already in the Actor's Docker image. def to_playwright_proxy(proxy_url: str) -> dict[str, str]: - """Convert an Apify Proxy URL into Playwright proxy settings. - - Playwright wants the proxy as a `server` URL with the credentials in separate - `username` and `password` fields, so the single URL returned by - `ProxyConfiguration.new_url` has to be split into its parts. - """ + """Split an Apify Proxy URL into Playwright's server/username/password.""" parts = urlsplit(proxy_url) return { 'server': f'{parts.scheme}://{parts.hostname}:{parts.port}', @@ -31,17 +24,11 @@ def to_playwright_proxy(proxy_url: str) -> dict[str, str]: async def scrape_page( context: BrowserContext, url: str ) -> tuple[dict[str, Any], list[str]]: - """Open a page in the browser, extract its data, and collect its links. - - Keeping the page handling in this helper keeps the Actor's main loop shallow. - It returns the extracted data together with the links found on the page, so - `main` only has to decide what to store and what to enqueue. - """ + """Open the URL in a new page and return its data and same-site links.""" page = await context.new_page() try: await page.goto(url) - # Extract the desired data. data = { 'url': url, 'title': await page.title(), @@ -50,7 +37,7 @@ async def scrape_page( 'h3s': [await h3.text_content() for h3 in await page.locator('h3').all()], } - # Collect absolute links on the same host so the crawl stays on this site. + # Keep only absolute links on the same host. links: list[str] = [] host = urlsplit(url).netloc for link in await page.locator('a').all(): @@ -74,11 +61,7 @@ async def enqueue_links( depth: int, max_depth: int, ) -> None: - """Enqueue the given links one level deeper than the current page. - - Nothing is enqueued once `depth` reaches `max_depth`, which keeps the crawl - bounded to the requested depth. - """ + """Enqueue the links one level deeper, unless max_depth was reached.""" if depth >= max_depth: return @@ -90,42 +73,34 @@ async def enqueue_links( async def main() -> None: - # Enter the context of the Actor. async with Actor: - # Retrieve the Actor input, and use default values if not provided. + # Read the Actor input. actor_input = await Actor.get_input() or {} start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) max_depth = actor_input.get('maxDepth', 1) - # Exit if no start URLs are provided. if not start_urls: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() - # Create a proxy configuration that routes the browser through Apify Proxy. - # Playwright applies the proxy at the browser level, so the whole run shares - # a single proxy URL rather than rotating it per request. + # Playwright proxies at the browser level, so one URL is shared per run. proxy_configuration = await Actor.create_proxy_configuration() proxy_url = await proxy_configuration.new_url() if proxy_configuration else None - # Open the default request queue for handling URLs to be processed. + # Open the request queue and enqueue the start URLs (crawl depth 0). request_queue = await Actor.open_request_queue() - - # Enqueue the start URLs. Their crawl depth defaults to 0. for start_url in start_urls: url = start_url.get('url') Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) - # Limit the crawl; raise or remove the cap to follow more pages. + # Cap the crawl; raise or remove to follow more pages. max_requests = 50 handled_requests = 0 Actor.log.info('Launching Playwright...') - # Launch Playwright and open a new browser context. async with async_playwright() as playwright: - # Configure the browser to launch in headless mode as per Actor configuration. browser = await playwright.chromium.launch( headless=Actor.configuration.headless, proxy=to_playwright_proxy(proxy_url) if proxy_url else None, @@ -133,29 +108,21 @@ async def main() -> None: ) context = await browser.new_context() - # Process the URLs from the request queue, up to the request limit. while handled_requests < max_requests and ( request := await request_queue.fetch_next_request() ): handled_requests += 1 url = request.url - - # Read the crawl depth tracked by the request itself. depth = request.crawl_depth Actor.log.info(f'Scraping {url} (depth={depth}) ...') try: - # Fetch the page and extract its data and nested links. data, links = await scrape_page(context, url) - - # Store the extracted data to the default dataset. await Actor.push_data(data) Actor.log.info( f'Stored data from {url} ' f'(title={data["title"]!r}, {len(links)} links found).' ) - - # Enqueue the links found on the page, one level deeper. await enqueue_links( request_queue, links, depth=depth, max_depth=max_depth ) @@ -164,7 +131,6 @@ async def main() -> None: Actor.log.exception(f'Cannot extract data from {url}.') finally: - # Mark the request as handled so it is not processed again. await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/04_selenium.py b/docs/03_guides/code/04_selenium.py index 37b8303c..8bf08817 100644 --- a/docs/03_guides/code/04_selenium.py +++ b/docs/03_guides/code/04_selenium.py @@ -13,22 +13,13 @@ from apify import Actor, Request from apify.storages import RequestQueue -# To run this Actor locally, you need to have the Selenium Chromedriver installed. -# Follow the installation guide at: +# To run locally, install the Selenium Chromedriver: # https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/ -# When running on the Apify platform, the Chromedriver is already included -# in the Actor's Docker image. +# On the Apify platform it is already in the Actor's Docker image. def proxy_auth_extension(proxy_url: str) -> str: - """Build a temporary Chrome extension that routes Chrome through a proxy. - - Chrome ignores credentials passed in the `--proxy-server` flag, so an - authenticated proxy such as Apify Proxy has to be configured from inside an - extension: its service worker sets the proxy server and answers the browser's - authentication challenge with the username and password. The function returns - the path to a packed extension ready to be loaded with `add_extension`. - """ + """Build a Chrome extension that routes Chrome through an authenticated proxy.""" parts = urlsplit(proxy_url) manifest = { @@ -41,8 +32,7 @@ def proxy_auth_extension(proxy_url: str) -> str: 'minimum_chrome_version': '108', } - # The service worker sets the proxy server and supplies the credentials when - # Chrome is challenged for authentication. `json.dumps` handles the escaping. + # The service worker sets the proxy and answers the auth challenge. proxy_config = json.dumps( { 'mode': 'fixed_servers', @@ -76,23 +66,17 @@ def proxy_auth_extension(proxy_url: str) -> str: def build_chrome_driver(proxy_url: str | None = None) -> webdriver.Chrome: - """Create a headless Chrome WebDriver, optionally routed through a proxy. - - When a proxy URL is given, the browser is configured with a small - authentication extension (see `proxy_auth_extension`), because Chrome ignores - the credentials passed via the `--proxy-server` flag. - """ + """Create a headless Chrome WebDriver, optionally routed through a proxy.""" chrome_options = ChromeOptions() if Actor.configuration.headless: - # The new headless mode is required for the proxy extension to load. + # The new headless mode is required to load the proxy extension. chrome_options.add_argument('--headless=new') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-gpu') - # Route the browser through Apify Proxy via an authentication extension. if proxy_url: chrome_options.add_extension(proxy_auth_extension(proxy_url)) chrome_options.add_argument( @@ -103,16 +87,9 @@ def build_chrome_driver(proxy_url: str | None = None) -> webdriver.Chrome: def scrape_page(driver: webdriver.Chrome, url: str) -> tuple[dict[str, Any], list[str]]: - """Navigate to a page with Selenium, extract its data, and collect its links. - - These are blocking WebDriver calls, so the Actor's main loop runs this helper - in a worker thread via `asyncio.to_thread`. It returns the extracted data - together with the links found on the page, so `main` only has to decide what - to store and what to enqueue. - """ + """Navigate to the URL with Selenium and return its data and same-site links.""" driver.get(url) - # Extract the desired data. data = { 'url': url, 'title': driver.title, @@ -121,7 +98,7 @@ def scrape_page(driver: webdriver.Chrome, url: str) -> tuple[dict[str, Any], lis 'h3s': [el.text for el in driver.find_elements(By.TAG_NAME, 'h3')], } - # Collect absolute links on the same host so the crawl stays on this site. + # Keep only absolute links on the same host. links: list[str] = [] host = urlsplit(url).netloc for link in driver.find_elements(By.TAG_NAME, 'a'): @@ -141,11 +118,7 @@ async def enqueue_links( depth: int, max_depth: int, ) -> None: - """Enqueue the given links one level deeper than the current page. - - Nothing is enqueued once `depth` reaches `max_depth`, which keeps the crawl - bounded to the requested depth. - """ + """Enqueue the links one level deeper, unless max_depth was reached.""" if depth >= max_depth: return @@ -157,69 +130,54 @@ async def enqueue_links( async def main() -> None: - # Enter the context of the Actor. async with Actor: - # Retrieve the Actor input, and use default values if not provided. + # Read the Actor input. actor_input = await Actor.get_input() or {} start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) max_depth = actor_input.get('maxDepth', 1) - # Exit if no start URLs are provided. if not start_urls: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() - # Create a proxy configuration that routes the browser through Apify Proxy. - # Selenium applies the proxy at the browser level, so the whole run shares - # a single proxy URL. + # Selenium proxies at the browser level, so one URL is shared per run. proxy_configuration = await Actor.create_proxy_configuration() - # Open the default request queue for handling URLs to be processed. + # Open the request queue and enqueue the start URLs (crawl depth 0). request_queue = await Actor.open_request_queue() - - # Enqueue the start URLs. Their crawl depth defaults to 0. for start_url in start_urls: url = start_url.get('url') Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) - # Limit the crawl; raise or remove the cap to follow more pages. + # Cap the crawl; raise or remove to follow more pages. max_requests = 50 handled_requests = 0 - # Get a proxy URL to route the browser through (None if no proxy set up). + # Fresh proxy URL for the run (None if no proxy). proxy_url = None if proxy_configuration: proxy_url = await proxy_configuration.new_url() - # Launch and configure a Selenium Chrome WebDriver. Actor.log.info('Launching Chrome WebDriver...') driver = build_chrome_driver(proxy_url) - # Process the URLs from the request queue, up to the request limit. while handled_requests < max_requests and ( request := await request_queue.fetch_next_request() ): handled_requests += 1 url = request.url - - # Read the crawl depth tracked by the request itself. depth = request.crawl_depth Actor.log.info(f'Scraping {url} (depth={depth}) ...') try: - # Fetch the page and extract its data and nested links. The blocking - # WebDriver calls run in a worker thread to keep the loop responsive. + # Blocking WebDriver calls run in a worker thread. data, links = await asyncio.to_thread(scrape_page, driver, url) - - # Store the extracted data to the default dataset. await Actor.push_data(data) Actor.log.info( f'Stored data from {url} ' f'(title={data["title"]!r}, {len(links)} links found).' ) - - # Enqueue the links found on the page, one level deeper. await enqueue_links( request_queue, links, depth=depth, max_depth=max_depth ) @@ -228,7 +186,6 @@ async def main() -> None: Actor.log.exception(f'Cannot extract data from {url}.') finally: - # Mark the request as handled so it is not processed again. await request_queue.mark_request_as_handled(request) driver.quit() diff --git a/docs/03_guides/code/05_crawlee_beautifulsoup.py b/docs/03_guides/code/05_crawlee_beautifulsoup.py index 0e6dea2f..d3767109 100644 --- a/docs/03_guides/code/05_crawlee_beautifulsoup.py +++ b/docs/03_guides/code/05_crawlee_beautifulsoup.py @@ -5,17 +5,15 @@ from apify import Actor -# Define the request router up front, so the crawler itself can be created later -# inside `main`, once the Apify Proxy configuration is available. +# Define the router up front; the crawler is created later in `main`. router = Router[BeautifulSoupCrawlingContext]() -# Define a request handler, which will be called for every request. +# Handler called for every request. @router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: Actor.log.info(f'Scraping {context.request.url} ...') - # Extract the desired data. data = { 'url': context.request.url, 'title': context.soup.title.string if context.soup.title else None, @@ -24,45 +22,38 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 'h3s': [h3.text for h3 in context.soup.find_all('h3')], } - # Store the extracted data to the default dataset. await context.push_data(data) Actor.log.info(f'Stored data from {context.request.url} (title={data["title"]!r}).') - # Enqueue additional links found on the current page. + # Enqueue links found on the page. await context.enqueue_links(strategy='same-domain') async def main() -> None: - # Enter the context of the Actor. async with Actor: - # Retrieve the Actor input, and use default values if not provided. + # Read the Actor input. actor_input = await Actor.get_input() or {} start_urls = [ url.get('url') for url in actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) ] - # Exit if no start URLs are provided. if not start_urls: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() - # Create a proxy configuration that routes requests through Apify Proxy. - # Crawlee rotates the proxy URL for every request on its own. + # Crawlee rotates the proxy URL per request on its own. proxy_configuration = await Actor.create_proxy_configuration() if proxy_configuration is None: raise RuntimeError('Failed to create the proxy configuration.') - # Create a crawler that uses the router above and routes requests through - # Apify Proxy. crawler = BeautifulSoupCrawler( proxy_configuration=proxy_configuration, request_handler=router, - # Limit the crawl; remove or increase to follow all links. + # Cap the crawl; remove or increase to follow all links. max_requests_per_crawl=50, ) - # Run the crawler with the starting requests. await crawler.run(start_urls) diff --git a/docs/03_guides/code/05_crawlee_parsel.py b/docs/03_guides/code/05_crawlee_parsel.py index 659f3b4d..32723b00 100644 --- a/docs/03_guides/code/05_crawlee_parsel.py +++ b/docs/03_guides/code/05_crawlee_parsel.py @@ -5,17 +5,15 @@ from apify import Actor -# Define the request router up front, so the crawler itself can be created later -# inside `main`, once the Apify Proxy configuration is available. +# Define the router up front; the crawler is created later in `main`. router = Router[ParselCrawlingContext]() -# Define a request handler, which will be called for every request. +# Handler called for every request. @router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: Actor.log.info(f'Scraping {context.request.url} ...') - # Extract the desired data. data = { 'url': context.request.url, 'title': context.selector.xpath('//title/text()').get(), @@ -24,45 +22,38 @@ async def request_handler(context: ParselCrawlingContext) -> None: 'h3s': context.selector.xpath('//h3/text()').getall(), } - # Store the extracted data to the default dataset. await context.push_data(data) Actor.log.info(f'Stored data from {context.request.url} (title={data["title"]!r}).') - # Enqueue additional links found on the current page. + # Enqueue links found on the page. await context.enqueue_links(strategy='same-domain') async def main() -> None: - # Enter the context of the Actor. async with Actor: - # Retrieve the Actor input, and use default values if not provided. + # Read the Actor input. actor_input = await Actor.get_input() or {} start_urls = [ url.get('url') for url in actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) ] - # Exit if no start URLs are provided. if not start_urls: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() - # Create a proxy configuration that routes requests through Apify Proxy. - # Crawlee rotates the proxy URL for every request on its own. + # Crawlee rotates the proxy URL per request on its own. proxy_configuration = await Actor.create_proxy_configuration() if proxy_configuration is None: raise RuntimeError('Failed to create the proxy configuration.') - # Create a crawler that uses the router above and routes requests through - # Apify Proxy. crawler = ParselCrawler( proxy_configuration=proxy_configuration, request_handler=router, - # Limit the crawl; remove or increase to follow all links. + # Cap the crawl; remove or increase to follow all links. max_requests_per_crawl=50, ) - # Run the crawler with the starting requests. await crawler.run(start_urls) diff --git a/docs/03_guides/code/05_crawlee_playwright.py b/docs/03_guides/code/05_crawlee_playwright.py index 93e9162a..56337a31 100644 --- a/docs/03_guides/code/05_crawlee_playwright.py +++ b/docs/03_guides/code/05_crawlee_playwright.py @@ -5,17 +5,15 @@ from apify import Actor -# Define the request router up front, so the crawler itself can be created later -# inside `main`, once the Apify Proxy configuration is available. +# Define the router up front; the crawler is created later in `main`. router = Router[PlaywrightCrawlingContext]() -# Define a request handler, which will be called for every request. +# Handler called for every request. @router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: Actor.log.info(f'Scraping {context.request.url} ...') - # Extract the desired data. data = { 'url': context.request.url, 'title': await context.page.title(), @@ -24,31 +22,27 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: 'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()], } - # Store the extracted data to the default dataset. await context.push_data(data) Actor.log.info(f'Stored data from {context.request.url} (title={data["title"]!r}).') - # Enqueue additional links found on the current page. + # Enqueue links found on the page. await context.enqueue_links(strategy='same-domain') async def main() -> None: - # Enter the context of the Actor. async with Actor: - # Retrieve the Actor input, and use default values if not provided. + # Read the Actor input. actor_input = await Actor.get_input() or {} start_urls = [ url.get('url') for url in actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) ] - # Exit if no start URLs are provided. if not start_urls: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() - # Create a proxy configuration that routes requests through Apify Proxy. - # Crawlee rotates the proxy URL for every request on its own. + # Crawlee rotates the proxy URL per request on its own. proxy_configuration = await Actor.create_proxy_configuration() if proxy_configuration is None: raise RuntimeError('Failed to create the proxy configuration.') @@ -56,19 +50,15 @@ async def main() -> None: # Common Chrome flags for running the browser in a container. browser_args = ['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu'] - # Create a crawler that uses the router above and routes requests through - # Apify Proxy. crawler = PlaywrightCrawler( proxy_configuration=proxy_configuration, request_handler=router, - # Limit the crawl; remove or increase to follow all links. + # Cap the crawl; remove or increase to follow all links. max_requests_per_crawl=50, - # Run the browser in a headless mode. headless=True, browser_launch_options={'args': browser_args}, ) - # Run the crawler with the starting requests. await crawler.run(start_urls) diff --git a/docs/03_guides/code/07_scrapling.py b/docs/03_guides/code/07_scrapling.py index 1faa96ea..49aab31b 100644 --- a/docs/03_guides/code/07_scrapling.py +++ b/docs/03_guides/code/07_scrapling.py @@ -13,14 +13,8 @@ async def scrape_page( *, proxy_url: str | None = None, ) -> tuple[dict[str, Any], list[str]]: - """Fetch a single page with Scrapling and extract its data and links. - - The page is fetched with Scrapling's asynchronous HTTP fetcher. The - `impersonate` and `stealthy_headers` options make the request look like it - comes from a real Chrome browser, which reduces the chance of being blocked. - The returned response is also a Scrapling selector, so it can be queried with - CSS selectors directly. - """ + """Fetch a page with Scrapling's HTTP fetcher and return data and links.""" + # `impersonate` and `stealthy_headers` make the request look like Chrome. response = await AsyncFetcher.get( url, proxy=proxy_url, @@ -29,8 +23,6 @@ async def scrape_page( timeout=60, ) - # Extract the desired data using CSS selectors. The `::text` pseudo-element - # returns the text content of the matched elements. data = { 'url': url, 'title': response.css('title::text').get(), @@ -39,9 +31,7 @@ async def scrape_page( 'h3s': response.css('h3::text').getall(), } - # Collect absolute links on the same host so the crawl stays on this site. - # The `::attr(href)` selector reads the attribute and `response.urljoin` - # resolves it against the page URL. + # Keep only absolute links on the same host. links: list[str] = [] host = urlsplit(url).netloc for href in response.css('a::attr(href)').getall(): @@ -61,11 +51,7 @@ async def enqueue_links( depth: int, max_depth: int, ) -> None: - """Enqueue the given links one level deeper than the current page. - - Nothing is enqueued once `depth` reaches `max_depth`, which keeps the crawl - bounded to the requested depth. - """ + """Enqueue the links one level deeper, unless max_depth was reached.""" if depth >= max_depth: return @@ -77,62 +63,50 @@ async def enqueue_links( async def main() -> None: - # Enter the context of the Actor. async with Actor: - # Retrieve the Actor input, and use default values if not provided. + # Read the Actor input. actor_input = await Actor.get_input() or {} start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) max_depth = actor_input.get('maxDepth', 1) - # Exit if no start URLs are provided. if not start_urls: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() - # Create a proxy configuration that routes requests through Apify Proxy. + # Set up Apify Proxy and the request queue. proxy_configuration = await Actor.create_proxy_configuration() - - # Open the default request queue for handling URLs to be processed. request_queue = await Actor.open_request_queue() - # Enqueue the start URLs. Their crawl depth defaults to 0. + # Enqueue the start URLs (crawl depth defaults to 0). for start_url in start_urls: url = start_url.get('url') Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) - # Limit the crawl; raise or remove the cap to follow more pages. + # Cap the crawl; raise or remove to follow more pages. max_requests = 50 handled_requests = 0 - # Process the URLs from the request queue, up to the request limit. while handled_requests < max_requests and ( request := await request_queue.fetch_next_request() ): handled_requests += 1 url = request.url - - # Read the crawl depth tracked by the request itself. depth = request.crawl_depth Actor.log.info(f'Scraping {url} (depth={depth}) ...') try: - # Get a fresh proxy URL for each request (None if no proxy set up). + # Fresh proxy URL per request (None if no proxy). proxy_url = None if proxy_configuration: proxy_url = await proxy_configuration.new_url() - # Fetch the page and extract its data and nested links. data, links = await scrape_page(url, proxy_url=proxy_url) - - # Store the extracted data to the default dataset. await Actor.push_data(data) Actor.log.info( f'Stored data from {url} ' f'(title={data["title"]!r}, {len(links)} links found).' ) - - # Enqueue the links found on the page, one level deeper. await enqueue_links( request_queue, links, depth=depth, max_depth=max_depth ) @@ -141,7 +115,6 @@ async def main() -> None: Actor.log.exception(f'Cannot extract data from {url}.') finally: - # Mark the request as handled so it is not processed again. await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/07_scrapling_browser.py b/docs/03_guides/code/07_scrapling_browser.py index d96f2d19..3eb50e24 100644 --- a/docs/03_guides/code/07_scrapling_browser.py +++ b/docs/03_guides/code/07_scrapling_browser.py @@ -8,13 +8,8 @@ async def scrape_page( *, proxy_url: str | None = None, ) -> tuple[dict[str, Any], list[str]]: - """Fetch a single page in a real browser and extract its data and links. - - `DynamicFetcher` drives a real browser via Playwright, so it can render - JavaScript-heavy pages. `network_idle` waits until the page stops making - network requests before the HTML is captured. Apart from the fetcher call, - everything else - including the parsing - is identical to the HTTP version. - """ + """Fetch a page in a real browser with Scrapling and return data and links.""" + # `network_idle` waits until the page stops making network requests. response = await DynamicFetcher.async_fetch( url, proxy=proxy_url, @@ -22,8 +17,6 @@ async def scrape_page( network_idle=True, ) - # Extract the desired data using CSS selectors. The `::text` pseudo-element - # returns the text content of the matched elements. data = { 'url': url, 'title': response.css('title::text').get(), @@ -32,8 +25,7 @@ async def scrape_page( 'h3s': response.css('h3::text').getall(), } - # Collect absolute links from the page. The `::attr(href)` pseudo-selector - # reads the attribute and `response.urljoin` resolves it against the page URL. + # Collect absolute links from the page. links: list[str] = [] for href in response.css('a::attr(href)').getall(): link_url = response.urljoin(href) diff --git a/docs/03_guides/code/08_crawl4ai.py b/docs/03_guides/code/08_crawl4ai.py index c43ad4f0..1c7884c1 100644 --- a/docs/03_guides/code/08_crawl4ai.py +++ b/docs/03_guides/code/08_crawl4ai.py @@ -19,13 +19,7 @@ async def scrape_page( *, proxy_url: str | None = None, ) -> tuple[dict[str, Any], list[str]]: - """Crawl a single page with Crawl4AI and extract its markdown and links. - - The page is rendered in the browser managed by `crawler`, and Crawl4AI turns - the result into clean, LLM-ready markdown. Setting `proxy_config` on the - per-request `CrawlerRunConfig` routes this request through Apify Proxy, so - every page can use a fresh IP address. - """ + """Crawl a page with Crawl4AI and return its markdown and same-site links.""" run_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, proxy_config=ProxyConfig.from_string(proxy_url) if proxy_url else None, @@ -35,16 +29,13 @@ async def scrape_page( if not result.success: raise RuntimeError(result.error_message or f'Failed to crawl {url}') - # `result.markdown` is the rendered page as clean markdown, and - # `result.metadata` carries page-level fields such as the title. data = { 'url': result.url, 'title': (result.metadata or {}).get('title'), 'markdown': str(result.markdown), } - # Crawl4AI already splits links into `internal` (same site) and `external`. - # We follow only the internal ones to keep the crawl on the same website. + # Crawl4AI already classifies links; follow only the internal ones. internal_links = result.links.get('internal', []) links = [link['href'] for link in internal_links if link.get('href')] @@ -58,11 +49,7 @@ async def enqueue_links( depth: int, max_depth: int, ) -> None: - """Enqueue the given links one level deeper than the current page. - - Nothing is enqueued once `depth` reaches `max_depth`, which keeps the crawl - bounded to the requested depth. - """ + """Enqueue the links one level deeper, unless max_depth was reached.""" if depth >= max_depth: return @@ -74,67 +61,54 @@ async def enqueue_links( async def main() -> None: - # Enter the context of the Actor. async with Actor: - # Retrieve the Actor input, and use default values if not provided. + # Read the Actor input. actor_input = await Actor.get_input() or {} start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) max_depth = actor_input.get('maxDepth', 1) - # Exit if no start URLs are provided. if not start_urls: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() - # Create a proxy configuration that routes requests through Apify Proxy. + # Set up Apify Proxy and the request queue. proxy_configuration = await Actor.create_proxy_configuration() - - # Open the default request queue for handling URLs to be processed. request_queue = await Actor.open_request_queue() - # Enqueue the start URLs. Their crawl depth defaults to 0. + # Enqueue the start URLs (crawl depth defaults to 0). for start_url in start_urls: url = start_url.get('url') Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) - # Limit the crawl; raise or remove the cap to follow more pages. + # Cap the crawl; raise or remove to follow more pages. max_requests = 50 handled_requests = 0 - # Configure the headless browser that Crawl4AI drives. + # Reuse one headless browser-backed crawler for every request. browser_config = BrowserConfig(headless=True) - # Open a single browser-backed crawler and reuse it for every request. async with AsyncWebCrawler(config=browser_config) as crawler: - # Process the URLs from the request queue, up to the request limit. while handled_requests < max_requests and ( request := await request_queue.fetch_next_request() ): handled_requests += 1 url = request.url - - # Read the crawl depth tracked by the request itself. depth = request.crawl_depth Actor.log.info(f'Scraping {url} (depth={depth}) ...') try: - # Get a fresh proxy URL for each request (None if no proxy set up). + # Fresh proxy URL per request (None if no proxy). proxy_url = None if proxy_configuration: proxy_url = await proxy_configuration.new_url() - # Crawl the page and extract its markdown and nested links. data, links = await scrape_page(crawler, url, proxy_url=proxy_url) - - # Store the extracted data to the default dataset. await Actor.push_data(data) Actor.log.info( f'Stored data from {url} ' f'(title={data["title"]!r}, {len(links)} links found).' ) - - # Enqueue the links found on the page, one level deeper. await enqueue_links( request_queue, links, depth=depth, max_depth=max_depth ) @@ -143,7 +117,6 @@ async def main() -> None: Actor.log.exception(f'Cannot extract data from {url}.') finally: - # Mark the request as handled so it is not processed again. await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/09_browser_use.py b/docs/03_guides/code/09_browser_use.py index 673eca07..cd16773f 100644 --- a/docs/03_guides/code/09_browser_use.py +++ b/docs/03_guides/code/09_browser_use.py @@ -8,7 +8,7 @@ from apify import Actor -# The default task is aligned with the `Posts` output schema defined below. +# Default task, aligned with the `Posts` schema below. DEFAULT_TASK = ( 'Open https://news.ycombinator.com and return the title and URL ' 'of the top 5 posts on the front page.' @@ -47,24 +47,17 @@ async def run_agent_task( headless: bool = True, proxy_url: str | None = None, ) -> Posts | None: - """Run a Browser Use agent for a single task and return its structured output. - - The agent is driven by an OpenAI model and a real Chromium browser. Passing - `output_model_schema` makes the agent return a validated `Posts` instance instead - of free-form text, and `enable_signal_handler=False` leaves signal handling to the - Actor. - """ - # Configure the LLM that drives the agent. Swap `ChatOpenAI` for `ChatAnthropic`, - # `ChatGoogle`, or another provider to use a different model. + """Run a Browser Use agent for one task and return its structured output.""" + # Configure the LLM. Swap `ChatOpenAI` for another provider if needed. llm = ChatOpenAI(model=model, api_key=llm_api_key) - # Configure the browser. When a proxy URL is provided, route the browser through it. + # Configure the browser, optionally routed through a proxy. browser = Browser( headless=headless, proxy=to_browser_use_proxy(proxy_url) if proxy_url else None, ) - # Create the agent and run it for at most `max_steps` steps. + # `output_model_schema` returns a validated `Posts`; signals stay with the Actor. agent = Agent( task=task, llm=llm, @@ -78,27 +71,24 @@ async def run_agent_task( async def main() -> None: - # Enter the context of the Actor. async with Actor: - # Retrieve the Actor input, and use default values if not provided. + # Read the Actor input. actor_input = await Actor.get_input() or {} task = actor_input.get('task', DEFAULT_TASK) model = actor_input.get('model', 'gpt-4.1-mini') max_steps = actor_input.get('maxSteps', 25) - # Read the LLM API key from the environment so it is never stored in the Actor - # input. On the Apify platform, set it as a secret environment variable. + # Read the LLM API key from the environment (set it as a secret on Apify). llm_api_key = os.environ.get('OPENAI_API_KEY') if not llm_api_key: raise RuntimeError('The OPENAI_API_KEY environment variable is not set.') - # Create a proxy configuration that routes the browser through Apify Proxy. + # Route the browser through Apify Proxy. proxy_configuration = await Actor.create_proxy_configuration() proxy_url = await proxy_configuration.new_url() if proxy_configuration else None Actor.log.info(f'Running the agent (model={model}) for task: {task}') - # Run the Browser Use agent and collect its structured output. result = await run_agent_task( task, model=model, @@ -112,7 +102,7 @@ async def main() -> None: Actor.log.warning('The agent did not return any structured output.') return - # Store every extracted item as a separate row in the default dataset. + # Store each extracted item as a dataset row. Actor.log.info(f'The agent returned {len(result.posts)} post(s); storing them.') for post in result.posts: Actor.log.info(f'Storing post: {post.title!r} ({post.url})') diff --git a/docs/03_guides/code/11_pydantic.py b/docs/03_guides/code/11_pydantic.py index e836fea9..7ce35f88 100644 --- a/docs/03_guides/code/11_pydantic.py +++ b/docs/03_guides/code/11_pydantic.py @@ -7,31 +7,24 @@ class ActorInput(BaseModel): - """Typed and validated representation of the Actor input. - - The field names follow Python's `snake_case`, while the aliases match the - `camelCase` keys produced by the Apify input schema editor. With - `populate_by_name`, the model accepts either form, and unknown fields are - ignored (`extra='ignore'`) so that adding a field to the input schema never - breaks an older Actor build. - """ + """Typed and validated representation of the Actor input.""" + # Accept both snake_case and the input schema's camelCase; ignore extras. model_config = ConfigDict(populate_by_name=True, extra='ignore') - # Required: a non-empty list of search terms. The validator below trims - # each entry and drops the empty ones. + # Required: non-empty list of search terms (normalized below). search_terms: list[str] = Field(alias='searchTerms', min_length=1) - # Optional: defaults to 10 and must fall within the inclusive 1-100 range. + # Optional: 1-100, defaults to 10. max_results: int = Field(alias='maxResults', default=10, ge=1, le=100) - # Optional: restricted to a fixed set of choices, like an input schema enum. + # Optional: restricted to a fixed set of choices. output_format: Literal['json', 'csv'] = Field(alias='outputFormat', default='json') @field_validator('search_terms') @classmethod def _normalize_terms(cls, value: list[str]) -> list[str]: - # Trim whitespace and drop empty terms, then ensure something is left. + # Trim whitespace and drop empty terms. cleaned = [term.strip() for term in value if term.strip()] if not cleaned: raise ValueError('searchTerms must contain at least one non-empty term') @@ -39,32 +32,26 @@ def _normalize_terms(cls, value: list[str]) -> list[str]: async def main() -> None: - # Enter the context of the Actor. async with Actor: - # Read the raw input record from the default key-value store. It's a - # plain dict (or None) - no validation has happened yet. + # Read the raw input (a plain dict, not yet validated). raw_input = await Actor.get_input() or {} - # Validate the raw input against the model. On success, `actor_input` is - # a fully typed `ActorInput` with defaults filled in and every field - # guaranteed to be valid. + # Validate the raw input against the model. try: actor_input = ActorInput.model_validate(raw_input) except ValidationError as exc: - # Log a readable, per-field summary, then re-raise so the context - # manager marks the run as FAILED. Failing fast here beats crashing - # later with an obscure error deep in the code. + # Log a per-field summary, then re-raise to fail the run. Actor.log.error('The Actor input is invalid:\n%s', exc) raise - # From here on, work with typed attributes instead of dict lookups. + # Work with typed attributes from here on. Actor.log.info('Input passed validation: %s', actor_input.model_dump()) max_results = actor_input.max_results for term in actor_input.search_terms: Actor.log.info('Processing %r (max %d results)', term, max_results) - # Store the normalized input as the Actor's output. + # Store the normalized input as output. await Actor.set_value('OUTPUT', actor_input.model_dump()) diff --git a/docs/03_guides/code/11_raw_input.py b/docs/03_guides/code/11_raw_input.py index 7bfbeede..29c313e5 100644 --- a/docs/03_guides/code/11_raw_input.py +++ b/docs/03_guides/code/11_raw_input.py @@ -6,7 +6,7 @@ async def main() -> None: # Enter the context of the Actor. async with Actor: - # Read the input and reach into the raw dictionary for each value. + # Read the input and reach into the raw dict. actor_input = await Actor.get_input() or {} search_terms = actor_input.get('searchTerms', []) max_results = actor_input.get('maxResults', 10) diff --git a/docs/03_guides/code/12_webserver.py b/docs/03_guides/code/12_webserver.py index aef8e869..1cb23c1f 100644 --- a/docs/03_guides/code/12_webserver.py +++ b/docs/03_guides/code/12_webserver.py @@ -18,7 +18,7 @@ def do_GET(self) -> None: def run_server() -> None: - """Start the HTTP server on the provided port, and save a reference to the server.""" + """Start the HTTP server and keep a reference to it.""" global http_server with ThreadingHTTPServer( ('', Actor.configuration.web_server_port), RequestHandler @@ -43,7 +43,7 @@ async def main() -> None: if http_server is None: raise RuntimeError('HTTP server not started') - # Signal the HTTP server to shut down, and wait for it to finish. + # Signal the server to shut down and wait. http_server.shutdown() await run_server_task diff --git a/docs/03_guides/code/12_webserver_fastapi.py b/docs/03_guides/code/12_webserver_fastapi.py index 1b1e6240..08768eb0 100644 --- a/docs/03_guides/code/12_webserver_fastapi.py +++ b/docs/03_guides/code/12_webserver_fastapi.py @@ -5,10 +5,10 @@ from apify import Actor -# A module-level counter that the web server reports and the Actor keeps updating. +# Counter the server reports and the Actor updates. processed_items = 0 -# The FastAPI application with a single endpoint. +# FastAPI app with a single endpoint. app = FastAPI() @@ -21,8 +21,7 @@ async def index() -> dict[str, int]: async def main() -> None: global processed_items async with Actor: - # Serve the FastAPI app with uvicorn on the platform's web server port. - # Binding to 0.0.0.0 makes it reachable through the Actor's container URL. + # Serve the app on the platform's web server port; 0.0.0.0 exposes it. config = uvicorn.Config( app, host='0.0.0.0', # noqa: S104 @@ -30,17 +29,17 @@ async def main() -> None: ) server = uvicorn.Server(config) - # Run the server in the background while the Actor does its work. + # Run the server in the background. server_task = asyncio.create_task(server.serve()) Actor.log.info(f'Server running at {Actor.configuration.web_server_url}') - # Simulate doing some work, updating the counter the endpoint reports. + # Simulate work, updating the reported counter. for _ in range(100): await asyncio.sleep(1) processed_items += 1 Actor.log.info(f'Processed items: {processed_items}') - # Signal the server to shut down, and wait for it to finish. + # Signal the server to shut down and wait. server.should_exit = True await server_task diff --git a/docs/03_guides/code/scrapy_project/src/__main__.py b/docs/03_guides/code/scrapy_project/src/__main__.py index 807447c9..f9b27ed5 100644 --- a/docs/03_guides/code/scrapy_project/src/__main__.py +++ b/docs/03_guides/code/scrapy_project/src/__main__.py @@ -7,7 +7,7 @@ # Import your main Actor coroutine here. from .main import main -# Ensure the location to the Scrapy settings module is defined. +# Point Scrapy at the settings module. os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings' diff --git a/docs/03_guides/code/scrapy_project/src/main.py b/docs/03_guides/code/scrapy_project/src/main.py index d8b67984..b234b171 100644 --- a/docs/03_guides/code/scrapy_project/src/main.py +++ b/docs/03_guides/code/scrapy_project/src/main.py @@ -14,16 +14,16 @@ async def main() -> None: """Apify Actor main coroutine for executing the Scrapy spider.""" async with Actor: - # Retrieve and process Actor input. + # Read the Actor input. actor_input = await Actor.get_input() or {} start_urls = [url['url'] for url in actor_input.get('startUrls', [])] allowed_domains = actor_input.get('allowedDomains') proxy_config = actor_input.get('proxyConfiguration') - # Apply Apify settings, which will override the Scrapy project settings. + # Apply Apify settings (override the Scrapy project settings). settings = apply_apify_settings(proxy_config=proxy_config) - # Create AsyncCrawlerRunner and execute the Scrapy spider. + # Run the Scrapy spider. crawler_runner = AsyncCrawlerRunner(settings) await crawler_runner.crawl( Spider, diff --git a/docs/03_guides/code/scrapy_project/src/settings.py b/docs/03_guides/code/scrapy_project/src/settings.py index 5c0e56e3..67ae1a03 100644 --- a/docs/03_guides/code/scrapy_project/src/settings.py +++ b/docs/03_guides/code/scrapy_project/src/settings.py @@ -5,7 +5,7 @@ ROBOTSTXT_OBEY = True SPIDER_MODULES = ['src.spiders'] TELNETCONSOLE_ENABLED = False -# Do not change the Twisted reactor unless you really know what you are doing. +# Don't change the Twisted reactor unless you know what you're doing. TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor' HTTPCACHE_ENABLED = True HTTPCACHE_EXPIRATION_SECS = 7200 diff --git a/docs/03_guides/code/scrapy_project/src/spiders/title.py b/docs/03_guides/code/scrapy_project/src/spiders/title.py index 7223a53d..8111ee31 100644 --- a/docs/03_guides/code/scrapy_project/src/spiders/title.py +++ b/docs/03_guides/code/scrapy_project/src/spiders/title.py @@ -14,11 +14,7 @@ class TitleSpider(Spider): - """A spider that scrapes web pages to extract titles and discover new links. - - This spider retrieves the content of the element from each page and queues - any valid hyperlinks for further crawling. - """ + """A spider that extracts page titles and queues links for further crawling.""" name = 'title_spider' @@ -32,36 +28,21 @@ def __init__( *args: Any, **kwargs: Any, ) -> None: - """A default constructor. - - Args: - start_urls: URLs to start the scraping from. - allowed_domains: Domains that the scraper is allowed to crawl. - *args: Additional positional arguments. - **kwargs: Additional keyword arguments. - """ + """Store the start URLs and allowed domains.""" super().__init__(*args, **kwargs) self.start_urls = start_urls self.allowed_domains = allowed_domains def parse(self, response: Response) -> Generator[TitleItem | Request, None, None]: - """Parse the web page response. - - Args: - response: The web page response. - - Yields: - Yields scraped `TitleItem` and new `Request` objects for links. - """ + """Yield a `TitleItem` and a `Request` for each link on the page.""" self.logger.info('TitleSpider is parsing %s...', response) - # Extract and yield the TitleItem + # Yield the title item. url = response.url title = response.css('title::text').extract_first() yield TitleItem(url=url, title=title) - # Extract all links from the page, create `Request` objects out of them, - # and yield them. + # Yield a request for each link. for link_href in response.css('a::attr("href")'): link_url = urljoin(response.url, link_href.get()) if link_url.startswith(('http://', 'https://')): From 88b1a63fd201c5135d46202903f1e3f7f4195e9c Mon Sep 17 00:00:00 2001 From: Vlada Dusek <v.dusek96@gmail.com> Date: Fri, 5 Jun 2026 20:15:01 +0200 Subject: [PATCH 21/24] docs: rename guide titles to describe the task, not just the library --- docs/01_introduction/quick-start.mdx | 24 +++++++++++------------ docs/03_guides/01_beautifulsoup_httpx.mdx | 2 +- docs/03_guides/02_parsel_impit.mdx | 2 +- docs/03_guides/03_playwright.mdx | 2 +- docs/03_guides/04_selenium.mdx | 2 +- docs/03_guides/05_crawlee.mdx | 2 +- docs/03_guides/06_scrapy.mdx | 2 +- docs/03_guides/07_scrapling.mdx | 2 +- docs/03_guides/08_crawl4ai.mdx | 2 +- docs/03_guides/09_browser_use.mdx | 2 +- docs/03_guides/10_uv.mdx | 2 +- docs/03_guides/11_pydantic.mdx | 2 +- docs/03_guides/12_running_webserver.mdx | 2 +- 13 files changed, 24 insertions(+), 24 deletions(-) diff --git a/docs/01_introduction/quick-start.mdx b/docs/01_introduction/quick-start.mdx index e487f201..42eec353 100644 --- a/docs/01_introduction/quick-start.mdx +++ b/docs/01_introduction/quick-start.mdx @@ -99,18 +99,18 @@ To learn more about the features of the Apify SDK and how to use them, check out To see how you can integrate the Apify SDK with popular web scraping libraries, check out these guides: -- [BeautifulSoup with HTTPX](../guides/beautifulsoup-httpx) -- [Parsel with Impit](../guides/parsel-impit) -- [Playwright](../guides/playwright) -- [Selenium](../guides/selenium) -- [Crawlee](../guides/crawlee) -- [Scrapy](../guides/scrapy) -- [Scrapling](../guides/scrapling) -- [Crawl4AI](../guides/crawl4ai) -- [Browser Use](../guides/browser-use) +- [HTTP scraping with BeautifulSoup and HTTPX](../guides/beautifulsoup-httpx) +- [HTTP scraping with Parsel and Impit](../guides/parsel-impit) +- [Browser automation with Playwright](../guides/playwright) +- [Browser automation with Selenium](../guides/selenium) +- [Web scraping with Crawlee](../guides/crawlee) +- [Web scraping with Scrapy](../guides/scrapy) +- [Adaptive scraping with Scrapling](../guides/scrapling) +- [LLM-ready scraping with Crawl4AI](../guides/crawl4ai) +- [Browser AI agents with Browser Use](../guides/browser-use) For other aspects of Actor development, explore these guides: -- [Manage your project with uv](../guides/uv) -- [Validate Actor input with Pydantic](../guides/input-validation) -- [Run a web server](../guides/running-webserver) +- [Project management with uv](../guides/uv) +- [Input validation with Pydantic](../guides/input-validation) +- [Running a web server](../guides/running-webserver) diff --git a/docs/03_guides/01_beautifulsoup_httpx.mdx b/docs/03_guides/01_beautifulsoup_httpx.mdx index ed44d5f8..5e79a509 100644 --- a/docs/03_guides/01_beautifulsoup_httpx.mdx +++ b/docs/03_guides/01_beautifulsoup_httpx.mdx @@ -1,6 +1,6 @@ --- id: beautifulsoup-httpx -title: Use BeautifulSoup with HTTPX +title: HTTP scraping with BeautifulSoup and HTTPX description: Build an Apify Actor that scrapes web pages using BeautifulSoup and HTTPX. --- diff --git a/docs/03_guides/02_parsel_impit.mdx b/docs/03_guides/02_parsel_impit.mdx index da427a04..3686b3c1 100644 --- a/docs/03_guides/02_parsel_impit.mdx +++ b/docs/03_guides/02_parsel_impit.mdx @@ -1,6 +1,6 @@ --- id: parsel-impit -title: Use Parsel with Impit +title: HTTP scraping with Parsel and Impit description: Build an Apify Actor that scrapes web pages using Parsel selectors and the Impit HTTP client. --- diff --git a/docs/03_guides/03_playwright.mdx b/docs/03_guides/03_playwright.mdx index 3d42ec63..dff1696e 100644 --- a/docs/03_guides/03_playwright.mdx +++ b/docs/03_guides/03_playwright.mdx @@ -1,6 +1,6 @@ --- id: playwright -title: Use Playwright +title: Browser automation with Playwright description: Build an Apify Actor that scrapes dynamic web pages using Playwright browser automation. --- diff --git a/docs/03_guides/04_selenium.mdx b/docs/03_guides/04_selenium.mdx index faed6296..e04c9ce0 100644 --- a/docs/03_guides/04_selenium.mdx +++ b/docs/03_guides/04_selenium.mdx @@ -1,6 +1,6 @@ --- id: selenium -title: Use Selenium +title: Browser automation with Selenium description: Build an Apify Actor that scrapes dynamic web pages using Selenium WebDriver. --- diff --git a/docs/03_guides/05_crawlee.mdx b/docs/03_guides/05_crawlee.mdx index b6dc2f74..cb3a72ce 100644 --- a/docs/03_guides/05_crawlee.mdx +++ b/docs/03_guides/05_crawlee.mdx @@ -1,6 +1,6 @@ --- id: crawlee -title: Use Crawlee +title: Web scraping with Crawlee description: Build Apify Actors using Crawlee's BeautifulSoupCrawler, ParselCrawler, or PlaywrightCrawler. --- diff --git a/docs/03_guides/06_scrapy.mdx b/docs/03_guides/06_scrapy.mdx index 81409ab2..7d4086f1 100644 --- a/docs/03_guides/06_scrapy.mdx +++ b/docs/03_guides/06_scrapy.mdx @@ -1,6 +1,6 @@ --- id: scrapy -title: Use Scrapy +title: Web scraping with Scrapy description: Convert Scrapy spiders into Apify Actors with platform storage and proxy integration. --- diff --git a/docs/03_guides/07_scrapling.mdx b/docs/03_guides/07_scrapling.mdx index e8da41e1..132a716c 100644 --- a/docs/03_guides/07_scrapling.mdx +++ b/docs/03_guides/07_scrapling.mdx @@ -1,6 +1,6 @@ --- id: scrapling -title: Use Scrapling +title: Adaptive scraping with Scrapling description: Build an Apify Actor that scrapes web pages using the Scrapling adaptive web scraping library. --- diff --git a/docs/03_guides/08_crawl4ai.mdx b/docs/03_guides/08_crawl4ai.mdx index 01df4dec..e548ee7e 100644 --- a/docs/03_guides/08_crawl4ai.mdx +++ b/docs/03_guides/08_crawl4ai.mdx @@ -1,6 +1,6 @@ --- id: crawl4ai -title: Use Crawl4AI +title: LLM-ready scraping with Crawl4AI description: Build an Apify Actor that scrapes web pages into LLM-ready markdown using the Crawl4AI library. --- diff --git a/docs/03_guides/09_browser_use.mdx b/docs/03_guides/09_browser_use.mdx index 1760505d..f7317c04 100644 --- a/docs/03_guides/09_browser_use.mdx +++ b/docs/03_guides/09_browser_use.mdx @@ -1,6 +1,6 @@ --- id: browser-use -title: Use Browser Use +title: Browser AI agents with Browser Use description: Build an Apify Actor that automates a browser with an LLM agent using the Browser Use library. --- diff --git a/docs/03_guides/10_uv.mdx b/docs/03_guides/10_uv.mdx index 6d480c81..3e037d5b 100644 --- a/docs/03_guides/10_uv.mdx +++ b/docs/03_guides/10_uv.mdx @@ -1,6 +1,6 @@ --- id: uv -title: Manage your project with uv +title: Project management with uv description: Manage your Actor's Python version, dependencies, and virtual environment with the uv package and project manager. --- diff --git a/docs/03_guides/11_pydantic.mdx b/docs/03_guides/11_pydantic.mdx index c6245193..dd8bbde6 100644 --- a/docs/03_guides/11_pydantic.mdx +++ b/docs/03_guides/11_pydantic.mdx @@ -1,6 +1,6 @@ --- id: input-validation -title: Validate Actor input with Pydantic +title: Input validation with Pydantic description: Parse, validate, and type your Actor's input with Pydantic models instead of reaching into a raw dictionary. --- diff --git a/docs/03_guides/12_running_webserver.mdx b/docs/03_guides/12_running_webserver.mdx index 7ff54a47..7b946e86 100644 --- a/docs/03_guides/12_running_webserver.mdx +++ b/docs/03_guides/12_running_webserver.mdx @@ -1,6 +1,6 @@ --- id: running-webserver -title: Run a web server +title: Running a web server description: Run an HTTP server inside your Actor for monitoring or serving content during execution. --- From 503d3632023676262356b8bd721f8689dc040559 Mon Sep 17 00:00:00 2001 From: Vlada Dusek <v.dusek96@gmail.com> Date: Fri, 5 Jun 2026 20:21:47 +0200 Subject: [PATCH 22/24] docs: shorten scraping guide titles so they fit the sidebar --- docs/01_introduction/quick-start.mdx | 8 ++++---- docs/03_guides/01_beautifulsoup_httpx.mdx | 2 +- docs/03_guides/02_parsel_impit.mdx | 2 +- docs/03_guides/05_crawlee.mdx | 2 +- docs/03_guides/06_scrapy.mdx | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/01_introduction/quick-start.mdx b/docs/01_introduction/quick-start.mdx index 42eec353..63791e51 100644 --- a/docs/01_introduction/quick-start.mdx +++ b/docs/01_introduction/quick-start.mdx @@ -99,12 +99,12 @@ To learn more about the features of the Apify SDK and how to use them, check out To see how you can integrate the Apify SDK with popular web scraping libraries, check out these guides: -- [HTTP scraping with BeautifulSoup and HTTPX](../guides/beautifulsoup-httpx) -- [HTTP scraping with Parsel and Impit](../guides/parsel-impit) +- [Scraping with BeautifulSoup and HTTPX](../guides/beautifulsoup-httpx) +- [Scraping with Parsel and Impit](../guides/parsel-impit) - [Browser automation with Playwright](../guides/playwright) - [Browser automation with Selenium](../guides/selenium) -- [Web scraping with Crawlee](../guides/crawlee) -- [Web scraping with Scrapy](../guides/scrapy) +- [Scraping with Crawlee](../guides/crawlee) +- [Scraping with Scrapy](../guides/scrapy) - [Adaptive scraping with Scrapling](../guides/scrapling) - [LLM-ready scraping with Crawl4AI](../guides/crawl4ai) - [Browser AI agents with Browser Use](../guides/browser-use) diff --git a/docs/03_guides/01_beautifulsoup_httpx.mdx b/docs/03_guides/01_beautifulsoup_httpx.mdx index 5e79a509..3829a2f5 100644 --- a/docs/03_guides/01_beautifulsoup_httpx.mdx +++ b/docs/03_guides/01_beautifulsoup_httpx.mdx @@ -1,6 +1,6 @@ --- id: beautifulsoup-httpx -title: HTTP scraping with BeautifulSoup and HTTPX +title: Scraping with BeautifulSoup and HTTPX description: Build an Apify Actor that scrapes web pages using BeautifulSoup and HTTPX. --- diff --git a/docs/03_guides/02_parsel_impit.mdx b/docs/03_guides/02_parsel_impit.mdx index 3686b3c1..58d4b283 100644 --- a/docs/03_guides/02_parsel_impit.mdx +++ b/docs/03_guides/02_parsel_impit.mdx @@ -1,6 +1,6 @@ --- id: parsel-impit -title: HTTP scraping with Parsel and Impit +title: Scraping with Parsel and Impit description: Build an Apify Actor that scrapes web pages using Parsel selectors and the Impit HTTP client. --- diff --git a/docs/03_guides/05_crawlee.mdx b/docs/03_guides/05_crawlee.mdx index cb3a72ce..385bc5f0 100644 --- a/docs/03_guides/05_crawlee.mdx +++ b/docs/03_guides/05_crawlee.mdx @@ -1,6 +1,6 @@ --- id: crawlee -title: Web scraping with Crawlee +title: Scraping with Crawlee description: Build Apify Actors using Crawlee's BeautifulSoupCrawler, ParselCrawler, or PlaywrightCrawler. --- diff --git a/docs/03_guides/06_scrapy.mdx b/docs/03_guides/06_scrapy.mdx index 7d4086f1..dcbe204f 100644 --- a/docs/03_guides/06_scrapy.mdx +++ b/docs/03_guides/06_scrapy.mdx @@ -1,6 +1,6 @@ --- id: scrapy -title: Web scraping with Scrapy +title: Scraping with Scrapy description: Convert Scrapy spiders into Apify Actors with platform storage and proxy integration. --- From cef88b3dd910fbc057f2c0ffe4158d41bcd6f858 Mon Sep 17 00:00:00 2001 From: Vlada Dusek <v.dusek96@gmail.com> Date: Fri, 5 Jun 2026 20:27:33 +0200 Subject: [PATCH 23/24] docs: retitle Crawlee and Scrapy guides as crawler frameworks --- docs/01_introduction/quick-start.mdx | 4 ++-- docs/03_guides/05_crawlee.mdx | 2 +- docs/03_guides/06_scrapy.mdx | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/01_introduction/quick-start.mdx b/docs/01_introduction/quick-start.mdx index 63791e51..a3fb05ba 100644 --- a/docs/01_introduction/quick-start.mdx +++ b/docs/01_introduction/quick-start.mdx @@ -103,8 +103,8 @@ To see how you can integrate the Apify SDK with popular web scraping libraries, - [Scraping with Parsel and Impit](../guides/parsel-impit) - [Browser automation with Playwright](../guides/playwright) - [Browser automation with Selenium](../guides/selenium) -- [Scraping with Crawlee](../guides/crawlee) -- [Scraping with Scrapy](../guides/scrapy) +- [Building crawlers with Crawlee](../guides/crawlee) +- [Building crawlers with Scrapy](../guides/scrapy) - [Adaptive scraping with Scrapling](../guides/scrapling) - [LLM-ready scraping with Crawl4AI](../guides/crawl4ai) - [Browser AI agents with Browser Use](../guides/browser-use) diff --git a/docs/03_guides/05_crawlee.mdx b/docs/03_guides/05_crawlee.mdx index 385bc5f0..6f20a9da 100644 --- a/docs/03_guides/05_crawlee.mdx +++ b/docs/03_guides/05_crawlee.mdx @@ -1,6 +1,6 @@ --- id: crawlee -title: Scraping with Crawlee +title: Building crawlers with Crawlee description: Build Apify Actors using Crawlee's BeautifulSoupCrawler, ParselCrawler, or PlaywrightCrawler. --- diff --git a/docs/03_guides/06_scrapy.mdx b/docs/03_guides/06_scrapy.mdx index dcbe204f..f038959a 100644 --- a/docs/03_guides/06_scrapy.mdx +++ b/docs/03_guides/06_scrapy.mdx @@ -1,6 +1,6 @@ --- id: scrapy -title: Scraping with Scrapy +title: Building crawlers with Scrapy description: Convert Scrapy spiders into Apify Actors with platform storage and proxy integration. --- From ca896a546b312fe83e0c81db2e8592558ba7e4c8 Mon Sep 17 00:00:00 2001 From: Vlada Dusek <v.dusek96@gmail.com> Date: Fri, 5 Jun 2026 20:31:59 +0200 Subject: [PATCH 24/24] docs: align guide intros with the new thematic titles --- docs/01_introduction/quick-start.mdx | 2 +- docs/03_guides/01_beautifulsoup_httpx.mdx | 2 +- docs/03_guides/02_parsel_impit.mdx | 2 +- docs/03_guides/03_playwright.mdx | 2 +- docs/03_guides/04_selenium.mdx | 2 +- docs/03_guides/05_crawlee.mdx | 2 +- docs/03_guides/06_scrapy.mdx | 2 +- docs/03_guides/07_scrapling.mdx | 2 +- docs/03_guides/08_crawl4ai.mdx | 2 +- docs/03_guides/09_browser_use.mdx | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/01_introduction/quick-start.mdx b/docs/01_introduction/quick-start.mdx index a3fb05ba..c74bd848 100644 --- a/docs/01_introduction/quick-start.mdx +++ b/docs/01_introduction/quick-start.mdx @@ -97,7 +97,7 @@ To learn more about the features of the Apify SDK and how to use them, check out ### Guides -To see how you can integrate the Apify SDK with popular web scraping libraries, check out these guides: +To see how you can integrate the Apify SDK with popular scraping libraries and frameworks, check out these guides: - [Scraping with BeautifulSoup and HTTPX](../guides/beautifulsoup-httpx) - [Scraping with Parsel and Impit](../guides/parsel-impit) diff --git a/docs/03_guides/01_beautifulsoup_httpx.mdx b/docs/03_guides/01_beautifulsoup_httpx.mdx index 3829a2f5..2ae47ded 100644 --- a/docs/03_guides/01_beautifulsoup_httpx.mdx +++ b/docs/03_guides/01_beautifulsoup_httpx.mdx @@ -8,7 +8,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import BeautifulSoupHttpxExample from '!!raw-loader!roa-loader!./code/01_beautifulsoup_httpx.py'; -In this guide, you'll learn how to use the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) library with the [HTTPX](https://www.python-httpx.org/) library in your Apify Actors. +In this guide, you'll learn how to scrape web pages with the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) and [HTTPX](https://www.python-httpx.org/) libraries in your Apify Actors. ## Introduction diff --git a/docs/03_guides/02_parsel_impit.mdx b/docs/03_guides/02_parsel_impit.mdx index 58d4b283..d91ebea2 100644 --- a/docs/03_guides/02_parsel_impit.mdx +++ b/docs/03_guides/02_parsel_impit.mdx @@ -8,7 +8,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import ParselImpitExample from '!!raw-loader!roa-loader!./code/02_parsel_impit.py'; -In this guide, you'll learn how to combine the [Parsel](https://github.com/scrapy/parsel) and [Impit](https://github.com/apify/impit) libraries when building Apify Actors. +In this guide, you'll learn how to scrape web pages with the [Parsel](https://github.com/scrapy/parsel) and [Impit](https://github.com/apify/impit) libraries in your Apify Actors. ## Introduction diff --git a/docs/03_guides/03_playwright.mdx b/docs/03_guides/03_playwright.mdx index dff1696e..11b57b7e 100644 --- a/docs/03_guides/03_playwright.mdx +++ b/docs/03_guides/03_playwright.mdx @@ -11,7 +11,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import PlaywrightExample from '!!raw-loader!roa-loader!./code/03_playwright.py'; -In this guide, you'll learn how to use [Playwright](https://playwright.dev) for web scraping in your Apify Actors. +In this guide, you'll learn how to use [Playwright](https://playwright.dev) for browser automation and web scraping in your Apify Actors. ## Introduction diff --git a/docs/03_guides/04_selenium.mdx b/docs/03_guides/04_selenium.mdx index e04c9ce0..ae4ccbef 100644 --- a/docs/03_guides/04_selenium.mdx +++ b/docs/03_guides/04_selenium.mdx @@ -8,7 +8,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import SeleniumExample from '!!raw-loader!roa-loader!./code/04_selenium.py'; -In this guide, you'll learn how to use [Selenium](https://www.selenium.dev/) for web scraping in your Apify Actors. +In this guide, you'll learn how to use [Selenium](https://www.selenium.dev/) for browser automation and web scraping in your Apify Actors. ## Introduction diff --git a/docs/03_guides/05_crawlee.mdx b/docs/03_guides/05_crawlee.mdx index 6f20a9da..f0aa67f6 100644 --- a/docs/03_guides/05_crawlee.mdx +++ b/docs/03_guides/05_crawlee.mdx @@ -10,7 +10,7 @@ import CrawleeBeautifulSoupExample from '!!raw-loader!roa-loader!./code/05_crawl import CrawleeParselExample from '!!raw-loader!roa-loader!./code/05_crawlee_parsel.py'; import CrawleePlaywrightExample from '!!raw-loader!roa-loader!./code/05_crawlee_playwright.py'; -In this guide, you'll learn how to use the [Crawlee](https://crawlee.dev/python) library in your Apify Actors. +In this guide, you'll learn how to build web crawlers with the [Crawlee](https://crawlee.dev/python) library in your Apify Actors. ## Introduction diff --git a/docs/03_guides/06_scrapy.mdx b/docs/03_guides/06_scrapy.mdx index f038959a..4af24354 100644 --- a/docs/03_guides/06_scrapy.mdx +++ b/docs/03_guides/06_scrapy.mdx @@ -15,7 +15,7 @@ import ItemsExample from '!!raw-loader!./code/scrapy_project/src/items.py'; import SpidersExample from '!!raw-loader!./code/scrapy_project/src/spiders/title.py'; import SettingsExample from '!!raw-loader!./code/scrapy_project/src/settings.py'; -In this guide, you'll learn how to use the [Scrapy](https://scrapy.org/) framework in your Apify Actors. +In this guide, you'll learn how to build web crawlers with the [Scrapy](https://scrapy.org/) framework in your Apify Actors. ## Introduction diff --git a/docs/03_guides/07_scrapling.mdx b/docs/03_guides/07_scrapling.mdx index 132a716c..63e948e5 100644 --- a/docs/03_guides/07_scrapling.mdx +++ b/docs/03_guides/07_scrapling.mdx @@ -10,7 +10,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import ScraplingExample from '!!raw-loader!roa-loader!./code/07_scrapling.py'; import ScraplingBrowserScraper from '!!raw-loader!./code/07_scrapling_browser.py'; -In this guide, you'll learn how to use the [Scrapling](https://scrapling.readthedocs.io/) library in your Apify Actors. +In this guide, you'll learn how to use the [Scrapling](https://scrapling.readthedocs.io/) library for adaptive web scraping in your Apify Actors. ## Introduction diff --git a/docs/03_guides/08_crawl4ai.mdx b/docs/03_guides/08_crawl4ai.mdx index e548ee7e..0802c002 100644 --- a/docs/03_guides/08_crawl4ai.mdx +++ b/docs/03_guides/08_crawl4ai.mdx @@ -8,7 +8,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import Crawl4aiExample from '!!raw-loader!roa-loader!./code/08_crawl4ai.py'; -In this guide, you'll learn how to use the [Crawl4AI](https://crawl4ai.com/) library in your Apify Actors. +In this guide, you'll learn how to use the [Crawl4AI](https://crawl4ai.com/) library for LLM-ready web scraping in your Apify Actors. ## Introduction diff --git a/docs/03_guides/09_browser_use.mdx b/docs/03_guides/09_browser_use.mdx index f7317c04..77529963 100644 --- a/docs/03_guides/09_browser_use.mdx +++ b/docs/03_guides/09_browser_use.mdx @@ -8,7 +8,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import BrowserUseExample from '!!raw-loader!roa-loader!./code/09_browser_use.py'; -In this guide, you'll learn how to use the [Browser Use](https://browser-use.com/) library in your Apify Actors. +In this guide, you'll learn how to use the [Browser Use](https://browser-use.com/) library to drive a browser with an LLM agent in your Apify Actors. ## Introduction