crawl4ai/docs/examples/shadow_dom_crawling.py at main · unclecode/crawl4ai

77 lines (59 loc) · 2.63 KB

Shadow DOM Crawling Example
============================
Demonstrates how to use `flatten_shadow_dom=True` to extract content
hidden inside Shadow DOM trees on sites built with Web Components
(Stencil, Lit, Shoelace, Angular Elements, etc.).
Shadow DOM creates encapsulated sub-trees that are invisible to the
normal page serialization (page.content() / outerHTML). The
`flatten_shadow_dom` option walks these trees and produces a single
flat HTML document that includes all shadow content.
This example crawls a Bosch Rexroth product page where the product
description, technical specs, and downloads are rendered entirely
inside Shadow DOM by Stencil.js web components.
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
URL = "https://store.boschrexroth.com/en/us/p/hydraulic-cylinder-r900999011"
async def main():
    browser_config = BrowserConfig(headless=True)
    # ── 1. Baseline: without shadow DOM flattening ──────────────────
    print("=" * 60)
    print("Without flatten_shadow_dom (baseline)")
    print("=" * 60)
    config = CrawlerRunConfig(
        wait_until="load",
        delay_before_return_html=3.0,
    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(URL, config=config)
    md = result.markdown.raw_markdown if result.markdown else ""
    print(f"Markdown length: {len(md)} chars")
    print(f"Has product description: {'mill type design' in md.lower()}")
    print(f"Has technical specs:     {'CDH1' in md}")
    print(f"Has downloads section:   {'Downloads' in md}")
    print()
    # ── 2. With shadow DOM flattening ───────────────────────────────
    print("=" * 60)
    print("With flatten_shadow_dom=True")
    print("=" * 60)
    config = CrawlerRunConfig(
        wait_until="load",
        delay_before_return_html=3.0,
        flatten_shadow_dom=True,
    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(URL, config=config)
    md = result.markdown.raw_markdown if result.markdown else ""
    print(f"Markdown length: {len(md)} chars")
    print(f"Has product description: {'mill type design' in md.lower()}")
    print(f"Has technical specs:     {'CDH1' in md}")
    print(f"Has downloads section:   {'Downloads' in md}")
    print()
    # Show the product content section
    idx = md.find("Product Description")
    if idx >= 0:
        print("── Extracted product content ──")
        print(md[idx:idx + 1200])
if __name__ == "__main__":
    asyncio.run(main())

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

FilesExpand file tree

shadow_dom_crawling.py

Latest commit

History

shadow_dom_crawling.py

File metadata and controls