-
-
Notifications
You must be signed in to change notification settings - Fork 6.6k
Expand file tree
/
Copy pathshadow_dom_crawling.py
More file actions
77 lines (59 loc) · 2.63 KB
/
shadow_dom_crawling.py
File metadata and controls
77 lines (59 loc) · 2.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
"""
Shadow DOM Crawling Example
============================
Demonstrates how to use `flatten_shadow_dom=True` to extract content
hidden inside Shadow DOM trees on sites built with Web Components
(Stencil, Lit, Shoelace, Angular Elements, etc.).
Shadow DOM creates encapsulated sub-trees that are invisible to the
normal page serialization (page.content() / outerHTML). The
`flatten_shadow_dom` option walks these trees and produces a single
flat HTML document that includes all shadow content.
This example crawls a Bosch Rexroth product page where the product
description, technical specs, and downloads are rendered entirely
inside Shadow DOM by Stencil.js web components.
"""
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
URL = "https://store.boschrexroth.com/en/us/p/hydraulic-cylinder-r900999011"
async def main():
browser_config = BrowserConfig(headless=True)
# ── 1. Baseline: without shadow DOM flattening ──────────────────
print("=" * 60)
print("Without flatten_shadow_dom (baseline)")
print("=" * 60)
config = CrawlerRunConfig(
wait_until="load",
delay_before_return_html=3.0,
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(URL, config=config)
md = result.markdown.raw_markdown if result.markdown else ""
print(f"Markdown length: {len(md)} chars")
print(f"Has product description: {'mill type design' in md.lower()}")
print(f"Has technical specs: {'CDH1' in md}")
print(f"Has downloads section: {'Downloads' in md}")
print()
# ── 2. With shadow DOM flattening ───────────────────────────────
print("=" * 60)
print("With flatten_shadow_dom=True")
print("=" * 60)
config = CrawlerRunConfig(
wait_until="load",
delay_before_return_html=3.0,
flatten_shadow_dom=True,
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(URL, config=config)
md = result.markdown.raw_markdown if result.markdown else ""
print(f"Markdown length: {len(md)} chars")
print(f"Has product description: {'mill type design' in md.lower()}")
print(f"Has technical specs: {'CDH1' in md}")
print(f"Has downloads section: {'Downloads' in md}")
print()
# Show the product content section
idx = md.find("Product Description")
if idx >= 0:
print("── Extracted product content ──")
print(md[idx:idx + 1200])
if __name__ == "__main__":
asyncio.run(main())