Skip to content

Commit 872447b

Browse files
MantisusCopilot
andauthored
feat: add discover_valid_sitemaps utility (#1777)
### Description - Add `discover_valid_sitemaps` utility to search for sitemaps of websites for the provided URLs. ### Issues - Closes: #1740 ### Testing - Add new unit tests --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent cc2c45c commit 872447b

3 files changed

Lines changed: 256 additions & 5 deletions

File tree

src/crawlee/_utils/sitemap.py

Lines changed: 145 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
from __future__ import annotations
22

33
import asyncio
4+
import re
45
import zlib
56
from codecs import getincrementaldecoder
7+
from collections import defaultdict
68
from contextlib import suppress
79
from dataclasses import dataclass
810
from datetime import datetime, timedelta
@@ -16,6 +18,9 @@
1618
from typing_extensions import NotRequired, override
1719
from yarl import URL
1820

21+
from crawlee._utils.web import is_status_code_successful
22+
from crawlee.errors import ProxyError
23+
1924
if TYPE_CHECKING:
2025
from collections.abc import AsyncGenerator
2126
from xml.sax.xmlreader import AttributesImpl
@@ -27,6 +32,8 @@
2732

2833
VALID_CHANGE_FREQS = {'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'}
2934
SITEMAP_HEADERS = {'accept': 'text/plain, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8'}
35+
SITEMAP_URL_PATTERN = re.compile(r'\/sitemap\.(?:xml|txt)(?:\.gz)?$', re.IGNORECASE)
36+
COMMON_SITEMAP_PATHS = ['/sitemap.xml', '/sitemap.txt', '/sitemap_index.xml']
3037

3138

3239
@dataclass()
@@ -384,7 +391,7 @@ def urls(self) -> list[str]:
384391
@classmethod
385392
async def try_common_names(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Sitemap:
386393
base_url = URL(url)
387-
sitemap_urls = [str(base_url.with_path('/sitemap.xml')), str(base_url.with_path('/sitemap.txt'))]
394+
sitemap_urls = [str(base_url.with_path(path)) for path in COMMON_SITEMAP_PATHS]
388395
return await cls.load(sitemap_urls, http_client, proxy_info)
389396

390397
@classmethod
@@ -484,3 +491,140 @@ async def parse_sitemap(
484491
yield result
485492
else:
486493
logger.warning(f'Invalid source configuration: {source}')
494+
495+
496+
async def _merge_async_generators(*generators: AsyncGenerator) -> AsyncGenerator:
497+
queue: asyncio.Queue = asyncio.Queue()
498+
499+
end_feed = object()
500+
501+
async def feed(gen: AsyncGenerator) -> None:
502+
try:
503+
async for item in gen:
504+
await queue.put(item)
505+
except Exception:
506+
logger.warning(f'Error in generator: {gen}', exc_info=True)
507+
finally:
508+
await queue.put(end_feed)
509+
510+
tasks = [asyncio.create_task(feed(gen)) for gen in generators]
511+
remaining_tasks = len(tasks)
512+
513+
try:
514+
while remaining_tasks > 0:
515+
item = await queue.get()
516+
if item is end_feed:
517+
remaining_tasks -= 1
518+
else:
519+
yield item
520+
finally:
521+
for task in tasks:
522+
task.cancel()
523+
await asyncio.gather(*tasks, return_exceptions=True)
524+
525+
526+
async def _discover_for_hostname(
527+
hostname: str,
528+
hostname_urls: list[str],
529+
*,
530+
http_client: HttpClient,
531+
proxy_info: ProxyInfo | None = None,
532+
request_timeout: timedelta,
533+
method_for_checking: Literal['HEAD', 'GET'] = 'HEAD',
534+
) -> AsyncGenerator[str, None]:
535+
# Import here to avoid circular imports.
536+
from crawlee._utils.robots import RobotsTxtFile # noqa: PLC0415
537+
538+
domain_seen: set[str] = set()
539+
hostname_urls = list(set(hostname_urls)) # Remove duplicates
540+
541+
def _check_and_add(url: str) -> bool:
542+
if url in domain_seen:
543+
return False
544+
domain_seen.add(url)
545+
return True
546+
547+
# Try getting sitemaps from robots.txt first
548+
robots = await RobotsTxtFile.find(url=hostname_urls[0], http_client=http_client, proxy_info=proxy_info)
549+
for sitemap_url in robots.get_sitemaps():
550+
if _check_and_add(sitemap_url):
551+
yield sitemap_url
552+
553+
# Check maybe provided URLs have sitemap url
554+
matching_sitemap_urls = [url for url in hostname_urls if SITEMAP_URL_PATTERN.search(url)]
555+
556+
if matching_sitemap_urls:
557+
for sitemap_url in matching_sitemap_urls:
558+
if _check_and_add(sitemap_url):
559+
yield sitemap_url
560+
else:
561+
# Check common sitemap locations
562+
base_url = URL(hostname_urls[0])
563+
for path in COMMON_SITEMAP_PATHS:
564+
candidate = str(base_url.with_path(path))
565+
if candidate in domain_seen:
566+
continue
567+
try:
568+
response = await http_client.send_request(
569+
candidate, method=method_for_checking, proxy_info=proxy_info, timeout=request_timeout
570+
)
571+
if is_status_code_successful(response.status_code) and _check_and_add(candidate):
572+
yield candidate
573+
except ProxyError:
574+
logger.warning(f'Proxy error when checking {candidate} with sitemap discovery for {hostname}')
575+
except asyncio.TimeoutError:
576+
logger.warning(f'Timeout when checking {candidate} with sitemap discovery for {hostname}')
577+
except Exception:
578+
logger.warning(f'Error when checking {candidate} with sitemap discovery for {hostname}', exc_info=True)
579+
580+
581+
async def discover_valid_sitemaps(
582+
urls: list[str],
583+
*,
584+
http_client: HttpClient,
585+
proxy_info: ProxyInfo | None = None,
586+
request_timeout: timedelta = timedelta(seconds=20),
587+
method_for_checking: Literal['HEAD', 'GET'] = 'HEAD',
588+
) -> AsyncGenerator[str, None]:
589+
"""Discover related sitemaps for the given URLs.
590+
591+
Args:
592+
urls: List of URLs to discover sitemaps for.
593+
http_client: `HttpClient` to use for making requests.
594+
proxy_info: Proxy configuration to use for requests.
595+
request_timeout: Timeout for each request when checking for sitemaps.
596+
method_for_checking: HTTP method to use when checking for sitemap existence (HEAD or GET).
597+
"""
598+
# Use a set to track seen sitemap URLs and avoid duplicates
599+
seen = set()
600+
601+
grouped_urls = defaultdict(list)
602+
for url in urls:
603+
try:
604+
hostname = URL(url).host
605+
except ValueError:
606+
logger.warning(f'Invalid URL {url} skipped')
607+
continue
608+
609+
if not hostname:
610+
logger.warning(f'URL {url} without host skipped')
611+
continue
612+
613+
grouped_urls[hostname].append(url)
614+
615+
generators = [
616+
_discover_for_hostname(
617+
hostname,
618+
hostname_urls,
619+
http_client=http_client,
620+
proxy_info=proxy_info,
621+
request_timeout=request_timeout,
622+
method_for_checking=method_for_checking,
623+
)
624+
for hostname, hostname_urls in grouped_urls.items()
625+
]
626+
627+
async for sitemap_url in _merge_async_generators(*generators):
628+
if sitemap_url not in seen:
629+
seen.add(sitemap_url)
630+
yield sitemap_url

src/crawlee/_utils/web.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,18 @@
11
from __future__ import annotations
22

3+
from http import HTTPStatus
4+
35

46
def is_status_code_client_error(value: int) -> bool:
57
"""Return `True` for 4xx status codes, `False` otherwise."""
6-
return 400 <= value <= 499 # noqa: PLR2004
8+
return HTTPStatus.BAD_REQUEST <= value < HTTPStatus.INTERNAL_SERVER_ERROR
79

810

911
def is_status_code_server_error(value: int) -> bool:
1012
"""Return `True` for 5xx status codes, `False` otherwise."""
11-
return value >= 500 # noqa: PLR2004
13+
return value >= HTTPStatus.INTERNAL_SERVER_ERROR
14+
15+
16+
def is_status_code_successful(value: int) -> bool:
17+
"""Return `True` for 2xx and 3xx status codes, `False` otherwise."""
18+
return HTTPStatus.OK <= value < HTTPStatus.BAD_REQUEST

tests/unit/_utils/test_sitemap.py

Lines changed: 102 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import base64
22
import gzip
33
from datetime import datetime
4+
from typing import Any
5+
from unittest.mock import AsyncMock, MagicMock
46

57
from yarl import URL
68

7-
from crawlee._utils.sitemap import Sitemap, SitemapUrl, parse_sitemap
8-
from crawlee.http_clients._base import HttpClient
9+
from crawlee._utils.sitemap import Sitemap, SitemapUrl, discover_valid_sitemaps, parse_sitemap
10+
from crawlee.http_clients._base import HttpClient, HttpResponse
911

1012
BASIC_SITEMAP = """
1113
<?xml version="1.0" encoding="UTF-8"?>
@@ -46,6 +48,23 @@
4648
}
4749

4850

51+
def _make_mock_client(url_map: dict[str, tuple[int, bytes]]) -> AsyncMock:
52+
async def send_request(url: str, **_kwargs: Any) -> HttpResponse:
53+
status, body = 404, b''
54+
for pattern, (s, b) in url_map.items():
55+
if pattern in url:
56+
status, body = s, b
57+
break
58+
response = MagicMock(spec=HttpResponse)
59+
response.status_code = status
60+
response.read = AsyncMock(return_value=body)
61+
return response
62+
63+
client = AsyncMock(spec=HttpClient)
64+
client.send_request.side_effect = send_request
65+
return client
66+
67+
4968
def compress_gzip(data: str) -> bytes:
5069
"""Compress a string using gzip."""
5170
return gzip.compress(data.encode())
@@ -246,3 +265,84 @@ async def test_sitemap_from_string() -> None:
246265

247266
assert len(sitemap.urls) == 5
248267
assert set(sitemap.urls) == BASIC_RESULTS
268+
269+
270+
async def test_discover_sitemap_from_robots_txt() -> None:
271+
"""Sitemap URL found in robots.txt is yielded."""
272+
robots_content = b'User-agent: *\nSitemap: http://example.com/custom-sitemap.xml'
273+
http_client = _make_mock_client({'robots.txt': (200, robots_content)})
274+
275+
urls = [url async for url in discover_valid_sitemaps(['http://example.com/page'], http_client=http_client)]
276+
277+
assert urls == ['http://example.com/custom-sitemap.xml']
278+
279+
280+
async def test_discover_sitemap_from_common_paths() -> None:
281+
"""Sitemap is found at common paths when robots.txt has none."""
282+
http_client = _make_mock_client(
283+
{'/sitemap.xml': (200, b''), '/sitemap.txt': (200, b''), '/sitemap_index.xml': (200, b'')}
284+
)
285+
286+
urls = [url async for url in discover_valid_sitemaps(['http://example.com/page'], http_client=http_client)]
287+
288+
assert urls == [
289+
'http://example.com/sitemap.xml',
290+
'http://example.com/sitemap.txt',
291+
'http://example.com/sitemap_index.xml',
292+
]
293+
294+
295+
async def test_discover_sitemap_from_input_url() -> None:
296+
"""Input URL that is already a sitemap is yielded directly without checking common paths."""
297+
http_client = _make_mock_client({'/sitemap.txt': (200, b'')})
298+
299+
urls = [url async for url in discover_valid_sitemaps(['http://example.com/sitemap.xml'], http_client=http_client)]
300+
301+
assert urls == ['http://example.com/sitemap.xml']
302+
303+
304+
async def test_discover_sitemap_deduplication() -> None:
305+
"""Sitemap URL found in robots.txt is not yielded again from common paths check."""
306+
robots_content = b'User-agent: *\nSitemap: http://example.com/sitemap.xml'
307+
http_client = _make_mock_client(
308+
{
309+
'robots.txt': (200, robots_content),
310+
'/sitemap.xml': (200, b''),
311+
}
312+
)
313+
314+
urls = [url async for url in discover_valid_sitemaps(['http://example.com/page'], http_client=http_client)]
315+
316+
assert urls == ['http://example.com/sitemap.xml']
317+
318+
319+
async def test_discover_sitemaps_multiple_domains() -> None:
320+
"""Sitemaps from multiple domains are all discovered."""
321+
http_client = _make_mock_client(
322+
{
323+
'domain-a.com/sitemap.xml': (200, b''),
324+
'domain-b.com/sitemap.xml': (200, b''),
325+
}
326+
)
327+
328+
urls = [
329+
url
330+
async for url in discover_valid_sitemaps(
331+
['http://domain-a.com/page', 'http://domain-b.com/page'],
332+
http_client=http_client,
333+
)
334+
]
335+
336+
assert set(urls) == {
337+
'http://domain-a.com/sitemap.xml',
338+
'http://domain-b.com/sitemap.xml',
339+
}
340+
341+
342+
async def test_discover_sitemap_url_without_host_skipped() -> None:
343+
"""URLs without a host are skipped."""
344+
http_client = _make_mock_client({})
345+
346+
urls = [url async for url in discover_valid_sitemaps(['not-a-valid-url'], http_client=http_client)]
347+
348+
assert urls == []

0 commit comments

Comments
 (0)