|
1 | 1 | from __future__ import annotations |
2 | 2 |
|
3 | 3 | import asyncio |
| 4 | +import re |
4 | 5 | import zlib |
5 | 6 | from codecs import getincrementaldecoder |
| 7 | +from collections import defaultdict |
6 | 8 | from contextlib import suppress |
7 | 9 | from dataclasses import dataclass |
8 | 10 | from datetime import datetime, timedelta |
|
16 | 18 | from typing_extensions import NotRequired, override |
17 | 19 | from yarl import URL |
18 | 20 |
|
| 21 | +from crawlee._utils.web import is_status_code_successful |
| 22 | +from crawlee.errors import ProxyError |
| 23 | + |
19 | 24 | if TYPE_CHECKING: |
20 | 25 | from collections.abc import AsyncGenerator |
21 | 26 | from xml.sax.xmlreader import AttributesImpl |
|
27 | 32 |
|
28 | 33 | VALID_CHANGE_FREQS = {'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'} |
29 | 34 | SITEMAP_HEADERS = {'accept': 'text/plain, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8'} |
| 35 | +SITEMAP_URL_PATTERN = re.compile(r'\/sitemap\.(?:xml|txt)(?:\.gz)?$', re.IGNORECASE) |
| 36 | +COMMON_SITEMAP_PATHS = ['/sitemap.xml', '/sitemap.txt', '/sitemap_index.xml'] |
30 | 37 |
|
31 | 38 |
|
32 | 39 | @dataclass() |
@@ -384,7 +391,7 @@ def urls(self) -> list[str]: |
384 | 391 | @classmethod |
385 | 392 | async def try_common_names(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Sitemap: |
386 | 393 | base_url = URL(url) |
387 | | - sitemap_urls = [str(base_url.with_path('/sitemap.xml')), str(base_url.with_path('/sitemap.txt'))] |
| 394 | + sitemap_urls = [str(base_url.with_path(path)) for path in COMMON_SITEMAP_PATHS] |
388 | 395 | return await cls.load(sitemap_urls, http_client, proxy_info) |
389 | 396 |
|
390 | 397 | @classmethod |
@@ -484,3 +491,140 @@ async def parse_sitemap( |
484 | 491 | yield result |
485 | 492 | else: |
486 | 493 | logger.warning(f'Invalid source configuration: {source}') |
| 494 | + |
| 495 | + |
| 496 | +async def _merge_async_generators(*generators: AsyncGenerator) -> AsyncGenerator: |
| 497 | + queue: asyncio.Queue = asyncio.Queue() |
| 498 | + |
| 499 | + end_feed = object() |
| 500 | + |
| 501 | + async def feed(gen: AsyncGenerator) -> None: |
| 502 | + try: |
| 503 | + async for item in gen: |
| 504 | + await queue.put(item) |
| 505 | + except Exception: |
| 506 | + logger.warning(f'Error in generator: {gen}', exc_info=True) |
| 507 | + finally: |
| 508 | + await queue.put(end_feed) |
| 509 | + |
| 510 | + tasks = [asyncio.create_task(feed(gen)) for gen in generators] |
| 511 | + remaining_tasks = len(tasks) |
| 512 | + |
| 513 | + try: |
| 514 | + while remaining_tasks > 0: |
| 515 | + item = await queue.get() |
| 516 | + if item is end_feed: |
| 517 | + remaining_tasks -= 1 |
| 518 | + else: |
| 519 | + yield item |
| 520 | + finally: |
| 521 | + for task in tasks: |
| 522 | + task.cancel() |
| 523 | + await asyncio.gather(*tasks, return_exceptions=True) |
| 524 | + |
| 525 | + |
| 526 | +async def _discover_for_hostname( |
| 527 | + hostname: str, |
| 528 | + hostname_urls: list[str], |
| 529 | + *, |
| 530 | + http_client: HttpClient, |
| 531 | + proxy_info: ProxyInfo | None = None, |
| 532 | + request_timeout: timedelta, |
| 533 | + method_for_checking: Literal['HEAD', 'GET'] = 'HEAD', |
| 534 | +) -> AsyncGenerator[str, None]: |
| 535 | + # Import here to avoid circular imports. |
| 536 | + from crawlee._utils.robots import RobotsTxtFile # noqa: PLC0415 |
| 537 | + |
| 538 | + domain_seen: set[str] = set() |
| 539 | + hostname_urls = list(set(hostname_urls)) # Remove duplicates |
| 540 | + |
| 541 | + def _check_and_add(url: str) -> bool: |
| 542 | + if url in domain_seen: |
| 543 | + return False |
| 544 | + domain_seen.add(url) |
| 545 | + return True |
| 546 | + |
| 547 | + # Try getting sitemaps from robots.txt first |
| 548 | + robots = await RobotsTxtFile.find(url=hostname_urls[0], http_client=http_client, proxy_info=proxy_info) |
| 549 | + for sitemap_url in robots.get_sitemaps(): |
| 550 | + if _check_and_add(sitemap_url): |
| 551 | + yield sitemap_url |
| 552 | + |
| 553 | + # Check maybe provided URLs have sitemap url |
| 554 | + matching_sitemap_urls = [url for url in hostname_urls if SITEMAP_URL_PATTERN.search(url)] |
| 555 | + |
| 556 | + if matching_sitemap_urls: |
| 557 | + for sitemap_url in matching_sitemap_urls: |
| 558 | + if _check_and_add(sitemap_url): |
| 559 | + yield sitemap_url |
| 560 | + else: |
| 561 | + # Check common sitemap locations |
| 562 | + base_url = URL(hostname_urls[0]) |
| 563 | + for path in COMMON_SITEMAP_PATHS: |
| 564 | + candidate = str(base_url.with_path(path)) |
| 565 | + if candidate in domain_seen: |
| 566 | + continue |
| 567 | + try: |
| 568 | + response = await http_client.send_request( |
| 569 | + candidate, method=method_for_checking, proxy_info=proxy_info, timeout=request_timeout |
| 570 | + ) |
| 571 | + if is_status_code_successful(response.status_code) and _check_and_add(candidate): |
| 572 | + yield candidate |
| 573 | + except ProxyError: |
| 574 | + logger.warning(f'Proxy error when checking {candidate} with sitemap discovery for {hostname}') |
| 575 | + except asyncio.TimeoutError: |
| 576 | + logger.warning(f'Timeout when checking {candidate} with sitemap discovery for {hostname}') |
| 577 | + except Exception: |
| 578 | + logger.warning(f'Error when checking {candidate} with sitemap discovery for {hostname}', exc_info=True) |
| 579 | + |
| 580 | + |
| 581 | +async def discover_valid_sitemaps( |
| 582 | + urls: list[str], |
| 583 | + *, |
| 584 | + http_client: HttpClient, |
| 585 | + proxy_info: ProxyInfo | None = None, |
| 586 | + request_timeout: timedelta = timedelta(seconds=20), |
| 587 | + method_for_checking: Literal['HEAD', 'GET'] = 'HEAD', |
| 588 | +) -> AsyncGenerator[str, None]: |
| 589 | + """Discover related sitemaps for the given URLs. |
| 590 | +
|
| 591 | + Args: |
| 592 | + urls: List of URLs to discover sitemaps for. |
| 593 | + http_client: `HttpClient` to use for making requests. |
| 594 | + proxy_info: Proxy configuration to use for requests. |
| 595 | + request_timeout: Timeout for each request when checking for sitemaps. |
| 596 | + method_for_checking: HTTP method to use when checking for sitemap existence (HEAD or GET). |
| 597 | + """ |
| 598 | + # Use a set to track seen sitemap URLs and avoid duplicates |
| 599 | + seen = set() |
| 600 | + |
| 601 | + grouped_urls = defaultdict(list) |
| 602 | + for url in urls: |
| 603 | + try: |
| 604 | + hostname = URL(url).host |
| 605 | + except ValueError: |
| 606 | + logger.warning(f'Invalid URL {url} skipped') |
| 607 | + continue |
| 608 | + |
| 609 | + if not hostname: |
| 610 | + logger.warning(f'URL {url} without host skipped') |
| 611 | + continue |
| 612 | + |
| 613 | + grouped_urls[hostname].append(url) |
| 614 | + |
| 615 | + generators = [ |
| 616 | + _discover_for_hostname( |
| 617 | + hostname, |
| 618 | + hostname_urls, |
| 619 | + http_client=http_client, |
| 620 | + proxy_info=proxy_info, |
| 621 | + request_timeout=request_timeout, |
| 622 | + method_for_checking=method_for_checking, |
| 623 | + ) |
| 624 | + for hostname, hostname_urls in grouped_urls.items() |
| 625 | + ] |
| 626 | + |
| 627 | + async for sitemap_url in _merge_async_generators(*generators): |
| 628 | + if sitemap_url not in seen: |
| 629 | + seen.add(sitemap_url) |
| 630 | + yield sitemap_url |
0 commit comments