From 48d33a69a8574b6f4f209b523ce88dcd7af8f7da Mon Sep 17 00:00:00 2001 From: tonghuaroot Date: Wed, 27 May 2026 13:03:38 +0800 Subject: [PATCH 1/2] fix(sitemap): apply enqueue strategy at the _process_sitemap_item producer The 1.7.0 fix for GHSA-3r75-xc34-5f44 wired the same-hostname enqueue strategy into SitemapRequestLoader._passes_filters, but the lower-level parse_sitemap / Sitemap.load / Sitemap.try_common_names API still accepted every nested-sitemap and every regardless of host. A sitemap on attacker.example could push http://127.0.0.1:... or http://169.254.169.254/... into the queue, and _fetch_and_process_sitemap would dispatch the request through the configured HTTP client. Move the filter_url check from SitemapRequestLoader._passes_filters down into _process_sitemap_item so the same policy applies to both pipelines. ParseSitemapOptions gains an enqueue_strategy field (default 'same-hostname', matching the loader default added in PR #1864). The strategy is threaded through _process_raw_source and _fetch_and_process_sitemap so producer-side filtering runs whether the sitemap content arrived as a raw blob or via the HTTP client. SitemapRequestLoader now stamps its configured enqueue_strategy into ParseSitemapOptions, so its existing _passes_filters call remains defence-in-depth rather than the sole gate. Callers that legitimately need cross-host sitemap discovery opt in with ParseSitemapOptions(enqueue_strategy='same-domain') / 'all'. Note: this closes the URL-injection (read-back) path. A blind GET against the redirect target can still occur because the HTTP-client stream() follows 3xx with follow_redirects=True; closing that fully needs a hook on stream() to re-run filter_url after redirect. Out of scope for the minimal producer-side fix; tracked as a follow-up. Signed-off-by: tonghuaroot --- src/crawlee/_utils/sitemap.py | 50 +++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index 05eaa6e726..dbf949524f 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -18,6 +18,7 @@ from typing_extensions import NotRequired, override from yarl import URL +from crawlee._utils.urls import filter_url from crawlee._utils.web import is_status_code_successful from crawlee.errors import ProxyError @@ -25,6 +26,7 @@ from collections.abc import AsyncGenerator from xml.sax.xmlreader import AttributesImpl + from crawlee import EnqueueStrategy from crawlee.http_clients import HttpClient from crawlee.proxy_configuration import ProxyInfo @@ -55,6 +57,7 @@ class ParseSitemapOptions(TypedDict, total=False): emit_nested_sitemaps: bool max_depth: int sitemap_retries: int + enqueue_strategy: EnqueueStrategy timeout: timedelta | None @@ -230,6 +233,7 @@ async def _process_sitemap_item( sources: list[SitemapSource], *, emit_nested_sitemaps: bool, + enqueue_strategy: EnqueueStrategy, ) -> AsyncGenerator[SitemapUrl | NestedSitemap | None, None]: """Process a sitemap item and yield appropriate results.""" item_copy = item.copy() # Work with a copy to avoid modifying the original @@ -242,7 +246,12 @@ async def _process_sitemap_item( # Handle sitemap URL references (nested sitemaps) if item_type == 'sitemap_url' and 'url' in item_copy: sitemap_url = item_copy['url'] - if sitemap_url and sitemap_url not in visited_sitemap_urls: + parent_url = source.get('url') + if sitemap_url and sitemap_url not in visited_sitemap_urls and parent_url: + ok, reason = filter_url(target=sitemap_url, strategy=enqueue_strategy, origin=parent_url) + if not ok: + logger.warning(f'Skipping nested sitemap {sitemap_url!r} (parent {parent_url!r}): {reason}.') + return # Add to processing queue sources.append(SitemapSource(type='url', url=sitemap_url, depth=depth + 1)) @@ -255,9 +264,17 @@ async def _process_sitemap_item( # Determine the origin sitemap URL for tracking purposes origin_url = _get_origin_url(source) + loc = item_copy['loc'] + parent_url = source.get('url') + if parent_url and loc: + ok, reason = filter_url(target=loc, strategy=enqueue_strategy, origin=parent_url) + if not ok: + logger.warning(f'Skipping sitemap URL {loc!r} (parent {parent_url!r}): {reason}.') + return + # Create and yield the sitemap URL object yield SitemapUrl( - loc=item_copy['loc'], + loc=loc, lastmod=item_copy.get('lastmod'), changefreq=item_copy.get('changefreq'), priority=item_copy.get('priority'), @@ -272,6 +289,7 @@ async def _process_raw_source( sources: list[SitemapSource], *, emit_nested_sitemaps: bool, + enqueue_strategy: EnqueueStrategy, ) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]: """Process a raw content sitemap source.""" if 'content' not in source: @@ -285,7 +303,13 @@ async def _process_raw_source( # Process the content async for item in parser.process_chunk(content): async for result in _process_sitemap_item( - item, source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps + item, + source, + depth, + visited_sitemap_urls, + sources, + emit_nested_sitemaps=emit_nested_sitemaps, + enqueue_strategy=enqueue_strategy, ): if result: yield result @@ -293,7 +317,13 @@ async def _process_raw_source( # Process any remaining content async for item in parser.flush(): async for result in _process_sitemap_item( - item, source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps + item, + source, + depth, + visited_sitemap_urls, + sources, + emit_nested_sitemaps=emit_nested_sitemaps, + enqueue_strategy=enqueue_strategy, ): if result: yield result @@ -314,6 +344,7 @@ async def _fetch_and_process_sitemap( proxy_info: ProxyInfo | None = None, timeout: timedelta | None = None, emit_nested_sitemaps: bool, + enqueue_strategy: EnqueueStrategy, ) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]: """Fetch a sitemap from a URL and process its content.""" if 'url' not in source: @@ -354,6 +385,7 @@ async def _fetch_and_process_sitemap( visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps, + enqueue_strategy=enqueue_strategy, ): if result: yield result @@ -367,6 +399,7 @@ async def _fetch_and_process_sitemap( visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps, + enqueue_strategy=enqueue_strategy, ): if result: yield result @@ -445,6 +478,7 @@ async def parse_sitemap( max_depth = options.get('max_depth', float('inf')) sitemap_retries = options.get('sitemap_retries', 3) timeout = options.get('timeout', timedelta(seconds=30)) + enqueue_strategy = options.get('enqueue_strategy', 'same-hostname') # Setup working state sources = list(initial_sources) @@ -463,7 +497,12 @@ async def parse_sitemap( # Process based on source type if source['type'] == 'raw': async for result in _process_raw_source( - source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps + source, + depth, + visited_sitemap_urls, + sources, + emit_nested_sitemaps=emit_nested_sitemaps, + enqueue_strategy=enqueue_strategy, ): yield result @@ -482,6 +521,7 @@ async def parse_sitemap( sources, sitemap_retries, emit_nested_sitemaps=emit_nested_sitemaps, + enqueue_strategy=enqueue_strategy, proxy_info=proxy_info, timeout=timeout, ): From e6b5671ad41a995b1c489918955ee60050c569c3 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 8 Jun 2026 15:10:45 +0200 Subject: [PATCH 2/2] Add tests for sitemap loading for different urls --- src/crawlee/_utils/sitemap.py | 1 + .../_sitemap_request_loader.py | 7 +- tests/unit/_utils/test_sitemap.py | 142 ++++++++---------- .../test_sitemap_request_loader.py | 80 ++++------ tests/unit/utils.py | 47 ++++++ 5 files changed, 151 insertions(+), 126 deletions(-) diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index dbf949524f..036e36a4cf 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -252,6 +252,7 @@ async def _process_sitemap_item( if not ok: logger.warning(f'Skipping nested sitemap {sitemap_url!r} (parent {parent_url!r}): {reason}.') return + # Add to processing queue sources.append(SitemapSource(type='url', url=sitemap_url, depth=depth + 1)) diff --git a/src/crawlee/request_loaders/_sitemap_request_loader.py b/src/crawlee/request_loaders/_sitemap_request_loader.py index 664686b23f..84051d2164 100644 --- a/src/crawlee/request_loaders/_sitemap_request_loader.py +++ b/src/crawlee/request_loaders/_sitemap_request_loader.py @@ -352,7 +352,12 @@ async def _load_sitemaps(self) -> None: continue state.in_progress_sitemap_url = sitemap_url - parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True, sitemap_retries=3) + parse_options = ParseSitemapOptions( + max_depth=0, + emit_nested_sitemaps=True, + sitemap_retries=3, + enqueue_strategy=self._enqueue_strategy, + ) parsed_sitemap_url = URL(sitemap_url) async for item in parse_sitemap( diff --git a/tests/unit/_utils/test_sitemap.py b/tests/unit/_utils/test_sitemap.py index f56094460a..ef14178bc0 100644 --- a/tests/unit/_utils/test_sitemap.py +++ b/tests/unit/_utils/test_sitemap.py @@ -18,48 +18,11 @@ parse_sitemap, ) from crawlee.http_clients._base import HttpClient, HttpResponse +from tests.unit.utils import DEFAULT_URL, get_basic_results, get_basic_sitemap if TYPE_CHECKING: from collections.abc import AsyncIterator -BASIC_SITEMAP = """ - - - -http://not-exists.com/ -2005-02-03 -monthly -0.8 - - -http://not-exists.com/catalog?item=12&desc=vacation_hawaii -weekly - - -http://not-exists.com/catalog?item=73&desc=vacation_new_zealand -2004-12-23 -weekly - - -http://not-exists.com/catalog?item=74&desc=vacation_newfoundland -2004-12-23T18:00:15+00:00 -0.3 - - -http://not-exists.com/catalog?item=83&desc=vacation_usa -2004-11-23 - - -""".strip() - -BASIC_RESULTS = { - 'http://not-exists.com/', - 'http://not-exists.com/catalog?item=12&desc=vacation_hawaii', - 'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand', - 'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland', - 'http://not-exists.com/catalog?item=83&desc=vacation_usa', -} - def _make_mock_client(url_map: dict[str, tuple[int, bytes]]) -> AsyncMock: async def send_request(url: str, **_kwargs: Any) -> HttpResponse: @@ -115,24 +78,49 @@ def encode_base64(data: bytes) -> str: async def test_sitemap(server_url: URL, http_client: HttpClient) -> None: """Test loading a basic sitemap.""" sitemap_url = (server_url / 'sitemap.xml').with_query( - base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/xml; charset=utf-8' + base64=encode_base64(get_basic_sitemap(url=server_url).encode()), c_type='application/xml; charset=utf-8' ) sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 5 - assert set(sitemap.urls) == BASIC_RESULTS + assert set(sitemap.urls) == get_basic_results(server_url) + + +async def test_sitemap_different_url(server_url: URL, http_client: HttpClient) -> None: + """Test loading a basic sitemap when sitemap contains links to different url. Those should be ignored.""" + different_url = 'https://other.com' + sitemap_url = (server_url / 'sitemap.xml').with_query( + base64=encode_base64(get_basic_sitemap(url=different_url).encode()), c_type='application/xml; charset=utf-8' + ) + sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) + + assert len(sitemap.urls) == 0 + + +async def test_sitemap_different_url_allowed(server_url: URL, http_client: HttpClient) -> None: + """Test loading a basic sitemap when sitemap contains links to different url, and it is explicitly allowed.""" + different_url = 'https://other.com' + sitemap_url = (server_url / 'sitemap.xml').with_query( + base64=encode_base64(get_basic_sitemap(url=different_url).encode()), c_type='application/xml; charset=utf-8' + ) + sitemap = await Sitemap.load( + str(sitemap_url), http_client=http_client, parse_sitemap_options={'enqueue_strategy': 'all'} + ) + + assert len(sitemap.urls) == 5 + assert set(sitemap.urls) == get_basic_results(different_url) async def test_extract_metadata_sitemap(server_url: URL, http_client: HttpClient) -> None: """Test extracting item metadata from a sitemap.""" sitemap_url = (server_url / 'sitemap.xml').with_query( - base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/xml; charset=utf-8' + base64=encode_base64(get_basic_sitemap(url=server_url).encode()), c_type='application/xml; charset=utf-8' ) items = [item async for item in parse_sitemap([{'type': 'url', 'url': str(sitemap_url)}], http_client=http_client)] assert len(items) == 5 assert items[0] == SitemapUrl( - loc='http://not-exists.com/', + loc=str(server_url), priority=0.8, changefreq='monthly', lastmod=datetime.fromisoformat('2005-02-03'), @@ -142,16 +130,16 @@ async def test_extract_metadata_sitemap(server_url: URL, http_client: HttpClient async def test_gzipped_sitemap(server_url: URL, http_client: HttpClient) -> None: """Test loading a gzipped sitemap with correct type and .xml.gz url.""" - gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP)) + gzipped_data = encode_base64(compress_gzip(get_basic_sitemap(url=server_url))) sitemap_url = (server_url / 'sitemap.xml.gz').with_query(base64=gzipped_data, c_type='application/gzip') sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 5 - assert set(sitemap.urls) == BASIC_RESULTS + assert set(sitemap.urls) == get_basic_results(server_url) async def test_gzipped_sitemap_with_invalid_data(server_url: URL, http_client: HttpClient) -> None: """Test loading a invalid gzipped sitemap with correct type and .xml.gz url.""" - compress_data = compress_gzip(BASIC_SITEMAP) + compress_data = compress_gzip(get_basic_sitemap(url=server_url)) invalid_gzipped_data = encode_base64(compress_data[:30]) sitemap_url = (server_url / 'sitemap.xml.gz').with_query(base64=invalid_gzipped_data, c_type='application/gzip') sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) @@ -163,34 +151,34 @@ async def test_gzipped_sitemap_with_invalid_data(server_url: URL, http_client: H async def test_gz_sitemap_with_non_gzipped(server_url: URL, http_client: HttpClient) -> None: """Test loading a sitemap with gzip type and .xml.gz url, but without gzipped data.""" sitemap_url = (server_url / 'sitemap.xml.gz').with_query( - base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/gzip' + base64=encode_base64(get_basic_sitemap(url=server_url).encode()), c_type='application/gzip' ) sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 5 - assert set(sitemap.urls) == BASIC_RESULTS + assert set(sitemap.urls) == get_basic_results(server_url) async def test_gzipped_sitemap_with_bad_type(server_url: URL, http_client: HttpClient) -> None: """Test loading a gzipped sitemap with bad type and .xml.gz url.""" - gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP)) + gzipped_data = encode_base64(compress_gzip(get_basic_sitemap(url=server_url))) sitemap_url = (server_url / 'sitemap.xml.gz').with_query( base64=gzipped_data, c_type='application/xml; charset=utf-8' ) sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 5 - assert set(sitemap.urls) == BASIC_RESULTS + assert set(sitemap.urls) == get_basic_results(server_url) async def test_xml_sitemap_with_gzipped_data(server_url: URL, http_client: HttpClient) -> None: """Test loading a gzipped sitemap with correct type and .xml url.""" - gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP)) + gzipped_data = encode_base64(compress_gzip(get_basic_sitemap(url=server_url))) sitemap_url = (server_url / 'sitemap.xml').with_query(base64=gzipped_data, c_type='application/gzip') sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 5 - assert set(sitemap.urls) == BASIC_RESULTS + assert set(sitemap.urls) == get_basic_results(server_url) async def test_parent_sitemap(server_url: URL, http_client: HttpClient) -> None: @@ -208,8 +196,12 @@ async def test_parent_sitemap(server_url: URL, http_client: HttpClient) -> None: """.strip() - child_sitemap = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) - child_sitemap_2 = (server_url / 'sitemap.xml.gz').with_query(base64=encode_base64(compress_gzip(BASIC_SITEMAP))) + child_sitemap = (server_url / 'sitemap.xml').with_query( + base64=encode_base64(get_basic_sitemap(url=server_url).encode()) + ) + child_sitemap_2 = (server_url / 'sitemap.xml.gz').with_query( + base64=encode_base64(compress_gzip(get_basic_sitemap(url=server_url))) + ) parent_sitemap_content = parent_sitemap.format(child_sitemap=child_sitemap, child_sitemap_2=child_sitemap_2) encoded_parent_sitemap_content = encode_base64(parent_sitemap_content.encode()) parent_sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encoded_parent_sitemap_content) @@ -217,7 +209,7 @@ async def test_parent_sitemap(server_url: URL, http_client: HttpClient) -> None: sitemap = await Sitemap.load(str(parent_sitemap_url), http_client=http_client) assert len(sitemap.urls) == 10 - assert set(sitemap.urls) == BASIC_RESULTS + assert set(sitemap.urls) == get_basic_results(server_url) async def test_non_sitemap_url(server_url: URL, http_client: HttpClient) -> None: @@ -230,11 +222,11 @@ async def test_non_sitemap_url(server_url: URL, http_client: HttpClient) -> None async def test_cdata_sitemap(server_url: URL, http_client: HttpClient) -> None: """Test loading a sitemap with CDATA sections.""" - cdata_sitemap = """ + cdata_sitemap = f""" - + """.strip() @@ -244,14 +236,14 @@ async def test_cdata_sitemap(server_url: URL, http_client: HttpClient) -> None: sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 1 - assert sitemap.urls == ['http://not-exists.com/catalog'] + assert sitemap.urls == [f'{server_url}catalog'] async def test_txt_sitemap(server_url: URL, http_client: HttpClient) -> None: """Test loading a plain text sitemap.""" urls = [ - 'http://not-exists.com/catalog?item=78&desc=vacation_crete', - 'http://not-exists.com/catalog?item=79&desc=vacation_somalia', + f'{server_url}catalog?item=78&desc=vacation_crete', + f'{server_url}catalog?item=79&desc=vacation_somalia', ] txt_sitemap_content = '\n'.join(urls) @@ -260,19 +252,19 @@ async def test_txt_sitemap(server_url: URL, http_client: HttpClient) -> None: assert len(sitemap.urls) == 2 assert set(sitemap.urls) == { - 'http://not-exists.com/catalog?item=78&desc=vacation_crete', - 'http://not-exists.com/catalog?item=79&desc=vacation_somalia', + f'{server_url}catalog?item=78&desc=vacation_crete', + f'{server_url}catalog?item=79&desc=vacation_somalia', } async def test_sitemap_pretty(server_url: URL, http_client: HttpClient) -> None: """Test loading a pretty-printed sitemap.""" - pretty_sitemap = """ + pretty_sitemap = f""" - http://not-exists.com/catalog?item=80&desc=vacation_turkey + {server_url}catalog?item=80&desc=vacation_turkey 2005-02-03 @@ -293,37 +285,33 @@ async def test_sitemap_pretty(server_url: URL, http_client: HttpClient) -> None: sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client) assert len(sitemap.urls) == 1 - assert sitemap.urls == ['http://not-exists.com/catalog?item=80&desc=vacation_turkey'] + assert sitemap.urls == [f'{server_url}catalog?item=80&desc=vacation_turkey'] async def test_sitemap_from_string() -> None: """Test creating a Sitemap instance from an XML string.""" - sitemap = await Sitemap.from_xml_string(BASIC_SITEMAP) + sitemap = await Sitemap.from_xml_string(get_basic_sitemap()) assert len(sitemap.urls) == 5 - assert set(sitemap.urls) == BASIC_RESULTS + assert set(sitemap.urls) == get_basic_results() async def test_sitemap_fetch_retries_on_transient_error() -> None: """Transient fetch errors are retried up to `sitemap_retries` times before giving up.""" - client, attempts = _make_flaky_stream_client(BASIC_SITEMAP.encode(), fail_times=2) + client, attempts = _make_flaky_stream_client(get_basic_sitemap().encode(), fail_times=2) - items = [ - item async for item in parse_sitemap([{'type': 'url', 'url': 'http://not-exists.com/sitemap.xml'}], client) - ] + items = [item async for item in parse_sitemap([{'type': 'url', 'url': f'{DEFAULT_URL}sitemap.xml'}], client)] assert len(attempts) == 3 - assert {item.loc for item in items} == BASIC_RESULTS + assert {item.loc for item in items} == get_basic_results() async def test_sitemap_fetch_raises_after_retries_exhausted() -> None: """A persistent fetch error is raised to the caller once all retries are exhausted.""" - client, attempts = _make_flaky_stream_client(BASIC_SITEMAP.encode(), fail_times=10) + client, attempts = _make_flaky_stream_client(get_basic_sitemap().encode(), fail_times=10) with pytest.raises(ConnectionError): - _ = [ - item async for item in parse_sitemap([{'type': 'url', 'url': 'http://not-exists.com/sitemap.xml'}], client) - ] + _ = [item async for item in parse_sitemap([{'type': 'url', 'url': f'{DEFAULT_URL}sitemap.xml'}], client)] assert len(attempts) == 3 @@ -331,9 +319,9 @@ async def test_sitemap_fetch_raises_after_retries_exhausted() -> None: async def test_parse_sitemap_with_partial_options() -> None: """Test that missing keys in partial `ParseSitemapOptions` fall back to defaults.""" options = ParseSitemapOptions(timeout=timedelta(seconds=10)) - items = [item async for item in parse_sitemap([{'type': 'raw', 'content': BASIC_SITEMAP}], options=options)] + items = [item async for item in parse_sitemap([{'type': 'raw', 'content': get_basic_sitemap()}], options=options)] - assert {item.loc for item in items} == BASIC_RESULTS + assert {item.loc for item in items} == get_basic_results() async def test_discover_sitemap_from_robots_txt() -> None: diff --git a/tests/unit/request_loaders/test_sitemap_request_loader.py b/tests/unit/request_loaders/test_sitemap_request_loader.py index 0fd77cae59..abfd83cd62 100644 --- a/tests/unit/request_loaders/test_sitemap_request_loader.py +++ b/tests/unit/request_loaders/test_sitemap_request_loader.py @@ -9,41 +9,11 @@ from crawlee.http_clients._base import HttpClient from crawlee.request_loaders._sitemap_request_loader import SitemapRequestLoader from crawlee.storages import KeyValueStore -from tests.unit.utils import poll_until_condition +from tests.unit.utils import get_basic_results, get_basic_sitemap, poll_until_condition if TYPE_CHECKING: from crawlee._types import JsonSerializable -BASIC_SITEMAP = """ - - - -http://not-exists.com/ -2005-02-03 -monthly -0.8 - - -http://not-exists.com/catalog?item=12&desc=vacation_hawaii -weekly - - -http://not-exists.com/catalog?item=73&desc=vacation_new_zealand -2004-12-23 -weekly - - -http://not-exists.com/catalog?item=74&desc=vacation_newfoundland -2004-12-23T18:00:15+00:00 -0.3 - - -http://not-exists.com/catalog?item=83&desc=vacation_usa -2004-11-23 - - -""".strip() - def compress_gzip(data: str) -> bytes: """Compress a string using gzip.""" @@ -56,7 +26,9 @@ def encode_base64(data: bytes) -> str: async def test_sitemap_traversal(server_url: URL, http_client: HttpClient) -> None: - sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) + sitemap_url = (server_url / 'sitemap.xml').with_query( + base64=encode_base64(get_basic_sitemap(url=server_url).encode()) + ) sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client, enqueue_strategy='all') while not await sitemap_loader.is_finished(): @@ -72,7 +44,9 @@ async def test_sitemap_traversal(server_url: URL, http_client: HttpClient) -> No async def test_is_empty_does_not_depend_on_fetch_next_request(server_url: URL, http_client: HttpClient) -> None: - sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) + sitemap_url = (server_url / 'sitemap.xml').with_query( + base64=encode_base64(get_basic_sitemap(url=server_url).encode()) + ) sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client, enqueue_strategy='all') items = [] @@ -95,7 +69,9 @@ async def test_is_empty_does_not_depend_on_fetch_next_request(server_url: URL, h async def test_abort_sitemap_loading(server_url: URL, http_client: HttpClient) -> None: - sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) + sitemap_url = (server_url / 'sitemap.xml').with_query( + base64=encode_base64(get_basic_sitemap(url=server_url).encode()) + ) sitemap_loader = SitemapRequestLoader( [str(sitemap_url)], max_buffer_size=2, http_client=http_client, enqueue_strategy='all' ) @@ -119,7 +95,9 @@ async def test_abort_sitemap_loading(server_url: URL, http_client: HttpClient) - async def test_create_persist_state_for_sitemap_loading( server_url: URL, http_client: HttpClient, key_value_store: KeyValueStore ) -> None: - sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) + sitemap_url = (server_url / 'sitemap.xml').with_query( + base64=encode_base64(get_basic_sitemap(url=server_url).encode()) + ) persist_key = 'create_persist_state' sitemap_loader = SitemapRequestLoader( [str(sitemap_url)], http_client=http_client, persist_state_key=persist_key, enqueue_strategy='all' @@ -137,7 +115,9 @@ async def test_create_persist_state_for_sitemap_loading( async def test_data_persistence_for_sitemap_loading( server_url: URL, http_client: HttpClient, key_value_store: KeyValueStore ) -> None: - sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) + sitemap_url = (server_url / 'sitemap.xml').with_query( + base64=encode_base64(get_basic_sitemap(url=server_url).encode()) + ) persist_key = 'data_persist_state' sitemap_loader = SitemapRequestLoader( [str(sitemap_url)], http_client=http_client, persist_state_key=persist_key, enqueue_strategy='all' @@ -159,7 +139,9 @@ async def test_data_persistence_for_sitemap_loading( async def test_recovery_data_persistence_for_sitemap_loading( server_url: URL, http_client: HttpClient, key_value_store: KeyValueStore ) -> None: - sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) + sitemap_url = (server_url / 'sitemap.xml').with_query( + base64=encode_base64(get_basic_sitemap(url=server_url).encode()) + ) persist_key = 'recovery_persist_state' sitemap_loader = SitemapRequestLoader( [str(sitemap_url)], http_client=http_client, persist_state_key=persist_key, enqueue_strategy='all' @@ -188,7 +170,9 @@ async def test_recovery_data_persistence_for_sitemap_loading( async def test_transform_request_function(server_url: URL, http_client: HttpClient) -> None: - sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) + sitemap_url = (server_url / 'sitemap.xml').with_query( + base64=encode_base64(get_basic_sitemap(url=server_url).encode()) + ) def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction: user_data: dict[str, JsonSerializable] = {'transformed': True} @@ -215,17 +199,13 @@ def transform_request(request_options: RequestOptions) -> RequestOptions | Reque await sitemap_loader.mark_request_as_handled(request) assert len(extracted_urls) == 5 - assert extracted_urls == { - 'http://not-exists.com/', - 'http://not-exists.com/catalog?item=12&desc=vacation_hawaii', - 'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand', - 'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland', - 'http://not-exists.com/catalog?item=83&desc=vacation_usa', - } + assert extracted_urls == get_basic_results(server_url) async def test_transform_request_function_with_skip(server_url: URL, http_client: HttpClient) -> None: - sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) + sitemap_url = (server_url / 'sitemap.xml').with_query( + base64=encode_base64(get_basic_sitemap(url=server_url).encode()) + ) def transform_request(_request_options: RequestOptions) -> RequestOptions | RequestTransformAction: return 'skip' @@ -255,7 +235,9 @@ async def test_sitemap_loader_to_tandem( server_url: URL, http_client: HttpClient, ) -> None: - sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) + sitemap_url = (server_url / 'sitemap.xml').with_query( + base64=encode_base64(get_basic_sitemap(url=server_url).encode()) + ) sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client, enqueue_strategy='all') request_manager = await sitemap_loader.to_tandem() @@ -277,7 +259,9 @@ async def test_sitemap_loader_to_tandem_with_request_dropped( server_url: URL, http_client: HttpClient, ) -> None: - sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode())) + sitemap_url = (server_url / 'sitemap.xml').with_query( + base64=encode_base64(get_basic_sitemap(url=server_url).encode()) + ) sitemap_loader = SitemapRequestLoader( [str(sitemap_url)], diff --git a/tests/unit/utils.py b/tests/unit/utils.py index a965d3cc7b..02f3ece24b 100644 --- a/tests/unit/utils.py +++ b/tests/unit/utils.py @@ -11,6 +11,8 @@ if TYPE_CHECKING: from collections.abc import Awaitable, Callable + from yarl import URL + T = TypeVar('T') run_alone_on_mac = pytest.mark.run_alone if sys.platform == 'darwin' else lambda x: x @@ -73,3 +75,48 @@ async def poll_until_condition( delay *= backoff_factor result = await maybe_await(fn()) return result + + +DEFAULT_URL = 'http://not-exists.com/' + + +def get_basic_sitemap(url: str | URL = DEFAULT_URL) -> str: + return """ + + + + {url} + 2005-02-03 + monthly + 0.8 + + + {url}catalog?item=12&desc=vacation_hawaii + weekly + + + {url}catalog?item=73&desc=vacation_new_zealand + 2004-12-23 + weekly + + + {url}catalog?item=74&desc=vacation_newfoundland + 2004-12-23T18:00:15+00:00 + 0.3 + + + {url}catalog?item=83&desc=vacation_usa + 2004-11-23 + + + """.strip().format(url=url) + + +def get_basic_results(server_url: str | URL = DEFAULT_URL) -> set[str]: + return { + str(server_url), + f'{server_url}catalog?item=12&desc=vacation_hawaii', + f'{server_url}catalog?item=73&desc=vacation_new_zealand', + f'{server_url}catalog?item=74&desc=vacation_newfoundland', + f'{server_url}catalog?item=83&desc=vacation_usa', + }