From 1577ff243b6749b035ba712ae658da6828e3508f Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 3 Jun 2026 14:59:17 +0200 Subject: [PATCH] fix: Avoid duplicate processed requests in memory request queue client The 'already present but not yet handled' branch in MemoryRequestQueueClient.add_batch_of_requests appended a ProcessedRequest and then fell through to the unconditional append, producing two identical entries for one input request. Remove the inner append so each input yields exactly one entry, matching the file system client. --- .../_memory/_request_queue_client.py | 8 -------- .../_memory/test_memory_rq_client.py | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/crawlee/storage_clients/_memory/_request_queue_client.py b/src/crawlee/storage_clients/_memory/_request_queue_client.py index a14b78934e..90b47c63d8 100644 --- a/src/crawlee/storage_clients/_memory/_request_queue_client.py +++ b/src/crawlee/storage_clients/_memory/_request_queue_client.py @@ -190,14 +190,6 @@ async def add_batch_of_requests( # Add updated request back to queue. self._pending_requests.appendleft(request) - processed_requests.append( - ProcessedRequest( - unique_key=request.unique_key, - was_already_present=True, - was_already_handled=False, - ) - ) - # Add the new request to the queue. else: if forefront: diff --git a/tests/unit/storage_clients/_memory/test_memory_rq_client.py b/tests/unit/storage_clients/_memory/test_memory_rq_client.py index 1846712084..bc04b80926 100644 --- a/tests/unit/storage_clients/_memory/test_memory_rq_client.py +++ b/tests/unit/storage_clients/_memory/test_memory_rq_client.py @@ -43,6 +43,23 @@ async def test_memory_specific_purge_behavior() -> None: assert await rq_client2.is_empty() is True +async def test_add_existing_pending_request_returns_single_processed_request( + rq_client: MemoryRequestQueueClient, +) -> None: + """Test that re-adding a pending (not handled) request yields exactly one `ProcessedRequest` entry.""" + request = Request.from_url('https://example.com') + await rq_client.add_batch_of_requests([request]) + + # Re-add the same request while it is still pending (not handled, not in progress). + response = await rq_client.add_batch_of_requests([request]) + + assert len(response.processed_requests) == 1 + processed_request = response.processed_requests[0] + assert processed_request.unique_key == request.unique_key + assert processed_request.was_already_present is True + assert processed_request.was_already_handled is False + + async def test_memory_metadata_updates(rq_client: MemoryRequestQueueClient) -> None: """Test that metadata timestamps are updated correctly in memory storage.""" # Record initial timestamps