-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Expand file tree
/
Copy pathtest_fetch_node_timeout.py
More file actions
241 lines (199 loc) · 8.42 KB
/
test_fetch_node_timeout.py
File metadata and controls
241 lines (199 loc) · 8.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
"""
Unit tests for FetchNode timeout functionality.
These tests verify that:
1. The timeout configuration is properly read and stored
2. HTTP requests use the configured timeout
3. PDF parsing respects the timeout
4. Timeout is propagated to ChromiumLoader via loader_kwargs
"""
import sys
import time
import unittest
from unittest.mock import Mock, patch, MagicMock
from pathlib import Path
# Add the project root to path to import modules
sys.path.insert(0, str(Path(__file__).parent.parent))
class TestFetchNodeTimeout(unittest.TestCase):
"""Test suite for FetchNode timeout configuration and usage."""
def setUp(self):
"""Set up test fixtures."""
# Mock all the heavy external dependencies at import time
self.mock_modules = {}
for module in ['langchain_core', 'langchain_core.documents',
'langchain_community', 'langchain_community.document_loaders',
'langchain_openai', 'minify_html', 'pydantic',
'langchain', 'langchain.prompts']:
if module not in sys.modules:
sys.modules[module] = MagicMock()
# Create mock Document class
class MockDocument:
def __init__(self, page_content, metadata=None):
self.page_content = page_content
self.metadata = metadata or {}
sys.modules['langchain_core.documents'].Document = MockDocument
# Create mock PyPDFLoader
class MockPyPDFLoader:
def __init__(self, source):
self.source = source
def load(self):
time.sleep(0.1) # Simulate some work
return [MockDocument(page_content=f"PDF content from {self.source}")]
sys.modules['langchain_community.document_loaders'].PyPDFLoader = MockPyPDFLoader
# Now import FetchNode
from scrapegraphai.nodes.fetch_node import FetchNode
self.FetchNode = FetchNode
def tearDown(self):
"""Clean up after tests."""
# Remove mocked modules
for module in list(sys.modules.keys()):
if 'langchain' in module or module in ['minify_html', 'pydantic']:
if module in self.mock_modules or module.startswith('langchain'):
sys.modules.pop(module, None)
def test_timeout_default_value(self):
"""Test that default timeout is set to 30 seconds."""
node = self.FetchNode(
input="url",
output=["doc"],
node_config={}
)
self.assertEqual(node.timeout, 30)
def test_timeout_custom_value(self):
"""Test that custom timeout value is properly stored."""
node = self.FetchNode(
input="url",
output=["doc"],
node_config={"timeout": 10}
)
self.assertEqual(node.timeout, 10)
def test_timeout_none_value(self):
"""Test that timeout can be disabled by setting to None."""
node = self.FetchNode(
input="url",
output=["doc"],
node_config={"timeout": None}
)
self.assertIsNone(node.timeout)
def test_timeout_no_config(self):
"""Test that timeout defaults to 30 when no node_config provided."""
node = self.FetchNode(
input="url",
output=["doc"],
node_config=None
)
self.assertEqual(node.timeout, 30)
@patch('scrapegraphai.nodes.fetch_node.requests')
def test_requests_get_with_timeout(self, mock_requests):
"""Test that requests.get is called with timeout when use_soup=True."""
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = "<html><body>Test content</body></html>"
mock_requests.get.return_value = mock_response
node = self.FetchNode(
input="url",
output=["doc"],
node_config={"use_soup": True, "timeout": 15}
)
# Execute with a URL
state = {"url": "https://example.com"}
node.execute(state)
# Verify requests.get was called with timeout
mock_requests.get.assert_called_once()
call_args = mock_requests.get.call_args
self.assertEqual(call_args[1].get('timeout'), 15)
@patch('scrapegraphai.nodes.fetch_node.requests')
def test_requests_get_without_timeout_when_none(self, mock_requests):
"""Test that requests.get is called without timeout argument when timeout=None."""
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = "<html><body>Test content</body></html>"
mock_requests.get.return_value = mock_response
node = self.FetchNode(
input="url",
output=["doc"],
node_config={"use_soup": True, "timeout": None}
)
# Execute with a URL
state = {"url": "https://example.com"}
node.execute(state)
# Verify requests.get was called without timeout
mock_requests.get.assert_called_once()
call_args = mock_requests.get.call_args
self.assertNotIn('timeout', call_args[1])
def test_pdf_parsing_with_timeout(self):
"""Test that PDF parsing completes within timeout."""
node = self.FetchNode(
input="pdf",
output=["doc"],
node_config={"timeout": 5}
)
# Execute with a PDF file
state = {"pdf": "test.pdf"}
result = node.execute(state)
# Should complete successfully
self.assertIn("doc", result)
self.assertIsNotNone(result["doc"])
def test_pdf_parsing_timeout_exceeded(self):
"""Test that PDF parsing raises TimeoutError when timeout is exceeded."""
# Create a mock loader that takes longer than timeout
class SlowPyPDFLoader:
def __init__(self, source):
self.source = source
def load(self):
time.sleep(2) # Sleep longer than timeout
return []
with patch('scrapegraphai.nodes.fetch_node.PyPDFLoader', SlowPyPDFLoader):
node = self.FetchNode(
input="pdf",
output=["doc"],
node_config={"timeout": 0.5} # Very short timeout
)
# Execute should raise TimeoutError
state = {"pdf": "slow.pdf"}
with self.assertRaises(TimeoutError) as context:
node.execute(state)
self.assertIn("PDF parsing exceeded timeout", str(context.exception))
@patch('scrapegraphai.nodes.fetch_node.ChromiumLoader')
def test_timeout_propagated_to_chromium_loader(self, mock_loader_class):
"""Test that timeout is propagated to ChromiumLoader via loader_kwargs."""
mock_loader = Mock()
mock_doc = Mock()
mock_doc.page_content = "<html>Test</html>"
mock_loader.load.return_value = [mock_doc]
mock_loader_class.return_value = mock_loader
node = self.FetchNode(
input="url",
output=["doc"],
node_config={"timeout": 20, "headless": True}
)
# Execute with a URL (not using soup, so ChromiumLoader is used)
state = {"url": "https://example.com"}
node.execute(state)
# Verify ChromiumLoader was instantiated with timeout in kwargs
mock_loader_class.assert_called_once()
call_kwargs = mock_loader_class.call_args[1]
self.assertEqual(call_kwargs.get('timeout'), 20)
@patch('scrapegraphai.nodes.fetch_node.ChromiumLoader')
def test_timeout_not_overridden_in_loader_kwargs(self, mock_loader_class):
"""Test that existing timeout in loader_kwargs is not overridden."""
mock_loader = Mock()
mock_doc = Mock()
mock_doc.page_content = "<html>Test</html>"
mock_loader.load.return_value = [mock_doc]
mock_loader_class.return_value = mock_loader
node = self.FetchNode(
input="url",
output=["doc"],
node_config={
"timeout": 20,
"loader_kwargs": {"timeout": 50} # Explicit loader timeout
}
)
# Execute with a URL
state = {"url": "https://example.com"}
node.execute(state)
# Verify ChromiumLoader got the loader_kwargs timeout, not node timeout
mock_loader_class.assert_called_once()
call_kwargs = mock_loader_class.call_args[1]
self.assertEqual(call_kwargs.get('timeout'), 50)
if __name__ == '__main__':
unittest.main()