add support for **kwargs in HttpReader (meta-pytorch#392)

msaroufim · facebook-github-bot · commit a7f89365d380 · 2022-05-11T12:26:57.000-07:00
Summary: Please read through our [contribution guide](https://github.com/pytorch/data/blob/main/CONTRIBUTING.md) prior to creating your pull request. - Note that there is a section on requirements related to adding a new DataPipe. Fixes meta-pytorch#391 ### Changes - Added support for `**kwarg` for `request` parameters in `HttpReader` Although it doesn't look like the HTTP dataset pipe is tested anywhere so I'll take a closer look and add one Pull Request resolved: meta-pytorch#392 Reviewed By: NivekT, ejguan Differential Revision: D36287860 Pulled By: msaroufim fbshipit-source-id: daac6c6e3a57608928c8e262f7cdc4d5a349ee03
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -52,7 +52,7 @@ conda install pytorch -c pytorch-nightly
 git clone https://github.com/pytorch/data.git
 cd data
 python setup.py develop
-pip install flake8 typing mypy pytest
+pip install flake8 typing mypy pytest expecttest
 ```
 
 ## Pull Requests
diff --git a/test/test_remote_io.py b/test/test_remote_io.py
@@ -41,7 +41,9 @@ def test_http_reader_iterdatapipe(self):
         file_url = "https://raw.githubusercontent.com/pytorch/data/main/LICENSE"
         expected_file_name = "LICENSE"
         expected_MD5_hash = "bb9675028dd39d2dd2bf71002b93e66c"
-        http_reader_dp = HttpReader(IterableWrapper([file_url]))
+        query_params = {"auth": ("fake_username", "fake_password"), "allow_redirects": True}
+        timeout = 120
+        http_reader_dp = HttpReader(IterableWrapper([file_url]), timeout=timeout, **query_params)
 
         # Functional Test: test if the Http Reader can download and read properly
         reader_dp = http_reader_dp.readlines()
diff --git a/torchdata/datapipes/iter/load/online.py b/torchdata/datapipes/iter/load/online.py
@@ -7,7 +7,7 @@
 import re
 import urllib
 
-from typing import Dict, Iterator, Optional, Tuple
+from typing import Any, Dict, Iterator, Optional, Tuple
 
 import requests
 
@@ -33,14 +33,16 @@ def _get_proxies() -> Optional[Dict[str, str]]:
     return None
 
 
-def _get_response_from_http(url: str, *, timeout: Optional[float]) -> Tuple[str, StreamWrapper]:
+def _get_response_from_http(
+    url: str, *, timeout: Optional[float], **query_params: Optional[Dict[str, Any]]
+) -> Tuple[str, StreamWrapper]:
     try:
         with requests.Session() as session:
             proxies = _get_proxies()
             if timeout is None:
-                r = session.get(url, stream=True, proxies=proxies)
+                r = session.get(url, stream=True, proxies=proxies, **query_params)
             else:
-                r = session.get(url, timeout=timeout, stream=True, proxies=proxies)
+                r = session.get(url, timeout=timeout, stream=True, proxies=proxies, **query_params)
         return url, StreamWrapper(r.raw)
     except HTTPError as e:
         raise Exception(f"Could not get the file. [HTTP Error] {e.response}.")
@@ -59,11 +61,14 @@ class HTTPReaderIterDataPipe(IterDataPipe[Tuple[str, StreamWrapper]]):
     Args:
         source_datapipe: a DataPipe that contains URLs
         timeout: timeout in seconds for HTTP request
+        **kwargs: a Dictionary to pass optional arguments that requests takes. For the full list check out https://docs.python-requests.org/en/master/api/
 
     Example:
         >>> from torchdata.datapipes.iter import IterableWrapper, HttpReader
         >>> file_url = "https://raw.githubusercontent.com/pytorch/data/main/LICENSE"
-        >>> http_reader_dp = HttpReader(IterableWrapper([file_url]))
+        >>> query_params = {"auth" : ("fake_username", "fake_password"), "allow_redirects" : True}
+        >>> timeout = 120
+        >>> http_reader_dp = HttpReader(IterableWrapper([file_url]), timeout=timeout, query_params)
         >>> reader_dp = http_reader_dp.readlines()
         >>> it = iter(reader_dp)
         >>> path, line = next(it)
@@ -73,13 +78,19 @@ class HTTPReaderIterDataPipe(IterDataPipe[Tuple[str, StreamWrapper]]):
         b'BSD 3-Clause License'
     """
 
-    def __init__(self, source_datapipe: IterDataPipe[str], timeout: Optional[float] = None) -> None:
+    def __init__(
+        self, source_datapipe: IterDataPipe[str], timeout: Optional[float] = None, **kwargs: Optional[Dict[str, Any]]
+    ) -> None:
         self.source_datapipe: IterDataPipe[str] = source_datapipe
         self.timeout = timeout
+        self.query_params = kwargs
 
     def __iter__(self) -> Iterator[Tuple[str, StreamWrapper]]:
         for url in self.source_datapipe:
-            yield _get_response_from_http(url, timeout=self.timeout)
+            if self.query_params:
+                yield _get_response_from_http(url, timeout=self.timeout, **self.query_params)
+            else:
+                yield _get_response_from_http(url, timeout=self.timeout)
 
     def __len__(self) -> int:
         return len(self.source_datapipe)