Made ResultsParser handle sequences of XML fragments.

Frederick Ross · Frederick Ross · commit 9f86ab2cba4f · 2012-05-21T11:34:24.000-07:00
Added an export endpoint method to Jobs.
diff --git a/splunklib/binding.py b/splunklib/binding.py
@@ -12,7 +12,9 @@
 # License for the specific language governing permissions and limitations
 # under the License.
 
-"""This module contains a low-level *binding* interface to the `Splunk REST API
+"""A low level binding to Splunk's REST API.
+
+This module contains a low-level *binding* interface to the `Splunk REST API
 <http://docs.splunk.com/Documentation/Splunk/latest/RESTAPI/RESTcontents>`_.
 
 This module is designed to enable client-side interaction with the Splunk
@@ -218,17 +220,17 @@ def namespace(**kwargs):
     if sharing in ["system"]:
         return record({
             'sharing': sharing, 
-            'owner': "nobody", 
+                'owner': "nobody", 
             'app': "system" })
     if sharing in ["global", "app"]:
         return record({ 
             'sharing': sharing, 
-            'owner': "nobody", 
+                'owner': "nobody", 
             'app': kwargs.get('app', None)})
     if sharing in ["user", None]:
         return record({
             'sharing': sharing, 
-            'owner': kwargs.get('owner', None),
+                'owner': kwargs.get('owner', None),
             'app': kwargs.get('app', None)})
     raise ValueError("Invalid value for argument: 'sharing'")
 
diff --git a/splunklib/client.py b/splunklib/client.py
@@ -1420,6 +1420,46 @@ def create(self, query, **kwargs):
         sid = _load_sid(response)
         return Job(self.service, PATH_JOBS + sid)
 
+    def export(self, query, **params):
+        """Run a search and immediately start streaming preview events.
+
+        Returns an InputStream over the events. The InputStream
+        streams XML fragments from the server. The SDK provides
+        ``results.ResultsReader`` to lazily parse this stream into
+        usable Python objects. For example::
+
+            import splunklib.client as client
+            import splunklib.results as results
+            s = client.connect(...)
+            r = results.ResultsReader(s.jobs.export("search * | head 5"))
+            assert r.is_preview == False # The job is finished when we get here
+            for kind, event in r:
+                assert kind == 'RESULT'
+                # events are returned as dicts with strings as values.
+                print event 
+
+        ``export`` makes a single roundtrip to the server (as opposed
+        to two for create followed by preview), plus at most two more
+        if autologin is turned on.
+
+        :raises SyntaxError: on invalid queries.
+
+        :param query: Splunk search language query to run
+        :type query: ``str``
+        :param params: Additional arguments to export (see the `REST API docs <http://docs/Documentation/Splunk/4.3.2/RESTAPI/RESTsearch#search.2Fjobs.2Fexport>`_).
+        :returns: InputStream over raw XML returned from the server.
+        """
+        if "exec_mode" in params:
+            raise TypeError("Cannot specify an exec_mode to export.")
+        try:
+            return self.post(path_segment="export", search=query, **params).body
+        except HTTPError as he:
+            if he.status == 400 and 'Search operation' in str(he):
+                raise SyntaxError(str(he))
+            else:
+                raise
+
+
     def oneshot(self, query, **params):
         """Run a search and directly return an InputStream IO handle over the results.
 
diff --git a/splunklib/data.py b/splunklib/data.py
@@ -12,10 +12,15 @@
 # License for the specific language governing permissions and limitations
 # under the License.
 
-"""This module provides an Atom Feed response loader.
+"""Code to read response from splunkd in Atom Feed format.
 
-A simple :func:`load` utility reads Atom Feed XML data (the format returned by
-the Splunk REST API), and converts it to a native Python dictionary or list.
+The Splunk REST API largely returns Atom (except in a few places where
+there are historical inconsistencies that will be ironed out in future
+versions).
+
+This module provides one function, :func:`load`, which reads a string
+containing the XML of an Atom Feed and returns a dictionary or list
+containing the corresponding data.
 """
 
 from xml.etree.ElementTree import XML
@@ -56,14 +61,16 @@ def localname(xname):
     return xname if rcurly == -1 else xname[rcurly+1:]
 
 def load(text, match=None):
-    """Loads XML text into a native Python structure (*dict* or *list*). If you
-    provide an optional **match** string (a tag name or path), only the matching
-    sub-elements are loaded. 
-
-    :param `text`: The XML text to load.
-    :type `text`: string
-    :param `match`: A tag name or path to match (optional).
-    :type `match`: string
+    """Extract Python data structures from Atom XML in the string *text*.
+
+    Loads XML text into a native Python structure (`dict` or `list`).
+    If you provide an optional *match* string (a tag name or path),
+    only the matching sub-elements are loaded.
+
+    :param text: The XML text to load.
+    :type text: string
+    :param match: A tag name or path to match (optional).
+    :type match: string
     """
     if text is None: return None
     text = text.strip()
diff --git a/splunklib/results.py b/splunklib/results.py
@@ -36,18 +36,101 @@
     import xml.etree.ElementTree as et
 
 from collections import OrderedDict
+try:
+    from cStringIO import StringIO
+except:
+    from StringIO import StringIO
 
 __all__ = [
     "ResultsReader",
     "Message"
 ]
 
 class Message(object):
+    """Messages returned splunkd's XML.
+
+    **Example**::
+
+        m = Message("DEBUG", "There's something in that variable...")
+    """
     def __init__(self, type_, message):
         self.type = type_
         self.message = message
     def __repr__(self):
-        print "%s: %s" % (type, message)
+        return "%s: %s" % (self.type, self.message)
+
+class ConcatenatedStream(object):
+    """Lazily concatenate zero or more streams into a stream.
+
+    As you read from the concatenated stream, you get characters from
+    each stream passed to ``ConcatenatedStream``, in order.
+
+    **Example**:
+
+        from StringIO import StringIO
+        s = ConcatenatedStream(StringIO("abc"), StringIO("def"))
+        assert s.read() == "abcdef"
+    """
+    def __init__(self, *streams):
+        self.streams = list(streams)
+
+    def read(self, n=None):
+        """Read at most *n* characters from this stream.
+
+        If *n* is ``None``, return all available characters.
+        """
+        response = ""
+        while len(self.streams) > 0 and (n is None or n > 0):
+            txt = self.streams[0].read(n)
+            response += txt
+            if n is not None:
+                n -= len(txt)
+            if n > 0 or n is None:
+                del self.streams[0]
+        return response
+
+class XMLDTDFilter(object):
+    """Lazily remove all XML DTDs from a stream.
+
+    All substrings matching the regular expression <?[^>]*> are
+    removed in their entirety from the stream. No regular expressions
+    are used, however, so everything still streams properly.
+
+    **Example**::
+
+        from StringIO import StringIO
+        s = XMLDTDFilter("<?xml abcd><element><?xml ...></element>")
+        assert s.read() == "<element></element>"
+    """
+    def __init__(self, stream):
+        self.stream = stream
+
+    def read(self, n=None):
+        """Read at most *n* characters from this stream.
+
+        If *n* is ``None``, return all available characters.
+        """
+        response = ""
+        while n is None or n > 0:
+            c = self.stream.read(1)
+            if c == "":
+                break
+            elif c == "<":
+                c += self.stream.read(1)
+                if c == "<?":
+                    while True:
+                        q = self.stream.read(1)
+                        if q == ">":
+                            break
+                else:
+                    response += c
+                    if n is not None:
+                        n -= len(c)
+            else:
+                response += c
+                if n is not None:
+                    n -= 1
+        return response
 
 class ResultsReader(object):
     """Lazily yield dicts from a streaming XML results stream.
@@ -84,15 +167,26 @@ class ResultsReader(object):
     # function creating that generator. Thus it's all wrapped up for
     # the sake of one field.
     def __init__(self, stream):
-            self._gen = self.parse_results(stream)
-            # splunkd 4.3 returns an empty response body instead of a
-            # results element with no result elements inside. There is
-            # no good way to handle it other than failing out and
-            # trying to get to a sane state.
-            try:
-                self.is_preview = self._gen.next()
-            except StopIteration:
-                self.is_preview = None
+        # The search/jobs/exports endpoint, when run with
+        # earliest_time=rt and latest_time=rt streams a sequence of
+        # XML documents, each containing a result, as opposed to one
+        # results element containing lots of results. Python's XML
+        # parsers are broken, and instead of reading one full document
+        # and returning the stream that follows untouched, they
+        # destroy the stream and throw an error. To get around this,
+        # we remove all the DTD definitions inline, then wrap the
+        # fragments in a fiction <doc> element to make the parser happy.
+        stream = XMLDTDFilter(stream)
+        stream = ConcatenatedStream(StringIO("<doc>"), stream, StringIO("</doc>"))
+        self._gen = self.parse_results(stream)
+        # splunkd 4.3 returns an empty response body instead of a
+        # results element with no result elements inside. There is
+        # no good way to handle it other than failing out and
+        # trying to get to a sane state.
+        try:
+            self.is_preview = self._gen.next()
+        except StopIteration:
+            self.is_preview = None
 
     def __iter__(self):
         return self
@@ -146,7 +240,7 @@ def parse_results(self, stream):
     
                 elif elem.tag == 'msg':
                     if event == 'start':
-                        msg_type = elem.attribs['type']
+                        msg_type = elem.attrib['type']
                     elif event == 'end':
                         yield Message(msg_type, elem.text.encode('utf8'))
                         elem.clear()
@@ -160,3 +254,4 @@ def parse_results(self, stream):
 
 
 
+
diff --git a/tests/test_job.py b/tests/test_job.py
@@ -90,8 +90,17 @@ def test_crud(self):
         self.assertTrue(isinstance(result.next(), dict))
         self.assertTrue(len(list(result)) <= 3)
         
+        result = results.ResultsReader(jobs.export("search index=_internal earliest=-1m | head 3"))
+        self.assertEqual(result.is_preview, False)
+        d = result.next()
+        print d
+        self.assertTrue(isinstance(d, dict) or isinstance(d, results.Message))
+        self.assertTrue(len(list(d for d in result if isinstance(d, dict))) <= 3)
+
         self.assertRaises(SyntaxError, jobs.oneshot, "asdaf;lkj2r23=")
 
+        self.assertRaises(SyntaxError, jobs.export, "asdaf;lkj2r23=")
+
         # Make sure we can create a job
         job = jobs.create("search index=sdk-tests earliest=-1m | head 1")
         self.assertTrue(jobs.contains(job.sid))
@@ -176,5 +185,30 @@ def test_results(self):
         self.assertTrue(isinstance(result, dict))
         self.assertEqual(int(result["count"]), 1)
 
+    def test_results_reader(self):
+        # Run jobs.export("search index=_internal | stats count",
+        # earliest_time="rt", latest_time="rt") and you get a
+        # streaming sequence of XML fragments containing results.
+        with open('streaming_results.xml') as input:
+            reader = results.ResultsReader(input)
+            print reader.next()
+            self.assertTrue(isinstance(reader.next(), dict))
+
+    def test_xmldtd_filter(self):
+        from StringIO import StringIO
+        s = results.XMLDTDFilter(StringIO("<?xml asdf awe awdf=""><boris>Other stuf</boris><?xml dafawe \n asdfaw > ab"))
+        self.assertEqual(s.read(3), "<bo")
+        self.assertEqual(s.read(), "ris>Other stuf</boris> ab")
+
+
+    def test_concatenated_stream(self):
+        from StringIO import StringIO
+        s = results.ConcatenatedStream(StringIO("This is a test "), 
+                                       StringIO("of the emergency broadcast system."))
+        self.assertEqual(s.read(3), "Thi")
+        self.assertEqual(s.read(20), 's is a test of the e')
+        self.assertEqual(s.read(), 'mergency broadcast system.')
+            
+
 if __name__ == "__main__":
     testlib.main()