From f626fff92a4b6f1d79d8f959d8c94d258886862b Mon Sep 17 00:00:00 2001 From: Pierre Quentel Date: Tue, 4 Apr 2017 15:55:29 +0200 Subject: [PATCH 1/5] Rewrite parse_multipart to make it consistent with FieldStorage --- Doc/library/cgi.rst | 27 +++++------ Lib/cgi.py | 107 +++++++------------------------------------ Lib/test/test_cgi.py | 4 +- 3 files changed, 33 insertions(+), 105 deletions(-) diff --git a/Doc/library/cgi.rst b/Doc/library/cgi.rst index 41219eeaabae08..93363d353922ae 100644 --- a/Doc/library/cgi.rst +++ b/Doc/library/cgi.rst @@ -294,21 +294,22 @@ algorithms implemented in this module in other circumstances. This function is deprecated in this module. Use :func:`urllib.parse.parse_qsl` instead. It is maintained here only for backward compatibility. -.. function:: parse_multipart(fp, pdict) +.. function:: parse_multipart(fp, pdict, encoding="utf-8") Parse input of type :mimetype:`multipart/form-data` (for file uploads). - Arguments are *fp* for the input file and *pdict* for a dictionary containing - other parameters in the :mailheader:`Content-Type` header. - - Returns a dictionary just like :func:`urllib.parse.parse_qs` keys are the field names, each - value is a list of values for that field. This is easy to use but not much good - if you are expecting megabytes to be uploaded --- in that case, use the - :class:`FieldStorage` class instead which is much more flexible. - - Note that this does not parse nested multipart parts --- use - :class:`FieldStorage` for that. - - + Arguments are *fp* for the input file, *pdict* for a dictionary containing + other parameters in the :mailheader:`Content-Type` header, and *encoding*, + the request encoding. + + Returns a dictionary just like :func:`urllib.parse.parse_qs`: keys are the + field names, each value is a list of values for that field. For non-file + fields, the value is a list of strings. + + This is easy to use but not much good if you are expecting megabytes to be + uploaded --- in that case, use the :class:`FieldStorage` class instead + which is much more flexible. + + .. function:: parse_header(string) Parse a MIME header (such as :mailheader:`Content-Type`) into a main value and a diff --git a/Lib/cgi.py b/Lib/cgi.py index 233a496e8170ab..0655336c861d99 100755 --- a/Lib/cgi.py +++ b/Lib/cgi.py @@ -198,105 +198,32 @@ def parse_qsl(qs, keep_blank_values=0, strict_parsing=0): DeprecationWarning, 2) return urllib.parse.parse_qsl(qs, keep_blank_values, strict_parsing) -def parse_multipart(fp, pdict): +def parse_multipart(fp, pdict, encoding="utf-8"): """Parse multipart input. Arguments: fp : input file pdict: dictionary containing other parameters of content-type header + encoding: request encoding Returns a dictionary just like parse_qs(): keys are the field names, each - value is a list of values for that field. This is easy to use but not - much good if you are expecting megabytes to be uploaded -- in that case, - use the FieldStorage class instead which is much more flexible. Note - that content-type is the raw, unparsed contents of the content-type - header. - - XXX This does not parse nested multipart parts -- use FieldStorage for - that. - - XXX This should really be subsumed by FieldStorage altogether -- no - point in having two implementations of the same parsing algorithm. - Also, FieldStorage protects itself better against certain DoS attacks - by limiting the size of the data read in one chunk. The API here - does not support that kind of protection. This also affects parse() - since it can call parse_multipart(). + value is a list of values for that field. For non-file fields, the value + is a list of strings. + + This is easy to use but not much good if you are expecting megabytes to be + uploaded -- in that case, use the FieldStorage class instead which is much + more flexible. Note that content-type is the raw, unparsed contents of + the content-type header. """ - import http.client - - boundary = b"" - if 'boundary' in pdict: - boundary = pdict['boundary'] - if not valid_boundary(boundary): - raise ValueError('Invalid boundary in multipart form: %r' - % (boundary,)) - - nextpart = b"--" + boundary - lastpart = b"--" + boundary + b"--" - partdict = {} - terminator = b"" - - while terminator != lastpart: - bytes = -1 - data = None - if terminator: - # At start of next part. Read headers first. - headers = http.client.parse_headers(fp) - clength = headers.get('content-length') - if clength: - try: - bytes = int(clength) - except ValueError: - pass - if bytes > 0: - if maxlen and bytes > maxlen: - raise ValueError('Maximum content length exceeded') - data = fp.read(bytes) - else: - data = b"" - # Read lines until end of part. - lines = [] - while 1: - line = fp.readline() - if not line: - terminator = lastpart # End outer loop - break - if line.startswith(b"--"): - terminator = line.rstrip() - if terminator in (nextpart, lastpart): - break - lines.append(line) - # Done with part. - if data is None: - continue - if bytes < 0: - if lines: - # Strip final line terminator - line = lines[-1] - if line[-2:] == b"\r\n": - line = line[:-2] - elif line[-1:] == b"\n": - line = line[:-1] - lines[-1] = line - data = b"".join(lines) - line = headers['content-disposition'] - if not line: - continue - key, params = parse_header(line) - if key != 'form-data': - continue - if 'name' in params: - name = params['name'] - else: - continue - if name in partdict: - partdict[name].append(data) - else: - partdict[name] = [data] - - return partdict - + boundary = pdict['boundary'].decode('ascii') # cf. RFC 2046 + ctype = "multipart/form-data; boundary={}".format(boundary) + headers = Message() + headers.set_type(ctype) + headers['Content-Length'] = pdict['CONTENT-LENGTH'] + fs = FieldStorage(fp, headers=headers, encoding=encoding, + environ={'REQUEST_METHOD': 'POST'}) + return {k: fs.getlist(k) for k in fs} def _parseparam(s): while s[:1] == ';': diff --git a/Lib/test/test_cgi.py b/Lib/test/test_cgi.py index 637322137d639f..903d0731f977fd 100644 --- a/Lib/test/test_cgi.py +++ b/Lib/test/test_cgi.py @@ -126,8 +126,8 @@ def test_parse_multipart(self): env = {'boundary': BOUNDARY.encode('latin1'), 'CONTENT-LENGTH': '558'} result = cgi.parse_multipart(fp, env) - expected = {'submit': [b' Add '], 'id': [b'1234'], - 'file': [b'Testing 123.\n'], 'title': [b'']} + expected = {'submit': [' Add '], 'id': ['1234'], + 'file': [b'Testing 123.\n'], 'title': ['']} self.assertEqual(result, expected) def test_fieldstorage_properties(self): From 77585f7b488d23aeb17f2ce65ee8278540ae9ff5 Mon Sep 17 00:00:00 2001 From: Pierre Quentel Date: Tue, 4 Apr 2017 16:09:53 +0200 Subject: [PATCH 2/5] Update whatsnew and NEWS --- Doc/whatsnew/3.7.rst | 8 ++++++++ Misc/NEWS | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst index 12f65ff9a59423..1219567ae81f05 100644 --- a/Doc/whatsnew/3.7.rst +++ b/Doc/whatsnew/3.7.rst @@ -95,6 +95,14 @@ New Modules Improved Modules ================ +cgi +--- + +:func:`~cgi.parse_multipart` returns the same results as +:class:`~FieldStorage` : for non-file fields, the value associated to a key +is a list of strings, not bytes. +(Contributed by Pierre Quentel in :issue:`29979`.) + http.server ----------- diff --git a/Misc/NEWS b/Misc/NEWS index 95092afa0b38f2..310786cb5ef784 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -303,6 +303,10 @@ Extension Modules Library ------- +- bpo-29979: rewrite cgi.parse_multipart, reusing the FieldStorage class and + making its results consistent with those of FieldStorage for + multipart/form-data requests. Patch by Pierre Quentel. + - bpo-29649: Improve struct.pack_into() exception messages for problems with the buffer size and offset. Patch by Andrew Nester. From 7b398f1c679c0de91839d9c3a905a8a70bd2333a Mon Sep 17 00:00:00 2001 From: Pierre Quentel Date: Fri, 14 Apr 2017 13:52:37 +0200 Subject: [PATCH 3/5] Remove mention of FieldStorage in docstring + more explicit comment on boundary encoding --- Lib/cgi.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/Lib/cgi.py b/Lib/cgi.py index 0655336c861d99..d3334a433bc735 100755 --- a/Lib/cgi.py +++ b/Lib/cgi.py @@ -209,14 +209,10 @@ def parse_multipart(fp, pdict, encoding="utf-8"): Returns a dictionary just like parse_qs(): keys are the field names, each value is a list of values for that field. For non-file fields, the value is a list of strings. - - This is easy to use but not much good if you are expecting megabytes to be - uploaded -- in that case, use the FieldStorage class instead which is much - more flexible. Note that content-type is the raw, unparsed contents of - the content-type header. - """ - boundary = pdict['boundary'].decode('ascii') # cf. RFC 2046 + # RFC 2026, Section 5.1 : The "multipart" boundary delimiters are always + # represented as 7bit US-ASCII. + boundary = pdict['boundary'].decode('ascii') ctype = "multipart/form-data; boundary={}".format(boundary) headers = Message() headers.set_type(ctype) From e9d4f3e6695101ffc810386519d23adce2c1291f Mon Sep 17 00:00:00 2001 From: Pierre Quentel Date: Wed, 3 May 2017 15:58:49 +0200 Subject: [PATCH 4/5] Update cgi.rst Remove trailing whitespaces --- Doc/library/cgi.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/library/cgi.rst b/Doc/library/cgi.rst index 93363d353922ae..b60e1cc41e4be0 100644 --- a/Doc/library/cgi.rst +++ b/Doc/library/cgi.rst @@ -304,12 +304,12 @@ algorithms implemented in this module in other circumstances. Returns a dictionary just like :func:`urllib.parse.parse_qs`: keys are the field names, each value is a list of values for that field. For non-file fields, the value is a list of strings. - + This is easy to use but not much good if you are expecting megabytes to be uploaded --- in that case, use the :class:`FieldStorage` class instead which is much more flexible. - + .. function:: parse_header(string) Parse a MIME header (such as :mailheader:`Content-Type`) into a main value and a From c6818e52d95ce0a2b089a6e2bd3f22b97c7b266a Mon Sep 17 00:00:00 2001 From: Pierre Quentel Date: Wed, 3 May 2017 15:59:44 +0200 Subject: [PATCH 5/5] Update 3.7.rst Remove trailing whitespace --- Doc/whatsnew/3.7.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst index 2e7304a4250e3a..b2dc995dced122 100644 --- a/Doc/whatsnew/3.7.rst +++ b/Doc/whatsnew/3.7.rst @@ -98,10 +98,11 @@ Improved Modules cgi --- -:func:`~cgi.parse_multipart` returns the same results as +:func:`~cgi.parse_multipart` returns the same results as :class:`~FieldStorage` : for non-file fields, the value associated to a key is a list of strings, not bytes. (Contributed by Pierre Quentel in :issue:`29979`.) + binascii --------