From d16c288ef1e0bf0deece631ad76ddc85c8d22e4b Mon Sep 17 00:00:00 2001 From: Windel Bouwman Date: Wed, 10 Jul 2019 11:19:01 +0200 Subject: [PATCH] Improve re module. Favor string.py over string.rs --- Lib/string.py | 305 ++++++++++++++++++++++++++++++++++++++ tests/snippets/test_re.py | 18 +++ vm/src/stdlib/mod.rs | 2 +- vm/src/stdlib/re.rs | 206 ++++++++++++++++++++----- vm/src/stdlib/string.rs | 26 +--- 5 files changed, 493 insertions(+), 64 deletions(-) create mode 100644 Lib/string.py diff --git a/Lib/string.py b/Lib/string.py new file mode 100644 index 00000000000..a7261a91502 --- /dev/null +++ b/Lib/string.py @@ -0,0 +1,305 @@ +"""A collection of string constants. + +Public module variables: + +whitespace -- a string containing all ASCII whitespace +ascii_lowercase -- a string containing all ASCII lowercase letters +ascii_uppercase -- a string containing all ASCII uppercase letters +ascii_letters -- a string containing all ASCII letters +digits -- a string containing all ASCII decimal digits +hexdigits -- a string containing all ASCII hexadecimal digits +octdigits -- a string containing all ASCII octal digits +punctuation -- a string containing all ASCII punctuation characters +printable -- a string containing all ASCII characters considered printable + +""" + +__all__ = ["ascii_letters", "ascii_lowercase", "ascii_uppercase", "capwords", + "digits", "hexdigits", "octdigits", "printable", "punctuation", + "whitespace", "Formatter", "Template"] + +import _string + +# Some strings for ctype-style character classification +whitespace = ' \t\n\r\v\f' +ascii_lowercase = 'abcdefghijklmnopqrstuvwxyz' +ascii_uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' +ascii_letters = ascii_lowercase + ascii_uppercase +digits = '0123456789' +hexdigits = digits + 'abcdef' + 'ABCDEF' +octdigits = '01234567' +punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~""" +printable = digits + ascii_letters + punctuation + whitespace + +# Functions which aren't available as string methods. + +# Capitalize the words in a string, e.g. " aBc dEf " -> "Abc Def". +def capwords(s, sep=None): + """capwords(s [,sep]) -> string + + Split the argument into words using split, capitalize each + word using capitalize, and join the capitalized words using + join. If the optional second argument sep is absent or None, + runs of whitespace characters are replaced by a single space + and leading and trailing whitespace are removed, otherwise + sep is used to split and join the words. + + """ + return (sep or ' ').join(x.capitalize() for x in s.split(sep)) + + +#################################################################### +import re as _re +from collections import ChainMap as _ChainMap + +class _TemplateMetaclass(type): + pattern = r""" + %(delim)s(?: + (?P%(delim)s) | # Escape sequence of two delimiters + (?P%(id)s) | # delimiter and a Python identifier + \{(?P%(bid)s)\} #| # delimiter and a braced identifier + # (?P) # Other ill-formed delimiter exprs + ) + """ + + def __init__(cls, name, bases, dct): + super(_TemplateMetaclass, cls).__init__(name, bases, dct) + if 'pattern' in dct: + pattern = cls.pattern + else: + pattern = _TemplateMetaclass.pattern % { + 'delim' : _re.escape(cls.delimiter), + 'id' : cls.idpattern, + 'bid' : cls.braceidpattern or cls.idpattern, + } + cls.pattern = _re.compile(pattern, cls.flags | _re.VERBOSE) + + +class Template(metaclass=_TemplateMetaclass): + """A string class for supporting $-substitutions.""" + + delimiter = '$' + # r'[a-z]' matches to non-ASCII letters when used with IGNORECASE, but + # without the ASCII flag. We can't add re.ASCII to flags because of + # backward compatibility. So we use the ?a local flag and [a-z] pattern. + # See https://bugs.python.org/issue31672 + idpattern = r'([_a-z][_a-z0-9]*)' + braceidpattern = None + flags = _re.IGNORECASE + + def __init__(self, template): + self.template = template + + # Search for $$, $identifier, ${identifier}, and any bare $'s + + def _invalid(self, mo): + i = mo.start('invalid') + lines = self.template[:i].splitlines(keepends=True) + if not lines: + colno = 1 + lineno = 1 + else: + colno = i - len(''.join(lines[:-1])) + lineno = len(lines) + raise ValueError('Invalid placeholder in string: line %d, col %d' % + (lineno, colno)) + + def substitute(*args, **kws): + if not args: + raise TypeError("descriptor 'substitute' of 'Template' object " + "needs an argument") + self, *args = args # allow the "self" keyword be passed + if len(args) > 1: + raise TypeError('Too many positional arguments') + if not args: + mapping = kws + elif kws: + mapping = _ChainMap(kws, args[0]) + else: + mapping = args[0] + # Helper function for .sub() + def convert(mo): + # Check the most common path first. + named = mo.group('named') or mo.group('braced') + if named is not None: + return str(mapping[named]) + if mo.group('escaped') is not None: + return self.delimiter + if mo.group('invalid') is not None: + self._invalid(mo) + raise ValueError('Unrecognized named group in pattern', + self.pattern) + return self.pattern.sub(convert, self.template) + + def safe_substitute(*args, **kws): + if not args: + raise TypeError("descriptor 'safe_substitute' of 'Template' object " + "needs an argument") + self, *args = args # allow the "self" keyword be passed + if len(args) > 1: + raise TypeError('Too many positional arguments') + if not args: + mapping = kws + elif kws: + mapping = _ChainMap(kws, args[0]) + else: + mapping = args[0] + # Helper function for .sub() + def convert(mo): + named = mo.group('named') or mo.group('braced') + if named is not None: + try: + return str(mapping[named]) + except KeyError: + return mo.group() + if mo.group('escaped') is not None: + return self.delimiter + if mo.group('invalid') is not None: + return mo.group() + raise ValueError('Unrecognized named group in pattern', + self.pattern) + return self.pattern.sub(convert, self.template) + + + +######################################################################## +# the Formatter class +# see PEP 3101 for details and purpose of this class + +# The hard parts are reused from the C implementation. They're exposed as "_" +# prefixed methods of str. + +# The overall parser is implemented in _string.formatter_parser. +# The field name parser is implemented in _string.formatter_field_name_split + +class Formatter: + def format(*args, **kwargs): + if not args: + raise TypeError("descriptor 'format' of 'Formatter' object " + "needs an argument") + self, *args = args # allow the "self" keyword be passed + try: + format_string, *args = args # allow the "format_string" keyword be passed + except ValueError: + raise TypeError("format() missing 1 required positional " + "argument: 'format_string'") from None + return self.vformat(format_string, args, kwargs) + + def vformat(self, format_string, args, kwargs): + used_args = set() + result, _ = self._vformat(format_string, args, kwargs, used_args, 2) + self.check_unused_args(used_args, args, kwargs) + return result + + def _vformat(self, format_string, args, kwargs, used_args, recursion_depth, + auto_arg_index=0): + if recursion_depth < 0: + raise ValueError('Max string recursion exceeded') + result = [] + for literal_text, field_name, format_spec, conversion in \ + self.parse(format_string): + + # output the literal text + if literal_text: + result.append(literal_text) + + # if there's a field, output it + if field_name is not None: + # this is some markup, find the object and do + # the formatting + + # handle arg indexing when empty field_names are given. + if field_name == '': + if auto_arg_index is False: + raise ValueError('cannot switch from manual field ' + 'specification to automatic field ' + 'numbering') + field_name = str(auto_arg_index) + auto_arg_index += 1 + elif field_name.isdigit(): + if auto_arg_index: + raise ValueError('cannot switch from manual field ' + 'specification to automatic field ' + 'numbering') + # disable auto arg incrementing, if it gets + # used later on, then an exception will be raised + auto_arg_index = False + + # given the field_name, find the object it references + # and the argument it came from + obj, arg_used = self.get_field(field_name, args, kwargs) + used_args.add(arg_used) + + # do any conversion on the resulting object + obj = self.convert_field(obj, conversion) + + # expand the format spec, if needed + format_spec, auto_arg_index = self._vformat( + format_spec, args, kwargs, + used_args, recursion_depth-1, + auto_arg_index=auto_arg_index) + + # format the object and append to the result + result.append(self.format_field(obj, format_spec)) + + return ''.join(result), auto_arg_index + + + def get_value(self, key, args, kwargs): + if isinstance(key, int): + return args[key] + else: + return kwargs[key] + + + def check_unused_args(self, used_args, args, kwargs): + pass + + + def format_field(self, value, format_spec): + return format(value, format_spec) + + + def convert_field(self, value, conversion): + # do any conversion on the resulting object + if conversion is None: + return value + elif conversion == 's': + return str(value) + elif conversion == 'r': + return repr(value) + elif conversion == 'a': + return ascii(value) + raise ValueError("Unknown conversion specifier {0!s}".format(conversion)) + + + # returns an iterable that contains tuples of the form: + # (literal_text, field_name, format_spec, conversion) + # literal_text can be zero length + # field_name can be None, in which case there's no + # object to format and output + # if field_name is not None, it is looked up, formatted + # with format_spec and conversion and then used + def parse(self, format_string): + return _string.formatter_parser(format_string) + + + # given a field_name, find the object it references. + # field_name: the field being looked up, e.g. "0.name" + # or "lookup[3]" + # used_args: a set of which args have been used + # args, kwargs: as passed in to vformat + def get_field(self, field_name, args, kwargs): + first, rest = _string.formatter_field_name_split(field_name) + + obj = self.get_value(first, args, kwargs) + + # loop through the rest of the field_name, doing + # getattr or getitem as needed + for is_attr, i in rest: + if is_attr: + obj = getattr(obj, i) + else: + obj = obj[i] + + return obj, first diff --git a/tests/snippets/test_re.py b/tests/snippets/test_re.py index 2bc7308e6a2..16cb685a1f0 100644 --- a/tests/snippets/test_re.py +++ b/tests/snippets/test_re.py @@ -13,3 +13,21 @@ assert mo.end() == 5 assert re.escape('python.exe') == 'python\\.exe' + +p = re.compile('ab') +s = p.sub('x', 'abcabca') +print(s) +assert s == 'xcxca' + +idpattern = r'([_a-z][_a-z0-9]*)' + +mo = re.search(idpattern, '7382 _boe0+2') +print(mo) +# TODO: +# assert mo.group(0) == '_boe0' + +from string import Template +s = Template('$who likes $what') +# TODO: +# r = s.substitute(who='tim', what='kung pow') +# print(r) diff --git a/vm/src/stdlib/mod.rs b/vm/src/stdlib/mod.rs index 09249a37572..a50bb5c2f72 100644 --- a/vm/src/stdlib/mod.rs +++ b/vm/src/stdlib/mod.rs @@ -53,7 +53,7 @@ pub fn get_module_inits() -> HashMap { "platform".to_string() => Box::new(platform::make_module), "re".to_string() => Box::new(re::make_module), "random".to_string() => Box::new(random::make_module), - "string".to_string() => Box::new(string::make_module), + "_string".to_string() => Box::new(string::make_module), "struct".to_string() => Box::new(pystruct::make_module), "_thread".to_string() => Box::new(thread::make_module), "time".to_string() => Box::new(time_module::make_module), diff --git a/vm/src/stdlib/re.rs b/vm/src/stdlib/re.rs index f427389802a..8ea2ad4dd12 100644 --- a/vm/src/stdlib/re.rs +++ b/vm/src/stdlib/re.rs @@ -4,24 +4,74 @@ * This module fits the python re interface onto the rust regular expression * system. */ -use regex::{Match, Regex}; +use regex::{Match, Regex, RegexBuilder}; +use std::fmt; + +use crate::function::{Args, OptionalArg}; +use crate::obj::objint::PyIntRef; use crate::obj::objstr::PyStringRef; use crate::obj::objtype::PyClassRef; -use crate::pyobject::{PyObjectRef, PyRef, PyResult, PyValue}; +use crate::pyobject::{PyClassImpl, PyObjectRef, PyResult, PyValue}; use crate::vm::VirtualMachine; +use num_traits::ToPrimitive; + +// #[derive(Debug)] +#[pyclass(name = "Pattern")] +struct PyPattern { + regex: Regex, + pattern: String, +} -impl PyValue for Regex { +impl fmt::Debug for PyPattern { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Pattern()") + } +} + +const IGNORECASE: usize = 2; +const MULTILINE: usize = 8; +const DOTALL: usize = 16; +const VERBOSE: usize = 64; + +#[derive(Default)] +struct PyRegexFlags { + ignorecase: bool, + multiline: bool, + dotall: bool, + verbose: bool, +} + +impl PyRegexFlags { + fn from_int(bits: usize) -> Self { + // TODO: detect unknown flag bits. + PyRegexFlags { + ignorecase: (bits & IGNORECASE) != 0, + multiline: (bits & MULTILINE) != 0, + dotall: (bits & DOTALL) != 0, + verbose: (bits & VERBOSE) != 0, + } + } +} + +impl PyValue for PyPattern { fn class(vm: &VirtualMachine) -> PyClassRef { vm.class("re", "Pattern") } } /// Inner data for a match object. -#[derive(Debug)] +#[pyclass(name = "Match")] struct PyMatch { start: usize, end: usize, + // m: Match<'t>, +} + +impl fmt::Debug for PyMatch { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Match()") + } } impl PyValue for PyMatch { @@ -30,36 +80,55 @@ impl PyValue for PyMatch { } } -type PyRegexRef = PyRef; -type PyMatchRef = PyRef; - -fn re_match(pattern: PyStringRef, string: PyStringRef, vm: &VirtualMachine) -> PyResult { - let regex = make_regex(vm, &pattern.value)?; +// type PyPatternRef = PyRef; +// type PyMatchRef = PyRef; + +fn re_match( + pattern: PyStringRef, + string: PyStringRef, + flags: OptionalArg, + vm: &VirtualMachine, +) -> PyResult { + let flags = extract_flags(flags); + let regex = make_regex(vm, &pattern.value, flags)?; do_match(vm, ®ex, &string.value) } -fn re_search(pattern: PyStringRef, string: PyStringRef, vm: &VirtualMachine) -> PyResult { - let regex = make_regex(vm, &pattern.value)?; +fn re_search( + pattern: PyStringRef, + string: PyStringRef, + flags: OptionalArg, + vm: &VirtualMachine, +) -> PyResult { + let flags = extract_flags(flags); + let regex = make_regex(vm, &pattern.value, flags)?; do_search(vm, ®ex, &string.value) } -fn do_match(vm: &VirtualMachine, regex: &Regex, search_text: &str) -> PyResult { +fn do_match(vm: &VirtualMachine, regex: &PyPattern, search_text: &str) -> PyResult { // TODO: implement match! do_search(vm, regex, search_text) } -fn do_search(vm: &VirtualMachine, regex: &Regex, search_text: &str) -> PyResult { - match regex.find(search_text) { +fn do_search(vm: &VirtualMachine, regex: &PyPattern, search_text: &str) -> PyResult { + match regex.regex.find(search_text) { None => Ok(vm.get_none()), Some(result) => create_match(vm, &result), } } -fn make_regex(vm: &VirtualMachine, pattern: &str) -> PyResult { - match Regex::new(pattern) { - Ok(regex) => Ok(regex), - Err(err) => Err(vm.new_value_error(format!("Error in regex: {:?}", err))), - } +fn make_regex(vm: &VirtualMachine, pattern: &str, flags: PyRegexFlags) -> PyResult { + let r = RegexBuilder::new(pattern) + .case_insensitive(flags.ignorecase) + .multi_line(flags.multiline) + .dot_matches_new_line(flags.dotall) + .ignore_whitespace(flags.verbose) + .build() + .map_err(|err| vm.new_value_error(format!("Error in regex: {:?}", err)))?; + Ok(PyPattern { + regex: r, + pattern: pattern.to_string(), + }) } /// Take a found regular expression and convert it to proper match object. @@ -75,52 +144,111 @@ fn create_match(vm: &VirtualMachine, match_value: &Match) -> PyResult { .into_object()) } -fn re_compile(pattern: PyStringRef, vm: &VirtualMachine) -> PyResult { - make_regex(vm, &pattern.value) +fn extract_flags(flags: OptionalArg) -> PyRegexFlags { + match flags { + OptionalArg::Present(flags) => { + PyRegexFlags::from_int(flags.as_bigint().to_usize().unwrap()) + } + OptionalArg::Missing => Default::default(), + } +} + +fn re_compile( + pattern: PyStringRef, + flags: OptionalArg, + vm: &VirtualMachine, +) -> PyResult { + let flags = extract_flags(flags); + make_regex(vm, &pattern.value, flags) } fn re_escape(pattern: PyStringRef, _vm: &VirtualMachine) -> String { regex::escape(&pattern.value) } -impl PyRegexRef { - fn match_(self, text: PyStringRef, vm: &VirtualMachine) -> PyResult { - do_match(vm, &self, &text.value) +fn re_purge(_vm: &VirtualMachine) {} + +#[pyimpl] +impl PyPattern { + #[pymethod(name = "match")] + fn match_(&self, text: PyStringRef, vm: &VirtualMachine) -> PyResult { + do_match(vm, self, &text.value) } - fn search(self, text: PyStringRef, vm: &VirtualMachine) -> PyResult { - do_search(vm, &self, &text.value) + + #[pymethod(name = "search")] + fn search(&self, text: PyStringRef, vm: &VirtualMachine) -> PyResult { + do_search(vm, self, &text.value) + } + + #[pymethod(name = "sub")] + fn sub(&self, repl: PyStringRef, text: PyStringRef, vm: &VirtualMachine) -> PyResult { + // let replacer: &Replacer = ; + + let replaced_text: String = self + .regex + .replace_all(&text.value, { repl.value.as_str() }) + .into_owned(); + Ok(vm.ctx.new_str(replaced_text)) + } + + #[pymethod(name = "subn")] + fn subn(&self, repl: PyStringRef, text: PyStringRef, vm: &VirtualMachine) -> PyResult { + self.sub(repl, text, vm) + } + + #[pyproperty(name = "pattern")] + fn pattern(&self, vm: &VirtualMachine) -> PyResult { + Ok(vm.ctx.new_str(self.pattern.clone())) } } -impl PyMatchRef { - fn start(self, _vm: &VirtualMachine) -> usize { +#[pyimpl] +impl PyMatch { + #[pymethod(name = "start")] + fn start(&self, _group: OptionalArg, _vm: &VirtualMachine) -> usize { self.start } - fn end(self, _vm: &VirtualMachine) -> usize { + + #[pymethod(name = "end")] + fn end(&self, _group: OptionalArg, _vm: &VirtualMachine) -> usize { self.end } + + #[pymethod(name = "group")] + fn group(&self, _groups: Args, _vm: &VirtualMachine) -> usize { + /* + let groups = groups.into_iter().collect(); + if groups.len() == 1 { + } else { + } + */ + // println!("{:?}", groups); + self.start + } } /// Create the python `re` module with all its members. pub fn make_module(vm: &VirtualMachine) -> PyObjectRef { let ctx = &vm.ctx; - let match_type = py_class!(ctx, "Match", ctx.object(), { - "start" => ctx.new_rustfunc(PyMatchRef::start), - "end" => ctx.new_rustfunc(PyMatchRef::end) - }); - - let pattern_type = py_class!(ctx, "Pattern", ctx.object(), { - "match" => ctx.new_rustfunc(PyRegexRef::match_), - "search" => ctx.new_rustfunc(PyRegexRef::search) - }); + let match_type = PyMatch::make_class(ctx); + let pattern_type = PyPattern::make_class(ctx); py_module!(vm, "re", { "compile" => ctx.new_rustfunc(re_compile), "escape" => ctx.new_rustfunc(re_escape), + "purge" => ctx.new_rustfunc(re_purge), "Match" => match_type, "match" => ctx.new_rustfunc(re_match), "Pattern" => pattern_type, - "search" => ctx.new_rustfunc(re_search) + "search" => ctx.new_rustfunc(re_search), + "IGNORECASE" => ctx.new_int(IGNORECASE), + "I" => ctx.new_int(IGNORECASE), + "MULTILINE" => ctx.new_int(MULTILINE), + "M" => ctx.new_int(MULTILINE), + "VERBOSE" => ctx.new_int(VERBOSE), + "X" => ctx.new_int(VERBOSE), + "DOTALL" => ctx.new_int(DOTALL), + "S" => ctx.new_int(DOTALL), }) } diff --git a/vm/src/stdlib/string.rs b/vm/src/stdlib/string.rs index c255ffc9356..89104a1c560 100644 --- a/vm/src/stdlib/string.rs +++ b/vm/src/stdlib/string.rs @@ -7,30 +7,8 @@ use crate::pyobject::PyObjectRef; use crate::vm::VirtualMachine; pub fn make_module(vm: &VirtualMachine) -> PyObjectRef { - let ctx = &vm.ctx; - - let ascii_lowercase = "abcdefghijklmnopqrstuvwxyz".to_string(); - let ascii_uppercase = "ABCDEFGHIJKLMNOPQRSTUVWXYZ".to_string(); - let ascii_letters = format!("{}{}", ascii_lowercase, ascii_uppercase); - let digits = "0123456789".to_string(); - let hexdigits = "0123456789abcdefABCDEF".to_string(); - let octdigits = "01234567".to_string(); - let punctuation = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~".to_string(); - /* FIXME - let whitespace = " \t\n\r\x0b\x0c".to_string(); - let printable = format!("{}{}{}{}", digits, ascii_letters, punctuation, whitespace); - */ + // let ctx = &vm.ctx; // Constants: - py_module!(vm, "string", { - "ascii_letters" => ctx.new_str(ascii_letters), - "ascii_lowercase" => ctx.new_str(ascii_lowercase), - "ascii_uppercase" => ctx.new_str(ascii_uppercase), - "digits" => ctx.new_str(digits), - "hexdigits" => ctx.new_str(hexdigits), - "octdigits" => ctx.new_str(octdigits), - // "printable", ctx.new_str(printable) - "punctuation" => ctx.new_str(punctuation) - // "whitespace", ctx.new_str(whitespace) - }) + py_module!(vm, "_string", {}) }