@@ -114,6 +114,7 @@ def __init__(self, pattern, data=None):
114114 data = []
115115 self .data = data
116116 self .width = None
117+
117118 def dump (self , level = 0 ):
118119 nl = True
119120 seqtypes = (tuple , list )
@@ -404,6 +405,15 @@ def _escape(source, escape, state):
404405 pass
405406 raise source .error ("bad escape %s" % escape , len (escape ))
406407
408+ def _uniq (items ):
409+ if len (set (items )) == len (items ):
410+ return items
411+ newitems = []
412+ for item in items :
413+ if item not in newitems :
414+ newitems .append (item )
415+ return newitems
416+
407417def _parse_sub (source , state , verbose , nested = True ):
408418 # parse an alternation: a|b|c
409419
@@ -420,7 +430,6 @@ def _parse_sub(source, state, verbose, nested=True):
420430 return items [0 ]
421431
422432 subpattern = SubPattern (state )
423- subpatternappend = subpattern .append
424433
425434 # check if all items share a common prefix
426435 while True :
@@ -437,35 +446,31 @@ def _parse_sub(source, state, verbose, nested=True):
437446 # move it out of the branch
438447 for item in items :
439448 del item [0 ]
440- subpatternappend (prefix )
449+ subpattern . append (prefix )
441450 continue # check next one
442451 break
443452
444453 # check if the branch can be replaced by a character set
454+ set = []
445455 for item in items :
446- if len (item ) != 1 or item [0 ][0 ] is not LITERAL :
456+ if len (item ) != 1 :
457+ break
458+ op , av = item [0 ]
459+ if op is LITERAL :
460+ set .append ((op , av ))
461+ elif op is IN and av [0 ][0 ] is not NEGATE :
462+ set .extend (av )
463+ else :
447464 break
448465 else :
449466 # we can store this as a character set instead of a
450467 # branch (the compiler may optimize this even more)
451- subpatternappend ((IN , [ item [ 0 ] for item in items ] ))
468+ subpattern . append ((IN , _uniq ( set ) ))
452469 return subpattern
453470
454471 subpattern .append ((BRANCH , (None , items )))
455472 return subpattern
456473
457- def _parse_sub_cond (source , state , condgroup , verbose ):
458- item_yes = _parse (source , state , verbose )
459- if source .match ("|" ):
460- item_no = _parse (source , state , verbose )
461- if source .next == "|" :
462- raise source .error ("conditional backref with more than two branches" )
463- else :
464- item_no = None
465- subpattern = SubPattern (state )
466- subpattern .append ((GROUPREF_EXISTS , (condgroup , item_yes , item_no )))
467- return subpattern
468-
469474def _parse (source , state , verbose , first = False ):
470475 # parse a simple pattern
471476 subpattern = SubPattern (state )
@@ -511,16 +516,14 @@ def _parse(source, state, verbose, first=False):
511516 setappend = set .append
512517## if sourcematch(":"):
513518## pass # handle character classes
514- if sourcematch ("^" ):
515- setappend ((NEGATE , None ))
519+ negate = sourcematch ("^" )
516520 # check remaining characters
517- start = set [:]
518521 while True :
519522 this = sourceget ()
520523 if this is None :
521524 raise source .error ("unterminated character set" ,
522525 source .tell () - here )
523- if this == "]" and set != start :
526+ if this == "]" and set :
524527 break
525528 elif this [0 ] == "\\ " :
526529 code1 = _class_escape (source , this )
@@ -556,13 +559,19 @@ def _parse(source, state, verbose, first=False):
556559 code1 = code1 [1 ][0 ]
557560 setappend (code1 )
558561
562+ set = _uniq (set )
559563 # XXX: <fl> should move set optimization to compiler!
560- if _len (set )== 1 and set [0 ][0 ] is LITERAL :
561- subpatternappend (set [0 ]) # optimization
562- elif _len (set )== 2 and set [0 ][0 ] is NEGATE and set [1 ][0 ] is LITERAL :
563- subpatternappend ((NOT_LITERAL , set [1 ][1 ])) # optimization
564+ if _len (set ) == 1 and set [0 ][0 ] is LITERAL :
565+ # optimization
566+ if negate :
567+ subpatternappend ((NOT_LITERAL , set [0 ][1 ]))
568+ else :
569+ subpatternappend (set [0 ])
564570 else :
565- # XXX: <fl> should add charmap optimization here
571+ if negate :
572+ set .insert (0 , (NEGATE , None ))
573+ # charmap optimization can't be added here because
574+ # global flags still are not known
566575 subpatternappend ((IN , set ))
567576
568577 elif this in REPEAT_CHARS :
@@ -579,6 +588,7 @@ def _parse(source, state, verbose, first=False):
579588 if source .next == "}" :
580589 subpatternappend ((LITERAL , _ord (this )))
581590 continue
591+
582592 min , max = 0 , MAXREPEAT
583593 lo = hi = ""
584594 while source .next in DIGITS :
@@ -592,6 +602,7 @@ def _parse(source, state, verbose, first=False):
592602 subpatternappend ((LITERAL , _ord (this )))
593603 source .seek (here )
594604 continue
605+
595606 if lo :
596607 min = int (lo )
597608 if min >= MAXREPEAT :
@@ -610,12 +621,16 @@ def _parse(source, state, verbose, first=False):
610621 item = subpattern [- 1 :]
611622 else :
612623 item = None
613- if not item or ( _len ( item ) == 1 and item [0 ][0 ] is AT ) :
624+ if not item or item [0 ][0 ] is AT :
614625 raise source .error ("nothing to repeat" ,
615626 source .tell () - here + len (this ))
616627 if item [0 ][0 ] in _REPEATCODES :
617628 raise source .error ("multiple repeat" ,
618629 source .tell () - here + len (this ))
630+ if item [0 ][0 ] is SUBPATTERN :
631+ group , add_flags , del_flags , p = item [0 ][1 ]
632+ if group is None and not add_flags and not del_flags :
633+ item = p
619634 if sourcematch ("?" ):
620635 subpattern [- 1 ] = (MIN_REPEAT , (min , max , item ))
621636 else :
@@ -628,7 +643,6 @@ def _parse(source, state, verbose, first=False):
628643 start = source .tell () - 1
629644 group = True
630645 name = None
631- condgroup = None
632646 add_flags = 0
633647 del_flags = 0
634648 if sourcematch ("?" ):
@@ -660,6 +674,7 @@ def _parse(source, state, verbose, first=False):
660674 state .checklookbehindgroup (gid , source )
661675 subpatternappend ((GROUPREF , gid ))
662676 continue
677+
663678 else :
664679 char = sourceget ()
665680 if char is None :
@@ -678,6 +693,7 @@ def _parse(source, state, verbose, first=False):
678693 if sourceget () == ")" :
679694 break
680695 continue
696+
681697 elif char in "=!<" :
682698 # lookahead assertions
683699 dir = 1
@@ -704,10 +720,10 @@ def _parse(source, state, verbose, first=False):
704720 else :
705721 subpatternappend ((ASSERT_NOT , (dir , p )))
706722 continue
723+
707724 elif char == "(" :
708725 # conditional backreference group
709726 condname = source .getuntil (")" )
710- group = None
711727 if condname .isidentifier ():
712728 condgroup = state .groupdict .get (condname )
713729 if condgroup is None :
@@ -728,6 +744,19 @@ def _parse(source, state, verbose, first=False):
728744 msg = "invalid group reference %d" % condgroup
729745 raise source .error (msg , len (condname ) + 1 )
730746 state .checklookbehindgroup (condgroup , source )
747+ item_yes = _parse (source , state , verbose )
748+ if source .match ("|" ):
749+ item_no = _parse (source , state , verbose )
750+ if source .next == "|" :
751+ raise source .error ("conditional backref with more than two branches" )
752+ else :
753+ item_no = None
754+ if not source .match (")" ):
755+ raise source .error ("missing ), unterminated subpattern" ,
756+ source .tell () - start )
757+ subpatternappend ((GROUPREF_EXISTS , (condgroup , item_yes , item_no )))
758+ continue
759+
731760 elif char in FLAGS or char == "-" :
732761 # flags
733762 flags = _parse_flags (source , state , char )
@@ -744,6 +773,7 @@ def _parse(source, state, verbose, first=False):
744773 if (state .flags & SRE_FLAG_VERBOSE ) and not verbose :
745774 raise Verbose
746775 continue
776+
747777 add_flags , del_flags = flags
748778 group = None
749779 else :
@@ -756,12 +786,9 @@ def _parse(source, state, verbose, first=False):
756786 group = state .opengroup (name )
757787 except error as err :
758788 raise source .error (err .msg , len (name ) + 1 ) from None
759- if condgroup :
760- p = _parse_sub_cond (source , state , condgroup , verbose )
761- else :
762- sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE )) and
763- not (del_flags & SRE_FLAG_VERBOSE ))
764- p = _parse_sub (source , state , sub_verbose )
789+ sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE )) and
790+ not (del_flags & SRE_FLAG_VERBOSE ))
791+ p = _parse_sub (source , state , sub_verbose )
765792 if not source .match (")" ):
766793 raise source .error ("missing ), unterminated subpattern" ,
767794 source .tell () - start )
@@ -773,11 +800,19 @@ def _parse(source, state, verbose, first=False):
773800 subpatternappend ((AT , AT_BEGINNING ))
774801
775802 elif this == "$" :
776- subpattern . append ((AT , AT_END ))
803+ subpatternappend ((AT , AT_END ))
777804
778805 else :
779806 raise AssertionError ("unsupported special character %r" % (char ,))
780807
808+ # unpack non-capturing groups
809+ for i in range (len (subpattern ))[::- 1 ]:
810+ op , av = subpattern [i ]
811+ if op is SUBPATTERN :
812+ group , add_flags , del_flags , p = av
813+ if group is None and not add_flags and not del_flags :
814+ subpattern [i : i + 1 ] = p
815+
781816 return subpattern
782817
783818def _parse_flags (source , state , char ):
0 commit comments