@@ -21,10 +21,10 @@ class BufferedIOBase(object):
2121 pass
2222
2323#Non-unicode versions of constants for use in the pre-parser
24- spaceCharactersBytes = frozenset ([str ( item ) for item in spaceCharacters ])
25- asciiLettersBytes = frozenset ([str ( item ) for item in asciiLetters ])
26- asciiUppercaseBytes = frozenset ([str ( item ) for item in asciiUppercase ])
27- spacesAngleBrackets = spaceCharactersBytes | frozenset ([">" , "<" ])
24+ spaceCharactersBytes = frozenset ([item . encode ( "ascii" ) for item in spaceCharacters ])
25+ asciiLettersBytes = frozenset ([item . encode ( "ascii" ) for item in asciiLetters ])
26+ asciiUppercaseBytes = frozenset ([item . encode ( "ascii" ) for item in asciiUppercase ])
27+ spacesAngleBrackets = spaceCharactersBytes | frozenset ([b ">" , b "<" ])
2828
2929invalid_unicode_re = re .compile ("[\u0001 -\u0008 \u000B \u000E -\u001F \u007F -\u009F \uD800 -\uDFFF \uFDD0 -\uFDEF \uFFFE \uFFFF \U0001FFFE \U0001FFFF \U0002FFFE \U0002FFFF \U0003FFFE \U0003FFFF \U0004FFFE \U0004FFFF \U0005FFFE \U0005FFFF \U0006FFFE \U0006FFFF \U0007FFFE \U0007FFFF \U0008FFFE \U0008FFFF \U0009FFFE \U0009FFFF \U000AFFFE \U000AFFFF \U000BFFFE \U000BFFFF \U000CFFFE \U000CFFFF \U000DFFFE \U000DFFFF \U000EFFFE \U000EFFFF \U000FFFFE \U000FFFFF \U0010FFFE \U0010FFFF ]" )
3030
@@ -391,12 +391,14 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
391391 parseMeta - Look for a <meta> element containing encoding information
392392
393393 """
394- self .charEncoding = (codecName (encoding ), "certain" )
395-
396394 # Raw Stream - for unicode objects this will encode to utf-8 and set
397395 # self.charEncoding as appropriate
398396 self .rawStream = self .openStream (source )
399397
398+ HTMLUnicodeInputStream .__init__ (self , self .rawStream )
399+
400+ self .charEncoding = (codecName (encoding ), "certain" )
401+
400402 # Encoding Information
401403 #Number of bytes to use when looking for a meta element with
402404 #encoding information
@@ -411,7 +413,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
411413 self .charEncoding = self .detectEncoding (parseMeta , chardet )
412414
413415 #Call superclass
414- HTMLUnicodeInputStream . __init__ ( self , self . rawStream )
416+ self . reset ( )
415417
416418 def reset (self ):
417419 self .dataStream = codecs .getreader (self .charEncoding [0 ])(self .rawStream ,
@@ -538,12 +540,13 @@ def detectEncodingMeta(self):
538540
539541 return encoding
540542
541- class EncodingBytes (str ):
543+ class EncodingBytes (bytes ):
542544 """String-like object with an associated position and various extra methods
543545 If the position is ever greater than the string length then an exception is
544546 raised"""
545547 def __new__ (self , value ):
546- return str .__new__ (self , value .lower ())
548+ assert isinstance (value , bytes )
549+ return bytes .__new__ (self , value .lower ())
547550
548551 def __init__ (self , value ):
549552 self ._position = - 1
@@ -557,7 +560,7 @@ def __next__(self):
557560 raise StopIteration
558561 elif p < 0 :
559562 raise TypeError
560- return self [p ]
563+ return self [p : p + 1 ]
561564
562565 def previous (self ):
563566 p = self ._position
@@ -566,7 +569,7 @@ def previous(self):
566569 elif p < 0 :
567570 raise TypeError
568571 self ._position = p = p - 1
569- return self [p ]
572+ return self [p : p + 1 ]
570573
571574 def setPosition (self , position ):
572575 if self ._position >= len (self ):
@@ -584,15 +587,15 @@ def getPosition(self):
584587 position = property (getPosition , setPosition )
585588
586589 def getCurrentByte (self ):
587- return self [self .position ]
590+ return self [self .position : self . position + 1 ]
588591
589592 currentByte = property (getCurrentByte )
590593
591594 def skip (self , chars = spaceCharactersBytes ):
592595 """Skip past a list of characters"""
593596 p = self .position # use property for the error-checking
594597 while p < len (self ):
595- c = self [p ]
598+ c = self [p : p + 1 ]
596599 if c not in chars :
597600 self ._position = p
598601 return c
@@ -603,7 +606,7 @@ def skip(self, chars=spaceCharactersBytes):
603606 def skipUntil (self , chars ):
604607 p = self .position
605608 while p < len (self ):
606- c = self [p ]
609+ c = self [p : p + 1 ]
607610 if c in chars :
608611 self ._position = p
609612 return c
@@ -645,12 +648,12 @@ def __init__(self, data):
645648
646649 def getEncoding (self ):
647650 methodDispatch = (
648- ("<!--" ,self .handleComment ),
649- ("<meta" ,self .handleMeta ),
650- ("</" ,self .handlePossibleEndTag ),
651- ("<!" ,self .handleOther ),
652- ("<?" ,self .handleOther ),
653- ("<" ,self .handlePossibleStartTag ))
651+ (b "<!--" ,self .handleComment ),
652+ (b "<meta" ,self .handleMeta ),
653+ (b "</" ,self .handlePossibleEndTag ),
654+ (b "<!" ,self .handleOther ),
655+ (b "<?" ,self .handleOther ),
656+ (b "<" ,self .handlePossibleStartTag ))
654657 for byte in self .data :
655658 keepParsing = True
656659 for key , method in methodDispatch :
@@ -663,37 +666,48 @@ def getEncoding(self):
663666 break
664667 if not keepParsing :
665668 break
666-
669+
667670 return self .encoding
668671
669672 def handleComment (self ):
670673 """Skip over comments"""
671- return self .data .jumpTo ("-->" )
674+ return self .data .jumpTo (b "-->" )
672675
673676 def handleMeta (self ):
674677 if self .data .currentByte not in spaceCharactersBytes :
675678 #if we have <meta not followed by a space so just keep going
676679 return True
677680 #We have a valid meta element we want to search for attributes
681+ hasPragma = False
682+ pendingEncoding = None
678683 while True :
679684 #Try to find the next attribute after the current position
680685 attr = self .getAttribute ()
681686 if attr is None :
682687 return True
683688 else :
684- if attr [0 ] == "charset" :
689+ if attr [0 ] == b"http-equiv" :
690+ hasPragma = attr [1 ] == b"content-type"
691+ if hasPragma and pendingEncoding is not None :
692+ self .encoding = pendingEncoding
693+ return False
694+ elif attr [0 ] == b"charset" :
685695 tentativeEncoding = attr [1 ]
686696 codec = codecName (tentativeEncoding )
687697 if codec is not None :
688698 self .encoding = codec
689699 return False
690- elif attr [0 ] == "content" :
700+ elif attr [0 ] == b "content" :
691701 contentParser = ContentAttrParser (EncodingBytes (attr [1 ]))
692702 tentativeEncoding = contentParser .parse ()
693- codec = codecName (tentativeEncoding )
694- if codec is not None :
695- self .encoding = codec
696- return False
703+ if tentativeEncoding is not None :
704+ codec = codecName (tentativeEncoding )
705+ if codec is not None :
706+ if hasPragma :
707+ self .encoding = codec
708+ return False
709+ else :
710+ pendingEncoding = codec
697711
698712 def handlePossibleStartTag (self ):
699713 return self .handlePossibleTag (False )
@@ -714,7 +728,7 @@ def handlePossibleTag(self, endTag):
714728 return True
715729
716730 c = data .skipUntil (spacesAngleBrackets )
717- if c == "<" :
731+ if c == b "<" :
718732 #return to the first step in the overall "two step" algorithm
719733 #reprocessing the < byte
720734 data .previous ()
@@ -726,31 +740,31 @@ def handlePossibleTag(self, endTag):
726740 return True
727741
728742 def handleOther (self ):
729- return self .data .jumpTo (">" )
743+ return self .data .jumpTo (b ">" )
730744
731745 def getAttribute (self ):
732746 """Return a name,value pair for the next attribute in the stream,
733747 if one is found, or None"""
734748 data = self .data
735749 # Step 1 (skip chars)
736- c = data .skip (spaceCharactersBytes | frozenset ("/" ))
750+ c = data .skip (spaceCharactersBytes | frozenset ([b"/" ]))
751+ assert c is None or len (c ) == 1
737752 # Step 2
738- if c in (">" , None ):
753+ if c in (b ">" , None ):
739754 return None
740755 # Step 3
741756 attrName = []
742757 attrValue = []
743758 #Step 4 attribute name
744759 while True :
745- if c == "=" and attrName :
760+ if c == b "=" and attrName :
746761 break
747762 elif c in spaceCharactersBytes :
748763 #Step 6!
749764 c = data .skip ()
750- c = next (data )
751765 break
752- elif c in ("/" , ">" ):
753- return "" .join (attrName ), ""
766+ elif c in (b "/" , b ">" ):
767+ return b "" .join (attrName ), b ""
754768 elif c in asciiUppercaseBytes :
755769 attrName .append (c .lower ())
756770 elif c == None :
@@ -760,15 +774,15 @@ def getAttribute(self):
760774 #Step 5
761775 c = next (data )
762776 #Step 7
763- if c != "=" :
777+ if c != b "=" :
764778 data .previous ()
765- return "" .join (attrName ), ""
779+ return b "" .join (attrName ), b ""
766780 #Step 8
767781 next (data )
768782 #Step 9
769783 c = data .skip ()
770784 #Step 10
771- if c in ("'" , '"' ):
785+ if c in (b "'" , b '"' ):
772786 #10.1
773787 quoteChar = c
774788 while True :
@@ -777,15 +791,15 @@ def getAttribute(self):
777791 #10.3
778792 if c == quoteChar :
779793 next (data )
780- return "" .join (attrName ), "" .join (attrValue )
794+ return b "" .join (attrName ), b "" .join (attrValue )
781795 #10.4
782796 elif c in asciiUppercaseBytes :
783797 attrValue .append (c .lower ())
784798 #10.5
785799 else :
786800 attrValue .append (c )
787- elif c == ">" :
788- return "" .join (attrName ), ""
801+ elif c == b ">" :
802+ return b "" .join (attrName ), b ""
789803 elif c in asciiUppercaseBytes :
790804 attrValue .append (c .lower ())
791805 elif c is None :
@@ -796,7 +810,7 @@ def getAttribute(self):
796810 while True :
797811 c = next (data )
798812 if c in spacesAngleBrackets :
799- return "" .join (attrName ), "" .join (attrValue )
813+ return b "" .join (attrName ), b "" .join (attrValue )
800814 elif c in asciiUppercaseBytes :
801815 attrValue .append (c .lower ())
802816 elif c is None :
@@ -807,21 +821,22 @@ def getAttribute(self):
807821
808822class ContentAttrParser (object ):
809823 def __init__ (self , data ):
824+ assert isinstance (data , bytes )
810825 self .data = data
811826 def parse (self ):
812827 try :
813828 #Check if the attr name is charset
814829 #otherwise return
815- self .data .jumpTo ("charset" )
830+ self .data .jumpTo (b "charset" )
816831 self .data .position += 1
817832 self .data .skip ()
818- if not self .data .currentByte == "=" :
833+ if not self .data .currentByte == b "=" :
819834 #If there is no = sign keep looking for attrs
820835 return None
821836 self .data .position += 1
822837 self .data .skip ()
823838 #Look for an encoding between matching quote marks
824- if self .data .currentByte in ('"' , "'" ):
839+ if self .data .currentByte in (b '"' , b "'" ):
825840 quoteMark = self .data .currentByte
826841 self .data .position += 1
827842 oldPosition = self .data .position
@@ -845,6 +860,11 @@ def parse(self):
845860def codecName (encoding ):
846861 """Return the python codec name corresponding to an encoding or None if the
847862 string doesn't correspond to a valid encoding."""
863+ if isinstance (encoding , bytes ):
864+ try :
865+ encoding = encoding .decode ("ascii" )
866+ except UnicodeDecodeError :
867+ return None
848868 if encoding :
849869 canonicalName = ascii_punctuation_re .sub ("" , encoding ).lower ()
850870 return encodings .get (canonicalName , None )
0 commit comments