@@ -14,8 +14,12 @@ def getNodeDetails(self, node):
1414 return (_base .DOCUMENT ,)
1515
1616 elif isinstance (node , Declaration ): # DocumentType
17- #Slice needed to remove markup added during unicode conversion
18- m = self .doctype_regexp .match (unicode (node .string )[2 :- 1 ])
17+ string = unicode (node .string )
18+ #Slice needed to remove markup added during unicode conversion,
19+ #but only in some versions of BeautifulSoup/Python
20+ if string .startswith ('<!' ) and string .endswith ('>' ):
21+ string = string [2 :- 1 ]
22+ m = self .doctype_regexp .match (string )
1923 #This regexp approach seems wrong and fragile
2024 #but beautiful soup stores the doctype as a single thing and we want the seperate bits
2125 #It should work as long as the tree is created by html5lib itself but may be wrong if it's
@@ -31,7 +35,10 @@ def getNodeDetails(self, node):
3135 return _base .DOCTYPE , name , publicId or "" , systemId or ""
3236
3337 elif isinstance (node , Comment ):
34- return _base .COMMENT , unicode (node .string )[4 :- 3 ]
38+ string = unicode (node .string )
39+ if string .startswith ('<!--' ) and string .endswith ('-->' ):
40+ string = string [4 :- 3 ]
41+ return _base .COMMENT , string
3542
3643 elif isinstance (node , unicode ): # TextNode
3744 return _base .TEXT , node
0 commit comments