@@ -88,14 +88,16 @@ class HTMLSerializer(object):
8888 # miscellaneous options
8989 emit_doctype = 'preserve'
9090 inject_meta_charset = True
91+ lang_attr = 'preserve'
9192 strip_whitespace = False
9293 sanitize = False
9394
9495 options = ("quote_attr_values" , "quote_char" , "use_best_quote_char" ,
9596 "minimize_boolean_attributes" , "use_trailing_solidus" ,
9697 "space_before_trailing_solidus" , "omit_optional_tags" ,
9798 "strip_whitespace" , "inject_meta_charset" , "escape_lt_in_attrs" ,
98- "escape_rcdata" , "resolve_entities" , "emit_doctype" , "sanitize" )
99+ "escape_rcdata" , "resolve_entities" , "emit_doctype" , "lang_attr" ,
100+ "sanitize" )
99101
100102 def __init__ (self , ** kwargs ):
101103 """Initialize HTMLSerializer.
@@ -114,6 +116,11 @@ def __init__(self, **kwargs):
114116 * emit_doctype='preserve' preserves the doctype, if any, unchanged
115117 inject_meta_charset=True|False
116118 ..?
119+ lang_attr='preserve'|'xml'|'html'
120+ Whether to translate 'lang' attributes.
121+ * lang_attr='preserve' does no translation
122+ * lang_attr='xml' translates 'lang' to 'xml:lang'
123+ * lang_attr='html' translates 'xml:lang' to 'lang'
117124 quote_attr_values=True|False
118125 Whether to quote attribute values that don't require quoting
119126 per HTML5 parsing rules.
@@ -288,6 +295,18 @@ def serialize(self, treewalker, encoding=None):
288295 attrs = attrs .items ()
289296 attributes = []
290297 for k ,v in attrs :
298+
299+ # clean up xml:lang
300+ if k == '{http://www.w3.org/XML/1998/namespace}lang' :
301+ k = 'xml:lang'
302+ if self .lang_attr == 'xml' :
303+ if k == 'lang' and not ('xml:lang' in attrs or
304+ '{http://www.w3.org/XML/1998/namespace}lang' in attrs ):
305+ k = 'xml:lang'
306+ elif self .lang_attr == 'html' :
307+ if k == 'xml:lang' and not ('lang' in attrs ):
308+ k = 'lang'
309+
291310 if encoding :
292311 k = k .encode (encoding , "strict" )
293312 attributes .append (' ' )
0 commit comments