Skip to content

Commit a8b37ad

Browse files
committed
Scrit by Marc-Andre Lemburg to generate htmlentitydefs.py.
1 parent f8e390b commit a8b37ad

1 file changed

Lines changed: 65 additions & 0 deletions

File tree

Tools/scripts/parseentities.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#!/usr/local/bin/python
2+
""" Utility for parsing HTML entity definitions available from:
3+
4+
http://www.w3.org/ as e.g.
5+
http://www.w3.org/TR/REC-html40/HTMLlat1.ent
6+
7+
Input is read from stdin, output is written to stdout in form of a
8+
Python snippet defining a dictionary "entitydefs" mapping literal
9+
entity name to character or numeric entity.
10+
11+
Marc-Andre Lemburg, mal@lemburg.com, 1999.
12+
Use as you like. NO WARRANTIES.
13+
14+
"""
15+
import re,sys
16+
import TextTools
17+
18+
entityRE = re.compile('<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->')
19+
20+
def parse(text,pos=0,endpos=None):
21+
22+
pos = 0
23+
if endpos is None:
24+
endpos = len(text)
25+
d = {}
26+
while 1:
27+
m = entityRE.search(text,pos,endpos)
28+
if not m:
29+
break
30+
name,charcode,comment = m.groups()
31+
d[name] = charcode,comment
32+
pos = m.end()
33+
return d
34+
35+
def writefile(f,defs):
36+
37+
f.write("entitydefs = {\n")
38+
items = defs.items()
39+
items.sort()
40+
for name,(charcode,comment) in items:
41+
if charcode[:2] == '&#':
42+
code = int(charcode[2:-1])
43+
if code < 256:
44+
charcode = "'\%o'" % code
45+
else:
46+
charcode = repr(charcode)
47+
else:
48+
charcode = repr(charcode)
49+
comment = TextTools.collapse(comment)
50+
f.write(" '%s':\t%s, \t# %s\n" % (name,charcode,comment))
51+
f.write('\n}\n')
52+
53+
if __name__ == '__main__':
54+
if len(sys.argv) > 1:
55+
infile = open(sys.argv[1])
56+
else:
57+
infile = sys.stdin
58+
if len(sys.argv) > 2:
59+
outfile = open(sys.argv[2],'w')
60+
else:
61+
outfile = sys.stdout
62+
text = infile.read()
63+
defs = parse(text)
64+
writefile(outfile,defs)
65+

0 commit comments

Comments
 (0)