1+ /*
2+ * HTML Parser By John Resig (ejohn.org)
3+ * Original code by Erik Arvidsson, Mozilla Public License
4+ * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js
5+ *
6+ * // Use like so:
7+ * htmlParser(htmlString, {
8+ * start: function(tag, attrs, unary) {},
9+ * end: function(tag) {},
10+ * chars: function(text) {},
11+ * comment: function(text) {}
12+ * });
13+ *
14+ * // or to get an XML string:
15+ * HTMLtoXML(htmlString);
16+ *
17+ * // or to get an XML DOM Document
18+ * HTMLtoDOM(htmlString);
19+ *
20+ * // or to inject into an existing document/DOM node
21+ * HTMLtoDOM(htmlString, document);
22+ * HTMLtoDOM(htmlString, document.body);
23+ *
24+ */
25+
26+ ( function ( ) {
27+
28+ // Regular Expressions for parsing tags and attributes
29+ var startTag = / ^ < ( \w + ) ( (?: \s + \w + (?: \s * = \s * (?: (?: " [ ^ " ] * " ) | (?: ' [ ^ ' ] * ' ) | [ ^ > \s ] + ) ) ? ) * ) \s * ( \/ ? ) > / ,
30+ endTag = / ^ < \/ ( \w + ) [ ^ > ] * > / ,
31+ attr = / ( \w + ) (?: \s * = \s * (?: (?: " ( (?: \\ .| [ ^ " ] ) * ) " ) | (?: ' ( (?: \\ .| [ ^ ' ] ) * ) ' ) | ( [ ^ > \s ] + ) ) ) ? / g;
32+
33+ // Empty Elements - HTML 4.01
34+ var empty = makeMap ( "area,base,basefont,br,col,frame,hr,img,input,isindex,link,meta,param,embed" ) ;
35+
36+ // Block Elements - HTML 4.01
37+ var block = makeMap ( "address,applet,blockquote,button,center,dd,del,dir,div,dl,dt,fieldset,form,frameset,hr,iframe,ins,isindex,li,map,menu,noframes,noscript,object,ol,p,pre,script,table,tbody,td,tfoot,th,thead,tr,ul" ) ;
38+
39+ // Inline Elements - HTML 4.01
40+ var inline = makeMap ( "a,abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,object,q,s,samp,script,select,small,span,strike,strong,sub,sup,textarea,tt,u,var" ) ;
41+
42+ // Elements that you can, intentionally, leave open
43+ // (and which close themselves)
44+ var closeSelf = makeMap ( "colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr" ) ;
45+
46+ // Attributes that have their values filled in disabled="disabled"
47+ var fillAttrs = makeMap ( "checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected" ) ;
48+
49+ // Special Elements (can contain anything)
50+ var special = makeMap ( "script,style" ) ;
51+
52+ var htmlParser = this . htmlParser = function ( html , handler ) {
53+ var index , chars , match , stack = [ ] , last = html ;
54+ stack . last = function ( ) {
55+ return this [ this . length - 1 ] ;
56+ } ;
57+
58+ while ( html ) {
59+ chars = true ;
60+
61+ // Make sure we're not in a script or style element
62+ if ( ! stack . last ( ) || ! special [ stack . last ( ) ] ) {
63+
64+ // Comment
65+ if ( html . indexOf ( "<!--" ) == 0 ) {
66+ index = html . indexOf ( "-->" ) ;
67+
68+ if ( index >= 0 ) {
69+ if ( handler . comment )
70+ handler . comment ( html . substring ( 4 , index ) ) ;
71+ html = html . substring ( index + 3 ) ;
72+ chars = false ;
73+ }
74+
75+ // end tag
76+ } else if ( html . indexOf ( "</" ) == 0 ) {
77+ match = html . match ( endTag ) ;
78+
79+ if ( match ) {
80+ html = html . substring ( match [ 0 ] . length ) ;
81+ match [ 0 ] . replace ( endTag , parseEndTag ) ;
82+ chars = false ;
83+ }
84+
85+ // start tag
86+ } else if ( html . indexOf ( "<" ) == 0 ) {
87+ match = html . match ( startTag ) ;
88+
89+ if ( match ) {
90+ html = html . substring ( match [ 0 ] . length ) ;
91+ match [ 0 ] . replace ( startTag , parseStartTag ) ;
92+ chars = false ;
93+ }
94+ }
95+
96+ if ( chars ) {
97+ index = html . indexOf ( "<" ) ;
98+
99+ var text = index < 0 ? html : html . substring ( 0 , index ) ;
100+ html = index < 0 ? "" : html . substring ( index ) ;
101+
102+ if ( handler . chars )
103+ handler . chars ( text ) ;
104+ }
105+
106+ } else {
107+ html = html . replace ( new RegExp ( "(.*)<\/" + stack . last ( ) + "[^>]*>" ) , function ( all , text ) {
108+ text = text . replace ( / < ! - - ( .* ?) - - > / g, "$1" )
109+ . replace ( / < ! \[ C D A T A \[ ( .* ?) ] ] > / g, "$1" ) ;
110+
111+ if ( handler . chars )
112+ handler . chars ( text ) ;
113+
114+ return "" ;
115+ } ) ;
116+
117+ parseEndTag ( "" , stack . last ( ) ) ;
118+ }
119+
120+ if ( html == last )
121+ throw "Parse Error: " + html ;
122+ last = html ;
123+ }
124+
125+ // Clean up any remaining tags
126+ parseEndTag ( ) ;
127+
128+ function parseStartTag ( tag , tagName , rest , unary ) {
129+ if ( block [ tagName ] ) {
130+ while ( stack . last ( ) && inline [ stack . last ( ) ] ) {
131+ parseEndTag ( "" , stack . last ( ) ) ;
132+ }
133+ }
134+
135+ if ( closeSelf [ tagName ] && stack . last ( ) == tagName ) {
136+ parseEndTag ( "" , tagName ) ;
137+ }
138+
139+ unary = empty [ tagName ] || ! ! unary ;
140+
141+ if ( ! unary )
142+ stack . push ( tagName ) ;
143+
144+ if ( handler . start ) {
145+ var attrs = [ ] ;
146+
147+ rest . replace ( attr , function ( match , name ) {
148+ var value = arguments [ 2 ] ? arguments [ 2 ] :
149+ arguments [ 3 ] ? arguments [ 3 ] :
150+ arguments [ 4 ] ? arguments [ 4 ] :
151+ fillAttrs [ name ] ? name : "" ;
152+
153+ attrs . push ( {
154+ name : name ,
155+ value : value ,
156+ escaped : value . replace ( / ( ^ | [ ^ \\ ] ) " / g, '$1\\\"' ) //"
157+ } ) ;
158+ } ) ;
159+
160+ if ( handler . start )
161+ handler . start ( tagName , attrs , unary ) ;
162+ }
163+ }
164+
165+ function parseEndTag ( tag , tagName ) {
166+ // If no tag name is provided, clean shop
167+ if ( ! tagName )
168+ var pos = 0 ;
169+
170+ // Find the closest opened tag of the same type
171+ else
172+ for ( var pos = stack . length - 1 ; pos >= 0 ; pos -- )
173+ if ( stack [ pos ] == tagName )
174+ break ;
175+
176+ if ( pos >= 0 ) {
177+ // Close all the open elements, up the stack
178+ for ( var i = stack . length - 1 ; i >= pos ; i -- )
179+ if ( handler . end )
180+ handler . end ( stack [ i ] ) ;
181+
182+ // Remove the open elements from the stack
183+ stack . length = pos ;
184+ }
185+ }
186+ } ;
187+
188+ this . HTMLtoXML = function ( html ) {
189+ var results = "" ;
190+
191+ htmlParser ( html , {
192+ start : function ( tag , attrs , unary ) {
193+ results += "<" + tag ;
194+
195+ for ( var i = 0 ; i < attrs . length ; i ++ )
196+ results += " " + attrs [ i ] . name + '="' + attrs [ i ] . escaped + '"' ;
197+
198+ results += ( unary ? "/" : "" ) + ">" ;
199+ } ,
200+ end : function ( tag ) {
201+ results += "</" + tag + ">" ;
202+ } ,
203+ chars : function ( text ) {
204+ results += text ;
205+ } ,
206+ comment : function ( text ) {
207+ results += "<!--" + text + "-->" ;
208+ }
209+ } ) ;
210+
211+ return results ;
212+ } ;
213+
214+ this . HTMLtoDOM = function ( html , doc ) {
215+ // There can be only one of these elements
216+ var one = makeMap ( "html,head,body,title" ) ;
217+
218+ // Enforce a structure for the document
219+ var structure = {
220+ link : "head" ,
221+ base : "head"
222+ } ;
223+
224+ if ( ! doc ) {
225+ if ( typeof DOMDocument != "undefined" )
226+ doc = new DOMDocument ( ) ;
227+ else if ( typeof document != "undefined" && document . implementation && document . implementation . createDocument )
228+ doc = document . implementation . createDocument ( "" , "" , null ) ;
229+ else if ( typeof ActiveX != "undefined" )
230+ doc = new ActiveXObject ( "Msxml.DOMDocument" ) ;
231+
232+ } else
233+ doc = doc . ownerDocument ||
234+ doc . getOwnerDocument && doc . getOwnerDocument ( ) ||
235+ doc ;
236+
237+ var elems = [ ] ,
238+ documentElement = doc . documentElement ||
239+ doc . getDocumentElement && doc . getDocumentElement ( ) ;
240+
241+ // If we're dealing with an empty document then we
242+ // need to pre-populate it with the HTML document structure
243+ if ( ! documentElement && doc . createElement ) ( function ( ) {
244+ var html = doc . createElement ( "html" ) ;
245+ var head = doc . createElement ( "head" ) ;
246+ head . appendChild ( doc . createElement ( "title" ) ) ;
247+ html . appendChild ( head ) ;
248+ html . appendChild ( doc . createElement ( "body" ) ) ;
249+ doc . appendChild ( html ) ;
250+ } ) ( ) ;
251+
252+ // Find all the unique elements
253+ if ( doc . getElementsByTagName )
254+ for ( var i in one )
255+ one [ i ] = doc . getElementsByTagName ( i ) [ 0 ] ;
256+
257+ // If we're working with a document, inject contents into
258+ // the body element
259+ var curParentNode = one . body ;
260+
261+ htmlParser ( html , {
262+ start : function ( tagName , attrs , unary ) {
263+ // If it's a pre-built element, then we can ignore
264+ // its construction
265+ if ( one [ tagName ] ) {
266+ curParentNode = one [ tagName ] ;
267+ return ;
268+ }
269+
270+ var elem = doc . createElement ( tagName ) ;
271+
272+ for ( var attr in attrs )
273+ elem . setAttribute ( attrs [ attr ] . name , attrs [ attr ] . value ) ;
274+
275+ if ( structure [ tagName ] && typeof one [ structure [ tagName ] ] != "boolean" )
276+ one [ structure [ tagName ] ] . appendChild ( elem ) ;
277+
278+ else if ( curParentNode && curParentNode . appendChild )
279+ curParentNode . appendChild ( elem ) ;
280+
281+ if ( ! unary ) {
282+ elems . push ( elem ) ;
283+ curParentNode = elem ;
284+ }
285+ } ,
286+ end : function ( tag ) {
287+ elems . length -= 1 ;
288+
289+ // Init the new parentNode
290+ curParentNode = elems [ elems . length - 1 ] ;
291+ } ,
292+ chars : function ( text ) {
293+ curParentNode . appendChild ( doc . createTextNode ( text ) ) ;
294+ } ,
295+ comment : function ( text ) {
296+ // create comment node
297+ }
298+ } ) ;
299+
300+ return doc ;
301+ } ;
302+
303+ function makeMap ( str ) {
304+ var obj = { } , items = str . split ( "," ) ;
305+ for ( var i = 0 ; i < items . length ; i ++ )
306+ obj [ items [ i ] ] = true ;
307+ return obj ;
308+ }
309+ } ) ( ) ;
0 commit comments