@@ -21,7 +21,9 @@ var START_TAG_REGEXP = /^<\s*([\w:]+)((?:\s+\w+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']
2121 BEGIN_TAG_REGEXP = / ^ < / ,
2222 BEGING_END_TAGE_REGEXP = / ^ < \s * \/ / ,
2323 COMMENT_REGEXP = / < ! - - ( .* ?) - - > / g,
24- CDATA_REGEXP = / < ! \[ C D A T A \[ ( .* ?) ] ] > / g;
24+ CDATA_REGEXP = / < ! \[ C D A T A \[ ( .* ?) ] ] > / g,
25+ URI_REGEXP = / ^ ( ( f t p | h t t p s ? ) : \/ \/ | m a i l t o : | # ) / ,
26+ NON_ALPHANUMERIC_REGEXP = / ( [ ^ \# - ~ | | ! ] ) / g; // Match everything outside of normal chars and " (quote character)
2527
2628// Empty Elements - HTML 4.01
2729var emptyElements = makeMap ( "area,base,basefont,br,col,hr,img,input,isindex,link,param" ) ;
@@ -33,24 +35,24 @@ var blockElements = makeMap("address,blockquote,button,center,dd,del,dir,div,dl,
3335// Inline Elements - HTML 4.01
3436var inlineElements = makeMap ( "a,abbr,acronym,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,img," +
3537 "input,ins,kbd,label,map,q,s,samp,select,small,span,strike,strong,sub,sup,textarea,tt,u,var" ) ;
36-
3738// Elements that you can, intentionally, leave open
3839// (and which close themselves)
3940var closeSelfElements = makeMap ( "colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr" ) ;
40-
41- // Attributes that have their values filled in disabled="disabled"
42- var fillAttrs = makeMap ( "checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected" ) ;
43-
4441// Special Elements (can contain anything)
4542var specialElements = makeMap ( "script,style" ) ;
46-
4743var validElements = extend ( { } , emptyElements , blockElements , inlineElements , closeSelfElements ) ;
48- var validAttrs = extend ( { } , fillAttrs , makeMap (
49- 'abbr,align,alink,alt,archive,axis,background,bgcolor,border,cellpadding,cellspacing,cite,class,classid,clear,code,codebase,' +
50- 'codetype,color,cols,colspan,content,coords,data,dir,face,for,headers,height,href,hreflang,hspace,id,label,lang,language,' +
51- 'link,longdesc,marginheight,marginwidth,maxlength,media,method,name,nowrap,profile,prompt,rel,rev,rows,rowspan,rules,scheme,' +
52- 'scope,scrolling,shape,size,span,src,standby,start,summary,tabindex,target,text,title,type,usemap,valign,value,valuetype,' +
53- 'vlink,vspace,width' ) ) ;
44+
45+ //see: http://www.w3.org/TR/html4/index/attributes.html
46+ //Attributes that have their values filled in disabled="disabled"
47+ var fillAttrs = makeMap ( "checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected" ) ;
48+ //Attributes that have href and hence need to be sanitized
49+ var uriAttrs = makeMap ( "background,href,longdesc,src,usemap" ) ;
50+ var validAttrs = extend ( { } , fillAttrs , uriAttrs , makeMap (
51+ 'abbr,align,alt,axis,bgcolor,border,cellpadding,cellspacing,class,clear,' +
52+ 'color,cols,colspan,coords,dir,face,for,headers,height,hreflang,hspace,id,' +
53+ 'label,lang,language,maxlength,method,name,prompt,rel,rev,rows,rowspan,rules,' +
54+ 'scope,scrolling,shape,size,span,start,summary,tabindex,target,title,type,' +
55+ 'valign,value,vspace,width' ) ) ;
5456
5557/**
5658 * @example
@@ -64,7 +66,7 @@ var validAttrs = extend({}, fillAttrs, makeMap(
6466 * @param {string } html string
6567 * @param {object } handler
6668 */
67- var htmlParser = function ( html , handler ) {
69+ function htmlParser ( html , handler ) {
6870 var index , chars , match , stack = [ ] , last = html ;
6971 stack . last = function ( ) { return stack [ stack . length - 1 ] ; } ;
7072
@@ -112,8 +114,7 @@ var htmlParser = function( html, handler ) {
112114 var text = index < 0 ? html : html . substring ( 0 , index ) ;
113115 html = index < 0 ? "" : html . substring ( index ) ;
114116
115- if ( handler . chars )
116- handler . chars ( text ) ;
117+ handler . chars ( decodeEntities ( text ) ) ;
117118 }
118119
119120 } else {
@@ -122,8 +123,7 @@ var htmlParser = function( html, handler ) {
122123 replace ( COMMENT_REGEXP , "$1" ) .
123124 replace ( CDATA_REGEXP , "$1" ) ;
124125
125- if ( handler . chars )
126- handler . chars ( text ) ;
126+ handler . chars ( decodeEntities ( text ) ) ;
127127
128128 return "" ;
129129 } ) ;
@@ -157,21 +157,18 @@ var htmlParser = function( html, handler ) {
157157 if ( ! unary )
158158 stack . push ( tagName ) ;
159159
160- if ( handler . start ) {
161- var attrs = { } ;
160+ var attrs = { } ;
162161
163- rest . replace ( ATTR_REGEXP , function ( match , name ) {
164- var value = arguments [ 2 ] ? arguments [ 2 ] :
165- arguments [ 3 ] ? arguments [ 3 ] :
166- arguments [ 4 ] ? arguments [ 4 ] :
167- fillAttrs [ name ] ? name : "" ;
162+ rest . replace ( ATTR_REGEXP , function ( match , name ) {
163+ var value = arguments [ 2 ] ? arguments [ 2 ] :
164+ arguments [ 3 ] ? arguments [ 3 ] :
165+ arguments [ 4 ] ? arguments [ 4 ] :
166+ fillAttrs [ name ] ? name : "" ;
168167
169- attrs [ name ] = value ; //value.replace(/(^|[^\\])"/g, '$1\\\"') //"
170- } ) ;
168+ attrs [ name ] = decodeEntities ( value ) ; //value.replace(/(^|[^\\])"/g, '$1\\\"') //"
169+ } ) ;
171170
172- if ( handler . start )
173- handler . start ( tagName , attrs , unary ) ;
174- }
171+ handler . start ( tagName , attrs , unary ) ;
175172 }
176173
177174 function parseEndTag ( tag , tagName ) {
@@ -186,8 +183,7 @@ var htmlParser = function( html, handler ) {
186183 if ( pos >= 0 ) {
187184 // Close all the open elements, up the stack
188185 for ( i = stack . length - 1 ; i >= pos ; i -- )
189- if ( handler . end )
190- handler . end ( stack [ i ] ) ;
186+ handler . end ( stack [ i ] ) ;
191187
192188 // Remove the open elements from the stack
193189 stack . length = pos ;
@@ -206,28 +202,32 @@ function makeMap(str){
206202 return obj ;
207203}
208204
209- /*
210- * For attack vectors see: http://ha.ckers.org/xss.html
205+ /**
206+ * decodes all entities into regular string
207+ * @param value
208+ * @returns
211209 */
212- var JAVASCRIPT_URL = / ^ j a v a s c r i p t : / i,
213- NBSP_REGEXP = / & n b s p ; / gim,
214- HEX_ENTITY_REGEXP = / & # x ( [ \d a - f ] * ) ; ? / igm,
215- DEC_ENTITY_REGEXP = / & # ( \d + ) ; ? / igm,
216- CHAR_REGEXP = / [ \w : ] / gm,
217- HEX_DECODE = function ( match , code ) { return fromCharCode ( parseInt ( code , 16 ) ) ; } ,
218- DEC_DECODE = function ( match , code ) { return fromCharCode ( code ) ; } ;
210+ var hiddenPre = document . createElement ( "pre" ) ;
211+ function decodeEntities ( value ) {
212+ hiddenPre . innerHTML = value . replace ( / < / g, "<" ) ;
213+ return hiddenPre . innerText || hiddenPre . textContent ;
214+ }
215+
219216/**
220- * @param {string } url
221- * @returns true if url decodes to something which starts with 'javascript:' hence unsafe
217+ * Escapes all potentially dangerous characters, so that the
218+ * resulting string can be safely inserted into attribute or
219+ * element text.
220+ * @param value
221+ * @returns escaped text
222222 */
223- function isJavaScriptUrl ( url ) {
224- var chars = [ ] ;
225- url . replace ( NBSP_REGEXP , '' ) .
226- replace ( HEX_ENTITY_REGEXP , HEX_DECODE ) .
227- replace ( DEC_ENTITY_REGEXP , DEC_DECODE ) .
228- // Remove all non \w: characters, unfurtunetly value.replace(/[\w:]/,'') can be defeated using \u0000
229- replace ( CHAR_REGEXP , function ( ch ) { chars . push ( ch ) ; } ) ;
230- return JAVASCRIPT_URL . test ( lowercase ( chars . join ( '' ) ) ) ;
223+ function encodeEntities ( value ) {
224+ return value .
225+ replace ( / & / g , '& ' ) .
226+ replace ( NON_ALPHANUMERIC_REGEXP , function ( value ) {
227+ return '&#' + value . charCodeAt ( 0 ) + ';' ;
228+ } ) .
229+ replace ( / < / g , '<' ) .
230+ replace ( / > / g , '>' ) ;
231231}
232232
233233/**
@@ -253,14 +253,12 @@ function htmlSanitizeWriter(buf){
253253 out ( '<' ) ;
254254 out ( tag ) ;
255255 foreach ( attrs , function ( value , key ) {
256- if ( validAttrs [ lowercase ( key ) ] && ! isJavaScriptUrl ( value ) ) {
256+ var lkey = lowercase ( key ) ;
257+ if ( validAttrs [ lkey ] && ( uriAttrs [ lkey ] !== true || value . match ( URI_REGEXP ) ) ) {
257258 out ( ' ' ) ;
258259 out ( key ) ;
259260 out ( '="' ) ;
260- out ( value .
261- replace ( / < / g, '<' ) .
262- replace ( / > / g, '>' ) .
263- replace ( / \" / g, '"' ) ) ;
261+ out ( encodeEntities ( value ) ) ;
264262 out ( '"' ) ;
265263 }
266264 } ) ;
@@ -280,10 +278,7 @@ function htmlSanitizeWriter(buf){
280278 } ,
281279 chars : function ( chars ) {
282280 if ( ! ignore ) {
283- out ( chars .
284- replace ( / & ( \w + [ & ; \W ] ) ? / g, function ( match , entity ) { return entity ?match :'&' ; } ) .
285- replace ( / < / g, '<' ) .
286- replace ( / > / g, '>' ) ) ;
281+ out ( encodeEntities ( chars ) ) ;
287282 }
288283 }
289284 } ;
0 commit comments