Skip to content

Commit 41d5938

Browse files
committed
Fixed sanitization
* explicitly require full URLs (ftp|https?://...) * list the URI attributes * remove a lot of unneeded attributes
1 parent 5480d2a commit 41d5938

2 files changed

Lines changed: 90 additions & 74 deletions

File tree

src/sanitizer.js

Lines changed: 55 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@ var START_TAG_REGEXP = /^<\s*([\w:]+)((?:\s+\w+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']
2121
BEGIN_TAG_REGEXP = /^</,
2222
BEGING_END_TAGE_REGEXP = /^<\s*\//,
2323
COMMENT_REGEXP = /<!--(.*?)-->/g,
24-
CDATA_REGEXP = /<!\[CDATA\[(.*?)]]>/g;
24+
CDATA_REGEXP = /<!\[CDATA\[(.*?)]]>/g,
25+
URI_REGEXP = /^((ftp|https?):\/\/|mailto:|#)/,
26+
NON_ALPHANUMERIC_REGEXP = /([^\#-~| |!])/g; // Match everything outside of normal chars and " (quote character)
2527

2628
// Empty Elements - HTML 4.01
2729
var emptyElements = makeMap("area,base,basefont,br,col,hr,img,input,isindex,link,param");
@@ -33,24 +35,24 @@ var blockElements = makeMap("address,blockquote,button,center,dd,del,dir,div,dl,
3335
// Inline Elements - HTML 4.01
3436
var inlineElements = makeMap("a,abbr,acronym,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,img,"+
3537
"input,ins,kbd,label,map,q,s,samp,select,small,span,strike,strong,sub,sup,textarea,tt,u,var");
36-
3738
// Elements that you can, intentionally, leave open
3839
// (and which close themselves)
3940
var closeSelfElements = makeMap("colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr");
40-
41-
// Attributes that have their values filled in disabled="disabled"
42-
var fillAttrs = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected");
43-
4441
// Special Elements (can contain anything)
4542
var specialElements = makeMap("script,style");
46-
4743
var validElements = extend({}, emptyElements, blockElements, inlineElements, closeSelfElements);
48-
var validAttrs = extend({}, fillAttrs, makeMap(
49-
'abbr,align,alink,alt,archive,axis,background,bgcolor,border,cellpadding,cellspacing,cite,class,classid,clear,code,codebase,'+
50-
'codetype,color,cols,colspan,content,coords,data,dir,face,for,headers,height,href,hreflang,hspace,id,label,lang,language,'+
51-
'link,longdesc,marginheight,marginwidth,maxlength,media,method,name,nowrap,profile,prompt,rel,rev,rows,rowspan,rules,scheme,'+
52-
'scope,scrolling,shape,size,span,src,standby,start,summary,tabindex,target,text,title,type,usemap,valign,value,valuetype,'+
53-
'vlink,vspace,width'));
44+
45+
//see: http://www.w3.org/TR/html4/index/attributes.html
46+
//Attributes that have their values filled in disabled="disabled"
47+
var fillAttrs = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected");
48+
//Attributes that have href and hence need to be sanitized
49+
var uriAttrs = makeMap("background,href,longdesc,src,usemap");
50+
var validAttrs = extend({}, fillAttrs, uriAttrs, makeMap(
51+
'abbr,align,alt,axis,bgcolor,border,cellpadding,cellspacing,class,clear,'+
52+
'color,cols,colspan,coords,dir,face,for,headers,height,hreflang,hspace,id,'+
53+
'label,lang,language,maxlength,method,name,prompt,rel,rev,rows,rowspan,rules,'+
54+
'scope,scrolling,shape,size,span,start,summary,tabindex,target,title,type,'+
55+
'valign,value,vspace,width'));
5456

5557
/**
5658
* @example
@@ -64,7 +66,7 @@ var validAttrs = extend({}, fillAttrs, makeMap(
6466
* @param {string} html string
6567
* @param {object} handler
6668
*/
67-
var htmlParser = function( html, handler ) {
69+
function htmlParser( html, handler ) {
6870
var index, chars, match, stack = [], last = html;
6971
stack.last = function(){ return stack[ stack.length - 1 ]; };
7072

@@ -112,8 +114,7 @@ var htmlParser = function( html, handler ) {
112114
var text = index < 0 ? html : html.substring( 0, index );
113115
html = index < 0 ? "" : html.substring( index );
114116

115-
if ( handler.chars )
116-
handler.chars( text );
117+
handler.chars( decodeEntities(text) );
117118
}
118119

119120
} else {
@@ -122,8 +123,7 @@ var htmlParser = function( html, handler ) {
122123
replace(COMMENT_REGEXP, "$1").
123124
replace(CDATA_REGEXP, "$1");
124125

125-
if ( handler.chars )
126-
handler.chars( text );
126+
handler.chars( decodeEntities(text) );
127127

128128
return "";
129129
});
@@ -157,21 +157,18 @@ var htmlParser = function( html, handler ) {
157157
if ( !unary )
158158
stack.push( tagName );
159159

160-
if ( handler.start ) {
161-
var attrs = {};
160+
var attrs = {};
162161

163-
rest.replace(ATTR_REGEXP, function(match, name) {
164-
var value = arguments[2] ? arguments[2] :
165-
arguments[3] ? arguments[3] :
166-
arguments[4] ? arguments[4] :
167-
fillAttrs[name] ? name : "";
162+
rest.replace(ATTR_REGEXP, function(match, name) {
163+
var value = arguments[2] ? arguments[2] :
164+
arguments[3] ? arguments[3] :
165+
arguments[4] ? arguments[4] :
166+
fillAttrs[name] ? name : "";
168167

169-
attrs[name] = value; //value.replace(/(^|[^\\])"/g, '$1\\\"') //"
170-
});
168+
attrs[name] = decodeEntities(value); //value.replace(/(^|[^\\])"/g, '$1\\\"') //"
169+
});
171170

172-
if ( handler.start )
173-
handler.start( tagName, attrs, unary );
174-
}
171+
handler.start( tagName, attrs, unary );
175172
}
176173

177174
function parseEndTag( tag, tagName ) {
@@ -186,8 +183,7 @@ var htmlParser = function( html, handler ) {
186183
if ( pos >= 0 ) {
187184
// Close all the open elements, up the stack
188185
for ( i = stack.length - 1; i >= pos; i-- )
189-
if ( handler.end )
190-
handler.end( stack[ i ] );
186+
handler.end( stack[ i ] );
191187

192188
// Remove the open elements from the stack
193189
stack.length = pos;
@@ -206,28 +202,32 @@ function makeMap(str){
206202
return obj;
207203
}
208204

209-
/*
210-
* For attack vectors see: http://ha.ckers.org/xss.html
205+
/**
206+
* decodes all entities into regular string
207+
* @param value
208+
* @returns
211209
*/
212-
var JAVASCRIPT_URL = /^javascript:/i,
213-
NBSP_REGEXP = /&nbsp;/gim,
214-
HEX_ENTITY_REGEXP = /&#x([\da-f]*);?/igm,
215-
DEC_ENTITY_REGEXP = /&#(\d+);?/igm,
216-
CHAR_REGEXP = /[\w:]/gm,
217-
HEX_DECODE = function(match, code){return fromCharCode(parseInt(code,16));},
218-
DEC_DECODE = function(match, code){return fromCharCode(code);};
210+
var hiddenPre=document.createElement("pre");
211+
function decodeEntities(value) {
212+
hiddenPre.innerHTML=value.replace(/</g,"&lt;");
213+
return hiddenPre.innerText || hiddenPre.textContent;
214+
}
215+
219216
/**
220-
* @param {string} url
221-
* @returns true if url decodes to something which starts with 'javascript:' hence unsafe
217+
* Escapes all potentially dangerous characters, so that the
218+
* resulting string can be safely inserted into attribute or
219+
* element text.
220+
* @param value
221+
* @returns escaped text
222222
*/
223-
function isJavaScriptUrl(url) {
224-
var chars = [];
225-
url.replace(NBSP_REGEXP, '').
226-
replace(HEX_ENTITY_REGEXP, HEX_DECODE).
227-
replace(DEC_ENTITY_REGEXP, DEC_DECODE).
228-
// Remove all non \w: characters, unfurtunetly value.replace(/[\w:]/,'') can be defeated using \u0000
229-
replace(CHAR_REGEXP, function(ch){chars.push(ch);});
230-
return JAVASCRIPT_URL.test(lowercase(chars.join('')));
223+
function encodeEntities(value) {
224+
return value.
225+
replace(/&/g, '&amp;').
226+
replace(NON_ALPHANUMERIC_REGEXP, function(value){
227+
return '&#' + value.charCodeAt(0) + ';';
228+
}).
229+
replace(/</g, '&lt;').
230+
replace(/>/g, '&gt;');
231231
}
232232

233233
/**
@@ -253,14 +253,12 @@ function htmlSanitizeWriter(buf){
253253
out('<');
254254
out(tag);
255255
foreach(attrs, function(value, key){
256-
if (validAttrs[lowercase(key)] && !isJavaScriptUrl(value)) {
256+
var lkey=lowercase(key);
257+
if (validAttrs[lkey] && (uriAttrs[lkey]!==true || value.match(URI_REGEXP))) {
257258
out(' ');
258259
out(key);
259260
out('="');
260-
out(value.
261-
replace(/</g, '&lt;').
262-
replace(/>/g, '&gt;').
263-
replace(/\"/g,'&quot;'));
261+
out(encodeEntities(value));
264262
out('"');
265263
}
266264
});
@@ -280,10 +278,7 @@ function htmlSanitizeWriter(buf){
280278
},
281279
chars: function(chars){
282280
if (!ignore) {
283-
out(chars.
284-
replace(/&(\w+[&;\W])?/g, function(match, entity){return entity?match:'&amp;';}).
285-
replace(/</g, '&lt;').
286-
replace(/>/g, '&gt;'));
281+
out(encodeEntities(chars));
287282
}
288283
}
289284
};

test/sanitizerSpec.js

Lines changed: 35 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ describe('HTML', function(){
66

77
it('should echo html', function(){
88
expectHTML('hello<b class="1\'23" align=\'""\'>world</b>.').
9-
toEqual('hello<b class="1\'23" align="&quot;&quot;">world</b>.');
9+
toEqual('hello<b class="1\'23" align="&#34;&#34;">world</b>.');
1010
});
1111

1212
it('should remove script', function(){
@@ -49,9 +49,15 @@ describe('HTML', function(){
4949
expectHTML('a<my:hr/><my:div>b</my:div>c').toEqual('abc');
5050
});
5151

52+
it('should handle entities', function(){
53+
var everything = '<div id="!@#$%^&amp;*()_+-={}[]:&#34;;\'&lt;&gt;?,./`~ &#295;">' +
54+
'!@#$%^&amp;*()_+-={}[]:&#34;;\'&lt;&gt;?,./`~ &#295;</div>';
55+
expectHTML(everything).toEqual(everything);
56+
});
57+
5258
it('should handle improper html', function(){
53-
expectHTML('< div id="</div>" alt=abc href=\'"\' >text< /div>').
54-
toEqual('<div id="&lt;/div&gt;" alt="abc" href="&quot;">text</div>');
59+
expectHTML('< div id="</div>" alt=abc dir=\'"\' >text< /div>').
60+
toEqual('<div id="&lt;/div&gt;" alt="abc" dir="&#34;">text</div>');
5561
});
5662

5763
it('should handle improper html2', function(){
@@ -81,19 +87,14 @@ describe('HTML', function(){
8187
expect(html).toEqual('a&lt;div&gt;&amp;&lt;/div&gt;c');
8288
});
8389

84-
it('should not double escape entities', function(){
85-
writer.chars('&nbsp;&gt;&lt;');
86-
expect(html).toEqual('&nbsp;&gt;&lt;');
87-
});
88-
8990
it('should escape IE script', function(){
90-
writer.chars('&{}');
91-
expect(html).toEqual('&amp;{}');
91+
writer.chars('&<>{}');
92+
expect(html).toEqual('&amp;&lt;&gt;{}');
9293
});
9394

9495
it('should escape attributes', function(){
95-
writer.start('div', {id:'\"\'<>'});
96-
expect(html).toEqual('<div id="&quot;\'&lt;&gt;">');
96+
writer.start('div', {id:'!@#$%^&*()_+-={}[]:";\'<>?,./`~ \n\0\r\u0127'});
97+
expect(html).toEqual('<div id="!@#$%^&amp;*()_+-={}[]:&#34;;\'&lt;&gt;?,./`~ &#10;&#0;&#13;&#295;">');
9798
});
9899

99100
it('should ignore missformed elements', function(){
@@ -105,20 +106,40 @@ describe('HTML', function(){
105106
writer.start('div', {unknown:""});
106107
expect(html).toEqual('<div>');
107108
});
109+
110+
describe('isUri', function(){
111+
112+
function isUri(value) {
113+
return value.match(URI_REGEXP);
114+
}
115+
116+
it('should be URI', function(){
117+
expect(isUri('http://abc')).toBeTruthy();
118+
expect(isUri('https://abc')).toBeTruthy();
119+
expect(isUri('ftp://abc')).toBeTruthy();
120+
expect(isUri('mailto:me@example.com')).toBeTruthy();
121+
expect(isUri('#anchor')).toBeTruthy();
122+
});
123+
124+
it('should not be UIR', function(){
125+
expect(isUri('')).toBeFalsy();
126+
expect(isUri('javascript:alert')).toBeFalsy();
127+
});
128+
});
108129

109130
describe('javascript URL attribute', function(){
110131
beforeEach(function(){
111132
this.addMatchers({
112133
toBeValidUrl: function(){
113-
return !isJavaScriptUrl(this.actual);
134+
return URI_REGEXP.exec(this.actual);
114135
}
115136
});
116137
});
117138

118139
it('should ignore javascript:', function(){
119140
expect('JavaScript:abc').not.toBeValidUrl();
120141
expect(' \n Java\n Script:abc').not.toBeValidUrl();
121-
expect('JavaScript/my.js').toBeValidUrl();
142+
expect('http://JavaScript/my.js').toBeValidUrl();
122143
});
123144

124145
it('should ignore dec encoded javascript:', function(){

0 commit comments

Comments
 (0)