@@ -391,6 +391,21 @@ def _splitnetloc(url, start=0):
391391 delim = min (delim , wdelim ) # use earliest delim position
392392 return url [start :delim ], url [delim :] # return (domain, rest)
393393
394+ def _checknetloc (netloc ):
395+ if not netloc or netloc .isascii ():
396+ return
397+ # looking for characters like \u2100 that expand to 'a/c'
398+ # IDNA uses NFKC equivalence, so normalize for this check
399+ import unicodedata
400+ netloc2 = unicodedata .normalize ('NFKC' , netloc )
401+ if netloc == netloc2 :
402+ return
403+ _ , _ , netloc = netloc .rpartition ('@' ) # anything to the left of '@' is okay
404+ for c in '/?#@:' :
405+ if c in netloc2 :
406+ raise ValueError ("netloc '" + netloc2 + "' contains invalid " +
407+ "characters under NFKC normalization" )
408+
394409def urlsplit (url , scheme = '' , allow_fragments = True ):
395410 """Parse a URL into 5 components:
396411 <scheme>://<netloc>/<path>?<query>#<fragment>
@@ -419,6 +434,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
419434 url , fragment = url .split ('#' , 1 )
420435 if '?' in url :
421436 url , query = url .split ('?' , 1 )
437+ _checknetloc (netloc )
422438 v = SplitResult ('http' , netloc , url , query , fragment )
423439 _parse_cache [key ] = v
424440 return _coerce_result (v )
@@ -442,6 +458,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
442458 url , fragment = url .split ('#' , 1 )
443459 if '?' in url :
444460 url , query = url .split ('?' , 1 )
461+ _checknetloc (netloc )
445462 v = SplitResult (scheme , netloc , url , query , fragment )
446463 _parse_cache [key ] = v
447464 return _coerce_result (v )
0 commit comments