Skip to content

Commit 978ace6

Browse files
committed
Introduce validate_filename()
1 parent 7625af3 commit 978ace6

2 files changed

Lines changed: 60 additions & 1 deletion

File tree

test/test_util.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,32 @@ def test_id_to_datetime_legacy(self):
111111
datetime(2020, 1, 2, 3, 4, 5))
112112
self.assertIsNone(util.id_to_datetime_legacy('20200102'), None)
113113

114+
def test_validate_filename(self):
115+
self.assertEqual(
116+
util.validate_filename(''),
117+
'_')
118+
self.assertEqual(
119+
util.validate_filename('.wsb'),
120+
'_.wsb')
121+
self.assertEqual(
122+
util.validate_filename('foo.'),
123+
'foo')
124+
self.assertEqual(
125+
util.validate_filename(' wsb '),
126+
'wsb')
127+
self.assertEqual(
128+
util.validate_filename(''.join(chr(i) for i in range(0x80))),
129+
"!_#$%&'()_+,-._0123456789_;(=)_@ABCDEFGHIJKLMNOPQRSTUVWXYZ[_]^_`abcdefghijklmnopqrstuvwxyz{_}-")
130+
self.assertEqual(
131+
util.validate_filename('\u0080中文𠀀'),
132+
'\u0080中文𠀀')
133+
self.assertEqual(
134+
util.validate_filename(''.join(chr(i) for i in range(0x80)), force_ascii=True),
135+
"!_#$%25&'()_+,-._0123456789_;(=)_@ABCDEFGHIJKLMNOPQRSTUVWXYZ[_]^_`abcdefghijklmnopqrstuvwxyz{_}-")
136+
self.assertEqual(
137+
util.validate_filename('\u0080中文𠀀', force_ascii=True),
138+
'%C2%80%E4%B8%AD%E6%96%87%F0%A0%80%80')
139+
114140
def test_crop(self):
115141
self.assertEqual(util.crop('dummy text', 10), 'dummy text')
116142
self.assertEqual(util.crop('dummy text', 9), 'dummy ...')

webscrapbook/util.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import binascii
1818
import codecs
1919
from base64 import b64decode
20-
from urllib.parse import unquote_to_bytes
20+
from urllib.parse import quote, unquote_to_bytes
2121
from urllib.request import pathname2url
2222
from ipaddress import IPv6Address, AddressValueError
2323
from datetime import datetime, timezone
@@ -159,6 +159,39 @@ def id_to_datetime_legacy(id):
159159
return None
160160

161161

162+
def validate_filename(filename, force_ascii=False):
163+
"""Transliterates the given string to be a safe filename
164+
165+
See also: scrapbook.validateFilename of WebScrapBook.
166+
"""
167+
fn = filename
168+
169+
# control chars are bad for filename
170+
fn = re.sub(r'[\x00-\x1F\x7F]+', '', fn)
171+
172+
# leading/trailing spaces and dots are not allowed on Windows
173+
fn = re.sub(r'^\.', '_.', fn)
174+
fn = re.sub(r'^ +', '', fn)
175+
fn = re.sub(r'[. ]+$', '', fn)
176+
177+
# bad chars on most OS
178+
fn = re.sub(r'[:"?*\\/|]', '_', fn)
179+
180+
# bad chars on Windows, replace with adequate direction
181+
fn = fn.replace('<', '(').replace('>', ')')
182+
183+
# "~" is not allowed by browser.downloads
184+
fn = fn.replace('~', '-')
185+
186+
if force_ascii:
187+
fn = quote(fn, safe="""!_#$&'()*+,-./:;<=>?@[\\]^_`{|}~""")
188+
189+
# prevent empty filename
190+
fn = fn or "_"
191+
192+
return fn
193+
194+
162195
#########################################################################
163196
# String handling
164197
#########################################################################

0 commit comments

Comments
 (0)