Skip to content

Commit f97c1b2

Browse files
authored
Fix text_file assignment - SentencePiece (#3016)
1 parent ba4f674 commit f97c1b2

2 files changed

Lines changed: 14 additions & 3 deletions

File tree

speechbrain/tokenizers/SentencePiece.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,9 @@ class SentencePiece:
6565
bos_id : int
6666
If -1 the bos_id = unk_id = 0. otherwise, bos_id = int. (default: -1)
6767
eos_id : int
68-
If -1 the bos_id = unk_id = 0. otherwise, bos_id = int. (default: -1)
68+
If -1 the eos_id = unk_id = 0. otherwise, eos_id = int. (default: -1)
6969
pad_id : int
70-
If -1 the pad_id = unk_id = 0. otherwise, bos_id = int. (default: -1)
70+
If -1 the pad_id = unk_id = 0. otherwise, pad_id = int. (default: -1)
7171
unk_id : int
7272
The token corresponding to an unknown symbol (not in token set).
7373
split_by_whitespace : bool
@@ -163,7 +163,7 @@ def __init__(
163163
ext, ".txt"
164164
),
165165
)
166-
self.text_file = text_file
166+
self.text_file = str(text_file)
167167

168168
self.prefix_model_file = os.path.join(
169169
model_dir, str(vocab_size) + "_" + model_type

tests/unittests/test_tokenizer.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,3 +143,14 @@ def test_tokenizer():
143143
]
144144
words_seq = spm(hyps_list, task="decode_from_list")
145145
assert words_seq == gt, "output not the same"
146+
147+
148+
def test_tokenizer_textfile(tmpdir):
149+
"""Test that the tokenizer can be initialized from a raw text file."""
150+
from speechbrain.tokenizers.SentencePiece import SentencePiece
151+
152+
textpath = tmpdir / "test.txt"
153+
textpath.write("Custom text file.\nTwo lines long.\n")
154+
155+
spm = SentencePiece(tmpdir, 19, text_file=textpath)
156+
assert spm.sp.vocab_size() == 19

0 commit comments

Comments
 (0)