File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -65,9 +65,9 @@ class SentencePiece:
6565 bos_id : int
6666 If -1 the bos_id = unk_id = 0. otherwise, bos_id = int. (default: -1)
6767 eos_id : int
68- If -1 the bos_id = unk_id = 0. otherwise, bos_id = int. (default: -1)
68+ If -1 the eos_id = unk_id = 0. otherwise, eos_id = int. (default: -1)
6969 pad_id : int
70- If -1 the pad_id = unk_id = 0. otherwise, bos_id = int. (default: -1)
70+ If -1 the pad_id = unk_id = 0. otherwise, pad_id = int. (default: -1)
7171 unk_id : int
7272 The token corresponding to an unknown symbol (not in token set).
7373 split_by_whitespace : bool
@@ -163,7 +163,7 @@ def __init__(
163163 ext , ".txt"
164164 ),
165165 )
166- self .text_file = text_file
166+ self .text_file = str ( text_file )
167167
168168 self .prefix_model_file = os .path .join (
169169 model_dir , str (vocab_size ) + "_" + model_type
Original file line number Diff line number Diff line change @@ -143,3 +143,14 @@ def test_tokenizer():
143143 ]
144144 words_seq = spm (hyps_list , task = "decode_from_list" )
145145 assert words_seq == gt , "output not the same"
146+
147+
148+ def test_tokenizer_textfile (tmpdir ):
149+ """Test that the tokenizer can be initialized from a raw text file."""
150+ from speechbrain .tokenizers .SentencePiece import SentencePiece
151+
152+ textpath = tmpdir / "test.txt"
153+ textpath .write ("Custom text file.\n Two lines long.\n " )
154+
155+ spm = SentencePiece (tmpdir , 19 , text_file = textpath )
156+ assert spm .sp .vocab_size () == 19
You can’t perform that action at this time.
0 commit comments