Fix text_file assignment - SentencePiece (#3016)

pplantinga · web-flow · commit f97c1b2f1b60 · 2026-01-29T15:55:26.000Z
diff --git a/speechbrain/tokenizers/SentencePiece.py b/speechbrain/tokenizers/SentencePiece.py
@@ -65,9 +65,9 @@ class SentencePiece:
     bos_id : int
         If -1 the bos_id = unk_id = 0. otherwise, bos_id = int. (default: -1)
     eos_id : int
-        If -1 the bos_id = unk_id = 0. otherwise, bos_id = int. (default: -1)
+        If -1 the eos_id = unk_id = 0. otherwise, eos_id = int. (default: -1)
     pad_id : int
-        If -1 the pad_id = unk_id = 0. otherwise, bos_id = int. (default: -1)
+        If -1 the pad_id = unk_id = 0. otherwise, pad_id = int. (default: -1)
     unk_id : int
         The token corresponding to an unknown symbol (not in token set).
     split_by_whitespace : bool
@@ -163,7 +163,7 @@ def __init__(
                         ext, ".txt"
                     ),
                 )
-            self.text_file = text_file
+        self.text_file = str(text_file)
 
         self.prefix_model_file = os.path.join(
             model_dir, str(vocab_size) + "_" + model_type
diff --git a/tests/unittests/test_tokenizer.py b/tests/unittests/test_tokenizer.py
@@ -143,3 +143,14 @@ def test_tokenizer():
     ]
     words_seq = spm(hyps_list, task="decode_from_list")
     assert words_seq == gt, "output not the same"
+
+
+def test_tokenizer_textfile(tmpdir):
+    """Test that the tokenizer can be initialized from a raw text file."""
+    from speechbrain.tokenizers.SentencePiece import SentencePiece
+
+    textpath = tmpdir / "test.txt"
+    textpath.write("Custom text file.\nTwo lines long.\n")
+
+    spm = SentencePiece(tmpdir, 19, text_file=textpath)
+    assert spm.sp.vocab_size() == 19