-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathexample.py
More file actions
executable file
·56 lines (30 loc) · 1.44 KB
/
example.py
File metadata and controls
executable file
·56 lines (30 loc) · 1.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/bin/env python3
import ucto
text = """To be or not to be, that's the question. This is a test to tokenise. We can span
multiple lines!!! The number 6 is Mr Li's favourite. We can't stop yet.
This is the next paragraph. And so it ends"""
#Set a file to use as tokeniser rules, this one is for English, other languages are available too:
settingsfile = "tokconfig-eng"
#Initialise the tokeniser, options are passed as keyword arguments, defaults:
# lowercase=False,uppercase=False,sentenceperlineinput=False,
# sentenceperlineoutput=False,
# sentencedetection=True, paragraphdetection=True, quotedetectin=False,
# debug=False
tokenizer = ucto.Tokenizer(settingsfile)
#pass the text (may be called multiple times),
tokenizer.process(text)
#read the tokenised data
for token in tokenizer:
#token is an instance of ucto.Token, serialise to string using str()
print( "[" + str(token) + "]", end="" )
#tokens remember whether they are followed by a space
if token.isendofsentence():
print()
elif not token.nospace():
print(" ",end="")
#the type of the token (i.e. the rule that build it) is available as token.type
#we can continue with more text:
tokenizer.process("This was not enough. We want more text. More sentences are better!!!")
#there is a high-levelinterface to iterate over sentences as string, with all tokens space-separated:
for sentence in tokenizer.sentences():
print(sentence)