forked from UWPCE-PythonCert/IntroToPython-2014
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrigram_solution.py
More file actions
158 lines (112 loc) · 4.29 KB
/
Copy pathtrigram_solution.py
File metadata and controls
158 lines (112 loc) · 4.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python
"""
Trigram.py
A solution to the trigram coding Kata:
http://codekata.com/kata/kata14-tom-swift-under-the-milkwood/
Chris Barker's Solution
This one is pretty straight forward -- really a quickie script
There is lots of room to make it fancier of you want
"""
# infilename = "sherlock_small.txt"
infilename = "sherlock.txt"
import sys
import string
import random
def make_words(text):
"""
make a list of words from a large bunch of text
strips all the punctuation and other stuff from a string
"""
# build a translation table for string.translate:
# there are other ways to do this:
# a_word.strip() works well, too.
punctuation = string.punctuation
punctuation = punctuation.replace("'", "") # keep apostropies
punctuation = punctuation.replace("-", "") # keep hyphenated words
## this will replace punctiation with spaces
## -- then split() will remove the extra spaces
table = string.maketrans(punctuation, " "*len(punctuation))
# lower-case everything to remove that complication:
text = text.lower()
# remove punctuation
text = text.translate(table)
# remove "--" -- can't do multiple characters with translate
text = text.replace("--", " ")
# split into words
words = text.split()
# remove the bare single quotes
# " ' " is both a quote and an apostrophe
words2 = []
for word in words:
if word != "'": # remove quote by itself
# "i" by itself should be capitalized
words2.append("I" if word == 'i' else word)
## could be done with list comp too -- next week!
# words2 = [("I" if word == 'i' else word) for word in words if word != "'"]
return words2
def read_in_data(infilename):
infile = open(infilename, 'r') # text mode is default
# strip out the header, table of contents, etc.
for i in range(61):
infile.readline()
full_text = []
# read the rest of the file line by line
for line in infile:
if line.startswith("End of the Project Gutenberg EBook"):
break
full_text.append(line)
# put all the lines together into one big string:
return " ".join(full_text)
def build_trigram(words):
"""build a trigram dict from the passed-in list of words"""
# Dictionary for trigram results:
# The keys will be all the word pairs
# The values will be a list of the words that follow each pair
word_pairs = {}
# loop through the words
# (rare case where using the index to loop is easiest)
for i in range(len(words) - 2): # minus 2, 'cause you need a pair'
pair = tuple( words[i:i+2] ) # a tuple so it can be a key in the dict
follower = words[i+2]
word_pairs.setdefault(pair,[]).append(follower)
# setdefault() returns the value if pair is already in the dict
# if it's not, it adds it, setting the value to a an empty list
# then it returns the list, which we then append the following
# word to -- cleaner than:
# if pair in word_pairs:
# word_pairs[pair].append(follower)
# else:
# word_pairs[pair] = [follower]
return word_pairs
def build_text(word_pairs):
"""
Build some new text from the word_pair dict supplied
A bit of fancy stuff to make them look like sentences..
"""
new_text = []
for i in range(30): # do thirty sentences
# pick a word pair to start the sentence
sentence = list(random.choice( word_pairs.keys() ) )
# now add a random number of additional words to the sentence
for j in range(random.randint(2,10)):
pair = tuple(sentence[-2:])
sentence.append( random.choice(word_pairs[pair]) )
#capitalize the first word:
sentence[0] = sentence[0].capitalize()
#Add the period
sentence[-1] += "."
new_text.extend(sentence)
new_text = " ".join(new_text)
return new_text
if __name__ == "__main__":
# get the filename from the command line
try:
filename = sys.argv[1]
except IndexError:
print "You must pass in a filename"
sys.exit(1)
in_data = read_in_data(filename)
words = make_words(in_data)
word_pairs = build_trigram(words)
new_text = build_text(word_pairs)
print new_text