Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions examples/syllable_chain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from __future__ import print_function

try:
from urllib.request import urlopen # Py3k
except ImportError:
from urllib2 import urlopen # Py2

import numpy as np

from sklearn.feature_extraction import FeatureHasher
from sklearn.cross_validation import train_test_split

from pystruct.models import ChainCRF
from pystruct.learners import SubgradientSSVM


NETTALK = ("http://archive.ics.uci.edu/ml/machine-learning-databases/"
"undocumented/connectionist-bench/nettalk/nettalk.data")


def features(word):
def end_features(curr_position):
return ("c[-1]={}".format(word[curr_position - 1]),
"c[+1]={}".format(word[curr_position]))

def middle_features(curr_position):
return ("c[-2]={}".format(word[curr_position - 2]),
"c[+2]={}".format(word[curr_position + 1]),
"c[-2:-1]={}".format(word[curr_position - 1:curr_position]),
"c[1:2]={}".format(word[curr_position + 1:curr_position + 3]))

return [end_features(pos) if pos == 1 or pos == len(word) - 1
else end_features(pos) + middle_features(pos)
for pos in xrange(1, len(word))]


def nettalk_syl_to_split(syl):
syllables = [syl[k - 1] != '>' and syl[k] in ('>', '0', '1', '2')
for k in xrange(1, len(syl))]
stress = [k == '1' for k in syl]
return syllables, stress


def nettalk_line(line):
try:
word, phon, syl, cls = line.strip().split('\t')
syllable, stress = nettalk_syl_to_split(syl)
except ValueError:
word, syllable, stress = "", [], []
return features(word), syllable, stress


def numbered_nb(y):
new_y = np.empty(len(y), dtype=np.int)
last_split = -1
for k, is_split in enumerate(y):
if is_split:
last_split = k
new_y[k] = k - last_split
return new_y


if __name__ == '__main__':
url = urlopen(NETTALK)
for _ in xrange(10): # skip header
url.readline()
lines = [nettalk_line(line) for line in url]
url.close()

X, y = zip(*((word, tag) for (word, tag, _) in lines if len(word)))
hasher = FeatureHasher(input_type='string', n_features=2**10,
non_negative=True)
X = np.array([hasher.transform(instance) for instance in X])
y = np.array([numbered_nb(this_y) for this_y in y])

# The random state ensures that all labels are in the train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2,
random_state=1)

# train linear chain CRF
model = ChainCRF(inference_method=('ad3', dict(branch_and_bound=True)))
ssvm = SubgradientSSVM(model=model, verbose=1, C=100, max_iter=5)
ssvm.fit(X_train, y_train)
y_pred = ssvm.predict(X_test)
score = np.mean([np.all((y_t == 0) == (y_p == 0))
for (y_t, y_p) in zip(y_test, y_pred)])
print("Test score: {:2.2f}".format(ssvm.score(X_test, y_test)))
7 changes: 5 additions & 2 deletions pystruct/models/graph_crf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import numpy as np

from sklearn.utils.extmath import safe_sparse_dot

from .crf import CRF
from ..utils import expand_sym, compress_sym

Expand Down Expand Up @@ -117,7 +119,7 @@ def _get_unary_potentials(self, x, w):
unary_params = w[:self.n_states * self.n_features].reshape(
self.n_states, self.n_features)

return np.dot(features, unary_params.T)
return safe_sparse_dot(features, unary_params.T, dense_output=True)

def psi(self, x, y):
"""Feature vector associated with instance (x, y).
Expand Down Expand Up @@ -165,7 +167,8 @@ def psi(self, x, y):
pw = np.dot(unary_marginals[edges[:, 0]].T,
unary_marginals[edges[:, 1]])

unaries_acc = np.dot(unary_marginals.T, features)
unaries_acc = safe_sparse_dot(unary_marginals.T, features,
dense_output=True)
if self.directed:
pw = pw.ravel()
else:
Expand Down