diff --git a/examples/plot_letters.py b/examples/plot_letters.py new file mode 100644 index 00000000..12f03b94 --- /dev/null +++ b/examples/plot_letters.py @@ -0,0 +1,91 @@ +""" +=============================== +OCR Letter sequence recognition +=============================== +This example illustrates the use of a chain CRF for optical character +recognition. The example is taken from Taskar et al "Max-margin markov random +fields". + +Each example consists of a handwritten word, that was presegmented into +characters. Each character is represented as a 16x8 binary image. The task is +to classify the image into one of the 26 characters a-z. The first letter of +every word was ommited as it was capitalized and the task does only consider +small caps letters. + +We compare classification using a standard linear SVM that classifies +each letter individually with a chain CRF that can exploit correlations +between neighboring letters (the correlation is particularly strong +as the same words are used during training and testsing). + +The first figures shows the segmented letters of four words from the test set. +In set are the ground truth (green), the prediction using SVM (blue) and the +prediction using a chain CRF (red). + +The second figure shows the pairwise potentials learned by the chain CRF. +The strongest patterns are "y after l" and "n after i". + +There are obvious extensions that both methods could benefit from, such as +window features or non-linear kernels. This example is more meant to give a +demonstration of the CRF than to show its superiority. +""" +import numpy as np +import matplotlib.pyplot as plt + +from sklearn.svm import LinearSVC +#from sklearn.metrics import confusion_matrix + +from pystruct.datasets import load_letters +from pystruct.models import ChainCRF +from pystruct.learners import OneSlackSSVM + +abc = "abcdefghijklmnopqrstuvwxyz" + +letters = load_letters() +X, y, folds = letters['data'], letters['labels'], letters['folds'] +# we convert the lists to object arrays, as that makes slicing much more +# convenient +X, y = np.array(X), np.array(y) +X_train, X_test = X[folds == 1], X[folds != 1] +y_train, y_test = y[folds == 1], y[folds != 1] + +# Train linear SVM +svm = LinearSVC(dual=False, C=.1) +# flatten input +svm.fit(np.vstack(X_train), np.hstack(y_train)) + +# Train linear chain CRF +model = ChainCRF(inference_method='qpbo') +ssvm = OneSlackSSVM(model=model, C=.1, inference_cache=50, tol=0.1) +ssvm.fit(X_train, y_train) + +print("Test score with chain CRF: %f" % ssvm.score(X_test, y_test)) + +print("Test score with linear SVM: %f" % svm.score(np.vstack(X_test), + np.hstack(y_test))) + +# plot some word sequenced +n_words = 4 +rnd = np.random.RandomState(1) +selected = rnd.randint(len(y_test), size=n_words) +max_word_len = max([len(y) for y in y_test[selected]]) +fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10)) +fig.subplots_adjust(wspace=0) +for ind, axes_row in zip(selected, axes): + y_pred_svm = svm.predict(X_test[ind]) + y_pred_chain = ssvm.predict([X_test[ind]])[0] + for i, (a, image, y_true, y_svm, y_chain) in enumerate( + zip(axes_row, X_test[ind], y_test[ind], y_pred_svm, y_pred_chain)): + a.matshow(image.reshape(16, 8), cmap=plt.cm.Greys) + a.text(0, 3, abc[y_true], color="#00AA00", size=25) + a.text(0, 14, abc[y_svm], color="#5555FF", size=25) + a.text(5, 14, abc[y_chain], color="#FF5555", size=25) + a.set_xticks(()) + a.set_yticks(()) + for ii in xrange(i + 1, max_word_len): + axes_row[ii].set_visible(False) + +plt.matshow(ssvm.w[26 * 8 * 16:].reshape(26, 26)) +plt.title("Transition parameters of the chain CRF.") +plt.xticks(np.arange(25), abc) +plt.yticks(np.arange(25), abc) +plt.show() diff --git a/pystruct/datasets/__init__.py b/pystruct/datasets/__init__.py index 5ad5e760..aed31974 100644 --- a/pystruct/datasets/__init__.py +++ b/pystruct/datasets/__init__.py @@ -5,9 +5,10 @@ generate_checker_multinomial, generate_big_checker) from .scene import load_scene +from .letters import load_letters __all__ = ['generate_blocks', 'generate_blocks_multinomial', 'generate_bars', 'generate_crosses_explicit', 'binary', 'multinomial', 'load_scene', 'make_simple_2x2', 'generate_easy', 'generate_crosses', 'generate_checker', 'generate_checker_multinomial', - 'generate_big_checker'] + 'generate_big_checker', 'load_letters'] diff --git a/pystruct/datasets/letters.pickle b/pystruct/datasets/letters.pickle new file mode 100644 index 00000000..ba236f81 Binary files /dev/null and b/pystruct/datasets/letters.pickle differ diff --git a/pystruct/datasets/letters.py b/pystruct/datasets/letters.py new file mode 100644 index 00000000..3724133f --- /dev/null +++ b/pystruct/datasets/letters.py @@ -0,0 +1,22 @@ +import cPickle +from os.path import dirname +from os.path import join + +import numpy as np + + +def load_letters(): + """Load the OCR letters dataset. + + This is a chain classification task. + Each example consists of a word, segmented into letters. + The first letter of each word is ommited from the data, + as it was a capital letter (in contrast to all other letters). + """ + module_path = dirname(__file__) + data_file = open(join(module_path, 'letters.pickle')) + data = cPickle.load(data_file) + # we add an easy to use image representation: + data['images'] = [np.hstack([l.reshape(16, 8) for l in word]) + for word in data['data']] + return data diff --git a/pystruct/learners/n_slack_ssvm.py b/pystruct/learners/n_slack_ssvm.py index 6a4cea73..705c8665 100644 --- a/pystruct/learners/n_slack_ssvm.py +++ b/pystruct/learners/n_slack_ssvm.py @@ -122,7 +122,7 @@ def _solve_n_slack_qp(self, constraints, n_samples): psis = [c[1] for sample in constraints for c in sample] losses = [c[2] for sample in constraints for c in sample] - psi_matrix = np.vstack(psis) + psi_matrix = np.vstack(psis).astype(np.float) n_constraints = len(psis) P = cvxopt.matrix(np.dot(psi_matrix, psi_matrix.T)) # q contains loss from margin-rescaling diff --git a/pystruct/models/__init__.py b/pystruct/models/__init__.py index 32dfc2de..d1632d4f 100644 --- a/pystruct/models/__init__.py +++ b/pystruct/models/__init__.py @@ -2,6 +2,7 @@ from .crf import CRF from .grid_crf import GridCRF, DirectionalGridCRF from .graph_crf import GraphCRF +from .chain_crf import ChainCRF from .latent_grid_crf import LatentGridCRF, LatentDirectionalGridCRF from .latent_graph_crf import LatentGraphCRF from .latent_node_crf import LatentNodeCRF, EdgeFeatureLatentNodeCRF @@ -12,5 +13,5 @@ __all__ = ["StructuredModel", "CRF", "GridCRF", "GraphCRF", "DirectionalGridCRF", "BinaryClf", "LatentGridCRF", "LatentDirectionalGridCRF", "MultiClassClf", "LatentGraphCRF", - "MultiLabelClf", "LatentNodeCRF", "EdgeFeatureGraphCRF", + "MultiLabelClf", "ChainCRF", "LatentNodeCRF", "EdgeFeatureGraphCRF", "EdgeFeatureLatentNodeCRF"] diff --git a/pystruct/models/chain_crf.py b/pystruct/models/chain_crf.py new file mode 100644 index 00000000..d98bb516 --- /dev/null +++ b/pystruct/models/chain_crf.py @@ -0,0 +1,80 @@ +import numpy as np + +from .graph_crf import GraphCRF + + +def make_chain_edges(x): + # this can be optimized sooooo much! + inds = np.arange(x.shape[0]) + edges = np.concatenate([inds[:-1, np.newaxis], inds[1:, np.newaxis]], + axis=1) + return edges + + +class ChainCRF(GraphCRF): + """Linear-chain CRF. + + Pairwise potentials are symmetric and the same for all edges. + This leads to ``n_classes`` parameters for unary potentials. + If ``directed=True``, there are ``n_classes * n_classes`` parameters + for pairwise potentials, if ``directed=False``, there are only + ``n_classes * (n_classes + 1) / 2`` (for a symmetric matrix). + + Unary evidence ``x`` is given as array of shape (n_nodes, n_features), and + labels ``y`` are given as array of shape (n_nodes,). Chain lengths do not + need to be constant over the dataset. + + Parameters + ---------- + n_states : int, default=2 + Number of states for all variables. + + inference_method : string or None, default=None + Function to call do do inference and loss-augmented inference. + Possible values are: + + - 'qpbo' for QPBO + alpha expansion. + - 'dai' for LibDAI bindings (which has another parameter). + - 'lp' for Linear Programming relaxation using GLPK. + - 'ad3' for AD3 dual decomposition. + + If None, ad3 is used if installed, otherwise lp. + + class_weight : None, or array-like + Class weights. If an array-like is passed, it must have length + n_classes. None means equal class weights. + + directed : boolean, default=False + Whether to model directed or undirected connections. + In undirected models, interaction terms are symmetric, + so an edge ``a -> b`` has the same energy as ``b -> a``. + """ + def __init__(self, n_states=None, n_features=None, inference_method=None, + class_weight=None, directed=True): + GraphCRF.__init__(self, n_states=n_states, n_features=n_features, + inference_method=inference_method, + class_weight=class_weight, directed=directed) + + def _get_edges(self, x): + return make_chain_edges(x) + + def _get_features(self, x): + return x + + def initialize(self, X, Y): + n_features = X[0].shape[1] + if self.n_features is None: + self.n_features = n_features + elif self.n_features != n_features: + raise ValueError("Expected %d features, got %d" + % (self.n_features, n_features)) + + n_states = len(np.unique(np.hstack([y for y in Y]))) + if self.n_states is None: + self.n_states = n_states + elif self.n_states != n_states: + raise ValueError("Expected %d states, got %d" + % (self.n_states, n_states)) + + self._set_size_psi() + self._set_class_weight() diff --git a/pystruct/models/crf.py b/pystruct/models/crf.py index 2d21c35b..4526fbfd 100644 --- a/pystruct/models/crf.py +++ b/pystruct/models/crf.py @@ -2,6 +2,7 @@ from .base import StructuredModel from ..inference import inference_dispatch, get_installed +#from .utils import loss_augment_unaries class CRF(StructuredModel): @@ -20,6 +21,8 @@ def __init__(self, n_states=None, n_features=None, inference_method=None, self._set_class_weight() def initialize(self, X, Y): + # Works for both GridCRF and GraphCRF, but not ChainCRF. + # funny that ^^ n_features = X[0][0].shape[1] if self.n_features is None: self.n_features = n_features diff --git a/pystruct/models/graph_crf.py b/pystruct/models/graph_crf.py index a3a8ba75..00e4ca0a 100644 --- a/pystruct/models/graph_crf.py +++ b/pystruct/models/graph_crf.py @@ -8,8 +8,10 @@ class GraphCRF(CRF): """Pairwise CRF on a general graph. Pairwise potentials are symmetric and the same for all edges. - This leads to n_classes parameters for unary potentials and - n_classes * (n_classes + 1) / 2 parameters for edge potentials. + This leads to n_classes parameters for unary potentials. + If ``directed=True``, there are ``n_classes * n_classes`` parameters + for pairwise potentials, if ``directed=False``, there are only + ``n_classes * (n_classes + 1) / 2`` (for a symmetric matrix). Examples, i.e. X, are given as an iterable of n_examples. An example, x, is represented as a tuple (features, edges) where @@ -27,7 +29,7 @@ class GraphCRF(CRF): n_features : int, default=None Number of features per node. None means n_states. - inference_method : string, default="ad3" + inference_method : string or None, default=None Function to call do do inference and loss-augmented inference. Possible values are: @@ -36,6 +38,8 @@ class GraphCRF(CRF): - 'lp' for Linear Programming relaxation using GLPK. - 'ad3' for AD3 dual decomposition. + If None, ad3 is used if installed, otherwise lp. + class_weight : None, or array-like Class weights. If an array-like is passed, it must have length n_classes. None means equal class weights. diff --git a/src/utils.pyx b/src/utils.pyx index 8db0934f..b40c7388 100644 --- a/src/utils.pyx +++ b/src/utils.pyx @@ -7,3 +7,13 @@ def crammer_singer_psi(double[:,:] X, long[:] Y, double[:, :] out): y = Y[i] for j in xrange(X.shape[1]): out[y, j] += X[i, j] + +# untested! +#def loss_augment_unaries(double[:,:] unary_potentials, long[:] y, double[:] class_weight): +# cdef int i +# cdef int n_states = unary_potentials.shape[1] +# for i in range(unary_potentials.shape[0]): +# for s in range(n_states): +# if s == y[i]: +# continue +# unary_potentials[i, s] += class_weight[s] diff --git a/tests/test_learners/test_edge_feature_graph_learning.py b/tests/test_learners/test_edge_feature_graph_learning.py index c4c1658d..23b988be 100644 --- a/tests/test_learners/test_edge_feature_graph_learning.py +++ b/tests/test_learners/test_edge_feature_graph_learning.py @@ -25,8 +25,7 @@ def test_multinomial_blocks_directional_simple(): X = zip([x.reshape(-1, 3) for x in X_], edges, edge_features) Y = [y.ravel() for y in Y_] - crf = EdgeFeatureGraphCRF(n_states=3, - n_edge_features=2) + crf = EdgeFeatureGraphCRF(n_states=3, n_edge_features=2) clf = NSlackSSVM(model=crf, max_iter=10, C=1, check_constraints=False) clf.fit(X, Y) Y_pred = clf.predict(X) diff --git a/tests/test_models/test_chain_crf.py b/tests/test_models/test_chain_crf.py new file mode 100644 index 00000000..145c638e --- /dev/null +++ b/tests/test_models/test_chain_crf.py @@ -0,0 +1,42 @@ +import numpy as np +from numpy.testing import assert_array_equal, assert_equal + +from nose.tools import assert_raises + +from pystruct.models import ChainCRF + + +def test_initialize(): + rnd = np.random.RandomState(0) + x = rnd.normal(size=(13, 5)) + y = rnd.randint(3, size=13) + crf = ChainCRF(n_states=3, n_features=5) + # no-op + crf.initialize([x], [y]) + + #test initialization works + crf = ChainCRF() + crf.initialize([x], [y]) + assert_equal(crf.n_states, 3) + assert_equal(crf.n_features, 5) + + crf = ChainCRF(n_states=2) + assert_raises(ValueError, crf.initialize, X=[x], Y=[y]) + pass + + +def test_directed_chain(): + # check that a directed model actually works differntly in the two + # directions. chain of length three, three states 0, 1, 2 which want to be + # in this order, evidence only in the middle + x = np.array([[0, 0, 0], [0, 1, 0], [0, 0, 0]]) + + w = np.array([1, 0, 0, # unary + 0, 1, 0, + 0, 0, 1, + 0, 1, 0, # pairwise + 0, 0, 1, + 0, 0, 0]) + crf = ChainCRF(n_states=3, n_features=3) + y = crf.inference(x, w) + assert_array_equal([0, 1, 2], y)