diff --git a/examples/plot_letters.py b/examples/plot_letters.py
new file mode 100644
index 00000000..12f03b94
--- /dev/null
+++ b/examples/plot_letters.py
@@ -0,0 +1,91 @@
+"""
+===============================
+OCR Letter sequence recognition
+===============================
+This example illustrates the use of a chain CRF for optical character
+recognition. The example is taken from Taskar et al "Max-margin markov random
+fields".
+
+Each example consists of a handwritten word, that was presegmented into
+characters.  Each character is represented as a 16x8 binary image. The task is
+to classify the image into one of the 26 characters a-z. The first letter of
+every word was ommited as it was capitalized and the task does only consider
+small caps letters.
+
+We compare classification using a standard linear SVM that classifies
+each letter individually with a chain CRF that can exploit correlations
+between neighboring letters (the correlation is particularly strong
+as the same words are used during training and testsing).
+
+The first figures shows the segmented letters of four words from the test set.
+In set are the ground truth (green), the prediction using SVM (blue) and the
+prediction using a chain CRF (red).
+
+The second figure shows the pairwise potentials learned by the chain CRF.
+The strongest patterns are "y after l" and "n after i".
+
+There are obvious extensions that both methods could benefit from, such as
+window features or non-linear kernels. This example is more meant to give a
+demonstration of the CRF than to show its superiority.
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn.svm import LinearSVC
+#from sklearn.metrics import confusion_matrix
+
+from pystruct.datasets import load_letters
+from pystruct.models import ChainCRF
+from pystruct.learners import OneSlackSSVM
+
+abc = "abcdefghijklmnopqrstuvwxyz"
+
+letters = load_letters()
+X, y, folds = letters['data'], letters['labels'], letters['folds']
+# we convert the lists to object arrays, as that makes slicing much more
+# convenient
+X, y = np.array(X), np.array(y)
+X_train, X_test = X[folds == 1], X[folds != 1]
+y_train, y_test = y[folds == 1], y[folds != 1]
+
+# Train linear SVM
+svm = LinearSVC(dual=False, C=.1)
+# flatten input
+svm.fit(np.vstack(X_train), np.hstack(y_train))
+
+# Train linear chain CRF
+model = ChainCRF(inference_method='qpbo')
+ssvm = OneSlackSSVM(model=model, C=.1, inference_cache=50, tol=0.1)
+ssvm.fit(X_train, y_train)
+
+print("Test score with chain CRF: %f" % ssvm.score(X_test, y_test))
+
+print("Test score with linear SVM: %f" % svm.score(np.vstack(X_test),
+                                                   np.hstack(y_test)))
+
+# plot some word sequenced
+n_words = 4
+rnd = np.random.RandomState(1)
+selected = rnd.randint(len(y_test), size=n_words)
+max_word_len = max([len(y) for y in y_test[selected]])
+fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10))
+fig.subplots_adjust(wspace=0)
+for ind, axes_row in zip(selected, axes):
+    y_pred_svm = svm.predict(X_test[ind])
+    y_pred_chain = ssvm.predict([X_test[ind]])[0]
+    for i, (a, image, y_true, y_svm, y_chain) in enumerate(
+            zip(axes_row, X_test[ind], y_test[ind], y_pred_svm, y_pred_chain)):
+        a.matshow(image.reshape(16, 8), cmap=plt.cm.Greys)
+        a.text(0, 3, abc[y_true], color="#00AA00", size=25)
+        a.text(0, 14, abc[y_svm], color="#5555FF", size=25)
+        a.text(5, 14, abc[y_chain], color="#FF5555", size=25)
+        a.set_xticks(())
+        a.set_yticks(())
+    for ii in xrange(i + 1, max_word_len):
+        axes_row[ii].set_visible(False)
+
+plt.matshow(ssvm.w[26 * 8 * 16:].reshape(26, 26))
+plt.title("Transition parameters of the chain CRF.")
+plt.xticks(np.arange(25), abc)
+plt.yticks(np.arange(25), abc)
+plt.show()
diff --git a/pystruct/datasets/__init__.py b/pystruct/datasets/__init__.py
index 5ad5e760..aed31974 100644
--- a/pystruct/datasets/__init__.py
+++ b/pystruct/datasets/__init__.py
@@ -5,9 +5,10 @@
                               generate_checker_multinomial,
                               generate_big_checker)
 from .scene import load_scene
+from .letters import load_letters
 
 __all__ = ['generate_blocks', 'generate_blocks_multinomial', 'generate_bars',
            'generate_crosses_explicit', 'binary', 'multinomial', 'load_scene',
            'make_simple_2x2', 'generate_easy', 'generate_crosses',
            'generate_checker', 'generate_checker_multinomial',
-           'generate_big_checker']
+           'generate_big_checker', 'load_letters']
diff --git a/pystruct/datasets/letters.pickle b/pystruct/datasets/letters.pickle
new file mode 100644
index 00000000..ba236f81
Binary files /dev/null and b/pystruct/datasets/letters.pickle differ
diff --git a/pystruct/datasets/letters.py b/pystruct/datasets/letters.py
new file mode 100644
index 00000000..3724133f
--- /dev/null
+++ b/pystruct/datasets/letters.py
@@ -0,0 +1,22 @@
+import cPickle
+from os.path import dirname
+from os.path import join
+
+import numpy as np
+
+
+def load_letters():
+    """Load the OCR letters dataset.
+
+    This is a chain classification task.
+    Each example consists of a word, segmented into letters.
+    The first letter of each word is ommited from the data,
+    as it was a capital letter (in contrast to all other letters).
+    """
+    module_path = dirname(__file__)
+    data_file = open(join(module_path, 'letters.pickle'))
+    data = cPickle.load(data_file)
+    # we add an easy to use image representation:
+    data['images'] = [np.hstack([l.reshape(16, 8) for l in word])
+                      for word in data['data']]
+    return data
diff --git a/pystruct/learners/n_slack_ssvm.py b/pystruct/learners/n_slack_ssvm.py
index 6a4cea73..705c8665 100644
--- a/pystruct/learners/n_slack_ssvm.py
+++ b/pystruct/learners/n_slack_ssvm.py
@@ -122,7 +122,7 @@ def _solve_n_slack_qp(self, constraints, n_samples):
         psis = [c[1] for sample in constraints for c in sample]
         losses = [c[2] for sample in constraints for c in sample]
 
-        psi_matrix = np.vstack(psis)
+        psi_matrix = np.vstack(psis).astype(np.float)
         n_constraints = len(psis)
         P = cvxopt.matrix(np.dot(psi_matrix, psi_matrix.T))
         # q contains loss from margin-rescaling
diff --git a/pystruct/models/__init__.py b/pystruct/models/__init__.py
index 32dfc2de..d1632d4f 100644
--- a/pystruct/models/__init__.py
+++ b/pystruct/models/__init__.py
@@ -2,6 +2,7 @@
 from .crf import CRF
 from .grid_crf import GridCRF, DirectionalGridCRF
 from .graph_crf import GraphCRF
+from .chain_crf import ChainCRF
 from .latent_grid_crf import LatentGridCRF, LatentDirectionalGridCRF
 from .latent_graph_crf import LatentGraphCRF
 from .latent_node_crf import LatentNodeCRF, EdgeFeatureLatentNodeCRF
@@ -12,5 +13,5 @@
 __all__ = ["StructuredModel", "CRF", "GridCRF", "GraphCRF",
            "DirectionalGridCRF", "BinaryClf", "LatentGridCRF",
            "LatentDirectionalGridCRF", "MultiClassClf", "LatentGraphCRF",
-           "MultiLabelClf", "LatentNodeCRF", "EdgeFeatureGraphCRF",
+           "MultiLabelClf", "ChainCRF", "LatentNodeCRF", "EdgeFeatureGraphCRF",
            "EdgeFeatureLatentNodeCRF"]
diff --git a/pystruct/models/chain_crf.py b/pystruct/models/chain_crf.py
new file mode 100644
index 00000000..d98bb516
--- /dev/null
+++ b/pystruct/models/chain_crf.py
@@ -0,0 +1,80 @@
+import numpy as np
+
+from .graph_crf import GraphCRF
+
+
+def make_chain_edges(x):
+    # this can be optimized sooooo much!
+    inds = np.arange(x.shape[0])
+    edges = np.concatenate([inds[:-1, np.newaxis], inds[1:, np.newaxis]],
+                           axis=1)
+    return edges
+
+
+class ChainCRF(GraphCRF):
+    """Linear-chain CRF.
+
+    Pairwise potentials are symmetric and the same for all edges.
+    This leads to ``n_classes`` parameters for unary potentials.
+    If ``directed=True``, there are ``n_classes * n_classes`` parameters
+    for pairwise potentials, if ``directed=False``, there are only
+    ``n_classes * (n_classes + 1) / 2`` (for a symmetric matrix).
+
+    Unary evidence ``x`` is given as array of shape (n_nodes, n_features), and
+    labels ``y`` are given as array of shape (n_nodes,). Chain lengths do not
+    need to be constant over the dataset.
+
+    Parameters
+    ----------
+    n_states : int, default=2
+        Number of states for all variables.
+
+    inference_method : string or None, default=None
+        Function to call do do inference and loss-augmented inference.
+        Possible values are:
+
+            - 'qpbo' for QPBO + alpha expansion.
+            - 'dai' for LibDAI bindings (which has another parameter).
+            - 'lp' for Linear Programming relaxation using GLPK.
+            - 'ad3' for AD3 dual decomposition.
+
+        If None, ad3 is used if installed, otherwise lp.
+
+    class_weight : None, or array-like
+        Class weights. If an array-like is passed, it must have length
+        n_classes. None means equal class weights.
+
+    directed : boolean, default=False
+        Whether to model directed or undirected connections.
+        In undirected models, interaction terms are symmetric,
+        so an edge ``a -> b`` has the same energy as ``b -> a``.
+    """
+    def __init__(self, n_states=None, n_features=None, inference_method=None,
+                 class_weight=None, directed=True):
+        GraphCRF.__init__(self, n_states=n_states, n_features=n_features,
+                          inference_method=inference_method,
+                          class_weight=class_weight, directed=directed)
+
+    def _get_edges(self, x):
+        return make_chain_edges(x)
+
+    def _get_features(self, x):
+        return x
+
+    def initialize(self, X, Y):
+        n_features = X[0].shape[1]
+        if self.n_features is None:
+            self.n_features = n_features
+        elif self.n_features != n_features:
+            raise ValueError("Expected %d features, got %d"
+                             % (self.n_features, n_features))
+
+        n_states = len(np.unique(np.hstack([y for y in Y])))
+        if self.n_states is None:
+            self.n_states = n_states
+        elif self.n_states != n_states:
+            raise ValueError("Expected %d states, got %d"
+                             % (self.n_states, n_states))
+
+        self._set_size_psi()
+        self._set_class_weight()
diff --git a/pystruct/models/crf.py b/pystruct/models/crf.py
index 2d21c35b..4526fbfd 100644
--- a/pystruct/models/crf.py
+++ b/pystruct/models/crf.py
@@ -2,6 +2,7 @@
 
 from .base import StructuredModel
 from ..inference import inference_dispatch, get_installed
+#from .utils import loss_augment_unaries
 
 
 class CRF(StructuredModel):
@@ -20,6 +21,8 @@ def __init__(self, n_states=None, n_features=None, inference_method=None,
         self._set_class_weight()
 
     def initialize(self, X, Y):
+        # Works for both GridCRF and GraphCRF, but not ChainCRF.
+        # funny that ^^
         n_features = X[0][0].shape[1]
         if self.n_features is None:
             self.n_features = n_features
diff --git a/pystruct/models/graph_crf.py b/pystruct/models/graph_crf.py
index a3a8ba75..00e4ca0a 100644
--- a/pystruct/models/graph_crf.py
+++ b/pystruct/models/graph_crf.py
@@ -8,8 +8,10 @@ class GraphCRF(CRF):
     """Pairwise CRF on a general graph.
 
     Pairwise potentials are symmetric and the same for all edges.
-    This leads to n_classes parameters for unary potentials and
-    n_classes * (n_classes + 1) / 2 parameters for edge potentials.
+    This leads to n_classes parameters for unary potentials.
+    If ``directed=True``, there are ``n_classes * n_classes`` parameters
+    for pairwise potentials, if ``directed=False``, there are only
+    ``n_classes * (n_classes + 1) / 2`` (for a symmetric matrix).
 
     Examples, i.e. X, are given as an iterable of n_examples.
     An example, x, is represented as a tuple (features, edges) where
@@ -27,7 +29,7 @@ class GraphCRF(CRF):
     n_features : int, default=None
         Number of features per node. None means n_states.
 
-    inference_method : string, default="ad3"
+    inference_method : string or None, default=None
         Function to call do do inference and loss-augmented inference.
         Possible values are:
 
@@ -36,6 +38,8 @@ class GraphCRF(CRF):
             - 'lp' for Linear Programming relaxation using GLPK.
             - 'ad3' for AD3 dual decomposition.
 
+        If None, ad3 is used if installed, otherwise lp.
+
     class_weight : None, or array-like
         Class weights. If an array-like is passed, it must have length
         n_classes. None means equal class weights.
diff --git a/src/utils.pyx b/src/utils.pyx
index 8db0934f..b40c7388 100644
--- a/src/utils.pyx
+++ b/src/utils.pyx
@@ -7,3 +7,13 @@ def crammer_singer_psi(double[:,:] X, long[:] Y, double[:, :] out):
         y = Y[i]
         for j in xrange(X.shape[1]):
             out[y, j] += X[i, j]
+
+# untested!
+#def loss_augment_unaries(double[:,:] unary_potentials, long[:] y, double[:] class_weight):
+#    cdef int i
+#    cdef int n_states = unary_potentials.shape[1]
+#    for i in range(unary_potentials.shape[0]):
+#        for s in range(n_states):
+#            if s == y[i]:
+#                continue
+#            unary_potentials[i, s] += class_weight[s]
diff --git a/tests/test_learners/test_edge_feature_graph_learning.py b/tests/test_learners/test_edge_feature_graph_learning.py
index c4c1658d..23b988be 100644
--- a/tests/test_learners/test_edge_feature_graph_learning.py
+++ b/tests/test_learners/test_edge_feature_graph_learning.py
@@ -25,8 +25,7 @@ def test_multinomial_blocks_directional_simple():
     X = zip([x.reshape(-1, 3) for x in X_], edges, edge_features)
     Y = [y.ravel() for y in Y_]
 
-    crf = EdgeFeatureGraphCRF(n_states=3,
-                              n_edge_features=2)
+    crf = EdgeFeatureGraphCRF(n_states=3, n_edge_features=2)
     clf = NSlackSSVM(model=crf, max_iter=10, C=1, check_constraints=False)
     clf.fit(X, Y)
     Y_pred = clf.predict(X)
diff --git a/tests/test_models/test_chain_crf.py b/tests/test_models/test_chain_crf.py
new file mode 100644
index 00000000..145c638e
--- /dev/null
+++ b/tests/test_models/test_chain_crf.py
@@ -0,0 +1,42 @@
+import numpy as np
+from numpy.testing import assert_array_equal, assert_equal
+
+from nose.tools import assert_raises
+
+from pystruct.models import ChainCRF
+
+
+def test_initialize():
+    rnd = np.random.RandomState(0)
+    x = rnd.normal(size=(13, 5))
+    y = rnd.randint(3, size=13)
+    crf = ChainCRF(n_states=3, n_features=5)
+    # no-op
+    crf.initialize([x], [y])
+
+    #test initialization works
+    crf = ChainCRF()
+    crf.initialize([x], [y])
+    assert_equal(crf.n_states, 3)
+    assert_equal(crf.n_features, 5)
+
+    crf = ChainCRF(n_states=2)
+    assert_raises(ValueError, crf.initialize, X=[x], Y=[y])
+    pass
+
+
+def test_directed_chain():
+    # check that a directed model actually works differntly in the two
+    # directions.  chain of length three, three states 0, 1, 2 which want to be
+    # in this order, evidence only in the middle
+    x = np.array([[0, 0, 0], [0, 1, 0], [0, 0, 0]])
+
+    w = np.array([1, 0, 0,  # unary
+                  0, 1, 0,
+                  0, 0, 1,
+                  0, 1, 0,  # pairwise
+                  0, 0, 1,
+                  0, 0, 0])
+    crf = ChainCRF(n_states=3, n_features=3)
+    y = crf.inference(x, w)
+    assert_array_equal([0, 1, 2], y)