diff --git a/data_structure/disjoint_sparse_table.cc b/data_structure/disjoint_sparse_table.cc
new file mode 100644
index 0000000..87dced4
--- /dev/null
+++ b/data_structure/disjoint_sparse_table.cc
@@ -0,0 +1,78 @@
+// 
+// Disjoint Sparse Table
+//
+// Description:
+//
+//   Let `otimes` be a binary associative operator.
+//   The disjoint sparse table is a data structure for a 
+//   sequence xs that admits a query
+//     prod(i,j) = xs[i] `otimes` ... `otimes` xs[j-1]
+//   in time O(1).
+//
+//   The structure is a segment tree whose node maintains 
+//   prod(i,m) and prod(m,j) for all i, j in the segment.
+//   Then prod(i,j) is evaluated by finding the node that 
+//   splits [i,j) and returning prod(i,m)*prod(m,j).
+//
+// Complexity:
+//
+//   preprocessing O(n log n)
+//   query O(1)
+//
+#include <bits/stdc++.h>
+
+using namespace std;
+
+#define fst first
+#define snd second
+#define all(c) ((c).begin()), ((c).end())
+#define TEST(s) if (!(s)) { cout << __LINE__ << " " << #s << endl; exit(-1); }
+
+template <class T, class Op>
+struct DisjointSparseTable {
+  vector<vector<T>> ys;
+  Op otimes;
+  DisjointSparseTable(vector<T> xs, Op otimes_) : otimes(otimes_) {
+    int n = 1;
+    while (n <= xs.size()) n *= 2;
+    xs.resize(n);
+    ys.push_back(xs);
+    for (int h = 1; ; ++h) {
+      int range = (2 << h), half = (range /= 2);
+      if (range > n) break;
+      ys.push_back(xs);
+      for (int i = half; i < n; i += range) {
+        for (int j = i-2; j >= i-half; --j) 
+          ys[h][j] = otimes(ys[h][j], ys[h][j+1]);
+        for (int j = i+1; j < min(n, i+half); ++j) 
+          ys[h][j] = otimes(ys[h][j-1], ys[h][j]);
+      }
+    }
+  }
+  T prod(int i, int j) { // [i, j) query
+    --j;
+    int h = sizeof(int)*__CHAR_BIT__-1-__builtin_clz(i ^ j);
+    return otimes(ys[h][i], ys[h][j]);
+  }
+};
+template <class T, class Op>
+auto makeDisjointSparseTable(vector<T> xs, Op op) {
+  return DisjointSparseTable<T, Op>(xs, op);
+}
+
+int main() {
+  vector<int> xs = {3,1,4,1,5,1};
+  int n = xs.size();
+  auto otimes = [](int a, int b) { return max(a, b); };
+  auto dst = makeDisjointSparseTable(xs, otimes);
+
+  for (int i = 0; i < n; ++i) {
+    for (int j = i+1; j <= n; ++j) {
+      cout << i << " " << j << " " << dst.prod(i, j) << " ";
+      int a = xs[i];
+      for (int k = i+1; k < j; ++k)
+        a = otimes(a, xs[k]);
+      cout << a << endl;
+    }
+  }
+}
diff --git a/data_structure/leftist_heap.cc b/data_structure/leftist_heap.cc
new file mode 100644
index 0000000..62e3609
--- /dev/null
+++ b/data_structure/leftist_heap.cc
@@ -0,0 +1,84 @@
+//
+// Leftist Heap
+//
+// Description:
+//
+//   Leftist heap is a heap data structure that allows
+//   the meld (merge) operation in O(log n) time.
+//   Use this for persistent heaps.
+//
+// Complexity:
+//
+//   O(1) for top, O(log n) for push/pop/meld
+//
+// g++ -std=c++17 -O3 -fmax-errors=1 -fsanitize=undefined
+#include <bits/stdc++.h>
+
+using namespace std;
+
+#define fst first
+#define snd second
+#define all(c) ((c).begin()), ((c).end())
+#define TEST(s) if (!(s)) { cout << __LINE__ << " " << #s << endl; exit(-1); }
+
+template <class T>
+struct LeftistHeap {
+  struct Node {
+    T key;
+    Node *left = 0, *right = 0;
+    int dist = 0;
+  } *root = 0;
+  static Node *merge(Node *x, Node *y) {
+    if (!x) return y;
+    if (!y) return x;
+    if (x->key > y->key) swap(x, y);
+    x->right = merge(x->right, y);
+    if (!x->left || x->left->dist < x->dist) swap(x->left, x->right);
+    x->dist = (x->right ? x->right->dist : 0) + 1;
+    return x;
+  }
+  void push(T key) { root = merge(root, new Node({key})); }
+  void pop() { root = merge(root->left, root->right); }
+  T top() { return root->key; }
+};
+
+//
+// Persistent Implementaiton. (allow copy)
+//
+template <class T>
+struct PersistentLeftistHeap {
+  struct Node {
+    T key;
+    Node *left = 0, *right = 0;
+    int dist = 0;
+  } *root = 0;
+  static Node *merge(Node *x, Node *y) {
+    if (!x) return y;
+    if (!y) return x;
+    if (x->key > y->key) swap(x, y);
+    x = new Node(*x);
+    x->right = merge(x->right, y);
+    if (!x->left || x->left->dist < x->dist) swap(x->left, x->right);
+    x->dist = (x->right ? x->right->dist : 0) + 1;
+    return x;
+  }
+  void push(T key) { root = merge(root, new Node({key})); }
+  void pop() { root = merge(root->left, root->right); }
+  T top() { return root->key; }
+};
+
+int main() {
+  PersistentLeftistHeap<int> heap;
+  heap.push(3);
+  heap.push(1);
+  heap.push(4);
+  heap.push(1);
+  heap.push(5);
+  cout << heap.top() << endl; heap.pop();
+  cout << heap.top() << endl; heap.pop();
+  auto temp = heap;
+  cout << heap.top() << endl; heap.pop();
+  cout << heap.top() << endl; heap.pop();
+  cout << temp.top() << endl; temp.pop();
+  cout << temp.top() << endl; temp.pop();
+}
diff --git a/data_structure/parallel_binary_search.cc b/data_structure/parallel_binary_search.cc
new file mode 100644
index 0000000..1323f80
--- /dev/null
+++ b/data_structure/parallel_binary_search.cc
@@ -0,0 +1,149 @@
+// 
+// Parallel Binary Search
+//
+// Description:
+//
+//   Let W(t) be a data structure depending on time t.
+//   Suppose that there are n agents. Each agent j wants to
+//   find the smallest time t(j) such that cond(j, W(t(j))) == true
+//   where cond(j, W(t)) is monotone in t.
+//
+//   If we perform n binary searches independently, we will construct 
+//   W(t) multiple times. Thus, we avoid the redundant construction
+//   by performing n binary searches in parallel. Imagine a binary 
+//   search tree on t. For each node, we first construct W(t). 
+//   Then we process multiple agents in parallel. Then, the total 
+//   number of constructions is O(log T), which is independent to 
+//   the number of agents.
+//
+// Complexity:
+//
+//   Suppose that W(t) is constructed from W(t') in time M(|t-t'|),
+//   and the condition cond(j,W(t)) is evaluated in time Q.
+//   Then, it runs in O(M(T log T) + n Q log T) time.
+//
+//   Even if W does not have decremental operations, i.e., W(t) 
+//   cannot be constructed from W(t') with t' > t, we can still use
+//   the parallel binary search that runs in O(M(T) log T + n Q log T);
+//   time. The constant factor is twice worse than the above.
+//   Similar result holds if W does not have incremental operations.
+//
+#include <bits/stdc++.h>
+
+using namespace std;
+
+#define fst first
+#define snd second
+#define all(c) ((c).begin()), ((c).end())
+#define TEST(s) if (!(s)) { cout << __LINE__ << " " << #s << endl; exit(-1); }
+
+using Int = __int128_t;
+
+// Point Query, Range Update 
+template <class T>
+struct FenwickTree {
+  vector<T> x;
+  FenwickTree(int n) : x(n) { }
+  void add(int k, T a) { // aux
+    for (; k < x.size(); k |= k+1) x[k] += a;
+  }
+  void add(int i, int j, T a) { // add x[k] += a for all k in [i,j]
+    add(i, a);
+    if (j+1 < x.size()) add(j+1, -a);
+  }
+  T get(int k) { // return x[k]
+    T sum = 0;
+    for (; k >= 0; k = (k&(k+1))-1) sum += x[k];
+    return sum;
+  }
+};
+
+template <class Update, class Cond>
+vector<int> parallelBinarySearch(
+  int n, int lo, int hi, Update update, Cond cond) {
+  using It = vector<int>::iterator;
+  vector<int> agents(n), solution(n, lo);
+  iota(all(agents), 0);
+
+  It begin = agents.begin(), end = agents.end();
+  deque<tuple<int,int,It,It>> stack = {make_tuple(lo, hi, begin, end)};
+  while (!stack.empty()) {
+    // invariant: elems in [begin, end) satisfy "!cond(lo) and cond(hi)"
+    tie(lo, hi, begin, end) = stack.back();
+    stack.pop_back();
+
+    if (begin == end) continue;
+    if (lo+1 == hi) {
+      for_each(begin, end, [&](int k) { solution[k] = hi; });
+      continue;
+    }
+    int mi = (lo + hi) / 2;
+    update(mi);
+    It mid = partition(begin, end, [&](int k) { return cond(k); });
+    stack.push_back(make_tuple(mi, hi, mid, end));
+    stack.push_back(make_tuple(lo, mi, begin, mid));
+  }
+  return solution;
+}
+
+void SPOJ_METEORS() {
+  int n, m, k;
+  scanf("%d %d", &n, &m);
+  vector<vector<int>> S(n); 
+  vector<Int> p(n);
+  for (int i = 0; i < m; ++i) {
+    int j; scanf("%d", &j);
+    S[j-1].push_back(i);
+  }
+  for (int i = 0; i < n; ++i)
+    scanf("%lld", &p[i]);
+  scanf("%d", &k);
+  vector<int> l(k), r(k);
+  vector<Int> a(k);
+  for (int i = 0; i < k; ++i) {
+    scanf("%d %d %lld", &l[i], &r[i], &a[i]);
+    --l[i]; --r[i];
+  }
+
+  FenwickTree<Int> FT(m);
+  int curr = -1; 
+  auto update = [&](int time) {
+    while (curr < time) {
+      ++curr;
+      if (l[curr] <= r[curr]) {
+        FT.add(l[curr], r[curr], a[curr]);
+      } else {
+        FT.add(l[curr], m-1, a[curr]);
+        FT.add(0, r[curr], a[curr]);
+      }
+    }
+    while (curr > time) {
+      if (l[curr] <= r[curr]) {
+        FT.add(l[curr], r[curr], -a[curr]);
+      } else {
+        FT.add(l[curr], m-1, -a[curr]);
+        FT.add(0, r[curr], -a[curr]);
+      }
+      --curr;
+    }
+  };
+  // minimum time such that cond = true.
+  auto cond = [&](int j) {
+    Int total = 0;
+    for (int i: S[j]) {
+      total += FT.get(i);
+    }
+    return total >= p[j];
+  };
+
+  auto solution = parallelBinarySearch(n, -1, k, update, cond);
+
+  for (int i = 0; i < n; ++i) {
+    if (solution[i] >= k) cout << "NIE" << endl;
+    else cout << 1+solution[i] << endl;
+  }
+}
+
+int main() {
+  SPOJ_METEORS();
+}
diff --git a/data_structure/segment_recognizer.cc b/data_structure/segment_recognizer.cc
new file mode 100644
index 0000000..6e87b4b
--- /dev/null
+++ b/data_structure/segment_recognizer.cc
@@ -0,0 +1,149 @@
+//
+// Segment Recognizer 
+//
+// Description:
+//   Let M be an automaton and x be a sequence of alphabets.
+//   The segment recognizer computes the transitioned state 
+//   starting from s and reading x[i,j) in O(|M|) time.
+//   The preprocessing requires O(|M| |x|) time and space.
+//
+//   The same method is implemented by the segment tree, 
+//   where the time complexity is O(log n) and the space 
+//   complexity is O(n log n). Thus, the segment recognizer 
+//   is efficient if |M| is small.
+//
+// Algorithm:
+//   Basically, it stores all the runs from all initial 
+//   position i and initial state s. To reduce the space,
+//   it merges two runs if they yields the same state.
+//
+// Reference
+//   Mikola Bojanczyk (2009): "Factorization forests", 
+//   International Conference on Developments in Language Theory,
+//   pp. 1--17.
+//
+#include <bits/stdc++.h>
+
+using namespace std;
+
+#define fst first
+#define snd second
+#define all(c) ((c).begin()), ((c).end())
+#define TEST(s) if (!(s)) { cout << __LINE__ << " " << #s << endl; exit(-1); }
+
+
+// === tick a time ===
+#include <ctime>
+double tick() {
+  static clock_t oldtick;
+  clock_t newtick = clock();
+  double diff = 1.0*(newtick - oldtick) / CLOCKS_PER_SEC;
+  oldtick = newtick;
+  return diff;
+}
+
+template <int MOD>
+struct ModuloAutomaton {
+  const int init = 0;
+  int size() const { return MOD; }
+  int next(int s, int d) const { return (s+d)%MOD; }
+  int accept(int s) const { return s==0; }
+};
+
+// 0: free
+// 1: selected
+// 2: bottom
+struct IndependenceAutomaton {
+  const int init = 0;
+  int size() const { return 3; }
+  int next(int s, int d) const { 
+    if (s == 0) return d;
+    if (s == 1) return 2*d;
+    if (s == 2) return s;
+  }
+  int accept(int s) const { return s!=2; }
+};
+
+template <class Automaton>
+struct SegmentRecognizer {
+  Automaton M;
+  vector<int> x;
+
+  struct Tape {
+    int begin;
+    vector<int> sequence;
+  };
+  vector<vector<int>> index;
+  vector<Tape> tapes;
+
+  SegmentRecognizer(Automaton M, vector<int> x) : M(M), x(x) { 
+    index.assign(x.size()+1, vector<int>(M.size()));
+    vector<int> stripe;
+    for (int r = 0; r < M.size(); ++r) {
+      stripe.push_back(r);
+      index[0][r] = stripe[r];
+      tapes.push_back({0, {r}});
+    }
+    for (int i = 0; i < x.size(); ++i) {
+      unordered_set<int> available;
+      for (int s = 0; s < M.size(); ++s)
+        available.insert(s);
+      vector<int> reallocate;
+      for (int r = 0; r < M.size(); ++r) {
+        int next = M.next(tapes[stripe[r]].sequence.back(), x[i]);
+        if (available.count(next)) {
+          available.erase(next);
+          index[i+1][next] = stripe[r];
+          tapes[stripe[r]].sequence.push_back(next);
+        } else {
+          reallocate.push_back(r);
+        }
+      }
+      for (int r: reallocate) {
+        int s = *available.begin();
+        stripe[r] = tapes.size();
+        index[i+1][s] = stripe[r];
+        tapes.push_back({i+1, {s}});
+        available.erase(s);
+      }
+    }
+  }
+
+  int getState(int i, int s, int j) {
+    while (1) {
+      auto &tape = tapes[index[i][s]];
+      if (j - tape.begin < tape.sequence.size()) {
+        return tape.sequence[j - tape.begin];
+      } else {
+        i = tape.begin + tape.sequence.size();
+        s = M.next(tape.sequence.back(), x[i-1]);
+      }
+    }
+  }
+};
+template <class Automaton>
+SegmentRecognizer<Automaton> makeSegmentRecognizer(Automaton M, vector<int> s) {
+  return SegmentRecognizer<Automaton>(M, s);
+}
+
+int main() {
+  IndependenceAutomaton M;
+
+  for (int n = 2; n < (1<<24); n*=2) {
+    vector<int> x(n);
+    for (int i = 0; i < n; ++i) {
+      x[i] = (rand() % 10 == 0);
+    }
+    auto recognizer = makeSegmentRecognizer(M, x);
+
+    tick();
+    int count = 0;
+    for (int iter = 0; iter < n; ++iter) {
+      int v = (rand() % n) + 1;
+      int u = rand() % v;
+      count += recognizer.getState(u, 0, v);
+    }
+    double t = tick();
+    cout << n << " " << t / n << endl;
+  }
+}
diff --git a/data_structure/union_find_undo.cc b/data_structure/union_find_undo.cc
index 4aff857..208ff82 100644
--- a/data_structure/union_find_undo.cc
+++ b/data_structure/union_find_undo.cc
@@ -27,25 +27,29 @@ struct UndoableUnionFind {
   UndoableUnionFind(int n) : parent(n, -1) { };
   bool unite(int u, int v) { 
     u = root(u); v = root(v);
-    if (u == v) return false;
-    if (parent[u] > parent[v]) swap(u, v);
-    history.push_back(make_tuple(u, v, parent[v]));
-    parent[u] += parent[v]; parent[v] = u;
-    return true;
+    if (u == v) {
+      history.push_back(make_tuple(-1,-1,-1));
+      return false;
+    } else {
+      if (parent[u] > parent[v]) swap(u, v);
+      history.push_back(make_tuple(u, v, parent[v]));
+      parent[u] += parent[v]; parent[v] = u;
+      return true;
+    }
   }
   void undo() {
     int u, v, w; 
     tie(u, v, w) = history.back();
     history.pop_back();
+    if (u == -1) return;
     parent[v] = w;
     parent[u] -= parent[v];
   }
   bool find(int u, int v) { return root(u) == root(v); }
-  int root(int u) { return parent[u] < 0 ? u : parent[u] = root(parent[u]); }
+  int root(int u) { while (parent[u] >= 0) u = parent[u]; return u; }
   int size(int u) { return -parent[root(u)]; }
 };
 
-
 struct OfflineDynamicConnectivity {
   int n;
   UndoableUnionFind uf;
diff --git a/graph/articulation_points.cc b/graph/articulation_points.cc
index 07c5dd0..62b6638 100644
--- a/graph/articulation_points.cc
+++ b/graph/articulation_points.cc
@@ -1,5 +1,5 @@
 // 
-// Articulation points / Biconnected components
+// Block-Cut Tree (Articulation points / Biconnected components)
 //
 // Description:
 //   Let G = (V, E). If G-v is disconnected, v in V is said to
@@ -7,25 +7,19 @@
 //   it is said to be biconnected.
 //
 //   A biconnected component is a maximal biconnected subgraph.
-//   The algorithm finds all articulation points and biconnected 
-//   components.
+//   The algorithm finds all articulation points and biconnected
+//   components. It can be obtained by the Hopcroft-Tarjan DFS.
 //
-//   The most important fact is that by contracting biconnected
-//   components we obtain a tree, which is called the block tree.
-//
-//
-// Algorithm:
-//   Hopcroft-Tarjan's DFS based algorithm.
-//
-//   Single DFS finds a block tree rooted from the component
-//   that contains the specified root.
+//   We maintain the biconnected component decomposition by the
+//   block tree whose vertices are the blocks and the articulation
+//   points. By contracting the graph by the articulation points,
+//   we obtain the intersection graph of the blocks.
 //
 // Complexity:
 //   O(n + m).
 //
 // Verified:
-//   SPOJ 14956: Submerging Island (articulation point)
-//   POJ 2942: Knights of the Round Table (biconnected components)
+//   AOJ_GRL_3_A (articulation point)
 //
 // References:
 //   J. Hopcroft and R. E. Tarjan (1973):
@@ -33,68 +27,107 @@
 //   Communications of the ACM, vol.16, no.6, pp.372-378.
 //
 
-#include <iostream>
-#include <vector>
-#include <cstdio>
-#include <algorithm>
-#include <functional>
-#include <unordered_set>
+// g++ -std=c++17 -O3 -fmax-errors=1 -fsanitize=undefined
+#include <bits/stdc++.h>
 
 using namespace std;
 
 #define fst first
 #define snd second
 #define all(c) ((c).begin()), ((c).end())
+#define TEST(s) if (!(s)) { cout << __LINE__ << " " << #s << endl; exit(-1); }
 
-struct graph {
+struct Graph {
   int n;
   vector<vector<int>> adj;
-  graph(int n) : n(n), adj(n) { }
-  void add_edge(int src, int dst) {
-    adj[src].push_back(dst);
-    adj[dst].push_back(src);
+  Graph(int n) : n(n), adj(n) { }
+  void addEdge(int u, int v) {
+    adj[u].push_back(v);
+    adj[v].push_back(u);
   }
+};
 
-  void biconnected_components() {
-    vector<int> num(n), low(n), S;
-    unordered_set<int> arts;
+//
+// bcc.n-bcc.block.size() is the number of articulation points
+// index[u] is the corresponding node in the block-cut tree.
+// Here, if u in block[k] then
+//   if u is an articulation point, k in adj[index[u]] 
+//   otherwise, index[u] = k
+//
+struct BiconnectedComponents : Graph {
+  vector<int> is_articulation, index;
+  vector<vector<int>> block;
 
-    function<void(int,int,int&)> dfs = [&](int p, int u, int &t) { 
-      num[u] = low[u] = ++t; 
-      S.push_back(u);
-      for (int v: adj[u]) {
-        if (v == p) continue;
-        if (num[v] == 0) {
-          dfs(u, v, t);
+  BiconnectedComponents(Graph g) : Graph(0) {
+    vector<int> low(g.n), num(g.n), cur(g.n), par(g.n, -1), path;
+    is_articulation.resize(g.n);
+    for (int s = 0; s < g.n; ++s) {
+      if (num[s]) continue;
+      int time = 0;
+      vector<int> stack = {s};
+      while (!stack.empty()) {
+        int u = stack.back();
+        if (cur[u] == 0) {
+          low[u] = num[u] = ++time;
+          path.push_back(u);
+        }
+        if (cur[u] == g.adj[u].size()) {
+          stack.pop_back();
+        } else if (cur[u] >= 0) {
+          int v = g.adj[u][cur[u]++];
+          if (num[v] == 0) {
+            cur[u] = ~cur[u];
+            stack.push_back(v);
+          } else if (v != par[u]) {
+            low[u] = min(low[u], num[v]);
+          }
+        } else {
+          cur[u] = ~cur[u];
+          int v = g.adj[u][cur[u]-1];
           low[u] = min(low[u], low[v]);
           if (num[u] <= low[v]) {
-            if (num[u] != 1 || num[v] > 2) {
-              // here, u is an articulation point if
-              //   (a). u is non-root
-              //   (b). u is root with two more children 
-            }
-            vector<int> C = {u}; // biconnected component
-            while (C.back() != v) {
-              C.push_back(S.back());
-              S.pop_back();
+            is_articulation[u] = (num[u] > 1 || num[v] > 2);
+            block.push_back({u});
+            while (block.back().back() != v) {
+              block.back().push_back(path.back());
+              path.pop_back();
             }
           }
-        } else low[u] = min(low[u], num[v]);
+        }
       }
-    };
-    for (int u = 0, t; u < n; ++u) 
-      if (!num[u]) dfs(-1, u, t = 0);
-    cout << arts.size() << endl;
+    }
+    index.resize(g.n); // make a block tree
+    n = block.size();
+    for (int u = 0; u < g.n; ++u) 
+      if (is_articulation[u]) index[u] = n++;
+    adj.resize(n);
+    for (int k = 0; k < block.size(); ++k) {
+      for (int u: block[k]) {
+        if (!is_articulation[u]) index[u] = k;
+        else addEdge(k, index[u]);
+      }
+    }
   }
 };
 
-int main() {
-  for (int n, m; ~scanf("%d %d", &n, &m) && n; ) {
-    graph g(n);
-    for (int i = 0; i < m; ++i) {
-      int u, v; scanf("%d %d", &u, &v);
-      g.add_edge(u-1, v-1);
-    }
-    g.biconnected_components();
+void AOJ_GRL_3_A() {
+  int n, m; 
+  scanf("%d %d", &n, &m);
+  Graph g(n);
+  for (int i = 0; i < m; ++i) {
+    int u, v; scanf("%d %d", &u, &v);
+    g.addEdge(u, v);
   }
+  BiconnectedComponents bcc(g);
+  for (int u = 0; u < g.n; ++u) 
+    if (bcc.is_articulation[u]) cout << u << endl;
 }
+
+int main() {
+  AOJ_GRL_3_A();
+  //SPOJ_SUBMERGE();
+  //test();
+  /*
+  */
+}
+
diff --git a/graph/bridge.cc b/graph/bridge.cc
new file mode 100644
index 0000000..de6cdba
--- /dev/null
+++ b/graph/bridge.cc
@@ -0,0 +1,149 @@
+// 
+// Bridge-Block Tree (Bridge / Two-edge connected component)
+//
+// Description:
+//   Let G = (V, E). e in E is said to be a cut edge if G-e is 
+//   disconnected If G has no cut edges, it is said to be two-edge 
+//   connected.
+//   A two-edge connected component is a maximal two-edge connected
+//   subgraph. The algorithm finds all bridges with the two-edge
+//   connected components.
+//
+//   We maintain the two-edge connected component decomposition by
+//   a bridge-block tree whose nodes are the two-edge connected 
+//   components.
+//
+//
+// Complexity:
+//   O(n + m).
+//
+// Verified:
+//
+// References:
+//
+
+#include <iostream>
+#include <vector>
+#include <cstdio>
+#include <algorithm>
+#include <functional>
+#include <unordered_set>
+
+using namespace std;
+
+#define fst first
+#define snd second
+#define all(c) ((c).begin()), ((c).end())
+
+struct Graph {
+  int n;
+  vector<vector<int>> adj;
+  Graph(int n) : n(n), adj(n) { }
+  void addEdge(int src, int dst) {
+    adj[src].push_back(dst);
+    adj[dst].push_back(src);
+  }
+};
+
+struct BridgeBlockTree : Graph {
+  vector<int> index;         // index[u] is the block index containing u
+  vector<vector<int>> block; // u in block[k] <=> index[u] == k
+
+  BridgeBlockTree(Graph g) : Graph(0) {
+    index.assign(g.n, -1);
+    vector<int> num(g.n), par(g.n,-1), cur(g.n);
+
+    for (int s = 0; s < g.n; ++s) {
+      if (num[s]) continue;
+      int time = 0;
+      vector<int> snum, path, stack = {s};
+      while (!stack.empty()) {
+        int u = stack.back();
+        if (cur[u] == 0) {
+          num[u] = ++time;
+          path.push_back(u); 
+          snum.push_back(num[u]);
+        }
+        if (cur[u] == g.adj[u].size()) {
+          if (num[u] == snum.back()) {
+            snum.pop_back();
+            block.push_back({});
+            while (1) {
+              int w = path.back(); path.pop_back(); 
+              block.back().push_back(w);
+              index[w] = block.size()-1;
+              if (u == w) break;
+            }
+          }
+          stack.pop_back();
+        } else {
+          int v = g.adj[u][cur[u]++];
+          if (!num[v]) {
+            par[v] = u;
+            stack.push_back(v);
+          } else if (v != par[u] && index[v] < 0) {
+            while (snum.back() > num[v]) snum.pop_back();
+          }
+        }
+      }
+    }
+    n = block.size();
+    adj.resize(n);
+    for (int u = 0; u < g.n; ++u)
+      if (par[u] >= 0 && index[u] != index[par[u]])
+        addEdge(index[u], index[par[u]]);
+  }
+};
+
+
+// === tick a time ===
+#include <ctime>
+double tick() {
+  static clock_t oldtick;
+  clock_t newtick = clock();
+  double diff = 1.0*(newtick - oldtick) / CLOCKS_PER_SEC;
+  oldtick = newtick;
+  return diff;
+}
+
+int main() {
+  Graph g(6);
+  g.addEdge(0, 1);
+  g.addEdge(1, 2);
+  g.addEdge(2, 0);
+  g.addEdge(2, 3);
+  g.addEdge(3, 4);
+  g.addEdge(4, 5);
+  g.addEdge(5, 3);
+  BridgeBlockTree t(g);
+
+  cout << t.n << endl;
+  for (int u = 0; u < t.n; ++u) {
+    for (int v: t.adj[u]) {
+      cout << u << " " << v << endl;
+    }
+  }
+
+  for (auto B: t.block) {
+    for (int u: B) {
+      cout << u << " ";
+    }
+    cout << endl;
+  }
+  for (int u = 0; u < 6; ++u) {
+    cout << t.index[u] << " ";
+  }
+  cout << endl;
+
+  //g.bridgeless_component();
+  /*
+  for (int n, m; ~scanf("%d %d", &n, &m) && n; ) {
+    graph g(n);
+    for (int i = 0; i < m; ++i) {
+      int u, v; scanf("%d %d", &u, &v);
+      g.add_edge(u-1, v-1);
+    }
+    g.biconnected_components();
+  }
+  */
+}
diff --git a/graph/chromatic_number.cc b/graph/chromatic_number.cc
new file mode 100644
index 0000000..aef4837
--- /dev/null
+++ b/graph/chromatic_number.cc
@@ -0,0 +1,96 @@
+// 
+// Exact Algorithm for Chromatic Number
+//
+// Description:
+//
+//   A vertex coloring is an assignment of colors to the vertices
+//   such that no adjacent vertices have a same color. The smallest
+//   number of colors for a vertex coloring is called the chromatic
+//   number. Computing the chromatic number is NP-hard. 
+//
+//   We can compute the chromatic number by the inclusion-exlusion
+//   principle. The complexity is O(poly(n) 2^n). The following
+//   implementation runs in O(n 2^n) but is a Monte-Carlo algorithm 
+//   since it takes modulos to avoid multiprecision numbers.
+//
+// Complexity:
+//
+//   O(n 2^n) 
+//
+// References:
+//
+// Andreas Bjorklund and Thore Husfeldt (2006):
+// "Inclusion--Exclusion Algorithms for Counting Set Partitions."
+// in Proceedings of the 47th Annual Symposium on Foundations of 
+// Computer Science, pp. 575--582.
+//
+#include <bits/stdc++.h>
+
+using namespace std;
+
+#define fst first
+#define snd second
+#define all(c) ((c).begin()), ((c).end())
+#define TEST(s) if (!(s)) { cout << __LINE__ << " " << #s << endl; exit(-1); }
+
+struct Graph {
+  int n;
+  vector<vector<int>> adj;
+  Graph(int n) : n(n), adj(n) { }
+  void addEdge(int u, int v) {
+    adj[u].push_back(v);
+    adj[v].push_back(u);
+  }
+};
+
+int chromaticNumber(Graph g) {
+  const int N = 1 << g.n;
+  vector<int> nbh(g.n);
+  for (int u = 0; u < g.n; ++u) 
+    for (int v: g.adj[u]) 
+      nbh[u] |= (1 << v);
+
+  int ans = g.n;
+  for (int d: {7}) { // ,11,21,33,87,93}) { 
+    long long mod = 1e9 + d;
+    vector<long long> ind(N), aux(N, 1); 
+    ind[0] = 1;
+    for (int S = 1; S < N; ++S) {
+      int u = __builtin_ctz(S);
+      ind[S] = ind[S^(1<<u)] + ind[(S^(1<<u))&~nbh[u]];
+    }
+    for (int k = 1; k < ans; ++k) {
+      long long chi = 0; 
+      for (int i = 0; i < N; ++i) {
+        int S = i ^ (i >> 1); // gray-code
+        aux[S] = (aux[S] * ind[S]) % mod;
+        chi += (i & 1) ? aux[S] : -aux[S];
+      }
+      if (chi % mod) ans = k; 
+    }
+  }
+  return ans;
+}
+
+int main() {
+  int n = 6;
+  Graph g(n);
+  g.addEdge(0,1);
+  g.addEdge(1,2);
+  g.addEdge(2,3);
+  g.addEdge(0,2);
+  g.addEdge(3,4);
+  g.addEdge(4,5);
+  g.addEdge(5,0);
+  //    0
+  // 1     5
+  //
+  // 2     4
+  //    3
+  /*
+  for (int i = 0; i < n; ++i)
+    for (int j = 0; j < i; ++j)
+      g.addEdge(i, j);
+      */
+  cout << chromaticNumber(g) << endl;
+}
diff --git a/graph/gabow_edmonds.cc b/graph/gabow_edmonds.cc
index 2c245b2..455b9a4 100644
--- a/graph/gabow_edmonds.cc
+++ b/graph/gabow_edmonds.cc
@@ -2,124 +2,196 @@
 // General Graph Matching (Gabow-Edmonds)
 //
 // Description:
-//   It computes a maximum matching in a general graph.
+//   
+//   For a graph G = (V, E), a matching M is a set of edges
+//   such that any vertex is contained in M at most once.
+//   The matching with maximum cardinality is computed by
+//   the Edmonds blossom algorithm.
+//
+//   This implementation is the Gabow's simplified version
+//   with the lazy update technique to improve the complexity
+//   in sparse graphs.
+//
+//
+// Complexity:
+//
+//   O(n m log n)
 //
-// Algorithm:
-//   Gabow's simplified version of Edmonds' blossom algorithm.
-// 
-// Comlexity:
-//   O(n^3)
 //
 // Verified:
-//   LA3820, LA4130
+//
+//   SPOJ ADABLOOM
+//
 //
 // References:
 //   H.Gabow (1976):
 //   An efficient implementation of Edmonds' algorithm for maximum matching on graphs.
 //   Journal of the ACM, vol.23, no.2, pp.221-234.
 //
+//   http://min-25.hatenablog.com/entry/2016/11/21/222625
+//
+//
 
-#include <cstring>
-#include <vector>
-#include <queue>
-#include <iostream>
-#include <cstdlib>
-#include <cstdio>
-#include <algorithm>
-#include <set>
-
+// g++ -std=c++17 -O3 -fmax-errors=1 -fsanitize=undefined
+#include <bits/stdc++.h>
 using namespace std;
 
-struct graph {
+#define fst first
+#define snd second
+#define all(c) ((c).begin()), ((c).end())
+
+struct Graph {
   int n;
-  vector<vector<int>> adj;
-  graph(int n) : n(n), adj(n) { };
-  void add_edge(int x, int y) {
-    adj[x].push_back(y);
-    adj[y].push_back(x);
+  vector< vector<int> > adj;
+  Graph(int n) : n(n), adj(n) { };
+  void addEdge(int u, int v) {
+    adj[u].push_back(v);
+    adj[v].push_back(u);
   }
-  queue<int> Q;
-  vector<int> label, mate, cycle;
-  void rematch(int x, int y) {
-    int m = mate[x]; mate[x] = y;
-    if (mate[m] == x) {
-      if (label[x] < n) {
-        rematch(mate[m] = label[x], m);
+
+  vector<int> mate;
+  int maximumMatching() {
+    mate.assign(n+1, n);
+    vector<int> first(n+1, n), que(n);
+    vector<pair<int,int>> label(n+1, make_pair(-1,-1));
+    int head = 0, tail = 0;
+
+    function<void(int,int)> rematch = [&](int v, int w) {
+      int t = mate[v]; mate[v] = w;
+      if (mate[t] != v) return;
+      if (label[v].snd == -1) {
+        mate[t] = label[v].fst;
+        rematch(mate[t], t);
       } else {
-        int s = (label[x]-n)/n, t = (label[x]-n)%n;
-        rematch(s, t); rematch(t, s);
+        int x, y; tie(x, y) = label[v];
+        rematch(x, y); rematch(y, x);
       }
-    }
-  }
-  void traverse(int x) {
-    vector<int> save = mate;
-    rematch(x, x);
-    for (int u = 0; u < n; ++u) 
-      if (mate[u] != save[u]) cycle[u] ^= 1;
-    save.swap(mate);
-  }
-  void relabel(int x, int y) {
-    cycle = vector<int>(n, 0); 
-    traverse(x);
-    traverse(y);
-    for (int u = 0; u < n; ++u) {
-      if (!cycle[u] || label[u] >= 0) continue;
-      label[u] = n+x+y*n;
-      Q.push(u);
-    }
-  }
-  int augment(int r) {
-    label.assign(n, -2);
-    label[r] = -1;
-    Q = queue<int>(); Q.push(r);
-    while (!Q.empty()) {
-      int x = Q.front(); Q.pop();
-      for (int y: adj[x]) {
-        if (mate[y] < 0 && r != y) {
-          rematch(mate[y] = x, y); return 1;
-        } else if (label[y] >= -1) {
-          relabel(x, y);
-        } else if (label[mate[y]] < -1) {
-          label[mate[y]] = x;
-          Q.push(mate[y]);
+    };
+    auto relabel = [&](int x, int y) {
+      function<int(int)> findFirst = [&](int u) {
+        return label[first[u]].fst < 0 ? first[u] :
+               first[u] = findFirst(first[u]);
+      };
+      int r = findFirst(x), s = findFirst(y);
+      if (r == s) return; 
+      auto h = make_pair(~x, y);
+      label[r] = label[s] = h;
+      int join;
+      while (1) {
+        if (s != n) swap(r, s);
+        r = findFirst(label[mate[r]].fst);
+        if (label[r] == h) {
+          join = r;
+          break;
+        } else {
+          label[r] = h;
         }
       }
-    }
-    return 0;
-  }
-  int maximum_matching() {
-    mate.assign(n, -2);
+      for (int v: {first[x], first[y]}) {
+        for (; v != join; v = first[label[mate[v]].fst]) {
+          label[v] = make_pair(x, y);
+          first[v] = join;
+          que[tail++] = v;
+        }
+      }
+    };
+    auto augment = [&](int u) {
+      label[u] = make_pair(n, -1);
+      first[u] = n;
+      head = tail = 0;
+      for (que[tail++] = u; head < tail;) {
+        int x = que[head++];
+        for (int y: adj[x]) {
+          if (mate[y] == n && y != u) {
+            mate[y] = x;
+            rematch(x, y);
+            return true; 
+          } else if (label[y].fst >= 0) {
+            relabel(x, y);
+          } else if (label[mate[y]].fst == -1) {
+            label[mate[y]].fst = x;
+            first[mate[y]] = y;
+            que[tail++] = mate[y];
+          } 
+        }
+      }
+      return false;
+    };
     int matching = 0;
-    for (int u = 0; u < n; ++u) 
-      if (mate[u] < 0) matching += augment(u);
+    for (int u = 0; u < n; ++u) {
+      if (mate[u] < n || !augment(u)) continue;
+      ++matching;
+      for (int i = 0; i < tail; ++i) 
+        label[que[i]] = label[mate[que[i]]] = make_pair(-1,-1);
+      label[n] = make_pair(-1, -1);
+    }
     return matching;
   }
 };
 
-int doit() {
-  int n, m; 
-  scanf("%d %d", &n, &m);
 
-  vector<int> v(n);
-  for (int i = 0; i < n; ++i) {
-    scanf("%d", &v[i]);
+void LA3820() {
+  int ncase; scanf("%d", &ncase);
+  for (int icase = 0; icase < ncase; ++icase) {
+    int n, m; 
+    scanf("%d %d", &n, &m);
+    vector<int> v(n);
+    for (int i = 0; i < n; ++i) {
+      scanf("%d", &v[i]);
+    }
+    set<int> S;
+    for (int i = 0; i < m; ++i) {
+      int x;
+      scanf("%d", &x);
+      S.insert(x);
+    }
+    Graph G(n);
+    for (int i = 0; i < n; ++i) {
+      for (int j = i+1; j < n; ++j) {
+        if (S.count(v[i] + v[j])) 
+          G.addEdge(i, j);
+      }
+    }
+    printf("%d\n", G.maximumMatching());
   }
-  set<int> S;
+}
+
+void UOJ79() {
+  int n, m; cin >> n >> m;
+  Graph g(n);
   for (int i = 0; i < m; ++i) {
-    int x;
-    scanf("%d", &x);
-    S.insert(x);
+    int u, v;
+    cin >> u >> v;
+    g.addEdge(u-1, v-1);
   }
-
-  graph g(n);
-  for (int i = 0; i < n; ++i) {
-    for (int j = i+1; j < n; ++j) {
-      if (S.count(v[i] + v[j])) 
-        g.add_edge(i, j);
+  cout << g.maximumMatching() << endl;
+  for (int u = 0; u < n; ++u) {
+    if (u > 0) cout << " ";
+    if (g.mate[u] >= n) cout << 0;
+    else                cout << g.mate[u]+1;
+  }
+  cout << endl;
+}
+void SPOJ_ADABLOOM() {
+  int ncase; scanf("%d", &ncase);
+  for (int icase; icase < ncase; ++icase) {
+    int n; scanf("%d", &n);
+    vector<long long> a(n);
+    for (int i = 0; i < n; ++i) 
+      scanf("%lld", &a[i]);
+    random_shuffle(all(a));
+    Graph g(n);
+    for (int i = 0; i < n; ++i) { 
+      for (int j = 0; j < n; ++j) {
+        if (a[i] < (a[i] ^ a[j]) && (a[i] ^ a[j]) < a[j]) g.addEdge(i, j);
+      }
     }
+    cout << g.maximumMatching() << endl;
   }
-  return g.maximum_matching();
 }
+
 int main() {
-  doit();
+  SPOJ_ADABLOOM();
+  //UOJ79();
+  //LA3820();
 }
diff --git a/graph/k_shortest_walks.cc b/graph/k_shortest_walks.cc
new file mode 100644
index 0000000..d37c038
--- /dev/null
+++ b/graph/k_shortest_walks.cc
@@ -0,0 +1,181 @@
+//
+// K Shortest Walks (Simplified Eppstein)
+//
+// Description
+//
+//   We are given a weighted graph. The k-shortest walks problem
+//   seeks k different s-t walks (paths allowing repeated vertices)
+//   in the increasing order of the lengths.
+//
+//   If we maintain each walks explicitly, it must costs O(k^2 m) time.
+//   To avoid this complexity, we maintain the walks in a compact format.
+//   Let us fix a reverse shortest path tree from t. A deviation is an 
+//   edge that is not on the tree. Any walk is represented by a concatenation
+//   of deviations and paths on the tree. We enumerate all possible 
+//   deviations and use the best-first search to find the k-th solution.
+//
+//   The Eppstein's algorithm maintains the set of deviations by 
+//   the augmented persistent heaps and emurates the best-first search. 
+//   Here, we implemented a simplified version of the Eppstein's algorithm,
+//   which uses the simple persistent heaps instead of the augmented ones.
+//   It increases the space from O(m + n log n) to O(m log n).
+//   
+// Complexity:
+//
+//   O(m log m) construction
+//   O(k log k) for k-th search
+//
+// Verified:
+//
+//   UTPC2013_10 J K-th Cycle
+//
+// References:
+//
+//   David Eppstein (1998): 
+//   "Finding the k shortest paths",
+//   SIAM Journal on computing, vol.28, no.2, pp.652--673.
+//
+#include <bits/stdc++.h>
+
+using namespace std;
+
+#define fst first
+#define snd second
+#define all(c) ((c).begin()), ((c).end())
+#define TEST(s) if (!(s)) { cout << __LINE__ << " " << #s << endl; exit(-1); }
+
+struct Graph {
+  int n, m = 0;
+  vector<int> head;                 // Vertex
+  vector<int> src, dst, next, prev; // Edge
+
+  using Weight = long long;
+  vector<Weight> weight;
+  Graph(int n) : n(n), head(n, -1) { }
+  int addEdge(int u, int v, Weight w) {
+    next.push_back(head[u]);
+    src.push_back(u);
+    dst.push_back(v);
+    weight.push_back(w);
+    return head[u] = m++;
+  }
+};
+constexpr Graph::Weight INF = 1e15;
+
+struct KShortestWalks {
+  Graph g;
+  vector<Graph::Weight> dist;
+  vector<int> tree, order;
+  void reverseDijkstra(int t) {
+    vector<vector<int>> adj(g.n);
+    for (int u = 0; u < g.n; ++u) 
+      for (int e = g.head[u]; e >= 0; e = g.next[e]) 
+        adj[g.dst[e]].push_back(e);
+    dist.assign(g.n, INF);
+    tree.assign(g.n, ~g.m);
+    using Node = tuple<Graph::Weight,int>;
+    priority_queue<Node, vector<Node>, greater<Node>> que;
+    que.push(make_tuple(0, t));
+    dist[t] = 0;
+    while (!que.empty()) {
+      int u = get<1>(que.top()); que.pop();
+      if (tree[u] >= 0) continue;
+      tree[u] = ~tree[u];
+      order.push_back(u);
+      for (int e: adj[u]) {
+        int v = g.src[e];
+        if (dist[v] > dist[u] + g.weight[e]) {
+          tree[v] = ~e;
+          dist[v] = dist[u] + g.weight[e];
+          que.push(Node(dist[v], v));
+        }
+      }
+    }
+  }
+  struct Node { // Persistent Heap (Leftist Heap)
+    int e;
+    Graph::Weight delta;
+    Node *left = 0, *right = 0;
+    int rnk = 0;
+  } *root = 0;
+  static Node *merge(Node *x, Node *y) {
+    if (!x) return y;
+    if (!y) return x;
+    if (x->delta > y->delta) swap(x, y);
+    x = new Node(*x);
+    x->right = merge(x->right, y);
+    if (!x->left || x->left->rnk < x->rnk) swap(x->left, x->right);
+    x->rnk = (x->right ? x->right->rnk : 0) + 1;
+    return x;
+  }
+  vector<Node*> deviation;
+  void buildHeap() {
+    deviation.resize(g.n);
+    for (int u: order) {
+      int v = -1;
+      for (int e = g.head[u]; e >= 0; e = g.next[e]) {
+        if (tree[u] == e) v = g.dst[e]; 
+        else if (dist[g.dst[e]] < INF) {
+          auto delta = g.weight[e] - dist[g.src[e]] + dist[g.dst[e]];
+          deviation[u] = merge(deviation[u], new Node({e, delta}));
+        }
+      }
+      if (v >= 0) deviation[u] = merge(deviation[u], deviation[v]);
+    }
+  }
+  KShortestPaths(Graph g_, int t) : g(g_) {
+    reverseDijkstra(t);
+    buildHeap();
+  }
+  void enumerate(int s, int kth) {
+    int k = 0;
+    Node *x = deviation[s];
+    Graph::Weight len = dist[s];
+    ++k;
+    using SearchNode = tuple<Node*, Graph::Weight>;
+    auto comp = [](SearchNode x, SearchNode y) { return get<1>(x) > get<1>(y); };
+    priority_queue<SearchNode, vector<SearchNode>, decltype(comp)> que(comp);
+    if (x) que.push(SearchNode(x, len + x->delta));
+    while (!que.empty() && k < kth) {
+      tie(x, len) = que.top(); que.pop();
+      int e = x->e, u = g.src[e], v = g.dst[e];
+      cout << len << endl; ++k;
+      if (deviation[v]) que.push(SearchNode(deviation[v], len+deviation[v]->delta));
+      for (Node *y: {x->left, x->right}) 
+        if (y) que.push(SearchNode(y, len + y->delta-x->delta));
+    }
+    while (k < kth) { cout << -1 << endl; ++k; }
+  }
+};
+
+void KSH_test() {
+  int n = 4;
+  Graph g(n);
+  g.addEdge(0, 1, 2);
+  g.addEdge(0, 2, 2);
+  g.addEdge(1, 3, 4);
+  g.addEdge(2, 3, 2);
+  g.addEdge(1, 2, 1);
+  g.addEdge(2, 1, 1);
+  KShortestPaths ksh(g, 3);
+  ksh.enumerate(0, 10);
+}
+
+void UTPC2013_10() {
+  int n, m, k;
+  scanf("%d %d %d", &n, &m, &k);
+  Graph g(n);
+  for (int i = 0; i < m; ++i) {
+    int u, v;
+    long long w;
+    scanf("%d %d %lld", &u, &v, &w);
+    g.addEdge(u, v, w);
+  }
+  KShortestPaths ksh(g, 0);
+  ksh.enumerate(0, k+1);
+}
+
+int main() {
+  UTPC2013_10();
+  //KSH_test();
+}
diff --git a/graph/plane_perfect_matchings.cc b/graph/plane_perfect_matchings.cc
new file mode 100644
index 0000000..095d48a
--- /dev/null
+++ b/graph/plane_perfect_matchings.cc
@@ -0,0 +1,196 @@
+//
+// Counting Perfect Matchings in Plane Graph 
+// (Fisher, Kasteleyn, and Temperley)
+//
+// Description:
+//
+//   Pfaffian Orientation; see https://en.wikipedia.org/wiki/FKT_algorithm
+//
+// Complexity:
+//
+//   O(n^3).
+//
+// g++ -std=c++17 -O3 -fmax-errors=1 -fsanitize=undefined
+#include <bits/stdc++.h>
+
+using namespace std;
+
+#define fst first
+#define snd second
+#define all(c) ((c).begin()), ((c).end())
+#define TEST(s) if (!(s)) { cout << __LINE__ << " " << #s << endl; exit(-1); }
+
+using Real = long double;
+struct Point { 
+  Real x, y;
+};
+
+struct PlaneGraph {
+  vector<int> incident_edge; // vertex record
+  vector<int> origin, twin, prev, next, incident_face; // edge record
+  vector<int> component; // face record
+  int edges()    const { return origin.size(); }
+  int vertices() const { return incident_edge.size(); }
+  int faces()    const { return component.size(); }
+
+  vector<Point> point;
+  int newVertex(Point p, int e = -1) {
+    point.push_back(p);
+    incident_edge.push_back(e);
+    return vertices()-1;
+  }
+  int newEdge(int o = -1) {
+    origin.push_back(o);
+    twin.push_back(-1);
+    prev.push_back(-1);
+    next.push_back(-1);
+    incident_face.push_back(-1);
+    return edges()-1;
+  }
+  int newFace(int e = -1) {
+    component.push_back(e);
+    return component.size()-1;
+  }
+  void completeFaces() {
+    component.clear();
+    fill(all(incident_face), -1);
+    for (int e = 0; e < edges(); ++e) {
+      if (incident_face[e] >= 0) continue;
+      int f = newFace(e), x = e;
+      do {
+        incident_face[x] = f;
+        x = next[x];
+      } while (x != e);
+    }
+  }
+
+  // assume connected
+  vector<int> pfaffianOrientation() {
+    // take any spanning tree T
+    vector<int> dir(edges(), -2), seen(vertices());
+    function<void(int)> dfs1 = [&](int u) {
+      seen[u] = 1;
+      int e = incident_edge[u];
+      do {
+        int v = origin[twin[e]];
+        if (!seen[v]) {
+          dir[e] = 1;
+          dir[twin[e]] = -dir[e];
+          dfs1(v);
+        }
+        e = next[twin[e]];
+      } while (e != incident_edge[u]);
+    };
+    for (int u = 0; u < vertices(); ++u)
+      if (!seen[u]) dfs1(u);
+
+    // take any dual spanning tree that does not cross T
+    seen = vector<int>(faces());
+    vector<int> come(faces(), -1);
+    function<void(int,int)> dfs2 = [&](int f, int p) {
+      int parity = 0;
+      int free_edge = -1;
+      seen[f] = 1;
+      int e = component[f];
+      do {
+        int g = incident_face[twin[e]];
+        if (dir[e] == -2 && !seen[g]) dfs2(g, twin[e]);
+        if (dir[e] == -2) { assert(free_edge == -1); free_edge = e; }
+        else if (dir[e] == 1) ++parity;
+        e = next[e];
+      } while (e != component[f]);
+      if (free_edge != -1) {
+        dir[free_edge] = -(parity % 2 == 0 ? -1 : +1);
+        dir[twin[free_edge]] = -dir[free_edge];
+      }
+    };
+    dfs2(0, -1);
+    return dir;
+  }
+  using Int = __int128_t;
+  Int countPerfectMatching() {
+    vector<int> dir = pfaffianOrientation();
+    int n = vertices();
+    vector<vector<Int>> A(n, vector<Int>(n));
+    for (int e = 0; e < edges(); ++e) 
+      A[origin[e]][origin[twin[e]]] = dir[e];
+
+    // compute determinant
+    Int det = 1;
+    for (int j = 0; j < n; ++j) {
+      for (int i = j+1; i < n; ++i) {
+        while (A[i][j]) { 
+          det = -det;
+          Int t = A[j][j] / A[i][j];
+          for (int k = j; k < n; ++k) 
+            swap(A[i][k], A[j][k] -= t * A[i][k]);
+        }
+      }
+      det *= A[j][j]; // % mod
+    }
+    return sqrt(det + 0.1);
+  }
+};
+
+using Int = long long;
+Int dominoCount(vector<vector<char>> table) {
+  int m = table.size(), n = table[0].size();
+  vector<vector<int>> index(m, vector<int>(n, -1));
+  PlaneGraph g;
+  unordered_map<int,unordered_map<int,int>> adj;
+  for (int i = 0; i < m; ++i) {
+    for (int j = 0; j < n; ++j) {
+      if (table[i][j] == '.') {
+        index[i][j] = g.newVertex(Point({i,j}));
+      }
+    }
+  }
+  unordered_map<int,int> next_inc, prev_inc;
+  for (int i = 0; i < m; ++i) {
+    for (int j = 0; j < n; ++j) {
+      vector<int> inc;
+      int x = index[i][j];
+      int dx[] = {1,0,-1,0}, dy[] = {0,1,0,-1};
+      for (int p = 0; p < 4; ++p) {
+        int k = i+dx[p], l = j+dy[p];
+        if (k < 0 || l < 0) continue;
+        if (k >= table.size() || l >= table[k].size()) continue;
+        if (table[i][j] != '.' || table[k][l] != '.') continue;
+        int y = index[k][l];
+        if (!adj[x].count(y)) adj[x][y] = g.newEdge(x);
+        if (!adj[y].count(x)) adj[y][x] = g.newEdge(y);
+        g.twin[adj[x][y]] = adj[y][x];
+        g.twin[adj[y][x]] = adj[x][y];
+        g.incident_edge[x] = adj[x][y];
+        g.incident_edge[y] = adj[y][x];
+        inc.push_back(adj[x][y]);
+      }
+      for (int i = 0; i < inc.size(); ++i) {
+        int j = (i == inc.size()-1 ? 0 : i+1);
+        next_inc[inc[i]] = inc[j];
+        prev_inc[inc[j]] = inc[i];
+      }
+    }
+  }
+  for (int e = 0; e < g.edges(); ++e) {
+    g.next[e] = prev_inc[g.twin[e]];
+    g.prev[e] = g.twin[next_inc[e]];
+  }
+  g.completeFaces();
+  return g.countPerfectMatching();
+}
+
+void SPOJ_GNY07H() {
+  int ncase;
+  cin >> ncase;
+  for (int icase = 0; icase < ncase; ++icase) {
+    int w;
+    cin >> w;
+    vector<vector<char>> table(4, vector<char>(w, '.')) ;
+    cout << icase+1 << " " << dominoCount(table) << endl;
+  }
+}
+
+int main() {
+  SPOJ_GNY07H();
+}
diff --git a/graph/topological_sort.cc b/graph/topological_sort.cc
new file mode 100644
index 0000000..66cfd16
--- /dev/null
+++ b/graph/topological_sort.cc
@@ -0,0 +1,81 @@
+//
+// Topological Sort
+//
+//
+// Description:
+//
+//   Let G = (V, E) be a graph. An ordering ord: [n] -> V is a topological 
+//   ordering if i > j then there is no edge from ord[i] to ord[j].
+//   G has a topological ordering if and only if G is DAG.
+//
+//   A topological order can be obtained in O(n + m) time by using
+//   an iterative method (Kahn's algorithm) or a recursive method 
+//   (by Tarjan's algorithm). The following implementation is a 
+//   Kahn's algorithm.
+//
+//   Note that if you want to find the all topological orders,
+//
+//
+// Complexity:
+//
+//   O(n + m)
+//
+//
+// Verified:
+//
+//   AOJ GPL_4_B 
+//
+// References:
+//
+//   Arthur B. Kahn (1962):
+//   "Topological sorting of large networks".
+//   Communications of the ACM, 5 (11): 558--562.
+//
+#include <bits/stdc++.h>
+using namespace std;
+
+#define fst first
+#define snd second
+#define all(c) ((c).begin()), ((c).end())
+
+struct Graph {
+  int n;
+  vector<vector<int>> adj;
+  Graph(int n) : n(n), adj(n) { }
+  void addEdge(int u, int v) {
+    adj[u].push_back(v);
+  }
+};
+
+// return empty list if g has no topological order
+vector<int> topologicalSort(Graph g) {
+  vector<int> deg(g.n);
+  for (int u = 0; u < g.n; ++u)
+    for (int v: g.adj[u]) ++deg[v];
+  vector<int> stack; 
+  for (int u = 0; u < g.n; ++u)
+    if (!deg[u]) stack.push_back(u);
+
+  vector<int> order;
+  while (!stack.empty()) {
+    int u = stack.back(); stack.pop_back();
+    order.push_back(u);
+    for (int v: g.adj[u]) 
+      if (!--deg[v]) stack.push_back(v);
+  }
+  return order.size() == g.n ? order : vector<int>();
+}
+
+int main() {
+  int n, m; cin >> n >> m;
+  Graph g(n);
+  for (int i = 0; i < m; ++i) {
+    int u, v; cin >> u >> v;
+    g.addEdge(u, v);
+  }
+  auto ord = topologicalSort(g);
+  for (int i = 0; i < ord.size(); ++i) {
+    if (i > 0) cout << " ";
+    cout << ord[i];
+  }
+}
diff --git a/machine_learning/roc-auc.cc b/machine_learning/roc-auc.cc
new file mode 100644
index 0000000..5af7947
--- /dev/null
+++ b/machine_learning/roc-auc.cc
@@ -0,0 +1,40 @@
+#include <bits/stdc++.h>
+
+using namespace std;
+
+double trapezoid(double x1, double x2, double y1, double y2) {
+	return (y2+y1)/2 * abs(x2-x1);
+}
+
+double auc(vector<int> test, vector<double> pred) {
+	int n = test.size();
+	assert(n == pred.size());
+
+	vector<int> idx(n);
+	for (int i = 0; i < n; ++i) idx[i] = i;
+	sort(idx.begin(), idx.end(), [&](int i, int j) { return pred[i] > pred[j]; });
+
+	double a = 0.0;
+	double fp = 0, tp = 0, fp_prev = 0, tp_prev = 0;
+	double prev_score = -1.0/0.0;
+	for (int i: idx) {
+		if (pred[i] != prev_score) {
+			a += trapezoid(fp, fp_prev, tp, tp_prev);
+			prev_score = pred[i];
+			fp_prev = fp;
+			tp_prev = tp;
+		}
+		if (test[i] == 1) {
+			tp += 1;
+		} else {
+			fp += 1;
+		}
+	}
+	a += trapezoid(fp, fp_prev, tp, tp_prev);
+	return a / (tp * fp);
+}
+int main() {
+	vector<int> test = {0, 1, 0, 1, 1};
+	vector<double> pred = {0.2, 0.3, 0.4, 0.5, 0.6};
+	cout << auc(test, pred) << endl;
+}
diff --git a/math/lattice_below_line.cc b/math/lattice_below_line.cc
new file mode 100644
index 0000000..e9ee769
--- /dev/null
+++ b/math/lattice_below_line.cc
@@ -0,0 +1,74 @@
+// 
+// Number of lattice points below a line 
+//
+// Description:
+//
+//   Let a, b, n, m be nonnegative integers. The task is to compute 
+//     sum_{i in [0,n)} floor((a + ib)/m).
+//
+//   We compute this quantity in two directions alternately.
+//   First, let 
+//     a = (a/m) m + (a%m), 
+//     b = (b/m) m + (b%m).
+//   Then the quantity is
+//     sum [(a/m)+i*(b/m)] + floor(((a%m) + i(b%m))/m)
+//   Here, the first term is analytically evaluated.
+//   The second term is zero if b%m == 0. Otherwise, the task is 
+//   reduced to compute
+//     sum_{i in [0,n)} floor((a + ib)/m)
+//   where a < m, b < m. By changing the axes, this quantity is 
+//     sum_{i in [0,n')} floor((a' + ib')/m')
+//   where
+//     n' = (a + b n) / m,
+//     a' = (a + b n) % m,
+//     b' = m,
+//     m' = b.
+//
+//   We evaluate the number of iterations. Since the computation 
+//   between b and m is the same as the one of the Euclidean 
+//   algorithm. Thus it terminates in O(log m) time.
+//
+// Complexity:
+//
+//   O(log m).
+//
+// Verified:
+// 
+//   Somewhere
+
+#include <iostream>
+#include <vector>
+#include <cstdio>
+#include <algorithm>
+#include <functional>
+
+using namespace std;
+
+#define fst first
+#define snd second
+#define all(c) ((c).begin()), ((c).end())
+
+//
+// sum_{0<=i<n} floor(a + b i)/m; assume n, m >= 0, a >= 0, b >= 0
+//
+// 
+using Int = long long;
+Int latticeBelowLine(Int n, Int a, Int b, Int m) {
+  Int ans = 0;
+  while (m) {
+    ans += (n-1)*n/2*(b/m) + n*(a/m); 
+    a %= m;
+    b %= m;
+    auto z = (a+b*n);
+    a = z%m;
+    n = z/m;
+    swap(b, m);
+  }
+  return ans;
+}
+
+int main() {
+  srand(time(0));
+  Int a = rand(), b = rand(), n = rand(), m = rand();
+  cout << latticeBelowLine(n, a, b, m) << endl;
+}
diff --git a/number_theory/modular_arithmetics.cc b/number_theory/modular_arithmetics.cc
index 32e0ce8..5110efe 100644
--- a/number_theory/modular_arithmetics.cc
+++ b/number_theory/modular_arithmetics.cc
@@ -1,179 +1,442 @@
 //
-// Modular arithmetics (long long)
+// Modular Arithmetics
 //
-// Note:
-//   int       < 2^31 < 10^9
-//   long long < 2^63 < 10^18
+//  long long: 10^18 < 2^63-1 < 10^19 (strict inequality)
+// __int128_t: 10^38 < 2^127-1 < 10^39
 //
-// feasible for M < 2^62 (10^18 < 2^62 < 10^19)
-//
-//
-// Verified:
-//   SPOJ 11409: Fibonacci With a Twist
-//   SPOJ 9832: Matrix Inverse
-//
-//
-
-#include <cassert>
-#include <cstring>
-#include <iostream>
-#include <cstdio>
-#include <cstdlib>
-#include <vector>
-#include <ctime>
-#include <functional>
-#include <algorithm>
+// g++ -std=c++17 -O3 -fmax-errors=1 -fsanitize=undefined
+#include <bits/stdc++.h>
+#pragma GCC optimize ("O3")
 
 using namespace std;
-
-
-#define All(c) c.begin(), c.end()
-#define FOR(i,c) for(typeof(c.begin())i=c.begin();i!=c.end();++i)
-#define REP(i,n) for(int i=0;i<n;++i)
 #define fst first
 #define snd second
+#define all(c) begin(c), end(c)
 
-typedef long long ll;
-typedef vector<ll> vec;
-typedef vector<vec> mat;
 
-ll add(ll a, ll b, ll M) {
-  a += b;
-  if (a >= M) a -= M;
-  return a;
-}
-ll sub(ll a, ll b, ll M) { 
-  if (a < b) a += M;
-  return a - b;
+// === tick a time ===
+#include <ctime>
+double tick() {
+  static clock_t oldtick;
+  clock_t newtick = clock();
+  double diff = 1.0*(newtick - oldtick) / CLOCKS_PER_SEC;
+  oldtick = newtick;
+  return diff;
 }
 
-// Correctness of mul
-//   ab = floor(ab / M) * M + (ab % M)
-//   -> (ab % M) = ab - floor(ab / M) * M
-ll mul(ll a, ll b, ll M) {
-  ll r = a*b - floor(1.0*a*b/M)*M;
-  return r < 0 ? r + M : r >= M ? r - M : r;
+
+template <class T>
+ostream &operator<<(ostream &os, const vector<T> &v) {
+  os << "[";
+  for (int i = 0; i < v.size(); os << v[i++]) 
+    if (i > 0) os << " ";
+  os << "]";
+  return os;
+}
+template <class T>
+ostream &operator<<(ostream &os, const vector<vector<T>> &v) {
+  os << "[";
+  for (int i = 0; i < v.size(); os << v[i++]) 
+    if (i > 0) os << endl << " ";
+  os << "]";
+  return os;
 }
-ll pow(ll a, ll b, ll M) {
-  ll x = 1;
-  for (; b > 0; b >>= 1) {
-    if (b & 1) x = mul(x, a, M);
-    a = mul(a, a, M);
+using Int = long long;
+struct ModInt {
+  static Int mod; // Set this value
+  Int val;
+  ModInt(Int v=0) : val(v%mod) { }
+  ModInt operator-() const { return ModInt(val?mod-val:val); }
+  ModInt &operator+=(ModInt a) { 
+    if ((val += a.val) >= mod) val -= mod;
+    return *this;
+  }
+  ModInt &operator-=(ModInt a) { 
+    if ((val -= a.val) < 0) val += mod;
+    return *this;
+  }
+  ModInt &operator*=(ModInt a) { 
+    val = (__uint128_t(val) * a.val) % mod;
+    return *this;
+  }
+  ModInt &operator/=(ModInt a) { 
+    Int u = 1, v = a.val, s = 0, t = mod;
+    while (v) {
+      Int q = t / v;
+      swap(s -= u * q, u);
+      swap(t -= v * q, v);
+    }
+    a.val = (s < 0 ? s + mod : s);
+    val /= t;
+    return (*this) *= a;
+  }
+  ModInt inv() const { return ModInt(1) /= (*this); }
+  bool operator<(ModInt x) const { return val < x.val; }
+};
+Int ModInt::mod = 1e9+7;
+
+ostream &operator<<(ostream &os, ModInt a) { os << a.val; return os; }
+ModInt operator+(ModInt a, ModInt b) { return a += b; }
+ModInt operator-(ModInt a, ModInt b) { return a -= b; }
+ModInt operator*(ModInt a, ModInt b) { return a *= b; }
+ModInt operator/(ModInt a, ModInt b) { return a /= b; }
+ModInt pow(ModInt a, Int e) { 
+  ModInt x(1);
+  for (; e > 0; e /= 2) {
+    if (e % 2 == 1) x *= a;
+    a *= a;
   }
   return x;
 }
-ll div(ll a, ll b, ll M) {
-  ll u = 1, x = 0, s = b, t = M; 
-  while (s) {
-    ll q = t / s;
-    swap(x -= u * q, u);
-    swap(t -= s * q, s);
-  }
-  if (a % t) return -1; // infeasible
-  return mul(x < 0 ? x + M : x, a / t, M);
+ModInt stringToModInt(string s) {
+  Int val = 0;
+  for (int i = 0; i < s.size(); ++i) 
+    val = (val*10 + (s[i]-'0')) % ModInt::mod;
+  return ModInt(val);
+}
+
+// compute inv[1], inv[2], ..., inv[mod-1] in O(n) time
+vector<ModInt> inverse() {
+  Int mod = ModInt::mod;
+  vector<ModInt> inv(mod);
+  inv[1].val = 1;
+  for (Int a = 2; a < mod; ++a) 
+    inv[a] = inv[mod % a] * ModInt(mod - mod/a);
+  return inv;
 }
 
 
-// Modular Matrix
-mat eye(int n) {
-  mat I(n, vec(n));
-  REP(i, n) I[i][i] = 1;
-  return I;
+//
+// Solve x^2 = n; mod should be a prime
+// 
+// Verified: Code Forces Quadratic Equations
+//
+bool isQuadraticResidue(ModInt n) {
+  return n.val == 0 || n.mod == 2 || pow(n, (n.mod-1)/2).val == 1;
 }
-mat zeros(int n) {
-  return mat(n, vec(n));
+ModInt sqrt(ModInt n) { 
+  const Int mod = ModInt::mod;
+  if (n.val == 0 || mod == 2) return n;
+  int M = __builtin_ctz(mod-1), Q = (mod-1)>>M;
+  ModInt z(2);
+  while (isQuadraticResidue(z)) ++z.val;
+  ModInt c = pow(z, Q);
+  ModInt t = pow(n, Q);
+  ModInt R = pow(n, (Q+1)/2);
+  while (t.val != 1) {
+    int i = 0;
+    for (ModInt s = t; s.val != 1; s *= s) ++i;
+    if (M == i) exit(0);
+    ModInt b = pow(c, 1<<(M-i-1));
+    M = i;
+    c = b*b;
+    t *= c;
+    R *= b;
+  }
+  return R;
 }
-mat mul(mat A, mat B, ll M) {
-  int l = A.size(), m = B.size(),  n = B[0].size();
-  mat C(l, vec(n));
-  REP(i,l) REP(k,m) REP(j,n) 
-    C[i][j] = add(C[i][j], mul(A[i][k], B[k][j], M), M);
-  return C;
+vector<ModInt> quadraticEquation(ModInt a, ModInt b, ModInt c) {
+  if (ModInt::mod == 2) {
+    vector<ModInt> ans;
+    if (c.val == 0) ans.push_back(c);
+    if ((a + b + c).val == 0) ans.push_back(ModInt(1));
+    return ans;
+  } else {
+    b /= (a+a); c /= a; 
+    ModInt D = b*b - c;
+    if (!isQuadraticResidue(D)) return {};
+    ModInt s = sqrt(D), x = -b+s, y = -b-s;
+    return (x.val < y.val) ? vector<ModInt>({x, y}) : 
+           (x.val > y.val) ? vector<ModInt>({y, x}) : 
+                             vector<ModInt>({x});
+  }
 }
-mat pow(mat A, ll b, ll M) {
-  mat X = eye(A.size());
-  for (; b > 0; b >>= 1) {
-    if (b & 1) X = mul(X, A, M);
-    A = mul(A, A, M);
+
+// Discrete Logarithm by Shanks' Baby-Step Giant-Step
+// Find k such that a^k == b 
+Int log(ModInt a, ModInt b) {
+  Int h = ceil(sqrt(ModInt::mod+1e-9));
+  unordered_map<Int,Int> hash;
+  ModInt x(1);
+  for (Int i = 0; i < h; ++i) {
+    if (!hash.count(x.val)) hash[x.val] = i;
+    x *= a;
   }
-  return X;
+  x = x.inv(); 
+  ModInt y = b;
+  for (int i = 0; i < h; ++i) {
+    if (hash.count(y.val)) return i*h+hash[y.val];
+    y *= x;
+  }
+  return -1;
 }
-// assume: M is prime (singular ==> 
-// verify: SPOJ9832
-mat inv(mat A, ll M) {
-  int n = A.size();
-  mat B(n, vec(n));
-  for (int i = 0; i < n; ++i) 
-    B[i][i] = 1;
-
-  for (int i = 0; i < n; ++i) {
-    int j = i;
-    while (j < n && A[j][i] == 0) ++j;
-    if (j == n) return {};
-    swap(A[i], A[j]); 
-    swap(B[i], B[j]);
-
-    ll inv = div(1, A[i][i], M);
-    for (int k = i; k < n; ++k)
-      A[i][k] = mul(A[i][k], inv, M);
-    for (int k = 0; k < n; ++k)
-      B[i][k] = mul(B[i][k], inv, M);
+
+struct ModMatrix {
+  int m, n; // m times n matrix
+  vector<vector<ModInt>> val;
+  ModInt &operator()(int i, int j) { return val[i][j]; }
+  ModMatrix(int m, int n) : 
+    m(m), n(n), val(m, vector<ModInt>(n)) { }
+  ModMatrix operator-() const {
+    ModMatrix A(m, n);
+    for (int i = 0; i < m; ++i) 
+      for (int j = 0; j < n; ++j) 
+        A.val[i][j] = -val[i][j];
+    return A;
+  }
+  ModMatrix &operator+=(ModMatrix A) {
+    for (int i = 0; i < m; ++i) 
+      for (int j = 0; j < n; ++j) 
+        val[i][j] += A.val[i][j];
+    return *this;
+  }
+  ModMatrix &operator-=(ModMatrix A) {
+    for (int i = 0; i < m; ++i) 
+      for (int j = 0; j < n; ++j) 
+        val[i][j] -= A.val[i][j];
+    return *this;
+  }
+  ModMatrix &operator*=(ModMatrix A) {
+    for (int i = 0; i < m; ++i) {
+      vector<ModInt> row(A.n);
+      for (int j = 0; j < A.n; ++j) {
+        for (int k = 0; k < A.m; ++k) 
+          row[j] += val[i][k] * A.val[k][j];
+      }
+      val[i] = row;
+    }
+    return *this;
+  }
+  static ModMatrix eye(int n) {
+    ModMatrix I(n, n);
+    for (int i = 0; i < n; ++i) I.val[i][i].val = 1;
+    return I;
+  }
+  static ModMatrix zero(int n) {
+    return ModMatrix(n, n);
+  }
+  // mod should be prime
+  ModMatrix inv() const { 
+    ModMatrix B = eye(n);
+    vector<vector<ModInt>>  a = val;
+    vector<vector<ModInt>> &b = B.val;
+    for (int i = 0, j, k; i < n; ++i) {
+      for (j = i; j < n && a[j][i].val == 0; ++j);
+      if (j == n) return ModMatrix(0,0); // regularity is checked by m = 0
+      swap(a[i], a[j]); 
+      swap(b[i], b[j]);
+      ModInt inv = a[i][i].inv();
+      for (k = i; k < n; ++k) a[i][k] *= inv;
+      for (k = 0; k < n; ++k) b[i][k] *= inv;
+      for (j = 0; j < n; ++j) {
+        if (i == j || a[j][i].val == 0) continue;
+        ModInt c = a[j][i];
+        for (k = i; k < n; ++k) a[j][k] -= c * a[i][k];
+        for (k = 0; k < n; ++k) b[j][k] -= c * b[i][k];
+      }
+    }
+    return B;
+  }
+  // It can be used for any composite modulo.
+  ModInt det() const {
+    vector<vector<ModInt>> a = val; 
+    ModInt D(1);
     for (int j = 0; j < n; ++j) {
-      if (i == j || A[j][i] == 0) continue;
-      ll cor = A[j][i];
-      for (int k = i; k < n; ++k)
-        A[j][k] = sub(A[j][k], mul(cor, A[i][k], M), M);
-      for (int k = 0; k < n; ++k)
-        B[j][k] = sub(B[j][k], mul(cor, B[i][k], M), M);
+      for (int i = j+1; i < n; ++i) {
+        while (a[i][j].val) { 
+          D = -D;
+          ModInt t(a[j][j].val/a[i][j].val);
+          for (int k = j; k < n; ++k) 
+            swap(a[i][k], a[j][k] -= t * a[i][k]);
+        }
+      }
+      D *= a[j][j];
     }
+    return D;
+  }
+};
+ModMatrix operator+(ModMatrix A, ModMatrix B) { return A += B; }
+ModMatrix operator-(ModMatrix A, ModMatrix B) { return A -= B; }
+ModMatrix operator*(ModMatrix A, ModMatrix B) { return A *= B; }
+ModMatrix pow(ModMatrix A, int k) {
+  ModMatrix X = ModMatrix::eye(A.n);
+  for (; k > 0; k /= 2) {
+    if (k % 2 == 1) X *= A;
+    A *= A;
   }
-  return B;
+  return X;
+}
+ModInt dot(ModMatrix A, ModMatrix B) {
+  ModInt val;
+  for (int i = 0; i < A.m; ++i)
+    for (int j = 0; j < A.n; ++j)
+      val += A.val[i][j] * B.val[i][j];
+  return val;
+}
+using ModVector = vector<ModInt>;
+ModVector operator*(ModMatrix A, ModVector x) {
+  vector<ModInt> y(A.m);
+  for (int i = 0; i < A.m; ++i) 
+    for (int j = 0; j < A.n; ++j)
+      y[i] += A.val[i][j] * x[j];
+  return y;
+}
+ModInt dot(ModVector a, ModVector b) {
+  ModInt val;
+  for (int i = 0; i < a.size(); ++i)
+    val += a[i] * b[i];
+  return val;
 }
 
+//
+// Only available for prime modulos.
+// If you want to compute multiple inverses, 
+// use LU decomposition instead of computing the inverse.
+//
+struct LUDecomposition {
+  int n;
+  vector<int> pi;
+  vector<vector<ModInt>> val;
+  LUDecomposition(ModMatrix A) : n(A.n), val(A.val) {
+    pi.resize(n+1);
+    iota(all(pi), 0);
+    for (int i = 0, j, k; i < n; ++i) {
+      for (k = i; k < n; ++k) 
+        if (val[k][i].val) break; 
+      if (k == n) { pi[n] = -1; return; } // NG
+      if (k != i) {
+        swap(pi[i], pi[k]);
+        swap(val[i], val[k]);
+        ++pi[n];
+      }
+      for (j = i+1; j < n; ++j) {
+        if (val[j][i].val == 0) continue;
+        val[j][i]/= val[i][i];
+        for (k = i+1; k < n; ++k) 
+          val[j][k] -= val[j][i] * val[i][k];
+      }
+    }
+  }
+  bool isRegular() const { return pi[n] >= 0; }
+  ModVector solve(ModVector b) {
+    vector<ModInt> x(b.size());
+    for (int i = 0; i < n; ++i) {
+      x[i] = b[pi[i]];
+      for (int k = 0; k < i; ++k) 
+        x[i] -= val[i][k] * x[k];
+    }
+    for (int i = n-1; i >= 0; --i) {
+      for (int k = i+1; k < n; ++k)
+        x[i] -= val[i][k] * x[k];
+      x[i] /= val[i][i];
+    }
+    return x;
+  }
+  ModMatrix inverse() { // do not compute the inverse
+    ModMatrix B(n, n);
+    for (int j = 0; j < n; ++j) {
+      for (int i = 0; i < n; ++i) {
+        if (pi[i] == j) B.val[i][j].val = 1;
+        for (int k = 0; k < i; k++)
+          B.val[i][j] -= val[i][k] * B.val[k][j];
+      }
+      for (int i = n-1; i >= 0; --i) {
+        for (int k = i+1; k < n; ++k) 
+          B.val[i][j] -= val[i][k] * B.val[k][j];
+        B.val[i][j] /= val[i][i];
+      }
+    }
+    return B;
+  }
+  ModInt det() {
+    ModInt D = val[0][0];
+    for (int i = 1; i < n; i++)
+      D *= val[i][i];
+    return ((pi[n] - n) % 2 != 0) ? -D : D;
+  }
+};
 
+void mulTest() {
+  ModInt::mod = 1e9+7;
+  int m = 4, n = 4;
+  ModMatrix A(m,n);
+  ModMatrix B(m,n);
+  vector<ModInt> b(n);
+  for (int i = 0; i < m; ++i) {
+    for (int j = 0; j < n; ++j) {
+      A(i,j).val = rand() % ModInt::mod;
+      B(i,j).val = rand() % ModInt::mod;
+    }
+    b[i].val = rand() % ModInt::mod;
+  }
+  ModMatrix C = A * B;
 
-void disp(mat A) {
-  cout << "[";
-  REP(i, A.size()) {
-    if (i != 0) cout << " ";
-    REP(j, A[i].size()) {
-      cout << A[i][j];
-      if (j != A[i].size()-1) cout << ", ";
-      else                    cout << "; ";
+  for (int i = 0; i < C.m; ++i) {
+    for (int j = 0; j < C.n; ++j) {
+      cout << C(i,j) << " ";
     }
     cout << endl;
   }
-  cout << endl;
-}
+  cout << C.det() << endl;
 
+  LUDecomposition LU(C);
+  cout << LU.det() << endl;
 
-ll binomial(ll n, ll k, ll M) {
-  ll num = 1, den = 1;
-  while (n > 0 || k > 0) {
-    ll m = n % M, l = k % M;
-    if (m < l) return 0;
-    if (l > m - l) l = m - l;
-    while (l > 0) {
-      num = mul(num, m--, M);
-      den = mul(den, l--, M);
-    }
-    n /= M; k /= M;
-  }
-  return div(num, den, M);
+  // A^{-1} b = x
+  auto x = LU.solve(b);
+  cout << b << " " << (C * x) << endl;
+  cout << "end" << endl;
 }
 
+void CF_QUADRATIC_EQUATIONS() {
+  int ncase; cin >> ncase;
+  for (int icase = 0; icase < ncase; ++icase) {
+    Int a, b, c, p;
+    cin >> a >> b >> c >> p;
+    ModInt::mod = p;
+    vector<ModInt> ans = quadraticEquation(
+      ModInt(a), ModInt(b), ModInt(c));
+    cout << ans.size();
+    for (int i = 0; i < ans.size(); ++i) 
+      cout << " " << ans[i].val;
+    cout << endl;
+  }
+}
 
-// tick a time 
-double tick() {
-  static clock_t oldtick;
-  clock_t newtick = clock();
-  double diff = 1.0*(newtick - oldtick) / CLOCKS_PER_SEC;
-  oldtick = newtick;
-  return diff;
+void CF_DISCLOG() {
+  ModInt::mod = 999998999999;
+  ModInt a(21309), b(696969);
+  cout << log(a, b) << endl;
 }
 
+int SPOJ_MIFF() {
+  for (int icase = 0; ; ++icase) {
+    int n, p; scanf("%d %d", &n, &p);
+    if (n == 0 && p == 0) break;
+    if (icase > 0) printf("\n");
+    ModInt::mod = p;
+    ModMatrix A(n, n);
+    for (int i = 0; i < n; ++i)
+      for (int j = 0; j < n; ++j)
+        scanf("%d", &A(i,j));
+
+    ModMatrix B = A.inv();
+    if (B.m > 0) {
+      for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < n; ++j) {
+          printf("%d ", B(i,j));
+        }
+        printf("\n");
+      }
+    } else {
+      printf("singular\n");
+    }
+  }
+}
 
-const int N = 10000000;
-ll x[N];
 int main() {
+  SPOJ_MIFF();
+  //CF_DISCLOG();
+  //CF_QUADRATIC_EQUATIONS();
+  //mulTest();
 }