forked from nisarg0/Algorithm-Implementation
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPatternMatching.cpp
More file actions
119 lines (97 loc) · 3.26 KB
/
PatternMatching.cpp
File metadata and controls
119 lines (97 loc) · 3.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
// Given a large text and a pattern where text >> pattern.
// Find indices for the matching pattern in the text.
// Rolling hash method: O(n) time, O(1) space
/**
* @brief Rolling hash method
* hash(pattern) =
* hash(pattern[0..m-1]) = (pattern[0]*d^(m-1) + pattern[1]*d^(m-2) + ... + pattern[m-1]) mod q
* Also, First character of the pattern is the most significant character hence it's mutiplied by d^(m-1)
*
*
* Formula for Rehashing text in O(1) operation given hash of previous window:
* hash( txt[s+1 .. s+m] ) = ( d ( hash( txt[s .. s+m-1]) – txt[s]*h ) + txt[s + m] ) mod q
*
* NOTE:
* d: Number of characters in the alphabet
* q: A prime number
* h: d^(m-1) mod q
*/
#include <iostream>
#include <vector>
#include <string>
#include <cmath>
using namespace std;
int findModuloPower(int a, int b, int q) {
int result = 1;
while( b > 0) {
if (b & 1) {
result = (result * a) % q;
}
b = b >> 1;
a = (a * a) % q;
}
return result%q;
}
// Find hash of any string
int findStringHash(int d, int h,int q, string str) {
int strHash = 0;
for (int i = 0; i < str.size(); i++) {
strHash = (d * strHash + str[i]) % q;
}
return strHash;
}
vector<int> findMatches(string text, string pattern) {
int d = 256; // Number of characters in the alphabet
int q = 10007; // A prime number
int n = text.size();
int m = pattern.size();
int h = findModuloPower(d, m-1, q); // d^(m-1) mod q
int patternHash = findStringHash(d, h, q, pattern);
int textHash;
vector<int> startIndicesOfMatches;
for (int i = 0; i <= n - m; i++) {
// Don't need to rehash the pattern
if(i == 0)
textHash = findStringHash(d, h, q, text.substr(0, m));
else // Rehash the text
textHash = (d * (textHash - text[i - 1] * h) + text[i + m - 1]) % q;
// In case of negative hash, make it positive
if (textHash < 0)
textHash = textHash + q;
if (patternHash == textHash) {
bool isMatch = true;
// In case of collision, check if the strings are actually equal
for (int j = 0; j < m; j++) {
if (text[i + j] != pattern[j]) {
isMatch = false;
break;
}
}
if (isMatch) {
startIndicesOfMatches.push_back(i);
}
}
}
}
int main() {
string text = "AABAACAADAABAABA";
string pattern = "AABA";
auto startIndicesOfMatches = findMatches(text, pattern);
cout << "Indices of matches: ";
for (auto index : startIndicesOfMatches) {
cout << index << " ";
}
}
/**
* @brief Addition to this problem:
* Find number of unique substring in a string
* - Get hash for all substrings of length m (O(n*n))
* - Store them in a set and get the size of set
*
* This method will mostly work as probablity of collision is approx: 1/q ie. 1/10007 = 0.0001%
* if we increase q to 10^9 then probablity of collision will be 0.000000001%
*
* Read more : https://cp-algorithms.com/string/string-hashing.html#determine-the-number-of-different-substrings-in-a-string
*
* It can be improved by using larger q (10^18) and using a good hash function
*/