Skip to content

Commit fcf9a94

Browse files
committed
Analysis of hashing algorithms.
1 parent c078b7c commit fcf9a94

File tree

2 files changed

+16268
-0
lines changed

2 files changed

+16268
-0
lines changed

hashes/hash-test.py

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
import string
2+
import random
3+
import scipy
4+
import matplotlib.pyplot as plt
5+
6+
# This script will test multiple hashes on strings to see if we get a normal distribution
7+
8+
9+
def load_words():
10+
words = {}
11+
exclude = set(string.punctuation + "\n")
12+
word_count = 0
13+
14+
with open("tale-of-two-cities.txt", "r") as content:
15+
for line in content:
16+
line = line.replace("--", ' ')
17+
line_words = line.split(" ")
18+
for word in line_words:
19+
word = ''.join(ch for ch in word if ch not in exclude)
20+
21+
# add to dict and keep track of times it occurs
22+
if word:
23+
word = word.lower()
24+
word_count += 1
25+
if word in words:
26+
words[word] += 1
27+
else:
28+
words[word] = 1
29+
30+
return words
31+
32+
33+
def is_prime(n):
34+
"""
35+
Function which returns True if the integer n is prime. Tests integers
36+
d from two up to Dmax = scipy.sqrt(n), stopping if any are divisors of n
37+
(or, test if n is even and then test odd divisors). This is most naturally
38+
done using the "while" command,
39+
while n%d != 0 and d <= Dmax:
40+
d+=1 [or 2]
41+
What condition will d satisfy after the while loop if n is prime?
42+
"""
43+
Dmax = scipy.sqrt(n)
44+
if n == 2:
45+
return True
46+
if n %2 == 0:
47+
return False
48+
d = 3
49+
while n % d != 0 and d <= Dmax:
50+
d += 2
51+
return d > Dmax
52+
53+
54+
def first_prime_greater_than(min):
55+
"""
56+
Returns a list of all prime numbers less than nMax.
57+
You can use isPrime to generate a list of primes using the nice
58+
Python feature of "List comprehensions". For example, the squares of the
59+
even numbers between seven and nineteen can be generated by
60+
[n**2 for n in scipy.arange(7,19) if isEven(n)]
61+
List comprehensions return a list using the elements generated by the
62+
"for" loop that satisfy the (optional) if expression.
63+
"""
64+
65+
for n in scipy.arange(min + 1, min * 2):
66+
if is_prime(n):
67+
return n
68+
69+
return None
70+
71+
72+
def randomhash(m):
73+
random.seed()
74+
return random.randint(0, m - 1)
75+
76+
77+
def polyhash_prime(word, a, p, m):
78+
hash = 0
79+
for c in word:
80+
hash = (hash * a + ord(c)) % p
81+
82+
return hash % m
83+
84+
85+
def polyhash_noprime(word, a, m):
86+
hash = 0
87+
for c in word:
88+
hash = (hash * a + ord(c))
89+
90+
return hash % m
91+
92+
93+
94+
def show_distribution(buckets, title):
95+
96+
counts = {}
97+
for v in buckets:
98+
if v in counts.keys():
99+
counts[v] += 1
100+
else:
101+
counts[v] = 1
102+
103+
plt.bar(counts.keys(), counts.values())
104+
plt.title(title)
105+
plt.xlabel("Bucket size")
106+
plt.ylabel("Buckets")
107+
plt.show()
108+
109+
110+
def main():
111+
words = load_words()
112+
word_count = len(words)
113+
114+
m = int(word_count / 2) # hash table will be at load = 0.5
115+
116+
# random
117+
118+
buckets = [0] * m
119+
for w in words:
120+
hash = randomhash(m)
121+
buckets[hash] += 1
122+
123+
show_distribution(buckets, "Bucket size distribution - Random insert")
124+
125+
# polyhash
126+
127+
# we want a prime to use in hash calculation
128+
# 10145 distinct words in the book
129+
prime = first_prime_greater_than(word_count)
130+
131+
buckets = [0] * m
132+
for w in words:
133+
hash = polyhash_prime(w, 31, prime, m)
134+
buckets[hash] += 1
135+
136+
show_distribution(buckets, "Bucket size distribution - PolyHash with prime")
137+
138+
# polyhash, without prime
139+
140+
buckets = [0] * m
141+
for w in words:
142+
hash = polyhash_noprime(w, 31, m)
143+
buckets[hash] += 1
144+
145+
show_distribution(buckets, "Bucket size distribution - PolyHash, no prime")
146+
147+
if __name__ == "__main__":
148+
main()

0 commit comments

Comments
 (0)