Skip to content

Commit 72123c6

Browse files
committed
Use siphash24 + PYTHONHASHSEED
1 parent bb6dc78 commit 72123c6

File tree

12 files changed

+117
-50
lines changed

12 files changed

+117
-50
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Lib/test/test_sys.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -491,9 +491,7 @@ def test_attributes(self):
491491
else:
492492
self.assertIn(sys.hash_info.algorithm, {"fnv", "siphash24"})
493493
else:
494-
# PY_HASH_EXTERNAL
495-
# TODO: RUSTPYTHON; use siphash24
496-
# self.assertEqual(algo, 0)
494+
self.assertEqual(algo, 0)
497495
pass
498496
self.assertGreaterEqual(sys.hash_info.cutoff, 0)
499497
self.assertLess(sys.hash_info.cutoff, 8)

common/Cargo.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,5 @@ lexical-core = "0.7"
1616
hexf-parse = "0.1.0"
1717
cfg-if = "0.1"
1818
once_cell = "1.4.1"
19-
20-
[dev-dependencies]
19+
siphasher = "0.3"
2120
rand = "0.7.3"

common/src/hash.rs

Lines changed: 79 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
use num_bigint::BigInt;
22
use num_complex::Complex64;
33
use num_traits::ToPrimitive;
4-
use std::collections::hash_map::DefaultHasher;
5-
use std::hash::{Hash, Hasher};
4+
use siphasher::sip::SipHasher24;
5+
use std::convert::TryInto;
6+
use std::hash::{BuildHasher, Hash, Hasher};
67
use std::num::Wrapping;
78

89
pub type PyHash = i64;
@@ -16,24 +17,75 @@ pub const MODULUS: PyUHash = (1 << BITS) - 1;
1617
pub const INF: PyHash = 314_159;
1718
pub const NAN: PyHash = 0;
1819
pub const IMAG: PyHash = MULTIPLIER;
19-
pub const ALGO: &str = "siphasher13";
20+
pub const ALGO: &str = "siphash24";
2021
pub const HASH_BITS: usize = std::mem::size_of::<PyHash>() * 8;
21-
// internally DefaultHasher uses 2 u64s as the seed, but
22-
// that's not guaranteed to be consistent across Rust releases
23-
// TODO: use something like the siphasher crate as our hash algorithm
24-
pub const SEED_BITS: usize = std::mem::size_of::<PyHash>() * 2 * 8;
22+
// SipHasher24 takes 2 u64s as a seed
23+
pub const SEED_BITS: usize = std::mem::size_of::<u64>() * 2 * 8;
2524

2625
// pub const CUTOFF: usize = 7;
2726

28-
#[inline]
29-
pub fn mod_int(value: i64) -> PyHash {
30-
value % MODULUS as i64
27+
pub struct HashSecret {
28+
k0: u64,
29+
k1: u64,
30+
}
31+
32+
impl BuildHasher for HashSecret {
33+
type Hasher = SipHasher24;
34+
fn build_hasher(&self) -> Self::Hasher {
35+
SipHasher24::new_with_keys(self.k0, self.k1)
36+
}
37+
}
38+
39+
impl rand::distributions::Distribution<HashSecret> for rand::distributions::Standard {
40+
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> HashSecret {
41+
HashSecret {
42+
k0: rng.gen(),
43+
k1: rng.gen(),
44+
}
45+
}
3146
}
3247

33-
pub fn hash_value<T: Hash + ?Sized>(data: &T) -> PyHash {
34-
let mut hasher = DefaultHasher::new();
35-
data.hash(&mut hasher);
36-
mod_int(hasher.finish() as PyHash)
48+
impl HashSecret {
49+
pub fn new(seed: u32) -> Self {
50+
let mut buf = [0u8; 16];
51+
lcg_urandom(seed, &mut buf);
52+
let k0 = u64::from_le_bytes(buf[..8].try_into().unwrap());
53+
let k1 = u64::from_le_bytes(buf[8..].try_into().unwrap());
54+
Self { k0, k1 }
55+
}
56+
}
57+
58+
impl HashSecret {
59+
pub fn hash_value<T: Hash + ?Sized>(&self, data: &T) -> PyHash {
60+
let mut hasher = self.build_hasher();
61+
data.hash(&mut hasher);
62+
mod_int(hasher.finish() as PyHash)
63+
}
64+
65+
pub fn hash_iter<'a, T: 'a, I, F, E>(&self, iter: I, hashf: F) -> Result<PyHash, E>
66+
where
67+
I: IntoIterator<Item = &'a T>,
68+
F: Fn(&'a T) -> Result<PyHash, E>,
69+
{
70+
let mut hasher = self.build_hasher();
71+
for element in iter {
72+
let item_hash = hashf(element)?;
73+
item_hash.hash(&mut hasher);
74+
}
75+
Ok(mod_int(hasher.finish() as PyHash))
76+
}
77+
78+
pub fn hash_bytes(&self, value: &[u8]) -> PyHash {
79+
if value.is_empty() {
80+
0
81+
} else {
82+
self.hash_value(value)
83+
}
84+
}
85+
86+
pub fn hash_str(&self, value: &str) -> PyHash {
87+
self.hash_bytes(value.as_bytes())
88+
}
3789
}
3890

3991
pub fn hash_float(value: f64) -> PyHash {
@@ -84,21 +136,8 @@ pub fn hash_float(value: f64) -> PyHash {
84136
pub fn hash_complex(value: &Complex64) -> PyHash {
85137
let re_hash = hash_float(value.re);
86138
let im_hash = hash_float(value.im);
87-
let ret = Wrapping(re_hash) + Wrapping(im_hash) * Wrapping(IMAG);
88-
ret.0
89-
}
90-
91-
pub fn hash_iter<'a, T: 'a, I, F, E>(iter: I, hashf: F) -> Result<PyHash, E>
92-
where
93-
I: IntoIterator<Item = &'a T>,
94-
F: Fn(&'a T) -> Result<PyHash, E>,
95-
{
96-
let mut hasher = DefaultHasher::new();
97-
for element in iter {
98-
let item_hash = hashf(element)?;
99-
item_hash.hash(&mut hasher);
100-
}
101-
Ok(mod_int(hasher.finish() as PyHash))
139+
let Wrapping(ret) = Wrapping(re_hash) + Wrapping(im_hash) * Wrapping(IMAG);
140+
ret
102141
}
103142

104143
pub fn hash_iter_unordered<'a, T: 'a, I, F, E>(iter: I, hashf: F) -> Result<PyHash, E>
@@ -126,6 +165,15 @@ pub fn hash_bigint(value: &BigInt) -> PyHash {
126165
)
127166
}
128167

129-
pub fn hash_str(value: &str) -> PyHash {
130-
hash_value(value.as_bytes())
168+
#[inline]
169+
pub fn mod_int(value: i64) -> PyHash {
170+
value % MODULUS as i64
171+
}
172+
173+
pub fn lcg_urandom(mut x: u32, buf: &mut [u8]) {
174+
for b in buf {
175+
x *= 214013;
176+
x += 2531011;
177+
*b = ((x >> 16) & 0xff) as u8;
178+
}
131179
}

src/main.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ fn parse_arguments<'a>(app: App<'a, '_>) -> ArgMatches<'a> {
198198
fn create_settings(matches: &ArgMatches) -> PySettings {
199199
let ignore_environment =
200200
matches.is_present("ignore-environment") || matches.is_present("isolate");
201-
let mut settings: PySettings = Default::default();
201+
let mut settings = PySettings::default();
202202
settings.ignore_environment = ignore_environment;
203203

204204
// add the current directory to sys.path
@@ -280,6 +280,16 @@ fn create_settings(matches: &ArgMatches) -> PySettings {
280280
vec!["".to_owned()]
281281
};
282282

283+
let hash_seed = match env::var("PYTHONHASHSEED") {
284+
Ok(s) if s == "random" => Some(None),
285+
Ok(s) => s.parse::<u32>().ok().map(Some),
286+
Err(_) => Some(None),
287+
};
288+
settings.hash_seed = hash_seed.unwrap_or_else(|| {
289+
error!("Fatal Python init error: PYTHONHASHSEED must be \"random\" or an integer in range [0; 4294967295]");
290+
process::exit(1)
291+
});
292+
283293
settings.argv = argv;
284294

285295
settings

vm/src/bytesinner.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -302,8 +302,8 @@ impl PyBytesInner {
302302
self.cmp(other, |a, b| a < b, vm)
303303
}
304304

305-
pub fn hash(&self) -> hash::PyHash {
306-
hash::hash_value(&self.elements)
305+
pub fn hash(&self, vm: &VirtualMachine) -> hash::PyHash {
306+
vm.state.hash_secret.hash_bytes(&self.elements)
307307
}
308308

309309
pub fn add(&self, other: PyBytesInner) -> Vec<u8> {

vm/src/dictdatatype.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -456,8 +456,8 @@ impl DictKey for PyObjectRef {
456456
}
457457

458458
impl DictKey for PyStringRef {
459-
fn key_hash(&self, _vm: &VirtualMachine) -> PyResult<HashValue> {
460-
Ok(self.hash())
459+
fn key_hash(&self, vm: &VirtualMachine) -> PyResult<HashValue> {
460+
Ok(self.hash(vm))
461461
}
462462

463463
fn key_is(&self, other: &PyObjectRef) -> bool {
@@ -480,9 +480,9 @@ impl DictKey for PyStringRef {
480480
/// Implement trait for the str type, so that we can use strings
481481
/// to index dictionaries.
482482
impl DictKey for &str {
483-
fn key_hash(&self, _vm: &VirtualMachine) -> PyResult<HashValue> {
483+
fn key_hash(&self, vm: &VirtualMachine) -> PyResult<HashValue> {
484484
// follow a similar route as the hashing of PyStringRef
485-
Ok(hash::hash_str(*self))
485+
Ok(vm.state.hash_secret.hash_str(*self))
486486
}
487487

488488
fn key_is(&self, _other: &PyObjectRef) -> bool {

vm/src/obj/objbytes.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,8 @@ impl PyBytes {
131131
}
132132

133133
#[pymethod(name = "__hash__")]
134-
fn hash(&self) -> PyHash {
135-
self.inner.hash()
134+
fn hash(&self, vm: &VirtualMachine) -> PyHash {
135+
self.inner.hash(vm)
136136
}
137137

138138
#[pymethod(name = "__iter__")]

vm/src/obj/objstr.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -290,9 +290,9 @@ impl PyString {
290290
}
291291

292292
#[pymethod(name = "__hash__")]
293-
pub(crate) fn hash(&self) -> hash::PyHash {
293+
pub(crate) fn hash(&self, vm: &VirtualMachine) -> hash::PyHash {
294294
self.hash.load().unwrap_or_else(|| {
295-
let hash = hash::hash_str(&self.value);
295+
let hash = vm.state.hash_secret.hash_str(&self.value);
296296
self.hash.store(Some(hash));
297297
hash
298298
})

vm/src/obj/objtype.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -805,7 +805,7 @@ mod tests {
805805
#[test]
806806
fn test_linearise() {
807807
let context = PyContext::new();
808-
let object: PyClassRef = context.object();
808+
let object = &context.types.object_type;
809809
let type_type = &context.types.type_type;
810810

811811
let a = new(

0 commit comments

Comments
 (0)