diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..74ade77 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,65 @@ +name: CI +on: + push: + branches: ["main"] + pull_request: + merge_group: + types: [checks_requested] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +env: + RUST_BACKTRACE: 1 + SHELL: /bin/bash + +jobs: + ci: + name: Build and Test + runs-on: ubuntu-latest + + strategy: + matrix: + rust: [1.70.0, nightly, beta, stable] + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 2 + - name: Setup Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: ${{ matrix.rust }} + default: true + override: true + - name: Build + run: | + cargo build --no-default-features + cargo build + cargo build --features malloc_size_of + - uses: actions-rs/cargo@v1 + with: + command: test + args: --all + - name: Build codegen + run: | + cd string-cache-codegen && cargo build && cd .. + + if [ ${{ matrix.rust }} = nightly ]; then + cd integration-tests && cargo test --features unstable && cd ..; + fi + + + build_result: + name: Result + runs-on: ubuntu-latest + needs: + - "ci" + + steps: + - name: Mark the job as successful + run: exit 0 + if: success() + - name: Mark the job as unsuccessful + run: exit 1 + if: "!success()" diff --git a/.gitignore b/.gitignore index 5df5866..c17061b 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ /doc +Cargo.lock +target +.cargo/config diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..e73215e --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,41 @@ +[package] +name = "string_cache" +version = "0.9.0" # Also update README.md when making a semver-breaking change +authors = ["The Servo Project Developers"] +description = "A string interning library for Rust, developed as part of the Servo project." +license = "MIT OR Apache-2.0" +repository = "https://github.com/servo/string-cache" +documentation = "https://docs.rs/string_cache" +edition = "2018" +rust-version = "1.70.0" + +# Do not `exclude` ./string-cache-codegen because we want to include +# ./string-cache-codegen/shared.rs, and `include` is a pain to use +# (It has to be exhaustive.) +# This means that packages for this crate include some unused files, +# but they’re not too big so that shouldn’t be a problem. + +[lib] +name = "string_cache" + +[features] +serde_support = ["serde"] +default = ["serde_support"] + +[dependencies] +precomputed-hash = "0.1" +serde = { version = "1", optional = true } +malloc_size_of = { version = "0.1", default-features = false, optional = true } +phf_shared = "0.13" +new_debug_unreachable = "1.0.2" +parking_lot = "0.12" + +[[test]] +name = "small-stack" +harness = false + +[workspace] +members = [ + "string-cache-codegen", + "integration-tests", +] diff --git a/Makefile.in b/Makefile.in deleted file mode 100644 index 6e5f2ee..0000000 --- a/Makefile.in +++ /dev/null @@ -1,34 +0,0 @@ -VPATH=%VPATH% - -RUSTC ?= rustc -RUSTFLAGS += -L ../../phf/rust-phf -EXT_DEPS ?= -RUSTDOC ?= rustdoc -RUSTDOC_FLAGS ?= -RUSTDOC_TARGET ?= doc - -RUST_SRC=$(shell find $(VPATH)/. -type f -name '*.rs') - -.PHONY: all -all: libstring-cache.dummy - -libstring-cache.dummy: lib.rs $(RUST_SRC) $(EXT_DEPS) - $(RUSTC) $(RUSTFLAGS) $< --out-dir . - touch $@ - -string-cache-test: lib.rs $(RUST_SRC) - $(RUSTC) $(RUSTFLAGS) $< -o $@ --test - -.PHONY: check -check: string-cache-test - ./string-cache-test $(TEST) - -.PHONY: doc -doc: $(RUSTDOC_TARGET)/string_cache/index.html - -$(RUSTDOC_TARGET)/string_cache/index.html: lib.rs $(RUST_SRC) $(EXT_DEPS) - $(RUSTDOC) $(RUSTDOC_FLAGS) $< -o $(RUSTDOC_TARGET) - -.PHONY: clean -clean: - rm -f *.o *.a *.so *.dylib *.rlib *.dll *.dummy *-test diff --git a/README.md b/README.md index cba6fc1..429d1ec 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,78 @@ -string-cache -============ +# string-cache + +[![Build Status](https://github.com/servo/string-cache/actions/workflows/ci.yml/badge.svg)](https://github.com/servo/string-cache/actions) + +[Documentation](https://docs.rs/string_cache/) + +A string interning library for Rust, developed as part of the [Servo](https://github.com/servo/servo) project. + +## Simple usage + +In `Cargo.toml`: + +```toml +[dependencies] +string_cache = "0.9" +``` + +In `lib.rs`: + +```rust +extern crate string_cache; +use string_cache::DefaultAtom as Atom; +``` + +## With static atoms + +In `Cargo.toml`: + +```toml +[package] +build = "build.rs" + +[dependencies] +string_cache = "0.9" + +[build-dependencies] +string_cache_codegen = "0.6" +``` + +In `build.rs`: + +```rust +extern crate string_cache_codegen; + +use std::env; +use std::path::Path; + +fn main() { + string_cache_codegen::AtomType::new("foo::FooAtom", "foo_atom!") + .atoms(&["foo", "bar"]) + .write_to_file(&Path::new(&env::var("OUT_DIR").unwrap()).join("foo_atom.rs")) + .unwrap() +} +``` + +In `lib.rs`: + +```rust +extern crate string_cache; + +mod foo { + include!(concat!(env!("OUT_DIR"), "/foo_atom.rs")); +} +``` + +The generated code will define a `FooAtom` type and a `foo_atom!` macro. +The macro can be used in expression or patterns, with strings listed in `build.rs`. +For example: + +```rust +fn compute_something(input: &foo::FooAtom) -> u32 { + match *input { + foo_atom!("foo") => 1, + foo_atom!("bar") => 2, + _ => 3, + } +} +``` diff --git a/atom.rs b/atom.rs deleted file mode 100644 index 036ae16..0000000 --- a/atom.rs +++ /dev/null @@ -1,493 +0,0 @@ -// Copyright 2014 The Servo Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use static_atoms::atom::StaticAtom; -use std::fmt; -use std::hash::{Hash, Hasher, sip}; -use std::mem; -use std::ptr; -use std::slice; -use std::slice::bytes; -use std::str; -use std::sync::atomics::{AtomicInt, SeqCst}; -use sync::Mutex; -use sync::one::{Once, ONCE_INIT}; -use std::rt::heap; - - -// Inline atoms are probably buggy on big-endian architectures. -#[allow(dead_code)] -#[static_assert] -static IS_LITTLE_ENDIAN: bool = cfg!(target_endian = "little"); - - -static mut global_string_cache_ptr: *mut Mutex = 0 as *mut _; - -static STATIC_SHIFT_BITS: uint = 32; -static ENTRY_ALIGNMENT: uint = 16; - -// NOTE: Deriving Eq here implies that a given string must always -// be interned the same way. -#[repr(u8)] -#[deriving(Eq, PartialEq)] -enum AtomType { - Dynamic = 0, - Inline = 1, - Static = 2, -} - -struct StringCache { - hasher: sip::SipHasher, - buckets: [*mut StringCacheEntry, ..4096], -} - -struct StringCacheEntry { - next_in_bucket: *mut StringCacheEntry, - hash: u64, - ref_count: AtomicInt, - string: String, -} - -impl StringCacheEntry { - fn new(next: *mut StringCacheEntry, hash: u64, string_to_add: &str) -> StringCacheEntry { - StringCacheEntry { - next_in_bucket: next, - hash: hash, - ref_count: AtomicInt::new(1), - string: string_to_add.to_string(), - } - } -} - -impl StringCache { - fn new() -> StringCache { - StringCache { - hasher: sip::SipHasher::new(), - buckets: unsafe { mem::zeroed() }, - } - } - - fn add(&mut self, string_to_add: &str) -> u64 { - let hash = self.hasher.hash(&string_to_add); - let bucket_index = (hash & (self.buckets.len()-1) as u64) as uint; - let mut ptr = self.buckets[bucket_index]; - - while ptr != ptr::mut_null() { - let value = unsafe { &*ptr }; - if value.hash == hash && value.string.as_slice() == string_to_add { - break; - } - ptr = value.next_in_bucket; - } - - if ptr == ptr::mut_null() { - unsafe { - ptr = heap::allocate(mem::size_of::(), ENTRY_ALIGNMENT) - as *mut StringCacheEntry; - ptr::write(ptr, - StringCacheEntry::new(self.buckets[bucket_index], hash, string_to_add)); - } - self.buckets[bucket_index] = ptr; - } else { - unsafe { - (*ptr).ref_count.fetch_add(1, SeqCst); - } - } - - assert!(ptr != ptr::mut_null()); - ptr as u64 - } - - fn remove(&mut self, key: u64) { - let ptr = key as *mut StringCacheEntry; - let value: &mut StringCacheEntry = unsafe { mem::transmute(ptr) }; - - if value.ref_count.fetch_sub(1, SeqCst) == 1 { - let bucket_index = (value.hash & (self.buckets.len()-1) as u64) as uint; - - let mut current = self.buckets[bucket_index]; - let mut prev: *mut StringCacheEntry = ptr::mut_null(); - - while current != ptr::mut_null() { - if current == ptr { - if prev != ptr::mut_null() { - unsafe { (*prev).next_in_bucket = (*current).next_in_bucket }; - } else { - unsafe { self.buckets[bucket_index] = (*current).next_in_bucket }; - } - break; - } - prev = current; - unsafe { current = (*current).next_in_bucket }; - } - assert!(current != ptr::mut_null()); - - unsafe { - ptr::read(ptr as *const StringCacheEntry); - heap::deallocate(ptr as *mut u8, - mem::size_of::(), ENTRY_ALIGNMENT); - } - } - } -} - -#[deriving(Eq, Hash, PartialEq)] -pub struct Atom { - data: u64 -} - -impl Atom { - pub fn from_static(atom_id: StaticAtom) -> Atom { - Atom { - data: (atom_id as u64 << STATIC_SHIFT_BITS) | (Static as u64) - } - } - - pub fn from_slice(string_to_add: &str) -> Atom { - match from_str::(string_to_add) { - Some(atom_id) => { - Atom::from_static(atom_id) - }, - None => { - if string_to_add.len() < 8 { - Atom::from_inline(string_to_add) - } else { - Atom::from_dynamic(string_to_add) - } - } - } - } - - pub fn as_slice<'t>(&'t self) -> &'t str { - let (atom_type, string_len) = self.get_type_and_inline_len(); - let ptr = self as *const Atom as *const u8; - match atom_type { - Inline => { - unsafe { - let data = ptr.offset(1) as *const [u8, ..7]; - str::raw::from_utf8((*data).slice_to(string_len)) - } - }, - Static => { - let key: StaticAtom = unsafe { mem::transmute((self.data >> STATIC_SHIFT_BITS) as u32) }; - key.as_slice() - }, - Dynamic => { - let hash_value = unsafe { &*(self.data as *const StringCacheEntry) }; - hash_value.string.as_slice() - } - } - } - - #[inline] - fn from_inline(string: &str) -> Atom { - assert!(string.len() < 8); - let mut string_data: u64 = 0; - unsafe { slice::raw::mut_buf_as_slice(&mut string_data as *mut u64 as *mut u8, 7, - |b| bytes::copy_memory(b, string.as_bytes())) }; - Atom { - data: (Inline as u64) | (string.len() as u64 << 4) | (string_data << 8), - } - } - - #[inline] - fn from_dynamic(string: &str) -> Atom { - static mut START: Once = ONCE_INIT; - - unsafe { - START.doit(|| { - let cache = box Mutex::new(StringCache::new()); - global_string_cache_ptr = mem::transmute(cache); - }); - } - - let mut string_cache = unsafe { &*global_string_cache_ptr }.lock(); - let hash_value_address = string_cache.add(string); - Atom { - data: hash_value_address | Dynamic as u64 - } - } - - #[inline] - fn get_type(&self) -> AtomType { - unsafe { mem::transmute((self.data & 0xf) as u8) } - } - - #[inline] - fn get_type_and_inline_len(&self) -> (AtomType, uint) { - let atom_type = self.get_type(); - let len = match atom_type { - Static | Dynamic => 0, - Inline => ((self.data & 0xf0) >> 4) as uint - }; - (atom_type, len) - } -} - -impl Clone for Atom { - fn clone(&self) -> Atom { - let atom_type = self.get_type(); - match atom_type { - Dynamic => { - let hash_value = unsafe { &mut *(self.data as *mut StringCacheEntry) }; - hash_value.ref_count.fetch_add(1, SeqCst); - } - _ => {} - } - Atom { - data: self.data - } - } -} - -impl Equiv for Atom { - fn equiv(&self, atom_id: &StaticAtom) -> bool { - self.get_type() == Static && self.data >> STATIC_SHIFT_BITS == *atom_id as u64 - } -} - -impl Drop for Atom { - fn drop(&mut self) { - match self.get_type() { - Dynamic => { - let mut string_cache = unsafe { &*global_string_cache_ptr }.lock(); - string_cache.remove(self.data); - }, - _ => {} - } - } -} - -impl fmt::Show for Atom { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Atom('{:s}' type={:?})", self.as_slice(), self.get_type()) - } -} - -impl PartialOrd for Atom { - fn partial_cmp(&self, other: &Atom) -> Option { - self.data.partial_cmp(&other.data) - } - - fn lt(&self, other: &Atom) -> bool { - if self.data == other.data { - return false; - } - self.as_slice() < other.as_slice() - } -} - -impl Ord for Atom { - fn cmp(&self, other: &Atom) -> Ordering { - if self.data == other.data { - return Equal; - } - self.as_slice().cmp(&other.as_slice()) - } -} - -#[cfg(test)] -mod tests { - use std::task::spawn; - use super::{Atom, Static, Inline, Dynamic}; - use static_atoms::atom; - use test::Bencher; - - #[test] - fn test_as_slice() { - let s0 = Atom::from_slice(""); - assert!(s0.as_slice() == ""); - - let s1 = Atom::from_slice("class"); - assert!(s1.as_slice() == "class"); - - let i0 = Atom::from_slice("blah"); - assert!(i0.as_slice() == "blah"); - - let s0 = Atom::from_slice("BLAH"); - assert!(s0.as_slice() == "BLAH"); - - let d0 = Atom::from_slice("zzzzzzzzzz"); - assert!(d0.as_slice() == "zzzzzzzzzz"); - - let d1 = Atom::from_slice("ZZZZZZZZZZ"); - assert!(d1.as_slice() == "ZZZZZZZZZZ"); - } - - #[test] - fn test_types() { - let s0 = Atom::from_slice(""); - assert!(s0.get_type_and_inline_len() == (Static, 0)); - - let s1 = Atom::from_slice("id"); - assert!(s1.get_type_and_inline_len() == (Static, 0)); - - let i0 = Atom::from_slice("z"); - assert!(i0.get_type_and_inline_len() == (Inline, 1)); - - let i1 = Atom::from_slice("zz"); - assert!(i1.get_type_and_inline_len() == (Inline, 2)); - - let i2 = Atom::from_slice("zzz"); - assert!(i2.get_type_and_inline_len() == (Inline, 3)); - - let i3 = Atom::from_slice("zzzz"); - assert!(i3.get_type_and_inline_len() == (Inline, 4)); - - let i4 = Atom::from_slice("zzzzz"); - assert!(i4.get_type_and_inline_len() == (Inline, 5)); - - let i5 = Atom::from_slice("zzzzzz"); - assert!(i5.get_type_and_inline_len() == (Inline, 6)); - - let i6 = Atom::from_slice("zzzzzzz"); - assert!(i6.get_type_and_inline_len() == (Inline, 7)); - - let d0 = Atom::from_slice("zzzzzzzz"); - assert!(d0.get_type_and_inline_len() == (Dynamic, 0)); - - let d1 = Atom::from_slice("zzzzzzzzzzzzz"); - assert!(d1.get_type_and_inline_len() == (Dynamic, 0)); - } - - #[test] - fn test_equality() { - let s0 = Atom::from_slice("fn"); - let s1 = Atom::from_slice("fn"); - let s2 = Atom::from_slice("loop"); - - let i0 = Atom::from_slice("blah"); - let i1 = Atom::from_slice("blah"); - let i2 = Atom::from_slice("blah2"); - - let d0 = Atom::from_slice("zzzzzzzz"); - let d1 = Atom::from_slice("zzzzzzzz"); - let d2 = Atom::from_slice("zzzzzzzzz"); - - assert!(s0 == s1); - assert!(s0 != s2); - - assert!(i0 == i1); - assert!(i0 != i2); - - assert!(d0 == d1); - assert!(d0 != d2); - - assert!(s0 != i0); - assert!(s0 != d0); - assert!(i0 != d0); - } - - #[test] - fn ord() { - fn check(x: &str, y: &str) { - assert_eq!(x < y, Atom::from_slice(x) < Atom::from_slice(y)); - assert_eq!(x.cmp(&y), Atom::from_slice(x).cmp(&Atom::from_slice(y))); - } - - check("a", "body"); - check("asdf", "body"); - check("zasdf", "body"); - check("z", "body"); - - check("a", "bbbbb"); - check("asdf", "bbbbb"); - check("zasdf", "bbbbb"); - check("z", "bbbbb"); - } - - #[test] - fn clone() { - let s0 = Atom::from_slice("fn"); - let s1 = s0.clone(); - let s2 = Atom::from_slice("loop"); - - let i0 = Atom::from_slice("blah"); - let i1 = i0.clone(); - let i2 = Atom::from_slice("blah2"); - - let d0 = Atom::from_slice("zzzzzzzz"); - let d1 = d0.clone(); - let d2 = Atom::from_slice("zzzzzzzzz"); - - assert!(s0 == s1); - assert!(s0 != s2); - - assert!(i0 == i1); - assert!(i0 != i2); - - assert!(d0 == d1); - assert!(d0 != d2); - - assert!(s0 != i0); - assert!(s0 != d0); - assert!(i0 != d0); - } - - #[test] - fn test_equiv() { - let s0 = Atom::from_slice("div"); - assert!(s0.equiv(&atom::Div)); - - let s1 = Atom::from_slice("Div"); - assert!(!s1.equiv(&atom::Div)); - } - - #[test] - fn test_threads() { - for _ in range(0u32, 100u32) { - spawn(proc() { - let _ = Atom::from_slice("a dynamic string"); - let _ = Atom::from_slice("another string"); - }); - } - } - - #[bench] - fn bench_strings(b: &mut Bencher) { - let mut strings0 = vec!(); - let mut strings1 = vec!(); - - for _ in range(0u32, 1000u32) { - strings0.push("a"); - strings1.push("b"); - } - - let mut eq_count = 0u32; - - b.iter(|| { - for (s0, s1) in strings0.iter().zip(strings1.iter()) { - if s0 == s1 { - eq_count += 1; - } - } - }); - } - - #[bench] - fn bench_atoms(b: &mut Bencher) { - let mut atoms0 = vec!(); - let mut atoms1 = vec!(); - - for _ in range(0u32, 1000u32) { - atoms0.push(Atom::from_slice("a")); - atoms1.push(Atom::from_slice("b")); - } - - let mut eq_count = 0u32; - - b.iter(|| { - for (a0, a1) in atoms0.iter().zip(atoms1.iter()) { - if a0 == a1 { - eq_count += 1; - } - } - }); - } -} diff --git a/configure b/configure deleted file mode 100755 index 62a0f4c..0000000 --- a/configure +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -SRCDIR="$(cd $(dirname $0) && pwd)" -sed "s#%VPATH%#${SRCDIR}#" ${SRCDIR}/Makefile.in > Makefile diff --git a/examples/simple.rs b/examples/simple.rs new file mode 100644 index 0000000..f063b06 --- /dev/null +++ b/examples/simple.rs @@ -0,0 +1,26 @@ + + +use string_cache::DefaultAtom; + +fn main() { + let mut interned_stuff = Vec::new(); + let text = "here is a sentence of text that will be tokenised and interned and some repeated \ + tokens is of text and"; + for word in text.split_whitespace() { + let seen_before = interned_stuff + .iter() + // We can use impl PartialEq where T is anything string-like to compare to + // interned strings to either other interned strings, or actual strings Comparing two + // interned strings is very fast (normally a single cpu operation). + .filter(|interned_word| interned_word == &word) + .count(); + if seen_before > 0 { + println!(r#"Seen the word "{}" {} times"#, word, seen_before); + } else { + println!(r#"Not seen the word "{}" before"#, word); + } + // We use the impl From<(Cow<'a, str>, or &'a str, or String) for Atom to intern a + // new string + interned_stuff.push(DefaultAtom::from(word)); + } +} diff --git a/integration-tests/Cargo.toml b/integration-tests/Cargo.toml new file mode 100644 index 0000000..4562747 --- /dev/null +++ b/integration-tests/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "integration_tests" +version = "0.0.1" +authors = [ "The Servo Project Developers" ] +build = "build.rs" +publish = false +edition = "2018" + +[lib] +doctest = false +test = true + +[features] + +# Use unstable features to optimize space and time (memory and CPU usage). +unstable = [] + +[dependencies] +string_cache = { version = "0.9", path = ".." } + +[dev-dependencies] +rand = { version = "0.8", features = ["small_rng"] } +string_cache_codegen = { version = "0.6", path = "../string-cache-codegen" } + +[build-dependencies] +string_cache_codegen = { version = "0.6", path = "../string-cache-codegen" } diff --git a/integration-tests/build.rs b/integration-tests/build.rs new file mode 100644 index 0000000..6293e4c --- /dev/null +++ b/integration-tests/build.rs @@ -0,0 +1,26 @@ +use string_cache_codegen; + +use std::env; +use std::path::Path; + +fn main() { + string_cache_codegen::AtomType::new("TestAtom", "test_atom!") + .atoms(&[ + "a", + "b", + "address", + "defaults", + "area", + "body", + "font-weight", + "br", + "html", + "head", + "id", + "❤", + "❤💯", + "❤💯❤💯", + ]) + .write_to_file(&Path::new(&env::var("OUT_DIR").unwrap()).join("test_atom.rs")) + .unwrap() +} diff --git a/integration-tests/src/bench.rs b/integration-tests/src/bench.rs new file mode 100644 index 0000000..45e7199 --- /dev/null +++ b/integration-tests/src/bench.rs @@ -0,0 +1,212 @@ +// Copyright 2014 The Servo Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/* + +A cautionary note about these benchmarks: + +Many of the operations we're attempting to measure take less than one +nanosecond. That's why we run them thousands of times in a loop just to get a +single iteration that Rust's statistical benchmarking can work with. At that +scale, any change anywhere in the library can produce durable performance +regressions on the order of half a nanosecond, i.e. "500 ns" in the output for +a test like eq_x_1000. + +We can't get anything done if we rachet on these numbers! They are more useful +for selecting between alternatives, and for noticing large regressions or +inconsistencies. + +Furthermore, a large part of the point of interning is to make strings small +and cheap to move around, which isn't reflected in these tests. + +*/ +use crate::TestAtom; + +use test::{black_box, Bencher}; + +// Just shorthand +fn mk(x: &str) -> TestAtom { + TestAtom::from(x) +} + +macro_rules! check_type (($name:ident, $x:expr) => ( + // NB: "cargo bench" does not run these! + #[test] + fn $name() { + assert!($x, "atom has wrong type"); + } +)); + +macro_rules! bench_tiny_op (($name:ident, $op:ident, $ctor_x:expr, $ctor_y:expr) => ( + #[bench] + fn $name(b: &mut Bencher) { + const n: usize = 1000; + let xs: Vec<_> = repeat($ctor_x).take(n).collect(); + let ys: Vec<_> = repeat($ctor_y).take(n).collect(); + + b.iter(|| { + for (x, y) in xs.iter().zip(ys.iter()) { + black_box(x.$op(y)); + } + }); + } +)); + +macro_rules! bench_one ( + (x_static $x:expr, $y:expr) => (check_type!(check_type_x, $x.is_static());); + (x_inline $x:expr, $y:expr) => (check_type!(check_type_x, $x.is_inline());); + (x_dynamic $x:expr, $y:expr) => (check_type!(check_type_x, $x.is_dynamic());); + (y_static $x:expr, $y:expr) => (check_type!(check_type_y, $y.is_static());); + (y_inline $x:expr, $y:expr) => (check_type!(check_type_y, $y.is_inline());); + (y_dynamic $x:expr, $y:expr) => (check_type!(check_type_y, $y.is_dynamic());); + (is_static $x:expr, $y:expr) => (bench_one!(x_static $x, $y); bench_one!(y_static $x, $y);); + (is_inline $x:expr, $y:expr) => (bench_one!(x_inline $x, $y); bench_one!(y_inline $x, $y);); + (is_dynamic $x:expr, $y:expr) => (bench_one!(x_dynamic $x, $y); bench_one!(y_dynamic $x, $y);); + + (eq $x:expr, $_y:expr) => (bench_tiny_op!(eq_x_1000, eq, $x, $x);); + (ne $x:expr, $y:expr) => (bench_tiny_op!(ne_x_1000, ne, $x, $y);); + (lt $x:expr, $y:expr) => (bench_tiny_op!(lt_x_1000, lt, $x, $y);); + + (intern $x:expr, $_y:expr) => ( + #[bench] + fn intern(b: &mut Bencher) { + let x = $x.to_string(); + b.iter(|| { + black_box(TestAtom::from(&*x)); + }); + } + ); + + (as_ref $x:expr, $_y:expr) => ( + #[bench] + fn as_ref_x_1000(b: &mut Bencher) { + let x = $x; + b.iter(|| { + for _ in 0..1000 { + black_box(x.as_ref()); + } + }); + } + ); + + (clone $x:expr, $_y:expr) => ( + #[bench] + fn clone_x_1000(b: &mut Bencher) { + let x = $x; + b.iter(|| { + for _ in 0..1000 { + black_box(x.clone()); + } + }); + } + ); + + (clone_string $x:expr, $_y:expr) => ( + #[bench] + fn clone_x_1000(b: &mut Bencher) { + let x = $x.to_string(); + b.iter(|| { + for _ in 0..1000 { + black_box(x.clone()); + } + }); + } + ); +); + +macro_rules! bench_all ( + ([ $($which:ident)+ ] for $name:ident = $x:expr, $y:expr) => ( + // FIXME: This module works around rust-lang/rust#12249 so we don't + // have to repeat the names for eq and neq. + mod $name { + #![allow(unused_imports)] + + use test::{Bencher, black_box}; + use std::string::ToString; + use std::iter::repeat; + + use crate::TestAtom; + + use super::mk; + + $( + bench_one!($which $x, $y); + )+ + } + ); +); + +pub const longer_dynamic_a: &'static str = + "Thee Silver Mt. Zion Memorial Orchestra & Tra-La-La Band"; +pub const longer_dynamic_b: &'static str = + "Thee Silver Mt. Zion Memorial Orchestra & Tra-La-La Ban!"; + +bench_all!([eq ne lt clone_string] for short_string = "e", "f"); +bench_all!([eq ne lt clone_string] for medium_string = "xyzzy01", "xyzzy02"); +bench_all!([eq ne lt clone_string] + for longer_string = super::longer_dynamic_a, super::longer_dynamic_b); + +bench_all!([eq ne intern as_ref clone is_static lt] + for static_atom = test_atom!("defaults"), test_atom!("font-weight")); + +bench_all!([intern as_ref clone is_inline] + for short_inline_atom = mk("e"), mk("f")); + +bench_all!([eq ne intern as_ref clone is_inline lt] + for medium_inline_atom = mk("xyzzy01"), mk("xyzzy02")); + +bench_all!([intern as_ref clone is_dynamic] + for min_dynamic_atom = mk("xyzzy001"), mk("xyzzy002")); + +bench_all!([eq ne intern as_ref clone is_dynamic lt] + for longer_dynamic_atom = mk(super::longer_dynamic_a), mk(super::longer_dynamic_b)); + +bench_all!([intern as_ref clone is_static] + for static_at_runtime = mk("defaults"), mk("font-weight")); + +bench_all!([ne lt x_static y_inline] + for static_vs_inline = test_atom!("defaults"), mk("f")); + +bench_all!([ne lt x_static y_dynamic] + for static_vs_dynamic = test_atom!("defaults"), mk(super::longer_dynamic_b)); + +bench_all!([ne lt x_inline y_dynamic] + for inline_vs_dynamic = mk("e"), mk(super::longer_dynamic_b)); + +macro_rules! bench_rand ( ($name:ident, $len:expr) => ( + #[bench] + fn $name(b: &mut Bencher) { + use std::str; + use rand; + use rand::{RngCore, SeedableRng}; + + let mut gen = rand::rngs::SmallRng::from_entropy(); + b.iter(|| { + // We have to generate new atoms on every iter, because + // the dynamic atom table isn't reset. + // + // I measured the overhead of random string generation + // as about 3-12% at one point. + + let mut buf: [u8; $len] = [0; $len]; + gen.fill_bytes(&mut buf); + for n in buf.iter_mut() { + // shift into printable ASCII + *n = (*n % 0x40) + 0x20; + } + let s = str::from_utf8(&buf[..]).unwrap(); + black_box(TestAtom::from(s)); + }); + } +)); + +bench_rand!(intern_rand_008, 8); +bench_rand!(intern_rand_032, 32); +bench_rand!(intern_rand_128, 128); +bench_rand!(intern_rand_512, 512); diff --git a/integration-tests/src/common-usage.rs b/integration-tests/src/common-usage.rs new file mode 100644 index 0000000..7b7380a --- /dev/null +++ b/integration-tests/src/common-usage.rs @@ -0,0 +1,19 @@ +/// Test common usage by popular dependents (html5ever, lalrpop, browserlists-rs), to ensure no API-surface breaking changes +/// Created after https://github.com/servo/string-cache/issues/271 +use std::collections::HashMap; + +use crate::Atom; +use crate::TestAtom; + +#[test] +fn usage_with_hashmap() { + let mut map: HashMap = HashMap::new(); + + map.insert(test_atom!("area"), 1); + map.insert("str_into".into(), 2); + map.insert("atom_from".into(), 3); + + assert_eq!(map.get(&"area".into()).unwrap(), &1); + assert_eq!(map.get(&"str_into".into()).unwrap(), &2); + assert_eq!(map.get(&Atom::from("atom_from")).unwrap(), &3); +} diff --git a/integration-tests/src/lib.rs b/integration-tests/src/lib.rs new file mode 100644 index 0000000..a788d93 --- /dev/null +++ b/integration-tests/src/lib.rs @@ -0,0 +1,316 @@ +// Copyright 2014 The Servo Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![cfg(test)] +#![deny(warnings)] +#![allow(non_upper_case_globals)] +#![cfg_attr(feature = "unstable", feature(test))] + +#[cfg(feature = "unstable")] +extern crate test; + +use std::thread; +use string_cache::StaticAtomSet; + +include!(concat!(env!("OUT_DIR"), "/test_atom.rs")); +pub type Atom = TestAtom; + +#[test] +fn test_as_slice() { + let s0 = Atom::from(""); + assert!(s0.as_ref() == ""); + + let s1 = Atom::from("class"); + assert!(s1.as_ref() == "class"); + + let i0 = Atom::from("blah"); + assert!(i0.as_ref() == "blah"); + + let s0 = Atom::from("BLAH"); + assert!(s0.as_ref() == "BLAH"); + + let d0 = Atom::from("zzzzzzzzzz"); + assert!(d0.as_ref() == "zzzzzzzzzz"); + + let d1 = Atom::from("ZZZZZZZZZZ"); + assert!(d1.as_ref() == "ZZZZZZZZZZ"); +} + +#[test] +fn test_types() { + assert!(Atom::from("").is_static()); + assert!(Atom::from("defaults").is_static()); + assert!(Atom::from("font-weight").is_static()); + assert!(Atom::from("id").is_inline()); + assert!(Atom::from("body").is_inline()); + assert!(Atom::from("a").is_inline()); + assert!(Atom::from("address").is_inline()); + assert!(Atom::from("c").is_inline()); + assert!(Atom::from("zz").is_inline()); + assert!(Atom::from("zzz").is_inline()); + assert!(Atom::from("zzzz").is_inline()); + assert!(Atom::from("zzzzz").is_inline()); + assert!(Atom::from("zzzzzz").is_inline()); + assert!(Atom::from("zzzzzzz").is_inline()); + assert!(Atom::from("zzzzzzzz").is_dynamic()); + assert!(Atom::from("zzzzzzzzzzzzz").is_dynamic()); +} + +#[test] +fn test_equality() { + let s0 = Atom::from("fn"); + let s1 = Atom::from("fn"); + let s2 = Atom::from("loop"); + + let i0 = Atom::from("blah"); + let i1 = Atom::from("blah"); + let i2 = Atom::from("blah2"); + + let d0 = Atom::from("zzzzzzzz"); + let d1 = Atom::from("zzzzzzzz"); + let d2 = Atom::from("zzzzzzzzz"); + + assert!(s0 == s1); + assert!(s0 != s2); + + assert!(i0 == i1); + assert!(i0 != i2); + + assert!(d0 == d1); + assert!(d0 != d2); + + assert!(s0 != i0); + assert!(s0 != d0); + assert!(i0 != d0); +} + +#[test] +fn default() { + assert_eq!(TestAtom::default(), test_atom!("")); + assert_eq!(&*TestAtom::default(), ""); +} + +#[test] +fn ord() { + fn check(x: &str, y: &str) { + assert_eq!(x < y, Atom::from(x) < Atom::from(y)); + assert_eq!(x.cmp(y), Atom::from(x).cmp(&Atom::from(y))); + assert_eq!(x.partial_cmp(y), Atom::from(x).partial_cmp(&Atom::from(y))); + } + + check("a", "body"); + check("asdf", "body"); + check("zasdf", "body"); + check("z", "body"); + + check("a", "bbbbb"); + check("asdf", "bbbbb"); + check("zasdf", "bbbbb"); + check("z", "bbbbb"); +} + +#[test] +fn clone() { + let s0 = Atom::from("fn"); + let s1 = s0.clone(); + let s2 = Atom::from("loop"); + + let i0 = Atom::from("blah"); + let i1 = i0.clone(); + let i2 = Atom::from("blah2"); + + let d0 = Atom::from("zzzzzzzz"); + let d1 = d0.clone(); + let d2 = Atom::from("zzzzzzzzz"); + + assert!(s0 == s1); + assert!(s0 != s2); + + assert!(i0 == i1); + assert!(i0 != i2); + + assert!(d0 == d1); + assert!(d0 != d2); + + assert!(s0 != i0); + assert!(s0 != d0); + assert!(i0 != d0); +} + +macro_rules! assert_eq_fmt (($fmt:expr, $x:expr, $y:expr) => ({ + let x = $x; + let y = $y; + if x != y { + panic!("assertion failed: {} != {}", + format_args!($fmt, x), + format_args!($fmt, y)); + } +})); + +#[test] +fn repr() { + fn check(s: &str, data: u64) { + assert_eq_fmt!("0x{:016X}", Atom::from(s).unsafe_data(), data); + } + + fn check_static(s: &str, x: Atom) { + assert_eq_fmt!("0x{:016X}", x.unsafe_data(), Atom::from(s).unsafe_data()); + assert_eq!(0x2, x.unsafe_data() & 0xFFFF_FFFF); + // The index is unspecified by phf. + assert!((x.unsafe_data() >> 32) <= TestAtomStaticSet::get().atoms.len() as u64); + } + + // This test is here to make sure we don't change atom representation + // by accident. It may need adjusting if there are changes to the + // static atom table, the tag values, etc. + + // Static atoms + check_static("defaults", test_atom!("defaults")); + check_static("font-weight", test_atom!("font-weight")); + + // Inline atoms + check("a", 0x0000_0000_0000_6111); + check("address", 0x7373_6572_6464_6171); + check("area", 0x0000_0061_6572_6141); + check("e", 0x0000_0000_0000_6511); + check("xyzzy", 0x0000_797A_7A79_7851); + check("xyzzy01", 0x3130_797A_7A79_7871); + + // Dynamic atoms. This is a pointer so we can't verify every bit. + assert_eq!(0x00, Atom::from("a dynamic string").unsafe_data() & 0xf); +} + +#[test] +fn test_threads() { + for _ in 0_u32..100 { + thread::spawn(move || { + let _ = Atom::from("a dynamic string"); + let _ = Atom::from("another string"); + }); + } +} + +#[test] +fn atom_macro() { + assert_eq!(test_atom!("a"), Atom::from("a")); + assert_eq!(test_atom!("body"), Atom::from("body")); + assert_eq!(test_atom!("address"), Atom::from("address")); + assert_eq!(test_atom!("❤"), Atom::from("❤")); + assert_eq!(test_atom!("❤💯"), Atom::from("❤💯")); + assert_eq!(test_atom!("font-weight"), Atom::from("font-weight")); + assert_eq!(test_atom!("❤💯❤💯"), Atom::from("❤💯❤💯")); +} + +#[test] +fn match_atom() { + assert_eq!( + 2, + match Atom::from("head") { + test_atom!("br") => 1, + test_atom!("html") | test_atom!("head") => 2, + _ => 3, + } + ); + + assert_eq!( + 3, + match Atom::from("body") { + test_atom!("br") => 1, + test_atom!("html") | test_atom!("head") => 2, + _ => 3, + } + ); + + assert_eq!( + 3, + match Atom::from("zzzzzz") { + test_atom!("br") => 1, + test_atom!("html") | test_atom!("head") => 2, + _ => 3, + } + ); +} + +#[test] +fn ensure_deref() { + // Ensure we can Deref to a &str + let atom = Atom::from("foobar"); + let _: &str = &atom; +} + +#[test] +fn ensure_as_ref() { + // Ensure we can as_ref to a &str + let atom = Atom::from("foobar"); + let _: &str = atom.as_ref(); +} + +#[test] +fn test_ascii_lowercase() { + assert_eq!(Atom::from("").to_ascii_lowercase(), Atom::from("")); + assert_eq!(Atom::from("aZ9").to_ascii_lowercase(), Atom::from("az9")); + assert_eq!( + Atom::from("The Quick Brown Fox!").to_ascii_lowercase(), + Atom::from("the quick brown fox!") + ); + assert_eq!( + Atom::from("JE VAIS À PARIS").to_ascii_lowercase(), + Atom::from("je vais À paris") + ); +} + +#[test] +fn test_ascii_uppercase() { + assert_eq!(Atom::from("").to_ascii_uppercase(), Atom::from("")); + assert_eq!(Atom::from("aZ9").to_ascii_uppercase(), Atom::from("AZ9")); + assert_eq!( + Atom::from("The Quick Brown Fox!").to_ascii_uppercase(), + Atom::from("THE QUICK BROWN FOX!") + ); + assert_eq!( + Atom::from("Je vais à Paris").to_ascii_uppercase(), + Atom::from("JE VAIS à PARIS") + ); +} + +#[test] +fn test_eq_ignore_ascii_case() { + assert!(Atom::from("").eq_ignore_ascii_case(&Atom::from(""))); + assert!(Atom::from("aZ9").eq_ignore_ascii_case(&Atom::from("aZ9"))); + assert!(Atom::from("aZ9").eq_ignore_ascii_case(&Atom::from("Az9"))); + assert!(Atom::from("The Quick Brown Fox!") + .eq_ignore_ascii_case(&Atom::from("THE quick BROWN fox!"))); + assert!(Atom::from("Je vais à Paris").eq_ignore_ascii_case(&Atom::from("je VAIS à PARIS"))); + assert!(!Atom::from("").eq_ignore_ascii_case(&Atom::from("az9"))); + assert!(!Atom::from("aZ9").eq_ignore_ascii_case(&Atom::from(""))); + assert!(!Atom::from("aZ9").eq_ignore_ascii_case(&Atom::from("9Za"))); + assert!(!Atom::from("The Quick Brown Fox!") + .eq_ignore_ascii_case(&Atom::from("THE quick BROWN fox!!"))); + assert!(!Atom::from("Je vais à Paris").eq_ignore_ascii_case(&Atom::from("JE vais À paris"))); +} + +#[test] +fn test_from_string() { + assert!(Atom::from("camembert".to_owned()) == Atom::from("camembert")); +} + +#[test] +fn test_try_static() { + assert!(Atom::try_static("defaults").is_some()); + assert!(Atom::try_static("head").is_none()); + assert!(Atom::try_static("not in the static table").is_none()); +} + +#[cfg(test)] +#[path = "common-usage.rs"] +mod common_usage; + +#[cfg(all(test, feature = "unstable"))] +#[path = "bench.rs"] +mod bench; diff --git a/lib.rs b/lib.rs deleted file mode 100644 index 0f958fc..0000000 --- a/lib.rs +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2014 The Servo Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -#![crate_name = "string_cache"] -#![crate_type = "rlib"] - -#![feature(phase, macro_rules)] - -extern crate sync; -extern crate debug; - -#[cfg(test)] -extern crate test; - -#[phase(plugin)] -extern crate phf_mac; -extern crate phf; - -pub mod atom; -pub mod static_atoms; diff --git a/src/atom.rs b/src/atom.rs new file mode 100644 index 0000000..5a8aa7f --- /dev/null +++ b/src/atom.rs @@ -0,0 +1,415 @@ +// Copyright 2014 The Servo Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use crate::dynamic_set::{dynamic_set, Entry}; +use crate::static_sets::StaticAtomSet; +use debug_unreachable::debug_unreachable; + +use std::borrow::Cow; +use std::cmp::Ordering::{self, Equal}; +use std::fmt; +use std::hash::{Hash, Hasher}; +use std::marker::PhantomData; +use std::mem; +use std::num::NonZeroU64; +use std::ops; +use std::slice; +use std::str; +use std::sync::atomic::Ordering::SeqCst; + +const DYNAMIC_TAG: u8 = 0b_00; +const INLINE_TAG: u8 = 0b_01; // len in upper nybble +const STATIC_TAG: u8 = 0b_10; +const TAG_MASK: u64 = 0b_11; +const LEN_OFFSET: u64 = 4; +const LEN_MASK: u64 = 0xF0; + +const MAX_INLINE_LEN: usize = 7; +const STATIC_SHIFT_BITS: usize = 32; + +/// Represents a string that has been interned. +/// +/// While the type definition for `Atom` indicates that it generic on a particular +/// implementation of an atom set, you don't need to worry about this. Atoms can be static +/// and come from a `StaticAtomSet` generated by the `string_cache_codegen` crate, or they +/// can be dynamic and created by you on an `EmptyStaticAtomSet`. +/// +/// `Atom` implements `Clone` but not `Copy`, since internally atoms are reference-counted; +/// this means that you may need to `.clone()` an atom to keep copies to it in different +/// places, or when passing it to a function that takes an `Atom` rather than an `&Atom`. +/// +/// ## Creating an atom at runtime +/// +/// If you use `string_cache_codegen` to generate a precomputed list of atoms, your code +/// may then do something like read data from somewhere and extract tokens that need to be +/// compared to the atoms. In this case, you can use `Atom::from(&str)` or +/// `Atom::from(String)`. These create a reference-counted atom which will be +/// automatically freed when all references to it are dropped. +/// +/// This means that your application can safely have a loop which tokenizes data, creates +/// atoms from the tokens, and compares the atoms to a predefined set of keywords, without +/// running the risk of arbitrary memory consumption from creating large numbers of atoms — +/// as long as your application does not store clones of the atoms it creates along the +/// way. +/// +/// For example, the following is safe and will not consume arbitrary amounts of memory: +/// +/// ```ignore +/// let untrusted_data = "large amounts of text ..."; +/// +/// for token in untrusted_data.split_whitespace() { +/// let atom = Atom::from(token); // interns the string +/// +/// if atom == Atom::from("keyword") { +/// // handle that keyword +/// } else if atom == Atom::from("another_keyword") { +/// // handle that keyword +/// } else { +/// println!("unknown keyword"); +/// } +/// } // atom is dropped here, so it is not kept around in memory +/// ``` +#[derive(PartialEq, Eq)] +// NOTE: Deriving PartialEq requires that a given string must always be interned the same way. +pub struct Atom { + unsafe_data: NonZeroU64, + phantom: PhantomData, +} + +// This isn't really correct as the Atoms can technically take up space. But I guess it's ok +// as it is possible to measure the size of the atom set separately/ +#[cfg(feature = "malloc_size_of")] +impl malloc_size_of::MallocSizeOf for Atom { + fn size_of(&self, _ops: &mut malloc_size_of::MallocSizeOfOps) -> usize { + 0 + } +} + +// FIXME: bound removed from the struct definition before of this error for pack_static: +// "error[E0723]: trait bounds other than `Sized` on const fn parameters are unstable" +// https://github.com/rust-lang/rust/issues/57563 +impl Atom { + /// For the atom!() macros + #[inline(always)] + #[doc(hidden)] + pub const fn pack_static(n: u32) -> Self { + Self { + unsafe_data: unsafe { + // STATIC_TAG ensures this is non-zero + NonZeroU64::new_unchecked((STATIC_TAG as u64) | ((n as u64) << STATIC_SHIFT_BITS)) + }, + phantom: PhantomData, + } + } + + /// For the atom!() macros + #[inline(always)] + #[doc(hidden)] + pub const fn pack_inline(mut n: u64, len: u8) -> Self { + if cfg!(target_endian = "big") { + // Reverse order of top 7 bytes. + // Bottom 8 bits of `n` are zero, and we need that to remain so. + // String data is stored in top 7 bytes, tag and length in bottom byte. + n = n.to_le() << 8; + } + + let data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET) | n; + Self { + // INLINE_TAG ensures this is never zero + unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, + phantom: PhantomData, + } + } + + fn tag(&self) -> u8 { + (self.unsafe_data.get() & TAG_MASK) as u8 + } +} + +impl Atom { + /// Return the internal representation. For testing. + #[doc(hidden)] + pub fn unsafe_data(&self) -> u64 { + self.unsafe_data.get() + } + + /// Return true if this is a static Atom. For testing. + #[doc(hidden)] + pub fn is_static(&self) -> bool { + self.tag() == STATIC_TAG + } + + /// Return true if this is a dynamic Atom. For testing. + #[doc(hidden)] + pub fn is_dynamic(&self) -> bool { + self.tag() == DYNAMIC_TAG + } + + /// Return true if this is an inline Atom. For testing. + #[doc(hidden)] + pub fn is_inline(&self) -> bool { + self.tag() == INLINE_TAG + } + + fn static_index(&self) -> u64 { + self.unsafe_data.get() >> STATIC_SHIFT_BITS + } + + /// Get the hash of the string as it is stored in the set. + pub fn get_hash(&self) -> u32 { + match self.tag() { + DYNAMIC_TAG => { + let entry = self.unsafe_data.get() as *const Entry; + unsafe { (*entry).hash } + } + STATIC_TAG => Static::get().hashes[self.static_index() as usize], + INLINE_TAG => { + let data = self.unsafe_data.get(); + // This may or may not be great... + ((data >> 32) ^ data) as u32 + } + _ => unsafe { debug_unreachable!() }, + } + } + + pub fn try_static(string_to_add: &str) -> Option { + Self::try_static_internal(string_to_add).ok() + } + + fn try_static_internal(string_to_add: &str) -> Result { + let static_set = Static::get(); + let hash = phf_shared::hash(&*string_to_add, &static_set.key); + let index = phf_shared::get_index(&hash, static_set.disps, static_set.atoms.len()); + + if static_set.atoms[index as usize] == string_to_add { + Ok(Self::pack_static(index)) + } else { + Err(hash) + } + } +} + +impl Default for Atom { + #[inline] + fn default() -> Self { + Atom::pack_static(Static::empty_string_index()) + } +} + +impl Hash for Atom { + #[inline] + fn hash(&self, state: &mut H) + where + H: Hasher, + { + state.write_u32(self.get_hash()) + } +} + +impl<'a, Static: StaticAtomSet> From> for Atom { + fn from(string_to_add: Cow<'a, str>) -> Self { + let len = string_to_add.len(); + if len == 0 { + Self::pack_static(Static::empty_string_index()) + } else if len <= MAX_INLINE_LEN { + let mut data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET); + { + let dest = inline_atom_slice_mut(&mut data); + dest[..len].copy_from_slice(string_to_add.as_bytes()); + } + Atom { + // INLINE_TAG ensures this is never zero + unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, + phantom: PhantomData, + } + } else { + Self::try_static_internal(&*string_to_add).unwrap_or_else(|hash| { + let ptr: std::ptr::NonNull = dynamic_set().insert(string_to_add, hash.g); + let data = ptr.as_ptr() as u64; + debug_assert!(0 == data & TAG_MASK); + Atom { + // The address of a ptr::NonNull is non-zero + unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, + phantom: PhantomData, + } + }) + } + } +} + +impl Clone for Atom { + #[inline(always)] + fn clone(&self) -> Self { + if self.tag() == DYNAMIC_TAG { + let entry = self.unsafe_data.get() as *const Entry; + unsafe { &*entry }.ref_count.fetch_add(1, SeqCst); + } + Atom { ..*self } + } +} + +impl Drop for Atom { + #[inline] + fn drop(&mut self) { + if self.tag() == DYNAMIC_TAG { + let entry = self.unsafe_data.get() as *const Entry; + if unsafe { &*entry }.ref_count.fetch_sub(1, SeqCst) == 1 { + drop_slow(self) + } + } + + // Out of line to guide inlining. + fn drop_slow(this: &mut Atom) { + dynamic_set().remove(this.unsafe_data.get() as *mut Entry); + } + } +} + +impl ops::Deref for Atom { + type Target = str; + + #[inline] + fn deref(&self) -> &str { + unsafe { + match self.tag() { + DYNAMIC_TAG => { + let entry = self.unsafe_data.get() as *const Entry; + &(*entry).string + } + INLINE_TAG => { + let len = (self.unsafe_data() & LEN_MASK) >> LEN_OFFSET; + debug_assert!(len as usize <= MAX_INLINE_LEN); + let src = inline_atom_slice(&self.unsafe_data); + str::from_utf8_unchecked(src.get_unchecked(..(len as usize))) + } + STATIC_TAG => Static::get().atoms[self.static_index() as usize], + _ => debug_unreachable!(), + } + } + } +} + +impl fmt::Debug for Atom { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let ty_str = unsafe { + match self.tag() { + DYNAMIC_TAG => "dynamic", + INLINE_TAG => "inline", + STATIC_TAG => "static", + _ => debug_unreachable!(), + } + }; + + write!(f, "Atom('{}' type={})", &*self, ty_str) + } +} + +impl PartialOrd for Atom { + #[inline] + fn partial_cmp(&self, other: &Self) -> Option { + if self.unsafe_data == other.unsafe_data { + return Some(Equal); + } + self.as_ref().partial_cmp(other.as_ref()) + } +} + +impl Ord for Atom { + #[inline] + fn cmp(&self, other: &Self) -> Ordering { + if self.unsafe_data == other.unsafe_data { + return Equal; + } + self.as_ref().cmp(other.as_ref()) + } +} + +// AsciiExt requires mutating methods, so we just implement the non-mutating ones. +// We don't need to implement is_ascii because there's no performance improvement +// over the one from &str. +impl Atom { + fn from_mutated_str(s: &str, f: F) -> Self { + let mut buffer = mem::MaybeUninit::<[u8; 64]>::uninit(); + let buffer = unsafe { &mut *buffer.as_mut_ptr() }; + + if let Some(buffer_prefix) = buffer.get_mut(..s.len()) { + buffer_prefix.copy_from_slice(s.as_bytes()); + let as_str = unsafe { ::std::str::from_utf8_unchecked_mut(buffer_prefix) }; + f(as_str); + Atom::from(&*as_str) + } else { + let mut string = s.to_owned(); + f(&mut string); + Atom::from(string) + } + } + + /// Like [`to_ascii_uppercase`]. + /// + /// [`to_ascii_uppercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_uppercase + pub fn to_ascii_uppercase(&self) -> Self { + for (i, b) in self.bytes().enumerate() { + if let b'a'..=b'z' = b { + return Atom::from_mutated_str(self, |s| s[i..].make_ascii_uppercase()); + } + } + self.clone() + } + + /// Like [`to_ascii_lowercase`]. + /// + /// [`to_ascii_lowercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_lowercase + pub fn to_ascii_lowercase(&self) -> Self { + for (i, b) in self.bytes().enumerate() { + if let b'A'..=b'Z' = b { + return Atom::from_mutated_str(self, |s| s[i..].make_ascii_lowercase()); + } + } + self.clone() + } + + /// Like [`eq_ignore_ascii_case`]. + /// + /// [`eq_ignore_ascii_case`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.eq_ignore_ascii_case + pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { + (self == other) || self.eq_str_ignore_ascii_case(&**other) + } + + /// Like [`eq_ignore_ascii_case`], but takes an unhashed string as `other`. + /// + /// [`eq_ignore_ascii_case`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.eq_ignore_ascii_case + pub fn eq_str_ignore_ascii_case(&self, other: &str) -> bool { + (&**self).eq_ignore_ascii_case(other) + } +} + +#[inline(always)] +fn inline_atom_slice(x: &NonZeroU64) -> &[u8] { + let x: *const NonZeroU64 = x; + let mut data = x as *const u8; + // All except the lowest byte, which is first in little-endian, last in big-endian. + if cfg!(target_endian = "little") { + data = unsafe { data.offset(1) }; + } + let len = 7; + unsafe { slice::from_raw_parts(data, len) } +} + +#[inline(always)] +fn inline_atom_slice_mut(x: &mut u64) -> &mut [u8] { + let x: *mut u64 = x; + let mut data = x as *mut u8; + // All except the lowest byte, which is first in little-endian, last in big-endian. + if cfg!(target_endian = "little") { + data = unsafe { data.offset(1) }; + } + let len = 7; + unsafe { slice::from_raw_parts_mut(data, len) } +} diff --git a/src/dynamic_set.rs b/src/dynamic_set.rs new file mode 100644 index 0000000..4442b4d --- /dev/null +++ b/src/dynamic_set.rs @@ -0,0 +1,112 @@ +// Copyright 2014 The Servo Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use parking_lot::Mutex; +use std::borrow::Cow; +use std::mem; +use std::ptr::NonNull; +use std::sync::atomic::AtomicIsize; +use std::sync::atomic::Ordering::SeqCst; +use std::sync::OnceLock; + +const NB_BUCKETS: usize = 1 << 12; // 4096 +const BUCKET_MASK: u32 = (1 << 12) - 1; + +pub(crate) struct Set { + buckets: Box<[Mutex>>]>, +} + +pub(crate) struct Entry { + pub(crate) string: Box, + pub(crate) hash: u32, + pub(crate) ref_count: AtomicIsize, + next_in_bucket: Option>, +} + +// Addresses are a multiples of this, +// and therefore have have TAG_MASK bits unset, available for tagging. +pub(crate) const ENTRY_ALIGNMENT: usize = 4; + +#[test] +fn entry_alignment_is_sufficient() { + assert!(mem::align_of::() >= ENTRY_ALIGNMENT); +} + +pub(crate) fn dynamic_set() -> &'static Set { + // NOTE: Using const initialization for buckets breaks the small-stack test. + // ``` + // // buckets: [Mutex>>; NB_BUCKETS], + // const MUTEX: Mutex>> = Mutex::new(None); + // let buckets = Box::new([MUTEX; NB_BUCKETS]); + // ``` + static DYNAMIC_SET: OnceLock = OnceLock::new(); + + DYNAMIC_SET.get_or_init(|| { + let buckets = (0..NB_BUCKETS).map(|_| Mutex::new(None)).collect(); + Set { buckets } + }) +} + +impl Set { + pub(crate) fn insert(&self, string: Cow, hash: u32) -> NonNull { + let bucket_index = (hash & BUCKET_MASK) as usize; + let mut linked_list = self.buckets[bucket_index].lock(); + + { + let mut ptr: Option<&mut Box> = linked_list.as_mut(); + + while let Some(entry) = ptr.take() { + if entry.hash == hash && *entry.string == *string { + if entry.ref_count.fetch_add(1, SeqCst) > 0 { + return NonNull::from(&mut **entry); + } + // Uh-oh. The pointer's reference count was zero, which means someone may try + // to free it. (Naive attempts to defend against this, for example having the + // destructor check to see whether the reference count is indeed zero, don't + // work due to ABA.) Thus we need to temporarily add a duplicate string to the + // list. + entry.ref_count.fetch_sub(1, SeqCst); + break; + } + ptr = entry.next_in_bucket.as_mut(); + } + } + debug_assert!(mem::align_of::() >= ENTRY_ALIGNMENT); + let string = string.into_owned(); + let mut entry = Box::new(Entry { + next_in_bucket: linked_list.take(), + hash, + ref_count: AtomicIsize::new(1), + string: string.into_boxed_str(), + }); + let ptr = NonNull::from(&mut *entry); + *linked_list = Some(entry); + ptr + } + + pub(crate) fn remove(&self, ptr: *mut Entry) { + let value: &Entry = unsafe { &*ptr }; + let bucket_index = (value.hash & BUCKET_MASK) as usize; + + let mut linked_list = self.buckets[bucket_index].lock(); + debug_assert!(value.ref_count.load(SeqCst) == 0); + let mut current: &mut Option> = &mut linked_list; + + while let Some(entry_ptr) = current.as_mut() { + let entry_ptr: *mut Entry = &mut **entry_ptr; + if entry_ptr == ptr { + mem::drop(mem::replace(current, unsafe { + (*entry_ptr).next_in_bucket.take() + })); + break; + } + current = unsafe { &mut (*entry_ptr).next_in_bucket }; + } + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..3cc29b1 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,139 @@ +// Copyright 2014 The Servo Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! +//! A library for interning things that are `AsRef`. +//! +//! Some strings may be interned at compile time using the `string-cache-codegen` crate, or the +//! `EmptyStaticAtomSet` may be used that has no compile-time interned strings. An `Atom` is an +//! interned string for a given set (either `EmptyStaticAtomSet` or a generated `StaticAtomSet`). +//! +//! Generated `Atom`s will have assocated macros to intern static strings at compile-time. +//! +//! # Examples +//! +//! Here are two examples, one with compile-time `Atom`s, and one without. +//! +//! ## With compile-time atoms +//! +//! In `Cargo.toml`: +//! ```toml +//! [dependencies] +//! string_cache = "0.9" +//! +//! [dev-dependencies] +//! string_cache_codegen = "0.6" +//! ``` +//! +//! In `build.rs`: +//! +//! ```ignore +//! extern crate string_cache_codegen; +//! +//! use std::env; +//! use std::path::Path; +//! +//! fn main() { +//! string_cache_codegen::AtomType::new("foo::FooAtom", "foo_atom!") +//! .atoms(&["foo", "bar"]) +//! .write_to_file(&Path::new(&env::var("OUT_DIR").unwrap()).join("foo_atom.rs")) +//! .unwrap() +//! } +//! ``` +//! +//! In `lib.rs`: +//! +//! ```ignore +//! extern crate string_cache; +//! +//! mod foo { +//! include!(concat!(env!("OUT_DIR"), "/foo_atom.rs")); +//! } +//! +//! fn use_the_atom(t: &str) { +//! match *t { +//! foo_atom!("foo") => println!("Found foo!"), +//! foo_atom!("bar") => println!("Found bar!"), +//! // foo_atom!("baz") => println!("Found baz!"), - would be a compile time error +//! _ => { +//! println!("String not interned"); +//! // We can intern strings at runtime as well +//! foo::FooAtom::from(t) +//! } +//! } +//! } +//! ``` +//! +//! ## No compile-time atoms +//! +//! ``` +//! # extern crate string_cache; +//! use string_cache::DefaultAtom; +//! +//! # fn main() { +//! let mut interned_stuff = Vec::new(); +//! let text = "here is a sentence of text that will be tokenised and +//! interned and some repeated tokens is of text and"; +//! for word in text.split_whitespace() { +//! let seen_before = interned_stuff.iter() +//! // We can use impl PartialEq where T is anything string-like +//! // to compare to interned strings to either other interned strings, +//! // or actual strings Comparing two interned strings is very fast +//! // (normally a single cpu operation). +//! .filter(|interned_word| interned_word == &word) +//! .count(); +//! if seen_before > 0 { +//! println!(r#"Seen the word "{}" {} times"#, word, seen_before); +//! } else { +//! println!(r#"Not seen the word "{}" before"#, word); +//! } +//! // We use the impl From<(Cow<'a, str>, or &'a str, or String)> for +//! // Atom to intern a new string. +//! interned_stuff.push(DefaultAtom::from(word)); +//! } +//! # } +//! ``` +//! + +#![cfg_attr(test, deny(warnings))] + +// Types, such as Atom, that impl Hash must follow the hash invariant: if two objects match +// with PartialEq, they must also have the same Hash. Clippy warns on types that derive one while +// manually impl-ing the other, because it seems easy for the two to drift apart, causing the +// invariant to be violated. +// +// But Atom is a newtype over NonZeroU64, and probably always will be, since cheap comparisons and +// copying are this library's purpose. So we know what the PartialEq comparison is going to do. +// +// The `get_hash` function, seen in `atom.rs`, consults that number, plus the global string interner +// tables. The only way for the resulting hash for two Atoms with the same inner 64-bit number to +// differ would be if the table entry changed between invocations, and that would be really bad. +#![allow(clippy::derive_hash_xor_eq)] + +mod atom; +mod dynamic_set; +mod static_sets; +mod trivial_impls; + +pub use atom::Atom; +pub use static_sets::{EmptyStaticAtomSet, PhfStrSet, StaticAtomSet}; + +/// Use this if you don’t care about static atoms. +pub type DefaultAtom = Atom; + +// Some minor tests of internal layout here. +// See ../integration-tests for much more. + +/// Guard against accidental changes to the sizes of things. +#[test] +fn assert_sizes() { + use std::mem::size_of; + assert_eq!(size_of::(), 8); + assert_eq!(size_of::>(), size_of::(),); +} diff --git a/src/static_sets.rs b/src/static_sets.rs new file mode 100644 index 0000000..f7f1799 --- /dev/null +++ b/src/static_sets.rs @@ -0,0 +1,64 @@ +// Copyright 2014 The Servo Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/// A static `PhfStrSet` +/// +/// This trait is implemented by static sets of interned strings generated using +/// `string_cache_codegen`, and `EmptyStaticAtomSet` for when strings will be added dynamically. +/// +/// It is used by the methods of [`Atom`] to check if a string is present in the static set. +/// +/// [`Atom`]: struct.Atom.html +pub trait StaticAtomSet: Ord { + /// Get the location of the static string set in the binary. + fn get() -> &'static PhfStrSet; + /// Get the index of the empty string, which is in every set and is used for `Atom::default`. + fn empty_string_index() -> u32; +} + +/// A string set created using a [perfect hash function], specifically +/// [Hash, Displace and Compress]. +/// +/// See the CHD document for the meaning of the struct fields. +/// +/// [perfect hash function]: https://en.wikipedia.org/wiki/Perfect_hash_function +/// [Hash, Displace and Compress]: http://cmph.sourceforge.net/papers/esa09.pdf +pub struct PhfStrSet { + #[doc(hidden)] + pub key: u64, + #[doc(hidden)] + pub disps: &'static [(u32, u32)], + #[doc(hidden)] + pub atoms: &'static [&'static str], + #[doc(hidden)] + pub hashes: &'static [u32], +} + +/// An empty static atom set for when only dynamic strings will be added +#[derive(PartialEq, Eq, PartialOrd, Ord)] +pub struct EmptyStaticAtomSet; + +impl StaticAtomSet for EmptyStaticAtomSet { + fn get() -> &'static PhfStrSet { + // The name is a lie: this set is not empty (it contains the empty string) + // but that’s only to avoid divisions by zero in rust-phf. + static SET: PhfStrSet = PhfStrSet { + key: 0, + disps: &[(0, 0)], + atoms: &[""], + // "" SipHash'd, and xored with u64_hash_to_u32. + hashes: &[0x3ddddef3], + }; + &SET + } + + fn empty_string_index() -> u32 { + 0 + } +} diff --git a/src/trivial_impls.rs b/src/trivial_impls.rs new file mode 100644 index 0000000..960dde0 --- /dev/null +++ b/src/trivial_impls.rs @@ -0,0 +1,119 @@ +// Copyright 2014 The Servo Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use crate::{Atom, StaticAtomSet}; +#[cfg(feature = "serde_support")] +use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use std::borrow::Cow; +use std::fmt; + +impl ::precomputed_hash::PrecomputedHash for Atom { + fn precomputed_hash(&self) -> u32 { + self.get_hash() + } +} + +impl<'a, Static: StaticAtomSet> From<&'a Atom> for Atom { + fn from(atom: &'a Self) -> Self { + atom.clone() + } +} + +impl PartialEq for Atom { + fn eq(&self, other: &str) -> bool { + &self[..] == other + } +} + +impl PartialEq> for str { + fn eq(&self, other: &Atom) -> bool { + self == &other[..] + } +} + +impl PartialEq for Atom { + fn eq(&self, other: &String) -> bool { + self[..] == other[..] + } +} + +impl<'a, Static: StaticAtomSet> From<&'a str> for Atom { + #[inline] + fn from(string_to_add: &str) -> Self { + Atom::from(Cow::Borrowed(string_to_add)) + } +} + +impl From for Atom { + #[inline] + fn from(string_to_add: String) -> Self { + Atom::from(Cow::Owned(string_to_add)) + } +} + +impl fmt::Display for Atom { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + ::fmt(self, f) + } +} + +impl AsRef for Atom { + fn as_ref(&self) -> &str { + self + } +} + +#[cfg(feature = "serde_support")] +impl Serialize for Atom { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let string: &str = self.as_ref(); + string.serialize(serializer) + } +} + +#[cfg(feature = "serde_support")] +impl<'a, Static: StaticAtomSet> Deserialize<'a> for Atom { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'a>, + { + use serde::de; + use std::marker::PhantomData; + + struct AtomVisitor(PhantomData); + + impl<'de, Static: StaticAtomSet> de::Visitor<'de> for AtomVisitor { + type Value = Atom; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + write!(formatter, "an Atom") + } + + fn visit_str(self, v: &str) -> Result + where + E: de::Error, + { + Ok(Atom::from(v)) + } + + fn visit_string(self, v: String) -> Result + where + E: de::Error, + { + Ok(Atom::from(v)) + } + } + + deserializer.deserialize_str(AtomVisitor(PhantomData)) + } +} diff --git a/static_atoms.rs b/static_atoms.rs deleted file mode 100644 index 6fc5e41..0000000 --- a/static_atoms.rs +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2014 The Servo Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -//! A list of static atoms that are pre-hashed at compile time. - -pub mod atom { - use phf::PhfOrderedMap; - use std::from_str::FromStr; - - #[repr(u32)] - #[deriving(Eq, PartialEq)] - pub enum StaticAtom { - EmptyString, - Id, - Class, - Href, - Style, - Span, - Width, - Height, - Type, - Data, - New, - Name, - Src, - Rel, - Div, - } - - static STATIC_ATOMS: PhfOrderedMap<&'static str, StaticAtom> = phf_ordered_map!( - "" => EmptyString, - "id" => Id, - "class" => Class, - "href" => Href, - "style" => Style, - "span" => Span, - "width" => Width, - "height" => Height, - "type" => Type, - "data" => Data, - "new" => New, - "name" => Name, - "src" => Src, - "rel" => Rel, - "div" => Div, - ); - - impl FromStr for StaticAtom { - #[inline] - fn from_str(string: &str) -> Option { - match STATIC_ATOMS.find_equiv(&string) { - None => None, - Some(&k) => Some(k) - } - } - } - - impl StaticAtom { - pub fn as_slice(&self) -> &'static str { - let &(string, _) = STATIC_ATOMS.entries().idx(*self as uint).unwrap(); - string - } - } -} diff --git a/string-cache-codegen/Cargo.toml b/string-cache-codegen/Cargo.toml new file mode 100644 index 0000000..20eced9 --- /dev/null +++ b/string-cache-codegen/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "string_cache_codegen" +version = "0.6.1" # Also update ../README.md when making a semver-breaking change +authors = [ "The Servo Project Developers" ] +description = "A codegen library for string-cache, developed as part of the Servo project." +license = "MIT OR Apache-2.0" +repository = "https://github.com/servo/string-cache" +documentation = "https://docs.rs/string_cache_codegen/" +edition = "2018" + +[lib] +name = "string_cache_codegen" +path = "lib.rs" + +[dependencies] +phf_generator = "0.13" +phf_shared = "0.13" +proc-macro2 = "1" +quote = "1" diff --git a/string-cache-codegen/LICENSE-APACHE b/string-cache-codegen/LICENSE-APACHE new file mode 100644 index 0000000..16fe87b --- /dev/null +++ b/string-cache-codegen/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/string-cache-codegen/LICENSE-MIT b/string-cache-codegen/LICENSE-MIT new file mode 100644 index 0000000..807526f --- /dev/null +++ b/string-cache-codegen/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2012-2013 Mozilla Foundation + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/string-cache-codegen/lib.rs b/string-cache-codegen/lib.rs new file mode 100644 index 0000000..525ef3a --- /dev/null +++ b/string-cache-codegen/lib.rs @@ -0,0 +1,393 @@ +// Copyright 2016 The Servo Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +//! A crate to create static string caches at compiletime. +//! +//! # Examples +//! +//! With static atoms: +//! +//! In `Cargo.toml`: +//! +//! ```toml +//! [package] +//! build = "build.rs" +//! +//! [dependencies] +//! string_cache = "0.9" +//! +//! [build-dependencies] +//! string_cache_codegen = "0.6" +//! ``` +//! +//! In `build.rs`: +//! +//! ```no_run +//! extern crate string_cache_codegen; +//! +//! use std::env; +//! use std::path::Path; +//! +//! fn main() { +//! string_cache_codegen::AtomType::new("foo::FooAtom", "foo_atom!") +//! .atoms(&["foo", "bar"]) +//! .write_to_file(&Path::new(&env::var("OUT_DIR").unwrap()).join("foo_atom.rs")) +//! .unwrap() +//! } +//! ``` +//! +//! In `lib.rs`: +//! +//! ```ignore +//! extern crate string_cache; +//! +//! mod foo { +//! include!(concat!(env!("OUT_DIR"), "/foo_atom.rs")); +//! } +//! ``` +//! +//! The generated code will define a `FooAtom` type and a `foo_atom!` macro. +//! The macro can be used in expression or patterns, with strings listed in `build.rs`. +//! For example: +//! +//! ```ignore +//! fn compute_something(input: &foo::FooAtom) -> u32 { +//! match *input { +//! foo_atom!("foo") => 1, +//! foo_atom!("bar") => 2, +//! _ => 3, +//! } +//! } +//! ``` +//! + +#![recursion_limit = "128"] + +use proc_macro2::Ident; +use quote::quote; +use std::collections::BTreeSet; +use std::fs::File; +use std::io::{self, BufWriter, Write}; +use std::path::Path; + +/// A builder for a static atom set and relevant macros +pub struct AtomType { + path: String, + atom_doc: Option, + static_set_doc: Option, + macro_name: String, + macro_doc: Option, + atoms: BTreeSet, +} + +impl AtomType { + /// Constructs a new static atom set builder + /// + /// `path` is a path within a crate of the atom type that will be created. + /// e.g. `"FooAtom"` at the crate root or `"foo::Atom"` if the generated code + /// is included in a `foo` module. + /// + /// `macro_name` must end with `!`. + /// + /// For example, `AtomType::new("foo::FooAtom", "foo_atom!")` will generate: + /// + /// ```ignore + /// pub type FooAtom = ::string_cache::Atom; + /// pub struct FooAtomStaticSet; + /// impl ::string_cache::StaticAtomSet for FooAtomStaticSet { + /// // ... + /// } + /// #[macro_export] + /// macro_rules foo_atom { + /// // Expands to: $crate::foo::FooAtom { … } + /// } + /// ``` + pub fn new(path: &str, macro_name: &str) -> Self { + assert!(macro_name.ends_with("!"), "`macro_name` must end with '!'"); + AtomType { + path: path.to_owned(), + macro_name: macro_name[..macro_name.len() - "!".len()].to_owned(), + atom_doc: None, + static_set_doc: None, + macro_doc: None, + atoms: BTreeSet::new(), + } + } + + /// Add some documentation to the generated Atom type alias. + /// + /// This can help the user know that the type uses interned strings. + /// + /// Note that `docs` should not contain the `///` at the front of normal docs. + pub fn with_atom_doc(&mut self, docs: &str) -> &mut Self { + self.atom_doc = Some(docs.to_owned()); + self + } + + /// Add some documentation to the generated static set. + /// + /// This can help the user know that this type is zero-sized and just references a static + /// lookup table, or point them to the `Atom` type alias for more info. + /// + /// Note that `docs` should not contain the `///` at the front of normal docs. + pub fn with_static_set_doc(&mut self, docs: &str) -> &mut Self { + self.static_set_doc = Some(docs.to_owned()); + self + } + + /// Add some documentation to the generated macro. + /// + /// Note that `docs` should not contain the `///` at the front of normal docs. + pub fn with_macro_doc(&mut self, docs: &str) -> &mut Self { + self.macro_doc = Some(docs.to_owned()); + self + } + + /// Adds an atom to the builder + pub fn atom(&mut self, s: &str) -> &mut Self { + self.atoms.insert(s.to_owned()); + self + } + + /// Adds multiple atoms to the builder + pub fn atoms(&mut self, iter: I) -> &mut Self + where + I: IntoIterator, + I::Item: AsRef, + { + self.atoms + .extend(iter.into_iter().map(|s| s.as_ref().to_owned())); + self + } + + /// Write generated code to `destination`. + pub fn write_to(&mut self, mut destination: W) -> io::Result<()> + where + W: Write, + { + destination.write_all( + self.to_tokens() + .to_string() + // Insert some newlines to make the generated code slightly easier to read. + .replace(" [ \"", "[\n\"") + .replace("\" , ", "\",\n") + .replace(" ( \"", "\n( \"") + .replace("; ", ";\n") + .as_bytes(), + ) + } + + #[cfg(test)] + /// Write generated code to destination [`Vec`] and return it as [`String`] + /// + /// Used mostly for testing or displaying a value. + pub fn write_to_string(&mut self, mut destination: Vec) -> io::Result { + destination.write_all( + self.to_tokens() + .to_string() + // Insert some newlines to make the generated code slightly easier to read. + .replace(" [ \"", "[\n\"") + .replace("\" , ", "\",\n") + .replace(" ( \"", "\n( \"") + .replace("; ", ";\n") + .as_bytes(), + )?; + let str = String::from_utf8(destination).unwrap(); + Ok(str) + } + + fn to_tokens(&mut self) -> proc_macro2::TokenStream { + // `impl Default for Atom` requires the empty string to be in the static set. + // This also makes sure the set in non-empty, + // which would cause divisions by zero in rust-phf. + self.atoms.insert(String::new()); + + // Strings over 7 bytes + empty string added to static set. + // Otherwise stored inline. + let (static_strs, inline_strs): (Vec<_>, Vec<_>) = self + .atoms + .iter() + .map(String::as_str) + .partition(|s| s.len() > 7 || s.is_empty()); + + // Static strings + let hash_state = phf_generator::generate_hash(&static_strs); + let phf_generator::HashState { key, disps, map } = hash_state; + let (disps0, disps1): (Vec<_>, Vec<_>) = disps.into_iter().unzip(); + let atoms: Vec<&str> = map.iter().map(|&idx| static_strs[idx]).collect(); + let empty_string_index = atoms.iter().position(|s| s.is_empty()).unwrap() as u32; + let indices = 0..atoms.len() as u32; + + fn is_valid_ident(name: &str) -> bool { + let begins_with_letter_or_underscore = name + .chars() + .next() + .is_some_and(|c| c.is_alphabetic() || c == '_'); + let is_alphanumeric = name.chars().all(|c| c.is_alphanumeric() || c == '_'); + + begins_with_letter_or_underscore && is_alphanumeric + } + + let atoms_for_idents: Vec<&str> = atoms + .iter() + .copied() + .filter(|x| is_valid_ident(x)) + .collect(); + let atom_idents: Vec = atoms_for_idents.iter().map(|atom| new_term(atom)).collect(); + + let istrs_for_idents: Vec<&str> = inline_strs + .iter() + .copied() + .filter(|x| is_valid_ident(x)) + .collect(); + let istr_idents: Vec = istrs_for_idents.iter().map(|atom| new_term(atom)).collect(); + + let hashes: Vec = atoms + .iter() + .map(|string| { + let hash = phf_shared::hash(string, &key); + (hash.g ^ hash.f1) as u32 + }) + .collect(); + + let mut path_parts = self.path.rsplitn(2, "::"); + let type_name = path_parts.next().unwrap(); + let module = match path_parts.next() { + Some(m) => format!("$crate::{}", m), + None => format!("$crate"), + }; + let atom_doc = match self.atom_doc { + Some(ref doc) => quote!(#[doc = #doc]), + None => quote!(), + }; + let static_set_doc = match self.static_set_doc { + Some(ref doc) => quote!(#[doc = #doc]), + None => quote!(), + }; + let macro_doc = match self.macro_doc { + Some(ref doc) => quote!(#[doc = #doc]), + None => quote!(), + }; + fn new_term(string: &str) -> Ident { + Ident::new(string, proc_macro2::Span::call_site()) + } + let static_set_name = new_term(&format!("{}StaticSet", type_name)); + let type_name = new_term(type_name); + let macro_name = new_term(&*self.macro_name); + let module = module.parse::().unwrap(); + let atom_prefix = format!("ATOM_{}_", type_name.to_string().to_uppercase()); + let new_const_name = |atom: &str| { + let mut name = atom_prefix.clone(); + for c in atom.chars() { + name.push_str(&format!("_{:02X}", c as u32)) + } + new_term(&name) + }; + let const_names: Vec<_> = atoms.iter().copied().map(new_const_name).collect(); + let ident_const_names: Vec<_> = atoms_for_idents + .iter() + .copied() + .map(new_const_name) + .collect(); + let ident_inline_const_names: Vec<_> = istrs_for_idents + .iter() + .copied() + .map(new_const_name) + .collect(); + + // Inline strings + let (inline_const_names, inline_values_and_lengths): (Vec<_>, Vec<_>) = inline_strs + .iter() + .map(|s| { + let const_name = new_const_name(s); + + let mut value = 0u64; + for (index, c) in s.bytes().enumerate() { + value = value | ((c as u64) << (index * 8 + 8)); + } + + let len = s.len() as u8; + + (const_name, (value, len)) + }) + .unzip(); + let (inline_values, inline_lengths): (Vec<_>, Vec<_>) = + inline_values_and_lengths.into_iter().unzip(); + + quote! { + #atom_doc + pub type #type_name = ::string_cache::Atom<#static_set_name>; + + #static_set_doc + #[derive(PartialEq, Eq, PartialOrd, Ord)] + pub struct #static_set_name; + + impl ::string_cache::StaticAtomSet for #static_set_name { + fn get() -> &'static ::string_cache::PhfStrSet { + static SET: ::string_cache::PhfStrSet = ::string_cache::PhfStrSet { + key: #key, + disps: &[#((#disps0, #disps1)),*], + atoms: &[#(#atoms),*], + hashes: &[#(#hashes),*] + }; + &SET + } + fn empty_string_index() -> u32 { + #empty_string_index + } + } + + #( + pub const #const_names: #type_name = #type_name::pack_static(#indices); + )* + #( + pub const #inline_const_names: #type_name = #type_name::pack_inline(#inline_values, #inline_lengths); + )* + + #macro_doc + #[macro_export] + macro_rules! #macro_name { + #( + (#atoms) => { #module::#const_names }; + )* + #( + (#inline_strs) => { #module::#inline_const_names }; + )* + #( + (#atom_idents) => { #module::#ident_const_names }; + )* + #( + (#istr_idents) => { #module::#ident_inline_const_names }; + )* + } + } + } + + /// Create a new file at `path` and write generated code there. + /// + /// Typical usage: + /// `.write_to_file(&Path::new(&env::var("OUT_DIR").unwrap()).join("foo_atom.rs"))` + pub fn write_to_file(&mut self, path: &Path) -> io::Result<()> { + self.write_to(BufWriter::new(File::create(path)?)) + } +} + +#[test] +fn test_iteration_order() { + let x1 = crate::AtomType::new("foo::Atom", "foo_atom!") + .atoms(&["x", "xlink", "svg", "test"]) + .write_to_string(Vec::new()) + .expect("write to string cache x1"); + + let x2 = crate::AtomType::new("foo::Atom", "foo_atom!") + .atoms(&["x", "xlink", "svg", "test"]) + .write_to_string(Vec::new()) + .expect("write to string cache x2"); + + assert_eq!(x1, x2); +} diff --git a/tests/small-stack.rs b/tests/small-stack.rs new file mode 100644 index 0000000..bb607af --- /dev/null +++ b/tests/small-stack.rs @@ -0,0 +1,17 @@ +// Regression test for https://github.com/servo/html5ever/issues/393 +// +// Create a dynamic atom − causing initialization of the global hash map − +// in a thread that has a small stack. +// +// This is a separate test program rather than a `#[test] fn` among others +// to make sure that nothing else has already initialized the map in this process. +fn main() { + std::thread::Builder::new() + .stack_size(50_000) + .spawn(|| { + let _atom = string_cache::DefaultAtom::from("12345678"); + }) + .unwrap() + .join() + .unwrap() +}