diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..74ade77 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,65 @@ +name: CI +on: + push: + branches: ["main"] + pull_request: + merge_group: + types: [checks_requested] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +env: + RUST_BACKTRACE: 1 + SHELL: /bin/bash + +jobs: + ci: + name: Build and Test + runs-on: ubuntu-latest + + strategy: + matrix: + rust: [1.70.0, nightly, beta, stable] + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 2 + - name: Setup Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: ${{ matrix.rust }} + default: true + override: true + - name: Build + run: | + cargo build --no-default-features + cargo build + cargo build --features malloc_size_of + - uses: actions-rs/cargo@v1 + with: + command: test + args: --all + - name: Build codegen + run: | + cd string-cache-codegen && cargo build && cd .. + + if [ ${{ matrix.rust }} = nightly ]; then + cd integration-tests && cargo test --features unstable && cd ..; + fi + + + build_result: + name: Result + runs-on: ubuntu-latest + needs: + - "ci" + + steps: + - name: Mark the job as successful + run: exit 0 + if: success() + - name: Mark the job as unsuccessful + run: exit 1 + if: "!success()" diff --git a/.gitignore b/.gitignore index fafa631..c17061b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ /doc -/Makefile -/Cargo.lock -/target +Cargo.lock +target +.cargo/config diff --git a/Cargo.toml b/Cargo.toml index 00e057b..e73215e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,13 +1,41 @@ [package] +name = "string_cache" +version = "0.9.0" # Also update README.md when making a semver-breaking change +authors = ["The Servo Project Developers"] +description = "A string interning library for Rust, developed as part of the Servo project." +license = "MIT OR Apache-2.0" +repository = "https://github.com/servo/string-cache" +documentation = "https://docs.rs/string_cache" +edition = "2018" +rust-version = "1.70.0" + +# Do not `exclude` ./string-cache-codegen because we want to include +# ./string-cache-codegen/shared.rs, and `include` is a pain to use +# (It has to be exhaustive.) +# This means that packages for this crate include some unused files, +# but they’re not too big so that shouldn’t be a problem. +[lib] name = "string_cache" -version = "0.0.0" -authors = [ "The Servo Project Developers" ] -[dependencies.phf] -git = "https://github.com/sfackler/rust-phf" -[dependencies.phf_mac] -git = "https://github.com/sfackler/rust-phf" +[features] +serde_support = ["serde"] +default = ["serde_support"] + +[dependencies] +precomputed-hash = "0.1" +serde = { version = "1", optional = true } +malloc_size_of = { version = "0.1", default-features = false, optional = true } +phf_shared = "0.13" +new_debug_unreachable = "1.0.2" +parking_lot = "0.12" + +[[test]] +name = "small-stack" +harness = false -[dependencies.string_cache_macros] -path = "macros" +[workspace] +members = [ + "string-cache-codegen", + "integration-tests", +] diff --git a/Makefile.in b/Makefile.in deleted file mode 100644 index 033472a..0000000 --- a/Makefile.in +++ /dev/null @@ -1,35 +0,0 @@ -VPATH=%VPATH% - -RUSTC ?= rustc -RUSTFLAGS += -L ../../phf/rust-phf -EXT_DEPS ?= -RUSTDOC ?= rustdoc -RUSTDOC_FLAGS ?= -RUSTDOC_TARGET ?= doc - -RUST_SRC=$(shell find $(VPATH)/src $(VPATH)/macros $(VPATH)/shared -type f -name '*.rs') - -.PHONY: all -all: libstring-cache.dummy - -libstring-cache.dummy: $(RUST_SRC) $(EXT_DEPS) - $(RUSTC) $(RUSTFLAGS) $(VPATH)/macros/src/lib.rs --out-dir . - $(RUSTC) $(RUSTFLAGS) -L . $(VPATH)/src/lib.rs --out-dir . - touch $@ - -string-cache-test: $(RUST_SRC) - $(RUSTC) $(RUSTFLAGS) -L . $< -o $@ --test - -.PHONY: check -check: string-cache-test - ./string-cache-test $(TEST) - -.PHONY: doc -doc: $(RUSTDOC_TARGET)/string_cache/index.html - -$(RUSTDOC_TARGET)/string_cache/index.html: $(RUST_SRC) $(EXT_DEPS) - $(RUSTDOC) $(RUSTDOC_FLAGS) $< -o $(RUSTDOC_TARGET) - -.PHONY: clean -clean: - rm -f *.o *.a *.so *.dylib *.rlib *.dll *.dummy *-test diff --git a/README.md b/README.md index cba6fc1..429d1ec 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,78 @@ -string-cache -============ +# string-cache + +[![Build Status](https://github.com/servo/string-cache/actions/workflows/ci.yml/badge.svg)](https://github.com/servo/string-cache/actions) + +[Documentation](https://docs.rs/string_cache/) + +A string interning library for Rust, developed as part of the [Servo](https://github.com/servo/servo) project. + +## Simple usage + +In `Cargo.toml`: + +```toml +[dependencies] +string_cache = "0.9" +``` + +In `lib.rs`: + +```rust +extern crate string_cache; +use string_cache::DefaultAtom as Atom; +``` + +## With static atoms + +In `Cargo.toml`: + +```toml +[package] +build = "build.rs" + +[dependencies] +string_cache = "0.9" + +[build-dependencies] +string_cache_codegen = "0.6" +``` + +In `build.rs`: + +```rust +extern crate string_cache_codegen; + +use std::env; +use std::path::Path; + +fn main() { + string_cache_codegen::AtomType::new("foo::FooAtom", "foo_atom!") + .atoms(&["foo", "bar"]) + .write_to_file(&Path::new(&env::var("OUT_DIR").unwrap()).join("foo_atom.rs")) + .unwrap() +} +``` + +In `lib.rs`: + +```rust +extern crate string_cache; + +mod foo { + include!(concat!(env!("OUT_DIR"), "/foo_atom.rs")); +} +``` + +The generated code will define a `FooAtom` type and a `foo_atom!` macro. +The macro can be used in expression or patterns, with strings listed in `build.rs`. +For example: + +```rust +fn compute_something(input: &foo::FooAtom) -> u32 { + match *input { + foo_atom!("foo") => 1, + foo_atom!("bar") => 2, + _ => 3, + } +} +``` diff --git a/configure b/configure deleted file mode 100755 index 62a0f4c..0000000 --- a/configure +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -SRCDIR="$(cd $(dirname $0) && pwd)" -sed "s#%VPATH%#${SRCDIR}#" ${SRCDIR}/Makefile.in > Makefile diff --git a/examples/simple.rs b/examples/simple.rs new file mode 100644 index 0000000..f063b06 --- /dev/null +++ b/examples/simple.rs @@ -0,0 +1,26 @@ + + +use string_cache::DefaultAtom; + +fn main() { + let mut interned_stuff = Vec::new(); + let text = "here is a sentence of text that will be tokenised and interned and some repeated \ + tokens is of text and"; + for word in text.split_whitespace() { + let seen_before = interned_stuff + .iter() + // We can use impl PartialEq where T is anything string-like to compare to + // interned strings to either other interned strings, or actual strings Comparing two + // interned strings is very fast (normally a single cpu operation). + .filter(|interned_word| interned_word == &word) + .count(); + if seen_before > 0 { + println!(r#"Seen the word "{}" {} times"#, word, seen_before); + } else { + println!(r#"Not seen the word "{}" before"#, word); + } + // We use the impl From<(Cow<'a, str>, or &'a str, or String) for Atom to intern a + // new string + interned_stuff.push(DefaultAtom::from(word)); + } +} diff --git a/integration-tests/Cargo.toml b/integration-tests/Cargo.toml new file mode 100644 index 0000000..4562747 --- /dev/null +++ b/integration-tests/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "integration_tests" +version = "0.0.1" +authors = [ "The Servo Project Developers" ] +build = "build.rs" +publish = false +edition = "2018" + +[lib] +doctest = false +test = true + +[features] + +# Use unstable features to optimize space and time (memory and CPU usage). +unstable = [] + +[dependencies] +string_cache = { version = "0.9", path = ".." } + +[dev-dependencies] +rand = { version = "0.8", features = ["small_rng"] } +string_cache_codegen = { version = "0.6", path = "../string-cache-codegen" } + +[build-dependencies] +string_cache_codegen = { version = "0.6", path = "../string-cache-codegen" } diff --git a/integration-tests/build.rs b/integration-tests/build.rs new file mode 100644 index 0000000..6293e4c --- /dev/null +++ b/integration-tests/build.rs @@ -0,0 +1,26 @@ +use string_cache_codegen; + +use std::env; +use std::path::Path; + +fn main() { + string_cache_codegen::AtomType::new("TestAtom", "test_atom!") + .atoms(&[ + "a", + "b", + "address", + "defaults", + "area", + "body", + "font-weight", + "br", + "html", + "head", + "id", + "❤", + "❤💯", + "❤💯❤💯", + ]) + .write_to_file(&Path::new(&env::var("OUT_DIR").unwrap()).join("test_atom.rs")) + .unwrap() +} diff --git a/integration-tests/src/bench.rs b/integration-tests/src/bench.rs new file mode 100644 index 0000000..45e7199 --- /dev/null +++ b/integration-tests/src/bench.rs @@ -0,0 +1,212 @@ +// Copyright 2014 The Servo Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/* + +A cautionary note about these benchmarks: + +Many of the operations we're attempting to measure take less than one +nanosecond. That's why we run them thousands of times in a loop just to get a +single iteration that Rust's statistical benchmarking can work with. At that +scale, any change anywhere in the library can produce durable performance +regressions on the order of half a nanosecond, i.e. "500 ns" in the output for +a test like eq_x_1000. + +We can't get anything done if we rachet on these numbers! They are more useful +for selecting between alternatives, and for noticing large regressions or +inconsistencies. + +Furthermore, a large part of the point of interning is to make strings small +and cheap to move around, which isn't reflected in these tests. + +*/ +use crate::TestAtom; + +use test::{black_box, Bencher}; + +// Just shorthand +fn mk(x: &str) -> TestAtom { + TestAtom::from(x) +} + +macro_rules! check_type (($name:ident, $x:expr) => ( + // NB: "cargo bench" does not run these! + #[test] + fn $name() { + assert!($x, "atom has wrong type"); + } +)); + +macro_rules! bench_tiny_op (($name:ident, $op:ident, $ctor_x:expr, $ctor_y:expr) => ( + #[bench] + fn $name(b: &mut Bencher) { + const n: usize = 1000; + let xs: Vec<_> = repeat($ctor_x).take(n).collect(); + let ys: Vec<_> = repeat($ctor_y).take(n).collect(); + + b.iter(|| { + for (x, y) in xs.iter().zip(ys.iter()) { + black_box(x.$op(y)); + } + }); + } +)); + +macro_rules! bench_one ( + (x_static $x:expr, $y:expr) => (check_type!(check_type_x, $x.is_static());); + (x_inline $x:expr, $y:expr) => (check_type!(check_type_x, $x.is_inline());); + (x_dynamic $x:expr, $y:expr) => (check_type!(check_type_x, $x.is_dynamic());); + (y_static $x:expr, $y:expr) => (check_type!(check_type_y, $y.is_static());); + (y_inline $x:expr, $y:expr) => (check_type!(check_type_y, $y.is_inline());); + (y_dynamic $x:expr, $y:expr) => (check_type!(check_type_y, $y.is_dynamic());); + (is_static $x:expr, $y:expr) => (bench_one!(x_static $x, $y); bench_one!(y_static $x, $y);); + (is_inline $x:expr, $y:expr) => (bench_one!(x_inline $x, $y); bench_one!(y_inline $x, $y);); + (is_dynamic $x:expr, $y:expr) => (bench_one!(x_dynamic $x, $y); bench_one!(y_dynamic $x, $y);); + + (eq $x:expr, $_y:expr) => (bench_tiny_op!(eq_x_1000, eq, $x, $x);); + (ne $x:expr, $y:expr) => (bench_tiny_op!(ne_x_1000, ne, $x, $y);); + (lt $x:expr, $y:expr) => (bench_tiny_op!(lt_x_1000, lt, $x, $y);); + + (intern $x:expr, $_y:expr) => ( + #[bench] + fn intern(b: &mut Bencher) { + let x = $x.to_string(); + b.iter(|| { + black_box(TestAtom::from(&*x)); + }); + } + ); + + (as_ref $x:expr, $_y:expr) => ( + #[bench] + fn as_ref_x_1000(b: &mut Bencher) { + let x = $x; + b.iter(|| { + for _ in 0..1000 { + black_box(x.as_ref()); + } + }); + } + ); + + (clone $x:expr, $_y:expr) => ( + #[bench] + fn clone_x_1000(b: &mut Bencher) { + let x = $x; + b.iter(|| { + for _ in 0..1000 { + black_box(x.clone()); + } + }); + } + ); + + (clone_string $x:expr, $_y:expr) => ( + #[bench] + fn clone_x_1000(b: &mut Bencher) { + let x = $x.to_string(); + b.iter(|| { + for _ in 0..1000 { + black_box(x.clone()); + } + }); + } + ); +); + +macro_rules! bench_all ( + ([ $($which:ident)+ ] for $name:ident = $x:expr, $y:expr) => ( + // FIXME: This module works around rust-lang/rust#12249 so we don't + // have to repeat the names for eq and neq. + mod $name { + #![allow(unused_imports)] + + use test::{Bencher, black_box}; + use std::string::ToString; + use std::iter::repeat; + + use crate::TestAtom; + + use super::mk; + + $( + bench_one!($which $x, $y); + )+ + } + ); +); + +pub const longer_dynamic_a: &'static str = + "Thee Silver Mt. Zion Memorial Orchestra & Tra-La-La Band"; +pub const longer_dynamic_b: &'static str = + "Thee Silver Mt. Zion Memorial Orchestra & Tra-La-La Ban!"; + +bench_all!([eq ne lt clone_string] for short_string = "e", "f"); +bench_all!([eq ne lt clone_string] for medium_string = "xyzzy01", "xyzzy02"); +bench_all!([eq ne lt clone_string] + for longer_string = super::longer_dynamic_a, super::longer_dynamic_b); + +bench_all!([eq ne intern as_ref clone is_static lt] + for static_atom = test_atom!("defaults"), test_atom!("font-weight")); + +bench_all!([intern as_ref clone is_inline] + for short_inline_atom = mk("e"), mk("f")); + +bench_all!([eq ne intern as_ref clone is_inline lt] + for medium_inline_atom = mk("xyzzy01"), mk("xyzzy02")); + +bench_all!([intern as_ref clone is_dynamic] + for min_dynamic_atom = mk("xyzzy001"), mk("xyzzy002")); + +bench_all!([eq ne intern as_ref clone is_dynamic lt] + for longer_dynamic_atom = mk(super::longer_dynamic_a), mk(super::longer_dynamic_b)); + +bench_all!([intern as_ref clone is_static] + for static_at_runtime = mk("defaults"), mk("font-weight")); + +bench_all!([ne lt x_static y_inline] + for static_vs_inline = test_atom!("defaults"), mk("f")); + +bench_all!([ne lt x_static y_dynamic] + for static_vs_dynamic = test_atom!("defaults"), mk(super::longer_dynamic_b)); + +bench_all!([ne lt x_inline y_dynamic] + for inline_vs_dynamic = mk("e"), mk(super::longer_dynamic_b)); + +macro_rules! bench_rand ( ($name:ident, $len:expr) => ( + #[bench] + fn $name(b: &mut Bencher) { + use std::str; + use rand; + use rand::{RngCore, SeedableRng}; + + let mut gen = rand::rngs::SmallRng::from_entropy(); + b.iter(|| { + // We have to generate new atoms on every iter, because + // the dynamic atom table isn't reset. + // + // I measured the overhead of random string generation + // as about 3-12% at one point. + + let mut buf: [u8; $len] = [0; $len]; + gen.fill_bytes(&mut buf); + for n in buf.iter_mut() { + // shift into printable ASCII + *n = (*n % 0x40) + 0x20; + } + let s = str::from_utf8(&buf[..]).unwrap(); + black_box(TestAtom::from(s)); + }); + } +)); + +bench_rand!(intern_rand_008, 8); +bench_rand!(intern_rand_032, 32); +bench_rand!(intern_rand_128, 128); +bench_rand!(intern_rand_512, 512); diff --git a/integration-tests/src/common-usage.rs b/integration-tests/src/common-usage.rs new file mode 100644 index 0000000..7b7380a --- /dev/null +++ b/integration-tests/src/common-usage.rs @@ -0,0 +1,19 @@ +/// Test common usage by popular dependents (html5ever, lalrpop, browserlists-rs), to ensure no API-surface breaking changes +/// Created after https://github.com/servo/string-cache/issues/271 +use std::collections::HashMap; + +use crate::Atom; +use crate::TestAtom; + +#[test] +fn usage_with_hashmap() { + let mut map: HashMap = HashMap::new(); + + map.insert(test_atom!("area"), 1); + map.insert("str_into".into(), 2); + map.insert("atom_from".into(), 3); + + assert_eq!(map.get(&"area".into()).unwrap(), &1); + assert_eq!(map.get(&"str_into".into()).unwrap(), &2); + assert_eq!(map.get(&Atom::from("atom_from")).unwrap(), &3); +} diff --git a/integration-tests/src/lib.rs b/integration-tests/src/lib.rs new file mode 100644 index 0000000..a788d93 --- /dev/null +++ b/integration-tests/src/lib.rs @@ -0,0 +1,316 @@ +// Copyright 2014 The Servo Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![cfg(test)] +#![deny(warnings)] +#![allow(non_upper_case_globals)] +#![cfg_attr(feature = "unstable", feature(test))] + +#[cfg(feature = "unstable")] +extern crate test; + +use std::thread; +use string_cache::StaticAtomSet; + +include!(concat!(env!("OUT_DIR"), "/test_atom.rs")); +pub type Atom = TestAtom; + +#[test] +fn test_as_slice() { + let s0 = Atom::from(""); + assert!(s0.as_ref() == ""); + + let s1 = Atom::from("class"); + assert!(s1.as_ref() == "class"); + + let i0 = Atom::from("blah"); + assert!(i0.as_ref() == "blah"); + + let s0 = Atom::from("BLAH"); + assert!(s0.as_ref() == "BLAH"); + + let d0 = Atom::from("zzzzzzzzzz"); + assert!(d0.as_ref() == "zzzzzzzzzz"); + + let d1 = Atom::from("ZZZZZZZZZZ"); + assert!(d1.as_ref() == "ZZZZZZZZZZ"); +} + +#[test] +fn test_types() { + assert!(Atom::from("").is_static()); + assert!(Atom::from("defaults").is_static()); + assert!(Atom::from("font-weight").is_static()); + assert!(Atom::from("id").is_inline()); + assert!(Atom::from("body").is_inline()); + assert!(Atom::from("a").is_inline()); + assert!(Atom::from("address").is_inline()); + assert!(Atom::from("c").is_inline()); + assert!(Atom::from("zz").is_inline()); + assert!(Atom::from("zzz").is_inline()); + assert!(Atom::from("zzzz").is_inline()); + assert!(Atom::from("zzzzz").is_inline()); + assert!(Atom::from("zzzzzz").is_inline()); + assert!(Atom::from("zzzzzzz").is_inline()); + assert!(Atom::from("zzzzzzzz").is_dynamic()); + assert!(Atom::from("zzzzzzzzzzzzz").is_dynamic()); +} + +#[test] +fn test_equality() { + let s0 = Atom::from("fn"); + let s1 = Atom::from("fn"); + let s2 = Atom::from("loop"); + + let i0 = Atom::from("blah"); + let i1 = Atom::from("blah"); + let i2 = Atom::from("blah2"); + + let d0 = Atom::from("zzzzzzzz"); + let d1 = Atom::from("zzzzzzzz"); + let d2 = Atom::from("zzzzzzzzz"); + + assert!(s0 == s1); + assert!(s0 != s2); + + assert!(i0 == i1); + assert!(i0 != i2); + + assert!(d0 == d1); + assert!(d0 != d2); + + assert!(s0 != i0); + assert!(s0 != d0); + assert!(i0 != d0); +} + +#[test] +fn default() { + assert_eq!(TestAtom::default(), test_atom!("")); + assert_eq!(&*TestAtom::default(), ""); +} + +#[test] +fn ord() { + fn check(x: &str, y: &str) { + assert_eq!(x < y, Atom::from(x) < Atom::from(y)); + assert_eq!(x.cmp(y), Atom::from(x).cmp(&Atom::from(y))); + assert_eq!(x.partial_cmp(y), Atom::from(x).partial_cmp(&Atom::from(y))); + } + + check("a", "body"); + check("asdf", "body"); + check("zasdf", "body"); + check("z", "body"); + + check("a", "bbbbb"); + check("asdf", "bbbbb"); + check("zasdf", "bbbbb"); + check("z", "bbbbb"); +} + +#[test] +fn clone() { + let s0 = Atom::from("fn"); + let s1 = s0.clone(); + let s2 = Atom::from("loop"); + + let i0 = Atom::from("blah"); + let i1 = i0.clone(); + let i2 = Atom::from("blah2"); + + let d0 = Atom::from("zzzzzzzz"); + let d1 = d0.clone(); + let d2 = Atom::from("zzzzzzzzz"); + + assert!(s0 == s1); + assert!(s0 != s2); + + assert!(i0 == i1); + assert!(i0 != i2); + + assert!(d0 == d1); + assert!(d0 != d2); + + assert!(s0 != i0); + assert!(s0 != d0); + assert!(i0 != d0); +} + +macro_rules! assert_eq_fmt (($fmt:expr, $x:expr, $y:expr) => ({ + let x = $x; + let y = $y; + if x != y { + panic!("assertion failed: {} != {}", + format_args!($fmt, x), + format_args!($fmt, y)); + } +})); + +#[test] +fn repr() { + fn check(s: &str, data: u64) { + assert_eq_fmt!("0x{:016X}", Atom::from(s).unsafe_data(), data); + } + + fn check_static(s: &str, x: Atom) { + assert_eq_fmt!("0x{:016X}", x.unsafe_data(), Atom::from(s).unsafe_data()); + assert_eq!(0x2, x.unsafe_data() & 0xFFFF_FFFF); + // The index is unspecified by phf. + assert!((x.unsafe_data() >> 32) <= TestAtomStaticSet::get().atoms.len() as u64); + } + + // This test is here to make sure we don't change atom representation + // by accident. It may need adjusting if there are changes to the + // static atom table, the tag values, etc. + + // Static atoms + check_static("defaults", test_atom!("defaults")); + check_static("font-weight", test_atom!("font-weight")); + + // Inline atoms + check("a", 0x0000_0000_0000_6111); + check("address", 0x7373_6572_6464_6171); + check("area", 0x0000_0061_6572_6141); + check("e", 0x0000_0000_0000_6511); + check("xyzzy", 0x0000_797A_7A79_7851); + check("xyzzy01", 0x3130_797A_7A79_7871); + + // Dynamic atoms. This is a pointer so we can't verify every bit. + assert_eq!(0x00, Atom::from("a dynamic string").unsafe_data() & 0xf); +} + +#[test] +fn test_threads() { + for _ in 0_u32..100 { + thread::spawn(move || { + let _ = Atom::from("a dynamic string"); + let _ = Atom::from("another string"); + }); + } +} + +#[test] +fn atom_macro() { + assert_eq!(test_atom!("a"), Atom::from("a")); + assert_eq!(test_atom!("body"), Atom::from("body")); + assert_eq!(test_atom!("address"), Atom::from("address")); + assert_eq!(test_atom!("❤"), Atom::from("❤")); + assert_eq!(test_atom!("❤💯"), Atom::from("❤💯")); + assert_eq!(test_atom!("font-weight"), Atom::from("font-weight")); + assert_eq!(test_atom!("❤💯❤💯"), Atom::from("❤💯❤💯")); +} + +#[test] +fn match_atom() { + assert_eq!( + 2, + match Atom::from("head") { + test_atom!("br") => 1, + test_atom!("html") | test_atom!("head") => 2, + _ => 3, + } + ); + + assert_eq!( + 3, + match Atom::from("body") { + test_atom!("br") => 1, + test_atom!("html") | test_atom!("head") => 2, + _ => 3, + } + ); + + assert_eq!( + 3, + match Atom::from("zzzzzz") { + test_atom!("br") => 1, + test_atom!("html") | test_atom!("head") => 2, + _ => 3, + } + ); +} + +#[test] +fn ensure_deref() { + // Ensure we can Deref to a &str + let atom = Atom::from("foobar"); + let _: &str = &atom; +} + +#[test] +fn ensure_as_ref() { + // Ensure we can as_ref to a &str + let atom = Atom::from("foobar"); + let _: &str = atom.as_ref(); +} + +#[test] +fn test_ascii_lowercase() { + assert_eq!(Atom::from("").to_ascii_lowercase(), Atom::from("")); + assert_eq!(Atom::from("aZ9").to_ascii_lowercase(), Atom::from("az9")); + assert_eq!( + Atom::from("The Quick Brown Fox!").to_ascii_lowercase(), + Atom::from("the quick brown fox!") + ); + assert_eq!( + Atom::from("JE VAIS À PARIS").to_ascii_lowercase(), + Atom::from("je vais À paris") + ); +} + +#[test] +fn test_ascii_uppercase() { + assert_eq!(Atom::from("").to_ascii_uppercase(), Atom::from("")); + assert_eq!(Atom::from("aZ9").to_ascii_uppercase(), Atom::from("AZ9")); + assert_eq!( + Atom::from("The Quick Brown Fox!").to_ascii_uppercase(), + Atom::from("THE QUICK BROWN FOX!") + ); + assert_eq!( + Atom::from("Je vais à Paris").to_ascii_uppercase(), + Atom::from("JE VAIS à PARIS") + ); +} + +#[test] +fn test_eq_ignore_ascii_case() { + assert!(Atom::from("").eq_ignore_ascii_case(&Atom::from(""))); + assert!(Atom::from("aZ9").eq_ignore_ascii_case(&Atom::from("aZ9"))); + assert!(Atom::from("aZ9").eq_ignore_ascii_case(&Atom::from("Az9"))); + assert!(Atom::from("The Quick Brown Fox!") + .eq_ignore_ascii_case(&Atom::from("THE quick BROWN fox!"))); + assert!(Atom::from("Je vais à Paris").eq_ignore_ascii_case(&Atom::from("je VAIS à PARIS"))); + assert!(!Atom::from("").eq_ignore_ascii_case(&Atom::from("az9"))); + assert!(!Atom::from("aZ9").eq_ignore_ascii_case(&Atom::from(""))); + assert!(!Atom::from("aZ9").eq_ignore_ascii_case(&Atom::from("9Za"))); + assert!(!Atom::from("The Quick Brown Fox!") + .eq_ignore_ascii_case(&Atom::from("THE quick BROWN fox!!"))); + assert!(!Atom::from("Je vais à Paris").eq_ignore_ascii_case(&Atom::from("JE vais À paris"))); +} + +#[test] +fn test_from_string() { + assert!(Atom::from("camembert".to_owned()) == Atom::from("camembert")); +} + +#[test] +fn test_try_static() { + assert!(Atom::try_static("defaults").is_some()); + assert!(Atom::try_static("head").is_none()); + assert!(Atom::try_static("not in the static table").is_none()); +} + +#[cfg(test)] +#[path = "common-usage.rs"] +mod common_usage; + +#[cfg(all(test, feature = "unstable"))] +#[path = "bench.rs"] +mod bench; diff --git a/macros/Cargo.toml b/macros/Cargo.toml deleted file mode 100644 index 7fb85ae..0000000 --- a/macros/Cargo.toml +++ /dev/null @@ -1,10 +0,0 @@ -[package] - -name = "string_cache_macros" -version = "0.0.0" -authors = [ "The Servo Project Developers" ] - -[lib] - -name = "string_cache_macros" -plugin = true diff --git a/macros/src/data.rs b/macros/src/data.rs deleted file mode 100644 index ec4b975..0000000 --- a/macros/src/data.rs +++ /dev/null @@ -1,1072 +0,0 @@ -// Copyright 2014 The Servo Project Developers. See the -// COPYRIGHT file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -// The first 64 atoms are special: we can quickly check membership -// in sets of these, using a bitmask. This includes every tag that -// appears in more than one set in the tree builder spec, plus a -// few others (arbitrarily chosen). -// -// FIXME(kmc): check if this is really true with the packed tag bits -// -// This list must remain sorted. -pub static fast_set_atoms: [&'static str, ..64] = [ - "a", - "address", - "applet", - "area", - "article", - "aside", - "b", - "base", - "basefont", - "bgsound", - "big", - "blockquote", - "body", - "br", - "button", - "caption", - "col", - "colgroup", - "dd", - "dt", - "embed", - "form", - "frame", - "frameset", - "h1", - "h2", - "h3", - "h4", - "h5", - "h6", - "head", - "html", - "input", - "li", - "link", - "marquee", - "meta", - "noframes", - "noscript", - "object", - "optgroup", - "option", - "param", - "plaintext", - "pre", - "rp", - "rt", - "script", - "select", - "source", - "style", - "svg", - "table", - "tbody", - "td", - "template", - "textarea", - "tfoot", - "th", - "thead", - "title", - "tr", - "track", - "xmp", -]; - -// The rest. -// -// This list must remain sorted. -pub static other_atoms: &'static [&'static str] = &[ - "", - "abbr", - "abs", - "accent", - "accent-height", - "accentunder", - "accept", - "accept-charset", - "accesskey", - "accumulate", - "acronym", - "action", - "actiontype", - "active", - "actuate", - "additive", - "align", - "alignment-baseline", - "alignmentscope", - "alink", - "alphabetic", - "alt", - "altGlyph", - "altGlyphDef", - "altGlyphItem", - "altglyph", - "altglyphdef", - "altglyphitem", - "altimg", - "alttext", - "amplitude", - "and", - "animate", - "animateColor", - "animateMotion", - "animateTransform", - "animatecolor", - "animatemotion", - "animatetransform", - "animation", - "annotation", - "annotation-xml", - "apply", - "approx", - "arabic-form", - "arccos", - "arccosh", - "arccot", - "arccoth", - "arccsc", - "arccsch", - "archive", - "arcrole", - "arcsec", - "arcsech", - "arcsin", - "arcsinh", - "arctan", - "arctanh", - "arg", - "aria-activedescendant", - "aria-atomic", - "aria-autocomplete", - "aria-busy", - "aria-channel", - "aria-checked", - "aria-controls", - "aria-datatype", - "aria-describedby", - "aria-disabled", - "aria-dropeffect", - "aria-expanded", - "aria-flowto", - "aria-grab", - "aria-haspopup", - "aria-hidden", - "aria-invalid", - "aria-labelledby", - "aria-level", - "aria-live", - "aria-multiline", - "aria-multiselectable", - "aria-owns", - "aria-posinset", - "aria-pressed", - "aria-readonly", - "aria-relevant", - "aria-required", - "aria-secret", - "aria-selected", - "aria-setsize", - "aria-sort", - "aria-templateid", - "aria-valuemax", - "aria-valuemin", - "aria-valuenow", - "ascent", - "async", - "attributeName", - "attributeType", - "attributename", - "attributetype", - "audio", - "autocomplete", - "autofocus", - "autoplay", - "autosubmit", - "axis", - "azimuth", - "background", - "baseFrequency", - "baseProfile", - "basefrequency", - "baseline", - "baseline-shift", - "baseprofile", - "bbox", - "bdo", - "begin", - "bevelled", - "bgcolor", - "bias", - "border", - "bvar", - "by", - "calcMode", - "calcmode", - "canvas", - "cap-height", - "card", - "cartesianproduct", - "ceiling", - "cellpadding", - "cellspacing", - "center", - "char", - "charoff", - "charset", - "checked", - "ci", - "circle", - "cite", - "class", - "classid", - "clear", - "clip", - "clip-path", - "clip-rule", - "clipPath", - "clipPathUnits", - "clippath", - "clippathunits", - "close", - "closure", - "cn", - "code", - "codebase", - "codetype", - "codomain", - "color", - "color-interpolation", - "color-interpolation-filters", - "color-profile", - "color-rendering", - "cols", - "colspan", - "columnalign", - "columnlines", - "columnspacing", - "columnspan", - "columnwidth", - "compact", - "complexes", - "compose", - "condition", - "conjugate", - "content", - "contentScriptType", - "contentStyleType", - "contenteditable", - "contentscripttype", - "contentstyletype", - "contextmenu", - "controls", - "coords", - "cos", - "cosh", - "cot", - "coth", - "crossorigin", - "csc", - "csch", - "csymbol", - "curl", - "cursor", - "cx", - "cy", - "d", - "data", - "datafld", - "dataformatas", - "datasrc", - "datatemplate", - "datetime", - "declare", - "default", - "defer", - "definition-src", - "definitionURL", - "definitionurl", - "defs", - "degree", - "del", - "depth", - "desc", - "descent", - "details", - "determinant", - "dfn", - "dialog", - "diff", - "diffuseConstant", - "diffuseconstant", - "dir", - "direction", - "disabled", - "discard", - "display", - "displaystyle", - "div", - "divergence", - "divide", - "divisor", - "dl", - "domain", - "domainofapplication", - "dominant-baseline", - "draggable", - "dur", - "dx", - "dy", - "edge", - "edgeMode", - "edgemode", - "elevation", - "ellipse", - "em", - "emptyset", - "enable-background", - "encoding", - "enctype", - "end", - "eq", - "equalcolumns", - "equalrows", - "equivalent", - "eulergamma", - "exists", - "exp", - "exponent", - "exponentiale", - "externalResourcesRequired", - "externalresourcesrequired", - "face", - "factorial", - "factorof", - "false", - "feBlend", - "feColorMatrix", - "feComponentTransfer", - "feComposite", - "feConvolveMatrix", - "feDiffuseLighting", - "feDisplacementMap", - "feDistantLight", - "feFlood", - "feFuncA", - "feFuncB", - "feFuncG", - "feFuncR", - "feGaussianBlur", - "feImage", - "feMerge", - "feMergeNode", - "feMorphology", - "feOffset", - "fePointLight", - "feSpecularLighting", - "feSpotLight", - "feTile", - "feTurbulence", - "feblend", - "fecolormatrix", - "fecomponenttransfer", - "fecomposite", - "feconvolvematrix", - "fediffuselighting", - "fedisplacementmap", - "fedistantlight", - "feflood", - "fefunca", - "fefuncb", - "fefuncg", - "fefuncr", - "fegaussianblur", - "feimage", - "femerge", - "femergenode", - "femorphology", - "fence", - "feoffset", - "fepointlight", - "fespecularlighting", - "fespotlight", - "fetile", - "feturbulence", - "fieldset", - "figcaption", - "figure", - "fill", - "fill-opacity", - "fill-rule", - "filter", - "filterRes", - "filterUnits", - "filterres", - "filterunits", - "flood-color", - "flood-opacity", - "floor", - "fn", - "font", - "font-face", - "font-face-format", - "font-face-name", - "font-face-src", - "font-face-uri", - "font-family", - "font-size", - "font-size-adjust", - "font-stretch", - "font-style", - "font-variant", - "font-weight", - "fontfamily", - "fontsize", - "fontstyle", - "fontweight", - "footer", - "for", - "forall", - "foreignObject", - "foreignobject", - "format", - "frameborder", - "framespacing", - "from", - "fx", - "fy", - "g", - "g1", - "g2", - "gcd", - "geq", - "glyph", - "glyph-name", - "glyph-orientation-horizontal", - "glyph-orientation-vertical", - "glyphRef", - "glyphref", - "grad", - "gradientTransform", - "gradientUnits", - "gradienttransform", - "gradientunits", - "groupalign", - "gt", - "handler", - "hanging", - "header", - "headers", - "height", - "hgroup", - "hidden", - "hidefocus", - "high", - "hkern", - "horiz-adv-x", - "horiz-origin-x", - "horiz-origin-y", - "hr", - "href", - "hreflang", - "hspace", - "http-equiv", - "i", - "icon", - "id", - "ident", - "ideographic", - "iframe", - "image", - "image-rendering", - "imaginary", - "imaginaryi", - "img", - "implies", - "in", - "in2", - "index", - "infinity", - "inputmode", - "ins", - "int", - "integers", - "intercept", - "intersect", - "interval", - "inverse", - "irrelevant", - "isindex", - "ismap", - "k", - "k1", - "k2", - "k3", - "k4", - "kbd", - "kernelMatrix", - "kernelUnitLength", - "kernelmatrix", - "kernelunitlength", - "kerning", - "keyPoints", - "keySplines", - "keyTimes", - "keygen", - "keypoints", - "keysplines", - "keytimes", - "label", - "lambda", - "lang", - "language", - "laplacian", - "largeop", - "lcm", - "legend", - "lengthAdjust", - "lengthadjust", - "leq", - "letter-spacing", - "lighting-color", - "limit", - "limitingConeAngle", - "limitingconeangle", - "line", - "linearGradient", - "lineargradient", - "linebreak", - "linethickness", - "list", - "listener", - "listing", - "ln", - "local", - "log", - "logbase", - "longdesc", - "loop", - "low", - "lowlimit", - "lowsrc", - "lquote", - "lspace", - "lt", - "macros", - "maction", - "main", - "maligngroup", - "malignmark", - "manifest", - "map", - "marginheight", - "marginwidth", - "mark", - "marker", - "marker-end", - "marker-mid", - "marker-start", - "markerHeight", - "markerUnits", - "markerWidth", - "markerheight", - "markerunits", - "markerwidth", - "mask", - "maskContentUnits", - "maskUnits", - "maskcontentunits", - "maskunits", - "math", - "mathbackground", - "mathcolor", - "mathematical", - "mathsize", - "mathvariant", - "matrix", - "matrixrow", - "max", - "maxlength", - "maxsize", - "mean", - "media", - "median", - "mediummathspace", - "menclose", - "menu", - "menuitem", - "merror", - "metadata", - "meter", - "method", - "mfenced", - "mfrac", - "mglyph", - "mi", - "min", - "minsize", - "minus", - "missing-glyph", - "mlabeledtr", - "mmultiscripts", - "mn", - "mo", - "mode", - "moment", - "momentabout", - "movablelimits", - "mover", - "mpadded", - "mpath", - "mphantom", - "mprescripts", - "mroot", - "mrow", - "ms", - "mspace", - "msqrt", - "mstyle", - "msub", - "msubsup", - "msup", - "mtable", - "mtd", - "mtext", - "mtr", - "multiple", - "munder", - "munderover", - "name", - "nargs", - "naturalnumbers", - "nav", - "neq", - "nest", - "nobr", - "noembed", - "nohref", - "none", - "noresize", - "noshade", - "not", - "notanumber", - "notation", - "notin", - "notprsubset", - "notsubset", - "nowrap", - "numOctaves", - "numoctaves", - "occurrence", - "offset", - "ol", - "onabort", - "onactivate", - "onafterprint", - "onafterupdate", - "onbefordeactivate", - "onbeforeactivate", - "onbeforecopy", - "onbeforecut", - "onbeforeeditfocus", - "onbeforepaste", - "onbeforeprint", - "onbeforeunload", - "onbeforeupdate", - "onbegin", - "onblur", - "onbounce", - "oncellchange", - "onchange", - "onclick", - "oncontextmenu", - "oncontrolselect", - "oncopy", - "oncut", - "ondataavailable", - "ondatasetchanged", - "ondatasetcomplete", - "ondblclick", - "ondeactivate", - "ondrag", - "ondragdrop", - "ondragend", - "ondragenter", - "ondragleave", - "ondragover", - "ondragstart", - "ondrop", - "onend", - "onerror", - "onerrorupdate", - "onfilterchange", - "onfinish", - "onfocus", - "onfocusin", - "onfocusout", - "onformchange", - "onforminput", - "onhelp", - "oninput", - "oninvalid", - "onkeydown", - "onkeypress", - "onkeyup", - "onload", - "onlosecapture", - "onmessage", - "onmousedown", - "onmouseenter", - "onmouseleave", - "onmousemove", - "onmouseout", - "onmouseover", - "onmouseup", - "onmousewheel", - "onmove", - "onmoveend", - "onmovestart", - "onpaste", - "onpropertychange", - "onreadystatechange", - "onrepeat", - "onreset", - "onresize", - "onrowenter", - "onrowexit", - "onrowsdelete", - "onrowsinserted", - "onscroll", - "onselect", - "onselectstart", - "onstart", - "onstop", - "onsubmit", - "onunload", - "onzoom", - "opacity", - "open", - "operator", - "optimum", - "or", - "order", - "orient", - "orientation", - "origin", - "other", - "otherwise", - "outerproduct", - "output", - "overflow", - "overline-position", - "overline-thickness", - "p", - "panose-1", - "partialdiff", - "path", - "pathLength", - "pathlength", - "pattern", - "patternContentUnits", - "patternTransform", - "patternUnits", - "patterncontentunits", - "patterntransform", - "patternunits", - "pi", - "piece", - "piecewise", - "ping", - "plus", - "pointer-events", - "points", - "pointsAtX", - "pointsAtY", - "pointsAtZ", - "pointsatx", - "pointsaty", - "pointsatz", - "polygon", - "polyline", - "poster", - "power", - "prefetch", - "preserveAlpha", - "preserveAspectRatio", - "preservealpha", - "preserveaspectratio", - "primes", - "primitiveUnits", - "primitiveunits", - "product", - "profile", - "progress", - "prompt", - "prsubset", - "q", - "quotient", - "r", - "radialGradient", - "radialgradient", - "radiogroup", - "radius", - "rationals", - "readonly", - "real", - "reals", - "rect", - "refX", - "refY", - "refx", - "refy", - "rel", - "reln", - "rem", - "rendering-intent", - "repeat", - "repeat-max", - "repeat-min", - "repeat-start", - "repeat-template", - "repeatCount", - "repeatDur", - "repeatcount", - "repeatdur", - "replace", - "required", - "requiredExtensions", - "requiredFeatures", - "requiredextensions", - "requiredfeatures", - "restart", - "result", - "rev", - "role", - "root", - "rotate", - "rowalign", - "rowlines", - "rows", - "rowspacing", - "rowspan", - "rquote", - "rspace", - "ruby", - "rule", - "rules", - "rx", - "ry", - "s", - "samp", - "sandbox", - "scalarproduct", - "scale", - "scheme", - "scope", - "scoped", - "scriptlevel", - "scriptminsize", - "scriptsizemultiplier", - "scrolldelay", - "scrolling", - "sdev", - "seamless", - "sec", - "sech", - "section", - "seed", - "selected", - "selection", - "selector", - "semantics", - "sep", - "separator", - "separators", - "set", - "setdiff", - "shape", - "shape-rendering", - "show", - "sin", - "sinh", - "size", - "slope", - "small", - "solidcolor", - "space", - "spacing", - "span", - "specification", - "specularConstant", - "specularExponent", - "specularconstant", - "specularexponent", - "speed", - "spreadMethod", - "spreadmethod", - "src", - "srcdoc", - "standby", - "start", - "startOffset", - "startoffset", - "stdDeviation", - "stddeviation", - "stemh", - "stemv", - "step", - "stitchTiles", - "stitchtiles", - "stop", - "stop-color", - "stop-opacity", - "stretchy", - "strike", - "strikethrough-position", - "strikethrough-thickness", - "string", - "stroke", - "stroke-dasharray", - "stroke-dashoffset", - "stroke-linecap", - "stroke-linejoin", - "stroke-miterlimit", - "stroke-opacity", - "stroke-width", - "strong", - "sub", - "subscriptshift", - "subset", - "sum", - "summary", - "sup", - "superscriptshift", - "surfaceScale", - "surfacescale", - "switch", - "symbol", - "symmetric", - "systemLanguage", - "systemlanguage", - "tabindex", - "tableValues", - "tablevalues", - "tan", - "tanh", - "target", - "targetX", - "targetY", - "targetx", - "targety", - "tbreak", - "tendsto", - "text", - "text-anchor", - "text-decoration", - "text-rendering", - "textLength", - "textPath", - "textlength", - "textpath", - "thickmathspace", - "thinmathspace", - "time", - "times", - "to", - "transform", - "transpose", - "tref", - "true", - "tspan", - "tt", - "type", - "u", - "u1", - "u2", - "ul", - "underline-position", - "underline-thickness", - "unicode", - "unicode-bidi", - "unicode-range", - "union", - "units-per-em", - "unselectable", - "uplimit", - "use", - "usemap", - "v-alphabetic", - "v-hanging", - "v-ideographic", - "v-mathematical", - "valign", - "value", - "values", - "valuetype", - "var", - "variance", - "vector", - "vectorproduct", - "version", - "vert-adv-y", - "vert-origin-x", - "vert-origin-y", - "verythickmathspace", - "verythinmathspace", - "veryverythickmathspace", - "veryverythinmathspace", - "video", - "view", - "viewBox", - "viewTarget", - "viewbox", - "viewtarget", - "visibility", - "vkern", - "vlink", - "vspace", - "wbr", - "when", - "width", - "widths", - "word-spacing", - "wrap", - "writing-mode", - "x", - "x-height", - "x1", - "x2", - "xChannelSelector", - "xchannelselector", - "xlink:actuate", - "xlink:arcrole", - "xlink:href", - "xlink:role", - "xlink:show", - "xlink:type", - "xml:base", - "xml:lang", - "xml:space", - "xmlns", - "xmlns:xlink", - "xor", - "xref", - "y", - "y1", - "y2", - "yChannelSelector", - "ychannelselector", - "z", - "zoomAndPan", - "zoomandpan", -]; diff --git a/macros/src/lib.rs b/macros/src/lib.rs deleted file mode 100644 index 24e160e..0000000 --- a/macros/src/lib.rs +++ /dev/null @@ -1,132 +0,0 @@ -// Copyright 2014 The Servo Project Developers. See the -// COPYRIGHT file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -#![crate_name="string_cache_macros"] -#![crate_type="dylib"] - -#![feature(macro_rules, plugin_registrar, quote, managed_boxes)] -#![allow(unused_imports)] // for quotes - -extern crate syntax; -extern crate rustc; - -use rustc::plugin::Registry; -use syntax::codemap::Span; -use syntax::ast::{TokenTree, TTTok}; -use syntax::ast; -use syntax::ext::base::{ExtCtxt, MacResult, MacExpr}; -use syntax::parse::token::{get_ident, InternedString, LIT_STR, IDENT}; - -use std::iter::Chain; -use std::slice::Items; -use std::gc::Gc; - -mod data; - -#[path="../../shared/static_atom.rs"] -mod static_atom; - -macro_rules! bail ( ($cx:expr, $sp:expr, $msg:expr) => ({ - $cx.span_err($sp, $msg); - return ::syntax::ext::base::DummyResult::any($sp); -})) - -macro_rules! bail_if ( ($e:expr, $cx:expr, $sp:expr, $msg:expr) => ( - if $e { bail!($cx, $sp, $msg) } -)) - -macro_rules! expect ( ($cx:expr, $sp:expr, $e:expr, $msg:expr) => ( - match $e { - Some(x) => x, - None => bail!($cx, $sp, $msg), - } -)) - -fn all_atoms<'a>() -> Chain, Items<'a, &'static str>> { - data::fast_set_atoms.iter().chain(data::other_atoms.iter()) -} - -// Build a PhfMap yielding static atom IDs. -// Takes no arguments. -fn expand_static_atom_map(cx: &mut ExtCtxt, sp: Span, tt: &[TokenTree]) -> Box { - bail_if!(tt.len() != 0, cx, sp, "Usage: static_atom_map!()"); - let tts: Vec = all_atoms().enumerate().flat_map(|(i, k)| { - let i = i as u32; - (quote_tokens!(&mut *cx, $k => $i,)).move_iter() - }).collect(); - MacExpr::new(quote_expr!(&mut *cx, phf_map!($tts))) -} - -// Build the array to convert IDs back to strings. -// FIXME: share storage with the PhfMap keys. -fn expand_static_atom_array(cx: &mut ExtCtxt, sp: Span, tt: &[TokenTree]) -> Box { - bail_if!(tt.len() != 0, cx, sp, "Usage: static_atom_array!()"); - let tts: Vec = all_atoms().flat_map(|k| - quote_tokens!(&mut *cx, $k,).move_iter() - ).collect(); - MacExpr::new(quote_expr!(&mut *cx, &[$tts])) -} - -fn atom_tok_to_str(t: &TokenTree) -> Option { - Some(get_ident(match *t { - TTTok(_, IDENT(s, _)) => s, - TTTok(_, LIT_STR(s)) => s.ident(), - _ => return None, - })) -} - -fn find_atom(name: InternedString) -> Option { - // Use bsearch instead of bsearch_elem because of type mismatch - // between &'t str and &'static str. - data::fast_set_atoms.bsearch(|&x| x.cmp(&name.get())).or_else(|| - data::other_atoms.bsearch(|&x| x.cmp(&name.get())).map(|i| i+64)) - -} - -struct AtomResult { - expr: Gc, - pat: Gc, -} - -impl MacResult for AtomResult { - fn make_expr(&self) -> Option> { - Some(self.expr) - } - - fn make_pat(&self) -> Option> { - Some(self.pat) - } -} - -// Translate `atom!(title)` or `atom!("font-weight")` into an `Atom` constant or pattern. -fn expand_atom(cx: &mut ExtCtxt, sp: Span, tt: &[TokenTree]) -> Box { - let usage = "Usage: atom!(html) or atom!(\"font-weight\")"; - let name = match tt { - [ref t] => expect!(cx, sp, atom_tok_to_str(t), usage), - _ => bail!(cx, sp, usage), - }; - - let i = expect!(cx, sp, find_atom(name.clone()), - format!("Unknown static atom {:s}", name.get()).as_slice()); - - let data = static_atom::add_tag(i as u32); - - box AtomResult { - expr: quote_expr!(&mut *cx, ::string_cache::atom::Atom { data: $data }), - pat: quote_pat!(&mut *cx, ::string_cache::atom::Atom { data: $data }), - } as Box -} - -// NB: This needs to be public or we get a linker error. -#[plugin_registrar] -pub fn plugin_registrar(reg: &mut Registry) { - reg.register_macro("static_atom_map", expand_static_atom_map); - reg.register_macro("static_atom_array", expand_static_atom_array); - reg.register_macro("atom", expand_atom); -} diff --git a/shared/static_atom.rs b/shared/static_atom.rs deleted file mode 100644 index e50e669..0000000 --- a/shared/static_atom.rs +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2014 The Servo Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -//! This code is compiled into both the macros crate and the run-time -//! library, in order to guarantee consistency. - -#![allow(dead_code)] - -pub static STATIC_TAG: u8 = 2; - -static STATIC_SHIFT_BITS: uint = 32; - -#[inline(always)] -pub fn add_tag(atom_id: u32) -> u64 { - (atom_id as u64 << STATIC_SHIFT_BITS) | (STATIC_TAG as u64) -} - -/// Undefined to call this on a non-static atom! -#[inline(always)] -pub fn remove_tag(atom_data: u64) -> u32 { - (atom_data >> STATIC_SHIFT_BITS) as u32 -} diff --git a/src/atom.rs b/src/atom.rs index 07365e1..5a8aa7f 100644 --- a/src/atom.rs +++ b/src/atom.rs @@ -7,511 +7,409 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use phf::PhfMap; +use crate::dynamic_set::{dynamic_set, Entry}; +use crate::static_sets::StaticAtomSet; +use debug_unreachable::debug_unreachable; +use std::borrow::Cow; +use std::cmp::Ordering::{self, Equal}; use std::fmt; -use std::hash::{Hash, Hasher, sip}; +use std::hash::{Hash, Hasher}; +use std::marker::PhantomData; use std::mem; -use std::ptr; +use std::num::NonZeroU64; +use std::ops; use std::slice; -use std::slice::bytes; use std::str; -use std::sync::atomics::{AtomicInt, SeqCst}; -use sync::Mutex; -use sync::one::{Once, ONCE_INIT}; -use std::rt::heap; - -#[path="../shared/static_atom.rs"] -mod static_atom; - -// Inline atoms are probably buggy on big-endian architectures. -#[allow(dead_code)] -#[static_assert] -static IS_LITTLE_ENDIAN: bool = cfg!(target_endian = "little"); - - -static mut global_string_cache_ptr: *mut Mutex = 0 as *mut _; - -static ENTRY_ALIGNMENT: uint = 16; - -// Macro-generated tables for static atoms. -static static_atom_map: PhfMap<&'static str, u32> = static_atom_map!(); -static static_atom_array: &'static [&'static str] = static_atom_array!(); - -// NOTE: Deriving Eq here implies that a given string must always -// be interned the same way. -#[repr(u8)] -#[deriving(Eq, PartialEq)] -enum AtomType { - Dynamic = 0, - Inline = 1, - Static = static_atom::STATIC_TAG, -} - -struct StringCache { - hasher: sip::SipHasher, - buckets: [*mut StringCacheEntry, ..4096], +use std::sync::atomic::Ordering::SeqCst; + +const DYNAMIC_TAG: u8 = 0b_00; +const INLINE_TAG: u8 = 0b_01; // len in upper nybble +const STATIC_TAG: u8 = 0b_10; +const TAG_MASK: u64 = 0b_11; +const LEN_OFFSET: u64 = 4; +const LEN_MASK: u64 = 0xF0; + +const MAX_INLINE_LEN: usize = 7; +const STATIC_SHIFT_BITS: usize = 32; + +/// Represents a string that has been interned. +/// +/// While the type definition for `Atom` indicates that it generic on a particular +/// implementation of an atom set, you don't need to worry about this. Atoms can be static +/// and come from a `StaticAtomSet` generated by the `string_cache_codegen` crate, or they +/// can be dynamic and created by you on an `EmptyStaticAtomSet`. +/// +/// `Atom` implements `Clone` but not `Copy`, since internally atoms are reference-counted; +/// this means that you may need to `.clone()` an atom to keep copies to it in different +/// places, or when passing it to a function that takes an `Atom` rather than an `&Atom`. +/// +/// ## Creating an atom at runtime +/// +/// If you use `string_cache_codegen` to generate a precomputed list of atoms, your code +/// may then do something like read data from somewhere and extract tokens that need to be +/// compared to the atoms. In this case, you can use `Atom::from(&str)` or +/// `Atom::from(String)`. These create a reference-counted atom which will be +/// automatically freed when all references to it are dropped. +/// +/// This means that your application can safely have a loop which tokenizes data, creates +/// atoms from the tokens, and compares the atoms to a predefined set of keywords, without +/// running the risk of arbitrary memory consumption from creating large numbers of atoms — +/// as long as your application does not store clones of the atoms it creates along the +/// way. +/// +/// For example, the following is safe and will not consume arbitrary amounts of memory: +/// +/// ```ignore +/// let untrusted_data = "large amounts of text ..."; +/// +/// for token in untrusted_data.split_whitespace() { +/// let atom = Atom::from(token); // interns the string +/// +/// if atom == Atom::from("keyword") { +/// // handle that keyword +/// } else if atom == Atom::from("another_keyword") { +/// // handle that keyword +/// } else { +/// println!("unknown keyword"); +/// } +/// } // atom is dropped here, so it is not kept around in memory +/// ``` +#[derive(PartialEq, Eq)] +// NOTE: Deriving PartialEq requires that a given string must always be interned the same way. +pub struct Atom { + unsafe_data: NonZeroU64, + phantom: PhantomData, } -struct StringCacheEntry { - next_in_bucket: *mut StringCacheEntry, - hash: u64, - ref_count: AtomicInt, - string: String, -} - -impl StringCacheEntry { - fn new(next: *mut StringCacheEntry, hash: u64, string_to_add: &str) -> StringCacheEntry { - StringCacheEntry { - next_in_bucket: next, - hash: hash, - ref_count: AtomicInt::new(1), - string: string_to_add.to_string(), - } +// This isn't really correct as the Atoms can technically take up space. But I guess it's ok +// as it is possible to measure the size of the atom set separately/ +#[cfg(feature = "malloc_size_of")] +impl malloc_size_of::MallocSizeOf for Atom { + fn size_of(&self, _ops: &mut malloc_size_of::MallocSizeOfOps) -> usize { + 0 } } -impl StringCache { - fn new() -> StringCache { - StringCache { - hasher: sip::SipHasher::new(), - buckets: unsafe { mem::zeroed() }, +// FIXME: bound removed from the struct definition before of this error for pack_static: +// "error[E0723]: trait bounds other than `Sized` on const fn parameters are unstable" +// https://github.com/rust-lang/rust/issues/57563 +impl Atom { + /// For the atom!() macros + #[inline(always)] + #[doc(hidden)] + pub const fn pack_static(n: u32) -> Self { + Self { + unsafe_data: unsafe { + // STATIC_TAG ensures this is non-zero + NonZeroU64::new_unchecked((STATIC_TAG as u64) | ((n as u64) << STATIC_SHIFT_BITS)) + }, + phantom: PhantomData, } } - fn add(&mut self, string_to_add: &str) -> u64 { - let hash = self.hasher.hash(&string_to_add); - let bucket_index = (hash & (self.buckets.len()-1) as u64) as uint; - let mut ptr = self.buckets[bucket_index]; - - while ptr != ptr::mut_null() { - let value = unsafe { &*ptr }; - if value.hash == hash && value.string.as_slice() == string_to_add { - break; - } - ptr = value.next_in_bucket; + /// For the atom!() macros + #[inline(always)] + #[doc(hidden)] + pub const fn pack_inline(mut n: u64, len: u8) -> Self { + if cfg!(target_endian = "big") { + // Reverse order of top 7 bytes. + // Bottom 8 bits of `n` are zero, and we need that to remain so. + // String data is stored in top 7 bytes, tag and length in bottom byte. + n = n.to_le() << 8; } - if ptr == ptr::mut_null() { - unsafe { - ptr = heap::allocate(mem::size_of::(), ENTRY_ALIGNMENT) - as *mut StringCacheEntry; - ptr::write(ptr, - StringCacheEntry::new(self.buckets[bucket_index], hash, string_to_add)); - } - self.buckets[bucket_index] = ptr; - } else { - unsafe { - (*ptr).ref_count.fetch_add(1, SeqCst); - } + let data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET) | n; + Self { + // INLINE_TAG ensures this is never zero + unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, + phantom: PhantomData, } - - assert!(ptr != ptr::mut_null()); - ptr as u64 } - fn remove(&mut self, key: u64) { - let ptr = key as *mut StringCacheEntry; - let value: &mut StringCacheEntry = unsafe { mem::transmute(ptr) }; - - if value.ref_count.fetch_sub(1, SeqCst) == 1 { - let bucket_index = (value.hash & (self.buckets.len()-1) as u64) as uint; - - let mut current = self.buckets[bucket_index]; - let mut prev: *mut StringCacheEntry = ptr::mut_null(); + fn tag(&self) -> u8 { + (self.unsafe_data.get() & TAG_MASK) as u8 + } +} - while current != ptr::mut_null() { - if current == ptr { - if prev != ptr::mut_null() { - unsafe { (*prev).next_in_bucket = (*current).next_in_bucket }; - } else { - unsafe { self.buckets[bucket_index] = (*current).next_in_bucket }; - } - break; - } - prev = current; - unsafe { current = (*current).next_in_bucket }; - } - assert!(current != ptr::mut_null()); +impl Atom { + /// Return the internal representation. For testing. + #[doc(hidden)] + pub fn unsafe_data(&self) -> u64 { + self.unsafe_data.get() + } - unsafe { - ptr::read(ptr as *const StringCacheEntry); - heap::deallocate(ptr as *mut u8, - mem::size_of::(), ENTRY_ALIGNMENT); - } - } + /// Return true if this is a static Atom. For testing. + #[doc(hidden)] + pub fn is_static(&self) -> bool { + self.tag() == STATIC_TAG } -} -#[deriving(Eq, Hash, PartialEq)] -pub struct Atom { - /// This field is public so that the `atom!()` macro can use it. - /// You should not otherwise access this field. - pub data: u64, -} + /// Return true if this is a dynamic Atom. For testing. + #[doc(hidden)] + pub fn is_dynamic(&self) -> bool { + self.tag() == DYNAMIC_TAG + } -impl Atom { - pub fn from_static(atom_id: u32) -> Atom { - Atom { - data: static_atom::add_tag(atom_id), - } + /// Return true if this is an inline Atom. For testing. + #[doc(hidden)] + pub fn is_inline(&self) -> bool { + self.tag() == INLINE_TAG } - pub fn from_slice(string_to_add: &str) -> Atom { - match static_atom_map.find_equiv(&string_to_add) { - Some(&atom_id) => { - Atom::from_static(atom_id) - }, - None => { - if string_to_add.len() < 8 { - Atom::from_inline(string_to_add) - } else { - Atom::from_dynamic(string_to_add) - } - } - } + fn static_index(&self) -> u64 { + self.unsafe_data.get() >> STATIC_SHIFT_BITS } - pub fn as_slice<'t>(&'t self) -> &'t str { - let (atom_type, string_len) = self.get_type_and_inline_len(); - let ptr = self as *const Atom as *const u8; - match atom_type { - Inline => { - unsafe { - let data = ptr.offset(1) as *const [u8, ..7]; - str::raw::from_utf8((*data).slice_to(string_len)) - } - }, - Static => { - *static_atom_array.get(static_atom::remove_tag(self.data) as uint) - .expect("bad static atom") - }, - Dynamic => { - let hash_value = unsafe { &*(self.data as *const StringCacheEntry) }; - hash_value.string.as_slice() + /// Get the hash of the string as it is stored in the set. + pub fn get_hash(&self) -> u32 { + match self.tag() { + DYNAMIC_TAG => { + let entry = self.unsafe_data.get() as *const Entry; + unsafe { (*entry).hash } } + STATIC_TAG => Static::get().hashes[self.static_index() as usize], + INLINE_TAG => { + let data = self.unsafe_data.get(); + // This may or may not be great... + ((data >> 32) ^ data) as u32 + } + _ => unsafe { debug_unreachable!() }, } } - #[inline] - fn from_inline(string: &str) -> Atom { - assert!(string.len() < 8); - let mut string_data: u64 = 0; - unsafe { slice::raw::mut_buf_as_slice(&mut string_data as *mut u64 as *mut u8, 7, - |b| bytes::copy_memory(b, string.as_bytes())) }; - Atom { - data: (Inline as u64) | (string.len() as u64 << 4) | (string_data << 8), - } + pub fn try_static(string_to_add: &str) -> Option { + Self::try_static_internal(string_to_add).ok() } - #[inline] - fn from_dynamic(string: &str) -> Atom { - static mut START: Once = ONCE_INIT; + fn try_static_internal(string_to_add: &str) -> Result { + let static_set = Static::get(); + let hash = phf_shared::hash(&*string_to_add, &static_set.key); + let index = phf_shared::get_index(&hash, static_set.disps, static_set.atoms.len()); - unsafe { - START.doit(|| { - let cache = box Mutex::new(StringCache::new()); - global_string_cache_ptr = mem::transmute(cache); - }); - } - - let mut string_cache = unsafe { &*global_string_cache_ptr }.lock(); - let hash_value_address = string_cache.add(string); - Atom { - data: hash_value_address | Dynamic as u64 + if static_set.atoms[index as usize] == string_to_add { + Ok(Self::pack_static(index)) + } else { + Err(hash) } } +} +impl Default for Atom { #[inline] - fn get_type(&self) -> AtomType { - unsafe { mem::transmute((self.data & 0xf) as u8) } + fn default() -> Self { + Atom::pack_static(Static::empty_string_index()) } +} +impl Hash for Atom { #[inline] - fn get_type_and_inline_len(&self) -> (AtomType, uint) { - let atom_type = self.get_type(); - let len = match atom_type { - Static | Dynamic => 0, - Inline => ((self.data & 0xf0) >> 4) as uint - }; - (atom_type, len) + fn hash(&self, state: &mut H) + where + H: Hasher, + { + state.write_u32(self.get_hash()) } } -impl Clone for Atom { - fn clone(&self) -> Atom { - let atom_type = self.get_type(); - match atom_type { - Dynamic => { - let hash_value = unsafe { &mut *(self.data as *mut StringCacheEntry) }; - hash_value.ref_count.fetch_add(1, SeqCst); +impl<'a, Static: StaticAtomSet> From> for Atom { + fn from(string_to_add: Cow<'a, str>) -> Self { + let len = string_to_add.len(); + if len == 0 { + Self::pack_static(Static::empty_string_index()) + } else if len <= MAX_INLINE_LEN { + let mut data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET); + { + let dest = inline_atom_slice_mut(&mut data); + dest[..len].copy_from_slice(string_to_add.as_bytes()); + } + Atom { + // INLINE_TAG ensures this is never zero + unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, + phantom: PhantomData, } - _ => {} + } else { + Self::try_static_internal(&*string_to_add).unwrap_or_else(|hash| { + let ptr: std::ptr::NonNull = dynamic_set().insert(string_to_add, hash.g); + let data = ptr.as_ptr() as u64; + debug_assert!(0 == data & TAG_MASK); + Atom { + // The address of a ptr::NonNull is non-zero + unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, + phantom: PhantomData, + } + }) } - Atom { - data: self.data + } +} + +impl Clone for Atom { + #[inline(always)] + fn clone(&self) -> Self { + if self.tag() == DYNAMIC_TAG { + let entry = self.unsafe_data.get() as *const Entry; + unsafe { &*entry }.ref_count.fetch_add(1, SeqCst); } + Atom { ..*self } } } -impl Drop for Atom { +impl Drop for Atom { + #[inline] fn drop(&mut self) { - match self.get_type() { - Dynamic => { - let mut string_cache = unsafe { &*global_string_cache_ptr }.lock(); - string_cache.remove(self.data); - }, - _ => {} + if self.tag() == DYNAMIC_TAG { + let entry = self.unsafe_data.get() as *const Entry; + if unsafe { &*entry }.ref_count.fetch_sub(1, SeqCst) == 1 { + drop_slow(self) + } + } + + // Out of line to guide inlining. + fn drop_slow(this: &mut Atom) { + dynamic_set().remove(this.unsafe_data.get() as *mut Entry); } } } -impl fmt::Show for Atom { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Atom('{:s}' type={:?})", self.as_slice(), self.get_type()) +impl ops::Deref for Atom { + type Target = str; + + #[inline] + fn deref(&self) -> &str { + unsafe { + match self.tag() { + DYNAMIC_TAG => { + let entry = self.unsafe_data.get() as *const Entry; + &(*entry).string + } + INLINE_TAG => { + let len = (self.unsafe_data() & LEN_MASK) >> LEN_OFFSET; + debug_assert!(len as usize <= MAX_INLINE_LEN); + let src = inline_atom_slice(&self.unsafe_data); + str::from_utf8_unchecked(src.get_unchecked(..(len as usize))) + } + STATIC_TAG => Static::get().atoms[self.static_index() as usize], + _ => debug_unreachable!(), + } + } } } -impl PartialOrd for Atom { - fn partial_cmp(&self, other: &Atom) -> Option { - self.data.partial_cmp(&other.data) +impl fmt::Debug for Atom { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let ty_str = unsafe { + match self.tag() { + DYNAMIC_TAG => "dynamic", + INLINE_TAG => "inline", + STATIC_TAG => "static", + _ => debug_unreachable!(), + } + }; + + write!(f, "Atom('{}' type={})", &*self, ty_str) } +} - fn lt(&self, other: &Atom) -> bool { - if self.data == other.data { - return false; +impl PartialOrd for Atom { + #[inline] + fn partial_cmp(&self, other: &Self) -> Option { + if self.unsafe_data == other.unsafe_data { + return Some(Equal); } - self.as_slice() < other.as_slice() + self.as_ref().partial_cmp(other.as_ref()) } } -impl Ord for Atom { - fn cmp(&self, other: &Atom) -> Ordering { - if self.data == other.data { +impl Ord for Atom { + #[inline] + fn cmp(&self, other: &Self) -> Ordering { + if self.unsafe_data == other.unsafe_data { return Equal; } - self.as_slice().cmp(&other.as_slice()) + self.as_ref().cmp(other.as_ref()) } } -#[cfg(test)] -mod tests { - use std::task::spawn; - use super::{Atom, Static, Inline, Dynamic}; - use test::Bencher; - - #[test] - fn test_as_slice() { - let s0 = Atom::from_slice(""); - assert!(s0.as_slice() == ""); - - let s1 = Atom::from_slice("class"); - assert!(s1.as_slice() == "class"); - - let i0 = Atom::from_slice("blah"); - assert!(i0.as_slice() == "blah"); - - let s0 = Atom::from_slice("BLAH"); - assert!(s0.as_slice() == "BLAH"); - - let d0 = Atom::from_slice("zzzzzzzzzz"); - assert!(d0.as_slice() == "zzzzzzzzzz"); - - let d1 = Atom::from_slice("ZZZZZZZZZZ"); - assert!(d1.as_slice() == "ZZZZZZZZZZ"); - } - - #[test] - fn test_types() { - let s0 = Atom::from_slice(""); - assert!(s0.get_type_and_inline_len() == (Static, 0)); - - let s1 = Atom::from_slice("id"); - assert!(s1.get_type_and_inline_len() == (Static, 0)); - - let s1 = Atom::from_slice("body"); - assert!(s1.get_type_and_inline_len() == (Static, 0)); - - // "z" is a static atom - let i0 = Atom::from_slice("c"); - assert!(i0.get_type_and_inline_len() == (Inline, 1)); - - let i1 = Atom::from_slice("zz"); - assert!(i1.get_type_and_inline_len() == (Inline, 2)); - - let i2 = Atom::from_slice("zzz"); - assert!(i2.get_type_and_inline_len() == (Inline, 3)); - - let i3 = Atom::from_slice("zzzz"); - assert!(i3.get_type_and_inline_len() == (Inline, 4)); - - let i4 = Atom::from_slice("zzzzz"); - assert!(i4.get_type_and_inline_len() == (Inline, 5)); - - let i5 = Atom::from_slice("zzzzzz"); - assert!(i5.get_type_and_inline_len() == (Inline, 6)); - - let i6 = Atom::from_slice("zzzzzzz"); - assert!(i6.get_type_and_inline_len() == (Inline, 7)); - - let d0 = Atom::from_slice("zzzzzzzz"); - assert!(d0.get_type_and_inline_len() == (Dynamic, 0)); - - let d1 = Atom::from_slice("zzzzzzzzzzzzz"); - assert!(d1.get_type_and_inline_len() == (Dynamic, 0)); +// AsciiExt requires mutating methods, so we just implement the non-mutating ones. +// We don't need to implement is_ascii because there's no performance improvement +// over the one from &str. +impl Atom { + fn from_mutated_str(s: &str, f: F) -> Self { + let mut buffer = mem::MaybeUninit::<[u8; 64]>::uninit(); + let buffer = unsafe { &mut *buffer.as_mut_ptr() }; + + if let Some(buffer_prefix) = buffer.get_mut(..s.len()) { + buffer_prefix.copy_from_slice(s.as_bytes()); + let as_str = unsafe { ::std::str::from_utf8_unchecked_mut(buffer_prefix) }; + f(as_str); + Atom::from(&*as_str) + } else { + let mut string = s.to_owned(); + f(&mut string); + Atom::from(string) + } } - #[test] - fn test_equality() { - let s0 = Atom::from_slice("fn"); - let s1 = Atom::from_slice("fn"); - let s2 = Atom::from_slice("loop"); - - let i0 = Atom::from_slice("blah"); - let i1 = Atom::from_slice("blah"); - let i2 = Atom::from_slice("blah2"); - - let d0 = Atom::from_slice("zzzzzzzz"); - let d1 = Atom::from_slice("zzzzzzzz"); - let d2 = Atom::from_slice("zzzzzzzzz"); - - assert!(s0 == s1); - assert!(s0 != s2); - - assert!(i0 == i1); - assert!(i0 != i2); - - assert!(d0 == d1); - assert!(d0 != d2); - - assert!(s0 != i0); - assert!(s0 != d0); - assert!(i0 != d0); + /// Like [`to_ascii_uppercase`]. + /// + /// [`to_ascii_uppercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_uppercase + pub fn to_ascii_uppercase(&self) -> Self { + for (i, b) in self.bytes().enumerate() { + if let b'a'..=b'z' = b { + return Atom::from_mutated_str(self, |s| s[i..].make_ascii_uppercase()); + } + } + self.clone() } - #[test] - fn ord() { - fn check(x: &str, y: &str) { - assert_eq!(x < y, Atom::from_slice(x) < Atom::from_slice(y)); - assert_eq!(x.cmp(&y), Atom::from_slice(x).cmp(&Atom::from_slice(y))); + /// Like [`to_ascii_lowercase`]. + /// + /// [`to_ascii_lowercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_lowercase + pub fn to_ascii_lowercase(&self) -> Self { + for (i, b) in self.bytes().enumerate() { + if let b'A'..=b'Z' = b { + return Atom::from_mutated_str(self, |s| s[i..].make_ascii_lowercase()); + } } - - check("a", "body"); - check("asdf", "body"); - check("zasdf", "body"); - check("z", "body"); - - check("a", "bbbbb"); - check("asdf", "bbbbb"); - check("zasdf", "bbbbb"); - check("z", "bbbbb"); + self.clone() } - #[test] - fn clone() { - let s0 = Atom::from_slice("fn"); - let s1 = s0.clone(); - let s2 = Atom::from_slice("loop"); - - let i0 = Atom::from_slice("blah"); - let i1 = i0.clone(); - let i2 = Atom::from_slice("blah2"); - - let d0 = Atom::from_slice("zzzzzzzz"); - let d1 = d0.clone(); - let d2 = Atom::from_slice("zzzzzzzzz"); - - assert!(s0 == s1); - assert!(s0 != s2); - - assert!(i0 == i1); - assert!(i0 != i2); - - assert!(d0 == d1); - assert!(d0 != d2); - - assert!(s0 != i0); - assert!(s0 != d0); - assert!(i0 != d0); + /// Like [`eq_ignore_ascii_case`]. + /// + /// [`eq_ignore_ascii_case`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.eq_ignore_ascii_case + pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { + (self == other) || self.eq_str_ignore_ascii_case(&**other) } - #[test] - fn test_threads() { - for _ in range(0u32, 100u32) { - spawn(proc() { - let _ = Atom::from_slice("a dynamic string"); - let _ = Atom::from_slice("another string"); - }); - } + /// Like [`eq_ignore_ascii_case`], but takes an unhashed string as `other`. + /// + /// [`eq_ignore_ascii_case`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.eq_ignore_ascii_case + pub fn eq_str_ignore_ascii_case(&self, other: &str) -> bool { + (&**self).eq_ignore_ascii_case(other) } +} - #[bench] - fn bench_strings(b: &mut Bencher) { - let mut strings0 = vec!(); - let mut strings1 = vec!(); - - for _ in range(0u32, 1000u32) { - strings0.push("a"); - strings1.push("b"); +#[inline(always)] +fn inline_atom_slice(x: &NonZeroU64) -> &[u8] { + let x: *const NonZeroU64 = x; + let mut data = x as *const u8; + // All except the lowest byte, which is first in little-endian, last in big-endian. + if cfg!(target_endian = "little") { + data = unsafe { data.offset(1) }; } + let len = 7; + unsafe { slice::from_raw_parts(data, len) } +} - let mut eq_count = 0u32; - - b.iter(|| { - for (s0, s1) in strings0.iter().zip(strings1.iter()) { - if s0 == s1 { - eq_count += 1; - } - } - }); - } - - #[bench] - fn bench_atoms(b: &mut Bencher) { - let mut atoms0 = vec!(); - let mut atoms1 = vec!(); - - for _ in range(0u32, 1000u32) { - atoms0.push(Atom::from_slice("a")); - atoms1.push(Atom::from_slice("b")); +#[inline(always)] +fn inline_atom_slice_mut(x: &mut u64) -> &mut [u8] { + let x: *mut u64 = x; + let mut data = x as *mut u8; + // All except the lowest byte, which is first in little-endian, last in big-endian. + if cfg!(target_endian = "little") { + data = unsafe { data.offset(1) }; } - - let mut eq_count = 0u32; - - b.iter(|| { - for (a0, a1) in atoms0.iter().zip(atoms1.iter()) { - if a0 == a1 { - eq_count += 1; - } - } - }); - } - - #[test] - fn atom_macro() { - assert_eq!(atom!(body), Atom::from_slice("body")); - assert_eq!(atom!("body"), Atom::from_slice("body")); - assert_eq!(atom!("font-weight"), Atom::from_slice("font-weight")); - } - - #[test] - fn match_atom() { - assert_eq!(2u, match Atom::from_slice("head") { - atom!(br) => 1u, - atom!(html) | atom!(head) => 2u, - _ => 3u, - }); - - assert_eq!(3u, match Atom::from_slice("body") { - atom!(br) => 1u, - atom!(html) | atom!(head) => 2u, - _ => 3u, - }); - - assert_eq!(3u, match Atom::from_slice("zzzzzz") { - atom!(br) => 1u, - atom!(html) | atom!(head) => 2u, - _ => 3u, - }); - } + let len = 7; + unsafe { slice::from_raw_parts_mut(data, len) } } diff --git a/src/dynamic_set.rs b/src/dynamic_set.rs new file mode 100644 index 0000000..4442b4d --- /dev/null +++ b/src/dynamic_set.rs @@ -0,0 +1,112 @@ +// Copyright 2014 The Servo Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use parking_lot::Mutex; +use std::borrow::Cow; +use std::mem; +use std::ptr::NonNull; +use std::sync::atomic::AtomicIsize; +use std::sync::atomic::Ordering::SeqCst; +use std::sync::OnceLock; + +const NB_BUCKETS: usize = 1 << 12; // 4096 +const BUCKET_MASK: u32 = (1 << 12) - 1; + +pub(crate) struct Set { + buckets: Box<[Mutex>>]>, +} + +pub(crate) struct Entry { + pub(crate) string: Box, + pub(crate) hash: u32, + pub(crate) ref_count: AtomicIsize, + next_in_bucket: Option>, +} + +// Addresses are a multiples of this, +// and therefore have have TAG_MASK bits unset, available for tagging. +pub(crate) const ENTRY_ALIGNMENT: usize = 4; + +#[test] +fn entry_alignment_is_sufficient() { + assert!(mem::align_of::() >= ENTRY_ALIGNMENT); +} + +pub(crate) fn dynamic_set() -> &'static Set { + // NOTE: Using const initialization for buckets breaks the small-stack test. + // ``` + // // buckets: [Mutex>>; NB_BUCKETS], + // const MUTEX: Mutex>> = Mutex::new(None); + // let buckets = Box::new([MUTEX; NB_BUCKETS]); + // ``` + static DYNAMIC_SET: OnceLock = OnceLock::new(); + + DYNAMIC_SET.get_or_init(|| { + let buckets = (0..NB_BUCKETS).map(|_| Mutex::new(None)).collect(); + Set { buckets } + }) +} + +impl Set { + pub(crate) fn insert(&self, string: Cow, hash: u32) -> NonNull { + let bucket_index = (hash & BUCKET_MASK) as usize; + let mut linked_list = self.buckets[bucket_index].lock(); + + { + let mut ptr: Option<&mut Box> = linked_list.as_mut(); + + while let Some(entry) = ptr.take() { + if entry.hash == hash && *entry.string == *string { + if entry.ref_count.fetch_add(1, SeqCst) > 0 { + return NonNull::from(&mut **entry); + } + // Uh-oh. The pointer's reference count was zero, which means someone may try + // to free it. (Naive attempts to defend against this, for example having the + // destructor check to see whether the reference count is indeed zero, don't + // work due to ABA.) Thus we need to temporarily add a duplicate string to the + // list. + entry.ref_count.fetch_sub(1, SeqCst); + break; + } + ptr = entry.next_in_bucket.as_mut(); + } + } + debug_assert!(mem::align_of::() >= ENTRY_ALIGNMENT); + let string = string.into_owned(); + let mut entry = Box::new(Entry { + next_in_bucket: linked_list.take(), + hash, + ref_count: AtomicIsize::new(1), + string: string.into_boxed_str(), + }); + let ptr = NonNull::from(&mut *entry); + *linked_list = Some(entry); + ptr + } + + pub(crate) fn remove(&self, ptr: *mut Entry) { + let value: &Entry = unsafe { &*ptr }; + let bucket_index = (value.hash & BUCKET_MASK) as usize; + + let mut linked_list = self.buckets[bucket_index].lock(); + debug_assert!(value.ref_count.load(SeqCst) == 0); + let mut current: &mut Option> = &mut linked_list; + + while let Some(entry_ptr) = current.as_mut() { + let entry_ptr: *mut Entry = &mut **entry_ptr; + if entry_ptr == ptr { + mem::drop(mem::replace(current, unsafe { + (*entry_ptr).next_in_bucket.take() + })); + break; + } + current = unsafe { &mut (*entry_ptr).next_in_bucket }; + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 3e50408..3cc29b1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,33 +7,133 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -#![crate_name = "string_cache"] -#![crate_type = "rlib"] +//! +//! A library for interning things that are `AsRef`. +//! +//! Some strings may be interned at compile time using the `string-cache-codegen` crate, or the +//! `EmptyStaticAtomSet` may be used that has no compile-time interned strings. An `Atom` is an +//! interned string for a given set (either `EmptyStaticAtomSet` or a generated `StaticAtomSet`). +//! +//! Generated `Atom`s will have assocated macros to intern static strings at compile-time. +//! +//! # Examples +//! +//! Here are two examples, one with compile-time `Atom`s, and one without. +//! +//! ## With compile-time atoms +//! +//! In `Cargo.toml`: +//! ```toml +//! [dependencies] +//! string_cache = "0.9" +//! +//! [dev-dependencies] +//! string_cache_codegen = "0.6" +//! ``` +//! +//! In `build.rs`: +//! +//! ```ignore +//! extern crate string_cache_codegen; +//! +//! use std::env; +//! use std::path::Path; +//! +//! fn main() { +//! string_cache_codegen::AtomType::new("foo::FooAtom", "foo_atom!") +//! .atoms(&["foo", "bar"]) +//! .write_to_file(&Path::new(&env::var("OUT_DIR").unwrap()).join("foo_atom.rs")) +//! .unwrap() +//! } +//! ``` +//! +//! In `lib.rs`: +//! +//! ```ignore +//! extern crate string_cache; +//! +//! mod foo { +//! include!(concat!(env!("OUT_DIR"), "/foo_atom.rs")); +//! } +//! +//! fn use_the_atom(t: &str) { +//! match *t { +//! foo_atom!("foo") => println!("Found foo!"), +//! foo_atom!("bar") => println!("Found bar!"), +//! // foo_atom!("baz") => println!("Found baz!"), - would be a compile time error +//! _ => { +//! println!("String not interned"); +//! // We can intern strings at runtime as well +//! foo::FooAtom::from(t) +//! } +//! } +//! } +//! ``` +//! +//! ## No compile-time atoms +//! +//! ``` +//! # extern crate string_cache; +//! use string_cache::DefaultAtom; +//! +//! # fn main() { +//! let mut interned_stuff = Vec::new(); +//! let text = "here is a sentence of text that will be tokenised and +//! interned and some repeated tokens is of text and"; +//! for word in text.split_whitespace() { +//! let seen_before = interned_stuff.iter() +//! // We can use impl PartialEq where T is anything string-like +//! // to compare to interned strings to either other interned strings, +//! // or actual strings Comparing two interned strings is very fast +//! // (normally a single cpu operation). +//! .filter(|interned_word| interned_word == &word) +//! .count(); +//! if seen_before > 0 { +//! println!(r#"Seen the word "{}" {} times"#, word, seen_before); +//! } else { +//! println!(r#"Not seen the word "{}" before"#, word); +//! } +//! // We use the impl From<(Cow<'a, str>, or &'a str, or String)> for +//! // Atom to intern a new string. +//! interned_stuff.push(DefaultAtom::from(word)); +//! } +//! # } +//! ``` +//! -#![feature(phase, macro_rules, default_type_params)] +#![cfg_attr(test, deny(warnings))] -extern crate sync; -extern crate debug; - -#[cfg(test)] -extern crate test; - -#[phase(plugin)] -extern crate phf_mac; -extern crate phf; +// Types, such as Atom, that impl Hash must follow the hash invariant: if two objects match +// with PartialEq, they must also have the same Hash. Clippy warns on types that derive one while +// manually impl-ing the other, because it seems easy for the two to drift apart, causing the +// invariant to be violated. +// +// But Atom is a newtype over NonZeroU64, and probably always will be, since cheap comparisons and +// copying are this library's purpose. So we know what the PartialEq comparison is going to do. +// +// The `get_hash` function, seen in `atom.rs`, consults that number, plus the global string interner +// tables. The only way for the resulting hash for two Atoms with the same inner 64-bit number to +// differ would be if the table entry changed between invocations, and that would be really bad. +#![allow(clippy::derive_hash_xor_eq)] -#[phase(plugin)] -extern crate string_cache_macros; +mod atom; +mod dynamic_set; +mod static_sets; +mod trivial_impls; pub use atom::Atom; +pub use static_sets::{EmptyStaticAtomSet, PhfStrSet, StaticAtomSet}; -pub mod atom; +/// Use this if you don’t care about static atoms. +pub type DefaultAtom = Atom; -// A private module so that macro-expanded idents like -// `::string_cache::atom::Atom` will also work in this crate. -// -// `libstd` uses the same trick. -#[doc(hidden)] -mod string_cache { - pub use atom; +// Some minor tests of internal layout here. +// See ../integration-tests for much more. + +/// Guard against accidental changes to the sizes of things. +#[test] +fn assert_sizes() { + use std::mem::size_of; + assert_eq!(size_of::(), 8); + assert_eq!(size_of::>(), size_of::(),); } diff --git a/src/static_sets.rs b/src/static_sets.rs new file mode 100644 index 0000000..f7f1799 --- /dev/null +++ b/src/static_sets.rs @@ -0,0 +1,64 @@ +// Copyright 2014 The Servo Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/// A static `PhfStrSet` +/// +/// This trait is implemented by static sets of interned strings generated using +/// `string_cache_codegen`, and `EmptyStaticAtomSet` for when strings will be added dynamically. +/// +/// It is used by the methods of [`Atom`] to check if a string is present in the static set. +/// +/// [`Atom`]: struct.Atom.html +pub trait StaticAtomSet: Ord { + /// Get the location of the static string set in the binary. + fn get() -> &'static PhfStrSet; + /// Get the index of the empty string, which is in every set and is used for `Atom::default`. + fn empty_string_index() -> u32; +} + +/// A string set created using a [perfect hash function], specifically +/// [Hash, Displace and Compress]. +/// +/// See the CHD document for the meaning of the struct fields. +/// +/// [perfect hash function]: https://en.wikipedia.org/wiki/Perfect_hash_function +/// [Hash, Displace and Compress]: http://cmph.sourceforge.net/papers/esa09.pdf +pub struct PhfStrSet { + #[doc(hidden)] + pub key: u64, + #[doc(hidden)] + pub disps: &'static [(u32, u32)], + #[doc(hidden)] + pub atoms: &'static [&'static str], + #[doc(hidden)] + pub hashes: &'static [u32], +} + +/// An empty static atom set for when only dynamic strings will be added +#[derive(PartialEq, Eq, PartialOrd, Ord)] +pub struct EmptyStaticAtomSet; + +impl StaticAtomSet for EmptyStaticAtomSet { + fn get() -> &'static PhfStrSet { + // The name is a lie: this set is not empty (it contains the empty string) + // but that’s only to avoid divisions by zero in rust-phf. + static SET: PhfStrSet = PhfStrSet { + key: 0, + disps: &[(0, 0)], + atoms: &[""], + // "" SipHash'd, and xored with u64_hash_to_u32. + hashes: &[0x3ddddef3], + }; + &SET + } + + fn empty_string_index() -> u32 { + 0 + } +} diff --git a/src/trivial_impls.rs b/src/trivial_impls.rs new file mode 100644 index 0000000..960dde0 --- /dev/null +++ b/src/trivial_impls.rs @@ -0,0 +1,119 @@ +// Copyright 2014 The Servo Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use crate::{Atom, StaticAtomSet}; +#[cfg(feature = "serde_support")] +use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use std::borrow::Cow; +use std::fmt; + +impl ::precomputed_hash::PrecomputedHash for Atom { + fn precomputed_hash(&self) -> u32 { + self.get_hash() + } +} + +impl<'a, Static: StaticAtomSet> From<&'a Atom> for Atom { + fn from(atom: &'a Self) -> Self { + atom.clone() + } +} + +impl PartialEq for Atom { + fn eq(&self, other: &str) -> bool { + &self[..] == other + } +} + +impl PartialEq> for str { + fn eq(&self, other: &Atom) -> bool { + self == &other[..] + } +} + +impl PartialEq for Atom { + fn eq(&self, other: &String) -> bool { + self[..] == other[..] + } +} + +impl<'a, Static: StaticAtomSet> From<&'a str> for Atom { + #[inline] + fn from(string_to_add: &str) -> Self { + Atom::from(Cow::Borrowed(string_to_add)) + } +} + +impl From for Atom { + #[inline] + fn from(string_to_add: String) -> Self { + Atom::from(Cow::Owned(string_to_add)) + } +} + +impl fmt::Display for Atom { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + ::fmt(self, f) + } +} + +impl AsRef for Atom { + fn as_ref(&self) -> &str { + self + } +} + +#[cfg(feature = "serde_support")] +impl Serialize for Atom { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let string: &str = self.as_ref(); + string.serialize(serializer) + } +} + +#[cfg(feature = "serde_support")] +impl<'a, Static: StaticAtomSet> Deserialize<'a> for Atom { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'a>, + { + use serde::de; + use std::marker::PhantomData; + + struct AtomVisitor(PhantomData); + + impl<'de, Static: StaticAtomSet> de::Visitor<'de> for AtomVisitor { + type Value = Atom; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + write!(formatter, "an Atom") + } + + fn visit_str(self, v: &str) -> Result + where + E: de::Error, + { + Ok(Atom::from(v)) + } + + fn visit_string(self, v: String) -> Result + where + E: de::Error, + { + Ok(Atom::from(v)) + } + } + + deserializer.deserialize_str(AtomVisitor(PhantomData)) + } +} diff --git a/string-cache-codegen/Cargo.toml b/string-cache-codegen/Cargo.toml new file mode 100644 index 0000000..20eced9 --- /dev/null +++ b/string-cache-codegen/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "string_cache_codegen" +version = "0.6.1" # Also update ../README.md when making a semver-breaking change +authors = [ "The Servo Project Developers" ] +description = "A codegen library for string-cache, developed as part of the Servo project." +license = "MIT OR Apache-2.0" +repository = "https://github.com/servo/string-cache" +documentation = "https://docs.rs/string_cache_codegen/" +edition = "2018" + +[lib] +name = "string_cache_codegen" +path = "lib.rs" + +[dependencies] +phf_generator = "0.13" +phf_shared = "0.13" +proc-macro2 = "1" +quote = "1" diff --git a/string-cache-codegen/LICENSE-APACHE b/string-cache-codegen/LICENSE-APACHE new file mode 100644 index 0000000..16fe87b --- /dev/null +++ b/string-cache-codegen/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/string-cache-codegen/LICENSE-MIT b/string-cache-codegen/LICENSE-MIT new file mode 100644 index 0000000..807526f --- /dev/null +++ b/string-cache-codegen/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2012-2013 Mozilla Foundation + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/string-cache-codegen/lib.rs b/string-cache-codegen/lib.rs new file mode 100644 index 0000000..525ef3a --- /dev/null +++ b/string-cache-codegen/lib.rs @@ -0,0 +1,393 @@ +// Copyright 2016 The Servo Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +//! A crate to create static string caches at compiletime. +//! +//! # Examples +//! +//! With static atoms: +//! +//! In `Cargo.toml`: +//! +//! ```toml +//! [package] +//! build = "build.rs" +//! +//! [dependencies] +//! string_cache = "0.9" +//! +//! [build-dependencies] +//! string_cache_codegen = "0.6" +//! ``` +//! +//! In `build.rs`: +//! +//! ```no_run +//! extern crate string_cache_codegen; +//! +//! use std::env; +//! use std::path::Path; +//! +//! fn main() { +//! string_cache_codegen::AtomType::new("foo::FooAtom", "foo_atom!") +//! .atoms(&["foo", "bar"]) +//! .write_to_file(&Path::new(&env::var("OUT_DIR").unwrap()).join("foo_atom.rs")) +//! .unwrap() +//! } +//! ``` +//! +//! In `lib.rs`: +//! +//! ```ignore +//! extern crate string_cache; +//! +//! mod foo { +//! include!(concat!(env!("OUT_DIR"), "/foo_atom.rs")); +//! } +//! ``` +//! +//! The generated code will define a `FooAtom` type and a `foo_atom!` macro. +//! The macro can be used in expression or patterns, with strings listed in `build.rs`. +//! For example: +//! +//! ```ignore +//! fn compute_something(input: &foo::FooAtom) -> u32 { +//! match *input { +//! foo_atom!("foo") => 1, +//! foo_atom!("bar") => 2, +//! _ => 3, +//! } +//! } +//! ``` +//! + +#![recursion_limit = "128"] + +use proc_macro2::Ident; +use quote::quote; +use std::collections::BTreeSet; +use std::fs::File; +use std::io::{self, BufWriter, Write}; +use std::path::Path; + +/// A builder for a static atom set and relevant macros +pub struct AtomType { + path: String, + atom_doc: Option, + static_set_doc: Option, + macro_name: String, + macro_doc: Option, + atoms: BTreeSet, +} + +impl AtomType { + /// Constructs a new static atom set builder + /// + /// `path` is a path within a crate of the atom type that will be created. + /// e.g. `"FooAtom"` at the crate root or `"foo::Atom"` if the generated code + /// is included in a `foo` module. + /// + /// `macro_name` must end with `!`. + /// + /// For example, `AtomType::new("foo::FooAtom", "foo_atom!")` will generate: + /// + /// ```ignore + /// pub type FooAtom = ::string_cache::Atom; + /// pub struct FooAtomStaticSet; + /// impl ::string_cache::StaticAtomSet for FooAtomStaticSet { + /// // ... + /// } + /// #[macro_export] + /// macro_rules foo_atom { + /// // Expands to: $crate::foo::FooAtom { … } + /// } + /// ``` + pub fn new(path: &str, macro_name: &str) -> Self { + assert!(macro_name.ends_with("!"), "`macro_name` must end with '!'"); + AtomType { + path: path.to_owned(), + macro_name: macro_name[..macro_name.len() - "!".len()].to_owned(), + atom_doc: None, + static_set_doc: None, + macro_doc: None, + atoms: BTreeSet::new(), + } + } + + /// Add some documentation to the generated Atom type alias. + /// + /// This can help the user know that the type uses interned strings. + /// + /// Note that `docs` should not contain the `///` at the front of normal docs. + pub fn with_atom_doc(&mut self, docs: &str) -> &mut Self { + self.atom_doc = Some(docs.to_owned()); + self + } + + /// Add some documentation to the generated static set. + /// + /// This can help the user know that this type is zero-sized and just references a static + /// lookup table, or point them to the `Atom` type alias for more info. + /// + /// Note that `docs` should not contain the `///` at the front of normal docs. + pub fn with_static_set_doc(&mut self, docs: &str) -> &mut Self { + self.static_set_doc = Some(docs.to_owned()); + self + } + + /// Add some documentation to the generated macro. + /// + /// Note that `docs` should not contain the `///` at the front of normal docs. + pub fn with_macro_doc(&mut self, docs: &str) -> &mut Self { + self.macro_doc = Some(docs.to_owned()); + self + } + + /// Adds an atom to the builder + pub fn atom(&mut self, s: &str) -> &mut Self { + self.atoms.insert(s.to_owned()); + self + } + + /// Adds multiple atoms to the builder + pub fn atoms(&mut self, iter: I) -> &mut Self + where + I: IntoIterator, + I::Item: AsRef, + { + self.atoms + .extend(iter.into_iter().map(|s| s.as_ref().to_owned())); + self + } + + /// Write generated code to `destination`. + pub fn write_to(&mut self, mut destination: W) -> io::Result<()> + where + W: Write, + { + destination.write_all( + self.to_tokens() + .to_string() + // Insert some newlines to make the generated code slightly easier to read. + .replace(" [ \"", "[\n\"") + .replace("\" , ", "\",\n") + .replace(" ( \"", "\n( \"") + .replace("; ", ";\n") + .as_bytes(), + ) + } + + #[cfg(test)] + /// Write generated code to destination [`Vec`] and return it as [`String`] + /// + /// Used mostly for testing or displaying a value. + pub fn write_to_string(&mut self, mut destination: Vec) -> io::Result { + destination.write_all( + self.to_tokens() + .to_string() + // Insert some newlines to make the generated code slightly easier to read. + .replace(" [ \"", "[\n\"") + .replace("\" , ", "\",\n") + .replace(" ( \"", "\n( \"") + .replace("; ", ";\n") + .as_bytes(), + )?; + let str = String::from_utf8(destination).unwrap(); + Ok(str) + } + + fn to_tokens(&mut self) -> proc_macro2::TokenStream { + // `impl Default for Atom` requires the empty string to be in the static set. + // This also makes sure the set in non-empty, + // which would cause divisions by zero in rust-phf. + self.atoms.insert(String::new()); + + // Strings over 7 bytes + empty string added to static set. + // Otherwise stored inline. + let (static_strs, inline_strs): (Vec<_>, Vec<_>) = self + .atoms + .iter() + .map(String::as_str) + .partition(|s| s.len() > 7 || s.is_empty()); + + // Static strings + let hash_state = phf_generator::generate_hash(&static_strs); + let phf_generator::HashState { key, disps, map } = hash_state; + let (disps0, disps1): (Vec<_>, Vec<_>) = disps.into_iter().unzip(); + let atoms: Vec<&str> = map.iter().map(|&idx| static_strs[idx]).collect(); + let empty_string_index = atoms.iter().position(|s| s.is_empty()).unwrap() as u32; + let indices = 0..atoms.len() as u32; + + fn is_valid_ident(name: &str) -> bool { + let begins_with_letter_or_underscore = name + .chars() + .next() + .is_some_and(|c| c.is_alphabetic() || c == '_'); + let is_alphanumeric = name.chars().all(|c| c.is_alphanumeric() || c == '_'); + + begins_with_letter_or_underscore && is_alphanumeric + } + + let atoms_for_idents: Vec<&str> = atoms + .iter() + .copied() + .filter(|x| is_valid_ident(x)) + .collect(); + let atom_idents: Vec = atoms_for_idents.iter().map(|atom| new_term(atom)).collect(); + + let istrs_for_idents: Vec<&str> = inline_strs + .iter() + .copied() + .filter(|x| is_valid_ident(x)) + .collect(); + let istr_idents: Vec = istrs_for_idents.iter().map(|atom| new_term(atom)).collect(); + + let hashes: Vec = atoms + .iter() + .map(|string| { + let hash = phf_shared::hash(string, &key); + (hash.g ^ hash.f1) as u32 + }) + .collect(); + + let mut path_parts = self.path.rsplitn(2, "::"); + let type_name = path_parts.next().unwrap(); + let module = match path_parts.next() { + Some(m) => format!("$crate::{}", m), + None => format!("$crate"), + }; + let atom_doc = match self.atom_doc { + Some(ref doc) => quote!(#[doc = #doc]), + None => quote!(), + }; + let static_set_doc = match self.static_set_doc { + Some(ref doc) => quote!(#[doc = #doc]), + None => quote!(), + }; + let macro_doc = match self.macro_doc { + Some(ref doc) => quote!(#[doc = #doc]), + None => quote!(), + }; + fn new_term(string: &str) -> Ident { + Ident::new(string, proc_macro2::Span::call_site()) + } + let static_set_name = new_term(&format!("{}StaticSet", type_name)); + let type_name = new_term(type_name); + let macro_name = new_term(&*self.macro_name); + let module = module.parse::().unwrap(); + let atom_prefix = format!("ATOM_{}_", type_name.to_string().to_uppercase()); + let new_const_name = |atom: &str| { + let mut name = atom_prefix.clone(); + for c in atom.chars() { + name.push_str(&format!("_{:02X}", c as u32)) + } + new_term(&name) + }; + let const_names: Vec<_> = atoms.iter().copied().map(new_const_name).collect(); + let ident_const_names: Vec<_> = atoms_for_idents + .iter() + .copied() + .map(new_const_name) + .collect(); + let ident_inline_const_names: Vec<_> = istrs_for_idents + .iter() + .copied() + .map(new_const_name) + .collect(); + + // Inline strings + let (inline_const_names, inline_values_and_lengths): (Vec<_>, Vec<_>) = inline_strs + .iter() + .map(|s| { + let const_name = new_const_name(s); + + let mut value = 0u64; + for (index, c) in s.bytes().enumerate() { + value = value | ((c as u64) << (index * 8 + 8)); + } + + let len = s.len() as u8; + + (const_name, (value, len)) + }) + .unzip(); + let (inline_values, inline_lengths): (Vec<_>, Vec<_>) = + inline_values_and_lengths.into_iter().unzip(); + + quote! { + #atom_doc + pub type #type_name = ::string_cache::Atom<#static_set_name>; + + #static_set_doc + #[derive(PartialEq, Eq, PartialOrd, Ord)] + pub struct #static_set_name; + + impl ::string_cache::StaticAtomSet for #static_set_name { + fn get() -> &'static ::string_cache::PhfStrSet { + static SET: ::string_cache::PhfStrSet = ::string_cache::PhfStrSet { + key: #key, + disps: &[#((#disps0, #disps1)),*], + atoms: &[#(#atoms),*], + hashes: &[#(#hashes),*] + }; + &SET + } + fn empty_string_index() -> u32 { + #empty_string_index + } + } + + #( + pub const #const_names: #type_name = #type_name::pack_static(#indices); + )* + #( + pub const #inline_const_names: #type_name = #type_name::pack_inline(#inline_values, #inline_lengths); + )* + + #macro_doc + #[macro_export] + macro_rules! #macro_name { + #( + (#atoms) => { #module::#const_names }; + )* + #( + (#inline_strs) => { #module::#inline_const_names }; + )* + #( + (#atom_idents) => { #module::#ident_const_names }; + )* + #( + (#istr_idents) => { #module::#ident_inline_const_names }; + )* + } + } + } + + /// Create a new file at `path` and write generated code there. + /// + /// Typical usage: + /// `.write_to_file(&Path::new(&env::var("OUT_DIR").unwrap()).join("foo_atom.rs"))` + pub fn write_to_file(&mut self, path: &Path) -> io::Result<()> { + self.write_to(BufWriter::new(File::create(path)?)) + } +} + +#[test] +fn test_iteration_order() { + let x1 = crate::AtomType::new("foo::Atom", "foo_atom!") + .atoms(&["x", "xlink", "svg", "test"]) + .write_to_string(Vec::new()) + .expect("write to string cache x1"); + + let x2 = crate::AtomType::new("foo::Atom", "foo_atom!") + .atoms(&["x", "xlink", "svg", "test"]) + .write_to_string(Vec::new()) + .expect("write to string cache x2"); + + assert_eq!(x1, x2); +} diff --git a/tests/small-stack.rs b/tests/small-stack.rs new file mode 100644 index 0000000..bb607af --- /dev/null +++ b/tests/small-stack.rs @@ -0,0 +1,17 @@ +// Regression test for https://github.com/servo/html5ever/issues/393 +// +// Create a dynamic atom − causing initialization of the global hash map − +// in a thread that has a small stack. +// +// This is a separate test program rather than a `#[test] fn` among others +// to make sure that nothing else has already initialized the map in this process. +fn main() { + std::thread::Builder::new() + .stack_size(50_000) + .spawn(|| { + let _atom = string_cache::DefaultAtom::from("12345678"); + }) + .unwrap() + .join() + .unwrap() +}