diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..74ade77 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,65 @@ +name: CI +on: + push: + branches: ["main"] + pull_request: + merge_group: + types: [checks_requested] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +env: + RUST_BACKTRACE: 1 + SHELL: /bin/bash + +jobs: + ci: + name: Build and Test + runs-on: ubuntu-latest + + strategy: + matrix: + rust: [1.70.0, nightly, beta, stable] + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 2 + - name: Setup Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: ${{ matrix.rust }} + default: true + override: true + - name: Build + run: | + cargo build --no-default-features + cargo build + cargo build --features malloc_size_of + - uses: actions-rs/cargo@v1 + with: + command: test + args: --all + - name: Build codegen + run: | + cd string-cache-codegen && cargo build && cd .. + + if [ ${{ matrix.rust }} = nightly ]; then + cd integration-tests && cargo test --features unstable && cd ..; + fi + + + build_result: + name: Result + runs-on: ubuntu-latest + needs: + - "ci" + + steps: + - name: Mark the job as successful + run: exit 0 + if: success() + - name: Mark the job as unsuccessful + run: exit 1 + if: "!success()" diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index e99d9a3..0000000 --- a/.travis.yml +++ /dev/null @@ -1,19 +0,0 @@ -sudo: false -language: rust -rust: - - nightly - - beta - - stable -os: - - linux - - osx -script: - - cargo build - - cargo test - - cargo test --features log-events - - "if [ $TRAVIS_RUST_VERSION = nightly ]; then cargo test --features unstable; fi" - - "if [ $TRAVIS_RUST_VERSION = nightly ]; then cargo test --features heap_size; fi" - - "cd examples/event-log/ && cargo build && cd ../.." - - "cd examples/summarize-events/ && cargo build && cd ../.." -notifications: - webhooks: http://build.servo.org:54856/travis diff --git a/Cargo.toml b/Cargo.toml index 1aa67e4..e73215e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,53 +1,41 @@ [package] - name = "string_cache" -version = "0.2.15" -authors = [ "The Servo Project Developers" ] +version = "0.9.0" # Also update README.md when making a semver-breaking change +authors = ["The Servo Project Developers"] description = "A string interning library for Rust, developed as part of the Servo project." -license = "MIT / Apache-2.0" +license = "MIT OR Apache-2.0" repository = "https://github.com/servo/string-cache" -documentation = "http://doc.servo.org/string_cache/" -build = "build.rs" +documentation = "https://docs.rs/string_cache" +edition = "2018" +rust-version = "1.70.0" + +# Do not `exclude` ./string-cache-codegen because we want to include +# ./string-cache-codegen/shared.rs, and `include` is a pain to use +# (It has to be exhaustive.) +# This means that packages for this crate include some unused files, +# but they’re not too big so that shouldn’t be a problem. [lib] name = "string_cache" -# https://github.com/rust-lang/cargo/issues/1512 -doctest = false - [features] - -# Enable event logging for generating benchmark traces. -# See examples/event-log. -log-events = ["rustc-serialize"] - -# Use unstable features to optimize space and time (memory and CPU usage). -unstable = [] - -# HeapSizeOf support -heap_size = ["heapsize", "heapsize_plugin"] +serde_support = ["serde"] +default = ["serde_support"] [dependencies] -lazy_static = "0.2" -serde = ">=0.6, <0.8" -phf_shared = "0.7.4" -debug_unreachable = "0.1.1" - -[dev-dependencies] -rand = "0.3" - -[dependencies.rustc-serialize] -version = "0.3" -optional = true - -[dependencies.heapsize] -version = ">=0.1.1, <0.4" -optional = true - -[dependencies.heapsize_plugin] -version = "0.1.4" -optional = true - -[build-dependencies] -phf_generator = "0.7.4" -phf_shared = "0.7.4" +precomputed-hash = "0.1" +serde = { version = "1", optional = true } +malloc_size_of = { version = "0.1", default-features = false, optional = true } +phf_shared = "0.13" +new_debug_unreachable = "1.0.2" +parking_lot = "0.12" + +[[test]] +name = "small-stack" +harness = false + +[workspace] +members = [ + "string-cache-codegen", + "integration-tests", +] diff --git a/README.md b/README.md index f18a7dd..429d1ec 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,78 @@ # string-cache -[![Build Status](https://travis-ci.org/servo/string-cache.svg?branch=master)](https://travis-ci.org/servo/string-cache) +[![Build Status](https://github.com/servo/string-cache/actions/workflows/ci.yml/badge.svg)](https://github.com/servo/string-cache/actions) -[Documentation](http://doc.servo.org/string_cache/) +[Documentation](https://docs.rs/string_cache/) A string interning library for Rust, developed as part of the [Servo](https://github.com/servo/servo) project. + +## Simple usage + +In `Cargo.toml`: + +```toml +[dependencies] +string_cache = "0.9" +``` + +In `lib.rs`: + +```rust +extern crate string_cache; +use string_cache::DefaultAtom as Atom; +``` + +## With static atoms + +In `Cargo.toml`: + +```toml +[package] +build = "build.rs" + +[dependencies] +string_cache = "0.9" + +[build-dependencies] +string_cache_codegen = "0.6" +``` + +In `build.rs`: + +```rust +extern crate string_cache_codegen; + +use std::env; +use std::path::Path; + +fn main() { + string_cache_codegen::AtomType::new("foo::FooAtom", "foo_atom!") + .atoms(&["foo", "bar"]) + .write_to_file(&Path::new(&env::var("OUT_DIR").unwrap()).join("foo_atom.rs")) + .unwrap() +} +``` + +In `lib.rs`: + +```rust +extern crate string_cache; + +mod foo { + include!(concat!(env!("OUT_DIR"), "/foo_atom.rs")); +} +``` + +The generated code will define a `FooAtom` type and a `foo_atom!` macro. +The macro can be used in expression or patterns, with strings listed in `build.rs`. +For example: + +```rust +fn compute_something(input: &foo::FooAtom) -> u32 { + match *input { + foo_atom!("foo") => 1, + foo_atom!("bar") => 2, + _ => 3, + } +} +``` diff --git a/build.rs b/build.rs deleted file mode 100644 index 7571868..0000000 --- a/build.rs +++ /dev/null @@ -1,73 +0,0 @@ -extern crate phf_shared; -extern crate phf_generator; - -#[path = "src/shared.rs"] #[allow(dead_code)] mod shared; -#[path = "src/static_atom_list.rs"] mod static_atom_list; - -use std::env; -use std::fs::File; -use std::io::{BufWriter, Write}; -use std::mem; -use std::path::Path; -use std::slice; - -fn main() { - let hash_state = generate(); - write_static_atom_set(&hash_state); - write_atom_macro(&hash_state); -} - -fn generate() -> phf_generator::HashState { - let mut set = std::collections::HashSet::new(); - for atom in static_atom_list::ATOMS { - if !set.insert(atom) { - panic!("duplicate static atom `{:?}`", atom); - } - } - phf_generator::generate_hash(static_atom_list::ATOMS) -} - -fn write_static_atom_set(hash_state: &phf_generator::HashState) { - let path = Path::new(&std::env::var("OUT_DIR").unwrap()).join("static_atom_set.rs"); - let mut file = BufWriter::new(File::create(&path).unwrap()); - macro_rules! w { - ($($arg: expr),+) => { (writeln!(&mut file, $($arg),+).unwrap()) } - } - w!("pub static STATIC_ATOM_SET: StaticAtomSet = StaticAtomSet {{"); - w!(" key: {},", hash_state.key); - w!(" disps: &["); - for &(d1, d2) in &hash_state.disps { - w!(" ({}, {}),", d1, d2); - } - w!(" ],"); - w!(" atoms: &["); - for &idx in &hash_state.map { - w!(" {:?},", static_atom_list::ATOMS[idx]); - } - w!(" ],"); - w!("}};"); -} - -fn write_atom_macro(hash_state: &phf_generator::HashState) { - let set = shared::StaticAtomSet { - key: hash_state.key, - disps: leak(hash_state.disps.clone()), - atoms: leak(hash_state.map.iter().map(|&idx| static_atom_list::ATOMS[idx]).collect()), - }; - - let path = Path::new(&env::var("OUT_DIR").unwrap()).join("atom_macro.rs"); - let mut file = BufWriter::new(File::create(&path).unwrap()); - writeln!(file, r"#[macro_export]").unwrap(); - writeln!(file, r"macro_rules! atom {{").unwrap(); - for &s in set.iter() { - let data = shared::pack_static(set.get_index_or_hash(s).unwrap() as u32); - writeln!(file, r"({:?}) => {{ $crate::Atom {{ data: 0x{:x} }} }};", s, data).unwrap(); - } - writeln!(file, r"}}").unwrap(); -} - -fn leak(v: Vec) -> &'static [T] { - let slice = unsafe { slice::from_raw_parts(v.as_ptr(), v.len()) }; - mem::forget(v); - slice -} diff --git a/examples/event-log/Cargo.toml b/examples/event-log/Cargo.toml deleted file mode 100644 index 7edd5cf..0000000 --- a/examples/event-log/Cargo.toml +++ /dev/null @@ -1,9 +0,0 @@ -[package] - -name = "string-cache-event-log-example" -version = "0.0.0" -authors = [ "The Servo Project Developers" ] - -[dependencies.string_cache] -path = "../.." -features = ["log-events"] diff --git a/examples/event-log/README.md b/examples/event-log/README.md deleted file mode 100644 index b2deb39..0000000 --- a/examples/event-log/README.md +++ /dev/null @@ -1,4 +0,0 @@ -string-cache can record logs of what it's doing, which can be useful for -guiding future changes to the library. This project demonstrates how to build -string-cache with logging enabled (see `Cargo.toml`), and how to access the log -at runtime. diff --git a/examples/event-log/src/main.rs b/examples/event-log/src/main.rs deleted file mode 100644 index 89adfdf..0000000 --- a/examples/event-log/src/main.rs +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2014 The Servo Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -extern crate string_cache; - -use string_cache::Atom; -use string_cache::event; - -use std::io; -use std::io::prelude::*; - -fn main() { - println!("Reading stdin to end of file"); - let mut stdin = String::new(); - io::stdin().read_to_string(&mut stdin).unwrap(); - let mut atoms = vec![]; - for word in stdin.split(|c: char| c.is_whitespace()) { - atoms.push(Atom::from(word)); - } - - let log = event::LOG.lock().unwrap(); - - println!("Created {} atoms, logged {} events:", atoms.len(), log.len()); - for e in log.iter() { - println!("{:?}", e); - } -} diff --git a/examples/simple.rs b/examples/simple.rs new file mode 100644 index 0000000..f063b06 --- /dev/null +++ b/examples/simple.rs @@ -0,0 +1,26 @@ + + +use string_cache::DefaultAtom; + +fn main() { + let mut interned_stuff = Vec::new(); + let text = "here is a sentence of text that will be tokenised and interned and some repeated \ + tokens is of text and"; + for word in text.split_whitespace() { + let seen_before = interned_stuff + .iter() + // We can use impl PartialEq where T is anything string-like to compare to + // interned strings to either other interned strings, or actual strings Comparing two + // interned strings is very fast (normally a single cpu operation). + .filter(|interned_word| interned_word == &word) + .count(); + if seen_before > 0 { + println!(r#"Seen the word "{}" {} times"#, word, seen_before); + } else { + println!(r#"Not seen the word "{}" before"#, word); + } + // We use the impl From<(Cow<'a, str>, or &'a str, or String) for Atom to intern a + // new string + interned_stuff.push(DefaultAtom::from(word)); + } +} diff --git a/examples/summarize-events/Cargo.toml b/examples/summarize-events/Cargo.toml deleted file mode 100644 index 7d2e7ba..0000000 --- a/examples/summarize-events/Cargo.toml +++ /dev/null @@ -1,13 +0,0 @@ -[package] - -name = "string-cache-summarize-events" -version = "0.0.0" -authors = [ "The Servo Project Developers" ] - -[dependencies] -csv = "0" -rustc-serialize = "0" -phf_shared = "0.7.4" - -[dependencies.string_cache] -path = "../.." diff --git a/examples/summarize-events/src/main.rs b/examples/summarize-events/src/main.rs deleted file mode 100644 index 1b1aa64..0000000 --- a/examples/summarize-events/src/main.rs +++ /dev/null @@ -1,169 +0,0 @@ -// Copyright 2014 The Servo Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -extern crate csv; -extern crate string_cache; -extern crate rustc_serialize; -extern crate phf_shared; - -#[path = "../../../src/shared.rs"] -#[allow(dead_code)] -mod shared; - -use string_cache::Atom; - -use std::{env, cmp}; -use std::collections::hash_map::{HashMap, Entry}; -use std::path::Path; - -#[derive(RustcDecodable, Debug)] -struct Event { - event: String, - id: u64, - string: Option, -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] -enum Kind { - Dynamic, - Inline, - Static, -} - -impl Kind { - fn from_tag(tag: u8) -> Kind { - match tag { - shared::DYNAMIC_TAG => Kind::Dynamic, - shared::INLINE_TAG => Kind::Inline, - shared::STATIC_TAG => Kind::Static, - _ => panic!() - } - } - - fn to_tag(self) -> u8 { - match self { - Kind::Dynamic => shared::DYNAMIC_TAG, - Kind::Inline => shared::INLINE_TAG, - Kind::Static => shared::STATIC_TAG, - } - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] -struct Summary { - kind: Kind, - times: usize, -} - -fn main() { - let filename = env::args().skip(1).next() - .expect("Usage: string-cache-summarize-events foo.csv"); - let path = &Path::new(&filename); - let mut file = csv::Reader::from_file(path).unwrap(); - - // Over the lifetime of a program, one dynamic atom might get interned at - // several addresses, and one address may be used to intern several - // different strings. For this reason we must separately track the - // currently-allocated atoms and the summary of all atoms ever created. - let mut dynamic: HashMap = HashMap::new(); - let mut peak_dynamic = 0; - let mut summary: HashMap = HashMap::new(); - let mut inserts = 0; - - for record in file.decode() { - let ev: Event = record.unwrap(); - match &ev.event[..] { - "intern" => { - let tag = (ev.id & 0xf) as u8; - assert!(tag <= shared::STATIC_TAG); - - let string = match tag { - shared::DYNAMIC_TAG => dynamic[&ev.id].clone(), - - // FIXME: We really shouldn't be allowed to do this. It's a memory-safety - // hazard; the field is only public for the atom!() macro. - _ => Atom { data: ev.id }.to_string(), - }; - - match summary.entry(string) { - Entry::Occupied(entry) => entry.into_mut().times += 1, - Entry::Vacant(entry) => { - entry.insert(Summary { - kind: Kind::from_tag(tag), - times: 1, - }); - } - } - }, - - "insert" => { - assert!(!dynamic.contains_key(&ev.id)); - dynamic.insert(ev.id, ev.string.expect("no string to insert")); - peak_dynamic = cmp::max(peak_dynamic, dynamic.len()); - inserts += 1; - } - - "remove" => { - assert!(dynamic.contains_key(&ev.id)); - dynamic.remove(&ev.id); - } - - e => panic!("unknown event {}", e), - } - } - - // Get all records, in a stable order. - let mut summary: Vec<_> = summary.into_iter().collect(); - summary.sort_by(|&(ref a, _), &(ref b, _)| a.cmp(b)); - - // Sort by number of occurrences, descending. - summary.sort_by(|&(_, a), &(_, b)| b.times.cmp(&a.times)); - let longest_atom = summary.iter().map(|&(ref k, _)| k.len()) - .max().unwrap_or(0); - - let pad = |c, n| { - for _ in n..longest_atom { - print!("{}", c); - } - }; - - let mut total = 0; - let mut by_kind = [0, 0, 0]; - for &(_, Summary { kind, times }) in &summary { - total += times; - by_kind[kind.to_tag() as usize] += times; - } - - println!("\n"); - println!("kind times pct"); - println!("------- ------- ----"); - for (k, &n) in by_kind.iter().enumerate() { - let k: Kind = Kind::from_tag(k as u8); - print!("{:7?} {:7} {:4.1}", - k, n, 100.0 * (n as f64) / (total as f64)); - - match k { - Kind::Dynamic => println!(" {} inserts, peak size {}, miss rate {:4.1}%", - inserts, peak_dynamic, 100.0 * (inserts as f64) / (n as f64)), - _ => println!(""), - } - } - println!(""); - println!("total {:7}", total); - println!("\n"); - - pad(' ', 4); - println!("atom times kind"); - pad('-', 4); - println!("---- ------ -------"); - for (string, Summary { kind, times }) in summary { - pad(' ', string.chars().count()); - println!("{} {:6} {:?}", string, times, kind); - } -} diff --git a/integration-tests/Cargo.toml b/integration-tests/Cargo.toml new file mode 100644 index 0000000..4562747 --- /dev/null +++ b/integration-tests/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "integration_tests" +version = "0.0.1" +authors = [ "The Servo Project Developers" ] +build = "build.rs" +publish = false +edition = "2018" + +[lib] +doctest = false +test = true + +[features] + +# Use unstable features to optimize space and time (memory and CPU usage). +unstable = [] + +[dependencies] +string_cache = { version = "0.9", path = ".." } + +[dev-dependencies] +rand = { version = "0.8", features = ["small_rng"] } +string_cache_codegen = { version = "0.6", path = "../string-cache-codegen" } + +[build-dependencies] +string_cache_codegen = { version = "0.6", path = "../string-cache-codegen" } diff --git a/integration-tests/build.rs b/integration-tests/build.rs new file mode 100644 index 0000000..6293e4c --- /dev/null +++ b/integration-tests/build.rs @@ -0,0 +1,26 @@ +use string_cache_codegen; + +use std::env; +use std::path::Path; + +fn main() { + string_cache_codegen::AtomType::new("TestAtom", "test_atom!") + .atoms(&[ + "a", + "b", + "address", + "defaults", + "area", + "body", + "font-weight", + "br", + "html", + "head", + "id", + "❤", + "❤💯", + "❤💯❤💯", + ]) + .write_to_file(&Path::new(&env::var("OUT_DIR").unwrap()).join("test_atom.rs")) + .unwrap() +} diff --git a/src/atom/bench.rs b/integration-tests/src/bench.rs similarity index 84% rename from src/atom/bench.rs rename to integration-tests/src/bench.rs index 96b0790..45e7199 100644 --- a/src/atom/bench.rs +++ b/integration-tests/src/bench.rs @@ -26,23 +26,20 @@ Furthermore, a large part of the point of interning is to make strings small and cheap to move around, which isn't reflected in these tests. */ +use crate::TestAtom; -use atom::Atom; -use test::{Bencher, black_box}; +use test::{black_box, Bencher}; // Just shorthand -fn mk(x: &str) -> Atom { - Atom::from(x) +fn mk(x: &str) -> TestAtom { + TestAtom::from(x) } -macro_rules! check_type (($name:ident, $x:expr, $p:pat) => ( +macro_rules! check_type (($name:ident, $x:expr) => ( // NB: "cargo bench" does not run these! #[test] fn $name() { - match unsafe { $x.unpack() } { - $p => (), - _ => panic!("atom has wrong type"), - } + assert!($x, "atom has wrong type"); } )); @@ -62,12 +59,12 @@ macro_rules! bench_tiny_op (($name:ident, $op:ident, $ctor_x:expr, $ctor_y:expr) )); macro_rules! bench_one ( - (x_static $x:expr, $y:expr) => (check_type!(check_type_x, $x, Static(..));); - (x_inline $x:expr, $y:expr) => (check_type!(check_type_x, $x, Inline(..));); - (x_dynamic $x:expr, $y:expr) => (check_type!(check_type_x, $x, Dynamic(..));); - (y_static $x:expr, $y:expr) => (check_type!(check_type_y, $y, Static(..));); - (y_inline $x:expr, $y:expr) => (check_type!(check_type_y, $y, Inline(..));); - (y_dynamic $x:expr, $y:expr) => (check_type!(check_type_y, $y, Dynamic(..));); + (x_static $x:expr, $y:expr) => (check_type!(check_type_x, $x.is_static());); + (x_inline $x:expr, $y:expr) => (check_type!(check_type_x, $x.is_inline());); + (x_dynamic $x:expr, $y:expr) => (check_type!(check_type_x, $x.is_dynamic());); + (y_static $x:expr, $y:expr) => (check_type!(check_type_y, $y.is_static());); + (y_inline $x:expr, $y:expr) => (check_type!(check_type_y, $y.is_inline());); + (y_dynamic $x:expr, $y:expr) => (check_type!(check_type_y, $y.is_dynamic());); (is_static $x:expr, $y:expr) => (bench_one!(x_static $x, $y); bench_one!(y_static $x, $y);); (is_inline $x:expr, $y:expr) => (bench_one!(x_inline $x, $y); bench_one!(y_inline $x, $y);); (is_dynamic $x:expr, $y:expr) => (bench_one!(x_dynamic $x, $y); bench_one!(y_dynamic $x, $y);); @@ -81,7 +78,7 @@ macro_rules! bench_one ( fn intern(b: &mut Bencher) { let x = $x.to_string(); b.iter(|| { - black_box(Atom::from(&*x)); + black_box(TestAtom::from(&*x)); }); } ); @@ -134,8 +131,7 @@ macro_rules! bench_all ( use std::string::ToString; use std::iter::repeat; - use atom::Atom; - use atom::UnpackedAtom::{Static, Inline, Dynamic}; + use crate::TestAtom; use super::mk; @@ -146,10 +142,10 @@ macro_rules! bench_all ( ); ); -pub const longer_dynamic_a: &'static str - = "Thee Silver Mt. Zion Memorial Orchestra & Tra-La-La Band"; -pub const longer_dynamic_b: &'static str - = "Thee Silver Mt. Zion Memorial Orchestra & Tra-La-La Ban!"; +pub const longer_dynamic_a: &'static str = + "Thee Silver Mt. Zion Memorial Orchestra & Tra-La-La Band"; +pub const longer_dynamic_b: &'static str = + "Thee Silver Mt. Zion Memorial Orchestra & Tra-La-La Ban!"; bench_all!([eq ne lt clone_string] for short_string = "e", "f"); bench_all!([eq ne lt clone_string] for medium_string = "xyzzy01", "xyzzy02"); @@ -157,7 +153,7 @@ bench_all!([eq ne lt clone_string] for longer_string = super::longer_dynamic_a, super::longer_dynamic_b); bench_all!([eq ne intern as_ref clone is_static lt] - for static_atom = atom!("a"), atom!("b")); + for static_atom = test_atom!("defaults"), test_atom!("font-weight")); bench_all!([intern as_ref clone is_inline] for short_inline_atom = mk("e"), mk("f")); @@ -172,13 +168,13 @@ bench_all!([eq ne intern as_ref clone is_dynamic lt] for longer_dynamic_atom = mk(super::longer_dynamic_a), mk(super::longer_dynamic_b)); bench_all!([intern as_ref clone is_static] - for static_at_runtime = mk("a"), mk("b")); + for static_at_runtime = mk("defaults"), mk("font-weight")); bench_all!([ne lt x_static y_inline] - for static_vs_inline = atom!("a"), mk("f")); + for static_vs_inline = test_atom!("defaults"), mk("f")); bench_all!([ne lt x_static y_dynamic] - for static_vs_dynamic = atom!("a"), mk(super::longer_dynamic_b)); + for static_vs_dynamic = test_atom!("defaults"), mk(super::longer_dynamic_b)); bench_all!([ne lt x_inline y_dynamic] for inline_vs_dynamic = mk("e"), mk(super::longer_dynamic_b)); @@ -188,9 +184,9 @@ macro_rules! bench_rand ( ($name:ident, $len:expr) => ( fn $name(b: &mut Bencher) { use std::str; use rand; - use rand::Rng; + use rand::{RngCore, SeedableRng}; - let mut gen = rand::weak_rng(); + let mut gen = rand::rngs::SmallRng::from_entropy(); b.iter(|| { // We have to generate new atoms on every iter, because // the dynamic atom table isn't reset. @@ -205,12 +201,12 @@ macro_rules! bench_rand ( ($name:ident, $len:expr) => ( *n = (*n % 0x40) + 0x20; } let s = str::from_utf8(&buf[..]).unwrap(); - black_box(Atom::from(s)); + black_box(TestAtom::from(s)); }); } )); -bench_rand!(intern_rand_008, 8); -bench_rand!(intern_rand_032, 32); +bench_rand!(intern_rand_008, 8); +bench_rand!(intern_rand_032, 32); bench_rand!(intern_rand_128, 128); bench_rand!(intern_rand_512, 512); diff --git a/integration-tests/src/common-usage.rs b/integration-tests/src/common-usage.rs new file mode 100644 index 0000000..7b7380a --- /dev/null +++ b/integration-tests/src/common-usage.rs @@ -0,0 +1,19 @@ +/// Test common usage by popular dependents (html5ever, lalrpop, browserlists-rs), to ensure no API-surface breaking changes +/// Created after https://github.com/servo/string-cache/issues/271 +use std::collections::HashMap; + +use crate::Atom; +use crate::TestAtom; + +#[test] +fn usage_with_hashmap() { + let mut map: HashMap = HashMap::new(); + + map.insert(test_atom!("area"), 1); + map.insert("str_into".into(), 2); + map.insert("atom_from".into(), 3); + + assert_eq!(map.get(&"area".into()).unwrap(), &1); + assert_eq!(map.get(&"str_into".into()).unwrap(), &2); + assert_eq!(map.get(&Atom::from("atom_from")).unwrap(), &3); +} diff --git a/integration-tests/src/lib.rs b/integration-tests/src/lib.rs new file mode 100644 index 0000000..a788d93 --- /dev/null +++ b/integration-tests/src/lib.rs @@ -0,0 +1,316 @@ +// Copyright 2014 The Servo Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![cfg(test)] +#![deny(warnings)] +#![allow(non_upper_case_globals)] +#![cfg_attr(feature = "unstable", feature(test))] + +#[cfg(feature = "unstable")] +extern crate test; + +use std::thread; +use string_cache::StaticAtomSet; + +include!(concat!(env!("OUT_DIR"), "/test_atom.rs")); +pub type Atom = TestAtom; + +#[test] +fn test_as_slice() { + let s0 = Atom::from(""); + assert!(s0.as_ref() == ""); + + let s1 = Atom::from("class"); + assert!(s1.as_ref() == "class"); + + let i0 = Atom::from("blah"); + assert!(i0.as_ref() == "blah"); + + let s0 = Atom::from("BLAH"); + assert!(s0.as_ref() == "BLAH"); + + let d0 = Atom::from("zzzzzzzzzz"); + assert!(d0.as_ref() == "zzzzzzzzzz"); + + let d1 = Atom::from("ZZZZZZZZZZ"); + assert!(d1.as_ref() == "ZZZZZZZZZZ"); +} + +#[test] +fn test_types() { + assert!(Atom::from("").is_static()); + assert!(Atom::from("defaults").is_static()); + assert!(Atom::from("font-weight").is_static()); + assert!(Atom::from("id").is_inline()); + assert!(Atom::from("body").is_inline()); + assert!(Atom::from("a").is_inline()); + assert!(Atom::from("address").is_inline()); + assert!(Atom::from("c").is_inline()); + assert!(Atom::from("zz").is_inline()); + assert!(Atom::from("zzz").is_inline()); + assert!(Atom::from("zzzz").is_inline()); + assert!(Atom::from("zzzzz").is_inline()); + assert!(Atom::from("zzzzzz").is_inline()); + assert!(Atom::from("zzzzzzz").is_inline()); + assert!(Atom::from("zzzzzzzz").is_dynamic()); + assert!(Atom::from("zzzzzzzzzzzzz").is_dynamic()); +} + +#[test] +fn test_equality() { + let s0 = Atom::from("fn"); + let s1 = Atom::from("fn"); + let s2 = Atom::from("loop"); + + let i0 = Atom::from("blah"); + let i1 = Atom::from("blah"); + let i2 = Atom::from("blah2"); + + let d0 = Atom::from("zzzzzzzz"); + let d1 = Atom::from("zzzzzzzz"); + let d2 = Atom::from("zzzzzzzzz"); + + assert!(s0 == s1); + assert!(s0 != s2); + + assert!(i0 == i1); + assert!(i0 != i2); + + assert!(d0 == d1); + assert!(d0 != d2); + + assert!(s0 != i0); + assert!(s0 != d0); + assert!(i0 != d0); +} + +#[test] +fn default() { + assert_eq!(TestAtom::default(), test_atom!("")); + assert_eq!(&*TestAtom::default(), ""); +} + +#[test] +fn ord() { + fn check(x: &str, y: &str) { + assert_eq!(x < y, Atom::from(x) < Atom::from(y)); + assert_eq!(x.cmp(y), Atom::from(x).cmp(&Atom::from(y))); + assert_eq!(x.partial_cmp(y), Atom::from(x).partial_cmp(&Atom::from(y))); + } + + check("a", "body"); + check("asdf", "body"); + check("zasdf", "body"); + check("z", "body"); + + check("a", "bbbbb"); + check("asdf", "bbbbb"); + check("zasdf", "bbbbb"); + check("z", "bbbbb"); +} + +#[test] +fn clone() { + let s0 = Atom::from("fn"); + let s1 = s0.clone(); + let s2 = Atom::from("loop"); + + let i0 = Atom::from("blah"); + let i1 = i0.clone(); + let i2 = Atom::from("blah2"); + + let d0 = Atom::from("zzzzzzzz"); + let d1 = d0.clone(); + let d2 = Atom::from("zzzzzzzzz"); + + assert!(s0 == s1); + assert!(s0 != s2); + + assert!(i0 == i1); + assert!(i0 != i2); + + assert!(d0 == d1); + assert!(d0 != d2); + + assert!(s0 != i0); + assert!(s0 != d0); + assert!(i0 != d0); +} + +macro_rules! assert_eq_fmt (($fmt:expr, $x:expr, $y:expr) => ({ + let x = $x; + let y = $y; + if x != y { + panic!("assertion failed: {} != {}", + format_args!($fmt, x), + format_args!($fmt, y)); + } +})); + +#[test] +fn repr() { + fn check(s: &str, data: u64) { + assert_eq_fmt!("0x{:016X}", Atom::from(s).unsafe_data(), data); + } + + fn check_static(s: &str, x: Atom) { + assert_eq_fmt!("0x{:016X}", x.unsafe_data(), Atom::from(s).unsafe_data()); + assert_eq!(0x2, x.unsafe_data() & 0xFFFF_FFFF); + // The index is unspecified by phf. + assert!((x.unsafe_data() >> 32) <= TestAtomStaticSet::get().atoms.len() as u64); + } + + // This test is here to make sure we don't change atom representation + // by accident. It may need adjusting if there are changes to the + // static atom table, the tag values, etc. + + // Static atoms + check_static("defaults", test_atom!("defaults")); + check_static("font-weight", test_atom!("font-weight")); + + // Inline atoms + check("a", 0x0000_0000_0000_6111); + check("address", 0x7373_6572_6464_6171); + check("area", 0x0000_0061_6572_6141); + check("e", 0x0000_0000_0000_6511); + check("xyzzy", 0x0000_797A_7A79_7851); + check("xyzzy01", 0x3130_797A_7A79_7871); + + // Dynamic atoms. This is a pointer so we can't verify every bit. + assert_eq!(0x00, Atom::from("a dynamic string").unsafe_data() & 0xf); +} + +#[test] +fn test_threads() { + for _ in 0_u32..100 { + thread::spawn(move || { + let _ = Atom::from("a dynamic string"); + let _ = Atom::from("another string"); + }); + } +} + +#[test] +fn atom_macro() { + assert_eq!(test_atom!("a"), Atom::from("a")); + assert_eq!(test_atom!("body"), Atom::from("body")); + assert_eq!(test_atom!("address"), Atom::from("address")); + assert_eq!(test_atom!("❤"), Atom::from("❤")); + assert_eq!(test_atom!("❤💯"), Atom::from("❤💯")); + assert_eq!(test_atom!("font-weight"), Atom::from("font-weight")); + assert_eq!(test_atom!("❤💯❤💯"), Atom::from("❤💯❤💯")); +} + +#[test] +fn match_atom() { + assert_eq!( + 2, + match Atom::from("head") { + test_atom!("br") => 1, + test_atom!("html") | test_atom!("head") => 2, + _ => 3, + } + ); + + assert_eq!( + 3, + match Atom::from("body") { + test_atom!("br") => 1, + test_atom!("html") | test_atom!("head") => 2, + _ => 3, + } + ); + + assert_eq!( + 3, + match Atom::from("zzzzzz") { + test_atom!("br") => 1, + test_atom!("html") | test_atom!("head") => 2, + _ => 3, + } + ); +} + +#[test] +fn ensure_deref() { + // Ensure we can Deref to a &str + let atom = Atom::from("foobar"); + let _: &str = &atom; +} + +#[test] +fn ensure_as_ref() { + // Ensure we can as_ref to a &str + let atom = Atom::from("foobar"); + let _: &str = atom.as_ref(); +} + +#[test] +fn test_ascii_lowercase() { + assert_eq!(Atom::from("").to_ascii_lowercase(), Atom::from("")); + assert_eq!(Atom::from("aZ9").to_ascii_lowercase(), Atom::from("az9")); + assert_eq!( + Atom::from("The Quick Brown Fox!").to_ascii_lowercase(), + Atom::from("the quick brown fox!") + ); + assert_eq!( + Atom::from("JE VAIS À PARIS").to_ascii_lowercase(), + Atom::from("je vais À paris") + ); +} + +#[test] +fn test_ascii_uppercase() { + assert_eq!(Atom::from("").to_ascii_uppercase(), Atom::from("")); + assert_eq!(Atom::from("aZ9").to_ascii_uppercase(), Atom::from("AZ9")); + assert_eq!( + Atom::from("The Quick Brown Fox!").to_ascii_uppercase(), + Atom::from("THE QUICK BROWN FOX!") + ); + assert_eq!( + Atom::from("Je vais à Paris").to_ascii_uppercase(), + Atom::from("JE VAIS à PARIS") + ); +} + +#[test] +fn test_eq_ignore_ascii_case() { + assert!(Atom::from("").eq_ignore_ascii_case(&Atom::from(""))); + assert!(Atom::from("aZ9").eq_ignore_ascii_case(&Atom::from("aZ9"))); + assert!(Atom::from("aZ9").eq_ignore_ascii_case(&Atom::from("Az9"))); + assert!(Atom::from("The Quick Brown Fox!") + .eq_ignore_ascii_case(&Atom::from("THE quick BROWN fox!"))); + assert!(Atom::from("Je vais à Paris").eq_ignore_ascii_case(&Atom::from("je VAIS à PARIS"))); + assert!(!Atom::from("").eq_ignore_ascii_case(&Atom::from("az9"))); + assert!(!Atom::from("aZ9").eq_ignore_ascii_case(&Atom::from(""))); + assert!(!Atom::from("aZ9").eq_ignore_ascii_case(&Atom::from("9Za"))); + assert!(!Atom::from("The Quick Brown Fox!") + .eq_ignore_ascii_case(&Atom::from("THE quick BROWN fox!!"))); + assert!(!Atom::from("Je vais à Paris").eq_ignore_ascii_case(&Atom::from("JE vais À paris"))); +} + +#[test] +fn test_from_string() { + assert!(Atom::from("camembert".to_owned()) == Atom::from("camembert")); +} + +#[test] +fn test_try_static() { + assert!(Atom::try_static("defaults").is_some()); + assert!(Atom::try_static("head").is_none()); + assert!(Atom::try_static("not in the static table").is_none()); +} + +#[cfg(test)] +#[path = "common-usage.rs"] +mod common_usage; + +#[cfg(all(test, feature = "unstable"))] +#[path = "bench.rs"] +mod bench; diff --git a/src/atom.rs b/src/atom.rs new file mode 100644 index 0000000..5a8aa7f --- /dev/null +++ b/src/atom.rs @@ -0,0 +1,415 @@ +// Copyright 2014 The Servo Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use crate::dynamic_set::{dynamic_set, Entry}; +use crate::static_sets::StaticAtomSet; +use debug_unreachable::debug_unreachable; + +use std::borrow::Cow; +use std::cmp::Ordering::{self, Equal}; +use std::fmt; +use std::hash::{Hash, Hasher}; +use std::marker::PhantomData; +use std::mem; +use std::num::NonZeroU64; +use std::ops; +use std::slice; +use std::str; +use std::sync::atomic::Ordering::SeqCst; + +const DYNAMIC_TAG: u8 = 0b_00; +const INLINE_TAG: u8 = 0b_01; // len in upper nybble +const STATIC_TAG: u8 = 0b_10; +const TAG_MASK: u64 = 0b_11; +const LEN_OFFSET: u64 = 4; +const LEN_MASK: u64 = 0xF0; + +const MAX_INLINE_LEN: usize = 7; +const STATIC_SHIFT_BITS: usize = 32; + +/// Represents a string that has been interned. +/// +/// While the type definition for `Atom` indicates that it generic on a particular +/// implementation of an atom set, you don't need to worry about this. Atoms can be static +/// and come from a `StaticAtomSet` generated by the `string_cache_codegen` crate, or they +/// can be dynamic and created by you on an `EmptyStaticAtomSet`. +/// +/// `Atom` implements `Clone` but not `Copy`, since internally atoms are reference-counted; +/// this means that you may need to `.clone()` an atom to keep copies to it in different +/// places, or when passing it to a function that takes an `Atom` rather than an `&Atom`. +/// +/// ## Creating an atom at runtime +/// +/// If you use `string_cache_codegen` to generate a precomputed list of atoms, your code +/// may then do something like read data from somewhere and extract tokens that need to be +/// compared to the atoms. In this case, you can use `Atom::from(&str)` or +/// `Atom::from(String)`. These create a reference-counted atom which will be +/// automatically freed when all references to it are dropped. +/// +/// This means that your application can safely have a loop which tokenizes data, creates +/// atoms from the tokens, and compares the atoms to a predefined set of keywords, without +/// running the risk of arbitrary memory consumption from creating large numbers of atoms — +/// as long as your application does not store clones of the atoms it creates along the +/// way. +/// +/// For example, the following is safe and will not consume arbitrary amounts of memory: +/// +/// ```ignore +/// let untrusted_data = "large amounts of text ..."; +/// +/// for token in untrusted_data.split_whitespace() { +/// let atom = Atom::from(token); // interns the string +/// +/// if atom == Atom::from("keyword") { +/// // handle that keyword +/// } else if atom == Atom::from("another_keyword") { +/// // handle that keyword +/// } else { +/// println!("unknown keyword"); +/// } +/// } // atom is dropped here, so it is not kept around in memory +/// ``` +#[derive(PartialEq, Eq)] +// NOTE: Deriving PartialEq requires that a given string must always be interned the same way. +pub struct Atom { + unsafe_data: NonZeroU64, + phantom: PhantomData, +} + +// This isn't really correct as the Atoms can technically take up space. But I guess it's ok +// as it is possible to measure the size of the atom set separately/ +#[cfg(feature = "malloc_size_of")] +impl malloc_size_of::MallocSizeOf for Atom { + fn size_of(&self, _ops: &mut malloc_size_of::MallocSizeOfOps) -> usize { + 0 + } +} + +// FIXME: bound removed from the struct definition before of this error for pack_static: +// "error[E0723]: trait bounds other than `Sized` on const fn parameters are unstable" +// https://github.com/rust-lang/rust/issues/57563 +impl Atom { + /// For the atom!() macros + #[inline(always)] + #[doc(hidden)] + pub const fn pack_static(n: u32) -> Self { + Self { + unsafe_data: unsafe { + // STATIC_TAG ensures this is non-zero + NonZeroU64::new_unchecked((STATIC_TAG as u64) | ((n as u64) << STATIC_SHIFT_BITS)) + }, + phantom: PhantomData, + } + } + + /// For the atom!() macros + #[inline(always)] + #[doc(hidden)] + pub const fn pack_inline(mut n: u64, len: u8) -> Self { + if cfg!(target_endian = "big") { + // Reverse order of top 7 bytes. + // Bottom 8 bits of `n` are zero, and we need that to remain so. + // String data is stored in top 7 bytes, tag and length in bottom byte. + n = n.to_le() << 8; + } + + let data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET) | n; + Self { + // INLINE_TAG ensures this is never zero + unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, + phantom: PhantomData, + } + } + + fn tag(&self) -> u8 { + (self.unsafe_data.get() & TAG_MASK) as u8 + } +} + +impl Atom { + /// Return the internal representation. For testing. + #[doc(hidden)] + pub fn unsafe_data(&self) -> u64 { + self.unsafe_data.get() + } + + /// Return true if this is a static Atom. For testing. + #[doc(hidden)] + pub fn is_static(&self) -> bool { + self.tag() == STATIC_TAG + } + + /// Return true if this is a dynamic Atom. For testing. + #[doc(hidden)] + pub fn is_dynamic(&self) -> bool { + self.tag() == DYNAMIC_TAG + } + + /// Return true if this is an inline Atom. For testing. + #[doc(hidden)] + pub fn is_inline(&self) -> bool { + self.tag() == INLINE_TAG + } + + fn static_index(&self) -> u64 { + self.unsafe_data.get() >> STATIC_SHIFT_BITS + } + + /// Get the hash of the string as it is stored in the set. + pub fn get_hash(&self) -> u32 { + match self.tag() { + DYNAMIC_TAG => { + let entry = self.unsafe_data.get() as *const Entry; + unsafe { (*entry).hash } + } + STATIC_TAG => Static::get().hashes[self.static_index() as usize], + INLINE_TAG => { + let data = self.unsafe_data.get(); + // This may or may not be great... + ((data >> 32) ^ data) as u32 + } + _ => unsafe { debug_unreachable!() }, + } + } + + pub fn try_static(string_to_add: &str) -> Option { + Self::try_static_internal(string_to_add).ok() + } + + fn try_static_internal(string_to_add: &str) -> Result { + let static_set = Static::get(); + let hash = phf_shared::hash(&*string_to_add, &static_set.key); + let index = phf_shared::get_index(&hash, static_set.disps, static_set.atoms.len()); + + if static_set.atoms[index as usize] == string_to_add { + Ok(Self::pack_static(index)) + } else { + Err(hash) + } + } +} + +impl Default for Atom { + #[inline] + fn default() -> Self { + Atom::pack_static(Static::empty_string_index()) + } +} + +impl Hash for Atom { + #[inline] + fn hash(&self, state: &mut H) + where + H: Hasher, + { + state.write_u32(self.get_hash()) + } +} + +impl<'a, Static: StaticAtomSet> From> for Atom { + fn from(string_to_add: Cow<'a, str>) -> Self { + let len = string_to_add.len(); + if len == 0 { + Self::pack_static(Static::empty_string_index()) + } else if len <= MAX_INLINE_LEN { + let mut data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET); + { + let dest = inline_atom_slice_mut(&mut data); + dest[..len].copy_from_slice(string_to_add.as_bytes()); + } + Atom { + // INLINE_TAG ensures this is never zero + unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, + phantom: PhantomData, + } + } else { + Self::try_static_internal(&*string_to_add).unwrap_or_else(|hash| { + let ptr: std::ptr::NonNull = dynamic_set().insert(string_to_add, hash.g); + let data = ptr.as_ptr() as u64; + debug_assert!(0 == data & TAG_MASK); + Atom { + // The address of a ptr::NonNull is non-zero + unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, + phantom: PhantomData, + } + }) + } + } +} + +impl Clone for Atom { + #[inline(always)] + fn clone(&self) -> Self { + if self.tag() == DYNAMIC_TAG { + let entry = self.unsafe_data.get() as *const Entry; + unsafe { &*entry }.ref_count.fetch_add(1, SeqCst); + } + Atom { ..*self } + } +} + +impl Drop for Atom { + #[inline] + fn drop(&mut self) { + if self.tag() == DYNAMIC_TAG { + let entry = self.unsafe_data.get() as *const Entry; + if unsafe { &*entry }.ref_count.fetch_sub(1, SeqCst) == 1 { + drop_slow(self) + } + } + + // Out of line to guide inlining. + fn drop_slow(this: &mut Atom) { + dynamic_set().remove(this.unsafe_data.get() as *mut Entry); + } + } +} + +impl ops::Deref for Atom { + type Target = str; + + #[inline] + fn deref(&self) -> &str { + unsafe { + match self.tag() { + DYNAMIC_TAG => { + let entry = self.unsafe_data.get() as *const Entry; + &(*entry).string + } + INLINE_TAG => { + let len = (self.unsafe_data() & LEN_MASK) >> LEN_OFFSET; + debug_assert!(len as usize <= MAX_INLINE_LEN); + let src = inline_atom_slice(&self.unsafe_data); + str::from_utf8_unchecked(src.get_unchecked(..(len as usize))) + } + STATIC_TAG => Static::get().atoms[self.static_index() as usize], + _ => debug_unreachable!(), + } + } + } +} + +impl fmt::Debug for Atom { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let ty_str = unsafe { + match self.tag() { + DYNAMIC_TAG => "dynamic", + INLINE_TAG => "inline", + STATIC_TAG => "static", + _ => debug_unreachable!(), + } + }; + + write!(f, "Atom('{}' type={})", &*self, ty_str) + } +} + +impl PartialOrd for Atom { + #[inline] + fn partial_cmp(&self, other: &Self) -> Option { + if self.unsafe_data == other.unsafe_data { + return Some(Equal); + } + self.as_ref().partial_cmp(other.as_ref()) + } +} + +impl Ord for Atom { + #[inline] + fn cmp(&self, other: &Self) -> Ordering { + if self.unsafe_data == other.unsafe_data { + return Equal; + } + self.as_ref().cmp(other.as_ref()) + } +} + +// AsciiExt requires mutating methods, so we just implement the non-mutating ones. +// We don't need to implement is_ascii because there's no performance improvement +// over the one from &str. +impl Atom { + fn from_mutated_str(s: &str, f: F) -> Self { + let mut buffer = mem::MaybeUninit::<[u8; 64]>::uninit(); + let buffer = unsafe { &mut *buffer.as_mut_ptr() }; + + if let Some(buffer_prefix) = buffer.get_mut(..s.len()) { + buffer_prefix.copy_from_slice(s.as_bytes()); + let as_str = unsafe { ::std::str::from_utf8_unchecked_mut(buffer_prefix) }; + f(as_str); + Atom::from(&*as_str) + } else { + let mut string = s.to_owned(); + f(&mut string); + Atom::from(string) + } + } + + /// Like [`to_ascii_uppercase`]. + /// + /// [`to_ascii_uppercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_uppercase + pub fn to_ascii_uppercase(&self) -> Self { + for (i, b) in self.bytes().enumerate() { + if let b'a'..=b'z' = b { + return Atom::from_mutated_str(self, |s| s[i..].make_ascii_uppercase()); + } + } + self.clone() + } + + /// Like [`to_ascii_lowercase`]. + /// + /// [`to_ascii_lowercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_lowercase + pub fn to_ascii_lowercase(&self) -> Self { + for (i, b) in self.bytes().enumerate() { + if let b'A'..=b'Z' = b { + return Atom::from_mutated_str(self, |s| s[i..].make_ascii_lowercase()); + } + } + self.clone() + } + + /// Like [`eq_ignore_ascii_case`]. + /// + /// [`eq_ignore_ascii_case`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.eq_ignore_ascii_case + pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { + (self == other) || self.eq_str_ignore_ascii_case(&**other) + } + + /// Like [`eq_ignore_ascii_case`], but takes an unhashed string as `other`. + /// + /// [`eq_ignore_ascii_case`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.eq_ignore_ascii_case + pub fn eq_str_ignore_ascii_case(&self, other: &str) -> bool { + (&**self).eq_ignore_ascii_case(other) + } +} + +#[inline(always)] +fn inline_atom_slice(x: &NonZeroU64) -> &[u8] { + let x: *const NonZeroU64 = x; + let mut data = x as *const u8; + // All except the lowest byte, which is first in little-endian, last in big-endian. + if cfg!(target_endian = "little") { + data = unsafe { data.offset(1) }; + } + let len = 7; + unsafe { slice::from_raw_parts(data, len) } +} + +#[inline(always)] +fn inline_atom_slice_mut(x: &mut u64) -> &mut [u8] { + let x: *mut u64 = x; + let mut data = x as *mut u8; + // All except the lowest byte, which is first in little-endian, last in big-endian. + if cfg!(target_endian = "little") { + data = unsafe { data.offset(1) }; + } + let len = 7; + unsafe { slice::from_raw_parts_mut(data, len) } +} diff --git a/src/atom/mod.rs b/src/atom/mod.rs deleted file mode 100644 index 2136a8b..0000000 --- a/src/atom/mod.rs +++ /dev/null @@ -1,779 +0,0 @@ -// Copyright 2014 The Servo Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -#![allow(non_upper_case_globals)] - -#[cfg(feature = "heap_size")] -use heapsize::HeapSizeOf; - -use serde::{Deserialize, Deserializer, Serialize, Serializer}; - -use std::ascii::AsciiExt; -use std::borrow::Cow; -use std::cmp::Ordering::{self, Equal}; -use std::fmt; -use std::mem; -use std::ops; -use std::ptr; -use std::slice; -use std::str; -use std::sync::Mutex; -use std::sync::atomic::AtomicIsize; -use std::sync::atomic::Ordering::SeqCst; - -use shared::{STATIC_TAG, INLINE_TAG, DYNAMIC_TAG, TAG_MASK, MAX_INLINE_LEN, STATIC_SHIFT_BITS, - ENTRY_ALIGNMENT, pack_static, StaticAtomSet}; -use self::UnpackedAtom::{Dynamic, Inline, Static}; - -#[cfg(feature = "log-events")] -use event::Event; - -include!(concat!(env!("OUT_DIR"), "/static_atom_set.rs")); - -#[cfg(not(feature = "log-events"))] -macro_rules! log (($e:expr) => (())); - -const NB_BUCKETS: usize = 1 << 12; // 4096 -const BUCKET_MASK: u64 = (1 << 12) - 1; - -struct StringCache { - buckets: [Option>; NB_BUCKETS], -} - -#[cfg(feature = "heap_size")] -impl HeapSizeOf for StringCache { - fn heap_size_of_children(&self) -> usize { - self.buckets.iter().fold(0, |size, bucket| size + bucket.heap_size_of_children()) - } -} - -lazy_static! { - static ref STRING_CACHE: Mutex = Mutex::new(StringCache::new()); -} - -/// A token that represents the heap used by the dynamic string cache. -#[cfg(feature = "heap_size")] -pub struct StringCacheHeap; - -#[cfg(feature = "heap_size")] -impl HeapSizeOf for StringCacheHeap { - fn heap_size_of_children(&self) -> usize { - STRING_CACHE.lock().unwrap().heap_size_of_children() - } -} - -#[cfg_attr(feature = "heap_size", derive(HeapSizeOf))] -struct StringCacheEntry { - next_in_bucket: Option>, - hash: u64, - ref_count: AtomicIsize, - string: String, -} - -impl StringCacheEntry { - fn new(next: Option>, hash: u64, string: String) - -> StringCacheEntry { - StringCacheEntry { - next_in_bucket: next, - hash: hash, - ref_count: AtomicIsize::new(1), - string: string, - } - } -} - -impl StringCache { - fn new() -> StringCache { - StringCache { - buckets: unsafe { mem::zeroed() }, - } - } - - fn add(&mut self, string: Cow, hash: u64) -> *mut StringCacheEntry { - let bucket_index = (hash & BUCKET_MASK) as usize; - { - let mut ptr: Option<&mut Box> = - self.buckets[bucket_index].as_mut(); - - while let Some(entry) = ptr.take() { - if entry.hash == hash && entry.string == &*string { - if entry.ref_count.fetch_add(1, SeqCst) > 0 { - return &mut **entry; - } - // Uh-oh. The pointer's reference count was zero, which means someone may try - // to free it. (Naive attempts to defend against this, for example having the - // destructor check to see whether the reference count is indeed zero, don't - // work due to ABA.) Thus we need to temporarily add a duplicate string to the - // list. - entry.ref_count.fetch_sub(1, SeqCst); - break; - } - ptr = entry.next_in_bucket.as_mut(); - } - } - debug_assert!(mem::align_of::() >= ENTRY_ALIGNMENT); - let string = string.into_owned(); - let _string_clone = if cfg!(feature = "log-events") { - string.clone() - } else { - "".to_owned() - }; - let mut entry = Box::new(StringCacheEntry::new( - self.buckets[bucket_index].take(), hash, string)); - let ptr: *mut StringCacheEntry = &mut *entry; - self.buckets[bucket_index] = Some(entry); - log!(Event::Insert(ptr as u64, _string_clone)); - - ptr - } - - fn remove(&mut self, key: u64) { - let ptr = key as *mut StringCacheEntry; - let bucket_index = { - let value: &StringCacheEntry = unsafe { &*ptr }; - debug_assert!(value.ref_count.load(SeqCst) == 0); - (value.hash & BUCKET_MASK) as usize - }; - - - let mut current: &mut Option> = &mut self.buckets[bucket_index]; - - loop { - let entry_ptr: *mut StringCacheEntry = match current.as_mut() { - Some(entry) => &mut **entry, - None => break, - }; - if entry_ptr == ptr { - mem::drop(mem::replace(current, unsafe { (*entry_ptr).next_in_bucket.take() })); - break; - } - current = unsafe { &mut (*entry_ptr).next_in_bucket }; - } - - log!(Event::Remove(key)); - } -} - -// NOTE: Deriving Eq here implies that a given string must always -// be interned the same way. -#[cfg_attr(feature = "unstable", unsafe_no_drop_flag)] // See tests::atom_drop_is_idempotent -#[cfg_attr(feature = "heap_size", derive(HeapSizeOf))] -#[derive(Eq, Hash, PartialEq)] -pub struct Atom { - /// This field is public so that the `atom!()` macro can use it. - /// You should not otherwise access this field. - pub data: u64, -} - -impl Atom { - #[inline(always)] - unsafe fn unpack(&self) -> UnpackedAtom { - UnpackedAtom::from_packed(self.data) - } - - pub fn get_hash(&self) -> u32 { - ((self.data >> 32) ^ self.data) as u32 - } -} - -impl PartialEq for Atom { - fn eq(&self, other: &str) -> bool { - &self[..] == other - } -} - -impl PartialEq for str { - fn eq(&self, other: &Atom) -> bool { - self == &other[..] - } -} - -impl<'a> From> for Atom { - #[inline] - fn from(string_to_add: Cow<'a, str>) -> Atom { - let unpacked = match STATIC_ATOM_SET.get_index_or_hash(&*string_to_add) { - Ok(id) => Static(id as u32), - Err(hash) => { - let len = string_to_add.len(); - if len <= MAX_INLINE_LEN { - let mut buf: [u8; 7] = [0; 7]; - copy_memory(string_to_add.as_bytes(), &mut buf); - Inline(len as u8, buf) - } else { - Dynamic(STRING_CACHE.lock().unwrap().add(string_to_add, hash) as *mut ()) - } - } - }; - - let data = unsafe { unpacked.pack() }; - log!(Event::Intern(data)); - Atom { data: data } - } -} - -impl<'a> From<&'a str> for Atom { - #[inline] - fn from(string_to_add: &str) -> Atom { - Atom::from(Cow::Borrowed(string_to_add)) - } -} - -impl From for Atom { - #[inline] - fn from(string_to_add: String) -> Atom { - Atom::from(Cow::Owned(string_to_add)) - } -} - -impl Clone for Atom { - #[inline(always)] - fn clone(&self) -> Atom { - unsafe { - match from_packed_dynamic(self.data) { - Some(entry) => { - let entry = entry as *mut StringCacheEntry; - (*entry).ref_count.fetch_add(1, SeqCst); - }, - None => (), - } - } - Atom { - data: self.data - } - } -} - -impl Drop for Atom { - #[inline] - fn drop(&mut self) { - // Out of line to guide inlining. - fn drop_slow(this: &mut Atom) { - STRING_CACHE.lock().unwrap().remove(this.data); - } - - unsafe { - match from_packed_dynamic(self.data) { - Some(entry) => { - let entry = entry as *mut StringCacheEntry; - if (*entry).ref_count.fetch_sub(1, SeqCst) == 1 { - drop_slow(self); - } - } - _ => (), - } - } - } -} - - -impl ops::Deref for Atom { - type Target = str; - - #[inline] - fn deref(&self) -> &str { - unsafe { - match self.unpack() { - Inline(..) => { - let buf = inline_orig_bytes(&self.data); - str::from_utf8(buf).unwrap() - }, - Static(idx) => STATIC_ATOM_SET.index(idx).expect("bad static atom"), - Dynamic(entry) => { - let entry = entry as *mut StringCacheEntry; - &(*entry).string - } - } - } - } -} - -impl fmt::Display for Atom { - #[inline] - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - ::fmt(self, f) - } -} - -impl fmt::Debug for Atom { - #[inline] - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let ty_str = unsafe { - match self.unpack() { - Dynamic(..) => "dynamic", - Inline(..) => "inline", - Static(..) => "static", - } - }; - - write!(f, "Atom('{}' type={})", &*self, ty_str) - } -} - -impl PartialOrd for Atom { - #[inline] - fn partial_cmp(&self, other: &Atom) -> Option { - if self.data == other.data { - return Some(Equal); - } - self.as_ref().partial_cmp(other.as_ref()) - } -} - -impl Ord for Atom { - #[inline] - fn cmp(&self, other: &Atom) -> Ordering { - if self.data == other.data { - return Equal; - } - self.as_ref().cmp(other.as_ref()) - } -} - -impl AsRef for Atom { - fn as_ref(&self) -> &str { - &self - } -} - -impl Serialize for Atom { - fn serialize(&self, serializer: &mut S) -> Result<(),S::Error> where S: Serializer { - let string: &str = self.as_ref(); - string.serialize(serializer) - } -} - -impl Deserialize for Atom { - fn deserialize(deserializer: &mut D) -> Result where D: Deserializer { - let string: String = try!(Deserialize::deserialize(deserializer)); - Ok(Atom::from(&*string)) - } -} - -// AsciiExt requires mutating methods, so we just implement the non-mutating ones. -// We don't need to implement is_ascii because there's no performance improvement -// over the one from &str. -impl Atom { - pub fn to_ascii_uppercase(&self) -> Atom { - if self.chars().all(char::is_uppercase) { - self.clone() - } else { - Atom::from(&*((&**self).to_ascii_uppercase())) - } - } - - pub fn to_ascii_lowercase(&self) -> Atom { - if self.chars().all(char::is_lowercase) { - self.clone() - } else { - Atom::from(&*((&**self).to_ascii_lowercase())) - } - } - - pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { - (self == other) || (&**self).eq_ignore_ascii_case(&**other) - } -} - -// Atoms use a compact representation which fits this enum in a single u64. -// Inlining avoids actually constructing the unpacked representation in memory. -#[allow(missing_copy_implementations)] -enum UnpackedAtom { - /// Pointer to a dynamic table entry. Must be 16-byte aligned! - Dynamic(*mut ()), - - /// Length + bytes of string. - Inline(u8, [u8; 7]), - - /// Index in static interning table. - Static(u32), -} - -struct RawSlice { - data: *const u8, - len: usize, -} - -#[cfg(target_endian = "little")] // Not implemented yet for big-endian -#[inline(always)] -unsafe fn inline_atom_slice(x: &u64) -> RawSlice { - let x: *const u64 = x; - RawSlice { - data: (x as *const u8).offset(1), - len: 7, - } -} - -impl UnpackedAtom { - #[inline(always)] - unsafe fn pack(self) -> u64 { - match self { - Static(n) => pack_static(n), - Dynamic(p) => { - let n = p as u64; - debug_assert!(0 == n & TAG_MASK); - n - } - Inline(len, buf) => { - debug_assert!((len as usize) <= MAX_INLINE_LEN); - let mut data: u64 = (INLINE_TAG as u64) | ((len as u64) << 4); - { - let raw_slice = inline_atom_slice(&mut data); - let dest: &mut [u8] = slice::from_raw_parts_mut( - raw_slice.data as *mut u8, raw_slice.len); - copy_memory(&buf[..], dest); - } - data - } - } - } - - #[inline(always)] - unsafe fn from_packed(data: u64) -> UnpackedAtom { - debug_assert!(DYNAMIC_TAG == 0); // Dynamic is untagged - - match (data & TAG_MASK) as u8 { - DYNAMIC_TAG => Dynamic(data as *mut ()), - STATIC_TAG => Static((data >> STATIC_SHIFT_BITS) as u32), - INLINE_TAG => { - let len = ((data & 0xf0) >> 4) as usize; - debug_assert!(len <= MAX_INLINE_LEN); - let mut buf: [u8; 7] = [0; 7]; - let raw_slice = inline_atom_slice(&data); - let src: &[u8] = slice::from_raw_parts(raw_slice.data, raw_slice.len); - copy_memory(src, &mut buf[..]); - Inline(len as u8, buf) - }, - _ => debug_unreachable!(), - } - } -} - -/// Used for a fast path in Clone and Drop. -#[inline(always)] -unsafe fn from_packed_dynamic(data: u64) -> Option<*mut ()> { - if (DYNAMIC_TAG as u64) == (data & TAG_MASK) { - Some(data as *mut ()) - } else { - None - } -} - -/// For as_slice on inline atoms, we need a pointer into the original -/// string contents. -/// -/// It's undefined behavior to call this on a non-inline atom!! -#[inline(always)] -unsafe fn inline_orig_bytes<'a>(data: &'a u64) -> &'a [u8] { - match UnpackedAtom::from_packed(*data) { - Inline(len, _) => { - let raw_slice = inline_atom_slice(&data); - let src: &[u8] = slice::from_raw_parts(raw_slice.data, raw_slice.len); - &src[..(len as usize)] - } - _ => debug_unreachable!(), - } -} - - -/// Copy of std::slice::bytes::copy_memory, which is unstable. -#[inline] -fn copy_memory(src: &[u8], dst: &mut [u8]) { - let len_src = src.len(); - assert!(dst.len() >= len_src); - // `dst` is unaliasable, so we know statically it doesn't overlap - // with `src`. - unsafe { - ptr::copy_nonoverlapping(src.as_ptr(), - dst.as_mut_ptr(), - len_src); - } -} - -#[cfg(all(test, feature = "unstable"))] -mod bench; - -#[cfg(test)] -mod tests { - use std::mem; - use std::thread; - use super::{Atom, StringCacheEntry, STATIC_ATOM_SET}; - use super::UnpackedAtom::{Dynamic, Inline, Static}; - use shared::ENTRY_ALIGNMENT; - - #[test] - fn test_as_slice() { - let s0 = Atom::from(""); - assert!(s0.as_ref() == ""); - - let s1 = Atom::from("class"); - assert!(s1.as_ref() == "class"); - - let i0 = Atom::from("blah"); - assert!(i0.as_ref() == "blah"); - - let s0 = Atom::from("BLAH"); - assert!(s0.as_ref() == "BLAH"); - - let d0 = Atom::from("zzzzzzzzzz"); - assert!(d0.as_ref() == "zzzzzzzzzz"); - - let d1 = Atom::from("ZZZZZZZZZZ"); - assert!(d1.as_ref() == "ZZZZZZZZZZ"); - } - - macro_rules! unpacks_to (($e:expr, $t:pat) => ( - match unsafe { Atom::from($e).unpack() } { - $t => (), - _ => panic!("atom has wrong type"), - } - )); - - #[test] - fn test_types() { - unpacks_to!("", Static(..)); - unpacks_to!("id", Static(..)); - unpacks_to!("body", Static(..)); - unpacks_to!("c", Inline(..)); // "z" is a static atom - unpacks_to!("zz", Inline(..)); - unpacks_to!("zzz", Inline(..)); - unpacks_to!("zzzz", Inline(..)); - unpacks_to!("zzzzz", Inline(..)); - unpacks_to!("zzzzzz", Inline(..)); - unpacks_to!("zzzzzzz", Inline(..)); - unpacks_to!("zzzzzzzz", Dynamic(..)); - unpacks_to!("zzzzzzzzzzzzz", Dynamic(..)); - } - - #[test] - fn test_equality() { - let s0 = Atom::from("fn"); - let s1 = Atom::from("fn"); - let s2 = Atom::from("loop"); - - let i0 = Atom::from("blah"); - let i1 = Atom::from("blah"); - let i2 = Atom::from("blah2"); - - let d0 = Atom::from("zzzzzzzz"); - let d1 = Atom::from("zzzzzzzz"); - let d2 = Atom::from("zzzzzzzzz"); - - assert!(s0 == s1); - assert!(s0 != s2); - - assert!(i0 == i1); - assert!(i0 != i2); - - assert!(d0 == d1); - assert!(d0 != d2); - - assert!(s0 != i0); - assert!(s0 != d0); - assert!(i0 != d0); - } - - #[test] - fn ord() { - fn check(x: &str, y: &str) { - assert_eq!(x < y, Atom::from(x) < Atom::from(y)); - assert_eq!(x.cmp(y), Atom::from(x).cmp(&Atom::from(y))); - assert_eq!(x.partial_cmp(y), Atom::from(x).partial_cmp(&Atom::from(y))); - } - - check("a", "body"); - check("asdf", "body"); - check("zasdf", "body"); - check("z", "body"); - - check("a", "bbbbb"); - check("asdf", "bbbbb"); - check("zasdf", "bbbbb"); - check("z", "bbbbb"); - } - - #[test] - fn clone() { - let s0 = Atom::from("fn"); - let s1 = s0.clone(); - let s2 = Atom::from("loop"); - - let i0 = Atom::from("blah"); - let i1 = i0.clone(); - let i2 = Atom::from("blah2"); - - let d0 = Atom::from("zzzzzzzz"); - let d1 = d0.clone(); - let d2 = Atom::from("zzzzzzzzz"); - - assert!(s0 == s1); - assert!(s0 != s2); - - assert!(i0 == i1); - assert!(i0 != i2); - - assert!(d0 == d1); - assert!(d0 != d2); - - assert!(s0 != i0); - assert!(s0 != d0); - assert!(i0 != d0); - } - - macro_rules! assert_eq_fmt (($fmt:expr, $x:expr, $y:expr) => ({ - let x = $x; - let y = $y; - if x != y { - panic!("assertion failed: {} != {}", - format_args!($fmt, x), - format_args!($fmt, y)); - } - })); - - #[test] - fn repr() { - fn check(s: &str, data: u64) { - assert_eq_fmt!("0x{:016X}", Atom::from(s).data, data); - } - - fn check_static(s: &str, x: Atom) { - assert_eq_fmt!("0x{:016X}", x.data, Atom::from(s).data); - assert_eq!(0x2, x.data & 0xFFFF_FFFF); - // The index is unspecified by phf. - assert!((x.data >> 32) <= STATIC_ATOM_SET.iter().len() as u64); - } - - // This test is here to make sure we don't change atom representation - // by accident. It may need adjusting if there are changes to the - // static atom table, the tag values, etc. - - // Static atoms - check_static("a", atom!("a")); - check_static("address", atom!("address")); - check_static("area", atom!("area")); - - // Inline atoms - check("e", 0x0000_0000_0000_6511); - check("xyzzy", 0x0000_797A_7A79_7851); - check("xyzzy01", 0x3130_797A_7A79_7871); - - // Dynamic atoms. This is a pointer so we can't verify every bit. - assert_eq!(0x00, Atom::from("a dynamic string").data & 0xf); - } - - #[test] - fn assert_sizes() { - // Guard against accidental changes to the sizes of things. - use std::mem; - assert_eq!(if cfg!(feature = "unstable") { 8 } else { 16 }, mem::size_of::()); - assert_eq!(48, mem::size_of::()); - } - - #[test] - fn test_threads() { - for _ in 0_u32..100 { - thread::spawn(move || { - let _ = Atom::from("a dynamic string"); - let _ = Atom::from("another string"); - }); - } - } - - #[test] - fn atom_macro() { - assert_eq!(atom!("body"), Atom::from("body")); - assert_eq!(atom!("font-weight"), Atom::from("font-weight")); - } - - #[test] - fn match_atom() { - assert_eq!(2, match Atom::from("head") { - atom!("br") => 1, - atom!("html") | atom!("head") => 2, - _ => 3, - }); - - assert_eq!(3, match Atom::from("body") { - atom!("br") => 1, - atom!("html") | atom!("head") => 2, - _ => 3, - }); - - assert_eq!(3, match Atom::from("zzzzzz") { - atom!("br") => 1, - atom!("html") | atom!("head") => 2, - _ => 3, - }); - } - - #[test] - fn ensure_deref() { - // Ensure we can Deref to a &str - let atom = Atom::from("foobar"); - let _: &str = &atom; - } - - #[test] - fn ensure_as_ref() { - // Ensure we can as_ref to a &str - let atom = Atom::from("foobar"); - let _: &str = atom.as_ref(); - } - - /// Atom uses #[unsafe_no_drop_flag] to stay small, so drop() may be called more than once. - /// In calls after the first one, the atom will be filled with a POST_DROP value. - /// drop() must be a no-op in this case. - #[cfg(feature = "unstable")] - #[test] - fn atom_drop_is_idempotent() { - use super::from_packed_dynamic; - unsafe { - assert_eq!(from_packed_dynamic(mem::POST_DROP_U64), None); - } - } - - #[test] - fn string_cache_entry_alignment_is_sufficient() { - assert!(mem::align_of::() >= ENTRY_ALIGNMENT); - } - - #[test] - fn test_ascii_lowercase() { - assert_eq!(Atom::from("").to_ascii_lowercase(), Atom::from("")); - assert_eq!(Atom::from("aZ9").to_ascii_lowercase(), Atom::from("az9")); - assert_eq!(Atom::from("The Quick Brown Fox!").to_ascii_lowercase(), Atom::from("the quick brown fox!")); - assert_eq!(Atom::from("JE VAIS À PARIS").to_ascii_lowercase(), Atom::from("je vais À paris")); - } - - #[test] - fn test_ascii_uppercase() { - assert_eq!(Atom::from("").to_ascii_uppercase(), Atom::from("")); - assert_eq!(Atom::from("aZ9").to_ascii_uppercase(), Atom::from("AZ9")); - assert_eq!(Atom::from("The Quick Brown Fox!").to_ascii_uppercase(), Atom::from("THE QUICK BROWN FOX!")); - assert_eq!(Atom::from("Je vais à Paris").to_ascii_uppercase(), Atom::from("JE VAIS à PARIS")); - } - - #[test] - fn test_eq_ignore_ascii_case() { - assert!(Atom::from("").eq_ignore_ascii_case(&Atom::from(""))); - assert!(Atom::from("aZ9").eq_ignore_ascii_case(&Atom::from("aZ9"))); - assert!(Atom::from("aZ9").eq_ignore_ascii_case(&Atom::from("Az9"))); - assert!(Atom::from("The Quick Brown Fox!").eq_ignore_ascii_case(&Atom::from("THE quick BROWN fox!"))); - assert!(Atom::from("Je vais à Paris").eq_ignore_ascii_case(&Atom::from("je VAIS à PARIS"))); - assert!(!Atom::from("").eq_ignore_ascii_case(&Atom::from("az9"))); - assert!(!Atom::from("aZ9").eq_ignore_ascii_case(&Atom::from(""))); - assert!(!Atom::from("aZ9").eq_ignore_ascii_case(&Atom::from("9Za"))); - assert!(!Atom::from("The Quick Brown Fox!").eq_ignore_ascii_case(&Atom::from("THE quick BROWN fox!!"))); - assert!(!Atom::from("Je vais à Paris").eq_ignore_ascii_case(&Atom::from("JE vais À paris"))); - } - - #[test] - fn test_from_string() { - assert!(Atom::from("camembert".to_owned()) == Atom::from("camembert")); - } -} diff --git a/src/dynamic_set.rs b/src/dynamic_set.rs new file mode 100644 index 0000000..4442b4d --- /dev/null +++ b/src/dynamic_set.rs @@ -0,0 +1,112 @@ +// Copyright 2014 The Servo Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use parking_lot::Mutex; +use std::borrow::Cow; +use std::mem; +use std::ptr::NonNull; +use std::sync::atomic::AtomicIsize; +use std::sync::atomic::Ordering::SeqCst; +use std::sync::OnceLock; + +const NB_BUCKETS: usize = 1 << 12; // 4096 +const BUCKET_MASK: u32 = (1 << 12) - 1; + +pub(crate) struct Set { + buckets: Box<[Mutex>>]>, +} + +pub(crate) struct Entry { + pub(crate) string: Box, + pub(crate) hash: u32, + pub(crate) ref_count: AtomicIsize, + next_in_bucket: Option>, +} + +// Addresses are a multiples of this, +// and therefore have have TAG_MASK bits unset, available for tagging. +pub(crate) const ENTRY_ALIGNMENT: usize = 4; + +#[test] +fn entry_alignment_is_sufficient() { + assert!(mem::align_of::() >= ENTRY_ALIGNMENT); +} + +pub(crate) fn dynamic_set() -> &'static Set { + // NOTE: Using const initialization for buckets breaks the small-stack test. + // ``` + // // buckets: [Mutex>>; NB_BUCKETS], + // const MUTEX: Mutex>> = Mutex::new(None); + // let buckets = Box::new([MUTEX; NB_BUCKETS]); + // ``` + static DYNAMIC_SET: OnceLock = OnceLock::new(); + + DYNAMIC_SET.get_or_init(|| { + let buckets = (0..NB_BUCKETS).map(|_| Mutex::new(None)).collect(); + Set { buckets } + }) +} + +impl Set { + pub(crate) fn insert(&self, string: Cow, hash: u32) -> NonNull { + let bucket_index = (hash & BUCKET_MASK) as usize; + let mut linked_list = self.buckets[bucket_index].lock(); + + { + let mut ptr: Option<&mut Box> = linked_list.as_mut(); + + while let Some(entry) = ptr.take() { + if entry.hash == hash && *entry.string == *string { + if entry.ref_count.fetch_add(1, SeqCst) > 0 { + return NonNull::from(&mut **entry); + } + // Uh-oh. The pointer's reference count was zero, which means someone may try + // to free it. (Naive attempts to defend against this, for example having the + // destructor check to see whether the reference count is indeed zero, don't + // work due to ABA.) Thus we need to temporarily add a duplicate string to the + // list. + entry.ref_count.fetch_sub(1, SeqCst); + break; + } + ptr = entry.next_in_bucket.as_mut(); + } + } + debug_assert!(mem::align_of::() >= ENTRY_ALIGNMENT); + let string = string.into_owned(); + let mut entry = Box::new(Entry { + next_in_bucket: linked_list.take(), + hash, + ref_count: AtomicIsize::new(1), + string: string.into_boxed_str(), + }); + let ptr = NonNull::from(&mut *entry); + *linked_list = Some(entry); + ptr + } + + pub(crate) fn remove(&self, ptr: *mut Entry) { + let value: &Entry = unsafe { &*ptr }; + let bucket_index = (value.hash & BUCKET_MASK) as usize; + + let mut linked_list = self.buckets[bucket_index].lock(); + debug_assert!(value.ref_count.load(SeqCst) == 0); + let mut current: &mut Option> = &mut linked_list; + + while let Some(entry_ptr) = current.as_mut() { + let entry_ptr: *mut Entry = &mut **entry_ptr; + if entry_ptr == ptr { + mem::drop(mem::replace(current, unsafe { + (*entry_ptr).next_in_bucket.take() + })); + break; + } + current = unsafe { &mut (*entry_ptr).next_in_bucket }; + } + } +} diff --git a/src/event.rs b/src/event.rs deleted file mode 100644 index 79af4a1..0000000 --- a/src/event.rs +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2014 The Servo Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use std::sync::Mutex; -use rustc_serialize::{Encoder, Encodable}; - -#[derive(PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Debug)] -pub enum Event { - Intern(u64), - Insert(u64, String), - Remove(u64), -} - -lazy_static! { - pub static ref LOG: Mutex> - = Mutex::new(Vec::with_capacity(50_000)); -} - -pub fn log(e: Event) { - LOG.lock().unwrap().push(e); -} - -macro_rules! log (($e:expr) => (::event::log($e))); - -// Serialize by converting to this private struct, -// which produces more convenient output. - -#[derive(RustcEncodable)] -struct SerializeEvent<'a> { - event: &'static str, - id: u64, - string: Option<&'a String>, -} - -impl Encodable for Event { - fn encode(&self, s: &mut S) -> Result<(), S::Error> { - let (event, id, string) = match *self { - Event::Intern(id) => ("intern", id, None), - Event::Insert(id, ref s) => ("insert", id, Some(s)), - Event::Remove(id) => ("remove", id, None), - }; - - SerializeEvent { - event: event, - id: id, - string: string - }.encode(s) - } -} diff --git a/src/lib.rs b/src/lib.rs index 65ad039..3cc29b1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,70 +7,133 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -#![crate_name = "string_cache"] -#![crate_type = "rlib"] +//! +//! A library for interning things that are `AsRef`. +//! +//! Some strings may be interned at compile time using the `string-cache-codegen` crate, or the +//! `EmptyStaticAtomSet` may be used that has no compile-time interned strings. An `Atom` is an +//! interned string for a given set (either `EmptyStaticAtomSet` or a generated `StaticAtomSet`). +//! +//! Generated `Atom`s will have assocated macros to intern static strings at compile-time. +//! +//! # Examples +//! +//! Here are two examples, one with compile-time `Atom`s, and one without. +//! +//! ## With compile-time atoms +//! +//! In `Cargo.toml`: +//! ```toml +//! [dependencies] +//! string_cache = "0.9" +//! +//! [dev-dependencies] +//! string_cache_codegen = "0.6" +//! ``` +//! +//! In `build.rs`: +//! +//! ```ignore +//! extern crate string_cache_codegen; +//! +//! use std::env; +//! use std::path::Path; +//! +//! fn main() { +//! string_cache_codegen::AtomType::new("foo::FooAtom", "foo_atom!") +//! .atoms(&["foo", "bar"]) +//! .write_to_file(&Path::new(&env::var("OUT_DIR").unwrap()).join("foo_atom.rs")) +//! .unwrap() +//! } +//! ``` +//! +//! In `lib.rs`: +//! +//! ```ignore +//! extern crate string_cache; +//! +//! mod foo { +//! include!(concat!(env!("OUT_DIR"), "/foo_atom.rs")); +//! } +//! +//! fn use_the_atom(t: &str) { +//! match *t { +//! foo_atom!("foo") => println!("Found foo!"), +//! foo_atom!("bar") => println!("Found bar!"), +//! // foo_atom!("baz") => println!("Found baz!"), - would be a compile time error +//! _ => { +//! println!("String not interned"); +//! // We can intern strings at runtime as well +//! foo::FooAtom::from(t) +//! } +//! } +//! } +//! ``` +//! +//! ## No compile-time atoms +//! +//! ``` +//! # extern crate string_cache; +//! use string_cache::DefaultAtom; +//! +//! # fn main() { +//! let mut interned_stuff = Vec::new(); +//! let text = "here is a sentence of text that will be tokenised and +//! interned and some repeated tokens is of text and"; +//! for word in text.split_whitespace() { +//! let seen_before = interned_stuff.iter() +//! // We can use impl PartialEq where T is anything string-like +//! // to compare to interned strings to either other interned strings, +//! // or actual strings Comparing two interned strings is very fast +//! // (normally a single cpu operation). +//! .filter(|interned_word| interned_word == &word) +//! .count(); +//! if seen_before > 0 { +//! println!(r#"Seen the word "{}" {} times"#, word, seen_before); +//! } else { +//! println!(r#"Not seen the word "{}" before"#, word); +//! } +//! // We use the impl From<(Cow<'a, str>, or &'a str, or String)> for +//! // Atom to intern a new string. +//! interned_stuff.push(DefaultAtom::from(word)); +//! } +//! # } +//! ``` +//! #![cfg_attr(test, deny(warnings))] -#![cfg_attr(all(test, feature = "unstable"), feature(test, filling_drop))] -#![cfg_attr(feature = "unstable", feature(unsafe_no_drop_flag))] -#![cfg_attr(feature = "heap_size", feature(plugin, custom_derive))] -#![cfg_attr(feature = "heap_size", plugin(heapsize_plugin))] -#[cfg(all(test, feature = "unstable"))] extern crate test; -#[cfg(feature = "log-events")] extern crate rustc_serialize; -#[cfg(feature = "heap_size")] extern crate heapsize; -#[cfg(test)] extern crate rand; -#[macro_use] extern crate lazy_static; -#[macro_use] extern crate debug_unreachable; -extern crate serde; -extern crate phf_shared; - -pub use atom::Atom; -pub use namespace::{Namespace, QualName}; - -#[macro_export] -macro_rules! qualname { - ("", $local:tt) => { - $crate::namespace::QualName { - ns: ns!(), - local: atom!($local), - } - }; - ($ns:tt, $local:tt) => { - $crate::namespace::QualName { - ns: ns!($ns), - local: atom!($local), - } - } -} +// Types, such as Atom, that impl Hash must follow the hash invariant: if two objects match +// with PartialEq, they must also have the same Hash. Clippy warns on types that derive one while +// manually impl-ing the other, because it seems easy for the two to drift apart, causing the +// invariant to be violated. +// +// But Atom is a newtype over NonZeroU64, and probably always will be, since cheap comparisons and +// copying are this library's purpose. So we know what the PartialEq comparison is going to do. +// +// The `get_hash` function, seen in `atom.rs`, consults that number, plus the global string interner +// tables. The only way for the resulting hash for two Atoms with the same inner 64-bit number to +// differ would be if the table entry changed between invocations, and that would be really bad. +#![allow(clippy::derive_hash_xor_eq)] -#[macro_export] -macro_rules! ns { - () => { $crate::Namespace(atom!("")) }; - (html) => { $crate::Namespace(atom!("http://www.w3.org/1999/xhtml")) }; - (xml) => { $crate::Namespace(atom!("http://www.w3.org/XML/1998/namespace")) }; - (xmlns) => { $crate::Namespace(atom!("http://www.w3.org/2000/xmlns/")) }; - (xlink) => { $crate::Namespace(atom!("http://www.w3.org/1999/xlink")) }; - (svg) => { $crate::Namespace(atom!("http://www.w3.org/2000/svg")) }; - (mathml) => { $crate::Namespace(atom!("http://www.w3.org/1998/Math/MathML")) }; -} +mod atom; +mod dynamic_set; +mod static_sets; +mod trivial_impls; -include!(concat!(env!("OUT_DIR"), "/atom_macro.rs")); +pub use atom::Atom; +pub use static_sets::{EmptyStaticAtomSet, PhfStrSet, StaticAtomSet}; -#[cfg(feature = "log-events")] -#[macro_use] -pub mod event; +/// Use this if you don’t care about static atoms. +pub type DefaultAtom = Atom; -pub mod atom; -pub mod namespace; -pub mod shared; +// Some minor tests of internal layout here. +// See ../integration-tests for much more. -// A private module so that macro-expanded idents like -// `::string_cache::atom::Atom` will also work in this crate. -// -// `libstd` uses the same trick. -#[doc(hidden)] -mod string_cache { - pub use atom; - pub use namespace; +/// Guard against accidental changes to the sizes of things. +#[test] +fn assert_sizes() { + use std::mem::size_of; + assert_eq!(size_of::(), 8); + assert_eq!(size_of::>(), size_of::(),); } diff --git a/src/namespace.rs b/src/namespace.rs deleted file mode 100644 index 4b4d142..0000000 --- a/src/namespace.rs +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright 2014 The Servo Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -//! **Note:** This may move as string-cache becomes less Web-specific. - -use atom::Atom; - -/// An atom that is meant to represent a namespace in the HTML / XML sense. -/// Whether a given string represents a namespace is contextual, so this is -/// a transparent wrapper that will not catch all mistakes. -#[derive(PartialEq, Eq, PartialOrd, Ord, Hash, Debug, Clone)] -#[cfg_attr(feature = "heap_size", derive(HeapSizeOf))] -pub struct Namespace(pub Atom); - -/// A name with a namespace. -#[derive(PartialEq, Eq, PartialOrd, Ord, Hash, Debug, Clone)] -#[cfg_attr(feature = "heap_size", derive(HeapSizeOf))] -pub struct QualName { - pub ns: Namespace, - pub local: Atom, -} - -impl QualName { - #[inline] - pub fn new(ns: Namespace, local: Atom) -> QualName { - QualName { - ns: ns, - local: local, - } - } -} - -#[cfg(test)] -mod tests { - use super::{Namespace, QualName}; - use Atom; - - #[test] - fn ns_macro() { - assert_eq!(ns!(), Namespace(Atom::from(""))); - - assert_eq!(ns!(html), Namespace(Atom::from("http://www.w3.org/1999/xhtml"))); - assert_eq!(ns!(xml), Namespace(Atom::from("http://www.w3.org/XML/1998/namespace"))); - assert_eq!(ns!(xmlns), Namespace(Atom::from("http://www.w3.org/2000/xmlns/"))); - assert_eq!(ns!(xlink), Namespace(Atom::from("http://www.w3.org/1999/xlink"))); - assert_eq!(ns!(svg), Namespace(Atom::from("http://www.w3.org/2000/svg"))); - assert_eq!(ns!(mathml), Namespace(Atom::from("http://www.w3.org/1998/Math/MathML"))); - } - - #[test] - fn qualname() { - assert_eq!(QualName::new(ns!(), atom!("")), - QualName { ns: ns!(), local: Atom::from("") }); - assert_eq!(QualName::new(ns!(xml), atom!("base")), - QualName { ns: ns!(xml), local: atom!("base") }); - } - - #[test] - fn qualname_macro() { - assert_eq!(qualname!("", ""), QualName { ns: ns!(), local: atom!("") }); - assert_eq!(qualname!(xml, "base"), QualName { ns: ns!(xml), local: atom!("base") }); - } -} diff --git a/src/shared.rs b/src/shared.rs deleted file mode 100644 index a653872..0000000 --- a/src/shared.rs +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2015 The Servo Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use phf_shared; - -// FIXME(rust-lang/rust#18153): generate these from an enum -pub const DYNAMIC_TAG: u8 = 0b_00; -pub const INLINE_TAG: u8 = 0b_01; // len in upper nybble -pub const STATIC_TAG: u8 = 0b_10; -pub const TAG_MASK: u64 = 0b_11; -pub const ENTRY_ALIGNMENT: usize = 4; // Multiples have TAG_MASK bits unset, available for tagging. - -pub const MAX_INLINE_LEN: usize = 7; - -pub const STATIC_SHIFT_BITS: usize = 32; - -pub fn pack_static(n: u32) -> u64 { - (STATIC_TAG as u64) | ((n as u64) << STATIC_SHIFT_BITS) -} - -pub struct StaticAtomSet { - pub key: u64, - pub disps: &'static [(u32, u32)], - pub atoms: &'static [&'static str], -} - -impl StaticAtomSet { - #[inline] - pub fn get_index_or_hash(&self, s: &str) -> Result { - let hash = phf_shared::hash(s, self.key); - let index = phf_shared::get_index(hash, self.disps, self.atoms.len()); - if self.atoms[index as usize] == s { - Ok(index) - } else { - Err(hash) - } - } - - #[inline] - pub fn index(&self, i: u32) -> Option<&'static str> { - self.atoms.get(i as usize).map(|&s| s) - } - - #[inline] - pub fn iter(&self) -> ::std::slice::Iter<&'static str> { - self.atoms.iter() - } -} diff --git a/src/static_atom_list.rs b/src/static_atom_list.rs deleted file mode 100644 index ebb4965..0000000 --- a/src/static_atom_list.rs +++ /dev/null @@ -1,1263 +0,0 @@ -// Copyright 2014 The Servo Project Developers. See the -// COPYRIGHT file at the top-level directory of this distribution. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -pub static ATOMS: &'static [&'static str] = &[ - - // The order is not preserved by phf. - - "a", - "address", - "applet", - "area", - "article", - "aside", - "b", - "base", - "basefont", - "bgsound", - "big", - "blockquote", - "body", - "br", - "button", - "caption", - "col", - "colgroup", - "dd", - "dt", - "embed", - "form", - "frame", - "frameset", - "h1", - "h2", - "h3", - "h4", - "h5", - "h6", - "head", - "html", - "input", - "li", - "link", - "marquee", - "meta", - "noframes", - "noscript", - "object", - "optgroup", - "option", - "param", - "plaintext", - "pre", - "preload", - "rp", - "rt", - "script", - "select", - "source", - "style", - "svg", - "table", - "tbody", - "td", - "template", - "textarea", - "tfoot", - "th", - "thead", - "title", - "tr", - "track", - "xmp", - - "", - - "*", - - // XML namespaces known to the HTML syntax spec - "http://www.w3.org/1999/xhtml", - "http://www.w3.org/XML/1998/namespace", - "http://www.w3.org/2000/xmlns/", - "http://www.w3.org/1999/xlink", - "http://www.w3.org/2000/svg", - "http://www.w3.org/1998/Math/MathML", - - "#text", - "#comment", - "#document", - "#document-fragment", - - // User agent strings - "4.0", - "Gecko", - "Linux", - "Mac", - "Mozilla", - "Netscape", - "Win32", - - "abbr", - "abort", - "abs", - "accent", - "accent-height", - "accentunder", - "accept", - "accept-charset", - "accesskey", - "accumulate", - "acronym", - "action", - "actiontype", - "active", - "actuate", - "additive", - "afterscriptexecute", - "align", - "alignment-baseline", - "alignmentscope", - "alink", - "alphabetic", - "alt", - "altglyph", - "altGlyph", - "altglyphdef", - "altGlyphDef", - "altglyphitem", - "altGlyphItem", - "altimg", - "alttext", - "amplitude", - "and", - "animate", - "animatecolor", - "animateColor", - "animatemotion", - "animateMotion", - "animatetransform", - "animateTransform", - "animation", - "annotation", - "annotation-xml", - "apply", - "approx", - "arabic-form", - "arccos", - "arccosh", - "arccot", - "arccoth", - "arccsc", - "arccsch", - "archive", - "arcrole", - "arcsec", - "arcsech", - "arcsin", - "arcsinh", - "arctan", - "arctanh", - "arg", - "aria-activedescendant", - "aria-atomic", - "aria-autocomplete", - "aria-busy", - "aria-channel", - "aria-checked", - "aria-controls", - "aria-datatype", - "aria-describedby", - "aria-disabled", - "aria-dropeffect", - "aria-expanded", - "aria-flowto", - "aria-grab", - "aria-haspopup", - "aria-hidden", - "aria-invalid", - "aria-labelledby", - "aria-level", - "aria-live", - "aria-multiline", - "aria-multiselectable", - "aria-owns", - "aria-posinset", - "aria-pressed", - "aria-readonly", - "aria-relevant", - "aria-required", - "aria-secret", - "aria-selected", - "aria-setsize", - "aria-sort", - "aria-templateid", - "aria-valuemax", - "aria-valuemin", - "aria-valuenow", - "ascent", - "async", - "attributename", - "attributeName", - "attributetype", - "attributeType", - "audio", - "autocomplete", - "autofocus", - "autoplay", - "autosubmit", - "axis", - "azimuth", - "background", - "background-attachment", - "background-clip", - "background-color", - "background-image", - "background-origin", - "background-position", - "background-repeat", - "background-size", - "basefrequency", - "baseFrequency", - "baseline", - "baseline-shift", - "baseprofile", - "baseProfile", - "bbox", - "bdi", - "bdo", - "beforescriptexecute", - "beforeunload", - "begin", - "bevel", - "bevelled", - "bgcolor", - "bias", - "blink", - "blob", - "border", - "border-bottom", - "border-bottom-color", - "border-bottom-left-radius", - "border-bottom-right-radius", - "border-bottom-style", - "border-bottom-width", - "border-color", - "border-left", - "border-left-color", - "border-left-style", - "border-left-width", - "border-radius", - "border-right", - "border-right-color", - "border-right-style", - "border-right-width", - "border-style", - "border-top", - "border-top-color", - "border-top-left-radius", - "border-top-right-radius", - "border-top-style", - "border-top-width", - "border-width", - "bottom", - "butt", - "bvar", - "by", - "calcmode", - "calcMode", - "canvas", - "cap-height", - "card", - "cartesianproduct", - "ceiling", - "cellpadding", - "cellspacing", - "center", - "change", - "char", - "charoff", - "charset", - "checkbox", - "checked", - "ci", - "circle", - "cite", - "class", - "classid", - "clear", - "click", - "clip", - "clip-path", - "clippath", - "clipPath", - "clippathunits", - "clipPathUnits", - "clip-rule", - "close", - "closure", - "cn", - "code", - "codebase", - "codetype", - "codomain", - "color", - "color-interpolation", - "color-interpolation-filters", - "color-profile", - "color-rendering", - "cols", - "colspan", - "columnalign", - "column-count", - "columnlines", - "columns", - "columnspacing", - "columnspan", - "column-width", - "columnwidth", - "compact", - "complexes", - "compose", - "condition", - "conjugate", - "content", - "contenteditable", - "contentscripttype", - "contentScriptType", - "contentstyletype", - "contentStyleType", - "contextmenu", - "controls", - "coords", - "cos", - "cosh", - "cot", - "coth", - "crossorigin", - "csc", - "csch", - "csymbol", - "curl", - "cursor", - "customevent", - "cx", - "cy", - "d", - "data", - "datafld", - "dataformatas", - "datalist", - "datasrc", - "datatemplate", - "date", - "datetime", - "datetime-local", - "declare", - "default", - "defer", - "definition-src", - "definitionurl", - "definitionURL", - "defs", - "degree", - "del", - "depth", - "desc", - "descent", - "details", - "determinant", - "dfn", - "dialog", - "diff", - "diffuseconstant", - "diffuseConstant", - "dir", - "direction", - "dirname", - "disabled", - "discard", - "display", - "displaystyle", - "div", - "divergence", - "divide", - "divisor", - "dl", - "domain", - "domainofapplication", - "DOMContentLoaded", - "dominant-baseline", - "draggable", - "dur", - "dx", - "dy", - "edge", - "edgemode", - "edgeMode", - "elevation", - "ellipse", - "em", - "email", - "emptyset", - "enable-background", - "encoding", - "enctype", - "end", - "eq", - "equalcolumns", - "equalrows", - "equivalent", - "error", - "eulergamma", - "event", - "events", - "exists", - "exp", - "exponent", - "exponentiale", - "externalresourcesrequired", - "externalResourcesRequired", - "face", - "factorial", - "factorof", - "false", - "feblend", - "feBlend", - "fecolormatrix", - "feColorMatrix", - "fecomponenttransfer", - "feComponentTransfer", - "fecomposite", - "feComposite", - "feconvolvematrix", - "feConvolveMatrix", - "fediffuselighting", - "feDiffuseLighting", - "fedisplacementmap", - "feDisplacementMap", - "fedistantlight", - "feDistantLight", - "fedropshadow", - "feDropShadow", - "feflood", - "feFlood", - "fefunca", - "feFuncA", - "fefuncb", - "feFuncB", - "fefuncg", - "feFuncG", - "fefuncr", - "feFuncR", - "fegaussianblur", - "feGaussianBlur", - "feimage", - "feImage", - "femerge", - "feMerge", - "femergenode", - "feMergeNode", - "femorphology", - "feMorphology", - "fence", - "feoffset", - "feOffset", - "fepointlight", - "fePointLight", - "fespecularlighting", - "feSpecularLighting", - "fespotlight", - "feSpotLight", - "fetile", - "feTile", - "feturbulence", - "feTurbulence", - "fieldset", - "figcaption", - "figure", - "file", - "fill", - "fill-opacity", - "fill-rule", - "filter", - "filterres", - "filterRes", - "filterunits", - "filterUnits", - "float", - "flood", - "flood-color", - "flood-opacity", - "floor", - "fn", - "font", - "font-face", - "font-face-format", - "font-face-name", - "font-face-src", - "font-face-uri", - "font-family", - "fontfamily", - "font-size", - "fontsize", - "font-size-adjust", - "font-stretch", - "font-style", - "fontstyle", - "font-variant", - "font-weight", - "fontweight", - "footer", - "for", - "forall", - "foreignobject", - "foreignObject", - "formaction", - "format", - "formenctype", - "formmethod", - "formnovalidate", - "formtarget", - "frameborder", - "framespacing", - "from", - "fx", - "fy", - "g", - "g1", - "g2", - "gcd", - "geq", - "glyph", - "glyph-name", - "glyph-orientation-horizontal", - "glyph-orientation-vertical", - "glyphref", - "glyphRef", - "grad", - "gradienttransform", - "gradientTransform", - "gradientunits", - "gradientUnits", - "groupalign", - "gt", - "handler", - "hanging", - "header", - "headers", - "height", - "hgroup", - "hidden", - "hidefocus", - "high", - "hkern", - "horiz-adv-x", - "horiz-origin-x", - "horiz-origin-y", - "hr", - "href", - "hreflang", - "hspace", - "htmlevents", - "http-equiv", - "i", - "icon", - "id", - "ident", - "ideographic", - "iframe", - "image", - "image-rendering", - "imaginary", - "imaginaryi", - "img", - "implies", - "important", - "in", - "in2", - "index", - "infinity", - "inputmode", - "ins", - "int", - "integers", - "intercept", - "intersect", - "interval", - "invalid", - "inverse", - "irrelevant", - "isindex", - "ismap", - "k", - "k1", - "k2", - "k3", - "k4", - "kbd", - "kernelmatrix", - "kernelMatrix", - "kernelunitlength", - "kernelUnitLength", - "kerning", - "keyboardevent", - "keydown", - "keyevents", - "keygen", - "keypoints", - "keyPoints", - "keypress", - "keysplines", - "keySplines", - "keytimes", - "keyTimes", - "keyup", - "label", - "lambda", - "lang", - "language", - "laplacian", - "largeop", - "lcm", - "left", - "legend", - "lengthadjust", - "lengthAdjust", - "leq", - "letter-spacing", - "lighting-color", - "limit", - "limitingconeangle", - "limitingConeAngle", - "line", - "lineargradient", - "linearGradient", - "linebreak", - "line-height", - "linethickness", - "list", - "listener", - "listing", - "list-style", - "list-style-image", - "list-style-position", - "list-style-type", - "ln", - "load", - "loadstart", - "loadend", - "local", - "log", - "logbase", - "longdesc", - "loop", - "low", - "lowlimit", - "lowsrc", - "lquote", - "lspace", - "lt", - "macros", - "maction", - "main", - "maligngroup", - "malignmark", - "manifest", - "map", - "margin", - "margin-bottom", - "marginheight", - "margin-left", - "margin-right", - "margin-top", - "marginwidth", - "mark", - "marker", - "marker-end", - "markerheight", - "markerHeight", - "marker-mid", - "marker-start", - "markerunits", - "markerUnits", - "markerwidth", - "markerWidth", - "mask", - "maskcontentunits", - "maskContentUnits", - "maskunits", - "maskUnits", - "math", - "mathbackground", - "mathcolor", - "mathematical", - "mathsize", - "mathvariant", - "matrix", - "matrixrow", - "max", - "max-height", - "maxlength", - "maxsize", - "max-width", - "mean", - "media", - "median", - "mediummathspace", - "menclose", - "menu", - "menuitem", - "merror", - "message", - "messageevent", - "metadata", - "meter", - "method", - "mfenced", - "mfrac", - "mglyph", - "mi", - "min", - "min-height", - "minsize", - "minus", - "min-width", - "missing-glyph", - "miter", - "mlabeledtr", - "mmultiscripts", - "mn", - "mo", - "mode", - "moment", - "momentabout", - "month", - "mousedown", - "mouseevent", - "mouseevents", - "mouseover", - "mouseup", - "movablelimits", - "mover", - "mozbrowser", - "mpadded", - "mpath", - "mphantom", - "mprescripts", - "mroot", - "mrow", - "ms", - "mspace", - "msqrt", - "mstyle", - "msub", - "msubsup", - "msup", - "mtable", - "mtd", - "mtext", - "mtr", - "multicol", - "multipart/form-data", - "multiple", - "munder", - "munderover", - "name", - "nargs", - "naturalnumbers", - "nav", - "neq", - "nest", - "nextid", - "no message", - "nobr", - "noembed", - "nohref", - "none", - "noresize", - "noshade", - "not", - "notanumber", - "notation", - "notin", - "notprsubset", - "notsubset", - "novalidate", - "nowrap", - "number", - "numoctaves", - "numOctaves", - "occurrence", - "off", - "offset", - "ol", - "on", - "onabort", - "onactivate", - "onafterprint", - "onafterupdate", - "onbefordeactivate", - "onbeforeactivate", - "onbeforecopy", - "onbeforecut", - "onbeforeeditfocus", - "onbeforepaste", - "onbeforeprint", - "onbeforeunload", - "onbeforeupdate", - "onbegin", - "onblur", - "onbounce", - "oncellchange", - "onchange", - "onclick", - "oncontextmenu", - "oncontrolselect", - "oncopy", - "oncut", - "ondataavailable", - "ondatasetchanged", - "ondatasetcomplete", - "ondblclick", - "ondeactivate", - "ondrag", - "ondragdrop", - "ondragend", - "ondragenter", - "ondragleave", - "ondragover", - "ondragstart", - "ondrop", - "onend", - "onerror", - "onerrorupdate", - "onfilterchange", - "onfinish", - "onfocus", - "onfocusin", - "onfocusout", - "onformchange", - "onforminput", - "onhashchange", - "onhelp", - "oninput", - "oninvalid", - "onkeydown", - "onkeypress", - "onkeyup", - "onlanguagechange", - "onload", - "onlosecapture", - "onmessage", - "onmousedown", - "onmouseenter", - "onmouseleave", - "onmousemove", - "onmouseout", - "onmouseover", - "onmouseup", - "onmousewheel", - "onmove", - "onmoveend", - "onmovestart", - "onoffline", - "ononline", - "onpagehide", - "onpageshow", - "onpaste", - "onpopstate", - "onpropertychange", - "onreadystatechange", - "onrepeat", - "onreset", - "onresize", - "onrowenter", - "onrowexit", - "onrowsdelete", - "onrowsinserted", - "onscroll", - "onselect", - "onselectstart", - "onstart", - "onstop", - "onstorage", - "onsubmit", - "onunload", - "onzoom", - "opacity", - "open", - "operator", - "optimum", - "or", - "order", - "orient", - "orientation", - "origin", - "other", - "otherwise", - "outerproduct", - "outline", - "outline-color", - "outline-style", - "outline-width", - "output", - "overflow", - "overflow-wrap", - "overflow-x", - "overflow-y", - "overline-position", - "overline-thickness", - "p", - "padding", - "padding-bottom", - "padding-left", - "padding-right", - "padding-top", - "panose-1", - "partialdiff", - "password", - "path", - "pathlength", - "pathLength", - "pattern", - "patterncontentunits", - "patternContentUnits", - "patterntransform", - "patternTransform", - "patternunits", - "patternUnits", - "pi", - "piece", - "piecewise", - "ping", - "placeholder", - "plus", - "pointer-events", - "points", - "pointsatx", - "pointsAtX", - "pointsaty", - "pointsAtY", - "pointsatz", - "pointsAtZ", - "polygon", - "polyline", - "position", - "post", - "poster", - "power", - "prefetch", - "preservealpha", - "preserveAlpha", - "preserveaspectratio", - "preserveAspectRatio", - "primes", - "primitiveunits", - "primitiveUnits", - "product", - "profile", - "progress", - "prompt", - "prsubset", - "q", - "quotient", - "r", - "radialgradient", - "radialGradient", - "radio", - "radiogroup", - "radius", - "range", - "rationals", - "rb", - "readonly", - "readystatechange", - "real", - "reals", - "rect", - "refx", - "refX", - "refy", - "refY", - "rel", - "reln", - "rem", - "rendering-intent", - "repeat", - "repeatcount", - "repeatCount", - "repeatdur", - "repeatDur", - "repeat-max", - "repeat-min", - "repeat-start", - "repeat-template", - "replace", - "required", - "requiredextensions", - "requiredExtensions", - "requiredfeatures", - "requiredFeatures", - "reset", - "resize", - "restart", - "result", - "rev", - "right", - "role", - "root", - "rotate", - "round", - "rowalign", - "rowlines", - "rows", - "rowspacing", - "rowspan", - "rquote", - "rspace", - "rtc", - "ruby", - "rule", - "rules", - "rx", - "ry", - "s", - "samp", - "sandbox", - "scalarproduct", - "scale", - "scheme", - "scope", - "scoped", - "scriptlevel", - "scriptminsize", - "scriptsizemultiplier", - "scrolldelay", - "scrolling", - "sdev", - "seamless", - "search", - "sec", - "sech", - "section", - "seed", - "selected", - "selection", - "selector", - "semantics", - "sep", - "separator", - "separators", - "serif", - "set", - "setdiff", - "shape", - "shape-rendering", - "show", - "sin", - "sinh", - "size", - "sizes", - "slope", - "small", - "solidcolor", - "space", - "spacer", - "spacing", - "span", - "specification", - "specularconstant", - "specularConstant", - "specularexponent", - "specularExponent", - "speed", - "spreadmethod", - "spreadMethod", - "square", - "src", - "srcdoc", - "standby", - "start", - "startoffset", - "startOffset", - "stddeviation", - "stdDeviation", - "stemh", - "stemv", - "step", - "stitchtiles", - "stitchTiles", - "stop", - "stop-color", - "stop-opacity", - "storage", - "stretchy", - "strike", - "strikethrough-position", - "strikethrough-thickness", - "string", - "stroke", - "stroke-dasharray", - "stroke-dashoffset", - "stroke-linecap", - "stroke-linejoin", - "stroke-miterlimit", - "stroke-opacity", - "stroke-width", - "strong", - "sub", - "submit", - "subscriptshift", - "subset", - "sum", - "summary", - "sup", - "superscriptshift", - "surfacescale", - "surfaceScale", - "switch", - "symbol", - "symmetric", - "systemlanguage", - "systemLanguage", - "tabindex", - "table-layout", - "tablevalues", - "tableValues", - "tan", - "tanh", - "target", - "targetx", - "targetX", - "targety", - "targetY", - "tbreak", - "tel", - "tendsto", - "text", - "text/plain", - "text-align", - "text-anchor", - "text-decoration", - "textlength", - "textLength", - "text-orientation", - "textpath", - "textPath", - "text-rendering", - "thickmathspace", - "thinmathspace", - "time", - "times", - "to", - "top", - "touchevent", - "transform", - "transition-delay", - "transition-duration", - "transition-property", - "transitions", - "transition-timing-function", - "transpose", - "tref", - "true", - "tspan", - "tt", - "type", - "u", - "u1", - "u2", - "uievent", - "uievents", - "ul", - "underline-position", - "underline-thickness", - "unicode", - "unicode-bidi", - "unicode-range", - "union", - "units-per-em", - "unselectable", - "uplimit", - "url", - "use", - "usemap", - "UTF-8", - "valign", - "v-alphabetic", - "value", - "values", - "valuetype", - "var", - "variance", - "vector", - "vectorproduct", - "version", - "vert-adv-y", - "vertical-align", - "vert-origin-x", - "vert-origin-y", - "verythickmathspace", - "verythinmathspace", - "veryverythickmathspace", - "veryverythinmathspace", - "v-hanging", - "video", - "v-ideographic", - "view", - "viewbox", - "viewBox", - "viewtarget", - "viewTarget", - "visibility", - "vkern", - "vlink", - "v-mathematical", - "vspace", - "wbr", - "webglcontextcreationerror", - "week", - "when", - "white-space", - "width", - "widths", - "word-spacing", - "word-wrap", - "wrap", - "writing-mode", - "x", - "x1", - "x2", - "xchannelselector", - "xChannelSelector", - "x-height", - "xlink", - "xlink:actuate", - "xlink:arcrole", - "xlink:href", - "xlink:role", - "xlink:show", - "xlink:title", - "xlink:type", - "xml:base", - "xml:lang", - "xmlns", - "xmlns:xlink", - "xml:space", - "xor", - "xref", - "y", - "y1", - "y2", - "ychannelselector", - "yChannelSelector", - "z", - "zoomandpan", - "zoomAndPan", -]; diff --git a/src/static_sets.rs b/src/static_sets.rs new file mode 100644 index 0000000..f7f1799 --- /dev/null +++ b/src/static_sets.rs @@ -0,0 +1,64 @@ +// Copyright 2014 The Servo Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/// A static `PhfStrSet` +/// +/// This trait is implemented by static sets of interned strings generated using +/// `string_cache_codegen`, and `EmptyStaticAtomSet` for when strings will be added dynamically. +/// +/// It is used by the methods of [`Atom`] to check if a string is present in the static set. +/// +/// [`Atom`]: struct.Atom.html +pub trait StaticAtomSet: Ord { + /// Get the location of the static string set in the binary. + fn get() -> &'static PhfStrSet; + /// Get the index of the empty string, which is in every set and is used for `Atom::default`. + fn empty_string_index() -> u32; +} + +/// A string set created using a [perfect hash function], specifically +/// [Hash, Displace and Compress]. +/// +/// See the CHD document for the meaning of the struct fields. +/// +/// [perfect hash function]: https://en.wikipedia.org/wiki/Perfect_hash_function +/// [Hash, Displace and Compress]: http://cmph.sourceforge.net/papers/esa09.pdf +pub struct PhfStrSet { + #[doc(hidden)] + pub key: u64, + #[doc(hidden)] + pub disps: &'static [(u32, u32)], + #[doc(hidden)] + pub atoms: &'static [&'static str], + #[doc(hidden)] + pub hashes: &'static [u32], +} + +/// An empty static atom set for when only dynamic strings will be added +#[derive(PartialEq, Eq, PartialOrd, Ord)] +pub struct EmptyStaticAtomSet; + +impl StaticAtomSet for EmptyStaticAtomSet { + fn get() -> &'static PhfStrSet { + // The name is a lie: this set is not empty (it contains the empty string) + // but that’s only to avoid divisions by zero in rust-phf. + static SET: PhfStrSet = PhfStrSet { + key: 0, + disps: &[(0, 0)], + atoms: &[""], + // "" SipHash'd, and xored with u64_hash_to_u32. + hashes: &[0x3ddddef3], + }; + &SET + } + + fn empty_string_index() -> u32 { + 0 + } +} diff --git a/src/trivial_impls.rs b/src/trivial_impls.rs new file mode 100644 index 0000000..960dde0 --- /dev/null +++ b/src/trivial_impls.rs @@ -0,0 +1,119 @@ +// Copyright 2014 The Servo Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use crate::{Atom, StaticAtomSet}; +#[cfg(feature = "serde_support")] +use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use std::borrow::Cow; +use std::fmt; + +impl ::precomputed_hash::PrecomputedHash for Atom { + fn precomputed_hash(&self) -> u32 { + self.get_hash() + } +} + +impl<'a, Static: StaticAtomSet> From<&'a Atom> for Atom { + fn from(atom: &'a Self) -> Self { + atom.clone() + } +} + +impl PartialEq for Atom { + fn eq(&self, other: &str) -> bool { + &self[..] == other + } +} + +impl PartialEq> for str { + fn eq(&self, other: &Atom) -> bool { + self == &other[..] + } +} + +impl PartialEq for Atom { + fn eq(&self, other: &String) -> bool { + self[..] == other[..] + } +} + +impl<'a, Static: StaticAtomSet> From<&'a str> for Atom { + #[inline] + fn from(string_to_add: &str) -> Self { + Atom::from(Cow::Borrowed(string_to_add)) + } +} + +impl From for Atom { + #[inline] + fn from(string_to_add: String) -> Self { + Atom::from(Cow::Owned(string_to_add)) + } +} + +impl fmt::Display for Atom { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + ::fmt(self, f) + } +} + +impl AsRef for Atom { + fn as_ref(&self) -> &str { + self + } +} + +#[cfg(feature = "serde_support")] +impl Serialize for Atom { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let string: &str = self.as_ref(); + string.serialize(serializer) + } +} + +#[cfg(feature = "serde_support")] +impl<'a, Static: StaticAtomSet> Deserialize<'a> for Atom { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'a>, + { + use serde::de; + use std::marker::PhantomData; + + struct AtomVisitor(PhantomData); + + impl<'de, Static: StaticAtomSet> de::Visitor<'de> for AtomVisitor { + type Value = Atom; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + write!(formatter, "an Atom") + } + + fn visit_str(self, v: &str) -> Result + where + E: de::Error, + { + Ok(Atom::from(v)) + } + + fn visit_string(self, v: String) -> Result + where + E: de::Error, + { + Ok(Atom::from(v)) + } + } + + deserializer.deserialize_str(AtomVisitor(PhantomData)) + } +} diff --git a/string-cache-codegen/Cargo.toml b/string-cache-codegen/Cargo.toml new file mode 100644 index 0000000..20eced9 --- /dev/null +++ b/string-cache-codegen/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "string_cache_codegen" +version = "0.6.1" # Also update ../README.md when making a semver-breaking change +authors = [ "The Servo Project Developers" ] +description = "A codegen library for string-cache, developed as part of the Servo project." +license = "MIT OR Apache-2.0" +repository = "https://github.com/servo/string-cache" +documentation = "https://docs.rs/string_cache_codegen/" +edition = "2018" + +[lib] +name = "string_cache_codegen" +path = "lib.rs" + +[dependencies] +phf_generator = "0.13" +phf_shared = "0.13" +proc-macro2 = "1" +quote = "1" diff --git a/string-cache-codegen/LICENSE-APACHE b/string-cache-codegen/LICENSE-APACHE new file mode 100644 index 0000000..16fe87b --- /dev/null +++ b/string-cache-codegen/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/string-cache-codegen/LICENSE-MIT b/string-cache-codegen/LICENSE-MIT new file mode 100644 index 0000000..807526f --- /dev/null +++ b/string-cache-codegen/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2012-2013 Mozilla Foundation + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/string-cache-codegen/lib.rs b/string-cache-codegen/lib.rs new file mode 100644 index 0000000..525ef3a --- /dev/null +++ b/string-cache-codegen/lib.rs @@ -0,0 +1,393 @@ +// Copyright 2016 The Servo Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +//! A crate to create static string caches at compiletime. +//! +//! # Examples +//! +//! With static atoms: +//! +//! In `Cargo.toml`: +//! +//! ```toml +//! [package] +//! build = "build.rs" +//! +//! [dependencies] +//! string_cache = "0.9" +//! +//! [build-dependencies] +//! string_cache_codegen = "0.6" +//! ``` +//! +//! In `build.rs`: +//! +//! ```no_run +//! extern crate string_cache_codegen; +//! +//! use std::env; +//! use std::path::Path; +//! +//! fn main() { +//! string_cache_codegen::AtomType::new("foo::FooAtom", "foo_atom!") +//! .atoms(&["foo", "bar"]) +//! .write_to_file(&Path::new(&env::var("OUT_DIR").unwrap()).join("foo_atom.rs")) +//! .unwrap() +//! } +//! ``` +//! +//! In `lib.rs`: +//! +//! ```ignore +//! extern crate string_cache; +//! +//! mod foo { +//! include!(concat!(env!("OUT_DIR"), "/foo_atom.rs")); +//! } +//! ``` +//! +//! The generated code will define a `FooAtom` type and a `foo_atom!` macro. +//! The macro can be used in expression or patterns, with strings listed in `build.rs`. +//! For example: +//! +//! ```ignore +//! fn compute_something(input: &foo::FooAtom) -> u32 { +//! match *input { +//! foo_atom!("foo") => 1, +//! foo_atom!("bar") => 2, +//! _ => 3, +//! } +//! } +//! ``` +//! + +#![recursion_limit = "128"] + +use proc_macro2::Ident; +use quote::quote; +use std::collections::BTreeSet; +use std::fs::File; +use std::io::{self, BufWriter, Write}; +use std::path::Path; + +/// A builder for a static atom set and relevant macros +pub struct AtomType { + path: String, + atom_doc: Option, + static_set_doc: Option, + macro_name: String, + macro_doc: Option, + atoms: BTreeSet, +} + +impl AtomType { + /// Constructs a new static atom set builder + /// + /// `path` is a path within a crate of the atom type that will be created. + /// e.g. `"FooAtom"` at the crate root or `"foo::Atom"` if the generated code + /// is included in a `foo` module. + /// + /// `macro_name` must end with `!`. + /// + /// For example, `AtomType::new("foo::FooAtom", "foo_atom!")` will generate: + /// + /// ```ignore + /// pub type FooAtom = ::string_cache::Atom; + /// pub struct FooAtomStaticSet; + /// impl ::string_cache::StaticAtomSet for FooAtomStaticSet { + /// // ... + /// } + /// #[macro_export] + /// macro_rules foo_atom { + /// // Expands to: $crate::foo::FooAtom { … } + /// } + /// ``` + pub fn new(path: &str, macro_name: &str) -> Self { + assert!(macro_name.ends_with("!"), "`macro_name` must end with '!'"); + AtomType { + path: path.to_owned(), + macro_name: macro_name[..macro_name.len() - "!".len()].to_owned(), + atom_doc: None, + static_set_doc: None, + macro_doc: None, + atoms: BTreeSet::new(), + } + } + + /// Add some documentation to the generated Atom type alias. + /// + /// This can help the user know that the type uses interned strings. + /// + /// Note that `docs` should not contain the `///` at the front of normal docs. + pub fn with_atom_doc(&mut self, docs: &str) -> &mut Self { + self.atom_doc = Some(docs.to_owned()); + self + } + + /// Add some documentation to the generated static set. + /// + /// This can help the user know that this type is zero-sized and just references a static + /// lookup table, or point them to the `Atom` type alias for more info. + /// + /// Note that `docs` should not contain the `///` at the front of normal docs. + pub fn with_static_set_doc(&mut self, docs: &str) -> &mut Self { + self.static_set_doc = Some(docs.to_owned()); + self + } + + /// Add some documentation to the generated macro. + /// + /// Note that `docs` should not contain the `///` at the front of normal docs. + pub fn with_macro_doc(&mut self, docs: &str) -> &mut Self { + self.macro_doc = Some(docs.to_owned()); + self + } + + /// Adds an atom to the builder + pub fn atom(&mut self, s: &str) -> &mut Self { + self.atoms.insert(s.to_owned()); + self + } + + /// Adds multiple atoms to the builder + pub fn atoms(&mut self, iter: I) -> &mut Self + where + I: IntoIterator, + I::Item: AsRef, + { + self.atoms + .extend(iter.into_iter().map(|s| s.as_ref().to_owned())); + self + } + + /// Write generated code to `destination`. + pub fn write_to(&mut self, mut destination: W) -> io::Result<()> + where + W: Write, + { + destination.write_all( + self.to_tokens() + .to_string() + // Insert some newlines to make the generated code slightly easier to read. + .replace(" [ \"", "[\n\"") + .replace("\" , ", "\",\n") + .replace(" ( \"", "\n( \"") + .replace("; ", ";\n") + .as_bytes(), + ) + } + + #[cfg(test)] + /// Write generated code to destination [`Vec`] and return it as [`String`] + /// + /// Used mostly for testing or displaying a value. + pub fn write_to_string(&mut self, mut destination: Vec) -> io::Result { + destination.write_all( + self.to_tokens() + .to_string() + // Insert some newlines to make the generated code slightly easier to read. + .replace(" [ \"", "[\n\"") + .replace("\" , ", "\",\n") + .replace(" ( \"", "\n( \"") + .replace("; ", ";\n") + .as_bytes(), + )?; + let str = String::from_utf8(destination).unwrap(); + Ok(str) + } + + fn to_tokens(&mut self) -> proc_macro2::TokenStream { + // `impl Default for Atom` requires the empty string to be in the static set. + // This also makes sure the set in non-empty, + // which would cause divisions by zero in rust-phf. + self.atoms.insert(String::new()); + + // Strings over 7 bytes + empty string added to static set. + // Otherwise stored inline. + let (static_strs, inline_strs): (Vec<_>, Vec<_>) = self + .atoms + .iter() + .map(String::as_str) + .partition(|s| s.len() > 7 || s.is_empty()); + + // Static strings + let hash_state = phf_generator::generate_hash(&static_strs); + let phf_generator::HashState { key, disps, map } = hash_state; + let (disps0, disps1): (Vec<_>, Vec<_>) = disps.into_iter().unzip(); + let atoms: Vec<&str> = map.iter().map(|&idx| static_strs[idx]).collect(); + let empty_string_index = atoms.iter().position(|s| s.is_empty()).unwrap() as u32; + let indices = 0..atoms.len() as u32; + + fn is_valid_ident(name: &str) -> bool { + let begins_with_letter_or_underscore = name + .chars() + .next() + .is_some_and(|c| c.is_alphabetic() || c == '_'); + let is_alphanumeric = name.chars().all(|c| c.is_alphanumeric() || c == '_'); + + begins_with_letter_or_underscore && is_alphanumeric + } + + let atoms_for_idents: Vec<&str> = atoms + .iter() + .copied() + .filter(|x| is_valid_ident(x)) + .collect(); + let atom_idents: Vec = atoms_for_idents.iter().map(|atom| new_term(atom)).collect(); + + let istrs_for_idents: Vec<&str> = inline_strs + .iter() + .copied() + .filter(|x| is_valid_ident(x)) + .collect(); + let istr_idents: Vec = istrs_for_idents.iter().map(|atom| new_term(atom)).collect(); + + let hashes: Vec = atoms + .iter() + .map(|string| { + let hash = phf_shared::hash(string, &key); + (hash.g ^ hash.f1) as u32 + }) + .collect(); + + let mut path_parts = self.path.rsplitn(2, "::"); + let type_name = path_parts.next().unwrap(); + let module = match path_parts.next() { + Some(m) => format!("$crate::{}", m), + None => format!("$crate"), + }; + let atom_doc = match self.atom_doc { + Some(ref doc) => quote!(#[doc = #doc]), + None => quote!(), + }; + let static_set_doc = match self.static_set_doc { + Some(ref doc) => quote!(#[doc = #doc]), + None => quote!(), + }; + let macro_doc = match self.macro_doc { + Some(ref doc) => quote!(#[doc = #doc]), + None => quote!(), + }; + fn new_term(string: &str) -> Ident { + Ident::new(string, proc_macro2::Span::call_site()) + } + let static_set_name = new_term(&format!("{}StaticSet", type_name)); + let type_name = new_term(type_name); + let macro_name = new_term(&*self.macro_name); + let module = module.parse::().unwrap(); + let atom_prefix = format!("ATOM_{}_", type_name.to_string().to_uppercase()); + let new_const_name = |atom: &str| { + let mut name = atom_prefix.clone(); + for c in atom.chars() { + name.push_str(&format!("_{:02X}", c as u32)) + } + new_term(&name) + }; + let const_names: Vec<_> = atoms.iter().copied().map(new_const_name).collect(); + let ident_const_names: Vec<_> = atoms_for_idents + .iter() + .copied() + .map(new_const_name) + .collect(); + let ident_inline_const_names: Vec<_> = istrs_for_idents + .iter() + .copied() + .map(new_const_name) + .collect(); + + // Inline strings + let (inline_const_names, inline_values_and_lengths): (Vec<_>, Vec<_>) = inline_strs + .iter() + .map(|s| { + let const_name = new_const_name(s); + + let mut value = 0u64; + for (index, c) in s.bytes().enumerate() { + value = value | ((c as u64) << (index * 8 + 8)); + } + + let len = s.len() as u8; + + (const_name, (value, len)) + }) + .unzip(); + let (inline_values, inline_lengths): (Vec<_>, Vec<_>) = + inline_values_and_lengths.into_iter().unzip(); + + quote! { + #atom_doc + pub type #type_name = ::string_cache::Atom<#static_set_name>; + + #static_set_doc + #[derive(PartialEq, Eq, PartialOrd, Ord)] + pub struct #static_set_name; + + impl ::string_cache::StaticAtomSet for #static_set_name { + fn get() -> &'static ::string_cache::PhfStrSet { + static SET: ::string_cache::PhfStrSet = ::string_cache::PhfStrSet { + key: #key, + disps: &[#((#disps0, #disps1)),*], + atoms: &[#(#atoms),*], + hashes: &[#(#hashes),*] + }; + &SET + } + fn empty_string_index() -> u32 { + #empty_string_index + } + } + + #( + pub const #const_names: #type_name = #type_name::pack_static(#indices); + )* + #( + pub const #inline_const_names: #type_name = #type_name::pack_inline(#inline_values, #inline_lengths); + )* + + #macro_doc + #[macro_export] + macro_rules! #macro_name { + #( + (#atoms) => { #module::#const_names }; + )* + #( + (#inline_strs) => { #module::#inline_const_names }; + )* + #( + (#atom_idents) => { #module::#ident_const_names }; + )* + #( + (#istr_idents) => { #module::#ident_inline_const_names }; + )* + } + } + } + + /// Create a new file at `path` and write generated code there. + /// + /// Typical usage: + /// `.write_to_file(&Path::new(&env::var("OUT_DIR").unwrap()).join("foo_atom.rs"))` + pub fn write_to_file(&mut self, path: &Path) -> io::Result<()> { + self.write_to(BufWriter::new(File::create(path)?)) + } +} + +#[test] +fn test_iteration_order() { + let x1 = crate::AtomType::new("foo::Atom", "foo_atom!") + .atoms(&["x", "xlink", "svg", "test"]) + .write_to_string(Vec::new()) + .expect("write to string cache x1"); + + let x2 = crate::AtomType::new("foo::Atom", "foo_atom!") + .atoms(&["x", "xlink", "svg", "test"]) + .write_to_string(Vec::new()) + .expect("write to string cache x2"); + + assert_eq!(x1, x2); +} diff --git a/tests/small-stack.rs b/tests/small-stack.rs new file mode 100644 index 0000000..bb607af --- /dev/null +++ b/tests/small-stack.rs @@ -0,0 +1,17 @@ +// Regression test for https://github.com/servo/html5ever/issues/393 +// +// Create a dynamic atom − causing initialization of the global hash map − +// in a thread that has a small stack. +// +// This is a separate test program rather than a `#[test] fn` among others +// to make sure that nothing else has already initialized the map in this process. +fn main() { + std::thread::Builder::new() + .stack_size(50_000) + .spawn(|| { + let _atom = string_cache::DefaultAtom::from("12345678"); + }) + .unwrap() + .join() + .unwrap() +}