diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..74ade77 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,65 @@ +name: CI +on: + push: + branches: ["main"] + pull_request: + merge_group: + types: [checks_requested] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +env: + RUST_BACKTRACE: 1 + SHELL: /bin/bash + +jobs: + ci: + name: Build and Test + runs-on: ubuntu-latest + + strategy: + matrix: + rust: [1.70.0, nightly, beta, stable] + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 2 + - name: Setup Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: ${{ matrix.rust }} + default: true + override: true + - name: Build + run: | + cargo build --no-default-features + cargo build + cargo build --features malloc_size_of + - uses: actions-rs/cargo@v1 + with: + command: test + args: --all + - name: Build codegen + run: | + cd string-cache-codegen && cargo build && cd .. + + if [ ${{ matrix.rust }} = nightly ]; then + cd integration-tests && cargo test --features unstable && cd ..; + fi + + + build_result: + name: Result + runs-on: ubuntu-latest + needs: + - "ci" + + steps: + - name: Mark the job as successful + run: exit 0 + if: success() + - name: Mark the job as unsuccessful + run: exit 1 + if: "!success()" diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 180d855..0000000 --- a/.travis.yml +++ /dev/null @@ -1,15 +0,0 @@ -sudo: false -language: rust -rust: - - 1.36.0 - - nightly - - beta - - stable -os: - - linux -script: - - cargo build --no-default-features - - cargo build - - cargo test --all - - "cd string-cache-codegen && cargo build && cd .." - - "if [ $TRAVIS_RUST_VERSION = nightly ]; then cd integration-tests && cargo test --features unstable && cd ..; fi" diff --git a/Cargo.toml b/Cargo.toml index 01b9282..e73215e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,12 +1,13 @@ [package] name = "string_cache" -version = "0.8.1" # Also update README.md when making a semver-breaking change -authors = [ "The Servo Project Developers" ] +version = "0.9.0" # Also update README.md when making a semver-breaking change +authors = ["The Servo Project Developers"] description = "A string interning library for Rust, developed as part of the Servo project." -license = "MIT / Apache-2.0" +license = "MIT OR Apache-2.0" repository = "https://github.com/servo/string-cache" -documentation = "https://docs.rs/string_cache/" +documentation = "https://docs.rs/string_cache" edition = "2018" +rust-version = "1.70.0" # Do not `exclude` ./string-cache-codegen because we want to include # ./string-cache-codegen/shared.rs, and `include` is a pain to use @@ -23,10 +24,11 @@ default = ["serde_support"] [dependencies] precomputed-hash = "0.1" -lazy_static = "1" serde = { version = "1", optional = true } -phf_shared = "0.8" -new_debug_unreachable = "1.0" +malloc_size_of = { version = "0.1", default-features = false, optional = true } +phf_shared = "0.13" +new_debug_unreachable = "1.0.2" +parking_lot = "0.12" [[test]] name = "small-stack" diff --git a/README.md b/README.md index 9c9c8ac..429d1ec 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # string-cache -[![Build Status](https://travis-ci.com/servo/string-cache.svg?branch=master)](https://travis-ci.com/servo/string-cache) +[![Build Status](https://github.com/servo/string-cache/actions/workflows/ci.yml/badge.svg)](https://github.com/servo/string-cache/actions) [Documentation](https://docs.rs/string_cache/) @@ -12,7 +12,7 @@ In `Cargo.toml`: ```toml [dependencies] -string_cache = "0.8" +string_cache = "0.9" ``` In `lib.rs`: @@ -31,10 +31,10 @@ In `Cargo.toml`: build = "build.rs" [dependencies] -string_cache = "0.8" +string_cache = "0.9" [build-dependencies] -string_cache_codegen = "0.5" +string_cache_codegen = "0.6" ``` In `build.rs`: diff --git a/integration-tests/Cargo.toml b/integration-tests/Cargo.toml index 736e34a..4562747 100644 --- a/integration-tests/Cargo.toml +++ b/integration-tests/Cargo.toml @@ -16,11 +16,11 @@ test = true unstable = [] [dependencies] -string_cache = { version = "0.8", path = ".." } +string_cache = { version = "0.9", path = ".." } [dev-dependencies] -rand = "0.7" -string_cache_codegen = { version = "0.5", path = "../string-cache-codegen" } +rand = { version = "0.8", features = ["small_rng"] } +string_cache_codegen = { version = "0.6", path = "../string-cache-codegen" } [build-dependencies] -string_cache_codegen = { version = "0.5", path = "../string-cache-codegen" } +string_cache_codegen = { version = "0.6", path = "../string-cache-codegen" } diff --git a/integration-tests/build.rs b/integration-tests/build.rs index da40873..6293e4c 100644 --- a/integration-tests/build.rs +++ b/integration-tests/build.rs @@ -9,6 +9,7 @@ fn main() { "a", "b", "address", + "defaults", "area", "body", "font-weight", @@ -16,6 +17,9 @@ fn main() { "html", "head", "id", + "❤", + "❤💯", + "❤💯❤💯", ]) .write_to_file(&Path::new(&env::var("OUT_DIR").unwrap()).join("test_atom.rs")) .unwrap() diff --git a/integration-tests/src/bench.rs b/integration-tests/src/bench.rs index 4d8f012..45e7199 100644 --- a/integration-tests/src/bench.rs +++ b/integration-tests/src/bench.rs @@ -153,7 +153,7 @@ bench_all!([eq ne lt clone_string] for longer_string = super::longer_dynamic_a, super::longer_dynamic_b); bench_all!([eq ne intern as_ref clone is_static lt] - for static_atom = test_atom!("a"), test_atom!("b")); + for static_atom = test_atom!("defaults"), test_atom!("font-weight")); bench_all!([intern as_ref clone is_inline] for short_inline_atom = mk("e"), mk("f")); @@ -168,13 +168,13 @@ bench_all!([eq ne intern as_ref clone is_dynamic lt] for longer_dynamic_atom = mk(super::longer_dynamic_a), mk(super::longer_dynamic_b)); bench_all!([intern as_ref clone is_static] - for static_at_runtime = mk("a"), mk("b")); + for static_at_runtime = mk("defaults"), mk("font-weight")); bench_all!([ne lt x_static y_inline] - for static_vs_inline = test_atom!("a"), mk("f")); + for static_vs_inline = test_atom!("defaults"), mk("f")); bench_all!([ne lt x_static y_dynamic] - for static_vs_dynamic = test_atom!("a"), mk(super::longer_dynamic_b)); + for static_vs_dynamic = test_atom!("defaults"), mk(super::longer_dynamic_b)); bench_all!([ne lt x_inline y_dynamic] for inline_vs_dynamic = mk("e"), mk(super::longer_dynamic_b)); diff --git a/integration-tests/src/common-usage.rs b/integration-tests/src/common-usage.rs new file mode 100644 index 0000000..7b7380a --- /dev/null +++ b/integration-tests/src/common-usage.rs @@ -0,0 +1,19 @@ +/// Test common usage by popular dependents (html5ever, lalrpop, browserlists-rs), to ensure no API-surface breaking changes +/// Created after https://github.com/servo/string-cache/issues/271 +use std::collections::HashMap; + +use crate::Atom; +use crate::TestAtom; + +#[test] +fn usage_with_hashmap() { + let mut map: HashMap = HashMap::new(); + + map.insert(test_atom!("area"), 1); + map.insert("str_into".into(), 2); + map.insert("atom_from".into(), 3); + + assert_eq!(map.get(&"area".into()).unwrap(), &1); + assert_eq!(map.get(&"str_into".into()).unwrap(), &2); + assert_eq!(map.get(&Atom::from("atom_from")).unwrap(), &3); +} diff --git a/integration-tests/src/lib.rs b/integration-tests/src/lib.rs index 1f2be87..a788d93 100644 --- a/integration-tests/src/lib.rs +++ b/integration-tests/src/lib.rs @@ -45,9 +45,12 @@ fn test_as_slice() { #[test] fn test_types() { assert!(Atom::from("").is_static()); - assert!(Atom::from("id").is_static()); - assert!(Atom::from("body").is_static()); - assert!(Atom::from("a").is_static()); + assert!(Atom::from("defaults").is_static()); + assert!(Atom::from("font-weight").is_static()); + assert!(Atom::from("id").is_inline()); + assert!(Atom::from("body").is_inline()); + assert!(Atom::from("a").is_inline()); + assert!(Atom::from("address").is_inline()); assert!(Atom::from("c").is_inline()); assert!(Atom::from("zz").is_inline()); assert!(Atom::from("zzz").is_inline()); @@ -168,11 +171,13 @@ fn repr() { // static atom table, the tag values, etc. // Static atoms - check_static("a", test_atom!("a")); - check_static("address", test_atom!("address")); - check_static("area", test_atom!("area")); + check_static("defaults", test_atom!("defaults")); + check_static("font-weight", test_atom!("font-weight")); // Inline atoms + check("a", 0x0000_0000_0000_6111); + check("address", 0x7373_6572_6464_6171); + check("area", 0x0000_0061_6572_6141); check("e", 0x0000_0000_0000_6511); check("xyzzy", 0x0000_797A_7A79_7851); check("xyzzy01", 0x3130_797A_7A79_7871); @@ -193,8 +198,13 @@ fn test_threads() { #[test] fn atom_macro() { + assert_eq!(test_atom!("a"), Atom::from("a")); assert_eq!(test_atom!("body"), Atom::from("body")); + assert_eq!(test_atom!("address"), Atom::from("address")); + assert_eq!(test_atom!("❤"), Atom::from("❤")); + assert_eq!(test_atom!("❤💯"), Atom::from("❤💯")); assert_eq!(test_atom!("font-weight"), Atom::from("font-weight")); + assert_eq!(test_atom!("❤💯❤💯"), Atom::from("❤💯❤💯")); } #[test] @@ -292,10 +302,15 @@ fn test_from_string() { #[test] fn test_try_static() { - assert!(Atom::try_static("head").is_some()); + assert!(Atom::try_static("defaults").is_some()); + assert!(Atom::try_static("head").is_none()); assert!(Atom::try_static("not in the static table").is_none()); } +#[cfg(test)] +#[path = "common-usage.rs"] +mod common_usage; + #[cfg(all(test, feature = "unstable"))] #[path = "bench.rs"] mod bench; diff --git a/src/atom.rs b/src/atom.rs index 6da0044..5a8aa7f 100644 --- a/src/atom.rs +++ b/src/atom.rs @@ -7,10 +7,10 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use crate::dynamic_set::{Entry, DYNAMIC_SET}; +use crate::dynamic_set::{dynamic_set, Entry}; use crate::static_sets::StaticAtomSet; use debug_unreachable::debug_unreachable; -use phf_shared; + use std::borrow::Cow; use std::cmp::Ordering::{self, Equal}; use std::fmt; @@ -82,6 +82,15 @@ pub struct Atom { phantom: PhantomData, } +// This isn't really correct as the Atoms can technically take up space. But I guess it's ok +// as it is possible to measure the size of the atom set separately/ +#[cfg(feature = "malloc_size_of")] +impl malloc_size_of::MallocSizeOf for Atom { + fn size_of(&self, _ops: &mut malloc_size_of::MallocSizeOfOps) -> usize { + 0 + } +} + // FIXME: bound removed from the struct definition before of this error for pack_static: // "error[E0723]: trait bounds other than `Sized` on const fn parameters are unstable" // https://github.com/rust-lang/rust/issues/57563 @@ -99,13 +108,32 @@ impl Atom { } } + /// For the atom!() macros + #[inline(always)] + #[doc(hidden)] + pub const fn pack_inline(mut n: u64, len: u8) -> Self { + if cfg!(target_endian = "big") { + // Reverse order of top 7 bytes. + // Bottom 8 bits of `n` are zero, and we need that to remain so. + // String data is stored in top 7 bytes, tag and length in bottom byte. + n = n.to_le() << 8; + } + + let data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET) | n; + Self { + // INLINE_TAG ensures this is never zero + unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, + phantom: PhantomData, + } + } + fn tag(&self) -> u8 { (self.unsafe_data.get() & TAG_MASK) as u8 } } impl Atom { - /// Return the internal repersentation. For testing. + /// Return the internal representation. For testing. #[doc(hidden)] pub fn unsafe_data(&self) -> u64 { self.unsafe_data.get() @@ -186,22 +214,23 @@ impl Hash for Atom { impl<'a, Static: StaticAtomSet> From> for Atom { fn from(string_to_add: Cow<'a, str>) -> Self { - Self::try_static_internal(&*string_to_add).unwrap_or_else(|hash| { - let len = string_to_add.len(); - if len <= MAX_INLINE_LEN { - let mut data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET); - { - let dest = inline_atom_slice_mut(&mut data); - dest[..len].copy_from_slice(string_to_add.as_bytes()) - } - Atom { - // INLINE_TAG ensures this is never zero - unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, - phantom: PhantomData, - } - } else { - let ptr: std::ptr::NonNull = - DYNAMIC_SET.lock().unwrap().insert(string_to_add, hash.g); + let len = string_to_add.len(); + if len == 0 { + Self::pack_static(Static::empty_string_index()) + } else if len <= MAX_INLINE_LEN { + let mut data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET); + { + let dest = inline_atom_slice_mut(&mut data); + dest[..len].copy_from_slice(string_to_add.as_bytes()); + } + Atom { + // INLINE_TAG ensures this is never zero + unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, + phantom: PhantomData, + } + } else { + Self::try_static_internal(&*string_to_add).unwrap_or_else(|hash| { + let ptr: std::ptr::NonNull = dynamic_set().insert(string_to_add, hash.g); let data = ptr.as_ptr() as u64; debug_assert!(0 == data & TAG_MASK); Atom { @@ -209,8 +238,8 @@ impl<'a, Static: StaticAtomSet> From> for Atom { unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, phantom: PhantomData, } - } - }) + }) + } } } @@ -237,10 +266,7 @@ impl Drop for Atom { // Out of line to guide inlining. fn drop_slow(this: &mut Atom) { - DYNAMIC_SET - .lock() - .unwrap() - .remove(this.unsafe_data.get() as *mut Entry); + dynamic_set().remove(this.unsafe_data.get() as *mut Entry); } } } @@ -258,8 +284,9 @@ impl ops::Deref for Atom { } INLINE_TAG => { let len = (self.unsafe_data() & LEN_MASK) >> LEN_OFFSET; + debug_assert!(len as usize <= MAX_INLINE_LEN); let src = inline_atom_slice(&self.unsafe_data); - str::from_utf8_unchecked(&src[..(len as usize)]) + str::from_utf8_unchecked(src.get_unchecked(..(len as usize))) } STATIC_TAG => Static::get().atoms[self.static_index() as usize], _ => debug_unreachable!(), @@ -365,28 +392,24 @@ impl Atom { #[inline(always)] fn inline_atom_slice(x: &NonZeroU64) -> &[u8] { - unsafe { let x: *const NonZeroU64 = x; let mut data = x as *const u8; // All except the lowest byte, which is first in little-endian, last in big-endian. if cfg!(target_endian = "little") { - data = data.offset(1); + data = unsafe { data.offset(1) }; } let len = 7; - slice::from_raw_parts(data, len) - } + unsafe { slice::from_raw_parts(data, len) } } #[inline(always)] -fn inline_atom_slice_mut(x: &mut u64) -> &mut [u8] { - unsafe { +fn inline_atom_slice_mut(x: &mut u64) -> &mut [u8] { let x: *mut u64 = x; let mut data = x as *mut u8; // All except the lowest byte, which is first in little-endian, last in big-endian. if cfg!(target_endian = "little") { - data = data.offset(1); + data = unsafe { data.offset(1) }; } let len = 7; - slice::from_raw_parts_mut(data, len) - } + unsafe { slice::from_raw_parts_mut(data, len) } } diff --git a/src/dynamic_set.rs b/src/dynamic_set.rs index 08c9dcd..4442b4d 100644 --- a/src/dynamic_set.rs +++ b/src/dynamic_set.rs @@ -7,19 +7,19 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use lazy_static::lazy_static; +use parking_lot::Mutex; use std::borrow::Cow; use std::mem; use std::ptr::NonNull; use std::sync::atomic::AtomicIsize; use std::sync::atomic::Ordering::SeqCst; -use std::sync::Mutex; +use std::sync::OnceLock; const NB_BUCKETS: usize = 1 << 12; // 4096 const BUCKET_MASK: u32 = (1 << 12) - 1; pub(crate) struct Set { - buckets: Box<[Option>; NB_BUCKETS]>, + buckets: Box<[Mutex>>]>, } pub(crate) struct Entry { @@ -38,25 +38,31 @@ fn entry_alignment_is_sufficient() { assert!(mem::align_of::() >= ENTRY_ALIGNMENT); } -lazy_static! { - pub(crate) static ref DYNAMIC_SET: Mutex = Mutex::new({ - type T = Option>; - let _static_assert_size_eq = std::mem::transmute::; - let vec = std::mem::ManuallyDrop::new(vec![0_usize; NB_BUCKETS]); - Set { - buckets: unsafe { Box::from_raw(vec.as_ptr() as *mut [T; NB_BUCKETS]) }, - } - }); +pub(crate) fn dynamic_set() -> &'static Set { + // NOTE: Using const initialization for buckets breaks the small-stack test. + // ``` + // // buckets: [Mutex>>; NB_BUCKETS], + // const MUTEX: Mutex>> = Mutex::new(None); + // let buckets = Box::new([MUTEX; NB_BUCKETS]); + // ``` + static DYNAMIC_SET: OnceLock = OnceLock::new(); + + DYNAMIC_SET.get_or_init(|| { + let buckets = (0..NB_BUCKETS).map(|_| Mutex::new(None)).collect(); + Set { buckets } + }) } impl Set { - pub(crate) fn insert(&mut self, string: Cow, hash: u32) -> NonNull { + pub(crate) fn insert(&self, string: Cow, hash: u32) -> NonNull { let bucket_index = (hash & BUCKET_MASK) as usize; + let mut linked_list = self.buckets[bucket_index].lock(); + { - let mut ptr: Option<&mut Box> = self.buckets[bucket_index].as_mut(); + let mut ptr: Option<&mut Box> = linked_list.as_mut(); while let Some(entry) = ptr.take() { - if entry.hash == hash && &*entry.string == &*string { + if entry.hash == hash && *entry.string == *string { if entry.ref_count.fetch_add(1, SeqCst) > 0 { return NonNull::from(&mut **entry); } @@ -74,31 +80,26 @@ impl Set { debug_assert!(mem::align_of::() >= ENTRY_ALIGNMENT); let string = string.into_owned(); let mut entry = Box::new(Entry { - next_in_bucket: self.buckets[bucket_index].take(), + next_in_bucket: linked_list.take(), hash, ref_count: AtomicIsize::new(1), string: string.into_boxed_str(), }); let ptr = NonNull::from(&mut *entry); - self.buckets[bucket_index] = Some(entry); - + *linked_list = Some(entry); ptr } - pub(crate) fn remove(&mut self, ptr: *mut Entry) { - let bucket_index = { - let value: &Entry = unsafe { &*ptr }; - debug_assert!(value.ref_count.load(SeqCst) == 0); - (value.hash & BUCKET_MASK) as usize - }; + pub(crate) fn remove(&self, ptr: *mut Entry) { + let value: &Entry = unsafe { &*ptr }; + let bucket_index = (value.hash & BUCKET_MASK) as usize; - let mut current: &mut Option> = &mut self.buckets[bucket_index]; + let mut linked_list = self.buckets[bucket_index].lock(); + debug_assert!(value.ref_count.load(SeqCst) == 0); + let mut current: &mut Option> = &mut linked_list; - loop { - let entry_ptr: *mut Entry = match current.as_mut() { - Some(entry) => &mut **entry, - None => break, - }; + while let Some(entry_ptr) = current.as_mut() { + let entry_ptr: *mut Entry = &mut **entry_ptr; if entry_ptr == ptr { mem::drop(mem::replace(current, unsafe { (*entry_ptr).next_in_bucket.take() diff --git a/src/lib.rs b/src/lib.rs index b4a8fd5..3cc29b1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -25,10 +25,10 @@ //! In `Cargo.toml`: //! ```toml //! [dependencies] -//! string_cache = "0.8" +//! string_cache = "0.9" //! //! [dev-dependencies] -//! string_cache_codegen = "0.5" +//! string_cache_codegen = "0.6" //! ``` //! //! In `build.rs`: @@ -103,6 +103,19 @@ #![cfg_attr(test, deny(warnings))] +// Types, such as Atom, that impl Hash must follow the hash invariant: if two objects match +// with PartialEq, they must also have the same Hash. Clippy warns on types that derive one while +// manually impl-ing the other, because it seems easy for the two to drift apart, causing the +// invariant to be violated. +// +// But Atom is a newtype over NonZeroU64, and probably always will be, since cheap comparisons and +// copying are this library's purpose. So we know what the PartialEq comparison is going to do. +// +// The `get_hash` function, seen in `atom.rs`, consults that number, plus the global string interner +// tables. The only way for the resulting hash for two Atoms with the same inner 64-bit number to +// differ would be if the table entry changed between invocations, and that would be really bad. +#![allow(clippy::derive_hash_xor_eq)] + mod atom; mod dynamic_set; mod static_sets; diff --git a/src/trivial_impls.rs b/src/trivial_impls.rs index 4c055fd..960dde0 100644 --- a/src/trivial_impls.rs +++ b/src/trivial_impls.rs @@ -39,7 +39,7 @@ impl PartialEq> for str { impl PartialEq for Atom { fn eq(&self, other: &String) -> bool { - &self[..] == &other[..] + self[..] == other[..] } } @@ -66,7 +66,7 @@ impl fmt::Display for Atom { impl AsRef for Atom { fn as_ref(&self) -> &str { - &self + self } } @@ -87,7 +87,33 @@ impl<'a, Static: StaticAtomSet> Deserialize<'a> for Atom { where D: Deserializer<'a>, { - let string: String = Deserialize::deserialize(deserializer)?; - Ok(Atom::from(string)) + use serde::de; + use std::marker::PhantomData; + + struct AtomVisitor(PhantomData); + + impl<'de, Static: StaticAtomSet> de::Visitor<'de> for AtomVisitor { + type Value = Atom; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + write!(formatter, "an Atom") + } + + fn visit_str(self, v: &str) -> Result + where + E: de::Error, + { + Ok(Atom::from(v)) + } + + fn visit_string(self, v: String) -> Result + where + E: de::Error, + { + Ok(Atom::from(v)) + } + } + + deserializer.deserialize_str(AtomVisitor(PhantomData)) } } diff --git a/string-cache-codegen/Cargo.toml b/string-cache-codegen/Cargo.toml index d212d7e..20eced9 100644 --- a/string-cache-codegen/Cargo.toml +++ b/string-cache-codegen/Cargo.toml @@ -1,9 +1,9 @@ [package] name = "string_cache_codegen" -version = "0.5.1" # Also update ../README.md when making a semver-breaking change +version = "0.6.1" # Also update ../README.md when making a semver-breaking change authors = [ "The Servo Project Developers" ] description = "A codegen library for string-cache, developed as part of the Servo project." -license = "MIT / Apache-2.0" +license = "MIT OR Apache-2.0" repository = "https://github.com/servo/string-cache" documentation = "https://docs.rs/string_cache_codegen/" edition = "2018" @@ -13,7 +13,7 @@ name = "string_cache_codegen" path = "lib.rs" [dependencies] -phf_generator = "0.8" -phf_shared = "0.8" +phf_generator = "0.13" +phf_shared = "0.13" proc-macro2 = "1" quote = "1" diff --git a/string-cache-codegen/lib.rs b/string-cache-codegen/lib.rs index 0fe4819..525ef3a 100644 --- a/string-cache-codegen/lib.rs +++ b/string-cache-codegen/lib.rs @@ -19,10 +19,10 @@ //! build = "build.rs" //! //! [dependencies] -//! string_cache = "0.8" +//! string_cache = "0.9" //! //! [build-dependencies] -//! string_cache_codegen = "0.5" +//! string_cache_codegen = "0.6" //! ``` //! //! In `build.rs`: @@ -68,8 +68,9 @@ #![recursion_limit = "128"] +use proc_macro2::Ident; use quote::quote; -use std::collections::HashSet; +use std::collections::BTreeSet; use std::fs::File; use std::io::{self, BufWriter, Write}; use std::path::Path; @@ -81,7 +82,7 @@ pub struct AtomType { static_set_doc: Option, macro_name: String, macro_doc: Option, - atoms: HashSet, + atoms: BTreeSet, } impl AtomType { @@ -114,7 +115,7 @@ impl AtomType { atom_doc: None, static_set_doc: None, macro_doc: None, - atoms: HashSet::new(), + atoms: BTreeSet::new(), } } @@ -181,20 +182,71 @@ impl AtomType { ) } + #[cfg(test)] + /// Write generated code to destination [`Vec`] and return it as [`String`] + /// + /// Used mostly for testing or displaying a value. + pub fn write_to_string(&mut self, mut destination: Vec) -> io::Result { + destination.write_all( + self.to_tokens() + .to_string() + // Insert some newlines to make the generated code slightly easier to read. + .replace(" [ \"", "[\n\"") + .replace("\" , ", "\",\n") + .replace(" ( \"", "\n( \"") + .replace("; ", ";\n") + .as_bytes(), + )?; + let str = String::from_utf8(destination).unwrap(); + Ok(str) + } + fn to_tokens(&mut self) -> proc_macro2::TokenStream { // `impl Default for Atom` requires the empty string to be in the static set. // This also makes sure the set in non-empty, // which would cause divisions by zero in rust-phf. self.atoms.insert(String::new()); - let atoms: Vec<&str> = self.atoms.iter().map(|s| &**s).collect(); - let hash_state = phf_generator::generate_hash(&atoms); + // Strings over 7 bytes + empty string added to static set. + // Otherwise stored inline. + let (static_strs, inline_strs): (Vec<_>, Vec<_>) = self + .atoms + .iter() + .map(String::as_str) + .partition(|s| s.len() > 7 || s.is_empty()); + + // Static strings + let hash_state = phf_generator::generate_hash(&static_strs); let phf_generator::HashState { key, disps, map } = hash_state; let (disps0, disps1): (Vec<_>, Vec<_>) = disps.into_iter().unzip(); - let atoms: Vec<&str> = map.iter().map(|&idx| atoms[idx]).collect(); + let atoms: Vec<&str> = map.iter().map(|&idx| static_strs[idx]).collect(); let empty_string_index = atoms.iter().position(|s| s.is_empty()).unwrap() as u32; let indices = 0..atoms.len() as u32; + fn is_valid_ident(name: &str) -> bool { + let begins_with_letter_or_underscore = name + .chars() + .next() + .is_some_and(|c| c.is_alphabetic() || c == '_'); + let is_alphanumeric = name.chars().all(|c| c.is_alphanumeric() || c == '_'); + + begins_with_letter_or_underscore && is_alphanumeric + } + + let atoms_for_idents: Vec<&str> = atoms + .iter() + .copied() + .filter(|x| is_valid_ident(x)) + .collect(); + let atom_idents: Vec = atoms_for_idents.iter().map(|atom| new_term(atom)).collect(); + + let istrs_for_idents: Vec<&str> = inline_strs + .iter() + .copied() + .filter(|x| is_valid_ident(x)) + .collect(); + let istr_idents: Vec = istrs_for_idents.iter().map(|atom| new_term(atom)).collect(); + let hashes: Vec = atoms .iter() .map(|string| { @@ -221,23 +273,51 @@ impl AtomType { Some(ref doc) => quote!(#[doc = #doc]), None => quote!(), }; - let new_term = - |string: &str| proc_macro2::Ident::new(string, proc_macro2::Span::call_site()); + fn new_term(string: &str) -> Ident { + Ident::new(string, proc_macro2::Span::call_site()) + } let static_set_name = new_term(&format!("{}StaticSet", type_name)); let type_name = new_term(type_name); let macro_name = new_term(&*self.macro_name); let module = module.parse::().unwrap(); let atom_prefix = format!("ATOM_{}_", type_name.to_string().to_uppercase()); - let const_names: Vec<_> = atoms + let new_const_name = |atom: &str| { + let mut name = atom_prefix.clone(); + for c in atom.chars() { + name.push_str(&format!("_{:02X}", c as u32)) + } + new_term(&name) + }; + let const_names: Vec<_> = atoms.iter().copied().map(new_const_name).collect(); + let ident_const_names: Vec<_> = atoms_for_idents + .iter() + .copied() + .map(new_const_name) + .collect(); + let ident_inline_const_names: Vec<_> = istrs_for_idents + .iter() + .copied() + .map(new_const_name) + .collect(); + + // Inline strings + let (inline_const_names, inline_values_and_lengths): (Vec<_>, Vec<_>) = inline_strs .iter() - .map(|atom| { - let mut name = atom_prefix.clone(); - for c in atom.chars() { - name.push_str(&format!("_{:02X}", c as u32)) + .map(|s| { + let const_name = new_const_name(s); + + let mut value = 0u64; + for (index, c) in s.bytes().enumerate() { + value = value | ((c as u64) << (index * 8 + 8)); } - new_term(&name) + + let len = s.len() as u8; + + (const_name, (value, len)) }) - .collect(); + .unzip(); + let (inline_values, inline_lengths): (Vec<_>, Vec<_>) = + inline_values_and_lengths.into_iter().unzip(); quote! { #atom_doc @@ -265,6 +345,9 @@ impl AtomType { #( pub const #const_names: #type_name = #type_name::pack_static(#indices); )* + #( + pub const #inline_const_names: #type_name = #type_name::pack_inline(#inline_values, #inline_lengths); + )* #macro_doc #[macro_export] @@ -272,6 +355,15 @@ impl AtomType { #( (#atoms) => { #module::#const_names }; )* + #( + (#inline_strs) => { #module::#inline_const_names }; + )* + #( + (#atom_idents) => { #module::#ident_const_names }; + )* + #( + (#istr_idents) => { #module::#ident_inline_const_names }; + )* } } } @@ -284,3 +376,18 @@ impl AtomType { self.write_to(BufWriter::new(File::create(path)?)) } } + +#[test] +fn test_iteration_order() { + let x1 = crate::AtomType::new("foo::Atom", "foo_atom!") + .atoms(&["x", "xlink", "svg", "test"]) + .write_to_string(Vec::new()) + .expect("write to string cache x1"); + + let x2 = crate::AtomType::new("foo::Atom", "foo_atom!") + .atoms(&["x", "xlink", "svg", "test"]) + .write_to_string(Vec::new()) + .expect("write to string cache x2"); + + assert_eq!(x1, x2); +} diff --git a/tests/small-stack.rs b/tests/small-stack.rs index 269cad7..bb607af 100644 --- a/tests/small-stack.rs +++ b/tests/small-stack.rs @@ -1,6 +1,6 @@ // Regression test for https://github.com/servo/html5ever/issues/393 // -// Create a dynamic atom − causing initialization of the golbal hash map − +// Create a dynamic atom − causing initialization of the global hash map − // in a thread that has a small stack. // // This is a separate test program rather than a `#[test] fn` among others @@ -9,7 +9,7 @@ fn main() { std::thread::Builder::new() .stack_size(50_000) .spawn(|| { - string_cache::DefaultAtom::from("12345678"); + let _atom = string_cache::DefaultAtom::from("12345678"); }) .unwrap() .join()