diff --git a/Cargo.lock b/Cargo.lock index 179949b262e..ba3a3204287 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3822,6 +3822,7 @@ dependencies = [ "reqwest 0.12.24", "rkyv", "rmp-serde", + "roaring", "seq-macro", "serde", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index 10d170e2164..96f6919ca05 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -217,6 +217,7 @@ reqwest-websocket = "0.5.0" rkyv = { version = "0.7.45", default-features = false } rmp-serde = "1.3.0" rmpv = "1.3.0" +roaring = "0.11.3" rstest = "0.15" # Make sure this is the same rustls version used by the `tonic` crate. # See the `ensure_default_crypto_provider` function. diff --git a/crates/dbsp/Cargo.toml b/crates/dbsp/Cargo.toml index c6da2493f09..636e755efb0 100644 --- a/crates/dbsp/Cargo.toml +++ b/crates/dbsp/Cargo.toml @@ -83,6 +83,7 @@ tracing = { workspace = true } snap = { workspace = true } enum-map = { workspace = true } fastbloom = { workspace = true } +roaring = { workspace = true } core_affinity = { workspace = true } indexmap = { workspace = true } feldera-storage = { workspace = true } @@ -165,6 +166,14 @@ harness = false name = "window_min" harness = false +[[bench]] +name = "filter_bitmap" +harness = false + +[[bench]] +name = "filter_predictor" +harness = false + [[example]] name = "orgchart" diff --git a/crates/dbsp/benches/filter_bitmap.rs b/crates/dbsp/benches/filter_bitmap.rs new file mode 100644 index 00000000000..3c6d6762c2f --- /dev/null +++ b/crates/dbsp/benches/filter_bitmap.rs @@ -0,0 +1,1431 @@ +//! Membership benchmark for `fastbloom` vs `roaring`. +//! +//! Examples: +//! `cargo bench -p dbsp --bench filter_bitmap -- --csv-output filter_bitmap.csv` +//! `cargo bench -p dbsp --bench filter_bitmap -- --key-types u32,u64 --key-spaces consecutive,full_range` + +use clap::{Parser, ValueEnum}; +use csv::Writer; +use dbsp::storage::file::BLOOM_FILTER_FALSE_POSITIVE_RATE; +use fastbloom::BloomFilter; +use rand::{RngCore, SeedableRng}; +use rand_chacha::ChaCha8Rng; +use rand_distr::{Distribution, Normal}; +use roaring::{RoaringBitmap, RoaringTreemap}; +use serde::Serialize; +use std::{ + fmt::{Display, Formatter}, + fs::File, + mem::size_of_val, + path::PathBuf, + time::Instant, +}; + +const DEFAULT_BLOOM_SEED: u128 = 42; +const MIN_BLOOM_EXPECTED_ITEMS: u64 = 64; +const U32_KEY_SPACE_SIZE: u64 = u32::MAX as u64 + 1; +const DEFAULT_LOOKUP_LIMIT: u64 = 50_000_000; +const DEFAULT_KEY_EPS_VALUES: [f64; 6] = [1e-6, 1e-4, 1e-3, 1e-2, 1e-1, 5e-1]; + +// Mirror the spine_async size bands and include the near-full u32 domain case. +const DEFAULT_SPINE_LEVEL_SIZES: [u64; 6] = + [14_999, 99_999, 999_999, 9_999_999, 99_999_999, 999_999_999]; + +fn main() { + let args = Args::parse(); + let key_types = args.key_types(); + let key_spaces = args.key_spaces(); + let num_elements_list = args.num_elements(); + args.validate(&key_types, &key_spaces, &num_elements_list); + + let csv_file = File::create(&args.csv_output) + .unwrap_or_else(|error| panic!("failed to create {}: {error}", args.csv_output.display())); + let mut csv_writer = Writer::from_writer(csv_file); + + println!("benchmark=filter_bitmap"); + println!( + "num_elements={}", + num_elements_list + .iter() + .map(u64::to_string) + .collect::>() + .join(",") + ); + println!("repetitions={}", args.repetitions); + println!("insert_order={}", args.insert_order); + println!("lookup_order={}", args.lookup_order); + println!("insert_seed={}", args.insert_seed); + println!("lookup_seed={}", args.lookup_seed); + println!("key_space_seed={}", args.key_space_seed); + println!( + "key_types={}", + key_types + .iter() + .map(ToString::to_string) + .collect::>() + .join(",") + ); + println!( + "key_spaces={}", + key_spaces + .iter() + .map(ToString::to_string) + .collect::>() + .join(",") + ); + println!( + "key_eps={}", + args.key_eps() + .iter() + .map(ToString::to_string) + .collect::>() + .join(",") + ); + println!( + "structures={}", + args.structures + .iter() + .map(ToString::to_string) + .collect::>() + .join(",") + ); + println!( + "bloom_false_positive_rate={}", + args.bloom_false_positive_rate + ); + println!("bloom_seed={}", args.bloom_seed); + println!("csv_output={}", args.csv_output.display()); + println!(); + + for &key_type in &key_types { + for &key_space in &key_spaces { + for key_eps in args.key_eps_for(key_space) { + let config = BenchmarkConfig { + key_type, + key_space, + key_eps, + }; + + for &num_elements in &num_elements_list { + let lookup_count = args.lookup_count_for(num_elements); + let false_positive_lookup_count = + args.false_positive_lookup_count_for(config, num_elements, lookup_count); + let bloom_expected_items = args + .bloom_expected_items + .unwrap_or(num_elements) + .max(MIN_BLOOM_EXPECTED_ITEMS); + + for structure in &args.structures { + let result = match structure { + Structure::Bloom => benchmark_bloom( + &args, + config, + num_elements, + lookup_count, + false_positive_lookup_count, + bloom_expected_items, + ), + Structure::Roaring => { + benchmark_roaring(&args, config, num_elements, lookup_count) + } + }; + + print_report( + *structure, + config, + &result, + num_elements, + lookup_count, + false_positive_lookup_count, + ); + + csv_writer + .serialize(CsvRow::from_result( + CsvRowContext { + structure: *structure, + config, + args: &args, + num_elements, + lookup_count, + false_positive_lookup_count, + bloom_expected_items, + }, + &result, + )) + .expect("failed to write CSV row"); + csv_writer.flush().expect("failed to flush CSV writer"); + } + } + } + } + } +} + +#[derive(Parser, Debug, Clone)] +#[command(name = "filter_bitmap")] +#[command(about = "Benchmark fastbloom against roaring bitmap or treemap membership queries")] +struct Args { + /// Comma-separated input sizes. Underscores and `u32::MAX` are accepted. + #[arg(long, value_name = "CSV")] + num_elements: Option, + + /// Number of successful lookups to benchmark for each input size. + /// Defaults to min(num_elements, 50_000_000). + #[arg(long)] + lookup_count: Option, + + /// Number of negative lookups used to measure bloom false positives for each input size. + #[arg(long)] + false_positive_lookup_count: Option, + + /// Number of repeated benchmark runs used to compute min/avg/max/std. + #[arg(long, default_value_t = 3)] + repetitions: usize, + + /// Structures to benchmark. + #[arg(long, value_delimiter = ',', default_value = "bloom,roaring")] + structures: Vec, + + /// Key types to benchmark. + #[arg(long, value_delimiter = ',', default_value = "u32")] + key_types: Vec, + + /// Key-space models to benchmark. + /// + /// `consecutive` inserts keys from `0..n`. + /// `full_range` samples `n` distinct keys from the full type domain. + /// `half_normal` spreads `n` unique keys across `0..u32::MAX` with + /// a half-normal offset distribution controlled by `--key-eps`. + #[arg(long, value_delimiter = ',', default_value = "consecutive")] + key_spaces: Vec, + + /// Seed used by the full-range sampler and half-normal quantile phase. + #[arg(long, default_value_t = 2)] + key_space_seed: u64, + + /// Comma-separated epsilon values used by `--key-spaces half-normal`. + #[arg(long, value_name = "CSV")] + key_eps: Option, + + /// Insert order over the chosen keyset. + #[arg(long, default_value_t = Order::Sequential)] + insert_order: Order, + + /// Lookup order over the chosen keyset or sampled subset. + #[arg(long, default_value_t = Order::Random)] + lookup_order: Order, + + /// Seed used when `insert-order=random`. + #[arg(long, default_value_t = 0)] + insert_seed: u64, + + /// Seed used when `lookup-order=random`. + #[arg(long, default_value_t = 1)] + lookup_seed: u64, + + /// Bloom filter false-positive rate. Defaults to DBSP storage default. + #[arg(long, default_value_t = BLOOM_FILTER_FALSE_POSITIVE_RATE)] + bloom_false_positive_rate: f64, + + /// Bloom filter seed. Defaults to DBSP storage seed. + #[arg(long, default_value_t = DEFAULT_BLOOM_SEED)] + bloom_seed: u128, + + /// Backward-compatible alias for `--key-types u64`. + #[doc(hidden)] + #[arg(long, hide = true, default_value_t = false)] + u64_keys: bool, + + /// Expected number of items passed to the bloom filter builder for each input size. + #[arg(long)] + bloom_expected_items: Option, + + /// Output CSV path. + #[arg(long, default_value = "filter_bitmap.csv")] + csv_output: PathBuf, + + // When running with `cargo bench` the binary gets the `--bench` flag, so we + // have to parse and ignore it so clap doesn't reject it. + #[doc(hidden)] + #[arg(long = "bench", hide = true)] + __bench: bool, +} + +impl Args { + fn key_types(&self) -> Vec { + let raw = if self.u64_keys { + vec![KeyType::U64] + } else { + self.key_types.clone() + }; + dedup(raw) + } + + fn key_spaces(&self) -> Vec { + dedup(self.key_spaces.clone()) + } + + fn key_eps(&self) -> Vec { + match &self.key_eps { + Some(csv) => parse_f64_csv(csv, "--key-eps"), + None => DEFAULT_KEY_EPS_VALUES.to_vec(), + } + } + + fn key_eps_for(&self, key_space: KeySpace) -> Vec> { + match key_space { + KeySpace::HalfNormal => self.key_eps().into_iter().map(Some).collect(), + _ => vec![None], + } + } + + fn num_elements(&self) -> Vec { + match &self.num_elements { + Some(csv) => parse_u64_csv(csv), + None => DEFAULT_SPINE_LEVEL_SIZES.to_vec(), + } + } + + fn lookup_count_for(&self, num_elements: u64) -> u64 { + self.lookup_count + .map(|lookup_count| lookup_count.min(num_elements)) + .unwrap_or(num_elements.min(DEFAULT_LOOKUP_LIMIT)) + } + + fn false_positive_lookup_count_for( + &self, + config: BenchmarkConfig, + num_elements: u64, + _lookup_count: u64, + ) -> u64 { + self.false_positive_lookup_count + .map(|count| { + let max_false_positive_lookup_count = + config.max_false_positive_lookup_count(num_elements); + count.min(max_false_positive_lookup_count) + }) + .unwrap_or(0) + } + + fn validate(&self, key_types: &[KeyType], key_spaces: &[KeySpace], num_elements_list: &[u64]) { + let key_eps = self.key_eps(); + + assert!( + !num_elements_list.is_empty(), + "--num-elements must select at least one size" + ); + assert!( + self.repetitions > 0, + "--repetitions must be greater than zero" + ); + assert!( + !self.structures.is_empty(), + "--structures must select at least one structure" + ); + assert!( + !key_types.is_empty(), + "--key-types must select at least one key type" + ); + assert!( + !key_spaces.is_empty(), + "--key-spaces must select at least one key-space mode" + ); + if key_spaces.contains(&KeySpace::HalfNormal) { + assert!( + !key_eps.is_empty(), + "--key-eps must select at least one epsilon for key-space half_normal" + ); + for eps in &key_eps { + assert!( + eps.is_finite() && *eps > 0.0, + "--key-eps values must be finite and greater than zero" + ); + } + } + assert!( + self.bloom_false_positive_rate > 0.0 && self.bloom_false_positive_rate < 1.0, + "--bloom-false-positive-rate must be between 0 and 1" + ); + + for &num_elements in num_elements_list { + assert!( + num_elements > 0, + "--num-elements values must be greater than zero" + ); + + for &key_type in key_types { + for &key_space in key_spaces { + let config = BenchmarkConfig { + key_type, + key_space, + key_eps: None, + }; + config.validate_num_elements(num_elements); + } + } + } + } +} + +fn dedup(values: Vec) -> Vec +where + T: PartialEq, +{ + let mut out = Vec::with_capacity(values.len()); + for value in values { + if !out.contains(&value) { + out.push(value); + } + } + out +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, ValueEnum)] +enum Structure { + #[value(name = "bloom")] + Bloom, + #[value(name = "roaring")] + Roaring, +} + +impl Display for Structure { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::Bloom => f.write_str("bloom"), + Self::Roaring => f.write_str("roaring"), + } + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, ValueEnum)] +enum KeyType { + #[value(name = "u32")] + U32, + #[value(name = "u64")] + U64, +} + +impl Display for KeyType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::U32 => f.write_str("u32"), + Self::U64 => f.write_str("u64"), + } + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, ValueEnum)] +enum KeySpace { + #[value(name = "consecutive")] + Consecutive, + #[value(name = "full_range", alias = "full-range")] + FullRange, + #[value(name = "half_normal", alias = "half-normal")] + HalfNormal, +} + +impl Display for KeySpace { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::Consecutive => f.write_str("consecutive"), + Self::FullRange => f.write_str("full_range"), + Self::HalfNormal => f.write_str("half_normal"), + } + } +} + +#[derive(Clone, Copy, Debug)] +struct BenchmarkConfig { + key_type: KeyType, + key_space: KeySpace, + key_eps: Option, +} + +impl BenchmarkConfig { + fn validate_num_elements(self, num_elements: u64) { + match (self.key_type, self.key_space) { + (KeyType::U32, _) | (_, KeySpace::HalfNormal) => assert!( + num_elements <= U32_KEY_SPACE_SIZE, + "--num-elements values must be <= {} for this key type/key space", + U32_KEY_SPACE_SIZE + ), + (KeyType::U64, _) => {} + } + } + + fn max_false_positive_lookup_count(self, num_elements: u64) -> u64 { + match (self.key_type, self.key_space) { + (_, KeySpace::HalfNormal) => U32_KEY_SPACE_SIZE - num_elements, + (KeyType::U32, _) => U32_KEY_SPACE_SIZE - num_elements, + (KeyType::U64, _) => u64::MAX - num_elements + 1, + } + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, ValueEnum)] +enum Order { + #[value(name = "sequential")] + Sequential, + #[value(name = "random")] + Random, +} + +impl Display for Order { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::Sequential => f.write_str("sequential"), + Self::Random => f.write_str("random"), + } + } +} + +#[derive(Clone, Copy, Debug)] +struct AffinePermutation { + len: u64, + multiplier: u64, + offset: u64, +} + +impl AffinePermutation { + fn sequential(len: u64) -> Self { + Self { + len, + multiplier: 1, + offset: 0, + } + } + + fn random(len: u64, seed: u64) -> Self { + if len <= 1 { + return Self::sequential(len); + } + let mut rng = ChaCha8Rng::seed_from_u64(seed); + let mut multiplier = (rng.next_u64() % len) | 1; + while gcd(multiplier, len) != 1 { + multiplier = (multiplier + 2) % len; + if multiplier == 0 { + multiplier = 1; + } + } + let offset = rng.next_u64() % len; + Self { + len, + multiplier, + offset, + } + } + + fn for_order(len: u64, order: Order, seed: u64) -> Self { + match order { + Order::Sequential => Self::sequential(len), + Order::Random => Self::random(len, seed), + } + } + + fn index_at(&self, position: u64) -> u64 { + debug_assert!(position < self.len); + (self + .multiplier + .wrapping_mul(position) + .wrapping_add(self.offset)) + % self.len + } +} + +#[derive(Clone, Copy, Debug)] +struct WrappingPermutation64 { + multiplier: u64, + offset: u64, +} + +impl WrappingPermutation64 { + fn sequential() -> Self { + Self { + multiplier: 1, + offset: 0, + } + } + + fn random(seed: u64) -> Self { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + Self { + multiplier: rng.next_u64() | 1, + offset: rng.next_u64(), + } + } + + fn for_order(order: Order, seed: u64) -> Self { + match order { + Order::Sequential => Self::sequential(), + Order::Random => Self::random(seed), + } + } + + fn index_at(&self, position: u64) -> u64 { + position + .wrapping_mul(self.multiplier) + .wrapping_add(self.offset) + } +} + +#[derive(Clone, Copy, Debug)] +struct HalfNormalKeySampler { + eps: f64, + seed: u64, +} + +impl HalfNormalKeySampler { + fn new(eps: f64, seed: u64) -> Self { + Self { eps, seed } + } + + fn present_keys_u32(&self, num_elements: u64) -> Vec { + let len = usize::try_from(num_elements).expect("num_elements must fit in usize"); + let mut rng = ChaCha8Rng::seed_from_u64(self.seed); + let sigma = self.eps * u32::MAX as f64; + let distribution = Normal::new(0.0, sigma) + .expect("half-normal epsilon must produce a positive standard deviation"); + let mut keys = Vec::with_capacity(len); + + for _ in 0..num_elements { + let sampled = distribution.sample(&mut rng).abs().round(); + keys.push(sampled.clamp(0.0, u32::MAX as f64) as u32); + } + + keys.sort_unstable(); + project_sorted_unique_u32_domain(&mut keys); + keys + } + + fn present_keys_u64(&self, num_elements: u64) -> Vec { + self.present_keys_u32(num_elements) + .into_iter() + .map(u64::from) + .collect() + } +} + +#[derive(Clone, Copy, Debug)] +enum U32KeySampler { + Consecutive, + FullRange(AffinePermutation), + HalfNormal(HalfNormalKeySampler), +} + +impl U32KeySampler { + fn new(key_space: KeySpace, _num_elements: u64, key_eps: Option, seed: u64) -> Self { + match key_space { + KeySpace::Consecutive => Self::Consecutive, + KeySpace::FullRange => Self::FullRange(AffinePermutation::for_order( + U32_KEY_SPACE_SIZE, + Order::Random, + seed, + )), + KeySpace::HalfNormal => Self::HalfNormal(HalfNormalKeySampler::new( + key_eps.expect("half_normal key space requires key_eps"), + seed, + )), + } + } + + fn present_key(&self, set_index: u64) -> u32 { + match self { + Self::Consecutive => set_index as u32, + Self::FullRange(permutation) => permutation.index_at(set_index) as u32, + Self::HalfNormal(_) => { + panic!("half_normal key space requires pre-generated keys") + } + } + } + + fn absent_key(&self, num_elements: u64, absent_index: u64) -> u32 { + let domain_index = num_elements + .checked_add(absent_index) + .expect("u32 absent-key generation overflowed"); + match self { + Self::Consecutive => domain_index as u32, + Self::FullRange(permutation) => permutation.index_at(domain_index) as u32, + Self::HalfNormal(_) => { + panic!("half_normal key space requires prepared absent keys") + } + } + } +} + +#[derive(Clone, Copy, Debug)] +enum U64KeySampler { + Consecutive, + FullRange(WrappingPermutation64), + HalfNormal(HalfNormalKeySampler), +} + +impl U64KeySampler { + fn new(key_space: KeySpace, _num_elements: u64, key_eps: Option, seed: u64) -> Self { + match key_space { + KeySpace::Consecutive => Self::Consecutive, + KeySpace::FullRange => { + Self::FullRange(WrappingPermutation64::for_order(Order::Random, seed)) + } + KeySpace::HalfNormal => Self::HalfNormal(HalfNormalKeySampler::new( + key_eps.expect("half_normal key space requires key_eps"), + seed, + )), + } + } + + fn present_key(&self, set_index: u64) -> u64 { + match self { + Self::Consecutive => set_index, + Self::FullRange(permutation) => permutation.index_at(set_index), + Self::HalfNormal(_) => { + panic!("half_normal key space requires pre-generated keys") + } + } + } + + fn absent_key(&self, num_elements: u64, absent_index: u64) -> u64 { + let domain_index = num_elements + .checked_add(absent_index) + .expect("u64 absent-key generation overflowed"); + match self { + Self::Consecutive => domain_index, + Self::FullRange(permutation) => permutation.index_at(domain_index), + Self::HalfNormal(_) => { + panic!("half_normal key space requires prepared absent keys") + } + } + } +} + +fn gcd(mut lhs: u64, mut rhs: u64) -> u64 { + while rhs != 0 { + let next = lhs % rhs; + lhs = rhs; + rhs = next; + } + lhs +} + +fn parse_u64_csv(csv: &str) -> Vec { + let mut out: Vec = csv + .split(',') + .filter(|entry| !entry.trim().is_empty()) + .map(|entry| { + parse_u64_token(entry.trim()) + .unwrap_or_else(|error| panic!("invalid u64 in --num-elements: {entry} ({error})")) + }) + .collect(); + out.sort_unstable(); + out.dedup(); + out +} + +fn parse_f64_csv(csv: &str, flag_name: &str) -> Vec { + let mut out: Vec = csv + .split(',') + .filter(|entry| !entry.trim().is_empty()) + .map(|entry| { + entry + .trim() + .parse::() + .unwrap_or_else(|error| panic!("invalid f64 in {flag_name}: {entry} ({error})")) + }) + .collect(); + out.sort_by(|lhs, rhs| lhs.partial_cmp(rhs).expect("NaN was already rejected")); + out.dedup(); + out +} + +fn parse_u64_token(token: &str) -> Result { + match token { + "u32::MAX" | "u32_max" | "max_u32" => Ok(u32::MAX as u64), + _ => token + .replace('_', "") + .parse::() + .map_err(|error| error.to_string()), + } +} + +fn project_sorted_unique_u32_domain(keys: &mut [u32]) { + if keys.is_empty() { + return; + } + + for (index, key) in keys.iter_mut().enumerate() { + let min_key = u32::try_from(index).expect("key count exceeded u32 domain"); + if *key < min_key { + *key = min_key; + } + } + + for index in (0..keys.len()).rev() { + let tail = keys.len() - 1 - index; + let max_key = u32::MAX + .checked_sub(u32::try_from(tail).expect("key count exceeded u32 domain")) + .expect("tail adjustment underflowed"); + if keys[index] > max_key { + keys[index] = max_key; + } + if index + 1 < keys.len() && keys[index] >= keys[index + 1] { + keys[index] = keys[index + 1] - 1; + } + } + + debug_assert!(keys.windows(2).all(|window| window[0] < window[1])); +} + +fn absent_keys_from_sorted_present_u32(present_keys: &[u32], count: u64) -> Vec { + let target_len = usize::try_from(count).expect("false-positive lookup count must fit in usize"); + let mut absent_keys = Vec::with_capacity(target_len); + let mut candidate = 0u64; + + for &present_key in present_keys { + let present_key = present_key as u64; + while candidate < present_key && absent_keys.len() < target_len { + absent_keys.push(candidate as u32); + candidate += 1; + } + if absent_keys.len() == target_len { + return absent_keys; + } + candidate = present_key + .checked_add(1) + .expect("half-normal absent-key generation overflowed"); + } + + while absent_keys.len() < target_len { + absent_keys.push(candidate as u32); + candidate = candidate + .checked_add(1) + .expect("half-normal absent-key generation overflowed"); + } + + absent_keys +} + +fn absent_keys_from_sorted_present_u64(present_keys: &[u64], count: u64) -> Vec { + let target_len = usize::try_from(count).expect("false-positive lookup count must fit in usize"); + let mut absent_keys = Vec::with_capacity(target_len); + let mut candidate = 0u64; + + for &present_key in present_keys { + while candidate < present_key && absent_keys.len() < target_len { + absent_keys.push(candidate); + candidate += 1; + } + if absent_keys.len() == target_len { + return absent_keys; + } + candidate = present_key + .checked_add(1) + .expect("half-normal absent-key generation overflowed"); + } + + while absent_keys.len() < target_len { + absent_keys.push(candidate); + candidate = candidate + .checked_add(1) + .expect("half-normal absent-key generation overflowed"); + } + + absent_keys +} + +#[derive(Debug, Clone, Copy)] +struct SummaryStats { + min: f64, + avg: f64, + max: f64, + stddev: f64, +} + +impl SummaryStats { + fn from_samples(samples: &[f64]) -> Self { + let min = samples.iter().copied().fold(f64::INFINITY, f64::min); + let max = samples.iter().copied().fold(f64::NEG_INFINITY, f64::max); + let avg = samples.iter().sum::() / samples.len() as f64; + let variance = samples + .iter() + .map(|sample| { + let delta = *sample - avg; + delta * delta + }) + .sum::() + / samples.len() as f64; + Self { + min, + avg, + max, + stddev: variance.sqrt(), + } + } +} + +#[derive(Debug, Clone, Copy)] +struct BenchmarkResult { + insert_ns_per_element: SummaryStats, + lookup_ns_per_element: SummaryStats, + bytes_used: usize, + false_positive_rate_percent: Option, +} + +#[derive(Debug, Serialize)] +struct CsvRow { + structure: &'static str, + key_type: &'static str, + key_space: &'static str, + key_eps: Option, + num_elements: u64, + lookup_count: u64, + false_positive_lookup_count: u64, + repetitions: usize, + insert_order: &'static str, + lookup_order: &'static str, + insert_seed: u64, + lookup_seed: u64, + key_space_seed: u64, + bloom_false_positive_rate_target_percent: f64, + bloom_seed: u128, + bloom_expected_items: u64, + bytes_used: usize, + bytes_per_element: f64, + bits_per_element: Option, + insert_ns_per_element_min: f64, + insert_ns_per_element_avg: f64, + insert_ns_per_element_max: f64, + insert_ns_per_element_stddev: f64, + lookup_ns_per_element_min: f64, + lookup_ns_per_element_avg: f64, + lookup_ns_per_element_max: f64, + lookup_ns_per_element_stddev: f64, + false_positive_rate_percent_min: Option, + false_positive_rate_percent_avg: Option, + false_positive_rate_percent_max: Option, + false_positive_rate_percent_stddev: Option, +} + +#[derive(Clone, Copy)] +struct CsvRowContext<'a> { + structure: Structure, + config: BenchmarkConfig, + args: &'a Args, + num_elements: u64, + lookup_count: u64, + false_positive_lookup_count: u64, + bloom_expected_items: u64, +} + +impl CsvRow { + fn from_result(context: CsvRowContext<'_>, result: &BenchmarkResult) -> Self { + let bits_per_element = (context.structure == Structure::Bloom) + .then_some((result.bytes_used as f64 * 8.0) / context.num_elements as f64); + let false_positive_stats = result.false_positive_rate_percent; + + Self { + structure: context.structure.as_str(), + key_type: context.config.key_type.as_str(), + key_space: context.config.key_space.as_str(), + key_eps: context.config.key_eps, + num_elements: context.num_elements, + lookup_count: context.lookup_count, + false_positive_lookup_count: context.false_positive_lookup_count, + repetitions: context.args.repetitions, + insert_order: context.args.insert_order.as_str(), + lookup_order: context.args.lookup_order.as_str(), + insert_seed: context.args.insert_seed, + lookup_seed: context.args.lookup_seed, + key_space_seed: context.args.key_space_seed, + bloom_false_positive_rate_target_percent: context.args.bloom_false_positive_rate + * 100.0, + bloom_seed: context.args.bloom_seed, + bloom_expected_items: context.bloom_expected_items, + bytes_used: result.bytes_used, + bytes_per_element: result.bytes_used as f64 / context.num_elements as f64, + bits_per_element, + insert_ns_per_element_min: result.insert_ns_per_element.min, + insert_ns_per_element_avg: result.insert_ns_per_element.avg, + insert_ns_per_element_max: result.insert_ns_per_element.max, + insert_ns_per_element_stddev: result.insert_ns_per_element.stddev, + lookup_ns_per_element_min: result.lookup_ns_per_element.min, + lookup_ns_per_element_avg: result.lookup_ns_per_element.avg, + lookup_ns_per_element_max: result.lookup_ns_per_element.max, + lookup_ns_per_element_stddev: result.lookup_ns_per_element.stddev, + false_positive_rate_percent_min: false_positive_stats.map(|stats| stats.min), + false_positive_rate_percent_avg: false_positive_stats.map(|stats| stats.avg), + false_positive_rate_percent_max: false_positive_stats.map(|stats| stats.max), + false_positive_rate_percent_stddev: false_positive_stats.map(|stats| stats.stddev), + } + } +} + +impl Structure { + fn as_str(self) -> &'static str { + match self { + Self::Bloom => "bloom", + Self::Roaring => "roaring", + } + } +} + +impl KeyType { + fn as_str(self) -> &'static str { + match self { + Self::U32 => "u32", + Self::U64 => "u64", + } + } +} + +impl KeySpace { + fn as_str(self) -> &'static str { + match self { + Self::Consecutive => "consecutive", + Self::FullRange => "full_range", + Self::HalfNormal => "half_normal", + } + } +} + +impl Order { + fn as_str(self) -> &'static str { + match self { + Self::Sequential => "sequential", + Self::Random => "random", + } + } +} + +fn benchmark_bloom( + args: &Args, + config: BenchmarkConfig, + num_elements: u64, + lookup_count: u64, + false_positive_lookup_count: u64, + bloom_expected_items: u64, +) -> BenchmarkResult { + match config.key_type { + KeyType::U32 => benchmark_bloom_u32( + args, + config, + num_elements, + lookup_count, + false_positive_lookup_count, + bloom_expected_items, + ), + KeyType::U64 => benchmark_bloom_u64( + args, + config, + num_elements, + lookup_count, + false_positive_lookup_count, + bloom_expected_items, + ), + } +} + +fn benchmark_bloom_u32( + args: &Args, + config: BenchmarkConfig, + num_elements: u64, + lookup_count: u64, + false_positive_lookup_count: u64, + bloom_expected_items: u64, +) -> BenchmarkResult { + let mut insert_samples = Vec::with_capacity(args.repetitions); + let mut lookup_samples = Vec::with_capacity(args.repetitions); + let mut false_positive_rate_percent_samples = Vec::with_capacity(args.repetitions); + let mut bytes_used = 0; + let expected_items = + usize::try_from(bloom_expected_items).expect("bloom expected items must fit in usize"); + + for repetition in 0..args.repetitions { + let sampler = U32KeySampler::new( + config.key_space, + num_elements, + config.key_eps, + args.key_space_seed.wrapping_add(repetition as u64), + ); + let present_keys = sorted_present_keys_u32(sampler, num_elements); + let insert_permutation = AffinePermutation::for_order( + num_elements, + args.insert_order, + args.insert_seed.wrapping_add(repetition as u64), + ); + let lookup_permutation = AffinePermutation::for_order( + num_elements, + args.lookup_order, + args.lookup_seed.wrapping_add(repetition as u64), + ); + let false_positive_permutation = (false_positive_lookup_count > 0).then(|| { + AffinePermutation::for_order( + false_positive_lookup_count, + args.lookup_order, + args.lookup_seed.wrapping_add(repetition as u64), + ) + }); + let absent_keys = matches!(sampler, U32KeySampler::HalfNormal(_)).then(|| { + absent_keys_from_sorted_present_u32(&present_keys, false_positive_lookup_count) + }); + + let mut bloom = BloomFilter::with_false_pos(args.bloom_false_positive_rate) + .seed(&args.bloom_seed) + .expected_items(expected_items.max(MIN_BLOOM_EXPECTED_ITEMS as usize)); + + let insert_started = Instant::now(); + for index in 0..num_elements { + let key = present_keys[insert_permutation.index_at(index) as usize]; + bloom.insert(&key); + } + let insert_elapsed = insert_started.elapsed(); + + let lookup_started = Instant::now(); + let mut hits = 0u64; + for index in 0..lookup_count { + let key = present_keys[lookup_permutation.index_at(index) as usize]; + hits += u64::from(bloom.contains(&key)); + } + let lookup_elapsed = lookup_started.elapsed(); + + assert_eq!(hits, lookup_count, "expected all lookup keys to be present"); + + if let Some(false_positive_permutation) = false_positive_permutation { + let mut false_positives = 0u64; + for index in 0..false_positive_lookup_count { + let absent_index = false_positive_permutation.index_at(index); + let key = absent_keys.as_ref().map_or_else( + || sampler.absent_key(num_elements, absent_index), + |keys| keys[absent_index as usize], + ); + false_positives += u64::from(bloom.contains(&key)); + } + false_positive_rate_percent_samples + .push((false_positives as f64 / false_positive_lookup_count as f64) * 100.0); + } + + bytes_used = size_of_val(bloom.as_slice()); + insert_samples.push(insert_elapsed.as_nanos() as f64 / num_elements as f64); + lookup_samples.push(lookup_elapsed.as_nanos() as f64 / lookup_count as f64); + } + + BenchmarkResult { + insert_ns_per_element: SummaryStats::from_samples(&insert_samples), + lookup_ns_per_element: SummaryStats::from_samples(&lookup_samples), + bytes_used, + false_positive_rate_percent: (!false_positive_rate_percent_samples.is_empty()) + .then(|| SummaryStats::from_samples(&false_positive_rate_percent_samples)), + } +} + +fn benchmark_bloom_u64( + args: &Args, + config: BenchmarkConfig, + num_elements: u64, + lookup_count: u64, + false_positive_lookup_count: u64, + bloom_expected_items: u64, +) -> BenchmarkResult { + let mut insert_samples = Vec::with_capacity(args.repetitions); + let mut lookup_samples = Vec::with_capacity(args.repetitions); + let mut false_positive_rate_percent_samples = Vec::with_capacity(args.repetitions); + let mut bytes_used = 0; + let expected_items = + usize::try_from(bloom_expected_items).expect("bloom expected items must fit in usize"); + + for repetition in 0..args.repetitions { + let sampler = U64KeySampler::new( + config.key_space, + num_elements, + config.key_eps, + args.key_space_seed.wrapping_add(repetition as u64), + ); + let present_keys = sorted_present_keys_u64(sampler, num_elements); + let insert_permutation = AffinePermutation::for_order( + num_elements, + args.insert_order, + args.insert_seed.wrapping_add(repetition as u64), + ); + let lookup_permutation = AffinePermutation::for_order( + num_elements, + args.lookup_order, + args.lookup_seed.wrapping_add(repetition as u64), + ); + let false_positive_permutation = (false_positive_lookup_count > 0).then(|| { + AffinePermutation::for_order( + false_positive_lookup_count, + args.lookup_order, + args.lookup_seed.wrapping_add(repetition as u64), + ) + }); + let absent_keys = matches!(sampler, U64KeySampler::HalfNormal(_)).then(|| { + absent_keys_from_sorted_present_u64(&present_keys, false_positive_lookup_count) + }); + + let mut bloom = BloomFilter::with_false_pos(args.bloom_false_positive_rate) + .seed(&args.bloom_seed) + .expected_items(expected_items.max(MIN_BLOOM_EXPECTED_ITEMS as usize)); + + let insert_started = Instant::now(); + for index in 0..num_elements { + let key = present_keys[insert_permutation.index_at(index) as usize]; + bloom.insert(&key); + } + let insert_elapsed = insert_started.elapsed(); + + let lookup_started = Instant::now(); + let mut hits = 0u64; + for index in 0..lookup_count { + let key = present_keys[lookup_permutation.index_at(index) as usize]; + hits += u64::from(bloom.contains(&key)); + } + let lookup_elapsed = lookup_started.elapsed(); + + assert_eq!(hits, lookup_count, "expected all lookup keys to be present"); + + if let Some(false_positive_permutation) = false_positive_permutation { + let mut false_positives = 0u64; + for index in 0..false_positive_lookup_count { + let absent_index = false_positive_permutation.index_at(index); + let key = absent_keys.as_ref().map_or_else( + || sampler.absent_key(num_elements, absent_index), + |keys| keys[absent_index as usize], + ); + false_positives += u64::from(bloom.contains(&key)); + } + false_positive_rate_percent_samples + .push((false_positives as f64 / false_positive_lookup_count as f64) * 100.0); + } + + bytes_used = size_of_val(bloom.as_slice()); + insert_samples.push(insert_elapsed.as_nanos() as f64 / num_elements as f64); + lookup_samples.push(lookup_elapsed.as_nanos() as f64 / lookup_count as f64); + } + + BenchmarkResult { + insert_ns_per_element: SummaryStats::from_samples(&insert_samples), + lookup_ns_per_element: SummaryStats::from_samples(&lookup_samples), + bytes_used, + false_positive_rate_percent: (!false_positive_rate_percent_samples.is_empty()) + .then(|| SummaryStats::from_samples(&false_positive_rate_percent_samples)), + } +} + +fn benchmark_roaring( + args: &Args, + config: BenchmarkConfig, + num_elements: u64, + lookup_count: u64, +) -> BenchmarkResult { + match config.key_type { + KeyType::U32 => benchmark_roaring_u32(args, config, num_elements, lookup_count), + KeyType::U64 => benchmark_roaring_u64(args, config, num_elements, lookup_count), + } +} + +fn benchmark_roaring_u32( + args: &Args, + config: BenchmarkConfig, + num_elements: u64, + lookup_count: u64, +) -> BenchmarkResult { + let mut insert_samples = Vec::with_capacity(args.repetitions); + let mut lookup_samples = Vec::with_capacity(args.repetitions); + let mut bytes_used = 0; + + for repetition in 0..args.repetitions { + let sampler = U32KeySampler::new( + config.key_space, + num_elements, + config.key_eps, + args.key_space_seed.wrapping_add(repetition as u64), + ); + let present_keys = sorted_present_keys_u32(sampler, num_elements); + let insert_permutation = AffinePermutation::for_order( + num_elements, + args.insert_order, + args.insert_seed.wrapping_add(repetition as u64), + ); + let lookup_permutation = AffinePermutation::for_order( + num_elements, + args.lookup_order, + args.lookup_seed.wrapping_add(repetition as u64), + ); + + let insert_started = Instant::now(); + let mut bitmap = RoaringBitmap::new(); + for index in 0..num_elements { + bitmap.insert(present_keys[insert_permutation.index_at(index) as usize]); + } + let insert_elapsed = insert_started.elapsed(); + let _ = bitmap.optimize(); + + let lookup_started = Instant::now(); + let mut hits = 0u64; + for index in 0..lookup_count { + let key = present_keys[lookup_permutation.index_at(index) as usize]; + hits += u64::from(bitmap.contains(key)); + } + let lookup_elapsed = lookup_started.elapsed(); + + assert_eq!(hits, lookup_count, "expected all lookup keys to be present"); + bytes_used = bitmap.serialized_size(); + insert_samples.push(insert_elapsed.as_nanos() as f64 / num_elements as f64); + lookup_samples.push(lookup_elapsed.as_nanos() as f64 / lookup_count as f64); + } + + BenchmarkResult { + insert_ns_per_element: SummaryStats::from_samples(&insert_samples), + lookup_ns_per_element: SummaryStats::from_samples(&lookup_samples), + bytes_used, + false_positive_rate_percent: None, + } +} + +fn benchmark_roaring_u64( + args: &Args, + config: BenchmarkConfig, + num_elements: u64, + lookup_count: u64, +) -> BenchmarkResult { + let mut insert_samples = Vec::with_capacity(args.repetitions); + let mut lookup_samples = Vec::with_capacity(args.repetitions); + let mut bytes_used = 0; + + for repetition in 0..args.repetitions { + let sampler = U64KeySampler::new( + config.key_space, + num_elements, + config.key_eps, + args.key_space_seed.wrapping_add(repetition as u64), + ); + let present_keys = sorted_present_keys_u64(sampler, num_elements); + let insert_permutation = AffinePermutation::for_order( + num_elements, + args.insert_order, + args.insert_seed.wrapping_add(repetition as u64), + ); + let lookup_permutation = AffinePermutation::for_order( + num_elements, + args.lookup_order, + args.lookup_seed.wrapping_add(repetition as u64), + ); + + let insert_started = Instant::now(); + let mut treemap = RoaringTreemap::new(); + for index in 0..num_elements { + treemap.insert(present_keys[insert_permutation.index_at(index) as usize]); + } + let insert_elapsed = insert_started.elapsed(); + + let lookup_started = Instant::now(); + let mut hits = 0u64; + for index in 0..lookup_count { + let key = present_keys[lookup_permutation.index_at(index) as usize]; + hits += u64::from(treemap.contains(key)); + } + let lookup_elapsed = lookup_started.elapsed(); + + assert_eq!(hits, lookup_count, "expected all lookup keys to be present"); + bytes_used = treemap.serialized_size(); + insert_samples.push(insert_elapsed.as_nanos() as f64 / num_elements as f64); + lookup_samples.push(lookup_elapsed.as_nanos() as f64 / lookup_count as f64); + } + + BenchmarkResult { + insert_ns_per_element: SummaryStats::from_samples(&insert_samples), + lookup_ns_per_element: SummaryStats::from_samples(&lookup_samples), + bytes_used, + false_positive_rate_percent: None, + } +} + +fn sorted_present_keys_u32(sampler: U32KeySampler, num_elements: u64) -> Vec { + match sampler { + U32KeySampler::HalfNormal(sampler) => sampler.present_keys_u32(num_elements), + _ => { + let mut present_keys = Vec::with_capacity( + usize::try_from(num_elements).expect("num_elements must fit in usize"), + ); + for set_index in 0..num_elements { + present_keys.push(sampler.present_key(set_index)); + } + present_keys.sort_unstable(); + present_keys + } + } +} + +fn sorted_present_keys_u64(sampler: U64KeySampler, num_elements: u64) -> Vec { + match sampler { + U64KeySampler::HalfNormal(sampler) => sampler.present_keys_u64(num_elements), + _ => { + let mut present_keys = Vec::with_capacity( + usize::try_from(num_elements).expect("num_elements must fit in usize"), + ); + for set_index in 0..num_elements { + present_keys.push(sampler.present_key(set_index)); + } + present_keys.sort_unstable(); + present_keys + } + } +} + +fn print_report( + structure: Structure, + config: BenchmarkConfig, + result: &BenchmarkResult, + num_elements: u64, + lookup_count: u64, + false_positive_lookup_count: u64, +) { + println!("structure={structure}"); + println!("key_type={}", config.key_type.as_str()); + println!("key_space={}", config.key_space.as_str()); + if let Some(key_eps) = config.key_eps { + println!("key_eps={key_eps}"); + } + println!("num_elements={num_elements}"); + println!("bytes_used={}", result.bytes_used); + println!( + "bytes_used_human={}", + format_bytes(result.bytes_used as f64) + ); + println!( + "bytes_per_element={}", + format_bytes(result.bytes_used as f64 / num_elements as f64) + ); + if structure == Structure::Bloom { + println!( + "bits_per_element={:.6}", + (result.bytes_used as f64 * 8.0) / num_elements as f64 + ); + } + print_stats("insert_ns_per_element", result.insert_ns_per_element); + print_stats("lookup_ns_per_element", result.lookup_ns_per_element); + println!("lookup_count={lookup_count}"); + if let Some(stats) = result.false_positive_rate_percent { + println!("false_positive_lookup_count={false_positive_lookup_count}"); + print_stats("false_positive_rate_percent", stats); + } + println!(); +} + +fn print_stats(label: &str, stats: SummaryStats) { + println!("{label}.min={:.6}", stats.min); + println!("{label}.avg={:.6}", stats.avg); + println!("{label}.max={:.6}", stats.max); + println!("{label}.stddev={:.6}", stats.stddev); +} + +fn format_bytes(bytes: f64) -> String { + const UNITS: [&str; 5] = ["B", "KiB", "MiB", "GiB", "TiB"]; + + let mut value = bytes; + let mut unit_index = 0; + while value >= 1024.0 && unit_index + 1 < UNITS.len() { + value /= 1024.0; + unit_index += 1; + } + + format!("{value:.6} {}", UNITS[unit_index]) +} diff --git a/crates/dbsp/benches/filter_predictor.rs b/crates/dbsp/benches/filter_predictor.rs new file mode 100644 index 00000000000..040202d09b3 --- /dev/null +++ b/crates/dbsp/benches/filter_predictor.rs @@ -0,0 +1,1932 @@ +//! Predictor benchmark for deciding between `fastbloom` and `roaring` on u32 keys. +//! +//! Examples: +//! `cargo bench -p dbsp --bench filter_predictor -- --csv-output filter_predictor.csv` +//! `cargo bench -p dbsp --bench filter_predictor -- --num-keys 99_999,999_999 --distributions gaussian,bimodal,exponential --gaussian-means 0.1,0.5,0.9 --gaussian-stddevs 1e-6,1e-4,1e-2` + +use clap::{Parser, ValueEnum}; +use csv::Writer; +use dbsp::storage::file::BLOOM_FILTER_FALSE_POSITIVE_RATE; +use fastbloom::BloomFilter; +use rand::{RngCore, SeedableRng, seq::index::sample}; +use rand_chacha::ChaCha8Rng; +use rand_distr::{Distribution, Exp, Normal}; +use roaring::RoaringBitmap; +use serde::Serialize; +use std::{ + collections::HashMap, + fmt::{Display, Formatter}, + fs::File, + mem::size_of_val, + path::PathBuf, + sync::{ + atomic::{AtomicUsize, Ordering}, + mpsc, + }, + thread, + time::Instant, +}; + +const DEFAULT_BLOOM_SEED: u128 = 42; +const DEFAULT_GAUSSIAN_MEAN_FRACTIONS: [f64; 1] = [0.5]; +const DEFAULT_GAUSSIAN_STDDEV_FRACTIONS: [f64; 10] = + [1e-6, 1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 1e-2, 5e-2, 1e-1, 5e-1]; +const DEFAULT_LOOKUP_LIMIT: u64 = 5_000_000; +const DEFAULT_SAMPLE_PERCENT: f64 = 0.1; +const DEFAULT_MIN_SAMPLE_SIZE: usize = 1_024; +const BIMODAL_LEFT_PEAK_FRAC: f64 = 0.25; +const BIMODAL_RIGHT_PEAK_FRAC: f64 = 0.75; +const MIN_BLOOM_EXPECTED_ITEMS: u64 = 64; +const U32_KEY_SPACE_SIZE: u64 = u32::MAX as u64 + 1; +const DEFAULT_NUM_KEYS: [u64; 10] = [ + 14_999, + 49_999, + 99_999, + 499_999, + 999_999, + 4_999_999, + 9_999_999, + 49_999_999, + 99_999_999, + 999_999_999, +]; + +// Build and memory mostly care about how much work or storage Roaring pays per +// touched 16-bit container, so these predictors stay intentionally simple and +// depend primarily on estimated keys per touched window. +const BUILD_ROARING_ESTIMATED_KEYS_PER_WINDOW_THRESHOLD: f64 = 4.0; +const MEMORY_ROARING_ESTIMATED_KEYS_PER_WINDOW_THRESHOLD: f64 = 32.0; + +// roaring-rs switches array containers to bitmap containers around 4096 keys. +// That transition materially changes lookup behavior, so the lookup predictor +// treats it as a first-class boundary. +const ROARING_BITMAP_CONTAINER_THRESHOLD: f64 = 4_096.0; + +// Lookup prediction is framed as a coarse cost proxy. If the estimated cost of +// reaching and searching a Roaring container stays below this budget, predict +// Roaring; otherwise predict Bloom. +const LOOKUP_ROARING_WINDOW_PROBABILITY_THRESHOLD: f64 = 0.1; +const LOOKUP_ROARING_BITMAP_WINDOW_PROBABILITY_PENALTY: f64 = 0.1; +const LOOKUP_ROARING_ARRAY_WINDOW_PROBABILITY_PENALTY_BASE: f64 = 0.25; +const LOOKUP_ROARING_ARRAY_WINDOW_PROBABILITY_PENALTY_PER_LOG2_KEY: f64 = 0.15; + +// Raw Chao1 fixes a real failure mode in sparse, very wide distributions, where +// the old uniform estimator badly under-counted touched windows and therefore +// over-predicted Roaring for random u32 lookups. Damping keeps that correction +// from overreacting on samples with only a small amount of singleton noise. +const TOUCHED_WINDOWS_CHAO1_DAMPING: f64 = 0.25; +const U32_WINDOW_COUNT: usize = 1 << 16; + +fn main() { + let args = Args::parse(); + let distributions = args.distributions(); + let num_keys_list = args.num_keys(); + let gaussian_means = args.gaussian_means(); + let gaussian_stddevs = args.gaussian_stddevs(); + args.validate( + &distributions, + &num_keys_list, + &gaussian_means, + &gaussian_stddevs, + ); + let run_configs = build_run_configs( + &args, + &distributions, + &num_keys_list, + &gaussian_means, + &gaussian_stddevs, + ); + let worker_threads = args.worker_threads(run_configs.len()); + + println!("benchmark=filter_predictor"); + println!( + "distributions={}", + distributions + .iter() + .map(|distribution| distribution.as_str()) + .collect::>() + .join(",") + ); + println!( + "num_keys={}", + num_keys_list + .iter() + .map(u64::to_string) + .collect::>() + .join(",") + ); + println!( + "gaussian_means={}", + gaussian_means + .iter() + .map(ToString::to_string) + .collect::>() + .join(",") + ); + println!( + "gaussian_stddevs={}", + gaussian_stddevs + .iter() + .map(ToString::to_string) + .collect::>() + .join(",") + ); + println!("repetitions={}", args.repetitions); + println!("distribution_seed={}", args.distribution_seed); + println!("sample_seed={}", args.sample_seed); + println!("lookup_seed={}", args.lookup_seed); + println!("threads={}", worker_threads); + println!("lookup_space={}", args.lookup_space.as_str()); + println!( + "sample_size_override_percent={}", + option_f64(args.sample_size) + ); + println!("lookup_count_override={}", option_u64(args.lookup_count)); + println!( + "bloom_false_positive_rate={}", + args.bloom_false_positive_rate + ); + println!("bloom_seed={}", args.bloom_seed); + println!( + "bloom_expected_items_override={}", + option_u64(args.bloom_expected_items) + ); + println!("csv_output={}", args.csv_output.display()); + println!(); + + let rows = execute_runs(&args, &run_configs, worker_threads); + + let csv_file = File::create(&args.csv_output) + .unwrap_or_else(|error| panic!("failed to create {}: {error}", args.csv_output.display())); + let mut csv_writer = Writer::from_writer(csv_file); + for row in &rows { + print_run_report(row); + csv_writer + .serialize(row) + .expect("failed to write filter predictor CSV row"); + } + csv_writer + .flush() + .expect("failed to flush filter predictor CSV"); + + let accuracy = summarize_accuracy(&rows); + print_summary(&rows, &accuracy); +} + +#[derive(Parser, Debug, Clone)] +#[command(name = "filter_predictor")] +#[command(about = "Benchmark a simple roaring-vs-bloom predictor on gaussian u32 keysets")] +struct Args { + /// Comma-separated key counts. Underscores and `u32::MAX` are accepted. + #[arg(long, value_name = "CSV")] + num_keys: Option, + + /// Comma-separated distribution families to run. + /// Supported values: `gaussian`, `consecutive`, `round_robin_window`, + /// `bimodal`, `exponential`. + #[arg(long, value_name = "CSV")] + distributions: Option, + + /// Gaussian mean values expressed as fractions of `u32::MAX`. + /// Only used by the `gaussian` distribution family. + #[arg(long, value_name = "CSV")] + gaussian_means: Option, + + /// Spread parameters expressed as fractions of `u32::MAX`. + /// Used as: + /// - gaussian standard deviation for `gaussian` + /// - per-peak standard deviation for `bimodal` + /// - exponential scale for `exponential` + #[arg(long, value_name = "CSV")] + gaussian_stddevs: Option, + + /// Number of repeated runs per `(num_keys, mean, stddev)` configuration. + #[arg(long, default_value_t = 3)] + repetitions: usize, + + /// Number of benchmark configurations to run concurrently. + /// `1` keeps runs sequential. + #[arg(long, default_value_t = 1)] + threads: usize, + + /// Lookup workload. + /// `present` samples only keys from the batch. + /// `full_u32` samples random u32 keys from the full domain. + #[arg(long, value_enum, default_value_t = LookupSpace::FullU32)] + lookup_space: LookupSpace, + + /// Number of lookups to benchmark per run. + /// Defaults to `min(num_keys, 5_000_000)` for `present` and `5_000_000` + /// for `full_u32`. + #[arg(long)] + lookup_count: Option, + + /// Predictor sample size as a percentage of the batch. + /// For example, `0.1` samples 0.1% of the keys. + #[arg(long)] + sample_size: Option, + + /// Seed for gaussian key generation. + #[arg(long, default_value_t = 0)] + distribution_seed: u64, + + /// Seed for the predictor's internal sampling pass. + #[arg(long, default_value_t = 1)] + sample_seed: u64, + + /// Seed for randomized successful lookups. + #[arg(long, default_value_t = 2)] + lookup_seed: u64, + + /// Bloom filter false-positive rate. + #[arg(long, default_value_t = BLOOM_FILTER_FALSE_POSITIVE_RATE)] + bloom_false_positive_rate: f64, + + /// Bloom filter seed. + #[arg(long, default_value_t = DEFAULT_BLOOM_SEED)] + bloom_seed: u128, + + /// Expected items passed to the bloom filter builder. + #[arg(long)] + bloom_expected_items: Option, + + /// Output CSV path. + #[arg(long, default_value = "filter_predictor.csv")] + csv_output: PathBuf, + + #[doc(hidden)] + #[arg(long = "bench", hide = true)] + __bench: bool, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq, ValueEnum)] +enum LookupSpace { + Present, + FullU32, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq, ValueEnum)] +enum DistributionKind { + Gaussian, + Consecutive, + RoundRobinWindow, + Bimodal, + Exponential, +} + +impl DistributionKind { + fn as_str(self) -> &'static str { + match self { + Self::Gaussian => "gaussian", + Self::Consecutive => "consecutive", + Self::RoundRobinWindow => "round_robin_window", + Self::Bimodal => "bimodal", + Self::Exponential => "exponential", + } + } + + fn uses_gaussian_mean(self) -> bool { + matches!(self, Self::Gaussian) + } + + fn uses_spread_param(self) -> bool { + matches!(self, Self::Gaussian | Self::Bimodal | Self::Exponential) + } +} + +const DEFAULT_DISTRIBUTIONS: [DistributionKind; 5] = [ + DistributionKind::Gaussian, + DistributionKind::Consecutive, + DistributionKind::RoundRobinWindow, + DistributionKind::Bimodal, + DistributionKind::Exponential, +]; + +impl LookupSpace { + fn as_str(self) -> &'static str { + match self { + Self::Present => "present", + Self::FullU32 => "full_u32", + } + } +} + +impl Args { + fn distributions(&self) -> Vec { + match &self.distributions { + Some(csv) => parse_distribution_csv(csv), + None => DEFAULT_DISTRIBUTIONS.to_vec(), + } + } + + fn num_keys(&self) -> Vec { + match &self.num_keys { + Some(csv) => parse_u64_csv(csv), + None => DEFAULT_NUM_KEYS.to_vec(), + } + } + + fn gaussian_means(&self) -> Vec { + match &self.gaussian_means { + Some(csv) => parse_f64_csv(csv, "--gaussian-means"), + None => DEFAULT_GAUSSIAN_MEAN_FRACTIONS.to_vec(), + } + } + + fn gaussian_stddevs(&self) -> Vec { + match &self.gaussian_stddevs { + Some(csv) => parse_f64_csv(csv, "--gaussian-stddevs"), + None => DEFAULT_GAUSSIAN_STDDEV_FRACTIONS.to_vec(), + } + } + + fn lookup_count_for(&self, num_keys: u64) -> u64 { + self.lookup_count + .map(|lookup_count| match self.lookup_space { + LookupSpace::Present => lookup_count.min(num_keys), + LookupSpace::FullU32 => lookup_count, + }) + .unwrap_or(match self.lookup_space { + LookupSpace::Present => num_keys.min(DEFAULT_LOOKUP_LIMIT), + LookupSpace::FullU32 => DEFAULT_LOOKUP_LIMIT, + }) + } + + fn sample_size_for(&self, num_keys: u64) -> usize { + match self.sample_size { + Some(sample_percent) => sample_count_from_percent(num_keys, sample_percent, 1), + None => default_sample_size(num_keys), + } + } + + fn worker_threads(&self, run_count: usize) -> usize { + self.threads.max(1).min(run_count.max(1)) + } + + fn validate( + &self, + distributions: &[DistributionKind], + num_keys_list: &[u64], + gaussian_means: &[f64], + gaussian_stddevs: &[f64], + ) { + assert!( + !distributions.is_empty(), + "--distributions must select at least one family" + ); + assert!( + !num_keys_list.is_empty(), + "--num-keys must select at least one size" + ); + if distributions + .iter() + .copied() + .any(DistributionKind::uses_gaussian_mean) + { + assert!( + !gaussian_means.is_empty(), + "--gaussian-means must select at least one value when gaussian is enabled" + ); + } + if distributions + .iter() + .copied() + .any(DistributionKind::uses_spread_param) + { + assert!( + !gaussian_stddevs.is_empty(), + "--gaussian-stddevs must select at least one value when gaussian, bimodal, or exponential is enabled" + ); + } + assert!( + self.repetitions > 0, + "--repetitions must be greater than zero" + ); + assert!(self.threads > 0, "--threads must be greater than zero"); + assert!( + self.bloom_false_positive_rate > 0.0 && self.bloom_false_positive_rate < 1.0, + "--bloom-false-positive-rate must be between 0 and 1" + ); + + for &num_keys in num_keys_list { + assert!(num_keys > 0, "--num-keys values must be greater than zero"); + assert!( + num_keys <= U32_KEY_SPACE_SIZE, + "--num-keys values must be <= {}", + U32_KEY_SPACE_SIZE + ); + } + for &gaussian_mean in gaussian_means { + assert!( + gaussian_mean.is_finite() && (0.0..=1.0).contains(&gaussian_mean), + "--gaussian-means values must be finite fractions in [0, 1]" + ); + } + for &gaussian_stddev in gaussian_stddevs { + assert!( + gaussian_stddev.is_finite() && gaussian_stddev > 0.0, + "--gaussian-stddevs values must be finite and greater than zero" + ); + } + if let Some(sample_percent) = self.sample_size { + assert!( + sample_percent.is_finite() && sample_percent > 0.0 && sample_percent <= 100.0, + "--sample-size must be a finite percentage in (0, 100]" + ); + } + if let Some(lookup_count) = self.lookup_count { + assert!(lookup_count > 0, "--lookup-count must be greater than zero"); + } + if let Some(bloom_expected_items) = self.bloom_expected_items { + assert!( + bloom_expected_items > 0, + "--bloom-expected-items must be greater than zero" + ); + } + } +} + +#[derive(Debug, Clone, Copy)] +struct GaussianDistribution { + mean_frac: f64, + stddev_frac: f64, +} + +impl GaussianDistribution { + fn mean_value(self) -> f64 { + self.mean_frac * u32::MAX as f64 + } + + fn stddev_value(self) -> f64 { + self.stddev_frac * u32::MAX as f64 + } +} + +#[derive(Debug, Clone, Copy)] +enum DistributionSpec { + Gaussian(GaussianDistribution), + Consecutive, + RoundRobinWindow, + Bimodal { stddev_frac: f64 }, + Exponential { scale_frac: f64 }, +} + +impl DistributionSpec { + fn as_str(self) -> &'static str { + match self { + Self::Gaussian(_) => "gaussian", + Self::Consecutive => "consecutive", + Self::RoundRobinWindow => "round_robin_window", + Self::Bimodal { .. } => "bimodal", + Self::Exponential { .. } => "exponential", + } + } + + fn parameter_name(self) -> &'static str { + match self { + Self::Gaussian(_) => "stddev_frac", + Self::Bimodal { .. } => "stddev_frac", + Self::Exponential { .. } => "scale_frac", + Self::Consecutive | Self::RoundRobinWindow => "none", + } + } + + fn parameter_frac(self) -> Option { + match self { + Self::Gaussian(distribution) => Some(distribution.stddev_frac), + Self::Bimodal { stddev_frac } => Some(stddev_frac), + Self::Exponential { scale_frac } => Some(scale_frac), + Self::Consecutive | Self::RoundRobinWindow => None, + } + } + + fn parameter_value(self) -> Option { + match self { + Self::Gaussian(distribution) => Some(distribution.stddev_value()), + Self::Bimodal { stddev_frac } => Some(stddev_frac * u32::MAX as f64), + Self::Exponential { scale_frac } => Some(scale_frac * u32::MAX as f64), + Self::Consecutive | Self::RoundRobinWindow => None, + } + } + + fn gaussian_mean_frac(self) -> Option { + match self { + Self::Gaussian(distribution) => Some(distribution.mean_frac), + Self::Consecutive + | Self::RoundRobinWindow + | Self::Bimodal { .. } + | Self::Exponential { .. } => None, + } + } + + fn gaussian_mean_value(self) -> Option { + match self { + Self::Gaussian(distribution) => Some(distribution.mean_value()), + Self::Consecutive + | Self::RoundRobinWindow + | Self::Bimodal { .. } + | Self::Exponential { .. } => None, + } + } + + fn gaussian_stddev_frac(self) -> Option { + match self { + Self::Gaussian(distribution) => Some(distribution.stddev_frac), + Self::Consecutive + | Self::RoundRobinWindow + | Self::Bimodal { .. } + | Self::Exponential { .. } => None, + } + } + + fn gaussian_stddev_value(self) -> Option { + match self { + Self::Gaussian(distribution) => Some(distribution.stddev_value()), + Self::Consecutive + | Self::RoundRobinWindow + | Self::Bimodal { .. } + | Self::Exponential { .. } => None, + } + } +} + +#[derive(Debug, Clone, Copy)] +struct RunConfig { + run_index: usize, + num_keys: u64, + distribution: DistributionSpec, + repetition: usize, + distribution_seed: u64, + sample_seed: u64, + lookup_seed: u64, +} + +fn build_run_configs( + args: &Args, + distributions: &[DistributionKind], + num_keys_list: &[u64], + gaussian_means: &[f64], + gaussian_stddevs: &[f64], +) -> Vec { + let mut run_configs = Vec::new(); + + for &num_keys in num_keys_list { + for &distribution_kind in distributions { + match distribution_kind { + DistributionKind::Gaussian => { + for &gaussian_mean_frac in gaussian_means { + for &gaussian_stddev_frac in gaussian_stddevs { + let distribution = DistributionSpec::Gaussian(GaussianDistribution { + mean_frac: gaussian_mean_frac, + stddev_frac: gaussian_stddev_frac, + }); + push_run_configs(&mut run_configs, args, num_keys, distribution); + } + } + } + DistributionKind::Consecutive => { + push_run_configs( + &mut run_configs, + args, + num_keys, + DistributionSpec::Consecutive, + ); + } + DistributionKind::RoundRobinWindow => { + push_run_configs( + &mut run_configs, + args, + num_keys, + DistributionSpec::RoundRobinWindow, + ); + } + DistributionKind::Bimodal => { + for &stddev_frac in gaussian_stddevs { + push_run_configs( + &mut run_configs, + args, + num_keys, + DistributionSpec::Bimodal { stddev_frac }, + ); + } + } + DistributionKind::Exponential => { + for &scale_frac in gaussian_stddevs { + push_run_configs( + &mut run_configs, + args, + num_keys, + DistributionSpec::Exponential { scale_frac }, + ); + } + } + } + } + } + + run_configs +} + +fn push_run_configs( + run_configs: &mut Vec, + args: &Args, + num_keys: u64, + distribution: DistributionSpec, +) { + for repetition in 0..args.repetitions { + run_configs.push(RunConfig { + run_index: run_configs.len(), + num_keys, + distribution, + repetition, + distribution_seed: args.distribution_seed.wrapping_add(repetition as u64), + sample_seed: args.sample_seed.wrapping_add(repetition as u64), + lookup_seed: args.lookup_seed.wrapping_add(repetition as u64), + }); + } +} + +fn execute_runs(args: &Args, run_configs: &[RunConfig], worker_threads: usize) -> Vec { + if worker_threads <= 1 { + return run_configs + .iter() + .copied() + .map(|run_config| run_single_config(args, run_config)) + .collect(); + } + + let next_index = AtomicUsize::new(0); + let (tx, rx) = mpsc::channel::<(usize, CsvRow)>(); + + thread::scope(|scope| { + for _ in 0..worker_threads { + let tx = tx.clone(); + let next_index = &next_index; + let run_configs = run_configs; + let args = args; + scope.spawn(move || { + loop { + let task_index = next_index.fetch_add(1, Ordering::Relaxed); + if task_index >= run_configs.len() { + break; + } + + let run_config = run_configs[task_index]; + let row = run_single_config(args, run_config); + tx.send((run_config.run_index, row)) + .expect("result receiver dropped unexpectedly"); + } + }); + } + + drop(tx); + + let mut rows_by_index: Vec> = std::iter::repeat_with(|| None) + .take(run_configs.len()) + .collect(); + for (run_index, row) in rx { + rows_by_index[run_index] = Some(row); + } + + rows_by_index + .into_iter() + .map(|row| row.expect("missing benchmark row")) + .collect() + }) +} + +fn run_single_config(args: &Args, run_config: RunConfig) -> CsvRow { + let generated_keys = generate_keys( + run_config.num_keys, + run_config.distribution, + run_config.distribution_seed, + ); + let batch = GeneratedBatch::new(generated_keys, run_config.sample_seed); + let lookup_count = args.lookup_count_for(run_config.num_keys); + let sample_size = args.sample_size_for(run_config.num_keys); + let sample_percent_of_batch = sample_size as f64 / run_config.num_keys as f64 * 100.0; + let bloom_expected_items = args + .bloom_expected_items + .unwrap_or(run_config.num_keys) + .max(MIN_BLOOM_EXPECTED_ITEMS); + + let predictor_stats = estimate_roaring_sample_stats(&batch, sample_size) + .expect("predictor sample should not be empty"); + let prediction = predict_filter_winner(&predictor_stats); + + let bloom = benchmark_bloom( + batch.keys(), + lookup_count, + run_config.lookup_seed, + args.lookup_space, + bloom_expected_items, + args.bloom_false_positive_rate, + args.bloom_seed, + ); + let roaring = benchmark_roaring( + batch.keys(), + lookup_count, + run_config.lookup_seed, + args.lookup_space, + ); + + let build_actual = actual_winner(bloom.build_ns_per_element, roaring.build_ns_per_element); + let lookup_actual = actual_winner(bloom.lookup_ns_per_element, roaring.lookup_ns_per_element); + let memory_actual = actual_winner(bloom.bytes_used as f64, roaring.bytes_used as f64); + + let build_prediction_correct = prediction.build_winner == build_actual; + let lookup_prediction_correct = prediction.lookup_winner == lookup_actual; + let memory_prediction_correct = prediction.memory_winner == memory_actual; + + CsvRow { + num_keys: run_config.num_keys, + distribution: run_config.distribution.as_str(), + distribution_param_name: run_config.distribution.parameter_name(), + distribution_param_frac: run_config.distribution.parameter_frac(), + distribution_param_value: run_config.distribution.parameter_value(), + gaussian_mean_frac: run_config.distribution.gaussian_mean_frac(), + gaussian_mean: run_config.distribution.gaussian_mean_value(), + gaussian_stddev_frac: run_config.distribution.gaussian_stddev_frac(), + gaussian_stddev: run_config.distribution.gaussian_stddev_value(), + repetition: run_config.repetition, + distribution_seed: run_config.distribution_seed, + sample_seed: run_config.sample_seed, + lookup_seed: run_config.lookup_seed, + lookup_space: args.lookup_space.as_str(), + lookup_count, + sample_size, + sample_percent_of_batch, + sample_fraction: sample_size as f64 / run_config.num_keys as f64, + bloom_false_positive_rate_target_percent: args.bloom_false_positive_rate * 100.0, + bloom_seed: args.bloom_seed, + bloom_expected_items, + predictor_sampled_keys: predictor_stats.sampled_keys, + predictor_distinct_windows: predictor_stats.distinct_windows, + predictor_avg_sample_keys_per_window: predictor_stats.avg_sample_keys_per_window, + predictor_same_window_rate: predictor_stats.same_window_rate, + predictor_estimated_keys_per_window: predictor_stats.estimated_keys_per_window, + predictor_estimated_touched_windows: predictor_stats.estimated_touched_windows, + predictor_estimated_window_fill_ratio: predictor_stats.estimated_window_fill_ratio, + predictor_density_score: prediction.density_score, + predictor_build_score: prediction.build_score, + predictor_lookup_score: prediction.lookup_score, + predictor_memory_score: prediction.memory_score, + predicted_build_winner: prediction.build_winner.as_str(), + predicted_lookup_winner: prediction.lookup_winner.as_str(), + predicted_memory_winner: prediction.memory_winner.as_str(), + bloom_build_ns_per_element: bloom.build_ns_per_element, + roaring_build_ns_per_element: roaring.build_ns_per_element, + build_ratio_bloom_over_roaring: bloom.build_ns_per_element / roaring.build_ns_per_element, + actual_build_winner: build_actual.as_str(), + build_prediction_correct, + bloom_lookup_ns_per_element: bloom.lookup_ns_per_element, + bloom_lookup_hits: bloom.lookup_hits, + bloom_lookup_hit_rate_percent: bloom.lookup_hits as f64 / lookup_count as f64 * 100.0, + roaring_lookup_ns_per_element: roaring.lookup_ns_per_element, + roaring_lookup_hits: roaring.lookup_hits, + roaring_lookup_hit_rate_percent: roaring.lookup_hits as f64 / lookup_count as f64 * 100.0, + lookup_ratio_bloom_over_roaring: bloom.lookup_ns_per_element + / roaring.lookup_ns_per_element, + actual_lookup_winner: lookup_actual.as_str(), + lookup_prediction_correct, + bloom_bytes_used: bloom.bytes_used, + roaring_bytes_used: roaring.bytes_used, + memory_ratio_bloom_over_roaring: bloom.bytes_used as f64 / roaring.bytes_used as f64, + actual_memory_winner: memory_actual.as_str(), + memory_prediction_correct, + } +} + +#[derive(Debug, Clone)] +struct GeneratedBatch { + keys: Vec, + sample_seed: u64, +} + +impl GeneratedBatch { + fn new(keys: Vec, sample_seed: u64) -> Self { + Self { keys, sample_seed } + } + + fn keys(&self) -> &[u32] { + &self.keys + } +} + +/// Minimal trait matching the predictor sketch. +pub trait SampleKeys { + fn sample_keys(&self, n: usize) -> Vec; + fn key_count(&self) -> usize; +} + +impl SampleKeys for GeneratedBatch { + fn sample_keys(&self, n: usize) -> Vec { + if self.keys.is_empty() { + return Vec::new(); + } + if n >= self.keys.len() { + return self.keys.clone(); + } + + let mut rng = ChaCha8Rng::seed_from_u64(self.sample_seed); + let mut indexes = sample(&mut rng, self.keys.len(), n).into_vec(); + indexes.sort_unstable(); + indexes.into_iter().map(|index| self.keys[index]).collect() + } + + fn key_count(&self) -> usize { + self.keys.len() + } +} + +#[derive(Debug, Clone)] +pub struct RoaringSampleStats { + /// Number of keys in the batch. + pub batch_keys: usize, + + /// Number of sampled keys actually returned. + pub sampled_keys: usize, + + /// Fraction of the batch included in the sample. + pub sample_fraction: f64, + + /// Number of distinct 16-bit windows (containers) touched by the sample. + pub distinct_windows: usize, + + /// Average number of sampled keys per touched window. + pub avg_sample_keys_per_window: f64, + + /// Fraction of adjacent sampled keys that stay in the same 2^16 window. + pub same_window_rate: f64, + + /// Estimated number of real keys per touched 16-bit window after + /// rescaling by the sample fraction. + pub estimated_keys_per_window: f64, + + /// Estimated number of distinct 16-bit windows touched by the full batch. + pub estimated_touched_windows: f64, + + /// Estimated occupancy of a touched window, normalized by 2^16. + pub estimated_window_fill_ratio: f64, +} + +/// Estimate Roaring-friendly batch structure from a small sample of keys. +/// +/// The estimator deliberately works in two layers: +/// 1. Sample `n` keys from the batch. +/// 2. Sort and dedup them so adjacency and per-window counts are stable. +/// 3. Bucket sampled keys by their high 16 bits, which matches Roaring's +/// top-level `u32` container layout. +/// 4. Compute sample-level statistics such as: +/// - sampled keys +/// - distinct touched windows +/// - average sampled keys per touched window +/// - adjacent-key same-window rate +/// 5. Rescale the sampled keys/window estimate by the sample fraction so large +/// dense batches do not look artificially sparse just because only a small +/// fraction of the batch was sampled. +/// 6. Estimate the full-batch touched-window count by combining: +/// - a uniform occupancy estimate, which works well when keys are spread +/// fairly evenly across windows +/// - a damped Chao1 correction, which helps when the sample is dominated by +/// singleton windows and the uniform estimate would under-count unseen +/// windows in sparse, wide distributions +/// 7. Derive the normalized window fill ratio from the estimated keys/window. +/// +/// Example: +/// - Suppose the batch contains `10_000` keys and we sample `1_000`. +/// - After sorting and deduping we still have `1_000` sampled keys, so the +/// sample fraction is `0.1`. +/// - If those sampled keys touch `50` distinct 16-bit windows, then the sample +/// average is `1_000 / 50 = 20` sampled keys per touched window. +/// - Rescaling by the sample fraction gives an estimated +/// `20 / 0.1 = 200` real keys per touched window. +/// - If most sampled windows are singletons, the Chao1-style correction will +/// push the touched-window estimate above the uniform estimate because the +/// sample is likely missing many windows entirely. +/// - If the sample instead shows repeated hits in the same windows, the uniform +/// estimate tends to dominate and the batch looks more Roaring-friendly. +pub fn estimate_roaring_sample_stats( + batch: &B, + n: usize, +) -> Option { + if n == 0 { + return None; + } + + let batch_keys = batch.key_count(); + if batch_keys == 0 { + return None; + } + + let mut keys = batch.sample_keys(n); + if keys.is_empty() { + return None; + } + + // Make adjacent-key and per-window statistics deterministic even if the + // caller samples in arbitrary order. + keys.sort_unstable(); + keys.dedup(); + + let sampled_keys = keys.len(); + if sampled_keys == 0 { + return None; + } + + let mut per_window: HashMap = HashMap::new(); + for &key in &keys { + let window = (key >> 16) as u16; + *per_window.entry(window).or_insert(0) += 1; + } + + let distinct_windows = per_window.len(); + let sample_fraction = sampled_keys as f64 / batch_keys as f64; + let avg_sample_keys_per_window = sampled_keys as f64 / distinct_windows as f64; + let same_window_rate = if sampled_keys > 1 { + (sampled_keys - distinct_windows) as f64 / (sampled_keys - 1) as f64 + } else { + 0.0 + }; + // The sampled average keys/window shrinks as batches get larger unless we + // scale it back up by the sample fraction. Without this rescaling, large + // but dense batches look artificially sparse and the predictor incorrectly + // drifts toward Bloom. + let estimated_keys_per_window = if sample_fraction > 0.0 { + (avg_sample_keys_per_window / sample_fraction).min(65_536.0) + } else { + 0.0 + }; + // Sparse, wide samples often show up as many singleton windows and very few + // doubletons. Those counts are exactly what the Chao1-style correction uses + // to estimate how many touched windows the sample likely missed entirely. + let sample_singleton_windows = per_window.values().filter(|&&count| count == 1).count(); + let sample_doubleton_windows = per_window.values().filter(|&&count| count == 2).count(); + let estimated_touched_windows = estimate_touched_windows( + batch_keys, + sampled_keys, + distinct_windows, + sample_singleton_windows, + sample_doubleton_windows, + ); + let estimated_window_fill_ratio = estimated_keys_per_window / 65_536.0; + + Some(RoaringSampleStats { + batch_keys, + sampled_keys, + sample_fraction, + distinct_windows, + avg_sample_keys_per_window, + same_window_rate, + estimated_keys_per_window, + estimated_touched_windows, + estimated_window_fill_ratio, + }) +} + +/// Estimate how many distinct 16-bit Roaring windows the full batch touches. +/// +/// This function combines two signals: +/// 1. A uniform occupancy estimate that works well when touched windows are +/// fairly evenly populated. +/// 2. A Chao1-style unseen-window estimate that reacts when the sample is full +/// of singleton windows and therefore likely missing many windows entirely. +/// +/// Example: +/// - Suppose a batch of `10_000` keys is sampled down to `1_000` keys. +/// - The sample touches `50` distinct windows. +/// - If many of those `50` windows only appear once in the sample, that is a +/// hint that the sample is only seeing the tip of a much wider distribution. +/// - The uniform estimate might still say "roughly 70 windows total", while +/// Chao1 might say "closer to 200 windows total". +/// - We blend the two so sparse wide batches move upward, but not so far that +/// a little singleton noise completely dominates the estimate. +/// +/// This blend exists because the original uniform-only estimator was the main +/// reason the predictor failed on wide Gaussians: it under-counted touched +/// windows, which made random full-u32 Roaring lookups appear cheaper than +/// they really were. +fn estimate_touched_windows( + batch_keys: usize, + sampled_keys: usize, + distinct_windows: usize, + sample_singleton_windows: usize, + sample_doubleton_windows: usize, +) -> f64 { + if batch_keys == 0 || sampled_keys == 0 || distinct_windows == 0 { + return 0.0; + } + if sampled_keys >= batch_keys { + return distinct_windows as f64; + } + + let uniform_estimate = + estimate_uniform_touched_windows(batch_keys, sampled_keys, distinct_windows); + let chao1_estimate = estimate_chao1_touched_windows( + distinct_windows, + sample_singleton_windows, + sample_doubleton_windows, + ); + + // The original uniform estimate works well when occupancy is reasonably + // even, but it collapses badly on sparse wide Gaussians: it can turn a + // singleton-heavy sample into only ~1k touched windows, which then makes + // random full-u32 lookups look far more Roaring-friendly than they are. + // Blend in a damped Chao1 correction so unseen windows move the estimate in + // the right direction without letting Chao1 dominate every noisy sample. + arithmetic_blend( + uniform_estimate, + chao1_estimate, + TOUCHED_WINDOWS_CHAO1_DAMPING, + ) +} + +/// Estimate touched windows under a "roughly uniform occupancy" assumption. +/// +/// Intuition: +/// - Assume the full batch touches `W` windows and spreads keys across them +/// fairly evenly. +/// - Given the sample fraction, solve for the `W` that would yield the observed +/// sampled distinct-window count. +/// +/// Example: +/// - If a `10_000`-key batch is sampled at `10%`, and the sample sees `50` +/// distinct windows, this function asks: +/// "For what total window count would a 10% sample be expected to see about +/// 50 windows?" +/// - It binary-searches that answer between the sampled distinct count and the +/// theoretical maximum number of windows. +/// +/// This is the baseline estimator because it behaves sensibly on compact or +/// moderately regular distributions. It falls apart on sparse wide batches, +/// where many windows are touched so rarely that the sample never sees them. +fn estimate_uniform_touched_windows( + batch_keys: usize, + sampled_keys: usize, + distinct_windows: usize, +) -> f64 { + if batch_keys == 0 || sampled_keys == 0 || distinct_windows == 0 { + return 0.0; + } + if sampled_keys >= batch_keys { + return distinct_windows as f64; + } + + // This model assumes touched windows are roughly uniform and solves for the + // total window count that would yield the observed sampled distinct window + // count. It is a good baseline, but it systematically underestimates very + // sparse wide batches because those batches have many unseen windows. + let sample_fraction = sampled_keys as f64 / batch_keys as f64; + let mut low = distinct_windows as f64; + let mut high = batch_keys.min(U32_WINDOW_COUNT) as f64; + + if low >= high { + return low; + } + + let log_unseen = (-sample_fraction).ln_1p(); + for _ in 0..100 { + let mid = (low + high) * 0.5; + let avg_keys_per_window = batch_keys as f64 / mid; + let observed_windows = mid * (1.0 - (avg_keys_per_window * log_unseen).exp()); + + if observed_windows < distinct_windows as f64 { + low = mid; + } else { + high = mid; + } + } + + high +} + +/// Estimate touched windows with a Chao1-style unseen-species correction. +/// +/// Here the "species" are touched 16-bit windows: +/// - `distinct_windows` is how many windows the sample observed +/// - `sample_singleton_windows` counts windows seen exactly once +/// - `sample_doubleton_windows` counts windows seen exactly twice +/// +/// Example: +/// - If a sample touches `50` windows, with `35` singletons and `2` +/// doubletons, that pattern is strong evidence that many windows were missed +/// entirely. +/// - Chao1 turns that singleton-heavy shape into a larger touched-window +/// estimate than the uniform model would produce. +/// +/// Raw Chao1 is intentionally not used directly in the final predictor because +/// it can overreact when `f2` is tiny. We still compute it here because it is +/// the right directional correction for the sparse-wide failure mode. +fn estimate_chao1_touched_windows( + distinct_windows: usize, + sample_singleton_windows: usize, + sample_doubleton_windows: usize, +) -> f64 { + // Chao1 is a classic unseen-species estimator. Here the "species" are + // touched 16-bit windows, and singleton-heavy samples are evidence that the + // sample missed many windows entirely. + let chao1_estimate = if sample_doubleton_windows > 0 { + distinct_windows as f64 + + (sample_singleton_windows * sample_singleton_windows) as f64 + / (2.0 * sample_doubleton_windows as f64) + } else { + distinct_windows as f64 + + (sample_singleton_windows.saturating_mul(sample_singleton_windows.saturating_sub(1)) + / 2) as f64 + }; + + chao1_estimate + .max(distinct_windows as f64) + .min(U32_WINDOW_COUNT as f64) +} + +fn arithmetic_blend(current: f64, chao1: f64, alpha: f64) -> f64 { + // Raw Chao1 reacts strongly to singleton-heavy samples, which is useful for + // sparse wide batches but too aggressive to use directly. Blend it toward + // the previous uniform estimate so the predictor only partially trusts the + // unseen-window correction. + current + alpha * (chao1 - current) +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +enum Winner { + Bloom, + Roaring, +} + +impl Winner { + fn as_str(self) -> &'static str { + match self { + Self::Bloom => "bloom", + Self::Roaring => "roaring", + } + } +} + +impl Display for Winner { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + +#[derive(Debug, Clone, Copy)] +struct PredictorOutput { + density_score: f64, + build_score: f64, + lookup_score: f64, + memory_score: f64, + build_winner: Winner, + lookup_winner: Winner, + memory_winner: Winner, +} + +/// Convert sampled structural estimates into coarse Bloom-vs-Roaring winners. +/// +/// The predictor intentionally uses different signals for different metrics: +/// - build: mostly "how many keys end up in each touched window?" +/// - memory: same question, but with a higher density threshold +/// - lookup: "how often does a random probe reach a touched window, and how +/// expensive is that container likely to be once it does?" +/// +/// Example: +/// - Suppose a batch looks dense after sampling, with many keys per touched +/// window and only a small touched-window fraction. That usually pushes all +/// three metrics toward Roaring. +/// - Suppose instead the batch is spread across a large fraction of the 16-bit +/// windows and each window only has a modest number of keys. That is the +/// "many sparse array containers" regime where lookup can flip toward Bloom. +/// +/// The lookup path is where most of the iterations happened: +/// - using only keys/window missed sparse-wide cases +/// - using touched windows without normalizing them was the wrong shape +/// - using a flat array penalty missed that `ArrayStore::contains()` gets +/// slower as array containers grow +/// +/// The current formula keeps the model simple while preserving those learned +/// corrections from the benchmark runs. +fn predict_filter_winner(stats: &RoaringSampleStats) -> PredictorOutput { + let density_score = stats.estimated_window_fill_ratio; + // Build and memory stay as simple density rules: if touched windows are + // dense, Roaring tends to compress and build well; if they are sparse, + // Bloom tends to be cheaper. + let build_score = + stats.estimated_keys_per_window / BUILD_ROARING_ESTIMATED_KEYS_PER_WINDOW_THRESHOLD; + // For lookups we need more than density. Random u32 probes only pay inner + // container cost when they land in a touched 16-bit window, so the touched + // window estimate is normalized into a hit probability. If we omit this + // term, the predictor cannot distinguish dense-in-a-few-windows batches + // from equally dense batches spread across a large fraction of the domain. + let lookup_window_probability = + (stats.estimated_touched_windows / U32_WINDOW_COUNT as f64).clamp(0.0, 1.0); + // roaring-rs switches between array and bitmap containers around 4096 + // elements. Bitmap containers are close to a constant-time bit test, but + // array containers use binary search and get meaningfully slower as they + // grow. Without this size-dependent array penalty, medium-N wide Gaussians + // with many sparse array containers were still over-predicted as Roaring. + let lookup_container_penalty = + if stats.estimated_keys_per_window >= ROARING_BITMAP_CONTAINER_THRESHOLD { + LOOKUP_ROARING_BITMAP_WINDOW_PROBABILITY_PENALTY + } else { + LOOKUP_ROARING_ARRAY_WINDOW_PROBABILITY_PENALTY_BASE + + LOOKUP_ROARING_ARRAY_WINDOW_PROBABILITY_PENALTY_PER_LOG2_KEY + * (stats.estimated_keys_per_window + 1.0).log2() + }; + // lookup_score >= 1.0 means the estimated Roaring lookup cost stays under + // the current budget and we predict Roaring. The exact threshold is tuned + // empirically from benchmark output; the important part is the shape above. + let lookup_cost_proxy = lookup_window_probability * lookup_container_penalty; + let lookup_score = + LOOKUP_ROARING_WINDOW_PROBABILITY_THRESHOLD / lookup_cost_proxy.max(f64::MIN_POSITIVE); + let memory_score = + stats.estimated_keys_per_window / MEMORY_ROARING_ESTIMATED_KEYS_PER_WINDOW_THRESHOLD; + + PredictorOutput { + density_score, + build_score, + lookup_score, + memory_score, + build_winner: predicted_winner(build_score), + lookup_winner: predicted_winner(lookup_score), + memory_winner: predicted_winner(memory_score), + } +} + +fn predicted_winner(score: f64) -> Winner { + if score >= 1.0 { + Winner::Roaring + } else { + Winner::Bloom + } +} + +#[derive(Debug, Clone, Copy)] +struct Measurement { + build_ns_per_element: f64, + lookup_ns_per_element: f64, + lookup_hits: u64, + bytes_used: usize, +} + +fn benchmark_bloom( + keys: &[u32], + lookup_count: u64, + lookup_seed: u64, + lookup_space: LookupSpace, + bloom_expected_items: u64, + bloom_false_positive_rate: f64, + bloom_seed: u128, +) -> Measurement { + let expected_items = + usize::try_from(bloom_expected_items).expect("bloom expected items must fit in usize"); + let mut bloom = BloomFilter::with_false_pos(bloom_false_positive_rate) + .seed(&bloom_seed) + .expected_items(expected_items.max(MIN_BLOOM_EXPECTED_ITEMS as usize)); + + let build_started = Instant::now(); + for &key in keys { + bloom.insert(&key); + } + let build_elapsed = build_started.elapsed(); + + let (lookup_elapsed, hits) = benchmark_lookup(keys, lookup_count, lookup_seed, lookup_space, { + |key| bloom.contains(&key) + }); + + Measurement { + build_ns_per_element: build_elapsed.as_nanos() as f64 / keys.len() as f64, + lookup_ns_per_element: lookup_elapsed.as_nanos() as f64 / lookup_count as f64, + lookup_hits: hits, + bytes_used: size_of_val(bloom.as_slice()), + } +} + +fn benchmark_roaring( + keys: &[u32], + lookup_count: u64, + lookup_seed: u64, + lookup_space: LookupSpace, +) -> Measurement { + let build_started = Instant::now(); + let mut bitmap = RoaringBitmap::from_sorted_iter(keys.iter().copied()) + .expect("sorted gaussian keys should build a roaring bitmap"); + let build_elapsed = build_started.elapsed(); + let _ = bitmap.optimize(); + + let (lookup_elapsed, hits) = + benchmark_lookup(keys, lookup_count, lookup_seed, lookup_space, |key| { + bitmap.contains(key) + }); + + Measurement { + build_ns_per_element: build_elapsed.as_nanos() as f64 / keys.len() as f64, + lookup_ns_per_element: lookup_elapsed.as_nanos() as f64 / lookup_count as f64, + lookup_hits: hits, + bytes_used: bitmap.serialized_size(), + } +} + +fn benchmark_lookup( + keys: &[u32], + lookup_count: u64, + lookup_seed: u64, + lookup_space: LookupSpace, + mut contains: F, +) -> (std::time::Duration, u64) +where + F: FnMut(u32) -> bool, +{ + let lookup_started = Instant::now(); + let hits = match lookup_space { + LookupSpace::Present => { + let lookup_permutation = AffinePermutation::random(keys.len() as u64, lookup_seed); + let mut hits = 0u64; + for index in 0..lookup_count { + let key = keys[lookup_permutation.index_at(index) as usize]; + hits += u64::from(contains(key)); + } + assert_eq!( + hits, lookup_count, + "expected all present lookup keys to be present" + ); + hits + } + LookupSpace::FullU32 => { + let mut rng = ChaCha8Rng::seed_from_u64(lookup_seed); + let mut hits = 0u64; + for _ in 0..lookup_count { + hits += u64::from(contains(rng.next_u32())); + } + hits + } + }; + (lookup_started.elapsed(), hits) +} + +fn actual_winner(bloom_value: f64, roaring_value: f64) -> Winner { + if roaring_value < bloom_value { + Winner::Roaring + } else { + Winner::Bloom + } +} + +#[derive(Debug, Serialize)] +struct CsvRow { + num_keys: u64, + distribution: &'static str, + distribution_param_name: &'static str, + distribution_param_frac: Option, + distribution_param_value: Option, + gaussian_mean_frac: Option, + gaussian_mean: Option, + gaussian_stddev_frac: Option, + gaussian_stddev: Option, + repetition: usize, + distribution_seed: u64, + sample_seed: u64, + lookup_seed: u64, + lookup_space: &'static str, + lookup_count: u64, + sample_size: usize, + sample_percent_of_batch: f64, + sample_fraction: f64, + bloom_false_positive_rate_target_percent: f64, + bloom_seed: u128, + bloom_expected_items: u64, + predictor_sampled_keys: usize, + predictor_distinct_windows: usize, + predictor_avg_sample_keys_per_window: f64, + predictor_same_window_rate: f64, + predictor_estimated_keys_per_window: f64, + predictor_estimated_touched_windows: f64, + predictor_estimated_window_fill_ratio: f64, + predictor_density_score: f64, + predictor_build_score: f64, + predictor_lookup_score: f64, + predictor_memory_score: f64, + predicted_build_winner: &'static str, + predicted_lookup_winner: &'static str, + predicted_memory_winner: &'static str, + bloom_build_ns_per_element: f64, + roaring_build_ns_per_element: f64, + build_ratio_bloom_over_roaring: f64, + actual_build_winner: &'static str, + build_prediction_correct: bool, + bloom_lookup_ns_per_element: f64, + bloom_lookup_hits: u64, + bloom_lookup_hit_rate_percent: f64, + roaring_lookup_ns_per_element: f64, + roaring_lookup_hits: u64, + roaring_lookup_hit_rate_percent: f64, + lookup_ratio_bloom_over_roaring: f64, + actual_lookup_winner: &'static str, + lookup_prediction_correct: bool, + bloom_bytes_used: usize, + roaring_bytes_used: usize, + memory_ratio_bloom_over_roaring: f64, + actual_memory_winner: &'static str, + memory_prediction_correct: bool, +} + +#[derive(Debug, Default)] +struct AccuracySummary { + runs: usize, + build_correct: usize, + lookup_correct: usize, + memory_correct: usize, +} + +fn summarize_accuracy(rows: &[CsvRow]) -> AccuracySummary { + let mut accuracy = AccuracySummary::default(); + + for row in rows { + accuracy.runs += 1; + accuracy.build_correct += usize::from(row.build_prediction_correct); + accuracy.lookup_correct += usize::from(row.lookup_prediction_correct); + accuracy.memory_correct += usize::from(row.memory_prediction_correct); + } + + accuracy +} + +fn print_summary(rows: &[CsvRow], accuracy: &AccuracySummary) { + let wrong_rows: Vec<&CsvRow> = rows + .iter() + .filter(|row| { + !row.build_prediction_correct + || !row.lookup_prediction_correct + || !row.memory_prediction_correct + }) + .collect(); + let wrong_metric_predictions = wrong_rows + .iter() + .map(|row| { + usize::from(!row.build_prediction_correct) + + usize::from(!row.lookup_prediction_correct) + + usize::from(!row.memory_prediction_correct) + }) + .sum::(); + + println!("summary.runs={}", accuracy.runs); + println!( + "accuracy.build={}/{}", + accuracy.build_correct, accuracy.runs + ); + println!( + "accuracy.lookup={}/{}", + accuracy.lookup_correct, accuracy.runs + ); + println!( + "accuracy.memory={}/{}", + accuracy.memory_correct, accuracy.runs + ); + println!("wrong_predictions.run_count={}", wrong_rows.len()); + println!( + "wrong_predictions.metric_count={}", + wrong_metric_predictions + ); + + for row in wrong_rows { + println!( + "wrong_prediction {} num_keys={} repetition={} sample_size={} sample_percent_of_batch={:.6}", + distribution_summary_fields(row), + row.num_keys, + row.repetition, + row.sample_size, + row.sample_percent_of_batch + ); + println!( + "wrong_prediction.predictor avg_sample_keys_per_window={:.6} same_window_rate={:.6} estimated_keys_per_window={:.6} estimated_touched_windows={:.6} estimated_window_fill_ratio={:.6}", + row.predictor_avg_sample_keys_per_window, + row.predictor_same_window_rate, + row.predictor_estimated_keys_per_window, + row.predictor_estimated_touched_windows, + row.predictor_estimated_window_fill_ratio + ); + + if !row.build_prediction_correct { + println!( + "wrong_prediction.build predicted={} actual={} score={:.6} bloom_over_roaring={:.6}", + row.predicted_build_winner, + row.actual_build_winner, + row.predictor_build_score, + row.build_ratio_bloom_over_roaring + ); + } + if !row.lookup_prediction_correct { + println!( + "wrong_prediction.lookup predicted={} actual={} score={:.6} bloom_over_roaring={:.6}", + row.predicted_lookup_winner, + row.actual_lookup_winner, + row.predictor_lookup_score, + row.lookup_ratio_bloom_over_roaring + ); + } + if !row.memory_prediction_correct { + println!( + "wrong_prediction.memory predicted={} actual={} score={:.6} bloom_over_roaring={:.6}", + row.predicted_memory_winner, + row.actual_memory_winner, + row.predictor_memory_score, + row.memory_ratio_bloom_over_roaring + ); + } + } +} + +fn print_run_report(row: &CsvRow) { + println!("distribution={}", row.distribution); + println!("distribution_param_name={}", row.distribution_param_name); + println!( + "distribution_param_frac={}", + option_f64(row.distribution_param_frac) + ); + println!( + "distribution_param_value={}", + option_f64(row.distribution_param_value) + ); + println!("num_keys={}", row.num_keys); + println!("gaussian_mean_frac={}", option_f64(row.gaussian_mean_frac)); + println!("gaussian_mean={}", option_f64(row.gaussian_mean)); + println!( + "gaussian_stddev_frac={}", + option_f64(row.gaussian_stddev_frac) + ); + println!("gaussian_stddev={}", option_f64(row.gaussian_stddev)); + println!("repetition={}", row.repetition); + println!("lookup_space={}", row.lookup_space); + println!("sample_size={}", row.sample_size); + println!("sample_percent_of_batch={:.6}", row.sample_percent_of_batch); + println!("lookup_count={}", row.lookup_count); + println!("predictor.sampled_keys={}", row.predictor_sampled_keys); + println!( + "predictor.distinct_windows={}", + row.predictor_distinct_windows + ); + println!( + "predictor.avg_sample_keys_per_window={:.6}", + row.predictor_avg_sample_keys_per_window + ); + println!( + "predictor.same_window_rate={:.6}", + row.predictor_same_window_rate + ); + println!( + "predictor.estimated_keys_per_window={:.6}", + row.predictor_estimated_keys_per_window + ); + println!( + "predictor.estimated_touched_windows={:.6}", + row.predictor_estimated_touched_windows + ); + println!( + "predictor.estimated_window_fill_ratio={:.6}", + row.predictor_estimated_window_fill_ratio + ); + println!("predictor.build_score={:.6}", row.predictor_build_score); + println!("predictor.lookup_score={:.6}", row.predictor_lookup_score); + println!("predictor.memory_score={:.6}", row.predictor_memory_score); + println!("predicted.build_winner={}", row.predicted_build_winner); + println!("predicted.lookup_winner={}", row.predicted_lookup_winner); + println!("predicted.memory_winner={}", row.predicted_memory_winner); + println!( + "bloom.build_ns_per_element={:.6}", + row.bloom_build_ns_per_element + ); + println!( + "roaring.build_ns_per_element={:.6}", + row.roaring_build_ns_per_element + ); + println!( + "build_ratio_bloom_over_roaring={:.6}", + row.build_ratio_bloom_over_roaring + ); + println!("actual.build_winner={}", row.actual_build_winner); + println!("build_prediction_correct={}", row.build_prediction_correct); + println!( + "bloom.lookup_ns_per_element={:.6}", + row.bloom_lookup_ns_per_element + ); + println!("bloom.lookup_hits={}", row.bloom_lookup_hits); + println!( + "bloom.lookup_hit_rate_percent={:.6}", + row.bloom_lookup_hit_rate_percent + ); + println!( + "roaring.lookup_ns_per_element={:.6}", + row.roaring_lookup_ns_per_element + ); + println!("roaring.lookup_hits={}", row.roaring_lookup_hits); + println!( + "roaring.lookup_hit_rate_percent={:.6}", + row.roaring_lookup_hit_rate_percent + ); + println!( + "lookup_ratio_bloom_over_roaring={:.6}", + row.lookup_ratio_bloom_over_roaring + ); + println!("actual.lookup_winner={}", row.actual_lookup_winner); + println!( + "lookup_prediction_correct={}", + row.lookup_prediction_correct + ); + println!("bloom.bytes_used={}", row.bloom_bytes_used); + println!("roaring.bytes_used={}", row.roaring_bytes_used); + println!( + "memory_ratio_bloom_over_roaring={:.6}", + row.memory_ratio_bloom_over_roaring + ); + println!("actual.memory_winner={}", row.actual_memory_winner); + println!( + "memory_prediction_correct={}", + row.memory_prediction_correct + ); + println!(); +} + +#[derive(Clone, Copy, Debug)] +struct AffinePermutation { + len: u64, + multiplier: u64, + offset: u64, +} + +impl AffinePermutation { + fn sequential(len: u64) -> Self { + Self { + len, + multiplier: 1, + offset: 0, + } + } + + fn random(len: u64, seed: u64) -> Self { + if len <= 1 { + return Self::sequential(len); + } + let mut rng = ChaCha8Rng::seed_from_u64(seed); + let mut multiplier = (rng.next_u64() % len) | 1; + while gcd(multiplier, len) != 1 { + multiplier = (multiplier + 2) % len; + if multiplier == 0 { + multiplier = 1; + } + } + let offset = rng.next_u64() % len; + Self { + len, + multiplier, + offset, + } + } + + fn index_at(&self, position: u64) -> u64 { + debug_assert!(position < self.len); + (self + .multiplier + .wrapping_mul(position) + .wrapping_add(self.offset)) + % self.len + } +} + +fn gcd(mut lhs: u64, mut rhs: u64) -> u64 { + while rhs != 0 { + let next = lhs % rhs; + lhs = rhs; + rhs = next; + } + lhs +} + +fn generate_keys(num_keys: u64, distribution: DistributionSpec, seed: u64) -> Vec { + match distribution { + DistributionSpec::Gaussian(distribution) => { + generate_gaussian_keys(num_keys, distribution, seed) + } + DistributionSpec::Consecutive => generate_consecutive_keys(num_keys), + DistributionSpec::RoundRobinWindow => generate_round_robin_window_keys(num_keys), + DistributionSpec::Bimodal { stddev_frac } => { + generate_bimodal_keys(num_keys, stddev_frac, seed) + } + DistributionSpec::Exponential { scale_frac } => { + generate_exponential_keys(num_keys, scale_frac, seed) + } + } +} + +fn generate_gaussian_keys( + num_keys: u64, + distribution: GaussianDistribution, + seed: u64, +) -> Vec { + let len = usize::try_from(num_keys).expect("num_keys must fit in usize"); + let mut rng = ChaCha8Rng::seed_from_u64(seed); + let normal = Normal::new(distribution.mean_value(), distribution.stddev_value()) + .expect("gaussian distribution should have a positive standard deviation"); + let mut keys = Vec::with_capacity(len); + + for _ in 0..num_keys { + let sampled = normal.sample(&mut rng).round(); + keys.push(sampled.clamp(0.0, u32::MAX as f64) as u32); + } + + keys.sort_unstable(); + project_sorted_unique_u32_domain(&mut keys); + keys +} + +fn generate_consecutive_keys(num_keys: u64) -> Vec { + let len = usize::try_from(num_keys).expect("num_keys must fit in usize"); + (0..len) + .map(|index| u32::try_from(index).expect("consecutive key exceeded u32 domain")) + .collect() +} + +fn generate_round_robin_window_keys(num_keys: u64) -> Vec { + let len = usize::try_from(num_keys).expect("num_keys must fit in usize"); + let mut keys = Vec::with_capacity(len); + let full_layers = num_keys / U32_WINDOW_COUNT as u64; + let partial_windows = num_keys % U32_WINDOW_COUNT as u64; + + for window in 0..U32_WINDOW_COUNT as u64 { + let keys_in_window = full_layers + u64::from(window < partial_windows); + let window_base = window << 16; + for low in 0..keys_in_window { + keys.push( + u32::try_from(window_base + low).expect("round-robin key exceeded u32 domain"), + ); + } + } + + debug_assert_eq!(keys.len(), len); + keys +} + +fn generate_bimodal_keys(num_keys: u64, stddev_frac: f64, seed: u64) -> Vec { + let len = usize::try_from(num_keys).expect("num_keys must fit in usize"); + let mut rng = ChaCha8Rng::seed_from_u64(seed); + let left = Normal::new( + BIMODAL_LEFT_PEAK_FRAC * u32::MAX as f64, + stddev_frac * u32::MAX as f64, + ) + .expect("bimodal distribution should have a positive standard deviation"); + let right = Normal::new( + BIMODAL_RIGHT_PEAK_FRAC * u32::MAX as f64, + stddev_frac * u32::MAX as f64, + ) + .expect("bimodal distribution should have a positive standard deviation"); + let mut keys = Vec::with_capacity(len); + + for _ in 0..num_keys { + let sampled = if rng.next_u32() & 1 == 0 { + left.sample(&mut rng) + } else { + right.sample(&mut rng) + } + .round(); + keys.push(sampled.clamp(0.0, u32::MAX as f64) as u32); + } + + keys.sort_unstable(); + project_sorted_unique_u32_domain(&mut keys); + keys +} + +fn generate_exponential_keys(num_keys: u64, scale_frac: f64, seed: u64) -> Vec { + let len = usize::try_from(num_keys).expect("num_keys must fit in usize"); + let mut rng = ChaCha8Rng::seed_from_u64(seed); + let scale = (scale_frac * u32::MAX as f64).max(f64::MIN_POSITIVE); + let distribution = + Exp::new(1.0 / scale).expect("exponential distribution should have a positive scale"); + let mut keys = Vec::with_capacity(len); + + for _ in 0..num_keys { + let sampled = distribution.sample(&mut rng).round(); + keys.push(sampled.clamp(0.0, u32::MAX as f64) as u32); + } + + keys.sort_unstable(); + project_sorted_unique_u32_domain(&mut keys); + keys +} + +fn default_sample_size(num_keys: u64) -> usize { + sample_count_from_percent(num_keys, DEFAULT_SAMPLE_PERCENT, DEFAULT_MIN_SAMPLE_SIZE) +} + +fn parse_u64_csv(csv: &str) -> Vec { + let mut out: Vec = csv + .split(',') + .filter(|entry| !entry.trim().is_empty()) + .map(|entry| { + parse_u64_token(entry.trim()) + .unwrap_or_else(|error| panic!("invalid u64 in CSV: {entry} ({error})")) + }) + .collect(); + out.sort_unstable(); + out.dedup(); + out +} + +fn parse_f64_csv(csv: &str, flag_name: &str) -> Vec { + let mut out: Vec = csv + .split(',') + .filter(|entry| !entry.trim().is_empty()) + .map(|entry| { + entry + .trim() + .parse::() + .unwrap_or_else(|error| panic!("invalid f64 in {flag_name}: {entry} ({error})")) + }) + .collect(); + out.sort_by(|lhs, rhs| lhs.partial_cmp(rhs).expect("NaN was already rejected")); + out.dedup(); + out +} + +fn parse_distribution_csv(csv: &str) -> Vec { + let mut out = Vec::new(); + + for token in csv.split(',').filter(|entry| !entry.trim().is_empty()) { + let normalized = token.trim().replace('_', "-"); + let distribution = DistributionKind::from_str(&normalized, true).unwrap_or_else(|error| { + panic!("invalid distribution in --distributions: {token} ({error})") + }); + if !out.contains(&distribution) { + out.push(distribution); + } + } + + out +} + +fn parse_u64_token(token: &str) -> Result { + match token { + "u32::MAX" | "u32_max" | "max_u32" => Ok(u32::MAX as u64), + _ => token + .replace('_', "") + .parse::() + .map_err(|error| error.to_string()), + } +} + +fn project_sorted_unique_u32_domain(keys: &mut [u32]) { + if keys.is_empty() { + return; + } + + for (index, key) in keys.iter_mut().enumerate() { + let min_key = u32::try_from(index).expect("key count exceeded u32 domain"); + if *key < min_key { + *key = min_key; + } + } + + for index in (0..keys.len()).rev() { + let tail = keys.len() - 1 - index; + let max_key = u32::MAX + .checked_sub(u32::try_from(tail).expect("key count exceeded u32 domain")) + .expect("tail adjustment underflowed"); + if keys[index] > max_key { + keys[index] = max_key; + } + if index + 1 < keys.len() && keys[index] >= keys[index + 1] { + keys[index] = keys[index + 1] - 1; + } + } + + debug_assert!(keys.windows(2).all(|window| window[0] < window[1])); +} + +fn option_u64(value: Option) -> String { + value + .map(|value| value.to_string()) + .unwrap_or_else(|| "auto".to_string()) +} + +fn option_f64(value: Option) -> String { + value + .map(|value| value.to_string()) + .unwrap_or_else(|| "auto".to_string()) +} + +fn distribution_summary_fields(row: &CsvRow) -> String { + let mut fields = format!("distribution={}", row.distribution); + if let Some(gaussian_mean_frac) = row.gaussian_mean_frac { + fields.push_str(&format!(" gaussian_mean_frac={gaussian_mean_frac}")); + } + if let Some(distribution_param_frac) = row.distribution_param_frac { + fields.push_str(&format!( + " {}={distribution_param_frac}", + row.distribution_param_name + )); + } + fields +} + +fn sample_count_from_percent(num_keys: u64, sample_percent: f64, min_sample_size: usize) -> usize { + let scaled = ((num_keys as f64) * (sample_percent / 100.0)).ceil() as u64; + let sample_size = scaled.max(min_sample_size as u64).min(num_keys); + usize::try_from(sample_size).expect("sample size must fit in usize") +} diff --git a/crates/dbsp/src/circuit/metadata.rs b/crates/dbsp/src/circuit/metadata.rs index 5522bd42e96..d0c3001b97b 100644 --- a/crates/dbsp/src/circuit/metadata.rs +++ b/crates/dbsp/src/circuit/metadata.rs @@ -136,6 +136,14 @@ pub const BLOOM_FILTER_MISSES_COUNT: MetricId = pub const BLOOM_FILTER_HIT_RATE_PERCENT: MetricId = MetricId(Cow::Borrowed("bloom_filter_hit_rate_percent")); pub const BLOOM_FILTER_SIZE_BYTES: MetricId = MetricId(Cow::Borrowed("bloom_filter_size_bytes")); +pub const ROARING_FILTER_HITS_COUNT: MetricId = + MetricId(Cow::Borrowed("roaring_filter_hits_count")); +pub const ROARING_FILTER_MISSES_COUNT: MetricId = + MetricId(Cow::Borrowed("roaring_filter_misses_count")); +pub const ROARING_FILTER_HIT_RATE_PERCENT: MetricId = + MetricId(Cow::Borrowed("roaring_filter_hit_rate_percent")); +pub const ROARING_FILTER_SIZE_BYTES: MetricId = + MetricId(Cow::Borrowed("roaring_filter_size_bytes")); pub const RANGE_FILTER_HITS_COUNT: MetricId = MetricId(Cow::Borrowed("range_filter_hits_count")); pub const RANGE_FILTER_MISSES_COUNT: MetricId = MetricId(Cow::Borrowed("range_filter_misses_count")); @@ -167,7 +175,7 @@ pub const PREFIX_BATCHES_STATS: MetricId = MetricId(Cow::Borrowed("prefix_batche pub const INPUT_INTEGRAL_RECORDS_COUNT: MetricId = MetricId(Cow::Borrowed("input_integral_records_count")); -pub const CIRCUIT_METRICS: [CircuitMetric; 70] = [ +pub const CIRCUIT_METRICS: [CircuitMetric; 74] = [ // State CircuitMetric { name: USED_MEMORY_BYTES, @@ -269,7 +277,7 @@ pub const CIRCUIT_METRICS: [CircuitMetric; 70] = [ name: BLOOM_FILTER_BITS_PER_KEY, category: CircuitMetricCategory::State, advanced: false, - description: "Average number of bits per key in the Bloom filter.", + description: "Average number of bits per key across batches that use a Bloom filter.", }, CircuitMetric { name: BLOOM_FILTER_SIZE_BYTES, @@ -295,6 +303,30 @@ pub const CIRCUIT_METRICS: [CircuitMetric; 70] = [ advanced: false, description: "Hit rate of the Bloom filter.", }, + CircuitMetric { + name: ROARING_FILTER_SIZE_BYTES, + category: CircuitMetricCategory::State, + advanced: false, + description: "Size of the bitmap filter in bytes.", + }, + CircuitMetric { + name: ROARING_FILTER_HITS_COUNT, + category: CircuitMetricCategory::State, + advanced: false, + description: "The number of hits across all bitmap filters. The hits are summed across the bitmap filters for all batches in the spine.", + }, + CircuitMetric { + name: ROARING_FILTER_MISSES_COUNT, + category: CircuitMetricCategory::State, + advanced: false, + description: "The number of misses across all bitmap filters. The misses are summed across the bitmap filters for all batches in the spine.", + }, + CircuitMetric { + name: ROARING_FILTER_HIT_RATE_PERCENT, + category: CircuitMetricCategory::State, + advanced: false, + description: "Hit rate of the bitmap filter.", + }, CircuitMetric { name: RANGE_FILTER_SIZE_BYTES, category: CircuitMetricCategory::State, diff --git a/crates/dbsp/src/dynamic/data.rs b/crates/dbsp/src/dynamic/data.rs index ce1645a1c27..f328ad90106 100644 --- a/crates/dbsp/src/dynamic/data.rs +++ b/crates/dbsp/src/dynamic/data.rs @@ -12,6 +12,7 @@ use crate::{ rkyv::SerializeDyn, }, hash::default_hash, + utils::SupportsRoaring, }; /// Defines the minimal set of operations that must be supported by @@ -19,7 +20,16 @@ use crate::{ /// /// This trait is object safe and can be invoked via dynamic dispatch. pub trait Data: - Comparable + Clonable + SerializeDyn + DeserializableDyn + Send + Sync + Debug + AsAny + SizeOf + Comparable + + Clonable + + SerializeDyn + + DeserializableDyn + + Send + + Sync + + Debug + + AsAny + + SizeOf + + SupportsRoaring { /// Compute a hash of the object using default hasher and seed. fn default_hash(&self) -> u64; diff --git a/crates/dbsp/src/storage.rs b/crates/dbsp/src/storage.rs index 25c5557567c..cb2fa45d11a 100644 --- a/crates/dbsp/src/storage.rs +++ b/crates/dbsp/src/storage.rs @@ -7,7 +7,6 @@ pub mod backend; pub mod buffer_cache; pub mod dirlock; pub mod file; -pub mod filter_stats; pub mod tracking_bloom_filter; use fdlimit::{Outcome::LimitRaised, raise_fd_limit}; diff --git a/crates/dbsp/src/storage/file.rs b/crates/dbsp/src/storage/file.rs index ec32a6b3729..e3abc88bfae 100644 --- a/crates/dbsp/src/storage/file.rs +++ b/crates/dbsp/src/storage/file.rs @@ -36,8 +36,10 @@ //! value and for sequential reads. It should be possible to disable indexing //! by data value for workloads that don't require it. //! -//! Layer files support approximate set membership query in `~O(1)` time using -//! [a filter block](format::FilterBlock). +//! Layer files support cheap key-membership tests using a per-batch filter +//! block. The default filter is Bloom-based; key types whose per-batch span +//! fits in `u32` can alternatively use an exact roaring bitmap filter by +//! storing keys relative to the batch minimum. //! //! Layer files should support 1 TB data size. //! @@ -98,6 +100,7 @@ use std::{ use std::{any::Any, sync::Arc}; use std::{fmt::Debug, ptr::NonNull}; +mod filter; pub mod format; mod item; pub mod reader; @@ -108,6 +111,10 @@ use crate::{ dynamic::{DataTrait, Erase, Factory, WithFactory}, storage::file::item::RefTup2Factory, }; +pub use filter::BatchKeyFilter; +pub use filter::FilterPlan; +pub use filter::TrackingRoaringBitmap; +pub use filter::{FilterKind, FilterStats, TrackingFilterStats}; pub use item::{ArchivedItem, Item, ItemFactory, WithItemFactory}; const BLOOM_FILTER_SEED: u128 = 42; @@ -577,9 +584,8 @@ impl Deserializer { pub fn new(version: u32) -> Self { // Proper error is returned in reader.rs, this is a sanity check. assert!( - version >= format::VERSION_NUMBER, - "Unable to read old (pre-v{}) checkpoint data on this feldera version, pipeline needs to backfilled to start.", - format::VERSION_NUMBER + version >= format::MIN_SUPPORTED_VERSION, + "Unable to read checkpoint data with unsupported old storage format version {version} on this feldera version.", ); Self { version, diff --git a/crates/dbsp/src/storage/file/filter.rs b/crates/dbsp/src/storage/file/filter.rs new file mode 100644 index 00000000000..2e3d71866e4 --- /dev/null +++ b/crates/dbsp/src/storage/file/filter.rs @@ -0,0 +1,266 @@ +mod bloom; +mod roaring; +mod stats; + +use crate::{ + Runtime, + dynamic::{DataTrait, DynVec}, + storage::tracking_bloom_filter::TrackingBloomFilter, + trace::{BatchReader, BatchReaderFactories, sample_keys_from_batches}, +}; +use dyn_clone::clone_box; +use rand::thread_rng; +use std::io; + +pub use roaring::TrackingRoaringBitmap; +pub(crate) use roaring::{ + FILTER_PLAN_MIN_SAMPLE_SIZE, FILTER_PLAN_SAMPLE_PERCENT, RoaringLookupSampleStats, +}; +pub use stats::{FilterKind, FilterStats, TrackingFilterStats}; + +/// In-memory representation of the per-batch key filter. +#[derive(Debug)] +pub enum BatchKeyFilter { + /// Probabilistic Bloom filter over key hashes. + Bloom(TrackingBloomFilter), + + /// Exact roaring bitmap for key types whose batch's range fits in `u32`. + RoaringU32(TrackingRoaringBitmap), +} + +impl BatchKeyFilter { + pub(crate) fn new_bloom(estimated_keys: usize, bloom_false_positive_rate: f64) -> Self { + Self::Bloom(bloom::new_bloom_filter( + estimated_keys, + bloom_false_positive_rate, + )) + } + + pub(crate) fn new_roaring_u32(min: &K) -> Self + where + K: DataTrait + ?Sized, + { + Self::RoaringU32(TrackingRoaringBitmap::with_min(min)) + } + + pub(crate) fn deserialize_bloom(num_hashes: u32, data: Vec) -> Self { + Self::Bloom(bloom::deserialize_bloom_filter(num_hashes, data)) + } + + pub(crate) fn deserialize_roaring_u32(data: &[u8], min: &K) -> io::Result + where + K: DataTrait + ?Sized, + { + TrackingRoaringBitmap::deserialize_from(data, min).map(Self::RoaringU32) + } + + pub(crate) fn insert_key(&mut self, key: &K) + where + K: DataTrait + ?Sized, + { + match self { + Self::Bloom(filter) => { + filter.insert_hash(key.default_hash()); + } + Self::RoaringU32(filter) => { + filter.insert_key(key); + } + } + } + pub(crate) fn finalize(&mut self) { + match self { + Self::Bloom(_) => {} + Self::RoaringU32(filter) => filter.finalize(), + } + } +} + +/// Merge-time input used to choose the batch membership filter before writing. +/// +/// The writer must know upfront whether it is building Bloom or bitmap state, +/// because it cannot switch filters after the first key is written. The plan +/// therefore bundles: +/// - the merged batch bounds, which tell us whether min-offset roaring fits; +/// - a sampled subset of input keys, which lets us predict lookup behavior +/// when Bloom and roaring are both enabled. +pub struct FilterPlan +where + K: DataTrait + ?Sized, +{ + min: Box, + max: Box, + sampled_keys: Option>>, +} + +impl FilterPlan +where + K: DataTrait + ?Sized, +{ + fn sample_count_for_filter_plan(num_keys: usize) -> usize { + let scaled = ((num_keys as f64) * (FILTER_PLAN_SAMPLE_PERCENT / 100.0)).ceil() as usize; + scaled.max(FILTER_PLAN_MIN_SAMPLE_SIZE).min(num_keys) + } + + /// Builds a filter plan from the known minimum and maximum batch keys. + pub fn from_bounds(min: &K, max: &K) -> Self { + Self { + min: clone_box(min), + max: clone_box(max), + sampled_keys: None, + } + } + + #[cfg(test)] + pub(crate) fn with_sampled_keys(mut self, sampled_keys: Box>) -> Self { + self.sampled_keys = Some(sampled_keys); + self + } + + pub(crate) fn from_batches<'a, B, I>(batches: I) -> Option + where + B: BatchReader, + I: IntoIterator, + { + let batches: Vec<&'a B> = batches.into_iter().collect(); + let mut bounds: Option<(Box, Box)> = None; + for batch in &batches { + let (batch_min, batch_max) = batch.key_bounds()?; + match bounds.as_mut() { + Some((min, max)) => { + if batch_min < min.as_ref() { + *min = clone_box(batch_min); + } + if batch_max > max.as_ref() { + *max = clone_box(batch_max); + } + } + None => bounds = Some((clone_box(batch_min), clone_box(batch_max))), + } + } + + bounds.map(|(min, max)| { + let mut plan = Self { + min, + max, + sampled_keys: None, + }; + if plan.roaring_range_fits() { + plan.sampled_keys = Self::collect_sampled_keys_from_batches(&batches); + } + plan + }) + } + + fn collect_sampled_keys_from_batches(batches: &[&B]) -> Option>> + where + B: BatchReader, + { + let first_batch = batches.first()?; + let mut sampled_keys = first_batch.factories().keys_factory().default_box(); + let total_sample_size = batches + .iter() + .map(|batch| Self::sample_count_for_filter_plan(batch.key_count())) + .sum::(); + sampled_keys.reserve(total_sample_size); + + let mut rng = thread_rng(); + sample_keys_from_batches( + &first_batch.factories(), + batches, + &mut rng, + |batch| Self::sample_count_for_filter_plan(batch.key_count()), + sampled_keys.as_mut(), + ); + + (!sampled_keys.is_empty()).then_some(sampled_keys) + } + + fn roaring_range_fits(&self) -> bool { + self.min.supports_roaring32() && self.max.into_roaring_u32(self.min.as_data()).is_some() + } + + fn can_use_roaring(&self, enable_roaring: bool) -> bool { + enable_roaring && self.roaring_range_fits() + } + + fn predict_lookup_prefers_roaring(&self, estimated_keys: usize) -> bool { + let sampled_keys = match self.sampled_keys.as_ref() { + Some(sampled_keys) => sampled_keys, + None => return false, + }; + + let mut roaring_keys = Vec::with_capacity(sampled_keys.len()); + for index in 0..sampled_keys.len() { + let roaring_key = match sampled_keys + .index(index) + .into_roaring_u32(self.min.as_data()) + { + Some(roaring_key) => roaring_key, + None => return false, + }; + roaring_keys.push(roaring_key); + } + roaring_keys.sort_unstable(); + roaring_keys.dedup(); + + RoaringLookupSampleStats::from_sample(estimated_keys, &roaring_keys) + .map(|stats| stats.lookup_prefers_roaring()) + .unwrap_or(false) + } + + fn preferred_filter( + &self, + estimated_keys: usize, + enable_roaring: bool, + bloom_false_positive_rate: f64, + ) -> BatchKeyFilter { + if self.can_use_roaring(enable_roaring) + && self.predict_lookup_prefers_roaring(estimated_keys) + { + BatchKeyFilter::new_roaring_u32(self.min.as_ref()) + } else { + BatchKeyFilter::new_bloom(estimated_keys, bloom_false_positive_rate) + } + } + + /// Chooses the membership filter to build for a batch with `estimated_keys` + /// rows, using the enabled Bloom/roaring settings and an optional batch + /// bounds plan. + pub fn decide_filter( + filter_plan: Option<&Self>, + estimated_keys: usize, + ) -> Option { + // Choose between Bloom, roaring, or no membership filter using the + // following rules: + // + // - If Bloom and roaring are both enabled, prefer roaring when the + // plan proves the batch range fits in `u32` and the sampled-key + // lookup predictor says roaring should beat Bloom. If sampling is + // unavailable or the predictor cannot run, fall back to Bloom. + // - If only Bloom is enabled, always build Bloom. + // - If only roaring is enabled, build roaring only when the plan + // proves the batch range fits in `u32`; otherwise build no + // membership filter. + // - If both are disabled, build no membership filter. + // + // The "no plan => no roaring" rule is intentional: without known + // batch bounds we cannot safely decide that min-offset roaring + // encoding will fit, and we do not allow switching filters after + // writing has started. + let enable_roaring = Runtime::with_dev_tweaks(|dev_tweaks| dev_tweaks.enable_roaring()); + let bloom_false_positive_rate = Runtime::with_dev_tweaks(|dev_tweaks| { + let rate = dev_tweaks.bloom_false_positive_rate(); + (rate > 0.0 && rate < 1.0).then_some(rate) + }); + match (bloom_false_positive_rate, filter_plan) { + (Some(rate), Some(filter_plan)) => { + Some(filter_plan.preferred_filter(estimated_keys, enable_roaring, rate)) + } + (Some(rate), None) => Some(BatchKeyFilter::new_bloom(estimated_keys, rate)), + (None, Some(filter_plan)) if filter_plan.can_use_roaring(enable_roaring) => { + Some(BatchKeyFilter::new_roaring_u32(filter_plan.min.as_ref())) + } + (None, _) => None, + } + } +} diff --git a/crates/dbsp/src/storage/file/filter/bloom.rs b/crates/dbsp/src/storage/file/filter/bloom.rs new file mode 100644 index 00000000000..698500a8784 --- /dev/null +++ b/crates/dbsp/src/storage/file/filter/bloom.rs @@ -0,0 +1,28 @@ +use crate::storage::tracking_bloom_filter::TrackingBloomFilter; +use fastbloom::BloomFilter; + +use super::super::BLOOM_FILTER_SEED; + +pub(super) fn new_bloom_filter( + estimated_keys: usize, + bloom_false_positive_rate: f64, +) -> TrackingBloomFilter { + TrackingBloomFilter::new( + BloomFilter::with_false_pos(bloom_false_positive_rate) + .seed(&BLOOM_FILTER_SEED) + .expected_items({ + // `.max(64)` works around a fastbloom bug that hangs when the + // expected number of items is zero (see + // ). + estimated_keys.max(64) + }), + ) +} + +pub(super) fn deserialize_bloom_filter(num_hashes: u32, data: Vec) -> TrackingBloomFilter { + TrackingBloomFilter::new( + BloomFilter::from_vec(data) + .seed(&BLOOM_FILTER_SEED) + .hashes(num_hashes), + ) +} diff --git a/crates/dbsp/src/storage/file/filter/roaring.rs b/crates/dbsp/src/storage/file/filter/roaring.rs new file mode 100644 index 00000000000..2c8ed0f65f3 --- /dev/null +++ b/crates/dbsp/src/storage/file/filter/roaring.rs @@ -0,0 +1,387 @@ +use crate::{ + dynamic::{DataTrait, DynData}, + storage::file::{FilterStats, TrackingFilterStats}, +}; +use dyn_clone::clone_box; +use roaring::RoaringBitmap; +use size_of::SizeOf; +use std::{collections::HashMap, io, mem::size_of_val}; + +/// Sample 0.1% of keys per batch when building a merge-time filter plan. +pub(crate) const FILTER_PLAN_SAMPLE_PERCENT: f64 = 0.1; +/// Never sample fewer than this many keys from a batch for the filter plan. +pub(crate) const FILTER_PLAN_MIN_SAMPLE_SIZE: usize = 1_024; + +/// Roaring bitmap wrapper that tracks hit/miss counts during membership probes. +#[derive(Debug)] +pub struct TrackingRoaringBitmap { + bitmap: RoaringBitmap, + min: Box, + tracking: TrackingFilterStats, +} + +impl TrackingRoaringBitmap { + pub(crate) fn new(bitmap: RoaringBitmap, min: &K) -> Self + where + K: DataTrait + ?Sized, + { + let mut filter = Self { + bitmap, + min: clone_box(min.as_data()), + tracking: TrackingFilterStats::new(0), + }; + filter.refresh_stats_size(); + filter + } + + pub(crate) fn with_min(min: &K) -> Self + where + K: DataTrait + ?Sized, + { + Self::new(RoaringBitmap::new(), min) + } + + pub(crate) fn insert(&mut self, value: u32) { + self.bitmap.insert(value); + } + + pub(crate) fn insert_key(&mut self, key: &K) + where + K: DataTrait + ?Sized, + { + self.insert(self.roaring_u32(key)); + } + + pub(crate) fn finalize(&mut self) { + self.bitmap.optimize(); + self.refresh_stats_size(); + } + + // Bloom filters allocate their backing bitset up front, so their tracked + // size is stable after construction. Roaring bitmaps grow as keys are + // inserted and can shrink again after `optimize()`, so refresh the tracked + // size once the batch is finalized instead of trying to maintain it on + // every insert. + fn refresh_stats_size(&mut self) { + let min_size = self.min.size_of().total_bytes(); + self.tracking + .set_size_byte(size_of_val(&self.bitmap) + self.bitmap.serialized_size() + min_size); + } + + pub(crate) fn contains(&self, value: u32) -> bool { + let is_hit = self.bitmap.contains(value); + self.tracking.record(is_hit); + is_hit + } + + fn roaring_u32(&self, key: &K) -> u32 + where + K: DataTrait + ?Sized, + { + key.into_roaring_u32_checked(self.min.as_ref()) + } + + pub(crate) fn maybe_contains_key(&self, key: &K) -> bool + where + K: DataTrait + ?Sized, + { + self.contains(self.roaring_u32(key)) + } + + pub(crate) fn stats(&self) -> FilterStats { + self.tracking.stats() + } + + pub(crate) fn serialized_size(&self) -> usize { + self.bitmap.serialized_size() + } + + pub(crate) fn serialize_into(&self, writer: W) -> io::Result<()> { + self.bitmap.serialize_into(writer) + } + + pub(crate) fn deserialize_from(reader: R, min: &K) -> io::Result + where + R: io::Read, + K: DataTrait + ?Sized, + { + Ok(Self::new(RoaringBitmap::deserialize_from(reader)?, min)) + } +} + +/// Sample-derived summary of how a batch's key distribution maps onto +/// Roaring's container layout for lookup prediction. +/// +/// This exists because Roaring is not uniformly "better than Bloom": +/// - keys are first partitioned by their high 16 bits, so a `u32` domain is +/// split into `2^16` containers; +/// - within each touched container, roaring-rs keeps values in an array until +/// the container reaches about 4096 entries, then upgrades it to a bitmap; +/// - sparse batches therefore tend to pay binary-search costs in many small +/// array containers, while dense batches benefit from cheap bitmap probes. +/// +/// The predictor estimates those two things from a sample: +/// - how many 16-bit containers the batch likely touches +/// - how many keys each touched container likely holds +#[derive(Debug, Clone, Copy)] +pub(crate) struct RoaringLookupSampleStats { + // Estimated number of real keys per touched 16-bit window after rescaling + // the sampled keys/window by the sample fraction. + estimated_keys_per_window: f64, + // Estimated number of distinct 16-bit windows touched by the full batch. + estimated_touched_windows: f64, +} + +impl RoaringLookupSampleStats { + const ROARING_WINDOW_CAPACITY: f64 = 65_536.0; + const ROARING_BITMAP_CONTAINER_THRESHOLD: f64 = 4_096.0; + const LOOKUP_ROARING_WINDOW_PROBABILITY_THRESHOLD: f64 = 0.1; + const LOOKUP_ROARING_BITMAP_WINDOW_PROBABILITY_PENALTY: f64 = 0.1; + const LOOKUP_ROARING_ARRAY_WINDOW_PROBABILITY_PENALTY_BASE: f64 = 0.25; + const LOOKUP_ROARING_ARRAY_WINDOW_PROBABILITY_PENALTY_PER_LOG2_KEY: f64 = 0.15; + const TOUCHED_WINDOWS_CHAO1_DAMPING: f64 = 0.25; + const U32_WINDOW_COUNT: usize = 1 << 16; + + /// Estimate roaring friendliness for lookups from a small sample of keys. + /// + /// The estimator works is based on `crates/dbsp/benches/filter_predictor.rs`: + /// 1. Bucket sampled keys by their high 16 bits, which matches Roaring's + /// top-level `u32` container layout. + /// 2. Rescale sampled keys/window by the sample fraction so large & dense + /// batches do not look artificially sparse. + /// 3. Estimate the full-batch touched-window count by combining a uniform + /// occupancy model with a Chao1 unseen-window correction. + /// + /// Example: + /// - If the batch has `1_000_000` keys and the sample contains `1_000`, + /// the sample fraction is `0.001` (`0.1%`). + /// - If those `1_000` sampled keys touch `50` windows, then the sampled + /// average is `20` keys/window and the rescaled estimate is + /// `20 / 0.001 = 20_000` real keys/window. + /// - If many sampled windows are singletons, the Chao1 correction pushes + /// the touched-window estimate upward because the sample likely missed + /// many windows entirely. + pub(crate) fn from_sample(batch_keys: usize, sampled_keys: &[u32]) -> Option { + if batch_keys == 0 || sampled_keys.is_empty() { + return None; + } + + let sampled_key_count = sampled_keys.len(); + let mut per_window: HashMap = HashMap::new(); + for &key in sampled_keys { + let window = (key >> 16) as u16; + *per_window.entry(window).or_insert(0) += 1; + } + + let distinct_windows = per_window.len(); + if distinct_windows == 0 { + return None; + } + + let sample_fraction = sampled_key_count as f64 / batch_keys as f64; + if sample_fraction <= 0.0 { + return None; + } + + let avg_sample_keys_per_window = sampled_key_count as f64 / distinct_windows as f64; + // Without this rescaling, large but dense batches look artificially + // sparse and the predictor drifts toward Bloom. + let estimated_keys_per_window = + (avg_sample_keys_per_window / sample_fraction).min(Self::ROARING_WINDOW_CAPACITY); + // Sparse, wide samples often show up as many singleton windows and very + // few doubletons. Those are exactly the signals the Chao1 correction + // uses to estimate how many windows the sample likely missed entirely. + let sample_singleton_windows = per_window.values().filter(|&&count| count == 1).count(); + let sample_doubleton_windows = per_window.values().filter(|&&count| count == 2).count(); + let estimated_touched_windows = estimate_touched_windows( + batch_keys, + sampled_key_count, + distinct_windows, + sample_singleton_windows, + sample_doubleton_windows, + ); + + Some(Self { + estimated_keys_per_window, + estimated_touched_windows, + }) + } + + /// Predict whether lookup-heavy workloads should prefer Roaring. + /// + /// Random probes only pay container cost when they land in a touched + /// 16-bit window, so touched-window count is normalized into a + /// probability. Array containers get a size-dependent penalty because + /// `ArrayStore::contains()` gets slower as they grow, while bitmap + /// containers are treated as near-constant-time once the estimated + /// keys/window crosses Roaring's array-to-bitmap threshold. + pub(crate) fn lookup_prefers_roaring(&self) -> bool { + let lookup_window_probability = + (self.estimated_touched_windows / Self::U32_WINDOW_COUNT as f64).clamp(0.0, 1.0); + // roaring-rs switches between array and bitmap containers around 4096 + // elements. Bitmap containers are close to a constant-time bit test, + // but array containers use binary search and get meaningfully slower + // as they grow. + let lookup_container_penalty = + if self.estimated_keys_per_window >= Self::ROARING_BITMAP_CONTAINER_THRESHOLD { + Self::LOOKUP_ROARING_BITMAP_WINDOW_PROBABILITY_PENALTY + } else { + Self::LOOKUP_ROARING_ARRAY_WINDOW_PROBABILITY_PENALTY_BASE + + Self::LOOKUP_ROARING_ARRAY_WINDOW_PROBABILITY_PENALTY_PER_LOG2_KEY + * (self.estimated_keys_per_window + 1.0).log2() + }; + let lookup_cost_proxy = lookup_window_probability * lookup_container_penalty; + let lookup_score = Self::LOOKUP_ROARING_WINDOW_PROBABILITY_THRESHOLD + / lookup_cost_proxy.max(f64::MIN_POSITIVE); + lookup_score >= 1.0 + } +} + +/// Estimate how many distinct 16-bit Roaring windows the full batch touches. +/// +/// This combines: +/// 1. A uniform occupancy estimate that works well when windows are populated +/// fairly evenly. +/// 2. A Chao1-style unseen-window estimate that reacts when the sample is full +/// of singleton windows and is therefore likely missing many windows. +/// +/// The blend exists because just doing a uniform-only estimate under-counts +/// touched windows on sparse, wide distributions and makes random Roaring +/// lookups look cheaper than they are. +fn estimate_touched_windows( + batch_keys: usize, + sampled_keys: usize, + distinct_windows: usize, + sample_singleton_windows: usize, + sample_doubleton_windows: usize, +) -> f64 { + if batch_keys == 0 || sampled_keys == 0 || distinct_windows == 0 { + return 0.0; + } + if sampled_keys >= batch_keys { + return distinct_windows as f64; + } + + let uniform_estimate = + estimate_uniform_touched_windows(batch_keys, sampled_keys, distinct_windows); + let chao1_estimate = estimate_chao1_touched_windows( + distinct_windows, + sample_singleton_windows, + sample_doubleton_windows, + ); + blend_touched_window_estimates( + uniform_estimate, + chao1_estimate, + RoaringLookupSampleStats::TOUCHED_WINDOWS_CHAO1_DAMPING, + ) +} + +/// Estimate touched windows under a "roughly uniform occupancy" assumption. +/// +/// Intuition: +/// - assume the full batch touches `W` windows and spreads keys across them +/// fairly evenly; +/// - given the sample fraction, solve for the `W` that would yield the +/// observed sampled distinct-window count. +/// +/// This is the baseline estimate because it behaves sensibly on compact or +/// moderately regular distributions. It falls apart on sparse wide batches, +/// where many windows are touched so rarely that the sample never sees them. +fn estimate_uniform_touched_windows( + batch_keys: usize, + sampled_keys: usize, + distinct_windows: usize, +) -> f64 { + if batch_keys == 0 || sampled_keys == 0 || distinct_windows == 0 { + return 0.0; + } + if sampled_keys >= batch_keys { + return distinct_windows as f64; + } + + let sample_fraction = sampled_keys as f64 / batch_keys as f64; + let mut low = distinct_windows as f64; + let mut high = batch_keys.min(RoaringLookupSampleStats::U32_WINDOW_COUNT) as f64; + + if low >= high { + return low; + } + + let log_unseen = (-sample_fraction).ln_1p(); + for _ in 0..100 { + let mid = (low + high) * 0.5; + let avg_keys_per_window = batch_keys as f64 / mid; + let observed_windows = mid * (1.0 - (avg_keys_per_window * log_unseen).exp()); + + if observed_windows < distinct_windows as f64 { + low = mid; + } else { + high = mid; + } + } + + high +} + +/// Estimate touched windows with a Chao1-style unseen-species correction. +/// +/// Here the "species" are touched 16-bit windows: +/// - `distinct_windows` is how many windows the sample observed +/// - `sample_singleton_windows` counts windows seen exactly once +/// - `sample_doubleton_windows` counts windows seen exactly twice +/// +/// Raw Chao1 is intentionally not used directly in the final predictor because +/// it can overreact when `f2` is tiny. We still compute it because it pushes +/// the estimate upward in the cases we care about here: batches that touch +/// many 16-bit windows, but only a few sampled keys land in each window. +fn estimate_chao1_touched_windows( + distinct_windows: usize, + sample_singleton_windows: usize, + sample_doubleton_windows: usize, +) -> f64 { + let chao1_estimate = if sample_doubleton_windows > 0 { + distinct_windows as f64 + + (sample_singleton_windows * sample_singleton_windows) as f64 + / (2.0 * sample_doubleton_windows as f64) + } else { + distinct_windows as f64 + + (sample_singleton_windows.saturating_mul(sample_singleton_windows.saturating_sub(1)) + / 2) as f64 + }; + + chao1_estimate + .max(distinct_windows as f64) + .min(RoaringLookupSampleStats::U32_WINDOW_COUNT as f64) +} + +fn blend_touched_window_estimates(uniform_estimate: f64, chao1_estimate: f64, alpha: f64) -> f64 { + // Raw Chao1 reacts strongly to singleton-heavy samples, which is useful + // for sparse wide batches but too aggressive to use directly. Blend it + // toward the uniform estimate so the unseen-window correction only nudges + // the final estimate in the right direction. + uniform_estimate + alpha * (chao1_estimate - uniform_estimate) +} + +#[cfg(test)] +mod tests { + use super::TrackingRoaringBitmap; + use crate::storage::file::FilterStats; + + #[test] + fn tracking_roaring_bitmap_stats() { + let mut filter = TrackingRoaringBitmap::with_min((&0u32) as &crate::dynamic::DynData); + filter.insert(1); + filter.insert(3); + + assert!(filter.contains(1)); + assert!(!filter.contains(2)); + assert_eq!( + filter.stats(), + FilterStats { + size_byte: filter.stats().size_byte, + hits: 1, + misses: 1, + } + ); + } +} diff --git a/crates/dbsp/src/storage/filter_stats.rs b/crates/dbsp/src/storage/file/filter/stats.rs similarity index 87% rename from crates/dbsp/src/storage/filter_stats.rs rename to crates/dbsp/src/storage/file/filter/stats.rs index 54167363333..fbad894ee4c 100644 --- a/crates/dbsp/src/storage/filter_stats.rs +++ b/crates/dbsp/src/storage/file/filter/stats.rs @@ -1,6 +1,15 @@ use crossbeam::utils::CachePadded; use std::sync::atomic::{AtomicUsize, Ordering}; +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Ord, PartialOrd)] +pub enum FilterKind { + #[default] + None, + Bloom, + Roaring, + Range, +} + /// Statistics about an in-memory key filter. /// /// The statistics implement addition such that they can be summed across @@ -59,6 +68,10 @@ impl TrackingFilterStats { } } + pub(crate) fn set_size_byte(&mut self, size_byte: usize) { + self.size_byte = size_byte; + } + /// Records the result of one filter probe. pub fn record(&self, is_hit: bool) { if is_hit { diff --git a/crates/dbsp/src/storage/file/format.rs b/crates/dbsp/src/storage/file/format.rs index b0b9cc66657..da7994c5542 100644 --- a/crates/dbsp/src/storage/file/format.rs +++ b/crates/dbsp/src/storage/file/format.rs @@ -75,12 +75,10 @@ //! //! Decompressing a compressed block yields the regular index or data block //! format starting with a [`BlockHeader`]. -use crate::storage::tracking_bloom_filter::TrackingBloomFilter; -use crate::storage::{buffer_cache::FBuf, file::BLOOM_FILTER_SEED}; +use crate::storage::buffer_cache::FBuf; use binrw::{BinRead, BinResult, BinWrite, Error as BinError, binrw, binwrite}; #[cfg(doc)] use crc32c; -use fastbloom::BloomFilter; use num_derive::FromPrimitive; use num_traits::FromPrimitive; use size_of::SizeOf; @@ -92,11 +90,15 @@ use size_of::SizeOf; /// - v3: Bloom filter format change. /// - v4: Tup None optimizations. /// - v5: Change in representation for Timestamp, ShortInterval +/// - v6: Roaring bitmap filter blocks. /// /// When a new version is created, make sure to generate new golden /// files for it in crate `storage-test-compat` to check for /// backwards compatibility. -pub const VERSION_NUMBER: u32 = 5; +pub const VERSION_NUMBER: u32 = 6; + +/// Oldest layer file format version this binary can read. +pub const MIN_SUPPORTED_VERSION: u32 = 5; /// Magic number for data blocks. pub const DATA_BLOCK_MAGIC: [u8; 4] = *b"LFDB"; @@ -107,8 +109,11 @@ pub const INDEX_BLOCK_MAGIC: [u8; 4] = *b"LFIB"; /// Magic number for the file trailer block. pub const FILE_TRAILER_BLOCK_MAGIC: [u8; 4] = *b"LFFT"; -/// Magic number for filter blocks. -pub const FILTER_BLOCK_MAGIC: [u8; 4] = *b"LFFB"; +/// Magic number for Bloom filter blocks. +pub const BLOOM_FILTER_BLOCK_MAGIC: [u8; 4] = *b"LFFB"; + +/// Magic number for roaring bitmap filter blocks. +pub const ROARING_BITMAP_FILTER_BLOCK_MAGIC: [u8; 4] = *b"LFFR"; /// 8-byte header at the beginning of each block. /// @@ -171,13 +176,13 @@ pub struct FileTrailer { #[br(count = n_columns)] pub columns: Vec, - /// File offset in bytes of the [FilterBlock]. + /// File offset in bytes of the filter block. /// /// This is 0 if there is no filter block, or if the filter block size is /// bigger than `i32::MAX`. pub filter_offset: u64, - /// Size in bytes of the [FilterBlock]. + /// Size in bytes of the filter block. /// /// This is 0 if there is no filter block, or if the filter block size is /// bigger than `i32::MAX`. @@ -205,7 +210,7 @@ pub struct FileTrailer { /// future expansion. pub incompatible_features: u64, - /// File offset in bytes of the [FilterBlock]. + /// File offset in bytes of the filter block. /// /// This is 0 if there is no filter block, or if the filter block size is /// less than `i32::MAX`. If this is nonzero, then @@ -213,7 +218,7 @@ pub struct FileTrailer { /// [FileTrailer::compatible_features]. pub filter_offset64: u64, - /// Size in bytes of the [FilterBlock]. + /// Size in bytes of the filter block. /// /// This is 0 if there is no filter block, or if the filter block size is /// less than `i32::MAX`. If this is nonzero, then @@ -243,6 +248,17 @@ impl FileTrailer { (self.compatible_features & feature) != 0 } + /// Returns the unknown incompatible features, if any. + pub fn unknown_incompatible_features(&self) -> Option { + let unknown_incompatible_features = + self.incompatible_features & !INCOMPATIBLE_FEATURE_ROARING_FILTERS; + if unknown_incompatible_features != 0 { + Some(unknown_incompatible_features) + } else { + None + } + } + /// Returns true if this file trailer has a 64-bit filter. pub fn has_filter64(&self) -> bool { self.has_compatible_feature(COMPATIBLE_FEATURE_FILTER64) @@ -259,6 +275,10 @@ pub const COMPATIBLE_FEATURE_FILTER64: u64 = 1 << 0; /// deserialized as if its value is 0. Conversely, old readers will simply ignore the field. pub const COMPATIBLE_FEATURE_NEGATIVE_WEIGHT_COUNT: u64 = 1 << 1; +/// Bit set to 1 in [FileTrailer::incompatible_features] if the file contains +/// roaring bitmap membership filter blocks. +pub const INCOMPATIBLE_FEATURE_ROARING_FILTERS: u64 = 1 << 0; + /// Information about a column. /// /// Embedded inside the [`FileTrailer`] block. @@ -555,12 +575,15 @@ impl Compression { /// /// The Bloom filter contains a member for each key in column 0. #[binrw] -pub struct FilterBlock { +pub struct BloomFilterBlock { /// Block header with "LFFB" magic. - #[brw(assert(header.magic == FILTER_BLOCK_MAGIC, "filter block has bad magic"))] + #[brw(assert( + header.magic == BLOOM_FILTER_BLOCK_MAGIC, + "bloom filter block has bad magic" + ))] pub header: BlockHeader, - /// [BloomFilter::num_hashes]. + /// Number of hashes used by the Bloom filter. pub num_hashes: u32, /// Number of elements in `data`. @@ -572,24 +595,17 @@ pub struct FilterBlock { pub data: Vec, } -impl From for TrackingBloomFilter { - fn from(block: FilterBlock) -> Self { - TrackingBloomFilter::new( - BloomFilter::from_vec(block.data) - .seed(&BLOOM_FILTER_SEED) - .hashes(block.num_hashes), - ) - } -} - /// A block representing a Bloom filter (with data by reference). #[binwrite] -pub struct FilterBlockRef<'a> { +pub struct BloomFilterBlockRef<'a> { /// Block header with "LFFB" magic. - #[bw(assert(header.magic == FILTER_BLOCK_MAGIC, "filter block has bad magic"))] + #[bw(assert( + header.magic == BLOOM_FILTER_BLOCK_MAGIC, + "bloom filter block has bad magic" + ))] pub header: BlockHeader, - /// [BloomFilter::num_hashes]. + /// Number of hashes used by the Bloom filter. pub num_hashes: u32, /// Number of elements in `data`. @@ -600,12 +616,39 @@ pub struct FilterBlockRef<'a> { pub data: &'a [u64], } -impl<'a> From<&'a TrackingBloomFilter> for FilterBlockRef<'a> { - fn from(value: &'a TrackingBloomFilter) -> Self { - FilterBlockRef { - header: BlockHeader::new(&FILTER_BLOCK_MAGIC), - num_hashes: value.num_hashes(), - data: value.as_slice(), - } - } +/// A block representing a roaring bitmap filter. +#[binrw] +pub struct RoaringBitmapFilterBlock { + /// Block header with "LFFR" magic. + #[brw(assert( + header.magic == ROARING_BITMAP_FILTER_BLOCK_MAGIC, + "roaring filter block has bad magic" + ))] + pub header: BlockHeader, + + /// Number of bytes in `data`. + #[bw(try_calc(u64::try_from(data.len())))] + pub len: u64, + + /// Serialized roaring bitmap contents. + #[br(count = len)] + pub data: Vec, +} + +/// A block representing a roaring bitmap filter (with data by reference). +#[binwrite] +pub struct RoaringBitmapFilterBlockRef<'a> { + /// Block header with "LFFR" magic. + #[bw(assert( + header.magic == ROARING_BITMAP_FILTER_BLOCK_MAGIC, + "roaring filter block has bad magic" + ))] + pub header: BlockHeader, + + /// Number of bytes in `data`. + #[bw(try_calc(u64::try_from(data.len())))] + pub len: u64, + + /// Serialized roaring bitmap contents. + pub data: &'a [u8], } diff --git a/crates/dbsp/src/storage/file/reader.rs b/crates/dbsp/src/storage/file/reader.rs index edaba67ee38..65746c1ded7 100644 --- a/crates/dbsp/src/storage/file/reader.rs +++ b/crates/dbsp/src/storage/file/reader.rs @@ -2,22 +2,22 @@ //! //! [`Reader`] is the top-level interface for reading layer files. -use super::format::{Compression, FileTrailer}; -use super::{AnyFactories, Deserializer, Factories}; +use super::format::{BloomFilterBlock, Compression, FileTrailer, RoaringBitmapFilterBlock}; +use super::{AnyFactories, BatchKeyFilter, Deserializer, Factories}; use crate::dynamic::{DynVec, WeightTrait}; use crate::storage::buffer_cache::CacheAccess; -use crate::storage::file::format::{BatchMetadata, FilterBlock}; -use crate::storage::tracking_bloom_filter::TrackingBloomFilter; use crate::storage::{ backend::StorageError, buffer_cache::{BufferCache, FBuf}, file::format::{ - DataBlockHeader, FileTrailerColumn, IndexBlockHeader, NodeType, VERSION_NUMBER, Varint, + BLOOM_FILTER_BLOCK_MAGIC, BatchMetadata, DataBlockHeader, FileTrailerColumn, + IndexBlockHeader, MIN_SUPPORTED_VERSION, NodeType, ROARING_BITMAP_FILTER_BLOCK_MAGIC, + Varint, }, file::item::ArchivedItem, }; use crate::{ - dynamic::{DataTrait, DeserializeDyn, Factory}, + dynamic::{DataTrait, DeserializeDyn, DynData, Factory}, storage::{ backend::{BlockLocation, FileReader, InvalidBlockLocation, StorageBackend}, buffer_cache::{AtomicCacheStats, CacheStats}, @@ -112,12 +112,14 @@ pub enum CorruptionError { }, /// Invalid version number in file trailer. - #[error("File has invalid version {version} (expected {expected_version})")] + #[error( + "File has unsupported storage format version {version}; supported versions are {min_supported_version} and newer" + )] InvalidVersion { /// Version in file. version: u32, - /// Expected version ([`VERSION_NUMBER`]). - expected_version: u32, + /// Minimum supported version. + min_supported_version: u32, }, /// Invalid version number in file trailer. @@ -327,6 +329,35 @@ pub enum CorruptionError { /// Invalid filter block location. #[error("Invalid file block location ({0}).")] InvalidFilterLocation(InvalidBlockLocation), + + /// Filter block payload could not be decoded. + #[error("Invalid {kind} filter encoding in block ({location}): {inner}")] + InvalidFilterEncoding { + /// Block location. + location: BlockLocation, + /// Filter kind. + kind: &'static str, + /// Underlying parse error. + inner: String, + }, + + /// Roaring bitmap filter block payload could not be decoded. + #[error("Invalid roaring bitmap filter encoding in block ({location}): {inner}")] + InvalidRoaringBitmapFilterEncoding { + /// Block location. + location: BlockLocation, + /// Underlying parse error. + inner: String, + }, + + /// Filter block magic is unknown. + #[error("Unknown filter block magic {magic:?} in block ({location}).")] + UnknownFilterBlockMagic { + /// Block location. + location: BlockLocation, + /// Unknown magic. + magic: [u8; 4], + }, } /// Reader for an array of [Varint]s in a storage file. @@ -1347,19 +1378,6 @@ struct Column { n_rows: u64, } -impl FilterBlock { - fn new(file_handle: &dyn FileReader, location: BlockLocation) -> Result { - let block = file_handle.read_block(location)?; - Self::read_le(&mut io::Cursor::new(block.as_slice())).map_err(|e| { - Error::Corruption(CorruptionError::Binrw { - location, - block_type: "filter", - inner: e.to_string(), - }) - }) - } -} - impl Column { fn new(factories: &AnyFactories, info: &FileTrailerColumn) -> Result { let FileTrailerColumn { @@ -1509,6 +1527,67 @@ fn decompress( Ok(raw) } +fn parse_filter_block BinRead = ()>>( + block: &FBuf, + location: BlockLocation, + block_type: &'static str, +) -> Result { + T::read_le(&mut io::Cursor::new(block.as_slice())).map_err(|e| { + Error::Corruption(CorruptionError::Binrw { + location, + block_type, + inner: e.to_string(), + }) + }) +} + +fn read_filter_block( + file_handle: &dyn FileReader, + location: BlockLocation, + roaring_min: Option<&DynData>, +) -> Result { + let block = file_handle.read_block(location)?; + if block.len() < 8 { + return Err(Error::Corruption(CorruptionError::InvalidFilterEncoding { + location, + kind: "unknown", + inner: format!("block too short: {} bytes", block.len()), + })); + } + + let mut magic = [0u8; 4]; + magic.copy_from_slice(&block[4..8]); + + match magic { + BLOOM_FILTER_BLOCK_MAGIC => { + let block: BloomFilterBlock = parse_filter_block(&block, location, "bloom filter")?; + Ok(BatchKeyFilter::deserialize_bloom( + block.num_hashes, + block.data, + )) + } + ROARING_BITMAP_FILTER_BLOCK_MAGIC => { + let block: RoaringBitmapFilterBlock = + parse_filter_block(&block, location, "roaring bitmap filter")?; + let roaring_min = roaring_min.ok_or_else(|| { + Error::Corruption(CorruptionError::InvalidRoaringBitmapFilterEncoding { + location, + inner: "roaring bitmap filter requires the batch minimum".to_string(), + }) + })?; + BatchKeyFilter::deserialize_roaring_u32(&block.data, roaring_min).map_err(|e| { + Error::Corruption(CorruptionError::InvalidRoaringBitmapFilterEncoding { + location, + inner: e.to_string(), + }) + }) + } + magic => Err(Error::Corruption( + CorruptionError::UnknownFilterBlockMagic { location, magic }, + )), + } +} + /// Layer file column specification. /// /// A column specification must take the form `K0, A0, N0`, where `(K0, A0)` is @@ -1554,6 +1633,7 @@ where pub struct Reader { file: ImmutableFileRef, columns: Vec, + membership_filter_location: Option, /// Additional metadata added to the file by the writer. pub(crate) metadata: BatchMetadata, @@ -1591,8 +1671,8 @@ where factories: &[&AnyFactories], cache: fn() -> Option>, file: Arc, - membership_filter: Option, - ) -> Result<(Self, Option), Error> { + membership_filter: Option, + ) -> Result<(Self, Option), Error> { let file_size = file.get_size()?; if file_size < 512 || (file_size % 512) != 0 { return Err(CorruptionError::InvalidFileSize(file_size).into()); @@ -1606,12 +1686,10 @@ where &stats, )?; - // v4/v5 isn't backwards compatible. do not attempt to support - // older formats. - if file_trailer.version < VERSION_NUMBER { + if file_trailer.version < MIN_SUPPORTED_VERSION { return Err(CorruptionError::InvalidVersion { version: file_trailer.version, - expected_version: VERSION_NUMBER, + min_supported_version: MIN_SUPPORTED_VERSION, } .into()); } @@ -1623,11 +1701,8 @@ where ); } - if file_trailer.incompatible_features != 0 { - return Err(CorruptionError::UnsupportedIncompatibleFeatures( - file_trailer.incompatible_features, - ) - .into()); + if let Some(features) = file_trailer.unknown_incompatible_features() { + return Err(CorruptionError::UnsupportedIncompatibleFeatures(features).into()); } assert_eq!(factories.len(), file_trailer.columns.len()); @@ -1659,34 +1734,26 @@ where .into()); } } - - fn read_filter_block( - file_handle: &dyn FileReader, - offset: u64, - size: usize, - ) -> Result { - Ok(FilterBlock::new( - file_handle, - BlockLocation::new(offset, size).map_err(|error: InvalidBlockLocation| { + let membership_filter_location = if file_trailer.has_filter64() { + Some( + BlockLocation::new( + file_trailer.filter_offset64, + file_trailer.filter_size64 as usize, + ) + .map_err(|error: InvalidBlockLocation| { Error::Corruption(CorruptionError::InvalidFilterLocation(error)) })?, - )? - .into()) - } - let membership_filter = if let Some(membership_filter) = membership_filter { - Some(membership_filter) - } else if file_trailer.has_filter64() { - Some(read_filter_block( - &*file, - file_trailer.filter_offset64, - file_trailer.filter_size64 as usize, - )?) + ) } else if file_trailer.filter_offset != 0 { - Some(read_filter_block( - &*file, - file_trailer.filter_offset, - file_trailer.filter_size as usize, - )?) + Some( + BlockLocation::new( + file_trailer.filter_offset, + file_trailer.filter_size as usize, + ) + .map_err(|error: InvalidBlockLocation| { + Error::Corruption(CorruptionError::InvalidFilterLocation(error)) + })?, + ) } else { None }; @@ -1701,6 +1768,7 @@ where file_trailer.version, ), columns, + membership_filter_location, metadata: file_trailer.metadata.clone(), _phantom: PhantomData, }, @@ -1723,15 +1791,6 @@ where Self::new(factories, cache, storage_backend.open(path)?) } - pub(crate) fn open_with_filter( - factories: &[&AnyFactories], - cache: fn() -> Option>, - storage_backend: &dyn StorageBackend, - path: &StoragePath, - ) -> Result<(Self, Option), Error> { - Self::new_with_filter(factories, cache, storage_backend.open(path)?, None) - } - /// The number of columns in the layer file. /// /// This is a fixed value for any given `Reader`. @@ -1780,6 +1839,15 @@ where pub fn metadata(&self) -> &BatchMetadata { &self.metadata } + + fn read_membership_filter( + &self, + roaring_min: Option<&DynData>, + ) -> Result, Error> { + self.membership_filter_location + .map(|location| read_filter_block(&*self.file.file_handle, location, roaring_min)) + .transpose() + } } impl Reader<(&'static K, &'static A, N)> @@ -1788,6 +1856,19 @@ where A: DataTrait + ?Sized, (&'static K, &'static A, N): ColumnSpec, { + pub(crate) fn open_with_filter( + factories: &[&AnyFactories], + cache: fn() -> Option>, + storage_backend: &dyn StorageBackend, + path: &StoragePath, + ) -> Result<(Self, Option), Error> { + let reader = Self::open(factories, cache, storage_backend, path)?; + let key_range = reader.key_range()?; + let roaring_min = key_range.as_ref().map(|(min, _)| min.as_ref().as_data()); + let membership_filter = reader.read_membership_filter(roaring_min)?; + Ok((reader, membership_filter)) + } + /// Returns the min and max keys stored in column 0. /// /// The bounds are loaded from the root node when first requested and can diff --git a/crates/dbsp/src/storage/file/test.rs b/crates/dbsp/src/storage/file/test.rs index ba77ef7ded9..062c5cd3ead 100644 --- a/crates/dbsp/src/storage/file/test.rs +++ b/crates/dbsp/src/storage/file/test.rs @@ -1,28 +1,29 @@ -use std::{marker::PhantomData, sync::Arc}; +use std::{io::Cursor, marker::PhantomData, sync::Arc}; use crate::{ DBWeight, dynamic::{DataTrait, DowncastTrait, DynWeight, Factory, LeanVec, Vector, WithFactory}, storage::{ - backend::StorageBackend, + backend::{BlockLocation, StorageBackend}, buffer_cache::BufferCache, file::{ - format::{BatchMetadata, Compression}, + format::{ + BLOOM_FILTER_BLOCK_MAGIC, BatchMetadata, Compression, FileTrailer, + ROARING_BITMAP_FILTER_BLOCK_MAGIC, + }, reader::{BulkRows, FilteredKeys, Reader}, }, }, trace::{ BatchReaderFactories, Builder, VecIndexedWSetFactories, VecWSetFactories, - ord::{ - batch_filter::BatchFilters, - vec::{indexed_wset_batch::VecIndexedWSetBuilder, wset_batch::VecWSetBuilder}, - }, + filter::BatchFilters, + ord::vec::{indexed_wset_batch::VecIndexedWSetBuilder, wset_batch::VecWSetBuilder}, }, - utils::test::init_test_logger, + utils::{Tup1, test::init_test_logger}, }; use super::{ - Factories, + Factories, FilterPlan, reader::{ColumnSpec, RowGroup}, writer::{Parameters, Writer1, Writer2}, }; @@ -31,6 +32,7 @@ use crate::{ DBData, dynamic::{DynData, Erase}, }; +use binrw::BinRead; use feldera_types::config::{StorageConfig, StorageOptions}; use rand::{Rng, seq::SliceRandom, thread_rng}; use tempfile::tempdir; @@ -712,6 +714,73 @@ fn test_key_range( assert_eq!(max.downcast_checked::(), &expected_max); } +fn filter_block_magic(reader: &Reader<(&'static K, &'static A, N)>) -> Option<[u8; 4]> +where + K: DataTrait + ?Sized, + A: DataTrait + ?Sized, + (&'static K, &'static A, N): ColumnSpec, +{ + let file_size = reader.byte_size().unwrap() as usize; + let trailer_block = reader + .file_handle() + .read_block(BlockLocation::new((file_size - 512) as u64, 512).unwrap()) + .unwrap(); + let trailer = FileTrailer::read_le(&mut Cursor::new(trailer_block.as_slice())).unwrap(); + let offset = if trailer.has_filter64() { + trailer.filter_offset64 + } else { + trailer.filter_offset + }; + let size = if trailer.has_filter64() { + trailer.filter_size64 as usize + } else { + trailer.filter_size as usize + }; + if offset == 0 { + return None; + } + + let filter_block = reader + .file_handle() + .read_block(BlockLocation::new(offset, size).unwrap()) + .unwrap(); + let mut magic = [0u8; 4]; + magic.copy_from_slice(&filter_block[4..8]); + Some(magic) +} + +fn incompatible_features(reader: &Reader<(&'static K, &'static A, N)>) -> u64 +where + K: DataTrait + ?Sized, + A: DataTrait + ?Sized, + (&'static K, &'static A, N): ColumnSpec, +{ + let file_size = reader.byte_size().unwrap() as usize; + let trailer_block = reader + .file_handle() + .read_block(BlockLocation::new((file_size - 512) as u64, 512).unwrap()) + .unwrap(); + let trailer = FileTrailer::read_le(&mut Cursor::new(trailer_block.as_slice())).unwrap(); + trailer.incompatible_features +} + +fn sampled_filter_plan( + factories: &Factories, + keys: &[K], +) -> FilterPlan +where + K: DBData + Erase, +{ + let mut sampled_keys = factories.keys_factory.default_box(); + sampled_keys.reserve(keys.len()); + for key in keys { + sampled_keys.push_ref(key.erase()); + } + + FilterPlan::from_bounds(keys.first().unwrap().erase(), keys.last().unwrap().erase()) + .with_sampled_keys(sampled_keys) +} + fn test_two_columns(parameters: Parameters) where T: TwoColumns, @@ -734,7 +803,7 @@ where test_buffer_cache, &*storage_backend, parameters, - T::n0(), + FilterPlan::::decide_filter(None, T::n0()), ) .unwrap(); let n0 = T::n0(); @@ -800,7 +869,7 @@ where test_buffer_cache, &*storage_backend, parameters, - T::n0(), + FilterPlan::::decide_filter(None, T::n0()), ) .unwrap(); let n0 = T::n0(); @@ -945,7 +1014,7 @@ where test_buffer_cache, &*storage_backend, parameters.clone(), - n, + FilterPlan::::decide_filter(None, n), ) .unwrap(); for row in 0..n { @@ -956,7 +1025,7 @@ where let (reader, filters) = if reopen { println!("closing writer and reopening as reader"); let path = writer.path().clone(); - let (_file_handle, _bloom_filter, _key_bounds) = + let (_file_handle, _key_filter, _key_bounds) = writer.close(BatchMetadata::default()).unwrap(); let (reader, membership_filter) = Reader::open_with_filter( &[&factories.any_factories()], @@ -1006,7 +1075,7 @@ fn test_one_column_zset( test_buffer_cache, &*storage_backend, parameters.clone(), - n, + FilterPlan::::decide_filter(None, n), ) .unwrap(); for row in 0..n { @@ -1017,7 +1086,7 @@ fn test_one_column_zset( let reader = if reopen { println!("closing writer and reopening as reader"); let path = writer.path().clone(); - let (_file_handle, _bloom_filter, _key_bounds) = + let (_file_handle, _key_filter, _key_bounds) = writer.close(BatchMetadata::default()).unwrap(); Reader::open( &[&factories.any_factories()], @@ -1063,7 +1132,7 @@ fn one_column_key_range() { test_buffer_cache, &*storage_backend, Parameters::default(), - keys.len(), + FilterPlan::::decide_filter(None, keys.len()), ) .unwrap(); for key in keys { @@ -1072,7 +1141,7 @@ fn one_column_key_range() { let reader = if reopen { let path = writer.path().clone(); - let (_file_handle, _bloom_filter, _key_bounds) = + let (_file_handle, _key_filter, _key_bounds) = writer.close(BatchMetadata::default()).unwrap(); Reader::open( &[&factories.any_factories()], @@ -1099,6 +1168,390 @@ fn one_column_key_range() { } } +#[test] +fn test_bloom_filter_roundtrip_and_block_kind() { + init_test_logger(); + + for reopen in [false, true] { + let factories = Factories::::new::(); + let tempdir = tempdir().unwrap(); + let storage_backend = ::new( + &StorageConfig { + path: tempdir.path().to_string_lossy().to_string(), + cache: Default::default(), + }, + &StorageOptions::default(), + ) + .unwrap(); + + let mut writer = Writer1::new( + &factories, + test_buffer_cache, + &*storage_backend, + Parameters::default(), + FilterPlan::::decide_filter(None, 3), + ) + .unwrap(); + for key in [1i64, 3, 7] { + writer.write0((&key, &())).unwrap(); + } + + let (reader, filters) = if reopen { + let path = writer.path().clone(); + let (_file_handle, _key_filter, _key_bounds) = + writer.close(BatchMetadata::default()).unwrap(); + let (reader, membership_filter) = Reader::open_with_filter( + &[&factories.any_factories()], + test_buffer_cache, + &*storage_backend, + &path, + ) + .unwrap(); + let key_range = reader.key_range().unwrap().map(Into::into); + let filters = BatchFilters::from_file(key_range, membership_filter); + (reader, filters) + } else { + writer.into_reader(BatchMetadata::default()).unwrap() + }; + + for key in [1i64, 3, 7] { + assert!(filters.maybe_contains_key(key.erase(), None)); + } + assert_eq!(filter_block_magic(&reader), Some(BLOOM_FILTER_BLOCK_MAGIC)); + assert_eq!(incompatible_features(&reader), 0); + } +} + +#[test] +fn test_roaring_u32_filter_roundtrip_exact_and_block_kind() { + init_test_logger(); + + for reopen in [false, true] { + let factories = Factories::::new::(); + let tempdir = tempdir().unwrap(); + let storage_backend = ::new( + &StorageConfig { + path: tempdir.path().to_string_lossy().to_string(), + cache: Default::default(), + }, + &StorageOptions::default(), + ) + .unwrap(); + + let filter_plan = sampled_filter_plan(&factories, &[1u32, 3, 7]); + let mut writer = Writer1::new( + &factories, + test_buffer_cache, + &*storage_backend, + Parameters::default(), + FilterPlan::decide_filter(Some(&filter_plan), 3), + ) + .unwrap(); + for key in [1u32, 3, 7] { + writer.write0((&key, &())).unwrap(); + } + + let (reader, filters) = if reopen { + let path = writer.path().clone(); + let (_file_handle, _key_filter, _key_bounds) = + writer.close(BatchMetadata::default()).unwrap(); + let (reader, membership_filter) = Reader::open_with_filter( + &[&factories.any_factories()], + test_buffer_cache, + &*storage_backend, + &path, + ) + .unwrap(); + let key_range = reader.key_range().unwrap().map(Into::into); + let filters = BatchFilters::from_file(key_range, membership_filter); + (reader, filters) + } else { + writer.into_reader(BatchMetadata::default()).unwrap() + }; + + for key in [1u32, 3, 7] { + assert!(filters.maybe_contains_key(key.erase(), None)); + } + for key in [0u32, 2, 9] { + assert!(!filters.maybe_contains_key(key.erase(), None)); + } + assert_eq!( + filter_block_magic(&reader), + Some(ROARING_BITMAP_FILTER_BLOCK_MAGIC) + ); + assert_ne!(incompatible_features(&reader), 0); + } +} + +#[test] +fn test_roaring_tup1_i32_filter_roundtrip_exact_and_block_kind() { + init_test_logger(); + + for reopen in [false, true] { + let factories = Factories::::new::, ()>(); + let tempdir = tempdir().unwrap(); + let storage_backend = ::new( + &StorageConfig { + path: tempdir.path().to_string_lossy().to_string(), + cache: Default::default(), + }, + &StorageOptions::default(), + ) + .unwrap(); + + let filter_plan = sampled_filter_plan(&factories, &[Tup1(-7i32), Tup1(1), Tup1(3)]); + let mut writer = Writer1::new( + &factories, + test_buffer_cache, + &*storage_backend, + Parameters::default(), + FilterPlan::decide_filter(Some(&filter_plan), 3), + ) + .unwrap(); + for key in [Tup1(-7i32), Tup1(1), Tup1(3)] { + writer.write0((&key, &())).unwrap(); + } + + let (reader, filters) = if reopen { + let path = writer.path().clone(); + let (_file_handle, _key_filter, _key_bounds) = + writer.close(BatchMetadata::default()).unwrap(); + let (reader, membership_filter) = Reader::open_with_filter( + &[&factories.any_factories()], + test_buffer_cache, + &*storage_backend, + &path, + ) + .unwrap(); + let key_range = reader.key_range().unwrap().map(Into::into); + let filters = BatchFilters::from_file(key_range, membership_filter); + (reader, filters) + } else { + writer.into_reader(BatchMetadata::default()).unwrap() + }; + + for key in [Tup1(-7i32), Tup1(1), Tup1(3)] { + assert!(filters.maybe_contains_key(key.erase(), None)); + } + for key in [Tup1(-8i32), Tup1(0), Tup1(9)] { + assert!(!filters.maybe_contains_key(key.erase(), None)); + } + assert_eq!( + filter_block_magic(&reader), + Some(ROARING_BITMAP_FILTER_BLOCK_MAGIC) + ); + assert_ne!(incompatible_features(&reader), 0); + } +} + +#[test] +fn test_writer_without_filter_plan_uses_bloom_filter() { + init_test_logger(); + + let factories = Factories::::new::(); + let tempdir = tempdir().unwrap(); + let storage_backend = ::new( + &StorageConfig { + path: tempdir.path().to_string_lossy().to_string(), + cache: Default::default(), + }, + &StorageOptions::default(), + ) + .unwrap(); + + let mut writer = Writer1::new( + &factories, + test_buffer_cache, + &*storage_backend, + Parameters::default(), + FilterPlan::::decide_filter(None, 2), + ) + .unwrap(); + for key in [5u32, 8] { + writer.write0((&key, &())).unwrap(); + } + + let (reader, _filters) = writer.into_reader(BatchMetadata::default()).unwrap(); + assert_eq!(filter_block_magic(&reader), Some(BLOOM_FILTER_BLOCK_MAGIC)); +} + +#[test] +fn test_filter_plan_without_sample_falls_back_to_bloom() { + init_test_logger(); + + let filter_plan = FilterPlan::from_bounds((&1u32) as &DynData, (&7u32) as &DynData); + assert!(matches!( + FilterPlan::decide_filter(Some(&filter_plan), 3), + Some(super::BatchKeyFilter::Bloom(_)) + )); +} + +#[test] +fn test_filter_plan_predictor_prefers_roaring_for_dense_sample() { + init_test_logger(); + + let factories = Factories::::new::(); + let keys: Vec = (0..50_000).collect(); + let filter_plan = sampled_filter_plan(&factories, keys.as_slice()); + + assert!(matches!( + FilterPlan::decide_filter(Some(&filter_plan), keys.len()), + Some(super::BatchKeyFilter::RoaringU32(_)) + )); +} + +#[test] +fn test_filter_plan_predictor_prefers_bloom_for_sparse_wide_sample() { + init_test_logger(); + + let factories = Factories::::new::(); + let keys: Vec = (0..50_000).map(|index| index << 16).collect(); + let filter_plan = sampled_filter_plan(&factories, keys.as_slice()); + + assert!(matches!( + FilterPlan::decide_filter(Some(&filter_plan), keys.len()), + Some(super::BatchKeyFilter::Bloom(_)) + )); +} + +#[test] +fn test_roaring_i64_filter_roundtrip_uses_batch_min_offset() { + init_test_logger(); + + for reopen in [false, true] { + let factories = Factories::::new::(); + let tempdir = tempdir().unwrap(); + let storage_backend = ::new( + &StorageConfig { + path: tempdir.path().to_string_lossy().to_string(), + cache: Default::default(), + }, + &StorageOptions::default(), + ) + .unwrap(); + + let min = (i64::from(u32::MAX) * 4) + 10; + let keys = [min, min + 3, min + 7]; + let filter_plan = sampled_filter_plan(&factories, &keys); + let mut writer = Writer1::new( + &factories, + test_buffer_cache, + &*storage_backend, + Parameters::default(), + FilterPlan::decide_filter(Some(&filter_plan), keys.len()), + ) + .unwrap(); + for key in keys { + writer.write0((&key, &())).unwrap(); + } + + let (reader, filters) = if reopen { + let path = writer.path().clone(); + let (_file_handle, _key_filter, _key_bounds) = + writer.close(BatchMetadata::default()).unwrap(); + let (reader, membership_filter) = Reader::open_with_filter( + &[&factories.any_factories()], + test_buffer_cache, + &*storage_backend, + &path, + ) + .unwrap(); + let key_range = reader.key_range().unwrap().map(Into::into); + let filters = BatchFilters::from_file(key_range, membership_filter); + (reader, filters) + } else { + writer.into_reader(BatchMetadata::default()).unwrap() + }; + + for key in keys { + assert!(filters.maybe_contains_key((&key) as &DynData, None)); + } + for key in [min - 1, min + 4, min + 9] { + assert!(!filters.maybe_contains_key((&key) as &DynData, None)); + } + assert_eq!( + filter_block_magic(&reader), + Some(ROARING_BITMAP_FILTER_BLOCK_MAGIC) + ); + } +} + +#[test] +fn test_roaring_u64_filter_roundtrip_uses_batch_min_offset() { + init_test_logger(); + + let factories = Factories::::new::(); + let tempdir = tempdir().unwrap(); + let storage_backend = ::new( + &StorageConfig { + path: tempdir.path().to_string_lossy().to_string(), + cache: Default::default(), + }, + &StorageOptions::default(), + ) + .unwrap(); + + let base = (u64::from(u32::MAX) << 8) + 11; + let keys = [base, base + 2, base + 9]; + let filter_plan = sampled_filter_plan(&factories, &keys); + let mut writer = Writer1::new( + &factories, + test_buffer_cache, + &*storage_backend, + Parameters::default(), + FilterPlan::decide_filter(Some(&filter_plan), keys.len()), + ) + .unwrap(); + for key in keys { + writer.write0((&key, &())).unwrap(); + } + + let (reader, filters) = writer.into_reader(BatchMetadata::default()).unwrap(); + for key in keys { + assert!(filters.maybe_contains_key((&key) as &DynData, None)); + } + for key in [base - 1, base + 3, base + 20] { + assert!(!filters.maybe_contains_key((&key) as &DynData, None)); + } + assert_eq!( + filter_block_magic(&reader), + Some(ROARING_BITMAP_FILTER_BLOCK_MAGIC) + ); +} + +#[test] +fn test_i64_keys_fallback_to_bloom_when_span_exceeds_u32() { + init_test_logger(); + + let factories = Factories::::new::(); + let tempdir = tempdir().unwrap(); + let storage_backend = ::new( + &StorageConfig { + path: tempdir.path().to_string_lossy().to_string(), + cache: Default::default(), + }, + &StorageOptions::default(), + ) + .unwrap(); + + let max = i64::from(u32::MAX) + 1; + let filter_plan = FilterPlan::from_bounds((&0i64) as &DynData, (&max) as &DynData); + let mut writer = Writer1::new( + &factories, + test_buffer_cache, + &*storage_backend, + Parameters::default(), + FilterPlan::decide_filter(Some(&filter_plan), 2), + ) + .unwrap(); + for key in [0i64, max] { + writer.write0((&key, &())).unwrap(); + } + + let (reader, _filters) = writer.into_reader(BatchMetadata::default()).unwrap(); + assert_eq!(filter_block_magic(&reader), Some(BLOOM_FILTER_BLOCK_MAGIC)); +} + fn test_i64_helper(parameters: Parameters) { init_test_logger(); test_one_column( diff --git a/crates/dbsp/src/storage/file/writer.rs b/crates/dbsp/src/storage/file/writer.rs index 189603f5a57..816c727a077 100644 --- a/crates/dbsp/src/storage/file/writer.rs +++ b/crates/dbsp/src/storage/file/writer.rs @@ -4,51 +4,44 @@ //! 2-column layer file. To write more columns, either add another `Writer` //! struct, which is easily done, or mark the currently private `Writer` as //! `pub`. +use super::format::Compression; +use super::{AnyFactories, BatchKeyFilter, Factories, reader::Reader}; use crate::storage::{ backend::{BlockLocation, FileReader, FileWriter, StorageBackend, StorageError}, buffer_cache::{BufferCache, FBuf, FBufSerializer, LimitExceeded}, file::{ - BLOOM_FILTER_SEED, SerializerInner, + SerializerInner, format::{ - BatchMetadata, BlockHeader, COMPATIBLE_FEATURE_FILTER64, + BatchMetadata, BlockHeader, BloomFilterBlockRef, COMPATIBLE_FEATURE_FILTER64, COMPATIBLE_FEATURE_NEGATIVE_WEIGHT_COUNT, DATA_BLOCK_MAGIC, DataBlockHeader, - FILE_TRAILER_BLOCK_MAGIC, FileTrailer, FileTrailerColumn, FilterBlockRef, FixedLen, - INDEX_BLOCK_MAGIC, IndexBlockHeader, NodeType, VERSION_NUMBER, Varint, + FILE_TRAILER_BLOCK_MAGIC, FileTrailer, FileTrailerColumn, FixedLen, + INCOMPATIBLE_FEATURE_ROARING_FILTERS, INDEX_BLOCK_MAGIC, IndexBlockHeader, NodeType, + ROARING_BITMAP_FILTER_BLOCK_MAGIC, RoaringBitmapFilterBlockRef, VERSION_NUMBER, Varint, }, reader::TreeNode, }, }; +use crate::{ + Runtime, + dynamic::{DataTrait, DeserializeDyn, SerializeDyn}, + storage::file::ItemFactory, + trace::filter::{BatchFilters, key_range::KeyRange}, +}; use binrw::{ BinWrite, io::{Cursor, NoSeek}, }; use crc32c::crc32c; -#[cfg(debug_assertions)] use dyn_clone::clone_box; -use fastbloom::BloomFilter; use feldera_buffer_cache::CacheEntry; use feldera_storage::StoragePath; use snap::raw::{Encoder, max_compress_len}; -use std::{ - cell::RefCell, - sync::{Arc, Once}, -}; +use std::{cell::RefCell, sync::Arc}; use std::{ marker::PhantomData, mem::{replace, take}, ops::Range, }; -use tracing::info; - -use super::format::Compression; -use super::{AnyFactories, Factories, reader::Reader}; -use crate::storage::tracking_bloom_filter::TrackingBloomFilter; -use crate::{ - Runtime, - dynamic::{DataTrait, DeserializeDyn, SerializeDyn}, - storage::file::ItemFactory, - trace::ord::{BatchFilters, key_range::KeyRange}, -}; struct VarintWriter { varint: Varint, @@ -1140,50 +1133,23 @@ impl BlockWriter { struct Writer { cache: fn() -> Option>, writer: BlockWriter, - bloom_filter: Option, + key_filter: Option, cws: Vec, finished_columns: Vec, serializer: SerializerInner, } impl Writer { - fn bloom_false_positive_rate() -> Option { - let rate = Runtime::with_dev_tweaks(|dev_tweaks| dev_tweaks.bloom_false_positive_rate()); - let rate = (rate > 0.0 && rate < 1.0).then_some(rate); - - static ONCE: Once = Once::new(); - ONCE.call_once(|| { - if let Some(rate) = rate { - info!("Using Bloom filter false positive rate {rate}"); - } else { - info!("Bloom filters disabled"); - } - }); - rate - } - pub fn new( factories: &[&AnyFactories], cache: fn() -> Option>, storage_backend: &dyn StorageBackend, parameters: Parameters, n_columns: usize, - estimated_keys: usize, + key_filter: Option, ) -> Result { assert_eq!(factories.len(), n_columns); - let bloom_filter = Self::bloom_false_positive_rate().map(|bloom_false_positive_rate| { - TrackingBloomFilter::new( - BloomFilter::with_false_pos(bloom_false_positive_rate) - .seed(&BLOOM_FILTER_SEED) - .expected_items({ - // `.max(64)` works around a fastbloom bug that hangs when the - // expected number of items is zero (see - // https://github.com/tomtomwombat/fastbloom/issues/17). - estimated_keys.max(64) - }), - ) - }); let parameters = Arc::new(parameters); let cws = factories .iter() @@ -1197,7 +1163,7 @@ impl Writer { cache().expect("Should have a buffer cache"), storage_backend.create_with_prefix(&worker.into())?, ), - bloom_filter, + key_filter, cws, finished_columns, serializer: SerializerInner::new(), @@ -1218,11 +1184,10 @@ impl Writer { None }; - if column == 0 { - // Add `key` to bloom filter. - if let Some(bloom_filter) = &mut self.bloom_filter { - bloom_filter.insert_hash(item.0.default_hash()); - } + if column == 0 + && let Some(key_filter) = &mut self.key_filter + { + key_filter.insert_key(item.0); } // Add `value` to row group for column. @@ -1252,22 +1217,50 @@ impl Writer { pub fn close( mut self, metadata: BatchMetadata, - ) -> Result<(Arc, Option), StorageError> { + ) -> Result<(Arc, Option), StorageError> { debug_assert_eq!(self.cws.len(), self.finished_columns.len()); - // Write the Bloom filter. - let filter_location = if let Some(bloom_filter) = &self.bloom_filter { - let filter_block = FilterBlockRef::from(bloom_filter); - // std::mem::size_of::() should be an - // upper bound: in-memory struct size + bloom payload bytes. - let estimated_block_size = (std::mem::size_of::() - + std::mem::size_of_val(filter_block.data)) - // our binrw min block size is 512 so we round it up to avoid another - // reallocation - .next_multiple_of(512); - self.writer - .write_block(filter_block.into_block(estimated_block_size), None)? - .1 + if let Some(key_filter) = &mut self.key_filter { + key_filter.finalize(); + } + + // Write the batch key filter. + let mut incompatible_features = 0; + let filter_location = if let Some(key_filter) = &self.key_filter { + match key_filter { + BatchKeyFilter::Bloom(filter) => { + let filter_block = BloomFilterBlockRef { + header: BlockHeader::new( + &crate::storage::file::format::BLOOM_FILTER_BLOCK_MAGIC, + ), + num_hashes: filter.num_hashes(), + data: filter.as_slice(), + }; + let estimated_block_size = (std::mem::size_of::() + + std::mem::size_of_val(filter_block.data)) + .next_multiple_of(512); + self.writer + .write_block(filter_block.into_block(estimated_block_size), None)? + .1 + } + BatchKeyFilter::RoaringU32(filter) => { + incompatible_features |= INCOMPATIBLE_FEATURE_ROARING_FILTERS; + let mut data = Vec::with_capacity(filter.serialized_size()); + filter + .serialize_into(&mut data) + .map_err(|_| StorageError::RoaringBitmapFilter)?; + let filter_block = RoaringBitmapFilterBlockRef { + header: BlockHeader::new(&ROARING_BITMAP_FILTER_BLOCK_MAGIC), + data: &data, + }; + let estimated_block_size = (std::mem::size_of::() + + data.len()) + .next_multiple_of(512); + self.writer + .write_block(filter_block.into_block(estimated_block_size), None)? + .1 + } + } } else { BlockLocation { offset: 0, size: 0 } }; @@ -1282,7 +1275,7 @@ impl Writer { filter_offset: 0, filter_size: 0, compatible_features: COMPATIBLE_FEATURE_NEGATIVE_WEIGHT_COUNT, - incompatible_features: 0, + incompatible_features, filter_offset64: 0, filter_size64: 0, metadata, @@ -1305,7 +1298,7 @@ impl Writer { self.writer .insert_cache_entry(location, Arc::new(file_trailer)); - Ok((self.writer.complete()?, self.bloom_filter)) + Ok((self.writer.complete()?, self.key_filter)) } pub fn n_columns(&self) -> usize { @@ -1354,7 +1347,7 @@ impl Writer { /// }, &StorageOptions::default()).unwrap(); /// let parameters = Parameters::default(); /// let mut file = -/// Writer1::new(&factories, || Some(Arc::new(BufferCache::new(1024 * 1024))), &*storage_backend, parameters, 1_000_000).unwrap(); +/// Writer1::new(&factories, || Some(Arc::new(BufferCache::new(1024 * 1024))), &*storage_backend, parameters, None).unwrap(); /// for i in 0..1000_u32 { /// file.write0((i.erase(), ().erase())).unwrap(); /// } @@ -1383,7 +1376,7 @@ where cache: fn() -> Option>, storage_backend: &dyn StorageBackend, parameters: Parameters, - estimated_keys: usize, + key_filter: Option, ) -> Result { Ok(Self { factories: factories.clone(), @@ -1393,7 +1386,7 @@ where storage_backend, parameters, 1, - estimated_keys, + key_filter, )?, _phantom: PhantomData, #[cfg(debug_assertions)] @@ -1434,7 +1427,7 @@ where ) -> Result< ( Arc, - Option, + Option, Option<(Box, Box)>, ), StorageError, @@ -1462,12 +1455,12 @@ where let any_factories = self.factories.any_factories(); let cache = self.inner.cache; - let (file_handle, bloom_filter, key_bounds) = self.close(metadata)?; + let (file_handle, key_filter, key_bounds) = self.close(metadata)?; let key_range = key_bounds .as_ref() .map(|(min, max)| KeyRange::from_refs(min.as_ref(), max.as_ref())); let (reader, membership_filter) = - Reader::new_with_filter(&[&any_factories], cache, file_handle, bloom_filter)?; + Reader::new_with_filter(&[&any_factories], cache, file_handle, key_filter)?; let filters = BatchFilters::from_file(key_range, membership_filter); Ok((reader, filters)) } @@ -1518,7 +1511,7 @@ where /// }, &StorageOptions::default()).unwrap(); /// let parameters = Parameters::default(); /// let mut file = -/// Writer2::new(&factories, &factories, || Some(Arc::new(BufferCache::new(1024 * 1024))), &*storage_backend, parameters, 1_000_000).unwrap(); +/// Writer2::new(&factories, &factories, || Some(Arc::new(BufferCache::new(1024 * 1024))), &*storage_backend, parameters, None).unwrap(); /// for i in 0..1000_u32 { /// for j in 0..10_u32 { /// file.write1((&j, &())).unwrap(); @@ -1558,7 +1551,7 @@ where cache: fn() -> Option>, storage_backend: &dyn StorageBackend, parameters: Parameters, - estimated_keys: usize, + key_filter: Option, ) -> Result { Ok(Self { factories0: factories0.clone(), @@ -1569,7 +1562,7 @@ where storage_backend, parameters, 2, - estimated_keys, + key_filter, )?, #[cfg(debug_assertions)] prev0: None, @@ -1640,7 +1633,7 @@ where ) -> Result< ( Arc, - Option, + Option, Option<(Box, Box)>, ), StorageError, @@ -1674,7 +1667,7 @@ where let any_factories0 = self.factories0.any_factories(); let any_factories1 = self.factories1.any_factories(); let cache = self.inner.cache; - let (file_handle, bloom_filter, key_bounds) = self.close(metadata)?; + let (file_handle, key_filter, key_bounds) = self.close(metadata)?; let key_range = key_bounds .as_ref() .map(|(min, max)| KeyRange::from_refs(min.as_ref(), max.as_ref())); @@ -1682,7 +1675,7 @@ where &[&any_factories0, &any_factories1], cache, file_handle, - bloom_filter, + key_filter, )?; let filters = BatchFilters::from_file(key_range, membership_filter); Ok((reader, filters)) diff --git a/crates/dbsp/src/storage/tracking_bloom_filter.rs b/crates/dbsp/src/storage/tracking_bloom_filter.rs index 7f226789112..a50cd65e4ad 100644 --- a/crates/dbsp/src/storage/tracking_bloom_filter.rs +++ b/crates/dbsp/src/storage/tracking_bloom_filter.rs @@ -1,4 +1,4 @@ -use crate::storage::filter_stats::{FilterStats, TrackingFilterStats}; +use crate::storage::file::{FilterStats, TrackingFilterStats}; use fastbloom::BloomFilter; /// Bloom filter which tracks the number of hits and misses when lookups are performed. @@ -53,7 +53,7 @@ impl TrackingBloomFilter { #[cfg(test)] mod tests { use super::TrackingBloomFilter; - use crate::storage::filter_stats::FilterStats; + use crate::storage::file::FilterStats; use fastbloom::BloomFilter; #[test] @@ -67,7 +67,7 @@ mod tests { FilterStats { size_byte: 96 + 8192 / 8, hits: 0, - misses: 0, + misses: 0 } ); filter.insert_hash(123); @@ -79,7 +79,7 @@ mod tests { FilterStats { size_byte: 96 + 8192 / 8, hits: 1, - misses: 2, + misses: 2 } ); } @@ -91,7 +91,7 @@ mod tests { FilterStats { size_byte: 0, hits: 0, - misses: 0, + misses: 0 } ); } diff --git a/crates/dbsp/src/trace.rs b/crates/dbsp/src/trace.rs index 573331b9aec..aad64648e9c 100644 --- a/crates/dbsp/src/trace.rs +++ b/crates/dbsp/src/trace.rs @@ -31,11 +31,12 @@ use crate::dynamic::{ClonableTrait, DynDataTyped, DynUnit, Weight}; use crate::storage::buffer_cache::CacheStats; use crate::storage::file::SerializerInner; pub use crate::storage::file::{DbspSerializer, Deserializable, Deserializer, Rkyv}; +use crate::storage::file::{FilterKind, FilterStats}; use crate::trace::cursor::{ DefaultPushCursor, FilteredMergeCursor, FilteredMergeCursorWithSnapshot, PushCursor, UnfilteredMergeCursor, }; -use crate::utils::IsNone; +use crate::utils::{IsNone, SupportsRoaring}; use crate::{dynamic::ArchivedDBData, storage::buffer_cache::FBuf}; use cursor::CursorFactory; use enum_map::Enum; @@ -52,7 +53,9 @@ pub mod cursor; pub mod filter; pub mod layers; pub mod ord; +mod sampling; pub mod spine_async; +pub(crate) use sampling::sample_keys_from_batches; pub use spine_async::{BatchReaderWithSnapshot, ListMerger, Spine, SpineSnapshot, WithSnapshot}; #[cfg(test)] @@ -77,7 +80,6 @@ use crate::{ algebra::MonoidValue, dynamic::{DataTrait, DynPair, DynVec, DynWeightedPairs, Erase, Factory, WeightTrait}, storage::file::reader::Error as ReaderError, - storage::filter_stats::FilterStats, }; pub use cursor::{Cursor, MergeCursor}; pub use filter::{Filter, GroupFilter}; @@ -102,6 +104,7 @@ pub trait DBData: + Debug + ArchivedDBData + IsNone + + SupportsRoaring + 'static { } @@ -119,6 +122,7 @@ impl DBData for T where + Debug + ArchivedDBData + IsNone + + SupportsRoaring + 'static { } @@ -473,17 +477,36 @@ where /// [Cursor::seek_key_exact] after the range filter. /// /// Today this is usually a Bloom filter. Batches without such a filter - /// should return `FilterStats::default()`. - fn membership_filter_stats(&self) -> FilterStats; + /// should return zero/default stats. + fn membership_filter_stats(&self) -> FilterStats { + FilterStats::default() + } + + /// Filter kind for the secondary membership filter used by + /// [Cursor::seek_key_exact]. + fn membership_filter_kind(&self) -> FilterKind { + FilterKind::None + } /// Statistics of the in-memory range filter used by /// [Cursor::seek_key_exact]. /// - /// Batches without a range filter should return `FilterStats::default()`. + /// Returns range-filter stats. Batches without a range filter should + /// return zeroed range stats. fn range_filter_stats(&self) -> FilterStats { FilterStats::default() } + /// Cached minimum and maximum keys for this batch, when available. + /// + /// File-backed batches materialize these bounds at write time. In-memory + /// batches can compute them from their ordered key storage. Merge builders + /// use these bounds to decide upfront whether a batch span can be encoded + /// into a roaring bitmap. + fn key_bounds(&self) -> Option<(&Self::Key, &Self::Key)> { + None + } + /// Where the batch's data is stored. fn location(&self) -> BatchLocation { BatchLocation::Memory @@ -534,7 +557,6 @@ where /// * The output sample contains keys sorted in ascending order. fn sample_keys(&self, rng: &mut RG, sample_size: usize, sample: &mut DynVec) where - Self::Time: PartialEq<()>, RG: Rng; /// Returns num_partitions-1 keys from the batch that partition the batch into num_partitions @@ -660,9 +682,15 @@ where fn membership_filter_stats(&self) -> FilterStats { (**self).membership_filter_stats() } + fn membership_filter_kind(&self) -> FilterKind { + (**self).membership_filter_kind() + } fn range_filter_stats(&self) -> FilterStats { (**self).range_filter_stats() } + fn key_bounds(&self) -> Option<(&Self::Key, &Self::Key)> { + (**self).key_bounds() + } fn location(&self) -> BatchLocation { (**self).location() } @@ -674,7 +702,6 @@ where } fn sample_keys(&self, rng: &mut RG, sample_size: usize, sample: &mut DynVec) where - Self::Time: PartialEq<()>, RG: Rng, { (**self).sample_keys(rng, sample_size, sample) @@ -998,7 +1025,7 @@ where location: Option, ) -> Self where - B: BatchReader, + B: BatchReader, I: IntoIterator + Clone, { let _ = location; diff --git a/crates/dbsp/src/trace/filter.rs b/crates/dbsp/src/trace/filter.rs index 3c1ed2da1eb..5a6f3ae5e41 100644 --- a/crates/dbsp/src/trace/filter.rs +++ b/crates/dbsp/src/trace/filter.rs @@ -3,10 +3,15 @@ //! Filters are used by the garbage collector to discard unused records. //! We support different several types of filters for keys and values. +pub(crate) mod batch; +pub(crate) mod key_range; + use dyn_clone::DynClone; use crate::{circuit::metadata::MetaItem, dynamic::Factory}; +pub(crate) use batch::BatchFilters; + pub trait FilterFunc: Fn(&V) -> bool + DynClone + Send + Sync {} impl FilterFunc for F where F: Fn(&V) -> bool + Clone + Send + Sync + 'static {} diff --git a/crates/dbsp/src/trace/ord/batch_filter.rs b/crates/dbsp/src/trace/filter/batch.rs similarity index 80% rename from crates/dbsp/src/trace/ord/batch_filter.rs rename to crates/dbsp/src/trace/filter/batch.rs index 6a47f708398..91745d23783 100644 --- a/crates/dbsp/src/trace/ord/batch_filter.rs +++ b/crates/dbsp/src/trace/filter/batch.rs @@ -6,14 +6,15 @@ use crate::{ dynamic::{DataTrait, DynVec}, storage::{ - file::reader::FilteredKeys, - filter_stats::{FilterStats, TrackingFilterStats}, + file::{ + BatchKeyFilter, FilterKind, FilterStats, TrackingFilterStats, TrackingRoaringBitmap, + reader::FilteredKeys, + }, tracking_bloom_filter::TrackingBloomFilter, }, - trace::ord::key_range::KeyRange, + trace::filter::key_range::KeyRange, }; use size_of::SizeOf; -use smallvec::SmallVec; use std::sync::Arc; /// A cheap, in-memory precheck used by `seek_key_exact`. @@ -31,6 +32,9 @@ where /// filters pays that cost at most once. fn maybe_contains_key(&self, key: &K, hash: &mut Option) -> bool; + /// Filter kind for observability. + fn kind(&self) -> FilterKind; + /// Statistics for this filter. fn stats(&self) -> FilterStats; } @@ -109,14 +113,10 @@ where /// pay the hash or bloom lookup cost. pub(crate) fn from_file( key_range: Option>, - membership_filter: Option, + membership_filter: Option, ) -> Self { - Self::new( - key_range, - membership_filter - .map(Arc::new) - .map(|filter| filter as Arc>), - ) + let membership_filter = membership_filter.map(Arc::>::from); + Self::new(key_range, membership_filter) } /// Returns cumulative statistics for the range and membership filters. @@ -131,6 +131,13 @@ where } } + pub fn membership_filter_kind(&self) -> FilterKind { + self.membership_filter + .as_ref() + .map(|filter| filter.kind()) + .unwrap_or(FilterKind::None) + } + /// Returns the cached key bounds, when available. pub fn key_bounds(&self) -> Option<(&K, &K)> { self.range_filter.range.as_ref().map(|range| range.bounds()) @@ -141,9 +148,7 @@ where pub(crate) fn filtered_keys<'a>(&self, keys: &'a DynVec) -> FilteredKeys<'a, K> { debug_assert!(keys.is_sorted_by(&|a, b| a.cmp(b))); - // Preserve the old `FilteredKeys` heuristic: if too many keys pass, - // avoid allocating the index vector and just keep the original slice. - let mut filter_pass_keys = SmallVec::<[_; 50]>::new(); + let mut filter_pass_keys = Vec::with_capacity(keys.len().min(50)); for (index, key) in keys.dyn_iter().enumerate() { if self.maybe_contains_key(key, None) { filter_pass_keys.push(index); @@ -153,7 +158,7 @@ where } } - FilteredKeys::with_filter_pass_keys(keys, Some(filter_pass_keys.into_vec())) + FilteredKeys::with_filter_pass_keys(keys, Some(filter_pass_keys)) } /// Returns `false` only when `key` is definitely not present. @@ -208,6 +213,10 @@ where is_hit } + fn kind(&self) -> FilterKind { + FilterKind::Range + } + fn stats(&self) -> FilterStats { self.as_ref().stats() } @@ -222,17 +231,48 @@ where self.contains_hash(*hash) } + fn kind(&self) -> FilterKind { + FilterKind::Bloom + } + fn stats(&self) -> FilterStats { TrackingBloomFilter::stats(self) } } +impl BatchFilter for TrackingRoaringBitmap +where + K: DataTrait + ?Sized, +{ + fn maybe_contains_key(&self, key: &K, _hash: &mut Option) -> bool { + self.maybe_contains_key(key) + } + + fn kind(&self) -> FilterKind { + FilterKind::Roaring + } + + fn stats(&self) -> FilterStats { + TrackingRoaringBitmap::stats(self) + } +} + +impl From for Arc> +where + K: DataTrait + ?Sized, +{ + fn from(filter: BatchKeyFilter) -> Self { + match filter { + BatchKeyFilter::Bloom(filter) => Arc::new(filter), + BatchKeyFilter::RoaringU32(filter) => Arc::new(filter), + } + } +} + #[cfg(test)] mod tests { use super::{BatchFilter, TrackedRangeFilter}; - use crate::{ - dynamic::DynData, storage::filter_stats::FilterStats, trace::ord::key_range::KeyRange, - }; + use crate::{dynamic::DynData, storage::file::FilterStats, trace::filter::key_range::KeyRange}; use std::sync::Arc; #[test] diff --git a/crates/dbsp/src/trace/ord/key_range.rs b/crates/dbsp/src/trace/filter/key_range.rs similarity index 100% rename from crates/dbsp/src/trace/ord/key_range.rs rename to crates/dbsp/src/trace/filter/key_range.rs diff --git a/crates/dbsp/src/trace/ord.rs b/crates/dbsp/src/trace/ord.rs index e0bf2b0c689..1be1646b897 100644 --- a/crates/dbsp/src/trace/ord.rs +++ b/crates/dbsp/src/trace/ord.rs @@ -1,11 +1,7 @@ -pub(crate) mod batch_filter; pub mod fallback; pub mod file; -pub(crate) mod key_range; pub mod merge_batcher; pub mod vec; - -pub use batch_filter::{BatchFilterStats, BatchFilters}; pub use fallback::{ indexed_wset::{ FallbackIndexedWSet, FallbackIndexedWSet as OrdIndexedWSet, FallbackIndexedWSetBuilder, diff --git a/crates/dbsp/src/trace/ord/fallback/indexed_wset.rs b/crates/dbsp/src/trace/ord/fallback/indexed_wset.rs index 54dbbc0b4ac..872de33b6e8 100644 --- a/crates/dbsp/src/trace/ord/fallback/indexed_wset.rs +++ b/crates/dbsp/src/trace/ord/fallback/indexed_wset.rs @@ -1,6 +1,6 @@ use super::utils::{copy_to_builder, pick_merge_destination}; use crate::storage::file::SerializerInner; -use crate::storage::filter_stats::FilterStats; +use crate::storage::file::{FilterKind, FilterStats}; use crate::{ DBWeight, Error, NumEntries, algebra::{AddAssignByRef, AddByRef, NegByRef, ZRingValue}, @@ -283,6 +283,13 @@ where } } + fn membership_filter_kind(&self) -> FilterKind { + match &self.inner { + Inner::File(file) => file.membership_filter_kind(), + Inner::Vec(vec) => vec.membership_filter_kind(), + } + } + fn range_filter_stats(&self) -> FilterStats { match &self.inner { Inner::File(file) => file.range_filter_stats(), @@ -290,6 +297,13 @@ where } } + fn key_bounds(&self) -> Option<(&Self::Key, &Self::Key)> { + match &self.inner { + Inner::File(file) => file.key_bounds(), + Inner::Vec(vec) => vec.key_bounds(), + } + } + #[inline] fn location(&self) -> BatchLocation { match &self.inner { @@ -514,17 +528,23 @@ where location: Option, ) -> Self where - B: BatchReader, + B: BatchReader, I: IntoIterator + Clone, { + let key_capacity = batches.clone().into_iter().map(|b| b.key_count()).sum(); + let value_capacity = batches.clone().into_iter().map(|b| b.len()).sum(); Self { factories: factories.clone(), - inner: BuilderInner::new( - factories, - batches.clone().into_iter().map(|b| b.key_count()).sum(), - batches.clone().into_iter().map(|b| b.len()).sum(), - pick_merge_destination(batches, location).into(), - ), + inner: match pick_merge_destination(batches.clone(), location) { + BatchLocation::Memory => BuilderInner::Vec(VecIndexedWSetBuilder::with_capacity( + &factories.vec_indexed_wset_factory, + key_capacity, + value_capacity, + )), + BatchLocation::Storage => BuilderInner::File(FileIndexedWSetBuilder::for_merge( + factories, batches, location, + )), + }, } } diff --git a/crates/dbsp/src/trace/ord/fallback/key_batch.rs b/crates/dbsp/src/trace/ord/fallback/key_batch.rs index e6725e7dcd2..d76d34fee3f 100644 --- a/crates/dbsp/src/trace/ord/fallback/key_batch.rs +++ b/crates/dbsp/src/trace/ord/fallback/key_batch.rs @@ -1,5 +1,5 @@ use super::utils::{copy_to_builder, pick_merge_destination}; -use crate::storage::filter_stats::FilterStats; +use crate::storage::file::{FilterKind, FilterStats}; use crate::{ DBData, DBWeight, NumEntries, Timestamp, dynamic::{ @@ -274,6 +274,14 @@ where } } + #[inline] + fn membership_filter_kind(&self) -> FilterKind { + match &self.inner { + Inner::File(file) => file.membership_filter_kind(), + Inner::Vec(vec) => vec.membership_filter_kind(), + } + } + #[inline] fn range_filter_stats(&self) -> FilterStats { match &self.inner { @@ -282,6 +290,13 @@ where } } + fn key_bounds(&self) -> Option<(&Self::Key, &Self::Key)> { + match &self.inner { + Inner::File(file) => file.key_bounds(), + Inner::Vec(vec) => vec.key_bounds(), + } + } + #[inline] fn location(&self) -> BatchLocation { match &self.inner { @@ -299,7 +314,6 @@ where fn sample_keys(&self, rng: &mut RG, sample_size: usize, output: &mut DynVec) where - Self::Time: PartialEq<()>, RG: Rng, { match &self.inner { @@ -408,23 +422,23 @@ where location: Option, ) -> Self where - B: BatchReader, + B: BatchReader, I: IntoIterator + Clone, { let key_capacity = batches.clone().into_iter().map(|b| b.key_count()).sum(); let value_capacity = batches.clone().into_iter().map(|b| b.len()).sum(); Self { factories: factories.clone(), - inner: match pick_merge_destination(batches, location) { + inner: match pick_merge_destination(batches.clone(), location) { BatchLocation::Memory => BuilderInner::Vec(VecKeyBuilder::with_capacity( &factories.vec, key_capacity, value_capacity, )), - BatchLocation::Storage => BuilderInner::File(FileKeyBuilder::with_capacity( + BatchLocation::Storage => BuilderInner::File(FileKeyBuilder::for_merge( &factories.file, - key_capacity, - value_capacity, + batches, + location, )), }, } diff --git a/crates/dbsp/src/trace/ord/fallback/val_batch.rs b/crates/dbsp/src/trace/ord/fallback/val_batch.rs index f376a626681..96e94c38919 100644 --- a/crates/dbsp/src/trace/ord/fallback/val_batch.rs +++ b/crates/dbsp/src/trace/ord/fallback/val_batch.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use super::utils::{copy_to_builder, pick_merge_destination}; use crate::storage::buffer_cache::CacheStats; -use crate::storage::filter_stats::FilterStats; +use crate::storage::file::{FilterKind, FilterStats}; use crate::trace::cursor::{DelegatingCursor, PushCursor}; use crate::trace::ord::file::val_batch::FileValBuilder; use crate::trace::ord::vec::val_batch::VecValBuilder; @@ -281,6 +281,14 @@ where } } + #[inline] + fn membership_filter_kind(&self) -> FilterKind { + match &self.inner { + Inner::File(file) => file.membership_filter_kind(), + Inner::Vec(vec) => vec.membership_filter_kind(), + } + } + #[inline] fn range_filter_stats(&self) -> FilterStats { match &self.inner { @@ -289,6 +297,13 @@ where } } + fn key_bounds(&self) -> Option<(&Self::Key, &Self::Key)> { + match &self.inner { + Inner::File(file) => file.key_bounds(), + Inner::Vec(vec) => vec.key_bounds(), + } + } + #[inline] fn location(&self) -> BatchLocation { match &self.inner { @@ -307,7 +322,6 @@ where fn sample_keys(&self, rng: &mut RG, sample_size: usize, output: &mut DynVec) where RG: Rng, - T: PartialEq<()>, { match &self.inner { Inner::Vec(vec) => vec.sample_keys(rng, sample_size, output), @@ -425,23 +439,23 @@ where location: Option, ) -> Self where - B: BatchReader, + B: BatchReader, I: IntoIterator + Clone, { let key_capacity = batches.clone().into_iter().map(|b| b.key_count()).sum(); let value_capacity = batches.clone().into_iter().map(|b| b.len()).sum(); Self { factories: factories.clone(), - inner: match pick_merge_destination(batches, location) { + inner: match pick_merge_destination(batches.clone(), location) { BatchLocation::Memory => BuilderInner::Vec(VecValBuilder::with_capacity( &factories.vec, key_capacity, value_capacity, )), - BatchLocation::Storage => BuilderInner::File(FileValBuilder::with_capacity( + BatchLocation::Storage => BuilderInner::File(FileValBuilder::for_merge( &factories.file, - key_capacity, - value_capacity, + batches, + location, )), }, } diff --git a/crates/dbsp/src/trace/ord/fallback/wset.rs b/crates/dbsp/src/trace/ord/fallback/wset.rs index c1de66b50e4..3757d52e7bd 100644 --- a/crates/dbsp/src/trace/ord/fallback/wset.rs +++ b/crates/dbsp/src/trace/ord/fallback/wset.rs @@ -1,5 +1,5 @@ use super::utils::{copy_to_builder, pick_merge_destination}; -use crate::storage::filter_stats::FilterStats; +use crate::storage::file::{FilterKind, FilterStats}; use crate::{ DBWeight, NumEntries, algebra::{AddAssignByRef, AddByRef, NegByRef, ZRingValue}, @@ -281,6 +281,13 @@ where } } + fn membership_filter_kind(&self) -> FilterKind { + match &self.inner { + Inner::File(file) => file.membership_filter_kind(), + Inner::Vec(vec) => vec.membership_filter_kind(), + } + } + fn range_filter_stats(&self) -> FilterStats { match &self.inner { Inner::File(file) => file.range_filter_stats(), @@ -288,6 +295,13 @@ where } } + fn key_bounds(&self) -> Option<(&Self::Key, &Self::Key)> { + match &self.inner { + Inner::File(file) => file.key_bounds(), + Inner::Vec(vec) => vec.key_bounds(), + } + } + #[inline] fn location(&self) -> BatchLocation { match &self.inner { @@ -495,16 +509,22 @@ where location: Option, ) -> Self where - B: BatchReader, + B: BatchReader, I: IntoIterator + Clone, { + let key_capacity = batches.clone().into_iter().map(|b| b.key_count()).sum(); Self { factories: factories.clone(), - inner: BuilderInner::new( - factories, - batches.clone().into_iter().map(|b| b.key_count()).sum(), - pick_merge_destination(batches, location).into(), - ), + inner: match pick_merge_destination(batches.clone(), location) { + BatchLocation::Memory => BuilderInner::Vec(VecWSetBuilder::with_capacity( + &factories.vec_wset_factory, + key_capacity, + key_capacity, + )), + BatchLocation::Storage => { + BuilderInner::File(FileWSetBuilder::for_merge(factories, batches, location)) + } + }, } } diff --git a/crates/dbsp/src/trace/ord/file/indexed_wset_batch.rs b/crates/dbsp/src/trace/ord/file/indexed_wset_batch.rs index 4148e0e134c..f4d0819cfb7 100644 --- a/crates/dbsp/src/trace/ord/file/indexed_wset_batch.rs +++ b/crates/dbsp/src/trace/ord/file/indexed_wset_batch.rs @@ -1,5 +1,5 @@ use crate::storage::file::format::BatchMetadata; -use crate::storage::filter_stats::FilterStats; +use crate::storage::file::{FilterKind, FilterStats}; use crate::{ DBData, DBWeight, NumEntries, Runtime, algebra::{AddAssignByRef, AddByRef, NegByRef}, @@ -10,7 +10,7 @@ use crate::{ storage::{ buffer_cache::CacheStats, file::{ - Factories as FileFactories, + Factories as FileFactories, FilterPlan, reader::{BulkRows, Cursor as FileCursor, Error as ReaderError, Reader}, writer::Writer2, }, @@ -19,8 +19,9 @@ use crate::{ Batch, BatchFactories, BatchLocation, BatchReader, BatchReaderFactories, Builder, Cursor, FileValBatch, VecIndexedWSetFactories, WeightedItem, cursor::{CursorFactory, CursorFactoryWrapper, Pending, Position, PushCursor}, + filter::BatchFilters, merge_batches_by_reference, - ord::{batch_filter::BatchFilters, file::UnwrapStorage, merge_batcher::MergeBatcher}, + ord::{file::UnwrapStorage, merge_batcher::MergeBatcher}, }, }; use crate::{DynZWeight, ZWeight}; @@ -284,7 +285,7 @@ where Runtime::buffer_cache, &*Runtime::storage_backend().unwrap_storage(), Runtime::file_writer_parameters(), - self.key_count(), + FilterPlan::::decide_filter(None, self.key_count()), ) .unwrap_storage(); @@ -398,10 +399,18 @@ where self.filters.stats().membership_filter } + fn membership_filter_kind(&self) -> FilterKind { + self.filters.membership_filter_kind() + } + fn range_filter_stats(&self) -> FilterStats { self.filters.stats().range_filter } + fn key_bounds(&self) -> Option<(&Self::Key, &Self::Key)> { + self.filters.key_bounds() + } + #[inline] fn location(&self) -> BatchLocation { BatchLocation::Storage @@ -913,7 +922,39 @@ where Runtime::buffer_cache, &*Runtime::storage_backend().unwrap_storage(), Runtime::file_writer_parameters(), - key_capacity, + FilterPlan::::decide_filter(None, key_capacity), + ) + .unwrap_storage(), + weight: factories.weight_factory().default_box(), + num_tuples: 0, + stats: BatchMetadata::default(), + } + } + + fn for_merge<'a, B, I>( + factories: &FileIndexedWSetFactories, + batches: I, + _location: Option, + ) -> Self + where + B: BatchReader, + I: IntoIterator + Clone, + { + let key_capacity = batches.clone().into_iter().map(|b| b.key_count()).sum(); + let filter_plan = FilterPlan::from_batches(batches.clone()); + let key_filter = filter_plan.map_or_else( + || FilterPlan::::decide_filter(None, key_capacity), + |filter_plan| FilterPlan::decide_filter(Some(&filter_plan), key_capacity), + ); + Self { + factories: factories.clone(), + writer: Writer2::new( + &factories.factories0, + &factories.factories1, + Runtime::buffer_cache, + &*Runtime::storage_backend().unwrap_storage(), + Runtime::file_writer_parameters(), + key_filter, ) .unwrap_storage(), weight: factories.weight_factory().default_box(), diff --git a/crates/dbsp/src/trace/ord/file/key_batch.rs b/crates/dbsp/src/trace/ord/file/key_batch.rs index 4c02b61b065..6ebbe942c77 100644 --- a/crates/dbsp/src/trace/ord/file/key_batch.rs +++ b/crates/dbsp/src/trace/ord/file/key_batch.rs @@ -1,5 +1,5 @@ use crate::storage::file::format::BatchMetadata; -use crate::storage::filter_stats::FilterStats; +use crate::storage::file::{FilterKind, FilterStats}; use crate::trace::cursor::Position; use crate::{ DBData, DBWeight, NumEntries, Runtime, Timestamp, @@ -10,7 +10,7 @@ use crate::{ storage::{ buffer_cache::CacheStats, file::{ - Factories as FileFactories, + Factories as FileFactories, FilterPlan, reader::{Cursor as FileCursor, Error as ReaderError, Reader}, writer::Writer2, }, @@ -18,7 +18,8 @@ use crate::{ trace::{ Batch, BatchFactories, BatchLocation, BatchReader, BatchReaderFactories, Builder, Cursor, WeightedItem, - ord::{batch_filter::BatchFilters, file::UnwrapStorage, merge_batcher::MergeBatcher}, + filter::BatchFilters, + ord::{file::UnwrapStorage, merge_batcher::MergeBatcher}, }, utils::Tup2, }; @@ -303,10 +304,18 @@ where self.filters.stats().membership_filter } + fn membership_filter_kind(&self) -> FilterKind { + self.filters.membership_filter_kind() + } + fn range_filter_stats(&self) -> FilterStats { self.filters.stats().range_filter } + fn key_bounds(&self) -> Option<(&Self::Key, &Self::Key)> { + self.filters.key_bounds() + } + #[inline] fn location(&self) -> BatchLocation { BatchLocation::Storage @@ -673,7 +682,39 @@ where Runtime::buffer_cache, &*Runtime::storage_backend().unwrap_storage(), Runtime::file_writer_parameters(), - key_capacity, + FilterPlan::::decide_filter(None, key_capacity), + ) + .unwrap_storage(), + key: factories.opt_key_factory.default_box(), + num_tuples: 0, + stats: BatchMetadata::default(), + } + } + + fn for_merge<'a, B, I>( + factories: &FileKeyBatchFactories, + batches: I, + _location: Option, + ) -> Self + where + B: BatchReader, + I: IntoIterator + Clone, + { + let key_capacity = batches.clone().into_iter().map(|b| b.key_count()).sum(); + let filter_plan = FilterPlan::from_batches(batches.clone()); + let key_filter = filter_plan.map_or_else( + || FilterPlan::::decide_filter(None, key_capacity), + |filter_plan| FilterPlan::decide_filter(Some(&filter_plan), key_capacity), + ); + Self { + factories: factories.clone(), + writer: Writer2::new( + &factories.factories0, + &factories.factories1, + Runtime::buffer_cache, + &*Runtime::storage_backend().unwrap_storage(), + Runtime::file_writer_parameters(), + key_filter, ) .unwrap_storage(), key: factories.opt_key_factory.default_box(), diff --git a/crates/dbsp/src/trace/ord/file/val_batch.rs b/crates/dbsp/src/trace/ord/file/val_batch.rs index 6c3824a302f..34b1e8cedb4 100644 --- a/crates/dbsp/src/trace/ord/file/val_batch.rs +++ b/crates/dbsp/src/trace/ord/file/val_batch.rs @@ -1,5 +1,5 @@ use crate::storage::buffer_cache::CacheStats; -use crate::storage::filter_stats::FilterStats; +use crate::storage::file::{FilterKind, FilterStats}; use crate::trace::BatchLocation; use crate::trace::cursor::Position; use crate::trace::ord::file::UnwrapStorage; @@ -10,14 +10,14 @@ use crate::{ Factory, LeanVec, WeightTrait, WithFactory, }, storage::file::{ - Factories as FileFactories, + Factories as FileFactories, FilterPlan, format::BatchMetadata, reader::{Cursor as FileCursor, Error as ReaderError, Reader}, writer::Writer2, }, trace::{ Batch, BatchFactories, BatchReader, BatchReaderFactories, Builder, Cursor, WeightedItem, - ord::{batch_filter::BatchFilters, merge_batcher::MergeBatcher}, + filter::BatchFilters, ord::merge_batcher::MergeBatcher, }, utils::Tup2, }; @@ -325,10 +325,18 @@ where self.filters.stats().membership_filter } + fn membership_filter_kind(&self) -> FilterKind { + self.filters.membership_filter_kind() + } + fn range_filter_stats(&self) -> FilterStats { self.filters.stats().range_filter } + fn key_bounds(&self) -> Option<(&Self::Key, &Self::Key)> { + self.filters.key_bounds() + } + #[inline] fn location(&self) -> BatchLocation { BatchLocation::Storage @@ -716,7 +724,39 @@ where Runtime::buffer_cache, &*Runtime::storage_backend().unwrap_storage(), Runtime::file_writer_parameters(), - key_capacity, + FilterPlan::::decide_filter(None, key_capacity), + ) + .unwrap_storage(), + time_diffs: factories.timediff_factory.default_box(), + num_tuples: 0, + stats: BatchMetadata::default(), + } + } + + fn for_merge<'a, B, I>( + factories: &FileValBatchFactories, + batches: I, + _location: Option, + ) -> Self + where + B: BatchReader, + I: IntoIterator + Clone, + { + let key_capacity = batches.clone().into_iter().map(|b| b.key_count()).sum(); + let filter_plan = FilterPlan::from_batches(batches.clone()); + let key_filter = filter_plan.map_or_else( + || FilterPlan::::decide_filter(None, key_capacity), + |filter_plan| FilterPlan::decide_filter(Some(&filter_plan), key_capacity), + ); + Self { + factories: factories.clone(), + writer: Writer2::new( + &factories.factories0, + &factories.factories1, + Runtime::buffer_cache, + &*Runtime::storage_backend().unwrap_storage(), + Runtime::file_writer_parameters(), + key_filter, ) .unwrap_storage(), time_diffs: factories.timediff_factory.default_box(), diff --git a/crates/dbsp/src/trace/ord/file/wset_batch.rs b/crates/dbsp/src/trace/ord/file/wset_batch.rs index 33a005f98d6..10bf4bd7698 100644 --- a/crates/dbsp/src/trace/ord/file/wset_batch.rs +++ b/crates/dbsp/src/trace/ord/file/wset_batch.rs @@ -1,5 +1,5 @@ use crate::storage::file::format::BatchMetadata; -use crate::storage::filter_stats::FilterStats; +use crate::storage::file::{FilterKind, FilterStats}; use crate::{ DBData, DBWeight, NumEntries, Runtime, algebra::{AddAssignByRef, AddByRef, NegByRef}, @@ -10,7 +10,7 @@ use crate::{ storage::{ buffer_cache::CacheStats, file::{ - Factories as FileFactories, + Factories as FileFactories, FilterPlan, reader::{BulkRows, Cursor as FileCursor, Error as ReaderError, Reader}, writer::Writer1, }, @@ -19,8 +19,9 @@ use crate::{ Batch, BatchFactories, BatchLocation, BatchReader, BatchReaderFactories, Builder, Cursor, DbspSerializer, Deserializer, FileKeyBatch, VecWSetFactories, WeightedItem, cursor::{CursorFactoryWrapper, Pending, Position, PushCursor}, + filter::BatchFilters, merge_batches_by_reference, - ord::{batch_filter::BatchFilters, file::UnwrapStorage, merge_batcher::MergeBatcher}, + ord::{file::UnwrapStorage, merge_batcher::MergeBatcher}, }, }; use crate::{DynZWeight, ZWeight}; @@ -258,7 +259,7 @@ where Runtime::buffer_cache, &*Runtime::storage_backend().unwrap(), Runtime::file_writer_parameters(), - self.key_count(), + FilterPlan::::decide_filter(None, self.key_count()), ) .unwrap_storage(); @@ -387,10 +388,18 @@ where self.filters.stats().membership_filter } + fn membership_filter_kind(&self) -> FilterKind { + self.filters.membership_filter_kind() + } + fn range_filter_stats(&self) -> FilterStats { self.filters.stats().range_filter } + fn key_bounds(&self) -> Option<(&Self::Key, &Self::Key)> { + self.filters.key_bounds() + } + #[inline] fn location(&self) -> BatchLocation { BatchLocation::Storage @@ -826,7 +835,38 @@ where Runtime::buffer_cache, &*Runtime::storage_backend().unwrap_storage(), Runtime::file_writer_parameters(), - key_capacity, + FilterPlan::::decide_filter(None, key_capacity), + ) + .unwrap_storage(), + weight: factories.weight_factory().default_box(), + num_tuples: 0, + stats: BatchMetadata::default(), + } + } + + fn for_merge<'a, B, I>( + factories: & as BatchReader>::Factories, + batches: I, + _location: Option, + ) -> Self + where + B: BatchReader, + I: IntoIterator + Clone, + { + let key_capacity = batches.clone().into_iter().map(|b| b.key_count()).sum(); + let filter_plan = FilterPlan::from_batches(batches.clone()); + let key_filter = filter_plan.map_or_else( + || FilterPlan::::decide_filter(None, key_capacity), + |filter_plan| FilterPlan::decide_filter(Some(&filter_plan), key_capacity), + ); + Self { + factories: factories.clone(), + writer: Writer1::new( + &factories.file_factories, + Runtime::buffer_cache, + &*Runtime::storage_backend().unwrap_storage(), + Runtime::file_writer_parameters(), + key_filter, ) .unwrap_storage(), weight: factories.weight_factory().default_box(), diff --git a/crates/dbsp/src/trace/ord/vec/indexed_wset_batch.rs b/crates/dbsp/src/trace/ord/vec/indexed_wset_batch.rs index 092e87d6717..5d47c5e129f 100644 --- a/crates/dbsp/src/trace/ord/vec/indexed_wset_batch.rs +++ b/crates/dbsp/src/trace/ord/vec/indexed_wset_batch.rs @@ -1,5 +1,5 @@ +use crate::storage::file::FilterStats; use crate::storage::file::SerializerInner; -use crate::storage::filter_stats::FilterStats; use crate::trace::ord::merge_batcher::MergeBatcher; use crate::{ DBData, DBWeight, Error, NumEntries, @@ -461,9 +461,12 @@ where FilterStats::default() } + fn key_bounds(&self) -> Option<(&Self::Key, &Self::Key)> { + Some((self.layer.keys.first()?, self.layer.keys.last()?)) + } + fn sample_keys(&self, rng: &mut RG, sample_size: usize, sample: &mut DynVec) where - Self::Time: PartialEq<()>, RG: Rng, { self.layer.sample_keys(rng, sample_size, sample); diff --git a/crates/dbsp/src/trace/ord/vec/key_batch.rs b/crates/dbsp/src/trace/ord/vec/key_batch.rs index a1598db2d41..c7c1f24a8ca 100644 --- a/crates/dbsp/src/trace/ord/vec/key_batch.rs +++ b/crates/dbsp/src/trace/ord/vec/key_batch.rs @@ -1,4 +1,4 @@ -use crate::storage::filter_stats::FilterStats; +use crate::storage::file::FilterStats; use crate::trace::ord::merge_batcher::MergeBatcher; use crate::{ DBData, DBWeight, NumEntries, Timestamp, @@ -320,9 +320,12 @@ where FilterStats::default() } + fn key_bounds(&self) -> Option<(&Self::Key, &Self::Key)> { + Some((self.layer.keys.first()?, self.layer.keys.last()?)) + } + fn sample_keys(&self, rng: &mut RG, sample_size: usize, sample: &mut DynVec) where - Self::Time: PartialEq<()>, RG: Rng, { self.layer.sample_keys(rng, sample_size, sample); diff --git a/crates/dbsp/src/trace/ord/vec/val_batch.rs b/crates/dbsp/src/trace/ord/vec/val_batch.rs index e19217ba530..b58383a63c7 100644 --- a/crates/dbsp/src/trace/ord/vec/val_batch.rs +++ b/crates/dbsp/src/trace/ord/vec/val_batch.rs @@ -1,5 +1,5 @@ use crate::ZWeight; -use crate::storage::filter_stats::FilterStats; +use crate::storage::file::FilterStats; use crate::trace::cursor::Position; use crate::trace::ord::merge_batcher::MergeBatcher; use crate::{ @@ -381,9 +381,12 @@ where FilterStats::default() } + fn key_bounds(&self) -> Option<(&Self::Key, &Self::Key)> { + Some((self.layer.keys.first()?, self.layer.keys.last()?)) + } + fn sample_keys(&self, rng: &mut RG, sample_size: usize, sample: &mut DynVec) where - Self::Time: PartialEq<()>, RG: Rng, { self.layer.sample_keys(rng, sample_size, sample); diff --git a/crates/dbsp/src/trace/ord/vec/wset_batch.rs b/crates/dbsp/src/trace/ord/vec/wset_batch.rs index 2e262d21ede..5c21bce1b9f 100644 --- a/crates/dbsp/src/trace/ord/vec/wset_batch.rs +++ b/crates/dbsp/src/trace/ord/vec/wset_batch.rs @@ -1,4 +1,4 @@ -use crate::storage::filter_stats::FilterStats; +use crate::storage::file::FilterStats; use crate::{ DBData, DBWeight, NumEntries, algebra::{NegByRef, ZRingValue}, @@ -363,6 +363,10 @@ impl BatchReader for VecWSet Option<(&Self::Key, &Self::Key)> { + Some((self.layer.keys.first()?, self.layer.keys.last()?)) + } + fn sample_keys(&self, rng: &mut RG, sample_size: usize, sample: &mut DynVec) where RG: Rng, diff --git a/crates/dbsp/src/trace/sampling.rs b/crates/dbsp/src/trace/sampling.rs new file mode 100644 index 00000000000..1798c4bfc8d --- /dev/null +++ b/crates/dbsp/src/trace/sampling.rs @@ -0,0 +1,61 @@ +use crate::{ + dynamic::DynVec, + trace::{BatchReader, BatchReaderFactories, Cursor, cursor::CursorList}, +}; +use rand::Rng; + +/// Samples keys from a set of batches by invoking each batch's +/// [`BatchReader::sample_keys`] implementation and merging the results. +/// +/// `sample_size_for` decides how many keys to request from each batch. The +/// helper deduplicates keys across batches before appending them to `sample`, +/// which keeps it usable for overlapping inputs such as merge planning. +pub(crate) fn sample_keys_from_batches( + factories: &B::Factories, + batches: &[&B], + rng: &mut RG, + mut sample_size_for: F, + sample: &mut DynVec, +) where + B: BatchReader, + RG: Rng, + F: FnMut(&B) -> usize, +{ + if batches.is_empty() { + return; + } + + let total_sample_size = batches + .iter() + .map(|batch| sample_size_for(*batch)) + .sum::(); + if total_sample_size == 0 { + return; + } + + let mut intermediate = factories.keys_factory().default_box(); + let mut merged_cursor = CursorList::new( + factories.weight_factory(), + batches.iter().map(|batch| batch.cursor()).collect(), + ); + intermediate.reserve(total_sample_size); + + for batch in batches { + let sample_size = sample_size_for(*batch); + if sample_size == 0 { + continue; + } + batch.sample_keys(rng, sample_size, intermediate.as_mut()); + } + + intermediate.as_mut().sort_unstable(); + intermediate.dedup(); + for key in intermediate.dyn_iter_mut() { + merged_cursor.seek_key(key); + if let Some(current_key) = merged_cursor.get_key() + && current_key == key + { + sample.push_ref(key); + } + } +} diff --git a/crates/dbsp/src/trace/spine_async.rs b/crates/dbsp/src/trace/spine_async.rs index 11defddadf1..ff3f14eafe6 100644 --- a/crates/dbsp/src/trace/spine_async.rs +++ b/crates/dbsp/src/trace/spine_async.rs @@ -18,17 +18,19 @@ use crate::{ MERGING_MEMORY_RECORDS_COUNT, MERGING_SIZE_BYTES, MERGING_STORAGE_RECORDS_COUNT, MetaItem, MetricId, MetricReading, NEGATIVE_WEIGHT_COUNT, OperatorMeta, RANGE_FILTER_HIT_RATE_PERCENT, RANGE_FILTER_HITS_COUNT, RANGE_FILTER_MISSES_COUNT, - RANGE_FILTER_SIZE_BYTES, SPINE_BATCHES_COUNT, SPINE_STORAGE_SIZE_BYTES, + RANGE_FILTER_SIZE_BYTES, ROARING_FILTER_HIT_RATE_PERCENT, ROARING_FILTER_HITS_COUNT, + ROARING_FILTER_MISSES_COUNT, ROARING_FILTER_SIZE_BYTES, SPINE_BATCHES_COUNT, + SPINE_STORAGE_SIZE_BYTES, }, metrics::COMPACTION_STALL_TIME_NANOSECONDS, negative_weight_multiplier, runtime::{TOKIO_BUFFER_CACHE, TOKIO_WORKER_INDEX}, }, - dynamic::{DynVec, Factory, Weight}, + dynamic::{DynVec, Factory}, samply::SamplySpan, storage::{ buffer_cache::{BufferCache, CacheStats}, - filter_stats::FilterStats, + file::{FilterKind, FilterStats}, }, time::Timestamp, trace::{ @@ -36,6 +38,7 @@ use crate::{ cursor::{CursorList, Position}, merge_batches, ord::fallback::pick_insert_destination, + sample_keys_from_batches, spine_async::{ list_merger::ArcListMerger, push_merger::ArcPushMerger, snapshot::FetchList, }, @@ -68,7 +71,6 @@ use std::{ use std::{collections::VecDeque, sync::atomic::Ordering}; use std::{ fmt::{self, Debug, Display, Formatter}, - ops::DerefMut, sync::Condvar, }; use std::{ops::RangeInclusive, sync::Mutex}; @@ -780,19 +782,25 @@ where let mut cache_stats = spine_stats.cache_stats; let mut storage_size = 0; let mut merging_size = 0; - let mut membership_filter_stats = FilterStats::default(); + let mut membership_filter_stats = BTreeMap::::new(); let mut range_filter_stats = FilterStats::default(); - let mut storage_records = 0; + let mut bloom_filter_records = 0; for (batch, merging) in batches { cache_stats += batch.cache_stats(); - membership_filter_stats += batch.membership_filter_stats(); + let kind = batch.membership_filter_kind(); + if kind != FilterKind::None { + *membership_filter_stats.entry(kind).or_default() += + batch.membership_filter_stats(); + } + if kind == FilterKind::Bloom { + bloom_filter_records += batch.key_count(); + } range_filter_stats += batch.range_filter_stats(); let on_storage = batch.location() == BatchLocation::Storage; if on_storage || merging { let size = batch.approximate_byte_size(); if on_storage { storage_size += size; - storage_records += batch.key_count(); } if merging { merging_size += size; @@ -800,9 +808,16 @@ where } } - if storage_records > 0 { + let bloom_filter_stats = membership_filter_stats + .remove(&FilterKind::Bloom) + .unwrap_or_default(); + let roaring_filter_stats = membership_filter_stats + .remove(&FilterKind::Roaring) + .unwrap_or_default(); + + if bloom_filter_records > 0 { let bits_per_key = - membership_filter_stats.size_byte as f64 * 8.0 / storage_records as f64; + bloom_filter_stats.size_byte as f64 * 8.0 / bloom_filter_records as f64; let bits_per_key = bits_per_key as usize; meta.extend(metadata! { BLOOM_FILTER_BITS_PER_KEY => MetaItem::Int(bits_per_key) @@ -839,25 +854,48 @@ where MetricReading::new( BLOOM_FILTER_SIZE_BYTES, Vec::new(), - MetaItem::bytes(membership_filter_stats.size_byte), + MetaItem::bytes(bloom_filter_stats.size_byte), ), MetricReading::new( BLOOM_FILTER_HITS_COUNT, Vec::new(), - MetaItem::Count(membership_filter_stats.hits), + MetaItem::Count(bloom_filter_stats.hits), ), MetricReading::new( BLOOM_FILTER_MISSES_COUNT, Vec::new(), - MetaItem::Count(membership_filter_stats.misses), + MetaItem::Count(bloom_filter_stats.misses), ), MetricReading::new( BLOOM_FILTER_HIT_RATE_PERCENT, Vec::new(), MetaItem::Percent { - numerator: membership_filter_stats.hits as u64, - denominator: membership_filter_stats.hits as u64 - + membership_filter_stats.misses as u64, + numerator: bloom_filter_stats.hits as u64, + denominator: bloom_filter_stats.hits as u64 + bloom_filter_stats.misses as u64, + }, + ), + MetricReading::new( + ROARING_FILTER_SIZE_BYTES, + Vec::new(), + MetaItem::bytes(roaring_filter_stats.size_byte), + ), + MetricReading::new( + ROARING_FILTER_HITS_COUNT, + Vec::new(), + MetaItem::Count(roaring_filter_stats.hits), + ), + MetricReading::new( + ROARING_FILTER_MISSES_COUNT, + Vec::new(), + MetaItem::Count(roaring_filter_stats.misses), + ), + MetricReading::new( + ROARING_FILTER_HIT_RATE_PERCENT, + Vec::new(), + MetaItem::Percent { + numerator: roaring_filter_stats.hits as u64, + denominator: roaring_filter_stats.hits as u64 + + roaring_filter_stats.misses as u64, }, ), MetricReading::new( @@ -1291,57 +1329,6 @@ where } } -/// Samples `sample_size` keys from a set of batches. -/// -/// See [`BatchReader::sample_keys`](`crate::trace::BatchReader::sample_keys`) for more details. -pub(crate) fn sample_keys_from_batches( - factories: &B::Factories, - batches: &[Arc], - rng: &mut RG, - sample_size: usize, - sample: &mut DynVec, -) where - B: Batch, - B::Time: PartialEq<()>, - RG: Rng, -{ - let total_keys = batches.iter().map(|batch| batch.key_count()).sum::(); - - if sample_size == 0 || total_keys == 0 { - // Avoid division by zero. - return; - } - - // Sample each batch, picking the number of keys proportional to - // batch size. - let mut intermediate = factories.keys_factory().default_box(); - intermediate.reserve(sample_size); - - for batch in batches { - batch.sample_keys( - rng, - ((batch.key_count() as u128) * (sample_size as u128) / (total_keys as u128)) as usize, - intermediate.as_mut(), - ); - } - - // Drop duplicate keys and keys that appear with 0 weight, i.e., - // get canceled out across multiple batches. - intermediate.deref_mut().sort_unstable(); - intermediate.dedup(); - - let mut cursor = SpineCursor::new_cursor(factories, batches.to_vec()); - for key in intermediate.dyn_iter_mut() { - cursor.seek_key(key); - if let Some(current_key) = cursor.get_key() - && current_key == key - { - debug_assert!(cursor.val_valid() && !cursor.weight().is_zero()); - sample.push_ref(key); - } - } -} - impl BatchReader for Spine where B: Batch, @@ -1382,14 +1369,6 @@ where .sum() } - fn membership_filter_stats(&self) -> FilterStats { - self.merger - .get_batches() - .iter() - .map(|batch| batch.membership_filter_stats()) - .sum() - } - fn range_filter_stats(&self) -> FilterStats { self.merger .get_batches() @@ -1404,14 +1383,23 @@ where fn sample_keys(&self, rng: &mut RG, sample_size: usize, sample: &mut DynVec) where - Self::Time: PartialEq<()>, RG: Rng, { + let batches = self.merger.get_batches(); + let total_keys = batches.iter().map(|batch| batch.key_count()).sum::(); + let batch_refs: Vec<_> = batches.iter().map(Arc::as_ref).collect(); sample_keys_from_batches( &self.factories, - &self.merger.get_batches(), + &batch_refs, rng, - sample_size, + |batch| { + if sample_size == 0 || total_keys == 0 { + 0 + } else { + ((batch.key_count() as u128) * (sample_size as u128) / (total_keys as u128)) + as usize + } + }, sample, ); } diff --git a/crates/dbsp/src/trace/spine_async/snapshot.rs b/crates/dbsp/src/trace/spine_async/snapshot.rs index 8f7e080779d..92c0ea12a52 100644 --- a/crates/dbsp/src/trace/spine_async/snapshot.rs +++ b/crates/dbsp/src/trace/spine_async/snapshot.rs @@ -13,10 +13,12 @@ use size_of::SizeOf; use super::SpineCursor; use crate::NumEntries; use crate::dynamic::{DynVec, Factory}; -use crate::storage::filter_stats::FilterStats; +use crate::storage::file::FilterStats; use crate::trace::cursor::{CursorFactory, CursorList}; -use crate::trace::spine_async::sample_keys_from_batches; -use crate::trace::{Batch, BatchReader, BatchReaderFactories, Cursor, Spine, merge_batches}; +use crate::trace::{ + Batch, BatchReader, BatchReaderFactories, Cursor, Spine, merge_batches, + sample_keys_from_batches, +}; pub trait WithSnapshot: Sized { type Batch: Batch; @@ -228,27 +230,32 @@ where .fold(0, |acc, batch| acc + batch.approximate_byte_size()) } - fn membership_filter_stats(&self) -> FilterStats { - self.batches - .iter() - .map(|b| b.membership_filter_stats()) - .sum() - } - fn range_filter_stats(&self) -> FilterStats { self.batches.iter().map(|b| b.range_filter_stats()).sum() } fn sample_keys(&self, rng: &mut RG, sample_size: usize, sample: &mut DynVec) where - Self::Time: PartialEq<()>, RG: Rng, { + let total_keys = self + .batches + .iter() + .map(|batch| batch.key_count()) + .sum::(); + let batch_refs: Vec<_> = self.batches.iter().map(Arc::as_ref).collect(); sample_keys_from_batches( &self.factories, - self.batches.as_slice(), + &batch_refs, rng, - sample_size, + |batch| { + if sample_size == 0 || total_keys == 0 { + 0 + } else { + ((batch.key_count() as u128) * (sample_size as u128) / (total_keys as u128)) + as usize + } + }, sample, ); } diff --git a/crates/dbsp/src/trace/test.rs b/crates/dbsp/src/trace/test.rs index b1849209bd9..94dc4a07068 100644 --- a/crates/dbsp/src/trace/test.rs +++ b/crates/dbsp/src/trace/test.rs @@ -12,14 +12,15 @@ use size_of::SizeOf; use crate::{ DynZWeight, Runtime, ZWeight, algebra::{ - IndexedZSet, NegByRef, OrdIndexedZSet, OrdIndexedZSetFactories, OrdZSet, OrdZSetFactories, - ZBatch, ZSet, + AddByRef, IndexedZSet, NegByRef, OrdIndexedZSet, OrdIndexedZSetFactories, OrdZSet, + OrdZSetFactories, ZBatch, ZSet, }, circuit::{CircuitConfig, mkconfig}, dynamic::{DowncastTrait, DynData, DynUnit, DynWeightedPairs, Erase, LeanVec, pair::DynPair}, + storage::{buffer_cache::CacheStats, file::FilterKind}, trace::{ - Batch, BatchReader, BatchReaderFactories, Builder, FileIndexedWSetFactories, - FileWSetFactories, GroupFilter, Spine, Trace, + Batch, BatchLocation, BatchReader, BatchReaderFactories, Builder, FileIndexedWSetFactories, + FileWSetFactories, GroupFilter, ListMerger, Spine, Trace, cursor::{Cursor, CursorPair}, ord::{ FileIndexedWSet, FileKeyBatch, FileKeyBatchFactories, FileValBatch, @@ -31,7 +32,7 @@ use crate::{ assert_trace_eq, test_batch_sampling, test_trace_sampling, }, }, - utils::{Tup2, Tup3, Tup4}, + utils::{Tup1, Tup2, Tup3, Tup4}, }; use super::Filter; @@ -828,6 +829,13 @@ where F: FnOnce() + Clone + Send + 'static, { let (_temp_dir, config) = mkconfig(); + run_in_circuit_with_storage_config(config, f); +} + +fn run_in_circuit_with_storage_config(config: CircuitConfig, f: F) +where + F: FnOnce() + Clone + Send + 'static, +{ let count = Arc::new(AtomicUsize::new(0)); Runtime::init_circuit(config, { let count = count.clone(); @@ -843,6 +851,145 @@ where assert_eq!(count.load(Ordering::Relaxed), 1); } +fn total_cache_accesses(stats: CacheStats) -> u64 { + stats + .0 + .iter() + .map(|(_, accesses)| accesses.iter().map(|(_, counts)| counts.count).sum::()) + .sum() +} + +fn build_file_wset_u32(keys: &[u32]) -> FileWSet { + let factories = >::new::(); + let mut builder = + as Batch>::Builder::with_capacity(&factories, keys.len(), 0); + + for key in keys { + let weight: ZWeight = 1; + builder.push_time_diff(&(), weight.erase()); + builder.push_key(key.erase()); + } + + builder.done() +} + +fn build_file_wset_tup1_i32(keys: &[i32]) -> FileWSet { + let factories = >::new::, (), ZWeight>(); + let mut builder = + as Batch>::Builder::with_capacity(&factories, keys.len(), 0); + + for key in keys { + let weight: ZWeight = 1; + builder.push_time_diff(&(), weight.erase()); + builder.push_key(Tup1(*key).erase()); + } + + builder.done() +} + +fn build_fallback_wset_i32(keys: &[i32]) -> crate::trace::FallbackWSet { + let factories = + >::new::(); + let mut erased_tuples = zset_tuples(keys.iter().copied().map(|key| Tup2(key, 1)).collect()); + crate::trace::FallbackWSet::::dyn_from_tuples( + &factories, + (), + &mut erased_tuples, + ) +} + +#[test] +fn test_file_wset_roaring_u32_seek_key_exact_skips_absent_reads() { + let (_temp_dir, mut config) = mkconfig(); + config.dev_tweaks.enable_roaring = Some(true); + + run_in_circuit_with_storage_config(config, move || { + let batch = build_file_wset_u32(&[1, 3, 7]); + let mut cursor = batch.cursor(); + let before = total_cache_accesses(batch.cache_stats()); + + let missing = 2u32; + assert!(!cursor.seek_key_exact(missing.erase(), None)); + assert_eq!(total_cache_accesses(batch.cache_stats()), before); + + let present = 3u32; + assert!(cursor.seek_key_exact(present.erase(), None)); + }); +} + +#[test] +fn test_file_wset_tup1_i32_roaring_seek_key_exact_skips_absent_reads() { + let (_temp_dir, mut config) = mkconfig(); + config.dev_tweaks.enable_roaring = Some(true); + + run_in_circuit_with_storage_config(config, move || { + let batch = build_file_wset_tup1_i32(&[-7, 1, 3]); + let mut cursor = batch.cursor(); + let before = total_cache_accesses(batch.cache_stats()); + + let missing = Tup1(2i32); + assert!(!cursor.seek_key_exact(missing.erase(), None)); + assert_eq!(total_cache_accesses(batch.cache_stats()), before); + + let present = Tup1(3i32); + assert!(cursor.seek_key_exact(present.erase(), None)); + }); +} + +#[test] +fn test_file_wset_roaring_filter_rebuilt_after_merge() { + let (_temp_dir, mut config) = mkconfig(); + config.dev_tweaks.enable_roaring = Some(true); + + run_in_circuit_with_storage_config(config, move || { + let lhs = build_file_wset_u32(&[1, 5]); + let rhs = build_file_wset_u32(&[3, 7]); + let merged = lhs.add_by_ref(&rhs); + + let mut cursor = merged.cursor(); + let before = total_cache_accesses(merged.cache_stats()); + + let missing = 4u32; + assert!(!cursor.seek_key_exact(missing.erase(), None)); + assert_eq!(total_cache_accesses(merged.cache_stats()), before); + + let present = 7u32; + assert!(cursor.seek_key_exact(present.erase(), None)); + }); +} + +#[test] +fn test_fallback_wset_roaring_filter_rebuilt_after_storage_merge() { + let (_temp_dir, mut config) = mkconfig(); + config.dev_tweaks.enable_roaring = Some(true); + config.storage.as_mut().unwrap().options.min_storage_bytes = Some(0); + + run_in_circuit_with_storage_config(config, move || { + let lhs = build_fallback_wset_i32(&[1, 5]); + let rhs = build_fallback_wset_i32(&[3, 7]); + let factories = + >::new::(); + let merged: crate::trace::FallbackWSet = ListMerger::merge( + &factories, + as Batch>::Builder::for_merge( + &factories, + [&lhs, &rhs], + Some(BatchLocation::Storage), + ), + vec![lhs.merge_cursor(None, None), rhs.merge_cursor(None, None)], + ); + + assert_eq!(merged.membership_filter_kind(), FilterKind::Roaring); + + let mut cursor = merged.cursor(); + let before = total_cache_accesses(merged.cache_stats()); + + let missing = 4i32; + assert!(!cursor.seek_key_exact(missing.erase(), None)); + assert_eq!(total_cache_accesses(merged.cache_stats()), before); + }); +} + proptest! { #![proptest_config(ProptestConfig::with_cases(1000))] diff --git a/crates/dbsp/src/trace/test/test_batch.rs b/crates/dbsp/src/trace/test/test_batch.rs index 99d64417552..34bc1728ca8 100644 --- a/crates/dbsp/src/trace/test/test_batch.rs +++ b/crates/dbsp/src/trace/test/test_batch.rs @@ -3,7 +3,7 @@ //! So far, only methods/traits used in tests have been implemented. #![allow(clippy::type_complexity)] -use crate::storage::filter_stats::FilterStats; +use crate::storage::file::FilterStats; use crate::{ DBData, DBWeight, NumEntries, Timestamp, dynamic::{ diff --git a/crates/dbsp/src/utils.rs b/crates/dbsp/src/utils.rs index 6849a75f3b2..a8528842407 100644 --- a/crates/dbsp/src/utils.rs +++ b/crates/dbsp/src/utils.rs @@ -6,6 +6,7 @@ mod consolidation; mod graph; mod is_none; mod sort; +mod supports_roaring; pub mod tuple; #[cfg(test)] @@ -31,6 +32,7 @@ pub use consolidation::{ pub use graph::components; pub use is_none::IsNone; +pub use supports_roaring::SupportsRoaring; #[allow(unused_imports)] pub use dot::{DotEdgeAttributes, DotNodeAttributes}; diff --git a/crates/dbsp/src/utils/supports_roaring.rs b/crates/dbsp/src/utils/supports_roaring.rs new file mode 100644 index 00000000000..0fa1388b807 --- /dev/null +++ b/crates/dbsp/src/utils/supports_roaring.rs @@ -0,0 +1,262 @@ +//! Trait for key types that can be mapped into a roaring bitmap domain. + +use crate::dynamic::{BSet, DowncastTrait, DynData, LeanVec}; +use crate::time::UnitTimestamp; +use std::collections::BTreeMap; +use std::rc::Rc; +use std::sync::Arc; +use uuid::Uuid; + +pub trait SupportsRoaring { + #[inline] + fn supports_roaring32(&self) -> bool { + false + } + + #[inline] + fn roaring_u32_offset(&self, _min: &Self) -> Option + where + Self: Sized, + { + None + } + + #[inline] + fn into_roaring_u32(&self, _min: &DynData) -> Option { + None + } + + #[inline] + fn into_roaring_u32_checked(&self, min: &DynData) -> u32 { + self.into_roaring_u32(min) + .expect("roaring-u32 filter was selected for a key outside the planned batch range") + } +} + +#[macro_export] +macro_rules! never_roaring_filter { + ($($ty:ty),* $(,)?) => { + $( + impl $crate::utils::SupportsRoaring for $ty {} + )* + }; +} + +never_roaring_filter!( + (), + bool, + char, + i8, + i16, + i128, + u8, + u16, + u128, + f32, + f64, + usize, + isize, + String, + UnitTimestamp, + Uuid +); + +impl SupportsRoaring for u32 { + #[inline] + fn supports_roaring32(&self) -> bool { + true + } + + #[inline] + fn roaring_u32_offset(&self, min: &Self) -> Option { + self.checked_sub(*min) + } + + #[inline] + fn into_roaring_u32(&self, min: &DynData) -> Option { + self.roaring_u32_offset(min.downcast_checked::()) + } +} + +impl SupportsRoaring for i32 { + #[inline] + fn supports_roaring32(&self) -> bool { + true + } + + #[inline] + fn roaring_u32_offset(&self, min: &Self) -> Option { + let diff = i64::from(*self) - i64::from(*min); + (0..=i64::from(u32::MAX)) + .contains(&diff) + .then_some(diff as u32) + } + + #[inline] + fn into_roaring_u32(&self, min: &DynData) -> Option { + self.roaring_u32_offset(min.downcast_checked::()) + } +} + +impl SupportsRoaring for u64 { + #[inline] + fn supports_roaring32(&self) -> bool { + true + } + + #[inline] + fn roaring_u32_offset(&self, min: &Self) -> Option { + self.checked_sub(*min) + .filter(|diff| *diff <= u64::from(u32::MAX)) + .map(|diff| diff as u32) + } + + #[inline] + fn into_roaring_u32(&self, min: &DynData) -> Option { + self.roaring_u32_offset(min.downcast_checked::()) + } +} + +impl SupportsRoaring for i64 { + #[inline] + fn supports_roaring32(&self) -> bool { + true + } + + #[inline] + fn roaring_u32_offset(&self, min: &Self) -> Option { + let diff = i128::from(*self) - i128::from(*min); + (0..=i128::from(u32::MAX)) + .contains(&diff) + .then_some(diff as u32) + } + + #[inline] + fn into_roaring_u32(&self, min: &DynData) -> Option { + self.roaring_u32_offset(min.downcast_checked::()) + } +} + +impl SupportsRoaring for Option {} + +#[macro_export] +macro_rules! never_roaring_filter_1 { + ($($wrapper:ident),* $(,)?) => { + $( + impl $crate::utils::SupportsRoaring for $wrapper {} + )* + }; +} + +never_roaring_filter_1!(Vec, LeanVec, BSet); + +#[macro_export] +macro_rules! delegate_supports_roaring { + ($($wrapper:ident),* $(,)?) => { + $( + impl $crate::utils::SupportsRoaring for $wrapper { + #[inline] + fn supports_roaring32(&self) -> bool { + self.as_ref().supports_roaring32() + } + + #[inline] + fn roaring_u32_offset(&self, min: &Self) -> Option { + self.as_ref().roaring_u32_offset(min.as_ref()) + } + + #[inline] + fn into_roaring_u32(&self, min: &$crate::dynamic::DynData) -> Option { + self.as_ref().into_roaring_u32(min) + } + + #[inline] + fn into_roaring_u32_checked(&self, min: &$crate::dynamic::DynData) -> u32 { + self.as_ref().into_roaring_u32_checked(min) + } + } + )* + }; +} + +delegate_supports_roaring!(Box, Rc, Arc); + +#[macro_export] +macro_rules! never_roaring_filter_tuples { + ($($name:ident),+) => { + impl<$($name),+> SupportsRoaring for ($($name,)+) {} + }; +} + +never_roaring_filter_tuples!(A); +never_roaring_filter_tuples!(A, B); +never_roaring_filter_tuples!(A, B, C); +never_roaring_filter_tuples!(A, B, C, D); +never_roaring_filter_tuples!(A, B, C, D, E); +never_roaring_filter_tuples!(A, B, C, D, E, F); + +impl SupportsRoaring for BTreeMap {} + +impl SupportsRoaring for crate::utils::Tup1 { + #[inline] + fn supports_roaring32(&self) -> bool { + self.0.supports_roaring32() + } + + #[inline] + fn roaring_u32_offset(&self, min: &Self) -> Option { + self.0.roaring_u32_offset(&min.0) + } + + #[inline] + fn into_roaring_u32(&self, min: &DynData) -> Option { + self.roaring_u32_offset(min.downcast_checked::()) + } + + #[inline] + fn into_roaring_u32_checked(&self, min: &DynData) -> u32 { + self.roaring_u32_offset(min.downcast_checked::()) + .expect("roaring-u32 filter was selected for a key outside the planned batch range") + } +} + +#[cfg(test)] +mod test { + use super::SupportsRoaring; + use crate::{dynamic::DynData, utils::Tup1}; + + #[test] + fn supported_roaring_keys() { + assert!(7u32.supports_roaring32()); + assert_eq!(7u32.into_roaring_u32((&0u32) as &DynData), Some(7)); + + assert!((-7i32).supports_roaring32()); + assert_eq!((-7i32).into_roaring_u32((&-10i32) as &DynData), Some(3)); + + assert!(Tup1(-7i32).supports_roaring32()); + assert_eq!( + Tup1(-7i32).into_roaring_u32((&Tup1(-10i32)) as &DynData), + Some(3) + ); + + assert!(11u64.supports_roaring32()); + assert_eq!(11u64.into_roaring_u32((&9u64) as &DynData), Some(2)); + + assert!((-2i64).supports_roaring32()); + assert_eq!((-2i64).into_roaring_u32((&-5i64) as &DynData), Some(3)); + } + + #[test] + fn unsupported_roaring_keys() { + assert!(!"feldera".to_string().supports_roaring32()); + assert_eq!( + "feldera" + .to_string() + .into_roaring_u32((&String::new()) as &DynData), + None + ); + + assert_eq!(11u64.into_roaring_u32((&(u64::MAX - 1)) as &DynData), None); + assert_eq!(5i64.into_roaring_u32((&10i64) as &DynData), None); + } +} diff --git a/crates/feldera-macros/src/lib.rs b/crates/feldera-macros/src/lib.rs index 0461482f405..c4b3d9086bd 100644 --- a/crates/feldera-macros/src/lib.rs +++ b/crates/feldera-macros/src/lib.rs @@ -1,4 +1,4 @@ -//! Procedural macros for Feldera tuple types and `IsNone`. +//! Procedural macros for Feldera tuple types and utility traits. //! //! The `declare_tuple!` macro decides which layout to use based on tuple size //! and the active storage format rules. @@ -51,6 +51,8 @@ pub fn derive_not_none(item: TokenStream) -> TokenStream { inner } } + + impl #impl_generics ::dbsp::utils::SupportsRoaring for #ident #ty_generics #where_clause {} }; TokenStream::from(expanded) diff --git a/crates/feldera-macros/src/tuples.rs b/crates/feldera-macros/src/tuples.rs index 6de257791cb..256f7b5e7a9 100644 --- a/crates/feldera-macros/src/tuples.rs +++ b/crates/feldera-macros/src/tuples.rs @@ -247,6 +247,14 @@ pub(super) fn declare_tuple_impl(tuple: TupleDef) -> TokenStream2 { } }; + let roaring_u32_key_impl = if num_elements == 1 { + quote! {} + } else { + quote! { + impl<#(#generics),*> ::dbsp::utils::SupportsRoaring for #name<#(#generics),*> {} + } + }; + let sparse_get_methods = fields .iter() .enumerate() @@ -969,6 +977,7 @@ pub(super) fn declare_tuple_impl(tuple: TupleDef) -> TokenStream2 { #copy_impl #checkpoint_impl #not_an_option + #roaring_u32_key_impl }); expanded diff --git a/crates/feldera-types/src/config/dev_tweaks.rs b/crates/feldera-types/src/config/dev_tweaks.rs index 057daa4c3cb..91679b15cb2 100644 --- a/crates/feldera-types/src/config/dev_tweaks.rs +++ b/crates/feldera-types/src/config/dev_tweaks.rs @@ -170,6 +170,11 @@ pub struct DevTweaks { #[serde(skip_serializing_if = "Option::is_none")] pub bloom_false_positive_rate: Option, + /// Whether file-backed batches may use roaring membership filters when the + /// key type supports them. + #[serde(skip_serializing_if = "Option::is_none")] + pub enable_roaring: Option, + /// Maximum batch size in records for level 0 merges. #[serde(skip_serializing_if = "Option::is_none")] pub max_level0_batch_size_records: Option, @@ -240,6 +245,9 @@ impl DevTweaks { pub fn bloom_false_positive_rate(&self) -> f64 { self.bloom_false_positive_rate.unwrap_or(0.0001) } + pub fn enable_roaring(&self) -> bool { + self.enable_roaring.unwrap_or(true) + } pub fn negative_weight_multiplier(&self) -> u16 { self.negative_weight_multiplier.unwrap_or(0) } diff --git a/crates/fxp/src/dbsp_impl.rs b/crates/fxp/src/dbsp_impl.rs index 37f8ec9160c..d7e999435c6 100644 --- a/crates/fxp/src/dbsp_impl.rs +++ b/crates/fxp/src/dbsp_impl.rs @@ -1,6 +1,6 @@ use dbsp::NumEntries; use dbsp::algebra::{HasOne, HasZero, MulByRef, OptionWeightType}; -use dbsp::utils::IsNone; +use dbsp::utils::{IsNone, SupportsRoaring}; use feldera_types::serde_with_context::{ DeserializeWithContext, SerializeWithContext, SqlSerdeConfig, serde_config::DecimalFormat, }; @@ -38,6 +38,8 @@ impl IsNone for Fixed { } } +impl SupportsRoaring for Fixed {} + impl OptionWeightType for Fixed {} impl OptionWeightType for &Fixed {} diff --git a/crates/nexmark/src/queries/q9.rs b/crates/nexmark/src/queries/q9.rs index 9aa8b12abfc..2e3040c1d22 100644 --- a/crates/nexmark/src/queries/q9.rs +++ b/crates/nexmark/src/queries/q9.rs @@ -43,6 +43,7 @@ pub struct Q9Output( ); dbsp::never_none!(Q9Output); +dbsp::never_roaring_filter!(Q9Output); type Q9Stream = Stream>; diff --git a/crates/sqllib/tests/tuple_proptest.rs b/crates/sqllib/tests/tuple_proptest.rs index a42fa01dc2e..cb206c97d95 100644 --- a/crates/sqllib/tests/tuple_proptest.rs +++ b/crates/sqllib/tests/tuple_proptest.rs @@ -8,6 +8,7 @@ use dbsp::storage::backend::memory_impl::MemoryBackend; use dbsp::storage::buffer_cache::BufferCache; use dbsp::storage::file::Factories; use dbsp::storage::file::{ + FilterPlan, format::BatchMetadata, writer::{Parameters, Writer1}, }; @@ -308,8 +309,14 @@ where let backend = MemoryBackend::new(); let factories = Factories::::new::(); let parameters = Parameters::default(); - let mut writer = Writer1::new(&factories, buffer_cache, &backend, parameters, values.len()) - .map_err(|err| TestCaseError::fail(format!("writer init failed: {err:?}")))?; + let mut writer = Writer1::new( + &factories, + buffer_cache, + &backend, + parameters, + FilterPlan::::decide_filter(None, values.len()), + ) + .map_err(|err| TestCaseError::fail(format!("writer init failed: {err:?}")))?; let aux = (); for value in &values { diff --git a/crates/storage-test-compat/golden-files/golden-batch-v6-large.feldera b/crates/storage-test-compat/golden-files/golden-batch-v6-large.feldera new file mode 100644 index 00000000000..c97031e2785 Binary files /dev/null and b/crates/storage-test-compat/golden-files/golden-batch-v6-large.feldera differ diff --git a/crates/storage-test-compat/golden-files/golden-batch-v6-small.feldera b/crates/storage-test-compat/golden-files/golden-batch-v6-small.feldera new file mode 100644 index 00000000000..849285b51a3 Binary files /dev/null and b/crates/storage-test-compat/golden-files/golden-batch-v6-small.feldera differ diff --git a/crates/storage-test-compat/golden-files/golden-batch-v6-snappy-large.feldera b/crates/storage-test-compat/golden-files/golden-batch-v6-snappy-large.feldera new file mode 100644 index 00000000000..62a8c83bfef Binary files /dev/null and b/crates/storage-test-compat/golden-files/golden-batch-v6-snappy-large.feldera differ diff --git a/crates/storage-test-compat/golden-files/golden-batch-v6-snappy-small.feldera b/crates/storage-test-compat/golden-files/golden-batch-v6-snappy-small.feldera new file mode 100644 index 00000000000..5e2a8d1349a Binary files /dev/null and b/crates/storage-test-compat/golden-files/golden-batch-v6-snappy-small.feldera differ diff --git a/crates/storage-test-compat/src/bin/golden-writer.rs b/crates/storage-test-compat/src/bin/golden-writer.rs index ba8f2a2a8d5..41f6f2cc81e 100644 --- a/crates/storage-test-compat/src/bin/golden-writer.rs +++ b/crates/storage-test-compat/src/bin/golden-writer.rs @@ -13,7 +13,7 @@ use dbsp::storage::file::format::BatchMetadata; use dbsp::storage::file::format::Compression; use dbsp::storage::file::format::VERSION_NUMBER; use dbsp::storage::file::writer::{Parameters, Writer1}; -use dbsp::storage::file::Factories; +use dbsp::storage::file::{Factories, FilterPlan}; use feldera_types::config::{StorageConfig, StorageOptions}; use storage_test_compat::{ @@ -102,7 +102,7 @@ where buffer_cache, &*storage_backend, parameters, - rows, + FilterPlan::::decide_filter(None, rows), )?; for row in 0..rows { @@ -112,7 +112,7 @@ where } let tmp_path = writer.path().clone(); - let (_file_handle, _bloom_filter, _key_bounds) = writer.close(BatchMetadata::default())?; + let (_file_handle, _key_filter, _key_bounds) = writer.close(BatchMetadata::default())?; let content = storage_backend.read(&tmp_path)?; storage_backend.write(&output_storage_path, (*content).clone())?; storage_backend.delete(&tmp_path)?; diff --git a/crates/storage/src/error.rs b/crates/storage/src/error.rs index f45874795fe..b4e3dab6070 100644 --- a/crates/storage/src/error.rs +++ b/crates/storage/src/error.rs @@ -37,10 +37,14 @@ pub enum StorageError { /// Cannot perform operation because storage is not enabled. #[error("Cannot perform operation because storage is not enabled.")] StorageDisabled, - /// Error while creating a bloom filter. - #[error("Failed to serialize/deserialize bloom filter.")] + /// Error while creating a batch key filter. + #[error("Failed to serialize/deserialize batch key filter.")] BloomFilter, + /// Error while serializing a roaring bitmap batch key filter. + #[error("Failed to serialize roaring bitmap batch key filter.")] + RoaringBitmapFilter, + /// Path is not valid in storage. /// /// Storage paths may not be absolute, may not start with a drive letter (on @@ -147,7 +151,7 @@ impl StorageError { StorageError::NoPersistentId(_) => ErrorKind::Other, StorageError::CheckpointNotFound(_) => ErrorKind::NotFound, StorageError::StorageDisabled => ErrorKind::Other, - StorageError::BloomFilter => ErrorKind::Other, + StorageError::BloomFilter | StorageError::RoaringBitmapFilter => ErrorKind::Other, StorageError::InvalidPath(_) => ErrorKind::Other, StorageError::InvalidURL(_) => ErrorKind::Other, StorageError::ObjectStore { kind, .. } => *kind, diff --git a/scripts/plot_filter_bitmap.py b/scripts/plot_filter_bitmap.py new file mode 100644 index 00000000000..684dc31198d --- /dev/null +++ b/scripts/plot_filter_bitmap.py @@ -0,0 +1,855 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = ">=3.11" +# dependencies = [ +# "pandas>=2.2", +# "plotly>=5.24", +# "kaleido>=0.2.1", +# ] +# /// + +from __future__ import annotations + +import argparse +import math +from pathlib import Path + +import pandas as pd +import plotly.graph_objects as go +import plotly.io as pio +from plotly.subplots import make_subplots + + +KEY_TYPE_ORDER = ["u32", "u64"] +KEY_SPACE_ORDER = ["consecutive", "full_range", "half_normal"] +STRUCTURE_ORDER = ["bloom", "roaring"] +METRICS = [ + ( + "insert_ns_per_element_avg", + "insert_ns_per_element_min", + "insert_ns_per_element_max", + "Insert Time", + "Insert Time (ns/element)", + "ns", + ), + ( + "lookup_ns_per_element_avg", + "lookup_ns_per_element_min", + "lookup_ns_per_element_max", + "Lookup Time", + "Lookup Time (ns/element)", + "ns", + ), + ("bytes_used", None, None, "Memory Usage", "Memory Usage (bytes)", "bytes"), +] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Plot filter_bitmap.csv comparisons for bloom vs roaring." + ) + parser.add_argument( + "--input", + type=Path, + default=Path("crates/dbsp/filter_bitmap.csv"), + help="Input CSV produced by crates/dbsp/benches/filter_bitmap.rs", + ) + parser.add_argument( + "--output-dir", + type=Path, + default=Path("filter_bitmap_plots"), + help="Directory to write plots into", + ) + parser.add_argument( + "--write-png", + action="store_true", + help="Also export PNG images with Kaleido. Requires a working non-snap Chrome/Chromium.", + ) + return parser.parse_args() + + +def format_structure(name: str) -> str: + return { + "bloom": "fastbloom", + "roaring": "roaring", + }.get(name, name) + + +def format_key_type(name: str) -> str: + return { + "u32": "u32 Keys", + "u64": "u64 Keys", + }.get(name, name) + + +def format_key_space(name: str, key_type: str) -> str: + if name == "consecutive": + return "K={0..N}" + if name == "full_range": + max_label = "2^32" if key_type == "u32" else "2^64" + return f"K={{0..{max_label}}}" + if name == "half_normal": + return "Half-normal K={0..2^32}" + return name + + +def format_distribution_key_space(name: str) -> str: + if name == "half_normal": + return "Half-normal K={0..2^32}" + return name.replace("_", " ").title() + + +def format_num_elements(value: int) -> str: + return f"{value:,}" + + +def format_key_eps(value: float) -> str: + return f"{value:g}" + + +def format_bytes(value: float) -> str: + units = ["B", "KiB", "MiB", "GiB", "TiB"] + unit_index = 0 + while value >= 1024.0 and unit_index + 1 < len(units): + value /= 1024.0 + unit_index += 1 + return f"{value:.2f} {units[unit_index]}" + + +def format_ns_per_element(value: float) -> str: + return f"{value:.2f} ns" + + +def format_ratio(value: float) -> str: + return f"{value:.2f}x" + + +def metric_formatter(kind: str): + if kind == "bytes": + return format_bytes + return format_ns_per_element + + +def ordered_values(values: pd.Series, preferred_order: list[str]) -> list[str]: + present = {str(value) for value in values.dropna().unique()} + ordered = [value for value in preferred_order if value in present] + extras = sorted(present - set(preferred_order)) + return ordered + extras + + +def prepare_frame(frame: pd.DataFrame) -> pd.DataFrame: + frame = frame.copy() + + if "key_type" not in frame.columns: + frame["key_type"] = "u32" + if "key_space" not in frame.columns: + frame["key_space"] = "consecutive" + if "key_eps" not in frame.columns: + frame["key_eps"] = pd.NA + + numeric_columns = [ + "key_eps", + "num_elements", + "lookup_count", + "false_positive_lookup_count", + "repetitions", + "insert_seed", + "lookup_seed", + "key_space_seed", + "bloom_false_positive_rate_target_percent", + "bloom_seed", + "bloom_expected_items", + "bytes_used", + "bytes_per_element", + "bits_per_element", + "insert_ns_per_element_min", + "insert_ns_per_element_avg", + "insert_ns_per_element_max", + "insert_ns_per_element_stddev", + "lookup_ns_per_element_min", + "lookup_ns_per_element_avg", + "lookup_ns_per_element_max", + "lookup_ns_per_element_stddev", + "false_positive_rate_percent_min", + "false_positive_rate_percent_avg", + "false_positive_rate_percent_max", + "false_positive_rate_percent_stddev", + ] + for column in numeric_columns: + if column in frame.columns: + frame[column] = pd.to_numeric(frame[column], errors="coerce") + + group_columns = ["structure", "key_type", "key_space", "key_eps", "num_elements"] + agg_spec: dict[str, str] = {} + for column in frame.columns: + if column in group_columns: + continue + if not pd.api.types.is_numeric_dtype(frame[column]): + agg_spec[column] = "first" + elif column.endswith("_min"): + agg_spec[column] = "min" + elif column.endswith("_max"): + agg_spec[column] = "max" + elif column.endswith("_avg") or column.endswith("_stddev"): + agg_spec[column] = "mean" + elif column in { + "bytes_used", + "bytes_per_element", + "bits_per_element", + "bloom_false_positive_rate_target_percent", + }: + agg_spec[column] = "mean" + else: + agg_spec[column] = "first" + + frame = frame.groupby(group_columns, as_index=False, dropna=False).agg(agg_spec) + return frame.sort_values(group_columns) + + +def build_category_order( + frame: pd.DataFrame, + key_spaces: list[str], +) -> list[tuple[int, str]]: + ordered_sizes = sorted(int(value) for value in frame["num_elements"].unique()) + categories: list[tuple[int, str]] = [] + for size in ordered_sizes: + for key_space in key_spaces: + if ((frame["num_elements"] == size) & (frame["key_space"] == key_space)).any(): + categories.append((size, key_space)) + return categories + + +def category_axis(categories: list[tuple[int, str]], key_type: str) -> list[list[str]]: + return [ + [format_num_elements(size) for size, _ in categories], + [format_key_space(key_space, key_type) for _, key_space in categories], + ] + + +def build_metric_figure( + frame: pd.DataFrame, + y_column: str, + y_min_column: str | None, + y_max_column: str | None, + y_label: str, + title: str, + formatter, +) -> go.Figure: + key_types = ordered_values(frame["key_type"], KEY_TYPE_ORDER) + key_spaces = ordered_values(frame["key_space"], KEY_SPACE_ORDER) + colors = { + "bloom": "#0f766e", + "roaring": "#c2410c", + } + + fig = make_subplots( + rows=max(1, len(key_types)), + cols=1, + shared_xaxes=False, + vertical_spacing=0.18, + row_titles=[format_key_type(key_type) for key_type in key_types], + ) + + for row_index, key_type in enumerate(key_types, start=1): + row_frame = frame[frame["key_type"] == key_type] + categories = build_category_order(row_frame, key_spaces) + x_axis = category_axis(categories, key_type) + + for structure in STRUCTURE_ORDER: + structure_frame = ( + row_frame[row_frame["structure"] == structure] + .set_index(["num_elements", "key_space"]) + .sort_index() + ) + if structure_frame.empty: + continue + + y_values = [] + text_values = [] + error_plus = [] + error_minus = [] + for category in categories: + if category in structure_frame.index: + value = float(structure_frame.loc[category, y_column]) + y_values.append(value) + text_values.append(formatter(value)) + if y_min_column is not None and y_max_column is not None: + min_value = float(structure_frame.loc[category, y_min_column]) + max_value = float(structure_frame.loc[category, y_max_column]) + error_minus.append(max(0.0, value - min_value)) + error_plus.append(max(0.0, max_value - value)) + else: + error_minus.append(None) + error_plus.append(None) + else: + y_values.append(None) + text_values.append("") + error_minus.append(None) + error_plus.append(None) + + fig.add_trace( + go.Bar( + name=format_structure(structure), + x=x_axis, + y=y_values, + text=text_values, + textposition="outside", + cliponaxis=False, + marker_color=colors[structure], + showlegend=row_index == 1, + offsetgroup=structure, + legendgroup=structure, + error_y=( + dict( + type="data", + symmetric=False, + array=error_plus, + arrayminus=error_minus, + thickness=1.2, + width=3, + color="#334155", + ) + if y_min_column is not None and y_max_column is not None + else None + ), + ), + row=row_index, + col=1, + ) + + fig.update_xaxes(title_text="Input Size / Key Space", row=row_index, col=1) + fig.update_yaxes(title_text=y_label, type="log", row=row_index, col=1) + + fig.update_layout( + title=title, + barmode="group", + template="plotly_white", + width=max(1100, 160 * max(1, len(build_category_order(frame, key_spaces)))), + height=500 * max(1, len(key_types)), + legend_title_text="Structure", + margin=dict(t=90, r=30, b=80, l=80), + ) + return fig + + +def relative_frame(frame: pd.DataFrame, metric: str) -> pd.DataFrame: + if frame.empty: + return pd.DataFrame() + + pivot = ( + frame.pivot_table( + index=["key_type", "key_space", "key_eps", "num_elements"], + columns="structure", + values=metric, + aggfunc="first", + ) + .rename(columns={"bloom": "bloom_value", "roaring": "roaring_value"}) + .reset_index() + ) + if pivot.empty or {"bloom_value", "roaring_value"} - set(pivot.columns): + return pd.DataFrame() + + pivot = pivot.dropna(subset=["bloom_value", "roaring_value"]).copy() + if pivot.empty: + return pivot + + pivot["relative_factor"] = pivot["bloom_value"] / pivot["roaring_value"] + pivot["log2_relative_factor"] = pivot["relative_factor"].map(math.log2) + return pivot + + +def heatmap_tick_values(z_bound: float) -> list[float]: + step = 0.5 if z_bound <= 2.0 else 1.0 + tick_count = int(round((2 * z_bound) / step)) + return [(-z_bound + step * index) for index in range(tick_count + 1)] + + +def build_relative_heatmap_figure( + frame: pd.DataFrame, + metric: str, + title: str, + value_formatter, + colorbar_title: str, +) -> go.Figure | None: + ratio_frame = relative_frame(frame, metric) + if ratio_frame.empty: + return None + + key_types = ordered_values(ratio_frame["key_type"], KEY_TYPE_ORDER) + key_spaces = ordered_values(ratio_frame["key_space"], KEY_SPACE_ORDER) + max_abs_log2 = ratio_frame["log2_relative_factor"].abs().max() + z_bound = max(0.5, math.ceil(float(max_abs_log2) * 2.0) / 2.0) + tick_values = heatmap_tick_values(z_bound) + tick_text = [format_ratio(2**value) for value in tick_values] + + fig = make_subplots( + rows=max(1, len(key_types)), + cols=max(1, len(key_spaces)), + row_titles=[format_key_type(key_type) for key_type in key_types], + column_titles=[format_distribution_key_space(key_space) for key_space in key_spaces], + horizontal_spacing=0.08, + vertical_spacing=0.16, + ) + + max_num_values = 1 + max_eps_values = 1 + + for row_index, key_type in enumerate(key_types, start=1): + for col_index, key_space in enumerate(key_spaces, start=1): + subplot_frame = ratio_frame[ + (ratio_frame["key_type"] == key_type) + & (ratio_frame["key_space"] == key_space) + ] + if subplot_frame.empty: + continue + + eps_values = sorted(float(value) for value in subplot_frame["key_eps"].dropna().unique()) + num_values = sorted(int(value) for value in subplot_frame["num_elements"].unique()) + max_num_values = max(max_num_values, len(num_values)) + max_eps_values = max(max_eps_values, len(eps_values)) + + log2_table = ( + subplot_frame.pivot(index="key_eps", columns="num_elements", values="log2_relative_factor") + .reindex(index=eps_values, columns=num_values) + ) + ratio_table = ( + subplot_frame.pivot(index="key_eps", columns="num_elements", values="relative_factor") + .reindex(index=eps_values, columns=num_values) + ) + bloom_table = ( + subplot_frame.pivot(index="key_eps", columns="num_elements", values="bloom_value") + .reindex(index=eps_values, columns=num_values) + ) + roaring_table = ( + subplot_frame.pivot(index="key_eps", columns="num_elements", values="roaring_value") + .reindex(index=eps_values, columns=num_values) + ) + + text = [ + [ + format_ratio(value) if pd.notna(value) else "" + for value in row_values + ] + for row_values in ratio_table.values + ] + customdata = [ + [ + [ + value_formatter(bloom_value) if pd.notna(bloom_value) else "", + value_formatter(roaring_value) if pd.notna(roaring_value) else "", + ] + for bloom_value, roaring_value in zip(bloom_row, roaring_row) + ] + for bloom_row, roaring_row in zip(bloom_table.values, roaring_table.values) + ] + + fig.add_trace( + go.Heatmap( + x=[format_num_elements(value) for value in num_values], + y=[format_key_eps(value) for value in eps_values], + z=log2_table.values, + text=text, + customdata=customdata, + texttemplate="%{text}", + hoverongaps=False, + xgap=1, + ygap=1, + coloraxis="coloraxis", + hovertemplate=( + "num_elements=%{x}
" + "key_eps=%{y}
" + f"{colorbar_title}=%{{text}}
" + "fastbloom=%{customdata[0]}
" + "roaring=%{customdata[1]}" + "" + ), + ), + row=row_index, + col=col_index, + ) + + fig.update_xaxes(title_text="num_elements", row=row_index, col=col_index) + fig.update_yaxes(title_text="key_eps", row=row_index, col=col_index) + + fig.update_layout( + title=title, + template="plotly_white", + width=max(950, 280 * len(key_spaces) + 110 * max_num_values * len(key_spaces)), + height=max(480, 220 * len(key_types) + 70 * max_eps_values * len(key_types)), + margin=dict(t=110, r=40, b=80, l=90), + coloraxis=dict( + colorscale=[ + (0.0, "#b91c1c"), + (0.5, "#f8fafc"), + (1.0, "#15803d"), + ], + cmin=-z_bound, + cmax=z_bound, + colorbar=dict( + title=colorbar_title, + tickvals=tick_values, + ticktext=tick_text, + ), + ), + ) + return fig + + +def build_summary_figure(frame: pd.DataFrame) -> go.Figure: + key_types = ordered_values(frame["key_type"], KEY_TYPE_ORDER) + key_spaces = ordered_values(frame["key_space"], KEY_SPACE_ORDER) + colors = { + "bloom": "#0f766e", + "roaring": "#c2410c", + } + + fig = make_subplots( + rows=max(1, len(key_types)), + cols=3, + subplot_titles=[ + metric_title + for _ in key_types + for _, _, _, metric_title, _, _ in METRICS + ], + row_titles=[format_key_type(key_type) for key_type in key_types], + horizontal_spacing=0.06, + vertical_spacing=0.18, + ) + + for row_index, key_type in enumerate(key_types, start=1): + row_frame = frame[frame["key_type"] == key_type] + categories = build_category_order(row_frame, key_spaces) + x_axis = category_axis(categories, key_type) + + for col_index, ( + metric, + metric_min, + metric_max, + _metric_title, + y_label, + kind, + ) in enumerate(METRICS, start=1): + formatter = metric_formatter(kind) + for structure in STRUCTURE_ORDER: + structure_frame = ( + row_frame[row_frame["structure"] == structure] + .set_index(["num_elements", "key_space"]) + .sort_index() + ) + if structure_frame.empty: + continue + + y_values = [] + text_values = [] + error_plus = [] + error_minus = [] + for category in categories: + if category in structure_frame.index: + value = float(structure_frame.loc[category, metric]) + y_values.append(value) + text_values.append(formatter(value)) + if metric_min is not None and metric_max is not None: + min_value = float(structure_frame.loc[category, metric_min]) + max_value = float(structure_frame.loc[category, metric_max]) + error_minus.append(max(0.0, value - min_value)) + error_plus.append(max(0.0, max_value - value)) + else: + error_minus.append(None) + error_plus.append(None) + else: + y_values.append(None) + text_values.append("") + error_minus.append(None) + error_plus.append(None) + + fig.add_trace( + go.Bar( + name=format_structure(structure), + x=x_axis, + y=y_values, + text=text_values, + textposition="outside", + cliponaxis=False, + marker_color=colors[structure], + showlegend=row_index == 1 and col_index == 1, + offsetgroup=structure, + legendgroup=structure, + error_y=( + dict( + type="data", + symmetric=False, + array=error_plus, + arrayminus=error_minus, + thickness=1.2, + width=3, + color="#334155", + ) + if metric_min is not None and metric_max is not None + else None + ), + ), + row=row_index, + col=col_index, + ) + + fig.update_yaxes(title_text=y_label, type="log", row=row_index, col=col_index) + fig.update_xaxes( + title_text="Input Size / Key Space", + row=row_index, + col=col_index, + ) + + fig.update_layout( + title="filter_bitmap Summary", + barmode="group", + template="plotly_white", + width=max(1900, 260 * max(1, len(build_category_order(frame, key_spaces)))), + height=640 * max(1, len(key_types)), + legend_title_text="Structure", + margin=dict(t=110, r=30, b=90, l=70), + ) + return fig + + +def write_figure(fig: go.Figure, base_path: Path, write_png: bool) -> None: + fig.write_html(base_path.with_suffix(".html")) + if write_png: + try: + fig.write_image(base_path.with_suffix(".png"), scale=2) + except Exception as exc: # pragma: no cover - depends on local browser setup. + print(f"warning: failed to write {base_path.with_suffix('.png')}: {exc}") + + +def write_summary_dashboard( + sections: list[tuple[str, go.Figure]], + output_path: Path, +) -> None: + if not sections: + return + + grouped_summary = next( + ((title, figure) for title, figure in sections if title == "Grouped Summary"), + None, + ) + heatmap_sections = [ + (title, figure) for title, figure in sections if title != "Grouped Summary" + ] + + html_parts = [ + "", + "", + "", + " ", + " ", + " filter_bitmap Summary", + " ", + "", + "", + "
", + "

filter_bitmap Summary

", + ] + + next_plotlyjs_mode = "cdn" + + if grouped_summary is not None: + title, figure = grouped_summary + figure_html = pio.to_html( + figure, + full_html=False, + include_plotlyjs=next_plotlyjs_mode, + ) + next_plotlyjs_mode = False + html_parts.extend( + [ + "
", + f"

{title}

", + figure_html, + "
", + ] + ) + + if heatmap_sections: + html_parts.extend( + [ + "
", + "

Roaring Advantage Heatmaps

", + "
", + ] + ) + for title, figure in heatmap_sections: + figure_html = pio.to_html( + figure, + full_html=False, + include_plotlyjs=next_plotlyjs_mode, + ) + next_plotlyjs_mode = False + html_parts.extend( + [ + "
", + f"

{title}

", + figure_html, + "
", + ] + ) + html_parts.extend( + [ + "
", + "
", + ] + ) + + html_parts.extend(["
", "", ""]) + output_path.write_text("\n".join(html_parts), encoding="utf-8") + + +def main() -> None: + args = parse_args() + if not args.input.exists(): + raise SystemExit(f"input CSV not found: {args.input}") + + frame = pd.read_csv(args.input) + if frame.empty: + raise SystemExit(f"input CSV is empty: {args.input}") + + required_columns = { + "structure", + "num_elements", + "insert_ns_per_element_avg", + "lookup_ns_per_element_avg", + "bytes_used", + } + missing_columns = sorted(required_columns - set(frame.columns)) + if missing_columns: + raise SystemExit( + f"input CSV is missing required columns: {', '.join(missing_columns)}" + ) + + frame = prepare_frame(frame) + args.output_dir.mkdir(parents=True, exist_ok=True) + + standard_frame = frame[frame["key_eps"].isna()].copy() + distribution_frame = frame[frame["key_eps"].notna()].copy() + summary_sections: list[tuple[str, go.Figure]] = [] + + if not standard_frame.empty: + insert_figure = build_metric_figure( + standard_frame, + "insert_ns_per_element_avg", + "insert_ns_per_element_min", + "insert_ns_per_element_max", + "Insert Time (ns/element)", + "filter_bitmap: Insert Time", + format_ns_per_element, + ) + write_figure( + insert_figure, + args.output_dir / "filter_bitmap_insert_ns_per_element", + args.write_png, + ) + + lookup_figure = build_metric_figure( + standard_frame, + "lookup_ns_per_element_avg", + "lookup_ns_per_element_min", + "lookup_ns_per_element_max", + "Lookup Time (ns/element)", + "filter_bitmap: Lookup Time", + format_ns_per_element, + ) + write_figure( + lookup_figure, + args.output_dir / "filter_bitmap_lookup_ns_per_element", + args.write_png, + ) + + memory_figure = build_metric_figure( + standard_frame, + "bytes_used", + None, + None, + "Memory Usage (bytes)", + "filter_bitmap: Memory Usage", + format_bytes, + ) + write_figure( + memory_figure, + args.output_dir / "filter_bitmap_memory_bytes", + args.write_png, + ) + + summary_figure = build_summary_figure(standard_frame) + write_figure( + summary_figure, + args.output_dir / "filter_bitmap_summary", + args.write_png, + ) + summary_sections.append(("Grouped Summary", summary_figure)) + + if not distribution_frame.empty: + insert_heatmap = build_relative_heatmap_figure( + distribution_frame, + "insert_ns_per_element_avg", + "filter_bitmap: Roaring Insert Advantage Heatmap", + format_ns_per_element, + "fastbloom / roaring", + ) + if insert_heatmap is not None: + write_figure( + insert_heatmap, + args.output_dir / "filter_bitmap_insert_advantage_heatmap", + args.write_png, + ) + summary_sections.append(("Roaring Insert Advantage Heatmap", insert_heatmap)) + + lookup_heatmap = build_relative_heatmap_figure( + distribution_frame, + "lookup_ns_per_element_avg", + "filter_bitmap: Roaring Lookup Advantage Heatmap", + format_ns_per_element, + "fastbloom / roaring", + ) + if lookup_heatmap is not None: + write_figure( + lookup_heatmap, + args.output_dir / "filter_bitmap_lookup_advantage_heatmap", + args.write_png, + ) + summary_sections.append(("Roaring Lookup Advantage Heatmap", lookup_heatmap)) + + memory_heatmap = build_relative_heatmap_figure( + distribution_frame, + "bytes_used", + "filter_bitmap: Roaring Memory Advantage Heatmap", + format_bytes, + "fastbloom / roaring", + ) + if memory_heatmap is not None: + write_figure( + memory_heatmap, + args.output_dir / "filter_bitmap_memory_advantage_heatmap", + args.write_png, + ) + summary_sections.append(("Roaring Memory Advantage Heatmap", memory_heatmap)) + + write_summary_dashboard( + summary_sections, + args.output_dir / "filter_bitmap_summary.html", + ) + + print(f"wrote plots to {args.output_dir}") + + +if __name__ == "__main__": + main()