Skip to content

Commit 9bad56b

Browse files
feat[bench]: add more random access (#6371)
Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
1 parent d4e6c55 commit 9bad56b

14 files changed

Lines changed: 983 additions & 216 deletions

File tree

Cargo.lock

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

benchmarks-website-v2/server.js

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -271,9 +271,21 @@ async function refresh() {
271271
}
272272
if (!groups[group]) continue;
273273

274-
const [query, series] = b.name.split("/");
275-
const seriesName = rename(series || "default");
276-
const chartName = formatQuery(query);
274+
// Random access names have the form: random-access/{dataset}/{pattern}/{format}
275+
// Historical random access names: random-access/{format}
276+
// Other benchmarks use: {query}/{series}
277+
let seriesName, chartName;
278+
const parts = b.name.split("/");
279+
if (group === "Random Access" && parts.length === 4) {
280+
chartName = `${parts[1]}/${parts[2]}`.toUpperCase().replace(/[_-]/g, " ");
281+
seriesName = rename(parts[3] || "default");
282+
} else if (group === "Random Access" && parts.length === 2) {
283+
chartName = "RANDOM ACCESS";
284+
seriesName = rename(parts[1] || "default");
285+
} else {
286+
seriesName = rename(parts[1] || "default");
287+
chartName = formatQuery(parts[0]);
288+
}
277289
if (chartName.includes("PARQUET-UNC")) continue;
278290

279291
// Skip throughput metrics (keep only time/size)
@@ -286,7 +298,7 @@ async function refresh() {
286298
else unit = "ns";
287299
}
288300

289-
const sortPos = query.match(/q(\d+)$/i)?.[1]
301+
const sortPos = parts[0].match(/q(\d+)$/i)?.[1]
290302
? parseInt(RegExp.$1, 10)
291303
: 0;
292304
const idx = commitIdx.get(commit.id);

benchmarks-website-v2/src/config.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ export const ENGINE_RENAMES = {
7474
"duckdb:duckdb": "duckdb:duckdb",
7575
"duckdb:vortex-compact": "duckdb:vortex-compact",
7676
"vortex-tokio-local-disk": "vortex-nvme",
77+
"vortex-compact-tokio-local-disk": "vortex-compact-nvme",
7778
"lance-tokio-local-disk": "lance-nvme",
7879
"parquet-tokio-local-disk": "parquet-nvme",
7980
lance: "lance",
@@ -89,6 +90,7 @@ const BESPOKE_CONFIGS = [
8990
name: "Random Access",
9091
renamedDatasets: {
9192
"vortex-tokio-local-disk": "vortex-nvme",
93+
"vortex-compact-tokio-local-disk": "vortex-compact-nvme",
9294
"lance-tokio-local-disk": "lance-nvme",
9395
"parquet-tokio-local-disk": "parquet-nvme",
9496
},
@@ -242,6 +244,7 @@ export const ENGINE_LABELS = {
242244
// Series color map
243245
export const SERIES_COLOR_MAP = {
244246
"vortex-nvme": "#19a508",
247+
"vortex-compact-nvme": "#15850a",
245248
"parquet-nvme": "#ef7f1d",
246249
"lance-nvme": "#3B82F6",
247250
"datafusion:arrow": "#7a27b1",

benchmarks/lance-bench/src/random_access.rs

Lines changed: 62 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,19 @@ use lance::dataset::WriteParams;
1212
use lance_encoding::version::LanceFileVersion;
1313
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
1414
use vortex_bench::Format;
15-
use vortex_bench::datasets::taxi_data::taxi_data_parquet;
15+
use vortex_bench::datasets::feature_vectors;
16+
use vortex_bench::datasets::nested_lists;
17+
use vortex_bench::datasets::nested_structs;
18+
use vortex_bench::datasets::taxi_data;
1619
use vortex_bench::idempotent_async;
1720
use vortex_bench::random_access::RandomAccessor;
21+
use vortex_bench::random_access::data_path;
1822

19-
pub async fn taxi_data_lance() -> anyhow::Result<PathBuf> {
20-
idempotent_async("taxi/taxi.lance", |output_fname| async move {
21-
let parquet_path = taxi_data_parquet().await?;
22-
23+
/// Convert a parquet file to lance format.
24+
///
25+
/// Uses `idempotent_async` to skip conversion if the output already exists.
26+
async fn parquet_to_lance_file(parquet_path: PathBuf, lance_path: &str) -> anyhow::Result<PathBuf> {
27+
idempotent_async(lance_path, |output_fname| async move {
2328
let file = File::open(&parquet_path)?;
2429
let builder = ParquetRecordBatchReaderBuilder::try_new(file)?;
2530
let reader = builder.build()?;
@@ -39,13 +44,58 @@ pub async fn taxi_data_lance() -> anyhow::Result<PathBuf> {
3944
.await
4045
}
4146

47+
pub async fn taxi_data_lance() -> anyhow::Result<PathBuf> {
48+
let parquet_path = taxi_data::taxi_data_parquet().await?;
49+
parquet_to_lance_file(parquet_path, &data_path(taxi_data::DATASET, Format::Lance)).await
50+
}
51+
52+
pub async fn feature_vectors_lance() -> anyhow::Result<PathBuf> {
53+
let parquet_path = feature_vectors::feature_vectors_parquet().await?;
54+
parquet_to_lance_file(
55+
parquet_path,
56+
&data_path(feature_vectors::DATASET, Format::Lance),
57+
)
58+
.await
59+
}
60+
61+
pub async fn nested_lists_lance() -> anyhow::Result<PathBuf> {
62+
let parquet_path = nested_lists::nested_lists_parquet().await?;
63+
parquet_to_lance_file(
64+
parquet_path,
65+
&data_path(nested_lists::DATASET, Format::Lance),
66+
)
67+
.await
68+
}
69+
70+
pub async fn nested_structs_lance() -> anyhow::Result<PathBuf> {
71+
let parquet_path = nested_structs::nested_structs_parquet().await?;
72+
parquet_to_lance_file(
73+
parquet_path,
74+
&data_path(nested_structs::DATASET, Format::Lance),
75+
)
76+
.await
77+
}
78+
79+
/// Random accessor for Lance format files.
80+
///
81+
/// The dataset handle is opened at construction time and reused across `take()` calls.
4282
pub struct LanceRandomAccessor {
43-
path: PathBuf,
83+
name: String,
84+
dataset: Dataset,
4485
}
4586

4687
impl LanceRandomAccessor {
47-
pub fn new(path: PathBuf) -> Self {
48-
Self { path }
88+
/// Open a Lance dataset and return a ready-to-use accessor.
89+
pub async fn open(path: PathBuf, name: impl Into<String>) -> anyhow::Result<Self> {
90+
let dataset = Dataset::open(
91+
path.to_str()
92+
.ok_or_else(|| anyhow!("Invalid dataset path"))?,
93+
)
94+
.await?;
95+
Ok(Self {
96+
name: name.into(),
97+
dataset,
98+
})
4999
}
50100
}
51101

@@ -56,22 +106,12 @@ impl RandomAccessor for LanceRandomAccessor {
56106
}
57107

58108
fn name(&self) -> &str {
59-
"random-access/lance-tokio-local-disk"
60-
}
61-
62-
fn path(&self) -> &PathBuf {
63-
&self.path
109+
&self.name
64110
}
65111

66-
async fn take(&self, indices: Vec<u64>) -> anyhow::Result<usize> {
67-
let dataset = Dataset::open(
68-
self.path
69-
.to_str()
70-
.ok_or_else(|| anyhow!("Invalid dataset path"))?,
71-
)
72-
.await?;
73-
let projection = ProjectionRequest::from_schema(dataset.schema().clone()); // All columns.
74-
let result = dataset.take(indices.as_slice(), projection).await?;
112+
async fn take(&self, indices: &[u64]) -> anyhow::Result<usize> {
113+
let projection = ProjectionRequest::from_schema(self.dataset.schema().clone());
114+
let result = self.dataset.take(indices, projection).await?;
75115
Ok(result.num_rows())
76116
}
77117
}

benchmarks/random-access-bench/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,12 @@ publish = false
1616

1717
[dependencies]
1818
anyhow = { workspace = true }
19+
async-trait = { workspace = true }
1920
clap = { workspace = true, features = ["derive"] }
2021
indicatif = { workspace = true }
2122
lance-bench = { path = "../lance-bench", optional = true }
23+
rand = { workspace = true }
24+
rand_distr = { workspace = true }
2225
tokio = { workspace = true, features = ["full"] }
2326
vortex = { workspace = true }
2427
vortex-bench = { workspace = true }

0 commit comments

Comments
 (0)