Skip to content

Commit 86ebf3d

Browse files
authored
Support building duckdb from a hash, including CI (#6658)
This is preliminary work to enable testing pre-release and hash-bashed versions of DuckDB with Vortex locally and in CI. - Enable release builds in CI from a hash to run benchmarks - Build httpfs statically for hash-based builds, otherwise DuckDB will try to download it and fail. - Enable curl development headers installation for httpfs if build.rs for vortex-duckb changed - Test both httpfs backends in e2e scan test. - Fix open_read's implementation incorrectly adding a slash between host and filename, which lead to a double-slash error with CURL backend duckdb/duckdb-httpfs#261 Signed-off-by: Mikhail Kot <mikhail@spiraldb.com>
1 parent 3579cf6 commit 86ebf3d

10 files changed

Lines changed: 172 additions & 36 deletions

File tree

.github/workflows/bench-pr.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,16 @@ jobs:
5454
- uses: actions/checkout@v6
5555
with:
5656
ref: ${{ github.event.pull_request.head.sha }}
57+
- uses: dorny/paths-filter@v3
58+
id: duckdb-buildfile-changed
59+
with:
60+
filters: |
61+
changed:
62+
- 'vortex-duckdb/build.rs'
63+
- name: Install curl dev headers for duckdb source build
64+
if: steps.duckdb-buildfile-changed.outputs.changed == 'true'
65+
run: |
66+
sudo apt-get update -y -qq && sudo apt-get install -y -qq libcurl4-openssl-dev
5767
- uses: ./.github/actions/setup-rust
5868
with:
5969
repo-token: ${{ secrets.GITHUB_TOKEN }}

.github/workflows/bench.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,16 @@ jobs:
5656
with:
5757
sccache: s3
5858
- uses: actions/checkout@v6
59+
- uses: dorny/paths-filter@v3
60+
id: duckdb-buildfile-changed
61+
with:
62+
filters: |
63+
changed:
64+
- 'vortex-duckdb/build.rs'
65+
- name: Install curl dev headers for duckdb source build
66+
if: steps.duckdb-buildfile-changed.outputs.changed == 'true'
67+
run: |
68+
sudo apt-get update -y -qq && sudo apt-get install -y -qq libcurl4-openssl-dev
5969
- uses: ./.github/actions/setup-rust
6070
with:
6171
repo-token: ${{ secrets.GITHUB_TOKEN }}

.github/workflows/ci.yml

Lines changed: 64 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,29 @@ jobs:
188188
- uses: ./.github/actions/setup-rust
189189
with:
190190
repo-token: ${{ secrets.GITHUB_TOKEN }}
191+
# DuckDB build from duckdb/, if building httpfs statically, fails unless
192+
# you have cURL dev headers present. Ideally we'd see these loaded during
193+
# DuckDB setup itself, but this is a tricky change, see
194+
# https://github.com/duckdb/duckdb/issues/21073
195+
#
196+
# This does NOT check for DUCKDB_VERSION changes, and its use in CI
197+
# is generally discouraged. Please change build.rs.
198+
#
199+
# Ideally I'd use a awalsh128/cache-apt-pkgs-action@v1.5.2,
200+
# as apt-get takes around 22s, but this specific action is broken, see
201+
# https://github.com/awalsh128/cache-apt-pkgs-action/issues/181
202+
#
203+
# TODO(myrrc): this should all go away once we have AMI in CI.
204+
- uses: dorny/paths-filter@v3
205+
id: duckdb-buildfile-changed
206+
with:
207+
filters: |
208+
changed:
209+
- 'vortex-duckdb/build.rs'
210+
- name: Install curl dev headers for duckdb source build
211+
if: steps.duckdb-buildfile-changed.outputs.changed == 'true'
212+
run: |
213+
sudo apt-get update -y -qq && sudo apt-get install -y -qq libcurl4-openssl-dev
191214
- name: Docs
192215
run: |
193216
RUSTDOCFLAGS="-D warnings" cargo doc --no-deps
@@ -287,6 +310,16 @@ jobs:
287310
repo-token: ${{ secrets.GITHUB_TOKEN }}
288311
- name: Install nightly for fmt
289312
run: rustup toolchain install $NIGHTLY_TOOLCHAIN --component rustfmt
313+
- uses: dorny/paths-filter@v3
314+
id: duckdb-buildfile-changed
315+
with:
316+
filters: |
317+
changed:
318+
- 'vortex-duckdb/build.rs'
319+
- name: Install curl dev headers for duckdb source build
320+
if: steps.duckdb-buildfile-changed.outputs.changed == 'true'
321+
run: |
322+
sudo apt-get update -y -qq && sudo apt-get install -y -qq libcurl4-openssl-dev
290323
- name: Rust Lint - Format
291324
run: cargo +$NIGHTLY_TOOLCHAIN fmt --all --check
292325
- name: Rustc check
@@ -411,6 +444,16 @@ jobs:
411444
uses: taiki-e/install-action@v2
412445
with:
413446
tool: nextest
447+
- uses: dorny/paths-filter@v3
448+
id: duckdb-buildfile-changed
449+
with:
450+
filters: |
451+
changed:
452+
- 'vortex-duckdb/build.rs'
453+
- name: Install curl dev headers for duckdb source build
454+
if: steps.duckdb-buildfile-changed.outputs.changed == 'true'
455+
run: |
456+
sudo apt-get update -y -qq && sudo apt-get install -y -qq libcurl4-openssl-dev
414457
- name: Rust Tests
415458
if: ${{ matrix.suite == 'tests' }}
416459
run: |
@@ -477,8 +520,7 @@ jobs:
477520
components: "rust-src, rustfmt, clippy, llvm-tools-preview"
478521
- name: Install build dependencies
479522
run: |
480-
sudo apt-get update
481-
sudo apt-get install -y ninja-build cmake
523+
sudo apt-get update -y -qq && sudo apt-get install -y -qq ninja-build cmake libcurl4-openssl-dev
482524
- name: Install nextest
483525
uses: taiki-e/install-action@v2
484526
with:
@@ -612,6 +654,16 @@ jobs:
612654
uses: taiki-e/install-action@v2
613655
with:
614656
tool: nextest
657+
- uses: dorny/paths-filter@v3
658+
id: duckdb-buildfile-changed
659+
with:
660+
filters: |
661+
changed:
662+
- 'vortex-duckdb/build.rs'
663+
- name: Install curl dev headers for duckdb source build
664+
if: matrix.os != 'windows-x64' && steps.duckdb-buildfile-changed.outputs.changed == 'true'
665+
run: |
666+
sudo apt-get update -y -qq && sudo apt-get install -y -qq libcurl4-openssl-dev
615667
- name: Rust Tests (Windows)
616668
if: matrix.os == 'windows-x64'
617669
run: |
@@ -770,11 +822,20 @@ jobs:
770822
uses: ./.github/actions/setup-rust
771823
with:
772824
repo-token: ${{ secrets.GITHUB_TOKEN }}
825+
- uses: dorny/paths-filter@v3
826+
id: duckdb-buildfile-changed
827+
with:
828+
filters: |
829+
changed:
830+
- 'vortex-duckdb/build.rs'
831+
- name: Install curl dev headers for duckdb source build
832+
if: steps.duckdb-buildfile-changed.outputs.changed == 'true'
833+
run: |
834+
sudo apt-get update -y -qq && sudo apt-get install -y -qq libcurl4-openssl-dev
773835
- name: Run sqllogictest tests
774836
run: |
775837
cargo test -p vortex-sqllogictest --test sqllogictests
776838
777-
778839
wasm-integration:
779840
name: "wasm-integration"
780841
runs-on: ubuntu-latest

.github/workflows/sql-benchmarks.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,18 @@ jobs:
115115

116116
- uses: actions/checkout@v6
117117
if: inputs.mode != 'pr'
118+
119+
- uses: dorny/paths-filter@v3
120+
id: duckdb-buildfile-changed
121+
with:
122+
filters: |
123+
changed:
124+
- 'vortex-duckdb/build.rs'
125+
- name: Install curl dev headers for duckdb source build
126+
if: steps.duckdb-buildfile-changed.outputs.changed == 'true'
127+
run: |
128+
sudo apt-get update -y -qq && sudo apt-get install -y -qq libcurl4-openssl-dev
129+
118130
- uses: ./.github/actions/setup-rust
119131
with:
120132
repo-token: ${{ secrets.GITHUB_TOKEN }}

benchmarks/duckdb-bench/src/lib.rs

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,6 @@ impl DuckClient {
8787
let connection = db.connect()?;
8888
vortex_duckdb::initialize(&db)?;
8989

90-
// Install and load httpfs so DuckDB can access remote files (S3, GCS, HTTP).
91-
connection.query("INSTALL httpfs;")?;
92-
connection.query("LOAD httpfs;")?;
93-
9490
// Enable Parquet metadata cache for all benchmark runs.
9591
//
9692
// `parquet_metadata_cache` is an extension-specific option that's

vortex-bench/src/tpcds/duckdb.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,6 @@ pub fn generate_tpcds(base_dir: PathBuf, scale_factor: String) -> Result<PathBuf
2929
let sql_script = format!(
3030
"SET autoinstall_known_extensions=1;\
3131
SET autoload_known_extensions=1;\
32-
INSTALL tpcds;\
33-
LOAD tpcds;\
3432
CALL dsdgen(sf={scale_factor});\
3533
EXPORT DATABASE '{output_dir}' (FORMAT PARQUET);",
3634
scale_factor = scale_factor,

vortex-duckdb/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,10 @@ By default, our tests use a precompiled build which means you don't get an
7373

7474
./target/release/duckdb will be a duckdb instance with vortex-duckdb already
7575
loaded.
76+
77+
## Testing a custom DuckDB tag
78+
79+
Change `DUCKDB_VERSION` environment variable value to a preferred hash or commit
80+
(local build), or change build.rs (for testing in CI). If you use a commit,
81+
DuckDB needs to link httpfs statically so you also need to install CURL
82+
development headers (e.g. `libcurl4-openssl-dev`).

vortex-duckdb/build.rs

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ use std::process::Command;
1616
use bindgen::Abi;
1717
use once_cell::sync::Lazy;
1818

19+
// Changing this part or build.rs in general runs a "build duckdb from source"
20+
// job in the CI, with all dependencies and benchmarks using this source-based build.
21+
// It's not ideal, but allows us to check pre-release stuff for breaking things or performance
22+
// regressions.
1923
static DUCKDB_VERSION: Lazy<DuckDBVersion> = Lazy::new(|| {
2024
// Override the DuckDB version via environment variable in case of an extension build.
2125
// `DUCKDB_VERSION` is set by the extension build in the `duckdb-vortex` repo.
@@ -252,7 +256,15 @@ fn extract_duckdb_source(source_dir: &Path) -> Result<PathBuf, Box<dyn std::erro
252256
}
253257

254258
/// Build DuckDB from source. Used for commit hashes or when VX_DUCKDB_DEBUG is set.
255-
fn build_duckdb(duckdb_source_dir: &Path) -> Result<PathBuf, Box<dyn std::error::Error>> {
259+
fn build_duckdb(
260+
duckdb_source_dir: &Path,
261+
version: &DuckDBVersion,
262+
debug: bool,
263+
) -> Result<PathBuf, Box<dyn std::error::Error>> {
264+
let build_type = match debug {
265+
true => "debug",
266+
false => "release",
267+
};
256268
// Check for ninja
257269
if Command::new("ninja").arg("--version").output().is_err() {
258270
return Err(
@@ -262,10 +274,12 @@ fn build_duckdb(duckdb_source_dir: &Path) -> Result<PathBuf, Box<dyn std::error:
262274

263275
let inner_dir_name = DUCKDB_VERSION.archive_inner_dir_name();
264276
let duckdb_repo_dir = duckdb_source_dir.join(&inner_dir_name);
265-
let build_dir = duckdb_repo_dir.join("build").join("debug");
277+
let build_dir = duckdb_repo_dir.join("build").join(build_type);
266278

267-
// Check if already built
268279
let lib_dir = build_dir.join("src");
280+
let lib_dir_str = lib_dir.display();
281+
println!("cargo:info=Checking if DuckDB is already built in {lib_dir_str}",);
282+
269283
let already_built = lib_dir.join("libduckdb.dylib").exists()
270284
|| lib_dir.join("libduckdb.so").exists()
271285
|| lib_dir
@@ -286,12 +300,26 @@ fn build_duckdb(duckdb_source_dir: &Path) -> Result<PathBuf, Box<dyn std::error:
286300
("1", "0")
287301
};
288302

303+
let mut envs = vec![
304+
("GEN", "ninja"),
305+
("DISABLE_SANITIZER", asan_option),
306+
("THREADSAN", tsan_option),
307+
("BUILD_SHELL", "false"),
308+
("BUILD_UNITTESTS", "false"),
309+
("ENABLE_UNITTEST_CPP_TESTS", "false"),
310+
];
311+
312+
// If we're building from a commit (likely a pre-release), we need to
313+
// build extensions statically. Otherwise DuckDB tries to load them
314+
// from an http endpoint with version 0.0.1 (all non-tagged builds)
315+
// which doesn't exists. httpfs also requires CURL dev headers
316+
if matches!(version, DuckDBVersion::Commit(_)) {
317+
envs.push(("BUILD_EXTENSIONS", "httpfs;parquet;tpch;tpcds;jemalloc"));
318+
};
319+
289320
let output = Command::new("make")
290321
.current_dir(&duckdb_repo_dir)
291-
.env("GEN", "ninja")
292-
.env("DISABLE_SANITIZER", asan_option)
293-
.env("THREADSAN", tsan_option)
294-
.arg("debug")
322+
.envs(envs)
295323
.output()?;
296324

297325
if !output.status.success() {
@@ -375,15 +403,21 @@ fn main() {
375403
drop(fs::remove_dir_all(&duckdb_symlink));
376404
std::os::unix::fs::symlink(&extracted_source_path, &duckdb_symlink).unwrap();
377405

378-
// Determine whether to build from source or use prebuilt libraries
379406
let use_debug_build =
380407
env::var("VX_DUCKDB_DEBUG").is_ok_and(|v| matches!(v.as_str(), "1" | "true"));
408+
println!("cargo:info=DuckDB debug build: {use_debug_build}");
381409

382410
let library_path = if use_debug_build || !DUCKDB_VERSION.is_release() {
383411
// Build from source for:
384412
// - Commit hashes (no prebuilt available)
385413
// - When VX_DUCKDB_DEBUG=1 (user wants debug build)
386-
build_duckdb(&extracted_source_path).unwrap()
414+
match build_duckdb(&extracted_source_path, &DUCKDB_VERSION, use_debug_build) {
415+
Ok(path) => path,
416+
Err(err) => {
417+
println!("cargo:error={err}");
418+
panic!("duckdb build failed");
419+
}
420+
}
387421
} else {
388422
// Download prebuilt libraries for release versions
389423
let archive_path = download_duckdb_lib_archive().unwrap();

vortex-duckdb/src/e2e_test/vortex_scan_test.rs

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -361,8 +361,10 @@ fn test_vortex_scan_over_http() {
361361
let listener = TcpListener::bind("127.0.0.1:0").unwrap();
362362
let addr = listener.local_addr().unwrap();
363363

364+
// Spawn 10 threads because DuckDB does HEAD and GET requests with retries,
365+
// thus 2 threads, one for each implementation, aren't enough
364366
std::thread::spawn(move || {
365-
for _ in 0..2 {
367+
for _ in 0..10 {
366368
if let Ok((mut stream, _)) = listener.accept() {
367369
let response = format!(
368370
"HTTP/1.1 200 OK\r\nContent-Length: {}\r\n\r\n",
@@ -376,24 +378,30 @@ fn test_vortex_scan_over_http() {
376378

377379
let conn = database_connection();
378380
conn.query("SET vortex_filesystem = 'duckdb';").unwrap();
379-
conn.query("INSTALL httpfs;").unwrap();
380-
conn.query("LOAD httpfs;").unwrap();
381+
for httpfs_impl in ["httplib", "curl"] {
382+
println!("Testing httpfs client implementation: {httpfs_impl}");
383+
conn.query(&format!(
384+
"SET httpfs_client_implementation = '{httpfs_impl}';"
385+
))
386+
.unwrap();
381387

382-
let url = format!(
383-
"http://{}/{}",
384-
addr,
385-
file.path().file_name().unwrap().to_string_lossy()
386-
);
388+
let url = format!(
389+
"http://{}/{}",
390+
addr,
391+
file.path().file_name().unwrap().to_string_lossy()
392+
);
393+
println!("url={url}, file={}", file.path().display());
387394

388-
let result = conn
389-
.query(&format!("SELECT COUNT(*) FROM read_vortex('{url}')"))
390-
.unwrap();
391-
let chunk = result.into_iter().next().unwrap();
392-
let count = chunk
393-
.get_vector(0)
394-
.as_slice_with_len::<i64>(chunk.len().as_())[0];
395+
let result = conn
396+
.query(&format!("SELECT COUNT(*) FROM read_vortex('{url}')"))
397+
.unwrap();
398+
let chunk = result.into_iter().next().unwrap();
399+
let count = chunk
400+
.get_vector(0)
401+
.as_slice_with_len::<i64>(chunk.len().as_())[0];
395402

396-
assert_eq!(count, 3);
403+
assert_eq!(count, 3);
404+
}
397405
}
398406

399407
#[test]

vortex-duckdb/src/filesystem.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ impl FileSystem for DuckDbFileSystem {
124124
fn list(&self, prefix: &str) -> BoxStream<'_, VortexResult<FileListing>> {
125125
let mut directory_url = self.base_url.clone();
126126
if !prefix.is_empty() {
127-
directory_url.set_path(&format!("{}/{}", directory_url.path(), prefix));
127+
directory_url.set_path(prefix);
128128
}
129129

130130
// DuckDB's ListFiles expects bare paths for local files, but full URLs
@@ -159,7 +159,7 @@ impl FileSystem for DuckDbFileSystem {
159159

160160
async fn open_read(&self, path: &str) -> VortexResult<Arc<dyn VortexReadAt>> {
161161
let mut url = self.base_url.clone();
162-
url.set_path(&format!("{}/{}", url.path(), path));
162+
url.set_path(path);
163163
let reader = unsafe { DuckDbFsReader::open_url(self.ctx.as_ptr(), &url)? };
164164
Ok(Arc::new(reader))
165165
}

0 commit comments

Comments
 (0)