{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "d9515c08-c3cc-4b67-8298-cb5e6da8fae8", "metadata": {}, "outputs": [], "source": [ "!python -m pip install duckdb" ] }, { "cell_type": "markdown", "id": "de891a8f-ef46-418e-81f1-bb400a20ffe0", "metadata": {}, "source": [ "# Getting Started With DuckDB" ] }, { "cell_type": "code", "execution_count": null, "id": "542e9da3-c459-45de-84d6-7b6287422e4e", "metadata": {}, "outputs": [], "source": [ "import duckdb\n", "\n", "duckdb.sql(\"SELECT 'whistling_duck' AS waterfowl, 'whistle' AS call\")" ] }, { "cell_type": "markdown", "id": "3c9020b1-a8b7-4dde-a70d-7ddc35d23dac", "metadata": {}, "source": [ "## Creating a Database From a Data Source" ] }, { "cell_type": "code", "execution_count": null, "id": "2ed567c9-06e3-4793-a43e-a6a9a985b4a8", "metadata": {}, "outputs": [], "source": [ "import duckdb\n", "\n", "conn = duckdb.connect(database=\"presidents.db\")\n", "\n", "presidents_relation = conn.read_parquet(\"presidents.parquet\")\n", "\n", "conn.sql(\n", " \"\"\"\n", " SELECT sequence, last_name, first_name\n", " FROM presidents_relation\n", " WHERE sequence <= 2\n", " \"\"\"\n", ").show()\n", "\n", "presidents_relation.to_table(\"presidents\")\n", "\n", "conn.close()" ] }, { "cell_type": "code", "execution_count": null, "id": "88f82cd9-9056-4cdc-9ff1-9827d0900792", "metadata": {}, "outputs": [], "source": [ "with duckdb.connect(database=\"presidents.db\") as conn:\n", " conn.sql(\n", " \"\"\"\n", " SELECT last_name, first_name\n", " FROM presidents\n", " WHERE last_name = 'Adams' \n", " \"\"\"\n", " ).show()" ] }, { "cell_type": "markdown", "id": "a4d1b371-f16e-4a81-85fe-37411b3b2824", "metadata": {}, "source": [ "**This code won't work.**\n", "\n", "```python\n", "with duckdb.connect(database=\"presidents.db\") as conn:\n", " conn.sql(\"SELECT * FROM presidents_relation\")\n", "```" ] }, { "cell_type": "markdown", "id": "d87920b4-cf7b-462b-8f89-7ab40d6ba8fa", "metadata": {}, "source": [ "## Correcting Your Data Import Errors" ] }, { "cell_type": "code", "execution_count": null, "id": "491262f1-8841-459a-9fff-4d9b23caa4bb", "metadata": { "scrolled": true }, "outputs": [], "source": [ "import duckdb\n", "\n", "with duckdb.connect(database=\"presidents.db\") as conn:\n", " presidents_relation = conn.read_csv(\"presidents.csv\")\n", " print(presidents_relation.limit(2))" ] }, { "cell_type": "code", "execution_count": null, "id": "f1b48835-4f78-4be1-93b9-4b563a91238b", "metadata": {}, "outputs": [], "source": [ "import duckdb\n", "\n", "with duckdb.connect(database=\"presidents.db\") as conn:\n", " presidents_relation = conn.read_csv(\n", " \"presidents.csv\", date_format=\"%B %d %Y\"\n", " )\n", " print(presidents_relation.dtypes)" ] }, { "cell_type": "markdown", "id": "281fc954-cbe8-4e13-bc35-13ead7d28fcb", "metadata": {}, "source": [ "# Querying the Database\n", "## Querying Tables" ] }, { "cell_type": "code", "execution_count": null, "id": "d8342da7-a20e-49f7-a156-18b875734f52", "metadata": {}, "outputs": [], "source": [ "import duckdb\n", "\n", "with duckdb.connect(database=\"presidents.db\") as conn:\n", " conn.read_json(\"parties.json\").to_table(\"parties\")" ] }, { "cell_type": "code", "execution_count": null, "id": "3b795c6e-df4b-4fd7-b6cd-aff308740c1c", "metadata": {}, "outputs": [], "source": [ "with duckdb.connect(\"presidents.db\") as conn:\n", " print(\n", " conn.sql(\n", " \"\"\"\n", " SELECT first_name, last_name, party_name\n", " FROM presidents\n", " JOIN parties\n", " ON presidents.party_id = parties.party_id\n", " WHERE party_name = 'Whig'\n", " ORDER BY last_name DESC\n", " \"\"\"\n", " )\n", " )" ] }, { "cell_type": "markdown", "id": "5e7da17a-6653-4c6b-85e9-61bbaad49ce5", "metadata": {}, "source": [ "## Querying Relations" ] }, { "cell_type": "code", "execution_count": null, "id": "11437a3f-c64e-4ba1-ab14-6d98f808c5f1", "metadata": {}, "outputs": [], "source": [ "import duckdb\n", "\n", "presidents = duckdb.read_parquet(\"presidents.parquet\")\n", "parties = duckdb.read_json(\"parties.json\")\n", "\n", "duckdb.sql(\n", " \"\"\"\n", " SELECT first_name, last_name, party_name\n", " FROM presidents\n", " JOIN parties\n", " ON presidents.party_id = parties.party_id\n", " WHERE party_name = 'Whig'\n", " ORDER BY last_name DESC\n", " \"\"\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "f2223996-ed7f-49cb-8aa7-34c4e4cf45c6", "metadata": { "scrolled": true }, "outputs": [], "source": [ "import duckdb\n", "\n", "presidents = duckdb.read_parquet(\"presidents.parquet\").set_alias(\"presidents\")\n", "parties = duckdb.read_json(\"parties.json\").set_alias(\"parties\")\n", "\n", "(\n", " presidents.join(parties, \"presidents.party_id = parties.party_id\")\n", " .select(\"first_name\", \"last_name\", \"party_name\")\n", " .filter(\"party_name = 'Whig'\")\n", " .order(\"last_name DESC\")\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "7fffdaf5-447c-4918-9809-0a8f50620b05", "metadata": {}, "outputs": [], "source": [ "leaders = duckdb.read_parquet(\"presidents.parquet\").set_alias(\"usa_presidents\")\n", "faction = duckdb.read_json(\"parties.json\").set_alias(\"political_parties\")\n", "\n", "(\n", " leaders.join(\n", " faction, \"usa_presidents.party_id = political_parties.party_id\"\n", " )\n", " .select(\"first_name\", \"last_name\", \"party_name\")\n", " .filter(\"party_name = 'Whig'\")\n", " .order(\"last_name DESC\")\n", ")" ] }, { "cell_type": "markdown", "id": "8c300c6e-1b5e-41e6-bbdf-04ec2ce04b8c", "metadata": {}, "source": [ "# Using Concurrency\n", "## Performing Concurrent Reads" ] }, { "cell_type": "code", "execution_count": null, "id": "03637669-43fb-4593-91e6-c44b425399b3", "metadata": {}, "outputs": [], "source": [ "from concurrent.futures import ThreadPoolExecutor\n", "\n", "import duckdb\n", "\n", "\n", "def read_data(thread_id):\n", " print(f\"Thread {thread_id} starting its read.\")\n", " with duckdb.connect(\"presidents.db\") as conn:\n", " conn.sql(\n", " \"\"\"\n", " SELECT first_name, last_name\n", " FROM presidents\n", " WHERE sequence = 1\n", " \"\"\"\n", " ).show()\n", " print(f\"Thread {thread_id} ending its read.\")\n", "\n", "\n", "def concurrent_read():\n", " with ThreadPoolExecutor(max_workers=3) as executor:\n", " executor.map(read_data, range(3))\n", "\n", "\n", "concurrent_read()" ] }, { "cell_type": "markdown", "id": "c231f1c1-8aa4-4efe-b54e-4c0ff53d1d7e", "metadata": {}, "source": [ "## Performing Concurrent Writes" ] }, { "cell_type": "code", "execution_count": null, "id": "65384b11-a04d-456a-956d-49ccc5057bf4", "metadata": {}, "outputs": [], "source": [ "import duckdb\n", "\n", "\n", "def update_data(thread_id):\n", " new_name = f\"George ({thread_id})\"\n", " with duckdb.connect(\"presidents.db\") as conn:\n", " print(f\"Thread {thread_id} starting its update.\")\n", " conn.sql(\n", " f\"\"\"\n", " UPDATE presidents\n", " SET first_name = '{new_name}'\n", " WHERE sequence = 1\n", " \"\"\"\n", " )\n", " print(f\"Thread {thread_id} ending its update.\")\n", "\n", "\n", "def concurrent_update():\n", " with ThreadPoolExecutor(max_workers=3) as executor:\n", " executor.map(update_data, range(3))\n", "\n", "\n", "concurrent_update()" ] }, { "cell_type": "code", "execution_count": null, "id": "892a7a41-89b2-4ceb-8e6c-221d417520d0", "metadata": {}, "outputs": [], "source": [ "with duckdb.connect(\"presidents.db\") as conn:\n", " print(\n", " conn.sql(\n", " \"\"\"\n", " SELECT last_name, first_name\n", " FROM presidents\n", " WHERE sequence = 1\n", " \"\"\"\n", " )\n", " )" ] }, { "cell_type": "markdown", "id": "bd5f57cf-b8a0-4833-9675-bcc7a6907124", "metadata": {}, "source": [ "# Integrating DuckDB Within Your Python Environment\n", "## Creating Python Functions for DuckDB to Use" ] }, { "cell_type": "code", "execution_count": null, "id": "4ecd1c32-608a-43a1-a3fd-25519585a4e5", "metadata": {}, "outputs": [], "source": [ "import duckdb\n", "\n", "\n", "def short_name(first_name: str, last_name: str) -> str:\n", " return f\"{first_name[0]}. {last_name}\"" ] }, { "cell_type": "code", "execution_count": null, "id": "49f50c03-2557-4637-98c9-e114a47de8f9", "metadata": {}, "outputs": [], "source": [ "short_name(\"Abraham\", \"Lincoln\")" ] }, { "cell_type": "markdown", "id": "b2772cd3-0f3e-48ab-99c2-949468cb57e8", "metadata": {}, "source": [ "**Only run this code if you need to update your existing `short_name()` function**\n", "\n", "```python\n", "duckdb.remove_function(\"short_name\")\n", "```" ] }, { "cell_type": "code", "execution_count": null, "id": "e001fe4a-498a-496e-901d-871b922e4231", "metadata": {}, "outputs": [], "source": [ "duckdb.create_function(\"short_name\", short_name)" ] }, { "cell_type": "code", "execution_count": null, "id": "c6a94bdc-4a89-41b3-921a-8142b43dcebe", "metadata": {}, "outputs": [], "source": [ "presidents = duckdb.read_parquet(\"presidents.parquet\")\n", "\n", "duckdb.sql(\n", " \"\"\" \n", " SELECT short_name(first_name::VARCHAR, last_name::VARCHAR) AS name,\n", " (term_end - term_start) AS \"days in office\"\n", " FROM presidents\n", " \"\"\"\n", ").limit(3)" ] }, { "cell_type": "markdown", "id": "2181faed-0d54-470c-9174-f8f0a681069f", "metadata": {}, "source": [ "# Using Polars and pandas With DuckDB" ] }, { "cell_type": "code", "execution_count": null, "id": "3934e0e3-7875-40ad-9882-a06c87030c6f", "metadata": {}, "outputs": [], "source": [ "!python -m pip install pandas polars pyarrow" ] }, { "cell_type": "code", "execution_count": null, "id": "77a11ef6-308e-4279-94f1-a38b1b046874", "metadata": {}, "outputs": [], "source": [ "import duckdb\n", "\n", "with duckdb.connect(\"presidents.db\") as conn:\n", " pandas_presidents = conn.sql(\n", " \"\"\"\n", " SELECT last_name, first_name\n", " FROM presidents\n", " WHERE sequence BETWEEN 2 AND 5\n", " \"\"\"\n", " ).df()\n", "\n", "pandas_presidents" ] }, { "cell_type": "code", "execution_count": null, "id": "d348d5dc-ff6a-4a61-9924-24ff345a63bd", "metadata": {}, "outputs": [], "source": [ "import duckdb\n", "\n", "presidents = duckdb.read_parquet(\"presidents.parquet\").set_alias(\"presidents\")\n", "parties = duckdb.read_json(\"parties.json\").set_alias(\"parties\")\n", "\n", "(\n", " presidents.join(parties, \"presidents.party_id = parties.party_id\")\n", " .select(\"first_name\", \"last_name\", \"party_name\")\n", " .order(\"last_name DESC\")\n", ").pl().head(3)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.1" } }, "nbformat": 4, "nbformat_minor": 5 }