From 887291d2f759cc6ea8b14cefb44e8fadfb93fd6a Mon Sep 17 00:00:00 2001 From: Diego Ripley Date: Sat, 21 Jun 2025 00:54:26 -0400 Subject: [PATCH] Read all DGUIDs from subset parquet output (100,000 records each) --- .../statcan_products/duck_experiment.ipynb | 293 ++++++++++++++---- 1 file changed, 233 insertions(+), 60 deletions(-) diff --git a/experiments/statcan_products/duck_experiment.ipynb b/experiments/statcan_products/duck_experiment.ipynb index 6666b94..1950421 100644 --- a/experiments/statcan_products/duck_experiment.ipynb +++ b/experiments/statcan_products/duck_experiment.ipynb @@ -2,10 +2,24 @@ "cells": [ { "cell_type": "code", - "execution_count": 41, + "execution_count": 1, "id": "5e04e469-d3eb-42ca-b548-5e6f1fa6af9d", - "metadata": {}, - "outputs": [], + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Buckaroo has been enabled as the default DataFrame viewer. To return to default dataframe visualization use `from buckaroo import disable; disable()`\n" + ] + } + ], "source": [ "import buckaroo\n", "import duckdb\n", @@ -14,9 +28,15 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 2, "id": "97c8e92b-21e4-4cc5-8dbe-7b42361ce3f9", - "metadata": {}, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, "outputs": [], "source": [ "con = duckdb.connect()" @@ -24,55 +44,65 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 9, "id": "e02f2416-fd16-444b-8fd4-eec2cecee5a7", - "metadata": {}, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 36, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "con.execute(\"\"\"\n", - "DESCRIBE '/data/tables/output/en/testing/36100670.parquet';\n", + "DESCRIBE '/data/tables/output/en/11100025.parquet';\n", "\"\"\")" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 10, "id": "fafa7ce7-8619-4951-8c73-7bfbc66dc92f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[('REF_DATE', 'DATE', 'YES', None, None, None),\n", + "[('REF_DATE', 'VARCHAR', 'YES', None, None, None),\n", + " ('REF_START_DATE', 'TIMESTAMP_NS', 'YES', None, None, None),\n", + " ('REF_END_DATE', 'TIMESTAMP_NS', 'YES', None, None, None),\n", " ('GEO', 'VARCHAR', 'YES', None, None, None),\n", " ('DGUID', 'VARCHAR', 'YES', None, None, None),\n", - " ('Seasonality', 'VARCHAR', 'YES', None, None, None),\n", - " ('Selected credit estimates', 'VARCHAR', 'YES', None, None, None),\n", + " ('Selected characteristics', 'VARCHAR', 'YES', None, None, None),\n", + " ('Low income threshold', 'VARCHAR', 'YES', None, None, None),\n", + " ('Years in low income', 'VARCHAR', 'YES', None, None, None),\n", + " ('Statistics', 'VARCHAR', 'YES', None, None, None),\n", " ('UOM', 'VARCHAR', 'YES', None, None, None),\n", - " ('UOM_ID', 'TINYINT', 'YES', None, None, None),\n", + " ('UOM_ID', 'SMALLINT', 'YES', None, None, None),\n", " ('SCALAR_FACTOR', 'VARCHAR', 'YES', None, None, None),\n", " ('SCALAR_ID', 'TINYINT', 'YES', None, None, None),\n", " ('VECTOR', 'VARCHAR', 'YES', None, None, None),\n", " ('COORDINATE', 'VARCHAR', 'YES', None, None, None),\n", - " ('VALUE', 'INTEGER', 'YES', None, None, None),\n", + " ('VALUE', 'DOUBLE', 'YES', None, None, None),\n", " ('STATUS', 'VARCHAR', 'YES', None, None, None),\n", - " ('SYMBOL', 'TINYINT', 'YES', None, None, None),\n", - " ('TERMINATED', 'TINYINT', 'YES', None, None, None),\n", + " ('SYMBOL', 'VARCHAR', 'YES', None, None, None),\n", + " ('TERMINATED', 'VARCHAR', 'YES', None, None, None),\n", " ('DECIMALS', 'TINYINT', 'YES', None, None, None)]" ] }, - "execution_count": 37, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -83,62 +113,205 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 5, "id": "a4ed2881-91b7-4473-b246-a969ef59efba", "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d937312a19f44fc2a8f87bcae8d0faca", - "version_major": 2, - "version_minor": 1 - }, - "text/plain": [ - "PolarsBuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'p…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ - "con.execute(\"SELECT DISTINCT REF_DATE FROM '/data/tables/output/en/testing/36100670.parquet' ORDER BY REF_DATE\").pl()" + "#con.execute(\"SELECT DISTINCT REF_DATE FROM '/data/tables/output/en/testing/36100670.parquet' ORDER BY REF_DATE\").pl()" ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 41, "id": "f400feee-efb6-421a-b518-1f9c0fc21bcb", - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c2edc671f3014e4aae90a2a5b0577bfa", - "version_major": 2, - "version_minor": 1 - }, - "text/plain": [ - "PolarsBuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'p…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], "source": [ - "con.execute(\"\"\"\n", - "SELECT REF_DATE, DGUID, Seasonality, 'Selected credit estimates', VALUE\n", - "FROM '/data/tables/output/en/testing/36100670.parquet'\n", - "WHERE REF_DATE > DATE '2018-01-01'\n", + "test = con.execute(\"\"\"\n", + "SELECT DISTINCT substring(DGUID, 1, 9) AS table, GEO\n", + "FROM '/data/tables/output/en/11100025.parquet'\n", "\"\"\").pl()" ] }, + { + "cell_type": "code", + "execution_count": 42, + "id": "a187c850-981e-4348-a57d-2f25e57cf9db", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f5c04f09c7134ecfa633b956711be823", + "version_major": 2, + "version_minor": 1 + }, + "text/plain": [ + "PolarsBuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'p…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "test" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "ff3eb520-c78b-4976-b734-840dc0fa53ab", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "con.execute(\"\"\"\n", + "SELECT *\n", + "FROM parquet_file_metadata('/data/tables/output/en/43100011.parquet');\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "30c8fc7e-cafd-43ec-a58f-aed4128df594", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7dc3ff8ea49441a5a548b223de7c823f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "[('/data/tables/output/en/43100011.parquet',\n", + " 'parquet-cpp-arrow version 20.0.0',\n", + " 172033460,\n", + " 165,\n", + " 2,\n", + " None,\n", + " None)]" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "con.fetchall()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "10f3238b-36a6-4033-a299-fc04190e9d63", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "con.execute(\"\"\"\n", + "SELECT DISTINCT DGUID, GEO\n", + "FROM read_parquet('/data/tables/output/en/*.parquet')\n", + "ORDER BY DGUID ASC\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "dcf81727-340c-4bcc-aec3-7133e0010eda", + "metadata": {}, + "outputs": [], + "source": [ + "distinct_dguid_geo = con.fetch_df()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "0d2f2de4-e033-4181-aaf5-9e8c3fd335b1", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ff5226caa7f947c593bf41fc302a2e52", + "version_major": 2, + "version_minor": 1 + }, + "text/plain": [ + "BuckarooInfiniteWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], …" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "distinct_dguid_geo" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "e2e00108-bc8f-4b8b-91e0-f7bf9b44c59d", + "id": "cfd1621f-9c4e-46ba-a64e-1c191ff27b35", "metadata": {}, "outputs": [], "source": []