diff --git a/experiments/statcan_products/duck_experiment.ipynb b/experiments/statcan_products/duck_experiment.ipynb index 1950421..b26e36b 100644 --- a/experiments/statcan_products/duck_experiment.ipynb +++ b/experiments/statcan_products/duck_experiment.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 12, "id": "5e04e469-d3eb-42ca-b548-5e6f1fa6af9d", "metadata": { "editable": true, @@ -11,15 +11,7 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Buckaroo has been enabled as the default DataFrame viewer. To return to default dataframe visualization use `from buckaroo import disable; disable()`\n" - ] - } - ], + "outputs": [], "source": [ "import buckaroo\n", "import duckdb\n", @@ -28,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 13, "id": "97c8e92b-21e4-4cc5-8dbe-7b42361ce3f9", "metadata": { "editable": true, @@ -44,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 14, "id": "e02f2416-fd16-444b-8fd4-eec2cecee5a7", "metadata": { "editable": true, @@ -57,10 +49,10 @@ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 9, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -73,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 15, "id": "fafa7ce7-8619-4951-8c73-7bfbc66dc92f", "metadata": {}, "outputs": [ @@ -102,7 +94,7 @@ " ('DECIMALS', 'TINYINT', 'YES', None, None, None)]" ] }, - "execution_count": 10, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -113,7 +105,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 16, "id": "a4ed2881-91b7-4473-b246-a969ef59efba", "metadata": {}, "outputs": [], @@ -123,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 17, "id": "f400feee-efb6-421a-b518-1f9c0fc21bcb", "metadata": { "editable": true, @@ -142,7 +134,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 18, "id": "a187c850-981e-4348-a57d-2f25e57cf9db", "metadata": { "editable": true, @@ -155,7 +147,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f5c04f09c7134ecfa633b956711be823", + "model_id": "ea3f2ee32a4d4ba1970f9f373b86fc02", "version_major": 2, "version_minor": 1 }, @@ -173,7 +165,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 19, "id": "ff3eb520-c78b-4976-b734-840dc0fa53ab", "metadata": { "editable": true, @@ -184,14 +176,15 @@ }, "outputs": [ { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" + "ename": "IOException", + "evalue": "IO Error: No files found that match the pattern \"/data/tables/output/en/43100011.parquet\"", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mIOException\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[19]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mcon\u001b[49m\u001b[43m.\u001b[49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\"\"\u001b[39;49m\n\u001b[32m 2\u001b[39m \u001b[33;43mSELECT *\u001b[39;49m\n\u001b[32m 3\u001b[39m \u001b[33;43mFROM parquet_file_metadata(\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m/data/tables/output/en/43100011.parquet\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m);\u001b[39;49m\n\u001b[32m 4\u001b[39m \u001b[33;43m\"\"\"\u001b[39;49m\u001b[43m)\u001b[49m\n", + "\u001b[31mIOException\u001b[39m: IO Error: No files found that match the pattern \"/data/tables/output/en/43100011.parquet\"" + ] } ], "source": [ @@ -203,7 +196,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 20, "id": "30c8fc7e-cafd-43ec-a58f-aed4128df594", "metadata": { "editable": true, @@ -214,34 +207,15 @@ }, "outputs": [ { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "7dc3ff8ea49441a5a548b223de7c823f", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "[('/data/tables/output/en/43100011.parquet',\n", - " 'parquet-cpp-arrow version 20.0.0',\n", - " 172033460,\n", - " 165,\n", - " 2,\n", - " None,\n", - " None)]" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" + "ename": "InvalidInputException", + "evalue": "Invalid Input Error: No open result set", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mInvalidInputException\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[20]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mcon\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfetchall\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[31mInvalidInputException\u001b[39m: Invalid Input Error: No open result set" + ] } ], "source": [ @@ -250,17 +224,17 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 21, "id": "10f3238b-36a6-4033-a299-fc04190e9d63", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 9, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -275,10 +249,25 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 22, "id": "dcf81727-340c-4bcc-aec3-7133e0010eda", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1298d824f74c4fc59151d5b746d8e864", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "distinct_dguid_geo = con.fetch_df()" ] @@ -310,9 +299,256 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "id": "cfd1621f-9c4e-46ba-a64e-1c191ff27b35", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "con.execute(\"\"\"\n", + "SELECT DISTINCT REF_DATE, REF_START_DATE, REF_END_DATE\n", + "FROM read_parquet('/data/tables/output/en/*.parquet')\n", + "ORDER BY REF_START_DATE ASC\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "5b5fed59-b900-47a0-8ab5-15e557f02dbf", + "metadata": {}, + "outputs": [], + "source": [ + "distinct_ref_date = con.fetch_df()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "41fbfb30-a471-4304-a985-9367ab589107", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0eaef40bea7f4a29810b29c1648f0ec1", + "version_major": 2, + "version_minor": 1 + }, + "text/plain": [ + "BuckarooInfiniteWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], …" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "distinct_ref_date" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "51a6663f-55e8-45c0-ab0e-dc85a5f69278", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "con.execute(\"\"\"\n", + "SELECT * FROM parquet_file_metadata('/data/tables/output/en/*.parquet')\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "07fc9ad3-1512-48cc-975f-5a9e7f633d30", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "02c94fd7d5d245baa91972617f80026a", + "version_major": 2, + "version_minor": 1 + }, + "text/plain": [ + "BuckarooInfiniteWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], …" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "con.fetch_df()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "2c01e753-5c54-403a-be2e-c65d3d4bdc89", + "metadata": {}, + "outputs": [], + "source": [ + "import glob" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "fa16862e-60b6-4ae7-a8f2-55bdcc5edb68", + "metadata": {}, + "outputs": [], + "source": [ + "output_parquet_files = glob.glob(\"/data/tables/output/en/*.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "e5edf922-f945-413e-8aca-2929eb6b8d2f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/data/tables/output/en/10100164.parquet\n", + "/data/tables/output/en/23100049.parquet\n", + "/data/tables/output/en/23100050.parquet\n", + "/data/tables/output/en/36100658.parquet\n", + "/data/tables/output/en/36100374.parquet\n", + "/data/tables/output/en/36100396.parquet\n", + "/data/tables/output/en/36100397.parquet\n", + "/data/tables/output/en/38100104.parquet\n", + "/data/tables/output/en/38100105.parquet\n" + ] + } + ], + "source": [ + "for parquet_file in output_parquet_files:\n", + " con.execute(f\"DESCRIBE '{parquet_file}'\")\n", + " value = con.fetch_df()\n", + " length_value_column = len(value[value['column_name'] == 'VALUE'][['column_name', 'column_type']])\n", + " if length_value_column == 0:\n", + " print(parquet_file) " + ] + }, + { + "cell_type": "markdown", + "id": "2f83e0e6-6126-44ca-974b-db92a97472cf", + "metadata": {}, + "source": [ + "# Issue because there's a `Value` and `VALUE` columns in the CSV\n", + "`Value` and `VALUE` columns end up as `Value` and `VALUE_1` columns" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "fc588dd7-a0d2-4b79-9c27-a6502ff99053", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('REF_DATE', 'VARCHAR', 'YES', None, None, None),\n", + " ('REF_START_DATE', 'TIMESTAMP_NS', 'YES', None, None, None),\n", + " ('REF_END_DATE', 'TIMESTAMP_NS', 'YES', None, None, None),\n", + " ('GEO', 'VARCHAR', 'YES', None, None, None),\n", + " ('DGUID', 'VARCHAR', 'YES', None, None, None),\n", + " ('Value', 'VARCHAR', 'YES', None, None, None),\n", + " ('Type of cannabis', 'VARCHAR', 'YES', None, None, None),\n", + " ('UOM', 'VARCHAR', 'YES', None, None, None),\n", + " ('UOM_ID', 'SMALLINT', 'YES', None, None, None),\n", + " ('SCALAR_FACTOR', 'VARCHAR', 'YES', None, None, None),\n", + " ('SCALAR_ID', 'TINYINT', 'YES', None, None, None),\n", + " ('VECTOR', 'VARCHAR', 'YES', None, None, None),\n", + " ('COORDINATE', 'VARCHAR', 'YES', None, None, None),\n", + " ('VALUE_1', 'BIGINT', 'YES', None, None, None),\n", + " ('STATUS', 'VARCHAR', 'YES', None, None, None),\n", + " ('SYMBOL', 'VARCHAR', 'YES', None, None, None),\n", + " ('TERMINATED', 'VARCHAR', 'YES', None, None, None),\n", + " ('DECIMALS', 'TINYINT', 'YES', None, None, None)]" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "con.execute(\"DESCRIBE '/data/tables/output/en/10100164.parquet'\")\n", + "con.fetchall()" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "fef3d2e2-91c5-4fdf-bb00-3928c8f52d11", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('REF_DATE', 'VARCHAR', 'YES', None, None, None),\n", + " ('REF_START_DATE', 'TIMESTAMP_NS', 'YES', None, None, None),\n", + " ('REF_END_DATE', 'TIMESTAMP_NS', 'YES', None, None, None),\n", + " ('GEO', 'VARCHAR', 'YES', None, None, None),\n", + " ('DGUID', 'VARCHAR', 'YES', None, None, None),\n", + " ('Mainline companies', 'VARCHAR', 'YES', None, None, None),\n", + " ('Property accounts', 'VARCHAR', 'YES', None, None, None),\n", + " ('Value', 'VARCHAR', 'YES', None, None, None),\n", + " ('UOM', 'VARCHAR', 'YES', None, None, None),\n", + " ('UOM_ID', 'SMALLINT', 'YES', None, None, None),\n", + " ('SCALAR_FACTOR', 'VARCHAR', 'YES', None, None, None),\n", + " ('SCALAR_ID', 'TINYINT', 'YES', None, None, None),\n", + " ('VECTOR', 'VARCHAR', 'YES', None, None, None),\n", + " ('COORDINATE', 'VARCHAR', 'YES', None, None, None),\n", + " ('VALUE_1', 'BIGINT', 'YES', None, None, None),\n", + " ('STATUS', 'VARCHAR', 'YES', None, None, None),\n", + " ('SYMBOL', 'VARCHAR', 'YES', None, None, None),\n", + " ('TERMINATED', 'VARCHAR', 'YES', None, None, None),\n", + " ('DECIMALS', 'TINYINT', 'YES', None, None, None)]" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "con.execute(\"DESCRIBE '/data/tables/output/en/23100049.parquet'\")\n", + "con.fetchall()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b472b408-4633-429d-824b-0d846db6e9db", + "metadata": {}, "outputs": [], "source": [] }