diff --git a/experiments/statcan_products/date_parsing.ipynb b/experiments/statcan_products/date_parsing.ipynb new file mode 100644 index 0000000..27214a1 --- /dev/null +++ b/experiments/statcan_products/date_parsing.ipynb @@ -0,0 +1,108 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e4d2c52d-38be-4f84-a0bf-6bd8cb577ad9", + "metadata": {}, + "source": [ + "# Purpose\n", + "I need to find out what all possible date formats are for the \"REF_DATE\" field so that when I write the parquet file people will be able to filter on it\n", + "\n", + "These are all the dates I have encountered:\n", + "- Just the year. Example found on productId 36100608: 2024\n", + "- Year and month. Example found on productId 14100443: 2024-07\n", + "- Year, month, date. Example found on productId 33100036: 2025-06-17\n", + "- Range. Example found on productId 17100022: 2013/2014" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "43916131-250b-4fdf-9c75-39024822529c", + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "from multiprocessing import Pool\n", + "import sqlite3\n", + "import polars as pl\n", + "\n", + "data_folder = \"/data/tables\"\n", + "input_folder = f\"{data_folder}/input\"\n", + "scratch_folder = f\"{data_folder}/scratch\"\n", + "output_folder = f\"{data_folder}/output\"\n", + "\n", + "con = sqlite3.connect(f\"{data_folder}/dates.db\")\n", + "cur = con.cursor()\n", + "cur.executescript(\"\"\"\n", + " DROP TABLE IF EXISTS dates;\n", + " CREATE TABLE IF NOT EXISTS dates (\n", + " product_id TEXT,\n", + " date TEXT\n", + " );\n", + "\"\"\")\n", + "con.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "553d58ff-81a4-45e6-a96e-6ea9ebb166fe", + "metadata": {}, + "outputs": [], + "source": [ + "def find_unique_dates(filepath):\n", + " \"\"\"\n", + " Finds unique dates for a table and writes to SQLite table\n", + " \"\"\"\n", + " product_id = filepath.split(\"/\")[-1].split(\".parquet\")[0]\n", + " print(f\"Processing {product_id}\")\n", + " try:\n", + " result = (\n", + " pl.scan_parquet(filepath)\n", + " .select(\"REF_DATE\")\n", + " .collect()\n", + " )\n", + " except Exception:\n", + " return\n", + " unique_dates = [(product_id, x) for x in result['REF_DATE'].unique(maintain_order=True).to_list()]\n", + " cur.executemany(\"INSERT INTO dates VALUES(?, ?)\", unique_dates)\n", + " con.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "507c069d-472f-44a9-872b-e690225fad17", + "metadata": {}, + "outputs": [], + "source": [ + "if __name__ == '__main__':\n", + " files_to_process = glob.glob(f\"{output_folder}/en/*.parquet\")\n", + " with Pool() as p:\n", + " p.map(find_unique_dates, files_to_process)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/experiments/statcan_products/date_parsing.py b/experiments/statcan_products/date_parsing.py new file mode 100644 index 0000000..a757d9a --- /dev/null +++ b/experiments/statcan_products/date_parsing.py @@ -0,0 +1,47 @@ +""" +I need to find out what all possible date formats are for the "REF_DATE" field so that when I write the parquet file people will be able to filter on it +""" + +import glob +from multiprocessing import Pool +import sqlite3 +import polars as pl + +data_folder = "/data/tables" +input_folder = f"{data_folder}/input" +scratch_folder = f"{data_folder}/scratch" +output_folder = f"{data_folder}/output" + +con = sqlite3.connect(f"{data_folder}/dates.db") +cur = con.cursor() +cur.executescript(""" + DROP TABLE IF EXISTS dates; + CREATE TABLE IF NOT EXISTS dates ( + product_id TEXT, + date TEXT + ); +""") +con.commit() + +def find_unique_dates(filepath): + """ + Finds unique dates for a table and writes to SQLite table + """ + product_id = filepath.split("/")[-1].split(".parquet")[0] + print(f"Processing {product_id}") + try: + result = ( + pl.scan_parquet(filepath) + .select("REF_DATE") + .collect() + ) + except Exception: + return + unique_dates = [(product_id, x) for x in result['REF_DATE'].unique(maintain_order=True).to_list()] + cur.executemany("INSERT INTO dates VALUES(?, ?)", unique_dates) + con.commit() + +if __name__ == '__main__': + files_to_process = glob.glob(f"{output_folder}/en/other/en/*.parquet") + with Pool() as p: + p.map(find_unique_dates, files_to_process) \ No newline at end of file diff --git a/experiments/statcan_products/duck_experiment.ipynb b/experiments/statcan_products/duck_experiment.ipynb new file mode 100644 index 0000000..6666b94 --- /dev/null +++ b/experiments/statcan_products/duck_experiment.ipynb @@ -0,0 +1,168 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 41, + "id": "5e04e469-d3eb-42ca-b548-5e6f1fa6af9d", + "metadata": {}, + "outputs": [], + "source": [ + "import buckaroo\n", + "import duckdb\n", + "import polars as pl" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "97c8e92b-21e4-4cc5-8dbe-7b42361ce3f9", + "metadata": {}, + "outputs": [], + "source": [ + "con = duckdb.connect()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "e02f2416-fd16-444b-8fd4-eec2cecee5a7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "con.execute(\"\"\"\n", + "DESCRIBE '/data/tables/output/en/testing/36100670.parquet';\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "fafa7ce7-8619-4951-8c73-7bfbc66dc92f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('REF_DATE', 'DATE', 'YES', None, None, None),\n", + " ('GEO', 'VARCHAR', 'YES', None, None, None),\n", + " ('DGUID', 'VARCHAR', 'YES', None, None, None),\n", + " ('Seasonality', 'VARCHAR', 'YES', None, None, None),\n", + " ('Selected credit estimates', 'VARCHAR', 'YES', None, None, None),\n", + " ('UOM', 'VARCHAR', 'YES', None, None, None),\n", + " ('UOM_ID', 'TINYINT', 'YES', None, None, None),\n", + " ('SCALAR_FACTOR', 'VARCHAR', 'YES', None, None, None),\n", + " ('SCALAR_ID', 'TINYINT', 'YES', None, None, None),\n", + " ('VECTOR', 'VARCHAR', 'YES', None, None, None),\n", + " ('COORDINATE', 'VARCHAR', 'YES', None, None, None),\n", + " ('VALUE', 'INTEGER', 'YES', None, None, None),\n", + " ('STATUS', 'VARCHAR', 'YES', None, None, None),\n", + " ('SYMBOL', 'TINYINT', 'YES', None, None, None),\n", + " ('TERMINATED', 'TINYINT', 'YES', None, None, None),\n", + " ('DECIMALS', 'TINYINT', 'YES', None, None, None)]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "con.fetchall()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "a4ed2881-91b7-4473-b246-a969ef59efba", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d937312a19f44fc2a8f87bcae8d0faca", + "version_major": 2, + "version_minor": 1 + }, + "text/plain": [ + "PolarsBuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'p…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "con.execute(\"SELECT DISTINCT REF_DATE FROM '/data/tables/output/en/testing/36100670.parquet' ORDER BY REF_DATE\").pl()" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "f400feee-efb6-421a-b518-1f9c0fc21bcb", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c2edc671f3014e4aae90a2a5b0577bfa", + "version_major": 2, + "version_minor": 1 + }, + "text/plain": [ + "PolarsBuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'p…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "con.execute(\"\"\"\n", + "SELECT REF_DATE, DGUID, Seasonality, 'Selected credit estimates', VALUE\n", + "FROM '/data/tables/output/en/testing/36100670.parquet'\n", + "WHERE REF_DATE > DATE '2018-01-01'\n", + "\"\"\").pl()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2e00108-bc8f-4b8b-91e0-f7bf9b44c59d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/experiments/statcan_products/parse_dates_properly.ipynb b/experiments/statcan_products/parse_dates_properly.ipynb new file mode 100644 index 0000000..bbf9d65 --- /dev/null +++ b/experiments/statcan_products/parse_dates_properly.ipynb @@ -0,0 +1,207 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d287c593-ab51-4042-b7a6-634916f7075f", + "metadata": {}, + "source": [ + "# Purpose\n", + "I need to find out what all possible date formats are for the \"REF_DATE\" field so that when I write the parquet file people will be able to filter on it\n", + "\n", + "These are all the dates I have encountered:\n", + "- Just the year. Example found on productId 36100608: 2024\n", + "- Year and month. Example found on productId 14100443: 2024-07\n", + "- Year, month, date. Example found on productId 33100036: 2025-06-17\n", + "- Range. Example found on productId 17100022: 2013/2014" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "id": "57b802e1-ce9c-4e2e-abf5-aa29ba7e77f4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 16)
REF_DATEGEODGUIDRestriction levelVaccination statusUOMUOM_IDSCALAR_FACTORSCALAR_IDVECTORCOORDINATEVALUESTATUSSYMBOLTERMINATEDDECIMALS
strstrstrstrstrstri16stri8strstrf64i8i8i8i8
"2020-01-01""Newfoundland and Labrador""2016A000210""Restriction index""Total population""Index"160"units"0"v1331468081""1.1.1"1.67nullnullnull2
"2020-01-01""Newfoundland and Labrador""2016A000210""Restriction index""Vaccinated persons""Index"160"units"0"v1331468097""1.1.2"1.67nullnullnull2
"2020-01-01""Newfoundland and Labrador""2016A000210""Restriction index""Unvaccinated persons""Index"160"units"0"v1331468113""1.1.3"1.67nullnullnull2
"2020-01-01""Newfoundland and Labrador""2016A000210""School closing""Total population""Index"160"units"0"v1331468082""1.2.1"0.0nullnullnull2
"2020-01-01""Newfoundland and Labrador""2016A000210""School closing""Vaccinated persons""Index"160"units"0"v1331468098""1.2.2"0.0nullnullnull2
" + ], + "text/plain": [ + "shape: (5, 16)\n", + "┌────────────┬─────────────┬────────────┬────────────┬───┬────────┬────────┬────────────┬──────────┐\n", + "│ REF_DATE ┆ GEO ┆ DGUID ┆ Restrictio ┆ … ┆ STATUS ┆ SYMBOL ┆ TERMINATED ┆ DECIMALS │\n", + "│ --- ┆ --- ┆ --- ┆ n level ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ str ┆ --- ┆ ┆ i8 ┆ i8 ┆ i8 ┆ i8 │\n", + "│ ┆ ┆ ┆ str ┆ ┆ ┆ ┆ ┆ │\n", + "╞════════════╪═════════════╪════════════╪════════════╪═══╪════════╪════════╪════════════╪══════════╡\n", + "│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n", + "│ ┆ d and ┆ 0 ┆ n index ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n", + "│ ┆ d and ┆ 0 ┆ n index ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n", + "│ ┆ d and ┆ 0 ┆ n index ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ School ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n", + "│ ┆ d and ┆ 0 ┆ closing ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ School ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n", + "│ ┆ d and ┆ 0 ┆ closing ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "└────────────┴─────────────┴────────────┴────────────┴───┴────────┴────────┴────────────┴──────────┘" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import polars as pl\n", + "\n", + "data_folder = \"/data/tables\"\n", + "input_folder = f\"{data_folder}/input\"\n", + "scratch_folder = f\"{data_folder}/scratch\"\n", + "output_folder = f\"{data_folder}/output\"\n", + "\n", + "def normalize_ref_date(series):\n", + " # Always cast to string first\n", + " series = series.cast(pl.Utf8)\n", + " \n", + " # Try parsing as full date (YYYY-MM-DD)\n", + " full = series.str.strptime(pl.Date, \"%Y-%m-%d\", strict=False)\n", + "\n", + " # For nulls, try parsing as year-month (YYYY-MM)\n", + " ym = series.str.strptime(pl.Date, \"%Y-%m\", strict=False).dt.replace(day=1)\n", + " full = full.fill_null(ym)\n", + "\n", + " # For remaining nulls, try just year (YYYY)\n", + " y = series.str.strptime(pl.Date, \"%Y\", strict=False).dt.replace(month=1, day=1)\n", + " full = full.fill_null(y)\n", + "\n", + " return full\n", + "\n", + "filepath = f\"{output_folder}/en/other/en/33100496.parquet\"\n", + "df = pl.read_parquet(filepath)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "id": "6e0bb9eb-0638-48c6-9308-bc5a846ff27f", + "metadata": {}, + "outputs": [], + "source": [ + "skip_calculating_ref_date = False\n", + "if df.schema[\"REF_DATE\"] == pl.String:\n", + " if df[\"REF_DATE\"].str.contains(\"/\").any():\n", + " # Skip the calculating of the field\n", + " skip_calculating_ref_date = True" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "id": "5f93ba75-5c1c-4a7f-b526-89d7589f3834", + "metadata": {}, + "outputs": [], + "source": [ + "if skip_calculating_ref_date == False:\n", + " df = df.with_columns([\n", + " normalize_ref_date(pl.col(\"REF_DATE\")).alias(\"REF_DATE\")\n", + " ])" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "id": "d54d297c-ae68-4a44-96c6-457873ea2d10", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 16)
REF_DATEGEODGUIDRestriction levelVaccination statusUOMUOM_IDSCALAR_FACTORSCALAR_IDVECTORCOORDINATEVALUESTATUSSYMBOLTERMINATEDDECIMALS
datestrstrstrstrstri16stri8strstrf64i8i8i8i8
2020-01-01"Newfoundland and Labrador""2016A000210""Restriction index""Total population""Index"160"units"0"v1331468081""1.1.1"1.67nullnullnull2
2020-01-01"Newfoundland and Labrador""2016A000210""Restriction index""Vaccinated persons""Index"160"units"0"v1331468097""1.1.2"1.67nullnullnull2
2020-01-01"Newfoundland and Labrador""2016A000210""Restriction index""Unvaccinated persons""Index"160"units"0"v1331468113""1.1.3"1.67nullnullnull2
2020-01-01"Newfoundland and Labrador""2016A000210""School closing""Total population""Index"160"units"0"v1331468082""1.2.1"0.0nullnullnull2
2020-01-01"Newfoundland and Labrador""2016A000210""School closing""Vaccinated persons""Index"160"units"0"v1331468098""1.2.2"0.0nullnullnull2
" + ], + "text/plain": [ + "shape: (5, 16)\n", + "┌────────────┬─────────────┬────────────┬────────────┬───┬────────┬────────┬────────────┬──────────┐\n", + "│ REF_DATE ┆ GEO ┆ DGUID ┆ Restrictio ┆ … ┆ STATUS ┆ SYMBOL ┆ TERMINATED ┆ DECIMALS │\n", + "│ --- ┆ --- ┆ --- ┆ n level ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ date ┆ str ┆ str ┆ --- ┆ ┆ i8 ┆ i8 ┆ i8 ┆ i8 │\n", + "│ ┆ ┆ ┆ str ┆ ┆ ┆ ┆ ┆ │\n", + "╞════════════╪═════════════╪════════════╪════════════╪═══╪════════╪════════╪════════════╪══════════╡\n", + "│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n", + "│ ┆ d and ┆ 0 ┆ n index ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n", + "│ ┆ d and ┆ 0 ┆ n index ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n", + "│ ┆ d and ┆ 0 ┆ n index ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ School ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n", + "│ ┆ d and ┆ 0 ┆ closing ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ School ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n", + "│ ┆ d and ┆ 0 ┆ closing ┆ ┆ ┆ ┆ ┆ │\n", + "│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "└────────────┴─────────────┴────────────┴────────────┴───┴────────┴────────┴────────────┴──────────┘" + ] + }, + "execution_count": 110, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c7a9dd4-5001-4174-9074-8cb14721f79c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/experiments/statcan_products/process_product_test.ipynb b/experiments/statcan_products/process_product_test.ipynb index 089b9e6..9362329 100644 --- a/experiments/statcan_products/process_product_test.ipynb +++ b/experiments/statcan_products/process_product_test.ipynb @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": 188, + "execution_count": 209, "id": "98859cd6-6fa4-4aef-a113-455699524fae", "metadata": {}, "outputs": [], @@ -39,8 +39,8 @@ "output_folder = f\"{data_folder}/output\"\n", "\n", "\n", - "if not os.path.exists(\"processing.db\"):\n", - " con = sqlite3.connect('processing.db')\n", + "if not os.path.exists(f\"{data_folder}/processing.db\"):\n", + " con = sqlite3.connect(f\"{data_folder}/processing.db\")\n", " cur = con.cursor()\n", " cur.executescript(\"\"\"\n", " CREATE TABLE IF NOT EXISTS downloaded (\n", @@ -55,13 +55,13 @@ " \"\"\")\n", " con.commit()\n", "else:\n", - " con = sqlite3.connect('processing.db')\n", + " con = sqlite3.connect(f\"{data_folder}/processing.db\")\n", " cur = con.cursor()" ] }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 210, "id": "28ac4c01-c1c5-427f-bb2c-0da99a4c5591", "metadata": {}, "outputs": [], @@ -83,7 +83,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 211, "id": "9daa94f4-16c9-4d8b-951e-3d5d38eb618f", "metadata": {}, "outputs": [], @@ -93,12 +93,17 @@ }, { "cell_type": "code", - "execution_count": 177, + "execution_count": 212, "id": "0af9a4b3-7b59-460b-b933-504919d4bd2a", "metadata": {}, "outputs": [], "source": [ "def update_last_downloaded(product_id):\n", + " \"\"\"\n", + " Updates SQLite database with the last time the table was updated\n", + " The datetime is in Eastern timezone, so have to convert to UTC to\n", + " be consistent with https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite\n", + " \"\"\"\n", " filepath = f\"{input_folder}/metadata/{product_id}.json\"\n", " print(f\"Reading metadata {filepath}\")\n", " with open(filepath, 'r') as fp:\n", @@ -123,7 +128,20 @@ }, { "cell_type": "code", - "execution_count": 170, + "execution_count": 213, + "id": "4b7996d2-75ab-4173-a17a-64fb7ab63740", + "metadata": {}, + "outputs": [], + "source": [ + "def update_last_processed(product_id):\n", + " time_finished_processing = datetime.now().isoformat()\n", + " cur.execute(\"UPDATE downloaded SET last_processed = ? WHERE product_id = ?\", (time_finished_processing, product_id))\n", + " con.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 214, "id": "23cbabc3-0d4b-4e28-a4df-b2c5e8c7ea8b", "metadata": {}, "outputs": [], @@ -166,159 +184,23 @@ " for result in results:\n", " product_id = result[0]\n", " print(f\"Updating product_id: {product_id}\")\n", + " download_cube(product_id)\n", " process_cube(product_id)" ] }, { "cell_type": "code", - "execution_count": 186, - "id": "c0d46f38-242c-4685-b8b6-4e046c23aec5", + "execution_count": 215, + "id": "dc5573ef-734b-44d8-a4c4-0df19d655975", "metadata": {}, "outputs": [], "source": [ - "cur.execute(\"\"\"\n", - " SELECT a.product_id, a.last_updated, b.last_updated\n", - " FROM downloaded AS a,\n", - " cubes AS b\n", - " WHERE a.product_id = b.product_id\n", - " AND b.last_updated > a.last_updated\n", - "\"\"\")\n", - "difference = pd.DataFrame(cur.fetchall(), columns=[\"product_id\", \"release_time_metadata\", \"release_time_cubelist\"])" + "#update_tables()" ] }, { "cell_type": "code", - "execution_count": 189, - "id": "a5e12a7a-9891-4d58-9368-e31b274feb1d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
product_idrelease_time_metadatarelease_time_cubelist
0101000062025-05-21T12:30:00+00:002025-06-17T12:30:00+00:00
1101001082025-05-13T12:30:00+00:002025-06-12T12:30:00+00:00
2101001322025-06-06T12:30:00+00:002025-06-13T12:30:00+00:00
3101001362025-06-10T12:30:00+00:002025-06-16T12:30:00+00:00
4101001382025-06-06T12:30:00+00:002025-06-13T12:30:00+00:00
............
92381002352025-03-13T12:30:00+00:002025-06-12T12:30:00+00:00
93381002362025-03-13T12:30:00+00:002025-06-12T12:30:00+00:00
94381002372025-03-13T12:30:00+00:002025-06-12T12:30:00+00:00
95381002382025-03-13T12:30:00+00:002025-06-12T12:30:00+00:00
96381001642023-09-13T12:30:00+00:002025-06-17T12:30:00+00:00
\n", - "

97 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " product_id release_time_metadata release_time_cubelist\n", - "0 10100006 2025-05-21T12:30:00+00:00 2025-06-17T12:30:00+00:00\n", - "1 10100108 2025-05-13T12:30:00+00:00 2025-06-12T12:30:00+00:00\n", - "2 10100132 2025-06-06T12:30:00+00:00 2025-06-13T12:30:00+00:00\n", - "3 10100136 2025-06-10T12:30:00+00:00 2025-06-16T12:30:00+00:00\n", - "4 10100138 2025-06-06T12:30:00+00:00 2025-06-13T12:30:00+00:00\n", - ".. ... ... ...\n", - "92 38100235 2025-03-13T12:30:00+00:00 2025-06-12T12:30:00+00:00\n", - "93 38100236 2025-03-13T12:30:00+00:00 2025-06-12T12:30:00+00:00\n", - "94 38100237 2025-03-13T12:30:00+00:00 2025-06-12T12:30:00+00:00\n", - "95 38100238 2025-03-13T12:30:00+00:00 2025-06-12T12:30:00+00:00\n", - "96 38100164 2023-09-13T12:30:00+00:00 2025-06-17T12:30:00+00:00\n", - "\n", - "[97 rows x 3 columns]" - ] - }, - "execution_count": 189, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "difference" - ] - }, - { - "cell_type": "code", - "execution_count": 193, + "execution_count": 228, "id": "144e3716-b0e7-4a39-9a25-ededea506f4f", "metadata": {}, "outputs": [], @@ -368,6 +250,9 @@ " return result\n", "\n", "def download_cube(product_id, language=\"en\"):\n", + " \"\"\"\n", + " Downloads the English CSV for a specific table\n", + " \"\"\"\n", " download_url = f\"https://www150.statcan.gc.ca/t1/wds/rest/getFullTableDownloadCSV/{product_id}/en\"\n", " response = requests.get(download_url).json()\n", " zip_url = response['object']\n", @@ -388,6 +273,11 @@ " progress_bar.close()\n", " \n", "def process_cube(product_id, language=\"en\"):\n", + " cur.execute(\"SELECT product_id FROM downloaded WHERE product_id = ?\", (product_id,))\n", + " result = cur.fetchone()\n", + " if result:\n", + " print(f\"Already processed {product_id}\")\n", + " return\n", " extract_zipfile(product_id, language)\n", " \"\"\"\n", " The pandas column reader is better than the Polars one\n", @@ -395,12 +285,11 @@ " https://www150.statcan.gc.ca/n1/tbl/csv/98100404-eng.zip\n", " \"\"\"\n", " # Get metadata\n", - " metadata_file = f\"{input_folder}/metadata/{product_id}.json\"\n", - " metadata = get_cube_metadata(product_id)\n", - " print(f\"Writing metadata file {metadata_file}\")\n", - " with open(metadata_file, \"w\") as outfile:\n", - " json.dump(metadata, outfile)\n", - " update_last_downloaded(product_id)\n", + " #metadata_file = f\"{input_folder}/metadata/{product_id}.json\"\n", + " #metadata = get_cube_metadata(product_id)\n", + " #print(f\"Writing metadata file {metadata_file}\")\n", + " #with open(metadata_file, \"w\") as outfile:\n", + " # json.dump(metadata, outfile)\n", " # Read CSV using Pandas\n", " product_csv = f\"{scratch_folder}/{product_id}.csv\"\n", " parameters = {\n", @@ -410,22 +299,32 @@ " print(f\"Reading {product_csv} as a Pandas dataframe\")\n", " df = pd.read_csv(product_csv, **parameters)\n", " df = convert_to_lowest_type(df)\n", + " # Fix mixed types. This happened in productId 11100147\n", + " for col in df.select_dtypes(include=[\"object\"]).columns:\n", + " types_in_col = df[col].map(type).unique()\n", + " # If there's any non-str type in the column\n", + " if any(t != str for t in types_in_col):\n", + " print(f\"Fixing mixed types in column: {col} — types found: {types_in_col}\")\n", + " # Replace non-str values with None\n", + " df[col] = df[col].where(df[col].map(type) == str, None).astype(\"string\")\n", " print(\"Import Pandas dataframe as a Polars dataframe\")\n", " df = pl.from_pandas(df)\n", " output_parquet = f\"{output_folder}/{language}/{product_id}.parquet\"\n", " print(f\"Exporting dataframe as parquet to {output_parquet}\")\n", - " df.write_parquet(f\"{output_folder}/{language}/{product_id}.parquet\",\n", + " df.write_parquet(output_parquet,\n", " compression='zstd',\n", " compression_level=22)\n", " # Remove the scratch files\n", " print(\"Removing scratch files\")\n", " os.remove(f\"{scratch_folder}/{product_id}.csv\")\n", - " os.remove(f\"{scratch_folder}/{product_id}_MetaData.csv\") \n" + " os.remove(f\"{scratch_folder}/{product_id}_MetaData.csv\")\n", + " update_last_downloaded(product_id)\n", + " update_last_processed(product_id)" ] }, { "cell_type": "code", - "execution_count": 164, + "execution_count": 229, "id": "6be0842e-c1e0-45b4-90e5-0be28b963aed", "metadata": {}, "outputs": [ @@ -433,54 +332,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Extracting /data/tables/input/en/10100111.zip to /data/tables/scratch\n", - "Writing metadata file /data/tables/input/metadata/10100111.json\n", - "Reading metadata /data/tables/input/metadata/10100111.json\n", - "('10100111', '2025-06-18T12:30:00+00:00')\n", - "Reading /data/tables/scratch/10100111.csv as a Pandas dataframe\n", - "Converting dataframe to optimal data types\n", - "Import Pandas dataframe as a Polars dataframe\n", - "Exporting dataframe as parquet to /data/tables/output/en/10100111.parquet\n", - "Removing scratch files\n" + "Already processed 11100147\n" ] } ], "source": [ - "process_cube(\"10100111\")" + "process_cube(\"11100147\")" ] - }, - { - "cell_type": "code", - "execution_count": 194, - "id": "8a193195-bfa6-4df3-adab-0f88025276da", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/38100164-eng.zip to /data/tables/input/en/38100164.zip\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/data/tables/input/en/38100164.zip: 100%|██████████████████| 980k/980k [00:00<00:00, 1.27MB/s]\n" - ] - } - ], - "source": [ - "download_cube(\"38100164\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fc4ae317-3e3e-4c27-a2fe-7e6bd12f673f", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": {