Continue work on processing data tables

This commit is contained in:
Diego Ripley
2025-06-19 15:58:30 -04:00
parent ab8f40c708
commit f6d88c5fd0
5 changed files with 589 additions and 201 deletions
@@ -0,0 +1,108 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "e4d2c52d-38be-4f84-a0bf-6bd8cb577ad9",
"metadata": {},
"source": [
"# Purpose\n",
"I need to find out what all possible date formats are for the \"REF_DATE\" field so that when I write the parquet file people will be able to filter on it\n",
"\n",
"These are all the dates I have encountered:\n",
"- Just the year. Example found on productId 36100608: 2024\n",
"- Year and month. Example found on productId 14100443: 2024-07\n",
"- Year, month, date. Example found on productId 33100036: 2025-06-17\n",
"- Range. Example found on productId 17100022: 2013/2014"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "43916131-250b-4fdf-9c75-39024822529c",
"metadata": {},
"outputs": [],
"source": [
"import glob\n",
"from multiprocessing import Pool\n",
"import sqlite3\n",
"import polars as pl\n",
"\n",
"data_folder = \"/data/tables\"\n",
"input_folder = f\"{data_folder}/input\"\n",
"scratch_folder = f\"{data_folder}/scratch\"\n",
"output_folder = f\"{data_folder}/output\"\n",
"\n",
"con = sqlite3.connect(f\"{data_folder}/dates.db\")\n",
"cur = con.cursor()\n",
"cur.executescript(\"\"\"\n",
" DROP TABLE IF EXISTS dates;\n",
" CREATE TABLE IF NOT EXISTS dates (\n",
" product_id TEXT,\n",
" date TEXT\n",
" );\n",
"\"\"\")\n",
"con.commit()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "553d58ff-81a4-45e6-a96e-6ea9ebb166fe",
"metadata": {},
"outputs": [],
"source": [
"def find_unique_dates(filepath):\n",
" \"\"\"\n",
" Finds unique dates for a table and writes to SQLite table\n",
" \"\"\"\n",
" product_id = filepath.split(\"/\")[-1].split(\".parquet\")[0]\n",
" print(f\"Processing {product_id}\")\n",
" try:\n",
" result = (\n",
" pl.scan_parquet(filepath)\n",
" .select(\"REF_DATE\")\n",
" .collect()\n",
" )\n",
" except Exception:\n",
" return\n",
" unique_dates = [(product_id, x) for x in result['REF_DATE'].unique(maintain_order=True).to_list()]\n",
" cur.executemany(\"INSERT INTO dates VALUES(?, ?)\", unique_dates)\n",
" con.commit()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "507c069d-472f-44a9-872b-e690225fad17",
"metadata": {},
"outputs": [],
"source": [
"if __name__ == '__main__':\n",
" files_to_process = glob.glob(f\"{output_folder}/en/*.parquet\")\n",
" with Pool() as p:\n",
" p.map(find_unique_dates, files_to_process)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
@@ -0,0 +1,47 @@
"""
I need to find out what all possible date formats are for the "REF_DATE" field so that when I write the parquet file people will be able to filter on it
"""
import glob
from multiprocessing import Pool
import sqlite3
import polars as pl
data_folder = "/data/tables"
input_folder = f"{data_folder}/input"
scratch_folder = f"{data_folder}/scratch"
output_folder = f"{data_folder}/output"
con = sqlite3.connect(f"{data_folder}/dates.db")
cur = con.cursor()
cur.executescript("""
DROP TABLE IF EXISTS dates;
CREATE TABLE IF NOT EXISTS dates (
product_id TEXT,
date TEXT
);
""")
con.commit()
def find_unique_dates(filepath):
"""
Finds unique dates for a table and writes to SQLite table
"""
product_id = filepath.split("/")[-1].split(".parquet")[0]
print(f"Processing {product_id}")
try:
result = (
pl.scan_parquet(filepath)
.select("REF_DATE")
.collect()
)
except Exception:
return
unique_dates = [(product_id, x) for x in result['REF_DATE'].unique(maintain_order=True).to_list()]
cur.executemany("INSERT INTO dates VALUES(?, ?)", unique_dates)
con.commit()
if __name__ == '__main__':
files_to_process = glob.glob(f"{output_folder}/en/other/en/*.parquet")
with Pool() as p:
p.map(find_unique_dates, files_to_process)
@@ -0,0 +1,168 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 41,
"id": "5e04e469-d3eb-42ca-b548-5e6f1fa6af9d",
"metadata": {},
"outputs": [],
"source": [
"import buckaroo\n",
"import duckdb\n",
"import polars as pl"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "97c8e92b-21e4-4cc5-8dbe-7b42361ce3f9",
"metadata": {},
"outputs": [],
"source": [
"con = duckdb.connect()"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "e02f2416-fd16-444b-8fd4-eec2cecee5a7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x7f3bbc156230>"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con.execute(\"\"\"\n",
"DESCRIBE '/data/tables/output/en/testing/36100670.parquet';\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "fafa7ce7-8619-4951-8c73-7bfbc66dc92f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('REF_DATE', 'DATE', 'YES', None, None, None),\n",
" ('GEO', 'VARCHAR', 'YES', None, None, None),\n",
" ('DGUID', 'VARCHAR', 'YES', None, None, None),\n",
" ('Seasonality', 'VARCHAR', 'YES', None, None, None),\n",
" ('Selected credit estimates', 'VARCHAR', 'YES', None, None, None),\n",
" ('UOM', 'VARCHAR', 'YES', None, None, None),\n",
" ('UOM_ID', 'TINYINT', 'YES', None, None, None),\n",
" ('SCALAR_FACTOR', 'VARCHAR', 'YES', None, None, None),\n",
" ('SCALAR_ID', 'TINYINT', 'YES', None, None, None),\n",
" ('VECTOR', 'VARCHAR', 'YES', None, None, None),\n",
" ('COORDINATE', 'VARCHAR', 'YES', None, None, None),\n",
" ('VALUE', 'INTEGER', 'YES', None, None, None),\n",
" ('STATUS', 'VARCHAR', 'YES', None, None, None),\n",
" ('SYMBOL', 'TINYINT', 'YES', None, None, None),\n",
" ('TERMINATED', 'TINYINT', 'YES', None, None, None),\n",
" ('DECIMALS', 'TINYINT', 'YES', None, None, None)]"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con.fetchall()"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "a4ed2881-91b7-4473-b246-a969ef59efba",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d937312a19f44fc2a8f87bcae8d0faca",
"version_major": 2,
"version_minor": 1
},
"text/plain": [
"PolarsBuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'p…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"con.execute(\"SELECT DISTINCT REF_DATE FROM '/data/tables/output/en/testing/36100670.parquet' ORDER BY REF_DATE\").pl()"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "f400feee-efb6-421a-b518-1f9c0fc21bcb",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c2edc671f3014e4aae90a2a5b0577bfa",
"version_major": 2,
"version_minor": 1
},
"text/plain": [
"PolarsBuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'p…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"con.execute(\"\"\"\n",
"SELECT REF_DATE, DGUID, Seasonality, 'Selected credit estimates', VALUE\n",
"FROM '/data/tables/output/en/testing/36100670.parquet'\n",
"WHERE REF_DATE > DATE '2018-01-01'\n",
"\"\"\").pl()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e2e00108-bc8f-4b8b-91e0-f7bf9b44c59d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
@@ -0,0 +1,207 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "d287c593-ab51-4042-b7a6-634916f7075f",
"metadata": {},
"source": [
"# Purpose\n",
"I need to find out what all possible date formats are for the \"REF_DATE\" field so that when I write the parquet file people will be able to filter on it\n",
"\n",
"These are all the dates I have encountered:\n",
"- Just the year. Example found on productId 36100608: 2024\n",
"- Year and month. Example found on productId 14100443: 2024-07\n",
"- Year, month, date. Example found on productId 33100036: 2025-06-17\n",
"- Range. Example found on productId 17100022: 2013/2014"
]
},
{
"cell_type": "code",
"execution_count": 107,
"id": "57b802e1-ce9c-4e2e-abf5-aa29ba7e77f4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (5, 16)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>REF_DATE</th><th>GEO</th><th>DGUID</th><th>Restriction level</th><th>Vaccination status</th><th>UOM</th><th>UOM_ID</th><th>SCALAR_FACTOR</th><th>SCALAR_ID</th><th>VECTOR</th><th>COORDINATE</th><th>VALUE</th><th>STATUS</th><th>SYMBOL</th><th>TERMINATED</th><th>DECIMALS</th></tr><tr><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>i16</td><td>str</td><td>i8</td><td>str</td><td>str</td><td>f64</td><td>i8</td><td>i8</td><td>i8</td><td>i8</td></tr></thead><tbody><tr><td>&quot;2020-01-01&quot;</td><td>&quot;Newfoundland and Labrador&quot;</td><td>&quot;2016A000210&quot;</td><td>&quot;Restriction index&quot;</td><td>&quot;Total population&quot;</td><td>&quot;Index&quot;</td><td>160</td><td>&quot;units&quot;</td><td>0</td><td>&quot;v1331468081&quot;</td><td>&quot;1.1.1&quot;</td><td>1.67</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>&quot;2020-01-01&quot;</td><td>&quot;Newfoundland and Labrador&quot;</td><td>&quot;2016A000210&quot;</td><td>&quot;Restriction index&quot;</td><td>&quot;Vaccinated persons&quot;</td><td>&quot;Index&quot;</td><td>160</td><td>&quot;units&quot;</td><td>0</td><td>&quot;v1331468097&quot;</td><td>&quot;1.1.2&quot;</td><td>1.67</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>&quot;2020-01-01&quot;</td><td>&quot;Newfoundland and Labrador&quot;</td><td>&quot;2016A000210&quot;</td><td>&quot;Restriction index&quot;</td><td>&quot;Unvaccinated persons&quot;</td><td>&quot;Index&quot;</td><td>160</td><td>&quot;units&quot;</td><td>0</td><td>&quot;v1331468113&quot;</td><td>&quot;1.1.3&quot;</td><td>1.67</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>&quot;2020-01-01&quot;</td><td>&quot;Newfoundland and Labrador&quot;</td><td>&quot;2016A000210&quot;</td><td>&quot;School closing&quot;</td><td>&quot;Total population&quot;</td><td>&quot;Index&quot;</td><td>160</td><td>&quot;units&quot;</td><td>0</td><td>&quot;v1331468082&quot;</td><td>&quot;1.2.1&quot;</td><td>0.0</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>&quot;2020-01-01&quot;</td><td>&quot;Newfoundland and Labrador&quot;</td><td>&quot;2016A000210&quot;</td><td>&quot;School closing&quot;</td><td>&quot;Vaccinated persons&quot;</td><td>&quot;Index&quot;</td><td>160</td><td>&quot;units&quot;</td><td>0</td><td>&quot;v1331468098&quot;</td><td>&quot;1.2.2&quot;</td><td>0.0</td><td>null</td><td>null</td><td>null</td><td>2</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (5, 16)\n",
"┌────────────┬─────────────┬────────────┬────────────┬───┬────────┬────────┬────────────┬──────────┐\n",
"│ REF_DATE ┆ GEO ┆ DGUID ┆ Restrictio ┆ … ┆ STATUS ┆ SYMBOL ┆ TERMINATED ┆ DECIMALS │\n",
"│ --- ┆ --- ┆ --- ┆ n level ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ str ┆ str ┆ str ┆ --- ┆ ┆ i8 ┆ i8 ┆ i8 ┆ i8 │\n",
"│ ┆ ┆ ┆ str ┆ ┆ ┆ ┆ ┆ │\n",
"╞════════════╪═════════════╪════════════╪════════════╪═══╪════════╪════════╪════════════╪══════════╡\n",
"│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n",
"│ ┆ d and ┆ 0 ┆ n index ┆ ┆ ┆ ┆ ┆ │\n",
"│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n",
"│ ┆ d and ┆ 0 ┆ n index ┆ ┆ ┆ ┆ ┆ │\n",
"│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n",
"│ ┆ d and ┆ 0 ┆ n index ┆ ┆ ┆ ┆ ┆ │\n",
"│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ School ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n",
"│ ┆ d and ┆ 0 ┆ closing ┆ ┆ ┆ ┆ ┆ │\n",
"│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ School ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n",
"│ ┆ d and ┆ 0 ┆ closing ┆ ┆ ┆ ┆ ┆ │\n",
"│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"└────────────┴─────────────┴────────────┴────────────┴───┴────────┴────────┴────────────┴──────────┘"
]
},
"execution_count": 107,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import polars as pl\n",
"\n",
"data_folder = \"/data/tables\"\n",
"input_folder = f\"{data_folder}/input\"\n",
"scratch_folder = f\"{data_folder}/scratch\"\n",
"output_folder = f\"{data_folder}/output\"\n",
"\n",
"def normalize_ref_date(series):\n",
" # Always cast to string first\n",
" series = series.cast(pl.Utf8)\n",
" \n",
" # Try parsing as full date (YYYY-MM-DD)\n",
" full = series.str.strptime(pl.Date, \"%Y-%m-%d\", strict=False)\n",
"\n",
" # For nulls, try parsing as year-month (YYYY-MM)\n",
" ym = series.str.strptime(pl.Date, \"%Y-%m\", strict=False).dt.replace(day=1)\n",
" full = full.fill_null(ym)\n",
"\n",
" # For remaining nulls, try just year (YYYY)\n",
" y = series.str.strptime(pl.Date, \"%Y\", strict=False).dt.replace(month=1, day=1)\n",
" full = full.fill_null(y)\n",
"\n",
" return full\n",
"\n",
"filepath = f\"{output_folder}/en/other/en/33100496.parquet\"\n",
"df = pl.read_parquet(filepath)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 108,
"id": "6e0bb9eb-0638-48c6-9308-bc5a846ff27f",
"metadata": {},
"outputs": [],
"source": [
"skip_calculating_ref_date = False\n",
"if df.schema[\"REF_DATE\"] == pl.String:\n",
" if df[\"REF_DATE\"].str.contains(\"/\").any():\n",
" # Skip the calculating of the field\n",
" skip_calculating_ref_date = True"
]
},
{
"cell_type": "code",
"execution_count": 109,
"id": "5f93ba75-5c1c-4a7f-b526-89d7589f3834",
"metadata": {},
"outputs": [],
"source": [
"if skip_calculating_ref_date == False:\n",
" df = df.with_columns([\n",
" normalize_ref_date(pl.col(\"REF_DATE\")).alias(\"REF_DATE\")\n",
" ])"
]
},
{
"cell_type": "code",
"execution_count": 110,
"id": "d54d297c-ae68-4a44-96c6-457873ea2d10",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (5, 16)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>REF_DATE</th><th>GEO</th><th>DGUID</th><th>Restriction level</th><th>Vaccination status</th><th>UOM</th><th>UOM_ID</th><th>SCALAR_FACTOR</th><th>SCALAR_ID</th><th>VECTOR</th><th>COORDINATE</th><th>VALUE</th><th>STATUS</th><th>SYMBOL</th><th>TERMINATED</th><th>DECIMALS</th></tr><tr><td>date</td><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>i16</td><td>str</td><td>i8</td><td>str</td><td>str</td><td>f64</td><td>i8</td><td>i8</td><td>i8</td><td>i8</td></tr></thead><tbody><tr><td>2020-01-01</td><td>&quot;Newfoundland and Labrador&quot;</td><td>&quot;2016A000210&quot;</td><td>&quot;Restriction index&quot;</td><td>&quot;Total population&quot;</td><td>&quot;Index&quot;</td><td>160</td><td>&quot;units&quot;</td><td>0</td><td>&quot;v1331468081&quot;</td><td>&quot;1.1.1&quot;</td><td>1.67</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>2020-01-01</td><td>&quot;Newfoundland and Labrador&quot;</td><td>&quot;2016A000210&quot;</td><td>&quot;Restriction index&quot;</td><td>&quot;Vaccinated persons&quot;</td><td>&quot;Index&quot;</td><td>160</td><td>&quot;units&quot;</td><td>0</td><td>&quot;v1331468097&quot;</td><td>&quot;1.1.2&quot;</td><td>1.67</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>2020-01-01</td><td>&quot;Newfoundland and Labrador&quot;</td><td>&quot;2016A000210&quot;</td><td>&quot;Restriction index&quot;</td><td>&quot;Unvaccinated persons&quot;</td><td>&quot;Index&quot;</td><td>160</td><td>&quot;units&quot;</td><td>0</td><td>&quot;v1331468113&quot;</td><td>&quot;1.1.3&quot;</td><td>1.67</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>2020-01-01</td><td>&quot;Newfoundland and Labrador&quot;</td><td>&quot;2016A000210&quot;</td><td>&quot;School closing&quot;</td><td>&quot;Total population&quot;</td><td>&quot;Index&quot;</td><td>160</td><td>&quot;units&quot;</td><td>0</td><td>&quot;v1331468082&quot;</td><td>&quot;1.2.1&quot;</td><td>0.0</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>2020-01-01</td><td>&quot;Newfoundland and Labrador&quot;</td><td>&quot;2016A000210&quot;</td><td>&quot;School closing&quot;</td><td>&quot;Vaccinated persons&quot;</td><td>&quot;Index&quot;</td><td>160</td><td>&quot;units&quot;</td><td>0</td><td>&quot;v1331468098&quot;</td><td>&quot;1.2.2&quot;</td><td>0.0</td><td>null</td><td>null</td><td>null</td><td>2</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (5, 16)\n",
"┌────────────┬─────────────┬────────────┬────────────┬───┬────────┬────────┬────────────┬──────────┐\n",
"│ REF_DATE ┆ GEO ┆ DGUID ┆ Restrictio ┆ … ┆ STATUS ┆ SYMBOL ┆ TERMINATED ┆ DECIMALS │\n",
"│ --- ┆ --- ┆ --- ┆ n level ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ date ┆ str ┆ str ┆ --- ┆ ┆ i8 ┆ i8 ┆ i8 ┆ i8 │\n",
"│ ┆ ┆ ┆ str ┆ ┆ ┆ ┆ ┆ │\n",
"╞════════════╪═════════════╪════════════╪════════════╪═══╪════════╪════════╪════════════╪══════════╡\n",
"│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n",
"│ ┆ d and ┆ 0 ┆ n index ┆ ┆ ┆ ┆ ┆ │\n",
"│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n",
"│ ┆ d and ┆ 0 ┆ n index ┆ ┆ ┆ ┆ ┆ │\n",
"│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n",
"│ ┆ d and ┆ 0 ┆ n index ┆ ┆ ┆ ┆ ┆ │\n",
"│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ School ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n",
"│ ┆ d and ┆ 0 ┆ closing ┆ ┆ ┆ ┆ ┆ │\n",
"│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ School ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n",
"│ ┆ d and ┆ 0 ┆ closing ┆ ┆ ┆ ┆ ┆ │\n",
"│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"└────────────┴─────────────┴────────────┴────────────┴───┴────────┴────────┴────────────┴──────────┘"
]
},
"execution_count": 110,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3c7a9dd4-5001-4174-9074-8cb14721f79c",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
@@ -14,7 +14,7 @@
},
{
"cell_type": "code",
"execution_count": 188,
"execution_count": 209,
"id": "98859cd6-6fa4-4aef-a113-455699524fae",
"metadata": {},
"outputs": [],
@@ -39,8 +39,8 @@
"output_folder = f\"{data_folder}/output\"\n",
"\n",
"\n",
"if not os.path.exists(\"processing.db\"):\n",
" con = sqlite3.connect('processing.db')\n",
"if not os.path.exists(f\"{data_folder}/processing.db\"):\n",
" con = sqlite3.connect(f\"{data_folder}/processing.db\")\n",
" cur = con.cursor()\n",
" cur.executescript(\"\"\"\n",
" CREATE TABLE IF NOT EXISTS downloaded (\n",
@@ -55,13 +55,13 @@
" \"\"\")\n",
" con.commit()\n",
"else:\n",
" con = sqlite3.connect('processing.db')\n",
" con = sqlite3.connect(f\"{data_folder}/processing.db\")\n",
" cur = con.cursor()"
]
},
{
"cell_type": "code",
"execution_count": 96,
"execution_count": 210,
"id": "28ac4c01-c1c5-427f-bb2c-0da99a4c5591",
"metadata": {},
"outputs": [],
@@ -83,7 +83,7 @@
},
{
"cell_type": "code",
"execution_count": 97,
"execution_count": 211,
"id": "9daa94f4-16c9-4d8b-951e-3d5d38eb618f",
"metadata": {},
"outputs": [],
@@ -93,12 +93,17 @@
},
{
"cell_type": "code",
"execution_count": 177,
"execution_count": 212,
"id": "0af9a4b3-7b59-460b-b933-504919d4bd2a",
"metadata": {},
"outputs": [],
"source": [
"def update_last_downloaded(product_id):\n",
" \"\"\"\n",
" Updates SQLite database with the last time the table was updated\n",
" The datetime is in Eastern timezone, so have to convert to UTC to\n",
" be consistent with https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite\n",
" \"\"\"\n",
" filepath = f\"{input_folder}/metadata/{product_id}.json\"\n",
" print(f\"Reading metadata {filepath}\")\n",
" with open(filepath, 'r') as fp:\n",
@@ -123,7 +128,20 @@
},
{
"cell_type": "code",
"execution_count": 170,
"execution_count": 213,
"id": "4b7996d2-75ab-4173-a17a-64fb7ab63740",
"metadata": {},
"outputs": [],
"source": [
"def update_last_processed(product_id):\n",
" time_finished_processing = datetime.now().isoformat()\n",
" cur.execute(\"UPDATE downloaded SET last_processed = ? WHERE product_id = ?\", (time_finished_processing, product_id))\n",
" con.commit()"
]
},
{
"cell_type": "code",
"execution_count": 214,
"id": "23cbabc3-0d4b-4e28-a4df-b2c5e8c7ea8b",
"metadata": {},
"outputs": [],
@@ -166,159 +184,23 @@
" for result in results:\n",
" product_id = result[0]\n",
" print(f\"Updating product_id: {product_id}\")\n",
" download_cube(product_id)\n",
" process_cube(product_id)"
]
},
{
"cell_type": "code",
"execution_count": 186,
"id": "c0d46f38-242c-4685-b8b6-4e046c23aec5",
"execution_count": 215,
"id": "dc5573ef-734b-44d8-a4c4-0df19d655975",
"metadata": {},
"outputs": [],
"source": [
"cur.execute(\"\"\"\n",
" SELECT a.product_id, a.last_updated, b.last_updated\n",
" FROM downloaded AS a,\n",
" cubes AS b\n",
" WHERE a.product_id = b.product_id\n",
" AND b.last_updated > a.last_updated\n",
"\"\"\")\n",
"difference = pd.DataFrame(cur.fetchall(), columns=[\"product_id\", \"release_time_metadata\", \"release_time_cubelist\"])"
"#update_tables()"
]
},
{
"cell_type": "code",
"execution_count": 189,
"id": "a5e12a7a-9891-4d58-9368-e31b274feb1d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>product_id</th>\n",
" <th>release_time_metadata</th>\n",
" <th>release_time_cubelist</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>10100006</td>\n",
" <td>2025-05-21T12:30:00+00:00</td>\n",
" <td>2025-06-17T12:30:00+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10100108</td>\n",
" <td>2025-05-13T12:30:00+00:00</td>\n",
" <td>2025-06-12T12:30:00+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>10100132</td>\n",
" <td>2025-06-06T12:30:00+00:00</td>\n",
" <td>2025-06-13T12:30:00+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>10100136</td>\n",
" <td>2025-06-10T12:30:00+00:00</td>\n",
" <td>2025-06-16T12:30:00+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>10100138</td>\n",
" <td>2025-06-06T12:30:00+00:00</td>\n",
" <td>2025-06-13T12:30:00+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>92</th>\n",
" <td>38100235</td>\n",
" <td>2025-03-13T12:30:00+00:00</td>\n",
" <td>2025-06-12T12:30:00+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>93</th>\n",
" <td>38100236</td>\n",
" <td>2025-03-13T12:30:00+00:00</td>\n",
" <td>2025-06-12T12:30:00+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94</th>\n",
" <td>38100237</td>\n",
" <td>2025-03-13T12:30:00+00:00</td>\n",
" <td>2025-06-12T12:30:00+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>38100238</td>\n",
" <td>2025-03-13T12:30:00+00:00</td>\n",
" <td>2025-06-12T12:30:00+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>38100164</td>\n",
" <td>2023-09-13T12:30:00+00:00</td>\n",
" <td>2025-06-17T12:30:00+00:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>97 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" product_id release_time_metadata release_time_cubelist\n",
"0 10100006 2025-05-21T12:30:00+00:00 2025-06-17T12:30:00+00:00\n",
"1 10100108 2025-05-13T12:30:00+00:00 2025-06-12T12:30:00+00:00\n",
"2 10100132 2025-06-06T12:30:00+00:00 2025-06-13T12:30:00+00:00\n",
"3 10100136 2025-06-10T12:30:00+00:00 2025-06-16T12:30:00+00:00\n",
"4 10100138 2025-06-06T12:30:00+00:00 2025-06-13T12:30:00+00:00\n",
".. ... ... ...\n",
"92 38100235 2025-03-13T12:30:00+00:00 2025-06-12T12:30:00+00:00\n",
"93 38100236 2025-03-13T12:30:00+00:00 2025-06-12T12:30:00+00:00\n",
"94 38100237 2025-03-13T12:30:00+00:00 2025-06-12T12:30:00+00:00\n",
"95 38100238 2025-03-13T12:30:00+00:00 2025-06-12T12:30:00+00:00\n",
"96 38100164 2023-09-13T12:30:00+00:00 2025-06-17T12:30:00+00:00\n",
"\n",
"[97 rows x 3 columns]"
]
},
"execution_count": 189,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"difference"
]
},
{
"cell_type": "code",
"execution_count": 193,
"execution_count": 228,
"id": "144e3716-b0e7-4a39-9a25-ededea506f4f",
"metadata": {},
"outputs": [],
@@ -368,6 +250,9 @@
" return result\n",
"\n",
"def download_cube(product_id, language=\"en\"):\n",
" \"\"\"\n",
" Downloads the English CSV for a specific table\n",
" \"\"\"\n",
" download_url = f\"https://www150.statcan.gc.ca/t1/wds/rest/getFullTableDownloadCSV/{product_id}/en\"\n",
" response = requests.get(download_url).json()\n",
" zip_url = response['object']\n",
@@ -388,6 +273,11 @@
" progress_bar.close()\n",
" \n",
"def process_cube(product_id, language=\"en\"):\n",
" cur.execute(\"SELECT product_id FROM downloaded WHERE product_id = ?\", (product_id,))\n",
" result = cur.fetchone()\n",
" if result:\n",
" print(f\"Already processed {product_id}\")\n",
" return\n",
" extract_zipfile(product_id, language)\n",
" \"\"\"\n",
" The pandas column reader is better than the Polars one\n",
@@ -395,12 +285,11 @@
" https://www150.statcan.gc.ca/n1/tbl/csv/98100404-eng.zip\n",
" \"\"\"\n",
" # Get metadata\n",
" metadata_file = f\"{input_folder}/metadata/{product_id}.json\"\n",
" metadata = get_cube_metadata(product_id)\n",
" print(f\"Writing metadata file {metadata_file}\")\n",
" with open(metadata_file, \"w\") as outfile:\n",
" json.dump(metadata, outfile)\n",
" update_last_downloaded(product_id)\n",
" #metadata_file = f\"{input_folder}/metadata/{product_id}.json\"\n",
" #metadata = get_cube_metadata(product_id)\n",
" #print(f\"Writing metadata file {metadata_file}\")\n",
" #with open(metadata_file, \"w\") as outfile:\n",
" # json.dump(metadata, outfile)\n",
" # Read CSV using Pandas\n",
" product_csv = f\"{scratch_folder}/{product_id}.csv\"\n",
" parameters = {\n",
@@ -410,22 +299,32 @@
" print(f\"Reading {product_csv} as a Pandas dataframe\")\n",
" df = pd.read_csv(product_csv, **parameters)\n",
" df = convert_to_lowest_type(df)\n",
" # Fix mixed types. This happened in productId 11100147\n",
" for col in df.select_dtypes(include=[\"object\"]).columns:\n",
" types_in_col = df[col].map(type).unique()\n",
" # If there's any non-str type in the column\n",
" if any(t != str for t in types_in_col):\n",
" print(f\"Fixing mixed types in column: {col} — types found: {types_in_col}\")\n",
" # Replace non-str values with None\n",
" df[col] = df[col].where(df[col].map(type) == str, None).astype(\"string\")\n",
" print(\"Import Pandas dataframe as a Polars dataframe\")\n",
" df = pl.from_pandas(df)\n",
" output_parquet = f\"{output_folder}/{language}/{product_id}.parquet\"\n",
" print(f\"Exporting dataframe as parquet to {output_parquet}\")\n",
" df.write_parquet(f\"{output_folder}/{language}/{product_id}.parquet\",\n",
" df.write_parquet(output_parquet,\n",
" compression='zstd',\n",
" compression_level=22)\n",
" # Remove the scratch files\n",
" print(\"Removing scratch files\")\n",
" os.remove(f\"{scratch_folder}/{product_id}.csv\")\n",
" os.remove(f\"{scratch_folder}/{product_id}_MetaData.csv\") \n"
" os.remove(f\"{scratch_folder}/{product_id}_MetaData.csv\")\n",
" update_last_downloaded(product_id)\n",
" update_last_processed(product_id)"
]
},
{
"cell_type": "code",
"execution_count": 164,
"execution_count": 229,
"id": "6be0842e-c1e0-45b4-90e5-0be28b963aed",
"metadata": {},
"outputs": [
@@ -433,54 +332,13 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Extracting /data/tables/input/en/10100111.zip to /data/tables/scratch\n",
"Writing metadata file /data/tables/input/metadata/10100111.json\n",
"Reading metadata /data/tables/input/metadata/10100111.json\n",
"('10100111', '2025-06-18T12:30:00+00:00')\n",
"Reading /data/tables/scratch/10100111.csv as a Pandas dataframe\n",
"Converting dataframe to optimal data types\n",
"Import Pandas dataframe as a Polars dataframe\n",
"Exporting dataframe as parquet to /data/tables/output/en/10100111.parquet\n",
"Removing scratch files\n"
"Already processed 11100147\n"
]
}
],
"source": [
"process_cube(\"10100111\")"
"process_cube(\"11100147\")"
]
},
{
"cell_type": "code",
"execution_count": 194,
"id": "8a193195-bfa6-4df3-adab-0f88025276da",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/38100164-eng.zip to /data/tables/input/en/38100164.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/38100164.zip: 100%|██████████████████| 980k/980k [00:00<00:00, 1.27MB/s]\n"
]
}
],
"source": [
"download_cube(\"38100164\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fc4ae317-3e3e-4c27-a2fe-7e6bd12f673f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {