Continue work on processing data tables

2026-06-13 14:10:55 +02:00 · 2025-06-19 15:58:30 -04:00
parent ab8f40c708
commit f6d88c5fd0
5 changed files with 589 additions and 201 deletions
@@ -0,0 +1,108 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "e4d2c52d-38be-4f84-a0bf-6bd8cb577ad9",
+   "metadata": {},
+   "source": [
+    "# Purpose\n",
+    "I need to find out what all possible date formats are for the \"REF_DATE\" field so that when I write the parquet file people will be able to filter on it\n",
+    "\n",
+    "These are all the dates I have encountered:\n",
+    "- Just the year. Example found on productId 36100608: 2024\n",
+    "- Year and month. Example found on productId 14100443: 2024-07\n",
+    "- Year, month, date. Example found on productId 33100036: 2025-06-17\n",
+    "- Range. Example found on productId 17100022: 2013/2014"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "43916131-250b-4fdf-9c75-39024822529c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "from multiprocessing import Pool\n",
+    "import sqlite3\n",
+    "import polars as pl\n",
+    "\n",
+    "data_folder = \"/data/tables\"\n",
+    "input_folder = f\"{data_folder}/input\"\n",
+    "scratch_folder = f\"{data_folder}/scratch\"\n",
+    "output_folder = f\"{data_folder}/output\"\n",
+    "\n",
+    "con = sqlite3.connect(f\"{data_folder}/dates.db\")\n",
+    "cur = con.cursor()\n",
+    "cur.executescript(\"\"\"\n",
+    "    DROP TABLE IF EXISTS dates;\n",
+    "    CREATE TABLE IF NOT EXISTS dates (\n",
+    "      product_id TEXT,\n",
+    "      date TEXT\n",
+    "    );\n",
+    "\"\"\")\n",
+    "con.commit()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "553d58ff-81a4-45e6-a96e-6ea9ebb166fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def find_unique_dates(filepath):\n",
+    "    \"\"\"\n",
+    "    Finds unique dates for a table and writes to SQLite table\n",
+    "    \"\"\"\n",
+    "    product_id = filepath.split(\"/\")[-1].split(\".parquet\")[0]\n",
+    "    print(f\"Processing {product_id}\")\n",
+    "    try:\n",
+    "        result = (\n",
+    "        pl.scan_parquet(filepath)\n",
+    "            .select(\"REF_DATE\")\n",
+    "            .collect()\n",
+    "        )\n",
+    "    except Exception:\n",
+    "        return\n",
+    "    unique_dates = [(product_id, x) for x in result['REF_DATE'].unique(maintain_order=True).to_list()]\n",
+    "    cur.executemany(\"INSERT INTO dates VALUES(?, ?)\", unique_dates)\n",
+    "    con.commit()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "507c069d-472f-44a9-872b-e690225fad17",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if __name__ == '__main__':\n",
+    "    files_to_process = glob.glob(f\"{output_folder}/en/*.parquet\")\n",
+    "    with Pool() as p:\n",
+    "        p.map(find_unique_dates, files_to_process)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -0,0 +1,47 @@
+"""
+I need to find out what all possible date formats are for the "REF_DATE" field so that when I write the parquet file people will be able to filter on it
+"""
+
+import glob
+from multiprocessing import Pool
+import sqlite3
+import polars as pl
+
+data_folder = "/data/tables"
+input_folder = f"{data_folder}/input"
+scratch_folder = f"{data_folder}/scratch"
+output_folder = f"{data_folder}/output"
+
+con = sqlite3.connect(f"{data_folder}/dates.db")
+cur = con.cursor()
+cur.executescript("""
+    DROP TABLE IF EXISTS dates;
+    CREATE TABLE IF NOT EXISTS dates (
+      product_id TEXT,
+      date TEXT
+    );
+""")
+con.commit()
+
+def find_unique_dates(filepath):
+    """
+    Finds unique dates for a table and writes to SQLite table
+    """
+    product_id = filepath.split("/")[-1].split(".parquet")[0]
+    print(f"Processing {product_id}")
+    try:
+        result = (
+        pl.scan_parquet(filepath)
+            .select("REF_DATE")
+            .collect()
+        )
+    except Exception:
+        return
+    unique_dates = [(product_id, x) for x in result['REF_DATE'].unique(maintain_order=True).to_list()]
+    cur.executemany("INSERT INTO dates VALUES(?, ?)", unique_dates)
+    con.commit()
+
+if __name__ == '__main__':
+    files_to_process = glob.glob(f"{output_folder}/en/other/en/*.parquet")
+    with Pool() as p:
+        p.map(find_unique_dates, files_to_process)
@@ -0,0 +1,168 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "5e04e469-d3eb-42ca-b548-5e6f1fa6af9d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import buckaroo\n",
+    "import duckdb\n",
+    "import polars as pl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "97c8e92b-21e4-4cc5-8dbe-7b42361ce3f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "con = duckdb.connect()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "e02f2416-fd16-444b-8fd4-eec2cecee5a7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<duckdb.duckdb.DuckDBPyConnection at 0x7f3bbc156230>"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "con.execute(\"\"\"\n",
+    "DESCRIBE '/data/tables/output/en/testing/36100670.parquet';\n",
+    "\"\"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "fafa7ce7-8619-4951-8c73-7bfbc66dc92f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('REF_DATE', 'DATE', 'YES', None, None, None),\n",
+       " ('GEO', 'VARCHAR', 'YES', None, None, None),\n",
+       " ('DGUID', 'VARCHAR', 'YES', None, None, None),\n",
+       " ('Seasonality', 'VARCHAR', 'YES', None, None, None),\n",
+       " ('Selected credit estimates', 'VARCHAR', 'YES', None, None, None),\n",
+       " ('UOM', 'VARCHAR', 'YES', None, None, None),\n",
+       " ('UOM_ID', 'TINYINT', 'YES', None, None, None),\n",
+       " ('SCALAR_FACTOR', 'VARCHAR', 'YES', None, None, None),\n",
+       " ('SCALAR_ID', 'TINYINT', 'YES', None, None, None),\n",
+       " ('VECTOR', 'VARCHAR', 'YES', None, None, None),\n",
+       " ('COORDINATE', 'VARCHAR', 'YES', None, None, None),\n",
+       " ('VALUE', 'INTEGER', 'YES', None, None, None),\n",
+       " ('STATUS', 'VARCHAR', 'YES', None, None, None),\n",
+       " ('SYMBOL', 'TINYINT', 'YES', None, None, None),\n",
+       " ('TERMINATED', 'TINYINT', 'YES', None, None, None),\n",
+       " ('DECIMALS', 'TINYINT', 'YES', None, None, None)]"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "con.fetchall()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "a4ed2881-91b7-4473-b246-a969ef59efba",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d937312a19f44fc2a8f87bcae8d0faca",
+       "version_major": 2,
+       "version_minor": 1
+      },
+      "text/plain": [
+       "PolarsBuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'p…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "con.execute(\"SELECT DISTINCT REF_DATE FROM '/data/tables/output/en/testing/36100670.parquet' ORDER BY REF_DATE\").pl()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "f400feee-efb6-421a-b518-1f9c0fc21bcb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c2edc671f3014e4aae90a2a5b0577bfa",
+       "version_major": 2,
+       "version_minor": 1
+      },
+      "text/plain": [
+       "PolarsBuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'p…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "con.execute(\"\"\"\n",
+    "SELECT REF_DATE, DGUID, Seasonality, 'Selected credit estimates', VALUE\n",
+    "FROM '/data/tables/output/en/testing/36100670.parquet'\n",
+    "WHERE REF_DATE > DATE '2018-01-01'\n",
+    "\"\"\").pl()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e2e00108-bc8f-4b8b-91e0-f7bf9b44c59d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -0,0 +1,207 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "d287c593-ab51-4042-b7a6-634916f7075f",
+   "metadata": {},
+   "source": [
+    "# Purpose\n",
+    "I need to find out what all possible date formats are for the \"REF_DATE\" field so that when I write the parquet file people will be able to filter on it\n",
+    "\n",
+    "These are all the dates I have encountered:\n",
+    "- Just the year. Example found on productId 36100608: 2024\n",
+    "- Year and month. Example found on productId 14100443: 2024-07\n",
+    "- Year, month, date. Example found on productId 33100036: 2025-06-17\n",
+    "- Range. Example found on productId 17100022: 2013/2014"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "id": "57b802e1-ce9c-4e2e-abf5-aa29ba7e77f4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div><style>\n",
+       ".dataframe > thead > tr,\n",
+       ".dataframe > tbody > tr {\n",
+       "  text-align: right;\n",
+       "  white-space: pre-wrap;\n",
+       "}\n",
+       "</style>\n",
+       "<small>shape: (5, 16)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>REF_DATE</th><th>GEO</th><th>DGUID</th><th>Restriction level</th><th>Vaccination status</th><th>UOM</th><th>UOM_ID</th><th>SCALAR_FACTOR</th><th>SCALAR_ID</th><th>VECTOR</th><th>COORDINATE</th><th>VALUE</th><th>STATUS</th><th>SYMBOL</th><th>TERMINATED</th><th>DECIMALS</th></tr><tr><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>i16</td><td>str</td><td>i8</td><td>str</td><td>str</td><td>f64</td><td>i8</td><td>i8</td><td>i8</td><td>i8</td></tr></thead><tbody><tr><td>&quot;2020-01-01&quot;</td><td>&quot;Newfoundland and Labrador&quot;</td><td>&quot;2016A000210&quot;</td><td>&quot;Restriction index&quot;</td><td>&quot;Total population&quot;</td><td>&quot;Index&quot;</td><td>160</td><td>&quot;units&quot;</td><td>0</td><td>&quot;v1331468081&quot;</td><td>&quot;1.1.1&quot;</td><td>1.67</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>&quot;2020-01-01&quot;</td><td>&quot;Newfoundland and Labrador&quot;</td><td>&quot;2016A000210&quot;</td><td>&quot;Restriction index&quot;</td><td>&quot;Vaccinated persons&quot;</td><td>&quot;Index&quot;</td><td>160</td><td>&quot;units&quot;</td><td>0</td><td>&quot;v1331468097&quot;</td><td>&quot;1.1.2&quot;</td><td>1.67</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>&quot;2020-01-01&quot;</td><td>&quot;Newfoundland and Labrador&quot;</td><td>&quot;2016A000210&quot;</td><td>&quot;Restriction index&quot;</td><td>&quot;Unvaccinated persons&quot;</td><td>&quot;Index&quot;</td><td>160</td><td>&quot;units&quot;</td><td>0</td><td>&quot;v1331468113&quot;</td><td>&quot;1.1.3&quot;</td><td>1.67</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>&quot;2020-01-01&quot;</td><td>&quot;Newfoundland and Labrador&quot;</td><td>&quot;2016A000210&quot;</td><td>&quot;School closing&quot;</td><td>&quot;Total population&quot;</td><td>&quot;Index&quot;</td><td>160</td><td>&quot;units&quot;</td><td>0</td><td>&quot;v1331468082&quot;</td><td>&quot;1.2.1&quot;</td><td>0.0</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>&quot;2020-01-01&quot;</td><td>&quot;Newfoundland and Labrador&quot;</td><td>&quot;2016A000210&quot;</td><td>&quot;School closing&quot;</td><td>&quot;Vaccinated persons&quot;</td><td>&quot;Index&quot;</td><td>160</td><td>&quot;units&quot;</td><td>0</td><td>&quot;v1331468098&quot;</td><td>&quot;1.2.2&quot;</td><td>0.0</td><td>null</td><td>null</td><td>null</td><td>2</td></tr></tbody></table></div>"
+      ],
+      "text/plain": [
+       "shape: (5, 16)\n",
+       "┌────────────┬─────────────┬────────────┬────────────┬───┬────────┬────────┬────────────┬──────────┐\n",
+       "│ REF_DATE   ┆ GEO         ┆ DGUID      ┆ Restrictio ┆ … ┆ STATUS ┆ SYMBOL ┆ TERMINATED ┆ DECIMALS │\n",
+       "│ ---        ┆ ---         ┆ ---        ┆ n level    ┆   ┆ ---    ┆ ---    ┆ ---        ┆ ---      │\n",
+       "│ str        ┆ str         ┆ str        ┆ ---        ┆   ┆ i8     ┆ i8     ┆ i8         ┆ i8       │\n",
+       "│            ┆             ┆            ┆ str        ┆   ┆        ┆        ┆            ┆          │\n",
+       "╞════════════╪═════════════╪════════════╪════════════╪═══╪════════╪════════╪════════════╪══════════╡\n",
+       "│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null   ┆ null   ┆ null       ┆ 2        │\n",
+       "│            ┆ d and       ┆ 0          ┆ n index    ┆   ┆        ┆        ┆            ┆          │\n",
+       "│            ┆ Labrador    ┆            ┆            ┆   ┆        ┆        ┆            ┆          │\n",
+       "│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null   ┆ null   ┆ null       ┆ 2        │\n",
+       "│            ┆ d and       ┆ 0          ┆ n index    ┆   ┆        ┆        ┆            ┆          │\n",
+       "│            ┆ Labrador    ┆            ┆            ┆   ┆        ┆        ┆            ┆          │\n",
+       "│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null   ┆ null   ┆ null       ┆ 2        │\n",
+       "│            ┆ d and       ┆ 0          ┆ n index    ┆   ┆        ┆        ┆            ┆          │\n",
+       "│            ┆ Labrador    ┆            ┆            ┆   ┆        ┆        ┆            ┆          │\n",
+       "│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ School     ┆ … ┆ null   ┆ null   ┆ null       ┆ 2        │\n",
+       "│            ┆ d and       ┆ 0          ┆ closing    ┆   ┆        ┆        ┆            ┆          │\n",
+       "│            ┆ Labrador    ┆            ┆            ┆   ┆        ┆        ┆            ┆          │\n",
+       "│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ School     ┆ … ┆ null   ┆ null   ┆ null       ┆ 2        │\n",
+       "│            ┆ d and       ┆ 0          ┆ closing    ┆   ┆        ┆        ┆            ┆          │\n",
+       "│            ┆ Labrador    ┆            ┆            ┆   ┆        ┆        ┆            ┆          │\n",
+       "└────────────┴─────────────┴────────────┴────────────┴───┴────────┴────────┴────────────┴──────────┘"
+      ]
+     },
+     "execution_count": 107,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import polars as pl\n",
+    "\n",
+    "data_folder = \"/data/tables\"\n",
+    "input_folder = f\"{data_folder}/input\"\n",
+    "scratch_folder = f\"{data_folder}/scratch\"\n",
+    "output_folder = f\"{data_folder}/output\"\n",
+    "\n",
+    "def normalize_ref_date(series):\n",
+    "    # Always cast to string first\n",
+    "    series = series.cast(pl.Utf8)\n",
+    "   \n",
+    "    # Try parsing as full date (YYYY-MM-DD)\n",
+    "    full = series.str.strptime(pl.Date, \"%Y-%m-%d\", strict=False)\n",
+    "\n",
+    "    # For nulls, try parsing as year-month (YYYY-MM)\n",
+    "    ym = series.str.strptime(pl.Date, \"%Y-%m\", strict=False).dt.replace(day=1)\n",
+    "    full = full.fill_null(ym)\n",
+    "\n",
+    "    # For remaining nulls, try just year (YYYY)\n",
+    "    y = series.str.strptime(pl.Date, \"%Y\", strict=False).dt.replace(month=1, day=1)\n",
+    "    full = full.fill_null(y)\n",
+    "\n",
+    "    return full\n",
+    "\n",
+    "filepath = f\"{output_folder}/en/other/en/33100496.parquet\"\n",
+    "df = pl.read_parquet(filepath)\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "id": "6e0bb9eb-0638-48c6-9308-bc5a846ff27f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "skip_calculating_ref_date = False\n",
+    "if df.schema[\"REF_DATE\"] == pl.String:\n",
+    "    if df[\"REF_DATE\"].str.contains(\"/\").any():\n",
+    "        # Skip the calculating of the field\n",
+    "        skip_calculating_ref_date = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "id": "5f93ba75-5c1c-4a7f-b526-89d7589f3834",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if skip_calculating_ref_date == False:\n",
+    "    df = df.with_columns([\n",
+    "        normalize_ref_date(pl.col(\"REF_DATE\")).alias(\"REF_DATE\")\n",
+    "    ])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
+   "id": "d54d297c-ae68-4a44-96c6-457873ea2d10",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div><style>\n",
+       ".dataframe > thead > tr,\n",
+       ".dataframe > tbody > tr {\n",
+       "  text-align: right;\n",
+       "  white-space: pre-wrap;\n",
+       "}\n",
+       "</style>\n",
+       "<small>shape: (5, 16)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>REF_DATE</th><th>GEO</th><th>DGUID</th><th>Restriction level</th><th>Vaccination status</th><th>UOM</th><th>UOM_ID</th><th>SCALAR_FACTOR</th><th>SCALAR_ID</th><th>VECTOR</th><th>COORDINATE</th><th>VALUE</th><th>STATUS</th><th>SYMBOL</th><th>TERMINATED</th><th>DECIMALS</th></tr><tr><td>date</td><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>i16</td><td>str</td><td>i8</td><td>str</td><td>str</td><td>f64</td><td>i8</td><td>i8</td><td>i8</td><td>i8</td></tr></thead><tbody><tr><td>2020-01-01</td><td>&quot;Newfoundland and Labrador&quot;</td><td>&quot;2016A000210&quot;</td><td>&quot;Restriction index&quot;</td><td>&quot;Total population&quot;</td><td>&quot;Index&quot;</td><td>160</td><td>&quot;units&quot;</td><td>0</td><td>&quot;v1331468081&quot;</td><td>&quot;1.1.1&quot;</td><td>1.67</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>2020-01-01</td><td>&quot;Newfoundland and Labrador&quot;</td><td>&quot;2016A000210&quot;</td><td>&quot;Restriction index&quot;</td><td>&quot;Vaccinated persons&quot;</td><td>&quot;Index&quot;</td><td>160</td><td>&quot;units&quot;</td><td>0</td><td>&quot;v1331468097&quot;</td><td>&quot;1.1.2&quot;</td><td>1.67</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>2020-01-01</td><td>&quot;Newfoundland and Labrador&quot;</td><td>&quot;2016A000210&quot;</td><td>&quot;Restriction index&quot;</td><td>&quot;Unvaccinated persons&quot;</td><td>&quot;Index&quot;</td><td>160</td><td>&quot;units&quot;</td><td>0</td><td>&quot;v1331468113&quot;</td><td>&quot;1.1.3&quot;</td><td>1.67</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>2020-01-01</td><td>&quot;Newfoundland and Labrador&quot;</td><td>&quot;2016A000210&quot;</td><td>&quot;School closing&quot;</td><td>&quot;Total population&quot;</td><td>&quot;Index&quot;</td><td>160</td><td>&quot;units&quot;</td><td>0</td><td>&quot;v1331468082&quot;</td><td>&quot;1.2.1&quot;</td><td>0.0</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>2020-01-01</td><td>&quot;Newfoundland and Labrador&quot;</td><td>&quot;2016A000210&quot;</td><td>&quot;School closing&quot;</td><td>&quot;Vaccinated persons&quot;</td><td>&quot;Index&quot;</td><td>160</td><td>&quot;units&quot;</td><td>0</td><td>&quot;v1331468098&quot;</td><td>&quot;1.2.2&quot;</td><td>0.0</td><td>null</td><td>null</td><td>null</td><td>2</td></tr></tbody></table></div>"
+      ],
+      "text/plain": [
+       "shape: (5, 16)\n",
+       "┌────────────┬─────────────┬────────────┬────────────┬───┬────────┬────────┬────────────┬──────────┐\n",
+       "│ REF_DATE   ┆ GEO         ┆ DGUID      ┆ Restrictio ┆ … ┆ STATUS ┆ SYMBOL ┆ TERMINATED ┆ DECIMALS │\n",
+       "│ ---        ┆ ---         ┆ ---        ┆ n level    ┆   ┆ ---    ┆ ---    ┆ ---        ┆ ---      │\n",
+       "│ date       ┆ str         ┆ str        ┆ ---        ┆   ┆ i8     ┆ i8     ┆ i8         ┆ i8       │\n",
+       "│            ┆             ┆            ┆ str        ┆   ┆        ┆        ┆            ┆          │\n",
+       "╞════════════╪═════════════╪════════════╪════════════╪═══╪════════╪════════╪════════════╪══════════╡\n",
+       "│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null   ┆ null   ┆ null       ┆ 2        │\n",
+       "│            ┆ d and       ┆ 0          ┆ n index    ┆   ┆        ┆        ┆            ┆          │\n",
+       "│            ┆ Labrador    ┆            ┆            ┆   ┆        ┆        ┆            ┆          │\n",
+       "│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null   ┆ null   ┆ null       ┆ 2        │\n",
+       "│            ┆ d and       ┆ 0          ┆ n index    ┆   ┆        ┆        ┆            ┆          │\n",
+       "│            ┆ Labrador    ┆            ┆            ┆   ┆        ┆        ┆            ┆          │\n",
+       "│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null   ┆ null   ┆ null       ┆ 2        │\n",
+       "│            ┆ d and       ┆ 0          ┆ n index    ┆   ┆        ┆        ┆            ┆          │\n",
+       "│            ┆ Labrador    ┆            ┆            ┆   ┆        ┆        ┆            ┆          │\n",
+       "│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ School     ┆ … ┆ null   ┆ null   ┆ null       ┆ 2        │\n",
+       "│            ┆ d and       ┆ 0          ┆ closing    ┆   ┆        ┆        ┆            ┆          │\n",
+       "│            ┆ Labrador    ┆            ┆            ┆   ┆        ┆        ┆            ┆          │\n",
+       "│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ School     ┆ … ┆ null   ┆ null   ┆ null       ┆ 2        │\n",
+       "│            ┆ d and       ┆ 0          ┆ closing    ┆   ┆        ┆        ┆            ┆          │\n",
+       "│            ┆ Labrador    ┆            ┆            ┆   ┆        ┆        ┆            ┆          │\n",
+       "└────────────┴─────────────┴────────────┴────────────┴───┴────────┴────────┴────────────┴──────────┘"
+      ]
+     },
+     "execution_count": 110,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c7a9dd4-5001-4174-9074-8cb14721f79c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -14,7 +14,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 188,
+   "execution_count": 209,
   "id": "98859cd6-6fa4-4aef-a113-455699524fae",
   "metadata": {},
   "outputs": [],
@@ -39,8 +39,8 @@
    "output_folder = f\"{data_folder}/output\"\n",
    "\n",
    "\n",
-    "if not os.path.exists(\"processing.db\"):\n",
-    "    con = sqlite3.connect('processing.db')\n",
+    "if not os.path.exists(f\"{data_folder}/processing.db\"):\n",
+    "    con = sqlite3.connect(f\"{data_folder}/processing.db\")\n",
    "    cur = con.cursor()\n",
    "    cur.executescript(\"\"\"\n",
    "        CREATE TABLE IF NOT EXISTS downloaded (\n",
@@ -55,13 +55,13 @@
    "    \"\"\")\n",
    "    con.commit()\n",
    "else:\n",
-    "    con = sqlite3.connect('processing.db')\n",
+    "    con = sqlite3.connect(f\"{data_folder}/processing.db\")\n",
    "    cur = con.cursor()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 96,
+   "execution_count": 210,
   "id": "28ac4c01-c1c5-427f-bb2c-0da99a4c5591",
   "metadata": {},
   "outputs": [],
@@ -83,7 +83,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 97,
+   "execution_count": 211,
   "id": "9daa94f4-16c9-4d8b-951e-3d5d38eb618f",
   "metadata": {},
   "outputs": [],
@@ -93,12 +93,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 177,
+   "execution_count": 212,
   "id": "0af9a4b3-7b59-460b-b933-504919d4bd2a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def update_last_downloaded(product_id):\n",
+    "    \"\"\"\n",
+    "    Updates SQLite database with the last time the table was updated\n",
+    "    The datetime is in Eastern timezone, so have to convert to UTC to\n",
+    "    be consistent with https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite\n",
+    "    \"\"\"\n",
    "    filepath = f\"{input_folder}/metadata/{product_id}.json\"\n",
    "    print(f\"Reading metadata {filepath}\")\n",
    "    with open(filepath, 'r') as fp:\n",
@@ -123,7 +128,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 170,
+   "execution_count": 213,
+   "id": "4b7996d2-75ab-4173-a17a-64fb7ab63740",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def update_last_processed(product_id):\n",
+    "    time_finished_processing = datetime.now().isoformat()\n",
+    "    cur.execute(\"UPDATE downloaded SET last_processed = ? WHERE product_id = ?\", (time_finished_processing, product_id))\n",
+    "    con.commit()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 214,
   "id": "23cbabc3-0d4b-4e28-a4df-b2c5e8c7ea8b",
   "metadata": {},
   "outputs": [],
@@ -166,159 +184,23 @@
    "    for result in results:\n",
    "        product_id = result[0]\n",
    "        print(f\"Updating product_id: {product_id}\")\n",
+    "        download_cube(product_id)\n",
    "        process_cube(product_id)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 186,
-   "id": "c0d46f38-242c-4685-b8b6-4e046c23aec5",
+   "execution_count": 215,
+   "id": "dc5573ef-734b-44d8-a4c4-0df19d655975",
   "metadata": {},
   "outputs": [],
   "source": [
-    "cur.execute(\"\"\"\n",
-    "    SELECT a.product_id, a.last_updated, b.last_updated\n",
-    "    FROM downloaded AS a,\n",
-    "         cubes AS b\n",
-    "    WHERE a.product_id = b.product_id\n",
-    "    AND b.last_updated > a.last_updated\n",
-    "\"\"\")\n",
-    "difference = pd.DataFrame(cur.fetchall(), columns=[\"product_id\", \"release_time_metadata\", \"release_time_cubelist\"])"
+    "#update_tables()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 189,
-   "id": "a5e12a7a-9891-4d58-9368-e31b274feb1d",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>product_id</th>\n",
-       "      <th>release_time_metadata</th>\n",
-       "      <th>release_time_cubelist</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>10100006</td>\n",
-       "      <td>2025-05-21T12:30:00+00:00</td>\n",
-       "      <td>2025-06-17T12:30:00+00:00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>10100108</td>\n",
-       "      <td>2025-05-13T12:30:00+00:00</td>\n",
-       "      <td>2025-06-12T12:30:00+00:00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>10100132</td>\n",
-       "      <td>2025-06-06T12:30:00+00:00</td>\n",
-       "      <td>2025-06-13T12:30:00+00:00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>10100136</td>\n",
-       "      <td>2025-06-10T12:30:00+00:00</td>\n",
-       "      <td>2025-06-16T12:30:00+00:00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>10100138</td>\n",
-       "      <td>2025-06-06T12:30:00+00:00</td>\n",
-       "      <td>2025-06-13T12:30:00+00:00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>92</th>\n",
-       "      <td>38100235</td>\n",
-       "      <td>2025-03-13T12:30:00+00:00</td>\n",
-       "      <td>2025-06-12T12:30:00+00:00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>93</th>\n",
-       "      <td>38100236</td>\n",
-       "      <td>2025-03-13T12:30:00+00:00</td>\n",
-       "      <td>2025-06-12T12:30:00+00:00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>94</th>\n",
-       "      <td>38100237</td>\n",
-       "      <td>2025-03-13T12:30:00+00:00</td>\n",
-       "      <td>2025-06-12T12:30:00+00:00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>95</th>\n",
-       "      <td>38100238</td>\n",
-       "      <td>2025-03-13T12:30:00+00:00</td>\n",
-       "      <td>2025-06-12T12:30:00+00:00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>96</th>\n",
-       "      <td>38100164</td>\n",
-       "      <td>2023-09-13T12:30:00+00:00</td>\n",
-       "      <td>2025-06-17T12:30:00+00:00</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>97 rows × 3 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   product_id      release_time_metadata      release_time_cubelist\n",
-       "0    10100006  2025-05-21T12:30:00+00:00  2025-06-17T12:30:00+00:00\n",
-       "1    10100108  2025-05-13T12:30:00+00:00  2025-06-12T12:30:00+00:00\n",
-       "2    10100132  2025-06-06T12:30:00+00:00  2025-06-13T12:30:00+00:00\n",
-       "3    10100136  2025-06-10T12:30:00+00:00  2025-06-16T12:30:00+00:00\n",
-       "4    10100138  2025-06-06T12:30:00+00:00  2025-06-13T12:30:00+00:00\n",
-       "..        ...                        ...                        ...\n",
-       "92   38100235  2025-03-13T12:30:00+00:00  2025-06-12T12:30:00+00:00\n",
-       "93   38100236  2025-03-13T12:30:00+00:00  2025-06-12T12:30:00+00:00\n",
-       "94   38100237  2025-03-13T12:30:00+00:00  2025-06-12T12:30:00+00:00\n",
-       "95   38100238  2025-03-13T12:30:00+00:00  2025-06-12T12:30:00+00:00\n",
-       "96   38100164  2023-09-13T12:30:00+00:00  2025-06-17T12:30:00+00:00\n",
-       "\n",
-       "[97 rows x 3 columns]"
-      ]
-     },
-     "execution_count": 189,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "difference"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 193,
+   "execution_count": 228,
   "id": "144e3716-b0e7-4a39-9a25-ededea506f4f",
   "metadata": {},
   "outputs": [],
@@ -368,6 +250,9 @@
    "    return result\n",
    "\n",
    "def download_cube(product_id, language=\"en\"):\n",
+    "    \"\"\"\n",
+    "    Downloads the English CSV for a specific table\n",
+    "    \"\"\"\n",
    "    download_url = f\"https://www150.statcan.gc.ca/t1/wds/rest/getFullTableDownloadCSV/{product_id}/en\"\n",
    "    response = requests.get(download_url).json()\n",
    "    zip_url = response['object']\n",
@@ -388,6 +273,11 @@
    "        progress_bar.close()\n",
    "        \n",
    "def process_cube(product_id, language=\"en\"):\n",
+    "    cur.execute(\"SELECT product_id FROM downloaded WHERE product_id = ?\", (product_id,))\n",
+    "    result = cur.fetchone()\n",
+    "    if result:\n",
+    "        print(f\"Already processed {product_id}\")\n",
+    "        return\n",
    "    extract_zipfile(product_id, language)\n",
    "    \"\"\"\n",
    "    The pandas column reader is better than the Polars one\n",
@@ -395,12 +285,11 @@
    "    https://www150.statcan.gc.ca/n1/tbl/csv/98100404-eng.zip\n",
    "    \"\"\"\n",
    "    # Get metadata\n",
-    "    metadata_file = f\"{input_folder}/metadata/{product_id}.json\"\n",
-    "    metadata = get_cube_metadata(product_id)\n",
-    "    print(f\"Writing metadata file {metadata_file}\")\n",
-    "    with open(metadata_file, \"w\") as outfile:\n",
-    "        json.dump(metadata, outfile)\n",
-    "    update_last_downloaded(product_id)\n",
+    "    #metadata_file = f\"{input_folder}/metadata/{product_id}.json\"\n",
+    "    #metadata = get_cube_metadata(product_id)\n",
+    "    #print(f\"Writing metadata file {metadata_file}\")\n",
+    "    #with open(metadata_file, \"w\") as outfile:\n",
+    "    #    json.dump(metadata, outfile)\n",
    "    # Read CSV using Pandas\n",
    "    product_csv = f\"{scratch_folder}/{product_id}.csv\"\n",
    "    parameters = {\n",
@@ -410,22 +299,32 @@
    "    print(f\"Reading {product_csv} as a Pandas dataframe\")\n",
    "    df = pd.read_csv(product_csv, **parameters)\n",
    "    df = convert_to_lowest_type(df)\n",
+    "    # Fix mixed types. This happened in productId 11100147\n",
+    "    for col in df.select_dtypes(include=[\"object\"]).columns:\n",
+    "        types_in_col = df[col].map(type).unique()\n",
+    "        # If there's any non-str type in the column\n",
+    "        if any(t != str for t in types_in_col):\n",
+    "            print(f\"Fixing mixed types in column: {col} — types found: {types_in_col}\")\n",
+    "            # Replace non-str values with None\n",
+    "            df[col] = df[col].where(df[col].map(type) == str, None).astype(\"string\")\n",
    "    print(\"Import Pandas dataframe as a Polars dataframe\")\n",
    "    df = pl.from_pandas(df)\n",
    "    output_parquet = f\"{output_folder}/{language}/{product_id}.parquet\"\n",
    "    print(f\"Exporting dataframe as parquet to {output_parquet}\")\n",
-    "    df.write_parquet(f\"{output_folder}/{language}/{product_id}.parquet\",\n",
+    "    df.write_parquet(output_parquet,\n",
    "                     compression='zstd',\n",
    "                     compression_level=22)\n",
    "    # Remove the scratch files\n",
    "    print(\"Removing scratch files\")\n",
    "    os.remove(f\"{scratch_folder}/{product_id}.csv\")\n",
-    "    os.remove(f\"{scratch_folder}/{product_id}_MetaData.csv\")   \n"
+    "    os.remove(f\"{scratch_folder}/{product_id}_MetaData.csv\")\n",
+    "    update_last_downloaded(product_id)\n",
+    "    update_last_processed(product_id)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 164,
+   "execution_count": 229,
   "id": "6be0842e-c1e0-45b4-90e5-0be28b963aed",
   "metadata": {},
   "outputs": [
@@ -433,54 +332,13 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Extracting /data/tables/input/en/10100111.zip to /data/tables/scratch\n",
-      "Writing metadata file /data/tables/input/metadata/10100111.json\n",
-      "Reading metadata /data/tables/input/metadata/10100111.json\n",
-      "('10100111', '2025-06-18T12:30:00+00:00')\n",
-      "Reading /data/tables/scratch/10100111.csv as a Pandas dataframe\n",
-      "Converting dataframe to optimal data types\n",
-      "Import Pandas dataframe as a Polars dataframe\n",
-      "Exporting dataframe as parquet to /data/tables/output/en/10100111.parquet\n",
-      "Removing scratch files\n"
+      "Already processed 11100147\n"
     ]
    }
   ],
   "source": [
-    "process_cube(\"10100111\")"
+    "process_cube(\"11100147\")"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 194,
-   "id": "8a193195-bfa6-4df3-adab-0f88025276da",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/38100164-eng.zip to /data/tables/input/en/38100164.zip\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/data/tables/input/en/38100164.zip: 100%|██████████████████| 980k/980k [00:00<00:00, 1.27MB/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "download_cube(\"38100164\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fc4ae317-3e3e-4c27-a2fe-7e6bd12f673f",
-   "metadata": {},
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {