diff --git a/experiments/statcan_products/process_product_test.ipynb b/experiments/statcan_products/process_product_test.ipynb new file mode 100644 index 0000000..089b9e6 --- /dev/null +++ b/experiments/statcan_products/process_product_test.ipynb @@ -0,0 +1,507 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8b767621-b96b-4eaa-a16a-0329bab29c0f", + "metadata": {}, + "source": [ + "# Notes\n", + "- Make sure Statistics Canada knows about these issues:\n", + " - When downloading the XML for product_id 98100404 it just returns the structure document, not the data document\n", + " - The releaseTime value is in Eastern Standard Zone for https://www150.statcan.gc.ca/t1/wds/rest/getCubeMetadata and UTC for https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite\n", + " - The releaseTime value is different when getting it from https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite and https://www150.statcan.gc.ca/t1/wds/rest/getCubeMetadata . For example productId 10100007" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "id": "98859cd6-6fa4-4aef-a113-455699524fae", + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "import glob\n", + "from multiprocessing import Pool\n", + "import json\n", + "import os\n", + "import sqlite3\n", + "import zipfile\n", + "from zoneinfo import ZoneInfo\n", + "\n", + "import pandas as pd\n", + "import polars as pl\n", + "import requests\n", + "from tqdm import tqdm\n", + "\n", + "data_folder = \"/data/tables\"\n", + "input_folder = f\"{data_folder}/input\"\n", + "scratch_folder = f\"{data_folder}/scratch\"\n", + "output_folder = f\"{data_folder}/output\"\n", + "\n", + "\n", + "if not os.path.exists(\"processing.db\"):\n", + " con = sqlite3.connect('processing.db')\n", + " cur = con.cursor()\n", + " cur.executescript(\"\"\"\n", + " CREATE TABLE IF NOT EXISTS downloaded (\n", + " product_id TEXT PRIMARY KEY,\n", + " last_updated TEXT\n", + " );\n", + "\n", + " CREATE TABLE IF NOT EXISTS cubes (\n", + " product_id TEXT PRIMARY KEY,\n", + " last_updated TEXT\n", + " );\n", + " \"\"\")\n", + " con.commit()\n", + "else:\n", + " con = sqlite3.connect('processing.db')\n", + " cur = con.cursor()" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "28ac4c01-c1c5-427f-bb2c-0da99a4c5591", + "metadata": {}, + "outputs": [], + "source": [ + "def setup():\n", + " \"\"\"\n", + " Makes data folders\n", + " \"\"\"\n", + " folders_to_create = [data_folder, input_folder, \n", + " scratch_folder, output_folder,\n", + " f\"{input_folder}/en\", f\"{output_folder}/en\",\n", + " f\"{input_folder}/fr\", f\"{output_folder}/fr\",\n", + " f\"{input_folder}/metadata\"]\n", + " for folder in folders_to_create:\n", + " if not os.path.exists(folder):\n", + " print(f\"Making folder {folder}\")\n", + " os.mkdir(folder)" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "9daa94f4-16c9-4d8b-951e-3d5d38eb618f", + "metadata": {}, + "outputs": [], + "source": [ + "setup()" + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "id": "0af9a4b3-7b59-460b-b933-504919d4bd2a", + "metadata": {}, + "outputs": [], + "source": [ + "def update_last_downloaded(product_id):\n", + " filepath = f\"{input_folder}/metadata/{product_id}.json\"\n", + " print(f\"Reading metadata {filepath}\")\n", + " with open(filepath, 'r') as fp:\n", + " metadata = json.load(fp)\n", + " product_id = metadata.get(\"object\").get(\"productId\")\n", + " last_updated = metadata.get(\"object\").get(\"releaseTime\")\n", + " # Convert last_updated to UTC since /getAllcubesListLite uses UTC\n", + " last_updated = datetime.strptime(last_updated, \"%Y-%m-%dT%H:%M\")\n", + " last_updated = last_updated.replace(tzinfo=ZoneInfo(\"America/Toronto\"))\n", + " last_updated = last_updated.astimezone(ZoneInfo(\"UTC\")).isoformat()\n", + " \n", + " data = (product_id, last_updated)\n", + " cur.execute(\"SELECT product_id FROM downloaded WHERE product_id = ?\", (product_id,))\n", + " result = cur.fetchone()\n", + " if not result:\n", + " cur.execute(\"INSERT INTO downloaded VALUES (?, ?)\", data)\n", + " else:\n", + " cur.execute(\"UPDATE downloaded SET last_updated = ? WHERE product_id = ?\", (last_updated, product_id))\n", + "\n", + " con.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "id": "23cbabc3-0d4b-4e28-a4df-b2c5e8c7ea8b", + "metadata": {}, + "outputs": [], + "source": [ + "def update_tables():\n", + " \"\"\"\n", + " This currently does not work as expected because Statistics Canada has discrepancies.\n", + " The \"releaseTime\" listed in https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite\n", + " for every pdocutId is not the same as \"releaseTime\" listed when making a POST \n", + " https://www150.statcan.gc.ca/t1/wds/rest/getCubeMetadata , for example:\n", + " [{\"productId\":10100007}]\n", + " \"\"\"\n", + " cur.execute(\"\"\"\n", + " DELETE FROM cubes;\n", + " \"\"\")\n", + " con.commit()\n", + " response = requests.get(\"https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite\").json()\n", + " cubes_metadata = pl.from_dicts(response)[['productId', 'releaseTime']]\n", + " cubes_metadata = cubes_metadata.rename({\"productId\": \"product_id\", \"releaseTime\": \"last_updated\"})\n", + " cubes_metadata = cubes_metadata.rows()\n", + " cubes_metadata_new = []\n", + " for cube in cubes_metadata:\n", + " product_id, last_updated = cube\n", + " # Update the date field so it is formatted the same as date field in downloaded table\n", + " last_updated = datetime.strptime(last_updated, \"%Y-%m-%dT%H:%M:%SZ\").astimezone(ZoneInfo(\"UTC\"))\n", + " last_updated = last_updated.isoformat()\n", + " cubes_metadata_new.append((product_id, last_updated))\n", + " \n", + " cur.executemany(\"INSERT INTO cubes VALUES(?, ?)\", cubes_metadata_new)\n", + " con.commit()\n", + "\n", + " cur.execute(\"\"\"\n", + " SELECT a.product_id\n", + " FROM downloaded AS a,\n", + " cubes AS b\n", + " WHERE a.product_id = b.product_id\n", + " AND b.last_updated > a.last_updated\n", + " \"\"\")\n", + " results = cur.fetchall()\n", + " for result in results:\n", + " product_id = result[0]\n", + " print(f\"Updating product_id: {product_id}\")\n", + " process_cube(product_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "id": "c0d46f38-242c-4685-b8b6-4e046c23aec5", + "metadata": {}, + "outputs": [], + "source": [ + "cur.execute(\"\"\"\n", + " SELECT a.product_id, a.last_updated, b.last_updated\n", + " FROM downloaded AS a,\n", + " cubes AS b\n", + " WHERE a.product_id = b.product_id\n", + " AND b.last_updated > a.last_updated\n", + "\"\"\")\n", + "difference = pd.DataFrame(cur.fetchall(), columns=[\"product_id\", \"release_time_metadata\", \"release_time_cubelist\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 189, + "id": "a5e12a7a-9891-4d58-9368-e31b274feb1d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | product_id | \n", + "release_time_metadata | \n", + "release_time_cubelist | \n", + "
|---|---|---|---|
| 0 | \n", + "10100006 | \n", + "2025-05-21T12:30:00+00:00 | \n", + "2025-06-17T12:30:00+00:00 | \n", + "
| 1 | \n", + "10100108 | \n", + "2025-05-13T12:30:00+00:00 | \n", + "2025-06-12T12:30:00+00:00 | \n", + "
| 2 | \n", + "10100132 | \n", + "2025-06-06T12:30:00+00:00 | \n", + "2025-06-13T12:30:00+00:00 | \n", + "
| 3 | \n", + "10100136 | \n", + "2025-06-10T12:30:00+00:00 | \n", + "2025-06-16T12:30:00+00:00 | \n", + "
| 4 | \n", + "10100138 | \n", + "2025-06-06T12:30:00+00:00 | \n", + "2025-06-13T12:30:00+00:00 | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 92 | \n", + "38100235 | \n", + "2025-03-13T12:30:00+00:00 | \n", + "2025-06-12T12:30:00+00:00 | \n", + "
| 93 | \n", + "38100236 | \n", + "2025-03-13T12:30:00+00:00 | \n", + "2025-06-12T12:30:00+00:00 | \n", + "
| 94 | \n", + "38100237 | \n", + "2025-03-13T12:30:00+00:00 | \n", + "2025-06-12T12:30:00+00:00 | \n", + "
| 95 | \n", + "38100238 | \n", + "2025-03-13T12:30:00+00:00 | \n", + "2025-06-12T12:30:00+00:00 | \n", + "
| 96 | \n", + "38100164 | \n", + "2023-09-13T12:30:00+00:00 | \n", + "2025-06-17T12:30:00+00:00 | \n", + "
97 rows × 3 columns
\n", + "