Will compare against the productIds available at  https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite
This commit is contained in:
Diego Ripley
2025-06-27 09:11:42 -04:00
parent b88a2272b4
commit 6ad2e2c4d6
18 changed files with 75040 additions and 0 deletions
@@ -0,0 +1,707 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "a9b38b9a-cc9a-464b-83c5-156bee74e053",
"metadata": {},
"outputs": [],
"source": [
"import glob\n",
"import sqlite3"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "56054a29-f6c8-43b6-a331-c38d53246a4c",
"metadata": {},
"outputs": [],
"source": [
"con = sqlite3.connect(\"/data/tables/processing.db\")\n",
"cur = con.cursor()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9c8ce39e-36e1-4aec-86fa-8c64514f52eb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('98100297', None), ('98100103', None)]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cur.execute(\"\"\"\n",
"SELECT product_id, last_processed FROM downloaded\n",
"WHERE last_processed IS NULL\n",
"\"\"\")\n",
"\n",
"cur.fetchall()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "8dddc0c2-377e-4ecc-a2b5-031c9008e7f5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(7889,)]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cur.execute(\"\"\"\n",
"SELECT count(*) FROM downloaded\n",
"\"\"\")\n",
"cur.fetchall()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "c351ce04-a543-4fb1-b174-eb5d50ed0fe1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"7889\n"
]
}
],
"source": [
"cur.execute(\"\"\"\n",
"SELECT DISTINCT product_id FROM downloaded\n",
"\"\"\")\n",
"\n",
"product_ids_processed = [x[0] for x in cur.fetchall()]\n",
"print(len(product_ids_processed))"
]
},
{
"cell_type": "markdown",
"id": "e821aac3-fddf-49de-bea6-936ade6fda61",
"metadata": {},
"source": [
"# This is the entire productIds universe according to Statitics Canada"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "63c4818f-7edc-4105-a376-a7ae70212f70",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"7917\n"
]
}
],
"source": [
"cur.execute(\"\"\"\n",
"SELECT DISTINCT product_id FROM cubes\n",
"\"\"\")\n",
"\n",
"print(len(cur.fetchall()))"
]
},
{
"cell_type": "markdown",
"id": "fa537a9e-7ed1-4bc3-885e-30fd2aab8f7f",
"metadata": {},
"source": [
"# The remaining productIds that I need to download"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "3cf6b247-95b2-42b2-9ef5-c7b046a40d73",
"metadata": {},
"outputs": [],
"source": [
"cur.execute(\"\"\"\n",
"SELECT DISTINCT product_id FROM cubes\n",
"WHERE product_id NOT IN (SELECT product_id FROM downloaded)\n",
"\"\"\")\n",
"\n",
"to_download = [x[0] for x in cur.fetchall()]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "3aba4d9f-aaf4-40b0-a802-b8a77517c1a9",
"metadata": {},
"outputs": [],
"source": [
"from datetime import datetime\n",
"import glob\n",
"from multiprocessing import Pool\n",
"import json\n",
"import os\n",
"import sqlite3\n",
"import zipfile\n",
"from zoneinfo import ZoneInfo\n",
"\n",
"import pandas as pd\n",
"import requests\n",
"from tqdm import tqdm\n",
"\n",
"data_folder = \"/data/tables\"\n",
"input_folder = f\"{data_folder}/input\"\n",
"scratch_folder = f\"{data_folder}/scratch\"\n",
"output_folder = f\"{data_folder}/output\"\n",
"\n",
"\n",
"def download_cube(product_id, language=\"en\"):\n",
" \"\"\"\n",
" Downloads the English CSV for a specific table\n",
" \"\"\"\n",
" download_url = f\"https://www150.statcan.gc.ca/t1/wds/rest/getFullTableDownloadCSV/{product_id}/en\"\n",
" response = requests.get(download_url).json()\n",
" zip_url = response['object']\n",
" zip_file_name = f\"{input_folder}/{language}/june_25_2025/{product_id}.zip\"\n",
" print(f\"Downloading {zip_url} to {zip_file_name}\")\n",
" response = requests.get(zip_url, stream=True, headers={\"user-agent\": None})\n",
" progress_bar = tqdm(\n",
" desc=zip_file_name,\n",
" total=int(response.headers.get(\"content-length\", 0)),\n",
" unit=\"B\",\n",
" unit_scale=True\n",
" )\n",
" with open(zip_file_name, \"wb\") as handle:\n",
" for chunk in response.iter_content(chunk_size=512):\n",
" if chunk: # filter out keep-alive new chunks\n",
" handle.write(chunk)\n",
" progress_bar.update(len(chunk))\n",
" progress_bar.close()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "38e30a9b-c185-4111-be6b-3f1dc704b15e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100147-eng.zip to /data/tables/input/en/june_25_2025/12100147.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/12100147.zip: 100%|█████████████████| 312M/312M [07:33<00:00, 688kB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100148-eng.zip to /data/tables/input/en/june_25_2025/12100148.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/12100148.zip: 10\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100149-eng.zip to /data/tables/input/en/june_25_2025/12100149.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/12100149.zip: 100%|██████████████| 1.42G/1.42G [14:39<00:00, 1.62MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100150-eng.zip to /data/tables/input/en/june_25_2025/12100150.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/12100150.zip: 100%|████████████████| 317M/317M [04:48<00:00, 1.10MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100151-eng.zip to /data/tables/input/en/june_25_2025/12100151.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/12100151.zip: 100%|██████████████| 2.13G/2.13G [24:42<00:00, 1.43MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100152-eng.zip to /data/tables/input/en/june_25_2025/12100152.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/12100152.zip: 100%|██████████████| 6.94G/6.94G [57:20<00:00, 2.02MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100153-eng.zip to /data/tables/input/en/june_25_2025/12100153.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/12100153.zip: 3.48kB [00:00, 17.1MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100154-eng.zip to /data/tables/input/en/june_25_2025/12100154.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/12100154.zip: 3.48kB [00:00, 19.5MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100155-eng.zip to /data/tables/input/en/june_25_2025/12100155.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/12100155.zip: 3.48kB [00:00, 16.2MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100156-eng.zip to /data/tables/input/en/june_25_2025/12100156.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/12100156.zip: 3.48kB [00:00, 16.3MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/13100442-eng.zip to /data/tables/input/en/june_25_2025/13100442.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/13100442.zip: 100%|██████████████| 1.02M/1.02M [00:00<00:00, 1.15MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/13100958-eng.zip to /data/tables/input/en/june_25_2025/13100958.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/13100958.zip: 100%|██████████████| 11.8k/11.8k [00:00<00:00, 52.3MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/33100852-eng.zip to /data/tables/input/en/june_25_2025/33100852.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/33100852.zip: 100%|██████████████| 5.26k/5.26k [00:00<00:00, 26.0MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/34100293-eng.zip to /data/tables/input/en/june_25_2025/34100293.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/34100293.zip: 100%|██████████████| 49.3M/49.3M [00:47<00:00, 1.04MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/34100294-eng.zip to /data/tables/input/en/june_25_2025/34100294.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/34100294.zip: 100%|███████████████| 50.6k/50.6k [00:00<00:00, 102MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/34100295-eng.zip to /data/tables/input/en/june_25_2025/34100295.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/34100295.zip: 100%|███████████████| 40.9k/40.9k [00:00<00:00, 101MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100271-eng.zip to /data/tables/input/en/june_25_2025/37100271.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/37100271.zip: 100%|███████████████| 5.92M/5.92M [00:07<00:00, 780kB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100272-eng.zip to /data/tables/input/en/june_25_2025/37100272.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/37100272.zip: 100%|███████████████| 14.8M/14.8M [00:19<00:00, 763kB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100273-eng.zip to /data/tables/input/en/june_25_2025/37100273.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/37100273.zip: 100%|███████████████| 9.07M/9.07M [00:12<00:00, 711kB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100274-eng.zip to /data/tables/input/en/june_25_2025/37100274.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/37100274.zip: 100%|██████████████| 23.5M/23.5M [00:07<00:00, 3.03MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100289-eng.zip to /data/tables/input/en/june_25_2025/37100289.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/37100289.zip: 100%|█████████████████| 544k/544k [00:01<00:00, 470kB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100290-eng.zip to /data/tables/input/en/june_25_2025/37100290.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/37100290.zip: 100%|███████████████| 10.4M/10.4M [00:11<00:00, 927kB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100291-eng.zip to /data/tables/input/en/june_25_2025/37100291.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/37100291.zip: 100%|████████████████| 129M/129M [01:41<00:00, 1.28MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100292-eng.zip to /data/tables/input/en/june_25_2025/37100292.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/37100292.zip: 100%|██████████████| 1.57M/1.57M [00:00<00:00, 4.13MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/38100182-eng.zip to /data/tables/input/en/june_25_2025/38100182.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/38100182.zip: 100%|█████████████████| 403k/403k [00:00<00:00, 937kB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/38100183-eng.zip to /data/tables/input/en/june_25_2025/38100183.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/38100183.zip: 100%|████████████████| 105k/105k [00:00<00:00, 2.88MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/45100110-eng.zip to /data/tables/input/en/june_25_2025/45100110.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/45100110.zip: 100%|███████████████| 16.2M/16.2M [00:19<00:00, 817kB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/45100111-eng.zip to /data/tables/input/en/june_25_2025/45100111.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/45100111.zip: 100%|███████████████| 5.20M/5.20M [00:11<00:00, 435kB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/46100092-eng.zip to /data/tables/input/en/june_25_2025/46100092.zip\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/tables/input/en/june_25_2025/46100092.zip: 100%|██████████████| 73.9k/73.9k [00:00<00:00, 1.90MB/s]\n"
]
}
],
"source": [
"for product_id in to_download:\n",
" download_cube(product_id)"
]
},
{
"cell_type": "markdown",
"id": "ee91149b-5349-41f5-b7f8-1a310f272c89",
"metadata": {},
"source": [
"# The remaining productIds that I need to process from input data directory"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "b9c7c4d7-31db-4f29-95cf-a718a660e2e8",
"metadata": {},
"outputs": [],
"source": [
"remaining_to_process = []\n",
"for file in glob.glob(\"/data/tables/input/en/*.zip\"):\n",
" product_id = file.split(\"/\")[-1].split(\".zip\")[0]\n",
" if product_id not in product_ids_processed:\n",
" remaining_to_process.append(product_id)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "a70f84cb-b978-4642-bf29-ce0d98342a9f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1\n"
]
}
],
"source": [
"print(len(remaining_to_process))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "bab154cc-8fe7-49b1-a67e-581d9ad8334b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['13100442']\n"
]
}
],
"source": [
"print(remaining_to_process)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "af6efc2a-b10c-40af-acb4-8ab94e2bf59a",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}