mirror of
https://github.com/dataforcanada/d4c-datapkg-statistical.git
synced 2026-06-13 14:10:55 +02:00
Scraping the table names from from https://www150.statcan.gc.ca/n1/en/type/data
Will compare against the productIds available at https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite
This commit is contained in:
@@ -0,0 +1,707 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "a9b38b9a-cc9a-464b-83c5-156bee74e053",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import glob\n",
|
||||
"import sqlite3"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "56054a29-f6c8-43b6-a331-c38d53246a4c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"con = sqlite3.connect(\"/data/tables/processing.db\")\n",
|
||||
"cur = con.cursor()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "9c8ce39e-36e1-4aec-86fa-8c64514f52eb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('98100297', None), ('98100103', None)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"cur.execute(\"\"\"\n",
|
||||
"SELECT product_id, last_processed FROM downloaded\n",
|
||||
"WHERE last_processed IS NULL\n",
|
||||
"\"\"\")\n",
|
||||
"\n",
|
||||
"cur.fetchall()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "8dddc0c2-377e-4ecc-a2b5-031c9008e7f5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[(7889,)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"cur.execute(\"\"\"\n",
|
||||
"SELECT count(*) FROM downloaded\n",
|
||||
"\"\"\")\n",
|
||||
"cur.fetchall()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "c351ce04-a543-4fb1-b174-eb5d50ed0fe1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"7889\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"cur.execute(\"\"\"\n",
|
||||
"SELECT DISTINCT product_id FROM downloaded\n",
|
||||
"\"\"\")\n",
|
||||
"\n",
|
||||
"product_ids_processed = [x[0] for x in cur.fetchall()]\n",
|
||||
"print(len(product_ids_processed))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e821aac3-fddf-49de-bea6-936ade6fda61",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# This is the entire productIds universe according to Statitics Canada"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "63c4818f-7edc-4105-a376-a7ae70212f70",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"7917\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"cur.execute(\"\"\"\n",
|
||||
"SELECT DISTINCT product_id FROM cubes\n",
|
||||
"\"\"\")\n",
|
||||
"\n",
|
||||
"print(len(cur.fetchall()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fa537a9e-7ed1-4bc3-885e-30fd2aab8f7f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# The remaining productIds that I need to download"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "3cf6b247-95b2-42b2-9ef5-c7b046a40d73",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cur.execute(\"\"\"\n",
|
||||
"SELECT DISTINCT product_id FROM cubes\n",
|
||||
"WHERE product_id NOT IN (SELECT product_id FROM downloaded)\n",
|
||||
"\"\"\")\n",
|
||||
"\n",
|
||||
"to_download = [x[0] for x in cur.fetchall()]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "3aba4d9f-aaf4-40b0-a802-b8a77517c1a9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from datetime import datetime\n",
|
||||
"import glob\n",
|
||||
"from multiprocessing import Pool\n",
|
||||
"import json\n",
|
||||
"import os\n",
|
||||
"import sqlite3\n",
|
||||
"import zipfile\n",
|
||||
"from zoneinfo import ZoneInfo\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"import requests\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"\n",
|
||||
"data_folder = \"/data/tables\"\n",
|
||||
"input_folder = f\"{data_folder}/input\"\n",
|
||||
"scratch_folder = f\"{data_folder}/scratch\"\n",
|
||||
"output_folder = f\"{data_folder}/output\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def download_cube(product_id, language=\"en\"):\n",
|
||||
" \"\"\"\n",
|
||||
" Downloads the English CSV for a specific table\n",
|
||||
" \"\"\"\n",
|
||||
" download_url = f\"https://www150.statcan.gc.ca/t1/wds/rest/getFullTableDownloadCSV/{product_id}/en\"\n",
|
||||
" response = requests.get(download_url).json()\n",
|
||||
" zip_url = response['object']\n",
|
||||
" zip_file_name = f\"{input_folder}/{language}/june_25_2025/{product_id}.zip\"\n",
|
||||
" print(f\"Downloading {zip_url} to {zip_file_name}\")\n",
|
||||
" response = requests.get(zip_url, stream=True, headers={\"user-agent\": None})\n",
|
||||
" progress_bar = tqdm(\n",
|
||||
" desc=zip_file_name,\n",
|
||||
" total=int(response.headers.get(\"content-length\", 0)),\n",
|
||||
" unit=\"B\",\n",
|
||||
" unit_scale=True\n",
|
||||
" )\n",
|
||||
" with open(zip_file_name, \"wb\") as handle:\n",
|
||||
" for chunk in response.iter_content(chunk_size=512):\n",
|
||||
" if chunk: # filter out keep-alive new chunks\n",
|
||||
" handle.write(chunk)\n",
|
||||
" progress_bar.update(len(chunk))\n",
|
||||
" progress_bar.close()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "38e30a9b-c185-4111-be6b-3f1dc704b15e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100147-eng.zip to /data/tables/input/en/june_25_2025/12100147.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/12100147.zip: 100%|█████████████████| 312M/312M [07:33<00:00, 688kB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100148-eng.zip to /data/tables/input/en/june_25_2025/12100148.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/12100148.zip: 10\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100149-eng.zip to /data/tables/input/en/june_25_2025/12100149.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/12100149.zip: 100%|██████████████| 1.42G/1.42G [14:39<00:00, 1.62MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100150-eng.zip to /data/tables/input/en/june_25_2025/12100150.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/12100150.zip: 100%|████████████████| 317M/317M [04:48<00:00, 1.10MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100151-eng.zip to /data/tables/input/en/june_25_2025/12100151.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/12100151.zip: 100%|██████████████| 2.13G/2.13G [24:42<00:00, 1.43MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100152-eng.zip to /data/tables/input/en/june_25_2025/12100152.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/12100152.zip: 100%|██████████████| 6.94G/6.94G [57:20<00:00, 2.02MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100153-eng.zip to /data/tables/input/en/june_25_2025/12100153.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/12100153.zip: 3.48kB [00:00, 17.1MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100154-eng.zip to /data/tables/input/en/june_25_2025/12100154.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/12100154.zip: 3.48kB [00:00, 19.5MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100155-eng.zip to /data/tables/input/en/june_25_2025/12100155.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/12100155.zip: 3.48kB [00:00, 16.2MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100156-eng.zip to /data/tables/input/en/june_25_2025/12100156.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/12100156.zip: 3.48kB [00:00, 16.3MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/13100442-eng.zip to /data/tables/input/en/june_25_2025/13100442.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/13100442.zip: 100%|██████████████| 1.02M/1.02M [00:00<00:00, 1.15MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/13100958-eng.zip to /data/tables/input/en/june_25_2025/13100958.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/13100958.zip: 100%|██████████████| 11.8k/11.8k [00:00<00:00, 52.3MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/33100852-eng.zip to /data/tables/input/en/june_25_2025/33100852.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/33100852.zip: 100%|██████████████| 5.26k/5.26k [00:00<00:00, 26.0MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/34100293-eng.zip to /data/tables/input/en/june_25_2025/34100293.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/34100293.zip: 100%|██████████████| 49.3M/49.3M [00:47<00:00, 1.04MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/34100294-eng.zip to /data/tables/input/en/june_25_2025/34100294.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/34100294.zip: 100%|███████████████| 50.6k/50.6k [00:00<00:00, 102MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/34100295-eng.zip to /data/tables/input/en/june_25_2025/34100295.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/34100295.zip: 100%|███████████████| 40.9k/40.9k [00:00<00:00, 101MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100271-eng.zip to /data/tables/input/en/june_25_2025/37100271.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/37100271.zip: 100%|███████████████| 5.92M/5.92M [00:07<00:00, 780kB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100272-eng.zip to /data/tables/input/en/june_25_2025/37100272.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/37100272.zip: 100%|███████████████| 14.8M/14.8M [00:19<00:00, 763kB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100273-eng.zip to /data/tables/input/en/june_25_2025/37100273.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/37100273.zip: 100%|███████████████| 9.07M/9.07M [00:12<00:00, 711kB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100274-eng.zip to /data/tables/input/en/june_25_2025/37100274.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/37100274.zip: 100%|██████████████| 23.5M/23.5M [00:07<00:00, 3.03MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100289-eng.zip to /data/tables/input/en/june_25_2025/37100289.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/37100289.zip: 100%|█████████████████| 544k/544k [00:01<00:00, 470kB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100290-eng.zip to /data/tables/input/en/june_25_2025/37100290.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/37100290.zip: 100%|███████████████| 10.4M/10.4M [00:11<00:00, 927kB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100291-eng.zip to /data/tables/input/en/june_25_2025/37100291.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/37100291.zip: 100%|████████████████| 129M/129M [01:41<00:00, 1.28MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100292-eng.zip to /data/tables/input/en/june_25_2025/37100292.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/37100292.zip: 100%|██████████████| 1.57M/1.57M [00:00<00:00, 4.13MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/38100182-eng.zip to /data/tables/input/en/june_25_2025/38100182.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/38100182.zip: 100%|█████████████████| 403k/403k [00:00<00:00, 937kB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/38100183-eng.zip to /data/tables/input/en/june_25_2025/38100183.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/38100183.zip: 100%|████████████████| 105k/105k [00:00<00:00, 2.88MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/45100110-eng.zip to /data/tables/input/en/june_25_2025/45100110.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/45100110.zip: 100%|███████████████| 16.2M/16.2M [00:19<00:00, 817kB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/45100111-eng.zip to /data/tables/input/en/june_25_2025/45100111.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/45100111.zip: 100%|███████████████| 5.20M/5.20M [00:11<00:00, 435kB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/46100092-eng.zip to /data/tables/input/en/june_25_2025/46100092.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/46100092.zip: 100%|██████████████| 73.9k/73.9k [00:00<00:00, 1.90MB/s]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for product_id in to_download:\n",
|
||||
" download_cube(product_id)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ee91149b-5349-41f5-b7f8-1a310f272c89",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# The remaining productIds that I need to process from input data directory"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "b9c7c4d7-31db-4f29-95cf-a718a660e2e8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"remaining_to_process = []\n",
|
||||
"for file in glob.glob(\"/data/tables/input/en/*.zip\"):\n",
|
||||
" product_id = file.split(\"/\")[-1].split(\".zip\")[0]\n",
|
||||
" if product_id not in product_ids_processed:\n",
|
||||
" remaining_to_process.append(product_id)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "a70f84cb-b978-4642-bf29-ce0d98342a9f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(len(remaining_to_process))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "bab154cc-8fe7-49b1-a67e-581d9ad8334b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['13100442']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(remaining_to_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "af6efc2a-b10c-40af-acb4-8ab94e2bf59a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user