Scraping the table names from from https://www150.statcan.gc.ca/n1/en/type/data

Will compare against the productIds available at https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite
2026-06-13 14:10:55 +02:00 · 2025-06-27 09:11:42 -04:00
parent b88a2272b4
commit 6ad2e2c4d6
18 changed files with 75040 additions and 0 deletions
@@ -0,0 +1,93 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "561e3485-50c3-4177-861a-5989ae437b33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "import aria2p\n",
+    "\n",
+    "# initialization, these are the default values\n",
+    "aria2 = aria2p.API(\n",
+    "    aria2p.Client(\n",
+    "        host=\"http://localhost\",\n",
+    "        port=6800,\n",
+    "        secret=\"\"\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e30a5be2-a498-4f3c-9a94-a7a2659f258d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Helper function to wait for download to finish\n",
+    "def wait_for_download(download):\n",
+    "    print(f\"Waiting for download: {download.name}\")\n",
+    "    while download.is_active:\n",
+    "        print(f\"Progress: {download.progress_string()} - {download.status}\")\n",
+    "        time.sleep(10)\n",
+    "        download = aria2.get_download(download.gid)  # refresh state\n",
+    "    print(f\"Download completed: {download.name} - Status: {download.status}\")\n",
+    "    return download"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9fc190bd-3c68-48d7-902e-770bbd69fc38",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Waiting for download: test.parquet\n",
+      "Progress: 0.00% - active\n"
+     ]
+    }
+   ],
+   "source": [
+    "urls = ['https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/db_2021.parquet']\n",
+    "\n",
+    "download = aria2.add_uris(urls, options={\"out\": \"test.parquet\"})\n",
+    "download = wait_for_download(download)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8000ee01-d10c-44d9-a826-2d3c44dabd7a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -0,0 +1,2 @@
+# You'll need to stop the crawler after the last page, as it will just restart at page 1
+scrapy crawl statcan_tables -o tables.json
@@ -0,0 +1,12 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class ScrapeStatcanTablesItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
@@ -0,0 +1,100 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+
+
+class ScrapeStatcanTablesSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    async def process_start(self, start):
+        # Called with an async iterator over the spider start() method or the
+        # maching method of an earlier spider middleware.
+        async for item_or_request in start:
+            yield item_or_request
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class ScrapeStatcanTablesDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
@@ -0,0 +1,13 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+
+
+class ScrapeStatcanTablesPipeline:
+    def process_item(self, item, spider):
+        return item
@@ -0,0 +1,93 @@
+# Scrapy settings for scrape_statcan_tables project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = "scrape_statcan_tables"
+
+SPIDER_MODULES = ["scrape_statcan_tables.spiders"]
+NEWSPIDER_MODULE = "scrape_statcan_tables.spiders"
+
+ADDONS = {}
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = "scrape_statcan_tables (+http://www.yourdomain.com)"
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+#    "Accept-Language": "en",
+#}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    "scrape_statcan_tables.middlewares.ScrapeStatcanTablesSpiderMiddleware": 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    "scrape_statcan_tables.middlewares.ScrapeStatcanTablesDownloaderMiddleware": 543,
+#}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    "scrapy.extensions.telnet.TelnetConsole": None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    "scrape_statcan_tables.pipelines.ScrapeStatcanTablesPipeline": 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = "httpcache"
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+
+# Set settings whose default value is deprecated to a future-proof value
+FEED_EXPORT_ENCODING = "utf-8"
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
@@ -0,0 +1,26 @@
+import scrapy
+import re
+
+
+class StatCanDataTablesSpider(scrapy.Spider):
+    name = "statcan_tables"
+    allowed_domains = ["www150.statcan.gc.ca"]
+    start_urls = [
+        "https://www150.statcan.gc.ca/n1/en/type/data?count=100#tables"
+    ]
+
+    def parse(self, response):
+        # Use regex to match any content after "Table:", stopping at a tag or line break
+        # This finds patterns like: "Table: 12-10-0134-01", "Table: 45-67-8910-02", etc.
+        matches = re.findall(r'\<span\>Table:\<\/span\>\s*([^<\n\r]+)', response.text)
+
+        for table_id in matches:
+            # Clean up whitespace
+            yield {
+                'table_id': table_id.strip()
+            }
+
+        # Follow the "Next" pagination link
+        next_page = response.css('a[rel=next]::attr(href)').get()
+        if next_page:
+            yield response.follow(next_page, callback=self.parse)
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = scrape_statcan_tables.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = scrape_statcan_tables
@@ -0,0 +1,707 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a9b38b9a-cc9a-464b-83c5-156bee74e053",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "import sqlite3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "56054a29-f6c8-43b6-a331-c38d53246a4c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "con = sqlite3.connect(\"/data/tables/processing.db\")\n",
+    "cur = con.cursor()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9c8ce39e-36e1-4aec-86fa-8c64514f52eb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('98100297', None), ('98100103', None)]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cur.execute(\"\"\"\n",
+    "SELECT product_id, last_processed FROM downloaded\n",
+    "WHERE last_processed IS NULL\n",
+    "\"\"\")\n",
+    "\n",
+    "cur.fetchall()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "8dddc0c2-377e-4ecc-a2b5-031c9008e7f5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[(7889,)]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cur.execute(\"\"\"\n",
+    "SELECT count(*) FROM downloaded\n",
+    "\"\"\")\n",
+    "cur.fetchall()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "c351ce04-a543-4fb1-b174-eb5d50ed0fe1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "7889\n"
+     ]
+    }
+   ],
+   "source": [
+    "cur.execute(\"\"\"\n",
+    "SELECT DISTINCT product_id FROM downloaded\n",
+    "\"\"\")\n",
+    "\n",
+    "product_ids_processed = [x[0] for x in cur.fetchall()]\n",
+    "print(len(product_ids_processed))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e821aac3-fddf-49de-bea6-936ade6fda61",
+   "metadata": {},
+   "source": [
+    "# This is the entire productIds universe according to Statitics Canada"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "63c4818f-7edc-4105-a376-a7ae70212f70",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "7917\n"
+     ]
+    }
+   ],
+   "source": [
+    "cur.execute(\"\"\"\n",
+    "SELECT DISTINCT product_id FROM cubes\n",
+    "\"\"\")\n",
+    "\n",
+    "print(len(cur.fetchall()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fa537a9e-7ed1-4bc3-885e-30fd2aab8f7f",
+   "metadata": {},
+   "source": [
+    "# The remaining productIds that I need to download"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "3cf6b247-95b2-42b2-9ef5-c7b046a40d73",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cur.execute(\"\"\"\n",
+    "SELECT DISTINCT product_id FROM cubes\n",
+    "WHERE product_id NOT IN (SELECT product_id FROM downloaded)\n",
+    "\"\"\")\n",
+    "\n",
+    "to_download = [x[0] for x in cur.fetchall()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "3aba4d9f-aaf4-40b0-a802-b8a77517c1a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import datetime\n",
+    "import glob\n",
+    "from multiprocessing import Pool\n",
+    "import json\n",
+    "import os\n",
+    "import sqlite3\n",
+    "import zipfile\n",
+    "from zoneinfo import ZoneInfo\n",
+    "\n",
+    "import pandas as pd\n",
+    "import requests\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "data_folder = \"/data/tables\"\n",
+    "input_folder = f\"{data_folder}/input\"\n",
+    "scratch_folder = f\"{data_folder}/scratch\"\n",
+    "output_folder = f\"{data_folder}/output\"\n",
+    "\n",
+    "\n",
+    "def download_cube(product_id, language=\"en\"):\n",
+    "    \"\"\"\n",
+    "    Downloads the English CSV for a specific table\n",
+    "    \"\"\"\n",
+    "    download_url = f\"https://www150.statcan.gc.ca/t1/wds/rest/getFullTableDownloadCSV/{product_id}/en\"\n",
+    "    response = requests.get(download_url).json()\n",
+    "    zip_url = response['object']\n",
+    "    zip_file_name = f\"{input_folder}/{language}/june_25_2025/{product_id}.zip\"\n",
+    "    print(f\"Downloading {zip_url} to {zip_file_name}\")\n",
+    "    response = requests.get(zip_url, stream=True, headers={\"user-agent\": None})\n",
+    "    progress_bar = tqdm(\n",
+    "        desc=zip_file_name,\n",
+    "        total=int(response.headers.get(\"content-length\", 0)),\n",
+    "        unit=\"B\",\n",
+    "        unit_scale=True\n",
+    "    )\n",
+    "    with open(zip_file_name, \"wb\") as handle:\n",
+    "        for chunk in response.iter_content(chunk_size=512):\n",
+    "            if chunk:  # filter out keep-alive new chunks\n",
+    "                handle.write(chunk)\n",
+    "                progress_bar.update(len(chunk))\n",
+    "        progress_bar.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "38e30a9b-c185-4111-be6b-3f1dc704b15e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100147-eng.zip to /data/tables/input/en/june_25_2025/12100147.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/12100147.zip: 100%|█████████████████| 312M/312M [07:33<00:00, 688kB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100148-eng.zip to /data/tables/input/en/june_25_2025/12100148.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/12100148.zip: 10\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100149-eng.zip to /data/tables/input/en/june_25_2025/12100149.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/12100149.zip: 100%|██████████████| 1.42G/1.42G [14:39<00:00, 1.62MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100150-eng.zip to /data/tables/input/en/june_25_2025/12100150.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/12100150.zip: 100%|████████████████| 317M/317M [04:48<00:00, 1.10MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100151-eng.zip to /data/tables/input/en/june_25_2025/12100151.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/12100151.zip: 100%|██████████████| 2.13G/2.13G [24:42<00:00, 1.43MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100152-eng.zip to /data/tables/input/en/june_25_2025/12100152.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/12100152.zip: 100%|██████████████| 6.94G/6.94G [57:20<00:00, 2.02MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100153-eng.zip to /data/tables/input/en/june_25_2025/12100153.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/12100153.zip: 3.48kB [00:00, 17.1MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100154-eng.zip to /data/tables/input/en/june_25_2025/12100154.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/12100154.zip: 3.48kB [00:00, 19.5MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100155-eng.zip to /data/tables/input/en/june_25_2025/12100155.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/12100155.zip: 3.48kB [00:00, 16.2MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100156-eng.zip to /data/tables/input/en/june_25_2025/12100156.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/12100156.zip: 3.48kB [00:00, 16.3MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/13100442-eng.zip to /data/tables/input/en/june_25_2025/13100442.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/13100442.zip: 100%|██████████████| 1.02M/1.02M [00:00<00:00, 1.15MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/13100958-eng.zip to /data/tables/input/en/june_25_2025/13100958.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/13100958.zip: 100%|██████████████| 11.8k/11.8k [00:00<00:00, 52.3MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/33100852-eng.zip to /data/tables/input/en/june_25_2025/33100852.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/33100852.zip: 100%|██████████████| 5.26k/5.26k [00:00<00:00, 26.0MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/34100293-eng.zip to /data/tables/input/en/june_25_2025/34100293.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/34100293.zip: 100%|██████████████| 49.3M/49.3M [00:47<00:00, 1.04MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/34100294-eng.zip to /data/tables/input/en/june_25_2025/34100294.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/34100294.zip: 100%|███████████████| 50.6k/50.6k [00:00<00:00, 102MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/34100295-eng.zip to /data/tables/input/en/june_25_2025/34100295.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/34100295.zip: 100%|███████████████| 40.9k/40.9k [00:00<00:00, 101MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100271-eng.zip to /data/tables/input/en/june_25_2025/37100271.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/37100271.zip: 100%|███████████████| 5.92M/5.92M [00:07<00:00, 780kB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100272-eng.zip to /data/tables/input/en/june_25_2025/37100272.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/37100272.zip: 100%|███████████████| 14.8M/14.8M [00:19<00:00, 763kB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100273-eng.zip to /data/tables/input/en/june_25_2025/37100273.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/37100273.zip: 100%|███████████████| 9.07M/9.07M [00:12<00:00, 711kB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100274-eng.zip to /data/tables/input/en/june_25_2025/37100274.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/37100274.zip: 100%|██████████████| 23.5M/23.5M [00:07<00:00, 3.03MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100289-eng.zip to /data/tables/input/en/june_25_2025/37100289.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/37100289.zip: 100%|█████████████████| 544k/544k [00:01<00:00, 470kB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100290-eng.zip to /data/tables/input/en/june_25_2025/37100290.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/37100290.zip: 100%|███████████████| 10.4M/10.4M [00:11<00:00, 927kB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100291-eng.zip to /data/tables/input/en/june_25_2025/37100291.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/37100291.zip: 100%|████████████████| 129M/129M [01:41<00:00, 1.28MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100292-eng.zip to /data/tables/input/en/june_25_2025/37100292.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/37100292.zip: 100%|██████████████| 1.57M/1.57M [00:00<00:00, 4.13MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/38100182-eng.zip to /data/tables/input/en/june_25_2025/38100182.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/38100182.zip: 100%|█████████████████| 403k/403k [00:00<00:00, 937kB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/38100183-eng.zip to /data/tables/input/en/june_25_2025/38100183.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/38100183.zip: 100%|████████████████| 105k/105k [00:00<00:00, 2.88MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/45100110-eng.zip to /data/tables/input/en/june_25_2025/45100110.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/45100110.zip: 100%|███████████████| 16.2M/16.2M [00:19<00:00, 817kB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/45100111-eng.zip to /data/tables/input/en/june_25_2025/45100111.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/45100111.zip: 100%|███████████████| 5.20M/5.20M [00:11<00:00, 435kB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading https://www150.statcan.gc.ca/n1/tbl/csv/46100092-eng.zip to /data/tables/input/en/june_25_2025/46100092.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/tables/input/en/june_25_2025/46100092.zip: 100%|██████████████| 73.9k/73.9k [00:00<00:00, 1.90MB/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for product_id in to_download:\n",
+    "    download_cube(product_id)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ee91149b-5349-41f5-b7f8-1a310f272c89",
+   "metadata": {},
+   "source": [
+    "# The remaining productIds that I need to process from input data directory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "b9c7c4d7-31db-4f29-95cf-a718a660e2e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "remaining_to_process = []\n",
+    "for file in glob.glob(\"/data/tables/input/en/*.zip\"):\n",
+    "    product_id = file.split(\"/\")[-1].split(\".zip\")[0]\n",
+    "    if product_id not in product_ids_processed:\n",
+    "        remaining_to_process.append(product_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "a70f84cb-b978-4642-bf29-ce0d98342a9f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(remaining_to_process))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "bab154cc-8fe7-49b1-a67e-581d9ad8334b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['13100442']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(remaining_to_process)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "af6efc2a-b10c-40af-acb4-8ab94e2bf59a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}