mirror of
https://github.com/dataforcanada/d4c-datapkg-statistical.git
synced 2026-06-13 14:10:55 +02:00
Scraping the table names from from https://www150.statcan.gc.ca/n1/en/type/data
Will compare against the productIds available at https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite
This commit is contained in:
Binary file not shown.
File diff suppressed because one or more lines are too long
@@ -0,0 +1,93 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "561e3485-50c3-4177-861a-5989ae437b33",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"import aria2p\n",
|
||||
"\n",
|
||||
"# initialization, these are the default values\n",
|
||||
"aria2 = aria2p.API(\n",
|
||||
" aria2p.Client(\n",
|
||||
" host=\"http://localhost\",\n",
|
||||
" port=6800,\n",
|
||||
" secret=\"\"\n",
|
||||
" )\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "e30a5be2-a498-4f3c-9a94-a7a2659f258d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Helper function to wait for download to finish\n",
|
||||
"def wait_for_download(download):\n",
|
||||
" print(f\"Waiting for download: {download.name}\")\n",
|
||||
" while download.is_active:\n",
|
||||
" print(f\"Progress: {download.progress_string()} - {download.status}\")\n",
|
||||
" time.sleep(10)\n",
|
||||
" download = aria2.get_download(download.gid) # refresh state\n",
|
||||
" print(f\"Download completed: {download.name} - Status: {download.status}\")\n",
|
||||
" return download"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9fc190bd-3c68-48d7-902e-770bbd69fc38",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Waiting for download: test.parquet\n",
|
||||
"Progress: 0.00% - active\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"urls = ['https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/db_2021.parquet']\n",
|
||||
"\n",
|
||||
"download = aria2.add_uris(urls, options={\"out\": \"test.parquet\"})\n",
|
||||
"download = wait_for_download(download)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8000ee01-d10c-44d9-a826-2d3c44dabd7a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,2 @@
|
||||
# You'll need to stop the crawler after the last page, as it will just restart at page 1
|
||||
scrapy crawl statcan_tables -o tables.json
|
||||
@@ -0,0 +1,12 @@
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class ScrapeStatcanTablesItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
# name = scrapy.Field()
|
||||
pass
|
||||
@@ -0,0 +1,100 @@
|
||||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
|
||||
class ScrapeStatcanTablesSpiderMiddleware:
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, or item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Request or item objects.
|
||||
pass
|
||||
|
||||
async def process_start(self, start):
|
||||
# Called with an async iterator over the spider start() method or the
|
||||
# maching method of an earlier spider middleware.
|
||||
async for item_or_request in start:
|
||||
yield item_or_request
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info("Spider opened: %s" % spider.name)
|
||||
|
||||
|
||||
class ScrapeStatcanTablesDownloaderMiddleware:
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the downloader middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# Called for each request that goes through the downloader
|
||||
# middleware.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this request
|
||||
# - or return a Response object
|
||||
# - or return a Request object
|
||||
# - or raise IgnoreRequest: process_exception() methods of
|
||||
# installed downloader middleware will be called
|
||||
return None
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# Called with the response returned from the downloader.
|
||||
|
||||
# Must either;
|
||||
# - return a Response object
|
||||
# - return a Request object
|
||||
# - or raise IgnoreRequest
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# Called when a download handler or a process_request()
|
||||
# (from other downloader middleware) raises an exception.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this exception
|
||||
# - return a Response object: stops process_exception() chain
|
||||
# - return a Request object: stops process_exception() chain
|
||||
pass
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info("Spider opened: %s" % spider.name)
|
||||
@@ -0,0 +1,13 @@
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
|
||||
class ScrapeStatcanTablesPipeline:
|
||||
def process_item(self, item, spider):
|
||||
return item
|
||||
@@ -0,0 +1,93 @@
|
||||
# Scrapy settings for scrape_statcan_tables project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = "scrape_statcan_tables"
|
||||
|
||||
SPIDER_MODULES = ["scrape_statcan_tables.spiders"]
|
||||
NEWSPIDER_MODULE = "scrape_statcan_tables.spiders"
|
||||
|
||||
ADDONS = {}
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = "scrape_statcan_tables (+http://www.yourdomain.com)"
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
# "Accept-Language": "en",
|
||||
#}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# "scrape_statcan_tables.middlewares.ScrapeStatcanTablesSpiderMiddleware": 543,
|
||||
#}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# "scrape_statcan_tables.middlewares.ScrapeStatcanTablesDownloaderMiddleware": 543,
|
||||
#}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
#ITEM_PIPELINES = {
|
||||
# "scrape_statcan_tables.pipelines.ScrapeStatcanTablesPipeline": 300,
|
||||
#}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = "httpcache"
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||
|
||||
# Set settings whose default value is deprecated to a future-proof value
|
||||
FEED_EXPORT_ENCODING = "utf-8"
|
||||
@@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
@@ -0,0 +1,26 @@
|
||||
import scrapy
|
||||
import re
|
||||
|
||||
|
||||
class StatCanDataTablesSpider(scrapy.Spider):
|
||||
name = "statcan_tables"
|
||||
allowed_domains = ["www150.statcan.gc.ca"]
|
||||
start_urls = [
|
||||
"https://www150.statcan.gc.ca/n1/en/type/data?count=100#tables"
|
||||
]
|
||||
|
||||
def parse(self, response):
|
||||
# Use regex to match any content after "Table:", stopping at a tag or line break
|
||||
# This finds patterns like: "Table: 12-10-0134-01", "Table: 45-67-8910-02", etc.
|
||||
matches = re.findall(r'\<span\>Table:\<\/span\>\s*([^<\n\r]+)', response.text)
|
||||
|
||||
for table_id in matches:
|
||||
# Clean up whitespace
|
||||
yield {
|
||||
'table_id': table_id.strip()
|
||||
}
|
||||
|
||||
# Follow the "Next" pagination link
|
||||
next_page = response.css('a[rel=next]::attr(href)').get()
|
||||
if next_page:
|
||||
yield response.follow(next_page, callback=self.parse)
|
||||
@@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = scrape_statcan_tables.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = scrape_statcan_tables
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because one or more lines are too long
@@ -0,0 +1,707 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "a9b38b9a-cc9a-464b-83c5-156bee74e053",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import glob\n",
|
||||
"import sqlite3"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "56054a29-f6c8-43b6-a331-c38d53246a4c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"con = sqlite3.connect(\"/data/tables/processing.db\")\n",
|
||||
"cur = con.cursor()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "9c8ce39e-36e1-4aec-86fa-8c64514f52eb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('98100297', None), ('98100103', None)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"cur.execute(\"\"\"\n",
|
||||
"SELECT product_id, last_processed FROM downloaded\n",
|
||||
"WHERE last_processed IS NULL\n",
|
||||
"\"\"\")\n",
|
||||
"\n",
|
||||
"cur.fetchall()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "8dddc0c2-377e-4ecc-a2b5-031c9008e7f5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[(7889,)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"cur.execute(\"\"\"\n",
|
||||
"SELECT count(*) FROM downloaded\n",
|
||||
"\"\"\")\n",
|
||||
"cur.fetchall()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "c351ce04-a543-4fb1-b174-eb5d50ed0fe1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"7889\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"cur.execute(\"\"\"\n",
|
||||
"SELECT DISTINCT product_id FROM downloaded\n",
|
||||
"\"\"\")\n",
|
||||
"\n",
|
||||
"product_ids_processed = [x[0] for x in cur.fetchall()]\n",
|
||||
"print(len(product_ids_processed))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e821aac3-fddf-49de-bea6-936ade6fda61",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# This is the entire productIds universe according to Statitics Canada"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "63c4818f-7edc-4105-a376-a7ae70212f70",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"7917\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"cur.execute(\"\"\"\n",
|
||||
"SELECT DISTINCT product_id FROM cubes\n",
|
||||
"\"\"\")\n",
|
||||
"\n",
|
||||
"print(len(cur.fetchall()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fa537a9e-7ed1-4bc3-885e-30fd2aab8f7f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# The remaining productIds that I need to download"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "3cf6b247-95b2-42b2-9ef5-c7b046a40d73",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cur.execute(\"\"\"\n",
|
||||
"SELECT DISTINCT product_id FROM cubes\n",
|
||||
"WHERE product_id NOT IN (SELECT product_id FROM downloaded)\n",
|
||||
"\"\"\")\n",
|
||||
"\n",
|
||||
"to_download = [x[0] for x in cur.fetchall()]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "3aba4d9f-aaf4-40b0-a802-b8a77517c1a9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from datetime import datetime\n",
|
||||
"import glob\n",
|
||||
"from multiprocessing import Pool\n",
|
||||
"import json\n",
|
||||
"import os\n",
|
||||
"import sqlite3\n",
|
||||
"import zipfile\n",
|
||||
"from zoneinfo import ZoneInfo\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"import requests\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"\n",
|
||||
"data_folder = \"/data/tables\"\n",
|
||||
"input_folder = f\"{data_folder}/input\"\n",
|
||||
"scratch_folder = f\"{data_folder}/scratch\"\n",
|
||||
"output_folder = f\"{data_folder}/output\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def download_cube(product_id, language=\"en\"):\n",
|
||||
" \"\"\"\n",
|
||||
" Downloads the English CSV for a specific table\n",
|
||||
" \"\"\"\n",
|
||||
" download_url = f\"https://www150.statcan.gc.ca/t1/wds/rest/getFullTableDownloadCSV/{product_id}/en\"\n",
|
||||
" response = requests.get(download_url).json()\n",
|
||||
" zip_url = response['object']\n",
|
||||
" zip_file_name = f\"{input_folder}/{language}/june_25_2025/{product_id}.zip\"\n",
|
||||
" print(f\"Downloading {zip_url} to {zip_file_name}\")\n",
|
||||
" response = requests.get(zip_url, stream=True, headers={\"user-agent\": None})\n",
|
||||
" progress_bar = tqdm(\n",
|
||||
" desc=zip_file_name,\n",
|
||||
" total=int(response.headers.get(\"content-length\", 0)),\n",
|
||||
" unit=\"B\",\n",
|
||||
" unit_scale=True\n",
|
||||
" )\n",
|
||||
" with open(zip_file_name, \"wb\") as handle:\n",
|
||||
" for chunk in response.iter_content(chunk_size=512):\n",
|
||||
" if chunk: # filter out keep-alive new chunks\n",
|
||||
" handle.write(chunk)\n",
|
||||
" progress_bar.update(len(chunk))\n",
|
||||
" progress_bar.close()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "38e30a9b-c185-4111-be6b-3f1dc704b15e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100147-eng.zip to /data/tables/input/en/june_25_2025/12100147.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/12100147.zip: 100%|█████████████████| 312M/312M [07:33<00:00, 688kB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100148-eng.zip to /data/tables/input/en/june_25_2025/12100148.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/12100148.zip: 10\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100149-eng.zip to /data/tables/input/en/june_25_2025/12100149.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/12100149.zip: 100%|██████████████| 1.42G/1.42G [14:39<00:00, 1.62MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100150-eng.zip to /data/tables/input/en/june_25_2025/12100150.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/12100150.zip: 100%|████████████████| 317M/317M [04:48<00:00, 1.10MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100151-eng.zip to /data/tables/input/en/june_25_2025/12100151.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/12100151.zip: 100%|██████████████| 2.13G/2.13G [24:42<00:00, 1.43MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100152-eng.zip to /data/tables/input/en/june_25_2025/12100152.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/12100152.zip: 100%|██████████████| 6.94G/6.94G [57:20<00:00, 2.02MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100153-eng.zip to /data/tables/input/en/june_25_2025/12100153.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/12100153.zip: 3.48kB [00:00, 17.1MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100154-eng.zip to /data/tables/input/en/june_25_2025/12100154.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/12100154.zip: 3.48kB [00:00, 19.5MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100155-eng.zip to /data/tables/input/en/june_25_2025/12100155.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/12100155.zip: 3.48kB [00:00, 16.2MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100156-eng.zip to /data/tables/input/en/june_25_2025/12100156.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/12100156.zip: 3.48kB [00:00, 16.3MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/13100442-eng.zip to /data/tables/input/en/june_25_2025/13100442.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/13100442.zip: 100%|██████████████| 1.02M/1.02M [00:00<00:00, 1.15MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/13100958-eng.zip to /data/tables/input/en/june_25_2025/13100958.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/13100958.zip: 100%|██████████████| 11.8k/11.8k [00:00<00:00, 52.3MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/33100852-eng.zip to /data/tables/input/en/june_25_2025/33100852.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/33100852.zip: 100%|██████████████| 5.26k/5.26k [00:00<00:00, 26.0MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/34100293-eng.zip to /data/tables/input/en/june_25_2025/34100293.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/34100293.zip: 100%|██████████████| 49.3M/49.3M [00:47<00:00, 1.04MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/34100294-eng.zip to /data/tables/input/en/june_25_2025/34100294.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/34100294.zip: 100%|███████████████| 50.6k/50.6k [00:00<00:00, 102MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/34100295-eng.zip to /data/tables/input/en/june_25_2025/34100295.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/34100295.zip: 100%|███████████████| 40.9k/40.9k [00:00<00:00, 101MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100271-eng.zip to /data/tables/input/en/june_25_2025/37100271.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/37100271.zip: 100%|███████████████| 5.92M/5.92M [00:07<00:00, 780kB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100272-eng.zip to /data/tables/input/en/june_25_2025/37100272.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/37100272.zip: 100%|███████████████| 14.8M/14.8M [00:19<00:00, 763kB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100273-eng.zip to /data/tables/input/en/june_25_2025/37100273.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/37100273.zip: 100%|███████████████| 9.07M/9.07M [00:12<00:00, 711kB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100274-eng.zip to /data/tables/input/en/june_25_2025/37100274.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/37100274.zip: 100%|██████████████| 23.5M/23.5M [00:07<00:00, 3.03MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100289-eng.zip to /data/tables/input/en/june_25_2025/37100289.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/37100289.zip: 100%|█████████████████| 544k/544k [00:01<00:00, 470kB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100290-eng.zip to /data/tables/input/en/june_25_2025/37100290.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/37100290.zip: 100%|███████████████| 10.4M/10.4M [00:11<00:00, 927kB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100291-eng.zip to /data/tables/input/en/june_25_2025/37100291.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/37100291.zip: 100%|████████████████| 129M/129M [01:41<00:00, 1.28MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100292-eng.zip to /data/tables/input/en/june_25_2025/37100292.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/37100292.zip: 100%|██████████████| 1.57M/1.57M [00:00<00:00, 4.13MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/38100182-eng.zip to /data/tables/input/en/june_25_2025/38100182.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/38100182.zip: 100%|█████████████████| 403k/403k [00:00<00:00, 937kB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/38100183-eng.zip to /data/tables/input/en/june_25_2025/38100183.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/38100183.zip: 100%|████████████████| 105k/105k [00:00<00:00, 2.88MB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/45100110-eng.zip to /data/tables/input/en/june_25_2025/45100110.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/45100110.zip: 100%|███████████████| 16.2M/16.2M [00:19<00:00, 817kB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/45100111-eng.zip to /data/tables/input/en/june_25_2025/45100111.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/45100111.zip: 100%|███████████████| 5.20M/5.20M [00:11<00:00, 435kB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/46100092-eng.zip to /data/tables/input/en/june_25_2025/46100092.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/june_25_2025/46100092.zip: 100%|██████████████| 73.9k/73.9k [00:00<00:00, 1.90MB/s]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for product_id in to_download:\n",
|
||||
" download_cube(product_id)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ee91149b-5349-41f5-b7f8-1a310f272c89",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# The remaining productIds that I need to process from input data directory"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "b9c7c4d7-31db-4f29-95cf-a718a660e2e8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"remaining_to_process = []\n",
|
||||
"for file in glob.glob(\"/data/tables/input/en/*.zip\"):\n",
|
||||
" product_id = file.split(\"/\")[-1].split(\".zip\")[0]\n",
|
||||
" if product_id not in product_ids_processed:\n",
|
||||
" remaining_to_process.append(product_id)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "a70f84cb-b978-4642-bf29-ce0d98342a9f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(len(remaining_to_process))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "bab154cc-8fe7-49b1-a67e-581d9ad8334b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['13100442']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(remaining_to_process)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "af6efc2a-b10c-40af-acb4-8ab94e2bf59a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user