mirror of
https://github.com/dataforcanada/d4c-datapkg-statistical.git
synced 2026-06-13 14:10:55 +02:00
Experiment with Jupyter notebook on downloading and processing statcan cubes
This commit is contained in:
@@ -0,0 +1,507 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "8b767621-b96b-4eaa-a16a-0329bab29c0f",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Notes\n",
|
||||||
|
"- Make sure Statistics Canada knows about these issues:\n",
|
||||||
|
" - When downloading the XML for product_id 98100404 it just returns the structure document, not the data document\n",
|
||||||
|
" - The releaseTime value is in Eastern Standard Zone for https://www150.statcan.gc.ca/t1/wds/rest/getCubeMetadata and UTC for https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite\n",
|
||||||
|
" - The releaseTime value is different when getting it from https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite and https://www150.statcan.gc.ca/t1/wds/rest/getCubeMetadata . For example productId 10100007"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 188,
|
||||||
|
"id": "98859cd6-6fa4-4aef-a113-455699524fae",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from datetime import datetime\n",
|
||||||
|
"import glob\n",
|
||||||
|
"from multiprocessing import Pool\n",
|
||||||
|
"import json\n",
|
||||||
|
"import os\n",
|
||||||
|
"import sqlite3\n",
|
||||||
|
"import zipfile\n",
|
||||||
|
"from zoneinfo import ZoneInfo\n",
|
||||||
|
"\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import polars as pl\n",
|
||||||
|
"import requests\n",
|
||||||
|
"from tqdm import tqdm\n",
|
||||||
|
"\n",
|
||||||
|
"data_folder = \"/data/tables\"\n",
|
||||||
|
"input_folder = f\"{data_folder}/input\"\n",
|
||||||
|
"scratch_folder = f\"{data_folder}/scratch\"\n",
|
||||||
|
"output_folder = f\"{data_folder}/output\"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"if not os.path.exists(\"processing.db\"):\n",
|
||||||
|
" con = sqlite3.connect('processing.db')\n",
|
||||||
|
" cur = con.cursor()\n",
|
||||||
|
" cur.executescript(\"\"\"\n",
|
||||||
|
" CREATE TABLE IF NOT EXISTS downloaded (\n",
|
||||||
|
" product_id TEXT PRIMARY KEY,\n",
|
||||||
|
" last_updated TEXT\n",
|
||||||
|
" );\n",
|
||||||
|
"\n",
|
||||||
|
" CREATE TABLE IF NOT EXISTS cubes (\n",
|
||||||
|
" product_id TEXT PRIMARY KEY,\n",
|
||||||
|
" last_updated TEXT\n",
|
||||||
|
" );\n",
|
||||||
|
" \"\"\")\n",
|
||||||
|
" con.commit()\n",
|
||||||
|
"else:\n",
|
||||||
|
" con = sqlite3.connect('processing.db')\n",
|
||||||
|
" cur = con.cursor()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 96,
|
||||||
|
"id": "28ac4c01-c1c5-427f-bb2c-0da99a4c5591",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def setup():\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Makes data folders\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" folders_to_create = [data_folder, input_folder, \n",
|
||||||
|
" scratch_folder, output_folder,\n",
|
||||||
|
" f\"{input_folder}/en\", f\"{output_folder}/en\",\n",
|
||||||
|
" f\"{input_folder}/fr\", f\"{output_folder}/fr\",\n",
|
||||||
|
" f\"{input_folder}/metadata\"]\n",
|
||||||
|
" for folder in folders_to_create:\n",
|
||||||
|
" if not os.path.exists(folder):\n",
|
||||||
|
" print(f\"Making folder {folder}\")\n",
|
||||||
|
" os.mkdir(folder)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 97,
|
||||||
|
"id": "9daa94f4-16c9-4d8b-951e-3d5d38eb618f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"setup()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 177,
|
||||||
|
"id": "0af9a4b3-7b59-460b-b933-504919d4bd2a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def update_last_downloaded(product_id):\n",
|
||||||
|
" filepath = f\"{input_folder}/metadata/{product_id}.json\"\n",
|
||||||
|
" print(f\"Reading metadata {filepath}\")\n",
|
||||||
|
" with open(filepath, 'r') as fp:\n",
|
||||||
|
" metadata = json.load(fp)\n",
|
||||||
|
" product_id = metadata.get(\"object\").get(\"productId\")\n",
|
||||||
|
" last_updated = metadata.get(\"object\").get(\"releaseTime\")\n",
|
||||||
|
" # Convert last_updated to UTC since /getAllcubesListLite uses UTC\n",
|
||||||
|
" last_updated = datetime.strptime(last_updated, \"%Y-%m-%dT%H:%M\")\n",
|
||||||
|
" last_updated = last_updated.replace(tzinfo=ZoneInfo(\"America/Toronto\"))\n",
|
||||||
|
" last_updated = last_updated.astimezone(ZoneInfo(\"UTC\")).isoformat()\n",
|
||||||
|
" \n",
|
||||||
|
" data = (product_id, last_updated)\n",
|
||||||
|
" cur.execute(\"SELECT product_id FROM downloaded WHERE product_id = ?\", (product_id,))\n",
|
||||||
|
" result = cur.fetchone()\n",
|
||||||
|
" if not result:\n",
|
||||||
|
" cur.execute(\"INSERT INTO downloaded VALUES (?, ?)\", data)\n",
|
||||||
|
" else:\n",
|
||||||
|
" cur.execute(\"UPDATE downloaded SET last_updated = ? WHERE product_id = ?\", (last_updated, product_id))\n",
|
||||||
|
"\n",
|
||||||
|
" con.commit()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 170,
|
||||||
|
"id": "23cbabc3-0d4b-4e28-a4df-b2c5e8c7ea8b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def update_tables():\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" This currently does not work as expected because Statistics Canada has discrepancies.\n",
|
||||||
|
" The \"releaseTime\" listed in https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite\n",
|
||||||
|
" for every pdocutId is not the same as \"releaseTime\" listed when making a POST \n",
|
||||||
|
" https://www150.statcan.gc.ca/t1/wds/rest/getCubeMetadata , for example:\n",
|
||||||
|
" [{\"productId\":10100007}]\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" cur.execute(\"\"\"\n",
|
||||||
|
" DELETE FROM cubes;\n",
|
||||||
|
" \"\"\")\n",
|
||||||
|
" con.commit()\n",
|
||||||
|
" response = requests.get(\"https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite\").json()\n",
|
||||||
|
" cubes_metadata = pl.from_dicts(response)[['productId', 'releaseTime']]\n",
|
||||||
|
" cubes_metadata = cubes_metadata.rename({\"productId\": \"product_id\", \"releaseTime\": \"last_updated\"})\n",
|
||||||
|
" cubes_metadata = cubes_metadata.rows()\n",
|
||||||
|
" cubes_metadata_new = []\n",
|
||||||
|
" for cube in cubes_metadata:\n",
|
||||||
|
" product_id, last_updated = cube\n",
|
||||||
|
" # Update the date field so it is formatted the same as date field in downloaded table\n",
|
||||||
|
" last_updated = datetime.strptime(last_updated, \"%Y-%m-%dT%H:%M:%SZ\").astimezone(ZoneInfo(\"UTC\"))\n",
|
||||||
|
" last_updated = last_updated.isoformat()\n",
|
||||||
|
" cubes_metadata_new.append((product_id, last_updated))\n",
|
||||||
|
" \n",
|
||||||
|
" cur.executemany(\"INSERT INTO cubes VALUES(?, ?)\", cubes_metadata_new)\n",
|
||||||
|
" con.commit()\n",
|
||||||
|
"\n",
|
||||||
|
" cur.execute(\"\"\"\n",
|
||||||
|
" SELECT a.product_id\n",
|
||||||
|
" FROM downloaded AS a,\n",
|
||||||
|
" cubes AS b\n",
|
||||||
|
" WHERE a.product_id = b.product_id\n",
|
||||||
|
" AND b.last_updated > a.last_updated\n",
|
||||||
|
" \"\"\")\n",
|
||||||
|
" results = cur.fetchall()\n",
|
||||||
|
" for result in results:\n",
|
||||||
|
" product_id = result[0]\n",
|
||||||
|
" print(f\"Updating product_id: {product_id}\")\n",
|
||||||
|
" process_cube(product_id)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 186,
|
||||||
|
"id": "c0d46f38-242c-4685-b8b6-4e046c23aec5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"cur.execute(\"\"\"\n",
|
||||||
|
" SELECT a.product_id, a.last_updated, b.last_updated\n",
|
||||||
|
" FROM downloaded AS a,\n",
|
||||||
|
" cubes AS b\n",
|
||||||
|
" WHERE a.product_id = b.product_id\n",
|
||||||
|
" AND b.last_updated > a.last_updated\n",
|
||||||
|
"\"\"\")\n",
|
||||||
|
"difference = pd.DataFrame(cur.fetchall(), columns=[\"product_id\", \"release_time_metadata\", \"release_time_cubelist\"])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 189,
|
||||||
|
"id": "a5e12a7a-9891-4d58-9368-e31b274feb1d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>product_id</th>\n",
|
||||||
|
" <th>release_time_metadata</th>\n",
|
||||||
|
" <th>release_time_cubelist</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>10100006</td>\n",
|
||||||
|
" <td>2025-05-21T12:30:00+00:00</td>\n",
|
||||||
|
" <td>2025-06-17T12:30:00+00:00</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>10100108</td>\n",
|
||||||
|
" <td>2025-05-13T12:30:00+00:00</td>\n",
|
||||||
|
" <td>2025-06-12T12:30:00+00:00</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>10100132</td>\n",
|
||||||
|
" <td>2025-06-06T12:30:00+00:00</td>\n",
|
||||||
|
" <td>2025-06-13T12:30:00+00:00</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>10100136</td>\n",
|
||||||
|
" <td>2025-06-10T12:30:00+00:00</td>\n",
|
||||||
|
" <td>2025-06-16T12:30:00+00:00</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>10100138</td>\n",
|
||||||
|
" <td>2025-06-06T12:30:00+00:00</td>\n",
|
||||||
|
" <td>2025-06-13T12:30:00+00:00</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>...</th>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>92</th>\n",
|
||||||
|
" <td>38100235</td>\n",
|
||||||
|
" <td>2025-03-13T12:30:00+00:00</td>\n",
|
||||||
|
" <td>2025-06-12T12:30:00+00:00</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>93</th>\n",
|
||||||
|
" <td>38100236</td>\n",
|
||||||
|
" <td>2025-03-13T12:30:00+00:00</td>\n",
|
||||||
|
" <td>2025-06-12T12:30:00+00:00</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>94</th>\n",
|
||||||
|
" <td>38100237</td>\n",
|
||||||
|
" <td>2025-03-13T12:30:00+00:00</td>\n",
|
||||||
|
" <td>2025-06-12T12:30:00+00:00</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>95</th>\n",
|
||||||
|
" <td>38100238</td>\n",
|
||||||
|
" <td>2025-03-13T12:30:00+00:00</td>\n",
|
||||||
|
" <td>2025-06-12T12:30:00+00:00</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>96</th>\n",
|
||||||
|
" <td>38100164</td>\n",
|
||||||
|
" <td>2023-09-13T12:30:00+00:00</td>\n",
|
||||||
|
" <td>2025-06-17T12:30:00+00:00</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"<p>97 rows × 3 columns</p>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" product_id release_time_metadata release_time_cubelist\n",
|
||||||
|
"0 10100006 2025-05-21T12:30:00+00:00 2025-06-17T12:30:00+00:00\n",
|
||||||
|
"1 10100108 2025-05-13T12:30:00+00:00 2025-06-12T12:30:00+00:00\n",
|
||||||
|
"2 10100132 2025-06-06T12:30:00+00:00 2025-06-13T12:30:00+00:00\n",
|
||||||
|
"3 10100136 2025-06-10T12:30:00+00:00 2025-06-16T12:30:00+00:00\n",
|
||||||
|
"4 10100138 2025-06-06T12:30:00+00:00 2025-06-13T12:30:00+00:00\n",
|
||||||
|
".. ... ... ...\n",
|
||||||
|
"92 38100235 2025-03-13T12:30:00+00:00 2025-06-12T12:30:00+00:00\n",
|
||||||
|
"93 38100236 2025-03-13T12:30:00+00:00 2025-06-12T12:30:00+00:00\n",
|
||||||
|
"94 38100237 2025-03-13T12:30:00+00:00 2025-06-12T12:30:00+00:00\n",
|
||||||
|
"95 38100238 2025-03-13T12:30:00+00:00 2025-06-12T12:30:00+00:00\n",
|
||||||
|
"96 38100164 2023-09-13T12:30:00+00:00 2025-06-17T12:30:00+00:00\n",
|
||||||
|
"\n",
|
||||||
|
"[97 rows x 3 columns]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 189,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"difference"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 193,
|
||||||
|
"id": "144e3716-b0e7-4a39-9a25-ededea506f4f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def convert_to_lowest_type(df):\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Convert columns to the best possible dtypes\n",
|
||||||
|
" For example, if the column is numerical and has a maximum value of 32,000 \n",
|
||||||
|
" we can assign it a type of int16\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" print(\"Converting dataframe to optimal data types\")\n",
|
||||||
|
" params = {\n",
|
||||||
|
" 'convert_string': False,\n",
|
||||||
|
" 'convert_boolean': False\n",
|
||||||
|
" }\n",
|
||||||
|
" df = df.convert_dtypes(**params)\n",
|
||||||
|
"\n",
|
||||||
|
" dtypes = pd.DataFrame(df.dtypes)\n",
|
||||||
|
" # Downcast to the smallest numerical dtype\n",
|
||||||
|
" for row in dtypes.itertuples():\n",
|
||||||
|
" column = row[0]\n",
|
||||||
|
" the_type = str(row[1])\n",
|
||||||
|
" # Skipping downcasting Float64 as there were issues with decimal places\n",
|
||||||
|
" # For example, instead of a value being 65.4, it turned into 65.4000015258789\n",
|
||||||
|
" if the_type == 'Float64':\n",
|
||||||
|
" continue\n",
|
||||||
|
" elif the_type == 'Int64':\n",
|
||||||
|
" df[column] = pd.to_numeric(df[column], downcast='integer')\n",
|
||||||
|
"\n",
|
||||||
|
" return df\n",
|
||||||
|
"\n",
|
||||||
|
"def extract_zipfile(product_id, language):\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" It is faster to extract the zip file and read the CSV, than open\n",
|
||||||
|
" via zipfile and then Pandas\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" zip_file = f\"{input_folder}/{language}/{product_id}.zip\"\n",
|
||||||
|
" with zipfile.ZipFile(zip_file) as myzip:\n",
|
||||||
|
" print(f\"Extracting {zip_file} to {scratch_folder}\")\n",
|
||||||
|
" myzip.extractall(path=scratch_folder)\n",
|
||||||
|
"\n",
|
||||||
|
"def get_cube_metadata(product_id):\n",
|
||||||
|
" url = f\"https://www150.statcan.gc.ca/t1/wds/rest/getCubeMetadata\"\n",
|
||||||
|
" cubes_payload = [{\"productId\": product_id}]\n",
|
||||||
|
" result = requests.post(url, json=cubes_payload)\n",
|
||||||
|
" result = result.json()[0]\n",
|
||||||
|
" return result\n",
|
||||||
|
"\n",
|
||||||
|
"def download_cube(product_id, language=\"en\"):\n",
|
||||||
|
" download_url = f\"https://www150.statcan.gc.ca/t1/wds/rest/getFullTableDownloadCSV/{product_id}/en\"\n",
|
||||||
|
" response = requests.get(download_url).json()\n",
|
||||||
|
" zip_url = response['object']\n",
|
||||||
|
" zip_file_name = f\"{input_folder}/{language}/{product_id}.zip\"\n",
|
||||||
|
" print(f\"Downloading {zip_url} to {zip_file_name}\")\n",
|
||||||
|
" response = requests.get(zip_url, stream=True, headers={\"user-agent\": None})\n",
|
||||||
|
" progress_bar = tqdm(\n",
|
||||||
|
" desc=zip_file_name,\n",
|
||||||
|
" total=int(response.headers.get(\"content-length\", 0)),\n",
|
||||||
|
" unit=\"B\",\n",
|
||||||
|
" unit_scale=True\n",
|
||||||
|
" )\n",
|
||||||
|
" with open(zip_file_name, \"wb\") as handle:\n",
|
||||||
|
" for chunk in response.iter_content(chunk_size=512):\n",
|
||||||
|
" if chunk: # filter out keep-alive new chunks\n",
|
||||||
|
" handle.write(chunk)\n",
|
||||||
|
" progress_bar.update(len(chunk))\n",
|
||||||
|
" progress_bar.close()\n",
|
||||||
|
" \n",
|
||||||
|
"def process_cube(product_id, language=\"en\"):\n",
|
||||||
|
" extract_zipfile(product_id, language)\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" The pandas column reader is better than the Polars one\n",
|
||||||
|
" Here is an example where polars was not reading it right:\n",
|
||||||
|
" https://www150.statcan.gc.ca/n1/tbl/csv/98100404-eng.zip\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" # Get metadata\n",
|
||||||
|
" metadata_file = f\"{input_folder}/metadata/{product_id}.json\"\n",
|
||||||
|
" metadata = get_cube_metadata(product_id)\n",
|
||||||
|
" print(f\"Writing metadata file {metadata_file}\")\n",
|
||||||
|
" with open(metadata_file, \"w\") as outfile:\n",
|
||||||
|
" json.dump(metadata, outfile)\n",
|
||||||
|
" update_last_downloaded(product_id)\n",
|
||||||
|
" # Read CSV using Pandas\n",
|
||||||
|
" product_csv = f\"{scratch_folder}/{product_id}.csv\"\n",
|
||||||
|
" parameters = {\n",
|
||||||
|
" \"engine\": \"c\",\n",
|
||||||
|
" \"low_memory\": True\n",
|
||||||
|
" }\n",
|
||||||
|
" print(f\"Reading {product_csv} as a Pandas dataframe\")\n",
|
||||||
|
" df = pd.read_csv(product_csv, **parameters)\n",
|
||||||
|
" df = convert_to_lowest_type(df)\n",
|
||||||
|
" print(\"Import Pandas dataframe as a Polars dataframe\")\n",
|
||||||
|
" df = pl.from_pandas(df)\n",
|
||||||
|
" output_parquet = f\"{output_folder}/{language}/{product_id}.parquet\"\n",
|
||||||
|
" print(f\"Exporting dataframe as parquet to {output_parquet}\")\n",
|
||||||
|
" df.write_parquet(f\"{output_folder}/{language}/{product_id}.parquet\",\n",
|
||||||
|
" compression='zstd',\n",
|
||||||
|
" compression_level=22)\n",
|
||||||
|
" # Remove the scratch files\n",
|
||||||
|
" print(\"Removing scratch files\")\n",
|
||||||
|
" os.remove(f\"{scratch_folder}/{product_id}.csv\")\n",
|
||||||
|
" os.remove(f\"{scratch_folder}/{product_id}_MetaData.csv\") \n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 164,
|
||||||
|
"id": "6be0842e-c1e0-45b4-90e5-0be28b963aed",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Extracting /data/tables/input/en/10100111.zip to /data/tables/scratch\n",
|
||||||
|
"Writing metadata file /data/tables/input/metadata/10100111.json\n",
|
||||||
|
"Reading metadata /data/tables/input/metadata/10100111.json\n",
|
||||||
|
"('10100111', '2025-06-18T12:30:00+00:00')\n",
|
||||||
|
"Reading /data/tables/scratch/10100111.csv as a Pandas dataframe\n",
|
||||||
|
"Converting dataframe to optimal data types\n",
|
||||||
|
"Import Pandas dataframe as a Polars dataframe\n",
|
||||||
|
"Exporting dataframe as parquet to /data/tables/output/en/10100111.parquet\n",
|
||||||
|
"Removing scratch files\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"process_cube(\"10100111\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 194,
|
||||||
|
"id": "8a193195-bfa6-4df3-adab-0f88025276da",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/38100164-eng.zip to /data/tables/input/en/38100164.zip\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/data/tables/input/en/38100164.zip: 100%|██████████████████| 980k/980k [00:00<00:00, 1.27MB/s]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"download_cube(\"38100164\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "fc4ae317-3e3e-4c27-a2fe-7e6bd12f673f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user