mirror of
https://github.com/dataforcanada/d4c-datapkg-statistical.git
synced 2026-06-13 14:10:55 +02:00
Continue work on processing data tables
This commit is contained in:
@@ -0,0 +1,108 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e4d2c52d-38be-4f84-a0bf-6bd8cb577ad9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Purpose\n",
|
||||
"I need to find out what all possible date formats are for the \"REF_DATE\" field so that when I write the parquet file people will be able to filter on it\n",
|
||||
"\n",
|
||||
"These are all the dates I have encountered:\n",
|
||||
"- Just the year. Example found on productId 36100608: 2024\n",
|
||||
"- Year and month. Example found on productId 14100443: 2024-07\n",
|
||||
"- Year, month, date. Example found on productId 33100036: 2025-06-17\n",
|
||||
"- Range. Example found on productId 17100022: 2013/2014"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"id": "43916131-250b-4fdf-9c75-39024822529c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import glob\n",
|
||||
"from multiprocessing import Pool\n",
|
||||
"import sqlite3\n",
|
||||
"import polars as pl\n",
|
||||
"\n",
|
||||
"data_folder = \"/data/tables\"\n",
|
||||
"input_folder = f\"{data_folder}/input\"\n",
|
||||
"scratch_folder = f\"{data_folder}/scratch\"\n",
|
||||
"output_folder = f\"{data_folder}/output\"\n",
|
||||
"\n",
|
||||
"con = sqlite3.connect(f\"{data_folder}/dates.db\")\n",
|
||||
"cur = con.cursor()\n",
|
||||
"cur.executescript(\"\"\"\n",
|
||||
" DROP TABLE IF EXISTS dates;\n",
|
||||
" CREATE TABLE IF NOT EXISTS dates (\n",
|
||||
" product_id TEXT,\n",
|
||||
" date TEXT\n",
|
||||
" );\n",
|
||||
"\"\"\")\n",
|
||||
"con.commit()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "553d58ff-81a4-45e6-a96e-6ea9ebb166fe",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def find_unique_dates(filepath):\n",
|
||||
" \"\"\"\n",
|
||||
" Finds unique dates for a table and writes to SQLite table\n",
|
||||
" \"\"\"\n",
|
||||
" product_id = filepath.split(\"/\")[-1].split(\".parquet\")[0]\n",
|
||||
" print(f\"Processing {product_id}\")\n",
|
||||
" try:\n",
|
||||
" result = (\n",
|
||||
" pl.scan_parquet(filepath)\n",
|
||||
" .select(\"REF_DATE\")\n",
|
||||
" .collect()\n",
|
||||
" )\n",
|
||||
" except Exception:\n",
|
||||
" return\n",
|
||||
" unique_dates = [(product_id, x) for x in result['REF_DATE'].unique(maintain_order=True).to_list()]\n",
|
||||
" cur.executemany(\"INSERT INTO dates VALUES(?, ?)\", unique_dates)\n",
|
||||
" con.commit()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"id": "507c069d-472f-44a9-872b-e690225fad17",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if __name__ == '__main__':\n",
|
||||
" files_to_process = glob.glob(f\"{output_folder}/en/*.parquet\")\n",
|
||||
" with Pool() as p:\n",
|
||||
" p.map(find_unique_dates, files_to_process)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
"""
|
||||
I need to find out what all possible date formats are for the "REF_DATE" field so that when I write the parquet file people will be able to filter on it
|
||||
"""
|
||||
|
||||
import glob
|
||||
from multiprocessing import Pool
|
||||
import sqlite3
|
||||
import polars as pl
|
||||
|
||||
data_folder = "/data/tables"
|
||||
input_folder = f"{data_folder}/input"
|
||||
scratch_folder = f"{data_folder}/scratch"
|
||||
output_folder = f"{data_folder}/output"
|
||||
|
||||
con = sqlite3.connect(f"{data_folder}/dates.db")
|
||||
cur = con.cursor()
|
||||
cur.executescript("""
|
||||
DROP TABLE IF EXISTS dates;
|
||||
CREATE TABLE IF NOT EXISTS dates (
|
||||
product_id TEXT,
|
||||
date TEXT
|
||||
);
|
||||
""")
|
||||
con.commit()
|
||||
|
||||
def find_unique_dates(filepath):
|
||||
"""
|
||||
Finds unique dates for a table and writes to SQLite table
|
||||
"""
|
||||
product_id = filepath.split("/")[-1].split(".parquet")[0]
|
||||
print(f"Processing {product_id}")
|
||||
try:
|
||||
result = (
|
||||
pl.scan_parquet(filepath)
|
||||
.select("REF_DATE")
|
||||
.collect()
|
||||
)
|
||||
except Exception:
|
||||
return
|
||||
unique_dates = [(product_id, x) for x in result['REF_DATE'].unique(maintain_order=True).to_list()]
|
||||
cur.executemany("INSERT INTO dates VALUES(?, ?)", unique_dates)
|
||||
con.commit()
|
||||
|
||||
if __name__ == '__main__':
|
||||
files_to_process = glob.glob(f"{output_folder}/en/other/en/*.parquet")
|
||||
with Pool() as p:
|
||||
p.map(find_unique_dates, files_to_process)
|
||||
@@ -0,0 +1,168 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 41,
|
||||
"id": "5e04e469-d3eb-42ca-b548-5e6f1fa6af9d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import buckaroo\n",
|
||||
"import duckdb\n",
|
||||
"import polars as pl"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "97c8e92b-21e4-4cc5-8dbe-7b42361ce3f9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"con = duckdb.connect()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"id": "e02f2416-fd16-444b-8fd4-eec2cecee5a7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<duckdb.duckdb.DuckDBPyConnection at 0x7f3bbc156230>"
|
||||
]
|
||||
},
|
||||
"execution_count": 36,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"con.execute(\"\"\"\n",
|
||||
"DESCRIBE '/data/tables/output/en/testing/36100670.parquet';\n",
|
||||
"\"\"\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"id": "fafa7ce7-8619-4951-8c73-7bfbc66dc92f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('REF_DATE', 'DATE', 'YES', None, None, None),\n",
|
||||
" ('GEO', 'VARCHAR', 'YES', None, None, None),\n",
|
||||
" ('DGUID', 'VARCHAR', 'YES', None, None, None),\n",
|
||||
" ('Seasonality', 'VARCHAR', 'YES', None, None, None),\n",
|
||||
" ('Selected credit estimates', 'VARCHAR', 'YES', None, None, None),\n",
|
||||
" ('UOM', 'VARCHAR', 'YES', None, None, None),\n",
|
||||
" ('UOM_ID', 'TINYINT', 'YES', None, None, None),\n",
|
||||
" ('SCALAR_FACTOR', 'VARCHAR', 'YES', None, None, None),\n",
|
||||
" ('SCALAR_ID', 'TINYINT', 'YES', None, None, None),\n",
|
||||
" ('VECTOR', 'VARCHAR', 'YES', None, None, None),\n",
|
||||
" ('COORDINATE', 'VARCHAR', 'YES', None, None, None),\n",
|
||||
" ('VALUE', 'INTEGER', 'YES', None, None, None),\n",
|
||||
" ('STATUS', 'VARCHAR', 'YES', None, None, None),\n",
|
||||
" ('SYMBOL', 'TINYINT', 'YES', None, None, None),\n",
|
||||
" ('TERMINATED', 'TINYINT', 'YES', None, None, None),\n",
|
||||
" ('DECIMALS', 'TINYINT', 'YES', None, None, None)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 37,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"con.fetchall()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"id": "a4ed2881-91b7-4473-b246-a969ef59efba",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "d937312a19f44fc2a8f87bcae8d0faca",
|
||||
"version_major": 2,
|
||||
"version_minor": 1
|
||||
},
|
||||
"text/plain": [
|
||||
"PolarsBuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'p…"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"con.execute(\"SELECT DISTINCT REF_DATE FROM '/data/tables/output/en/testing/36100670.parquet' ORDER BY REF_DATE\").pl()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 44,
|
||||
"id": "f400feee-efb6-421a-b518-1f9c0fc21bcb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "c2edc671f3014e4aae90a2a5b0577bfa",
|
||||
"version_major": 2,
|
||||
"version_minor": 1
|
||||
},
|
||||
"text/plain": [
|
||||
"PolarsBuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'p…"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"con.execute(\"\"\"\n",
|
||||
"SELECT REF_DATE, DGUID, Seasonality, 'Selected credit estimates', VALUE\n",
|
||||
"FROM '/data/tables/output/en/testing/36100670.parquet'\n",
|
||||
"WHERE REF_DATE > DATE '2018-01-01'\n",
|
||||
"\"\"\").pl()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e2e00108-bc8f-4b8b-91e0-f7bf9b44c59d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,207 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d287c593-ab51-4042-b7a6-634916f7075f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Purpose\n",
|
||||
"I need to find out what all possible date formats are for the \"REF_DATE\" field so that when I write the parquet file people will be able to filter on it\n",
|
||||
"\n",
|
||||
"These are all the dates I have encountered:\n",
|
||||
"- Just the year. Example found on productId 36100608: 2024\n",
|
||||
"- Year and month. Example found on productId 14100443: 2024-07\n",
|
||||
"- Year, month, date. Example found on productId 33100036: 2025-06-17\n",
|
||||
"- Range. Example found on productId 17100022: 2013/2014"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 107,
|
||||
"id": "57b802e1-ce9c-4e2e-abf5-aa29ba7e77f4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div><style>\n",
|
||||
".dataframe > thead > tr,\n",
|
||||
".dataframe > tbody > tr {\n",
|
||||
" text-align: right;\n",
|
||||
" white-space: pre-wrap;\n",
|
||||
"}\n",
|
||||
"</style>\n",
|
||||
"<small>shape: (5, 16)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>REF_DATE</th><th>GEO</th><th>DGUID</th><th>Restriction level</th><th>Vaccination status</th><th>UOM</th><th>UOM_ID</th><th>SCALAR_FACTOR</th><th>SCALAR_ID</th><th>VECTOR</th><th>COORDINATE</th><th>VALUE</th><th>STATUS</th><th>SYMBOL</th><th>TERMINATED</th><th>DECIMALS</th></tr><tr><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>i16</td><td>str</td><td>i8</td><td>str</td><td>str</td><td>f64</td><td>i8</td><td>i8</td><td>i8</td><td>i8</td></tr></thead><tbody><tr><td>"2020-01-01"</td><td>"Newfoundland and Labrador"</td><td>"2016A000210"</td><td>"Restriction index"</td><td>"Total population"</td><td>"Index"</td><td>160</td><td>"units"</td><td>0</td><td>"v1331468081"</td><td>"1.1.1"</td><td>1.67</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>"2020-01-01"</td><td>"Newfoundland and Labrador"</td><td>"2016A000210"</td><td>"Restriction index"</td><td>"Vaccinated persons"</td><td>"Index"</td><td>160</td><td>"units"</td><td>0</td><td>"v1331468097"</td><td>"1.1.2"</td><td>1.67</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>"2020-01-01"</td><td>"Newfoundland and Labrador"</td><td>"2016A000210"</td><td>"Restriction index"</td><td>"Unvaccinated persons"</td><td>"Index"</td><td>160</td><td>"units"</td><td>0</td><td>"v1331468113"</td><td>"1.1.3"</td><td>1.67</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>"2020-01-01"</td><td>"Newfoundland and Labrador"</td><td>"2016A000210"</td><td>"School closing"</td><td>"Total population"</td><td>"Index"</td><td>160</td><td>"units"</td><td>0</td><td>"v1331468082"</td><td>"1.2.1"</td><td>0.0</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>"2020-01-01"</td><td>"Newfoundland and Labrador"</td><td>"2016A000210"</td><td>"School closing"</td><td>"Vaccinated persons"</td><td>"Index"</td><td>160</td><td>"units"</td><td>0</td><td>"v1331468098"</td><td>"1.2.2"</td><td>0.0</td><td>null</td><td>null</td><td>null</td><td>2</td></tr></tbody></table></div>"
|
||||
],
|
||||
"text/plain": [
|
||||
"shape: (5, 16)\n",
|
||||
"┌────────────┬─────────────┬────────────┬────────────┬───┬────────┬────────┬────────────┬──────────┐\n",
|
||||
"│ REF_DATE ┆ GEO ┆ DGUID ┆ Restrictio ┆ … ┆ STATUS ┆ SYMBOL ┆ TERMINATED ┆ DECIMALS │\n",
|
||||
"│ --- ┆ --- ┆ --- ┆ n level ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
|
||||
"│ str ┆ str ┆ str ┆ --- ┆ ┆ i8 ┆ i8 ┆ i8 ┆ i8 │\n",
|
||||
"│ ┆ ┆ ┆ str ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"╞════════════╪═════════════╪════════════╪════════════╪═══╪════════╪════════╪════════════╪══════════╡\n",
|
||||
"│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n",
|
||||
"│ ┆ d and ┆ 0 ┆ n index ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n",
|
||||
"│ ┆ d and ┆ 0 ┆ n index ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n",
|
||||
"│ ┆ d and ┆ 0 ┆ n index ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ School ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n",
|
||||
"│ ┆ d and ┆ 0 ┆ closing ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ School ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n",
|
||||
"│ ┆ d and ┆ 0 ┆ closing ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"└────────────┴─────────────┴────────────┴────────────┴───┴────────┴────────┴────────────┴──────────┘"
|
||||
]
|
||||
},
|
||||
"execution_count": 107,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import polars as pl\n",
|
||||
"\n",
|
||||
"data_folder = \"/data/tables\"\n",
|
||||
"input_folder = f\"{data_folder}/input\"\n",
|
||||
"scratch_folder = f\"{data_folder}/scratch\"\n",
|
||||
"output_folder = f\"{data_folder}/output\"\n",
|
||||
"\n",
|
||||
"def normalize_ref_date(series):\n",
|
||||
" # Always cast to string first\n",
|
||||
" series = series.cast(pl.Utf8)\n",
|
||||
" \n",
|
||||
" # Try parsing as full date (YYYY-MM-DD)\n",
|
||||
" full = series.str.strptime(pl.Date, \"%Y-%m-%d\", strict=False)\n",
|
||||
"\n",
|
||||
" # For nulls, try parsing as year-month (YYYY-MM)\n",
|
||||
" ym = series.str.strptime(pl.Date, \"%Y-%m\", strict=False).dt.replace(day=1)\n",
|
||||
" full = full.fill_null(ym)\n",
|
||||
"\n",
|
||||
" # For remaining nulls, try just year (YYYY)\n",
|
||||
" y = series.str.strptime(pl.Date, \"%Y\", strict=False).dt.replace(month=1, day=1)\n",
|
||||
" full = full.fill_null(y)\n",
|
||||
"\n",
|
||||
" return full\n",
|
||||
"\n",
|
||||
"filepath = f\"{output_folder}/en/other/en/33100496.parquet\"\n",
|
||||
"df = pl.read_parquet(filepath)\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 108,
|
||||
"id": "6e0bb9eb-0638-48c6-9308-bc5a846ff27f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"skip_calculating_ref_date = False\n",
|
||||
"if df.schema[\"REF_DATE\"] == pl.String:\n",
|
||||
" if df[\"REF_DATE\"].str.contains(\"/\").any():\n",
|
||||
" # Skip the calculating of the field\n",
|
||||
" skip_calculating_ref_date = True"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 109,
|
||||
"id": "5f93ba75-5c1c-4a7f-b526-89d7589f3834",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if skip_calculating_ref_date == False:\n",
|
||||
" df = df.with_columns([\n",
|
||||
" normalize_ref_date(pl.col(\"REF_DATE\")).alias(\"REF_DATE\")\n",
|
||||
" ])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 110,
|
||||
"id": "d54d297c-ae68-4a44-96c6-457873ea2d10",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div><style>\n",
|
||||
".dataframe > thead > tr,\n",
|
||||
".dataframe > tbody > tr {\n",
|
||||
" text-align: right;\n",
|
||||
" white-space: pre-wrap;\n",
|
||||
"}\n",
|
||||
"</style>\n",
|
||||
"<small>shape: (5, 16)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>REF_DATE</th><th>GEO</th><th>DGUID</th><th>Restriction level</th><th>Vaccination status</th><th>UOM</th><th>UOM_ID</th><th>SCALAR_FACTOR</th><th>SCALAR_ID</th><th>VECTOR</th><th>COORDINATE</th><th>VALUE</th><th>STATUS</th><th>SYMBOL</th><th>TERMINATED</th><th>DECIMALS</th></tr><tr><td>date</td><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>i16</td><td>str</td><td>i8</td><td>str</td><td>str</td><td>f64</td><td>i8</td><td>i8</td><td>i8</td><td>i8</td></tr></thead><tbody><tr><td>2020-01-01</td><td>"Newfoundland and Labrador"</td><td>"2016A000210"</td><td>"Restriction index"</td><td>"Total population"</td><td>"Index"</td><td>160</td><td>"units"</td><td>0</td><td>"v1331468081"</td><td>"1.1.1"</td><td>1.67</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>2020-01-01</td><td>"Newfoundland and Labrador"</td><td>"2016A000210"</td><td>"Restriction index"</td><td>"Vaccinated persons"</td><td>"Index"</td><td>160</td><td>"units"</td><td>0</td><td>"v1331468097"</td><td>"1.1.2"</td><td>1.67</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>2020-01-01</td><td>"Newfoundland and Labrador"</td><td>"2016A000210"</td><td>"Restriction index"</td><td>"Unvaccinated persons"</td><td>"Index"</td><td>160</td><td>"units"</td><td>0</td><td>"v1331468113"</td><td>"1.1.3"</td><td>1.67</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>2020-01-01</td><td>"Newfoundland and Labrador"</td><td>"2016A000210"</td><td>"School closing"</td><td>"Total population"</td><td>"Index"</td><td>160</td><td>"units"</td><td>0</td><td>"v1331468082"</td><td>"1.2.1"</td><td>0.0</td><td>null</td><td>null</td><td>null</td><td>2</td></tr><tr><td>2020-01-01</td><td>"Newfoundland and Labrador"</td><td>"2016A000210"</td><td>"School closing"</td><td>"Vaccinated persons"</td><td>"Index"</td><td>160</td><td>"units"</td><td>0</td><td>"v1331468098"</td><td>"1.2.2"</td><td>0.0</td><td>null</td><td>null</td><td>null</td><td>2</td></tr></tbody></table></div>"
|
||||
],
|
||||
"text/plain": [
|
||||
"shape: (5, 16)\n",
|
||||
"┌────────────┬─────────────┬────────────┬────────────┬───┬────────┬────────┬────────────┬──────────┐\n",
|
||||
"│ REF_DATE ┆ GEO ┆ DGUID ┆ Restrictio ┆ … ┆ STATUS ┆ SYMBOL ┆ TERMINATED ┆ DECIMALS │\n",
|
||||
"│ --- ┆ --- ┆ --- ┆ n level ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
|
||||
"│ date ┆ str ┆ str ┆ --- ┆ ┆ i8 ┆ i8 ┆ i8 ┆ i8 │\n",
|
||||
"│ ┆ ┆ ┆ str ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"╞════════════╪═════════════╪════════════╪════════════╪═══╪════════╪════════╪════════════╪══════════╡\n",
|
||||
"│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n",
|
||||
"│ ┆ d and ┆ 0 ┆ n index ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n",
|
||||
"│ ┆ d and ┆ 0 ┆ n index ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ Restrictio ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n",
|
||||
"│ ┆ d and ┆ 0 ┆ n index ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ School ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n",
|
||||
"│ ┆ d and ┆ 0 ┆ closing ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"│ 2020-01-01 ┆ Newfoundlan ┆ 2016A00021 ┆ School ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n",
|
||||
"│ ┆ d and ┆ 0 ┆ closing ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"│ ┆ Labrador ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
|
||||
"└────────────┴─────────────┴────────────┴────────────┴───┴────────┴────────┴────────────┴──────────┘"
|
||||
]
|
||||
},
|
||||
"execution_count": 110,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3c7a9dd4-5001-4174-9074-8cb14721f79c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -14,7 +14,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 188,
|
||||
"execution_count": 209,
|
||||
"id": "98859cd6-6fa4-4aef-a113-455699524fae",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -39,8 +39,8 @@
|
||||
"output_folder = f\"{data_folder}/output\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if not os.path.exists(\"processing.db\"):\n",
|
||||
" con = sqlite3.connect('processing.db')\n",
|
||||
"if not os.path.exists(f\"{data_folder}/processing.db\"):\n",
|
||||
" con = sqlite3.connect(f\"{data_folder}/processing.db\")\n",
|
||||
" cur = con.cursor()\n",
|
||||
" cur.executescript(\"\"\"\n",
|
||||
" CREATE TABLE IF NOT EXISTS downloaded (\n",
|
||||
@@ -55,13 +55,13 @@
|
||||
" \"\"\")\n",
|
||||
" con.commit()\n",
|
||||
"else:\n",
|
||||
" con = sqlite3.connect('processing.db')\n",
|
||||
" con = sqlite3.connect(f\"{data_folder}/processing.db\")\n",
|
||||
" cur = con.cursor()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 96,
|
||||
"execution_count": 210,
|
||||
"id": "28ac4c01-c1c5-427f-bb2c-0da99a4c5591",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -83,7 +83,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 97,
|
||||
"execution_count": 211,
|
||||
"id": "9daa94f4-16c9-4d8b-951e-3d5d38eb618f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -93,12 +93,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 177,
|
||||
"execution_count": 212,
|
||||
"id": "0af9a4b3-7b59-460b-b933-504919d4bd2a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def update_last_downloaded(product_id):\n",
|
||||
" \"\"\"\n",
|
||||
" Updates SQLite database with the last time the table was updated\n",
|
||||
" The datetime is in Eastern timezone, so have to convert to UTC to\n",
|
||||
" be consistent with https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite\n",
|
||||
" \"\"\"\n",
|
||||
" filepath = f\"{input_folder}/metadata/{product_id}.json\"\n",
|
||||
" print(f\"Reading metadata {filepath}\")\n",
|
||||
" with open(filepath, 'r') as fp:\n",
|
||||
@@ -123,7 +128,20 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 170,
|
||||
"execution_count": 213,
|
||||
"id": "4b7996d2-75ab-4173-a17a-64fb7ab63740",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def update_last_processed(product_id):\n",
|
||||
" time_finished_processing = datetime.now().isoformat()\n",
|
||||
" cur.execute(\"UPDATE downloaded SET last_processed = ? WHERE product_id = ?\", (time_finished_processing, product_id))\n",
|
||||
" con.commit()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 214,
|
||||
"id": "23cbabc3-0d4b-4e28-a4df-b2c5e8c7ea8b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -166,159 +184,23 @@
|
||||
" for result in results:\n",
|
||||
" product_id = result[0]\n",
|
||||
" print(f\"Updating product_id: {product_id}\")\n",
|
||||
" download_cube(product_id)\n",
|
||||
" process_cube(product_id)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 186,
|
||||
"id": "c0d46f38-242c-4685-b8b6-4e046c23aec5",
|
||||
"execution_count": 215,
|
||||
"id": "dc5573ef-734b-44d8-a4c4-0df19d655975",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cur.execute(\"\"\"\n",
|
||||
" SELECT a.product_id, a.last_updated, b.last_updated\n",
|
||||
" FROM downloaded AS a,\n",
|
||||
" cubes AS b\n",
|
||||
" WHERE a.product_id = b.product_id\n",
|
||||
" AND b.last_updated > a.last_updated\n",
|
||||
"\"\"\")\n",
|
||||
"difference = pd.DataFrame(cur.fetchall(), columns=[\"product_id\", \"release_time_metadata\", \"release_time_cubelist\"])"
|
||||
"#update_tables()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 189,
|
||||
"id": "a5e12a7a-9891-4d58-9368-e31b274feb1d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>product_id</th>\n",
|
||||
" <th>release_time_metadata</th>\n",
|
||||
" <th>release_time_cubelist</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>10100006</td>\n",
|
||||
" <td>2025-05-21T12:30:00+00:00</td>\n",
|
||||
" <td>2025-06-17T12:30:00+00:00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>10100108</td>\n",
|
||||
" <td>2025-05-13T12:30:00+00:00</td>\n",
|
||||
" <td>2025-06-12T12:30:00+00:00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>10100132</td>\n",
|
||||
" <td>2025-06-06T12:30:00+00:00</td>\n",
|
||||
" <td>2025-06-13T12:30:00+00:00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>10100136</td>\n",
|
||||
" <td>2025-06-10T12:30:00+00:00</td>\n",
|
||||
" <td>2025-06-16T12:30:00+00:00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>10100138</td>\n",
|
||||
" <td>2025-06-06T12:30:00+00:00</td>\n",
|
||||
" <td>2025-06-13T12:30:00+00:00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>92</th>\n",
|
||||
" <td>38100235</td>\n",
|
||||
" <td>2025-03-13T12:30:00+00:00</td>\n",
|
||||
" <td>2025-06-12T12:30:00+00:00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>93</th>\n",
|
||||
" <td>38100236</td>\n",
|
||||
" <td>2025-03-13T12:30:00+00:00</td>\n",
|
||||
" <td>2025-06-12T12:30:00+00:00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>94</th>\n",
|
||||
" <td>38100237</td>\n",
|
||||
" <td>2025-03-13T12:30:00+00:00</td>\n",
|
||||
" <td>2025-06-12T12:30:00+00:00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>95</th>\n",
|
||||
" <td>38100238</td>\n",
|
||||
" <td>2025-03-13T12:30:00+00:00</td>\n",
|
||||
" <td>2025-06-12T12:30:00+00:00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>96</th>\n",
|
||||
" <td>38100164</td>\n",
|
||||
" <td>2023-09-13T12:30:00+00:00</td>\n",
|
||||
" <td>2025-06-17T12:30:00+00:00</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>97 rows × 3 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" product_id release_time_metadata release_time_cubelist\n",
|
||||
"0 10100006 2025-05-21T12:30:00+00:00 2025-06-17T12:30:00+00:00\n",
|
||||
"1 10100108 2025-05-13T12:30:00+00:00 2025-06-12T12:30:00+00:00\n",
|
||||
"2 10100132 2025-06-06T12:30:00+00:00 2025-06-13T12:30:00+00:00\n",
|
||||
"3 10100136 2025-06-10T12:30:00+00:00 2025-06-16T12:30:00+00:00\n",
|
||||
"4 10100138 2025-06-06T12:30:00+00:00 2025-06-13T12:30:00+00:00\n",
|
||||
".. ... ... ...\n",
|
||||
"92 38100235 2025-03-13T12:30:00+00:00 2025-06-12T12:30:00+00:00\n",
|
||||
"93 38100236 2025-03-13T12:30:00+00:00 2025-06-12T12:30:00+00:00\n",
|
||||
"94 38100237 2025-03-13T12:30:00+00:00 2025-06-12T12:30:00+00:00\n",
|
||||
"95 38100238 2025-03-13T12:30:00+00:00 2025-06-12T12:30:00+00:00\n",
|
||||
"96 38100164 2023-09-13T12:30:00+00:00 2025-06-17T12:30:00+00:00\n",
|
||||
"\n",
|
||||
"[97 rows x 3 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 189,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"difference"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 193,
|
||||
"execution_count": 228,
|
||||
"id": "144e3716-b0e7-4a39-9a25-ededea506f4f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -368,6 +250,9 @@
|
||||
" return result\n",
|
||||
"\n",
|
||||
"def download_cube(product_id, language=\"en\"):\n",
|
||||
" \"\"\"\n",
|
||||
" Downloads the English CSV for a specific table\n",
|
||||
" \"\"\"\n",
|
||||
" download_url = f\"https://www150.statcan.gc.ca/t1/wds/rest/getFullTableDownloadCSV/{product_id}/en\"\n",
|
||||
" response = requests.get(download_url).json()\n",
|
||||
" zip_url = response['object']\n",
|
||||
@@ -388,6 +273,11 @@
|
||||
" progress_bar.close()\n",
|
||||
" \n",
|
||||
"def process_cube(product_id, language=\"en\"):\n",
|
||||
" cur.execute(\"SELECT product_id FROM downloaded WHERE product_id = ?\", (product_id,))\n",
|
||||
" result = cur.fetchone()\n",
|
||||
" if result:\n",
|
||||
" print(f\"Already processed {product_id}\")\n",
|
||||
" return\n",
|
||||
" extract_zipfile(product_id, language)\n",
|
||||
" \"\"\"\n",
|
||||
" The pandas column reader is better than the Polars one\n",
|
||||
@@ -395,12 +285,11 @@
|
||||
" https://www150.statcan.gc.ca/n1/tbl/csv/98100404-eng.zip\n",
|
||||
" \"\"\"\n",
|
||||
" # Get metadata\n",
|
||||
" metadata_file = f\"{input_folder}/metadata/{product_id}.json\"\n",
|
||||
" metadata = get_cube_metadata(product_id)\n",
|
||||
" print(f\"Writing metadata file {metadata_file}\")\n",
|
||||
" with open(metadata_file, \"w\") as outfile:\n",
|
||||
" json.dump(metadata, outfile)\n",
|
||||
" update_last_downloaded(product_id)\n",
|
||||
" #metadata_file = f\"{input_folder}/metadata/{product_id}.json\"\n",
|
||||
" #metadata = get_cube_metadata(product_id)\n",
|
||||
" #print(f\"Writing metadata file {metadata_file}\")\n",
|
||||
" #with open(metadata_file, \"w\") as outfile:\n",
|
||||
" # json.dump(metadata, outfile)\n",
|
||||
" # Read CSV using Pandas\n",
|
||||
" product_csv = f\"{scratch_folder}/{product_id}.csv\"\n",
|
||||
" parameters = {\n",
|
||||
@@ -410,22 +299,32 @@
|
||||
" print(f\"Reading {product_csv} as a Pandas dataframe\")\n",
|
||||
" df = pd.read_csv(product_csv, **parameters)\n",
|
||||
" df = convert_to_lowest_type(df)\n",
|
||||
" # Fix mixed types. This happened in productId 11100147\n",
|
||||
" for col in df.select_dtypes(include=[\"object\"]).columns:\n",
|
||||
" types_in_col = df[col].map(type).unique()\n",
|
||||
" # If there's any non-str type in the column\n",
|
||||
" if any(t != str for t in types_in_col):\n",
|
||||
" print(f\"Fixing mixed types in column: {col} — types found: {types_in_col}\")\n",
|
||||
" # Replace non-str values with None\n",
|
||||
" df[col] = df[col].where(df[col].map(type) == str, None).astype(\"string\")\n",
|
||||
" print(\"Import Pandas dataframe as a Polars dataframe\")\n",
|
||||
" df = pl.from_pandas(df)\n",
|
||||
" output_parquet = f\"{output_folder}/{language}/{product_id}.parquet\"\n",
|
||||
" print(f\"Exporting dataframe as parquet to {output_parquet}\")\n",
|
||||
" df.write_parquet(f\"{output_folder}/{language}/{product_id}.parquet\",\n",
|
||||
" df.write_parquet(output_parquet,\n",
|
||||
" compression='zstd',\n",
|
||||
" compression_level=22)\n",
|
||||
" # Remove the scratch files\n",
|
||||
" print(\"Removing scratch files\")\n",
|
||||
" os.remove(f\"{scratch_folder}/{product_id}.csv\")\n",
|
||||
" os.remove(f\"{scratch_folder}/{product_id}_MetaData.csv\") \n"
|
||||
" os.remove(f\"{scratch_folder}/{product_id}_MetaData.csv\")\n",
|
||||
" update_last_downloaded(product_id)\n",
|
||||
" update_last_processed(product_id)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 164,
|
||||
"execution_count": 229,
|
||||
"id": "6be0842e-c1e0-45b4-90e5-0be28b963aed",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -433,54 +332,13 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Extracting /data/tables/input/en/10100111.zip to /data/tables/scratch\n",
|
||||
"Writing metadata file /data/tables/input/metadata/10100111.json\n",
|
||||
"Reading metadata /data/tables/input/metadata/10100111.json\n",
|
||||
"('10100111', '2025-06-18T12:30:00+00:00')\n",
|
||||
"Reading /data/tables/scratch/10100111.csv as a Pandas dataframe\n",
|
||||
"Converting dataframe to optimal data types\n",
|
||||
"Import Pandas dataframe as a Polars dataframe\n",
|
||||
"Exporting dataframe as parquet to /data/tables/output/en/10100111.parquet\n",
|
||||
"Removing scratch files\n"
|
||||
"Already processed 11100147\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"process_cube(\"10100111\")"
|
||||
"process_cube(\"11100147\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 194,
|
||||
"id": "8a193195-bfa6-4df3-adab-0f88025276da",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading https://www150.statcan.gc.ca/n1/tbl/csv/38100164-eng.zip to /data/tables/input/en/38100164.zip\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/data/tables/input/en/38100164.zip: 100%|██████████████████| 980k/980k [00:00<00:00, 1.27MB/s]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"download_cube(\"38100164\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fc4ae317-3e3e-4c27-a2fe-7e6bd12f673f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
Reference in New Issue
Block a user