This commit is contained in:
Diego Ripley
2025-05-30 16:11:45 +00:00
parent 6aedfe7c46
commit daf5a2d154
+71 -29
View File
@@ -2,26 +2,39 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 50, "execution_count": 10,
"id": "fc8ca6f9", "id": "fc8ca6f9",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import gc\n", "import gc\n",
"import glob\n", "import glob\n",
"import os\n",
"\n", "\n",
"import duckdb\n", "import duckdb\n",
"from IPython.core.interactiveshell import InteractiveShell \n", "from IPython.core.interactiveshell import InteractiveShell \n",
"import numpy as np\n", "import numpy as np\n",
"import pandas as pd\n", "import pandas as pd\n",
"import sqlalchemy\n", "from sqlalchemy import create_engine\n",
"\n", "\n",
"# Enable multiple outputs per cell\n", "# Enable multiple outputs per cell\n",
"InteractiveShell.ast_node_interactivity = \"all\"\n", "InteractiveShell.ast_node_interactivity = \"all\"\n",
"# Show all columns\n", "# Show all columns\n",
"pd.set_option('display.max_columns', None)\n", "pd.set_option('display.max_columns', None)\n",
"\n", "\n",
"data_dir = '/data/census_of_population/output/2021/tabular'" "data_dir = '/data/census_of_population/output/2021/tabular'\n",
"\n",
"# PostgreSQL DB\n",
"DATABASE = os.environ.get(\"POSTGRES_DB\")\n",
"USER = os.environ.get(\"POSTGRES_USER\")\n",
"PASSWORD = os.environ.get(\"POSTGRES_PASSWORD\")\n",
"\n",
"engine = create_engine(f\"postgresql://{USER}:{PASSWORD}@db:5432/{DATABASE}\")\n",
"\n",
"# DuckDB\n",
"con = duckdb.connect()\n",
"con.install_extension(\"spatial\")\n",
"con.load_extension(\"spatial\")"
] ]
}, },
{ {
@@ -41,7 +54,8 @@
"- 9.0 Forward sortation areas (FSAs)\n", "- 9.0 Forward sortation areas (FSAs)\n",
"- 10.0 Health regions (HRs)\n", "- 10.0 Health regions (HRs)\n",
" - The Health Regions CSV also has a `Home and Community Care Support Services` `GEO_LEVEL`\n", " - The Health Regions CSV also has a `Home and Community Care Support Services` `GEO_LEVEL`\n",
" - Need to take a look at the HRs files listed here https://www150.statcan.gc.ca/n1/pub/82-402-x/2023001/hrbf-flrs-eng.htm" " - Need to take a look at the HRs files listed here https://www150.statcan.gc.ca/n1/pub/82-402-x/2023001/hrbf-flrs-eng.htm\n",
"- 11.0 Dissemination Blocks (DBs)"
] ]
}, },
{ {
@@ -124,7 +138,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 21,
"id": "6071a2fe", "id": "6071a2fe",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -263,10 +277,6 @@
} }
], ],
"source": [ "source": [
"con = duckdb.connect()\n",
"con.install_extension(\"spatial\")\n",
"con.load_extension(\"spatial\")\n",
"\n",
"# Get the dguid per level of geography\n", "# Get the dguid per level of geography\n",
"con.sql(\"\"\"\n", "con.sql(\"\"\"\n",
"DROP TABLE IF EXISTS country_2021;\n", "DROP TABLE IF EXISTS country_2021;\n",
@@ -1303,36 +1313,68 @@
"DROP TABLE IF EXISTS fsa_2021;\n", "DROP TABLE IF EXISTS fsa_2021;\n",
"DROP TABLE IF EXISTS hr_2022;\n", "DROP TABLE IF EXISTS hr_2022;\n",
"CREATE TABLE hr_2022 AS SELECT hr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/hr_2022.parquet';\n", "CREATE TABLE hr_2022 AS SELECT hr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/hr_2022.parquet';\n",
"\n",
"CREATE \n",
"\"\"\")\n", "\"\"\")\n",
"con.commit()\n", "con.commit()\n",
"\n", "\n",
"# Convert the duckdb tables to pandas dataframe\n", "# Convert the duckdb tables to pandas dataframe\n",
"hr_dguid = con.sql(\"SELECT * FROM hr_2022\").to_df()\n", "hr_dguid = con.sql(\"SELECT * FROM hr_2022\").to_df()"
]
},
{
"cell_type": "markdown",
"id": "846e1be5-cb81-4d5e-a1e5-47a5bb1401b5",
"metadata": {},
"source": [
"## 11.0 Dissemination Blocks (DBs)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "c850b121-5d3f-492f-be94-cb7258552ebd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x7f6df0455b30>"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sql = \"\"\"\n",
"SELECT db_dguid, \n",
"db_pop_2021 AS count_total_1, \n",
"db_total_private_dwell_2021 AS count_total_4,\n",
"db_usual_residents_dwellings_2021 AS count_total_5\n",
"FROM silver.gaf_2021;\n",
"\"\"\"\n",
"\n",
"cop_df = pd.read_sql_query(sql=sql, con=engine, index_col='db_dguid')\n",
"\n",
"# Get the dguid per level of geography\n",
"con.sql(\"\"\"\n",
"DROP TABLE IF EXISTS db_2021;\n",
"CREATE TABLE db_2021 AS SELECT db_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/db_2021.parquet';\n",
"\"\"\")\n",
"con.commit()\n",
"\n",
"# Convert the duckdb tables to pandas dataframe\n",
"db_dguid = con.sql(\"SELECT * FROM db_2021\").to_df()\n",
"\n", "\n",
"# Join the Census of Population dataframe to each geographic level\n", "# Join the Census of Population dataframe to each geographic level\n",
"cop_fsa = cop_df.join(fsa_dguid.set_index('dguid'), on='dguid', how='inner')\n", "cop_db = cop_df.join(db_dguid.set_index('dguid'), on='db_dguid', how='inner')\n",
"\n",
"del(fsa_dguid)\n",
"del(cop_df)\n",
"gc.collect()\n",
"\n", "\n",
"# Convert columns to lowest dtypes\n", "# Convert columns to lowest dtypes\n",
"cop_fsa = convert_to_lowest_type(cop_fsa)\n", "cop_db = convert_to_lowest_type(cop_db)\n",
"\n",
"# Drop NA columns\n",
"print(f\"FSA - Number of Columns BEFORE: {len(cop_fsa.columns)}\")\n",
"drop_na_columns(cop_fsa)\n",
"print(f\"FSA - Number of Columns AFTER: {len(cop_fsa.columns)}\")\n",
"\n", "\n",
"# Export\n", "# Export\n",
"cop_fsa = cop_fsa.reset_index()\n", "cop_db = cop_db.reset_index()\n",
"cop_fsa.rename(columns={'dguid': 'fsa_dguid'}, inplace=True)\n", "cop_db.to_parquet(path=f'{data_dir}/db_2021.parquet', index=False, compression='zstd')"
"cop_fsa.to_parquet(path=f'{data_dir}/fsa_2021.parquet', index=False, compression='zstd')\n",
"\n",
"del(cop_fsa)\n",
"gc.collect()"
] ]
} }
], ],