mirror of
https://github.com/dataforcanada/d4c-datapkg-statistical.git
synced 2026-06-13 14:10:55 +02:00
Fix #6
This commit is contained in:
@@ -2,26 +2,39 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 50,
|
||||
"execution_count": 10,
|
||||
"id": "fc8ca6f9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import gc\n",
|
||||
"import glob\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"import duckdb\n",
|
||||
"from IPython.core.interactiveshell import InteractiveShell \n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"import sqlalchemy\n",
|
||||
"from sqlalchemy import create_engine\n",
|
||||
"\n",
|
||||
"# Enable multiple outputs per cell\n",
|
||||
"InteractiveShell.ast_node_interactivity = \"all\"\n",
|
||||
"# Show all columns\n",
|
||||
"pd.set_option('display.max_columns', None)\n",
|
||||
"\n",
|
||||
"data_dir = '/data/census_of_population/output/2021/tabular'"
|
||||
"data_dir = '/data/census_of_population/output/2021/tabular'\n",
|
||||
"\n",
|
||||
"# PostgreSQL DB\n",
|
||||
"DATABASE = os.environ.get(\"POSTGRES_DB\")\n",
|
||||
"USER = os.environ.get(\"POSTGRES_USER\")\n",
|
||||
"PASSWORD = os.environ.get(\"POSTGRES_PASSWORD\")\n",
|
||||
"\n",
|
||||
"engine = create_engine(f\"postgresql://{USER}:{PASSWORD}@db:5432/{DATABASE}\")\n",
|
||||
"\n",
|
||||
"# DuckDB\n",
|
||||
"con = duckdb.connect()\n",
|
||||
"con.install_extension(\"spatial\")\n",
|
||||
"con.load_extension(\"spatial\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -41,7 +54,8 @@
|
||||
"- 9.0 Forward sortation areas (FSAs)\n",
|
||||
"- 10.0 Health regions (HRs)\n",
|
||||
" - The Health Regions CSV also has a `Home and Community Care Support Services` `GEO_LEVEL`\n",
|
||||
" - Need to take a look at the HRs files listed here https://www150.statcan.gc.ca/n1/pub/82-402-x/2023001/hrbf-flrs-eng.htm"
|
||||
" - Need to take a look at the HRs files listed here https://www150.statcan.gc.ca/n1/pub/82-402-x/2023001/hrbf-flrs-eng.htm\n",
|
||||
"- 11.0 Dissemination Blocks (DBs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -124,7 +138,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 21,
|
||||
"id": "6071a2fe",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -263,10 +277,6 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"con = duckdb.connect()\n",
|
||||
"con.install_extension(\"spatial\")\n",
|
||||
"con.load_extension(\"spatial\")\n",
|
||||
"\n",
|
||||
"# Get the dguid per level of geography\n",
|
||||
"con.sql(\"\"\"\n",
|
||||
"DROP TABLE IF EXISTS country_2021;\n",
|
||||
@@ -1303,36 +1313,68 @@
|
||||
"DROP TABLE IF EXISTS fsa_2021;\n",
|
||||
"DROP TABLE IF EXISTS hr_2022;\n",
|
||||
"CREATE TABLE hr_2022 AS SELECT hr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/hr_2022.parquet';\n",
|
||||
"\n",
|
||||
"CREATE \n",
|
||||
"\"\"\")\n",
|
||||
"con.commit()\n",
|
||||
"\n",
|
||||
"# Convert the duckdb tables to pandas dataframe\n",
|
||||
"hr_dguid = con.sql(\"SELECT * FROM hr_2022\").to_df()\n",
|
||||
"hr_dguid = con.sql(\"SELECT * FROM hr_2022\").to_df()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "846e1be5-cb81-4d5e-a1e5-47a5bb1401b5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 11.0 Dissemination Blocks (DBs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"id": "c850b121-5d3f-492f-be94-cb7258552ebd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<duckdb.duckdb.DuckDBPyConnection at 0x7f6df0455b30>"
|
||||
]
|
||||
},
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"sql = \"\"\"\n",
|
||||
"SELECT db_dguid, \n",
|
||||
"db_pop_2021 AS count_total_1, \n",
|
||||
"db_total_private_dwell_2021 AS count_total_4,\n",
|
||||
"db_usual_residents_dwellings_2021 AS count_total_5\n",
|
||||
"FROM silver.gaf_2021;\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"cop_df = pd.read_sql_query(sql=sql, con=engine, index_col='db_dguid')\n",
|
||||
"\n",
|
||||
"# Get the dguid per level of geography\n",
|
||||
"con.sql(\"\"\"\n",
|
||||
"DROP TABLE IF EXISTS db_2021;\n",
|
||||
"CREATE TABLE db_2021 AS SELECT db_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/db_2021.parquet';\n",
|
||||
"\"\"\")\n",
|
||||
"con.commit()\n",
|
||||
"\n",
|
||||
"# Convert the duckdb tables to pandas dataframe\n",
|
||||
"db_dguid = con.sql(\"SELECT * FROM db_2021\").to_df()\n",
|
||||
"\n",
|
||||
"# Join the Census of Population dataframe to each geographic level\n",
|
||||
"cop_fsa = cop_df.join(fsa_dguid.set_index('dguid'), on='dguid', how='inner')\n",
|
||||
"\n",
|
||||
"del(fsa_dguid)\n",
|
||||
"del(cop_df)\n",
|
||||
"gc.collect()\n",
|
||||
"cop_db = cop_df.join(db_dguid.set_index('dguid'), on='db_dguid', how='inner')\n",
|
||||
"\n",
|
||||
"# Convert columns to lowest dtypes\n",
|
||||
"cop_fsa = convert_to_lowest_type(cop_fsa)\n",
|
||||
"\n",
|
||||
"# Drop NA columns\n",
|
||||
"print(f\"FSA - Number of Columns BEFORE: {len(cop_fsa.columns)}\")\n",
|
||||
"drop_na_columns(cop_fsa)\n",
|
||||
"print(f\"FSA - Number of Columns AFTER: {len(cop_fsa.columns)}\")\n",
|
||||
"cop_db = convert_to_lowest_type(cop_db)\n",
|
||||
"\n",
|
||||
"# Export\n",
|
||||
"cop_fsa = cop_fsa.reset_index()\n",
|
||||
"cop_fsa.rename(columns={'dguid': 'fsa_dguid'}, inplace=True)\n",
|
||||
"cop_fsa.to_parquet(path=f'{data_dir}/fsa_2021.parquet', index=False, compression='zstd')\n",
|
||||
"\n",
|
||||
"del(cop_fsa)\n",
|
||||
"gc.collect()"
|
||||
"cop_db = cop_db.reset_index()\n",
|
||||
"cop_db.to_parquet(path=f'{data_dir}/db_2021.parquet', index=False, compression='zstd')"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
Reference in New Issue
Block a user