From daf5a2d1546fca8e1608c586a0dd94c085bc6ff6 Mon Sep 17 00:00:00 2001 From: Diego Ripley Date: Fri, 30 May 2025 16:11:45 +0000 Subject: [PATCH] Fix #6 --- census_of_population/process_2021.ipynb | 100 +++++++++++++++++------- 1 file changed, 71 insertions(+), 29 deletions(-) diff --git a/census_of_population/process_2021.ipynb b/census_of_population/process_2021.ipynb index 4008a10..ad0ff86 100644 --- a/census_of_population/process_2021.ipynb +++ b/census_of_population/process_2021.ipynb @@ -2,26 +2,39 @@ "cells": [ { "cell_type": "code", - "execution_count": 50, + "execution_count": 10, "id": "fc8ca6f9", "metadata": {}, "outputs": [], "source": [ "import gc\n", "import glob\n", + "import os\n", "\n", "import duckdb\n", "from IPython.core.interactiveshell import InteractiveShell \n", "import numpy as np\n", "import pandas as pd\n", - "import sqlalchemy\n", + "from sqlalchemy import create_engine\n", "\n", "# Enable multiple outputs per cell\n", "InteractiveShell.ast_node_interactivity = \"all\"\n", "# Show all columns\n", "pd.set_option('display.max_columns', None)\n", "\n", - "data_dir = '/data/census_of_population/output/2021/tabular'" + "data_dir = '/data/census_of_population/output/2021/tabular'\n", + "\n", + "# PostgreSQL DB\n", + "DATABASE = os.environ.get(\"POSTGRES_DB\")\n", + "USER = os.environ.get(\"POSTGRES_USER\")\n", + "PASSWORD = os.environ.get(\"POSTGRES_PASSWORD\")\n", + "\n", + "engine = create_engine(f\"postgresql://{USER}:{PASSWORD}@db:5432/{DATABASE}\")\n", + "\n", + "# DuckDB\n", + "con = duckdb.connect()\n", + "con.install_extension(\"spatial\")\n", + "con.load_extension(\"spatial\")" ] }, { @@ -41,7 +54,8 @@ "- 9.0 Forward sortation areas (FSAs)\n", "- 10.0 Health regions (HRs)\n", " - The Health Regions CSV also has a `Home and Community Care Support Services` `GEO_LEVEL`\n", - " - Need to take a look at the HRs files listed here https://www150.statcan.gc.ca/n1/pub/82-402-x/2023001/hrbf-flrs-eng.htm" + " - Need to take a look at the HRs files listed here https://www150.statcan.gc.ca/n1/pub/82-402-x/2023001/hrbf-flrs-eng.htm\n", + "- 11.0 Dissemination Blocks (DBs)" ] }, { @@ -124,7 +138,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 21, "id": "6071a2fe", "metadata": {}, "outputs": [], @@ -263,10 +277,6 @@ } ], "source": [ - "con = duckdb.connect()\n", - "con.install_extension(\"spatial\")\n", - "con.load_extension(\"spatial\")\n", - "\n", "# Get the dguid per level of geography\n", "con.sql(\"\"\"\n", "DROP TABLE IF EXISTS country_2021;\n", @@ -1303,36 +1313,68 @@ "DROP TABLE IF EXISTS fsa_2021;\n", "DROP TABLE IF EXISTS hr_2022;\n", "CREATE TABLE hr_2022 AS SELECT hr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/hr_2022.parquet';\n", - "\n", - "CREATE \n", "\"\"\")\n", "con.commit()\n", "\n", "# Convert the duckdb tables to pandas dataframe\n", - "hr_dguid = con.sql(\"SELECT * FROM hr_2022\").to_df()\n", + "hr_dguid = con.sql(\"SELECT * FROM hr_2022\").to_df()" + ] + }, + { + "cell_type": "markdown", + "id": "846e1be5-cb81-4d5e-a1e5-47a5bb1401b5", + "metadata": {}, + "source": [ + "## 11.0 Dissemination Blocks (DBs)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "c850b121-5d3f-492f-be94-cb7258552ebd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sql = \"\"\"\n", + "SELECT db_dguid, \n", + "db_pop_2021 AS count_total_1, \n", + "db_total_private_dwell_2021 AS count_total_4,\n", + "db_usual_residents_dwellings_2021 AS count_total_5\n", + "FROM silver.gaf_2021;\n", + "\"\"\"\n", + "\n", + "cop_df = pd.read_sql_query(sql=sql, con=engine, index_col='db_dguid')\n", + "\n", + "# Get the dguid per level of geography\n", + "con.sql(\"\"\"\n", + "DROP TABLE IF EXISTS db_2021;\n", + "CREATE TABLE db_2021 AS SELECT db_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/db_2021.parquet';\n", + "\"\"\")\n", + "con.commit()\n", + "\n", + "# Convert the duckdb tables to pandas dataframe\n", + "db_dguid = con.sql(\"SELECT * FROM db_2021\").to_df()\n", "\n", "# Join the Census of Population dataframe to each geographic level\n", - "cop_fsa = cop_df.join(fsa_dguid.set_index('dguid'), on='dguid', how='inner')\n", - "\n", - "del(fsa_dguid)\n", - "del(cop_df)\n", - "gc.collect()\n", + "cop_db = cop_df.join(db_dguid.set_index('dguid'), on='db_dguid', how='inner')\n", "\n", "# Convert columns to lowest dtypes\n", - "cop_fsa = convert_to_lowest_type(cop_fsa)\n", - "\n", - "# Drop NA columns\n", - "print(f\"FSA - Number of Columns BEFORE: {len(cop_fsa.columns)}\")\n", - "drop_na_columns(cop_fsa)\n", - "print(f\"FSA - Number of Columns AFTER: {len(cop_fsa.columns)}\")\n", + "cop_db = convert_to_lowest_type(cop_db)\n", "\n", "# Export\n", - "cop_fsa = cop_fsa.reset_index()\n", - "cop_fsa.rename(columns={'dguid': 'fsa_dguid'}, inplace=True)\n", - "cop_fsa.to_parquet(path=f'{data_dir}/fsa_2021.parquet', index=False, compression='zstd')\n", - "\n", - "del(cop_fsa)\n", - "gc.collect()" + "cop_db = cop_db.reset_index()\n", + "cop_db.to_parquet(path=f'{data_dir}/db_2021.parquet', index=False, compression='zstd')" ] } ],