From daf5a2d1546fca8e1608c586a0dd94c085bc6ff6 Mon Sep 17 00:00:00 2001
From: Diego Ripley <diego@diegoripley.ca>
Date: Fri, 30 May 2025 16:11:45 +0000
Subject: [PATCH] Fix #6

---
 census_of_population/process_2021.ipynb | 100 +++++++++++++++++-------
 1 file changed, 71 insertions(+), 29 deletions(-)

diff --git a/census_of_population/process_2021.ipynb b/census_of_population/process_2021.ipynb
index 4008a10..ad0ff86 100644
--- a/census_of_population/process_2021.ipynb
+++ b/census_of_population/process_2021.ipynb
@@ -2,26 +2,39 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 10,
    "id": "fc8ca6f9",
    "metadata": {},
    "outputs": [],
    "source": [
     "import gc\n",
     "import glob\n",
+    "import os\n",
     "\n",
     "import duckdb\n",
     "from IPython.core.interactiveshell import InteractiveShell  \n",
     "import numpy as np\n",
     "import pandas as pd\n",
-    "import sqlalchemy\n",
+    "from sqlalchemy import create_engine\n",
     "\n",
     "# Enable multiple outputs per cell\n",
     "InteractiveShell.ast_node_interactivity = \"all\"\n",
     "# Show all columns\n",
     "pd.set_option('display.max_columns', None)\n",
     "\n",
-    "data_dir = '/data/census_of_population/output/2021/tabular'"
+    "data_dir = '/data/census_of_population/output/2021/tabular'\n",
+    "\n",
+    "# PostgreSQL DB\n",
+    "DATABASE = os.environ.get(\"POSTGRES_DB\")\n",
+    "USER = os.environ.get(\"POSTGRES_USER\")\n",
+    "PASSWORD = os.environ.get(\"POSTGRES_PASSWORD\")\n",
+    "\n",
+    "engine = create_engine(f\"postgresql://{USER}:{PASSWORD}@db:5432/{DATABASE}\")\n",
+    "\n",
+    "# DuckDB\n",
+    "con = duckdb.connect()\n",
+    "con.install_extension(\"spatial\")\n",
+    "con.load_extension(\"spatial\")"
    ]
   },
   {
@@ -41,7 +54,8 @@
     "- 9.0 Forward sortation areas (FSAs)\n",
     "- 10.0 Health regions (HRs)\n",
     "   - The Health Regions CSV also has a `Home and Community Care Support Services` `GEO_LEVEL`\n",
-    "   - Need to take a look at the HRs files listed here https://www150.statcan.gc.ca/n1/pub/82-402-x/2023001/hrbf-flrs-eng.htm"
+    "   - Need to take a look at the HRs files listed here https://www150.statcan.gc.ca/n1/pub/82-402-x/2023001/hrbf-flrs-eng.htm\n",
+    "- 11.0 Dissemination Blocks (DBs)"
    ]
   },
   {
@@ -124,7 +138,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 21,
    "id": "6071a2fe",
    "metadata": {},
    "outputs": [],
@@ -263,10 +277,6 @@
     }
    ],
    "source": [
-    "con = duckdb.connect()\n",
-    "con.install_extension(\"spatial\")\n",
-    "con.load_extension(\"spatial\")\n",
-    "\n",
     "# Get the dguid per level of geography\n",
     "con.sql(\"\"\"\n",
     "DROP TABLE IF EXISTS country_2021;\n",
@@ -1303,36 +1313,68 @@
     "DROP TABLE IF EXISTS fsa_2021;\n",
     "DROP TABLE IF EXISTS hr_2022;\n",
     "CREATE TABLE hr_2022 AS SELECT hr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/hr_2022.parquet';\n",
-    "\n",
-    "CREATE \n",
     "\"\"\")\n",
     "con.commit()\n",
     "\n",
     "# Convert the duckdb tables to pandas dataframe\n",
-    "hr_dguid = con.sql(\"SELECT * FROM hr_2022\").to_df()\n",
+    "hr_dguid = con.sql(\"SELECT * FROM hr_2022\").to_df()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "846e1be5-cb81-4d5e-a1e5-47a5bb1401b5",
+   "metadata": {},
+   "source": [
+    "## 11.0 Dissemination Blocks (DBs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "c850b121-5d3f-492f-be94-cb7258552ebd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<duckdb.duckdb.DuckDBPyConnection at 0x7f6df0455b30>"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sql = \"\"\"\n",
+    "SELECT db_dguid, \n",
+    "db_pop_2021 AS count_total_1, \n",
+    "db_total_private_dwell_2021 AS count_total_4,\n",
+    "db_usual_residents_dwellings_2021 AS count_total_5\n",
+    "FROM silver.gaf_2021;\n",
+    "\"\"\"\n",
+    "\n",
+    "cop_df = pd.read_sql_query(sql=sql, con=engine, index_col='db_dguid')\n",
+    "\n",
+    "# Get the dguid per level of geography\n",
+    "con.sql(\"\"\"\n",
+    "DROP TABLE IF EXISTS db_2021;\n",
+    "CREATE TABLE db_2021 AS SELECT db_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/db_2021.parquet';\n",
+    "\"\"\")\n",
+    "con.commit()\n",
+    "\n",
+    "# Convert the duckdb tables to pandas dataframe\n",
+    "db_dguid = con.sql(\"SELECT * FROM db_2021\").to_df()\n",
     "\n",
     "# Join the Census of Population dataframe to each geographic level\n",
-    "cop_fsa = cop_df.join(fsa_dguid.set_index('dguid'), on='dguid', how='inner')\n",
-    "\n",
-    "del(fsa_dguid)\n",
-    "del(cop_df)\n",
-    "gc.collect()\n",
+    "cop_db = cop_df.join(db_dguid.set_index('dguid'), on='db_dguid', how='inner')\n",
     "\n",
     "# Convert columns to lowest dtypes\n",
-    "cop_fsa = convert_to_lowest_type(cop_fsa)\n",
-    "\n",
-    "# Drop NA columns\n",
-    "print(f\"FSA - Number of Columns BEFORE: {len(cop_fsa.columns)}\")\n",
-    "drop_na_columns(cop_fsa)\n",
-    "print(f\"FSA - Number of Columns AFTER: {len(cop_fsa.columns)}\")\n",
+    "cop_db = convert_to_lowest_type(cop_db)\n",
     "\n",
     "# Export\n",
-    "cop_fsa = cop_fsa.reset_index()\n",
-    "cop_fsa.rename(columns={'dguid': 'fsa_dguid'}, inplace=True)\n",
-    "cop_fsa.to_parquet(path=f'{data_dir}/fsa_2021.parquet', index=False, compression='zstd')\n",
-    "\n",
-    "del(cop_fsa)\n",
-    "gc.collect()"
+    "cop_db = cop_db.reset_index()\n",
+    "cop_db.to_parquet(path=f'{data_dir}/db_2021.parquet', index=False, compression='zstd')"
    ]
   }
  ],