Initial commit

2026-06-13 14:10:55 +02:00 · 2025-05-24 13:37:31 -04:00
commit f93e4d0cec
108 changed files with 11689 additions and 0 deletions
@@ -0,0 +1,11 @@
+#!/bin/bash
+if [ ! -d "${DATA_FOLDER}/geographic_attribute_file" ]
+then
+    echo "Making directory ${DATA_FOLDER}/geographic_attribute_file/"
+    mkdir -p ${DATA_FOLDER}/geographic_attribute_file/{input,extracted,output}
+fi
+
+INPUT_FOLDER="${DATA_FOLDER}/geographic_attribute_file/input"
+
+echo "Downloading 2021 geographic attribute file"
+aria2c -x16 -i "${SCRIPT_DIR}/geographic_attribute_file/files.txt" --dir=${INPUT_FOLDER} --auto-file-renaming=false
@@ -0,0 +1,2 @@
+# 2021. Here is the reference guide https://www150.statcan.gc.ca/n1/pub/92-151-g/92-151-g2021001-eng.htm
+https://www12.statcan.gc.ca/census-recensement/2021/geo/aip-pia/attribute-attribs/files-fichiers/2021_92-151_X.zip
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+INPUT_FOLDER="${DATA_FOLDER}/geographic_attribute_file/input"
+EXTRACTED_FOLDER="${DATA_FOLDER}/geographic_attribute_file/extracted"
+
+import_2021() {
+    echo "Extracting ${INPUT_FOLDER}/2021_92-151_X.zip"
+    unzip -q -n ${INPUT_FOLDER}/2021_92-151_X.zip -d ${EXTRACTED_FOLDER}
+    python geographic_attribute_file/process.py ${EXTRACTED_FOLDER}/2021_92-151_X.csv
+}
+
+import_2021
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+# coding: utf-8
+import os
+import sys
+
+import numpy as np
+import pandas as pd
+from sqlalchemy import create_engine
+
+filename = sys.argv[1]
+
+DATABASE = os.environ.get("POSTGRES_DB")
+USER = os.environ.get("POSTGRES_USER")
+PASSWORD = os.environ.get("POSTGRES_PASSWORD")
+
+engine = create_engine(f"postgresql://{USER}:{PASSWORD}@db:5432/{DATABASE}")
+
+"""
+Data dictionary is here:
+https://web.archive.org/web/20240918110218/https://www150.statcan.gc.ca/n1/pub/92-151-g/2021001/tbl/tbl4_1-eng.htm
+
+This processes the entire DGUID hierarchy and other useful fields
+"""
+print(f"Processing {filename}")
+params = {
+    'filepath_or_buffer': filename,
+    'encoding': 'latin-1',
+    'usecols': ['PRDGUID_PRIDUGD', 'CDDGUID_DRIDUGD', 
+                'FEDDGUID_CEFIDUGD', 'CSDDGUID_SDRIDUGD', 
+                'DPLDGUID_LDIDUGD', 'ERDGUID_REIDUGD',
+                'CCSDGUID_SRUIDUGD', 'SACTYPE_CSSGENRE', 'SACCODE_CSSCODE', 'CMAPDGUID_RMRPIDUGD', 'CMADGUID_RMRIDUGD',
+                'CMATYPE_RMRGENRE', 'CTDGUID_SRIDUGD', 'POPCTRRAPDGUID_CTRPOPRRPIDUGD', 'POPCTRRADGUID_CTRPOPRRIDUGD',
+                'DADGUID_ADIDUGD', 'DBDGUID_IDIDUGD',
+                # 2021 Block population, private dwellings, usual residents
+                'DBPOP2021_IDPOP2021', 'DBTDWELL2021_IDTLOG2021', 'DBURDWELL2021_IDRHLOG2021',
+                # 2021 Census Indian reserve refusal flag
+                'DBIR2021_IDRI2021',
+                'ADADGUID_ADAIDUGD'
+               ],
+    # Apparently they have to be int64 because there's NA values
+    'dtype': {
+        'DBPOP2021_IDPOP2021': "Int64",
+        'DBTDWELL2021_IDTLOG2021': "Int64",
+        'DBURDWELL2021_IDRHLOG2021': "Int64"
+    }
+}
+gaf_2021_df = pd.read_csv(**params)
+
+# Rename columns, replace french portion
+gaf_2021_df.rename(columns={
+    'PRDGUID_PRIDUGD': 'pr_dguid',
+    'CDDGUID_DRIDUGD': 'cd_dguid',
+    'FEDDGUID_CEFIDUGD': 'fed_dguid',
+    'CSDDGUID_SDRIDUGD': 'csd_dguid',
+    'DPLDGUID_LDIDUGD': 'dpl_dguid',
+    'ERDGUID_REIDUGD': 'er_dguid',
+    'CCSDGUID_SRUIDUGD': 'ccs_dguid',
+    'SACTYPE_CSSGENRE': 'sac_type',
+    'SACCODE_CSSCODE': 'sac_code',
+    'CMAPDGUID_RMRPIDUGD': 'cma_p_dguid',
+    'CMADGUID_RMRIDUGD': 'cma_dguid',
+    'CTDGUID_SRIDUGD': 'ct_dguid',
+    'POPCTRRAPDGUID_CTRPOPRRPIDUGD': 'pop_ctr_p_dguid',
+    'POPCTRRADGUID_CTRPOPRRIDUGD': 'pop_ctr_dguid',
+    'DADGUID_ADIDUGD': 'da_dguid',
+    'DBDGUID_IDIDUGD': 'db_dguid',
+    'DBPOP2021_IDPOP2021': 'db_pop_2021',
+    # This one needs work
+    'DBTDWELL2021_IDTLOG2021': 'db_total_private_dwell_2021',
+    # I don't particularly like this one
+    'DBURDWELL2021_IDRHLOG2021': 'db_usual_residents_dwellings_2021',
+    'DBIR2021_IDRI2021': 'db_ir_2021',
+    'ADADGUID_ADAIDUGD': 'ada_dguid'
+}, inplace=True)
+
+columns_ordered = ['pr_dguid', 'fed_dguid', 'er_dguid', 'cd_dguid',
+                   'dpl_dguid', 'ccs_dguid', 'csd_dguid', 'sac_type', 'sac_code',
+                   'cma_p_dguid', 'cma_dguid',
+                   'pop_ctr_p_dguid', 'pop_ctr_dguid', 
+                   'ada_dguid', 'ct_dguid', 'da_dguid', 'db_dguid',
+                   'db_pop_2021', 'db_total_private_dwell_2021', 'db_usual_residents_dwellings_2021', 'db_ir_2021']
+
+gaf_2021_df = gaf_2021_df.reindex(columns_ordered, axis=1)
+print("Loading 2021 geographic attribute file to PostgreSQL as gaf_2021")
+gaf_2021_df.to_sql(name='gaf_2021', 
+                   con=engine, 
+                   index=False, 
+                   chunksize=50000,
+                   if_exists='replace',
+                   schema='silver'
+                  )