Files
d4c-datapkg-statistical/dissemination_geographies_relationship_file/process.py
T
Diego Ripley f93e4d0cec Initial commit
2025-05-24 13:37:31 -04:00

60 lines
2.1 KiB
Python

#!/usr/bin/env python
# coding: utf-8
import os
import sys
import pandas as pd
from sqlalchemy import create_engine
dgr_2021_csv = sys.argv[1]
DATABASE = os.environ.get("POSTGRES_DB")
USER = os.environ.get("POSTGRES_USER")
PASSWORD = os.environ.get("POSTGRES_PASSWORD")
engine = create_engine(f"postgresql://{USER}:{PASSWORD}@db:5432/{DATABASE}")
"""
Data dictionary is here:
https://web.archive.org/web/20250413152017/https://www150.statcan.gc.ca/n1/pub/98-26-0003/982600032021001-eng.htm
This processes the entire DGUID hierarchy and other useful fields
"""
print(f"Processing {dgr_2021_csv}")
dgr_2021_df = pd.read_csv(dgr_2021_csv)
# Rename columns, remove french portion
dgr_2021_df.rename(columns={
'PRDGUID_PRIDUGD': 'pr_dguid',
'CDDGUID_DRIDUGD': 'cd_dguid',
'FEDDGUID_CEFIDUGD': 'fed_dguid',
'CSDDGUID_SDRIDUGD': 'csd_dguid',
'ERDGUID_REIDUGD': 'er_dguid',
'CARDGUID_RARIDUGD': 'car_dguid',
'CCSDGUID_SRUIDUGD': 'ccs_dguid',
'DADGUID_ADIDUGD': 'da_dguid',
'DBDGUID_IDIDUGD': 'db_dguid',
'ADADGUID_ADAIDUGD': 'ada_dguid',
'DPLDGUID_LDIDUGD': 'dpl_dguid',
'CMAPDGUID_RMRPIDUGD': 'cma_p_dguid',
'CMADGUID_RMRIDUGD': 'cma_dguid',
'CTDGUID_SRIDUGD': 'ct_dguid',
'POPCTRPDGUID_CTRPOPPIDUGD': 'pop_ctr_p_dguid',
'POPCTRDGUID_CTRPOPIDUGD': 'pop_ctr_dguid',
}, inplace=True)
columns_ordered = ['pr_dguid', 'fed_dguid', 'er_dguid', 'car_dguid', 'cd_dguid',
'dpl_dguid', 'ccs_dguid', 'csd_dguid',
'cma_p_dguid', 'cma_dguid',
'pop_ctr_p_dguid', 'pop_ctr_dguid',
'ada_dguid', 'ct_dguid', 'da_dguid', 'db_dguid']
dgr_2021_df = dgr_2021_df.reindex(columns_ordered, axis=1)
print("Loading processed 2021 dissemination geographies relationship file to database as dissemination_geographies_relationship_2021")
dgr_2021_df.to_sql(name='dissemination_geographies_relationship_2021',
con=engine,
index=False,
chunksize=50000,
if_exists='replace',
schema='silver'
)