mirror of
https://github.com/dataforcanada/d4c-datapkg-statistical.git
synced 2026-06-13 14:10:55 +02:00
Initial commit
This commit is contained in:
+11
@@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
if [ ! -d "${DATA_FOLDER}/dissemination_geographies_relationship_file" ]
|
||||
then
|
||||
echo "Making directory ${DATA_FOLDER}/dissemination_geographies_relationship_file/"
|
||||
mkdir -p ${DATA_FOLDER}/dissemination_geographies_relationship_file/{input,extracted,output}
|
||||
fi
|
||||
|
||||
INPUT_FOLDER="${DATA_FOLDER}/dissemination_geographies_relationship_file/input"
|
||||
|
||||
echo "Downloading 2021 dissemiantion geographies relationship file"
|
||||
aria2c -x16 -i "${SCRIPT_DIR}/dissemination_geographies_relationship_file/files.txt" --dir=${INPUT_FOLDER} --auto-file-renaming=false
|
||||
@@ -0,0 +1,2 @@
|
||||
# 2021. Here is the reference guide https://web.archive.org/web/20250413152017/https://www150.statcan.gc.ca/n1/pub/98-26-0003/982600032021001-eng.htm
|
||||
https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/dguid-idugd/files-fichiers/2021_98260004.zip
|
||||
+12
@@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
INPUT_FOLDER="${DATA_FOLDER}/dissemination_geographies_relationship_file/input"
|
||||
EXTRACTED_FOLDER="${DATA_FOLDER}/dissemination_geographies_relationship_file/extracted"
|
||||
|
||||
import_2021() {
|
||||
echo "Unzipping 2021 dissemination geographies relationship file"
|
||||
unzip -n "${INPUT_FOLDER}/2021_98260004.zip" -d ${EXTRACTED_FOLDER}
|
||||
python dissemination_geographies_relationship_file/process.py ${EXTRACTED_FOLDER}/2021_98260004.csv
|
||||
}
|
||||
|
||||
import_2021
|
||||
@@ -0,0 +1,60 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
import os
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
dgr_2021_csv = sys.argv[1]
|
||||
|
||||
DATABASE = os.environ.get("POSTGRES_DB")
|
||||
USER = os.environ.get("POSTGRES_USER")
|
||||
PASSWORD = os.environ.get("POSTGRES_PASSWORD")
|
||||
|
||||
engine = create_engine(f"postgresql://{USER}:{PASSWORD}@db:5432/{DATABASE}")
|
||||
|
||||
"""
|
||||
Data dictionary is here:
|
||||
https://web.archive.org/web/20250413152017/https://www150.statcan.gc.ca/n1/pub/98-26-0003/982600032021001-eng.htm
|
||||
|
||||
This processes the entire DGUID hierarchy and other useful fields
|
||||
"""
|
||||
print(f"Processing {dgr_2021_csv}")
|
||||
dgr_2021_df = pd.read_csv(dgr_2021_csv)
|
||||
|
||||
# Rename columns, remove french portion
|
||||
dgr_2021_df.rename(columns={
|
||||
'PRDGUID_PRIDUGD': 'pr_dguid',
|
||||
'CDDGUID_DRIDUGD': 'cd_dguid',
|
||||
'FEDDGUID_CEFIDUGD': 'fed_dguid',
|
||||
'CSDDGUID_SDRIDUGD': 'csd_dguid',
|
||||
'ERDGUID_REIDUGD': 'er_dguid',
|
||||
'CARDGUID_RARIDUGD': 'car_dguid',
|
||||
'CCSDGUID_SRUIDUGD': 'ccs_dguid',
|
||||
'DADGUID_ADIDUGD': 'da_dguid',
|
||||
'DBDGUID_IDIDUGD': 'db_dguid',
|
||||
'ADADGUID_ADAIDUGD': 'ada_dguid',
|
||||
'DPLDGUID_LDIDUGD': 'dpl_dguid',
|
||||
'CMAPDGUID_RMRPIDUGD': 'cma_p_dguid',
|
||||
'CMADGUID_RMRIDUGD': 'cma_dguid',
|
||||
'CTDGUID_SRIDUGD': 'ct_dguid',
|
||||
'POPCTRPDGUID_CTRPOPPIDUGD': 'pop_ctr_p_dguid',
|
||||
'POPCTRDGUID_CTRPOPIDUGD': 'pop_ctr_dguid',
|
||||
}, inplace=True)
|
||||
|
||||
columns_ordered = ['pr_dguid', 'fed_dguid', 'er_dguid', 'car_dguid', 'cd_dguid',
|
||||
'dpl_dguid', 'ccs_dguid', 'csd_dguid',
|
||||
'cma_p_dguid', 'cma_dguid',
|
||||
'pop_ctr_p_dguid', 'pop_ctr_dguid',
|
||||
'ada_dguid', 'ct_dguid', 'da_dguid', 'db_dguid']
|
||||
|
||||
dgr_2021_df = dgr_2021_df.reindex(columns_ordered, axis=1)
|
||||
print("Loading processed 2021 dissemination geographies relationship file to database as dissemination_geographies_relationship_2021")
|
||||
dgr_2021_df.to_sql(name='dissemination_geographies_relationship_2021',
|
||||
con=engine,
|
||||
index=False,
|
||||
chunksize=50000,
|
||||
if_exists='replace',
|
||||
schema='silver'
|
||||
)
|
||||
Reference in New Issue
Block a user