Process 2024-12 national address register. Still need to make some improvements

This commit is contained in:
Diego Ripley
2025-06-06 22:49:47 +00:00
parent 2350d6d8d7
commit 5e26ec282e
12 changed files with 765 additions and 248 deletions
+5
View File
@@ -48,3 +48,8 @@ census_of_population/process.sh
census_of_agriculture/download.sh
## 7.2 Process Census of Agriculture ##
census_of_agriculture/process.sh
#### 8.0 National Address Register ####
## 8.1 Download National Address Register ##
national_address_register/download.sh
## 8.2 Load National Address Register ##
national_address_register/process.sh
+11 -3
View File
@@ -2,8 +2,16 @@
if [ ! -d "${DATA_FOLDER}/national_address_register" ]
then
echo "Making directory ${DATA_FOLDER}/national_address_register/"
mkdir -p ${DATA_FOLDER}/national_address_register/{input,extracted,output,scratch}
mkdir -p ${DATA_FOLDER}/national_address_register/{input,extracted,output}/{2024-12,2024-06,2023,2022}
fi
echo "Downloading national address register files"
aria2c -x16 -i "${SCRIPT_DIR}/national_address_register/national_address_register_files.txt" --dir=$DATA_FOLDER/national_address_register/input --auto-file-renaming=false
INPUT_FOLDER="${DATA_FOLDER}/national_address_register/input"
echo "Downloading 2024-12 vintage of national address register files"
aria2c -x16 -i "${SCRIPT_DIR}/national_address_register/national_address_register_files_2024_12.txt" --dir=${INPUT_FOLDER}/2024-12 --auto-file-renaming=false
echo "Downloading 2024-06 vintage of national address register files"
aria2c -x16 -i "${SCRIPT_DIR}/national_address_register/national_address_register_files_2024_06.txt" --dir=${INPUT_FOLDER}/2024-06 --auto-file-renaming=false
echo "Downloading 2023 vintage of national address register files"
aria2c -x16 -i "${SCRIPT_DIR}/national_address_register/national_address_register_files_2023.txt" --dir=${INPUT_FOLDER}/2023 --auto-file-renaming=false
echo "Downloading 2022 vintage of national address register files"
aria2c -x16 -i "${SCRIPT_DIR}/national_address_register/national_address_register_files_2022.txt" --dir=${INPUT_FOLDER}/2022 --auto-file-renaming=false
+155
View File
@@ -0,0 +1,155 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "b6e053ec",
"metadata": {},
"outputs": [],
"source": [
"DATA_FOLDER=/data\n",
"\n",
"source ../.env"
]
},
{
"cell_type": "markdown",
"id": "12eca225-3d05-4bb7-95fa-7b9df694f53d",
"metadata": {},
"source": [
"# 1. Export National Address Register File (2024-12 vintage)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "4c5bb532",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Exporting silver.nar_2024_12 table to /data/national_address_register/output/2024-12/nar_2024_12.parquet\n"
]
}
],
"source": [
"output_folder=\"${DATA_FOLDER}/national_address_register/output/2024-12\"\n",
"output_file=\"${output_folder}/nar_2024_12.parquet\"\n",
"echo \"Exporting silver.nar_2024_12 table to ${output_file}\" \n",
"ogr2ogr \\\n",
" -lco COMPRESSION=\"ZSTD\" \\\n",
" -lco CREATOR=\"www.dataforcanada.org\" \\\n",
" -lco WRITE_COVERING_BBOX=\"YES\" \\\n",
" -lco SORT_BY_BBOX=\"YES\" \\\n",
" -f Parquet \\\n",
" ${output_file} \\\n",
" \"PG:host=db dbname=${POSTGRES_DB} user=${POSTGRES_USER} password=${POSTGRES_PASSWORD} port=5432\" \\\n",
" \"silver.nar_2024_12\""
]
},
{
"cell_type": "markdown",
"id": "245cd8fb-da36-4476-8e5c-4d3665e901d7",
"metadata": {},
"source": [
"# 2. Export National Address Register File (2024-06 vintage)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f3e2a5b5-9b31-4f82-88da-414bf23ba515",
"metadata": {},
"outputs": [],
"source": [
"output_folder=\"${DATA_FOLDER}/national_address_register/output/2024-06\"\n",
"output_file=\"${output_folder}/nar_2024_06.parquet\"\n",
"echo \"Exporting silver.nar_2024_06 table to ${output_file}\" \n",
"ogr2ogr \\\n",
" -lco COMPRESSION=\"ZSTD\" \\\n",
" -lco CREATOR=\"www.dataforcanada.org\" \\\n",
" -lco WRITE_COVERING_BBOX=\"YES\" \\\n",
" -lco SORT_BY_BBOX=\"YES\" \\\n",
" -f Parquet \\\n",
" ${output_file} \\\n",
" \"PG:host=db dbname=${POSTGRES_DB} user=${POSTGRES_USER} password=${POSTGRES_PASSWORD} port=5432\" \\\n",
" \"silver.nar_2024_06\""
]
},
{
"cell_type": "markdown",
"id": "56ef7755-06ef-473c-8a1f-0129f6b1dc28",
"metadata": {},
"source": [
"# 3. Export National Address Register File (2023 vintage)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "97df2904-c18d-4132-bc28-f0f454c48c87",
"metadata": {},
"outputs": [],
"source": [
"output_folder=\"${DATA_FOLDER}/national_address_register/output/2023\"\n",
"output_file=\"${output_folder}/nar_2023.parquet\"\n",
"echo \"Exporting silver.nar_2023 table to ${output_file}\"\n",
"ogr2ogr \\\n",
" -lco COMPRESSION=\"ZSTD\" \\\n",
" -lco CREATOR=\"www.dataforcanada.org\" \\\n",
" -lco WRITE_COVERING_BBOX=\"YES\" \\\n",
" -lco SORT_BY_BBOX=\"YES\" \\\n",
" -f Parquet \\\n",
" ${output_file} \\\n",
" \"PG:host=db dbname=${POSTGRES_DB} user=${POSTGRES_USER} password=${POSTGRES_PASSWORD} port=5432\" \\\n",
" \"silver.nar_2023\""
]
},
{
"cell_type": "markdown",
"id": "f1bfd838-19aa-4de7-9ca1-61b1215d7865",
"metadata": {},
"source": [
"# 4. Export National Address Register File (2022 vintage)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d2f1210f-0d9c-45de-a019-a438ff9dfa4a",
"metadata": {},
"outputs": [],
"source": [
"output_folder=\"${DATA_FOLDER}/national_address_register/output/2022\"\n",
"output_file=\"${output_folder}/nar_2022.parquet\"\n",
"echo \"Exporting silver.nar_2022 table to ${output_file}\"\n",
"ogr2ogr \\\n",
" -lco COMPRESSION=\"ZSTD\" \\\n",
" -lco CREATOR=\"www.dataforcanada.org\" \\\n",
" -lco WRITE_COVERING_BBOX=\"YES\" \\\n",
" -lco SORT_BY_BBOX=\"YES\" \\\n",
" -f Parquet \\\n",
" ${output_file} \\\n",
" \"PG:host=db dbname=${POSTGRES_DB} user=${POSTGRES_USER} password=${POSTGRES_PASSWORD} port=5432\" \\\n",
" \"silver.nar_2022\""
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Bash",
"language": "bash",
"name": "bash"
},
"language_info": {
"codemirror_mode": "shell",
"file_extension": ".sh",
"mimetype": "text/x-sh",
"name": "bash"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
-167
View File
@@ -1,167 +0,0 @@
#!/bin/bash
import_to_postgis() {
local filepath=$1
local table_name=$2
local extra_parameters=${@:3}
# Virtual file system
if [[ ${filepath: -4} = '.zip' ]]; then
local filepath="/vsizip/${filepath}"
fi
echo "Importing ${filepath}"
ogr2ogr \
--config PG_USE_COPY YES \
-overwrite \
-f "PostgreSQL" \
"PG:host=db dbname=${POSTGRES_DB} user=${POSTGRES_USER} password=${POSTGRES_PASSWORD} port=5432" \
-lco GEOMETRY_NAME=geom \
-progress \
-gt 500000 \
-t_srs EPSG:3857 \
-nln ${table_name} \
${extra_parameters} \
${filepath}
}
concatenate_csvs() {
# Concatenates all of the CSVs in the directory
local input_directory=$1
local output_file=$2
for address_file in $(ls ${input_directory}/*.csv);
do
echo "Processing ${address_file}. Adding to ${output_file}"
tail -n +2 $address_file >> ${output_file}
done
}
INPUT_FOLDER="${DATA_FOLDER}/national_address_register/input"
EXTRACTED_FOLDER="${DATA_FOLDER}/national_address_register/extracted"
SCRATCH_FOLDER="${DATA_FOLDER}/national_address_register/scratch"
import_202412() {
# Process 202412
# Extract files
echo "Extracting ${INPUT_FOLDER}/202412.zip"
unzip -q -n ${INPUT_FOLDER}/202412.zip -d ${EXTRACTED_FOLDER}/202412
if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_addresses_202412.csv ]
then
echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_addresses_202412.csv"
echo "LOC_GUID,ADDR_GUID,APT_NO_LABEL,CIVIC_NO,CIVIC_NO_SUFFIX,OFFICIAL_STREET_NAME,OFFICIAL_STREET_TYPE,OFFICIAL_STREET_DIR,PROV_CODE,CSD_ENG_NAME,CSD_FRE_NAME,CSD_TYPE_ENG_CODE,CSD_TYPE_FRE_CODE,MAIL_STREET_NAME,MAIL_STREET_TYPE,MAIL_STREET_DIR,MAIL_MUN_NAME,MAIL_PROV_ABVN,MAIL_POSTAL_CODE,BG_DLS_LSD,BG_DLS_QTR,BG_DLS_SCTN,BG_DLS_TWNSHP,BG_DLS_RNG,BG_DLS_MRD,BG_X,BG_Y,BU_N_CIVIC_ADD,BU_USE" > ${SCRATCH_FOLDER}/statcan_nar_addresses_202412.csv
fi
if [ $(head ${SCRATCH_FOLDER}/statcan_nar_addresses_202412.csv | wc -l) -ne 10 ]
then
echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_addresses_202412.csv"
concatenate_csvs "${EXTRACTED_FOLDER}/202412/Addresses" "${SCRATCH_FOLDER}/statcan_nar_addresses_202412.csv"
fi
if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_locations_202412.csv ]
then
echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_locations_202412.csv"
echo "LOC_GUID,CSD_CODE,FED_CODE,FED_ENG_NAME,FED_FRE_NAME,ER_CODE,ER_ENG_NAME,ER_FRE_NAME,REPPOINT_LATITUDE,REPPOINT_LONGITUDE" > ${SCRATCH_FOLDER}/statcan_nar_locations_202412.csv
fi
if [ $(head ${SCRATCH_FOLDER}/statcan_nar_locations_202412.csv | wc -l) -ne 10 ]
then
echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_locations_202412.csv"
concatenate_csvs "${EXTRACTED_FOLDER}/202412/Locations" "${SCRATCH_FOLDER}/statcan_nar_locations_202412.csv"
fi
python national_address_register/process.py ${SCRATCH_FOLDER}/statcan_nar_addresses_202412.csv ${SCRATCH_FOLDER}/statcan_nar_locations_202412.csv 202412 utf-8
}
import_202406() {
# Process 202406
echo "Extracting ${INPUT_FOLDER}/2024.zip"
unzip -q -n ${INPUT_FOLDER}/2024.zip -d ${EXTRACTED_FOLDER}/202406
if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_addresses_202406.csv ]
then
echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_addresses_202406.csv"
echo "LOC_GUID,ADDR_GUID,APT_NO_LABEL,CIVIC_NO,CIVIC_NO_SUFFIX,OFFICIAL_STREET_NAME,OFFICIAL_STREET_TYPE,OFFICIAL_STREET_DIR,PROV_CODE,CSD_ENG_NAME,CSD_FRE_NAME,CSD_TYPE_ENG_CODE,CSD_TYPE_FRE_CODE,MAIL_STREET_NAME,MAIL_STREET_TYPE,MAIL_STREET_DIR,MAIL_MUN_NAME,MAIL_PROV_ABVN,MAIL_POSTAL_CODE,BG_DLS_LSD,BG_DLS_QTR,BG_DLS_SCTN,BG_DLS_TWNSHP,BG_DLS_RNG,BG_DLS_MRD,BG_X,BG_Y,BU_N_CIVIC_ADD,BU_USE" > ${SCRATCH_FOLDER}/statcan_nar_addresses_202406.csv
fi
if [ $(head ${SCRATCH_FOLDER}/statcan_nar_addresses_202406.csv | wc -l) -ne 10 ]
then
echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_addresses_202406.csv"
concatenate_csvs "${EXTRACTED_FOLDER}/202406/Addresses" "${SCRATCH_FOLDER}/statcan_nar_addresses_202406.csv"
fi
if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_locations_202406.csv ]
then
echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_locations_202406.csv"
echo "LOC_GUID,CSD_CODE,FED_CODE,FED_ENG_NAME,FED_FRE_NAME,ER_CODE,ER_ENG_NAME,ER_FRE_NAME,REPPOINT_LATITUDE,REPPOINT_LONGITUDE" > ${SCRATCH_FOLDER}/statcan_nar_locations_202406.csv
fi
if [ $(head ${SCRATCH_FOLDER}/statcan_nar_locations_202406.csv | wc -l) -ne 10 ]
then
echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_locations_202406.csv"
concatenate_csvs "${EXTRACTED_FOLDER}/202406/Locations" "${SCRATCH_FOLDER}/statcan_nar_locations_202406.csv"
fi
python national_address_register/process.py ${SCRATCH_FOLDER}/statcan_nar_addresses_202406.csv ${SCRATCH_FOLDER}/statcan_nar_locations_202406.csv 202406 utf-8
}
import_2023() {
# Process 2023
echo "Extracting ${INPUT_FOLDER}/2023.zip"
unzip -q -n ${INPUT_FOLDER}/2023.zip -d ${EXTRACTED_FOLDER}/2023
if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_addresses_2023.csv ]
then
echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_addresses_2023.csv"
echo "LOC_GUID,ADDR_GUID,APT_NO_LABEL,CIVIC_NO,CIVIC_NO_SUFFIX,OFFICIAL_STREET_NAME,OFFICIAL_STREET_TYPE,OFFICIAL_STREET_DIR,PROV_CODE,CSD_ENG_NAME,CSD_FRE_NAME,CSD_TYPE_ENG_CODE,CSD_TYPE_FRE_CODE,MAIL_STREET_NAME,MAIL_STREET_TYPE,MAIL_STEET_DIR,MAIL_MUN_NAME,MAIL_PROV_ABVN,MAIL_POSTAL_CODE,BG_DLS_LSD,BG_DLS_QTR,BG_DLS_SCTN,BG_DLS_TWNSHP,BG_DLS_RNG,BG_DLS_MRD,BG_X,BG_Y,BU_N_CIVIC_ADD,BU_USE" > ${SCRATCH_FOLDER}/statcan_nar_addresses_2023.csv
fi
if [ $(head ${SCRATCH_FOLDER}/statcan_nar_addresses_2023.csv | wc -l) -ne 10 ]
then
echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_addresses_2023.csv"
concatenate_csvs "${EXTRACTED_FOLDER}/2023/Addresses" "${SCRATCH_FOLDER}/statcan_nar_addresses_2023.csv"
fi
if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_locations_2023.csv ]
then
echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_locations_2023.csv"
echo "LOC_GUID,CSD_CODE,FED_2021_CODE,FED_2021_ENG_NAME,FED_2021_FRE_NAME,ER_2021_CODE,ER_2021_ENG_NAME,ER_2021_FRE_NAME,REPPOINT_LATITUDE,REPPOINT_LONGITUDE" > ${SCRATCH_FOLDER}/statcan_nar_locations_2023.csv
fi
if [ $(head ${SCRATCH_FOLDER}/statcan_nar_locations_2023.csv | wc -l) -ne 10 ]
then
echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_locations_2023.csv"
concatenate_csvs "${EXTRACTED_FOLDER}/2022/Locations" "${SCRATCH_FOLDER}/statcan_nar_locations_2023.csv"
fi
python national_address_register/process.py ${SCRATCH_FOLDER}/statcan_nar_addresses_2023.csv ${SCRATCH_FOLDER}/statcan_nar_locations_2023.csv 2023 latin-1
}
import_2022() {
# Process 2022
echo "Extracting ${INPUT_FOLDER}/2022.zip"
unzip -q -n ${INPUT_FOLDER}/2022.zip -d ${EXTRACTED_FOLDER}/2022
if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_addresses_2022.csv ]
then
echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_addresses_2022.csv"
echo "LOC_GUID,ADDR_GUID,CIVIC_NO,CIVIC_NO_SUFFIX,APT_NO_LABEL,OFFICIAL_STREET_NAME,OFFICIAL_STREET_TYPE,OFFICIAL_STREET_DIR,PROV_CODE,CSD_ENG_NAME,CSD_FRE_NAME,CSD_TYPE_ENG_CODE,CSD_TYPE_FRE_CODE,MAIL_STREET_NAME,MAIL_STREET_TYPE,MAIL_STREET_DIR,MAIL_MUN_NAME,MAIL_POSTAL_CODE,MAIL_PROV_ABVN,BG_DLS_LSD,BG_DLS_QTR,BG_DLS_SCTN,BG_DLS_TWNSHP,BG_DLS_RNG,BG_DLS_MRD,BG_X,BG_Y,BU_N_CIVIC_ADD,BU_USE" > ${SCRATCH_FOLDER}/statcan_nar_addresses_2022.csv
fi
if [ $(head ${SCRATCH_FOLDER}/statcan_nar_addresses_2022.csv | wc -l) -ne 10 ]
then
echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_addresses_2022.csv"
concatenate_csvs "${EXTRACTED_FOLDER}/2022/Addresses" "${SCRATCH_FOLDER}/statcan_nar_addresses_2022.csv"
fi
if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_locations_2022.csv ]
then
echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_locations_2022.csv"
echo "LOC_GUID,CSD_CODE,FED_2016_CODE,FED_2016_ENG_NAME,FED_2016_FRE_NAME,ER_2016_CODE,ER_2016_ENG_NAME,ER_2016_FRE_NAME,REPPOINT_LATITUDE,REPPOINT_LONGITUDE" > ${SCRATCH_FOLDER}/statcan_nar_locations_2022.csv
fi
if [ $(head ${SCRATCH_FOLDER}/statcan_nar_locations_2022.csv | wc -l) -ne 10 ]
then
echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_locations_2022.csv"
concatenate_csvs "${EXTRACTED_FOLDER}/2022/Locations" "${SCRATCH_FOLDER}/statcan_nar_locations_2022.csv"
fi
python national_address_register/process.py ${SCRATCH_FOLDER}/statcan_nar_addresses_2022.csv ${SCRATCH_FOLDER}/statcan_nar_locations_2022.csv 2022 latin-1
}
import_202412
import_202406
import_2023
import_2022
@@ -1,8 +0,0 @@
# December 2024
https://www150.statcan.gc.ca/n1/pub/46-26-0002/2022001/202412.zip
# June 2024
https://www150.statcan.gc.ca/n1/pub/46-26-0002/2022001/2024.zip
# 2023
https://www150.statcan.gc.ca/n1/pub/46-26-0002/2022001/2023.zip
# 2022
https://www150.statcan.gc.ca/n1/pub/46-26-0002/2022001/2022.zip
@@ -0,0 +1,2 @@
# 2022
https://data.dataforcanada.org/archive/statistics_canada/national_address_register/2022/2022.zip
@@ -0,0 +1,2 @@
# 2023
https://data.dataforcanada.org/archive/statistics_canada/national_address_register/2023/2023.zip
@@ -0,0 +1,2 @@
# June 2024
https://data.dataforcanada.org/archive/statistics_canada/national_address_register/2024-06/2024.zip
@@ -0,0 +1,2 @@
# December 2024
https://data.dataforcanada.org/archive/statistics_canada/national_address_register/2024-12/202412.zip
-69
View File
@@ -1,69 +0,0 @@
#!/usr/bin/env python
# coding: utf-8
import os
import sys
import geopandas as gpd
import pandas as pd
from sqlalchemy import create_engine
statcan_nar_addresses_csv = sys.argv[1]
statcan_nar_locations_csv = sys.argv[2]
vintage = sys.argv[3]
encoding = sys.argv[4]
print(f"Reading {statcan_nar_addresses_csv}")
statcan_nar_addresses = pd.read_csv(filepath_or_buffer=statcan_nar_addresses_csv,
dtype={
"CIVIC_NO": "Int32",
"PROV_CODE": object,
"BU_USE": "Int8",
"BG_DLS_LSD": object,
"BG_DLS_QTR": object,
"BG_DLS_SCTN": object,
"BG_DLS_TWNSHP": object,
"BG_DLS_RNG": object,
"BG_DLS_MRD": object
},
encoding=encoding)
print(f"Reading {statcan_nar_locations_csv}")
statcan_nar_locations = pd.read_csv(filepath_or_buffer=statcan_nar_locations_csv,
usecols=["LOC_GUID",
"REPPOINT_LATITUDE",
"REPPOINT_LONGITUDE"],
encoding=encoding)
print(f"Combining {statcan_nar_addresses_csv} and {statcan_nar_locations_csv}")
statcan_nar_addresses_combined = pd.merge(statcan_nar_addresses,
statcan_nar_locations,
on="LOC_GUID", how="inner")
del statcan_nar_addresses
del statcan_nar_locations
DATABASE = os.environ.get("POSTGRES_DB")
HOST = os.environ.get("WAREHOUSE_PG_HOST")
USER = os.environ.get("POSTGRES_USER")
PASSWORD = os.environ.get("POSTGRES_PASSWORD")
engine = create_engine(f"postgresql://{USER}:{PASSWORD}@{HOST}:5432/{DATABASE}")
print("Creating geodataframe from combined address file")
gdf = gpd.GeoDataFrame(
statcan_nar_addresses_combined,
geometry=gpd.points_from_xy(statcan_nar_addresses_combined.REPPOINT_LONGITUDE,
statcan_nar_addresses_combined.REPPOINT_LATITUDE),
crs="EPSG:4326"
)
print("Dropping 'REPPOINT_LATITUDE', 'REPPOINT_LONGITUDE' from geodataframe")
gdf.drop(columns=["REPPOINT_LATITUDE", "REPPOINT_LONGITUDE"],
inplace=True)
print("Converting geodataframe to EPSG:3857")
gdf.to_crs(3857, inplace=True)
print(f"Loading geodatframe to PostgreSQL as statcan_nar_addresses_combined_{vintage}")
gdf.to_postgis(name=f"statcan_nar_addresses_combined_{vintage}",
con=engine,
chunksize=150000)
+38
View File
@@ -0,0 +1,38 @@
#!/bin/bash
INPUT_FOLDER="${DATA_FOLDER}/national_address_register/input"
EXTRACTED_FOLDER="${DATA_FOLDER}/national_address_register/extracted"
process_202412() {
# Process 2024-12 vintage
# Extract files
echo "Extracting ${INPUT_FOLDER}/2024-12/202412.zip to ${EXTRACTED_FOLDER}/2024-12"
unzip -q -n ${INPUT_FOLDER}/2024-12/202412.zip -d ${EXTRACTED_FOLDER}/2024-12
jupyter execute process_2024_12.ipynb
}
process_202406() {
# Process 2024-06 vintage
echo "Extracting ${INPUT_FOLDER}/2024.zip"
unzip -q -n ${INPUT_FOLDER}/2024.zip -d ${EXTRACTED_FOLDER}/2024-06
# Encoding is utf-8
}
process_2023() {
# Process 2023
echo "Extracting ${INPUT_FOLDER}/2023.zip"
unzip -q -n ${INPUT_FOLDER}/2023.zip -d ${EXTRACTED_FOLDER}/2023
# Encoding is latin-1
}
process_2022() {
# Process 2022
echo "Extracting ${INPUT_FOLDER}/2022.zip"
unzip -q -n ${INPUT_FOLDER}/2022.zip -d ${EXTRACTED_FOLDER}/2022
# Encoding is latin-1
}
process_202412
#process_202406
#process_2023
#process_2022
@@ -0,0 +1,547 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "11c50f98-ddeb-4af0-b69a-743f915fa904",
"metadata": {},
"source": [
"# Experimenting with processing this file. Still need to figure out how to structure this file"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "95feb0e4-b3b6-4235-8c8d-bc3652e82b3a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Buckaroo has been enabled as the default DataFrame viewer. To return to default dataframe visualization use `from buckaroo import disable; disable()`\n"
]
}
],
"source": [
"#!/usr/bin/env python\n",
"# coding: utf-8\n",
"import gc\n",
"import glob\n",
"import os\n",
"import sys \n",
"\n",
"import buckaroo\n",
"import duckdb\n",
"from IPython.core.interactiveshell import InteractiveShell \n",
"import geopandas as gpd\n",
"import pandas as pd\n",
"from sqlalchemy import create_engine\n",
"from sqlalchemy import text\n",
"\n",
"# Enable multiple outputs per cell\n",
"InteractiveShell.ast_node_interactivity = \"all\"\n",
"# Show all columns\n",
"pd.set_option('display.max_columns', None)\n",
"\n",
"DATABASE = os.environ.get(\"POSTGRES_DB\")\n",
"USER = os.environ.get(\"POSTGRES_USER\")\n",
"PASSWORD = os.environ.get(\"POSTGRES_PASSWORD\")\n",
"\n",
"engine = create_engine(f\"postgresql://{USER}:{PASSWORD}@db:5432/{DATABASE}\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "e10bc182-d0d6-4aa3-99b3-2fc396f9ce34",
"metadata": {},
"outputs": [],
"source": [
"input_folder = '/data/national_address_register/extracted'"
]
},
{
"cell_type": "markdown",
"id": "7f170d27-14ca-488a-811d-1c9836264bb6",
"metadata": {},
"source": [
"# 1. Process 2024-12 vintage"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b27e0edc-59ca-4a63-8bff-1d85909aa174",
"metadata": {},
"outputs": [],
"source": [
"nar_addresses_csvs = glob.glob(f'{input_folder}/2024-12/Addresses/*.csv')\n",
"nar_locations_csvs = glob.glob(f'{input_folder}/2024-12/Locations/*.csv')\n",
"encoding = 'utf-8'"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "3fcca833-8547-43b1-b5c7-e5f463ef0bbc",
"metadata": {},
"outputs": [],
"source": [
"def process_nar_locations_csvs(csvs_to_process, encoding):\n",
" \"\"\"\n",
" 1. Reads subset of fields for National Address Register locations\n",
" 2. Appends all of the processed CSVs as one dataframe\n",
" \"\"\"\n",
" dataframes_to_concatenate = []\n",
" for filename in csvs_to_process:\n",
" print(f\"Processing {filename}\")\n",
" params = {\n",
" 'filepath_or_buffer': filename,\n",
" 'encoding': encoding,\n",
" 'usecols': ['LOC_GUID', \n",
" 'REPPOINT_LATITUDE', \n",
" 'REPPOINT_LONGITUDE'\n",
" ]\n",
" }\n",
" nar_location_df = pd.read_csv(**params)\n",
" # Lowercase columns\n",
" nar_location_df.columns = [x.lower() for x in nar_location_df.columns]\n",
" dataframes_to_concatenate.append(nar_location_df)\n",
" \n",
" print(\"Concatenating all dataframes into one\")\n",
" nar_locations_df = pd.concat(dataframes_to_concatenate)\n",
" \n",
" return nar_locations_df\n",
"\n",
"def process_nar_addresses_csvs(csvs_to_process, encoding):\n",
" \"\"\"\n",
" 1. Reads subset of fields for National Address Register addresses\n",
" 2. Appends all of the processed CSVs as one dataframe\n",
" \"\"\"\n",
" dataframes_to_concatenate = []\n",
" for filename in csvs_to_process:\n",
" print(f\"Processing {filename}\")\n",
" params = {\n",
" 'filepath_or_buffer': filename,\n",
" 'encoding': encoding,\n",
" 'usecols': ['LOC_GUID', \n",
" 'ADDR_GUID', \n",
" 'APT_NO_LABEL',\n",
" 'CIVIC_NO',\n",
" 'CIVIC_NO_SUFFIX',\n",
" 'OFFICIAL_STREET_NAME',\n",
" 'OFFICIAL_STREET_TYPE',\n",
" 'OFFICIAL_STREET_DIR',\n",
" 'MAIL_STREET_NAME',\n",
" 'MAIL_STREET_TYPE',\n",
" 'MAIL_STREET_DIR',\n",
" 'MAIL_MUN_NAME',\n",
" 'MAIL_POSTAL_CODE',\n",
" 'BG_DLS_LSD',\n",
" 'BG_DLS_QTR',\n",
" 'BG_DLS_SCTN',\n",
" 'BG_DLS_RNG',\n",
" 'BG_DLS_MRD',\n",
" # Removing since REPPOINT_LATITUDE and REPPOINT_LONGITUDE seem to have same purpose\n",
" #'BG_X',\n",
" #'BG_Y',\n",
" 'BU_USE',\n",
" 'BU_N_CIVIC_ADD'\n",
" ],\n",
" 'dtype': {\n",
" \"CIVIC_NO\": \"Int32\", \n",
" \"PROV_CODE\": object,\n",
" \"BU_USE\": \"Int8\",\n",
" \"BG_DLS_LSD\": object,\n",
" \"BG_DLS_QTR\": object,\n",
" \"BG_DLS_SCTN\": object,\n",
" \"BG_DLS_TWNSHP\": object,\n",
" \"BG_DLS_RNG\": object,\n",
" \"BG_DLS_MRD\": object\n",
" }\n",
" }\n",
" nar_address_df = pd.read_csv(**params)\n",
" # Lowercase columns\n",
" nar_address_df.columns = [x.lower() for x in nar_address_df.columns]\n",
" dataframes_to_concatenate.append(nar_address_df)\n",
" \n",
" print(\"Concatenating all dataframes into one\")\n",
" nar_addresses_df = pd.concat(dataframes_to_concatenate, ignore_index=True)\n",
" \n",
" return nar_addresses_df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "f9bb3a4d-d913-4b9d-aa49-7ec2c4dfef60",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_10.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_11.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_12.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_13.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_24_part_1.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_24_part_2.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_24_part_3.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_24_part_4.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_35_part_1.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_35_part_2.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_35_part_3.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_35_part_4.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_35_part_5.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_46.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_47.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_48_part_1.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_48_part_2.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_59_part_1.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_59_part_2.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_60.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_61.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_62.csv\n",
"Concatenating all dataframes into one\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_10.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_11.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_12.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_13.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_943/484766080.py:74: DtypeWarning: Columns (27) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" nar_address_df = pd.read_csv(**params)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_24_part_1.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_24_part_2.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_24_part_3.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_24_part_4.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_24_part_5.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_35_part_1.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_35_part_2.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_35_part_3.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_35_part_4.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_35_part_5.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_35_part_6.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_35_part_7.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_46.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_47.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_48_part_1.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_48_part_2.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_48_part_3.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_59_part_1.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_59_part_2.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_59_part_3.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_60.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_61.csv\n",
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_62.csv\n",
"Concatenating all dataframes into one\n"
]
}
],
"source": [
"nar_locations = process_nar_locations_csvs(nar_locations_csvs, encoding)\n",
"nar_addresses = process_nar_addresses_csvs(nar_addresses_csvs, encoding)"
]
},
{
"cell_type": "markdown",
"id": "b49f6602-9bda-410f-82a2-54a5711311b0",
"metadata": {},
"source": [
"# TODO\n",
"- look into why there are locations with empty reppoint_latitude and reppoint_longitude\n",
" - There are 84,285 records that have an empty reppoint_latitude and reppoint_longitude"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "f3600645-c350-473f-81ea-0bd9ebe70e37",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Combining nar_addresses and nar_locations\n"
]
},
{
"data": {
"text/plain": [
"40"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(\"Combining nar_addresses and nar_locations\")\n",
"nar_addresses_combined = duckdb.sql(\"\"\"\n",
"SELECT a.addr_guid, a.apt_no_label, a.civic_no, a.civic_no_suffix, a.official_street_name, a.mail_street_name, a.official_street_type, a.mail_street_type,\n",
" a.official_street_dir AS official_street_direction, a.mail_street_dir AS mail_street_direction, a.mail_postal_code, a.mail_mun_name AS mail_municipality_name, \n",
" a.bu_n_civic_add, a.bu_use,\n",
" a.bg_dls_lsd, a.bg_dls_qtr, a.bg_dls_sctn, a.bg_dls_rng, a.bg_dls_mrd,\n",
" b.reppoint_latitude, b.reppoint_longitude\n",
"FROM nar_addresses AS a,\n",
" nar_locations AS b\n",
"WHERE a.loc_guid = b.loc_guid AND b.reppoint_latitude IS NOT NULL\n",
"\"\"\").df()\n",
"\n",
"del nar_addresses\n",
"del nar_locations\n",
"gc.collect()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0d35eec6-8ab3-4e7a-a7fd-2915333e27f9",
"metadata": {},
"outputs": [],
"source": [
"gdf = gpd.GeoDataFrame(\n",
" nar_addresses_combined, \n",
" geometry=gpd.points_from_xy(nar_addresses_combined.reppoint_longitude,\n",
" nar_addresses_combined.reppoint_latitude),\n",
" crs=\"EPSG:4326\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "614ad773-adb3-413c-8b8f-da721afc85cd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dropping 'reppoint_latitude', 'reppoint_longitude' from geodataframe\n"
]
}
],
"source": [
"print(\"Dropping 'reppoint_latitude', 'reppoint_longitude' from geodataframe\")\n",
"gdf.drop(columns=[\"reppoint_latitude\", \"reppoint_longitude\"], \n",
" inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "cac024f8-6eb5-48ea-88ad-289edf505ba3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"del nar_addresses_combined\n",
"gc.collect()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "d2f81e61-1fc7-4518-ad1f-d515e16ce22c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loading geodataframe to PostgreSQL as bronze.nar_2024_12\n"
]
}
],
"source": [
"print(\"Loading geodataframe to PostgreSQL as bronze.nar_2024_12\")\n",
"gdf.to_postgis(name=\"nar_2024_12\", \n",
" schema='bronze',\n",
" con=engine,\n",
" chunksize=150000)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "b7fb6976-0b95-445e-b1e8-022c39bce25b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"584"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"del(gdf)\n",
"gc.collect()"
]
},
{
"cell_type": "markdown",
"id": "384cd07b-c79f-42e8-81d6-ecbe78f167b1",
"metadata": {},
"source": [
"## Link to 2021 geographies\n",
"There are 10 records that were not linked to 2021 geographies"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "f7f56297-150b-4ad2-a5c2-2ee5e477f978",
"metadata": {},
"outputs": [],
"source": [
"sql = \"\"\"\n",
"DROP TABLE IF EXISTS silver.nar_2024_12;\n",
"CREATE TABLE silver.nar_2024_12 AS\n",
"SELECT DISTINCT\n",
" b.country_dguid,\n",
" b.country_en_name,\n",
" b.country_fr_name,\n",
" b.country_en_abbreviation,\n",
" b.country_fr_abbreviation,\n",
" b.grc_dguid,\n",
" b.grc_en_name,\n",
" b.grc_fr_name,\n",
" b.pr_dguid,\n",
" b.pr_en_name,\n",
" b.pr_fr_name,\n",
" b.pr_en_abbreviation,\n",
" b.pr_fr_abbreviation,\n",
" b.pr_iso_code,\n",
" b.car_dguid,\n",
" b.car_en_name,\n",
" b.car_fr_name,\n",
" b.er_dguid,\n",
" b.er_name,\n",
" b.cd_dguid,\n",
" b.cd_name,\n",
" b.cd_type,\n",
" b.ccs_dguid,\n",
" b.ccs_name,\n",
" b.cma_dguid,\n",
" b.cma_p_dguid,\n",
" b.cma_name,\n",
" b.cma_type,\n",
" b.csd_dguid,\n",
" b.csd_name,\n",
" b.csd_type,\n",
" b.sac_type,\n",
" b.sac_code,\n",
" b.fed_dguid,\n",
" b.fed_name,\n",
" b.fed_en_name,\n",
" b.fed_fr_name,\n",
" b.ct_dguid,\n",
" b.ada_dguid,\n",
" b.da_dguid,\n",
" b.db_dguid,\n",
" a.addr_guid,\n",
" a.apt_no_label,\n",
" a.civic_no,\n",
" a.civic_no_suffix,\n",
" a.official_street_name, \n",
" a.mail_street_name, \n",
" a.official_street_type,\n",
" a.mail_street_type,\n",
" a.official_street_direction,\n",
" a.mail_street_direction,\n",
" a.mail_postal_code,\n",
" a.mail_municipality_name,\n",
" a.bu_n_civic_add,\n",
" a.bu_use,\n",
" a.bg_dls_lsd,\n",
" a.bg_dls_qtr,\n",
" a.bg_dls_sctn,\n",
" a.bg_dls_rng,\n",
" a.bg_dls_mrd,\n",
" a.geometry AS geom\n",
"FROM bronze.nar_2024_12 AS a,\n",
" silver.db_2021_digital AS b\n",
"WHERE ST_Intersects(a.geometry, b.geom);\n",
"\n",
"-- Create spatial index\n",
"CREATE INDEX nar_2024_12_geom_idx ON silver.nar_2024_12 USING gist (geom) WITH (\n",
" fillfactor = 100\n",
");\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "ae7babb4-a3e6-4c0b-93a6-3b52a4f89a1b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<sqlalchemy.engine.cursor.CursorResult at 0x7f38310c66d0>"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with engine.connect() as conn:\n",
" conn.execute(text(sql))\n",
" conn.commit()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}