mirror of
https://github.com/dataforcanada/d4c-datapkg-statistical.git
synced 2026-06-13 14:10:55 +02:00
Process 2024-12 national address register. Still need to make some improvements
This commit is contained in:
@@ -47,4 +47,9 @@ census_of_population/process.sh
|
|||||||
## 7.1 Download Census of Population ##
|
## 7.1 Download Census of Population ##
|
||||||
census_of_agriculture/download.sh
|
census_of_agriculture/download.sh
|
||||||
## 7.2 Process Census of Agriculture ##
|
## 7.2 Process Census of Agriculture ##
|
||||||
census_of_agriculture/process.sh
|
census_of_agriculture/process.sh
|
||||||
|
#### 8.0 National Address Register ####
|
||||||
|
## 8.1 Download National Address Register ##
|
||||||
|
national_address_register/download.sh
|
||||||
|
## 8.2 Load National Address Register ##
|
||||||
|
national_address_register/process.sh
|
||||||
|
|||||||
@@ -2,8 +2,16 @@
|
|||||||
if [ ! -d "${DATA_FOLDER}/national_address_register" ]
|
if [ ! -d "${DATA_FOLDER}/national_address_register" ]
|
||||||
then
|
then
|
||||||
echo "Making directory ${DATA_FOLDER}/national_address_register/"
|
echo "Making directory ${DATA_FOLDER}/national_address_register/"
|
||||||
mkdir -p ${DATA_FOLDER}/national_address_register/{input,extracted,output,scratch}
|
mkdir -p ${DATA_FOLDER}/national_address_register/{input,extracted,output}/{2024-12,2024-06,2023,2022}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Downloading national address register files"
|
INPUT_FOLDER="${DATA_FOLDER}/national_address_register/input"
|
||||||
aria2c -x16 -i "${SCRIPT_DIR}/national_address_register/national_address_register_files.txt" --dir=$DATA_FOLDER/national_address_register/input --auto-file-renaming=false
|
|
||||||
|
echo "Downloading 2024-12 vintage of national address register files"
|
||||||
|
aria2c -x16 -i "${SCRIPT_DIR}/national_address_register/national_address_register_files_2024_12.txt" --dir=${INPUT_FOLDER}/2024-12 --auto-file-renaming=false
|
||||||
|
echo "Downloading 2024-06 vintage of national address register files"
|
||||||
|
aria2c -x16 -i "${SCRIPT_DIR}/national_address_register/national_address_register_files_2024_06.txt" --dir=${INPUT_FOLDER}/2024-06 --auto-file-renaming=false
|
||||||
|
echo "Downloading 2023 vintage of national address register files"
|
||||||
|
aria2c -x16 -i "${SCRIPT_DIR}/national_address_register/national_address_register_files_2023.txt" --dir=${INPUT_FOLDER}/2023 --auto-file-renaming=false
|
||||||
|
echo "Downloading 2022 vintage of national address register files"
|
||||||
|
aria2c -x16 -i "${SCRIPT_DIR}/national_address_register/national_address_register_files_2022.txt" --dir=${INPUT_FOLDER}/2022 --auto-file-renaming=false
|
||||||
@@ -0,0 +1,155 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "b6e053ec",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"DATA_FOLDER=/data\n",
|
||||||
|
"\n",
|
||||||
|
"source ../.env"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "12eca225-3d05-4bb7-95fa-7b9df694f53d",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# 1. Export National Address Register File (2024-12 vintage)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "4c5bb532",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Exporting silver.nar_2024_12 table to /data/national_address_register/output/2024-12/nar_2024_12.parquet\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"output_folder=\"${DATA_FOLDER}/national_address_register/output/2024-12\"\n",
|
||||||
|
"output_file=\"${output_folder}/nar_2024_12.parquet\"\n",
|
||||||
|
"echo \"Exporting silver.nar_2024_12 table to ${output_file}\" \n",
|
||||||
|
"ogr2ogr \\\n",
|
||||||
|
" -lco COMPRESSION=\"ZSTD\" \\\n",
|
||||||
|
" -lco CREATOR=\"www.dataforcanada.org\" \\\n",
|
||||||
|
" -lco WRITE_COVERING_BBOX=\"YES\" \\\n",
|
||||||
|
" -lco SORT_BY_BBOX=\"YES\" \\\n",
|
||||||
|
" -f Parquet \\\n",
|
||||||
|
" ${output_file} \\\n",
|
||||||
|
" \"PG:host=db dbname=${POSTGRES_DB} user=${POSTGRES_USER} password=${POSTGRES_PASSWORD} port=5432\" \\\n",
|
||||||
|
" \"silver.nar_2024_12\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "245cd8fb-da36-4476-8e5c-4d3665e901d7",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# 2. Export National Address Register File (2024-06 vintage)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "f3e2a5b5-9b31-4f82-88da-414bf23ba515",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"output_folder=\"${DATA_FOLDER}/national_address_register/output/2024-06\"\n",
|
||||||
|
"output_file=\"${output_folder}/nar_2024_06.parquet\"\n",
|
||||||
|
"echo \"Exporting silver.nar_2024_06 table to ${output_file}\" \n",
|
||||||
|
"ogr2ogr \\\n",
|
||||||
|
" -lco COMPRESSION=\"ZSTD\" \\\n",
|
||||||
|
" -lco CREATOR=\"www.dataforcanada.org\" \\\n",
|
||||||
|
" -lco WRITE_COVERING_BBOX=\"YES\" \\\n",
|
||||||
|
" -lco SORT_BY_BBOX=\"YES\" \\\n",
|
||||||
|
" -f Parquet \\\n",
|
||||||
|
" ${output_file} \\\n",
|
||||||
|
" \"PG:host=db dbname=${POSTGRES_DB} user=${POSTGRES_USER} password=${POSTGRES_PASSWORD} port=5432\" \\\n",
|
||||||
|
" \"silver.nar_2024_06\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "56ef7755-06ef-473c-8a1f-0129f6b1dc28",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# 3. Export National Address Register File (2023 vintage)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "97df2904-c18d-4132-bc28-f0f454c48c87",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"output_folder=\"${DATA_FOLDER}/national_address_register/output/2023\"\n",
|
||||||
|
"output_file=\"${output_folder}/nar_2023.parquet\"\n",
|
||||||
|
"echo \"Exporting silver.nar_2023 table to ${output_file}\"\n",
|
||||||
|
"ogr2ogr \\\n",
|
||||||
|
" -lco COMPRESSION=\"ZSTD\" \\\n",
|
||||||
|
" -lco CREATOR=\"www.dataforcanada.org\" \\\n",
|
||||||
|
" -lco WRITE_COVERING_BBOX=\"YES\" \\\n",
|
||||||
|
" -lco SORT_BY_BBOX=\"YES\" \\\n",
|
||||||
|
" -f Parquet \\\n",
|
||||||
|
" ${output_file} \\\n",
|
||||||
|
" \"PG:host=db dbname=${POSTGRES_DB} user=${POSTGRES_USER} password=${POSTGRES_PASSWORD} port=5432\" \\\n",
|
||||||
|
" \"silver.nar_2023\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "f1bfd838-19aa-4de7-9ca1-61b1215d7865",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# 4. Export National Address Register File (2022 vintage)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "d2f1210f-0d9c-45de-a019-a438ff9dfa4a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"output_folder=\"${DATA_FOLDER}/national_address_register/output/2022\"\n",
|
||||||
|
"output_file=\"${output_folder}/nar_2022.parquet\"\n",
|
||||||
|
"echo \"Exporting silver.nar_2022 table to ${output_file}\"\n",
|
||||||
|
"ogr2ogr \\\n",
|
||||||
|
" -lco COMPRESSION=\"ZSTD\" \\\n",
|
||||||
|
" -lco CREATOR=\"www.dataforcanada.org\" \\\n",
|
||||||
|
" -lco WRITE_COVERING_BBOX=\"YES\" \\\n",
|
||||||
|
" -lco SORT_BY_BBOX=\"YES\" \\\n",
|
||||||
|
" -f Parquet \\\n",
|
||||||
|
" ${output_file} \\\n",
|
||||||
|
" \"PG:host=db dbname=${POSTGRES_DB} user=${POSTGRES_USER} password=${POSTGRES_PASSWORD} port=5432\" \\\n",
|
||||||
|
" \"silver.nar_2022\""
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Bash",
|
||||||
|
"language": "bash",
|
||||||
|
"name": "bash"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": "shell",
|
||||||
|
"file_extension": ".sh",
|
||||||
|
"mimetype": "text/x-sh",
|
||||||
|
"name": "bash"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
@@ -1,167 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
import_to_postgis() {
|
|
||||||
local filepath=$1
|
|
||||||
local table_name=$2
|
|
||||||
local extra_parameters=${@:3}
|
|
||||||
|
|
||||||
# Virtual file system
|
|
||||||
if [[ ${filepath: -4} = '.zip' ]]; then
|
|
||||||
local filepath="/vsizip/${filepath}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Importing ${filepath}"
|
|
||||||
ogr2ogr \
|
|
||||||
--config PG_USE_COPY YES \
|
|
||||||
-overwrite \
|
|
||||||
-f "PostgreSQL" \
|
|
||||||
"PG:host=db dbname=${POSTGRES_DB} user=${POSTGRES_USER} password=${POSTGRES_PASSWORD} port=5432" \
|
|
||||||
-lco GEOMETRY_NAME=geom \
|
|
||||||
-progress \
|
|
||||||
-gt 500000 \
|
|
||||||
-t_srs EPSG:3857 \
|
|
||||||
-nln ${table_name} \
|
|
||||||
${extra_parameters} \
|
|
||||||
${filepath}
|
|
||||||
}
|
|
||||||
|
|
||||||
concatenate_csvs() {
|
|
||||||
# Concatenates all of the CSVs in the directory
|
|
||||||
local input_directory=$1
|
|
||||||
local output_file=$2
|
|
||||||
for address_file in $(ls ${input_directory}/*.csv);
|
|
||||||
do
|
|
||||||
echo "Processing ${address_file}. Adding to ${output_file}"
|
|
||||||
tail -n +2 $address_file >> ${output_file}
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
INPUT_FOLDER="${DATA_FOLDER}/national_address_register/input"
|
|
||||||
EXTRACTED_FOLDER="${DATA_FOLDER}/national_address_register/extracted"
|
|
||||||
SCRATCH_FOLDER="${DATA_FOLDER}/national_address_register/scratch"
|
|
||||||
|
|
||||||
import_202412() {
|
|
||||||
# Process 202412
|
|
||||||
# Extract files
|
|
||||||
echo "Extracting ${INPUT_FOLDER}/202412.zip"
|
|
||||||
unzip -q -n ${INPUT_FOLDER}/202412.zip -d ${EXTRACTED_FOLDER}/202412
|
|
||||||
if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_addresses_202412.csv ]
|
|
||||||
then
|
|
||||||
echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_addresses_202412.csv"
|
|
||||||
echo "LOC_GUID,ADDR_GUID,APT_NO_LABEL,CIVIC_NO,CIVIC_NO_SUFFIX,OFFICIAL_STREET_NAME,OFFICIAL_STREET_TYPE,OFFICIAL_STREET_DIR,PROV_CODE,CSD_ENG_NAME,CSD_FRE_NAME,CSD_TYPE_ENG_CODE,CSD_TYPE_FRE_CODE,MAIL_STREET_NAME,MAIL_STREET_TYPE,MAIL_STREET_DIR,MAIL_MUN_NAME,MAIL_PROV_ABVN,MAIL_POSTAL_CODE,BG_DLS_LSD,BG_DLS_QTR,BG_DLS_SCTN,BG_DLS_TWNSHP,BG_DLS_RNG,BG_DLS_MRD,BG_X,BG_Y,BU_N_CIVIC_ADD,BU_USE" > ${SCRATCH_FOLDER}/statcan_nar_addresses_202412.csv
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $(head ${SCRATCH_FOLDER}/statcan_nar_addresses_202412.csv | wc -l) -ne 10 ]
|
|
||||||
then
|
|
||||||
echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_addresses_202412.csv"
|
|
||||||
concatenate_csvs "${EXTRACTED_FOLDER}/202412/Addresses" "${SCRATCH_FOLDER}/statcan_nar_addresses_202412.csv"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_locations_202412.csv ]
|
|
||||||
then
|
|
||||||
echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_locations_202412.csv"
|
|
||||||
echo "LOC_GUID,CSD_CODE,FED_CODE,FED_ENG_NAME,FED_FRE_NAME,ER_CODE,ER_ENG_NAME,ER_FRE_NAME,REPPOINT_LATITUDE,REPPOINT_LONGITUDE" > ${SCRATCH_FOLDER}/statcan_nar_locations_202412.csv
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $(head ${SCRATCH_FOLDER}/statcan_nar_locations_202412.csv | wc -l) -ne 10 ]
|
|
||||||
then
|
|
||||||
echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_locations_202412.csv"
|
|
||||||
concatenate_csvs "${EXTRACTED_FOLDER}/202412/Locations" "${SCRATCH_FOLDER}/statcan_nar_locations_202412.csv"
|
|
||||||
fi
|
|
||||||
python national_address_register/process.py ${SCRATCH_FOLDER}/statcan_nar_addresses_202412.csv ${SCRATCH_FOLDER}/statcan_nar_locations_202412.csv 202412 utf-8
|
|
||||||
}
|
|
||||||
|
|
||||||
import_202406() {
|
|
||||||
# Process 202406
|
|
||||||
echo "Extracting ${INPUT_FOLDER}/2024.zip"
|
|
||||||
unzip -q -n ${INPUT_FOLDER}/2024.zip -d ${EXTRACTED_FOLDER}/202406
|
|
||||||
if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_addresses_202406.csv ]
|
|
||||||
then
|
|
||||||
echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_addresses_202406.csv"
|
|
||||||
echo "LOC_GUID,ADDR_GUID,APT_NO_LABEL,CIVIC_NO,CIVIC_NO_SUFFIX,OFFICIAL_STREET_NAME,OFFICIAL_STREET_TYPE,OFFICIAL_STREET_DIR,PROV_CODE,CSD_ENG_NAME,CSD_FRE_NAME,CSD_TYPE_ENG_CODE,CSD_TYPE_FRE_CODE,MAIL_STREET_NAME,MAIL_STREET_TYPE,MAIL_STREET_DIR,MAIL_MUN_NAME,MAIL_PROV_ABVN,MAIL_POSTAL_CODE,BG_DLS_LSD,BG_DLS_QTR,BG_DLS_SCTN,BG_DLS_TWNSHP,BG_DLS_RNG,BG_DLS_MRD,BG_X,BG_Y,BU_N_CIVIC_ADD,BU_USE" > ${SCRATCH_FOLDER}/statcan_nar_addresses_202406.csv
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $(head ${SCRATCH_FOLDER}/statcan_nar_addresses_202406.csv | wc -l) -ne 10 ]
|
|
||||||
then
|
|
||||||
echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_addresses_202406.csv"
|
|
||||||
concatenate_csvs "${EXTRACTED_FOLDER}/202406/Addresses" "${SCRATCH_FOLDER}/statcan_nar_addresses_202406.csv"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_locations_202406.csv ]
|
|
||||||
then
|
|
||||||
echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_locations_202406.csv"
|
|
||||||
echo "LOC_GUID,CSD_CODE,FED_CODE,FED_ENG_NAME,FED_FRE_NAME,ER_CODE,ER_ENG_NAME,ER_FRE_NAME,REPPOINT_LATITUDE,REPPOINT_LONGITUDE" > ${SCRATCH_FOLDER}/statcan_nar_locations_202406.csv
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $(head ${SCRATCH_FOLDER}/statcan_nar_locations_202406.csv | wc -l) -ne 10 ]
|
|
||||||
then
|
|
||||||
echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_locations_202406.csv"
|
|
||||||
concatenate_csvs "${EXTRACTED_FOLDER}/202406/Locations" "${SCRATCH_FOLDER}/statcan_nar_locations_202406.csv"
|
|
||||||
fi
|
|
||||||
python national_address_register/process.py ${SCRATCH_FOLDER}/statcan_nar_addresses_202406.csv ${SCRATCH_FOLDER}/statcan_nar_locations_202406.csv 202406 utf-8
|
|
||||||
}
|
|
||||||
|
|
||||||
import_2023() {
|
|
||||||
# Process 2023
|
|
||||||
echo "Extracting ${INPUT_FOLDER}/2023.zip"
|
|
||||||
unzip -q -n ${INPUT_FOLDER}/2023.zip -d ${EXTRACTED_FOLDER}/2023
|
|
||||||
if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_addresses_2023.csv ]
|
|
||||||
then
|
|
||||||
echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_addresses_2023.csv"
|
|
||||||
echo "LOC_GUID,ADDR_GUID,APT_NO_LABEL,CIVIC_NO,CIVIC_NO_SUFFIX,OFFICIAL_STREET_NAME,OFFICIAL_STREET_TYPE,OFFICIAL_STREET_DIR,PROV_CODE,CSD_ENG_NAME,CSD_FRE_NAME,CSD_TYPE_ENG_CODE,CSD_TYPE_FRE_CODE,MAIL_STREET_NAME,MAIL_STREET_TYPE,MAIL_STEET_DIR,MAIL_MUN_NAME,MAIL_PROV_ABVN,MAIL_POSTAL_CODE,BG_DLS_LSD,BG_DLS_QTR,BG_DLS_SCTN,BG_DLS_TWNSHP,BG_DLS_RNG,BG_DLS_MRD,BG_X,BG_Y,BU_N_CIVIC_ADD,BU_USE" > ${SCRATCH_FOLDER}/statcan_nar_addresses_2023.csv
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $(head ${SCRATCH_FOLDER}/statcan_nar_addresses_2023.csv | wc -l) -ne 10 ]
|
|
||||||
then
|
|
||||||
echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_addresses_2023.csv"
|
|
||||||
concatenate_csvs "${EXTRACTED_FOLDER}/2023/Addresses" "${SCRATCH_FOLDER}/statcan_nar_addresses_2023.csv"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_locations_2023.csv ]
|
|
||||||
then
|
|
||||||
echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_locations_2023.csv"
|
|
||||||
echo "LOC_GUID,CSD_CODE,FED_2021_CODE,FED_2021_ENG_NAME,FED_2021_FRE_NAME,ER_2021_CODE,ER_2021_ENG_NAME,ER_2021_FRE_NAME,REPPOINT_LATITUDE,REPPOINT_LONGITUDE" > ${SCRATCH_FOLDER}/statcan_nar_locations_2023.csv
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $(head ${SCRATCH_FOLDER}/statcan_nar_locations_2023.csv | wc -l) -ne 10 ]
|
|
||||||
then
|
|
||||||
echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_locations_2023.csv"
|
|
||||||
concatenate_csvs "${EXTRACTED_FOLDER}/2022/Locations" "${SCRATCH_FOLDER}/statcan_nar_locations_2023.csv"
|
|
||||||
fi
|
|
||||||
python national_address_register/process.py ${SCRATCH_FOLDER}/statcan_nar_addresses_2023.csv ${SCRATCH_FOLDER}/statcan_nar_locations_2023.csv 2023 latin-1
|
|
||||||
}
|
|
||||||
|
|
||||||
import_2022() {
|
|
||||||
# Process 2022
|
|
||||||
echo "Extracting ${INPUT_FOLDER}/2022.zip"
|
|
||||||
unzip -q -n ${INPUT_FOLDER}/2022.zip -d ${EXTRACTED_FOLDER}/2022
|
|
||||||
if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_addresses_2022.csv ]
|
|
||||||
then
|
|
||||||
echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_addresses_2022.csv"
|
|
||||||
echo "LOC_GUID,ADDR_GUID,CIVIC_NO,CIVIC_NO_SUFFIX,APT_NO_LABEL,OFFICIAL_STREET_NAME,OFFICIAL_STREET_TYPE,OFFICIAL_STREET_DIR,PROV_CODE,CSD_ENG_NAME,CSD_FRE_NAME,CSD_TYPE_ENG_CODE,CSD_TYPE_FRE_CODE,MAIL_STREET_NAME,MAIL_STREET_TYPE,MAIL_STREET_DIR,MAIL_MUN_NAME,MAIL_POSTAL_CODE,MAIL_PROV_ABVN,BG_DLS_LSD,BG_DLS_QTR,BG_DLS_SCTN,BG_DLS_TWNSHP,BG_DLS_RNG,BG_DLS_MRD,BG_X,BG_Y,BU_N_CIVIC_ADD,BU_USE" > ${SCRATCH_FOLDER}/statcan_nar_addresses_2022.csv
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $(head ${SCRATCH_FOLDER}/statcan_nar_addresses_2022.csv | wc -l) -ne 10 ]
|
|
||||||
then
|
|
||||||
echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_addresses_2022.csv"
|
|
||||||
concatenate_csvs "${EXTRACTED_FOLDER}/2022/Addresses" "${SCRATCH_FOLDER}/statcan_nar_addresses_2022.csv"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_locations_2022.csv ]
|
|
||||||
then
|
|
||||||
echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_locations_2022.csv"
|
|
||||||
echo "LOC_GUID,CSD_CODE,FED_2016_CODE,FED_2016_ENG_NAME,FED_2016_FRE_NAME,ER_2016_CODE,ER_2016_ENG_NAME,ER_2016_FRE_NAME,REPPOINT_LATITUDE,REPPOINT_LONGITUDE" > ${SCRATCH_FOLDER}/statcan_nar_locations_2022.csv
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $(head ${SCRATCH_FOLDER}/statcan_nar_locations_2022.csv | wc -l) -ne 10 ]
|
|
||||||
then
|
|
||||||
echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_locations_2022.csv"
|
|
||||||
concatenate_csvs "${EXTRACTED_FOLDER}/2022/Locations" "${SCRATCH_FOLDER}/statcan_nar_locations_2022.csv"
|
|
||||||
fi
|
|
||||||
python national_address_register/process.py ${SCRATCH_FOLDER}/statcan_nar_addresses_2022.csv ${SCRATCH_FOLDER}/statcan_nar_locations_2022.csv 2022 latin-1
|
|
||||||
}
|
|
||||||
|
|
||||||
import_202412
|
|
||||||
import_202406
|
|
||||||
import_2023
|
|
||||||
import_2022
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
# December 2024
|
|
||||||
https://www150.statcan.gc.ca/n1/pub/46-26-0002/2022001/202412.zip
|
|
||||||
# June 2024
|
|
||||||
https://www150.statcan.gc.ca/n1/pub/46-26-0002/2022001/2024.zip
|
|
||||||
# 2023
|
|
||||||
https://www150.statcan.gc.ca/n1/pub/46-26-0002/2022001/2023.zip
|
|
||||||
# 2022
|
|
||||||
https://www150.statcan.gc.ca/n1/pub/46-26-0002/2022001/2022.zip
|
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
# 2022
|
||||||
|
https://data.dataforcanada.org/archive/statistics_canada/national_address_register/2022/2022.zip
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
# 2023
|
||||||
|
https://data.dataforcanada.org/archive/statistics_canada/national_address_register/2023/2023.zip
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
# June 2024
|
||||||
|
https://data.dataforcanada.org/archive/statistics_canada/national_address_register/2024-06/2024.zip
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
# December 2024
|
||||||
|
https://data.dataforcanada.org/archive/statistics_canada/national_address_register/2024-12/202412.zip
|
||||||
@@ -1,69 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
# coding: utf-8
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
|
|
||||||
import geopandas as gpd
|
|
||||||
import pandas as pd
|
|
||||||
from sqlalchemy import create_engine
|
|
||||||
|
|
||||||
statcan_nar_addresses_csv = sys.argv[1]
|
|
||||||
statcan_nar_locations_csv = sys.argv[2]
|
|
||||||
vintage = sys.argv[3]
|
|
||||||
encoding = sys.argv[4]
|
|
||||||
|
|
||||||
print(f"Reading {statcan_nar_addresses_csv}")
|
|
||||||
statcan_nar_addresses = pd.read_csv(filepath_or_buffer=statcan_nar_addresses_csv,
|
|
||||||
dtype={
|
|
||||||
"CIVIC_NO": "Int32",
|
|
||||||
"PROV_CODE": object,
|
|
||||||
"BU_USE": "Int8",
|
|
||||||
"BG_DLS_LSD": object,
|
|
||||||
"BG_DLS_QTR": object,
|
|
||||||
"BG_DLS_SCTN": object,
|
|
||||||
"BG_DLS_TWNSHP": object,
|
|
||||||
"BG_DLS_RNG": object,
|
|
||||||
"BG_DLS_MRD": object
|
|
||||||
},
|
|
||||||
encoding=encoding)
|
|
||||||
|
|
||||||
print(f"Reading {statcan_nar_locations_csv}")
|
|
||||||
statcan_nar_locations = pd.read_csv(filepath_or_buffer=statcan_nar_locations_csv,
|
|
||||||
usecols=["LOC_GUID",
|
|
||||||
"REPPOINT_LATITUDE",
|
|
||||||
"REPPOINT_LONGITUDE"],
|
|
||||||
encoding=encoding)
|
|
||||||
|
|
||||||
print(f"Combining {statcan_nar_addresses_csv} and {statcan_nar_locations_csv}")
|
|
||||||
statcan_nar_addresses_combined = pd.merge(statcan_nar_addresses,
|
|
||||||
statcan_nar_locations,
|
|
||||||
on="LOC_GUID", how="inner")
|
|
||||||
|
|
||||||
del statcan_nar_addresses
|
|
||||||
del statcan_nar_locations
|
|
||||||
|
|
||||||
DATABASE = os.environ.get("POSTGRES_DB")
|
|
||||||
HOST = os.environ.get("WAREHOUSE_PG_HOST")
|
|
||||||
USER = os.environ.get("POSTGRES_USER")
|
|
||||||
PASSWORD = os.environ.get("POSTGRES_PASSWORD")
|
|
||||||
|
|
||||||
engine = create_engine(f"postgresql://{USER}:{PASSWORD}@{HOST}:5432/{DATABASE}")
|
|
||||||
|
|
||||||
print("Creating geodataframe from combined address file")
|
|
||||||
gdf = gpd.GeoDataFrame(
|
|
||||||
statcan_nar_addresses_combined,
|
|
||||||
geometry=gpd.points_from_xy(statcan_nar_addresses_combined.REPPOINT_LONGITUDE,
|
|
||||||
statcan_nar_addresses_combined.REPPOINT_LATITUDE),
|
|
||||||
crs="EPSG:4326"
|
|
||||||
)
|
|
||||||
|
|
||||||
print("Dropping 'REPPOINT_LATITUDE', 'REPPOINT_LONGITUDE' from geodataframe")
|
|
||||||
gdf.drop(columns=["REPPOINT_LATITUDE", "REPPOINT_LONGITUDE"],
|
|
||||||
inplace=True)
|
|
||||||
|
|
||||||
print("Converting geodataframe to EPSG:3857")
|
|
||||||
gdf.to_crs(3857, inplace=True)
|
|
||||||
print(f"Loading geodatframe to PostgreSQL as statcan_nar_addresses_combined_{vintage}")
|
|
||||||
gdf.to_postgis(name=f"statcan_nar_addresses_combined_{vintage}",
|
|
||||||
con=engine,
|
|
||||||
chunksize=150000)
|
|
||||||
Executable
+38
@@ -0,0 +1,38 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
INPUT_FOLDER="${DATA_FOLDER}/national_address_register/input"
|
||||||
|
EXTRACTED_FOLDER="${DATA_FOLDER}/national_address_register/extracted"
|
||||||
|
|
||||||
|
process_202412() {
|
||||||
|
# Process 2024-12 vintage
|
||||||
|
# Extract files
|
||||||
|
echo "Extracting ${INPUT_FOLDER}/2024-12/202412.zip to ${EXTRACTED_FOLDER}/2024-12"
|
||||||
|
unzip -q -n ${INPUT_FOLDER}/2024-12/202412.zip -d ${EXTRACTED_FOLDER}/2024-12
|
||||||
|
jupyter execute process_2024_12.ipynb
|
||||||
|
}
|
||||||
|
|
||||||
|
process_202406() {
|
||||||
|
# Process 2024-06 vintage
|
||||||
|
echo "Extracting ${INPUT_FOLDER}/2024.zip"
|
||||||
|
unzip -q -n ${INPUT_FOLDER}/2024.zip -d ${EXTRACTED_FOLDER}/2024-06
|
||||||
|
# Encoding is utf-8
|
||||||
|
}
|
||||||
|
|
||||||
|
process_2023() {
|
||||||
|
# Process 2023
|
||||||
|
echo "Extracting ${INPUT_FOLDER}/2023.zip"
|
||||||
|
unzip -q -n ${INPUT_FOLDER}/2023.zip -d ${EXTRACTED_FOLDER}/2023
|
||||||
|
# Encoding is latin-1
|
||||||
|
}
|
||||||
|
|
||||||
|
process_2022() {
|
||||||
|
# Process 2022
|
||||||
|
echo "Extracting ${INPUT_FOLDER}/2022.zip"
|
||||||
|
unzip -q -n ${INPUT_FOLDER}/2022.zip -d ${EXTRACTED_FOLDER}/2022
|
||||||
|
# Encoding is latin-1
|
||||||
|
}
|
||||||
|
|
||||||
|
process_202412
|
||||||
|
#process_202406
|
||||||
|
#process_2023
|
||||||
|
#process_2022
|
||||||
@@ -0,0 +1,547 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "11c50f98-ddeb-4af0-b69a-743f915fa904",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Experimenting with processing this file. Still need to figure out how to structure this file"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "95feb0e4-b3b6-4235-8c8d-bc3652e82b3a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Buckaroo has been enabled as the default DataFrame viewer. To return to default dataframe visualization use `from buckaroo import disable; disable()`\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"#!/usr/bin/env python\n",
|
||||||
|
"# coding: utf-8\n",
|
||||||
|
"import gc\n",
|
||||||
|
"import glob\n",
|
||||||
|
"import os\n",
|
||||||
|
"import sys \n",
|
||||||
|
"\n",
|
||||||
|
"import buckaroo\n",
|
||||||
|
"import duckdb\n",
|
||||||
|
"from IPython.core.interactiveshell import InteractiveShell \n",
|
||||||
|
"import geopandas as gpd\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from sqlalchemy import create_engine\n",
|
||||||
|
"from sqlalchemy import text\n",
|
||||||
|
"\n",
|
||||||
|
"# Enable multiple outputs per cell\n",
|
||||||
|
"InteractiveShell.ast_node_interactivity = \"all\"\n",
|
||||||
|
"# Show all columns\n",
|
||||||
|
"pd.set_option('display.max_columns', None)\n",
|
||||||
|
"\n",
|
||||||
|
"DATABASE = os.environ.get(\"POSTGRES_DB\")\n",
|
||||||
|
"USER = os.environ.get(\"POSTGRES_USER\")\n",
|
||||||
|
"PASSWORD = os.environ.get(\"POSTGRES_PASSWORD\")\n",
|
||||||
|
"\n",
|
||||||
|
"engine = create_engine(f\"postgresql://{USER}:{PASSWORD}@db:5432/{DATABASE}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "e10bc182-d0d6-4aa3-99b3-2fc396f9ce34",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"input_folder = '/data/national_address_register/extracted'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "7f170d27-14ca-488a-811d-1c9836264bb6",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# 1. Process 2024-12 vintage"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "b27e0edc-59ca-4a63-8bff-1d85909aa174",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"nar_addresses_csvs = glob.glob(f'{input_folder}/2024-12/Addresses/*.csv')\n",
|
||||||
|
"nar_locations_csvs = glob.glob(f'{input_folder}/2024-12/Locations/*.csv')\n",
|
||||||
|
"encoding = 'utf-8'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "3fcca833-8547-43b1-b5c7-e5f463ef0bbc",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def process_nar_locations_csvs(csvs_to_process, encoding):\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" 1. Reads subset of fields for National Address Register locations\n",
|
||||||
|
" 2. Appends all of the processed CSVs as one dataframe\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" dataframes_to_concatenate = []\n",
|
||||||
|
" for filename in csvs_to_process:\n",
|
||||||
|
" print(f\"Processing {filename}\")\n",
|
||||||
|
" params = {\n",
|
||||||
|
" 'filepath_or_buffer': filename,\n",
|
||||||
|
" 'encoding': encoding,\n",
|
||||||
|
" 'usecols': ['LOC_GUID', \n",
|
||||||
|
" 'REPPOINT_LATITUDE', \n",
|
||||||
|
" 'REPPOINT_LONGITUDE'\n",
|
||||||
|
" ]\n",
|
||||||
|
" }\n",
|
||||||
|
" nar_location_df = pd.read_csv(**params)\n",
|
||||||
|
" # Lowercase columns\n",
|
||||||
|
" nar_location_df.columns = [x.lower() for x in nar_location_df.columns]\n",
|
||||||
|
" dataframes_to_concatenate.append(nar_location_df)\n",
|
||||||
|
" \n",
|
||||||
|
" print(\"Concatenating all dataframes into one\")\n",
|
||||||
|
" nar_locations_df = pd.concat(dataframes_to_concatenate)\n",
|
||||||
|
" \n",
|
||||||
|
" return nar_locations_df\n",
|
||||||
|
"\n",
|
||||||
|
"def process_nar_addresses_csvs(csvs_to_process, encoding):\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" 1. Reads subset of fields for National Address Register addresses\n",
|
||||||
|
" 2. Appends all of the processed CSVs as one dataframe\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" dataframes_to_concatenate = []\n",
|
||||||
|
" for filename in csvs_to_process:\n",
|
||||||
|
" print(f\"Processing {filename}\")\n",
|
||||||
|
" params = {\n",
|
||||||
|
" 'filepath_or_buffer': filename,\n",
|
||||||
|
" 'encoding': encoding,\n",
|
||||||
|
" 'usecols': ['LOC_GUID', \n",
|
||||||
|
" 'ADDR_GUID', \n",
|
||||||
|
" 'APT_NO_LABEL',\n",
|
||||||
|
" 'CIVIC_NO',\n",
|
||||||
|
" 'CIVIC_NO_SUFFIX',\n",
|
||||||
|
" 'OFFICIAL_STREET_NAME',\n",
|
||||||
|
" 'OFFICIAL_STREET_TYPE',\n",
|
||||||
|
" 'OFFICIAL_STREET_DIR',\n",
|
||||||
|
" 'MAIL_STREET_NAME',\n",
|
||||||
|
" 'MAIL_STREET_TYPE',\n",
|
||||||
|
" 'MAIL_STREET_DIR',\n",
|
||||||
|
" 'MAIL_MUN_NAME',\n",
|
||||||
|
" 'MAIL_POSTAL_CODE',\n",
|
||||||
|
" 'BG_DLS_LSD',\n",
|
||||||
|
" 'BG_DLS_QTR',\n",
|
||||||
|
" 'BG_DLS_SCTN',\n",
|
||||||
|
" 'BG_DLS_RNG',\n",
|
||||||
|
" 'BG_DLS_MRD',\n",
|
||||||
|
" # Removing since REPPOINT_LATITUDE and REPPOINT_LONGITUDE seem to have same purpose\n",
|
||||||
|
" #'BG_X',\n",
|
||||||
|
" #'BG_Y',\n",
|
||||||
|
" 'BU_USE',\n",
|
||||||
|
" 'BU_N_CIVIC_ADD'\n",
|
||||||
|
" ],\n",
|
||||||
|
" 'dtype': {\n",
|
||||||
|
" \"CIVIC_NO\": \"Int32\", \n",
|
||||||
|
" \"PROV_CODE\": object,\n",
|
||||||
|
" \"BU_USE\": \"Int8\",\n",
|
||||||
|
" \"BG_DLS_LSD\": object,\n",
|
||||||
|
" \"BG_DLS_QTR\": object,\n",
|
||||||
|
" \"BG_DLS_SCTN\": object,\n",
|
||||||
|
" \"BG_DLS_TWNSHP\": object,\n",
|
||||||
|
" \"BG_DLS_RNG\": object,\n",
|
||||||
|
" \"BG_DLS_MRD\": object\n",
|
||||||
|
" }\n",
|
||||||
|
" }\n",
|
||||||
|
" nar_address_df = pd.read_csv(**params)\n",
|
||||||
|
" # Lowercase columns\n",
|
||||||
|
" nar_address_df.columns = [x.lower() for x in nar_address_df.columns]\n",
|
||||||
|
" dataframes_to_concatenate.append(nar_address_df)\n",
|
||||||
|
" \n",
|
||||||
|
" print(\"Concatenating all dataframes into one\")\n",
|
||||||
|
" nar_addresses_df = pd.concat(dataframes_to_concatenate, ignore_index=True)\n",
|
||||||
|
" \n",
|
||||||
|
" return nar_addresses_df"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "f9bb3a4d-d913-4b9d-aa49-7ec2c4dfef60",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_10.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_11.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_12.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_13.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_24_part_1.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_24_part_2.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_24_part_3.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_24_part_4.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_35_part_1.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_35_part_2.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_35_part_3.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_35_part_4.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_35_part_5.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_46.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_47.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_48_part_1.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_48_part_2.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_59_part_1.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_59_part_2.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_60.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_61.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Locations/Location_62.csv\n",
|
||||||
|
"Concatenating all dataframes into one\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_10.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_11.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_12.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_13.csv\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/tmp/ipykernel_943/484766080.py:74: DtypeWarning: Columns (27) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
||||||
|
" nar_address_df = pd.read_csv(**params)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_24_part_1.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_24_part_2.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_24_part_3.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_24_part_4.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_24_part_5.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_35_part_1.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_35_part_2.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_35_part_3.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_35_part_4.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_35_part_5.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_35_part_6.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_35_part_7.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_46.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_47.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_48_part_1.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_48_part_2.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_48_part_3.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_59_part_1.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_59_part_2.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_59_part_3.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_60.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_61.csv\n",
|
||||||
|
"Processing /data/national_address_register/extracted/2024-12/Addresses/Address_62.csv\n",
|
||||||
|
"Concatenating all dataframes into one\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"nar_locations = process_nar_locations_csvs(nar_locations_csvs, encoding)\n",
|
||||||
|
"nar_addresses = process_nar_addresses_csvs(nar_addresses_csvs, encoding)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "b49f6602-9bda-410f-82a2-54a5711311b0",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# TODO\n",
|
||||||
|
"- look into why there are locations with empty reppoint_latitude and reppoint_longitude\n",
|
||||||
|
" - There are 84,285 records that have an empty reppoint_latitude and reppoint_longitude"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "f3600645-c350-473f-81ea-0bd9ebe70e37",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Combining nar_addresses and nar_locations\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"40"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(\"Combining nar_addresses and nar_locations\")\n",
|
||||||
|
"nar_addresses_combined = duckdb.sql(\"\"\"\n",
|
||||||
|
"SELECT a.addr_guid, a.apt_no_label, a.civic_no, a.civic_no_suffix, a.official_street_name, a.mail_street_name, a.official_street_type, a.mail_street_type,\n",
|
||||||
|
" a.official_street_dir AS official_street_direction, a.mail_street_dir AS mail_street_direction, a.mail_postal_code, a.mail_mun_name AS mail_municipality_name, \n",
|
||||||
|
" a.bu_n_civic_add, a.bu_use,\n",
|
||||||
|
" a.bg_dls_lsd, a.bg_dls_qtr, a.bg_dls_sctn, a.bg_dls_rng, a.bg_dls_mrd,\n",
|
||||||
|
" b.reppoint_latitude, b.reppoint_longitude\n",
|
||||||
|
"FROM nar_addresses AS a,\n",
|
||||||
|
" nar_locations AS b\n",
|
||||||
|
"WHERE a.loc_guid = b.loc_guid AND b.reppoint_latitude IS NOT NULL\n",
|
||||||
|
"\"\"\").df()\n",
|
||||||
|
"\n",
|
||||||
|
"del nar_addresses\n",
|
||||||
|
"del nar_locations\n",
|
||||||
|
"gc.collect()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "0d35eec6-8ab3-4e7a-a7fd-2915333e27f9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"gdf = gpd.GeoDataFrame(\n",
|
||||||
|
" nar_addresses_combined, \n",
|
||||||
|
" geometry=gpd.points_from_xy(nar_addresses_combined.reppoint_longitude,\n",
|
||||||
|
" nar_addresses_combined.reppoint_latitude),\n",
|
||||||
|
" crs=\"EPSG:4326\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "614ad773-adb3-413c-8b8f-da721afc85cd",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Dropping 'reppoint_latitude', 'reppoint_longitude' from geodataframe\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(\"Dropping 'reppoint_latitude', 'reppoint_longitude' from geodataframe\")\n",
|
||||||
|
"gdf.drop(columns=[\"reppoint_latitude\", \"reppoint_longitude\"], \n",
|
||||||
|
" inplace=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "cac024f8-6eb5-48ea-88ad-289edf505ba3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"del nar_addresses_combined\n",
|
||||||
|
"gc.collect()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"id": "d2f81e61-1fc7-4518-ad1f-d515e16ce22c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Loading geodataframe to PostgreSQL as bronze.nar_2024_12\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(\"Loading geodataframe to PostgreSQL as bronze.nar_2024_12\")\n",
|
||||||
|
"gdf.to_postgis(name=\"nar_2024_12\", \n",
|
||||||
|
" schema='bronze',\n",
|
||||||
|
" con=engine,\n",
|
||||||
|
" chunksize=150000)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"id": "b7fb6976-0b95-445e-b1e8-022c39bce25b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"584"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"del(gdf)\n",
|
||||||
|
"gc.collect()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "384cd07b-c79f-42e8-81d6-ecbe78f167b1",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Link to 2021 geographies\n",
|
||||||
|
"There are 10 records that were not linked to 2021 geographies"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"id": "f7f56297-150b-4ad2-a5c2-2ee5e477f978",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"sql = \"\"\"\n",
|
||||||
|
"DROP TABLE IF EXISTS silver.nar_2024_12;\n",
|
||||||
|
"CREATE TABLE silver.nar_2024_12 AS\n",
|
||||||
|
"SELECT DISTINCT\n",
|
||||||
|
" b.country_dguid,\n",
|
||||||
|
" b.country_en_name,\n",
|
||||||
|
" b.country_fr_name,\n",
|
||||||
|
" b.country_en_abbreviation,\n",
|
||||||
|
" b.country_fr_abbreviation,\n",
|
||||||
|
" b.grc_dguid,\n",
|
||||||
|
" b.grc_en_name,\n",
|
||||||
|
" b.grc_fr_name,\n",
|
||||||
|
" b.pr_dguid,\n",
|
||||||
|
" b.pr_en_name,\n",
|
||||||
|
" b.pr_fr_name,\n",
|
||||||
|
" b.pr_en_abbreviation,\n",
|
||||||
|
" b.pr_fr_abbreviation,\n",
|
||||||
|
" b.pr_iso_code,\n",
|
||||||
|
" b.car_dguid,\n",
|
||||||
|
" b.car_en_name,\n",
|
||||||
|
" b.car_fr_name,\n",
|
||||||
|
" b.er_dguid,\n",
|
||||||
|
" b.er_name,\n",
|
||||||
|
" b.cd_dguid,\n",
|
||||||
|
" b.cd_name,\n",
|
||||||
|
" b.cd_type,\n",
|
||||||
|
" b.ccs_dguid,\n",
|
||||||
|
" b.ccs_name,\n",
|
||||||
|
" b.cma_dguid,\n",
|
||||||
|
" b.cma_p_dguid,\n",
|
||||||
|
" b.cma_name,\n",
|
||||||
|
" b.cma_type,\n",
|
||||||
|
" b.csd_dguid,\n",
|
||||||
|
" b.csd_name,\n",
|
||||||
|
" b.csd_type,\n",
|
||||||
|
" b.sac_type,\n",
|
||||||
|
" b.sac_code,\n",
|
||||||
|
" b.fed_dguid,\n",
|
||||||
|
" b.fed_name,\n",
|
||||||
|
" b.fed_en_name,\n",
|
||||||
|
" b.fed_fr_name,\n",
|
||||||
|
" b.ct_dguid,\n",
|
||||||
|
" b.ada_dguid,\n",
|
||||||
|
" b.da_dguid,\n",
|
||||||
|
" b.db_dguid,\n",
|
||||||
|
" a.addr_guid,\n",
|
||||||
|
" a.apt_no_label,\n",
|
||||||
|
" a.civic_no,\n",
|
||||||
|
" a.civic_no_suffix,\n",
|
||||||
|
" a.official_street_name, \n",
|
||||||
|
" a.mail_street_name, \n",
|
||||||
|
" a.official_street_type,\n",
|
||||||
|
" a.mail_street_type,\n",
|
||||||
|
" a.official_street_direction,\n",
|
||||||
|
" a.mail_street_direction,\n",
|
||||||
|
" a.mail_postal_code,\n",
|
||||||
|
" a.mail_municipality_name,\n",
|
||||||
|
" a.bu_n_civic_add,\n",
|
||||||
|
" a.bu_use,\n",
|
||||||
|
" a.bg_dls_lsd,\n",
|
||||||
|
" a.bg_dls_qtr,\n",
|
||||||
|
" a.bg_dls_sctn,\n",
|
||||||
|
" a.bg_dls_rng,\n",
|
||||||
|
" a.bg_dls_mrd,\n",
|
||||||
|
" a.geometry AS geom\n",
|
||||||
|
"FROM bronze.nar_2024_12 AS a,\n",
|
||||||
|
" silver.db_2021_digital AS b\n",
|
||||||
|
"WHERE ST_Intersects(a.geometry, b.geom);\n",
|
||||||
|
"\n",
|
||||||
|
"-- Create spatial index\n",
|
||||||
|
"CREATE INDEX nar_2024_12_geom_idx ON silver.nar_2024_12 USING gist (geom) WITH (\n",
|
||||||
|
" fillfactor = 100\n",
|
||||||
|
");\n",
|
||||||
|
"\"\"\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"id": "ae7babb4-a3e6-4c0b-93a6-3b52a4f89a1b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"<sqlalchemy.engine.cursor.CursorResult at 0x7f38310c66d0>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"with engine.connect() as conn:\n",
|
||||||
|
" conn.execute(text(sql))\n",
|
||||||
|
" conn.commit()"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user