Process 2024-06 national address register

This commit is contained in:
Diego Ripley
2025-06-07 15:39:28 +00:00
parent b3c5f8767f
commit 4d16ef8232
3 changed files with 559 additions and 9 deletions
+7 -8
View File
@@ -5,7 +5,6 @@ EXTRACTED_FOLDER="${DATA_FOLDER}/national_address_register/extracted"
process_202412() { process_202412() {
# Process 2024-12 vintage # Process 2024-12 vintage
# Extract files
echo "Extracting ${INPUT_FOLDER}/2024-12/202412.zip to ${EXTRACTED_FOLDER}/2024-12" echo "Extracting ${INPUT_FOLDER}/2024-12/202412.zip to ${EXTRACTED_FOLDER}/2024-12"
unzip -q -n ${INPUT_FOLDER}/2024-12/202412.zip -d ${EXTRACTED_FOLDER}/2024-12 unzip -q -n ${INPUT_FOLDER}/2024-12/202412.zip -d ${EXTRACTED_FOLDER}/2024-12
jupyter execute process_2024_12.ipynb jupyter execute process_2024_12.ipynb
@@ -13,26 +12,26 @@ process_202412() {
process_202406() { process_202406() {
# Process 2024-06 vintage # Process 2024-06 vintage
echo "Extracting ${INPUT_FOLDER}/2024.zip" echo "Extracting ${INPUT_FOLDER}/2024-06/2024.zip"
unzip -q -n ${INPUT_FOLDER}/2024.zip -d ${EXTRACTED_FOLDER}/2024-06 unzip -q -n ${INPUT_FOLDER}/2024-06/2024.zip -d ${EXTRACTED_FOLDER}/2024-06
# Encoding is utf-8 jupyter execute process_2024_12.ipynb
} }
process_2023() { process_2023() {
# Process 2023 # Process 2023
echo "Extracting ${INPUT_FOLDER}/2023.zip" echo "Extracting ${INPUT_FOLDER}/2023/2023.zip"
unzip -q -n ${INPUT_FOLDER}/2023.zip -d ${EXTRACTED_FOLDER}/2023 unzip -q -n ${INPUT_FOLDER}/2023/2023.zip -d ${EXTRACTED_FOLDER}/2023
# Encoding is latin-1 # Encoding is latin-1
} }
process_2022() { process_2022() {
# Process 2022 # Process 2022
echo "Extracting ${INPUT_FOLDER}/2022.zip" echo "Extracting ${INPUT_FOLDER}/2022/2022.zip"
unzip -q -n ${INPUT_FOLDER}/2022.zip -d ${EXTRACTED_FOLDER}/2022 unzip -q -n ${INPUT_FOLDER}/2022.zip -d ${EXTRACTED_FOLDER}/2022
# Encoding is latin-1 # Encoding is latin-1
} }
process_202412 process_202412
#process_202406 process_202406
#process_2023 #process_2023
#process_2022 #process_2022
@@ -0,0 +1,549 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "11c50f98-ddeb-4af0-b69a-743f915fa904",
"metadata": {},
"source": [
"# Experimenting with processing this file. Still need to figure out how to structure this file\n",
"In summary:\n",
"- there are some types that should be fixed. For example: `sac_type` should not be `Integer64`, `bu_use` should be `Int8`, `civic_no` should be `Int32`"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "95feb0e4-b3b6-4235-8c8d-bc3652e82b3a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Buckaroo has been enabled as the default DataFrame viewer. To return to default dataframe visualization use `from buckaroo import disable; disable()`\n"
]
}
],
"source": [
"#!/usr/bin/env python\n",
"# coding: utf-8\n",
"import gc\n",
"import glob\n",
"import os\n",
"import sys \n",
"\n",
"import buckaroo\n",
"import duckdb\n",
"from IPython.core.interactiveshell import InteractiveShell \n",
"import geopandas as gpd\n",
"import pandas as pd\n",
"from sqlalchemy import create_engine\n",
"from sqlalchemy import text\n",
"\n",
"# Enable multiple outputs per cell\n",
"InteractiveShell.ast_node_interactivity = \"all\"\n",
"# Show all columns\n",
"pd.set_option('display.max_columns', None)\n",
"\n",
"DATABASE = os.environ.get(\"POSTGRES_DB\")\n",
"USER = os.environ.get(\"POSTGRES_USER\")\n",
"PASSWORD = os.environ.get(\"POSTGRES_PASSWORD\")\n",
"\n",
"engine = create_engine(f\"postgresql://{USER}:{PASSWORD}@db:5432/{DATABASE}\")"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "e10bc182-d0d6-4aa3-99b3-2fc396f9ce34",
"metadata": {},
"outputs": [],
"source": [
"input_folder = '/data/national_address_register/extracted'"
]
},
{
"cell_type": "markdown",
"id": "7f170d27-14ca-488a-811d-1c9836264bb6",
"metadata": {},
"source": [
"# 1. Process 2024-06 vintage"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "b27e0edc-59ca-4a63-8bff-1d85909aa174",
"metadata": {},
"outputs": [],
"source": [
"nar_addresses_csvs = glob.glob(f'{input_folder}/2024-06/Addresses/*.csv')\n",
"nar_locations_csvs = glob.glob(f'{input_folder}/2024-06/Locations/*.csv')\n",
"encoding = 'utf-8'"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "3fcca833-8547-43b1-b5c7-e5f463ef0bbc",
"metadata": {},
"outputs": [],
"source": [
"def process_nar_locations_csvs(csvs_to_process, encoding):\n",
" \"\"\"\n",
" 1. Reads subset of fields for National Address Register locations\n",
" 2. Appends all of the processed CSVs as one dataframe\n",
" \"\"\"\n",
" dataframes_to_concatenate = []\n",
" for filename in csvs_to_process:\n",
" print(f\"Processing {filename}\")\n",
" params = {\n",
" 'filepath_or_buffer': filename,\n",
" 'encoding': encoding,\n",
" 'usecols': ['LOC_GUID', \n",
" 'REPPOINT_LATITUDE', \n",
" 'REPPOINT_LONGITUDE'\n",
" ]\n",
" }\n",
" nar_location_df = pd.read_csv(**params)\n",
" # Lowercase columns\n",
" nar_location_df.columns = [x.lower() for x in nar_location_df.columns]\n",
" dataframes_to_concatenate.append(nar_location_df)\n",
" \n",
" print(\"Concatenating all dataframes into one\")\n",
" nar_locations_df = pd.concat(dataframes_to_concatenate)\n",
" \n",
" return nar_locations_df\n",
"\n",
"def process_nar_addresses_csvs(csvs_to_process, encoding):\n",
" \"\"\"\n",
" 1. Reads subset of fields for National Address Register addresses\n",
" 2. Appends all of the processed CSVs as one dataframe\n",
" \"\"\"\n",
" dataframes_to_concatenate = []\n",
" for filename in csvs_to_process:\n",
" print(f\"Processing {filename}\")\n",
" params = {\n",
" 'filepath_or_buffer': filename,\n",
" 'encoding': encoding,\n",
" 'usecols': ['LOC_GUID', \n",
" 'ADDR_GUID', \n",
" 'APT_NO_LABEL',\n",
" 'CIVIC_NO',\n",
" 'CIVIC_NO_SUFFIX',\n",
" 'OFFICIAL_STREET_NAME',\n",
" 'OFFICIAL_STREET_TYPE',\n",
" 'OFFICIAL_STREET_DIR',\n",
" 'MAIL_STREET_NAME',\n",
" 'MAIL_STREET_TYPE',\n",
" 'MAIL_STEET_DIR',\n",
" 'MAIL_MUN_NAME',\n",
" 'MAIL_POSTAL_CODE',\n",
" 'BG_DLS_LSD',\n",
" 'BG_DLS_QTR',\n",
" 'BG_DLS_SCTN',\n",
" 'BG_DLS_RNG',\n",
" 'BG_DLS_MRD',\n",
" # Removing since REPPOINT_LATITUDE and REPPOINT_LONGITUDE seem to have same purpose\n",
" #'BG_X',\n",
" #'BG_Y',\n",
" 'BU_USE',\n",
" 'BU_N_CIVIC_ADD'\n",
" ],\n",
" 'dtype': {\n",
" \"CIVIC_NO\": \"Int32\", \n",
" \"PROV_CODE\": object,\n",
" \"BU_USE\": \"Int8\",\n",
" \"BG_DLS_LSD\": object,\n",
" \"BG_DLS_QTR\": object,\n",
" \"BG_DLS_SCTN\": object,\n",
" \"BG_DLS_TWNSHP\": object,\n",
" \"BG_DLS_RNG\": object,\n",
" \"BG_DLS_MRD\": object\n",
" }\n",
" }\n",
" nar_address_df = pd.read_csv(**params)\n",
" # Lowercase columns\n",
" nar_address_df.columns = [x.lower() for x in nar_address_df.columns]\n",
" dataframes_to_concatenate.append(nar_address_df)\n",
" \n",
" print(\"Concatenating all dataframes into one\")\n",
" nar_addresses_df = pd.concat(dataframes_to_concatenate, ignore_index=True)\n",
" \n",
" return nar_addresses_df"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "f9bb3a4d-d913-4b9d-aa49-7ec2c4dfef60",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_10.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_11.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_12.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_13.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_24_part_1.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_24_part_2.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_24_part_3.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_24_part_4.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_35_part_1.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_35_part_2.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_35_part_3.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_35_part_4.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_35_part_5.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_46.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_47.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_48_part_1.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_48_part_2.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_59_part_1.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_59_part_2.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_60.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_61.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Locations/Location_62.csv\n",
"Concatenating all dataframes into one\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_10.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_11.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_12.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_13.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_24_part_1.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_24_part_2.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_24_part_3.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_24_part_4.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_24_part_5.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_35_part_1.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_35_part_2.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_35_part_3.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_35_part_4.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_35_part_5.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_35_part_6.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_35_part_7.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_46.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_47.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_48_part_1.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_48_part_2.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_48_part_3.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_59_part_1.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_59_part_2.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_59_part_3.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_60.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_61.csv\n",
"Processing /data/national_address_register/extracted/2024-06/Addresses/Address_62.csv\n",
"Concatenating all dataframes into one\n"
]
}
],
"source": [
"nar_locations = process_nar_locations_csvs(nar_locations_csvs, encoding)\n",
"nar_addresses = process_nar_addresses_csvs(nar_addresses_csvs, encoding)"
]
},
{
"cell_type": "markdown",
"id": "b49f6602-9bda-410f-82a2-54a5711311b0",
"metadata": {},
"source": [
"# TODO\n",
"- look into why there are locations with empty reppoint_latitude and reppoint_longitude\n",
" - There are 84,285 records that have an empty reppoint_latitude and reppoint_longitude"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "f3600645-c350-473f-81ea-0bd9ebe70e37",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Combining nar_addresses and nar_locations\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f927771f74bd4f81a7ca98cbf227b958",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"616"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(\"Combining nar_addresses and nar_locations\")\n",
"nar_addresses_combined = duckdb.sql(\"\"\"\n",
"SELECT a.addr_guid, a.apt_no_label, a.civic_no, a.civic_no_suffix, a.official_street_name, a.mail_street_name, a.official_street_type, a.mail_street_type,\n",
" a.official_street_dir AS official_street_direction, a.mail_steet_dir AS mail_street_direction, a.mail_postal_code, a.mail_mun_name AS mail_municipality_name, \n",
" a.bu_n_civic_add, a.bu_use,\n",
" a.bg_dls_lsd, a.bg_dls_qtr, a.bg_dls_sctn, a.bg_dls_rng, a.bg_dls_mrd,\n",
" b.reppoint_latitude, b.reppoint_longitude\n",
"FROM nar_addresses AS a,\n",
" nar_locations AS b\n",
"WHERE a.loc_guid = b.loc_guid AND b.reppoint_latitude IS NOT NULL\n",
"\"\"\").df()\n",
"\n",
"del nar_addresses\n",
"del nar_locations\n",
"gc.collect()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "0d35eec6-8ab3-4e7a-a7fd-2915333e27f9",
"metadata": {},
"outputs": [],
"source": [
"gdf = gpd.GeoDataFrame(\n",
" nar_addresses_combined, \n",
" geometry=gpd.points_from_xy(nar_addresses_combined.reppoint_longitude,\n",
" nar_addresses_combined.reppoint_latitude),\n",
" crs=\"EPSG:4326\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "614ad773-adb3-413c-8b8f-da721afc85cd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dropping 'reppoint_latitude', 'reppoint_longitude' from geodataframe\n"
]
}
],
"source": [
"print(\"Dropping 'reppoint_latitude', 'reppoint_longitude' from geodataframe\")\n",
"gdf.drop(columns=[\"reppoint_latitude\", \"reppoint_longitude\"], \n",
" inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "cac024f8-6eb5-48ea-88ad-289edf505ba3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"del nar_addresses_combined\n",
"gc.collect()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "d2f81e61-1fc7-4518-ad1f-d515e16ce22c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loading geodataframe to PostgreSQL as bronze.nar_2024_06\n"
]
}
],
"source": [
"print(\"Loading geodataframe to PostgreSQL as bronze.nar_2024_06\")\n",
"gdf.to_postgis(name=\"nar_2024_06\", \n",
" schema='bronze',\n",
" con=engine,\n",
" chunksize=150000)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "b7fb6976-0b95-445e-b1e8-022c39bce25b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3477"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"del(gdf)\n",
"gc.collect()"
]
},
{
"cell_type": "markdown",
"id": "384cd07b-c79f-42e8-81d6-ecbe78f167b1",
"metadata": {},
"source": [
"## Link to 2021 geographies\n",
"There are 10 records that were not linked to 2021 geographies"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "f7f56297-150b-4ad2-a5c2-2ee5e477f978",
"metadata": {},
"outputs": [],
"source": [
"sql = \"\"\"\n",
"DROP TABLE IF EXISTS silver.nar_2024_06;\n",
"CREATE TABLE silver.nar_2024_06 AS\n",
"SELECT DISTINCT\n",
" b.country_dguid,\n",
" b.country_en_name,\n",
" b.country_fr_name,\n",
" b.country_en_abbreviation,\n",
" b.country_fr_abbreviation,\n",
" b.grc_dguid,\n",
" b.grc_en_name,\n",
" b.grc_fr_name,\n",
" b.pr_dguid,\n",
" b.pr_en_name,\n",
" b.pr_fr_name,\n",
" b.pr_en_abbreviation,\n",
" b.pr_fr_abbreviation,\n",
" b.pr_iso_code,\n",
" b.car_dguid,\n",
" b.car_en_name,\n",
" b.car_fr_name,\n",
" b.er_dguid,\n",
" b.er_name,\n",
" b.cd_dguid,\n",
" b.cd_name,\n",
" b.cd_type,\n",
" b.ccs_dguid,\n",
" b.ccs_name,\n",
" b.cma_dguid,\n",
" b.cma_p_dguid,\n",
" b.cma_name,\n",
" b.cma_type,\n",
" b.csd_dguid,\n",
" b.csd_name,\n",
" b.csd_type,\n",
" b.sac_type,\n",
" b.sac_code,\n",
" b.fed_dguid,\n",
" b.fed_name,\n",
" b.fed_en_name,\n",
" b.fed_fr_name,\n",
" b.ct_dguid,\n",
" b.ada_dguid,\n",
" b.da_dguid,\n",
" b.db_dguid,\n",
" a.addr_guid,\n",
" a.apt_no_label,\n",
" a.civic_no,\n",
" a.civic_no_suffix,\n",
" a.official_street_name, \n",
" a.mail_street_name, \n",
" a.official_street_type,\n",
" a.mail_street_type,\n",
" a.official_street_direction,\n",
" a.mail_street_direction,\n",
" a.mail_postal_code,\n",
" a.mail_municipality_name,\n",
" a.bu_n_civic_add,\n",
" a.bu_use,\n",
" a.bg_dls_lsd,\n",
" a.bg_dls_qtr,\n",
" a.bg_dls_sctn,\n",
" a.bg_dls_rng,\n",
" a.bg_dls_mrd,\n",
" a.geometry AS geom\n",
"FROM bronze.nar_2024_06 AS a,\n",
" silver.db_2021_digital AS b\n",
"WHERE ST_Intersects(a.geometry, b.geom);\n",
"\n",
"-- Create spatial index\n",
"CREATE INDEX nar_2024_06_geom_idx ON silver.nar_2024_06 USING gist (geom) WITH (\n",
" fillfactor = 100\n",
");\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "ae7babb4-a3e6-4c0b-93a6-3b52a4f89a1b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<sqlalchemy.engine.cursor.CursorResult at 0x7f523a52d940>"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with engine.connect() as conn:\n",
" conn.execute(text(sql))\n",
" conn.commit()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
@@ -5,7 +5,9 @@
"id": "11c50f98-ddeb-4af0-b69a-743f915fa904", "id": "11c50f98-ddeb-4af0-b69a-743f915fa904",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Experimenting with processing this file. Still need to figure out how to structure this file" "# Experimenting with processing this file. Still need to figure out how to structure this file\n",
"In summary:\n",
"- there are some types that should be fixed. For example: `sac_type` should not be `Integer64`, `bu_use` should be `Int8`, `civic_no` should be `Int32`"
] ]
}, },
{ {