From f93e4d0cec877e6f5e38614b59aadc59fa8cdece Mon Sep 17 00:00:00 2001 From: Diego Ripley Date: Sat, 24 May 2025 13:37:31 -0400 Subject: [PATCH] Initial commit --- .devcontainer/devcontainer.json | 26 + .env | 5 + .gitignore | 3 + Dockerfile | 54 + LICENSE | 21 + README.md | 34 + boundaries/README.md | 46 + boundaries/aggregate_dissemination_areas.sql | 59 + boundaries/boundary_files_2001.txt | 16 + boundaries/boundary_files_2006.txt | 16 + boundaries/boundary_files_2011.txt | 17 + boundaries/boundary_files_2016.txt | 18 + boundaries/boundary_files_2021.txt | 18 + boundaries/census_agricultural_regions.sql | 49 + .../census_consolidated_subdivisions.sql | 55 + boundaries/census_divisions.sql | 57 + boundaries/census_metropolitan_areas.sql | 56 + boundaries/census_subdivisions.sql | 71 + boundaries/census_tracts.sql | 52 + boundaries/country.sql | 131 ++ boundaries/designated_places.sql | 48 + boundaries/dissemination_areas.sql | 66 + boundaries/dissemination_blocks.sql | 73 + boundaries/download.sh | 19 + boundaries/economic_regions.sql | 49 + boundaries/federal_electoral_districts.sql | 50 + boundaries/forward_sortation_areas.sql | 43 + boundaries/geographic_regions_of_canada.sql | 471 ++++ boundaries/load.sh | 290 +++ boundaries/organize.sql | 608 +++++ boundaries/population_centres.sql | 52 + boundaries/process.sh | 18 + boundaries/provinces_and_territories.sql | 113 + census_of_agriculture/README.md | 1 + .../census_of_agriculture_2016.txt | 9 + .../census_of_agriculture_2021.txt | 9 + census_of_agriculture/download.sh | 15 + census_of_agriculture/process.sh | 5 + census_of_agriculture/process_2016.ipynb | 1979 +++++++++++++++++ census_of_agriculture/process_2021.ipynb | 1793 +++++++++++++++ census_of_population/README.md | 5 + .../census_of_population_files_2001.txt | 4 + .../census_of_population_files_2006.txt | 4 + .../census_of_population_files_2011.txt | 45 + .../census_of_population_files_2016.txt | 31 + .../census_of_population_files_2021.txt | 36 + census_of_population/download.sh | 24 + census_of_population/process.sh | 155 ++ census_of_population/process_2021.ipynb | 1360 +++++++++++ .../download.sh | 11 + .../files.txt | 2 + .../load.sh | 12 + .../process.py | 60 + docker-compose.yml | 32 + experiments/boundaries_spatial_checks.ipynb | 140 ++ experiments/duckdb_census_of_population.ipynb | 242 ++ experiments/export_2021_boundaries.ipynb | 922 ++++++++ experiments/index.html | 88 + experiments/index_da.html | 90 + experiments/lonboard_duckdb.ipynb | 152 ++ experiments/make_tiles.sh | 3 + experiments/make_tiles_da.sh | 1 + experiments/vector_tiles.ipynb | 110 + geographic_attribute_file/download.sh | 11 + geographic_attribute_file/files.txt | 2 + geographic_attribute_file/load.sh | 12 + geographic_attribute_file/process.py | 91 + geosuite/download.sh | 11 + geosuite/files.txt | 10 + geosuite/load.sh | 12 + geosuite/problems.ipynb | 283 +++ geosuite/process.py | 113 + health_regions/download.sh | 23 + health_regions/load.sh | 48 + health_regions/organize.sql | 126 ++ hydro/README.md | 2 + hydro/download.sh | 17 + hydro/hydro_2006.txt | 4 + hydro/hydro_2011.txt | 4 + hydro/hydro_2016.txt | 4 + hydro/hydro_2021.txt | 15 + hydro/load.sh | 70 + hydro/process.sh | 18 + main.sh | 50 + national_address_register/download.sh | 9 + national_address_register/load.sh | 167 ++ .../national_address_register_files.txt | 8 + national_address_register/process.py | 69 + .../files_opendatabase_addresses.txt | 10 + .../files_opendatabase_buildings.txt | 8 + .../files_opendatabase_cultural.txt | 1 + .../files_opendatabase_educational.txt | 1 + .../files_opendatabase_greenhouses.txt | 1 + .../files_opendatabase_healthcare.txt | 1 + .../files_opendatabase_recreation.txt | 1 + open_databases/import_opendatabase.sh | 96 + open_databases/standardize.sql | 112 + road_network_files/README.md | 4 + road_network_files/download.sh | 19 + road_network_files/load.sh | 62 + road_network_files/organize.sql | 48 + road_network_files/process.sh | 1 + road_network_files/road_network_file_2001.txt | 4 + road_network_files/road_network_file_2006.txt | 2 + road_network_files/road_network_file_2011.txt | 2 + road_network_files/road_network_file_2016.txt | 2 + road_network_files/road_network_file_2021.txt | 2 + road_network_files/road_network_files.sql | 89 + 108 files changed, 11689 insertions(+) create mode 100644 .devcontainer/devcontainer.json create mode 100644 .env create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 LICENSE create mode 100644 README.md create mode 100644 boundaries/README.md create mode 100644 boundaries/aggregate_dissemination_areas.sql create mode 100644 boundaries/boundary_files_2001.txt create mode 100644 boundaries/boundary_files_2006.txt create mode 100644 boundaries/boundary_files_2011.txt create mode 100644 boundaries/boundary_files_2016.txt create mode 100644 boundaries/boundary_files_2021.txt create mode 100644 boundaries/census_agricultural_regions.sql create mode 100644 boundaries/census_consolidated_subdivisions.sql create mode 100644 boundaries/census_divisions.sql create mode 100644 boundaries/census_metropolitan_areas.sql create mode 100644 boundaries/census_subdivisions.sql create mode 100644 boundaries/census_tracts.sql create mode 100644 boundaries/country.sql create mode 100644 boundaries/designated_places.sql create mode 100644 boundaries/dissemination_areas.sql create mode 100644 boundaries/dissemination_blocks.sql create mode 100755 boundaries/download.sh create mode 100644 boundaries/economic_regions.sql create mode 100644 boundaries/federal_electoral_districts.sql create mode 100644 boundaries/forward_sortation_areas.sql create mode 100644 boundaries/geographic_regions_of_canada.sql create mode 100755 boundaries/load.sh create mode 100644 boundaries/organize.sql create mode 100644 boundaries/population_centres.sql create mode 100755 boundaries/process.sh create mode 100644 boundaries/provinces_and_territories.sql create mode 100644 census_of_agriculture/README.md create mode 100644 census_of_agriculture/census_of_agriculture_2016.txt create mode 100644 census_of_agriculture/census_of_agriculture_2021.txt create mode 100755 census_of_agriculture/download.sh create mode 100755 census_of_agriculture/process.sh create mode 100644 census_of_agriculture/process_2016.ipynb create mode 100644 census_of_agriculture/process_2021.ipynb create mode 100644 census_of_population/README.md create mode 100644 census_of_population/census_of_population_files_2001.txt create mode 100644 census_of_population/census_of_population_files_2006.txt create mode 100644 census_of_population/census_of_population_files_2011.txt create mode 100644 census_of_population/census_of_population_files_2016.txt create mode 100644 census_of_population/census_of_population_files_2021.txt create mode 100755 census_of_population/download.sh create mode 100755 census_of_population/process.sh create mode 100644 census_of_population/process_2021.ipynb create mode 100755 dissemination_geographies_relationship_file/download.sh create mode 100755 dissemination_geographies_relationship_file/files.txt create mode 100755 dissemination_geographies_relationship_file/load.sh create mode 100644 dissemination_geographies_relationship_file/process.py create mode 100644 docker-compose.yml create mode 100644 experiments/boundaries_spatial_checks.ipynb create mode 100644 experiments/duckdb_census_of_population.ipynb create mode 100644 experiments/export_2021_boundaries.ipynb create mode 100755 experiments/index.html create mode 100755 experiments/index_da.html create mode 100644 experiments/lonboard_duckdb.ipynb create mode 100644 experiments/make_tiles.sh create mode 100644 experiments/make_tiles_da.sh create mode 100644 experiments/vector_tiles.ipynb create mode 100755 geographic_attribute_file/download.sh create mode 100755 geographic_attribute_file/files.txt create mode 100755 geographic_attribute_file/load.sh create mode 100755 geographic_attribute_file/process.py create mode 100755 geosuite/download.sh create mode 100755 geosuite/files.txt create mode 100755 geosuite/load.sh create mode 100644 geosuite/problems.ipynb create mode 100644 geosuite/process.py create mode 100755 health_regions/download.sh create mode 100755 health_regions/load.sh create mode 100644 health_regions/organize.sql create mode 100644 hydro/README.md create mode 100755 hydro/download.sh create mode 100644 hydro/hydro_2006.txt create mode 100644 hydro/hydro_2011.txt create mode 100644 hydro/hydro_2016.txt create mode 100644 hydro/hydro_2021.txt create mode 100755 hydro/load.sh create mode 100755 hydro/process.sh create mode 100755 main.sh create mode 100755 national_address_register/download.sh create mode 100755 national_address_register/load.sh create mode 100644 national_address_register/national_address_register_files.txt create mode 100755 national_address_register/process.py create mode 100644 open_databases/files_opendatabase_addresses.txt create mode 100644 open_databases/files_opendatabase_buildings.txt create mode 100644 open_databases/files_opendatabase_cultural.txt create mode 100644 open_databases/files_opendatabase_educational.txt create mode 100644 open_databases/files_opendatabase_greenhouses.txt create mode 100644 open_databases/files_opendatabase_healthcare.txt create mode 100644 open_databases/files_opendatabase_recreation.txt create mode 100644 open_databases/import_opendatabase.sh create mode 100644 open_databases/standardize.sql create mode 100644 road_network_files/README.md create mode 100755 road_network_files/download.sh create mode 100755 road_network_files/load.sh create mode 100644 road_network_files/organize.sql create mode 100755 road_network_files/process.sh create mode 100644 road_network_files/road_network_file_2001.txt create mode 100644 road_network_files/road_network_file_2006.txt create mode 100644 road_network_files/road_network_file_2011.txt create mode 100644 road_network_files/road_network_file_2016.txt create mode 100644 road_network_files/road_network_file_2021.txt create mode 100644 road_network_files/road_network_files.sql diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..00e15fc --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,26 @@ +{ + "name": "process-statcan-data", + "dockerComposeFile": "../docker-compose.yml", + "service": "devcontainer", + "workspaceFolder": "/workspace", + "shutdownAction": "stopCompose", + "forwardPorts": [ + 5432, + 8888 + ], + "customizations": { + "vscode": { + "extensions": [ + "ms-python.python", + "ms-toolsai.jupyter", + "vsls-contrib.gistfs", + "vscode-icons-team.vscode-icons", + "dorzey.vscode-sqlfluff" + ], + "settings": { + "python.pythonPath": "/root/.venv/bin/python", + "python.defaultInterpreterPath": "/root/.venv/bin/python" + } + } + } +} \ No newline at end of file diff --git a/.env b/.env new file mode 100644 index 0000000..b11671b --- /dev/null +++ b/.env @@ -0,0 +1,5 @@ +POSTGRES_PASSWORD=floral-retype-viewless +POSTGRES_USER=statcan +POSTGRES_DB=statcan +# For Jupyter Notebook +DATA_FOLDER=/data \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2f1dd28 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +data/ +.ipynb_checkpoints +__pycache__/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..831cf13 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,54 @@ +FROM ghcr.io/osgeo/gdal:ubuntu-full-3.11.0 + +USER root + +RUN apt-get update -y \ + && apt-get -y install gcc g++ make libsqlite3-dev zlib1g-dev + +# Utilities +RUN apt-get install -y neovim \ + python3-neovim \ + htop \ + tmux \ + git \ + aria2 \ + postgresql-client \ + zip + +# tippecanoe +RUN git clone https://github.com/felt/tippecanoe.git \ + && cd tippecanoe \ + && git checkout 2.78.0 \ + && make -j \ + && make install \ + && rm /home/jovyan/tippecanoe -rf + +# Install uv +RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh + +# Create virtual environment and install Python packages +RUN uv venv ~/.venv \ + && cd ~ \ + && uv pip install 'geopandas[all]' duckdb psycopg2-binary jupyterlab lonboard click stats-can openpyxl ordered-set sqlfluff + +# Bash Kernel +RUN cd ~ \ + && uv pip install bash_kernel \ + && /root/.venv/bin/python -m bash_kernel.install + +# Install DuckDB +RUN mkdir -p ~/.local/bin \ + && curl https://install.duckdb.org | sh + +# Install rclone +RUN curl https://rclone.org/install.sh | bash + +# When user logs in, we use the spatial virtual environment +RUN echo 'source /root/.venv/bin/activate' > ~/.bashrc \ + && echo 'export PATH="~/.local/bin:${PATH}"' >> ~/.bashrc + +RUN mkdir /data + +RUN apt-get clean \ + && rm -rf /var/lib/apt/lists/* \ + && uv cache clean diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..741bf17 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) Diego Ripley. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..10ae8a6 --- /dev/null +++ b/README.md @@ -0,0 +1,34 @@ +## Table of Contents +- [About](#-about) +- [How to Run](#-how-to-run) +- [License](#-license) + +## About + +**process-statcan-data** is a set of scripts that helps you load all Statistics Canada data required for data analysis. Specifically this loads: + +- **Boundaries**: Geographic boundaries from 2001 to 2021. +- **Road Network Files**: Road network files from 2001 to 2021. +- **Health Regions**: Health Regions from 2003 to 2023. +- **National Address Register**: National Address Register files from 2022 to 2024. +- **Census of Population**: Census of Population data from 2001 to 2021. +- **Census of Agriculture**: Census of Agriculture from 2001 to 2021. +- **National Household Survey**: National Household Survey from 2011 to 2016. + +## How to Run + +We make use of Dev Container to run the code: + +```shell +# Clone the repository +git clone https://github.com/dataforcanada/process-statcan-data.git + +# Navigate to the project directory +cd process-statcan-data +``` + +## License + +This product is distributed under an MIT license. + +[Back to top](#top) diff --git a/boundaries/README.md b/boundaries/README.md new file mode 100644 index 0000000..9c6143b --- /dev/null +++ b/boundaries/README.md @@ -0,0 +1,46 @@ +# TODO +- Process 2023 Federal Electoral Districts +- For `load.sh` + - Finish processing 2001 data + +- For `country.sql` + - Create `country_2001` from 2001 geometries. Need to finish `load.sh` + - Add English abbreviation for all years + - Add French abbreviation for all years + +- For `geographic_regions_of_canada.sql` + - Add other years (2016, 2011, 2006, 2001) + - Add GRC abbreviation english + - Add GRC abbreviation french + - According to this, Territories DGUID should be `2021A00016` https://www150.statcan.gc.ca/n1/en/geo?geotext=Territories%20%5BRegion%5D&geocode=A00016 + - According to the link above, British Columbia DGUID should be `2021A00015` + +- For `er_2021`, split `er_name` into English and French components. There's some records that are separated by `/` + - South Coast--Burin Peninsula / Côte-sud--Burin Peninsula + - West Coast--Northern Peninsula--Labrador / Côte-ouest--Northern Peninsula--Labrador + - Prince Edward Island / Île-du-Prince-Édouard + +- For `cma_2021`, split `cma_name` into English and French components. There's some records that are separated by `/` + - Greater Sudbury / Grand Sudbury + - Ottawa - Gatineau (Ontario part / partie de l'Ontario) + +- For `ccs_2021`, split `ccs_name` into English and French components. There's some records that are separated by `/` + - West Nipissing / Nipissing Ouest + - French River / Rivière des Français + - Greater Sudbury / Grand Sudbury + - The Nation / La Nation + +- For `csd_2021`, split `csd_name` into English and French components. There's some records that are separated by `/` + - The Nation / La Nation + - West Nipissing / Nipissing Ouest + - Greater Sudbury / Grand Sudbury + - Beaubassin East / Beaubassin-est + +- For `csd_2021`, figure out what level of geography the sac_code and sac_type belongs to so I can name it appropriately + +- For `pop_ctr_2021`, split `pop_ctr_name` into English and French components. There's one record that is separated by `/` + - Grand Falls / Grand-Sault + +- For `dpl_2021`, split `dpl_name` into English and French components. There's records that are separated by `/` + - Saint Irénée and Alderwood / Saint Irénée et Alderwood + - `Sainte-Anne-de-Kent part B / partie B` - this one would need to be split into `Sainte-Anne-de-Kent part B` and `Sainte-Anne-de-Kentpartie partie B` diff --git a/boundaries/aggregate_dissemination_areas.sql b/boundaries/aggregate_dissemination_areas.sql new file mode 100644 index 0000000..61cead5 --- /dev/null +++ b/boundaries/aggregate_dissemination_areas.sql @@ -0,0 +1,59 @@ +/* +Aggregate Dissemination Areas +*/ + +/* 2021 +Definition here: https://web.archive.org/web/20240731061904/https://www12.statcan.gc.ca/census-recensement/2021/ref/dict/az/Definition-eng.cfm?ID=geo053 +*/ + +DROP TABLE IF EXISTS silver.ada_2021; +CREATE TABLE silver.ada_2021 AS +SELECT DISTINCT + cd.country_dguid, + cd.country_en_name, + cd.country_fr_name, + cd.country_en_abbreviation, + cd.country_fr_abbreviation, + cd.grc_dguid, + cd.grc_en_name, + cd.grc_fr_name, + cd.pr_dguid, + cd.pr_en_name, + cd.pr_fr_name, + cd.pr_en_abbreviation, + cd.pr_fr_abbreviation, + cd.pr_iso_code, + cd.car_dguid, + cd.car_en_name, + cd.car_fr_name, + cd.cd_dguid, + cd.cd_name, + cd.cd_type, + cma.cma_dguid, + cma.cma_p_dguid, + cma.cma_name, + cma.cma_type, + dgr.ada_dguid, + ada.geom +FROM silver.dissemination_geographies_relationship_2021 AS dgr +LEFT JOIN silver.cd_2021 AS cd + ON dgr.cd_dguid = cd.cd_dguid +LEFT JOIN silver.cma_2021 AS cma + ON + concat(dgr.cma_dguid, dgr.cma_p_dguid) + = concat(cma.cma_dguid, cma.cma_p_dguid) +LEFT JOIN bronze.lada000a21a_e AS ada + ON dgr.ada_dguid = ada.dguid; + +-- Make geometries valid +UPDATE + silver.ada_2021 +SET + geom = st_makevalid(geom) +WHERE + st_isvalid(geom) = 'f'; + +-- Create spatial index +CREATE INDEX ada_2021_geom_idx ON silver.ada_2021 USING gist (geom) WITH ( + fillfactor = 100 +); diff --git a/boundaries/boundary_files_2001.txt b/boundaries/boundary_files_2001.txt new file mode 100644 index 0000000..1dc4d8f --- /dev/null +++ b/boundaries/boundary_files_2001.txt @@ -0,0 +1,16 @@ +# https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/bound-limit-2001-eng.cfm +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2001/gpr_000b01m_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gpr_000b01m_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2001/gfed000b01m_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gfed000b01m_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2001/ger_000b01m_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/ger_000b01m_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2001/gcd_000b01m_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gcd_000b01m_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2001/gcar000b01m_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gcar000b01m_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2001/gccs000b01m_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gccs000b01m_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2001/gcsd000b01m_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gcsd000b01m_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2001/gcma000b01m_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gcma000b01m_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2001/gct_000b01m_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gct_000b01m_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2001/gda_000b01m_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gda_000b01m_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2001/gdb_000a01m_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gdb_000a01m_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2001/gdpl000b01m_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gdpl000b01m_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2001/gua_000b01m_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gua_000b01m_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2001/gecu000e01m_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gecu000e01m_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2001/geca000e01m_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/geca000e01m_e.zip \ No newline at end of file diff --git a/boundaries/boundary_files_2006.txt b/boundaries/boundary_files_2006.txt new file mode 100644 index 0000000..02c1ea8 --- /dev/null +++ b/boundaries/boundary_files_2006.txt @@ -0,0 +1,16 @@ +# https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/bound-limit-2006-eng.cfm +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2006/gpr_000a06a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gpr_000a06a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2006/gfed000a06a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gfed000a06a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2006/ger_000a06a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/ger_000a06a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2006/gcd_000a06a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gcd_000a06a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2006/gcar000a06a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gcar000a06a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2006/gccs000a06a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gccs000a06a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2006/gcsd000a06a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gcsd000a06a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2006/gcma000a06a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gcma000a06a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2006/gct_000a06a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gct_000a06a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2006/gda_000a06a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gda_000a06a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2006/gdb_000a06a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gdb_000a06a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2006/gdpl000a06a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gdpl000a06a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2006/gua_000a06a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gua_000a06a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2006/gecu000e06a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gecu000e06a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2006/geca000e06a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/geca000e06a_e.zip \ No newline at end of file diff --git a/boundaries/boundary_files_2011.txt b/boundaries/boundary_files_2011.txt new file mode 100644 index 0000000..711a757 --- /dev/null +++ b/boundaries/boundary_files_2011.txt @@ -0,0 +1,17 @@ +# https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/bound-limit-2011-eng.cfm +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2011/gpr_000a11a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gpr_000a11a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2011/gfed000a11a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gfed000a11a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2011/ger_000a11a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/ger_000a11a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2011/gcd_000a11a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gcd_000a11a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2011/gcar000a11a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gcar000a11a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2011/gccs000a11a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gccs000a11a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2011/gcsd000a11a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gcsd000a11a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2011/gcma000a11a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gcma000a11a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2011/gct_000a11a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gct_000a11a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2011/gda_000a11a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gda_000a11a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2011/gdb_000a11a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gdb_000a11a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2011/gdpl000a11a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gdpl000a11a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2011/gpc_000a11a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gpc_000a11a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2011/gecu000e11a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gecu000e11a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2011/geca000e11a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/geca000e11a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2011/gfsa000a11a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gfsa000a11a_e.zip \ No newline at end of file diff --git a/boundaries/boundary_files_2016.txt b/boundaries/boundary_files_2016.txt new file mode 100644 index 0000000..d48a30b --- /dev/null +++ b/boundaries/boundary_files_2016.txt @@ -0,0 +1,18 @@ +# https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/bound-limit-2016-eng.cfm +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2016/lpr_000a16a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/2016/lpr_000a16a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2016/lfed000a16a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/2016/lfed000a16a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2016/ler_000a16a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/2016/ler_000a16a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2016/lcd_000a16a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/2016/lcd_000a16a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2016/lada000a16a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/2016/lada000a16a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2016/lcar000a16a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/2016/lcar000a16a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2016/lccs000a16a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/2016/lccs000a16a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2016/lcsd000a16a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/2016/lcsd000a16a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2016/lcma000a16a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/2016/lcma000a16a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2016/lct_000a16a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/2016/lct_000a16a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2016/lda_000a16a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/2016/lda_000a16a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2016/ldb_000a16a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/2016/ldb_000a16a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2016/ldpl000a16a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/2016/ldpl000a16a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2016/lpc_000a16a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/2016/lpc_000a16a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2016/lecu000e16a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/2016/lecu000e16a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2016/geca000e16a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/2016/geca000e16a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2016/lfsa000a16a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/2016/lfsa000a16a_e.zip \ No newline at end of file diff --git a/boundaries/boundary_files_2021.txt b/boundaries/boundary_files_2021.txt new file mode 100644 index 0000000..6891750 --- /dev/null +++ b/boundaries/boundary_files_2021.txt @@ -0,0 +1,18 @@ +# https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/index2021-eng.cfm?year=21 +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2021/lpr_000a21a_e.zip https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lpr_000a21a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2021/lcd_000a21a_e.zip https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lcd_000a21a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2021/lfed000a21a_e.zip https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lfed000a21a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2021/lcsd000a21a_e.zip https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lcsd000a21a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2021/ldpl000a21a_e.zip https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/ldpl000a21a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2021/lfsa000a21a_e.zip https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lfsa000a21a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2021/ler_000a21a_e.zip https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/ler_000a21a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2021/lcar000a21a_e.zip https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lcar000a21a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2021/lccs000a21a_e.zip https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lccs000a21a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2021/lcma000a21a_e.zip https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lcma000a21a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2021/lct_000a21a_e.zip https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lct_000a21a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2021/lpc_000a21a_e.zip https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lpc_000a21a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2021/lda_000a21a_e.zip https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lda_000a21a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2021/ldb_000a21a_e.zip https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/ldb_000a21a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2021/lada000a21a_e.zip https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lada000a21a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2021/lecu000e21a_e.zip https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lecu000e21a_e.zip +https://data.dataforcanada.org/archive/statistics_canada/boundaries/2021/leca000e21a_e.zip https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/leca000e21a_e.zip \ No newline at end of file diff --git a/boundaries/census_agricultural_regions.sql b/boundaries/census_agricultural_regions.sql new file mode 100644 index 0000000..0f4ec33 --- /dev/null +++ b/boundaries/census_agricultural_regions.sql @@ -0,0 +1,49 @@ +/* +Census Agricultural Regions +*/ + +/* 2021 +Definition here: https://web.archive.org/web/20250401192328/https://www12.statcan.gc.ca/census-recensement/2021/ref/dict/az/definition-eng.cfm?ID=geo006 +*/ + +DROP TABLE IF EXISTS silver.car_2021; +CREATE TABLE silver.car_2021 AS +SELECT DISTINCT + pr.country_dguid, + pr.country_en_name, + pr.country_fr_name, + pr.country_en_abbreviation, + pr.country_fr_abbreviation, + pr.grc_dguid, + pr.grc_en_name, + pr.grc_fr_name, + pr.pr_dguid, + pr.pr_en_name, + pr.pr_fr_name, + pr.pr_en_abbreviation, + pr.pr_fr_abbreviation, + pr.pr_iso_code, + dgr.car_dguid, + car.carename AS car_en_name, + car.carfname AS car_fr_name, + car.geom +FROM + silver.pr_2021 AS pr, + silver.dissemination_geographies_relationship_2021 AS dgr, + bronze.lcar000a21a_e AS car +WHERE + pr.pr_dguid = dgr.pr_dguid + AND car.dguid = dgr.car_dguid; + +-- Make geometries valid +UPDATE + silver.car_2021 +SET + geom = st_makevalid(geom) +WHERE + st_isvalid(geom) = 'f'; + +-- Create spatial index +CREATE INDEX car_2021_geom_idx ON silver.car_2021 USING gist (geom) WITH ( + fillfactor = 100 +); diff --git a/boundaries/census_consolidated_subdivisions.sql b/boundaries/census_consolidated_subdivisions.sql new file mode 100644 index 0000000..3bb8d63 --- /dev/null +++ b/boundaries/census_consolidated_subdivisions.sql @@ -0,0 +1,55 @@ +/* +Census Consolidated Subdivisions +*/ + +/* 2021 +Definition here: https://web.archive.org/web/20250401192303/https://www12.statcan.gc.ca/census-recensement/2021/ref/dict/az/Definition-eng.cfm?ID=geo007 +*/ + +DROP TABLE IF EXISTS silver.ccs_2021; +CREATE TABLE silver.ccs_2021 AS +SELECT DISTINCT + cd.country_dguid, + cd.country_en_name, + cd.country_fr_name, + cd.country_en_abbreviation, + cd.country_fr_abbreviation, + cd.grc_dguid, + cd.grc_en_name, + cd.grc_fr_name, + cd.pr_dguid, + cd.pr_en_name, + cd.pr_fr_name, + cd.pr_en_abbreviation, + cd.pr_fr_abbreviation, + cd.pr_iso_code, + cd.car_dguid, + cd.car_en_name, + cd.car_fr_name, + cd.cd_dguid, + cd.cd_name, + cd.cd_type, + ccs.dguid AS ccs_dguid, + ccs.ccsname AS ccs_name, + ccs.geom +FROM + silver.cd_2021 AS cd, + silver.dissemination_geographies_relationship_2021 AS dgr, + bronze.lccs000a21a_e AS ccs +WHERE + cd.cd_dguid = dgr.cd_dguid + AND + ccs.dguid = dgr.ccs_dguid; + +-- Make geometries valid +UPDATE + silver.ccs_2021 +SET + geom = st_makevalid(geom) +WHERE + st_isvalid(geom) = 'f'; + +-- Create spatial index +CREATE INDEX ccs_2021_geom_idx ON silver.ccs_2021 USING gist (geom) WITH ( + fillfactor = 100 +); diff --git a/boundaries/census_divisions.sql b/boundaries/census_divisions.sql new file mode 100644 index 0000000..795a5a9 --- /dev/null +++ b/boundaries/census_divisions.sql @@ -0,0 +1,57 @@ +/* +Census Divisions +*/ + +/* 2021 +Definition here: https://web.archive.org/web/20250131082459/https://www12.statcan.gc.ca/census-recensement/2021/ref/dict/az/definition-eng.cfm?ID=geo008#moreinfo +*/ + +DROP TABLE IF EXISTS silver.cd_2021; +CREATE TABLE silver.cd_2021 AS +SELECT DISTINCT + pr.country_dguid, + pr.country_en_name, + pr.country_fr_name, + pr.country_en_abbreviation, + pr.country_fr_abbreviation, + pr.grc_dguid, + pr.grc_en_name, + pr.grc_fr_name, + pr.pr_dguid, + pr.pr_en_name, + pr.pr_fr_name, + pr.pr_en_abbreviation, + pr.pr_fr_abbreviation, + pr.pr_iso_code, + dgr.car_dguid, + car.carename AS car_en_name, + car.carfname AS car_fr_name, + cd.dguid AS cd_dguid, + cd.cdname AS cd_name, + cd.cdtype AS cd_type, + cd.geom +FROM + silver.pr_2021 AS pr, + bronze.lcd_000a21a_e AS cd, + silver.dissemination_geographies_relationship_2021 AS dgr, + bronze.lcar000a21a_e AS car +WHERE + pr.pr_dguid = dgr.pr_dguid + AND cd.dguid = dgr.cd_dguid + AND dgr.car_dguid = car.dguid; + +-- Make geometries valid +UPDATE + silver.cd_2021 +SET + geom = ST_MAKEVALID(geom) +WHERE + ST_ISVALID(geom) = 'f'; + +-- Create spatial index +CREATE INDEX cd_2021_geom_idx ON silver.cd_2021 +USING gist (geom) WITH (fillfactor = 100); + +/* 2016 +Definition here: https://web.archive.org/web/20250304001456/https://www12.statcan.gc.ca/census-recensement/2016/ref/dict/geo008-eng.cfm +*/ diff --git a/boundaries/census_metropolitan_areas.sql b/boundaries/census_metropolitan_areas.sql new file mode 100644 index 0000000..d0c9672 --- /dev/null +++ b/boundaries/census_metropolitan_areas.sql @@ -0,0 +1,56 @@ +/* +Census Metropolitan Areas +*/ + +/* 2021 +Definition here: https://web.archive.org/web/20250518133322/https://www12.statcan.gc.ca/census-recensement/2021/ref/dict/az/Definition-eng.cfm?ID=geo009 +*/ + +DROP TABLE IF EXISTS silver.cma_2021; +CREATE TABLE silver.cma_2021 AS +SELECT DISTINCT + pr.country_dguid, + pr.country_en_name, + pr.country_fr_name, + pr.country_en_abbreviation, + pr.country_fr_abbreviation, + pr.grc_dguid, + pr.grc_en_name, + pr.grc_fr_name, + pr.pr_dguid, + pr.pr_en_name, + pr.pr_fr_name, + pr.pr_en_abbreviation, + pr.pr_fr_abbreviation, + pr.pr_iso_code, + cma.dguid AS cma_dguid, + cma.dguidp AS cma_p_dguid, + cma.cmaname AS cma_name, + cma.cmatype AS cma_type, + cma.geom +FROM + silver.pr_2021 AS pr, + bronze.lcma000a21a_e AS cma, + silver.dissemination_geographies_relationship_2021 AS dgr +WHERE + pr.pr_dguid = dgr.pr_dguid + AND + concat( + cma.dguid, + cma.dguidp) = concat( + dgr.cma_dguid, + dgr.cma_p_dguid + ); + +-- Make geometries valid +UPDATE + silver.cma_2021 +SET + geom = st_makevalid(geom) +WHERE + st_isvalid(geom) = 'f'; + +-- Create spatial index +CREATE INDEX cma_2021_geom_idx ON silver.cma_2021 USING gist (geom) WITH ( + fillfactor = 100 +); diff --git a/boundaries/census_subdivisions.sql b/boundaries/census_subdivisions.sql new file mode 100644 index 0000000..7319a87 --- /dev/null +++ b/boundaries/census_subdivisions.sql @@ -0,0 +1,71 @@ +/* +Census Subdivisions +*/ + +/* 2021 +Definition here: https://web.archive.org/web/20240526213705/https://www12.statcan.gc.ca/census-recensement/2021/ref/dict/az/Definition-eng.cfm?ID=geo012 +*/ + +DROP TABLE IF EXISTS silver.csd_2021; +CREATE TABLE silver.csd_2021 AS +SELECT DISTINCT + cd.country_dguid, + cd.country_en_name, + cd.country_fr_name, + cd.country_en_abbreviation, + cd.country_fr_abbreviation, + cd.grc_dguid, + cd.grc_en_name, + cd.grc_fr_name, + cd.pr_dguid, + cd.pr_en_name, + cd.pr_fr_name, + cd.pr_en_abbreviation, + cd.pr_fr_abbreviation, + cd.pr_iso_code, + cd.car_dguid, + cd.car_en_name, + cd.car_fr_name, + er.er_dguid, + er.er_name, + cd.cd_dguid, + cd.cd_name, + cd.cd_type, + ccs.ccs_dguid, + ccs.ccs_name, + gaf.cma_dguid, + gaf.cma_p_dguid, + cma.cma_name, + cma.cma_type, + gaf.csd_dguid, + csd.csdname AS csd_name, + csd.csdtype AS csd_type, + gaf.sac_type, + gaf.sac_code, + csd.geom +FROM silver.gaf_2021 AS gaf +LEFT JOIN silver.cma_2021 AS cma + ON + concat(gaf.cma_dguid, gaf.cma_p_dguid) + = concat(cma.cma_dguid, cma.cma_p_dguid) +LEFT JOIN silver.cd_2021 AS cd + ON gaf.cd_dguid = cd.cd_dguid +LEFT JOIN silver.er_2021 AS er + ON gaf.er_dguid = er.er_dguid +LEFT JOIN silver.ccs_2021 AS ccs + ON gaf.ccs_dguid = ccs.ccs_dguid +LEFT JOIN bronze.lcsd000a21a_e AS csd + ON gaf.csd_dguid = csd.dguid; + +-- Make geometries valid +UPDATE + silver.csd_2021 +SET + geom = st_makevalid(geom) +WHERE + st_isvalid(geom) = 'f'; + +-- Create spatial index +CREATE INDEX csd_2021_geom_idx ON silver.csd_2021 USING gist (geom) WITH ( + fillfactor = 100 +); diff --git a/boundaries/census_tracts.sql b/boundaries/census_tracts.sql new file mode 100644 index 0000000..61ce0fd --- /dev/null +++ b/boundaries/census_tracts.sql @@ -0,0 +1,52 @@ +/* +Census Tracts +*/ + +/* 2021 +Definition here: https://web.archive.org/web/20241013011815/https://www12.statcan.gc.ca/census-recensement/2021/ref/dict/az/Definition-eng.cfm?ID=geo013 +*/ + +DROP TABLE IF EXISTS silver.ct_2021; +CREATE TABLE silver.ct_2021 AS +SELECT DISTINCT + cma.country_dguid, + cma.country_en_name, + cma.country_fr_name, + cma.country_en_abbreviation, + cma.country_fr_abbreviation, + cma.grc_dguid, + cma.grc_en_name, + cma.grc_fr_name, + cma.pr_dguid, + cma.pr_en_name, + cma.pr_fr_name, + cma.pr_en_abbreviation, + cma.pr_fr_abbreviation, + cma.pr_iso_code, + cma.cma_dguid, + cma.cma_p_dguid, + cma.cma_name, + cma.cma_type, + ct.dguid AS ct_dguid, + ct.geom +FROM silver.dissemination_geographies_relationship_2021 AS dgr +LEFT JOIN bronze.lct_000a21a_e AS ct + ON dgr.ct_dguid = ct.dguid +LEFT JOIN silver.cma_2021 AS cma + ON + concat(dgr.cma_dguid, dgr.cma_p_dguid) + = concat(cma.cma_dguid, cma.cma_p_dguid) +WHERE dgr.ct_dguid IS NOT null; + +-- Make geometries valid +UPDATE + silver.ct_2021 +SET + geom = st_makevalid(geom) +WHERE + st_isvalid(geom) = 'f'; + +-- Create spatial index +CREATE INDEX ct_2021_geom_idx ON silver.ct_2021 USING gist (geom) WITH ( + fillfactor = 100 +); diff --git a/boundaries/country.sql b/boundaries/country.sql new file mode 100644 index 0000000..5b67631 --- /dev/null +++ b/boundaries/country.sql @@ -0,0 +1,131 @@ +/* +Canada +*/ + +-- 2021 Canada; +DROP TABLE IF EXISTS silver.country_2021; +CREATE TABLE silver.country_2021 AS +SELECT DISTINCT + '2021A000011124' AS country_dguid, + 'Canada' AS country_en_name, + 'Canada' AS country_fr_name, + 'CAN' AS country_en_abbreviation, + 'CAN' AS country_fr_abbreviation, + ST_UNION(geom) AS geom +FROM + bronze.lpr_000a21a_e; + +-- Make geometries valid +UPDATE + silver.country_2021 +SET + geom = ST_MAKEVALID(geom) +WHERE + ST_ISVALID(geom) = 'f'; + +-- Create spatial index +CREATE INDEX country_2021_geom_idx ON silver.country_2021 +USING gist (geom) WITH (fillfactor = 100); + +-- 2016 Canada; +DROP TABLE IF EXISTS silver.country_2016; +CREATE TABLE silver.country_2016 AS +SELECT DISTINCT + '2016A000011124' AS country_dguid, + 'Canada' AS country_en_name, + 'Canada' AS country_fr_name, + 'CAN' AS country_en_abbreviation, + 'CAN' AS country_fr_abbreviation, + ST_UNION(geom) AS geom +FROM + bronze.lpr_000a16a_e; + +-- Make geometries valid +UPDATE + silver.country_2016 +SET + geom = ST_MAKEVALID(geom) +WHERE + ST_ISVALID(geom) = 'f'; + +-- Create spatial index +CREATE INDEX country_2016_geom_idx ON silver.country_2016 +USING gist (geom) WITH (fillfactor = 100); + +-- 2011 Canada; +DROP TABLE IF EXISTS silver.country_2011; +CREATE TABLE silver.country_2011 AS +SELECT DISTINCT + '2011A000011124' AS country_dguid, + 'Canada' AS country_en_name, + 'Canada' AS country_fr_name, + 'CAN' AS country_en_abbreviation, + 'CAN' AS country_fr_abbreviation, + st_union(geom) AS geom +FROM + bronze.gpr_000a11a_e; + +-- Make geometries valid +UPDATE + silver.country_2011 +SET + geom = ST_MAKEVALID(geom) +WHERE + ST_ISVALID(geom) = 'f'; + +-- Create spatial index +CREATE INDEX country_2011_geom_idx ON silver.country_2011 +USING gist (geom) WITH (fillfactor = 100); + +-- 2006 Canada; +DROP TABLE IF EXISTS silver.country_2006; +CREATE TABLE silver.country_2006 AS +SELECT DISTINCT + '2006A000011124' AS country_dguid, + 'Canada' AS country_en_name, + 'Canada' AS country_fr_name, + 'CAN' AS country_en_abbreviation, + 'CAN' AS country_fr_abbreviation, + ST_UNION(geom) AS geom +FROM + bronze.gpr_000a06a_e; + +-- Make geometries valid +UPDATE + silver.country_2006 +SET + geom = ST_MAKEVALID(geom) +WHERE + ST_ISVALID(geom) = 'f'; + +-- Create spatial index +CREATE INDEX country_2006_geom_idx ON silver.country_2006 +USING gist (geom) WITH (fillfactor = 100); + +-- 2001 Canada; +-- TODO +/* +-- Clean Provinces and Territories layer; +UPDATE + bronze.gpr_000a06a_e +SET + geom = st_makevalid(geom) +WHERE + st_isvalid(geom) = 'f'; +DROP TABLE IF EXISTS country_2001; + +CREATE TABLE country_2001 AS + SELECT + DISTINCT '2001A000011124' AS country_dguid, + 'Canada' AS country_en_name, + 'Canada' AS country_fr_name, + 'CAN' AS country_en_abbreviation, + 'CAN' AS country_fr_abbreviation, + st_union(geom) AS geom +FROM + lpr_000a21a_e; + +CREATE INDEX country_2001_geom_idx ON +country_2001 + USING GIST(geom) WITH (FILLFACTOR = 100); +*/ diff --git a/boundaries/designated_places.sql b/boundaries/designated_places.sql new file mode 100644 index 0000000..acf0ab7 --- /dev/null +++ b/boundaries/designated_places.sql @@ -0,0 +1,48 @@ +/* +Designated Places +*/ + +/* 2021 +Definition here: https://web.archive.org/web/20240731061904/https://www12.statcan.gc.ca/census-recensement/2021/ref/dict/az/Definition-eng.cfm?ID=geo018 +*/ + +DROP TABLE IF EXISTS silver.dpl_2021; +CREATE TABLE silver.dpl_2021 AS +SELECT DISTINCT + pr.country_dguid, + pr.country_en_name, + pr.country_fr_name, + pr.country_en_abbreviation, + pr.country_fr_abbreviation, + pr.grc_dguid, + pr.grc_en_name, + pr.grc_fr_name, + pr.pr_dguid, + pr.pr_en_name, + pr.pr_fr_name, + pr.pr_en_abbreviation, + pr.pr_fr_abbreviation, + pr.pr_iso_code, + dgr.dpl_dguid, + dpl.dplname AS dpl_name, + dpl.dpltype AS dpl_type, + dpl.geom +FROM silver.dissemination_geographies_relationship_2021 AS dgr +LEFT JOIN silver.pr_2021 AS pr + ON dgr.pr_dguid = pr.pr_dguid +LEFT JOIN bronze.ldpl000a21a_e AS dpl + ON dgr.dpl_dguid = dpl.dguid +WHERE dgr.dpl_dguid IS NOT null; + +-- Make geometries valid +UPDATE + silver.dpl_2021 +SET + geom = st_makevalid(geom) +WHERE + st_isvalid(geom) = 'f'; + +-- Create spatial index +CREATE INDEX dpl_2021_geom_idx ON silver.dpl_2021 USING gist (geom) WITH ( + fillfactor = 100 +); diff --git a/boundaries/dissemination_areas.sql b/boundaries/dissemination_areas.sql new file mode 100644 index 0000000..62b2377 --- /dev/null +++ b/boundaries/dissemination_areas.sql @@ -0,0 +1,66 @@ +/* +Dissemination Areas +*/ + +/* 2021 +Definition here: https://web.archive.org/web/20240731061905/https://www12.statcan.gc.ca/census-recensement/2021/ref/dict/az/definition-eng.cfm?ID=geo021 +*/ + +DROP TABLE IF EXISTS silver.da_2021; +CREATE TABLE silver.da_2021 AS +SELECT DISTINCT + csd.country_dguid, + csd.country_en_name, + csd.country_fr_name, + csd.country_en_abbreviation, + csd.country_fr_abbreviation, + csd.grc_dguid, + csd.grc_en_name, + csd.grc_fr_name, + csd.pr_dguid, + csd.pr_en_name, + csd.pr_fr_name, + csd.pr_en_abbreviation, + csd.pr_fr_abbreviation, + csd.pr_iso_code, + csd.car_dguid, + csd.car_en_name, + csd.car_fr_name, + csd.er_dguid, + csd.er_name, + csd.cd_dguid, + csd.cd_name, + csd.cd_type, + csd.ccs_dguid, + csd.ccs_name, + csd.cma_dguid, + csd.cma_p_dguid, + csd.cma_name, + csd.cma_type, + csd.csd_dguid, + csd.csd_name, + csd.csd_type, + csd.sac_type, + csd.sac_code, + dgr.ct_dguid, + dgr.ada_dguid, + dgr.da_dguid, + da.geom +FROM silver.dissemination_geographies_relationship_2021 AS dgr +LEFT JOIN silver.csd_2021 AS csd + ON dgr.csd_dguid = csd.csd_dguid +LEFT JOIN bronze.lda_000a21a_e AS da + ON dgr.da_dguid = da.dguid; + +-- Make geometries valid +UPDATE + bronze.lda_000a21a_e +SET + geom = st_makevalid(geom) +WHERE + st_isvalid(geom) = 'f'; + +-- Create spatial index +CREATE INDEX da_2021_geom_idx ON silver.da_2021 USING gist (geom) WITH ( + fillfactor = 100 +); diff --git a/boundaries/dissemination_blocks.sql b/boundaries/dissemination_blocks.sql new file mode 100644 index 0000000..7ba3d4e --- /dev/null +++ b/boundaries/dissemination_blocks.sql @@ -0,0 +1,73 @@ +/* +Dissemination Blocks +*/ + +/* 2021 +Definition here: https://web.archive.org/web/20250212081621/https://www12.statcan.gc.ca/census-recensement/2021/ref/dict/az/definition-eng.cfm?ID=geo014 +*/ + +DROP TABLE IF EXISTS silver.db_2021; +CREATE TABLE silver.db_2021 AS +SELECT DISTINCT + csd.country_dguid, + csd.country_en_name, + csd.country_fr_name, + csd.country_en_abbreviation, + csd.country_fr_abbreviation, + csd.grc_dguid, + csd.grc_en_name, + csd.grc_fr_name, + csd.pr_dguid, + csd.pr_en_name, + csd.pr_fr_name, + csd.pr_en_abbreviation, + csd.pr_fr_abbreviation, + csd.pr_iso_code, + csd.car_dguid, + csd.car_en_name, + csd.car_fr_name, + csd.er_dguid, + csd.er_name, + csd.cd_dguid, + csd.cd_name, + csd.cd_type, + csd.ccs_dguid, + csd.ccs_name, + csd.cma_dguid, + csd.cma_p_dguid, + csd.cma_name, + csd.cma_type, + csd.csd_dguid, + csd.csd_name, + csd.csd_type, + csd.sac_type, + csd.sac_code, + dgr.fed_dguid, + fed.fed_name, + fed.fed_en_name, + fed.fed_fr_name, + dgr.ct_dguid, + dgr.ada_dguid, + dgr.da_dguid, + dgr.db_dguid, + db.geom +FROM silver.dissemination_geographies_relationship_2021 AS dgr +LEFT JOIN silver.csd_2021 AS csd + ON dgr.csd_dguid = csd.csd_dguid +LEFT JOIN silver.fed_2021 AS fed + ON dgr.fed_dguid = fed.fed_dguid +LEFT JOIN bronze.ldb_000a21a_e AS db + ON dgr.db_dguid = db.dguid; + +-- Make geometries valid +UPDATE + silver.db_2021 +SET + geom = st_makevalid(geom) +WHERE + st_isvalid(geom) = 'f'; + +-- Create spatial index +CREATE INDEX db_2021_geom_idx ON silver.db_2021 USING gist (geom) WITH ( + fillfactor = 100 +); diff --git a/boundaries/download.sh b/boundaries/download.sh new file mode 100755 index 0000000..8550fb0 --- /dev/null +++ b/boundaries/download.sh @@ -0,0 +1,19 @@ +#!/bin/bash +if [ ! -d "${DATA_FOLDER}/boundaries" ] +then + echo "Making directory ${DATA_FOLDER}/boundaries/" + mkdir -p ${DATA_FOLDER}/boundaries/{input,extracted,output}/{2021,2016,2011,2001} +fi + +INPUT_FOLDER="${DATA_FOLDER}/boundaries/input" + +echo "Downloading 2021 boundaries" +aria2c -x16 -i "${SCRIPT_DIR}/boundaries/boundary_files_2021.txt" --dir=${INPUT_FOLDER}/2021 --auto-file-renaming=false +echo "Downloading 2016 boundaries" +aria2c -x16 -i "${SCRIPT_DIR}/boundaries/boundary_files_2016.txt" --dir=${INPUT_FOLDER}/2016 --auto-file-renaming=false +echo "Downloading 2011 boundaries" +aria2c -x16 -i "${SCRIPT_DIR}/boundaries/boundary_files_2011.txt" --dir=${INPUT_FOLDER}/2011 --auto-file-renaming=false +echo "Downloading 2006 boundaries" +aria2c -x16 -i "${SCRIPT_DIR}/boundaries/boundary_files_2006.txt" --dir=${INPUT_FOLDER}/2006 --auto-file-renaming=false +echo "Downloading 2001 boundaries" +aria2c -x16 -i "${SCRIPT_DIR}/boundaries/boundary_files_2001.txt" --dir=${INPUT_FOLDER}/2001 --auto-file-renaming=false diff --git a/boundaries/economic_regions.sql b/boundaries/economic_regions.sql new file mode 100644 index 0000000..27a16fc --- /dev/null +++ b/boundaries/economic_regions.sql @@ -0,0 +1,49 @@ +/* +Economic Regions +*/ + +/* 2021 +Definition here: https://web.archive.org/web/20250518132130/https://www12.statcan.gc.ca/census-recensement/2021/ref/dict/az/Definition-eng.cfm?ID=geo022 +*/ + +DROP TABLE IF EXISTS silver.er_2021; +CREATE TABLE silver.er_2021 AS +SELECT DISTINCT + pr.country_dguid, + pr.country_en_name, + pr.country_fr_name, + pr.country_en_abbreviation, + pr.country_fr_abbreviation, + pr.grc_dguid, + pr.grc_en_name, + pr.grc_fr_name, + pr.pr_dguid, + pr.pr_en_name, + pr.pr_fr_name, + pr.pr_en_abbreviation, + pr.pr_fr_abbreviation, + pr.pr_iso_code, + er.dguid AS er_dguid, + er.ername AS er_name, + er.geom +FROM + silver.pr_2021 AS pr, + silver.dissemination_geographies_relationship_2021 AS dgr, + bronze.ler_000a21a_e AS er +WHERE + pr.pr_dguid = dgr.pr_dguid + AND + er.dguid = dgr.er_dguid; + +-- Make geometries valid +UPDATE + silver.er_2021 +SET + geom = st_makevalid(geom) +WHERE + st_isvalid(geom) = 'f'; + +-- Create spatial index +CREATE INDEX er_2021_geom_idx ON silver.er_2021 USING gist (geom) WITH ( + fillfactor = 100 +); diff --git a/boundaries/federal_electoral_districts.sql b/boundaries/federal_electoral_districts.sql new file mode 100644 index 0000000..12cf509 --- /dev/null +++ b/boundaries/federal_electoral_districts.sql @@ -0,0 +1,50 @@ +/* +Federal Electoral Districts +*/ + +/* 2021 +Definition here: https://web.archive.org/web/20240731061905/https://www12.statcan.gc.ca/census-recensement/2021/ref/dict/az/Definition-eng.cfm?ID=geo025 +*/ + +-- 2021 vintage, 2013 representation order; +DROP TABLE IF EXISTS silver.fed_2021_2013; +CREATE TABLE silver.fed_2021_2013 AS +SELECT DISTINCT + pr.country_dguid, + pr.country_en_name, + pr.country_fr_name, + pr.country_en_abbreviation, + pr.country_fr_abbreviation, + pr.grc_dguid, + pr.grc_en_name, + pr.grc_fr_name, + pr.pr_dguid, + pr.pr_en_name, + pr.pr_fr_name, + pr.pr_en_abbreviation, + pr.pr_fr_abbreviation, + pr.pr_iso_code, + dgr.fed_dguid, + fed.fedname AS fed_name, + fed.fedename AS fed_en_name, + fed.fedfname AS fed_fr_name, + fed.geom +FROM silver.dissemination_geographies_relationship_2021 AS dgr +LEFT JOIN silver.pr_2021 AS pr + ON dgr.pr_dguid = pr.pr_dguid +LEFT JOIN bronze.lfed000a21a_e AS fed + ON dgr.fed_dguid = fed.dguid +WHERE dgr.fed_dguid IS NOT null; + +-- Make geometries valid +UPDATE + silver.fed_2021_2013 +SET + geom = st_makevalid(geom) +WHERE + st_isvalid(geom) = 'f'; + +-- Create spatial index +CREATE INDEX fed_2021_2013_geom_idx ON silver.fed_2021_2013 USING gist (geom) WITH ( + fillfactor = 100 +); diff --git a/boundaries/forward_sortation_areas.sql b/boundaries/forward_sortation_areas.sql new file mode 100644 index 0000000..ef05844 --- /dev/null +++ b/boundaries/forward_sortation_areas.sql @@ -0,0 +1,43 @@ +/* +Forward Sortation Areas +*/ + +/* 2021 +Definition here: https://web.archive.org/web/20241102112247/https://www12.statcan.gc.ca/census-recensement/2021/ref/dict/az/Definition-eng.cfm?ID=geo036 +*/ + +DROP TABLE IF EXISTS silver.fsa_2021; +CREATE TABLE silver.fsa_2021 AS +SELECT DISTINCT + pr.country_dguid, + pr.country_en_name, + pr.country_fr_name, + pr.country_en_abbreviation, + pr.country_fr_abbreviation, + pr.grc_dguid, + pr.grc_en_name, + pr.grc_fr_name, + pr.pr_dguid, + pr.pr_en_name, + pr.pr_fr_name, + pr.pr_en_abbreviation, + pr.pr_fr_abbreviation, + pr.pr_iso_code, + fsa.dguid AS fsa_dguid, + fsa.geom +FROM bronze.lfsa000a21a_e AS fsa, + silver.pr_2021 AS pr +WHERE concat('2021A0002', fsa.pruid) = pr.pr_dguid; + +-- Make geometries valid +UPDATE + silver.fsa_2021 +SET + geom = st_makevalid(geom) +WHERE + st_isvalid(geom) = 'f'; + +-- Create spatial index +CREATE INDEX fsa_2021_geom_idx ON silver.fsa_2021 USING gist (geom) WITH ( + fillfactor = 100 +); diff --git a/boundaries/geographic_regions_of_canada.sql b/boundaries/geographic_regions_of_canada.sql new file mode 100644 index 0000000..c31fb0d --- /dev/null +++ b/boundaries/geographic_regions_of_canada.sql @@ -0,0 +1,471 @@ +/* +Geographic Regions of Canada +*/ + +/* 2021 +Definition here: https://web.archive.org/web/20240624230708/https://www150.statcan.gc.ca/n1/pub/92-195-x/2021001/geo/region/region-eng.htm +*/ + +-- With geometries; +DROP TABLE IF EXISTS silver.grc_2021; +CREATE TABLE silver.grc_2021 AS +WITH territories AS ( + SELECT + '2021A00016' AS grc_dguid, + 'Territories' AS grc_en_name, + 'Territoires' AS grc_fr_name, + ST_UNION(geom) AS geom + FROM + bronze.lpr_000a21a_e + WHERE + pruid IN ('60', '61', '62') +), + +atlantic AS ( + SELECT + '2021A00011' AS grc_dguid, + 'Atlantic' AS grc_en_name, + 'Atlantique' AS grc_fr_name, + ST_UNION(geom) AS geom + FROM + bronze.lpr_000a21a_e + WHERE + pruid IN ('10', '11', '12', '13') +), + +prairies AS ( + SELECT + '2021A00014' AS grc_dguid, + 'Prairies' AS grc_en_name, + 'Prairies' AS grc_fr_name, + ST_UNION(geom) AS geom + FROM + bronze.lpr_000a21a_e + WHERE + pruid IN ('48', '47', '46') +), + +the_rest AS ( + SELECT + CASE + WHEN pruid = '59' THEN '2021A00015' + WHEN pruid = '35' THEN '2021A00013' + WHEN pruid = '24' THEN '2021A00012' + END AS grc_dguid, + prename AS grc_en_name, + prfname AS grc_fr_name, + geom + FROM + bronze.lpr_000a21a_e + WHERE + pruid IN ('59', '35', '24') +), + +final AS ( + SELECT + country.country_dguid, + country.country_en_name, + country.country_fr_name, + country.country_en_abbreviation, + country.country_fr_abbreviation, + territories.* + FROM + territories, + silver.country_2021 AS country + UNION + SELECT + country.country_dguid, + country.country_en_name, + country.country_fr_name, + country.country_en_abbreviation, + country.country_fr_abbreviation, + prairies.* + FROM + prairies, + silver.country_2021 AS country + UNION + SELECT + country.country_dguid, + country.country_en_name, + country.country_fr_name, + country.country_en_abbreviation, + country.country_fr_abbreviation, + atlantic.* + FROM + atlantic, + silver.country_2021 AS country + UNION + SELECT + country.country_dguid, + country.country_en_name, + country.country_fr_name, + country.country_en_abbreviation, + country.country_fr_abbreviation, + the_rest.* + FROM + the_rest, + silver.country_2021 AS country +) + +SELECT * +FROM + final; + +-- Make geometries valid +UPDATE + silver.grc_2021 +SET + geom = ST_MAKEVALID(geom) +WHERE + ST_ISVALID(geom) = 'f'; + +-- Create spatial index +CREATE INDEX grc_2021_geom_idx ON silver.grc_2021 +USING gist (geom) WITH (fillfactor = 100); + +-- 2021 without geometries, and with pr_dguid; +DROP TABLE IF EXISTS silver.grc_pr_2021; +CREATE TABLE silver.grc_pr_2021 AS +WITH territories AS ( + SELECT + '2021A00016' AS grc_dguid, + 'Territories' AS grc_en_name, + 'Territoires' AS grc_fr_name, + dguid AS pr_dguid + FROM + bronze.lpr_000a21a_e + WHERE + pruid IN ('60', '61', '62') +), + +atlantic AS ( + SELECT + '2021A00011' AS grc_dguid, + 'Atlantic' AS grc_en_name, + 'Atlantique' AS grc_fr_name, + dguid AS pr_dguid + FROM + bronze.lpr_000a21a_e + WHERE + pruid IN ('10', '11', '12', '13') +), + +prairies AS ( + SELECT + '2021A00014' AS grc_dguid, + 'Prairies' AS grc_en_name, + 'Prairies' AS grc_fr_name, + dguid AS pr_dguid + FROM + bronze.lpr_000a21a_e + WHERE + pruid IN ('48', '47', '46') +), + +the_rest AS ( + SELECT + CASE + WHEN pruid = '59' THEN '2021A00015' + WHEN pruid = '35' THEN '2021A00013' + WHEN pruid = '24' THEN '2021A00012' + END AS grc_dguid, + prename AS grc_en_name, + prfname AS grc_fr_name, + dguid AS pr_dguid + FROM + bronze.lpr_000a21a_e + WHERE + pruid IN ('59', '35', '24') +), + +final AS ( + SELECT + country.country_dguid, + country.country_en_name, + country.country_fr_name, + country.country_en_abbreviation, + country.country_fr_abbreviation, + territories.* + FROM + territories, + silver.country_2021 AS country + UNION + SELECT + country.country_dguid, + country.country_en_name, + country.country_fr_name, + country.country_en_abbreviation, + country.country_fr_abbreviation, + prairies.* + FROM + prairies, + silver.country_2021 AS country + UNION + SELECT + country.country_dguid, + country.country_en_name, + country.country_fr_name, + country.country_en_abbreviation, + country.country_fr_abbreviation, + atlantic.* + FROM + atlantic, + silver.country_2021 AS country + UNION + SELECT + country.country_dguid, + country.country_en_name, + country.country_fr_name, + country.country_en_abbreviation, + country.country_fr_abbreviation, + the_rest.* + FROM + the_rest, + silver.country_2021 AS country +) + +SELECT * +FROM + final; + +/* 2016 +Definition here: https://web.archive.org/web/20240224030001/https://www12.statcan.gc.ca/census-recensement/2016/ref/dict/geo027a-eng.cfm +*/ + +-- With geometries; +DROP TABLE IF EXISTS silver.grc_2016; +CREATE TABLE silver.grc_2016 AS +WITH territories AS ( + SELECT + '2016A00016' AS grc_dguid, + 'Territories' AS grc_en_name, + 'Territoires' AS grc_fr_name, + ST_UNION(geom) AS geom + FROM + bronze.lpr_000a16a_e + WHERE + pruid IN ('60', '61', '62') +), + +atlantic AS ( + SELECT + '2016A00011' AS grc_dguid, + 'Atlantic' AS grc_en_name, + 'Atlantique' AS grc_fr_name, + ST_UNION(geom) AS geom + FROM + bronze.lpr_000a16a_e + WHERE + pruid IN ('10', '11', '12', '13') +), + +prairies AS ( + SELECT + '2016A00014' AS grc_dguid, + 'Prairies' AS grc_en_name, + 'Prairies' AS grc_fr_name, + ST_UNION(geom) AS geom + FROM + bronze.lpr_000a16a_e + WHERE + pruid IN ('48', '47', '46') +), + +the_rest AS ( + SELECT + CASE + WHEN pruid = '59' THEN '2016A00015' + WHEN pruid = '35' THEN '2016A00013' + WHEN pruid = '24' THEN '2016A00012' + END AS grc_dguid, + prename AS grc_en_name, + prfname AS grc_fr_name, + geom + FROM + bronze.lpr_000a16a_e + WHERE + pruid IN ('59', '35', '24') +), + +final AS ( + SELECT + country.country_dguid, + country.country_en_name, + country.country_fr_name, + country.country_en_abbreviation, + country.country_fr_abbreviation, + territories.* + FROM + territories, + silver.country_2016 AS country + UNION + SELECT + country.country_dguid, + country.country_en_name, + country.country_fr_name, + country.country_en_abbreviation, + country.country_fr_abbreviation, + prairies.* + FROM + prairies, + silver.country_2016 AS country + UNION + SELECT + country.country_dguid, + country.country_en_name, + country.country_fr_name, + country.country_en_abbreviation, + country.country_fr_abbreviation, + atlantic.* + FROM + atlantic, + silver.country_2016 AS country + UNION + SELECT + country.country_dguid, + country.country_en_name, + country.country_fr_name, + country.country_en_abbreviation, + country.country_fr_abbreviation, + the_rest.* + FROM + the_rest, + silver.country_2016 AS country +) + +SELECT * +FROM + final; + +-- Make geometries valid +UPDATE + silver.grc_2016 +SET + geom = ST_MAKEVALID(geom) +WHERE + ST_ISVALID(geom) = 'f'; + +-- Create spatial index +CREATE INDEX grc_2016_geom_idx ON silver.grc_2016 +USING gist (geom) WITH (fillfactor = 100); + +-- 2016 without geometries, and with pr_dguid; +DROP TABLE IF EXISTS silver.grc_pr_2016; + +CREATE TABLE silver.grc_pr_2016 AS +WITH territories AS ( + SELECT + '2016A00016' AS grc_dguid, + 'Territories' AS grc_en_name, + 'Territoires' AS grc_fr_name, + CONCAT('2016A0002', pruid) AS pr_dguid + FROM + bronze.lpr_000a16a_e + WHERE + pruid IN ('60', '61', '62') +), + +atlantic AS ( + SELECT + '2016A00011' AS grc_dguid, + 'Atlantic' AS grc_en_name, + 'Atlantique' AS grc_fr_name, + CONCAT('2016A0002', pruid) AS pr_dguid + FROM + bronze.lpr_000a16a_e + WHERE + pruid IN ('10', '11', '12', '13') +), + +prairies AS ( + SELECT + '2016A00014' AS grc_dguid, + 'Prairies' AS grc_en_name, + 'Prairies' AS grc_fr_name, + CONCAT('2016A0002', pruid) AS pr_dguid + FROM + bronze.lpr_000a16a_e + WHERE + pruid IN ('48', '47', '46') +), + +the_rest AS ( + SELECT + CASE + WHEN pruid = '59' THEN '2016A00015' + WHEN pruid = '35' THEN '2016A00013' + WHEN pruid = '24' THEN '2016A00012' + END AS grc_dguid, + prename AS grc_en_name, + prfname AS grc_fr_name, + CONCAT('2016A0002', pruid) AS pr_dguid + FROM + bronze.lpr_000a16a_e + WHERE + pruid IN ('59', '35', '24') +), + +final AS ( + SELECT + country.country_dguid, + country.country_en_name, + country.country_fr_name, + country.country_en_abbreviation, + country.country_fr_abbreviation, + territories.* + FROM + territories, + silver.country_2016 AS country + UNION + SELECT + country.country_dguid, + country.country_en_name, + country.country_fr_name, + country.country_en_abbreviation, + country.country_fr_abbreviation, + prairies.* + FROM + prairies, + silver.country_2016 AS country + UNION + SELECT + country.country_dguid, + country.country_en_name, + country.country_fr_name, + country.country_en_abbreviation, + country.country_fr_abbreviation, + atlantic.* + FROM + atlantic, + silver.country_2016 AS country + UNION + SELECT + country.country_dguid, + country.country_en_name, + country.country_fr_name, + country.country_en_abbreviation, + country.country_fr_abbreviation, + the_rest.* + FROM + the_rest, + silver.country_2016 AS country +) + +SELECT * +FROM + final; + +/* 2011 +Definition here: https://web.archive.org/web/20240214024306/https://www12.statcan.gc.ca/census-recensement/2011/ref/dict/geo027a-eng.cfm +*/ + +/* +2006 +No definition in 2006 dictionary https://www12.statcan.gc.ca/census-recensement/2006/ref/dict/azindex-eng.cfm +*/ + +/* +2001 + +Census Dictionary is available here https://www12.statcan.gc.ca/access_acces/archive.action-eng.cfm?/english/census01/products/reference/dict/appendices/92-378-XIE02002.pdf +*/ diff --git a/boundaries/load.sh b/boundaries/load.sh new file mode 100755 index 0000000..5a208f8 --- /dev/null +++ b/boundaries/load.sh @@ -0,0 +1,290 @@ +#!/bin/bash + +import_to_postgis() { + local filepath="$1" + local table_name="$2" + local extra_parameters="${@:3}" + + # Handle zip files using GDAL's virtual file system + if [[ "${filepath: -4}" == ".zip" ]]; then + filepath="/vsizip/${filepath}" + fi + + echo "Importing ${filepath} into table ${table_name}" + ogr2ogr \ + --config PG_USE_COPY YES \ + -lco "OVERWRITE=YES" \ + -f "PostgreSQL" \ + "PG:host=db dbname=${POSTGRES_DB} user=${POSTGRES_USER} password=${POSTGRES_PASSWORD} port=5432" \ + -lco GEOMETRY_NAME=geom \ + -progress \ + -gt 500000 \ + -t_srs EPSG:4326 \ + -nln "${table_name}" \ + ${extra_parameters} \ + "${filepath}" +} + +# Define input folders +INPUT_FOLDER="${DATA_FOLDER}/boundaries/input" +EXTRACTED_FOLDER="${DATA_FOLDER}/boundaries/extracted" + +import_data_2021() { + # Source: https://web.archive.org/web/20230307163203/https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/index2021-eng.cfm?year=21 + # Provinces/territories + import_to_postgis "${INPUT_FOLDER}/2021/lpr_000a21a_e.zip" lpr_000a21a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Census divisions + import_to_postgis "${INPUT_FOLDER}/2021/lcd_000a21a_e.zip" lcd_000a21a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Federal electoral districts (2013 Representation Order) + import_to_postgis "${INPUT_FOLDER}/2021/lfed000a21a_e.zip" lfed000a21a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Census subdivisions + import_to_postgis "${INPUT_FOLDER}/2021/lcsd000a21a_e.zip" lcsd000a21a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Designated places + import_to_postgis "${INPUT_FOLDER}/2021/ldpl000a21a_e.zip" ldpl000a21a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Forward sortation areas + unzip -n "${INPUT_FOLDER}/2021/lfsa000a21a_e.zip" -d "${EXTRACTED_FOLDER}/2021" + import_to_postgis "${EXTRACTED_FOLDER}/2021/lfsa000a21a_e/lfsa000a21a_e.shp" lfsa000a21a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Economic regions + import_to_postgis "${INPUT_FOLDER}/2021/ler_000a21a_e.zip" ler_000a21a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # There's issues with the Census agricultural regions encoding, statcan did not export the shapefile properly + # Census agricultural regions + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2021/lcar000a21a_e.zip" lcar000a21a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Census consolidated subdivisions + import_to_postgis "${INPUT_FOLDER}/2021/lccs000a21a_e.zip" lccs000a21a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Census metropolitan areas and census agglomerations + import_to_postgis "${INPUT_FOLDER}/2021/lcma000a21a_e.zip" lcma000a21a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Census tracts + import_to_postgis "${INPUT_FOLDER}/2021/lct_000a21a_e.zip" lct_000a21a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Population centres + import_to_postgis "${INPUT_FOLDER}/2021/lpc_000a21a_e.zip" lpc_000a21a_e "-lco SCHEMA=bronze" + # Dissemination areas + import_to_postgis "${INPUT_FOLDER}/2021/lda_000a21a_e.zip" lda_000a21a_e "-lco SCHEMA=bronze" + # Dissemination blocks + import_to_postgis "${INPUT_FOLDER}/2021/ldb_000a21a_e.zip" ldb_000a21a_e "-lco SCHEMA=bronze" + # Aggregate dissemination areas + import_to_postgis "${INPUT_FOLDER}/2021/lada000a21a_e.zip" lada000a21a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Population ecumene + unzip -n "${INPUT_FOLDER}/2021/lecu000e21a_e.zip" -d "${EXTRACTED_FOLDER}/2021" + import_to_postgis "${EXTRACTED_FOLDER}/2021/lecu000e21a_e.shp" lecu000a21a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Agricultural ecumene + unzip -n "${INPUT_FOLDER}/2021/leca000e21a_e.zip" -d "${EXTRACTED_FOLDER}/2021" + import_to_postgis "${EXTRACTED_FOLDER}/2021/leca000e21a_e.shp" leca000e21a_e "-lco SCHEMA=bronze" +} + +import_data_2016() { + # Source: https://web.archive.org/web/20230120140926/https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/bound-limit-2016-eng.cfm + # Provinces/territories + import_to_postgis "${INPUT_FOLDER}/2016/lpr_000a16a_e.zip" lpr_000a16a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Federal electoral districts (2013 Representation Order) + import_to_postgis "${INPUT_FOLDER}/2016/lfed000a16a_e.zip" lfed000a16a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Economic regions + import_to_postgis "${INPUT_FOLDER}/2016/ler_000a16a_e.zip" ler_000a16a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Census divisions + import_to_postgis "${INPUT_FOLDER}/2016/lcd_000a16a_e.zip" lcd_000a16a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Aggregate dissemination areas + import_to_postgis "${INPUT_FOLDER}/2016/lada000a16a_e.zip" lada000a16a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Census agricultural regions + import_to_postgis "${INPUT_FOLDER}/2016/lcar000a16a_e.zip" lcar000a16a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Census consolidated subdivisions + import_to_postgis "${INPUT_FOLDER}/2016/lccs000a16a_e.zip" lccs000a16a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Census subdivisions + import_to_postgis "${INPUT_FOLDER}/2016/lcsd000a16a_e.zip" lcsd000a16a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Census metropolitan areas and census agglomerations + import_to_postgis "${INPUT_FOLDER}/2016/lcma000a16a_e.zip" lcma000a16a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Census tracts + import_to_postgis "${INPUT_FOLDER}/2016/lct_000a16a_e.zip" lct_000a16a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Dissemination areas + import_to_postgis "${INPUT_FOLDER}/2016/lda_000a16a_e.zip" lda_000a16a_e "-lco SCHEMA=bronze" + # Dissemination blocks + import_to_postgis "${INPUT_FOLDER}/2016/ldb_000a16a_e.zip" ldb_000a16a_e "-lco SCHEMA=bronze" + # Designated places + import_to_postgis "${INPUT_FOLDER}/2016/ldpl000a16a_e.zip" ldpl000a16a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Population centres + import_to_postgis "${INPUT_FOLDER}/2016/lpc_000a16a_e.zip" lpc_000a16a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Population ecumene + unzip -n "${INPUT_FOLDER}/2016/lecu000e16a_e.zip" -d "${EXTRACTED_FOLDER}/2016" + import_to_postgis "${EXTRACTED_FOLDER}/2016/lecu000e16a_e.shp" lecu000e16a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Agricultural ecumene + unzip -n "${INPUT_FOLDER}/2016/geca000e16a_e.zip" -d ${EXTRACTED_FOLDER}/2016 + import_to_postgis "${EXTRACTED_FOLDER}/2016/lagecu000e16a_e.shp" lagecu000e16a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Forward sortation areas + import_to_postgis "${INPUT_FOLDER}/2016/lfsa000a16a_e.zip" lfsa000a16a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" +} + +import_data_2011() { + # Source: https://web.archive.org/web/20230110163150/https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/bound-limit-2011-eng.cfm + # Provinces/territories + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2011/gpr_000a11a_e.zip" gpr_000a11a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Federal electoral districts (2003 Representation Order) + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2011/gfed000a11a_e.zip" gfed000a11a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Economic regions + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2011/ger_000a11a_e.zip" ger_000a11a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Census divisions + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2011/gcd_000a11a_e.zip" gcd_000a11a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Census agricultural regions + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2011/gcar000a11a_e.zip" gcar000a11a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Census consolidated subdivisions + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2011/gccs000a11a_e.zip" gccs000a11a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Census subdivisions + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2011/gcsd000a11a_e.zip" gcsd000a11a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Census metropolitan areas and census agglomerations + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2011/gcma000a11a_e.zip" gcma000a11a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Census tracts + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2011/gct_000a11a_e.zip" gct_000a11a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Dissemination areas + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2011/gda_000a11a_e.zip" gda_000a11a_e "-lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Dissemination blocks + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2011/gdb_000a11a_e.zip" gdb_000a11a_e "-lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Designated places + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2011/gdpl000a11a_e.zip" gdpl000a11a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Population centres + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2011/gpc_000a11a_e.zip" gpc_000a11a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Population ecumene + unzip -n "${INPUT_FOLDER}/2011/gecu000e11a_e.zip" -d "${EXTRACTED_FOLDER}/2011" + import_to_postgis "${EXTRACTED_FOLDER}/2011/gecu000e11a_e.shp" gecu000e11a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Agricultural ecumene + unzip -n "${INPUT_FOLDER}/2011/geca000e11a_e.zip" -d "${EXTRACTED_FOLDER}/2011" + import_to_postgis "${EXTRACTED_FOLDER}/2011/geca000e11a_e.shp" geca000e11a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Forward sortation areas + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2011/gfsa000a11a_e.zip" gfsa000a11a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING +} + +import_data_2006() { + # Source: https://web.archive.org/web/20221218043125/https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/bound-limit-2006-eng.cfm + # Provinces/territories + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2006/gpr_000a06a_e.zip" gpr_000a06a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Federal electoral districts (2003 Representation Order) + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2006/gfed000a06a_e.zip" gfed000a06a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Economic regions + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2006/ger_000a06a_e.zip" ger_000a06a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Census divisions + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2006/gcd_000a06a_e.zip" gcd_000a06a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Census agricultural regions + #import_to_postgis "${INPUT_FOLDER}/2006/gcar000a06a_e.zip" gcar000a06a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Census consolidated subdivisions + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2006/gccs000a06a_e.zip" gccs000a06a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Census subdivisions + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2006/gcsd000a06a_e.zip" gcsd000a06a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Census metropolitan areas and census agglomerations + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2006/gcma000a06a_e.zip" gcma000a06a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Census tracts + import_to_postgis "${INPUT_FOLDER}/2006/gct_000a06a_e.zip" gct_000a06a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Dissemination areas + import_to_postgis "${INPUT_FOLDER}/2006/gda_000a06a_e.zip" gda_000a06a_e "-lco SCHEMA=bronze" + # Dissemination blocks + import_to_postgis "${INPUT_FOLDER}/2006/gdb_000a06a_e.zip" gdb_000a06a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Designated places + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2006/gdpl000a06a_e.zip" gdpl000a06a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Urban areas + export PGCLIENTENCODING=LATIN-1; + import_to_postgis "${INPUT_FOLDER}/2006/gua_000a06a_e.zip" gua_000a06a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + unset PGCLIENTENCODING + # Population ecumene + unzip -n "${INPUT_FOLDER}/2006/gecu000e06a_e.zip" -d "${EXTRACTED_FOLDER}/2006" + import_to_postgis "${EXTRACTED_FOLDER}/2006/gecu000e06a_e.shp" gecu000e06a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Agricultural ecumene + unzip -o "${INPUT_FOLDER}/2006/geca000e06a_e.zip" -d ${EXTRACTED_FOLDER}/2006 + import_to_postgis "${EXTRACTED_FOLDER}/2006/geca000e06a_e.shp" geca000e06a_e "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" +} + +import_data_2001() { + # Source: https://web.archive.org/web/20221218043135/https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/bound-limit-2001-eng.cfm + # TODO: merge geometries by unique identifier for all but the block + # Provinces/Territories + unzip -n "${INPUT_FOLDER}/2001/gpr_000b01m_e.zip" -d ${EXTRACTED_FOLDER}/2001 + import_to_postgis "${EXTRACTED_FOLDER}/2001/gpr_000b02m_e/gpr_000b02m_e.MID" gpr_000b02m_e_tmp "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Federal Electoral Districts (1996 and 2003 Representation Orders) + unzip -n "${INPUT_FOLDER}/2001/gfed000b01m_e.zip" -d "${EXTRACTED_FOLDER}/2001" + import_to_postgis "${EXTRACTED_FOLDER}/2001/gfed000b02m_e/gfed000b02m_e.MID" gfed000b02m_e_tmp "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Economic Regions + unzip -n "${INPUT_FOLDER}/2001/ger_000b01m_e.zip" -d "${EXTRACTED_FOLDER}/2001" + import_to_postgis "${EXTRACTED_FOLDER}/2001/ger_000b02m_e/ger_000b02m_e.MID" ger_000b02m_e_tmp "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Census Divisions + unzip -n "${INPUT_FOLDER}/2001/gcd_000b01m_e.zip" -d "${EXTRACTED_FOLDER}/2001" + import_to_postgis "${EXTRACTED_FOLDER}/2001/gcd_000b02m_e/gcd_000b02m_e.MID" gcd_000b02m_e_tmp "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Census Agricultural Regions + unzip -n "${INPUT_FOLDER}/2001/gcar000b01m_e.zip" -d "${EXTRACTED_FOLDER}/2001" + import_to_postgis "${EXTRACTED_FOLDER}/2001/gcar000b03m_e/gcar000b03m_e.mid" gcar000b03m_e_tmp "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Census Consolidated Subdivisions + unzip -n "${INPUT_FOLDER}/2001/gccs000b01m_e.zip" -d "${EXTRACTED_FOLDER}/2001" + import_to_postgis "${EXTRACTED_FOLDER}/2001/gccs000b02m_e/gccs000b02m_e.MID" gccs000b02m_e_tmp "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Census Subdivisions + unzip -n "${INPUT_FOLDER}/2001/gcsd000b01m_e.zip" -d "${EXTRACTED_FOLDER}/2001" + import_to_postgis "${EXTRACTED_FOLDER}/2001/gcsd000b02m_e/gcsd000b02m_e.MID" gcsd000b02m_e_tmp "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Census Metropolitan Areas and Census Agglomerations + unzip -n "${INPUT_FOLDER}/2001/gcma000b01m_e.zip" -d "${EXTRACTED_FOLDER}/2001" + import_to_postgis "${EXTRACTED_FOLDER}/2001/gcma000b02m_e/gcma000b02m_e.MID" gcma000b02m_e_tmp "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Census Tracts + unzip -n "${INPUT_FOLDER}/2001/gct_000b01m_e.zip" -d "${EXTRACTED_FOLDER}/2001" + import_to_postgis "${EXTRACTED_FOLDER}/2001/gct_000b02m_e/gct_000b02m_e.MID" gct_000b02m_e_tmp "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Dissemination Areas + unzip -n "${INPUT_FOLDER}/2001/gda_000b01m_e.zip" -d "${EXTRACTED_FOLDER}/2001" + import_to_postgis "${EXTRACTED_FOLDER}/2001/gda_000b02m_e/gda_000b02m_e.MID" gda_000b02m_e_tmp "-lco SCHEMA=bronze" + # Blocks (no need to merge geometries) + unzip -n "${INPUT_FOLDER}/2001/gdb_000a01m_e.zip" -d "${EXTRACTED_FOLDER}/2001" + import_to_postgis "${EXTRACTED_FOLDER}/2001/gbl_000d02m_e/gbl_000d02m_e.TAB" gbl_000d02m_e_tmp "-lco SCHEMA=bronze" + # Designated Places + unzip -n "${INPUT_FOLDER}/2001/gdpl000b01m_e.zip" -d "${EXTRACTED_FOLDER}/2001" + import_to_postgis "${EXTRACTED_FOLDER}/2001/gdpl000b02m_e/gdpl000b02m_e.MID" gdpl000b02m_e_tmp "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # Urban Areas + unzip -n "${INPUT_FOLDER}/2001/gua_000b01m_e.zip" -d "${EXTRACTED_FOLDER}/2001" + import_to_postgis "${EXTRACTED_FOLDER}/2001/gua_000b02m_e/gua_000b02m_e.MID" gua000b02m_e_tmp "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" + # TODO + # Population Ecumene + # Agricultural Ecumene + unzip -n "${INPUT_FOLDER}/2001/geca000e01m_e.zip" -d "${EXTRACTED_FOLDER}/2001" + import_to_postgis "${EXTRACTED_FOLDER}/2001/geca000e03m_e/geca000e03m_e.mid" geca000e03m_e_tmp "-nlt PROMOTE_TO_MULTI -lco SCHEMA=bronze" +} + +# Execute all import functions +import_data_2021 +import_data_2016 +import_data_2011 +import_data_2006 +#import_data_2001 diff --git a/boundaries/organize.sql b/boundaries/organize.sql new file mode 100644 index 0000000..99062c4 --- /dev/null +++ b/boundaries/organize.sql @@ -0,0 +1,608 @@ +/* +PR +*/ + +/* 2001 Provinces and Territories */ +drop table if exists statcan_pr_2001; +create table statcan_pr_2001 as + select distinct pruid, prename, prfname, preabbr, prfabbr, + concat('2001A0002', pruid) as dguid, + st_union(geom) as geom + from statcan_gpr_000b02m_e_tmp + group by pruid, prename, prfname, preabbr, prfabbr; + +create index statcan_pr_2001_geom_idx on statcan_pr_2001 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gpr_000b02m_e_tmp; + +/* 2006 Provinces and Territories */ +drop table if exists statcan_pr_2006; +create table statcan_pr_2006 as + select pruid, prename, prfname, preabbr, prfabbr, + concat('2006A0002', pruid) as dguid, + geom + from statcan_gpr_000a06a_e; + +create index statcan_pr_2006_geom_idx on statcan_pr_2006 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gpr_000b02m_e_tmp; + +/* 2011 Provinces and Territories */ +drop table if exists statcan_pr_2011; +create table statcan_pr_2011 as + select pruid, prename, prfname, preabbr, prfabbr, + concat('2011A0002', pruid) as dguid, + geom + from statcan_gpr_000a11a_e; + +create index statcan_pr_2011_geom_idx on statcan_pr_2011 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gpr_000a11a_e; + +/* 2016 Provinces and Territories */ +drop table if exists statcan_pr_2016; +create table statcan_pr_2016 as + select pruid, prename, prfname, preabbr, prfabbr, + concat('2016A0002', pruid) as dguid, + geom + from statcan_lpr_000a16a_e; + +create index statcan_pr_2016_geom_idx on statcan_pr_2016 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_lpr_000a16a_e; + +/* +CD +*/ +--- 2001 Census Divisions; +drop table if exists statcan_cd_2001; +create table statcan_cd_2001 as + select distinct pruid, cduid, cdname, cdtype, + concat('2001A0003', cduid) as dguid, + st_union(geom) as geom + from statcan_gcd_000b02m_e_tmp + group by pruid, cduid, cdname, cdtype; + +create index statcan_cd_2001_geom_idx on statcan_cd_2001 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gcd_000b02m_e_tmp; + +--- 2006 Census Divisions; +drop table if exists statcan_cd_2006; +create table statcan_cd_2006 as + select pruid, cduid, cdname, cdtype, + concat('2006A0003', cduid) as dguid, + geom + from statcan_gcd_000a06a_e; + +create index statcan_cd_2006_geom_idx on statcan_cd_2006 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gcd_000a06a_e; + +--- 2011 Census Divisions; +drop table if exists statcan_cd_2011; +create table statcan_cd_2011 as + select pruid, cduid, cdname, cdtype, concat('2011A0003', cduid) as dguid, + geom + from statcan_gcd_000a11a_e; + +create index statcan_cd_2011_geom_idx on statcan_cd_2011 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gcd_000a11a_e; + +--- 2016 Census Divisions; +drop table if exists statcan_cd_2016; +create table statcan_cd_2016 as + select pruid, cduid, cdname, cdtype, + concat('2016A0003', cduid) as dguid, + geom + from statcan_lcd_000a16a_e; + +create index statcan_cd_2016_geom_idx on statcan_cd_2016 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_lcd_000a16a_e; + +/* +ER +*/ +--- 2001 Economic Regions; +drop table if exists statcan_er_2001; +create table statcan_er_2001 as + select distinct pruid, eruid, ername, + concat('2001S0500', eruid) as dguid, + st_union(geom) as geom + from statcan_ger_000b02m_e_tmp + group by pruid, eruid, ername; + +create index statcan_er_2001_geom_idx on statcan_er_2001 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_ger_000b02m_e_tmp; + +--- 2006 Economic Regions; +drop table if exists statcan_er_2006; +create table statcan_er_2006 as + select distinct pruid, eruid, ername, + concat('2006S0500', eruid) as dguid, geom + from statcan_ger_000a06a_e; + +create index statcan_er_2006_geom_idx on statcan_er_2006 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_ger_000a06a_e; + +--- 2011 Economic Regions; +drop table if exists statcan_er_2011; +create table statcan_er_2011 as + select distinct pruid, eruid, ername, + concat('2011S0500', eruid) as dguid, geom + from statcan_ger_000a11a_e; + +create index statcan_er_2011_geom_idx on statcan_er_2011 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_ger_000a11a_e; + +--- 2016 Economic Regions; +drop table if exists statcan_er_2016; +create table statcan_er_2016 as + select distinct pruid, eruid, ername, + concat('2016S0500', eruid) as dguid, geom + from statcan_ler_000a16a_e; + +create index statcan_er_2016_geom_idx on statcan_er_2016 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_ler_000a16a_e; + +/* +CCS +*/ + +--- 2001 Census Consolidated Subdivision; +drop table if exists statcan_ccs_2001; +create table statcan_ccs_2001 as + select distinct pruid, ccsuid, ccsname, + concat('2001S0502', ccsuid) as dguid, + st_union(geom) as geom + from statcan_gccs000b02m_e_tmp + group by pruid, ccsuid, ccsname; + +create index statcan_ccs_2001_geom_idx on statcan_ccs_2001 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gccs000b02m_e_tmp; + +--- 2006 Census Consolidated Subdivision; +drop table if exists statcan_ccs_2006; +create table statcan_ccs_2006 as + select distinct pruid, ccsuid, ccsname, + concat('2006S0502', ccsuid) as dguid, geom + from statcan_gccs000a06a_e; + +create index statcan_ccs_2006_geom_idx on statcan_ccs_2006 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gccs000a06a_e; + +--- 2011 Census Consolidated Subdivision; +drop table if exists statcan_ccs_2011; +create table statcan_ccs_2011 as + select distinct pruid, cduid, ccsuid, ccsname, + concat('2011S0502', ccsuid) as dguid, geom + from statcan_gccs000a11a_e; + +create index statcan_ccs_2011_geom_idx on statcan_ccs_2011 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gccs000a11a_e; + +--- 2016 Census Consolidated Subdivision; +drop table if exists statcan_ccs_2016; +create table statcan_ccs_2016 as + select distinct pruid, cduid, ccsuid, ccsname, + concat('2016S0502', ccsuid) as dguid, geom + from statcan_lcsd000a16a_e; + +create index statcan_ccs_2016_geom_idx on statcan_ccs_2016 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_lcsd000a16a_e; + +/* +CSD +*/ + +/* 2001 Census Subdivisions */ +drop table if exists statcan_csd_2001; +create table statcan_csd_2001 as + select distinct pruid, csduid, csdname, csdtype, eruid, + concat('2001A0005', csduid) as dguid, + st_union(geom) as geom + from statcan_gcsd000b02m_e_tmp + group by pruid, csduid, csdname, csdtype, eruid; + +create index statcan_csd_2001_geom_idx on statcan_csd_2001 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gcsd000b02m_e_tmp; + +/* 2006 Census Subdivisions */ +drop table if exists statcan_csd_2006; +create table statcan_csd_2006 as + select pruid, csduid, csdname, csdtype, concat(pruid, cmauid) as cmapuid, + concat('2006A0005', csduid) as dguid, + geom + from statcan_gcsd000a06a_e; + +create index statcan_csd_2006_geom_idx on statcan_csd_2006 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gcsd000a06a_e; + +/* 2011 Census Subdivisions */ +drop table if exists statcan_csd_2011; +create table statcan_csd_2011 as + select pruid, cduid, ccsuid, csduid, csdname, csdtype, eruid, cmapuid, + concat('2011A0005', csduid) as dguid, + geom + from statcan_gcsd000a11a_e; + +create index statcan_csd_2011_geom_idx on statcan_csd_2011 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gcsd000a11a_e; + +/* 2016 Census Subdivisions */ +drop table if exists statcan_csd_2016; +create table statcan_csd_2016 as + select pruid, cduid, ccsuid, csduid, csdname, csdtype, eruid, cmapuid, + concat('2016A0005', csduid) as dguid, + geom + from statcan_lcsd000a16a_e; + +create index statcan_csd_2016_geom_idx on statcan_csd_2016 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_lcsd000a16a_e; + +/* +FED +*/ + +--- 2003 Federal Electoral Districts; +drop table if exists statcan_fed_2003; +create table statcan_fed_2003 as + select distinct pruid, feduid, fedname, fedename, fedfname, + concat('2003A0004', feduid) as dguid, geom + from statcan_gfed000a11a_e; + +create index statcan_fed_2003_geom_idx on statcan_fed_2003 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gfed000a11a_e; + +--- 2013 Federal Electoral Districts; +drop table if exists statcan_fed_2013; +create table statcan_fed_2013 as + select distinct pruid, feduid, fedname, fedename, fedfname, + concat('2013A0004', feduid) as dguid, geom + from statcan_lfed000a21a_e; + +create index statcan_fed_2013_geom_idx on statcan_fed_2013 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_lfed000a21a_e; +drop table if exists statcan_lfed000a16a_e; + +/* +Census Agricultural Regions +*/ + +--- 2001 Census Agricultural Regions; +drop table if exists statcan_car_2001; +create table statcan_car_2001 as + select distinct pruid, caruid, carname, aguid, water, + concat('2001S0501', caruid) as dguid, geom + from statcan_gcar000b03m_e_tmp; + +create index statcan_car_2001_geom_idx on statcan_car_2001 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gcar000b03m_e_tmp; + +--- 2006 Census Agricultural Regions; +drop table if exists statcan_car_2006; +create table statcan_car_2006 as + select distinct pruid, caruid, carname, aguid, + concat('2006S0501', caruid) as dguid, geom + from statcan_gcar000a06a_e; + +create index statcan_car_2006_geom_idx on statcan_car_2006 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gcar000a06a_e; + +--- 2011 Census Agricultural Regions; +drop table if exists statcan_car_2011; +create table statcan_car_2011 as + select distinct pruid, caruid, carname, aguid, + concat('2011S0501', caruid) as dguid, geom + from statcan_gcar000a11a_e; + +create index statcan_car_2011_geom_idx on statcan_car_2011 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gcar000a11a_e; + +--- 2016 Census Agricultural Regions; +drop table if exists statcan_car_2016; +create table statcan_car_2016 as + select distinct pruid, caruid, carename, carfname, + concat('2016S0501', caruid) as dguid, geom + from statcan_lcar000a16a_e; + +create index statcan_car_2016_geom_idx on statcan_car_2016 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_lcar000a16a_e; + +/* +Designated Places +*/ + +--- 2001 Designated Places; +drop table if exists statcan_dpl_2001; +create table statcan_dpl_2001 as + select distinct pruid, dpluid, dplname, dpltype, + concat('2001A0006', dpluid) as dguid, + st_union(st_makevalid(geom)) as geom + from statcan_gdpl000b02m_e_tmp + group by pruid, dpluid, dplname, dpltype; + +create index statcan_dpl_2001_geom_idx on statcan_dpl_2001 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gdpl000b02m_e_tmp; + +--- 2006 Designated Places; +drop table if exists statcan_dpl_2006; +create table statcan_dpl_2006 as + select distinct pruid, csduid, dpluid, dplname, dpltype, + concat('2006A0006', dpluid) as dguid, geom + from statcan_gdpl000a06a_e; + +create index statcan_dpl_2006_geom_idx on statcan_dpl_2006 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gdpl000a06a_e; + +--- 2011 Designated Places; +drop table if exists statcan_dpl_2011; +create table statcan_dpl_2011 as + select distinct pruid, dpluid, dplname, dpltype, + concat('2011A0006', dpluid) as dguid, geom + from statcan_gdpl000a11a_e; + +create index statcan_dpl_2011_geom_idx on statcan_dpl_2011 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gdpl000a11a_e; + +--- 2016 Designated Places; +drop table if exists statcan_dpl_2016; +create table statcan_dpl_2016 as + select distinct pruid, dpluid, dplname, dpltype, + concat('2016A0006', dpluid) as dguid, geom + from statcan_ldpl000a16a_e; + +create index statcan_dpl_2016_geom_idx on statcan_dpl_2016 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_ldpl000a16a_e; + +/* +FSA +*/ + +--- 2011 FSA; +drop table if exists statcan_fsa_2011; +create table statcan_fsa_2011 as + select distinct pruid, cfsauid, concat('2011A0011', cfsauid) as dguid, geom + from statcan_gfsa000a11a_e; + +create index statcan_fsa_2011_geom_idx on statcan_fsa_2011 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gfsa000a11a_e; + +--- 2016 FSA; +drop table if exists statcan_fsa_2016; +create table statcan_fsa_2016 as + select distinct pruid, cfsauid, concat('2016A0011', cfsauid) as dguid, geom + from statcan_lfsa000a16a_e; + +create index statcan_fsa_2016_geom_idx on statcan_fsa_2016 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_lfsa000a16a_e; + +/* +CMA +*/ + +--- 2001 CMA; +drop table if exists statcan_cma_2001; +create table statcan_cma_2001 as + select distinct cmauid, concat(pruid, cmauid) as cmapuid, cmaname, cmatype, pruid, + st_union(geom) as geom + from statcan_gcma000b02m_e_tmp + group by cmaname, cmauid, cmatype, pruid; + +create index statcan_cma_2001_geom_idx on statcan_cma_2001 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gcma000b02m_e_tmp; + +--- 2006 CMA; +drop table if exists statcan_cma_2006; +create table statcan_cma_2006 as + select cmauid as cmapuid, cmaname, cmatype, pruid, geom + from statcan_gcma000a06a_e; + +create index statcan_cma_2006_geom_idx on statcan_cma_2006 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gcma000a06a_e; + +--- 2011 CMA; +drop table if exists statcan_cma_2011; +create table statcan_cma_2011 as + select cmauid, cmapuid, cmaname, cmatype, pruid, geom + from statcan_gcma000a11a_e; + +create index statcan_cma_2011_geom_idx on statcan_cma_2011 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gcma000a11a_e; + +--- 2016 CMA; +drop table if exists statcan_cma_2016; +create table statcan_cma_2016 as + select cmauid, cmapuid, cmaname, cmatype, pruid, geom + from statcan_lcma000a16a_e; + +create index statcan_cma_2016_geom_idx on statcan_cma_2016 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_lcma000a16a_e; + +/* +Population Centres +*/ + +--- 2011 Population Centres; +drop table if exists statcan_pc_2011; +create table statcan_pc_2011 as + select distinct pruid, cmapuid, pcuid, pcpuid, pcname, pctype, + concat('2011S0510', pcuid) as dguid, geom + from statcan_gpc_000a11a_e; + +create index statcan_pc_2011_geom_idx on statcan_pc_2011 using GIST(geom) with (FILLFACTOR=100); + +--- 2016 Population Centres; +drop table if exists statcan_pc_2016; +create table statcan_pc_2016 as + select distinct pruid, cmapuid, pcuid, pcpuid, pcname, pctype, + concat('2016S0510', pcuid) as dguid, geom + from statcan_lpc_000a16a_e; + +create index statcan_pc_2016_geom_idx on statcan_pc_2016 using GIST(geom) with (FILLFACTOR=100); + +/* +Population Centres Part +*/ + +--- 2011 Population Centres Part; +drop table if exists statcan_pc_part_2011; +create table statcan_pc_part_2011 as + select distinct pruid, cmapuid, pcuid, pcpuid, pcname, pctype, + concat('2011S0511', pcpuid) as dguid, geom + from statcan_gpc_000a11a_e; + +create index statcan_pc_part_2011_geom_idx on statcan_pc_part_2011 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gpc_000a11a_e; + +--- 2016 Population Centres Part; +drop table if exists statcan_pc_part_2016; +create table statcan_pc_part_2016 as + select distinct pruid, cmapuid, pcuid, pcpuid, pcname, pctype, + concat('2016S0511', pcpuid) as dguid, geom + from statcan_lpc_000a16a_e; + +create index statcan_pc_part_2016_geom_idx on statcan_pc_part_2016 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_lpc_000a16a_e; + +/* +Census Tracts +*/ +--- 2001 Census Tracts; +drop table if exists statcan_ct_2001; +create table statcan_ct_2001 as + select distinct pruid, ctuid, ctname, concat(pruid, cmauid) as cmapuid, + concat('2001S0507', ctuid) as dguid, st_union(geom) as geom + from statcan_gct_000b02m_e_tmp + group by pruid, ctuid, ctname, cmauid; + +create index statcan_ct_2001_geom_idx on statcan_ct_2001 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gct_000b02m_e_tmp; + +--- 2006 Census Tracts; +drop table if exists statcan_ct_2006; +create table statcan_ct_2006 as + select distinct pruid, ctuid, concat(pruid, cmauid) as cmapuid, + concat('2006S0507', ctuid) as dguid, geom + from statcan_gct_000a06a_e; + +create index statcan_ct_2006_geom_idx on statcan_ct_2006 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gct_000a06a_e; + +--- 2011 Census Tracts; +drop table if exists statcan_ct_2011; +create table statcan_ct_2011 as + select distinct pruid, ctuid, ctname, cmapuid, + concat('2011S0507', ctuid) as dguid, geom + from statcan_gct_000a11a_e; + +create index statcan_ct_2011_geom_idx on statcan_ct_2011 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gct_000a11a_e; + +--- 2016 Census Tracts; +drop table if exists statcan_ct_2016; +create table statcan_ct_2016 as + select distinct pruid, ctuid, ctname, cmapuid, + concat('2016S0507', ctuid) as dguid, geom + from statcan_lct_000a16a_e; + +create index statcan_ct_2016_geom_idx on statcan_ct_2016 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_lct_000a16a_e; + +/* +DA +*/ +--- 2001 Dissemination Areas; +drop table if exists statcan_da_2001; +create table statcan_da_2001 as + select distinct pruid, csduid, concat(pruid, cmauid) as cmapuid, dauid, + concat('2001S0512', dauid) as dguid, + st_union(geom) as geom + from statcan_gda_000b02m_e_tmp + group by pruid, csduid, cmauid, dauid; + +create index statcan_da_2001_geom_idx on statcan_da_2001 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gda_000b02m_e_tmp; + +--- 2006 Dissemination Area; +drop table if exists statcan_da_2006; +create table statcan_da_2006 as + select distinct pruid, cduid, ccsuid, csduid, eruid, concat(pruid, cmauid) as cmapuid, ctuid, dauid, + concat('2006S0512', dauid) as dguid, geom + from statcan_gda_000a06a_e; + +create index statcan_da_2006_geom_idx on statcan_da_2006 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gda_000a06a_e; + +--- 2011 Dissemination Area; +drop table if exists statcan_da_2011; +create table statcan_da_2011 as + select distinct pruid, cduid, ccsuid, csduid, eruid, cmapuid, ctuid, dauid, + concat('2011S0512', dauid) as dguid, geom + from statcan_gda_000a11a_e; + +create index statcan_da_2011_geom_idx on statcan_da_2011 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gda_000a11a_e; + +--- 2016 Dissemination Area; +drop table if exists statcan_da_2016; +create table statcan_da_2016 as + select distinct pruid, cduid, ccsuid, csduid, eruid, cmapuid, ctuid, adauid, dauid, + concat('2016S0512', dauid) as dguid, geom + from statcan_lda_000a16a_e; + +create index statcan_da_2016_geom_idx on statcan_da_2016 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_lda_000a16a_e; + +/* +DB +*/ + +--- 2001 Dissemination Blocks; +drop table if exists statcan_db_2001; +create table statcan_db_2001 as + select distinct pruid, csduid, concat(pruid, cmauid) as cmapuid, ctname, dauid, blockuid as dbuid, + concat('2001S0513', blockuid) as dguid, geom + from statcan_gbl_000d02m_e_tmp; + +create index statcan_db_2001_geom_idx on statcan_db_2001 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gbl_000d02m_e_tmp; + +--- 2006 Dissemination Blocks; +drop table if exists statcan_db_2006; +create table statcan_db_2006 as + select distinct pruid, cduid, ccsuid, csduid, eruid, concat(pruid, cmauid) as cmapuid, ctuid, dauid, dbuid, + concat('2006S0513', dbuid) as dguid, geom + from statcan_gdb_000a06a_e; + +create index statcan_db_2006_geom_idx on statcan_db_2006 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gdb_000a06a_e; + +--- 2011 Dissemination Blocks; +drop table if exists statcan_db_2011; +create table statcan_db_2011 as + select distinct pruid, cduid, ccsuid, csduid, eruid, feduid, + concat(pruid, cmauid) as cmapuid, ctuid, dauid, dbuid, + concat('2011S0513', dbuid) as dguid, geom + from statcan_gdb_000a11a_e; + +create index statcan_db_2011_geom_idx on statcan_db_2011 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_gdb_000a11a_e; + +--- 2016 Dissemination Blocks; +drop table if exists statcan_db_2016; +create table statcan_db_2016 as + select distinct pruid, cduid, ccsuid, csduid, eruid, feduid, cmapuid, ctuid, adauid, dauid, dbuid, + concat('2016S0513', dbuid) as dguid, geom + from statcan_ldb_000a16a_e; + +create index statcan_db_2016_geom_idx on statcan_db_2016 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_ldb_000a16a_e; + +/* +ADA +*/ + +-- 2016 Aggregate Dissemination Areas; +drop table if exists statcan_ada_2016; +create table statcan_ada_2016 as + select distinct pruid, cduid, adauid, + concat('2016S0516', adauid) as dguid, geom + from statcan_lada000a16a_e; + +create index statcan_ada_2016_geom_idx on statcan_ada_2016 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_lada000a16a_e; \ No newline at end of file diff --git a/boundaries/population_centres.sql b/boundaries/population_centres.sql new file mode 100644 index 0000000..ae42742 --- /dev/null +++ b/boundaries/population_centres.sql @@ -0,0 +1,52 @@ +/* +Population Centres +*/ + +/* 2021 +Definition here: https://web.archive.org/web/20250324222733/https://www12.statcan.gc.ca/census-recensement/2021/ref/dict/az/definition-eng.cfm?ID=geo049a +*/ + +DROP TABLE IF EXISTS silver.pop_ctr_2021; +CREATE TABLE silver.pop_ctr_2021 AS +SELECT DISTINCT + pr.country_dguid, + pr.country_en_name, + pr.country_fr_name, + pr.country_en_abbreviation, + pr.country_fr_abbreviation, + pr.grc_dguid, + pr.grc_en_name, + pr.grc_fr_name, + pr.pr_dguid, + pr.pr_en_name, + pr.pr_fr_name, + pr.pr_en_abbreviation, + pr.pr_fr_abbreviation, + pr.pr_iso_code, + gaf.pop_ctr_dguid, + gaf.pop_ctr_p_dguid, + pop_ctr.pcname AS pop_ctr_name, + pop_ctr.pctype AS pop_ctr_type, + pop_ctr.pcclass AS pop_ctr_class, + pop_ctr.geom +FROM silver.gaf_2021 AS gaf +LEFT JOIN bronze.lpc_000a21a_e AS pop_ctr + ON + concat(gaf.pop_ctr_dguid, gaf.pop_ctr_p_dguid) + = concat(pop_ctr.dguid, pop_ctr.dguidp) +LEFT JOIN silver.pr_2021 AS pr + ON gaf.pr_dguid = pr.pr_dguid +WHERE gaf.pop_ctr_dguid IS NOT null OR gaf.pop_ctr_p_dguid IS NOT null; + +-- Make geometries valid +UPDATE + silver.pop_ctr_2021 +SET + geom = st_makevalid(geom) +WHERE + st_isvalid(geom) = 'f'; + +-- Create spatial index +CREATE INDEX pop_ctr_2021_geom_idx ON silver.pop_ctr_2021 USING gist ( + geom +) WITH (fillfactor = 100); diff --git a/boundaries/process.sh b/boundaries/process.sh new file mode 100755 index 0000000..6a07f7f --- /dev/null +++ b/boundaries/process.sh @@ -0,0 +1,18 @@ +# Standardizes field names, builds up hierarchy for datasets, create Canada and Geographic Regions of Canada +psql "postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@db/$POSTGRES_DB" -f boundaries/country.sql +psql "postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@db/$POSTGRES_DB" -f boundaries/geographic_regions_of_canada.sql +psql "postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@db/$POSTGRES_DB" -f boundaries/provinces_and_territories.sql +psql "postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@db/$POSTGRES_DB" -f boundaries/census_divisions.sql +psql "postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@db/$POSTGRES_DB" -f boundaries/economic_regions.sql +psql "postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@db/$POSTGRES_DB" -f boundaries/census_agricultural_regions.sql +psql "postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@db/$POSTGRES_DB" -f boundaries/forward_sortation_areas.sql +psql "postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@db/$POSTGRES_DB" -f boundaries/federal_electoral_districts.sql +psql "postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@db/$POSTGRES_DB" -f boundaries/census_consolidated_subdivisions.sql +psql "postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@db/$POSTGRES_DB" -f boundaries/census_metropolitan_areas.sql +psql "postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@db/$POSTGRES_DB" -f boundaries/census_subdivisions.sql +psql "postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@db/$POSTGRES_DB" -f boundaries/population_centres.sql +psql "postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@db/$POSTGRES_DB" -f boundaries/designated_places.sql +psql "postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@db/$POSTGRES_DB" -f boundaries/census_tracts.sql +psql "postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@db/$POSTGRES_DB" -f boundaries/aggregate_dissemination_areas.sql +psql "postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@db/$POSTGRES_DB" -f boundaries/dissemination_areas.sql +psql "postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@db/$POSTGRES_DB" -f boundaries/dissemination_blocks.sql \ No newline at end of file diff --git a/boundaries/provinces_and_territories.sql b/boundaries/provinces_and_territories.sql new file mode 100644 index 0000000..fbe3d02 --- /dev/null +++ b/boundaries/provinces_and_territories.sql @@ -0,0 +1,113 @@ +/* +Provinces and territories +*/ + +/* 2021 +Definition here: https://web.archive.org/web/20240402175445/https://www150.statcan.gc.ca/n1/pub/92-195-x/2021001/geo/prov/prov-eng.htm +*/ + +DROP TABLE IF EXISTS silver.pr_2021; +CREATE TABLE silver.pr_2021 AS +SELECT DISTINCT + grc.country_dguid, + grc.country_en_name, + grc.country_fr_name, + grc.country_en_abbreviation, + grc.country_fr_abbreviation, + grc.grc_dguid, + grc.grc_en_name, + grc.grc_fr_name, + pr.dguid AS pr_dguid, + pr.prename AS pr_en_name, + pr.prfname AS pr_fr_name, + pr.preabbr AS pr_en_abbreviation, + pr.prfabbr AS pr_fr_abbreviation, + CASE + WHEN pr.pruid = '10' THEN 'NL' + WHEN pr.pruid = '11' THEN 'PE' + WHEN pr.pruid = '12' THEN 'NS' + WHEN pr.pruid = '13' THEN 'NB' + WHEN pr.pruid = '24' THEN 'QC' + WHEN pr.pruid = '35' THEN 'ON' + WHEN pr.pruid = '46' THEN 'MB' + WHEN pr.pruid = '47' THEN 'SK' + WHEN pr.pruid = '48' THEN 'AB' + WHEN pr.pruid = '59' THEN 'BC' + WHEN pr.pruid = '60' THEN 'YT' + WHEN pr.pruid = '61' THEN 'NT' + WHEN pr.pruid = '62' THEN 'NU' + END AS pr_iso_code, + pr.geom +FROM + bronze.lpr_000a21a_e AS pr, + silver.grc_pr_2021 AS grc +WHERE + grc.pr_dguid = pr.dguid; + +-- Make geometries valid +UPDATE + silver.pr_2021 +SET + geom = ST_MAKEVALID(geom) +WHERE + ST_ISVALID(geom) = 'f'; + +-- Create spatial index +CREATE INDEX pr_2021_geom_idx ON pr_2021 USING gist (geom) WITH ( + fillfactor = 100 +); + +/* 2016 +Definition here: https://web.archive.org/web/20241104061057/https://www12.statcan.gc.ca/census-recensement/2016/ref/dict/geo038-eng.cfm +*/ + +DROP TABLE IF EXISTS silver.pr_2016; +CREATE TABLE silver.pr_2016 AS +SELECT DISTINCT + grc.country_dguid, + grc.country_en_name, + grc.country_fr_name, + grc.country_en_abbreviation, + grc.country_fr_abbreviation, + grc.grc_dguid, + grc.grc_en_name, + grc.grc_fr_name, + CONCAT('2016A0002', pr.pruid) AS pr_dguid, + pr.prename AS pr_en_name, + pr.prfname AS pr_fr_name, + pr.preabbr AS pr_en_abbreviation, + pr.prfabbr AS pr_fr_abbreviation, + CASE + WHEN pr.pruid = '10' THEN 'NL' + WHEN pr.pruid = '11' THEN 'PE' + WHEN pr.pruid = '12' THEN 'NS' + WHEN pr.pruid = '13' THEN 'NB' + WHEN pr.pruid = '24' THEN 'QC' + WHEN pr.pruid = '35' THEN 'ON' + WHEN pr.pruid = '46' THEN 'MB' + WHEN pr.pruid = '47' THEN 'SK' + WHEN pr.pruid = '48' THEN 'AB' + WHEN pr.pruid = '59' THEN 'BC' + WHEN pr.pruid = '60' THEN 'YT' + WHEN pr.pruid = '61' THEN 'NT' + WHEN pr.pruid = '62' THEN 'NU' + END AS pr_iso_code, + pr.geom +FROM + bronze.lpr_000a16a_e AS pr, + silver.grc_pr_2016 AS grc +WHERE + grc.pr_dguid = CONCAT('2016A0002', pr.pruid); + +-- Make geometries valid +UPDATE + silver.pr_2016 +SET + geom = ST_MAKEVALID(geom) +WHERE + ST_ISVALID(geom) = 'f'; + +-- Create spatial index +CREATE INDEX pr_2016_geom_idx ON pr_2016 USING gist (geom) WITH ( + fillfactor = 100 +); diff --git a/census_of_agriculture/README.md b/census_of_agriculture/README.md new file mode 100644 index 0000000..b6f8f21 --- /dev/null +++ b/census_of_agriculture/README.md @@ -0,0 +1 @@ +- See email that I sent to Statistics Canada titled Reporting Mistakes in Census of Agriculture: Data Linked to Geographic Boundaries for mistakes in the Census of Agriculture data \ No newline at end of file diff --git a/census_of_agriculture/census_of_agriculture_2016.txt b/census_of_agriculture/census_of_agriculture_2016.txt new file mode 100644 index 0000000..7855ddc --- /dev/null +++ b/census_of_agriculture/census_of_agriculture_2016.txt @@ -0,0 +1,9 @@ +# https://open.canada.ca/data/en/dataset/b944bd53-49e5-4a80-83e5-1048d3abf38d + +# Download variables descriptions +https://ftp.maps.canada.ca/pub/statcan_statcan/Agriculture_Agriculture/census_of_agriculture-recensement_agriculture/2016/CEAG16_VariablesDescriptions_REAG16_EN_FR.xlsx +https://ftp.maps.canada.ca/pub/statcan_statcan/Agriculture_Agriculture/census_of_agriculture-recensement_agriculture/2016/CEAG16_AgriculturalOperations_ExploitationsAgricoles_REAG16.gdb.zip +https://ftp.maps.canada.ca/pub/statcan_statcan/Agriculture_Agriculture/census_of_agriculture-recensement_agriculture/2016/CEAG16_Crops_Cultures_REAG16.gdb.zip +https://ftp.maps.canada.ca/pub/statcan_statcan/Agriculture_Agriculture/census_of_agriculture-recensement_agriculture/2016/CEAG16_FarmOperators_ExploitantsAgricoles_REAG16.gdb.zip +https://ftp.maps.canada.ca/pub/statcan_statcan/Agriculture_Agriculture/census_of_agriculture-recensement_agriculture/2016/CEAG16_LivestockPoultryBees_BetailVolailleAbeilles_REAG16.gdb.zip +https://ftp.maps.canada.ca/pub/statcan_statcan/Agriculture_Agriculture/census_of_agriculture-recensement_agriculture/2016/CEAG16_UseTenurePractices_UtilisationOccupationPratiques_REAG16.gdb.zip \ No newline at end of file diff --git a/census_of_agriculture/census_of_agriculture_2021.txt b/census_of_agriculture/census_of_agriculture_2021.txt new file mode 100644 index 0000000..e03cb48 --- /dev/null +++ b/census_of_agriculture/census_of_agriculture_2021.txt @@ -0,0 +1,9 @@ +# https://open.canada.ca/data/en/dataset/b944bd53-49e5-4a80-83e5-1048d3abf38d + +# Download variables descriptions +https://ftp.maps.canada.ca/pub/statcan_statcan/Agriculture_Agriculture/census_of_agriculture-recensement_agriculture/2021/CEAG21_VariablesDescriptions_REAG21_EN_FR.xlsx +https://ftp.maps.canada.ca/pub/statcan_statcan/Agriculture_Agriculture/census_of_agriculture-recensement_agriculture/2021/CEAG21_AgriculturalOperations_ExploitationsAgricoles_REAG21.gdb.zip +https://ftp.maps.canada.ca/pub/statcan_statcan/Agriculture_Agriculture/census_of_agriculture-recensement_agriculture/2021/CEAG21_Crops_Cultures_REAG21.gdb.zip +https://ftp.maps.canada.ca/pub/statcan_statcan/Agriculture_Agriculture/census_of_agriculture-recensement_agriculture/2021/CEAG21_FarmOperators_ExploitantsAgricoles_REAG21.gdb.zip +https://ftp.maps.canada.ca/pub/statcan_statcan/Agriculture_Agriculture/census_of_agriculture-recensement_agriculture/2021/CEAG21_LivestockPoultryBees_BetailVolailleAbeilles_REAG21.gdb.zip +https://ftp.maps.canada.ca/pub/statcan_statcan/Agriculture_Agriculture/census_of_agriculture-recensement_agriculture/2021/CEAG21_UseTenurePractices_UtilisationOccupationPratiques_REAG21.gdb.zip \ No newline at end of file diff --git a/census_of_agriculture/download.sh b/census_of_agriculture/download.sh new file mode 100755 index 0000000..9ea4836 --- /dev/null +++ b/census_of_agriculture/download.sh @@ -0,0 +1,15 @@ +#!/bin/bash +if [ ! -d "${DATA_FOLDER}/census_of_agriculture" ] +then + echo "Making directory ${DATA_FOLDER}/census_of_agriculture/" + mkdir -p ${DATA_FOLDER}/census_of_agriculture/{input,extracted,output}/{2021,2016,2011,2001} + mkdir -p ${DATA_FOLDER}/census_of_agriculture/output/{2021,2016,2011,2001}/{tabular,spatial} +fi + +INPUT_FOLDER="${DATA_FOLDER}/census_of_agriculture/input" + +echo "Downloading 2021 Census of Agriculture" +aria2c -x16 -i "${SCRIPT_DIR}/census_of_agriculture/census_of_agriculture_2021.txt" --dir=${INPUT_FOLDER}/2021 --auto-file-renaming=false + +echo "Downloading 2016 Census of Agriculture" +aria2c -x16 -i "${SCRIPT_DIR}/census_of_agriculture/census_of_agriculture_2016.txt" --dir=${INPUT_FOLDER}/2016 --auto-file-renaming=false \ No newline at end of file diff --git a/census_of_agriculture/process.sh b/census_of_agriculture/process.sh new file mode 100755 index 0000000..4a081a0 --- /dev/null +++ b/census_of_agriculture/process.sh @@ -0,0 +1,5 @@ +#!/bin/bash +echo "Processing 2021 Census of Agriculture" +jupyter execute census_of_agriculture/process_2021.ipynb +echo "Processing 2016 Census of Agriculture" +jupyter execute census_of_agriculture/process_2016.ipynb \ No newline at end of file diff --git a/census_of_agriculture/process_2016.ipynb b/census_of_agriculture/process_2016.ipynb new file mode 100644 index 0000000..91777ef --- /dev/null +++ b/census_of_agriculture/process_2016.ipynb @@ -0,0 +1,1979 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 65, + "id": "a06cc1c7-7826-4270-8c04-48ad4de90bc9", + "metadata": {}, + "outputs": [], + "source": [ + "import gc\n", + "\n", + "from IPython.core.interactiveshell import InteractiveShell \n", + "import geopandas as gpd\n", + "from ordered_set import OrderedSet\n", + "import pandas as pd\n", + "\n", + "# Enable multiple outputs per cell\n", + "InteractiveShell.ast_node_interactivity = \"all\"\n", + "# Show all columns\n", + "pd.set_option('display.max_columns', None)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "6e4dad8c-afec-4bb6-aee2-20b9b9c5a7a5", + "metadata": {}, + "outputs": [], + "source": [ + "input_data_dir = '/data/census_of_agriculture/input/2016'\n", + "output_data_dir = '/data/census_of_agriculture/output/2016/tabular'" + ] + }, + { + "cell_type": "markdown", + "id": "c049bbce-8dcb-418c-b7cb-7015f920a39a", + "metadata": {}, + "source": [ + "# 1.0 Process Excel sheet with column names and descriptions\n", + "The compilation of all of the file geodatabase dataset columns should match this dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "0026e968-c4a4-4dc3-9f5c-cdf9706bb8e9", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Reading Excel sheet with variables\")\n", + "\n", + "data_description = pd.read_excel(f'{input_data_dir}/CEAG16_VariablesDescriptions_REAG16_EN_FR.xlsx', skiprows=3,\n", + " usecols=['Variables', 'Long description of the variables'])\n", + "data_description.rename(columns={'Variables': 'variables', 'Long description of the variables': 'description_en'}, inplace=True)\n", + "data_description['variables'] = data_description['variables'].str.lower()\n", + "\n", + "# There are duplicate variables that are identical. For example, opermore_n\n", + "data_description = data_description.groupby(['variables', 'description_en']).last().reset_index()" + ] + }, + { + "cell_type": "markdown", + "id": "ae758963-b639-44a9-9904-d7438594a729", + "metadata": {}, + "source": [ + "# 2.0 Process Provinces and Territories\n", + "## 2.1 Process Agricultural Operations\n", + "**TODO:** \n", + "- Mistakes:\n", + " - There is a duplicate of `farms_n` column that is named `farms_n1`. The values are identical for both columns" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "2b8467e8-b979-423c-856c-80242f9a8443", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'farms_n1',\n", + " 'geo_descr1',\n", + " 'geo_descr_',\n", + " 'geometry',\n", + " 'shape_area',\n", + " 'shape_length'}" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG16_AgriculturalOperations_ExploitationsAgricoles_REAG16.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lpr_000b16a_ceag16_n feature class')\n", + "\n", + "ao_pr = gpd.read_file(dataset, layer='lpr_000b16a_ceag16_n')\n", + "# Lowercase column names\n", + "ao_pr.columns = [x.lower() for x in ao_pr.columns]\n", + "\n", + "# Calculate dguid\n", + "ao_pr['geo_pruid'] = '2016A0002' + ao_pr['geo_pruid']\n", + "ao_pr.rename(columns={'geo_pruid':'pr_dguid'}, inplace=True)\n", + "\n", + "# Select the variables\n", + "variable_names = list(data_description['variables'])\n", + "variable_names.insert(0, 'pr_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(ao_pr.columns) - set(variable_names)\n", + "\n", + "variable_names = list(OrderedSet(variable_names) & OrderedSet(ao_pr.columns))\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "ao_pr = ao_pr[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "ao_pr = ao_pr.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "54b1ab9d-68c7-4751-a17e-e5ad321fd72f", + "metadata": {}, + "source": [ + "## 2.2 Process Crop Cultures" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "0dfa08f0-2d8d-4279-bd22-2f127d596c3f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr1',\n", + " 'geo_descr_',\n", + " 'geometry',\n", + " 'shape_area',\n", + " 'shape_leng',\n", + " 'shape_length'}" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG16_Crops_Cultures_REAG16.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lpr_000b16a_ceag16_n feature class')\n", + "\n", + "cc_pr = gpd.read_file(dataset, \n", + " layer='lpr_000b16a_ceag16_n')\n", + "\n", + "# Lowercase column names\n", + "cc_pr.columns = [x.lower() for x in cc_pr.columns]\n", + "\n", + "# Calculate dguid\n", + "cc_pr['geo_pruid'] = '2016A0002' + cc_pr['geo_pruid']\n", + "cc_pr.rename(columns={'geo_pruid':'pr_dguid'}, inplace=True)\n", + "\n", + "# Select the variables\n", + "variable_names = list(data_description['variables'])\n", + "variable_names.insert(0, 'pr_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(cc_pr.columns) - set(variable_names)\n", + "\n", + "variable_names = list(OrderedSet(variable_names) & OrderedSet(cc_pr.columns))\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "cc_pr = cc_pr[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "cc_pr = cc_pr.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "076eb1e3-b319-47ff-b959-cae6dd600081", + "metadata": {}, + "source": [ + "## 2.3 Process Farm Operators\n", + "**TODO:** \n", + "- Mistakes:\n", + " - Column `more_avg_a` should be called `more_avg_age`\n", + " - Column `one_avg_ag` should be called `one_avg_age`\n", + " - On the Excel sheet, there are four `OPER_N`, with the same definition\n", + " - On the dataset, this column is replicated five times as `oper_n`, `oper_n1`, `oper_n2`, `oper_n3`, `oper_n4`\n", + " - No idea what `opermore_1` is supposed to be\n", + " - Column `operone_n1` is duplicate of `operone_n`" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "de1dbde2-e820-438f-ae9d-b54455b10ac4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr1',\n", + " 'geo_descr_',\n", + " 'geometry',\n", + " 'oper_n1',\n", + " 'oper_n2',\n", + " 'oper_n3',\n", + " 'oper_n4',\n", + " 'opermore_1',\n", + " 'operone_n1',\n", + " 'shape_area',\n", + " 'shape_leng',\n", + " 'shape_length'}" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG16_FarmOperators_ExploitantsAgricoles_REAG16.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lpr_000b16a_ceag16_n feature class')\n", + "\n", + "fo_pr = gpd.read_file(dataset, layer='lpr_000b16a_ceag16_n')\n", + "\n", + "# Lowercase column names\n", + "fo_pr.columns = [x.lower() for x in fo_pr.columns]\n", + "\n", + "# Calculate dguid\n", + "fo_pr['geo_pruid'] = '2016A0002' + fo_pr['geo_pruid']\n", + "fo_pr.rename(columns={'geo_pruid':'pr_dguid'}, inplace=True)\n", + "\n", + "# Fix mistakes\n", + "fo_pr.rename(columns={\n", + " 'more_avg_a': 'more_avg_age',\n", + " 'one_avg_ag': 'one_avg_age'}, inplace=True)\n", + "\n", + "# Select the variables\n", + "variable_names = list(data_description['variables'])\n", + "variable_names.insert(0, 'pr_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(fo_pr.columns) - set(variable_names)\n", + "\n", + "variable_names = list(OrderedSet(variable_names) & OrderedSet(fo_pr.columns))\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "fo_pr = fo_pr[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "fo_pr = fo_pr.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "39161daf-5d6a-47e0-9211-0cc0137e2c9e", + "metadata": {}, + "source": [ + "## 2.4 Process Livestock Poultry Bees" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "79731af0-56f2-4840-b903-49410512d410", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr1',\n", + " 'geo_descr_',\n", + " 'geometry',\n", + " 'shape_area',\n", + " 'shape_leng',\n", + " 'shape_length'}" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG16_LivestockPoultryBees_BetailVolailleAbeilles_REAG16.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lpr_000b16a_ceag16_n feature class')\n", + "\n", + "lpb_pr = gpd.read_file(dataset, layer='lpr_000b16a_ceag16_n')\n", + "\n", + "# Lowercase column names\n", + "lpb_pr.columns = [x.lower() for x in lpb_pr.columns]\n", + "\n", + "# Calculate dguid\n", + "lpb_pr['geo_pruid'] = '2016A0002' + lpb_pr['geo_pruid']\n", + "lpb_pr.rename(columns={'geo_pruid':'pr_dguid'}, inplace=True)\n", + "\n", + "# Select the variables\n", + "variable_names = list(data_description['variables'])\n", + "variable_names.insert(0, 'pr_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(lpb_pr.columns) - set(variable_names)\n", + "\n", + "variable_names = list(OrderedSet(variable_names) & OrderedSet(lpb_pr.columns))\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "lpb_pr = lpb_pr[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "lpb_pr = lpb_pr.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "6a33cfe2-4bbb-46a7-81fb-494a61663a86", + "metadata": {}, + "source": [ + "## 2.5 Process Use Tenure Practices" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "328daec4-f6de-42c4-b11d-5ba287b7a94d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr1',\n", + " 'geo_descr_',\n", + " 'geometry',\n", + " 'shape_area',\n", + " 'shape_leng',\n", + " 'shape_length'}" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG16_UseTenurePractices_UtilisationOccupationPratiques_REAG16.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lpr_000b16a_ceag16_n feature class')\n", + "\n", + "utp_pr = gpd.read_file(dataset, layer='lpr_000b16a_ceag16_n')\n", + "\n", + "# Lowercase column names\n", + "utp_pr.columns = [x.lower() for x in utp_pr.columns]\n", + "\n", + "# Calculate dguid\n", + "utp_pr['geo_pruid'] = '2016A0002' + utp_pr['geo_pruid']\n", + "utp_pr.rename(columns={'geo_pruid':'pr_dguid'}, inplace=True)\n", + "\n", + "# Select the variables\n", + "variable_names = list(data_description['variables'])\n", + "variable_names.insert(0, 'pr_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(utp_pr.columns) - set(variable_names)\n", + "\n", + "variable_names = list(OrderedSet(variable_names) & OrderedSet(utp_pr.columns))\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "utp_pr = utp_pr[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "utp_pr = utp_pr.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "4963cfbb-71c3-4ce7-98f1-3c6ad3d90fde", + "metadata": {}, + "source": [ + "## 2.6 Join the DataFrames and Export" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "11b5355a-2293-4751-a364-54a0963ed662", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr_en', 'geo_descr_fr'}" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be pr_dguid\n" + ] + }, + { + "data": { + "text/plain": [ + "{'pr_dguid'}" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"Merging all Province and Territories dataframes into one\")\n", + "pr_merge = ao_pr.merge(cc_pr, how='inner', on='pr_dguid') \\\n", + " .merge(fo_pr, how='inner', on='pr_dguid') \\\n", + " .merge(lpb_pr, how='inner', on='pr_dguid') \\\n", + " .merge(utp_pr, how='inner', on='pr_dguid')\n", + "\n", + "# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr\n", + "print(\"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\")\n", + "set(data_description['variables']) - set(pr_merge.columns)\n", + "print(\"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be pr_dguid\")\n", + "set(pr_merge.columns) - set(data_description['variables'])\n", + "\n", + "# Export\n", + "print(\"Exporting pr_2016.parquet\")\n", + "pr_merge.to_parquet(f'{output_data_dir}/pr_2016.parquet', index=False, compression='zstd')\n", + "\n", + "# Create country as well\n", + "# TODO: check if -1 values subtracted from the sum\n", + "country = pd.read_parquet(f'{output_data_dir}/pr_2016.parquet')\n", + "country['pr_dguid'] = '2016A000011124'\n", + "country.rename(columns={'pr_dguid': 'country_dguid'}, inplace=True)\n", + "country = country.groupby(['country_dguid']).sum()\n", + "country.reset_index(inplace=True)\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "country = country.convert_dtypes(**params)\n", + "print(\"Exporting country_2016.parquet\")\n", + "country.to_parquet(f'{output_data_dir}/country_2016.parquet', index=False, compression='zstd')\n", + "\n", + "del(ao_pr)\n", + "del(cc_pr)\n", + "del(fo_pr)\n", + "del(lpb_pr)\n", + "del(utp_pr)\n", + "del(pr_merge)\n", + "del(country)\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "id": "bd78f553-52b4-427e-bf40-7ee4101d65ac", + "metadata": {}, + "source": [ + "# 3.0 Process Census Agricultural Regions" + ] + }, + { + "cell_type": "markdown", + "id": "ac1383f5-1340-4dd8-9346-0d21ab506886", + "metadata": {}, + "source": [ + "## 3.1 Process Agricultural Operations\n", + "**TODO:** \n", + "- Mistakes:\n", + " - There is a duplicate of `farms_n` column that is named `farms_n1`. The values are identical for both columns" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "740de275-a602-4a18-9f7a-708db0e4e529", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'farms_n1',\n", + " 'geo_descr1',\n", + " 'geo_descr_',\n", + " 'geometry',\n", + " 'shape_area',\n", + " 'shape_length'}" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG16_AgriculturalOperations_ExploitationsAgricoles_REAG16.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lcar000b16a_ceag16_n feature class')\n", + "\n", + "ao_car = gpd.read_file(dataset, layer='lcar000b16a_ceag16_n')\n", + "\n", + "# Lowercase column names\n", + "ao_car.columns = [x.lower() for x in ao_car.columns]\n", + "\n", + "# Calculate dguid\n", + "ao_car['geo_caruid'] = '2016S0501' + ao_car['geo_caruid']\n", + "ao_car.rename(columns={'geo_caruid':'car_dguid'}, inplace=True)\n", + "\n", + "# Select the variables\n", + "variable_names = list(data_description['variables'])\n", + "variable_names.insert(0, 'car_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(ao_car.columns) - set(variable_names)\n", + "\n", + "variable_names = list(OrderedSet(variable_names) & OrderedSet(ao_car.columns))\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "ao_car = ao_car[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "ao_car = ao_car.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "c4254a8e-30cf-43a0-8a08-b7ac1ab9689c", + "metadata": {}, + "source": [ + "## 3.2 Process Crop Cultures" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "92b08284-0adb-43fd-a89f-19395afb9bd0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr1',\n", + " 'geo_descr_',\n", + " 'geometry',\n", + " 'shape_area',\n", + " 'shape_leng',\n", + " 'shape_length'}" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG16_Crops_Cultures_REAG16.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lcar_000b16a_ceag16_n feature class')\n", + "\n", + "cc_car = gpd.read_file(dataset, layer='lcar_000b16a_ceag16_n')\n", + "\n", + "# Lowercase column names\n", + "cc_car.columns = [x.lower() for x in cc_car.columns]\n", + "\n", + "# Calculate dguid\n", + "cc_car['geo_caruid'] = '2016S0501' + cc_car['geo_caruid']\n", + "cc_car.rename(columns={'geo_caruid':'car_dguid'}, inplace=True)\n", + "\n", + "# Select the variables\n", + "variable_names = list(data_description['variables'])\n", + "variable_names.insert(0, 'car_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(cc_car.columns) - set(variable_names)\n", + "\n", + "variable_names = list(OrderedSet(variable_names) & OrderedSet(cc_car.columns))\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "cc_car = cc_car[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "cc_car = cc_car.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "babc334e-47c0-435d-919c-2df54a70130a", + "metadata": {}, + "source": [ + "## 3.3 Process Farm Operators\n", + "**TODO:** \n", + "- Mistakes:\n", + " - Column `more_avg_a` should be called `more_avg_age`\n", + " - Column `one_avg_ag` should be called `one_avg_age`\n", + " - On the Excel sheet, there are four `OPER_N`, with the same definition\n", + " - On the dataset, this column is replicated five times as `oper_n`, `oper_n1`, `oper_n2`, `oper_n3`, `oper_n4`\n", + " - No idea what `opermore_1` is supposed to be\n", + " - Column `operone_n1` is duplicate of `operone_n`" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "47afb1ab-f4a9-4581-8222-b4a52ff0eb4a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr1',\n", + " 'geo_descr_',\n", + " 'geometry',\n", + " 'oper_n1',\n", + " 'oper_n2',\n", + " 'oper_n3',\n", + " 'oper_n4',\n", + " 'opermore_1',\n", + " 'operone_n1',\n", + " 'shape_area',\n", + " 'shape_leng',\n", + " 'shape_length'}" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG16_FarmOperators_ExploitantsAgricoles_REAG16.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lcar000b16a_ceag16_n feature class')\n", + "\n", + "fo_car = gpd.read_file(dataset, layer='lcar000b16a_ceag16_n')\n", + "\n", + "# Lowercase column names\n", + "fo_car.columns = [x.lower() for x in fo_car.columns]\n", + "\n", + "# Calculate dguid\n", + "fo_car['geo_caruid'] = '2016S0501' + fo_car['geo_caruid']\n", + "fo_car.rename(columns={'geo_caruid':'car_dguid'}, inplace=True)\n", + "\n", + "# Fix mistakes\n", + "fo_car.rename(columns={\n", + " 'more_avg_a': 'more_avg_age',\n", + " 'one_avg_ag': 'one_avg_age'}, inplace=True)\n", + "\n", + "# Select the variables\n", + "variable_names = list(data_description['variables'])\n", + "variable_names.insert(0, 'car_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(fo_car.columns) - set(variable_names)\n", + "\n", + "variable_names = list(OrderedSet(variable_names) & OrderedSet(fo_car.columns))\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "fo_car = fo_car[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "fo_car = fo_car.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "03bf9372-c2da-46b9-8da9-64c358cb51df", + "metadata": {}, + "source": [ + "## 3.4 Process Livestock Poultry Bees" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "abf3ff3f-6fbc-482f-9ef8-f7ae6c289884", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr1',\n", + " 'geo_descr_',\n", + " 'geometry',\n", + " 'shape_area',\n", + " 'shape_leng',\n", + " 'shape_length'}" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG16_LivestockPoultryBees_BetailVolailleAbeilles_REAG16.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lcar000b16a_ceag16_n feature class')\n", + "\n", + "lpb_car = gpd.read_file(dataset, layer='lcar000b16a_ceag16_n')\n", + "\n", + "# Lowercase column names\n", + "lpb_car.columns = [x.lower() for x in lpb_car.columns]\n", + "\n", + "# Calculate dguid\n", + "lpb_car['geo_caruid'] = '2016S0501' + lpb_car['geo_caruid']\n", + "lpb_car.rename(columns={'geo_caruid':'car_dguid'}, inplace=True)\n", + "\n", + "# Select the variables\n", + "variable_names = list(data_description['variables'])\n", + "variable_names.insert(0, 'car_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(lpb_car.columns) - set(variable_names)\n", + "\n", + "variable_names = list(OrderedSet(variable_names) & OrderedSet(lpb_car.columns))\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "lpb_car = lpb_car[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "lpb_car = lpb_car.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "a3962c9c-3b5b-4fd8-a6ca-8e004159a521", + "metadata": {}, + "source": [ + "## 3.5 Process Use Tenure Practices" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "16678c91-7132-4ed2-981b-f60b0d81bc73", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr1',\n", + " 'geo_descr_',\n", + " 'geometry',\n", + " 'shape_area',\n", + " 'shape_leng',\n", + " 'shape_length'}" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG16_UseTenurePractices_UtilisationOccupationPratiques_REAG16.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lcar_000b16a_ceag16_n feature class')\n", + "\n", + "utp_car = gpd.read_file(dataset, layer='lcar_000b16a_ceag16_n')\n", + "\n", + "# Lowercase column names\n", + "utp_car.columns = [x.lower() for x in utp_car.columns]\n", + "\n", + "# Calculate dguid\n", + "utp_car['geo_caruid'] = '2016S0501' + utp_car['geo_caruid']\n", + "utp_car.rename(columns={'geo_caruid':'car_dguid'}, inplace=True)\n", + "\n", + "# Select the variables\n", + "variable_names = list(data_description['variables'])\n", + "variable_names.insert(0, 'car_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(utp_car.columns) - set(variable_names)\n", + "\n", + "variable_names = list(OrderedSet(variable_names) & OrderedSet(utp_car.columns))\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "utp_car = utp_car[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "utp_car = utp_car.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "9b99344d-edfc-4f51-ab9f-8a56100394b0", + "metadata": {}, + "source": [ + "## 3.6 Join the DataFrames and Export" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "d1cfb3c1-c94e-4f4a-9c90-e61c43a791a3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr_en', 'geo_descr_fr'}" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be car_dguid\n" + ] + }, + { + "data": { + "text/plain": [ + "{'car_dguid'}" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"Merging all Census Agricultural Regions dataframes into one\")\n", + "car_merge = ao_car.merge(cc_car, how='inner', on='car_dguid') \\\n", + " .merge(fo_car, how='inner', on='car_dguid') \\\n", + " .merge(lpb_car, how='inner', on='car_dguid') \\\n", + " .merge(utp_car, how='inner', on='car_dguid')\n", + "\n", + "# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr\n", + "print(\"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\")\n", + "set(data_description['variables']) - set(car_merge.columns)\n", + "print(\"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be car_dguid\")\n", + "set(car_merge.columns) - set(data_description['variables'])\n", + "\n", + "# Export\n", + "print(\"Exporting car_2016.parquet\")\n", + "car_merge.to_parquet(f'{output_data_dir}/car_2016.parquet', index=False, compression='zstd')\n", + "\n", + "del(ao_car)\n", + "del(cc_car)\n", + "del(fo_car)\n", + "del(lpb_car)\n", + "del(utp_car)\n", + "del(car_merge)\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "id": "5ec65522-e754-40b9-994d-e21c5bae88ac", + "metadata": {}, + "source": [ + "# 4.0 Process Census Divisions\n", + "## 4.1 Process Agricultural Operations\n", + "**TODO:** \n", + "- Mistakes:\n", + " - There is a duplicate of `farms_n` column that is named `farms_n1`. The values are identical for both columns" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "02f789b8-e3d8-4359-97b1-46be400bc019", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'farms_n1',\n", + " 'geo_descr1',\n", + " 'geo_descr_',\n", + " 'geometry',\n", + " 'shape_area',\n", + " 'shape_length'}" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG16_AgriculturalOperations_ExploitationsAgricoles_REAG16.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lcd_000b16a_ceag16_n feature class')\n", + "\n", + "ao_cd = gpd.read_file(dataset, layer='lcd_000b16a_ceag16_n')\n", + "\n", + "# Lowercase column names\n", + "ao_cd.columns = [x.lower() for x in ao_cd.columns]\n", + "\n", + "# Calculate dguid\n", + "ao_cd['geo_cduid'] = '2016A0003' + ao_cd['geo_cduid']\n", + "ao_cd.rename(columns={'geo_cduid':'cd_dguid'}, inplace=True)\n", + "\n", + "# Select the variables\n", + "variable_names = list(data_description['variables'])\n", + "variable_names.insert(0, 'cd_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(ao_cd.columns) - set(variable_names)\n", + "\n", + "variable_names = list(OrderedSet(variable_names) & OrderedSet(ao_cd.columns))\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "ao_cd = ao_cd[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "ao_cd = ao_cd.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "b22c32c1-bf09-42f6-ad9f-15e19e88f125", + "metadata": {}, + "source": [ + "## 4.2 Process Crop Cultures" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "05283ef4-d1a9-48aa-8b34-d11123e19dc1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr1',\n", + " 'geo_descr_',\n", + " 'geometry',\n", + " 'shape_area',\n", + " 'shape_leng',\n", + " 'shape_length'}" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG16_Crops_Cultures_REAG16.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lcd_000b16a_ceag16_n feature class')\n", + "\n", + "cc_cd = gpd.read_file(dataset, layer='lcd_000b16a_ceag16_n')\n", + "\n", + "# Lowercase column names\n", + "cc_cd.columns = [x.lower() for x in cc_cd.columns]\n", + "\n", + "# Calculate dguid\n", + "cc_cd['geo_cduid'] = '2016A0003' + cc_cd['geo_cduid']\n", + "cc_cd.rename(columns={'geo_cduid':'cd_dguid'}, inplace=True)\n", + "\n", + "# Select the variables\n", + "variable_names = list(data_description['variables'])\n", + "variable_names.insert(0, 'cd_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(cc_cd.columns) - set(variable_names)\n", + "\n", + "variable_names = list(OrderedSet(variable_names) & OrderedSet(cc_cd.columns))\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "cc_cd = cc_cd[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "cc_cd = cc_cd.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "52241e8d-b468-4dd6-99b9-342f9a6fde94", + "metadata": {}, + "source": [ + "## 4.3 Process Farm Operators\n", + "**TODO:** \n", + "- Mistakes:\n", + " - Column `more_avg_a` should be called `more_avg_age`\n", + " - Column `one_avg_ag` should be called `one_avg_age`\n", + " - On the Excel sheet, there are four `OPER_N`, with the same definition\n", + " - On the dataset, this column is replicated five times as `oper_n`, `oper_n1`, `oper_n2`, `oper_n3`, `oper_n4`\n", + " - No idea what `opermore_1` is supposed to be\n", + " - Column `operone_n1` is duplicate of `operone_n`" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "cf5b26d1-7315-4e9d-85fe-473a68cc5f0f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr1',\n", + " 'geo_descr_',\n", + " 'geometry',\n", + " 'oper_n1',\n", + " 'oper_n2',\n", + " 'oper_n3',\n", + " 'oper_n4',\n", + " 'opermore_1',\n", + " 'operone_n1',\n", + " 'shape_area',\n", + " 'shape_leng',\n", + " 'shape_length'}" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG16_FarmOperators_ExploitantsAgricoles_REAG16.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lcd_000b16a_ceag16_n feature class')\n", + "\n", + "fo_cd = gpd.read_file(dataset, \n", + " layer='lcd_000b16a_ceag16_n')\n", + "\n", + "# Lowercase column names\n", + "fo_cd.columns = [x.lower() for x in fo_cd.columns]\n", + "\n", + "# Calculate dguid\n", + "fo_cd['geo_cduid'] = '2016A0003' + fo_cd['geo_cduid']\n", + "fo_cd.rename(columns={'geo_cduid':'cd_dguid'}, inplace=True)\n", + "\n", + "# Select the variables\n", + "variable_names = list(data_description['variables'])\n", + "variable_names.insert(0, 'cd_dguid')\n", + "\n", + "# Fix mistakes\n", + "fo_cd.rename(columns={\n", + " 'more_avg_a': 'more_avg_age',\n", + " 'one_avg_ag': 'one_avg_age'}, inplace=True)\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(fo_cd.columns) - set(variable_names)\n", + "\n", + "variable_names = list(OrderedSet(variable_names) & OrderedSet(fo_cd.columns))\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "fo_cd = fo_cd[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "fo_cd = fo_cd.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "f68f6fc6-f757-4f21-b32a-7070d4bcc3ec", + "metadata": {}, + "source": [ + "## 4.4 Process Livestock Poultry Bees" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "e3cd38c6-6563-4067-8280-32277b00a33b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr1',\n", + " 'geo_descr_',\n", + " 'geometry',\n", + " 'shape_area',\n", + " 'shape_leng',\n", + " 'shape_length'}" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG16_LivestockPoultryBees_BetailVolailleAbeilles_REAG16.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lcd_000b16a_ceag16_n feature class')\n", + "\n", + "lpb_cd = gpd.read_file(dataset, layer='lcd_000b16a_ceag16_n')\n", + "\n", + "# Lowercase column names\n", + "lpb_cd.columns = [x.lower() for x in lpb_cd.columns]\n", + "\n", + "# Calculate dguid\n", + "lpb_cd['geo_cduid'] = '2016A0003' + lpb_cd['geo_cduid']\n", + "lpb_cd.rename(columns={'geo_cduid':'cd_dguid'}, inplace=True)\n", + "\n", + "# Select the variables\n", + "variable_names = list(data_description['variables'])\n", + "variable_names.insert(0, 'cd_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(lpb_cd.columns) - set(variable_names)\n", + "\n", + "variable_names = list(OrderedSet(variable_names) & OrderedSet(lpb_cd.columns))\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "lpb_cd = lpb_cd[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "lpb_cd = lpb_cd.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "e06083ab-9a15-4e7f-82e1-721c9ee1d4f1", + "metadata": {}, + "source": [ + "## 4.5 Process Use Tenure Practices" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "efe9a9da-9f18-43f7-8ba7-5f2730743358", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr1',\n", + " 'geo_descr_',\n", + " 'geometry',\n", + " 'shape_area',\n", + " 'shape_leng',\n", + " 'shape_length'}" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG16_UseTenurePractices_UtilisationOccupationPratiques_REAG16.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lcd_000b16a_ceag16_n feature class')\n", + "\n", + "utp_cd = gpd.read_file(dataset, layer='lcd_000b16a_ceag16_n')\n", + "\n", + "# Lowercase column names\n", + "utp_cd.columns = [x.lower() for x in utp_cd.columns]\n", + "\n", + "# Calculate dguid\n", + "utp_cd['geo_cduid'] = '2016A0003' + utp_cd['geo_cduid']\n", + "utp_cd.rename(columns={'geo_cduid':'cd_dguid'}, inplace=True)\n", + "\n", + "# Select the variables\n", + "variable_names = list(data_description['variables'])\n", + "variable_names.insert(0, 'cd_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(utp_cd.columns) - set(variable_names)\n", + "\n", + "variable_names = list(OrderedSet(variable_names) & OrderedSet(utp_cd.columns))\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "utp_cd = utp_cd[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "utp_cd = utp_cd.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "1f2bf2f9-8638-441a-bd54-1e988861dc96", + "metadata": {}, + "source": [ + "## 4.6 Join the DataFrames and Export" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "029991c3-c3c9-4047-89b6-db0b019f3261", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr_en', 'geo_descr_fr'}" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid\n" + ] + }, + { + "data": { + "text/plain": [ + "{'cd_dguid'}" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"Merging all Census Divisions dataframes into one\")\n", + "cd_merge = ao_cd.merge(cc_cd, how='inner', on='cd_dguid') \\\n", + " .merge(fo_cd, how='inner', on='cd_dguid') \\\n", + " .merge(lpb_cd, how='inner', on='cd_dguid') \\\n", + " .merge(utp_cd, how='inner', on='cd_dguid')\n", + "\n", + "# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr\n", + "print(\"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\")\n", + "set(data_description['variables']) - set(cd_merge.columns)\n", + "print(\"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid\")\n", + "set(cd_merge.columns) - set(data_description['variables'])\n", + "\n", + "# Export\n", + "cd_merge.to_parquet(f'{output_data_dir}/cd_2016.parquet', index=False, compression='zstd')\n", + "\n", + "del(ao_cd)\n", + "del(cc_cd)\n", + "del(fo_cd)\n", + "del(lpb_cd)\n", + "del(utp_cd)\n", + "del(cd_merge)\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "id": "c5d3a2fd-3d8b-461f-9933-19c13bcd01ba", + "metadata": {}, + "source": [ + "# 5.0 Process Consolidated Subdivisions\n", + "## 5.1 Process Agricultural Operations\n", + "**TODO:** \n", + "- Mistakes:\n", + " - There is a duplicate of `farms_n` column that is named `farms_n1`. The values are identical for both columns" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "f0a5858b-4fbc-4993-a9ed-b17c1606c8bd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'farms_n1',\n", + " 'geo_descr1',\n", + " 'geo_descr_',\n", + " 'geometry',\n", + " 'shape_area',\n", + " 'shape_length'}" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG16_AgriculturalOperations_ExploitationsAgricoles_REAG16.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lccs000b16a_ceag16_n feature class')\n", + "\n", + "ao_ccs = gpd.read_file(dataset, layer='lccs000b16a_ceag16_n')\n", + "\n", + "# Lowercase column names\n", + "ao_ccs.columns = [x.lower() for x in ao_ccs.columns]\n", + "\n", + "# Calculate dguid\n", + "ao_ccs['ccsuid'] = '2016S0502' + ao_ccs['ccsuid']\n", + "ao_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n", + "\n", + "# Select the variables\n", + "variable_names = list(data_description['variables'])\n", + "variable_names.insert(0, 'ccs_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(ao_ccs.columns) - set(variable_names)\n", + "\n", + "variable_names = list(OrderedSet(variable_names) & OrderedSet(ao_ccs.columns))\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "ao_ccs = ao_ccs[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "ao_ccs = ao_ccs.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "d667b9b4-36fc-489d-9430-d8d099926d2f", + "metadata": {}, + "source": [ + "## 5.2 Process Crop Cultures" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "7352d371-cf2a-4846-b239-f11ed0f5dd66", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr1',\n", + " 'geo_descr_',\n", + " 'geometry',\n", + " 'shape_area',\n", + " 'shape_leng',\n", + " 'shape_length'}" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG16_Crops_Cultures_REAG16.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lccs_000b16a_ceag16_n feature class')\n", + "\n", + "cc_ccs = gpd.read_file(dataset, layer='lccs_000b16a_ceag16_n')\n", + "\n", + "# Lowercase column names\n", + "cc_ccs.columns = [x.lower() for x in cc_ccs.columns]\n", + "\n", + "# Calculate dguid\n", + "cc_ccs['geo_ccsuid'] = '2016S0502' + cc_ccs['geo_ccsuid']\n", + "cc_ccs.rename(columns={'geo_ccsuid':'ccs_dguid'}, inplace=True)\n", + "\n", + "# Select the variables\n", + "variable_names = list(data_description['variables'])\n", + "variable_names.insert(0, 'ccs_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(cc_ccs.columns) - set(variable_names)\n", + "\n", + "variable_names = list(OrderedSet(variable_names) & OrderedSet(cc_ccs.columns))\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "cc_ccs = cc_ccs[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "cc_ccs = cc_ccs.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "ab1841c1-9ebb-41b4-ac04-a998c249b5aa", + "metadata": {}, + "source": [ + "## 5.3 Process Farm Operators\n", + "**TODO:** \n", + "- Mistakes:\n", + " - Column `more_avg_a` should be called `more_avg_age`\n", + " - Column `one_avg_ag` should be called `one_avg_age`\n", + " - On the Excel sheet, there are four `OPER_N`, with the same definition\n", + " - On the dataset, this column is replicated five times as `oper_n`, `oper_n1`, `oper_n2`, `oper_n3`, `oper_n4`\n", + " - No idea what `opermore_1` is supposed to be\n", + " - Column `operone_n1` is duplicate of `operone_n`" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "c047128d-ba23-4701-9907-9199b0f68549", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr1',\n", + " 'geo_descr_',\n", + " 'geometry',\n", + " 'oper_n1',\n", + " 'oper_n2',\n", + " 'oper_n3',\n", + " 'oper_n4',\n", + " 'opermore_1',\n", + " 'operone_n1',\n", + " 'shape_area',\n", + " 'shape_leng',\n", + " 'shape_length'}" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG16_FarmOperators_ExploitantsAgricoles_REAG16.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lccs000b16a_ceag16_n feature class')\n", + "\n", + "fo_ccs = gpd.read_file(dataset, layer='lccs000b16a_ceag16_n')\n", + "\n", + "# Lowercase column names\n", + "fo_ccs.columns = [x.lower() for x in fo_ccs.columns]\n", + "\n", + "# Calculate dguid\n", + "fo_ccs['ccsuid'] = '2016S0502' + fo_ccs['ccsuid']\n", + "fo_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n", + "\n", + "# Select the variables\n", + "variable_names = list(data_description['variables'])\n", + "variable_names.insert(0, 'ccs_dguid')\n", + "\n", + "# Fix mistakes\n", + "fo_ccs.rename(columns={\n", + " 'more_avg_a': 'more_avg_age',\n", + " 'one_avg_ag': 'one_avg_age'}, inplace=True)\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(fo_ccs.columns) - set(variable_names)\n", + "\n", + "variable_names = list(OrderedSet(variable_names) & OrderedSet(fo_ccs.columns))\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "fo_ccs = fo_ccs[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "fo_ccs = fo_ccs.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "ae7f58ae-f7b2-4eb3-b4b4-62e9605a290c", + "metadata": {}, + "source": [ + "## 5.4 Process Livestock Poultry Bees" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "69e39b2b-7032-406b-9686-da84463dce5c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr1',\n", + " 'geo_descr_',\n", + " 'geometry',\n", + " 'shape_area',\n", + " 'shape_leng',\n", + " 'shape_length'}" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG16_LivestockPoultryBees_BetailVolailleAbeilles_REAG16.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lccs000b16a_ceag16_n feature class')\n", + "\n", + "lpb_ccs = gpd.read_file(dataset, layer='lccs000b16a_ceag16_n')\n", + "\n", + "# Lowercase column names\n", + "lpb_ccs.columns = [x.lower() for x in lpb_ccs.columns]\n", + "\n", + "# Calculate dguid\n", + "lpb_ccs['ccsuid'] = '2016S0502' + lpb_ccs['ccsuid']\n", + "lpb_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n", + "\n", + "# Select the variables\n", + "variable_names = list(data_description['variables'])\n", + "variable_names.insert(0, 'ccs_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(lpb_ccs.columns) - set(variable_names)\n", + "\n", + "variable_names = list(OrderedSet(variable_names) & OrderedSet(lpb_ccs.columns))\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "lpb_ccs = lpb_ccs[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "lpb_ccs = lpb_ccs.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "d7343f35-7a86-4cfd-ba10-d81d5347f276", + "metadata": {}, + "source": [ + "## 5.5 Process Use Tenure Practices" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "19b31cd4-37ae-4541-8e15-17e698256c98", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr1',\n", + " 'geo_descr_',\n", + " 'geometry',\n", + " 'shape_area',\n", + " 'shape_leng',\n", + " 'shape_length'}" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG16_UseTenurePractices_UtilisationOccupationPratiques_REAG16.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lccs_000b16a_ceag16_n feature class')\n", + "\n", + "utp_ccs = gpd.read_file(dataset, layer='lccs_000b16a_ceag16_n')\n", + "\n", + "# Lowercase column names\n", + "utp_ccs.columns = [x.lower() for x in utp_ccs.columns]\n", + "\n", + "# Calculate dguid\n", + "utp_ccs['ccsuid'] = '2016S0502' + utp_ccs['ccsuid']\n", + "utp_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n", + "\n", + "# Select the variables\n", + "variable_names = list(data_description['variables'])\n", + "variable_names.insert(0, 'ccs_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(utp_ccs.columns) - set(variable_names)\n", + "\n", + "variable_names = list(OrderedSet(variable_names) & OrderedSet(utp_ccs.columns))\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "utp_ccs = utp_ccs[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "utp_ccs = utp_ccs.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "aec1734e-6ade-4849-a160-8a849af15265", + "metadata": {}, + "source": [ + "## 5.6 Join the DataFrames and Export" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "11bd0ad8-a207-43ba-ad27-826e79e4b678", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr_en', 'geo_descr_fr'}" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid\n" + ] + }, + { + "data": { + "text/plain": [ + "{'ccs_dguid'}" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"Merging all Census Consolidated Subdivisions dataframes into one\")\n", + "ccs_merge = ao_ccs.merge(cc_ccs, how='inner', on='ccs_dguid') \\\n", + " .merge(fo_ccs, how='inner', on='ccs_dguid') \\\n", + " .merge(lpb_ccs, how='inner', on='ccs_dguid') \\\n", + " .merge(utp_ccs, how='inner', on='ccs_dguid')\n", + "\n", + "# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr\n", + "print(\"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\")\n", + "set(data_description['variables']) - set(ccs_merge.columns)\n", + "print(\"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid\")\n", + "set(ccs_merge.columns) - set(data_description['variables'])\n", + "\n", + "# Export\n", + "print(\"Exporting ccs_2016.parquet\")\n", + "ccs_merge.to_parquet(f'{output_data_dir}/ccs_2016.parquet', index=False, compression='zstd')\n", + "\n", + "del(ao_ccs)\n", + "del(cc_ccs)\n", + "del(fo_ccs)\n", + "del(lpb_ccs)\n", + "del(utp_ccs)\n", + "del(ccs_merge)\n", + "gc.collect()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/census_of_agriculture/process_2021.ipynb b/census_of_agriculture/process_2021.ipynb new file mode 100644 index 0000000..e94d927 --- /dev/null +++ b/census_of_agriculture/process_2021.ipynb @@ -0,0 +1,1793 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 40, + "id": "a06cc1c7-7826-4270-8c04-48ad4de90bc9", + "metadata": {}, + "outputs": [], + "source": [ + "import gc\n", + "\n", + "from IPython.core.interactiveshell import InteractiveShell \n", + "import geopandas as gpd\n", + "import pandas as pd\n", + "\n", + "# Enable multiple outputs per cell\n", + "InteractiveShell.ast_node_interactivity = \"all\"\n", + "# Show all columns\n", + "pd.set_option('display.max_columns', None)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "6e4dad8c-afec-4bb6-aee2-20b9b9c5a7a5", + "metadata": {}, + "outputs": [], + "source": [ + "input_data_dir = '/data/census_of_agriculture/input/2021'\n", + "output_data_dir = '/data/census_of_agriculture/output/2021/tabular'" + ] + }, + { + "cell_type": "markdown", + "id": "c049bbce-8dcb-418c-b7cb-7015f920a39a", + "metadata": {}, + "source": [ + "# 1.0 Process Excel sheet with column names and descriptions\n", + "The compilation of all of the file geodatabase dataset columns should match this dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "0026e968-c4a4-4dc3-9f5c-cdf9706bb8e9", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Reading Excel sheet with variables\")\n", + "\n", + "data_description = pd.read_excel(f'{input_data_dir}/CEAG21_VariablesDescriptions_REAG21_EN_FR.xlsx', skiprows=2,\n", + " usecols=['2021 Variables', 'Categories', '2021 Long description of the variables (EN)'])\n", + "data_description.rename(columns={'2021 Variables': 'variables', 'Categories': 'categories', '2021 Long description of the variables (EN)': 'description_en'}, inplace=True)\n", + "data_description['variables'] = data_description['variables'].str.lower()" + ] + }, + { + "cell_type": "markdown", + "id": "ae758963-b639-44a9-9904-d7438594a729", + "metadata": {}, + "source": [ + "# 2.0 Process Provinces and Territories\n", + "## 2.1 Process Agricultural Operations\n", + "**TODO:** \n", + "- Figure out the -1 values\n", + "- Figure out why `valoeq` is float and not integer\n", + " - It is float because Nova Scotia has a value of 82047377.0000004\n", + "- Figure out why the data types are integer64" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "2b8467e8-b979-423c-856c-80242f9a8443", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geometry', 'prename', 'prfname', 'shape_area', 'shape_length'}" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG21_AgriculturalOperations_ExploitationsAgricoles_REAG21.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lpr_000b21a_e_ceag21_n feature class')\n", + "\n", + "ao_pr = gpd.read_file(dataset, layer='lpr_000b21a_e_ceag21_n')\n", + "# Lowercase column names\n", + "ao_pr.columns = [x.lower() for x in ao_pr.columns]\n", + "\n", + "# Calculate dguid\n", + "ao_pr['pruid'] = '2021A0002' + ao_pr['pruid']\n", + "ao_pr.rename(columns={'pruid':'pr_dguid'}, inplace=True)\n", + "\n", + "# Select the variables for agricultural operations\n", + "variable_names = list(data_description[data_description['categories'] == 'Agricultural operations']['variables'])\n", + "variable_names.insert(0, 'pr_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(ao_pr.columns) - set(variable_names)\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "ao_pr = ao_pr[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "ao_pr = ao_pr.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "54b1ab9d-68c7-4751-a17e-e5ad321fd72f", + "metadata": {}, + "source": [ + "## 2.2 Process Crop Cultures" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "0dfa08f0-2d8d-4279-bd22-2f127d596c3f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geometry',\n", + " 'prename',\n", + " 'prfname',\n", + " 'shape_area',\n", + " 'shape_area_1',\n", + " 'shape_length',\n", + " 'shape_length_1'}" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG21_Crops_Cultures_REAG21.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lpr_000b21a_e_ceag21_n feature class')\n", + "\n", + "cc_pr = gpd.read_file(dataset, layer='lpr_000b21a_e_ceag21_n')\n", + "\n", + "# Lowercase column names\n", + "cc_pr.columns = [x.lower() for x in cc_pr.columns]\n", + "\n", + "# Calculate dguid\n", + "cc_pr['pruid'] = '2021A0002' + cc_pr['pruid']\n", + "cc_pr.rename(columns={'pruid':'pr_dguid'}, inplace=True)\n", + "\n", + "# Select the variables for agricultural operations\n", + "variable_names = list(data_description[data_description['categories'] == 'Crops']['variables'])\n", + "variable_names.insert(0, 'pr_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(cc_pr.columns) - set(variable_names)\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "cc_pr = cc_pr[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "cc_pr = cc_pr.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "076eb1e3-b319-47ff-b959-cae6dd600081", + "metadata": {}, + "source": [ + "## 2.3 Process Farm Operators" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "de1dbde2-e820-438f-ae9d-b54455b10ac4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geometry', 'prename', 'prfname', 'shape_area', 'shape_length'}" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG21_FarmOperators_ExploitantsAgricoles_REAG21.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lpr_000b21a_e_ceag21_n feature class')\n", + "\n", + "fo_pr = gpd.read_file(dataset, layer='lpr_000b21a_e_ceag21_n')\n", + "\n", + "# Lowercase column names\n", + "fo_pr.columns = [x.lower() for x in fo_pr.columns]\n", + "\n", + "# Calculate dguid\n", + "fo_pr['pruid'] = '2021A0002' + fo_pr['pruid']\n", + "fo_pr.rename(columns={'pruid':'pr_dguid'}, inplace=True)\n", + "\n", + "# Select the variables for farm operators\n", + "variable_names = list(data_description[data_description['categories'] == 'Farm operators']['variables'])\n", + "variable_names.insert(0, 'pr_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(fo_pr.columns) - set(variable_names)\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "fo_pr = fo_pr[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "fo_pr = fo_pr.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "39161daf-5d6a-47e0-9211-0cc0137e2c9e", + "metadata": {}, + "source": [ + "## 2.4 Process Livestock Poultry Bees" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "79731af0-56f2-4840-b903-49410512d410", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geometry', 'prename', 'prfname', 'shape_area', 'shape_length'}" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG21_LivestockPoultryBees_BetailVolailleAbeilles_REAG21.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lpr_000b21a_e_ceag21_n feature class')\n", + "\n", + "lpb_pr = gpd.read_file(dataset, layer='lpr_000b21a_e_ceag21_n')\n", + "\n", + "# Lowercase column names\n", + "lpb_pr.columns = [x.lower() for x in lpb_pr.columns]\n", + "\n", + "# Calculate dguid\n", + "lpb_pr['pruid'] = '2021A0002' + lpb_pr['pruid']\n", + "lpb_pr.rename(columns={'pruid':'pr_dguid'}, inplace=True)\n", + "\n", + "# Select the variables for livestock poultry bees\n", + "variable_names = list(data_description[data_description['categories'] == 'Livestock Poultry Bees']['variables'])\n", + "variable_names.insert(0, 'pr_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(lpb_pr.columns) - set(variable_names)\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "lpb_pr = lpb_pr[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "lpb_pr = lpb_pr.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "6a33cfe2-4bbb-46a7-81fb-494a61663a86", + "metadata": {}, + "source": [ + "## 2.5 Process Use Tenure Practices" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "328daec4-f6de-42c4-b11d-5ba287b7a94d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geometry', 'prename', 'prfname', 'shape_area', 'shape_length'}" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG21_UseTenurePractices_UtilisationOccupationPratiques_REAG21.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lpr_000b21a_e_ceag21_n feature class')\n", + "\n", + "utp_pr = gpd.read_file(dataset, layer='lpr_000b21a_e_ceag21_n')\n", + "\n", + "# Lowercase column names\n", + "utp_pr.columns = [x.lower() for x in utp_pr.columns]\n", + "\n", + "# Calculate dguid\n", + "utp_pr['pruid'] = '2021A0002' + utp_pr['pruid']\n", + "utp_pr.rename(columns={'pruid':'pr_dguid'}, inplace=True)\n", + "\n", + "# Select the variables for tenure practices\n", + "variable_names = list(data_description[data_description['categories'] == 'Use Tenure Practices']['variables'])\n", + "variable_names.insert(0, 'pr_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(utp_pr.columns) - set(variable_names)\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "utp_pr = utp_pr[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "utp_pr = utp_pr.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "4963cfbb-71c3-4ce7-98f1-3c6ad3d90fde", + "metadata": {}, + "source": [ + "## 2.6 Join the DataFrames and Export" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "11b5355a-2293-4751-a364-54a0963ed662", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr_en', 'geo_descr_fr'}" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be pr_dguid\n" + ] + }, + { + "data": { + "text/plain": [ + "{'pr_dguid'}" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "308" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"Merging all Province and Territories dataframes into one\")\n", + "pr_merge = ao_pr.merge(cc_pr, how='inner', on='pr_dguid') \\\n", + " .merge(fo_pr, how='inner', on='pr_dguid') \\\n", + " .merge(lpb_pr, how='inner', on='pr_dguid') \\\n", + " .merge(utp_pr, how='inner', on='pr_dguid')\n", + "\n", + "# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr\n", + "print(\"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\")\n", + "set(data_description['variables']) - set(pr_merge.columns)\n", + "print(\"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be pr_dguid\")\n", + "set(pr_merge.columns) - set(data_description['variables'])\n", + "\n", + "# Export\n", + "print(\"Exporting pr_2021.parquet\")\n", + "pr_merge.to_parquet(f'{output_data_dir}/pr_2021.parquet', index=False, compression='zstd')\n", + "\n", + "# Create country as well\n", + "# TODO: check if -1 values subtracted from the sum\n", + "country = pd.read_parquet(f'{output_data_dir}/pr_2021.parquet')\n", + "country['pr_dguid'] = '2021A000011124'\n", + "country.rename(columns={'pr_dguid': 'country_dguid'}, inplace=True)\n", + "country = country.groupby(['country_dguid']).sum()\n", + "country.reset_index(inplace=True)\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "country = country.convert_dtypes(**params)\n", + "print(\"Exporting country_2021.parquet\")\n", + "country.to_parquet(f'{output_data_dir}/country_2021.parquet', index=False, compression='zstd')\n", + "\n", + "del(ao_pr)\n", + "del(cc_pr)\n", + "del(fo_pr)\n", + "del(lpb_pr)\n", + "del(utp_pr)\n", + "del(pr_merge)\n", + "del(country)\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "id": "bd78f553-52b4-427e-bf40-7ee4101d65ac", + "metadata": {}, + "source": [ + "# 3.0 Process Census Agricultural Regions" + ] + }, + { + "cell_type": "markdown", + "id": "ac1383f5-1340-4dd8-9346-0d21ab506886", + "metadata": {}, + "source": [ + "## 3.1 Process Agricultural Operations" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "740de275-a602-4a18-9f7a-708db0e4e529", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'carename', 'carfname', 'geometry', 'shape_area', 'shape_length'}" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG21_AgriculturalOperations_ExploitationsAgricoles_REAG21.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lcar000b21a_e_ceag21_n feature class')\n", + "\n", + "ao_car = gpd.read_file(dataset, layer='lcar000b21a_e_ceag21_n')\n", + "\n", + "# Lowercase column names\n", + "ao_car.columns = [x.lower() for x in ao_car.columns]\n", + "\n", + "# Calculate dguid\n", + "ao_car['caruid'] = '2021S0501' + ao_car['caruid']\n", + "ao_car.rename(columns={'caruid':'car_dguid'}, inplace=True)\n", + "\n", + "# Select the variables for agricultural operations\n", + "variable_names = list(data_description[data_description['categories'] == 'Agricultural operations']['variables'])\n", + "variable_names.insert(0, 'car_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(ao_car.columns) - set(variable_names)\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "ao_car = ao_car[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "ao_car = ao_car.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "c4254a8e-30cf-43a0-8a08-b7ac1ab9689c", + "metadata": {}, + "source": [ + "## 3.2 Process Crop Cultures" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "92b08284-0adb-43fd-a89f-19395afb9bd0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'carename', 'carfname', 'geometry', 'shape_area', 'shape_length'}" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG21_Crops_Cultures_REAG21.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lcar000b21a_e_ceag21_n feature class')\n", + "\n", + "cc_car = gpd.read_file(dataset, layer='lcar000b21a_e_ceag21_n')\n", + "\n", + "# Lowercase column names\n", + "cc_car.columns = [x.lower() for x in cc_car.columns]\n", + "\n", + "# Calculate dguid\n", + "cc_car['caruid'] = '2021S0501' + cc_car['caruid']\n", + "cc_car.rename(columns={'caruid':'car_dguid'}, inplace=True)\n", + "\n", + "# Select the variables for crop cultures\n", + "variable_names = list(data_description[data_description['categories'] == 'Crops']['variables'])\n", + "variable_names.insert(0, 'car_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(cc_car.columns) - set(variable_names)\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "cc_car = cc_car[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "cc_car = cc_car.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "babc334e-47c0-435d-919c-2df54a70130a", + "metadata": {}, + "source": [ + "## 3.3 Process Farm Operators\n", + "Census of Agriculture release made a mistake in this file:\n", + "- `more_avg_age` is now `more_avg_a` in the file\n", + "- `more_med_age` is now `more_med_a` in the file\n", + "- `one_avg_age` is now `one_avg_ag` in the file\n", + "- `one_med_age` is now `one_med_ag` in the file\n", + "- `plan_nodis_n` is now `plan_nodis` in the file" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "47afb1ab-f4a9-4581-8222-b4a52ff0eb4a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'carename',\n", + " 'carfname',\n", + " 'geometry',\n", + " 'shape_area',\n", + " 'shape_leng',\n", + " 'shape_length'}" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG21_FarmOperators_ExploitantsAgricoles_REAG21.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lcar000b21a_e_ceag21_n feature class')\n", + "\n", + "fo_car = gpd.read_file(dataset, layer='lcar000b21a_e_ceag21_n')\n", + "\n", + "# Lowercase column names\n", + "fo_car.columns = [x.lower() for x in fo_car.columns]\n", + "\n", + "# Calculate dguid\n", + "fo_car['caruid'] = '2021S0501' + fo_car['caruid']\n", + "fo_car.rename(columns={'caruid':'car_dguid'}, inplace=True)\n", + "\n", + "# Select the variables for farm operators\n", + "variable_names = list(data_description[data_description['categories'] == 'Farm operators']['variables'])\n", + "variable_names.insert(0, 'car_dguid')\n", + "\n", + "# Rename mistakes\n", + "fo_car.rename(columns={\n", + " 'more_avg_a': 'more_avg_age',\n", + " 'more_med_a': 'more_med_age',\n", + " 'one_avg_ag': 'one_avg_age',\n", + " 'one_med_ag': 'one_med_age',\n", + " 'plan_nodis': 'plan_nodis_n'\n", + "}, inplace=True)\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(fo_car.columns) - set(variable_names)\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "fo_car = fo_car[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "fo_car = fo_car.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "03bf9372-c2da-46b9-8da9-64c358cb51df", + "metadata": {}, + "source": [ + "## 3.4 Process Livestock Poultry Bees" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "abf3ff3f-6fbc-482f-9ef8-f7ae6c289884", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'carename', 'carfname', 'geometry', 'shape_area', 'shape_length'}" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG21_LivestockPoultryBees_BetailVolailleAbeilles_REAG21.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lcar000b21a_e_ceag21_n feature class')\n", + "\n", + "lpb_car = gpd.read_file(dataset, layer='lcar000b21a_e_ceag21_n')\n", + "\n", + "# Lowercase column names\n", + "lpb_car.columns = [x.lower() for x in lpb_car.columns]\n", + "\n", + "# Calculate dguid\n", + "lpb_car['caruid'] = '2021S0501' + lpb_car['caruid']\n", + "lpb_car.rename(columns={'caruid':'car_dguid'}, inplace=True)\n", + "\n", + "# Select the variables for livestock poultry bees\n", + "variable_names = list(data_description[data_description['categories'] == 'Livestock Poultry Bees']['variables'])\n", + "variable_names.insert(0, 'car_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(lpb_car.columns) - set(variable_names)\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "lpb_car = lpb_car[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "lpb_car = lpb_car.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "a3962c9c-3b5b-4fd8-a6ca-8e004159a521", + "metadata": {}, + "source": [ + "## 3.5 Process Use Tenure Practices" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "16678c91-7132-4ed2-981b-f60b0d81bc73", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'carename', 'carfname', 'geometry', 'shape_area', 'shape_length'}" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG21_UseTenurePractices_UtilisationOccupationPratiques_REAG21.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lcar000b21a_e_ceag21_n feature class')\n", + "\n", + "utp_car = gpd.read_file(dataset, layer='lcar000b21a_e_ceag21_n')\n", + "\n", + "# Lowercase column names\n", + "utp_car.columns = [x.lower() for x in utp_car.columns]\n", + "\n", + "# Calculate dguid\n", + "utp_car['caruid'] = '2021S0501' + utp_car['caruid']\n", + "utp_car.rename(columns={'caruid':'car_dguid'}, inplace=True)\n", + "\n", + "# Select the variables for tenure practices\n", + "variable_names = list(data_description[data_description['categories'] == 'Use Tenure Practices']['variables'])\n", + "variable_names.insert(0, 'car_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(utp_car.columns) - set(variable_names)\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "utp_car = utp_car[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "utp_car = utp_car.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "9b99344d-edfc-4f51-ab9f-8a56100394b0", + "metadata": {}, + "source": [ + "## 3.6 Join the DataFrames and Export" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "d1cfb3c1-c94e-4f4a-9c90-e61c43a791a3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr_en', 'geo_descr_fr'}" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be car_dguid\n" + ] + }, + { + "data": { + "text/plain": [ + "{'car_dguid'}" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"Merging all Census Agricultural Regions dataframes into one\")\n", + "car_merge = ao_car.merge(cc_car, how='inner', on='car_dguid') \\\n", + " .merge(fo_car, how='inner', on='car_dguid') \\\n", + " .merge(lpb_car, how='inner', on='car_dguid') \\\n", + " .merge(utp_car, how='inner', on='car_dguid')\n", + "\n", + "# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr\n", + "print(\"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\")\n", + "set(data_description['variables']) - set(car_merge.columns)\n", + "print(\"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be car_dguid\")\n", + "set(car_merge.columns) - set(data_description['variables'])\n", + "\n", + "# Export\n", + "print(\"Exporting car_2021.parquet\")\n", + "car_merge.to_parquet(f'{output_data_dir}/car_2021.parquet', index=False, compression='zstd')\n", + "\n", + "del(ao_car)\n", + "del(cc_car)\n", + "del(fo_car)\n", + "del(lpb_car)\n", + "del(utp_car)\n", + "del(car_merge)\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "id": "5ec65522-e754-40b9-994d-e21c5bae88ac", + "metadata": {}, + "source": [ + "# 4.0 Process Census Divisions\n", + "## 4.1 Process Agricultural Operations" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "02f789b8-e3d8-4359-97b1-46be400bc019", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'cdname', 'drnom', 'geometry', 'shape_area', 'shape_length'}" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG21_AgriculturalOperations_ExploitationsAgricoles_REAG21.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lcd000b21a_e_ceag21_n feature class')\n", + "\n", + "ao_cd = gpd.read_file(dataset, layer='lcd000b21a_e_ceag21_n')\n", + "\n", + "# Lowercase column names\n", + "ao_cd.columns = [x.lower() for x in ao_cd.columns]\n", + "\n", + "# Calculate dguid\n", + "ao_cd['cduid'] = '2021A0003' + ao_cd['cduid']\n", + "ao_cd.rename(columns={'cduid':'cd_dguid'}, inplace=True)\n", + "\n", + "# Select the variables for agricultural operations\n", + "variable_names = list(data_description[data_description['categories'] == 'Agricultural operations']['variables'])\n", + "variable_names.insert(0, 'cd_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(ao_cd.columns) - set(variable_names)\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "ao_cd = ao_cd[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "ao_cd = ao_cd.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "b22c32c1-bf09-42f6-ad9f-15e19e88f125", + "metadata": {}, + "source": [ + "## 4.2 Process Crop Cultures" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "05283ef4-d1a9-48aa-8b34-d11123e19dc1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'cdname', 'drnom', 'geometry', 'shape_area', 'shape_length'}" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG21_Crops_Cultures_REAG21.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lcd000b21a_e_ceag21_n feature class')\n", + "\n", + "cc_cd = gpd.read_file(dataset, layer='lcd000b21a_e_ceag21_n')\n", + "\n", + "# Lowercase column names\n", + "cc_cd.columns = [x.lower() for x in cc_cd.columns]\n", + "\n", + "# Calculate dguid\n", + "cc_cd['cduid'] = '2021A0003' + cc_cd['cduid']\n", + "cc_cd.rename(columns={'cduid':'cd_dguid'}, inplace=True)\n", + "\n", + "# Select the variables for agricultural operations\n", + "variable_names = list(data_description[data_description['categories'] == 'Crops']['variables'])\n", + "variable_names.insert(0, 'cd_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(cc_cd.columns) - set(variable_names)\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "cc_cd = cc_cd[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "cc_cd = cc_cd.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "52241e8d-b468-4dd6-99b9-342f9a6fde94", + "metadata": {}, + "source": [ + "## 4.3 Process Farm Operators\n", + "Census of Agriculture release made a mistake in this file:\n", + "- `more_avg_age` is now `more_avg_a` in the file\n", + "- `more_med_age` is now `more_med_a` in the file\n", + "- `one_avg_age` is now `one_avg_ag` in the file\n", + "- `one_med_age` is now `one_med_ag` in the file\n", + "- `plan_nodis_n` is now `plan_nodis` in the file" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "cf5b26d1-7315-4e9d-85fe-473a68cc5f0f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'cdname', 'drnom', 'geometry', 'shape_area', 'shape_leng', 'shape_length'}" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG21_FarmOperators_ExploitantsAgricoles_REAG21.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lcd000b21a_e_ceag21_n feature class')\n", + "\n", + "fo_cd = gpd.read_file(dataset, layer='lcd000b21a_e_ceag21_n')\n", + "\n", + "# Lowercase column names\n", + "fo_cd.columns = [x.lower() for x in fo_cd.columns]\n", + "\n", + "# Calculate dguid\n", + "fo_cd['cduid'] = '2021A0003' + fo_cd['cduid']\n", + "fo_cd.rename(columns={'cduid':'cd_dguid'}, inplace=True)\n", + "\n", + "# Select the variables for agricultural operations\n", + "variable_names = list(data_description[data_description['categories'] == 'Farm operators']['variables'])\n", + "variable_names.insert(0, 'cd_dguid')\n", + "\n", + "# Rename mistakes\n", + "fo_cd.rename(columns={\n", + " 'more_avg_a': 'more_avg_age',\n", + " 'more_med_a': 'more_med_age',\n", + " 'one_avg_ag': 'one_avg_age',\n", + " 'one_med_ag': 'one_med_age',\n", + " 'plan_nodis': 'plan_nodis_n'\n", + "}, inplace=True)\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(fo_cd.columns) - set(variable_names)\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "fo_cd = fo_cd[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "fo_cd = fo_cd.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "f68f6fc6-f757-4f21-b32a-7070d4bcc3ec", + "metadata": {}, + "source": [ + "## 4.4 Process Livestock Poultry Bees" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "e3cd38c6-6563-4067-8280-32277b00a33b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'cdname', 'drnom', 'geometry', 'shape_area', 'shape_length'}" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG21_LivestockPoultryBees_BetailVolailleAbeilles_REAG21.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lcd000b21a_e_ceag21_n feature class')\n", + "\n", + "lpb_cd = gpd.read_file(dataset, layer='lcd000b21a_e_ceag21_n')\n", + "\n", + "# Lowercase column names\n", + "lpb_cd.columns = [x.lower() for x in lpb_cd.columns]\n", + "\n", + "# Calculate dguid\n", + "lpb_cd['cduid'] = '2021A0003' + lpb_cd['cduid']\n", + "lpb_cd.rename(columns={'cduid':'cd_dguid'}, inplace=True)\n", + "\n", + "# Select the variables for livestock poultry bees\n", + "variable_names = list(data_description[data_description['categories'] == 'Livestock Poultry Bees']['variables'])\n", + "variable_names.insert(0, 'cd_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(lpb_cd.columns) - set(variable_names)\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "lpb_cd = lpb_cd[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "lpb_cd = lpb_cd.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "e06083ab-9a15-4e7f-82e1-721c9ee1d4f1", + "metadata": {}, + "source": [ + "## 4.5 Process Use Tenure Practices" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "efe9a9da-9f18-43f7-8ba7-5f2730743358", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'cdname', 'drnom', 'geometry', 'shape_area', 'shape_length'}" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG21_UseTenurePractices_UtilisationOccupationPratiques_REAG21.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lcd000b21a_e_ceag21_n feature class')\n", + "\n", + "utp_cd = gpd.read_file(dataset, layer='lcd000b21a_e_ceag21_n')\n", + "\n", + "# Lowercase column names\n", + "utp_cd.columns = [x.lower() for x in utp_cd.columns]\n", + "\n", + "# Calculate dguid\n", + "utp_cd['cduid'] = '2021A0003' + utp_cd['cduid']\n", + "utp_cd.rename(columns={'cduid':'cd_dguid'}, inplace=True)\n", + "\n", + "# Select the variables for tenure practices\n", + "variable_names = list(data_description[data_description['categories'] == 'Use Tenure Practices']['variables'])\n", + "variable_names.insert(0, 'cd_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(utp_cd.columns) - set(variable_names)\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "utp_cd = utp_cd[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "utp_cd = utp_cd.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "1f2bf2f9-8638-441a-bd54-1e988861dc96", + "metadata": {}, + "source": [ + "## 4.6 Join the DataFrames and Export" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "029991c3-c3c9-4047-89b6-db0b019f3261", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr_en', 'geo_descr_fr'}" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid\n" + ] + }, + { + "data": { + "text/plain": [ + "{'cd_dguid'}" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"Merging all Census Divisions dataframes into one\")\n", + "cd_merge = ao_cd.merge(cc_cd, how='inner', on='cd_dguid') \\\n", + " .merge(fo_cd, how='inner', on='cd_dguid') \\\n", + " .merge(lpb_cd, how='inner', on='cd_dguid') \\\n", + " .merge(utp_cd, how='inner', on='cd_dguid')\n", + "\n", + "# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr\n", + "print(\"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\")\n", + "set(data_description['variables']) - set(cd_merge.columns)\n", + "print(\"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid\")\n", + "set(cd_merge.columns) - set(data_description['variables'])\n", + "\n", + "# Export\n", + "print(\"Exporting cd_2021.parquet\")\n", + "cd_merge.to_parquet(f'{output_data_dir}/cd_2021.parquet', index=False, compression='zstd')\n", + "\n", + "del(ao_cd)\n", + "del(cc_cd)\n", + "del(fo_cd)\n", + "del(lpb_cd)\n", + "del(utp_cd)\n", + "del(cd_merge)\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "id": "c5d3a2fd-3d8b-461f-9933-19c13bcd01ba", + "metadata": {}, + "source": [ + "# 5.0 Process Consolidated Subdivisions\n", + "## 5.1 Process Agricultural Operations" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "f0a5858b-4fbc-4993-a9ed-b17c1606c8bd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'ccsname', 'geometry', 'shape_area', 'shape_length', 'srunom'}" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG21_AgriculturalOperations_ExploitationsAgricoles_REAG21.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lccs000b21a_e_ceag21_n feature class')\n", + "\n", + "ao_ccs = gpd.read_file(dataset, layer='lccs000b21a_e_ceag21_n')\n", + "\n", + "# Lowercase column names\n", + "ao_ccs.columns = [x.lower() for x in ao_ccs.columns]\n", + "\n", + "# Calculate dguid\n", + "ao_ccs['ccsuid'] = '2021S0502' + ao_ccs['ccsuid']\n", + "ao_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n", + "\n", + "# Select the variables for agricultural operations\n", + "variable_names = list(data_description[data_description['categories'] == 'Agricultural operations']['variables'])\n", + "variable_names.insert(0, 'ccs_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(ao_ccs.columns) - set(variable_names)\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "ao_ccs = ao_ccs[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "ao_ccs = ao_ccs.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "d667b9b4-36fc-489d-9430-d8d099926d2f", + "metadata": {}, + "source": [ + "## 5.2 Process Crop Cultures" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "7352d371-cf2a-4846-b239-f11ed0f5dd66", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'ccsname', 'geometry', 'shape_area', 'shape_length', 'srunom'}" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG21_Crops_Cultures_REAG21.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lccs000b21a_e_ceag21_n feature class')\n", + "\n", + "cc_ccs = gpd.read_file(dataset, layer='lccs000b21a_e_ceag21_n')\n", + "\n", + "# Lowercase column names\n", + "cc_ccs.columns = [x.lower() for x in cc_ccs.columns]\n", + "\n", + "# Calculate dguid\n", + "cc_ccs['ccsuid'] = '2021S0502' + cc_ccs['ccsuid']\n", + "cc_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n", + "\n", + "# Select the variables for agricultural operations\n", + "variable_names = list(data_description[data_description['categories'] == 'Crops']['variables'])\n", + "variable_names.insert(0, 'ccs_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(cc_ccs.columns) - set(variable_names)\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "cc_ccs = cc_ccs[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "cc_ccs = cc_ccs.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "ab1841c1-9ebb-41b4-ac04-a998c249b5aa", + "metadata": {}, + "source": [ + "## 5.3 Process Farm Operators" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "c047128d-ba23-4701-9907-9199b0f68549", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'ccsname', 'geometry', 'shape_area', 'shape_length', 'srunom'}" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG21_FarmOperators_ExploitantsAgricoles_REAG21.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lccs000b21a_e_ceag21_n feature class')\n", + "\n", + "fo_ccs = gpd.read_file(dataset, layer='lccs000b21a_e_ceag21_n')\n", + "\n", + "# Lowercase column names\n", + "fo_ccs.columns = [x.lower() for x in fo_ccs.columns]\n", + "\n", + "# Calculate dguid\n", + "fo_ccs['ccsuid'] = '2021S0502' + fo_ccs['ccsuid']\n", + "fo_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n", + "\n", + "# Select the variables for agricultural operations\n", + "variable_names = list(data_description[data_description['categories'] == 'Farm operators']['variables'])\n", + "variable_names.insert(0, 'ccs_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(fo_ccs.columns) - set(variable_names)\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "fo_ccs = fo_ccs[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "fo_ccs = fo_ccs.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "ae7f58ae-f7b2-4eb3-b4b4-62e9605a290c", + "metadata": {}, + "source": [ + "## 5.4 Process Livestock Poultry Bees" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "69e39b2b-7032-406b-9686-da84463dce5c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'ccsname', 'geometry', 'shape_area', 'shape_length', 'srunom'}" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG21_LivestockPoultryBees_BetailVolailleAbeilles_REAG21.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lccs000b21a_e_ceag21_n feature class')\n", + "\n", + "lpb_ccs = gpd.read_file(dataset, layer='lccs000b21a_e_ceag21_n')\n", + "\n", + "# Lowercase column names\n", + "lpb_ccs.columns = [x.lower() for x in lpb_ccs.columns]\n", + "\n", + "# Calculate dguid\n", + "lpb_ccs['ccsuid'] = '2021S0502' + lpb_ccs['ccsuid']\n", + "lpb_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n", + "\n", + "\n", + "# Select the variables for livestock poultry bees\n", + "variable_names = list(data_description[data_description['categories'] == 'Livestock Poultry Bees']['variables'])\n", + "variable_names.insert(0, 'ccs_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(lpb_ccs.columns) - set(variable_names)\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "lpb_ccs = lpb_ccs[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "lpb_ccs = lpb_ccs.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "d7343f35-7a86-4cfd-ba10-d81d5347f276", + "metadata": {}, + "source": [ + "## 5.5 Process Use Tenure Practices" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "19b31cd4-37ae-4541-8e15-17e698256c98", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Quick check on columns that are on the geodataframe but not on the variables list\n" + ] + }, + { + "data": { + "text/plain": [ + "{'ccsname', 'geometry', 'shape_area', 'shape_length', 'srunom'}" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = f'{input_data_dir}/CEAG21_UseTenurePractices_UtilisationOccupationPratiques_REAG21.gdb.zip'\n", + "\n", + "print(f'Processing {dataset}, lccs000b21a_e_ceag21_n feature class')\n", + "\n", + "utp_ccs = gpd.read_file(dataset, layer='lccs000b21a_e_ceag21_n')\n", + "\n", + "# Lowercase column names\n", + "utp_ccs.columns = [x.lower() for x in utp_ccs.columns]\n", + "\n", + "# Calculate dguid\n", + "utp_ccs['ccsuid'] = '2021S0502' + utp_ccs['ccsuid']\n", + "utp_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n", + "\n", + "# Select the variables for tenure practices\n", + "variable_names = list(data_description[data_description['categories'] == 'Use Tenure Practices']['variables'])\n", + "variable_names.insert(0, 'ccs_dguid')\n", + "\n", + "# Quick check\n", + "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", + "set(utp_ccs.columns) - set(variable_names)\n", + "\n", + "# Get rid of the geometry column and shape area, length\n", + "utp_ccs = utp_ccs[variable_names]\n", + "\n", + "# Convert to lowest data type\n", + "params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + "}\n", + "utp_ccs = utp_ccs.convert_dtypes(**params)" + ] + }, + { + "cell_type": "markdown", + "id": "aec1734e-6ade-4849-a160-8a849af15265", + "metadata": {}, + "source": [ + "## 5.6 Join the DataFrames and Export" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "11bd0ad8-a207-43ba-ad27-826e79e4b678", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\n" + ] + }, + { + "data": { + "text/plain": [ + "{'geo_descr_en', 'geo_descr_fr'}" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid\n" + ] + }, + { + "data": { + "text/plain": [ + "{'ccs_dguid'}" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"Merging all Census Consolidated Subdivisions dataframes into one\")\n", + "ccs_merge = ao_ccs.merge(cc_ccs, how='inner', on='ccs_dguid') \\\n", + " .merge(fo_ccs, how='inner', on='ccs_dguid') \\\n", + " .merge(lpb_ccs, how='inner', on='ccs_dguid') \\\n", + " .merge(utp_ccs, how='inner', on='ccs_dguid')\n", + "\n", + "# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr\n", + "print(\"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\")\n", + "set(data_description['variables']) - set(ccs_merge.columns)\n", + "print(\"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid\")\n", + "set(ccs_merge.columns) - set(data_description['variables'])\n", + "\n", + "# Export\n", + "print(\"Exporting ccs_2021.parquet\")\n", + "ccs_merge.to_parquet(f'{output_data_dir}/ccs_2021.parquet', index=False, compression='zstd')\n", + "\n", + "del(ao_ccs)\n", + "del(cc_ccs)\n", + "del(fo_ccs)\n", + "del(lpb_ccs)\n", + "del(utp_ccs)\n", + "del(ccs_merge)\n", + "gc.collect()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/census_of_population/README.md b/census_of_population/README.md new file mode 100644 index 0000000..71cc5d2 --- /dev/null +++ b/census_of_population/README.md @@ -0,0 +1,5 @@ +# TODO +- Get download links for 2001 and 2006 Census +- For `process_2021.ipynb` + - Finish processing CMA + - Finish processing HR and Local health integration networks \ No newline at end of file diff --git a/census_of_population/census_of_population_files_2001.txt b/census_of_population/census_of_population_files_2001.txt new file mode 100644 index 0000000..edcc3a5 --- /dev/null +++ b/census_of_population/census_of_population_files_2001.txt @@ -0,0 +1,4 @@ +### 2001 ### + +# https://www12.statcan.gc.ca/english/census01/products/standard/popdwell/Tables.cfm +# This one does not have all geographic levels as CSV: https://www12.statcan.gc.ca/census-recensement/2011/dp-pd/prof/details/download-telecharger/comprehensive/comp-csv-tab-dwnld-tlchrgr.cfm?Lang=E#tabs2011 diff --git a/census_of_population/census_of_population_files_2006.txt b/census_of_population/census_of_population_files_2006.txt new file mode 100644 index 0000000..e1c8e72 --- /dev/null +++ b/census_of_population/census_of_population_files_2006.txt @@ -0,0 +1,4 @@ +### 2006 ### + +# https://www12.statcan.gc.ca/census-recensement/2006/dp-pd/prof/rel/index-eng.cfm +# This one does not have all geographic levels as CSV: https://www12.statcan.gc.ca/census-recensement/2011/dp-pd/prof/details/download-telecharger/comprehensive/comp-csv-tab-dwnld-tlchrgr.cfm?Lang=E#tabs2011 diff --git a/census_of_population/census_of_population_files_2011.txt b/census_of_population/census_of_population_files_2011.txt new file mode 100644 index 0000000..b6097f3 --- /dev/null +++ b/census_of_population/census_of_population_files_2011.txt @@ -0,0 +1,45 @@ +### 2011 ### + +# https://www12.statcan.gc.ca/census-recensement/2011/dp-pd/prof/details/download-telecharger/comprehensive/comp-csv-tab-dwnld-tlchrgr.cfm?Lang=E#tabs2011 + +# Canada, provinces, territories (PRs) (98-316-XWE2011001-101_CSV.zip) +https://archive.org/download/canadas-2011-census-of-population/98-316-XWE2011001-101_CSV.zip https://zenodo.org/records/15344413/files/98-316-XWE2011001-101_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2011/98-316-XWE2011001-101_CSV.zip + +# Census divisions (CDs) (98-316-XWE2011001-701_CSV.zip) +https://archive.org/download/canadas-2011-census-of-population/98-316-XWE2011001-701_CSV.zip https://zenodo.org/records/15344413/files/98-316-XWE2011001-701_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2011/98-316-XWE2011001-701_CSV.zip + +# Census subdivisions (CSDs) (98-316-XWE2011001-301_CSV.zip) +https://archive.org/download/canadas-2011-census-of-population/98-316-XWE2011001-301_CSV.zip https://zenodo.org/records/15344413/files/98-316-XWE2011001-301_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2011/98-316-XWE2011001-301_CSV.zip + +# Census metropolitan areas (CMAs), tracted census agglomerations (CAs) (98-316-XWE2011001-201_CSV.zip) +https://archive.org/download/canadas-2011-census-of-population/98-316-XWE2011001-201_CSV.zip https://zenodo.org/records/15344413/files/98-316-XWE2011001-201_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2011/98-316-XWE2011001-201_CSV.zip + +# Census tracts (CTs) (98-316-XWE2011001-401_CSV.zip) +https://archive.org/download/canadas-2011-census-of-population/98-316-XWE2011001-401_CSV.zip https://zenodo.org/records/15344413/files/98-316-XWE2011001-401_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2011/98-316-XWE2011001-401_CSV.zip + +# Federal electoral districts (FEDs) (2003 representation order) (98-316-XWE2011001-501_CSV.zip) +https://archive.org/download/canadas-2011-census-of-population/98-316-XWE2011001-501_CSV.zip https://zenodo.org/records/15344413/files/98-316-XWE2011001-501_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2011/98-316-XWE2011001-501_CSV.zip + +# Federal electoral districts (FEDs) (2013 representation order) (98-316-XWE2011001-511_CSV.zip) +https://archive.org/download/canadas-2011-census-of-population/98-316-XWE2011001-511_CSV.zip https://zenodo.org/records/15344413/files/98-316-XWE2011001-511_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2011/98-316-XWE2011001-511_CSV.zip + +# Economic regions (ERs) (98-316-XWE2011001-901_CSV.zip) +https://archive.org/download/canadas-2011-census-of-population/98-316-XWE2011001-901_CSV.zip https://zenodo.org/records/15344413/files/98-316-XWE2011001-901_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2011/98-316-XWE2011001-901_CSV.zip + +# Designated places (DPLs) (98-316-XWE2011001-1301_CSV.zip) +https://archive.org/download/canadas-2011-census-of-population/98-316-XWE2011001-1301_CSV.zip https://zenodo.org/records/15344413/files/98-316-XWE2011001-1301_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2011/98-316-XWE2011001-1301_CSV.zip + +# Population centres (POPCTRs) (98-316-XWE2011001-801_CSV.zip) +https://archive.org/download/canadas-2011-census-of-population/98-316-XWE2011001-801_CSV.zip https://zenodo.org/records/15344413/files/98-316-XWE2011001-801_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2011/98-316-XWE2011001-801_CSV.zip + +# Dissemination areas (DAs) (98-316-XWE2011001-1501_CSV.zip) +https://archive.org/download/canadas-2011-census-of-population/98-316-XWE2011001-1501_CSV.zip https://zenodo.org/records/15344413/files/98-316-XWE2011001-1501_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2011/98-316-XWE2011001-1501_CSV.zip + +# Dissolved census subdivisions (Dissolved CSDs) (98-316-XWE2011001-1401_CSV.zip) +https://archive.org/download/canadas-2011-census-of-population/98-316-XWE2011001-1401_CSV.zip https://zenodo.org/records/15344413/files/98-316-XWE2011001-1401_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2011/98-316-XWE2011001-1401_CSV.zip + +# Forward sortation areas (FSAs) (98-316-XWE2011001-1601_CSV.zip) +https://archive.org/download/canadas-2011-census-of-population/98-316-XWE2011001-1601_CSV.zip https://zenodo.org/records/15344413/files/98-316-XWE2011001-1601_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2011/98-316-XWE2011001-1601_CSV.zip + +# Health regions (HRs) (December 2013) (98-316-XWE2011001-1701_CSV.zip) +https://archive.org/download/canadas-2011-census-of-population/98-316-XWE2011001-1701_CSV.zip https://zenodo.org/records/15344413/files/98-316-XWE2011001-1701_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2011/98-316-XWE2011001-1701_CSV.zip diff --git a/census_of_population/census_of_population_files_2016.txt b/census_of_population/census_of_population_files_2016.txt new file mode 100644 index 0000000..007e73b --- /dev/null +++ b/census_of_population/census_of_population_files_2016.txt @@ -0,0 +1,31 @@ +### 2016 ### + +# Canada, provinces, territories (PRs), census divisions (CDs), census subdivisions (CSDs) and dissemination areas (DAs) (98-401-X2016044_eng_CSV.zip) +https://archive.org/download/canadas-2016-census-of-population/98-401-X2016044_eng_CSV.zip https://zenodo.org/records/15343658/files/98-401-X2016044_eng_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2016/98-401-X2016044_eng_CSV.zip + +# Census metropolitan areas (CMAs), tracted census agglomerations (CAs) and census tracts (CTs) (98-401-X2016043_eng_CSV.zip) +https://archive.org/download/canadas-2016-census-of-population/98-401-X2016043_eng_CSV.zip https://zenodo.org/records/15343658/files/98-401-X2016043_eng_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2016/98-401-X2016043_eng_CSV.zip + +# Economic regions (ERs) (98-401-X2016049_eng_CSV.zip) +https://archive.org/download/canadas-2016-census-of-population/98-401-X2016049_eng_CSV.zip https://zenodo.org/records/15343658/files/98-401-X2016049_eng_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2016/98-401-X2016049_eng_CSV.zip + +# Population centres (POPCTRs) (98-401-X2016048_eng_CSV.zip) +https://archive.org/download/canadas-2016-census-of-population/98-401-X2016048_eng_CSV.zip https://zenodo.org/records/15343658/files/98-401-X2016048_eng_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2016/98-401-X2016048_eng_CSV.zip + +# Canada, provinces, territories and federal electoral districts (FEDs) (2013 Representation Order) (98-401-X2016045_eng_CSV.zip) +https://archive.org/download/canadas-2016-census-of-population/98-401-X2016045_eng_CSV.zip https://zenodo.org/records/15343658/files/98-401-X2016045_eng_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2016/98-401-X2016045_eng_CSV.zip + +# Designated places (DPLs) (98-401-X2016047_eng_CSV.zip) +https://archive.org/download/canadas-2016-census-of-population/98-401-X2016047_eng_CSV.zip https://zenodo.org/records/15343658/files/98-401-X2016047_eng_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2016/98-401-X2016047_eng_CSV.zip + +# Aggregate dissemination areas (ADAs) (98-401-X2016050_eng_CSV.zip) +https://archive.org/download/canadas-2016-census-of-population/98-401-X2016050_eng_CSV.zip https://zenodo.org/records/15343658/files/98-401-X2016050_eng_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2016/98-401-X2016050_eng_CSV.zip + +# Forward Sortation Areas (FSAs) (98-401-X2016046_eng_CSV.zip) +https://archive.org/download/canadas-2016-census-of-population/98-401-X2016046_eng_CSV.zip https://zenodo.org/records/15343658/files/98-401-X2016046_eng_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2016/98-401-X2016046_eng_CSV.zip + +# Dissolved Census subdivisions (CSDs) (98-401-X2016057_eng_CSV.zip) +https://archive.org/download/canadas-2016-census-of-population/98-401-X2016057_eng_CSV.zip https://zenodo.org/records/15343658/files/98-401-X2016057_eng_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2016/98-401-X2016057_eng_CSV.zip + +# Health Regions (HRs) (98-401-X2016058_eng_CSV.zip) +https://archive.org/download/canadas-2016-census-of-population/98-401-X2016058_eng_CSV.zip https://zenodo.org/records/15343658/files/98-401-X2016058_eng_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2016/98-401-X2016058_eng_CSV.zip \ No newline at end of file diff --git a/census_of_population/census_of_population_files_2021.txt b/census_of_population/census_of_population_files_2021.txt new file mode 100644 index 0000000..bb9ae3e --- /dev/null +++ b/census_of_population/census_of_population_files_2021.txt @@ -0,0 +1,36 @@ +### 2021 ### + +# https://www12.statcan.gc.ca/census-recensement/2021/dp-pd/prof/details/download-telecharger.cfm?Lang=E + +# Canada, provinces, territories (PRs), census divisions (CDs), census subdivisions (CSDs) and dissemination areas (DAs) (98-401-X2021006_eng_CSV.zip) +https://archive.org/download/canadas-2021-census-of-population/98-401-X2021006_eng_CSV.zip https://zenodo.org/records/15333781/files/98-401-X2021006_eng_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2021/98-401-X2021006_eng_CSV.zip + +# Census metropolitan areas (CMAs), tracted census agglomerations (CAs) and census tracts (CTs) (98-401-X2021007_eng_CSV.zip) +https://archive.org/download/canadas-2021-census-of-population/98-401-X2021007_eng_CSV.zip https://zenodo.org/records/15333781/files/98-401-X2021007_eng_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2021/98-401-X2021007_eng_CSV.zip + +# Economic regions (ERs) (98-401-X2021008_eng_CSV.zip) +https://archive.org/download/canadas-2021-census-of-population/98-401-X2021008_eng_CSV.zip https://zenodo.org/records/15333781/files/98-401-X2021008_eng_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2021/98-401-X2021008_eng_CSV.zip + +# Population centres (POPCTRs) (98-401-X2021009_eng_CSV.zip) +https://archive.org/download/canadas-2021-census-of-population/98-401-X2021009_eng_CSV.zip https://zenodo.org/records/15333781/files/98-401-X2021009_eng_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2021/98-401-X2021009_eng_CSV.zip + +# Federal electoral districts, 2013 representation order (98-401-X2021010_eng_CSV.zip) +https://archive.org/download/canadas-2021-census-of-population/98-401-X2021010_eng_CSV.zip https://zenodo.org/records/15333781/files/98-401-X2021010_eng_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2021/98-401-X2021010_eng_CSV.zip + +# Federal electoral districts, 2023 representation order (98-401-X2021029_eng_CSV.zip) +https://archive.org/download/canadas-2021-census-of-population/98-401-X2021029_eng_CSV.zip https://zenodo.org/records/15333781/files/98-401-X2021029_eng_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2021/98-401-X2021029_eng_CSV.zip + +# Designated places (DPLs) (98-401-X2021011_eng_CSV.zip) +https://archive.org/download/canadas-2021-census-of-population/98-401-X2021011_eng_CSV.zip https://zenodo.org/records/15333781/files/98-401-X2021011_eng_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2021/98-401-X2021011_eng_CSV.zip + +# Aggregate dissemination areas (ADAs) (98-401-X2021012_eng_CSV.zip) +https://archive.org/download/canadas-2021-census-of-population/98-401-X2021012_eng_CSV.zip https://zenodo.org/records/15333781/files/98-401-X2021012_eng_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2021/98-401-X2021012_eng_CSV.zip + +# Forward Sortation Areas (FSAs) (98-401-X2021013_eng_CSV.zip) +https://archive.org/download/canadas-2021-census-of-population/98-401-X2021013_eng_CSV.zip https://zenodo.org/records/15333781/files/98-401-X2021013_eng_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2021/98-401-X2021013_eng_CSV.zip + +# Dissolved census subdivisions (Dissolved CSDs) (98-401-X2021014_eng_CSV.zip) +https://archive.org/download/canadas-2021-census-of-population/98-401-X2021014_eng_CSV.zip https://zenodo.org/records/15334624/files/98-401-X2021014_eng_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2021/98-401-X2021014_eng_CSV.zip + +# Health Regions (HRs) and Home and Community Care Support Services (HCCCSS) 2022 (98-401-X2021015_eng_CSV.zip) +https://archive.org/download/canadas-2021-census-of-population/98-401-X2021015_eng_CSV.zip https://zenodo.org/records/15333781/files/98-401-X2021015_eng_CSV.zip?download=1 https://data.dataforcanada.org/archive/statistics_canada/census_of_population/2021/98-401-X2021015_eng_CSV.zip \ No newline at end of file diff --git a/census_of_population/download.sh b/census_of_population/download.sh new file mode 100755 index 0000000..8952d72 --- /dev/null +++ b/census_of_population/download.sh @@ -0,0 +1,24 @@ +#!/bin/bash +if [ ! -d "${DATA_FOLDER}/census_of_population" ] +then + echo "Making directory ${DATA_FOLDER}/census_of_population/" + mkdir -p ${DATA_FOLDER}/census_of_population/{input,extracted,output}/{2021,2016,2011,2001} + mkdir -p ${DATA_FOLDER}/census_of_population/output/{2021,2016,2011,2001}/{tabular,spatial} +fi + +INPUT_FOLDER="${DATA_FOLDER}/census_of_population/input" + +echo "Downloading 2021 Census of Population" +aria2c -x16 -i "${SCRIPT_DIR}/census_of_population/census_of_population_files_2021.txt" --dir=${INPUT_FOLDER}/2021 --auto-file-renaming=false + +echo "Downloading 2016 Census of Population" +aria2c -x16 -i "${SCRIPT_DIR}/census_of_population/census_of_population_files_2016.txt" --dir=${INPUT_FOLDER}/2016 --auto-file-renaming=false + +echo "Downloading 2011 Census of Population" +aria2c -x16 -i "${SCRIPT_DIR}/census_of_population/census_of_population_files_2011.txt" --dir=${INPUT_FOLDER}/2011 --auto-file-renaming=false + +#echo "Downloading 2006 Census of Population" +#aria2c -x16 -i "${SCRIPT_DIR}/census_of_population/census_of_population_files_2006.txt" --dir=${INPUT_FOLDER}/2006 --auto-file-renaming=false +# +#echo "Downloading 2001 Census of Population" +#aria2c -x16 -i "${SCRIPT_DIR}/census_of_population/census_of_population_files_2001.txt" --dir=${INPUT_FOLDER}/2001 --auto-file-renaming=false \ No newline at end of file diff --git a/census_of_population/process.sh b/census_of_population/process.sh new file mode 100755 index 0000000..c2f3e05 --- /dev/null +++ b/census_of_population/process.sh @@ -0,0 +1,155 @@ +#!/bin/bash + +INPUT_FOLDER="${DATA_FOLDER}/census_of_population/input" +EXTRACTED_FOLDER="${DATA_FOLDER}/census_of_population/extracted" + +process_2021() { + echo "Processing 2021 Census of Population" + jupyter execute census_of_population/process_2021.ipynb +} + +extract_2021() { + local INPUT_FOLDER="${INPUT_FOLDER}/2021" + local EXTRACTED_FOLDER="${EXTRACTED_FOLDER}/2021" + echo "Extracting Canada, provinces, territories (PRs), census divisions (CDs), census subdivisions (CSDs) and dissemination areas (DAs) (${INPUT_FOLDER}/98-401-X2021006_eng_CSV.zip). Extracting to ${EXTRACTED_FOLDER}/98-401-X2021006_eng_CSV" + mkdir -p "${EXTRACTED_FOLDER}/98-401-X2021006_eng_CSV" + unzip -q -n ${INPUT_FOLDER}/98-401-X2021006_eng_CSV.zip -d ${EXTRACTED_FOLDER}/98-401-X2021006_eng_CSV + + echo "Extracting Census metropolitan areas (CMAs), tracted census agglomerations (CAs) and census tracts (CTs) (${INPUT_FOLDER}/98-401-X2021007_eng_CSV.zip). Extracting to ${EXTRACTED_FOLDER}/98-401-X2021007_eng_CSV" + mkdir -p "${EXTRACTED_FOLDER}/98-401-X2021007_eng_CSV" + unzip -q -n ${INPUT_FOLDER}/98-401-X2021007_eng_CSV.zip -d ${EXTRACTED_FOLDER}/98-401-X2021007_eng_CSV + + echo "Extracting Economic regions (ERs) (${INPUT_FOLDER}/98-401-X2021008_eng_CSV.zip). Extracting to ${EXTRACTED_FOLDER}/98-401-X2021008_eng_CSV" + mkdir -p "${EXTRACTED_FOLDER}/98-401-X2021008_eng_CSV" + unzip -q -n ${INPUT_FOLDER}/98-401-X2021008_eng_CSV.zip -d ${EXTRACTED_FOLDER}/98-401-X2021008_eng_CSV + + echo "Extracting Population centres (POPCTRs) (${INPUT_FOLDER}/98-401-X2021009_eng_CSV.zip). Extracting to ${EXTRACTED_FOLDER}/98-401-X2021009_eng_CSV" + mkdir -p "${EXTRACTED_FOLDER}/98-401-X2021009_eng_CSV" + unzip -q -n ${INPUT_FOLDER}/98-401-X2021009_eng_CSV.zip -d ${EXTRACTED_FOLDER}/98-401-X2021009_eng_CSV + + echo "Extracting Federal electoral districts, 2013 representation order (${INPUT_FOLDER}/98-401-X2021010_eng_CSV.zip). Extracting to ${EXTRACTED_FOLDER}/98-401-X2021010_eng_CSV" + mkdir -p "${EXTRACTED_FOLDER}/98-401-X2021010_eng_CSV" + unzip -q -n ${INPUT_FOLDER}/98-401-X2021010_eng_CSV.zip -d ${EXTRACTED_FOLDER}/98-401-X2021010_eng_CSV + + echo "Extracting Federal electoral districts, 2023 representation order (${INPUT_FOLDER}/98-401-X2021029_eng_CSV.zip). Extracting to ${EXTRACTED_FOLDER}/98-401-X2021029_eng_CSV" + mkdir -p "${EXTRACTED_FOLDER}/98-401-X2021029_eng_CSV" + unzip -q -n ${INPUT_FOLDER}/98-401-X2021029_eng_CSV.zip -d ${EXTRACTED_FOLDER}/98-401-X2021029_eng_CSV + + echo "Extracting Designated places (DPLs) (${INPUT_FOLDER}/98-401-X2021011_eng_CSV.zip). Extracting to ${EXTRACTED_FOLDER}/98-401-X2021011_eng_CSV" + mkdir -p "${EXTRACTED_FOLDER}/98-401-X2021011_eng_CSV" + unzip -q -n ${INPUT_FOLDER}/98-401-X2021011_eng_CSV.zip -d ${EXTRACTED_FOLDER}/98-401-X2021011_eng_CSV + + echo "Extracting Aggregate dissemination areas (ADAs) (${INPUT_FOLDER}/98-401-X2021012_eng_CSV.zip). Extracting to ${EXTRACTED_FOLDER}/98-401-X2021012_eng_CSV" + mkdir -p "${EXTRACTED_FOLDER}/98-401-X2021012_eng_CSV" + unzip -q -n ${INPUT_FOLDER}/98-401-X2021012_eng_CSV.zip -d ${EXTRACTED_FOLDER}/98-401-X2021012_eng_CSV + + echo "Extracting Forward Sortation Areas (FSAs) (${INPUT_FOLDER}/98-401-X2021013_eng_CSV.zip). Extracting to ${EXTRACTED_FOLDER}/98-401-X2021013_eng_CSV" + mkdir -p "${EXTRACTED_FOLDER}/98-401-X2021013_eng_CSV" + unzip -q -n ${INPUT_FOLDER}/98-401-X2021013_eng_CSV.zip -d ${EXTRACTED_FOLDER}/98-401-X2021013_eng_CSV + + echo "Extracting Dissolved census subdivisions (Dissolved CSDs) (98-401-X2021014_eng_CSV.zip)" + mkdir -p "${EXTRACTED_FOLDER}/98-401-X2021014_eng_CSV" + unzip -q -n ${INPUT_FOLDER}/98-401-X2021014_eng_CSV.zip -d ${EXTRACTED_FOLDER}/98-401-X2021014_eng_CSV + + echo "Extracting Health Regions (HRs) and Home and Community Care Support Services (HCCCSS) 2022 (${INPUT_FOLDER}/98-401-X2021015_eng_CSV.zip). Extracting to ${EXTRACTED_FOLDER}/98-401-X2021009_eng_CSV" + mkdir -p "${EXTRACTED_FOLDER}/98-401-X2021015_eng_CSV" + unzip -q -n ${INPUT_FOLDER}/98-401-X2021015_eng_CSV.zip -d ${EXTRACTED_FOLDER}/98-401-X2021015_eng_CSV +} + +extract_2016() { + local INPUT_FOLDER="${INPUT_FOLDER}/2016" + local EXTRACTED_FOLDER="${EXTRACTED_FOLDER}/2016" + echo "Extracting Canada, provinces, territories (PRs), census divisions (CDs), census subdivisions (CSDs) and dissemination areas (DAs) (${INPUT_FOLDER}/98-401-X2016044_eng_CSV.zip). Extracting to ${EXTRACTED_FOLDER}/98-401-X2016044_eng_CSV" + mkdir -p "${EXTRACTED_FOLDER}/98-401-X2016044_eng_CSV" + unzip -q -n ${INPUT_FOLDER}/98-401-X2016044_eng_CSV.zip -d ${EXTRACTED_FOLDER}/98-401-X2016044_eng_CSV + + echo "Extracting Census metropolitan areas (CMAs), tracted census agglomerations (CAs) and census tracts (CTs) (${INPUT_FOLDER}/98-401-X2016044_eng_CSV.zip). Extracting to ${EXTRACTED_FOLDER}/98-401-X2016044_eng_CSV" + mkdir -p "${EXTRACTED_FOLDER}/98-401-X2016044_eng_CSV" + unzip -q -n ${INPUT_FOLDER}/98-401-X2016044_eng_CSV.zip -d ${EXTRACTED_FOLDER}/98-401-X2016044_eng_CSV + + echo "Extracting Economic regions (ERs) (${INPUT_FOLDER}/98-401-X2016049_eng_CSV.zip). Extracting to ${EXTRACTED_FOLDER}/98-401-X2016049_eng_CSV" + mkdir -p "${EXTRACTED_FOLDER}/98-401-X2016049_eng_CSV" + unzip -q -n ${INPUT_FOLDER}/98-401-X2016049_eng_CSV.zip -d ${EXTRACTED_FOLDER}/98-401-X2016049_eng_CSV + + echo "Extracting Population centres (POPCTRs) (${INPUT_FOLDER}/98-401-X2016048_eng_CSV.zip). Extracting to ${EXTRACTED_FOLDER}/98-401-X2016048_eng_CSV" + mkdir -p "${EXTRACTED_FOLDER}/98-401-X2016048_eng_CSV" + unzip -q -n ${INPUT_FOLDER}/98-401-X2016048_eng_CSV.zip -d ${EXTRACTED_FOLDER}/98-401-X2016048_eng_CSV + + echo "Extracting Canada, provinces, territories and federal electoral districts (FEDs) (2013 Representation Order) (${INPUT_FOLDER}/98-401-X2016045_eng_CSV.zip). Extracting to ${EXTRACTED_FOLDER}/98-401-X2016045_eng_CSV" + mkdir -p "${EXTRACTED_FOLDER}/98-401-X2016045_eng_CSV" + unzip -q -n ${INPUT_FOLDER}/98-401-X2016045_eng_CSV.zip -d ${EXTRACTED_FOLDER}/98-401-X2016045_eng_CSV + + echo "Extracting Designated places (DPLs) (${INPUT_FOLDER}/98-401-X2016047_eng_CSV.zip). Extracting to ${EXTRACTED_FOLDER}/98-401-X2016047_eng_CSV" + mkdir -p "${EXTRACTED_FOLDER}/98-401-X2016047_eng_CSV" + unzip -q -n ${INPUT_FOLDER}/98-401-X2016047_eng_CSV.zip -d ${EXTRACTED_FOLDER}/98-401-X2016047_eng_CSV + + echo "Extracting Aggregate dissemination areas (ADAs) (${INPUT_FOLDER}/98-401-X2016050_eng_CSV.zip). Extracting to ${EXTRACTED_FOLDER}/98-401-X2016050_eng_CSV" + mkdir -p "${EXTRACTED_FOLDER}/98-401-X2016050_eng_CSV" + unzip -q -n ${INPUT_FOLDER}/98-401-X2016050_eng_CSV.zip -d ${EXTRACTED_FOLDER}/98-401-X2016050_eng_CSV + + echo "Extracting Forward Sortation Areas (FSAs) (${INPUT_FOLDER}/98-401-X2016046_eng_CSV.zip). Extracting to ${EXTRACTED_FOLDER}/98-401-X2016046_eng_CSV" + mkdir -p "${EXTRACTED_FOLDER}/98-401-X2016046_eng_CSV" + unzip -q -n ${INPUT_FOLDER}/98-401-X2016046_eng_CSV.zip -d ${EXTRACTED_FOLDER}/98-401-X2016046_eng_CSV + + echo "Extracting Health Regions (HRs) (${INPUT_FOLDER}/98-401-X2016058_eng_CSV.zip). Extracting to ${EXTRACTED_FOLDER}/98-401-X2016058_eng_CSV" + mkdir -p "${EXTRACTED_FOLDER}/98-401-X2016058_eng_CSV" + unzip -q -n ${INPUT_FOLDER}/98-401-X2016058_eng_CSV.zip -d ${EXTRACTED_FOLDER}/98-401-X2016058_eng_CSV +} + +extract_2011() { + local INPUT_FOLDER="${INPUT_FOLDER}/2011" + local EXTRACTED_FOLDER="${EXTRACTED_FOLDER}/2011" + + echo "Extracting Canada, provinces, territories (PRs) (98-316-XWE2011001-101_CSV.zip)" + mkdir -p "${EXTRACTED_FOLDER}/98-316-XWE2011001-101_CSV" + unzip -q -n ${INPUT_FOLDER}/98-316-XWE2011001-101_CSV.zip -d ${EXTRACTED_FOLDER}/98-316-XWE2011001-101_CSV + + echo "Extracting Census divisions (CDs) (98-316-XWE2011001-701_CSV.zip)" + mkdir -p "${EXTRACTED_FOLDER}/98-316-XWE2011001-701_CSV" + unzip -q -n ${INPUT_FOLDER}/98-316-XWE2011001-701_CSV.zip -d ${EXTRACTED_FOLDER}/98-316-XWE2011001-701_CSV + + echo "Extracting Census metropolitan areas (CMAs), tracted census agglomerations (CAs) (98-316-XWE2011001-201_CSV.zip)" + mkdir -p "${EXTRACTED_FOLDER}/98-316-XWE2011001-201_CSV" + unzip -q -n ${INPUT_FOLDER}/98-316-XWE2011001-201_CSV.zip -d ${EXTRACTED_FOLDER}/98-316-XWE2011001-201_CSV + + echo "Extracting Census tracts (CTs) (98-316-XWE2011001-401_CSV.zip)" + mkdir -p "${EXTRACTED_FOLDER}/98-316-XWE2011001-401_CSV" + unzip -q -n ${INPUT_FOLDER}/98-316-XWE2011001-401_CSV.zip -d ${EXTRACTED_FOLDER}/98-316-XWE2011001-401_CSV + + echo "Extracting Federal electoral districts (2003 representation order) (98-316-XWE2011001-501_CSV.zip)" + mkdir -p "${EXTRACTED_FOLDER}/98-316-XWE2011001-501_CSV.zip" + unzip -q -n ${INPUT_FOLDER}/98-316-XWE2011001-501_CSV.zip -d ${EXTRACTED_FOLDER}/98-316-XWE2011001-501_CSV.zip + + echo "Extracting Federal electoral districts (2013 representation order) (98-316-XWE2011001-511_CSV.zip)" + mkdir -p "${EXTRACTED_FOLDER}/98-316-XWE2011001-511_CSV" + unzip -q -n ${INPUT_FOLDER}/98-316-XWE2011001-511_CSV.zip -d ${EXTRACTED_FOLDER}/98-316-XWE2011001-511_CSV + + echo "Extracting Economic regions (ERs) (98-316-XWE2011001-901_CSV.zip)" + mkdir -p "${EXTRACTED_FOLDER}/98-316-XWE2011001-901_CSV" + unzip -q -n ${INPUT_FOLDER}/98-316-XWE2011001-901_CSV.zip -d ${EXTRACTED_FOLDER}/98-316-XWE2011001-901_CSV + + echo "Extracting Designated places (98-316-XWE2011001-1301_CSV.zip)" + mkdir -p "${EXTRACTED_FOLDER}/98-316-XWE2011001-1301_CSV" + unzip -q -n ${INPUT_FOLDER}/98-316-XWE2011001-1301_CSV.zip -d ${EXTRACTED_FOLDER}/98-316-XWE2011001-1301_CSV + + echo "Extracting Population centres (POPCTRs) (98-316-XWE2011001-801_CSV.zip)" + mkdir -p "${EXTRACTED_FOLDER}/98-316-XWE2011001-801_CSV" + unzip -q -n ${INPUT_FOLDER}/98-316-XWE2011001-801_CSV.zip -d ${EXTRACTED_FOLDER}/98-316-XWE2011001-801_CSV +} + +extract_2006() { + local INPUT_FOLDER="${INPUT_FOLDER}/2006" + local EXTRACTED_FOLDER="${EXTRACTED_FOLDER}/2006" +} + +extract_2001() { + local INPUT_FOLDER="${INPUT_FOLDER}/2001" + local EXTRACTED_FOLDER="${EXTRACTED_FOLDER}/2001" +} + +extract_2021 +extract_2016 +extract_2011 +extract_2006 + +process_2021 \ No newline at end of file diff --git a/census_of_population/process_2021.ipynb b/census_of_population/process_2021.ipynb new file mode 100644 index 0000000..ac49d1a --- /dev/null +++ b/census_of_population/process_2021.ipynb @@ -0,0 +1,1360 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 50, + "id": "fc8ca6f9", + "metadata": {}, + "outputs": [], + "source": [ + "import gc\n", + "import glob\n", + "\n", + "import duckdb\n", + "from IPython.core.interactiveshell import InteractiveShell \n", + "import numpy as np\n", + "import pandas as pd\n", + "import sqlalchemy\n", + "\n", + "# Enable multiple outputs per cell\n", + "InteractiveShell.ast_node_interactivity = \"all\"\n", + "# Show all columns\n", + "pd.set_option('display.max_columns', None)\n", + "\n", + "data_dir = '/data/census_of_population/output/2021/tabular'" + ] + }, + { + "cell_type": "markdown", + "id": "3d20a1f5", + "metadata": {}, + "source": [ + "# Datasets\n", + "- 1.0 Canada, provinces, territories, census divisions (CDs), census subdivisions (CSDs) and dissemination areas (DAs)\n", + "- 2.0 Census metropolitan areas (CMAs), tracted census agglomerations (CAs) and census tracts (CTs)\n", + "- 3.0 Economic regions (ERs)\n", + "- 4.0 Population centres (POPCTRs)\n", + "- 5.0 Canada, provinces, territories and federal electoral districts (FEDs) (2013 Representation Order). **Just process FEDs**\n", + "- 6.0 Canada, provinces, territories and federal electoral districts (FEDs) (2023 Representation Order). **Just process FEDs**\n", + "- 7.0 Designated places (DPLs)\n", + "- 8.0 Aggregate dissemination areas (ADAs)\n", + "- 9.0 Forward sortation areas (FSAs)\n", + "- 10.0 Health regions (HRs)\n", + " - The Health Regions CSV also has a `Home and Community Care Support Services` `GEO_LEVEL`\n", + " - Need to take a look at the HRs files listed here https://www150.statcan.gc.ca/n1/pub/82-402-x/2023001/hrbf-flrs-eng.htm" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b88a0db8", + "metadata": {}, + "outputs": [], + "source": [ + "def process_cop_csv(csvs_to_process):\n", + " \"\"\"\n", + " 1. Reads subset of fields for Census of Population CSV files\n", + " 2. Pivots on characteristic_id\n", + " 3. Appends all of the processed CSVs as one dataframe\n", + " \"\"\"\n", + " dataframes_to_concatenate = []\n", + " for filename in csvs_to_process:\n", + " print(f\"Processing {filename}\")\n", + " params = {\n", + " 'filepath_or_buffer': filename,\n", + " 'encoding': 'latin-1',\n", + " 'usecols': ['DGUID', \n", + " 'CHARACTERISTIC_ID', \n", + " 'C1_COUNT_TOTAL',\n", + " 'C2_COUNT_MEN+',\n", + " 'C3_COUNT_WOMEN+'\n", + " ],\n", + " 'dtype': {\n", + " 'CHARACTERISTIC_ID': np.int16\n", + " }\n", + " }\n", + " cop_df = pd.read_csv(**params)\n", + " cop_df.rename(columns={\n", + " 'C1_COUNT_TOTAL': 'count_total',\n", + " 'C2_COUNT_MEN+': 'count_men', \n", + " 'C3_COUNT_WOMEN+': 'count_women',\n", + " 'DGUID': 'dguid'\n", + " }, inplace=True)\n", + "\n", + " cop_df = cop_df.pivot(index='dguid', columns='CHARACTERISTIC_ID')\n", + "\n", + " # Flatten the hierarchical index\n", + " # https://stackoverflow.com/questions/14507794/how-to-flatten-a-hierarchical-index-in-columns/57630176#57630176\n", + " level_one = cop_df.columns.get_level_values(0).astype(str)\n", + " level_two = cop_df.columns.get_level_values(1).astype(str)\n", + " column_separator = ['_' if x != '' else '' for x in level_two]\n", + " cop_df.columns = level_one + column_separator + level_two\n", + " dataframes_to_concatenate.append(cop_df)\n", + " \n", + " print(\"Concatenating all dataframes into one\")\n", + " cop_df = pd.concat(dataframes_to_concatenate)\n", + " \n", + " return cop_df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "cba01571", + "metadata": {}, + "outputs": [], + "source": [ + "def drop_na_columns(dataframe):\n", + " \"\"\"\n", + " Delete columns where there are no values.\n", + " There are cases where there are values for the count_total\n", + " columns, but no values for the count_men and count_women columns\n", + " \"\"\"\n", + " columns_to_drop = []\n", + " for field in dataframe.columns:\n", + " minimum_value = dataframe[field].min()\n", + " maximum_value = dataframe[field].max()\n", + " if pd.isna(minimum_value) and pd.isna(maximum_value):\n", + " columns_to_drop.append(field)\n", + "\n", + " if columns_to_drop:\n", + " print(\"Dropping columns that don't have values\")\n", + " dataframe.drop(columns=columns_to_drop, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6071a2fe", + "metadata": {}, + "outputs": [], + "source": [ + "def convert_to_lowest_type(df):\n", + " \"\"\"\n", + " Convert columns to the best possible dtypes\n", + " For example, if the column is numerical and has a maximum value of 32,000 we can assign it a type of int16\n", + " \"\"\"\n", + " params = {\n", + " 'convert_string': False,\n", + " 'convert_boolean': False\n", + " }\n", + " df = df.convert_dtypes(**params)\n", + "\n", + " dtypes = pd.DataFrame(df.dtypes)\n", + " \n", + " # Downcast to the smallest numerical dtype\n", + " for row in dtypes.itertuples():\n", + " column = row[0]\n", + " the_type = str(row[1])\n", + " \n", + " # Skipping downcasting Float64 as there were issues with decimal places\n", + " # For example, instead of a value being 65.4, it turned into 65.4000015258789\n", + " if the_type == 'Float64':\n", + " continue \n", + " elif the_type == 'Int64':\n", + " df[column] = pd.to_numeric(df[column], downcast='integer')\n", + "\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "id": "86ef0ae5", + "metadata": {}, + "source": [ + "# Start processing\n", + "## 1.0 Process Canada, provinces, territories (PRs), census divisions (CDs), census subdivisions (CSDs) and dissemination areas (DAs)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e648921d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Atlantic.csv\n", + "Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_BritishColumbia.csv\n", + "Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Ontario.csv\n", + "Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Prairies.csv\n", + "Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Quebec.csv\n", + "Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Territories.csv\n", + "Concatenating all dataframes into one\n" + ] + } + ], + "source": [ + "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/*English_CSV_data*\")\n", + "cop_df = process_cop_csv(csvs_to_process)" + ] + }, + { + "cell_type": "markdown", + "id": "9a893e04", + "metadata": {}, + "source": [ + "# Remove duplicates\n", + "- For example, for some reason, they included Canada (dguid 2021A000011124) 6 times (once per CSV), so we need to get unique values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5712f497", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "print(f\"Number of records before {len(cop_df)}\")\n", + "print(\"Before:\")\n", + "cop_df[cop_df.index == '2021A000011124']\n", + "\n", + "# Get unique records\n", + "cop_df = cop_df.groupby(cop_df.index).last()\n", + "print(f\"Number of records after {len(cop_df)}\")\n", + "cop_df[cop_df.index == '2021A000011124']" + ] + }, + { + "cell_type": "markdown", + "id": "f987a804", + "metadata": {}, + "source": [ + "Get unique records" + ] + }, + { + "cell_type": "markdown", + "id": "0e0e6cb8", + "metadata": {}, + "source": [ + "# Split the Census of Population dataframe by geographic level" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdee50fc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "con = duckdb.connect()\n", + "con.install_extension(\"spatial\")\n", + "con.load_extension(\"spatial\")\n", + "\n", + "# Get the dguid per level of geography\n", + "con.sql(\"\"\"\n", + "DROP TABLE IF EXISTS country_2021;\n", + "CREATE TABLE country_2021 AS SELECT country_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/country_2021.parquet';\n", + "\n", + "DROP TABLE IF EXISTS pr_2021;\n", + "CREATE TABLE pr_2021 AS SELECT pr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/pr_2021.parquet';\n", + "\n", + "DROP TABLE IF EXISTS cd_2021;\n", + "CREATE TABLE cd_2021 AS SELECT cd_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/cd_2021.parquet';\n", + "\n", + "DROP TABLE IF EXISTS csd_2021;\n", + "CREATE TABLE csd_2021 AS SELECT csd_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/csd_2021.parquet';\n", + "\n", + "DROP TABLE IF EXISTS da_2021;\n", + "CREATE TABLE da_2021 AS SELECT da_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/da_2021.parquet';\n", + "\"\"\")\n", + "con.commit()\n", + "\n", + "# Convert the duckdb tables to pandas dataframe\n", + "country_dguid = con.sql(\"SELECT * FROM country_2021\").to_df()\n", + "pr_dguid = con.sql(\"SELECT * FROM pr_2021\").to_df()\n", + "cd_dguid = con.sql(\"SELECT * FROM cd_2021\").to_df()\n", + "csd_dguid = con.sql(\"SELECT * FROM csd_2021\").to_df()\n", + "da_dguid = con.sql(\"SELECT * FROM da_2021\").to_df()\n", + "\n", + "# Join the Census of Population dataframe to each geographic level\n", + "cop_country = cop_df.join(country_dguid.set_index('dguid'), on='dguid', how='inner')\n", + "cop_pr = cop_df.join(pr_dguid.set_index('dguid'), on='dguid', how='inner')\n", + "cop_cd = cop_df.join(cd_dguid.set_index('dguid'), on='dguid', how='inner')\n", + "cop_csd = cop_df.join(csd_dguid.set_index('dguid'), on='dguid', how='inner')\n", + "cop_da = cop_df.join(da_dguid.set_index('dguid'), on='dguid', how='inner')\n", + "\n", + "del(cop_df)\n", + "del(country_dguid)\n", + "del(pr_dguid)\n", + "del(cd_dguid)\n", + "del(csd_dguid)\n", + "del(da_dguid)\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "id": "2ca0a313-6530-4ecf-becd-8bedc31fdeed", + "metadata": {}, + "source": [ + "# Convert dataframe columns to lowest dtype" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "96aed739", + "metadata": {}, + "outputs": [], + "source": [ + "cop_country = convert_to_lowest_type(cop_country)\n", + "cop_pr = convert_to_lowest_type(cop_pr)\n", + "cop_cd = convert_to_lowest_type(cop_cd)\n", + "cop_csd = convert_to_lowest_type(cop_csd)\n", + "cop_da = convert_to_lowest_type(cop_da)" + ] + }, + { + "cell_type": "markdown", + "id": "9de07a61", + "metadata": {}, + "source": [ + "# Delete columns where there are no values" + ] + }, + { + "cell_type": "markdown", + "id": "7da3c692", + "metadata": {}, + "source": [ + "## Number of columns before" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "7c6531d5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Country- length: 7893\n", + "PR- length: 7893\n", + "CD- length: 7893\n", + "CSD- length: 7893\n", + "DA- length: 7893\n" + ] + } + ], + "source": [ + "print(f\"Country- length: {len(cop_country.columns)}\")\n", + "print(f\"PR- length: {len(cop_pr.columns)}\")\n", + "print(f\"CD- length: {len(cop_cd.columns)}\")\n", + "print(f\"CSD- length: {len(cop_csd.columns)}\")\n", + "print(f\"DA- length: {len(cop_da.columns)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a971d90c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dropping columns that don't have values\n", + "Dropping columns that don't have values\n", + "Dropping columns that don't have values\n", + "Dropping columns that don't have values\n", + "Dropping columns that don't have values\n" + ] + } + ], + "source": [ + "drop_na_columns(cop_country)\n", + "drop_na_columns(cop_pr)\n", + "drop_na_columns(cop_cd)\n", + "drop_na_columns(cop_csd)\n", + "drop_na_columns(cop_da)" + ] + }, + { + "cell_type": "markdown", + "id": "511023c8", + "metadata": {}, + "source": [ + "## Number of columns after" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "1b003396", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Country- length: 7433\n", + "PR- length: 7433\n", + "CD- length: 7433\n", + "CSD- length: 7433\n", + "DA- length: 7431\n" + ] + } + ], + "source": [ + "print(f\"Country- length: {len(cop_country.columns)}\")\n", + "print(f\"PR- length: {len(cop_pr.columns)}\")\n", + "print(f\"CD- length: {len(cop_cd.columns)}\")\n", + "print(f\"CSD- length: {len(cop_csd.columns)}\")\n", + "print(f\"DA- length: {len(cop_da.columns)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "fa72bcce", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Country\n", + "cop_country = cop_country.reset_index()\n", + "cop_country.rename(columns={'dguid': 'country_dguid'}, inplace=True)\n", + "cop_country.to_parquet(path=f'{data_dir}/country_2021.parquet', index=False, compression='zstd')\n", + "\n", + "# Provinces and Territories\n", + "cop_pr = cop_pr.reset_index()\n", + "cop_pr.rename(columns={'dguid': 'pr_dguid'}, inplace=True)\n", + "cop_pr.to_parquet(path=f'{data_dir}/pr_2021.parquet', index=False, compression='zstd')\n", + "\n", + "# Census Divisions\n", + "cop_cd = cop_cd.reset_index()\n", + "cop_cd.rename(columns={'dguid': 'cd_dguid'}, inplace=True)\n", + "cop_cd.to_parquet(path=f'{data_dir}/cd_2021.parquet', index=False, compression='zstd')\n", + "\n", + "# Census Subdivisions\n", + "cop_csd = cop_csd.reset_index()\n", + "cop_csd.rename(columns={'dguid': 'csd_dguid'}, inplace=True)\n", + "cop_csd.to_parquet(path=f'{data_dir}/csd_2021.parquet', index=False, compression='zstd')\n", + "\n", + "# Dissemination Areas\n", + "cop_da = cop_da.reset_index()\n", + "cop_da.rename(columns={'dguid': 'da_dguid'}, inplace=True)\n", + "cop_da.to_parquet(path=f'{data_dir}/da_2021.parquet', index=False, compression='zstd')" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "441a7834-594f-400c-85a3-a353c8bdf202", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "del(cop_country)\n", + "del(cop_pr)\n", + "del(cop_cd)\n", + "del(cop_csd)\n", + "del(cop_da)\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "id": "6c7b31ef-9657-4a2e-bdf7-8d75313c10ef", + "metadata": {}, + "source": [ + "## 2.0 Process Census metropolitan areas (CMAs), tracted census agglomerations (CAs) and census tracts (CTs)" + ] + }, + { + "cell_type": "markdown", + "id": "91b81c24-c160-4c38-8713-8d1f06d18624", + "metadata": {}, + "source": [ + "# TODO: Finish processing CMA" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8c5ebfd-d5b7-4700-8006-3fa96402dc94", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing /data/census_of_population/extracted/2021/98-401-X2021007_eng_CSV/98-401-X2021007_English_CSV_data.csv\n", + "Concatenating all dataframes into one\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "244615" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021007_eng_CSV/*English_CSV_data*\")\n", + "cop_df = process_cop_csv(csvs_to_process)\n", + "\n", + "# Get the dguid per level of geography\n", + "con.sql(\"\"\"\n", + "DROP TABLE IF EXISTS country_2021;\n", + "DROP TABLE IF EXISTS pr_2021;\n", + "DROP TABLE IF EXISTS cd_2021;\n", + "DROP TABLE IF EXISTS csd_2021;\n", + "DROP TABLE IF EXISTS da_2021;\n", + "\n", + "DROP TABLE IF EXISTS cma_2021;\n", + "CREATE TABLE cma_2021 AS SELECT cma_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/cma_2021.parquet';\n", + "\n", + "DROP TABLE IF EXISTS ct_2021;\n", + "CREATE TABLE ct_2021 AS SELECT ct_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/ct_2021.parquet';\n", + "\"\"\")\n", + "con.commit()\n", + "\n", + "# Convert the duckdb tables to pandas dataframe\n", + "cma_dguid = con.sql(\"SELECT * FROM cma_2021\").to_df()\n", + "ct_dguid = con.sql(\"SELECT * FROM ct_2021\").to_df()\n", + "\n", + "# Join the Census of Population dataframe to each geographic level\n", + "# There's going to be missing links\n", + "cop_cma = cop_df.join(cma_dguid.set_index('dguid'), on='dguid', how='inner')\n", + "cop_ct = cop_df.join(ct_dguid.set_index('dguid'), on='dguid', how='inner')\n", + "\n", + "del(ct_dguid)\n", + "del(cop_df)\n", + "gc.collect()\n", + "\n", + "# Convert columns to lowest dtypes\n", + "cop_ct = convert_to_lowest_type(cop_ct)\n", + "\n", + "# Drop NA columns\n", + "print(f\"CT - Number of Columns BEFORE: {len(cop_ct.columns)}\")\n", + "drop_na_columns(cop_ct)\n", + "print(f\"CT - Number of Columns AFTER: {len(cop_ct.columns)}\")\n", + "\n", + "# Export\n", + "# Census Tracts\n", + "cop_ct = cop_ct.reset_index()\n", + "cop_ct.rename(columns={'dguid': 'ct_dguid'}, inplace=True)\n", + "cop_ct.to_parquet(path=f'{data_dir}/ct_2021.parquet', index=False, compression='zstd')\n", + "\n", + "del(cop_ct)\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "id": "f7f5e3fc-a6b1-46bc-a299-f0dd0d6db897", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## 3.0 Process Economic regions (ERs) (98-401-X2021008_eng_CSV)\n", + "This file also includes Provinces and Territories and Country" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8fa4d456-90ed-4b31-8c91-5d0c8b1faef5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing /data/census_of_population/extracted/2021/98-401-X2021008_eng_CSV/98-401-X2021008_English_CSV_data.csv\n", + "Concatenating all dataframes into one\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021008_eng_CSV/*English_CSV_data*\")\n", + "cop_df = process_cop_csv(csvs_to_process)\n", + "\n", + "# Get the dguid per level of geography\n", + "con.sql(\"\"\"\n", + "DROP TABLE IF EXISTS cma_2021;\n", + "DROP TABLE IF EXISTS ct_2021;\n", + "\n", + "DROP TABLE IF EXISTS er_2021;\n", + "CREATE TABLE er_2021 AS SELECT er_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/er_2021.parquet';\n", + "\"\"\")\n", + "con.commit()\n", + "\n", + "# Convert the duckdb tables to pandas dataframe\n", + "er_dguid = con.sql(\"SELECT * FROM er_2021\").to_df()\n", + "\n", + "# Join the Census of Population dataframe to each geographic level\n", + "cop_er = cop_df.join(er_dguid.set_index('dguid'), on='dguid', how='inner')\n", + "\n", + "del(er_dguid)\n", + "del(cop_df)\n", + "gc.collect()\n", + "\n", + "# Convert columns to lowest dtypes\n", + "cop_er = convert_to_lowest_type(cop_er)\n", + "\n", + "# Drop NA columns\n", + "print(f\"ER - Number of Columns BEFORE: {len(cop_er.columns)}\")\n", + "drop_na_columns(cop_er)\n", + "print(f\"CT - Number of Columns AFTER: {len(cop_er.columns)}\")\n", + "\n", + "# Export\n", + "# Economic Regions\n", + "cop_er = cop_er.reset_index()\n", + "cop_er.rename(columns={'dguid': 'er_dguid'}, inplace=True)\n", + "cop_er.to_parquet(path=f'{data_dir}/er_2021.parquet', index=False, compression='zstd')\n", + "\n", + "del(cop_er)\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "id": "5a57d285-6567-4288-a632-ce96e02ca23c", + "metadata": {}, + "source": [ + "## 4.0 Process Population centres (POPCTRs)\n", + "### There are 1026 DGUIDs in the Census of Population data, but there should be 1030\n", + "They also use the pop_ctr_dguid and not the pop_ctr_p_dguid. So, there's no way to differentiate between Ottawa, and Gatineau for pop_ctr_dguid 2021S05100616" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cef1f94a-bec2-45bb-95f1-f06a8f04b470", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing /data/census_of_population/extracted/2021/98-401-X2021009_eng_CSV/98-401-X2021009_English_CSV_data.csv\n", + "Concatenating all dataframes into one\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021009_eng_CSV/*English_CSV_data*\")\n", + "cop_df = process_cop_csv(csvs_to_process)\n", + "\n", + "# Get the dguid per level of geography\n", + "con.sql(\"\"\"\n", + "DROP TABLE IF EXISTS er_2021;\n", + "\n", + "DROP TABLE IF EXISTS pop_ctr_2021;\n", + "CREATE TABLE pop_ctr_2021 AS SELECT pop_ctr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/pop_ctr_2021.parquet';\n", + "\"\"\")\n", + "con.commit()\n", + "\n", + "# Convert the duckdb tables to pandas dataframe\n", + "pop_ctr_dguid = con.sql(\"SELECT * FROM pop_ctr_2021\").to_df()\n", + "\n", + "# Join the Census of Population dataframe to each geographic level\n", + "cop_pop_ctr = cop_df.join(pop_ctr_dguid.set_index('dguid'), on='dguid', how='inner')\n", + "\n", + "del(pop_ctr_dguid)\n", + "del(cop_df)\n", + "gc.collect()\n", + "\n", + "# Convert columns to lowest dtypes\n", + "cop_pop_ctr = convert_to_lowest_type(cop_pop_ctr)\n", + "\n", + "# Drop NA columns\n", + "print(f\"POP CTR - Number of Columns BEFORE: {len(cop_pop_ctr.columns)}\")\n", + "drop_na_columns(cop_pop_ctr)\n", + "print(f\"POP CTR- Number of Columns AFTER: {len(cop_pop_ctr.columns)}\")\n", + "\n", + "# Export\n", + "# Population Centers\n", + "cop_pop_ctr = cop_pop_ctr.reset_index()\n", + "cop_pop_ctr.rename(columns={'dguid': 'pop_ctr_dguid'}, inplace=True)\n", + "cop_pop_ctr.to_parquet(path=f'{data_dir}/pop_ctr_2021.parquet', index=False, compression='zstd')\n", + "\n", + "del(cop_pop_ctr)\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "id": "9b196a95-62c1-449f-8e44-08bb81719750", + "metadata": {}, + "source": [ + "## 5.0 Process Federal electoral districts (FEDs) (2013 Representation Order)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2240a33a-a100-440e-9c55-c2a3509523ea", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing /data/census_of_population/extracted/2021/98-401-X2021010_eng_CSV/98-401-X2021010_English_CSV_data.csv\n", + "Concatenating all dataframes into one\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FED - Number of Columns BEFORE: 7893\n", + "Dropping columns that don't have values\n", + "FED - Number of Columns AFTER: 7433\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021010_eng_CSV/*English_CSV_data*\")\n", + "cop_df = process_cop_csv(csvs_to_process)\n", + "\n", + "# Get the dguid per level of geography\n", + "con.sql(\"\"\"\n", + "DROP TABLE IF EXISTS pop_ctr_2021;\n", + "\n", + "DROP TABLE IF EXISTS fed_2013;\n", + "CREATE TABLE fed_2013 AS SELECT fed_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/fed_2021.parquet';\n", + "\"\"\")\n", + "con.commit()\n", + "\n", + "# Convert the duckdb tables to pandas dataframe\n", + "fed_dguid = con.sql(\"SELECT * FROM fed_2013\").to_df()\n", + "\n", + "# Join the Census of Population dataframe to each geographic level\n", + "cop_fed = cop_df.join(fed_dguid.set_index('dguid'), on='dguid', how='inner')\n", + "\n", + "del(fed_dguid)\n", + "del(cop_df)\n", + "gc.collect()\n", + "\n", + "# Convert columns to lowest dtypes\n", + "cop_fed = convert_to_lowest_type(cop_fed)\n", + "\n", + "# Drop NA columns\n", + "print(f\"FED - Number of Columns BEFORE: {len(cop_fed.columns)}\")\n", + "drop_na_columns(cop_fed)\n", + "print(f\"FED - Number of Columns AFTER: {len(cop_fed.columns)}\")\n", + "\n", + "# Export\n", + "cop_fed = cop_fed.reset_index()\n", + "cop_fed.rename(columns={'dguid': 'fed_dguid'}, inplace=True)\n", + "cop_fed.to_parquet(path=f'{data_dir}/fed_2013.parquet', index=False, compression='zstd')\n", + "\n", + "del(cop_fed)\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "id": "dada67c8-8f4b-4795-a97b-3d68887fc582", + "metadata": {}, + "source": [ + "## 6.0 Process Federal electoral districts (FEDs) (2023 Representation Order)\n", + "There should be 343 2023 FEDs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb3d4506-cf3a-446f-9ff8-2a8e408630a4", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing /data/census_of_population/extracted/2021/98-401-X2021029_eng_CSV/98-401-X2021029_English_CSV_data.csv\n", + "Concatenating all dataframes into one\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "31" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FED - Number of Columns BEFORE: 7894\n", + "Dropping columns that don't have values\n", + "FED - Number of Columns AFTER: 7427\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021029_eng_CSV/*English_CSV_data*\")\n", + "cop_df = process_cop_csv(csvs_to_process)\n", + "\n", + "# Get the dguid per level of geography\n", + "con.sql(\"\"\"\n", + "DROP TABLE IF EXISTS fed_2013;\n", + "\n", + "/*\n", + "DROP TABLE IF EXISTS fed_2023;\n", + "CREATE TABLE fed_2023 AS SELECT fed_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/fed_2023.parquet';\n", + "*/\n", + "\"\"\")\n", + "con.commit()\n", + "\n", + "# Convert the duckdb tables to pandas dataframe\n", + "#fed_dguid = con.sql(\"SELECT * FROM fed_2023\").to_df()\n", + "\n", + "# Join the Census of Population dataframe to each geographic level\n", + "#cop_fed = cop_df.join(fed_dguid.set_index('dguid'), on='dguid', how='inner')\n", + "cop_df = cop_df.reset_index()\n", + "cop_df = cop_df[cop_df['dguid'].str.contains(\"2023\")]\n", + "\n", + "# Convert columns to lowest dtypes\n", + "cop_fed = convert_to_lowest_type(cop_df)\n", + "\n", + "del(cop_df)\n", + "gc.collect()\n", + "\n", + "# Drop NA columns\n", + "print(f\"FED - Number of Columns BEFORE: {len(cop_fed.columns)}\")\n", + "drop_na_columns(cop_fed)\n", + "print(f\"FED - Number of Columns AFTER: {len(cop_fed.columns)}\")\n", + "\n", + "# Export\n", + "cop_fed.rename(columns={'dguid': 'fed_dguid'}, inplace=True)\n", + "cop_fed.to_parquet(path=f'{data_dir}/fed_2023.parquet', index=False, compression='zstd')\n", + "\n", + "del(cop_fed)\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "id": "3cf32961-f1a4-44e2-8676-bb10263190b9", + "metadata": {}, + "source": [ + "## 7.0 Process Designated places (DPLs)\n", + "There should be 1685 DPLs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "958a12c8-2d4d-474a-8af7-81a6b71e24d1", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing /data/census_of_population/extracted/2021/98-401-X2021011_eng_CSV/98-401-X2021011_English_CSV_data.csv\n", + "Concatenating all dataframes into one\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DPL - Number of Columns BEFORE: 7893\n", + "Dropping columns that don't have values\n", + "DPL - Number of Columns AFTER: 7433\n" + ] + } + ], + "source": [ + "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021011_eng_CSV/*English_CSV_data*\")\n", + "cop_df = process_cop_csv(csvs_to_process)\n", + "\n", + "# Get the dguid per level of geography\n", + "con.sql(\"\"\"\n", + "DROP TABLE IF EXISTS dpl_2021;\n", + "CREATE TABLE dpl_2021 AS SELECT dpl_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/dpl_2021.parquet';\n", + "\"\"\")\n", + "con.commit()\n", + "\n", + "# Convert the duckdb tables to pandas dataframe\n", + "dpl_dguid = con.sql(\"SELECT * FROM dpl_2021\").to_df()\n", + "\n", + "# Join the Census of Population dataframe to each geographic level\n", + "cop_dpl = cop_df.join(dpl_dguid.set_index('dguid'), on='dguid', how='inner')\n", + "\n", + "del(dpl_dguid)\n", + "del(cop_df)\n", + "gc.collect()\n", + "\n", + "# Convert columns to lowest dtypes\n", + "cop_dpl = convert_to_lowest_type(cop_dpl)\n", + "\n", + "# Drop NA columns\n", + "print(f\"DPL - Number of Columns BEFORE: {len(cop_dpl.columns)}\")\n", + "drop_na_columns(cop_dpl)\n", + "print(f\"DPL - Number of Columns AFTER: {len(cop_dpl.columns)}\")\n", + "\n", + "# Export\n", + "cop_dpl = cop_dpl.reset_index()\n", + "cop_dpl.rename(columns={'dguid': 'dpl_dguid'}, inplace=True)\n", + "cop_dpl.to_parquet(path=f'{data_dir}/dpl_2021.parquet', index=False, compression='zstd')\n", + "\n", + "del(cop_dpl)\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "id": "bceb5f4a-4019-4d25-8671-b0854233e109", + "metadata": {}, + "source": [ + "## 8.0 Process Aggregate dissemination areas (ADAs)\n", + "There should be 5433 ADAs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c846535-aff6-434e-ada0-51e739d89d23", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing /data/census_of_population/extracted/2021/98-401-X2021012_eng_CSV/98-401-X2021012_English_CSV_data.csv\n", + "Concatenating all dataframes into one\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021012_eng_CSV/*English_CSV_data*\")\n", + "cop_df = process_cop_csv(csvs_to_process)\n", + "\n", + "# Get the dguid per level of geography\n", + "con.sql(\"\"\"\n", + "DROP TABLE IF EXISTS dpl_2021;\n", + "DROP TABLE IF EXISTS ada_2021;\n", + "CREATE TABLE ada_2021 AS SELECT ada_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/ada_2021.parquet';\n", + "\"\"\")\n", + "con.commit()\n", + "\n", + "# Convert the duckdb tables to pandas dataframe\n", + "ada_dguid = con.sql(\"SELECT * FROM ada_2021\").to_df()\n", + "\n", + "# Join the Census of Population dataframe to each geographic level\n", + "cop_ada = cop_df.join(ada_dguid.set_index('dguid'), on='dguid', how='inner')\n", + "\n", + "del(ada_dguid)\n", + "del(cop_df)\n", + "gc.collect()\n", + "\n", + "# Convert columns to lowest dtypes\n", + "cop_ada = convert_to_lowest_type(cop_ada)\n", + "\n", + "# Drop NA columns\n", + "print(f\"ADA - Number of Columns BEFORE: {len(cop_ada.columns)}\")\n", + "drop_na_columns(cop_ada)\n", + "print(f\"ADA - Number of Columns AFTER: {len(cop_ada.columns)}\")\n", + "\n", + "# Export\n", + "cop_ada = cop_ada.reset_index()\n", + "cop_ada.rename(columns={'dguid': 'ada_dguid'}, inplace=True)\n", + "cop_ada.to_parquet(path=f'{data_dir}/ada_2021.parquet', index=False, compression='zstd')\n", + "\n", + "del(cop_ada)\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "id": "6d66d8ae-e3ca-4014-961c-4af3c577880d", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## 9.0 Process Forward sortation areas (FSAs)\n", + "There should be 1643 FSAs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdee5c1a-9914-42ab-b759-e7685d5e627c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing /data/census_of_population/extracted/2021/98-401-X2021013_eng_CSV/98-401-X2021013_English_CSV_data.csv\n", + "Concatenating all dataframes into one\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 116, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "684" + ] + }, + "execution_count": 116, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FSA - Number of Columns BEFORE: 7893\n", + "Dropping columns that don't have values\n", + "FSA - Number of Columns AFTER: 7429\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 116, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021013_eng_CSV/*English_CSV_data*\")\n", + "cop_df = process_cop_csv(csvs_to_process)\n", + "\n", + "# Get the dguid per level of geography\n", + "con.sql(\"\"\"\n", + "DROP TABLE IF EXISTS ada_2021;\n", + "DROP TABLE IF EXISTS fsa_2021;\n", + "CREATE TABLE fsa_2021 AS SELECT fsa_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/fsa_2021.parquet';\n", + "\"\"\")\n", + "con.commit()\n", + "\n", + "# Convert the duckdb tables to pandas dataframe\n", + "fsa_dguid = con.sql(\"SELECT * FROM fsa_2021\").to_df()\n", + "\n", + "# Join the Census of Population dataframe to each geographic level\n", + "cop_fsa = cop_df.join(fsa_dguid.set_index('dguid'), on='dguid', how='inner')\n", + "\n", + "del(fsa_dguid)\n", + "del(cop_df)\n", + "gc.collect()\n", + "\n", + "# Convert columns to lowest dtypes\n", + "cop_fsa = convert_to_lowest_type(cop_fsa)\n", + "\n", + "# Drop NA columns\n", + "print(f\"FSA - Number of Columns BEFORE: {len(cop_fsa.columns)}\")\n", + "drop_na_columns(cop_fsa)\n", + "print(f\"FSA - Number of Columns AFTER: {len(cop_fsa.columns)}\")\n", + "\n", + "# Export\n", + "cop_fsa = cop_fsa.reset_index()\n", + "cop_fsa.rename(columns={'dguid': 'fsa_dguid'}, inplace=True)\n", + "cop_fsa.to_parquet(path=f'{data_dir}/fsa_2021.parquet', index=False, compression='zstd')\n", + "\n", + "del(cop_fsa)\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "id": "ddb67445-e9dc-489f-b2cd-7a7969393d5a", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## 10.0 Process Health regions (HRs) and Local health integration networks\n", + "Start looking here https://www150.statcan.gc.ca/n1/en/catalogue/82-402-X" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e4c3ddf-4279-4ae7-8a8a-f43ae97d59b4", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021015_eng_CSV/*English_CSV_data*\")\n", + "cop_df = process_cop_csv(csvs_to_process)\n", + "\n", + "# Get the dguid per level of geography\n", + "con.sql(\"\"\"\n", + "DROP TABLE IF EXISTS fsa_2021;\n", + "DROP TABLE IF EXISTS hr_2022;\n", + "CREATE TABLE hr_2022 AS SELECT hr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/hr_2022.parquet';\n", + "\n", + "CREATE \n", + "\"\"\")\n", + "con.commit()\n", + "\n", + "# Convert the duckdb tables to pandas dataframe\n", + "hr_dguid = con.sql(\"SELECT * FROM hr_2022\").to_df()\n", + "\n", + "# Join the Census of Population dataframe to each geographic level\n", + "cop_fsa = cop_df.join(fsa_dguid.set_index('dguid'), on='dguid', how='inner')\n", + "\n", + "del(fsa_dguid)\n", + "del(cop_df)\n", + "gc.collect()\n", + "\n", + "# Convert columns to lowest dtypes\n", + "cop_fsa = convert_to_lowest_type(cop_fsa)\n", + "\n", + "# Drop NA columns\n", + "print(f\"FSA - Number of Columns BEFORE: {len(cop_fsa.columns)}\")\n", + "drop_na_columns(cop_fsa)\n", + "print(f\"FSA - Number of Columns AFTER: {len(cop_fsa.columns)}\")\n", + "\n", + "# Export\n", + "cop_fsa = cop_fsa.reset_index()\n", + "cop_fsa.rename(columns={'dguid': 'fsa_dguid'}, inplace=True)\n", + "cop_fsa.to_parquet(path=f'{data_dir}/fsa_2021.parquet', index=False, compression='zstd')\n", + "\n", + "del(cop_fsa)\n", + "gc.collect()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/dissemination_geographies_relationship_file/download.sh b/dissemination_geographies_relationship_file/download.sh new file mode 100755 index 0000000..1e25d0f --- /dev/null +++ b/dissemination_geographies_relationship_file/download.sh @@ -0,0 +1,11 @@ +#!/bin/bash +if [ ! -d "${DATA_FOLDER}/dissemination_geographies_relationship_file" ] +then + echo "Making directory ${DATA_FOLDER}/dissemination_geographies_relationship_file/" + mkdir -p ${DATA_FOLDER}/dissemination_geographies_relationship_file/{input,extracted,output} +fi + +INPUT_FOLDER="${DATA_FOLDER}/dissemination_geographies_relationship_file/input" + +echo "Downloading 2021 dissemiantion geographies relationship file" +aria2c -x16 -i "${SCRIPT_DIR}/dissemination_geographies_relationship_file/files.txt" --dir=${INPUT_FOLDER} --auto-file-renaming=false \ No newline at end of file diff --git a/dissemination_geographies_relationship_file/files.txt b/dissemination_geographies_relationship_file/files.txt new file mode 100755 index 0000000..af87e17 --- /dev/null +++ b/dissemination_geographies_relationship_file/files.txt @@ -0,0 +1,2 @@ +# 2021. Here is the reference guide https://web.archive.org/web/20250413152017/https://www150.statcan.gc.ca/n1/pub/98-26-0003/982600032021001-eng.htm +https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/dguid-idugd/files-fichiers/2021_98260004.zip \ No newline at end of file diff --git a/dissemination_geographies_relationship_file/load.sh b/dissemination_geographies_relationship_file/load.sh new file mode 100755 index 0000000..6dd35b8 --- /dev/null +++ b/dissemination_geographies_relationship_file/load.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +INPUT_FOLDER="${DATA_FOLDER}/dissemination_geographies_relationship_file/input" +EXTRACTED_FOLDER="${DATA_FOLDER}/dissemination_geographies_relationship_file/extracted" + +import_2021() { + echo "Unzipping 2021 dissemination geographies relationship file" + unzip -n "${INPUT_FOLDER}/2021_98260004.zip" -d ${EXTRACTED_FOLDER} + python dissemination_geographies_relationship_file/process.py ${EXTRACTED_FOLDER}/2021_98260004.csv +} + +import_2021 diff --git a/dissemination_geographies_relationship_file/process.py b/dissemination_geographies_relationship_file/process.py new file mode 100644 index 0000000..c5910d0 --- /dev/null +++ b/dissemination_geographies_relationship_file/process.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python +# coding: utf-8 +import os +import sys + +import pandas as pd +from sqlalchemy import create_engine + +dgr_2021_csv = sys.argv[1] + +DATABASE = os.environ.get("POSTGRES_DB") +USER = os.environ.get("POSTGRES_USER") +PASSWORD = os.environ.get("POSTGRES_PASSWORD") + +engine = create_engine(f"postgresql://{USER}:{PASSWORD}@db:5432/{DATABASE}") + +""" +Data dictionary is here: +https://web.archive.org/web/20250413152017/https://www150.statcan.gc.ca/n1/pub/98-26-0003/982600032021001-eng.htm + +This processes the entire DGUID hierarchy and other useful fields +""" +print(f"Processing {dgr_2021_csv}") +dgr_2021_df = pd.read_csv(dgr_2021_csv) + +# Rename columns, remove french portion +dgr_2021_df.rename(columns={ + 'PRDGUID_PRIDUGD': 'pr_dguid', + 'CDDGUID_DRIDUGD': 'cd_dguid', + 'FEDDGUID_CEFIDUGD': 'fed_dguid', + 'CSDDGUID_SDRIDUGD': 'csd_dguid', + 'ERDGUID_REIDUGD': 'er_dguid', + 'CARDGUID_RARIDUGD': 'car_dguid', + 'CCSDGUID_SRUIDUGD': 'ccs_dguid', + 'DADGUID_ADIDUGD': 'da_dguid', + 'DBDGUID_IDIDUGD': 'db_dguid', + 'ADADGUID_ADAIDUGD': 'ada_dguid', + 'DPLDGUID_LDIDUGD': 'dpl_dguid', + 'CMAPDGUID_RMRPIDUGD': 'cma_p_dguid', + 'CMADGUID_RMRIDUGD': 'cma_dguid', + 'CTDGUID_SRIDUGD': 'ct_dguid', + 'POPCTRPDGUID_CTRPOPPIDUGD': 'pop_ctr_p_dguid', + 'POPCTRDGUID_CTRPOPIDUGD': 'pop_ctr_dguid', +}, inplace=True) + +columns_ordered = ['pr_dguid', 'fed_dguid', 'er_dguid', 'car_dguid', 'cd_dguid', + 'dpl_dguid', 'ccs_dguid', 'csd_dguid', + 'cma_p_dguid', 'cma_dguid', + 'pop_ctr_p_dguid', 'pop_ctr_dguid', + 'ada_dguid', 'ct_dguid', 'da_dguid', 'db_dguid'] + +dgr_2021_df = dgr_2021_df.reindex(columns_ordered, axis=1) +print("Loading processed 2021 dissemination geographies relationship file to database as dissemination_geographies_relationship_2021") +dgr_2021_df.to_sql(name='dissemination_geographies_relationship_2021', + con=engine, + index=False, + chunksize=50000, + if_exists='replace', + schema='silver' + ) \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..feb5682 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,32 @@ +name: process-statcan-data + +services: + devcontainer: + container_name: devcontainer + user: root + build: + context: . + dockerfile: Dockerfile + volumes: + - .:/workspace + - ./data:/data + env_file: .env + ports: + - 8888:8888 + command: sleep infinity + db: + image: postgis/postgis:17-3.5-alpine + container_name: db + restart: unless-stopped + volumes: + - pgdata:/var/lib/postgresql/data + env_file: .env + ports: + - 5432:5432 + +networks: + default: + name: dev-container + +volumes: + pgdata: \ No newline at end of file diff --git a/experiments/boundaries_spatial_checks.ipynb b/experiments/boundaries_spatial_checks.ipynb new file mode 100644 index 0000000..8ed580a --- /dev/null +++ b/experiments/boundaries_spatial_checks.ipynb @@ -0,0 +1,140 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 30, + "id": "3849cfa2-bb10-4323-95a3-51d205f497d5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Buckaroo has been enabled as the default DataFrame viewer. To return to default dataframe visualization use `from buckaroo import disable; disable()`\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "import buckaroo\n", + "import geopandas as gpd\n", + "from IPython.core.interactiveshell import InteractiveShell\n", + "from lonboard import viz\n", + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "\n", + "# Enable multiple outputs per cell\n", + "InteractiveShell.ast_node_interactivity = \"all\"\n", + "# Show all columns\n", + "pd.set_option('display.max_columns', None)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "7e8b2e03-594c-4d8d-8914-ff4fe68117ac", + "metadata": {}, + "outputs": [], + "source": [ + "DATABASE = os.getenv('POSTGRES_DB')\n", + "USER = os.getenv('POSTGRES_USER')\n", + "PASSWORD = os.getenv('POSTGRES_PASSWORD')" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "dca70ec0-35c5-4c51-9e6a-5131ed46522c", + "metadata": {}, + "outputs": [], + "source": [ + "engine = create_engine(f'postgresql://{USER}:{PASSWORD}@db:5432/{DATABASE}')" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "08935c71-ff7b-4a2a-aa9d-029b4231cc71", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "95c1fb1edb9e440d916fa8cd3496bb46", + "version_major": 2, + "version_minor": 1 + }, + "text/plain": [ + "GeopandasBuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'],…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sql = \"\"\"\n", + "SELECT * FROM silver.pr_2016;\n", + "\"\"\"\n", + "df = gpd.read_postgis(sql, con=engine)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "03ff2898-1318-4859-8249-d2c3a05117a7", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "775652f4d632485abaf8babe7c671b25", + "version_major": 2, + "version_minor": 1 + }, + "text/plain": [ + "Map(basemap_style=" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "con.execute(\"\"\"\n", + "DROP TABLE IF EXISTS geo_data;\n", + "CREATE TABLE geo_data AS\n", + "SELECT da.da_dguid, da_cop.* EXCLUDE dguid, da.geom FROM 'https://data.dataforcanada.org/processed/statistics_canada/census_of_population/2021/tabular/da_2021.parquet' AS da_cop,\n", + "'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/da_2021.parquet' AS da\n", + "WHERE da_cop.dguid = da.da_dguid\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "33a3819e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(57936,)]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "con.execute(\"SELECT count(*) FROM geo_data;\")\n", + "con.fetchall()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "6d38118b", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ee1b19d9ad3c4a9483ba3cef3600ba8e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "con.execute(\"\"\"\n", + "COPY geo_data TO 'da_2021_cop.gdb'\n", + "WITH (\n", + " FORMAT GDAL,\n", + " DRIVER 'OpenFileGDB',\n", + " GEOMETRY_TYPE 'POLYGON',\n", + " SRS 'EPSG:4326'\n", + ");\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "76ada8f9", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e4d6cb52cb864a0cac1941bb25315b79", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "con.execute(\"\"\"\n", + "COPY geo_data TO 'da_2021_cop.geojson'\n", + "WITH (\n", + " FORMAT GDAL,\n", + " DRIVER 'GeoJSON',\n", + " GEOMETRY_TYPE 'POLYGON',\n", + " SRS 'EPSG:4326'\n", + ");\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c32ab0f8", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c226f348d3f14c0abbeb2bfa907a7db6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "con.execute(\"\"\"\n", + "COPY geo_data TO 'da_2021_cop_geom.parquet' (FORMAT PARQUET);\n", + "\"\"\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/experiments/export_2021_boundaries.ipynb b/experiments/export_2021_boundaries.ipynb new file mode 100644 index 0000000..561dc88 --- /dev/null +++ b/experiments/export_2021_boundaries.ipynb @@ -0,0 +1,922 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b6e053ec", + "metadata": {}, + "outputs": [], + "source": [ + "DATA_FOLDER=/data\n", + "\n", + "source ../.env" + ] + }, + { + "cell_type": "markdown", + "id": "12eca225-3d05-4bb7-95fa-7b9df694f53d", + "metadata": {}, + "source": [ + "# 1. Export Digital Boundary Files" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "22183463", + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "output_folder=\"${DATA_FOLDER}/boundaries/output/2021/digital_boundary_files\"\n", + "mkdir -p ${output_folder}" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a0372dac", + "metadata": {}, + "outputs": [], + "source": [ + "files=('country_2021'\n", + " 'grc_2021'\n", + " 'pr_2021'\n", + " 'er_2021'\n", + " 'car_2021'\n", + " 'cd_2021'\n", + " 'ccs_2021'\n", + " 'cma_2021'\n", + " 'csd_2021'\n", + " 'fed_2021_2013'\n", + " 'dpl_2021'\n", + " 'fsa_2021'\n", + " 'pop_ctr_2021'\n", + " 'ct_2021'\n", + " 'da_2021'\n", + " 'db_2021'\n", + " 'ada_2021'\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "04a2c69a-6204-4fa5-b6e2-f0375f51f425", + "metadata": {}, + "source": [ + "## Export as Parquet" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4c5bb532", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Exporting silver.country_2021 table to /data/boundaries/output/2021/digital_boundary_files/country_2021.parquet\n", + "Exporting silver.grc_2021 table to /data/boundaries/output/2021/digital_boundary_files/grc_2021.parquet\n", + "Exporting silver.pr_2021 table to /data/boundaries/output/2021/digital_boundary_files/pr_2021.parquet\n", + "Exporting silver.er_2021 table to /data/boundaries/output/2021/digital_boundary_files/er_2021.parquet\n", + "Exporting silver.car_2021 table to /data/boundaries/output/2021/digital_boundary_files/car_2021.parquet\n", + "Exporting silver.cd_2021 table to /data/boundaries/output/2021/digital_boundary_files/cd_2021.parquet\n", + "Exporting silver.ccs_2021 table to /data/boundaries/output/2021/digital_boundary_files/ccs_2021.parquet\n", + "Exporting silver.cma_2021 table to /data/boundaries/output/2021/digital_boundary_files/cma_2021.parquet\n", + "Exporting silver.csd_2021 table to /data/boundaries/output/2021/digital_boundary_files/csd_2021.parquet\n", + "Exporting silver.fed_2021_2013 table to /data/boundaries/output/2021/digital_boundary_files/fed_2021_2013.parquet\n", + "Exporting silver.dpl_2021 table to /data/boundaries/output/2021/digital_boundary_files/dpl_2021.parquet\n", + "Exporting silver.fsa_2021 table to /data/boundaries/output/2021/digital_boundary_files/fsa_2021.parquet\n", + "Exporting silver.pop_ctr_2021 table to /data/boundaries/output/2021/digital_boundary_files/pop_ctr_2021.parquet\n", + "Exporting silver.ct_2021 table to /data/boundaries/output/2021/digital_boundary_files/ct_2021.parquet\n", + "Exporting silver.da_2021 table to /data/boundaries/output/2021/digital_boundary_files/da_2021.parquet\n", + "Exporting silver.db_2021 table to /data/boundaries/output/2021/digital_boundary_files/db_2021.parquet\n", + "Exporting silver.ada_2021 table to /data/boundaries/output/2021/digital_boundary_files/ada_2021.parquet\n" + ] + } + ], + "source": [ + "for file in ${files[@]}\n", + "do\n", + " output_file=\"${output_folder}/${file}.parquet\"\n", + " echo \"Exporting silver.${file} table to ${output_file}\" \n", + " ogr2ogr \\\n", + " -lco COMPRESSION=\"ZSTD\" \\\n", + " -lco CREATOR=\"www.dataforcanada.org\" \\\n", + " -lco WRITE_COVERING_BBOX=\"YES\" \\\n", + " -lco SORT_BY_BBOX=\"YES\" \\\n", + " -f Parquet \\\n", + " ${output_file} \\\n", + " \"PG:host=db dbname=${POSTGRES_DB} user=${POSTGRES_USER} password=${POSTGRES_PASSWORD} port=5432\" \\\n", + " \"silver.${file}\"\n", + "done" + ] + }, + { + "cell_type": "markdown", + "id": "77af4081-b070-4c78-bbd9-ab2bcbb28881", + "metadata": {}, + "source": [ + "## Export as FlatGeobuf" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c012df55", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Exporting country_2021 table to /data/boundaries/output/2021/digital_boundary_files/country_2021.fgb\n", + "Exporting grc_2021 table to /data/boundaries/output/2021/digital_boundary_files/grc_2021.fgb\n", + "Exporting pr_2021 table to /data/boundaries/output/2021/digital_boundary_files/pr_2021.fgb\n", + "Exporting er_2021 table to /data/boundaries/output/2021/digital_boundary_files/er_2021.fgb\n", + "Exporting car_2021 table to /data/boundaries/output/2021/digital_boundary_files/car_2021.fgb\n", + "Exporting cd_2021 table to /data/boundaries/output/2021/digital_boundary_files/cd_2021.fgb\n", + "Exporting ccs_2021 table to /data/boundaries/output/2021/digital_boundary_files/ccs_2021.fgb\n", + "Exporting cma_2021 table to /data/boundaries/output/2021/digital_boundary_files/cma_2021.fgb\n", + "Exporting csd_2021 table to /data/boundaries/output/2021/digital_boundary_files/csd_2021.fgb\n", + "Exporting fed_2021_2013 table to /data/boundaries/output/2021/digital_boundary_files/fed_2021_2013.fgb\n", + "Exporting dpl_2021 table to /data/boundaries/output/2021/digital_boundary_files/dpl_2021.fgb\n", + "Exporting fsa_2021 table to /data/boundaries/output/2021/digital_boundary_files/fsa_2021.fgb\n", + "Exporting pop_ctr_2021 table to /data/boundaries/output/2021/digital_boundary_files/pop_ctr_2021.fgb\n", + "Exporting ct_2021 table to /data/boundaries/output/2021/digital_boundary_files/ct_2021.fgb\n", + "Exporting da_2021 table to /data/boundaries/output/2021/digital_boundary_files/da_2021.fgb\n", + "Exporting db_2021 table to /data/boundaries/output/2021/digital_boundary_files/db_2021.fgb\n", + "Exporting ada_2021 table to /data/boundaries/output/2021/digital_boundary_files/ada_2021.fgb\n" + ] + } + ], + "source": [ + "for file in ${files[@]}\n", + "do\n", + " output_file=\"${output_folder}/${file}.fgb\"\n", + " echo \"Exporting ${file} table to ${output_file}\" \n", + " ogr2ogr \\\n", + " -f FlatGeobuf \\\n", + " -lco TITLE=\"${file}\" \\\n", + " ${output_file} \\\n", + " \"PG:host=db dbname=${POSTGRES_DB} user=${POSTGRES_USER} password=${POSTGRES_PASSWORD} port=5432\" \\\n", + " \"silver.${file}\" \\\n", + " -nln ${file}\n", + "done" + ] + }, + { + "cell_type": "markdown", + "id": "b90b3919", + "metadata": {}, + "source": [ + "## Export as File Geodatabase" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "64780573", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Exporting country_2021 table to /data/boundaries/output/2021/digital_boundary_files/country_2021.gdb\n", + "ERROR 6: Unsupported geometry type\n", + "ERROR 1: Terminating translation prematurely after failed\n", + "translation of layer silver.country_2021 (use -skipfailures to skip errors)\n", + "Exporting grc_2021 table to /data/boundaries/output/2021/digital_boundary_files/grc_2021.gdb\n", + "ERROR 6: Unsupported geometry type\n", + "ERROR 1: Terminating translation prematurely after failed\n", + "translation of layer silver.grc_2021 (use -skipfailures to skip errors)\n", + "Exporting pr_2021 table to /data/boundaries/output/2021/digital_boundary_files/pr_2021.gdb\n", + "Exporting er_2021 table to /data/boundaries/output/2021/digital_boundary_files/er_2021.gdb\n", + "Exporting car_2021 table to /data/boundaries/output/2021/digital_boundary_files/car_2021.gdb\n", + "Exporting cd_2021 table to /data/boundaries/output/2021/digital_boundary_files/cd_2021.gdb\n", + "Exporting ccs_2021 table to /data/boundaries/output/2021/digital_boundary_files/ccs_2021.gdb\n", + "Exporting cma_2021 table to /data/boundaries/output/2021/digital_boundary_files/cma_2021.gdb\n", + "Exporting csd_2021 table to /data/boundaries/output/2021/digital_boundary_files/csd_2021.gdb\n", + "Exporting fed_2021_2013 table to /data/boundaries/output/2021/digital_boundary_files/fed_2021_2013.gdb\n", + "Exporting dpl_2021 table to /data/boundaries/output/2021/digital_boundary_files/dpl_2021.gdb\n", + "Exporting fsa_2021 table to /data/boundaries/output/2021/digital_boundary_files/fsa_2021.gdb\n", + "Exporting pop_ctr_2021 table to /data/boundaries/output/2021/digital_boundary_files/pop_ctr_2021.gdb\n", + "Exporting ct_2021 table to /data/boundaries/output/2021/digital_boundary_files/ct_2021.gdb\n", + "Exporting da_2021 table to /data/boundaries/output/2021/digital_boundary_files/da_2021.gdb\n", + "Exporting db_2021 table to /data/boundaries/output/2021/digital_boundary_files/db_2021.gdb\n", + "Exporting ada_2021 table to /data/boundaries/output/2021/digital_boundary_files/ada_2021.gdb\n" + ] + } + ], + "source": [ + "for file in ${files[@]}\n", + "do\n", + " output_file=\"${output_folder}/${file}.gdb\"\n", + " echo \"Exporting ${file} table to ${output_file}\" \n", + " ogr2ogr \\\n", + " -f OpenFileGDB \\\n", + " -lco TARGET_ARCGIS_VERSION=\"ARCGIS_PRO_3_2_OR_LATER\" \\\n", + " ${output_file} \\\n", + " \"PG:host=db dbname=${POSTGRES_DB} user=${POSTGRES_USER} password=${POSTGRES_PASSWORD} port=5432\" \\\n", + " \"silver.${file}\" \\\n", + " -nln ${file}\n", + "done" + ] + }, + { + "cell_type": "markdown", + "id": "69d3a9be-6fa5-437d-b851-f53457d49333", + "metadata": {}, + "source": [ + "### Zip File Geodatabases" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "69c6203b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Zipping country_2021.gdb\n", + " adding: country_2021.gdb/ (stored 0%)\n", + " adding: country_2021.gdb/gdb (stored 0%)\n", + " adding: country_2021.gdb/timestamps (deflated 98%)\n", + " adding: country_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: country_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: country_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: country_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: country_2021.gdb/a00000003.gdbtable (deflated 42%)\n", + " adding: country_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: country_2021.gdb/a00000004.gdbtable (deflated 44%)\n", + " adding: country_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: country_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: country_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: country_2021.gdb/a00000006.gdbtable (deflated 33%)\n", + " adding: country_2021.gdb/a00000006.gdbtablx (deflated 72%)\n", + " adding: country_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: country_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + "Zipping grc_2021.gdb\n", + " adding: grc_2021.gdb/ (stored 0%)\n", + " adding: grc_2021.gdb/gdb (stored 0%)\n", + " adding: grc_2021.gdb/timestamps (deflated 98%)\n", + " adding: grc_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: grc_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: grc_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: grc_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: grc_2021.gdb/a00000003.gdbtable (deflated 42%)\n", + " adding: grc_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: grc_2021.gdb/a00000004.gdbtable (deflated 44%)\n", + " adding: grc_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: grc_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: grc_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: grc_2021.gdb/a00000006.gdbtable (deflated 33%)\n", + " adding: grc_2021.gdb/a00000006.gdbtablx (deflated 72%)\n", + " adding: grc_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: grc_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + "Zipping pr_2021.gdb\n", + " adding: pr_2021.gdb/ (stored 0%)\n", + " adding: pr_2021.gdb/gdb (stored 0%)\n", + " adding: pr_2021.gdb/timestamps (deflated 98%)\n", + " adding: pr_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: pr_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: pr_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: pr_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: pr_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: pr_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: pr_2021.gdb/a00000004.gdbtable (deflated 77%)\n", + " adding: pr_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: pr_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: pr_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: pr_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: pr_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: pr_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: pr_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: pr_2021.gdb/a00000009.gdbtable (deflated 7%)\n", + " adding: pr_2021.gdb/a00000009.gdbtablx (deflated 98%)\n", + " adding: pr_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: pr_2021.gdb/a00000009.spx (deflated 97%)\n", + "Zipping er_2021.gdb\n", + " adding: er_2021.gdb/ (stored 0%)\n", + " adding: er_2021.gdb/gdb (stored 0%)\n", + " adding: er_2021.gdb/timestamps (deflated 98%)\n", + " adding: er_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: er_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: er_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: er_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: er_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: er_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: er_2021.gdb/a00000004.gdbtable (deflated 78%)\n", + " adding: er_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: er_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: er_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: er_2021.gdb/a00000006.gdbtable (deflated 22%)\n", + " adding: er_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: er_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: er_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: er_2021.gdb/a00000009.gdbtable (deflated 8%)\n", + " adding: er_2021.gdb/a00000009.gdbtablx (deflated 94%)\n", + " adding: er_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: er_2021.gdb/a00000009.spx (deflated 95%)\n", + "Zipping car_2021.gdb\n", + " adding: car_2021.gdb/ (stored 0%)\n", + " adding: car_2021.gdb/gdb (stored 0%)\n", + " adding: car_2021.gdb/timestamps (deflated 98%)\n", + " adding: car_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: car_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: car_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: car_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: car_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: car_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: car_2021.gdb/a00000004.gdbtable (deflated 78%)\n", + " adding: car_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: car_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: car_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: car_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: car_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: car_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: car_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: car_2021.gdb/a00000009.gdbtable (deflated 9%)\n", + " adding: car_2021.gdb/a00000009.gdbtablx (deflated 94%)\n", + " adding: car_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: car_2021.gdb/a00000009.spx (deflated 95%)\n", + "Zipping cd_2021.gdb\n", + " adding: cd_2021.gdb/ (stored 0%)\n", + " adding: cd_2021.gdb/gdb (stored 0%)\n", + " adding: cd_2021.gdb/timestamps (deflated 98%)\n", + " adding: cd_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: cd_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: cd_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: cd_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: cd_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: cd_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: cd_2021.gdb/a00000004.gdbtable (deflated 80%)\n", + " adding: cd_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: cd_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: cd_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: cd_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: cd_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: cd_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: cd_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: cd_2021.gdb/a00000009.gdbtable (deflated 9%)\n", + " adding: cd_2021.gdb/a00000009.gdbtablx (deflated 81%)\n", + " adding: cd_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: cd_2021.gdb/a00000009.spx (deflated 86%)\n", + "Zipping ccs_2021.gdb\n", + " adding: ccs_2021.gdb/ (stored 0%)\n", + " adding: ccs_2021.gdb/gdb (stored 0%)\n", + " adding: ccs_2021.gdb/timestamps (deflated 98%)\n", + " adding: ccs_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: ccs_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: ccs_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: ccs_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: ccs_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: ccs_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: ccs_2021.gdb/a00000004.gdbtable (deflated 81%)\n", + " adding: ccs_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: ccs_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: ccs_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: ccs_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: ccs_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: ccs_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: ccs_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: ccs_2021.gdb/a00000009.gdbtable (deflated 12%)\n", + " adding: ccs_2021.gdb/a00000009.gdbtablx (deflated 52%)\n", + " adding: ccs_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: ccs_2021.gdb/a00000009.spx (deflated 90%)\n", + "Zipping cma_2021.gdb\n", + " adding: cma_2021.gdb/ (stored 0%)\n", + " adding: cma_2021.gdb/gdb (stored 0%)\n", + " adding: cma_2021.gdb/timestamps (deflated 98%)\n", + " adding: cma_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: cma_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: cma_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: cma_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: cma_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: cma_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: cma_2021.gdb/a00000004.gdbtable (deflated 79%)\n", + " adding: cma_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: cma_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: cma_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: cma_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: cma_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: cma_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: cma_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: cma_2021.gdb/a00000009.gdbtable (deflated 9%)\n", + " adding: cma_2021.gdb/a00000009.gdbtablx (deflated 90%)\n", + " adding: cma_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: cma_2021.gdb/a00000009.spx (deflated 88%)\n", + "Zipping csd_2021.gdb\n", + " adding: csd_2021.gdb/ (stored 0%)\n", + " adding: csd_2021.gdb/gdb (stored 0%)\n", + " adding: csd_2021.gdb/timestamps (deflated 98%)\n", + " adding: csd_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: csd_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: csd_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: csd_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: csd_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: csd_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: csd_2021.gdb/a00000004.gdbtable (deflated 85%)\n", + " adding: csd_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: csd_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: csd_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: csd_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: csd_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: csd_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: csd_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: csd_2021.gdb/a00000009.gdbtable (deflated 14%)\n", + " adding: csd_2021.gdb/a00000009.gdbtablx (deflated 53%)\n", + " adding: csd_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: csd_2021.gdb/a00000009.spx (deflated 89%)\n", + "Zipping fed_2021_2013.gdb\n", + " adding: fed_2021_2013.gdb/ (stored 0%)\n", + " adding: fed_2021_2013.gdb/gdb (stored 0%)\n", + " adding: fed_2021_2013.gdb/timestamps (deflated 98%)\n", + " adding: fed_2021_2013.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: fed_2021_2013.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: fed_2021_2013.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: fed_2021_2013.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: fed_2021_2013.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: fed_2021_2013.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: fed_2021_2013.gdb/a00000004.gdbtable (deflated 79%)\n", + " adding: fed_2021_2013.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: fed_2021_2013.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: fed_2021_2013.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: fed_2021_2013.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: fed_2021_2013.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: fed_2021_2013.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: fed_2021_2013.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: fed_2021_2013.gdb/a00000009.gdbtable (deflated 9%)\n", + " adding: fed_2021_2013.gdb/a00000009.gdbtablx (deflated 79%)\n", + " adding: fed_2021_2013.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: fed_2021_2013.gdb/a00000009.spx (deflated 94%)\n", + "Zipping dpl_2021.gdb\n", + " adding: dpl_2021.gdb/ (stored 0%)\n", + " adding: dpl_2021.gdb/gdb (stored 0%)\n", + " adding: dpl_2021.gdb/timestamps (deflated 98%)\n", + " adding: dpl_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: dpl_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: dpl_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: dpl_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: dpl_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: dpl_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: dpl_2021.gdb/a00000004.gdbtable (deflated 78%)\n", + " adding: dpl_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: dpl_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: dpl_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: dpl_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: dpl_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: dpl_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: dpl_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: dpl_2021.gdb/a00000009.gdbtable (deflated 15%)\n", + " adding: dpl_2021.gdb/a00000009.gdbtablx (deflated 55%)\n", + " adding: dpl_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: dpl_2021.gdb/a00000009.spx (deflated 85%)\n", + "Zipping fsa_2021.gdb\n", + " adding: fsa_2021.gdb/ (stored 0%)\n", + " adding: fsa_2021.gdb/gdb (stored 0%)\n", + " adding: fsa_2021.gdb/timestamps (deflated 98%)\n", + " adding: fsa_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: fsa_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: fsa_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: fsa_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: fsa_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: fsa_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: fsa_2021.gdb/a00000004.gdbtable (deflated 77%)\n", + " adding: fsa_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: fsa_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: fsa_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: fsa_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: fsa_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: fsa_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: fsa_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: fsa_2021.gdb/a00000009.gdbtable (deflated 8%)\n", + " adding: fsa_2021.gdb/a00000009.gdbtablx (deflated 54%)\n", + " adding: fsa_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: fsa_2021.gdb/a00000009.spx (deflated 89%)\n", + "Zipping pop_ctr_2021.gdb\n", + " adding: pop_ctr_2021.gdb/ (stored 0%)\n", + " adding: pop_ctr_2021.gdb/gdb (stored 0%)\n", + " adding: pop_ctr_2021.gdb/timestamps (deflated 98%)\n", + " adding: pop_ctr_2021.gdb/a00000001.gdbtable (deflated 33%)\n", + " adding: pop_ctr_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: pop_ctr_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: pop_ctr_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: pop_ctr_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: pop_ctr_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: pop_ctr_2021.gdb/a00000004.gdbtable (deflated 80%)\n", + " adding: pop_ctr_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: pop_ctr_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: pop_ctr_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: pop_ctr_2021.gdb/a00000006.gdbtable (deflated 22%)\n", + " adding: pop_ctr_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: pop_ctr_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: pop_ctr_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: pop_ctr_2021.gdb/a00000009.gdbtable (deflated 10%)\n", + " adding: pop_ctr_2021.gdb/a00000009.gdbtablx (deflated 72%)\n", + " adding: pop_ctr_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: pop_ctr_2021.gdb/a00000009.spx (deflated 85%)\n", + "Zipping ct_2021.gdb\n", + " adding: ct_2021.gdb/ (stored 0%)\n", + " adding: ct_2021.gdb/gdb (stored 0%)\n", + " adding: ct_2021.gdb/timestamps (deflated 98%)\n", + " adding: ct_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: ct_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: ct_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: ct_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: ct_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: ct_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: ct_2021.gdb/a00000004.gdbtable (deflated 80%)\n", + " adding: ct_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: ct_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: ct_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: ct_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: ct_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: ct_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: ct_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: ct_2021.gdb/a00000009.gdbtable (deflated 17%)\n", + " adding: ct_2021.gdb/a00000009.gdbtablx (deflated 52%)\n", + " adding: ct_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: ct_2021.gdb/a00000009.spx (deflated 88%)\n", + "Zipping da_2021.gdb\n", + " adding: da_2021.gdb/ (stored 0%)\n", + " adding: da_2021.gdb/gdb (stored 0%)\n", + " adding: da_2021.gdb/timestamps (deflated 98%)\n", + " adding: da_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: da_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: da_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: da_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: da_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: da_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: da_2021.gdb/a00000004.gdbtable (deflated 85%)\n", + " adding: da_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: da_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: da_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: da_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: da_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: da_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: da_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: da_2021.gdb/a00000009.gdbtable (deflated 31%)\n", + " adding: da_2021.gdb/a00000009.gdbtablx (deflated 45%)\n", + " adding: da_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: da_2021.gdb/a00000009.spx (deflated 88%)\n", + "Zipping db_2021.gdb\n", + " adding: db_2021.gdb/ (stored 0%)\n", + " adding: db_2021.gdb/gdb (stored 0%)\n", + " adding: db_2021.gdb/timestamps (deflated 98%)\n", + " adding: db_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: db_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: db_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: db_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: db_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: db_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: db_2021.gdb/a00000004.gdbtable (deflated 86%)\n", + " adding: db_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: db_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: db_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: db_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: db_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: db_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: db_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: db_2021.gdb/a00000009.gdbtable (deflated 59%)\n", + " adding: db_2021.gdb/a00000009.gdbtablx (deflated 45%)\n", + " adding: db_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: db_2021.gdb/a00000009.spx (deflated 88%)\n", + "Zipping ada_2021.gdb\n", + " adding: ada_2021.gdb/ (stored 0%)\n", + " adding: ada_2021.gdb/gdb (stored 0%)\n", + " adding: ada_2021.gdb/timestamps (deflated 98%)\n", + " adding: ada_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: ada_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: ada_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: ada_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: ada_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: ada_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: ada_2021.gdb/a00000004.gdbtable (deflated 82%)\n", + " adding: ada_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: ada_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: ada_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: ada_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: ada_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: ada_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: ada_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: ada_2021.gdb/a00000009.gdbtable (deflated 13%)\n", + " adding: ada_2021.gdb/a00000009.gdbtablx (deflated 51%)\n", + " adding: ada_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: ada_2021.gdb/a00000009.spx (deflated 89%)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " adding: fed_2021.gdb/a00000004.gdbtable (deflated 79%)\n", + " adding: fed_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: fed_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: fed_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: fed_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: fed_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: fed_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: fed_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: fed_2021.gdb/a00000009.gdbtable (deflated 9%)\n", + " adding: fed_2021.gdb/a00000009.gdbtablx (deflated 79%)\n", + " adding: fed_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: fed_2021.gdb/a00000009.spx (deflated 94%)\n", + "Zipping car_2021.gdb\n", + " adding: car_2021.gdb/ (stored 0%)\n", + " adding: car_2021.gdb/gdb (stored 0%)\n", + " adding: car_2021.gdb/timestamps (deflated 98%)\n", + " adding: car_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: car_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: car_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: car_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: car_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: car_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: car_2021.gdb/a00000004.gdbtable (deflated 78%)\n", + " adding: car_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: car_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: car_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: car_2021.gdb/a00000006.gdbtable (deflated 22%)\n", + " adding: car_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: car_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: car_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: car_2021.gdb/a00000009.gdbtable (deflated 9%)\n", + " adding: car_2021.gdb/a00000009.gdbtablx (deflated 94%)\n", + " adding: car_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: car_2021.gdb/a00000009.spx (deflated 95%)\n", + "Zipping dpl_2021.gdb\n", + " adding: dpl_2021.gdb/ (stored 0%)\n", + " adding: dpl_2021.gdb/gdb (stored 0%)\n", + " adding: dpl_2021.gdb/timestamps (deflated 98%)\n", + " adding: dpl_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: dpl_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: dpl_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: dpl_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: dpl_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: dpl_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: dpl_2021.gdb/a00000004.gdbtable (deflated 78%)\n", + " adding: dpl_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: dpl_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: dpl_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: dpl_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: dpl_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: dpl_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: dpl_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: dpl_2021.gdb/a00000009.gdbtable (deflated 14%)\n", + " adding: dpl_2021.gdb/a00000009.gdbtablx (deflated 55%)\n", + " adding: dpl_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: dpl_2021.gdb/a00000009.spx (deflated 83%)\n", + "Zipping fsa_2021.gdb\n", + " adding: fsa_2021.gdb/ (stored 0%)\n", + " adding: fsa_2021.gdb/gdb (stored 0%)\n", + " adding: fsa_2021.gdb/timestamps (deflated 98%)\n", + " adding: fsa_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: fsa_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: fsa_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: fsa_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: fsa_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: fsa_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: fsa_2021.gdb/a00000004.gdbtable (deflated 77%)\n", + " adding: fsa_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: fsa_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: fsa_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: fsa_2021.gdb/a00000006.gdbtable (deflated 22%)\n", + " adding: fsa_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: fsa_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: fsa_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: fsa_2021.gdb/a00000009.gdbtable (deflated 8%)\n", + " adding: fsa_2021.gdb/a00000009.gdbtablx (deflated 54%)\n", + " adding: fsa_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: fsa_2021.gdb/a00000009.spx (deflated 89%)\n", + "Zipping cma_2021.gdb\n", + " adding: cma_2021.gdb/ (stored 0%)\n", + " adding: cma_2021.gdb/gdb (stored 0%)\n", + " adding: cma_2021.gdb/timestamps (deflated 98%)\n", + " adding: cma_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: cma_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: cma_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: cma_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: cma_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: cma_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: cma_2021.gdb/a00000004.gdbtable (deflated 79%)\n", + " adding: cma_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: cma_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: cma_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: cma_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: cma_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: cma_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: cma_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: cma_2021.gdb/a00000009.gdbtable (deflated 9%)\n", + " adding: cma_2021.gdb/a00000009.gdbtablx (deflated 89%)\n", + " adding: cma_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: cma_2021.gdb/a00000009.spx (deflated 88%)\n", + "Zipping pc_2021.gdb\n", + " adding: pc_2021.gdb/ (stored 0%)\n", + " adding: pc_2021.gdb/gdb (stored 0%)\n", + " adding: pc_2021.gdb/timestamps (deflated 98%)\n", + " adding: pc_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: pc_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: pc_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: pc_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: pc_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: pc_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: pc_2021.gdb/a00000004.gdbtable (deflated 80%)\n", + " adding: pc_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: pc_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: pc_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: pc_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: pc_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: pc_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: pc_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: pc_2021.gdb/a00000009.gdbtable (deflated 10%)\n", + " adding: pc_2021.gdb/a00000009.gdbtablx (deflated 72%)\n", + " adding: pc_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: pc_2021.gdb/a00000009.spx (deflated 85%)\n", + "Zipping ct_2021.gdb\n", + " adding: ct_2021.gdb/ (stored 0%)\n", + " adding: ct_2021.gdb/gdb (stored 0%)\n", + " adding: ct_2021.gdb/timestamps (deflated 98%)\n", + " adding: ct_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: ct_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: ct_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: ct_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: ct_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: ct_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: ct_2021.gdb/a00000004.gdbtable (deflated 80%)\n", + " adding: ct_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: ct_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: ct_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: ct_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: ct_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: ct_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: ct_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: ct_2021.gdb/a00000009.gdbtable (deflated 17%)\n", + " adding: ct_2021.gdb/a00000009.gdbtablx (deflated 52%)\n", + " adding: ct_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: ct_2021.gdb/a00000009.spx (deflated 88%)\n", + "Zipping da_2021.gdb\n", + " adding: da_2021.gdb/ (stored 0%)\n", + " adding: da_2021.gdb/gdb (stored 0%)\n", + " adding: da_2021.gdb/timestamps (deflated 98%)\n", + " adding: da_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: da_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: da_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: da_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: da_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: da_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: da_2021.gdb/a00000004.gdbtable (deflated 85%)\n", + " adding: da_2021.gdb/a00000004.gdbtablx (deflated 99%)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " adding: da_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: da_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: da_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: da_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: da_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: da_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: da_2021.gdb/a00000009.gdbtable (deflated 31%)\n", + " adding: da_2021.gdb/a00000009.gdbtablx (deflated 45%)\n", + " adding: da_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: da_2021.gdb/a00000009.spx (deflated 88%)\n", + "Zipping db_2021.gdb\n", + " adding: db_2021.gdb/ (stored 0%)\n", + " adding: db_2021.gdb/gdb (stored 0%)\n", + " adding: db_2021.gdb/timestamps (deflated 98%)\n", + " adding: db_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: db_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: db_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: db_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: db_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: db_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: db_2021.gdb/a00000004.gdbtable (deflated 86%)\n", + " adding: db_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: db_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: db_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: db_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: db_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: db_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: db_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: db_2021.gdb/a00000009.gdbtable (deflated 59%)\n", + " adding: db_2021.gdb/a00000009.gdbtablx (deflated 45%)\n", + " adding: db_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: db_2021.gdb/a00000009.spx (deflated 88%)\n", + "Zipping ada_2021.gdb\n", + " adding: ada_2021.gdb/ (stored 0%)\n", + " adding: ada_2021.gdb/gdb (stored 0%)\n", + " adding: ada_2021.gdb/timestamps (deflated 98%)\n", + " adding: ada_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: ada_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: ada_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: ada_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: ada_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: ada_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: ada_2021.gdb/a00000004.gdbtable (deflated 82%)\n", + " adding: ada_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: ada_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: ada_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: ada_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: ada_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: ada_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: ada_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: ada_2021.gdb/a00000009.gdbtable (deflated 13%)\n", + " adding: ada_2021.gdb/a00000009.gdbtablx (deflated 51%)\n", + " adding: ada_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: ada_2021.gdb/a00000009.spx (deflated 89%)\n", + "Zipping pn_2021.gdb\n", + " adding: pn_2021.gdb/ (stored 0%)\n", + " adding: pn_2021.gdb/gdb (stored 0%)\n", + " adding: pn_2021.gdb/timestamps (deflated 98%)\n", + " adding: pn_2021.gdb/a00000001.gdbtable (deflated 34%)\n", + " adding: pn_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: pn_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: pn_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: pn_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: pn_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: pn_2021.gdb/a00000004.gdbtable (deflated 87%)\n", + " adding: pn_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: pn_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: pn_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: pn_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: pn_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: pn_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: pn_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: pn_2021.gdb/a00000009.gdbtable (deflated 93%)\n", + " adding: pn_2021.gdb/a00000009.gdbtablx (deflated 46%)\n", + " adding: pn_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: pn_2021.gdb/a00000009.spx (deflated 84%)\n", + "Zipping road_2021.gdb\n", + " adding: road_2021.gdb/ (stored 0%)\n", + " adding: road_2021.gdb/gdb (stored 0%)\n", + " adding: road_2021.gdb/timestamps (deflated 98%)\n", + " adding: road_2021.gdb/a00000001.gdbtable (deflated 35%)\n", + " adding: road_2021.gdb/a00000001.gdbtablx (deflated 99%)\n", + " adding: road_2021.gdb/a00000002.gdbtable (deflated 68%)\n", + " adding: road_2021.gdb/a00000002.gdbtablx (deflated 97%)\n", + " adding: road_2021.gdb/a00000003.gdbtable (deflated 56%)\n", + " adding: road_2021.gdb/a00000003.gdbtablx (deflated 99%)\n", + " adding: road_2021.gdb/a00000004.gdbtable (deflated 90%)\n", + " adding: road_2021.gdb/a00000004.gdbtablx (deflated 99%)\n", + " adding: road_2021.gdb/a00000005.gdbtable (deflated 37%)\n", + " adding: road_2021.gdb/a00000005.gdbtablx (deflated 97%)\n", + " adding: road_2021.gdb/a00000006.gdbtable (deflated 23%)\n", + " adding: road_2021.gdb/a00000006.gdbtablx (deflated 99%)\n", + " adding: road_2021.gdb/a00000007.gdbtable (deflated 47%)\n", + " adding: road_2021.gdb/a00000007.gdbtablx (deflated 98%)\n", + " adding: road_2021.gdb/a00000009.gdbtable (deflated 79%)\n", + " adding: road_2021.gdb/a00000009.gdbtablx (deflated 45%)\n", + " adding: road_2021.gdb/a00000009.gdbindexes (deflated 37%)\n", + " adding: road_2021.gdb/a00000009.spx (deflated 85%)\n" + ] + } + ], + "source": [ + "cd ${output_folder}\n", + "for file in ${files[@]}\n", + "do\n", + " output_file=\"${file}.gdb\"\n", + " echo \"Zipping ${output_file}\"\n", + " zip -r \"${output_file}.zip\" ${output_file}\n", + " rm -rf ${output_file}\n", + "done" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1bde70a8-e4ae-450b-9929-de35970172ab", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Bash", + "language": "bash", + "name": "bash" + }, + "language_info": { + "codemirror_mode": "shell", + "file_extension": ".sh", + "mimetype": "text/x-sh", + "name": "bash" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/experiments/index.html b/experiments/index.html new file mode 100755 index 0000000..862789c --- /dev/null +++ b/experiments/index.html @@ -0,0 +1,88 @@ + + + + + Add a vector tile source + + + + + + + + + +
+ + + + \ No newline at end of file diff --git a/experiments/index_da.html b/experiments/index_da.html new file mode 100755 index 0000000..a20dbe3 --- /dev/null +++ b/experiments/index_da.html @@ -0,0 +1,90 @@ + + + + + Add a vector tile source + + + + + + + + + +
+ + + + \ No newline at end of file diff --git a/experiments/lonboard_duckdb.ipynb b/experiments/lonboard_duckdb.ipynb new file mode 100644 index 0000000..28ff73a --- /dev/null +++ b/experiments/lonboard_duckdb.ipynb @@ -0,0 +1,152 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "a1aa3974-2590-4c35-8278-29bf8269a3b5", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "from lonboard import viz" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "68fc8e11-6463-4ad5-b70e-61910cb6cfbe", + "metadata": {}, + "outputs": [], + "source": [ + "con = duckdb.connect()\n", + "con.install_extension(\"spatial\")\n", + "con.load_extension(\"spatial\")" + ] + }, + { + "cell_type": "markdown", + "id": "10c0d819-c86a-4c6c-afed-396136685d0d", + "metadata": {}, + "source": [ + "# 2021 Dissemination Blocks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d7ee1da-4fa7-4d66-bac1-b2e384bd4110", + "metadata": {}, + "outputs": [], + "source": [ + "sql = \"SELECT * FROM 'https://files.sisyphus.ca/db_2021.parquet';\"\n", + "query = con.sql(sql)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da57c688-ca49-4fa7-8178-58e0cc9cb351", + "metadata": {}, + "outputs": [], + "source": [ + "viz(query, con=con)" + ] + }, + { + "cell_type": "markdown", + "id": "3efc0288-73ea-4b19-b0d1-261c863bf330", + "metadata": {}, + "source": [ + "# 2021 Road Network File" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "70b951ad-9b8f-47b6-ad6d-8f6e2ae77a56", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "af33fca355de497586bc9d1e80cef1a7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "31ed9c9423ee4601a4ec42b9a787aa44", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1389e30ae8964a8f9af5a1cf3899077c", + "version_major": 2, + "version_minor": 1 + }, + "text/plain": [ + "Map(basemap_style= 1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[43mattributes_types\u001b[49m))\n", + "\u001b[0;31mNameError\u001b[0m: name 'attributes_types' is not defined" + ] + } + ], + "source": [ + "print(' '.join(attributes_types))" + ] + }, + { + "cell_type": "markdown", + "id": "160d5b8f", + "metadata": {}, + "source": [ + "# Then we edit the metadata for the generated vector tiles to remove the long generator_options\n", + "\n", + "Edit metadata.json and remove generator_options\n", + "```\n", + "pmtiles show test.pmtiles --metadata > metadata.json\n", + "pmtiles edit test.pmtiles --metadata=metadata.json\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9280e3d2", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/geographic_attribute_file/download.sh b/geographic_attribute_file/download.sh new file mode 100755 index 0000000..5a91083 --- /dev/null +++ b/geographic_attribute_file/download.sh @@ -0,0 +1,11 @@ +#!/bin/bash +if [ ! -d "${DATA_FOLDER}/geographic_attribute_file" ] +then + echo "Making directory ${DATA_FOLDER}/geographic_attribute_file/" + mkdir -p ${DATA_FOLDER}/geographic_attribute_file/{input,extracted,output} +fi + +INPUT_FOLDER="${DATA_FOLDER}/geographic_attribute_file/input" + +echo "Downloading 2021 geographic attribute file" +aria2c -x16 -i "${SCRIPT_DIR}/geographic_attribute_file/files.txt" --dir=${INPUT_FOLDER} --auto-file-renaming=false diff --git a/geographic_attribute_file/files.txt b/geographic_attribute_file/files.txt new file mode 100755 index 0000000..be2cba2 --- /dev/null +++ b/geographic_attribute_file/files.txt @@ -0,0 +1,2 @@ +# 2021. Here is the reference guide https://www150.statcan.gc.ca/n1/pub/92-151-g/92-151-g2021001-eng.htm +https://www12.statcan.gc.ca/census-recensement/2021/geo/aip-pia/attribute-attribs/files-fichiers/2021_92-151_X.zip \ No newline at end of file diff --git a/geographic_attribute_file/load.sh b/geographic_attribute_file/load.sh new file mode 100755 index 0000000..a28c5d7 --- /dev/null +++ b/geographic_attribute_file/load.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +INPUT_FOLDER="${DATA_FOLDER}/geographic_attribute_file/input" +EXTRACTED_FOLDER="${DATA_FOLDER}/geographic_attribute_file/extracted" + +import_2021() { + echo "Extracting ${INPUT_FOLDER}/2021_92-151_X.zip" + unzip -q -n ${INPUT_FOLDER}/2021_92-151_X.zip -d ${EXTRACTED_FOLDER} + python geographic_attribute_file/process.py ${EXTRACTED_FOLDER}/2021_92-151_X.csv +} + +import_2021 diff --git a/geographic_attribute_file/process.py b/geographic_attribute_file/process.py new file mode 100755 index 0000000..7c9a251 --- /dev/null +++ b/geographic_attribute_file/process.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python +# coding: utf-8 +import os +import sys + +import numpy as np +import pandas as pd +from sqlalchemy import create_engine + +filename = sys.argv[1] + +DATABASE = os.environ.get("POSTGRES_DB") +USER = os.environ.get("POSTGRES_USER") +PASSWORD = os.environ.get("POSTGRES_PASSWORD") + +engine = create_engine(f"postgresql://{USER}:{PASSWORD}@db:5432/{DATABASE}") + +""" +Data dictionary is here: +https://web.archive.org/web/20240918110218/https://www150.statcan.gc.ca/n1/pub/92-151-g/2021001/tbl/tbl4_1-eng.htm + +This processes the entire DGUID hierarchy and other useful fields +""" +print(f"Processing {filename}") +params = { + 'filepath_or_buffer': filename, + 'encoding': 'latin-1', + 'usecols': ['PRDGUID_PRIDUGD', 'CDDGUID_DRIDUGD', + 'FEDDGUID_CEFIDUGD', 'CSDDGUID_SDRIDUGD', + 'DPLDGUID_LDIDUGD', 'ERDGUID_REIDUGD', + 'CCSDGUID_SRUIDUGD', 'SACTYPE_CSSGENRE', 'SACCODE_CSSCODE', 'CMAPDGUID_RMRPIDUGD', 'CMADGUID_RMRIDUGD', + 'CMATYPE_RMRGENRE', 'CTDGUID_SRIDUGD', 'POPCTRRAPDGUID_CTRPOPRRPIDUGD', 'POPCTRRADGUID_CTRPOPRRIDUGD', + 'DADGUID_ADIDUGD', 'DBDGUID_IDIDUGD', + # 2021 Block population, private dwellings, usual residents + 'DBPOP2021_IDPOP2021', 'DBTDWELL2021_IDTLOG2021', 'DBURDWELL2021_IDRHLOG2021', + # 2021 Census Indian reserve refusal flag + 'DBIR2021_IDRI2021', + 'ADADGUID_ADAIDUGD' + ], + # Apparently they have to be int64 because there's NA values + 'dtype': { + 'DBPOP2021_IDPOP2021': "Int64", + 'DBTDWELL2021_IDTLOG2021': "Int64", + 'DBURDWELL2021_IDRHLOG2021': "Int64" + } +} +gaf_2021_df = pd.read_csv(**params) + +# Rename columns, replace french portion +gaf_2021_df.rename(columns={ + 'PRDGUID_PRIDUGD': 'pr_dguid', + 'CDDGUID_DRIDUGD': 'cd_dguid', + 'FEDDGUID_CEFIDUGD': 'fed_dguid', + 'CSDDGUID_SDRIDUGD': 'csd_dguid', + 'DPLDGUID_LDIDUGD': 'dpl_dguid', + 'ERDGUID_REIDUGD': 'er_dguid', + 'CCSDGUID_SRUIDUGD': 'ccs_dguid', + 'SACTYPE_CSSGENRE': 'sac_type', + 'SACCODE_CSSCODE': 'sac_code', + 'CMAPDGUID_RMRPIDUGD': 'cma_p_dguid', + 'CMADGUID_RMRIDUGD': 'cma_dguid', + 'CTDGUID_SRIDUGD': 'ct_dguid', + 'POPCTRRAPDGUID_CTRPOPRRPIDUGD': 'pop_ctr_p_dguid', + 'POPCTRRADGUID_CTRPOPRRIDUGD': 'pop_ctr_dguid', + 'DADGUID_ADIDUGD': 'da_dguid', + 'DBDGUID_IDIDUGD': 'db_dguid', + 'DBPOP2021_IDPOP2021': 'db_pop_2021', + # This one needs work + 'DBTDWELL2021_IDTLOG2021': 'db_total_private_dwell_2021', + # I don't particularly like this one + 'DBURDWELL2021_IDRHLOG2021': 'db_usual_residents_dwellings_2021', + 'DBIR2021_IDRI2021': 'db_ir_2021', + 'ADADGUID_ADAIDUGD': 'ada_dguid' +}, inplace=True) + +columns_ordered = ['pr_dguid', 'fed_dguid', 'er_dguid', 'cd_dguid', + 'dpl_dguid', 'ccs_dguid', 'csd_dguid', 'sac_type', 'sac_code', + 'cma_p_dguid', 'cma_dguid', + 'pop_ctr_p_dguid', 'pop_ctr_dguid', + 'ada_dguid', 'ct_dguid', 'da_dguid', 'db_dguid', + 'db_pop_2021', 'db_total_private_dwell_2021', 'db_usual_residents_dwellings_2021', 'db_ir_2021'] + +gaf_2021_df = gaf_2021_df.reindex(columns_ordered, axis=1) +print("Loading 2021 geographic attribute file to PostgreSQL as gaf_2021") +gaf_2021_df.to_sql(name='gaf_2021', + con=engine, + index=False, + chunksize=50000, + if_exists='replace', + schema='silver' + ) \ No newline at end of file diff --git a/geosuite/download.sh b/geosuite/download.sh new file mode 100755 index 0000000..404fb6d --- /dev/null +++ b/geosuite/download.sh @@ -0,0 +1,11 @@ +#!/bin/bash +if [ ! -d "${DATA_FOLDER}/geosuite" ] +then + echo "Making directory ${DATA_FOLDER}/geosuite/" + mkdir -p ${DATA_FOLDER}/geosuite/{input,extracted,output} +fi + +INPUT_FOLDER="${DATA_FOLDER}/geosuite/input" + +echo "Downloading geosuite files" +aria2c -x16 -i "${SCRIPT_DIR}/geosuite/files.txt" --dir=${INPUT_FOLDER} --auto-file-renaming=false diff --git a/geosuite/files.txt b/geosuite/files.txt new file mode 100755 index 0000000..04ba510 --- /dev/null +++ b/geosuite/files.txt @@ -0,0 +1,10 @@ +# 2021. Here is the reference guide https://web.archive.org/web/20240809014903/https://www150.statcan.gc.ca/n1/pub/92-150-g/92-150-g2021001-eng.htm +https://www12.statcan.gc.ca/census-recensement/2021/geo/aip-pia/geosuite/files-fichiers/2021_92-150-X_eng.zip +# 2016. Here is the reference guide https://web.archive.org/web/20250115043056/https://www150.statcan.gc.ca/n1/pub/92-150-g/92-150-g2016002-eng.htm +https://www12.statcan.gc.ca/census-recensement/2016/geo/ref/geosuite/files-fichiers/GeoSuite_2016_92-150_XBB_eng.zip +# 2011. Here is the reference guide https://www150.statcan.gc.ca/n1/pub/92-150-g/92-150-g2011001-eng.htm +https://www12.statcan.gc.ca/census-recensement/2011/geo/ref/files-fichiers/2011_92-150_XBB_eng.zip +# 2006. Here is the reference guide +https://www12.statcan.gc.ca/census-recensement/2011/geo/ref/files-fichiers/2006_92-150_XBB_eng.zip +# 2001. Here is the reference guide +https://www12.statcan.gc.ca/census-recensement/2011/geo/ref/files-fichiers/92F0150WCB2001000.zip \ No newline at end of file diff --git a/geosuite/load.sh b/geosuite/load.sh new file mode 100755 index 0000000..70b0935 --- /dev/null +++ b/geosuite/load.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +INPUT_FOLDER="${DATA_FOLDER}/geosuite/input" +EXTRACTED_FOLDER="${DATA_FOLDER}/geosuite/extracted" + +import_2021() { + echo "Unzipping 2021 geosuite data" + unzip -n "${INPUT_FOLDER}/2021_92-150-X_eng.zip" -d ${EXTRACTED_FOLDER} + python geosuite/process.py ${EXTRACTED_FOLDER}/2021_92-150-X_eng/PN.csv +} + +import_2021 \ No newline at end of file diff --git a/geosuite/problems.ipynb b/geosuite/problems.ipynb new file mode 100644 index 0000000..e18b5b6 --- /dev/null +++ b/geosuite/problems.ipynb @@ -0,0 +1,283 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "id": "dacb31a5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading /home/jovyan/work/process-statcan-data/data/geosuite/extracted/2021_92-150-X_eng/PN.csv\n" + ] + } + ], + "source": [ + "#!/usr/bin/env python\n", + "# coding: utf-8\n", + "import os\n", + "import sys\n", + "\n", + "import geopandas as gpd\n", + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "from sqlalchemy import text\n", + "\n", + "placenames_2021_csv = \"/home/jovyan/work/process-statcan-data/data/geosuite/extracted/2021_92-150-X_eng/PN.csv\"\n", + "\n", + "DATABASE = os.environ.get(\"POSTGRES_DB\")\n", + "HOST = os.environ.get(\"WAREHOUSE_PG_HOST\")\n", + "USER = os.environ.get(\"POSTGRES_USER\")\n", + "PASSWORD = os.environ.get(\"POSTGRES_PASSWORD\")\n", + "\n", + "#engine = create_engine(f\"postgresql://{USER}:{PASSWORD}@{HOST}:5432/{DATABASE}\")\n", + "\n", + "print(f\"Reading {placenames_2021_csv}\")\n", + "placenames = pd.read_csv(filepath_or_buffer=placenames_2021_csv,\n", + " encoding='latin-1',\n", + " usecols=['PNdguid', 'PNname', 'PNsource', 'PNrplat', 'PNrplong'])\n", + "\n", + "placenames.rename(columns={\n", + " 'PNdguid': 'pn_dguid',\n", + " 'PNname': 'pn_name',\n", + " 'PNsource': 'pn_source',\n", + " 'PNrplat': 'latitude',\n", + " 'PNrplong': 'longitude'\n", + "}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d2d4d385", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "rec.array([(4269, '2021S0515005422', 'Cascapédia\\x96Saint-Jules', 1, 48.25, -65.9166667)],\n", + " dtype=[('index', '> ${output_file} + done +} + +INPUT_FOLDER="${DATA_FOLDER}/national_address_register/input" +EXTRACTED_FOLDER="${DATA_FOLDER}/national_address_register/extracted" +SCRATCH_FOLDER="${DATA_FOLDER}/national_address_register/scratch" + +import_202412() { + # Process 202412 + # Extract files + echo "Extracting ${INPUT_FOLDER}/202412.zip" + unzip -q -n ${INPUT_FOLDER}/202412.zip -d ${EXTRACTED_FOLDER}/202412 + if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_addresses_202412.csv ] + then + echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_addresses_202412.csv" + echo "LOC_GUID,ADDR_GUID,APT_NO_LABEL,CIVIC_NO,CIVIC_NO_SUFFIX,OFFICIAL_STREET_NAME,OFFICIAL_STREET_TYPE,OFFICIAL_STREET_DIR,PROV_CODE,CSD_ENG_NAME,CSD_FRE_NAME,CSD_TYPE_ENG_CODE,CSD_TYPE_FRE_CODE,MAIL_STREET_NAME,MAIL_STREET_TYPE,MAIL_STREET_DIR,MAIL_MUN_NAME,MAIL_PROV_ABVN,MAIL_POSTAL_CODE,BG_DLS_LSD,BG_DLS_QTR,BG_DLS_SCTN,BG_DLS_TWNSHP,BG_DLS_RNG,BG_DLS_MRD,BG_X,BG_Y,BU_N_CIVIC_ADD,BU_USE" > ${SCRATCH_FOLDER}/statcan_nar_addresses_202412.csv + fi + + if [ $(head ${SCRATCH_FOLDER}/statcan_nar_addresses_202412.csv | wc -l) -ne 10 ] + then + echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_addresses_202412.csv" + concatenate_csvs "${EXTRACTED_FOLDER}/202412/Addresses" "${SCRATCH_FOLDER}/statcan_nar_addresses_202412.csv" + fi + + if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_locations_202412.csv ] + then + echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_locations_202412.csv" + echo "LOC_GUID,CSD_CODE,FED_CODE,FED_ENG_NAME,FED_FRE_NAME,ER_CODE,ER_ENG_NAME,ER_FRE_NAME,REPPOINT_LATITUDE,REPPOINT_LONGITUDE" > ${SCRATCH_FOLDER}/statcan_nar_locations_202412.csv + fi + + if [ $(head ${SCRATCH_FOLDER}/statcan_nar_locations_202412.csv | wc -l) -ne 10 ] + then + echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_locations_202412.csv" + concatenate_csvs "${EXTRACTED_FOLDER}/202412/Locations" "${SCRATCH_FOLDER}/statcan_nar_locations_202412.csv" + fi + python national_address_register/process.py ${SCRATCH_FOLDER}/statcan_nar_addresses_202412.csv ${SCRATCH_FOLDER}/statcan_nar_locations_202412.csv 202412 utf-8 +} + +import_202406() { + # Process 202406 + echo "Extracting ${INPUT_FOLDER}/2024.zip" + unzip -q -n ${INPUT_FOLDER}/2024.zip -d ${EXTRACTED_FOLDER}/202406 + if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_addresses_202406.csv ] + then + echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_addresses_202406.csv" + echo "LOC_GUID,ADDR_GUID,APT_NO_LABEL,CIVIC_NO,CIVIC_NO_SUFFIX,OFFICIAL_STREET_NAME,OFFICIAL_STREET_TYPE,OFFICIAL_STREET_DIR,PROV_CODE,CSD_ENG_NAME,CSD_FRE_NAME,CSD_TYPE_ENG_CODE,CSD_TYPE_FRE_CODE,MAIL_STREET_NAME,MAIL_STREET_TYPE,MAIL_STREET_DIR,MAIL_MUN_NAME,MAIL_PROV_ABVN,MAIL_POSTAL_CODE,BG_DLS_LSD,BG_DLS_QTR,BG_DLS_SCTN,BG_DLS_TWNSHP,BG_DLS_RNG,BG_DLS_MRD,BG_X,BG_Y,BU_N_CIVIC_ADD,BU_USE" > ${SCRATCH_FOLDER}/statcan_nar_addresses_202406.csv + fi + + if [ $(head ${SCRATCH_FOLDER}/statcan_nar_addresses_202406.csv | wc -l) -ne 10 ] + then + echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_addresses_202406.csv" + concatenate_csvs "${EXTRACTED_FOLDER}/202406/Addresses" "${SCRATCH_FOLDER}/statcan_nar_addresses_202406.csv" + fi + + if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_locations_202406.csv ] + then + echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_locations_202406.csv" + echo "LOC_GUID,CSD_CODE,FED_CODE,FED_ENG_NAME,FED_FRE_NAME,ER_CODE,ER_ENG_NAME,ER_FRE_NAME,REPPOINT_LATITUDE,REPPOINT_LONGITUDE" > ${SCRATCH_FOLDER}/statcan_nar_locations_202406.csv + fi + + if [ $(head ${SCRATCH_FOLDER}/statcan_nar_locations_202406.csv | wc -l) -ne 10 ] + then + echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_locations_202406.csv" + concatenate_csvs "${EXTRACTED_FOLDER}/202406/Locations" "${SCRATCH_FOLDER}/statcan_nar_locations_202406.csv" + fi + python national_address_register/process.py ${SCRATCH_FOLDER}/statcan_nar_addresses_202406.csv ${SCRATCH_FOLDER}/statcan_nar_locations_202406.csv 202406 utf-8 +} + +import_2023() { + # Process 2023 + echo "Extracting ${INPUT_FOLDER}/2023.zip" + unzip -q -n ${INPUT_FOLDER}/2023.zip -d ${EXTRACTED_FOLDER}/2023 + if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_addresses_2023.csv ] + then + echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_addresses_2023.csv" + echo "LOC_GUID,ADDR_GUID,APT_NO_LABEL,CIVIC_NO,CIVIC_NO_SUFFIX,OFFICIAL_STREET_NAME,OFFICIAL_STREET_TYPE,OFFICIAL_STREET_DIR,PROV_CODE,CSD_ENG_NAME,CSD_FRE_NAME,CSD_TYPE_ENG_CODE,CSD_TYPE_FRE_CODE,MAIL_STREET_NAME,MAIL_STREET_TYPE,MAIL_STEET_DIR,MAIL_MUN_NAME,MAIL_PROV_ABVN,MAIL_POSTAL_CODE,BG_DLS_LSD,BG_DLS_QTR,BG_DLS_SCTN,BG_DLS_TWNSHP,BG_DLS_RNG,BG_DLS_MRD,BG_X,BG_Y,BU_N_CIVIC_ADD,BU_USE" > ${SCRATCH_FOLDER}/statcan_nar_addresses_2023.csv + fi + + if [ $(head ${SCRATCH_FOLDER}/statcan_nar_addresses_2023.csv | wc -l) -ne 10 ] + then + echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_addresses_2023.csv" + concatenate_csvs "${EXTRACTED_FOLDER}/2023/Addresses" "${SCRATCH_FOLDER}/statcan_nar_addresses_2023.csv" + fi + + if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_locations_2023.csv ] + then + echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_locations_2023.csv" + echo "LOC_GUID,CSD_CODE,FED_2021_CODE,FED_2021_ENG_NAME,FED_2021_FRE_NAME,ER_2021_CODE,ER_2021_ENG_NAME,ER_2021_FRE_NAME,REPPOINT_LATITUDE,REPPOINT_LONGITUDE" > ${SCRATCH_FOLDER}/statcan_nar_locations_2023.csv + fi + + if [ $(head ${SCRATCH_FOLDER}/statcan_nar_locations_2023.csv | wc -l) -ne 10 ] + then + echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_locations_2023.csv" + concatenate_csvs "${EXTRACTED_FOLDER}/2022/Locations" "${SCRATCH_FOLDER}/statcan_nar_locations_2023.csv" + fi + python national_address_register/process.py ${SCRATCH_FOLDER}/statcan_nar_addresses_2023.csv ${SCRATCH_FOLDER}/statcan_nar_locations_2023.csv 2023 latin-1 +} + +import_2022() { + # Process 2022 + echo "Extracting ${INPUT_FOLDER}/2022.zip" + unzip -q -n ${INPUT_FOLDER}/2022.zip -d ${EXTRACTED_FOLDER}/2022 + if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_addresses_2022.csv ] + then + echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_addresses_2022.csv" + echo "LOC_GUID,ADDR_GUID,CIVIC_NO,CIVIC_NO_SUFFIX,APT_NO_LABEL,OFFICIAL_STREET_NAME,OFFICIAL_STREET_TYPE,OFFICIAL_STREET_DIR,PROV_CODE,CSD_ENG_NAME,CSD_FRE_NAME,CSD_TYPE_ENG_CODE,CSD_TYPE_FRE_CODE,MAIL_STREET_NAME,MAIL_STREET_TYPE,MAIL_STREET_DIR,MAIL_MUN_NAME,MAIL_POSTAL_CODE,MAIL_PROV_ABVN,BG_DLS_LSD,BG_DLS_QTR,BG_DLS_SCTN,BG_DLS_TWNSHP,BG_DLS_RNG,BG_DLS_MRD,BG_X,BG_Y,BU_N_CIVIC_ADD,BU_USE" > ${SCRATCH_FOLDER}/statcan_nar_addresses_2022.csv + fi + + if [ $(head ${SCRATCH_FOLDER}/statcan_nar_addresses_2022.csv | wc -l) -ne 10 ] + then + echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_addresses_2022.csv" + concatenate_csvs "${EXTRACTED_FOLDER}/2022/Addresses" "${SCRATCH_FOLDER}/statcan_nar_addresses_2022.csv" + fi + + if [ ! -f ${SCRATCH_FOLDER}/statcan_nar_locations_2022.csv ] + then + echo "Adding header file to ${SCRATCH_FOLDER}/statcan_nar_locations_2022.csv" + echo "LOC_GUID,CSD_CODE,FED_2016_CODE,FED_2016_ENG_NAME,FED_2016_FRE_NAME,ER_2016_CODE,ER_2016_ENG_NAME,ER_2016_FRE_NAME,REPPOINT_LATITUDE,REPPOINT_LONGITUDE" > ${SCRATCH_FOLDER}/statcan_nar_locations_2022.csv + fi + + if [ $(head ${SCRATCH_FOLDER}/statcan_nar_locations_2022.csv | wc -l) -ne 10 ] + then + echo "Appending Addresses CSVs to ${SCRATCH_FOLDER}/statcan_nar_locations_2022.csv" + concatenate_csvs "${EXTRACTED_FOLDER}/2022/Locations" "${SCRATCH_FOLDER}/statcan_nar_locations_2022.csv" + fi + python national_address_register/process.py ${SCRATCH_FOLDER}/statcan_nar_addresses_2022.csv ${SCRATCH_FOLDER}/statcan_nar_locations_2022.csv 2022 latin-1 +} + +import_202412 +import_202406 +import_2023 +import_2022 \ No newline at end of file diff --git a/national_address_register/national_address_register_files.txt b/national_address_register/national_address_register_files.txt new file mode 100644 index 0000000..c02b883 --- /dev/null +++ b/national_address_register/national_address_register_files.txt @@ -0,0 +1,8 @@ +# December 2024 +https://www150.statcan.gc.ca/n1/pub/46-26-0002/2022001/202412.zip +# June 2024 +https://www150.statcan.gc.ca/n1/pub/46-26-0002/2022001/2024.zip +# 2023 +https://www150.statcan.gc.ca/n1/pub/46-26-0002/2022001/2023.zip +# 2022 +https://www150.statcan.gc.ca/n1/pub/46-26-0002/2022001/2022.zip \ No newline at end of file diff --git a/national_address_register/process.py b/national_address_register/process.py new file mode 100755 index 0000000..7cc744a --- /dev/null +++ b/national_address_register/process.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +# coding: utf-8 +import os +import sys + +import geopandas as gpd +import pandas as pd +from sqlalchemy import create_engine + +statcan_nar_addresses_csv = sys.argv[1] +statcan_nar_locations_csv = sys.argv[2] +vintage = sys.argv[3] +encoding = sys.argv[4] + +print(f"Reading {statcan_nar_addresses_csv}") +statcan_nar_addresses = pd.read_csv(filepath_or_buffer=statcan_nar_addresses_csv, + dtype={ + "CIVIC_NO": "Int32", + "PROV_CODE": object, + "BU_USE": "Int8", + "BG_DLS_LSD": object, + "BG_DLS_QTR": object, + "BG_DLS_SCTN": object, + "BG_DLS_TWNSHP": object, + "BG_DLS_RNG": object, + "BG_DLS_MRD": object + }, + encoding=encoding) + +print(f"Reading {statcan_nar_locations_csv}") +statcan_nar_locations = pd.read_csv(filepath_or_buffer=statcan_nar_locations_csv, + usecols=["LOC_GUID", + "REPPOINT_LATITUDE", + "REPPOINT_LONGITUDE"], + encoding=encoding) + +print(f"Combining {statcan_nar_addresses_csv} and {statcan_nar_locations_csv}") +statcan_nar_addresses_combined = pd.merge(statcan_nar_addresses, + statcan_nar_locations, + on="LOC_GUID", how="inner") + +del statcan_nar_addresses +del statcan_nar_locations + +DATABASE = os.environ.get("POSTGRES_DB") +HOST = os.environ.get("WAREHOUSE_PG_HOST") +USER = os.environ.get("POSTGRES_USER") +PASSWORD = os.environ.get("POSTGRES_PASSWORD") + +engine = create_engine(f"postgresql://{USER}:{PASSWORD}@{HOST}:5432/{DATABASE}") + +print("Creating geodataframe from combined address file") +gdf = gpd.GeoDataFrame( + statcan_nar_addresses_combined, + geometry=gpd.points_from_xy(statcan_nar_addresses_combined.REPPOINT_LONGITUDE, + statcan_nar_addresses_combined.REPPOINT_LATITUDE), + crs="EPSG:4326" +) + +print("Dropping 'REPPOINT_LATITUDE', 'REPPOINT_LONGITUDE' from geodataframe") +gdf.drop(columns=["REPPOINT_LATITUDE", "REPPOINT_LONGITUDE"], + inplace=True) + +print("Converting geodataframe to EPSG:3857") +gdf.to_crs(3857, inplace=True) +print(f"Loading geodatframe to PostgreSQL as statcan_nar_addresses_combined_{vintage}") +gdf.to_postgis(name=f"statcan_nar_addresses_combined_{vintage}", + con=engine, + chunksize=150000) diff --git a/open_databases/files_opendatabase_addresses.txt b/open_databases/files_opendatabase_addresses.txt new file mode 100644 index 0000000..9661319 --- /dev/null +++ b/open_databases/files_opendatabase_addresses.txt @@ -0,0 +1,10 @@ +https://www150.statcan.gc.ca/n1/en/pub/46-26-0001/2021001/ODA_AB_v1.zip +https://www150.statcan.gc.ca/n1/en/pub/46-26-0001/2021001/ODA_BC_v1.zip +https://www150.statcan.gc.ca/n1/en/pub/46-26-0001/2021001/ODA_MB_v1.zip +https://www150.statcan.gc.ca/n1/en/pub/46-26-0001/2021001/ODA_NB_v1.zip +https://www150.statcan.gc.ca/n1/en/pub/46-26-0001/2021001/ODA_NT_v1.zip +https://www150.statcan.gc.ca/n1/en/pub/46-26-0001/2021001/ODA_NS_v1.zip +https://www150.statcan.gc.ca/n1/en/pub/46-26-0001/2021001/ODA_ON_v1.zip +https://www150.statcan.gc.ca/n1/en/pub/46-26-0001/2021001/ODA_PE_v1.zip +https://www150.statcan.gc.ca/n1/en/pub/46-26-0001/2021001/ODA_QC_v1.zip +https://www150.statcan.gc.ca/n1/en/pub/46-26-0001/2021001/ODA_SK_v1.zip \ No newline at end of file diff --git a/open_databases/files_opendatabase_buildings.txt b/open_databases/files_opendatabase_buildings.txt new file mode 100644 index 0000000..4fd116e --- /dev/null +++ b/open_databases/files_opendatabase_buildings.txt @@ -0,0 +1,8 @@ +https://www150.statcan.gc.ca/n1/en/pub/34-26-0001/2018001/ODB_v2_Alberta.zip?st=0J_AsIyy +https://www150.statcan.gc.ca/n1/en/pub/34-26-0001/2018001/ODB_v2_BritishColumbia.zip +https://www150.statcan.gc.ca/n1/en/pub/34-26-0001/2018001/ODB_v2_NewBrunswick.zip?st=k35-Ygwr +https://www150.statcan.gc.ca/n1/en/pub/34-26-0001/2018001/ODB_v2_NorthwestTerritories.zip?st=SXozU436 +https://www150.statcan.gc.ca/n1/en/pub/34-26-0001/2018001/ODB_v2_NovaScotia.zip +https://www150.statcan.gc.ca/n1/en/pub/34-26-0001/2018001/ODB_v2_Ontario.zip +https://www150.statcan.gc.ca/n1/en/pub/34-26-0001/2018001/ODB_v2_Quebec.zip +https://www150.statcan.gc.ca/n1/en/pub/34-26-0001/2018001/ODB_v2_Saskatchewan.zip \ No newline at end of file diff --git a/open_databases/files_opendatabase_cultural.txt b/open_databases/files_opendatabase_cultural.txt new file mode 100644 index 0000000..51a262e --- /dev/null +++ b/open_databases/files_opendatabase_cultural.txt @@ -0,0 +1 @@ +https://www150.statcan.gc.ca/n1/en/pub/21-26-0001/2020001/ODCAF_V1.0.zip \ No newline at end of file diff --git a/open_databases/files_opendatabase_educational.txt b/open_databases/files_opendatabase_educational.txt new file mode 100644 index 0000000..3161708 --- /dev/null +++ b/open_databases/files_opendatabase_educational.txt @@ -0,0 +1 @@ +https://www150.statcan.gc.ca/n1/en/pub/37-26-0001/2022001/ODEF_v2.1.zip \ No newline at end of file diff --git a/open_databases/files_opendatabase_greenhouses.txt b/open_databases/files_opendatabase_greenhouses.txt new file mode 100644 index 0000000..70fb308 --- /dev/null +++ b/open_databases/files_opendatabase_greenhouses.txt @@ -0,0 +1 @@ +https://www150.statcan.gc.ca/n1/en/pub/32-26-0005/2023001/ODG_V1.zip \ No newline at end of file diff --git a/open_databases/files_opendatabase_healthcare.txt b/open_databases/files_opendatabase_healthcare.txt new file mode 100644 index 0000000..f8e5052 --- /dev/null +++ b/open_databases/files_opendatabase_healthcare.txt @@ -0,0 +1 @@ +https://www150.statcan.gc.ca/n1/en/pub/13-26-0001/2020001/ODHF_v1.1.zip \ No newline at end of file diff --git a/open_databases/files_opendatabase_recreation.txt b/open_databases/files_opendatabase_recreation.txt new file mode 100644 index 0000000..711ca77 --- /dev/null +++ b/open_databases/files_opendatabase_recreation.txt @@ -0,0 +1 @@ +https://www150.statcan.gc.ca/n1/en/pub/21-26-0002/2021001/ODRSF_v1.0.zip \ No newline at end of file diff --git a/open_databases/import_opendatabase.sh b/open_databases/import_opendatabase.sh new file mode 100644 index 0000000..b03173d --- /dev/null +++ b/open_databases/import_opendatabase.sh @@ -0,0 +1,96 @@ +#!/bin/bash + +DATA_FOLDER=/home/ripledi/Documents/projects/process-statcan-spatial-data/data + +source credentials.sh + +export_postgis_single() { + local filepath=$1 + local table_name=$2 + local extra_parameters=${@:3} + + # Virtual file system + if [[ ${filepath: -4} = '.zip' ]]; then + local filepath="/vsizip/${filepath}" + fi + + echo "Importing ${filepath}" + ogr2ogr \ + --config PG_USE_COPY YES \ + -overwrite \ + -f "PostgreSQL" \ + "PG:host=db dbname=${POSTGRES_DB} user=${POSTGRES_USER} password=${POSTGRES_PASSWORD} port=5432" \ + -progress \ + -gt 500000 \ + -t_srs EPSG:4326 \ + -nln ${table_name} \ + ${extra_parameters} \ + ${filepath} +} + +export_open_database_of_greenhouses() { + export PGCLIENTENCODING=UTF-8; + export_postgis_single ${DATA_FOLDER}/ODG_V1/odg_v1.shp statcan_odg_tmp +} + +export_open_database_of_buildings() { + # Open Database of Buildings + export PGCLIENTENCODING=UTF-8; + export_postgis_single ${DATA_FOLDER}/ODB_Alberta/odb_alberta.shp statcan_odb_tmp "-nlt PROMOTE_TO_MULTI" + export_postgis_single ${DATA_FOLDER}/ODB_BritishColumbia/odb_britishcolumbia.shp statcan_odb_tmp "-append -nlt PROMOTE_TO_MULTI" + export_postgis_single ${DATA_FOLDER}/ODB_NewBrunswick/odb_newbrunswick.shp statcan_odb_tmp "-append -nlt PROMOTE_TO_MULTI" + export_postgis_single ${DATA_FOLDER}/ODB_NorthwestTerritories/odb_northwestterritories.shp statcan_odb_tmp "-append -nlt PROMOTE_TO_MULTI" + export_postgis_single ${DATA_FOLDER}/ODB_NovaScotia/odb_novascotia.shp statcan_odb_tmp "-append -nlt PROMOTE_TO_MULTI" + export_postgis_single ${DATA_FOLDER}/ODB_Ontario/odb_ontario.shp statcan_odb_tmp "-append -nlt PROMOTE_TO_MULTI" + export_postgis_single ${DATA_FOLDER}/ODB_Quebec/odb_quebec.shp statcan_odb_tmp "-append -nlt PROMOTE_TO_MULTI" + export_postgis_single ${DATA_FOLDER}/ODB_Saskatchewan/odb_saskatchewan.shp statcan_odb_tmp "-append -nlt PROMOTE_TO_MULTI" +} + +export_open_database_of_educational_facilities() { + export PGCLIENTENCODING=LATIN-1; + export_postgis_single ${DATA_FOLDER}/ODEF_v2.1_EN/ODEF_v2_1.csv statcan_odef_tmp "-oo X_POSSIBLE_NAMES=Longitude, -oo Y_POSSIBLE_NAMES=Latitude -s_srs EPSG:4326" +} + +export_open_database_of_healthcare_facilities() { + export PGCLIENTENCODING=LATIN-1; + # TODO: process further + # There are issues with the characters in this file, example <97> + export_postgis_single ${DATA_FOLDER}/ODHF_v1.1/odhf_v1.1.csv statcan_odhf_tmp "-oo X_POSSIBLE_NAMES=longitude, -oo Y_POSSIBLE_NAMES=latitude -s_srs EPSG:4326" +} + +export_open_database_of_cultural_and_art_facilities() { + export PGCLIENTENCODING=LATIN-1; + # TODO: process further + export_postgis_single ${DATA_FOLDER}/ODCAF_V1.0/ODCAF_v1.0.csv statcan_odcaf_tmp "-oo X_POSSIBLE_NAMES=Longitude, -oo Y_POSSIBLE_NAMES=Latitude -s_srs EPSG:4326" +} + +export_open_database_of_addresses() { + # PGCLIENTENCODING=UTF-8 seems to have fixed all of the issues + export PGCLIENTENCODING=UTF-8; + export_postgis_single ${DATA_FOLDER}/ODA_AB_v1.csv statcan_oda_tmp "-oo X_POSSIBLE_NAMES=longitude, -oo Y_POSSIBLE_NAMES=latitude -s_srs EPSG:4326" + export_postgis_single ${DATA_FOLDER}/ODA_BC_v1.csv statcan_oda_tmp "-oo X_POSSIBLE_NAMES=longitude, -oo Y_POSSIBLE_NAMES=latitude -s_srs EPSG:4326 -append" + export_postgis_single ${DATA_FOLDER}/ODA_MB_v1.csv statcan_oda_tmp "-oo X_POSSIBLE_NAMES=longitude, -oo Y_POSSIBLE_NAMES=latitude -s_srs EPSG:4326 -append" + export_postgis_single ${DATA_FOLDER}/ODA_NB_v1.csv statcan_oda_tmp "-oo X_POSSIBLE_NAMES=longitude, -oo Y_POSSIBLE_NAMES=latitude -s_srs EPSG:4326 -append" + export_postgis_single ${DATA_FOLDER}/ODA_NS_v1.csv statcan_oda_tmp "-oo X_POSSIBLE_NAMES=longitude, -oo Y_POSSIBLE_NAMES=latitude -s_srs EPSG:4326 -append" + export_postgis_single ${DATA_FOLDER}/ODA_NT_v1.csv statcan_oda_tmp "-oo X_POSSIBLE_NAMES=longitude, -oo Y_POSSIBLE_NAMES=latitude -s_srs EPSG:4326 -append" + export_postgis_single ${DATA_FOLDER}/ODA_ON_v1.csv statcan_oda_tmp "-oo X_POSSIBLE_NAMES=longitude, -oo Y_POSSIBLE_NAMES=latitude -s_srs EPSG:4326 -append" + export_postgis_single ${DATA_FOLDER}/ODA_PE_v1.csv statcan_oda_tmp "-oo X_POSSIBLE_NAMES=longitude, -oo Y_POSSIBLE_NAMES=latitude -s_srs EPSG:4326 -append" + export_postgis_single ${DATA_FOLDER}/ODA_QC_v1.csv statcan_oda_tmp "-oo X_POSSIBLE_NAMES=longitude, -oo Y_POSSIBLE_NAMES=latitude -s_srs EPSG:4326 -append" + export_postgis_single ${DATA_FOLDER}/ODA_SK_v1.csv statcan_oda_tmp "-oo X_POSSIBLE_NAMES=longitude, -oo Y_POSSIBLE_NAMES=latitude -s_srs EPSG:4326 -append" +} + +export_open_database_of_recreational_and_sport_facilities() { + export PGCLIENTENCODING=LATIN-1; + # TODO: process further + export_postgis_single ${DATA_FOLDER}/ODRSF_V1.0/ODRSF_v1.0.csv statcan_odrsf_tmp "-oo X_POSSIBLE_NAMES=Longitude, -oo Y_POSSIBLE_NAMES=Latitude -s_srs EPSG:4326" +} + + + +#export_open_database_of_greenhouses +#export_open_database_of_buildings +#export_open_database_of_educational_facilities +#export_open_database_of_healthcare_facilities +#export_open_database_of_cultural_and_art_facilities +#export_open_database_of_addresses +export_open_database_of_recreational_and_sport_facilities diff --git a/open_databases/standardize.sql b/open_databases/standardize.sql new file mode 100644 index 0000000..b0ef650 --- /dev/null +++ b/open_databases/standardize.sql @@ -0,0 +1,112 @@ +/* Open Databases */ + +/* Open Database of Greenhouses */ +drop table if exists statcan_odg_2023; +create table statcan_odg_2023 as +select b.dguid as prdguid, b.prename as provincenameenglish, a.imagedate, a.datasource as provider, wkb_geometry as geom +from statcan_odg_tmp as a, + statcan_pr_2021 as b + where st_intersects(a.wkb_geometry, b.geom); + +create index statcan_odg_2023_geom_idx on statcan_odg_2023 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_odg_tmp; + +/* Open Database of Buildings */ +create table statcan_odb_2019 as +select b.dguid as csddguid, b.csdname, a.data_prov as data_provider, a.build_id, a.wkb_geometry as geom +from statcan_odb_tmp as a, + statcan_csd_2021 as b + where st_intersects(a.wkb_geometry, b.geom); + +create index statcan_odb_2019_geom_idx on statcan_odb_2019 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_odb_tmp; + +/* Open Database of Educational Facilities */ +drop table if exists statcan_odef_2022; +create table statcan_odef_2022 as +select index, source_id, facility_name, facility_type, authority_name, isced010, isced020, isced1, isced2, isced3, isced4plus, olms_status, unit, street_no, street_name, city, prov_terr, + postal_code, a.pruid, csdname, csduid, geo_source, provider, cmaname, cmauid, wkb_geometry as geom +from statcan_odef_tmp as a, + statcan_pr_2021 as b + where st_intersects(a.wkb_geometry, b.geom); + +create index statcan_odef_2022_geom_idx on statcan_odef_2022 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_odef_tmp; + +/* Open Database of Healthcare Facilities */ +drop table if exists statcan_odhf_2020; +create table statcan_odhf_2020 as +select index, facility_name, source_facility_type, odhf_facility_type, provider, unit, street_no, street_name, postal_code, + b.dguid as csddguid, b.csdname, c.dguid as prdguid, c.prename, a.wkb_geometry as geom +from statcan_odhf_tmp as a, + statcan_csd_2021 as b, + statcan_pr_2021 as c +where a.wkb_geometry is not null + and st_intersects(a.wkb_geometry, b.geom) + and st_intersects(a.wkb_geometry, c.geom); + +create index statcan_odhf_2020_geom_idx on statcan_odhf_2020 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_odhf_tmp; + +/* Open Database of Cultural and Art Facilities */ +drop table if exists statcan_odcaf_2020; +create table statcan_odcaf_2020 as +select index, facility_name, source_facility_type, odcaf_facility_type, provider, unit, street_no, + street_name, postal_code, city, prov_terr, csd_name, csduid, pruid, wkb_geometry as geom + from statcan_odcaf_tmp; + +create index statcan_odcaf_2020_geom_idx on statcan_odcaf_2020 using GIST(geom) with (FILLFACTOR=100); + +update statcan_odcaf_2020 + set facility_name = '' + where facility_name = '..'; +update statcan_odcaf_2020 + set source_facility_type = '' + where source_facility_type = '..'; +update statcan_odcaf_2020 + set unit = '' + where unit = '..'; +update statcan_odcaf_2020 + set street_no = '' + where street_no = '..'; +update statcan_odcaf_2020 + set street_name = '' + where street_name = '..'; +update statcan_odcaf_2020 + set postal_code = '' + where postal_code = '..'; +update statcan_odcaf_2020 + set city = '' + where city = '..'; +update statcan_odcaf_2020 + set city = '' + where city = '..'; +update statcan_odcaf_2020 + set prov_terr = '' + where prov_terr = '..'; +update statcan_odcaf_2020 + set csd_name = '' + where csd_name = '..'; +update statcan_odcaf_2020 + set csduid = '' + where csduid = '..'; +update statcan_odcaf_2020 + set pruid = '' + where pruid = '..'; + +drop table if exists statcan_odcaf_tmp; + +/* Open Database of Addresses */ +create table statcan_oda_2021 as +select a.id, a.street_no, a.street, a.unit, a.postal_code, b.dguid as csddguid, b.csdname, c.dguid as prdguid, a.provider, wkb_geometry as geom +from statcan_oda_tmp as a, + statcan_csd_2021 as b, + statcan_pr_2021 as c + where st_intersects(a.wkb_geometry, b.geom) + and b.pruid = c.pruid; + +create index statcan_oda_2021_geom_idx on statcan_oda_2021 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_oda_tmp; + +/* Open Database of Recreational and Sport Facilities */ +-- TODO; \ No newline at end of file diff --git a/road_network_files/README.md b/road_network_files/README.md new file mode 100644 index 0000000..304fb13 --- /dev/null +++ b/road_network_files/README.md @@ -0,0 +1,4 @@ +# TODO +- Process 2006 Geographic Attribute File Road Network +- Process 2001 road network +- Change loading of Census Road Network files as a function \ No newline at end of file diff --git a/road_network_files/download.sh b/road_network_files/download.sh new file mode 100755 index 0000000..c68cd8e --- /dev/null +++ b/road_network_files/download.sh @@ -0,0 +1,19 @@ +#!/bin/bash +if [ ! -d "${DATA_FOLDER}/road_network_files" ] +then + echo "Making directory ${DATA_FOLDER}/road_network_files/" + mkdir -p ${DATA_FOLDER}/road_network_files/{input,extracted,output}/{2021,2016,2011,2006} +fi + +INPUT_FOLDER="${DATA_FOLDER}/road_network_files/input" + +echo "Downloading 2021 road network file" +aria2c -x16 -i "${SCRIPT_DIR}/road_network_files/road_network_file_2021.txt" --dir="${INPUT_FOLDER}/2021" --auto-file-renaming=false +echo "Downloading 2016 road network file" +aria2c -x16 -i "${SCRIPT_DIR}/road_network_files/road_network_file_2016.txt" --dir="${INPUT_FOLDER}/2016" --auto-file-renaming=false +echo "Downloading 2011 road network file" +aria2c -x16 -i "${SCRIPT_DIR}/road_network_files/road_network_file_2011.txt" --dir="${INPUT_FOLDER}/2011" --auto-file-renaming=false +echo "Downloading 2006 road network file" +aria2c -x16 -i "${SCRIPT_DIR}/road_network_files/road_network_file_2006.txt" --dir="${INPUT_FOLDER}/2006" --auto-file-renaming=false +echo "Downloading 2001 road network file" +aria2c -x16 -i "${SCRIPT_DIR}/road_network_files/road_network_file_2001.txt" --dir="${INPUT_FOLDER}/2001" --auto-file-renaming=false diff --git a/road_network_files/load.sh b/road_network_files/load.sh new file mode 100755 index 0000000..1352c23 --- /dev/null +++ b/road_network_files/load.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +import_to_postgis() { + local filepath="$1" + local table_name="$2" + local extra_parameters="${@:3}" + + # Handle zip files using GDAL's virtual file system + if [[ "${filepath: -4}" == ".zip" ]]; then + filepath="/vsizip/${filepath}" + fi + + echo "Importing ${filepath} into table bronze.${table_name}" + ogr2ogr \ + --config PG_USE_COPY YES \ + -lco "OVERWRITE=YES" \ + -f "PostgreSQL" \ + "PG:host=db dbname=${POSTGRES_DB} user=${POSTGRES_USER} password=${POSTGRES_PASSWORD} port=5432" \ + -lco GEOMETRY_NAME=geom \ + -progress \ + -gt 500000 \ + -t_srs EPSG:4326 \ + -nln "${table_name}" \ + ${extra_parameters} \ + "${filepath}" +} + +# Define input folders +INPUT_FOLDER="${DATA_FOLDER}/road_network_files/input" +EXTRACTED_FOLDER="${DATA_FOLDER}/road_network_files/extracted" + +### 2021 Road Network File ### +# https://web.archive.org/web/20230307163203/https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/index2021-eng.cfm?year=21 +export PGCLIENTENCODING=LATIN-1; +import_to_postgis "${INPUT_FOLDER}/2021/lrnf000r21a_e.zip" lrnf000r21a_e "-lco COLUMN_TYPES=afl_val=integer,atl_val=integer,afr_val=integer,atr_val=integer -lco SCHEMA=bronze" +unset PGCLIENTENCODING + +### 2016 Road Network File ### +# https://web.archive.org/web/20230120140926/https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/bound-limit-2016-eng.cfm +import_to_postgis "${INPUT_FOLDER}/2016/lrnf000r16a_e.zip" lrnf000r16a_e "-lco COLUMN_TYPES=afl_val=integer,atl_val=integer,afr_val=integer,atr_val=integer -lco SCHEMA=bronze" + +### 2011 Road Network File ### +# https://web.archive.org/web/20230110163150/https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/bound-limit-2011-eng.cfm +export PGCLIENTENCODING=LATIN-1; +import_to_postgis "${INPUT_FOLDER}/2011/grnf000r11a_e.zip" grnf000r11a_e "-lco COLUMN_TYPES=afl_val=integer,atl_val=integer,afr_val=integer,atr_val=integer -lco SCHEMA=bronze" +unset PGCLIENTENCODING + +### 2006 Road Network File ### +# https://web.archive.org/web/20221218043125/https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/bound-limit-2006-eng.cfm +export PGCLIENTENCODING=LATIN-1; +import_to_postgis "${INPUT_FOLDER}/2006/grnf000r06a_e.zip" grnf000r06a_e "-lco COLUMN_TYPES=addr_fm_le=integer,addr_to_le=integer,addr_fm_rg=integer,addr_to_rg=integer -lco SCHEMA=bronze" +unset PGCLIENTENCODING + +# TODO - get working +### 2001 Road Network Files ### +# https://web.archive.org/web/20221218043135/https://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/bound-limit-2001-eng.cfm +# Census Road Network file +#unzip -n ${INPUT_FOLDER}/grnf000r01m_e.zip -d ${EXTRACTED_FOLDER} +#import_to_postgis ${EXTRACTED_FOLDER}/grnf000r02ml_e.MID statcan_grnf000r02ml_e_tmp +## Census Skeletal Road Network File +#unzip -n ${INPUT_FOLDER}/gsrn000r01m_e.zip -d ${EXTRACTED_FOLDER} +#import_to_postgis ${DATA_FOLDER}/gsrn000r02m_e.MID statcan_gsrn000r02m_e_tmp \ No newline at end of file diff --git a/road_network_files/organize.sql b/road_network_files/organize.sql new file mode 100644 index 0000000..132dbc8 --- /dev/null +++ b/road_network_files/organize.sql @@ -0,0 +1,48 @@ +/* Road Network */ + +--- 2016 Road Network; +drop table if exists statcan_rn_2016; +create table statcan_rn_2016 as + select ngduid as ngd_uid, "name", type, dir, afl_val, atl_val, afr_val, atr_val, + csduid_l, csduid_r, concat(pruid_l, cmauid_l) as cmapuid_l, concat(pruid_r, cmauid_r) as cmapuid_r, pruid_l, pruid_r, rank, class, geom + from statcan_lrnf000r16a_e; + +create index statcan_rn_2016_geom_idx on statcan_rn_2016 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_lrnf000r16a_e; + +--- 2011 Road Network; +drop table if exists statcan_rn_2011; +create table statcan_rn_2011 as + select ngd_uid, "name", type, dir, afl_val, atl_val, afr_val, atr_val, + csduid_l, csduid_r, concat(pruid_l, cmauid_l) as cmapuid_l, concat(pruid_r, cmauid_r) as cmapuid_r, pruid_l, pruid_r, rank, class, geom + from statcan_grnf000r11a_e; + +create index statcan_rn_2011_geom_idx on statcan_rn_2011 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_grnf000r11a_e; + +--- 2006 Road Network; +drop table if exists statcan_rn_2006; +create table statcan_rn_2006 as + select rb_uid, "name", type, direction as dir, addr_fm_le as afl_val, addr_to_le as atl_val, addr_fm_rg as afr_val, addr_to_rg as atr_val, + geom + from statcan_grnf000r06a_e; + +create index statcan_rn_2006_geom_idx on statcan_rn_2006 using GIST(geom) with (FILLFACTOR=100); +drop table if exists statcan_grnf000r06a_e; + +--- TODO; +--- 2001 Road Network; +create table statcan_rn_2001_tmp as +select arc_id, "name", type, direction as dir, addr_fm_left as afl_val, + addr_to_left as atl_val, addr_fm_rght as afr_val, addr_to_rght as atr_val, + geom +from statcan_grnf000r02ml_e_tmp +where class = 'U' + + + + + + + + diff --git a/road_network_files/process.sh b/road_network_files/process.sh new file mode 100755 index 0000000..9402819 --- /dev/null +++ b/road_network_files/process.sh @@ -0,0 +1 @@ +psql "postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@db/$POSTGRES_DB" -f road_network_files/road_network_files.sql diff --git a/road_network_files/road_network_file_2001.txt b/road_network_files/road_network_file_2001.txt new file mode 100644 index 0000000..d75c56a --- /dev/null +++ b/road_network_files/road_network_file_2001.txt @@ -0,0 +1,4 @@ +# 2001 Census Road Network File +https://data.dataforcanada.org/archive/statistics_canada/road_network_files/2001/grnf000r01m_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/RNF-FRR/files-fichiers/grnf000r01m_e.zip +# 2001 Census Skeletal Road Network File +https://data.dataforcanada.org/archive/statistics_canada/road_network_files/2001/gsrn000r01m_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/RNF-FRR/files-fichiers/gsrn000r01m_e.zip \ No newline at end of file diff --git a/road_network_files/road_network_file_2006.txt b/road_network_files/road_network_file_2006.txt new file mode 100644 index 0000000..968b356 --- /dev/null +++ b/road_network_files/road_network_file_2006.txt @@ -0,0 +1,2 @@ +# 2006 Census Road Network and Geographic Attribute File Road Network +https://data.dataforcanada.org/archive/statistics_canada/road_network_files/2006/grgf000r06a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/RNF-FRR/files-fichiers/grgf000r06a_e.zip \ No newline at end of file diff --git a/road_network_files/road_network_file_2011.txt b/road_network_files/road_network_file_2011.txt new file mode 100644 index 0000000..dda0400 --- /dev/null +++ b/road_network_files/road_network_file_2011.txt @@ -0,0 +1,2 @@ +# 2011 Census Road Network File +https://data.dataforcanada.org/archive/statistics_canada/road_network_files/2011/grnf000r11a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/RNF-FRR/files-fichiers/grnf000r11a_e.zip \ No newline at end of file diff --git a/road_network_files/road_network_file_2016.txt b/road_network_files/road_network_file_2016.txt new file mode 100644 index 0000000..902ed1c --- /dev/null +++ b/road_network_files/road_network_file_2016.txt @@ -0,0 +1,2 @@ +# 2016 Census Road Network File +https://data.dataforcanada.org/archive/statistics_canada/road_network_files/2016/lrnf000r16a_e.zip https://www12.statcan.gc.ca/census-recensement/2011/geo/RNF-FRR/files-fichiers/2016/lrnf000r16a_e.zip \ No newline at end of file diff --git a/road_network_files/road_network_file_2021.txt b/road_network_files/road_network_file_2021.txt new file mode 100644 index 0000000..a884d6c --- /dev/null +++ b/road_network_files/road_network_file_2021.txt @@ -0,0 +1,2 @@ +# 2021 Census Road Network File. Geodatabase was not working so fall back to shapefile +https://data.dataforcanada.org/archive/statistics_canada/road_network_files/2021/lrnf000r21a_e.zip https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/RNF-FRR/files-fichiers/lrnf000r21a_e.zip \ No newline at end of file diff --git a/road_network_files/road_network_files.sql b/road_network_files/road_network_files.sql new file mode 100644 index 0000000..b83a94e --- /dev/null +++ b/road_network_files/road_network_files.sql @@ -0,0 +1,89 @@ +/* +Road Network Files +*/ + +/* 2021 +Definition here: https://web.archive.org/web/20240215050313/https://www12.statcan.gc.ca/census-recensement/2021/ref/dict/az/definition-eng.cfm?ID=geo041 +*/ + +DROP TABLE IF EXISTS silver.road_2021; +CREATE TABLE silver.road_2021 AS +SELECT + left_side.country_dguid, + left_side.country_en_name, + left_side.country_fr_name, + left_side.country_en_abbreviation, + left_side.country_fr_abbreviation, + left_side.grc_dguid AS grc_dguid_left, + right_side.grc_dguid AS grc_dguid_right, + left_side.grc_en_name AS grc_en_name_left, + right_side.grc_en_name AS grc_en_name_right, + left_side.grc_fr_name AS grc_fr_name_left, + right_side.grc_fr_name AS grc_fr_name_right, + left_side.pr_dguid AS pr_dguid_left, + right_side.pr_dguid AS pr_dguid_right, + left_side.pr_en_name AS pr_en_name_left, + right_side.pr_en_name AS pr_en_name_right, + left_side.pr_fr_name AS pr_fr_name_left, + right_side.pr_fr_name AS pr_fr_name_right, + left_side.pr_iso_code AS pr_iso_code_left, + right_side.pr_iso_code AS pr_iso_code_right, + left_side.car_dguid AS car_dguid_left, + right_side.car_dguid AS car_dguid_right, + left_side.car_en_name AS car_en_name_left, + right_side.car_en_name AS car_en_name_right, + left_side.car_fr_name AS car_fr_name_left, + right_side.car_fr_name AS car_fr_name_right, + left_side.er_dguid AS er_dguid_left, + right_side.er_dguid AS er_dguid_right, + left_side.er_name AS er_name_left, + right_side.er_name AS er_name_right, + left_side.cd_dguid AS cd_dguid_left, + right_side.cd_dguid AS cd_dguid_right, + left_side.cd_name AS cd_name_left, + right_side.cd_name AS cd_name_right, + left_side.cd_type AS cd_type_left, + right_side.cd_type AS cd_type_right, + left_side.ccs_dguid AS ccs_dguid_left, + right_side.ccs_dguid AS ccs_dguid_right, + left_side.ccs_name AS ccs_name_left, + right_side.ccs_name AS ccs_name_right, + left_side.cma_dguid AS cma_dguid_left, + right_side.cma_dguid AS cma_dguid_right, + left_side.cma_p_dguid AS cma_p_dguid_left, + right_side.cma_p_dguid AS cma_p_dguid_right, + left_side.cma_name AS cma_name_left, + right_side.cma_name AS cma_name_right, + left_side.cma_type AS cma_type_left, + right_side.cma_type AS cma_type_right, + left_side.csd_dguid AS csd_dguid_left, + right_side.csd_dguid AS csd_dguid_right, + left_side.csd_name AS csd_name_left, + right_side.csd_name AS csd_name_right, + left_side.csd_type AS csd_type_left, + right_side.csd_type AS csd_type_right, + left_side.sac_type AS sac_type_left, + right_side.sac_type AS sac_type_right, + left_side.sac_code AS sac_code_left, + right_side.sac_code AS sac_code_right, + road.ngd_uid AS road_ngd_uid, + road.name AS road_name, + road.type AS road_type, + road.dir AS road_direction, + road.afl_val AS road_address_from_left_value, + road.atl_val AS road_address_to_left_value, + road.afr_val AS road_address_from_right_value, + road.atr_val AS road_address_to_right_value, + road.rank AS road_rank, + road.class AS road_class, + road.geom +FROM bronze.lrnf000r21a_e AS road +LEFT JOIN silver.csd_2021 AS left_side + ON road.csddguid_l = left_side.csd_dguid +LEFT JOIN silver.csd_2021 AS right_side + ON road.csddguid_r = right_side.csd_dguid; + +-- Create spatial index +CREATE INDEX road_2021_geom_idx ON +silver.road_2021 +USING gist (geom) WITH (fillfactor = 100);