mirror of
https://github.com/dataforcanada/d4c-datapkg-statistical.git
synced 2026-06-13 14:10:55 +02:00
Initial commit
This commit is contained in:
Executable
+11
@@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
if [ ! -d "${DATA_FOLDER}/geosuite" ]
|
||||
then
|
||||
echo "Making directory ${DATA_FOLDER}/geosuite/"
|
||||
mkdir -p ${DATA_FOLDER}/geosuite/{input,extracted,output}
|
||||
fi
|
||||
|
||||
INPUT_FOLDER="${DATA_FOLDER}/geosuite/input"
|
||||
|
||||
echo "Downloading geosuite files"
|
||||
aria2c -x16 -i "${SCRIPT_DIR}/geosuite/files.txt" --dir=${INPUT_FOLDER} --auto-file-renaming=false
|
||||
Executable
+10
@@ -0,0 +1,10 @@
|
||||
# 2021. Here is the reference guide https://web.archive.org/web/20240809014903/https://www150.statcan.gc.ca/n1/pub/92-150-g/92-150-g2021001-eng.htm
|
||||
https://www12.statcan.gc.ca/census-recensement/2021/geo/aip-pia/geosuite/files-fichiers/2021_92-150-X_eng.zip
|
||||
# 2016. Here is the reference guide https://web.archive.org/web/20250115043056/https://www150.statcan.gc.ca/n1/pub/92-150-g/92-150-g2016002-eng.htm
|
||||
https://www12.statcan.gc.ca/census-recensement/2016/geo/ref/geosuite/files-fichiers/GeoSuite_2016_92-150_XBB_eng.zip
|
||||
# 2011. Here is the reference guide https://www150.statcan.gc.ca/n1/pub/92-150-g/92-150-g2011001-eng.htm
|
||||
https://www12.statcan.gc.ca/census-recensement/2011/geo/ref/files-fichiers/2011_92-150_XBB_eng.zip
|
||||
# 2006. Here is the reference guide
|
||||
https://www12.statcan.gc.ca/census-recensement/2011/geo/ref/files-fichiers/2006_92-150_XBB_eng.zip
|
||||
# 2001. Here is the reference guide
|
||||
https://www12.statcan.gc.ca/census-recensement/2011/geo/ref/files-fichiers/92F0150WCB2001000.zip
|
||||
Executable
+12
@@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
INPUT_FOLDER="${DATA_FOLDER}/geosuite/input"
|
||||
EXTRACTED_FOLDER="${DATA_FOLDER}/geosuite/extracted"
|
||||
|
||||
import_2021() {
|
||||
echo "Unzipping 2021 geosuite data"
|
||||
unzip -n "${INPUT_FOLDER}/2021_92-150-X_eng.zip" -d ${EXTRACTED_FOLDER}
|
||||
python geosuite/process.py ${EXTRACTED_FOLDER}/2021_92-150-X_eng/PN.csv
|
||||
}
|
||||
|
||||
import_2021
|
||||
@@ -0,0 +1,283 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "dacb31a5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Reading /home/jovyan/work/process-statcan-data/data/geosuite/extracted/2021_92-150-X_eng/PN.csv\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#!/usr/bin/env python\n",
|
||||
"# coding: utf-8\n",
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"\n",
|
||||
"import geopandas as gpd\n",
|
||||
"import pandas as pd\n",
|
||||
"from sqlalchemy import create_engine\n",
|
||||
"from sqlalchemy import text\n",
|
||||
"\n",
|
||||
"placenames_2021_csv = \"/home/jovyan/work/process-statcan-data/data/geosuite/extracted/2021_92-150-X_eng/PN.csv\"\n",
|
||||
"\n",
|
||||
"DATABASE = os.environ.get(\"POSTGRES_DB\")\n",
|
||||
"HOST = os.environ.get(\"WAREHOUSE_PG_HOST\")\n",
|
||||
"USER = os.environ.get(\"POSTGRES_USER\")\n",
|
||||
"PASSWORD = os.environ.get(\"POSTGRES_PASSWORD\")\n",
|
||||
"\n",
|
||||
"#engine = create_engine(f\"postgresql://{USER}:{PASSWORD}@{HOST}:5432/{DATABASE}\")\n",
|
||||
"\n",
|
||||
"print(f\"Reading {placenames_2021_csv}\")\n",
|
||||
"placenames = pd.read_csv(filepath_or_buffer=placenames_2021_csv,\n",
|
||||
" encoding='latin-1',\n",
|
||||
" usecols=['PNdguid', 'PNname', 'PNsource', 'PNrplat', 'PNrplong'])\n",
|
||||
"\n",
|
||||
"placenames.rename(columns={\n",
|
||||
" 'PNdguid': 'pn_dguid',\n",
|
||||
" 'PNname': 'pn_name',\n",
|
||||
" 'PNsource': 'pn_source',\n",
|
||||
" 'PNrplat': 'latitude',\n",
|
||||
" 'PNrplong': 'longitude'\n",
|
||||
"}, inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "d2d4d385",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"rec.array([(4269, '2021S0515005422', 'Cascapédia\\x96Saint-Jules', 1, 48.25, -65.9166667)],\n",
|
||||
" dtype=[('index', '<i8'), ('pn_dguid', 'O'), ('pn_name', 'O'), ('pn_source', '<i8'), ('latitude', '<f8'), ('longitude', '<f8')])"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"placenames[placenames['pn_dguid'] == '2021S0515005422'].to_records()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"id": "5110c35c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"special_unicodes = []\n",
|
||||
"for record in placenames.to_records():\n",
|
||||
" pn_dguid = record[1]\n",
|
||||
" pn_name = record[2]\n",
|
||||
" if r'\\x' in repr(pn_name):\n",
|
||||
" special_unicodes.append((pn_dguid, pn_name))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"id": "1d880cb1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"19"
|
||||
]
|
||||
},
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#print(special_unicodes)\n",
|
||||
"len(special_unicodes)\n",
|
||||
"#dguids_affected = [x[0] for x in special_unicodes]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"id": "7b320444",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['2021S0515005422',\n",
|
||||
" '2021S0515007864',\n",
|
||||
" '2021S0515017557',\n",
|
||||
" '2021S0515019487',\n",
|
||||
" '2021S0515019731',\n",
|
||||
" '2021S0515022795',\n",
|
||||
" '2021S0515024311',\n",
|
||||
" '2021S0515028429',\n",
|
||||
" '2021S0515030028',\n",
|
||||
" '2021S0515030168',\n",
|
||||
" '2021S0515030432',\n",
|
||||
" '2021S0515031197',\n",
|
||||
" '2021S0515031295',\n",
|
||||
" '2021S0515031660',\n",
|
||||
" '2021S0515032370',\n",
|
||||
" '2021S0515038300',\n",
|
||||
" '2021S0515038389',\n",
|
||||
" '2021S0515040448',\n",
|
||||
" '2021S0515040522']"
|
||||
]
|
||||
},
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dguids_affected"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "b36377ad",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"r'\\x' in r'Cascapédia\\x96Saint-Jules'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a68fe5c3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"Creating geodataframe from placenames file\")\n",
|
||||
"gdf = gpd.GeoDataFrame(\n",
|
||||
" placenames, \n",
|
||||
" geometry=gpd.points_from_xy(placenames.longitude,\n",
|
||||
" placenames.latitude),\n",
|
||||
" crs=\"EPSG:4326\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"Dropping 'latitude', 'longitude' from geodataframe\")\n",
|
||||
"gdf.drop(columns=[\"latitude\", \"longitude\"], \n",
|
||||
" inplace=True)\n",
|
||||
"\n",
|
||||
"print(f\"Loading geodatframe to PostgreSQL as statcan_pn_2021_tmp\")\n",
|
||||
"gdf.to_postgis(name=f\"statcan_pn_2021_tmp\", \n",
|
||||
" con=engine,\n",
|
||||
" chunksize=150000,\n",
|
||||
" if_exists='replace')\n",
|
||||
"\n",
|
||||
"print(\"Creating statcan_pn_2021\")\n",
|
||||
"sql = \"\"\"\n",
|
||||
"DROP TABLE IF EXISTS statcan_pn_2021;\n",
|
||||
"\n",
|
||||
"CREATE TABLE statcan_pn_2021 AS\n",
|
||||
"SELECT \n",
|
||||
"db.country_dguid,\n",
|
||||
"db.country_en_name, \n",
|
||||
"db.country_fr_name,\n",
|
||||
"db.country_en_abbreviation,\n",
|
||||
"db.country_fr_abbreviation,\n",
|
||||
"db.grc_dguid,\n",
|
||||
"db.grc_en_name,\n",
|
||||
"db.grc_fr_name,\n",
|
||||
"db.pr_dguid,\n",
|
||||
"db.pr_en_name,\n",
|
||||
"db.pr_fr_name,\n",
|
||||
"db.pr_en_abbreviation,\n",
|
||||
"db.pr_fr_abbreviation,\n",
|
||||
"db.pr_iso_code,\n",
|
||||
"db.car_dguid,\n",
|
||||
"db.car_en_name,\n",
|
||||
"db.car_fr_name,\n",
|
||||
"db.er_dguid,\n",
|
||||
"db.er_name,\n",
|
||||
"db.cd_dguid,\n",
|
||||
"db.cd_name,\n",
|
||||
"db.cd_type,\n",
|
||||
"db.ccs_dguid,\n",
|
||||
"db.ccs_name,\n",
|
||||
"db.cma_dguid,\n",
|
||||
"db.cma_p_dguid,\n",
|
||||
"db.cma_name,\n",
|
||||
"db.cma_type,\n",
|
||||
"db.csd_dguid,\n",
|
||||
"db.csd_name,\n",
|
||||
"db.csd_type,\n",
|
||||
"db.sac_type,\n",
|
||||
"db.sac_code,\n",
|
||||
"db.fed_dguid,\n",
|
||||
"db.fed_name,\n",
|
||||
"db.fed_en_name,\n",
|
||||
"db.fed_fr_name,\n",
|
||||
"db.ct_dguid,\n",
|
||||
"db.ada_dguid,\n",
|
||||
"db.da_dguid,\n",
|
||||
"db.db_dguid,\n",
|
||||
"placenames.pn_dguid,\n",
|
||||
"placenames.pn_name,\n",
|
||||
"placenames.pn_source,\n",
|
||||
"placenames.geometry as geom\n",
|
||||
"FROM statcan_pn_2021_tmp as placenames,\n",
|
||||
" statcan_db_2021 as db\n",
|
||||
"WHERE ST_Intersects(placenames.geometry, db.geom);\n",
|
||||
"\n",
|
||||
"CREATE INDEX statcan_pn_2021_geom_idx ON\n",
|
||||
"statcan_pn_2021 \n",
|
||||
"\tUSING GIST(geom) WITH (FILLFACTOR = 100);\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"with engine.connect() as conn:\n",
|
||||
" conn.execute(text(sql))\n",
|
||||
" conn.commit()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,113 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
import os
|
||||
import sys
|
||||
|
||||
import geopandas as gpd
|
||||
import pandas as pd
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy import text
|
||||
|
||||
placenames_2021_csv = sys.argv[1]
|
||||
|
||||
DATABASE = os.environ.get("POSTGRES_DB")
|
||||
USER = os.environ.get("POSTGRES_USER")
|
||||
PASSWORD = os.environ.get("POSTGRES_PASSWORD")
|
||||
|
||||
engine = create_engine(f"postgresql://{USER}:{PASSWORD}@db:5432/{DATABASE}")
|
||||
|
||||
print(f"Reading {placenames_2021_csv}")
|
||||
placenames = pd.read_csv(filepath_or_buffer=placenames_2021_csv,
|
||||
encoding='ISO-8859-1',
|
||||
usecols=['PNdguid', 'PNname', 'PNsource', 'PNrplat', 'PNrplong'])
|
||||
|
||||
placenames.rename(columns={
|
||||
'PNdguid': 'pn_dguid',
|
||||
'PNname': 'pn_name',
|
||||
'PNsource': 'pn_source',
|
||||
'PNrplat': 'latitude',
|
||||
'PNrplong': 'longitude'
|
||||
}, inplace=True)
|
||||
|
||||
print("Creating geodataframe from placenames file")
|
||||
gdf = gpd.GeoDataFrame(
|
||||
placenames,
|
||||
geometry=gpd.points_from_xy(placenames.longitude,
|
||||
placenames.latitude),
|
||||
crs="EPSG:4326"
|
||||
)
|
||||
|
||||
print("Dropping 'latitude', 'longitude' from geodataframe")
|
||||
gdf.drop(columns=["latitude", "longitude"],
|
||||
inplace=True)
|
||||
|
||||
print(f"Loading geodataframe to PostgreSQL as bronze.pn_2021_tmp")
|
||||
gdf.to_postgis(name=f"pn_2021_tmp",
|
||||
con=engine,
|
||||
chunksize=150000,
|
||||
if_exists='replace',
|
||||
schema='bronze')
|
||||
|
||||
print("Creating silver.pn_2021")
|
||||
sql = """
|
||||
DROP TABLE IF EXISTS silver.pn_2021;
|
||||
|
||||
CREATE TABLE silver.pn_2021 AS
|
||||
SELECT
|
||||
db.country_dguid,
|
||||
db.country_en_name,
|
||||
db.country_fr_name,
|
||||
db.country_en_abbreviation,
|
||||
db.country_fr_abbreviation,
|
||||
db.grc_dguid,
|
||||
db.grc_en_name,
|
||||
db.grc_fr_name,
|
||||
db.pr_dguid,
|
||||
db.pr_en_name,
|
||||
db.pr_fr_name,
|
||||
db.pr_en_abbreviation,
|
||||
db.pr_fr_abbreviation,
|
||||
db.pr_iso_code,
|
||||
db.car_dguid,
|
||||
db.car_en_name,
|
||||
db.car_fr_name,
|
||||
db.er_dguid,
|
||||
db.er_name,
|
||||
db.cd_dguid,
|
||||
db.cd_name,
|
||||
db.cd_type,
|
||||
db.ccs_dguid,
|
||||
db.ccs_name,
|
||||
db.cma_dguid,
|
||||
db.cma_p_dguid,
|
||||
db.cma_name,
|
||||
db.cma_type,
|
||||
db.csd_dguid,
|
||||
db.csd_name,
|
||||
db.csd_type,
|
||||
db.sac_type,
|
||||
db.sac_code,
|
||||
db.fed_dguid,
|
||||
db.fed_name,
|
||||
db.fed_en_name,
|
||||
db.fed_fr_name,
|
||||
db.ct_dguid,
|
||||
db.ada_dguid,
|
||||
db.da_dguid,
|
||||
db.db_dguid,
|
||||
placenames.pn_dguid,
|
||||
placenames.pn_name,
|
||||
placenames.pn_source,
|
||||
placenames.geometry as geom
|
||||
FROM bronze.pn_2021_tmp as placenames,
|
||||
silver.db_2021 as db
|
||||
WHERE ST_Intersects(placenames.geometry, db.geom);
|
||||
|
||||
CREATE INDEX pn_2021_geom_idx ON
|
||||
silver.pn_2021
|
||||
USING GIST(geom) WITH (FILLFACTOR = 100);
|
||||
"""
|
||||
|
||||
with engine.connect() as conn:
|
||||
conn.execute(text(sql))
|
||||
conn.commit()
|
||||
Reference in New Issue
Block a user