Files
d4c-datapkg-statistical/geosuite/problems.ipynb
T
Diego Ripley f93e4d0cec Initial commit
2025-05-24 13:37:31 -04:00

284 lines
7.2 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"id": "dacb31a5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reading /home/jovyan/work/process-statcan-data/data/geosuite/extracted/2021_92-150-X_eng/PN.csv\n"
]
}
],
"source": [
"#!/usr/bin/env python\n",
"# coding: utf-8\n",
"import os\n",
"import sys\n",
"\n",
"import geopandas as gpd\n",
"import pandas as pd\n",
"from sqlalchemy import create_engine\n",
"from sqlalchemy import text\n",
"\n",
"placenames_2021_csv = \"/home/jovyan/work/process-statcan-data/data/geosuite/extracted/2021_92-150-X_eng/PN.csv\"\n",
"\n",
"DATABASE = os.environ.get(\"POSTGRES_DB\")\n",
"HOST = os.environ.get(\"WAREHOUSE_PG_HOST\")\n",
"USER = os.environ.get(\"POSTGRES_USER\")\n",
"PASSWORD = os.environ.get(\"POSTGRES_PASSWORD\")\n",
"\n",
"#engine = create_engine(f\"postgresql://{USER}:{PASSWORD}@{HOST}:5432/{DATABASE}\")\n",
"\n",
"print(f\"Reading {placenames_2021_csv}\")\n",
"placenames = pd.read_csv(filepath_or_buffer=placenames_2021_csv,\n",
" encoding='latin-1',\n",
" usecols=['PNdguid', 'PNname', 'PNsource', 'PNrplat', 'PNrplong'])\n",
"\n",
"placenames.rename(columns={\n",
" 'PNdguid': 'pn_dguid',\n",
" 'PNname': 'pn_name',\n",
" 'PNsource': 'pn_source',\n",
" 'PNrplat': 'latitude',\n",
" 'PNrplong': 'longitude'\n",
"}, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "d2d4d385",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"rec.array([(4269, '2021S0515005422', 'Cascapédia\\x96Saint-Jules', 1, 48.25, -65.9166667)],\n",
" dtype=[('index', '<i8'), ('pn_dguid', 'O'), ('pn_name', 'O'), ('pn_source', '<i8'), ('latitude', '<f8'), ('longitude', '<f8')])"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"placenames[placenames['pn_dguid'] == '2021S0515005422'].to_records()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "5110c35c",
"metadata": {},
"outputs": [],
"source": [
"special_unicodes = []\n",
"for record in placenames.to_records():\n",
" pn_dguid = record[1]\n",
" pn_name = record[2]\n",
" if r'\\x' in repr(pn_name):\n",
" special_unicodes.append((pn_dguid, pn_name))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "1d880cb1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"19"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#print(special_unicodes)\n",
"len(special_unicodes)\n",
"#dguids_affected = [x[0] for x in special_unicodes]"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "7b320444",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['2021S0515005422',\n",
" '2021S0515007864',\n",
" '2021S0515017557',\n",
" '2021S0515019487',\n",
" '2021S0515019731',\n",
" '2021S0515022795',\n",
" '2021S0515024311',\n",
" '2021S0515028429',\n",
" '2021S0515030028',\n",
" '2021S0515030168',\n",
" '2021S0515030432',\n",
" '2021S0515031197',\n",
" '2021S0515031295',\n",
" '2021S0515031660',\n",
" '2021S0515032370',\n",
" '2021S0515038300',\n",
" '2021S0515038389',\n",
" '2021S0515040448',\n",
" '2021S0515040522']"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dguids_affected"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "b36377ad",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r'\\x' in r'Cascapédia\\x96Saint-Jules'"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a68fe5c3",
"metadata": {},
"outputs": [],
"source": [
"print(\"Creating geodataframe from placenames file\")\n",
"gdf = gpd.GeoDataFrame(\n",
" placenames, \n",
" geometry=gpd.points_from_xy(placenames.longitude,\n",
" placenames.latitude),\n",
" crs=\"EPSG:4326\"\n",
")\n",
"\n",
"print(\"Dropping 'latitude', 'longitude' from geodataframe\")\n",
"gdf.drop(columns=[\"latitude\", \"longitude\"], \n",
" inplace=True)\n",
"\n",
"print(f\"Loading geodatframe to PostgreSQL as statcan_pn_2021_tmp\")\n",
"gdf.to_postgis(name=f\"statcan_pn_2021_tmp\", \n",
" con=engine,\n",
" chunksize=150000,\n",
" if_exists='replace')\n",
"\n",
"print(\"Creating statcan_pn_2021\")\n",
"sql = \"\"\"\n",
"DROP TABLE IF EXISTS statcan_pn_2021;\n",
"\n",
"CREATE TABLE statcan_pn_2021 AS\n",
"SELECT \n",
"db.country_dguid,\n",
"db.country_en_name, \n",
"db.country_fr_name,\n",
"db.country_en_abbreviation,\n",
"db.country_fr_abbreviation,\n",
"db.grc_dguid,\n",
"db.grc_en_name,\n",
"db.grc_fr_name,\n",
"db.pr_dguid,\n",
"db.pr_en_name,\n",
"db.pr_fr_name,\n",
"db.pr_en_abbreviation,\n",
"db.pr_fr_abbreviation,\n",
"db.pr_iso_code,\n",
"db.car_dguid,\n",
"db.car_en_name,\n",
"db.car_fr_name,\n",
"db.er_dguid,\n",
"db.er_name,\n",
"db.cd_dguid,\n",
"db.cd_name,\n",
"db.cd_type,\n",
"db.ccs_dguid,\n",
"db.ccs_name,\n",
"db.cma_dguid,\n",
"db.cma_p_dguid,\n",
"db.cma_name,\n",
"db.cma_type,\n",
"db.csd_dguid,\n",
"db.csd_name,\n",
"db.csd_type,\n",
"db.sac_type,\n",
"db.sac_code,\n",
"db.fed_dguid,\n",
"db.fed_name,\n",
"db.fed_en_name,\n",
"db.fed_fr_name,\n",
"db.ct_dguid,\n",
"db.ada_dguid,\n",
"db.da_dguid,\n",
"db.db_dguid,\n",
"placenames.pn_dguid,\n",
"placenames.pn_name,\n",
"placenames.pn_source,\n",
"placenames.geometry as geom\n",
"FROM statcan_pn_2021_tmp as placenames,\n",
" statcan_db_2021 as db\n",
"WHERE ST_Intersects(placenames.geometry, db.geom);\n",
"\n",
"CREATE INDEX statcan_pn_2021_geom_idx ON\n",
"statcan_pn_2021 \n",
"\tUSING GIST(geom) WITH (FILLFACTOR = 100);\n",
"\"\"\"\n",
"\n",
"with engine.connect() as conn:\n",
" conn.execute(text(sql))\n",
" conn.commit()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}