mirror of
https://github.com/dataforcanada/d4c-datapkg-statistical.git
synced 2026-06-13 14:10:55 +02:00
Initial commit
This commit is contained in:
@@ -0,0 +1,283 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "dacb31a5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Reading /home/jovyan/work/process-statcan-data/data/geosuite/extracted/2021_92-150-X_eng/PN.csv\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#!/usr/bin/env python\n",
|
||||
"# coding: utf-8\n",
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"\n",
|
||||
"import geopandas as gpd\n",
|
||||
"import pandas as pd\n",
|
||||
"from sqlalchemy import create_engine\n",
|
||||
"from sqlalchemy import text\n",
|
||||
"\n",
|
||||
"placenames_2021_csv = \"/home/jovyan/work/process-statcan-data/data/geosuite/extracted/2021_92-150-X_eng/PN.csv\"\n",
|
||||
"\n",
|
||||
"DATABASE = os.environ.get(\"POSTGRES_DB\")\n",
|
||||
"HOST = os.environ.get(\"WAREHOUSE_PG_HOST\")\n",
|
||||
"USER = os.environ.get(\"POSTGRES_USER\")\n",
|
||||
"PASSWORD = os.environ.get(\"POSTGRES_PASSWORD\")\n",
|
||||
"\n",
|
||||
"#engine = create_engine(f\"postgresql://{USER}:{PASSWORD}@{HOST}:5432/{DATABASE}\")\n",
|
||||
"\n",
|
||||
"print(f\"Reading {placenames_2021_csv}\")\n",
|
||||
"placenames = pd.read_csv(filepath_or_buffer=placenames_2021_csv,\n",
|
||||
" encoding='latin-1',\n",
|
||||
" usecols=['PNdguid', 'PNname', 'PNsource', 'PNrplat', 'PNrplong'])\n",
|
||||
"\n",
|
||||
"placenames.rename(columns={\n",
|
||||
" 'PNdguid': 'pn_dguid',\n",
|
||||
" 'PNname': 'pn_name',\n",
|
||||
" 'PNsource': 'pn_source',\n",
|
||||
" 'PNrplat': 'latitude',\n",
|
||||
" 'PNrplong': 'longitude'\n",
|
||||
"}, inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "d2d4d385",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"rec.array([(4269, '2021S0515005422', 'Cascapédia\\x96Saint-Jules', 1, 48.25, -65.9166667)],\n",
|
||||
" dtype=[('index', '<i8'), ('pn_dguid', 'O'), ('pn_name', 'O'), ('pn_source', '<i8'), ('latitude', '<f8'), ('longitude', '<f8')])"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"placenames[placenames['pn_dguid'] == '2021S0515005422'].to_records()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"id": "5110c35c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"special_unicodes = []\n",
|
||||
"for record in placenames.to_records():\n",
|
||||
" pn_dguid = record[1]\n",
|
||||
" pn_name = record[2]\n",
|
||||
" if r'\\x' in repr(pn_name):\n",
|
||||
" special_unicodes.append((pn_dguid, pn_name))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"id": "1d880cb1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"19"
|
||||
]
|
||||
},
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#print(special_unicodes)\n",
|
||||
"len(special_unicodes)\n",
|
||||
"#dguids_affected = [x[0] for x in special_unicodes]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"id": "7b320444",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['2021S0515005422',\n",
|
||||
" '2021S0515007864',\n",
|
||||
" '2021S0515017557',\n",
|
||||
" '2021S0515019487',\n",
|
||||
" '2021S0515019731',\n",
|
||||
" '2021S0515022795',\n",
|
||||
" '2021S0515024311',\n",
|
||||
" '2021S0515028429',\n",
|
||||
" '2021S0515030028',\n",
|
||||
" '2021S0515030168',\n",
|
||||
" '2021S0515030432',\n",
|
||||
" '2021S0515031197',\n",
|
||||
" '2021S0515031295',\n",
|
||||
" '2021S0515031660',\n",
|
||||
" '2021S0515032370',\n",
|
||||
" '2021S0515038300',\n",
|
||||
" '2021S0515038389',\n",
|
||||
" '2021S0515040448',\n",
|
||||
" '2021S0515040522']"
|
||||
]
|
||||
},
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dguids_affected"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "b36377ad",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"r'\\x' in r'Cascapédia\\x96Saint-Jules'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a68fe5c3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"Creating geodataframe from placenames file\")\n",
|
||||
"gdf = gpd.GeoDataFrame(\n",
|
||||
" placenames, \n",
|
||||
" geometry=gpd.points_from_xy(placenames.longitude,\n",
|
||||
" placenames.latitude),\n",
|
||||
" crs=\"EPSG:4326\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"Dropping 'latitude', 'longitude' from geodataframe\")\n",
|
||||
"gdf.drop(columns=[\"latitude\", \"longitude\"], \n",
|
||||
" inplace=True)\n",
|
||||
"\n",
|
||||
"print(f\"Loading geodatframe to PostgreSQL as statcan_pn_2021_tmp\")\n",
|
||||
"gdf.to_postgis(name=f\"statcan_pn_2021_tmp\", \n",
|
||||
" con=engine,\n",
|
||||
" chunksize=150000,\n",
|
||||
" if_exists='replace')\n",
|
||||
"\n",
|
||||
"print(\"Creating statcan_pn_2021\")\n",
|
||||
"sql = \"\"\"\n",
|
||||
"DROP TABLE IF EXISTS statcan_pn_2021;\n",
|
||||
"\n",
|
||||
"CREATE TABLE statcan_pn_2021 AS\n",
|
||||
"SELECT \n",
|
||||
"db.country_dguid,\n",
|
||||
"db.country_en_name, \n",
|
||||
"db.country_fr_name,\n",
|
||||
"db.country_en_abbreviation,\n",
|
||||
"db.country_fr_abbreviation,\n",
|
||||
"db.grc_dguid,\n",
|
||||
"db.grc_en_name,\n",
|
||||
"db.grc_fr_name,\n",
|
||||
"db.pr_dguid,\n",
|
||||
"db.pr_en_name,\n",
|
||||
"db.pr_fr_name,\n",
|
||||
"db.pr_en_abbreviation,\n",
|
||||
"db.pr_fr_abbreviation,\n",
|
||||
"db.pr_iso_code,\n",
|
||||
"db.car_dguid,\n",
|
||||
"db.car_en_name,\n",
|
||||
"db.car_fr_name,\n",
|
||||
"db.er_dguid,\n",
|
||||
"db.er_name,\n",
|
||||
"db.cd_dguid,\n",
|
||||
"db.cd_name,\n",
|
||||
"db.cd_type,\n",
|
||||
"db.ccs_dguid,\n",
|
||||
"db.ccs_name,\n",
|
||||
"db.cma_dguid,\n",
|
||||
"db.cma_p_dguid,\n",
|
||||
"db.cma_name,\n",
|
||||
"db.cma_type,\n",
|
||||
"db.csd_dguid,\n",
|
||||
"db.csd_name,\n",
|
||||
"db.csd_type,\n",
|
||||
"db.sac_type,\n",
|
||||
"db.sac_code,\n",
|
||||
"db.fed_dguid,\n",
|
||||
"db.fed_name,\n",
|
||||
"db.fed_en_name,\n",
|
||||
"db.fed_fr_name,\n",
|
||||
"db.ct_dguid,\n",
|
||||
"db.ada_dguid,\n",
|
||||
"db.da_dguid,\n",
|
||||
"db.db_dguid,\n",
|
||||
"placenames.pn_dguid,\n",
|
||||
"placenames.pn_name,\n",
|
||||
"placenames.pn_source,\n",
|
||||
"placenames.geometry as geom\n",
|
||||
"FROM statcan_pn_2021_tmp as placenames,\n",
|
||||
" statcan_db_2021 as db\n",
|
||||
"WHERE ST_Intersects(placenames.geometry, db.geom);\n",
|
||||
"\n",
|
||||
"CREATE INDEX statcan_pn_2021_geom_idx ON\n",
|
||||
"statcan_pn_2021 \n",
|
||||
"\tUSING GIST(geom) WITH (FILLFACTOR = 100);\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"with engine.connect() as conn:\n",
|
||||
" conn.execute(text(sql))\n",
|
||||
" conn.commit()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user