mirror of
https://github.com/dataforcanada/d4c-datapkg-statistical.git
synced 2026-06-13 22:20:56 +02:00
1361 lines
42 KiB
Plaintext
1361 lines
42 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 50,
|
|
"id": "fc8ca6f9",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import gc\n",
|
|
"import glob\n",
|
|
"\n",
|
|
"import duckdb\n",
|
|
"from IPython.core.interactiveshell import InteractiveShell \n",
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"import sqlalchemy\n",
|
|
"\n",
|
|
"# Enable multiple outputs per cell\n",
|
|
"InteractiveShell.ast_node_interactivity = \"all\"\n",
|
|
"# Show all columns\n",
|
|
"pd.set_option('display.max_columns', None)\n",
|
|
"\n",
|
|
"data_dir = '/data/census_of_population/output/2021/tabular'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "3d20a1f5",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Datasets\n",
|
|
"- 1.0 Canada, provinces, territories, census divisions (CDs), census subdivisions (CSDs) and dissemination areas (DAs)\n",
|
|
"- 2.0 Census metropolitan areas (CMAs), tracted census agglomerations (CAs) and census tracts (CTs)\n",
|
|
"- 3.0 Economic regions (ERs)\n",
|
|
"- 4.0 Population centres (POPCTRs)\n",
|
|
"- 5.0 Canada, provinces, territories and federal electoral districts (FEDs) (2013 Representation Order). **Just process FEDs**\n",
|
|
"- 6.0 Canada, provinces, territories and federal electoral districts (FEDs) (2023 Representation Order). **Just process FEDs**\n",
|
|
"- 7.0 Designated places (DPLs)\n",
|
|
"- 8.0 Aggregate dissemination areas (ADAs)\n",
|
|
"- 9.0 Forward sortation areas (FSAs)\n",
|
|
"- 10.0 Health regions (HRs)\n",
|
|
" - The Health Regions CSV also has a `Home and Community Care Support Services` `GEO_LEVEL`\n",
|
|
" - Need to take a look at the HRs files listed here https://www150.statcan.gc.ca/n1/pub/82-402-x/2023001/hrbf-flrs-eng.htm"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "b88a0db8",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def process_cop_csv(csvs_to_process):\n",
|
|
" \"\"\"\n",
|
|
" 1. Reads subset of fields for Census of Population CSV files\n",
|
|
" 2. Pivots on characteristic_id\n",
|
|
" 3. Appends all of the processed CSVs as one dataframe\n",
|
|
" \"\"\"\n",
|
|
" dataframes_to_concatenate = []\n",
|
|
" for filename in csvs_to_process:\n",
|
|
" print(f\"Processing {filename}\")\n",
|
|
" params = {\n",
|
|
" 'filepath_or_buffer': filename,\n",
|
|
" 'encoding': 'latin-1',\n",
|
|
" 'usecols': ['DGUID', \n",
|
|
" 'CHARACTERISTIC_ID', \n",
|
|
" 'C1_COUNT_TOTAL',\n",
|
|
" 'C2_COUNT_MEN+',\n",
|
|
" 'C3_COUNT_WOMEN+'\n",
|
|
" ],\n",
|
|
" 'dtype': {\n",
|
|
" 'CHARACTERISTIC_ID': np.int16\n",
|
|
" }\n",
|
|
" }\n",
|
|
" cop_df = pd.read_csv(**params)\n",
|
|
" cop_df.rename(columns={\n",
|
|
" 'C1_COUNT_TOTAL': 'count_total',\n",
|
|
" 'C2_COUNT_MEN+': 'count_men', \n",
|
|
" 'C3_COUNT_WOMEN+': 'count_women',\n",
|
|
" 'DGUID': 'dguid'\n",
|
|
" }, inplace=True)\n",
|
|
"\n",
|
|
" cop_df = cop_df.pivot(index='dguid', columns='CHARACTERISTIC_ID')\n",
|
|
"\n",
|
|
" # Flatten the hierarchical index\n",
|
|
" # https://stackoverflow.com/questions/14507794/how-to-flatten-a-hierarchical-index-in-columns/57630176#57630176\n",
|
|
" level_one = cop_df.columns.get_level_values(0).astype(str)\n",
|
|
" level_two = cop_df.columns.get_level_values(1).astype(str)\n",
|
|
" column_separator = ['_' if x != '' else '' for x in level_two]\n",
|
|
" cop_df.columns = level_one + column_separator + level_two\n",
|
|
" dataframes_to_concatenate.append(cop_df)\n",
|
|
" \n",
|
|
" print(\"Concatenating all dataframes into one\")\n",
|
|
" cop_df = pd.concat(dataframes_to_concatenate)\n",
|
|
" \n",
|
|
" return cop_df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "cba01571",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def drop_na_columns(dataframe):\n",
|
|
" \"\"\"\n",
|
|
" Delete columns where there are no values.\n",
|
|
" There are cases where there are values for the count_total\n",
|
|
" columns, but no values for the count_men and count_women columns\n",
|
|
" \"\"\"\n",
|
|
" columns_to_drop = []\n",
|
|
" for field in dataframe.columns:\n",
|
|
" minimum_value = dataframe[field].min()\n",
|
|
" maximum_value = dataframe[field].max()\n",
|
|
" if pd.isna(minimum_value) and pd.isna(maximum_value):\n",
|
|
" columns_to_drop.append(field)\n",
|
|
"\n",
|
|
" if columns_to_drop:\n",
|
|
" print(\"Dropping columns that don't have values\")\n",
|
|
" dataframe.drop(columns=columns_to_drop, inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "6071a2fe",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def convert_to_lowest_type(df):\n",
|
|
" \"\"\"\n",
|
|
" Convert columns to the best possible dtypes\n",
|
|
" For example, if the column is numerical and has a maximum value of 32,000 we can assign it a type of int16\n",
|
|
" \"\"\"\n",
|
|
" params = {\n",
|
|
" 'convert_string': False,\n",
|
|
" 'convert_boolean': False\n",
|
|
" }\n",
|
|
" df = df.convert_dtypes(**params)\n",
|
|
"\n",
|
|
" dtypes = pd.DataFrame(df.dtypes)\n",
|
|
" \n",
|
|
" # Downcast to the smallest numerical dtype\n",
|
|
" for row in dtypes.itertuples():\n",
|
|
" column = row[0]\n",
|
|
" the_type = str(row[1])\n",
|
|
" \n",
|
|
" # Skipping downcasting Float64 as there were issues with decimal places\n",
|
|
" # For example, instead of a value being 65.4, it turned into 65.4000015258789\n",
|
|
" if the_type == 'Float64':\n",
|
|
" continue \n",
|
|
" elif the_type == 'Int64':\n",
|
|
" df[column] = pd.to_numeric(df[column], downcast='integer')\n",
|
|
"\n",
|
|
" return df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "86ef0ae5",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Start processing\n",
|
|
"## 1.0 Process Canada, provinces, territories (PRs), census divisions (CDs), census subdivisions (CSDs) and dissemination areas (DAs)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "e648921d",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Atlantic.csv\n",
|
|
"Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_BritishColumbia.csv\n",
|
|
"Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Ontario.csv\n",
|
|
"Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Prairies.csv\n",
|
|
"Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Quebec.csv\n",
|
|
"Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Territories.csv\n",
|
|
"Concatenating all dataframes into one\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/*English_CSV_data*\")\n",
|
|
"cop_df = process_cop_csv(csvs_to_process)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "9a893e04",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Remove duplicates\n",
|
|
"- For example, for some reason, they included Canada (dguid 2021A000011124) 6 times (once per CSV), so we need to get unique values"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "5712f497",
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(f\"Number of records before {len(cop_df)}\")\n",
|
|
"print(\"Before:\")\n",
|
|
"cop_df[cop_df.index == '2021A000011124']\n",
|
|
"\n",
|
|
"# Get unique records\n",
|
|
"cop_df = cop_df.groupby(cop_df.index).last()\n",
|
|
"print(f\"Number of records after {len(cop_df)}\")\n",
|
|
"cop_df[cop_df.index == '2021A000011124']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "f987a804",
|
|
"metadata": {},
|
|
"source": [
|
|
"Get unique records"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "0e0e6cb8",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Split the Census of Population dataframe by geographic level"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "bdee50fc",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
|
|
]
|
|
},
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0"
|
|
]
|
|
},
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"con = duckdb.connect()\n",
|
|
"con.install_extension(\"spatial\")\n",
|
|
"con.load_extension(\"spatial\")\n",
|
|
"\n",
|
|
"# Get the dguid per level of geography\n",
|
|
"con.sql(\"\"\"\n",
|
|
"DROP TABLE IF EXISTS country_2021;\n",
|
|
"CREATE TABLE country_2021 AS SELECT country_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/country_2021.parquet';\n",
|
|
"\n",
|
|
"DROP TABLE IF EXISTS pr_2021;\n",
|
|
"CREATE TABLE pr_2021 AS SELECT pr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/pr_2021.parquet';\n",
|
|
"\n",
|
|
"DROP TABLE IF EXISTS cd_2021;\n",
|
|
"CREATE TABLE cd_2021 AS SELECT cd_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/cd_2021.parquet';\n",
|
|
"\n",
|
|
"DROP TABLE IF EXISTS csd_2021;\n",
|
|
"CREATE TABLE csd_2021 AS SELECT csd_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/csd_2021.parquet';\n",
|
|
"\n",
|
|
"DROP TABLE IF EXISTS da_2021;\n",
|
|
"CREATE TABLE da_2021 AS SELECT da_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/da_2021.parquet';\n",
|
|
"\"\"\")\n",
|
|
"con.commit()\n",
|
|
"\n",
|
|
"# Convert the duckdb tables to pandas dataframe\n",
|
|
"country_dguid = con.sql(\"SELECT * FROM country_2021\").to_df()\n",
|
|
"pr_dguid = con.sql(\"SELECT * FROM pr_2021\").to_df()\n",
|
|
"cd_dguid = con.sql(\"SELECT * FROM cd_2021\").to_df()\n",
|
|
"csd_dguid = con.sql(\"SELECT * FROM csd_2021\").to_df()\n",
|
|
"da_dguid = con.sql(\"SELECT * FROM da_2021\").to_df()\n",
|
|
"\n",
|
|
"# Join the Census of Population dataframe to each geographic level\n",
|
|
"cop_country = cop_df.join(country_dguid.set_index('dguid'), on='dguid', how='inner')\n",
|
|
"cop_pr = cop_df.join(pr_dguid.set_index('dguid'), on='dguid', how='inner')\n",
|
|
"cop_cd = cop_df.join(cd_dguid.set_index('dguid'), on='dguid', how='inner')\n",
|
|
"cop_csd = cop_df.join(csd_dguid.set_index('dguid'), on='dguid', how='inner')\n",
|
|
"cop_da = cop_df.join(da_dguid.set_index('dguid'), on='dguid', how='inner')\n",
|
|
"\n",
|
|
"del(cop_df)\n",
|
|
"del(country_dguid)\n",
|
|
"del(pr_dguid)\n",
|
|
"del(cd_dguid)\n",
|
|
"del(csd_dguid)\n",
|
|
"del(da_dguid)\n",
|
|
"gc.collect()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "2ca0a313-6530-4ecf-becd-8bedc31fdeed",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Convert dataframe columns to lowest dtype"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "96aed739",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"cop_country = convert_to_lowest_type(cop_country)\n",
|
|
"cop_pr = convert_to_lowest_type(cop_pr)\n",
|
|
"cop_cd = convert_to_lowest_type(cop_cd)\n",
|
|
"cop_csd = convert_to_lowest_type(cop_csd)\n",
|
|
"cop_da = convert_to_lowest_type(cop_da)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "9de07a61",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Delete columns where there are no values"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "7da3c692",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Number of columns before"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "7c6531d5",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Country- length: 7893\n",
|
|
"PR- length: 7893\n",
|
|
"CD- length: 7893\n",
|
|
"CSD- length: 7893\n",
|
|
"DA- length: 7893\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(f\"Country- length: {len(cop_country.columns)}\")\n",
|
|
"print(f\"PR- length: {len(cop_pr.columns)}\")\n",
|
|
"print(f\"CD- length: {len(cop_cd.columns)}\")\n",
|
|
"print(f\"CSD- length: {len(cop_csd.columns)}\")\n",
|
|
"print(f\"DA- length: {len(cop_da.columns)}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "a971d90c",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Dropping columns that don't have values\n",
|
|
"Dropping columns that don't have values\n",
|
|
"Dropping columns that don't have values\n",
|
|
"Dropping columns that don't have values\n",
|
|
"Dropping columns that don't have values\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"drop_na_columns(cop_country)\n",
|
|
"drop_na_columns(cop_pr)\n",
|
|
"drop_na_columns(cop_cd)\n",
|
|
"drop_na_columns(cop_csd)\n",
|
|
"drop_na_columns(cop_da)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "511023c8",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Number of columns after"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "1b003396",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Country- length: 7433\n",
|
|
"PR- length: 7433\n",
|
|
"CD- length: 7433\n",
|
|
"CSD- length: 7433\n",
|
|
"DA- length: 7431\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(f\"Country- length: {len(cop_country.columns)}\")\n",
|
|
"print(f\"PR- length: {len(cop_pr.columns)}\")\n",
|
|
"print(f\"CD- length: {len(cop_cd.columns)}\")\n",
|
|
"print(f\"CSD- length: {len(cop_csd.columns)}\")\n",
|
|
"print(f\"DA- length: {len(cop_da.columns)}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 27,
|
|
"id": "fa72bcce",
|
|
"metadata": {
|
|
"editable": true,
|
|
"slideshow": {
|
|
"slide_type": ""
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Country\n",
|
|
"cop_country = cop_country.reset_index()\n",
|
|
"cop_country.rename(columns={'dguid': 'country_dguid'}, inplace=True)\n",
|
|
"cop_country.to_parquet(path=f'{data_dir}/country_2021.parquet', index=False, compression='zstd')\n",
|
|
"\n",
|
|
"# Provinces and Territories\n",
|
|
"cop_pr = cop_pr.reset_index()\n",
|
|
"cop_pr.rename(columns={'dguid': 'pr_dguid'}, inplace=True)\n",
|
|
"cop_pr.to_parquet(path=f'{data_dir}/pr_2021.parquet', index=False, compression='zstd')\n",
|
|
"\n",
|
|
"# Census Divisions\n",
|
|
"cop_cd = cop_cd.reset_index()\n",
|
|
"cop_cd.rename(columns={'dguid': 'cd_dguid'}, inplace=True)\n",
|
|
"cop_cd.to_parquet(path=f'{data_dir}/cd_2021.parquet', index=False, compression='zstd')\n",
|
|
"\n",
|
|
"# Census Subdivisions\n",
|
|
"cop_csd = cop_csd.reset_index()\n",
|
|
"cop_csd.rename(columns={'dguid': 'csd_dguid'}, inplace=True)\n",
|
|
"cop_csd.to_parquet(path=f'{data_dir}/csd_2021.parquet', index=False, compression='zstd')\n",
|
|
"\n",
|
|
"# Dissemination Areas\n",
|
|
"cop_da = cop_da.reset_index()\n",
|
|
"cop_da.rename(columns={'dguid': 'da_dguid'}, inplace=True)\n",
|
|
"cop_da.to_parquet(path=f'{data_dir}/da_2021.parquet', index=False, compression='zstd')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 28,
|
|
"id": "441a7834-594f-400c-85a3-a353c8bdf202",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0"
|
|
]
|
|
},
|
|
"execution_count": 28,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"del(cop_country)\n",
|
|
"del(cop_pr)\n",
|
|
"del(cop_cd)\n",
|
|
"del(cop_csd)\n",
|
|
"del(cop_da)\n",
|
|
"gc.collect()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "6c7b31ef-9657-4a2e-bdf7-8d75313c10ef",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 2.0 Process Census metropolitan areas (CMAs), tracted census agglomerations (CAs) and census tracts (CTs)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "91b81c24-c160-4c38-8713-8d1f06d18624",
|
|
"metadata": {},
|
|
"source": [
|
|
"# TODO: Finish processing CMA"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b8c5ebfd-d5b7-4700-8006-3fa96402dc94",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Processing /data/census_of_population/extracted/2021/98-401-X2021007_eng_CSV/98-401-X2021007_English_CSV_data.csv\n",
|
|
"Concatenating all dataframes into one\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
|
|
]
|
|
},
|
|
"execution_count": 51,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"244615"
|
|
]
|
|
},
|
|
"execution_count": 51,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021007_eng_CSV/*English_CSV_data*\")\n",
|
|
"cop_df = process_cop_csv(csvs_to_process)\n",
|
|
"\n",
|
|
"# Get the dguid per level of geography\n",
|
|
"con.sql(\"\"\"\n",
|
|
"DROP TABLE IF EXISTS country_2021;\n",
|
|
"DROP TABLE IF EXISTS pr_2021;\n",
|
|
"DROP TABLE IF EXISTS cd_2021;\n",
|
|
"DROP TABLE IF EXISTS csd_2021;\n",
|
|
"DROP TABLE IF EXISTS da_2021;\n",
|
|
"\n",
|
|
"DROP TABLE IF EXISTS cma_2021;\n",
|
|
"CREATE TABLE cma_2021 AS SELECT cma_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/cma_2021.parquet';\n",
|
|
"\n",
|
|
"DROP TABLE IF EXISTS ct_2021;\n",
|
|
"CREATE TABLE ct_2021 AS SELECT ct_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/ct_2021.parquet';\n",
|
|
"\"\"\")\n",
|
|
"con.commit()\n",
|
|
"\n",
|
|
"# Convert the duckdb tables to pandas dataframe\n",
|
|
"cma_dguid = con.sql(\"SELECT * FROM cma_2021\").to_df()\n",
|
|
"ct_dguid = con.sql(\"SELECT * FROM ct_2021\").to_df()\n",
|
|
"\n",
|
|
"# Join the Census of Population dataframe to each geographic level\n",
|
|
"# There's going to be missing links\n",
|
|
"cop_cma = cop_df.join(cma_dguid.set_index('dguid'), on='dguid', how='inner')\n",
|
|
"cop_ct = cop_df.join(ct_dguid.set_index('dguid'), on='dguid', how='inner')\n",
|
|
"\n",
|
|
"del(ct_dguid)\n",
|
|
"del(cop_df)\n",
|
|
"gc.collect()\n",
|
|
"\n",
|
|
"# Convert columns to lowest dtypes\n",
|
|
"cop_ct = convert_to_lowest_type(cop_ct)\n",
|
|
"\n",
|
|
"# Drop NA columns\n",
|
|
"print(f\"CT - Number of Columns BEFORE: {len(cop_ct.columns)}\")\n",
|
|
"drop_na_columns(cop_ct)\n",
|
|
"print(f\"CT - Number of Columns AFTER: {len(cop_ct.columns)}\")\n",
|
|
"\n",
|
|
"# Export\n",
|
|
"# Census Tracts\n",
|
|
"cop_ct = cop_ct.reset_index()\n",
|
|
"cop_ct.rename(columns={'dguid': 'ct_dguid'}, inplace=True)\n",
|
|
"cop_ct.to_parquet(path=f'{data_dir}/ct_2021.parquet', index=False, compression='zstd')\n",
|
|
"\n",
|
|
"del(cop_ct)\n",
|
|
"gc.collect()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "f7f5e3fc-a6b1-46bc-a299-f0dd0d6db897",
|
|
"metadata": {
|
|
"editable": true,
|
|
"slideshow": {
|
|
"slide_type": ""
|
|
},
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"## 3.0 Process Economic regions (ERs) (98-401-X2021008_eng_CSV)\n",
|
|
"This file also includes Provinces and Territories and Country"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "8fa4d456-90ed-4b31-8c91-5d0c8b1faef5",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Processing /data/census_of_population/extracted/2021/98-401-X2021008_eng_CSV/98-401-X2021008_English_CSV_data.csv\n",
|
|
"Concatenating all dataframes into one\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
|
|
]
|
|
},
|
|
"execution_count": 74,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021008_eng_CSV/*English_CSV_data*\")\n",
|
|
"cop_df = process_cop_csv(csvs_to_process)\n",
|
|
"\n",
|
|
"# Get the dguid per level of geography\n",
|
|
"con.sql(\"\"\"\n",
|
|
"DROP TABLE IF EXISTS cma_2021;\n",
|
|
"DROP TABLE IF EXISTS ct_2021;\n",
|
|
"\n",
|
|
"DROP TABLE IF EXISTS er_2021;\n",
|
|
"CREATE TABLE er_2021 AS SELECT er_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/er_2021.parquet';\n",
|
|
"\"\"\")\n",
|
|
"con.commit()\n",
|
|
"\n",
|
|
"# Convert the duckdb tables to pandas dataframe\n",
|
|
"er_dguid = con.sql(\"SELECT * FROM er_2021\").to_df()\n",
|
|
"\n",
|
|
"# Join the Census of Population dataframe to each geographic level\n",
|
|
"cop_er = cop_df.join(er_dguid.set_index('dguid'), on='dguid', how='inner')\n",
|
|
"\n",
|
|
"del(er_dguid)\n",
|
|
"del(cop_df)\n",
|
|
"gc.collect()\n",
|
|
"\n",
|
|
"# Convert columns to lowest dtypes\n",
|
|
"cop_er = convert_to_lowest_type(cop_er)\n",
|
|
"\n",
|
|
"# Drop NA columns\n",
|
|
"print(f\"ER - Number of Columns BEFORE: {len(cop_er.columns)}\")\n",
|
|
"drop_na_columns(cop_er)\n",
|
|
"print(f\"CT - Number of Columns AFTER: {len(cop_er.columns)}\")\n",
|
|
"\n",
|
|
"# Export\n",
|
|
"# Economic Regions\n",
|
|
"cop_er = cop_er.reset_index()\n",
|
|
"cop_er.rename(columns={'dguid': 'er_dguid'}, inplace=True)\n",
|
|
"cop_er.to_parquet(path=f'{data_dir}/er_2021.parquet', index=False, compression='zstd')\n",
|
|
"\n",
|
|
"del(cop_er)\n",
|
|
"gc.collect()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "5a57d285-6567-4288-a632-ce96e02ca23c",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 4.0 Process Population centres (POPCTRs)\n",
|
|
"### There are 1026 DGUIDs in the Census of Population data, but there should be 1030\n",
|
|
"They also use the pop_ctr_dguid and not the pop_ctr_p_dguid. So, there's no way to differentiate between Ottawa, and Gatineau for pop_ctr_dguid 2021S05100616"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "cef1f94a-bec2-45bb-95f1-f06a8f04b470",
|
|
"metadata": {
|
|
"editable": true,
|
|
"slideshow": {
|
|
"slide_type": ""
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Processing /data/census_of_population/extracted/2021/98-401-X2021009_eng_CSV/98-401-X2021009_English_CSV_data.csv\n",
|
|
"Concatenating all dataframes into one\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
|
|
]
|
|
},
|
|
"execution_count": 76,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021009_eng_CSV/*English_CSV_data*\")\n",
|
|
"cop_df = process_cop_csv(csvs_to_process)\n",
|
|
"\n",
|
|
"# Get the dguid per level of geography\n",
|
|
"con.sql(\"\"\"\n",
|
|
"DROP TABLE IF EXISTS er_2021;\n",
|
|
"\n",
|
|
"DROP TABLE IF EXISTS pop_ctr_2021;\n",
|
|
"CREATE TABLE pop_ctr_2021 AS SELECT pop_ctr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/pop_ctr_2021.parquet';\n",
|
|
"\"\"\")\n",
|
|
"con.commit()\n",
|
|
"\n",
|
|
"# Convert the duckdb tables to pandas dataframe\n",
|
|
"pop_ctr_dguid = con.sql(\"SELECT * FROM pop_ctr_2021\").to_df()\n",
|
|
"\n",
|
|
"# Join the Census of Population dataframe to each geographic level\n",
|
|
"cop_pop_ctr = cop_df.join(pop_ctr_dguid.set_index('dguid'), on='dguid', how='inner')\n",
|
|
"\n",
|
|
"del(pop_ctr_dguid)\n",
|
|
"del(cop_df)\n",
|
|
"gc.collect()\n",
|
|
"\n",
|
|
"# Convert columns to lowest dtypes\n",
|
|
"cop_pop_ctr = convert_to_lowest_type(cop_pop_ctr)\n",
|
|
"\n",
|
|
"# Drop NA columns\n",
|
|
"print(f\"POP CTR - Number of Columns BEFORE: {len(cop_pop_ctr.columns)}\")\n",
|
|
"drop_na_columns(cop_pop_ctr)\n",
|
|
"print(f\"POP CTR- Number of Columns AFTER: {len(cop_pop_ctr.columns)}\")\n",
|
|
"\n",
|
|
"# Export\n",
|
|
"# Population Centers\n",
|
|
"cop_pop_ctr = cop_pop_ctr.reset_index()\n",
|
|
"cop_pop_ctr.rename(columns={'dguid': 'pop_ctr_dguid'}, inplace=True)\n",
|
|
"cop_pop_ctr.to_parquet(path=f'{data_dir}/pop_ctr_2021.parquet', index=False, compression='zstd')\n",
|
|
"\n",
|
|
"del(cop_pop_ctr)\n",
|
|
"gc.collect()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "9b196a95-62c1-449f-8e44-08bb81719750",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 5.0 Process Federal electoral districts (FEDs) (2013 Representation Order)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2240a33a-a100-440e-9c55-c2a3509523ea",
|
|
"metadata": {
|
|
"editable": true,
|
|
"slideshow": {
|
|
"slide_type": ""
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Processing /data/census_of_population/extracted/2021/98-401-X2021010_eng_CSV/98-401-X2021010_English_CSV_data.csv\n",
|
|
"Concatenating all dataframes into one\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
|
|
]
|
|
},
|
|
"execution_count": 90,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0"
|
|
]
|
|
},
|
|
"execution_count": 90,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"FED - Number of Columns BEFORE: 7893\n",
|
|
"Dropping columns that don't have values\n",
|
|
"FED - Number of Columns AFTER: 7433\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0"
|
|
]
|
|
},
|
|
"execution_count": 90,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021010_eng_CSV/*English_CSV_data*\")\n",
|
|
"cop_df = process_cop_csv(csvs_to_process)\n",
|
|
"\n",
|
|
"# Get the dguid per level of geography\n",
|
|
"con.sql(\"\"\"\n",
|
|
"DROP TABLE IF EXISTS pop_ctr_2021;\n",
|
|
"\n",
|
|
"DROP TABLE IF EXISTS fed_2013;\n",
|
|
"CREATE TABLE fed_2013 AS SELECT fed_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/fed_2021_2013.parquet';\n",
|
|
"\"\"\")\n",
|
|
"con.commit()\n",
|
|
"\n",
|
|
"# Convert the duckdb tables to pandas dataframe\n",
|
|
"fed_dguid = con.sql(\"SELECT * FROM fed_2013\").to_df()\n",
|
|
"\n",
|
|
"# Join the Census of Population dataframe to each geographic level\n",
|
|
"cop_fed = cop_df.join(fed_dguid.set_index('dguid'), on='dguid', how='inner')\n",
|
|
"\n",
|
|
"del(fed_dguid)\n",
|
|
"del(cop_df)\n",
|
|
"gc.collect()\n",
|
|
"\n",
|
|
"# Convert columns to lowest dtypes\n",
|
|
"cop_fed = convert_to_lowest_type(cop_fed)\n",
|
|
"\n",
|
|
"# Drop NA columns\n",
|
|
"print(f\"FED - Number of Columns BEFORE: {len(cop_fed.columns)}\")\n",
|
|
"drop_na_columns(cop_fed)\n",
|
|
"print(f\"FED - Number of Columns AFTER: {len(cop_fed.columns)}\")\n",
|
|
"\n",
|
|
"# Export\n",
|
|
"cop_fed = cop_fed.reset_index()\n",
|
|
"cop_fed.rename(columns={'dguid': 'fed_dguid'}, inplace=True)\n",
|
|
"cop_fed.to_parquet(path=f'{data_dir}/fed_2013.parquet', index=False, compression='zstd')\n",
|
|
"\n",
|
|
"del(cop_fed)\n",
|
|
"gc.collect()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "dada67c8-8f4b-4795-a97b-3d68887fc582",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 6.0 Process Federal electoral districts (FEDs) (2023 Representation Order)\n",
|
|
"There should be 343 2023 FEDs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "fb3d4506-cf3a-446f-9ff8-2a8e408630a4",
|
|
"metadata": {
|
|
"editable": true,
|
|
"slideshow": {
|
|
"slide_type": ""
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Processing /data/census_of_population/extracted/2021/98-401-X2021029_eng_CSV/98-401-X2021029_English_CSV_data.csv\n",
|
|
"Concatenating all dataframes into one\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
|
|
]
|
|
},
|
|
"execution_count": 100,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"31"
|
|
]
|
|
},
|
|
"execution_count": 100,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"FED - Number of Columns BEFORE: 7894\n",
|
|
"Dropping columns that don't have values\n",
|
|
"FED - Number of Columns AFTER: 7427\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0"
|
|
]
|
|
},
|
|
"execution_count": 100,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021029_eng_CSV/*English_CSV_data*\")\n",
|
|
"cop_df = process_cop_csv(csvs_to_process)\n",
|
|
"\n",
|
|
"# Get the dguid per level of geography\n",
|
|
"con.sql(\"\"\"\n",
|
|
"DROP TABLE IF EXISTS fed_2013;\n",
|
|
"\n",
|
|
"/*\n",
|
|
"DROP TABLE IF EXISTS fed_2023;\n",
|
|
"CREATE TABLE fed_2023 AS SELECT fed_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/fed_2023.parquet';\n",
|
|
"*/\n",
|
|
"\"\"\")\n",
|
|
"con.commit()\n",
|
|
"\n",
|
|
"# Convert the duckdb tables to pandas dataframe\n",
|
|
"#fed_dguid = con.sql(\"SELECT * FROM fed_2023\").to_df()\n",
|
|
"\n",
|
|
"# Join the Census of Population dataframe to each geographic level\n",
|
|
"#cop_fed = cop_df.join(fed_dguid.set_index('dguid'), on='dguid', how='inner')\n",
|
|
"cop_df = cop_df.reset_index()\n",
|
|
"cop_df = cop_df[cop_df['dguid'].str.contains(\"2023\")]\n",
|
|
"\n",
|
|
"# Convert columns to lowest dtypes\n",
|
|
"cop_fed = convert_to_lowest_type(cop_df)\n",
|
|
"\n",
|
|
"del(cop_df)\n",
|
|
"gc.collect()\n",
|
|
"\n",
|
|
"# Drop NA columns\n",
|
|
"print(f\"FED - Number of Columns BEFORE: {len(cop_fed.columns)}\")\n",
|
|
"drop_na_columns(cop_fed)\n",
|
|
"print(f\"FED - Number of Columns AFTER: {len(cop_fed.columns)}\")\n",
|
|
"\n",
|
|
"# Export\n",
|
|
"cop_fed.rename(columns={'dguid': 'fed_dguid'}, inplace=True)\n",
|
|
"cop_fed.to_parquet(path=f'{data_dir}/fed_2023.parquet', index=False, compression='zstd')\n",
|
|
"\n",
|
|
"del(cop_fed)\n",
|
|
"gc.collect()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "3cf32961-f1a4-44e2-8676-bb10263190b9",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 7.0 Process Designated places (DPLs)\n",
|
|
"There should be 1685 DPLs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "958a12c8-2d4d-474a-8af7-81a6b71e24d1",
|
|
"metadata": {
|
|
"editable": true,
|
|
"slideshow": {
|
|
"slide_type": ""
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Processing /data/census_of_population/extracted/2021/98-401-X2021011_eng_CSV/98-401-X2021011_English_CSV_data.csv\n",
|
|
"Concatenating all dataframes into one\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
|
|
]
|
|
},
|
|
"execution_count": 106,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0"
|
|
]
|
|
},
|
|
"execution_count": 106,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"DPL - Number of Columns BEFORE: 7893\n",
|
|
"Dropping columns that don't have values\n",
|
|
"DPL - Number of Columns AFTER: 7433\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021011_eng_CSV/*English_CSV_data*\")\n",
|
|
"cop_df = process_cop_csv(csvs_to_process)\n",
|
|
"\n",
|
|
"# Get the dguid per level of geography\n",
|
|
"con.sql(\"\"\"\n",
|
|
"DROP TABLE IF EXISTS dpl_2021;\n",
|
|
"CREATE TABLE dpl_2021 AS SELECT dpl_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/dpl_2021.parquet';\n",
|
|
"\"\"\")\n",
|
|
"con.commit()\n",
|
|
"\n",
|
|
"# Convert the duckdb tables to pandas dataframe\n",
|
|
"dpl_dguid = con.sql(\"SELECT * FROM dpl_2021\").to_df()\n",
|
|
"\n",
|
|
"# Join the Census of Population dataframe to each geographic level\n",
|
|
"cop_dpl = cop_df.join(dpl_dguid.set_index('dguid'), on='dguid', how='inner')\n",
|
|
"\n",
|
|
"del(dpl_dguid)\n",
|
|
"del(cop_df)\n",
|
|
"gc.collect()\n",
|
|
"\n",
|
|
"# Convert columns to lowest dtypes\n",
|
|
"cop_dpl = convert_to_lowest_type(cop_dpl)\n",
|
|
"\n",
|
|
"# Drop NA columns\n",
|
|
"print(f\"DPL - Number of Columns BEFORE: {len(cop_dpl.columns)}\")\n",
|
|
"drop_na_columns(cop_dpl)\n",
|
|
"print(f\"DPL - Number of Columns AFTER: {len(cop_dpl.columns)}\")\n",
|
|
"\n",
|
|
"# Export\n",
|
|
"cop_dpl = cop_dpl.reset_index()\n",
|
|
"cop_dpl.rename(columns={'dguid': 'dpl_dguid'}, inplace=True)\n",
|
|
"cop_dpl.to_parquet(path=f'{data_dir}/dpl_2021.parquet', index=False, compression='zstd')\n",
|
|
"\n",
|
|
"del(cop_dpl)\n",
|
|
"gc.collect()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "bceb5f4a-4019-4d25-8671-b0854233e109",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 8.0 Process Aggregate dissemination areas (ADAs)\n",
|
|
"There should be 5433 ADAs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "8c846535-aff6-434e-ada0-51e739d89d23",
|
|
"metadata": {
|
|
"editable": true,
|
|
"slideshow": {
|
|
"slide_type": ""
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Processing /data/census_of_population/extracted/2021/98-401-X2021012_eng_CSV/98-401-X2021012_English_CSV_data.csv\n",
|
|
"Concatenating all dataframes into one\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
|
|
]
|
|
},
|
|
"execution_count": 109,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021012_eng_CSV/*English_CSV_data*\")\n",
|
|
"cop_df = process_cop_csv(csvs_to_process)\n",
|
|
"\n",
|
|
"# Get the dguid per level of geography\n",
|
|
"con.sql(\"\"\"\n",
|
|
"DROP TABLE IF EXISTS dpl_2021;\n",
|
|
"DROP TABLE IF EXISTS ada_2021;\n",
|
|
"CREATE TABLE ada_2021 AS SELECT ada_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/ada_2021.parquet';\n",
|
|
"\"\"\")\n",
|
|
"con.commit()\n",
|
|
"\n",
|
|
"# Convert the duckdb tables to pandas dataframe\n",
|
|
"ada_dguid = con.sql(\"SELECT * FROM ada_2021\").to_df()\n",
|
|
"\n",
|
|
"# Join the Census of Population dataframe to each geographic level\n",
|
|
"cop_ada = cop_df.join(ada_dguid.set_index('dguid'), on='dguid', how='inner')\n",
|
|
"\n",
|
|
"del(ada_dguid)\n",
|
|
"del(cop_df)\n",
|
|
"gc.collect()\n",
|
|
"\n",
|
|
"# Convert columns to lowest dtypes\n",
|
|
"cop_ada = convert_to_lowest_type(cop_ada)\n",
|
|
"\n",
|
|
"# Drop NA columns\n",
|
|
"print(f\"ADA - Number of Columns BEFORE: {len(cop_ada.columns)}\")\n",
|
|
"drop_na_columns(cop_ada)\n",
|
|
"print(f\"ADA - Number of Columns AFTER: {len(cop_ada.columns)}\")\n",
|
|
"\n",
|
|
"# Export\n",
|
|
"cop_ada = cop_ada.reset_index()\n",
|
|
"cop_ada.rename(columns={'dguid': 'ada_dguid'}, inplace=True)\n",
|
|
"cop_ada.to_parquet(path=f'{data_dir}/ada_2021.parquet', index=False, compression='zstd')\n",
|
|
"\n",
|
|
"del(cop_ada)\n",
|
|
"gc.collect()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "6d66d8ae-e3ca-4014-961c-4af3c577880d",
|
|
"metadata": {
|
|
"editable": true,
|
|
"slideshow": {
|
|
"slide_type": ""
|
|
},
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"## 9.0 Process Forward sortation areas (FSAs)\n",
|
|
"There should be 1643 FSAs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "bdee5c1a-9914-42ab-b759-e7685d5e627c",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Processing /data/census_of_population/extracted/2021/98-401-X2021013_eng_CSV/98-401-X2021013_English_CSV_data.csv\n",
|
|
"Concatenating all dataframes into one\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
|
|
]
|
|
},
|
|
"execution_count": 116,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"684"
|
|
]
|
|
},
|
|
"execution_count": 116,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"FSA - Number of Columns BEFORE: 7893\n",
|
|
"Dropping columns that don't have values\n",
|
|
"FSA - Number of Columns AFTER: 7429\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0"
|
|
]
|
|
},
|
|
"execution_count": 116,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021013_eng_CSV/*English_CSV_data*\")\n",
|
|
"cop_df = process_cop_csv(csvs_to_process)\n",
|
|
"\n",
|
|
"# Get the dguid per level of geography\n",
|
|
"con.sql(\"\"\"\n",
|
|
"DROP TABLE IF EXISTS ada_2021;\n",
|
|
"DROP TABLE IF EXISTS fsa_2021;\n",
|
|
"CREATE TABLE fsa_2021 AS SELECT fsa_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/fsa_2021.parquet';\n",
|
|
"\"\"\")\n",
|
|
"con.commit()\n",
|
|
"\n",
|
|
"# Convert the duckdb tables to pandas dataframe\n",
|
|
"fsa_dguid = con.sql(\"SELECT * FROM fsa_2021\").to_df()\n",
|
|
"\n",
|
|
"# Join the Census of Population dataframe to each geographic level\n",
|
|
"cop_fsa = cop_df.join(fsa_dguid.set_index('dguid'), on='dguid', how='inner')\n",
|
|
"\n",
|
|
"del(fsa_dguid)\n",
|
|
"del(cop_df)\n",
|
|
"gc.collect()\n",
|
|
"\n",
|
|
"# Convert columns to lowest dtypes\n",
|
|
"cop_fsa = convert_to_lowest_type(cop_fsa)\n",
|
|
"\n",
|
|
"# Drop NA columns\n",
|
|
"print(f\"FSA - Number of Columns BEFORE: {len(cop_fsa.columns)}\")\n",
|
|
"drop_na_columns(cop_fsa)\n",
|
|
"print(f\"FSA - Number of Columns AFTER: {len(cop_fsa.columns)}\")\n",
|
|
"\n",
|
|
"# Export\n",
|
|
"cop_fsa = cop_fsa.reset_index()\n",
|
|
"cop_fsa.rename(columns={'dguid': 'fsa_dguid'}, inplace=True)\n",
|
|
"cop_fsa.to_parquet(path=f'{data_dir}/fsa_2021.parquet', index=False, compression='zstd')\n",
|
|
"\n",
|
|
"del(cop_fsa)\n",
|
|
"gc.collect()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "ddb67445-e9dc-489f-b2cd-7a7969393d5a",
|
|
"metadata": {
|
|
"editable": true,
|
|
"slideshow": {
|
|
"slide_type": ""
|
|
},
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"## 10.0 Process Health regions (HRs) and Local health integration networks\n",
|
|
"Start looking here https://www150.statcan.gc.ca/n1/en/catalogue/82-402-X"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "1e4c3ddf-4279-4ae7-8a8a-f43ae97d59b4",
|
|
"metadata": {
|
|
"editable": true,
|
|
"slideshow": {
|
|
"slide_type": ""
|
|
},
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021015_eng_CSV/*English_CSV_data*\")\n",
|
|
"cop_df = process_cop_csv(csvs_to_process)\n",
|
|
"\n",
|
|
"# Get the dguid per level of geography\n",
|
|
"con.sql(\"\"\"\n",
|
|
"DROP TABLE IF EXISTS fsa_2021;\n",
|
|
"DROP TABLE IF EXISTS hr_2022;\n",
|
|
"CREATE TABLE hr_2022 AS SELECT hr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/hr_2022.parquet';\n",
|
|
"\n",
|
|
"CREATE \n",
|
|
"\"\"\")\n",
|
|
"con.commit()\n",
|
|
"\n",
|
|
"# Convert the duckdb tables to pandas dataframe\n",
|
|
"hr_dguid = con.sql(\"SELECT * FROM hr_2022\").to_df()\n",
|
|
"\n",
|
|
"# Join the Census of Population dataframe to each geographic level\n",
|
|
"cop_fsa = cop_df.join(fsa_dguid.set_index('dguid'), on='dguid', how='inner')\n",
|
|
"\n",
|
|
"del(fsa_dguid)\n",
|
|
"del(cop_df)\n",
|
|
"gc.collect()\n",
|
|
"\n",
|
|
"# Convert columns to lowest dtypes\n",
|
|
"cop_fsa = convert_to_lowest_type(cop_fsa)\n",
|
|
"\n",
|
|
"# Drop NA columns\n",
|
|
"print(f\"FSA - Number of Columns BEFORE: {len(cop_fsa.columns)}\")\n",
|
|
"drop_na_columns(cop_fsa)\n",
|
|
"print(f\"FSA - Number of Columns AFTER: {len(cop_fsa.columns)}\")\n",
|
|
"\n",
|
|
"# Export\n",
|
|
"cop_fsa = cop_fsa.reset_index()\n",
|
|
"cop_fsa.rename(columns={'dguid': 'fsa_dguid'}, inplace=True)\n",
|
|
"cop_fsa.to_parquet(path=f'{data_dir}/fsa_2021.parquet', index=False, compression='zstd')\n",
|
|
"\n",
|
|
"del(cop_fsa)\n",
|
|
"gc.collect()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|