Files
d4c-datapkg-statistical/census_of_population/process_2021.ipynb
T
Diego Ripley f93e4d0cec Initial commit
2025-05-24 13:37:31 -04:00

1361 lines
42 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 50,
"id": "fc8ca6f9",
"metadata": {},
"outputs": [],
"source": [
"import gc\n",
"import glob\n",
"\n",
"import duckdb\n",
"from IPython.core.interactiveshell import InteractiveShell \n",
"import numpy as np\n",
"import pandas as pd\n",
"import sqlalchemy\n",
"\n",
"# Enable multiple outputs per cell\n",
"InteractiveShell.ast_node_interactivity = \"all\"\n",
"# Show all columns\n",
"pd.set_option('display.max_columns', None)\n",
"\n",
"data_dir = '/data/census_of_population/output/2021/tabular'"
]
},
{
"cell_type": "markdown",
"id": "3d20a1f5",
"metadata": {},
"source": [
"# Datasets\n",
"- 1.0 Canada, provinces, territories, census divisions (CDs), census subdivisions (CSDs) and dissemination areas (DAs)\n",
"- 2.0 Census metropolitan areas (CMAs), tracted census agglomerations (CAs) and census tracts (CTs)\n",
"- 3.0 Economic regions (ERs)\n",
"- 4.0 Population centres (POPCTRs)\n",
"- 5.0 Canada, provinces, territories and federal electoral districts (FEDs) (2013 Representation Order). **Just process FEDs**\n",
"- 6.0 Canada, provinces, territories and federal electoral districts (FEDs) (2023 Representation Order). **Just process FEDs**\n",
"- 7.0 Designated places (DPLs)\n",
"- 8.0 Aggregate dissemination areas (ADAs)\n",
"- 9.0 Forward sortation areas (FSAs)\n",
"- 10.0 Health regions (HRs)\n",
" - The Health Regions CSV also has a `Home and Community Care Support Services` `GEO_LEVEL`\n",
" - Need to take a look at the HRs files listed here https://www150.statcan.gc.ca/n1/pub/82-402-x/2023001/hrbf-flrs-eng.htm"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b88a0db8",
"metadata": {},
"outputs": [],
"source": [
"def process_cop_csv(csvs_to_process):\n",
" \"\"\"\n",
" 1. Reads subset of fields for Census of Population CSV files\n",
" 2. Pivots on characteristic_id\n",
" 3. Appends all of the processed CSVs as one dataframe\n",
" \"\"\"\n",
" dataframes_to_concatenate = []\n",
" for filename in csvs_to_process:\n",
" print(f\"Processing {filename}\")\n",
" params = {\n",
" 'filepath_or_buffer': filename,\n",
" 'encoding': 'latin-1',\n",
" 'usecols': ['DGUID', \n",
" 'CHARACTERISTIC_ID', \n",
" 'C1_COUNT_TOTAL',\n",
" 'C2_COUNT_MEN+',\n",
" 'C3_COUNT_WOMEN+'\n",
" ],\n",
" 'dtype': {\n",
" 'CHARACTERISTIC_ID': np.int16\n",
" }\n",
" }\n",
" cop_df = pd.read_csv(**params)\n",
" cop_df.rename(columns={\n",
" 'C1_COUNT_TOTAL': 'count_total',\n",
" 'C2_COUNT_MEN+': 'count_men', \n",
" 'C3_COUNT_WOMEN+': 'count_women',\n",
" 'DGUID': 'dguid'\n",
" }, inplace=True)\n",
"\n",
" cop_df = cop_df.pivot(index='dguid', columns='CHARACTERISTIC_ID')\n",
"\n",
" # Flatten the hierarchical index\n",
" # https://stackoverflow.com/questions/14507794/how-to-flatten-a-hierarchical-index-in-columns/57630176#57630176\n",
" level_one = cop_df.columns.get_level_values(0).astype(str)\n",
" level_two = cop_df.columns.get_level_values(1).astype(str)\n",
" column_separator = ['_' if x != '' else '' for x in level_two]\n",
" cop_df.columns = level_one + column_separator + level_two\n",
" dataframes_to_concatenate.append(cop_df)\n",
" \n",
" print(\"Concatenating all dataframes into one\")\n",
" cop_df = pd.concat(dataframes_to_concatenate)\n",
" \n",
" return cop_df"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "cba01571",
"metadata": {},
"outputs": [],
"source": [
"def drop_na_columns(dataframe):\n",
" \"\"\"\n",
" Delete columns where there are no values.\n",
" There are cases where there are values for the count_total\n",
" columns, but no values for the count_men and count_women columns\n",
" \"\"\"\n",
" columns_to_drop = []\n",
" for field in dataframe.columns:\n",
" minimum_value = dataframe[field].min()\n",
" maximum_value = dataframe[field].max()\n",
" if pd.isna(minimum_value) and pd.isna(maximum_value):\n",
" columns_to_drop.append(field)\n",
"\n",
" if columns_to_drop:\n",
" print(\"Dropping columns that don't have values\")\n",
" dataframe.drop(columns=columns_to_drop, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "6071a2fe",
"metadata": {},
"outputs": [],
"source": [
"def convert_to_lowest_type(df):\n",
" \"\"\"\n",
" Convert columns to the best possible dtypes\n",
" For example, if the column is numerical and has a maximum value of 32,000 we can assign it a type of int16\n",
" \"\"\"\n",
" params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
" }\n",
" df = df.convert_dtypes(**params)\n",
"\n",
" dtypes = pd.DataFrame(df.dtypes)\n",
" \n",
" # Downcast to the smallest numerical dtype\n",
" for row in dtypes.itertuples():\n",
" column = row[0]\n",
" the_type = str(row[1])\n",
" \n",
" # Skipping downcasting Float64 as there were issues with decimal places\n",
" # For example, instead of a value being 65.4, it turned into 65.4000015258789\n",
" if the_type == 'Float64':\n",
" continue \n",
" elif the_type == 'Int64':\n",
" df[column] = pd.to_numeric(df[column], downcast='integer')\n",
"\n",
" return df"
]
},
{
"cell_type": "markdown",
"id": "86ef0ae5",
"metadata": {},
"source": [
"# Start processing\n",
"## 1.0 Process Canada, provinces, territories (PRs), census divisions (CDs), census subdivisions (CSDs) and dissemination areas (DAs)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e648921d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Atlantic.csv\n",
"Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_BritishColumbia.csv\n",
"Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Ontario.csv\n",
"Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Prairies.csv\n",
"Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Quebec.csv\n",
"Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Territories.csv\n",
"Concatenating all dataframes into one\n"
]
}
],
"source": [
"csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/*English_CSV_data*\")\n",
"cop_df = process_cop_csv(csvs_to_process)"
]
},
{
"cell_type": "markdown",
"id": "9a893e04",
"metadata": {},
"source": [
"# Remove duplicates\n",
"- For example, for some reason, they included Canada (dguid 2021A000011124) 6 times (once per CSV), so we need to get unique values"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5712f497",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print(f\"Number of records before {len(cop_df)}\")\n",
"print(\"Before:\")\n",
"cop_df[cop_df.index == '2021A000011124']\n",
"\n",
"# Get unique records\n",
"cop_df = cop_df.groupby(cop_df.index).last()\n",
"print(f\"Number of records after {len(cop_df)}\")\n",
"cop_df[cop_df.index == '2021A000011124']"
]
},
{
"cell_type": "markdown",
"id": "f987a804",
"metadata": {},
"source": [
"Get unique records"
]
},
{
"cell_type": "markdown",
"id": "0e0e6cb8",
"metadata": {},
"source": [
"# Split the Census of Population dataframe by geographic level"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bdee50fc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con = duckdb.connect()\n",
"con.install_extension(\"spatial\")\n",
"con.load_extension(\"spatial\")\n",
"\n",
"# Get the dguid per level of geography\n",
"con.sql(\"\"\"\n",
"DROP TABLE IF EXISTS country_2021;\n",
"CREATE TABLE country_2021 AS SELECT country_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/country_2021.parquet';\n",
"\n",
"DROP TABLE IF EXISTS pr_2021;\n",
"CREATE TABLE pr_2021 AS SELECT pr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/pr_2021.parquet';\n",
"\n",
"DROP TABLE IF EXISTS cd_2021;\n",
"CREATE TABLE cd_2021 AS SELECT cd_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/cd_2021.parquet';\n",
"\n",
"DROP TABLE IF EXISTS csd_2021;\n",
"CREATE TABLE csd_2021 AS SELECT csd_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/csd_2021.parquet';\n",
"\n",
"DROP TABLE IF EXISTS da_2021;\n",
"CREATE TABLE da_2021 AS SELECT da_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/da_2021.parquet';\n",
"\"\"\")\n",
"con.commit()\n",
"\n",
"# Convert the duckdb tables to pandas dataframe\n",
"country_dguid = con.sql(\"SELECT * FROM country_2021\").to_df()\n",
"pr_dguid = con.sql(\"SELECT * FROM pr_2021\").to_df()\n",
"cd_dguid = con.sql(\"SELECT * FROM cd_2021\").to_df()\n",
"csd_dguid = con.sql(\"SELECT * FROM csd_2021\").to_df()\n",
"da_dguid = con.sql(\"SELECT * FROM da_2021\").to_df()\n",
"\n",
"# Join the Census of Population dataframe to each geographic level\n",
"cop_country = cop_df.join(country_dguid.set_index('dguid'), on='dguid', how='inner')\n",
"cop_pr = cop_df.join(pr_dguid.set_index('dguid'), on='dguid', how='inner')\n",
"cop_cd = cop_df.join(cd_dguid.set_index('dguid'), on='dguid', how='inner')\n",
"cop_csd = cop_df.join(csd_dguid.set_index('dguid'), on='dguid', how='inner')\n",
"cop_da = cop_df.join(da_dguid.set_index('dguid'), on='dguid', how='inner')\n",
"\n",
"del(cop_df)\n",
"del(country_dguid)\n",
"del(pr_dguid)\n",
"del(cd_dguid)\n",
"del(csd_dguid)\n",
"del(da_dguid)\n",
"gc.collect()"
]
},
{
"cell_type": "markdown",
"id": "2ca0a313-6530-4ecf-becd-8bedc31fdeed",
"metadata": {},
"source": [
"# Convert dataframe columns to lowest dtype"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "96aed739",
"metadata": {},
"outputs": [],
"source": [
"cop_country = convert_to_lowest_type(cop_country)\n",
"cop_pr = convert_to_lowest_type(cop_pr)\n",
"cop_cd = convert_to_lowest_type(cop_cd)\n",
"cop_csd = convert_to_lowest_type(cop_csd)\n",
"cop_da = convert_to_lowest_type(cop_da)"
]
},
{
"cell_type": "markdown",
"id": "9de07a61",
"metadata": {},
"source": [
"# Delete columns where there are no values"
]
},
{
"cell_type": "markdown",
"id": "7da3c692",
"metadata": {},
"source": [
"## Number of columns before"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "7c6531d5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Country- length: 7893\n",
"PR- length: 7893\n",
"CD- length: 7893\n",
"CSD- length: 7893\n",
"DA- length: 7893\n"
]
}
],
"source": [
"print(f\"Country- length: {len(cop_country.columns)}\")\n",
"print(f\"PR- length: {len(cop_pr.columns)}\")\n",
"print(f\"CD- length: {len(cop_cd.columns)}\")\n",
"print(f\"CSD- length: {len(cop_csd.columns)}\")\n",
"print(f\"DA- length: {len(cop_da.columns)}\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "a971d90c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dropping columns that don't have values\n",
"Dropping columns that don't have values\n",
"Dropping columns that don't have values\n",
"Dropping columns that don't have values\n",
"Dropping columns that don't have values\n"
]
}
],
"source": [
"drop_na_columns(cop_country)\n",
"drop_na_columns(cop_pr)\n",
"drop_na_columns(cop_cd)\n",
"drop_na_columns(cop_csd)\n",
"drop_na_columns(cop_da)"
]
},
{
"cell_type": "markdown",
"id": "511023c8",
"metadata": {},
"source": [
"## Number of columns after"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "1b003396",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Country- length: 7433\n",
"PR- length: 7433\n",
"CD- length: 7433\n",
"CSD- length: 7433\n",
"DA- length: 7431\n"
]
}
],
"source": [
"print(f\"Country- length: {len(cop_country.columns)}\")\n",
"print(f\"PR- length: {len(cop_pr.columns)}\")\n",
"print(f\"CD- length: {len(cop_cd.columns)}\")\n",
"print(f\"CSD- length: {len(cop_csd.columns)}\")\n",
"print(f\"DA- length: {len(cop_da.columns)}\")"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "fa72bcce",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
"# Country\n",
"cop_country = cop_country.reset_index()\n",
"cop_country.rename(columns={'dguid': 'country_dguid'}, inplace=True)\n",
"cop_country.to_parquet(path=f'{data_dir}/country_2021.parquet', index=False, compression='zstd')\n",
"\n",
"# Provinces and Territories\n",
"cop_pr = cop_pr.reset_index()\n",
"cop_pr.rename(columns={'dguid': 'pr_dguid'}, inplace=True)\n",
"cop_pr.to_parquet(path=f'{data_dir}/pr_2021.parquet', index=False, compression='zstd')\n",
"\n",
"# Census Divisions\n",
"cop_cd = cop_cd.reset_index()\n",
"cop_cd.rename(columns={'dguid': 'cd_dguid'}, inplace=True)\n",
"cop_cd.to_parquet(path=f'{data_dir}/cd_2021.parquet', index=False, compression='zstd')\n",
"\n",
"# Census Subdivisions\n",
"cop_csd = cop_csd.reset_index()\n",
"cop_csd.rename(columns={'dguid': 'csd_dguid'}, inplace=True)\n",
"cop_csd.to_parquet(path=f'{data_dir}/csd_2021.parquet', index=False, compression='zstd')\n",
"\n",
"# Dissemination Areas\n",
"cop_da = cop_da.reset_index()\n",
"cop_da.rename(columns={'dguid': 'da_dguid'}, inplace=True)\n",
"cop_da.to_parquet(path=f'{data_dir}/da_2021.parquet', index=False, compression='zstd')"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "441a7834-594f-400c-85a3-a353c8bdf202",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"del(cop_country)\n",
"del(cop_pr)\n",
"del(cop_cd)\n",
"del(cop_csd)\n",
"del(cop_da)\n",
"gc.collect()"
]
},
{
"cell_type": "markdown",
"id": "6c7b31ef-9657-4a2e-bdf7-8d75313c10ef",
"metadata": {},
"source": [
"## 2.0 Process Census metropolitan areas (CMAs), tracted census agglomerations (CAs) and census tracts (CTs)"
]
},
{
"cell_type": "markdown",
"id": "91b81c24-c160-4c38-8713-8d1f06d18624",
"metadata": {},
"source": [
"# TODO: Finish processing CMA"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b8c5ebfd-d5b7-4700-8006-3fa96402dc94",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing /data/census_of_population/extracted/2021/98-401-X2021007_eng_CSV/98-401-X2021007_English_CSV_data.csv\n",
"Concatenating all dataframes into one\n"
]
},
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"244615"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021007_eng_CSV/*English_CSV_data*\")\n",
"cop_df = process_cop_csv(csvs_to_process)\n",
"\n",
"# Get the dguid per level of geography\n",
"con.sql(\"\"\"\n",
"DROP TABLE IF EXISTS country_2021;\n",
"DROP TABLE IF EXISTS pr_2021;\n",
"DROP TABLE IF EXISTS cd_2021;\n",
"DROP TABLE IF EXISTS csd_2021;\n",
"DROP TABLE IF EXISTS da_2021;\n",
"\n",
"DROP TABLE IF EXISTS cma_2021;\n",
"CREATE TABLE cma_2021 AS SELECT cma_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/cma_2021.parquet';\n",
"\n",
"DROP TABLE IF EXISTS ct_2021;\n",
"CREATE TABLE ct_2021 AS SELECT ct_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/ct_2021.parquet';\n",
"\"\"\")\n",
"con.commit()\n",
"\n",
"# Convert the duckdb tables to pandas dataframe\n",
"cma_dguid = con.sql(\"SELECT * FROM cma_2021\").to_df()\n",
"ct_dguid = con.sql(\"SELECT * FROM ct_2021\").to_df()\n",
"\n",
"# Join the Census of Population dataframe to each geographic level\n",
"# There's going to be missing links\n",
"cop_cma = cop_df.join(cma_dguid.set_index('dguid'), on='dguid', how='inner')\n",
"cop_ct = cop_df.join(ct_dguid.set_index('dguid'), on='dguid', how='inner')\n",
"\n",
"del(ct_dguid)\n",
"del(cop_df)\n",
"gc.collect()\n",
"\n",
"# Convert columns to lowest dtypes\n",
"cop_ct = convert_to_lowest_type(cop_ct)\n",
"\n",
"# Drop NA columns\n",
"print(f\"CT - Number of Columns BEFORE: {len(cop_ct.columns)}\")\n",
"drop_na_columns(cop_ct)\n",
"print(f\"CT - Number of Columns AFTER: {len(cop_ct.columns)}\")\n",
"\n",
"# Export\n",
"# Census Tracts\n",
"cop_ct = cop_ct.reset_index()\n",
"cop_ct.rename(columns={'dguid': 'ct_dguid'}, inplace=True)\n",
"cop_ct.to_parquet(path=f'{data_dir}/ct_2021.parquet', index=False, compression='zstd')\n",
"\n",
"del(cop_ct)\n",
"gc.collect()"
]
},
{
"cell_type": "markdown",
"id": "f7f5e3fc-a6b1-46bc-a299-f0dd0d6db897",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"source": [
"## 3.0 Process Economic regions (ERs) (98-401-X2021008_eng_CSV)\n",
"This file also includes Provinces and Territories and Country"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8fa4d456-90ed-4b31-8c91-5d0c8b1faef5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing /data/census_of_population/extracted/2021/98-401-X2021008_eng_CSV/98-401-X2021008_English_CSV_data.csv\n",
"Concatenating all dataframes into one\n"
]
},
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021008_eng_CSV/*English_CSV_data*\")\n",
"cop_df = process_cop_csv(csvs_to_process)\n",
"\n",
"# Get the dguid per level of geography\n",
"con.sql(\"\"\"\n",
"DROP TABLE IF EXISTS cma_2021;\n",
"DROP TABLE IF EXISTS ct_2021;\n",
"\n",
"DROP TABLE IF EXISTS er_2021;\n",
"CREATE TABLE er_2021 AS SELECT er_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/er_2021.parquet';\n",
"\"\"\")\n",
"con.commit()\n",
"\n",
"# Convert the duckdb tables to pandas dataframe\n",
"er_dguid = con.sql(\"SELECT * FROM er_2021\").to_df()\n",
"\n",
"# Join the Census of Population dataframe to each geographic level\n",
"cop_er = cop_df.join(er_dguid.set_index('dguid'), on='dguid', how='inner')\n",
"\n",
"del(er_dguid)\n",
"del(cop_df)\n",
"gc.collect()\n",
"\n",
"# Convert columns to lowest dtypes\n",
"cop_er = convert_to_lowest_type(cop_er)\n",
"\n",
"# Drop NA columns\n",
"print(f\"ER - Number of Columns BEFORE: {len(cop_er.columns)}\")\n",
"drop_na_columns(cop_er)\n",
"print(f\"CT - Number of Columns AFTER: {len(cop_er.columns)}\")\n",
"\n",
"# Export\n",
"# Economic Regions\n",
"cop_er = cop_er.reset_index()\n",
"cop_er.rename(columns={'dguid': 'er_dguid'}, inplace=True)\n",
"cop_er.to_parquet(path=f'{data_dir}/er_2021.parquet', index=False, compression='zstd')\n",
"\n",
"del(cop_er)\n",
"gc.collect()"
]
},
{
"cell_type": "markdown",
"id": "5a57d285-6567-4288-a632-ce96e02ca23c",
"metadata": {},
"source": [
"## 4.0 Process Population centres (POPCTRs)\n",
"### There are 1026 DGUIDs in the Census of Population data, but there should be 1030\n",
"They also use the pop_ctr_dguid and not the pop_ctr_p_dguid. So, there's no way to differentiate between Ottawa, and Gatineau for pop_ctr_dguid 2021S05100616"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cef1f94a-bec2-45bb-95f1-f06a8f04b470",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing /data/census_of_population/extracted/2021/98-401-X2021009_eng_CSV/98-401-X2021009_English_CSV_data.csv\n",
"Concatenating all dataframes into one\n"
]
},
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021009_eng_CSV/*English_CSV_data*\")\n",
"cop_df = process_cop_csv(csvs_to_process)\n",
"\n",
"# Get the dguid per level of geography\n",
"con.sql(\"\"\"\n",
"DROP TABLE IF EXISTS er_2021;\n",
"\n",
"DROP TABLE IF EXISTS pop_ctr_2021;\n",
"CREATE TABLE pop_ctr_2021 AS SELECT pop_ctr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/pop_ctr_2021.parquet';\n",
"\"\"\")\n",
"con.commit()\n",
"\n",
"# Convert the duckdb tables to pandas dataframe\n",
"pop_ctr_dguid = con.sql(\"SELECT * FROM pop_ctr_2021\").to_df()\n",
"\n",
"# Join the Census of Population dataframe to each geographic level\n",
"cop_pop_ctr = cop_df.join(pop_ctr_dguid.set_index('dguid'), on='dguid', how='inner')\n",
"\n",
"del(pop_ctr_dguid)\n",
"del(cop_df)\n",
"gc.collect()\n",
"\n",
"# Convert columns to lowest dtypes\n",
"cop_pop_ctr = convert_to_lowest_type(cop_pop_ctr)\n",
"\n",
"# Drop NA columns\n",
"print(f\"POP CTR - Number of Columns BEFORE: {len(cop_pop_ctr.columns)}\")\n",
"drop_na_columns(cop_pop_ctr)\n",
"print(f\"POP CTR- Number of Columns AFTER: {len(cop_pop_ctr.columns)}\")\n",
"\n",
"# Export\n",
"# Population Centers\n",
"cop_pop_ctr = cop_pop_ctr.reset_index()\n",
"cop_pop_ctr.rename(columns={'dguid': 'pop_ctr_dguid'}, inplace=True)\n",
"cop_pop_ctr.to_parquet(path=f'{data_dir}/pop_ctr_2021.parquet', index=False, compression='zstd')\n",
"\n",
"del(cop_pop_ctr)\n",
"gc.collect()"
]
},
{
"cell_type": "markdown",
"id": "9b196a95-62c1-449f-8e44-08bb81719750",
"metadata": {},
"source": [
"## 5.0 Process Federal electoral districts (FEDs) (2013 Representation Order)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2240a33a-a100-440e-9c55-c2a3509523ea",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing /data/census_of_population/extracted/2021/98-401-X2021010_eng_CSV/98-401-X2021010_English_CSV_data.csv\n",
"Concatenating all dataframes into one\n"
]
},
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"FED - Number of Columns BEFORE: 7893\n",
"Dropping columns that don't have values\n",
"FED - Number of Columns AFTER: 7433\n"
]
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021010_eng_CSV/*English_CSV_data*\")\n",
"cop_df = process_cop_csv(csvs_to_process)\n",
"\n",
"# Get the dguid per level of geography\n",
"con.sql(\"\"\"\n",
"DROP TABLE IF EXISTS pop_ctr_2021;\n",
"\n",
"DROP TABLE IF EXISTS fed_2013;\n",
"CREATE TABLE fed_2013 AS SELECT fed_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/fed_2021.parquet';\n",
"\"\"\")\n",
"con.commit()\n",
"\n",
"# Convert the duckdb tables to pandas dataframe\n",
"fed_dguid = con.sql(\"SELECT * FROM fed_2013\").to_df()\n",
"\n",
"# Join the Census of Population dataframe to each geographic level\n",
"cop_fed = cop_df.join(fed_dguid.set_index('dguid'), on='dguid', how='inner')\n",
"\n",
"del(fed_dguid)\n",
"del(cop_df)\n",
"gc.collect()\n",
"\n",
"# Convert columns to lowest dtypes\n",
"cop_fed = convert_to_lowest_type(cop_fed)\n",
"\n",
"# Drop NA columns\n",
"print(f\"FED - Number of Columns BEFORE: {len(cop_fed.columns)}\")\n",
"drop_na_columns(cop_fed)\n",
"print(f\"FED - Number of Columns AFTER: {len(cop_fed.columns)}\")\n",
"\n",
"# Export\n",
"cop_fed = cop_fed.reset_index()\n",
"cop_fed.rename(columns={'dguid': 'fed_dguid'}, inplace=True)\n",
"cop_fed.to_parquet(path=f'{data_dir}/fed_2013.parquet', index=False, compression='zstd')\n",
"\n",
"del(cop_fed)\n",
"gc.collect()"
]
},
{
"cell_type": "markdown",
"id": "dada67c8-8f4b-4795-a97b-3d68887fc582",
"metadata": {},
"source": [
"## 6.0 Process Federal electoral districts (FEDs) (2023 Representation Order)\n",
"There should be 343 2023 FEDs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fb3d4506-cf3a-446f-9ff8-2a8e408630a4",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing /data/census_of_population/extracted/2021/98-401-X2021029_eng_CSV/98-401-X2021029_English_CSV_data.csv\n",
"Concatenating all dataframes into one\n"
]
},
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"31"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"FED - Number of Columns BEFORE: 7894\n",
"Dropping columns that don't have values\n",
"FED - Number of Columns AFTER: 7427\n"
]
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021029_eng_CSV/*English_CSV_data*\")\n",
"cop_df = process_cop_csv(csvs_to_process)\n",
"\n",
"# Get the dguid per level of geography\n",
"con.sql(\"\"\"\n",
"DROP TABLE IF EXISTS fed_2013;\n",
"\n",
"/*\n",
"DROP TABLE IF EXISTS fed_2023;\n",
"CREATE TABLE fed_2023 AS SELECT fed_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/fed_2023.parquet';\n",
"*/\n",
"\"\"\")\n",
"con.commit()\n",
"\n",
"# Convert the duckdb tables to pandas dataframe\n",
"#fed_dguid = con.sql(\"SELECT * FROM fed_2023\").to_df()\n",
"\n",
"# Join the Census of Population dataframe to each geographic level\n",
"#cop_fed = cop_df.join(fed_dguid.set_index('dguid'), on='dguid', how='inner')\n",
"cop_df = cop_df.reset_index()\n",
"cop_df = cop_df[cop_df['dguid'].str.contains(\"2023\")]\n",
"\n",
"# Convert columns to lowest dtypes\n",
"cop_fed = convert_to_lowest_type(cop_df)\n",
"\n",
"del(cop_df)\n",
"gc.collect()\n",
"\n",
"# Drop NA columns\n",
"print(f\"FED - Number of Columns BEFORE: {len(cop_fed.columns)}\")\n",
"drop_na_columns(cop_fed)\n",
"print(f\"FED - Number of Columns AFTER: {len(cop_fed.columns)}\")\n",
"\n",
"# Export\n",
"cop_fed.rename(columns={'dguid': 'fed_dguid'}, inplace=True)\n",
"cop_fed.to_parquet(path=f'{data_dir}/fed_2023.parquet', index=False, compression='zstd')\n",
"\n",
"del(cop_fed)\n",
"gc.collect()"
]
},
{
"cell_type": "markdown",
"id": "3cf32961-f1a4-44e2-8676-bb10263190b9",
"metadata": {},
"source": [
"## 7.0 Process Designated places (DPLs)\n",
"There should be 1685 DPLs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "958a12c8-2d4d-474a-8af7-81a6b71e24d1",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing /data/census_of_population/extracted/2021/98-401-X2021011_eng_CSV/98-401-X2021011_English_CSV_data.csv\n",
"Concatenating all dataframes into one\n"
]
},
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
]
},
"execution_count": 106,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 106,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"DPL - Number of Columns BEFORE: 7893\n",
"Dropping columns that don't have values\n",
"DPL - Number of Columns AFTER: 7433\n"
]
}
],
"source": [
"csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021011_eng_CSV/*English_CSV_data*\")\n",
"cop_df = process_cop_csv(csvs_to_process)\n",
"\n",
"# Get the dguid per level of geography\n",
"con.sql(\"\"\"\n",
"DROP TABLE IF EXISTS dpl_2021;\n",
"CREATE TABLE dpl_2021 AS SELECT dpl_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/dpl_2021.parquet';\n",
"\"\"\")\n",
"con.commit()\n",
"\n",
"# Convert the duckdb tables to pandas dataframe\n",
"dpl_dguid = con.sql(\"SELECT * FROM dpl_2021\").to_df()\n",
"\n",
"# Join the Census of Population dataframe to each geographic level\n",
"cop_dpl = cop_df.join(dpl_dguid.set_index('dguid'), on='dguid', how='inner')\n",
"\n",
"del(dpl_dguid)\n",
"del(cop_df)\n",
"gc.collect()\n",
"\n",
"# Convert columns to lowest dtypes\n",
"cop_dpl = convert_to_lowest_type(cop_dpl)\n",
"\n",
"# Drop NA columns\n",
"print(f\"DPL - Number of Columns BEFORE: {len(cop_dpl.columns)}\")\n",
"drop_na_columns(cop_dpl)\n",
"print(f\"DPL - Number of Columns AFTER: {len(cop_dpl.columns)}\")\n",
"\n",
"# Export\n",
"cop_dpl = cop_dpl.reset_index()\n",
"cop_dpl.rename(columns={'dguid': 'dpl_dguid'}, inplace=True)\n",
"cop_dpl.to_parquet(path=f'{data_dir}/dpl_2021.parquet', index=False, compression='zstd')\n",
"\n",
"del(cop_dpl)\n",
"gc.collect()"
]
},
{
"cell_type": "markdown",
"id": "bceb5f4a-4019-4d25-8671-b0854233e109",
"metadata": {},
"source": [
"## 8.0 Process Aggregate dissemination areas (ADAs)\n",
"There should be 5433 ADAs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8c846535-aff6-434e-ada0-51e739d89d23",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing /data/census_of_population/extracted/2021/98-401-X2021012_eng_CSV/98-401-X2021012_English_CSV_data.csv\n",
"Concatenating all dataframes into one\n"
]
},
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
]
},
"execution_count": 109,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021012_eng_CSV/*English_CSV_data*\")\n",
"cop_df = process_cop_csv(csvs_to_process)\n",
"\n",
"# Get the dguid per level of geography\n",
"con.sql(\"\"\"\n",
"DROP TABLE IF EXISTS dpl_2021;\n",
"DROP TABLE IF EXISTS ada_2021;\n",
"CREATE TABLE ada_2021 AS SELECT ada_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/ada_2021.parquet';\n",
"\"\"\")\n",
"con.commit()\n",
"\n",
"# Convert the duckdb tables to pandas dataframe\n",
"ada_dguid = con.sql(\"SELECT * FROM ada_2021\").to_df()\n",
"\n",
"# Join the Census of Population dataframe to each geographic level\n",
"cop_ada = cop_df.join(ada_dguid.set_index('dguid'), on='dguid', how='inner')\n",
"\n",
"del(ada_dguid)\n",
"del(cop_df)\n",
"gc.collect()\n",
"\n",
"# Convert columns to lowest dtypes\n",
"cop_ada = convert_to_lowest_type(cop_ada)\n",
"\n",
"# Drop NA columns\n",
"print(f\"ADA - Number of Columns BEFORE: {len(cop_ada.columns)}\")\n",
"drop_na_columns(cop_ada)\n",
"print(f\"ADA - Number of Columns AFTER: {len(cop_ada.columns)}\")\n",
"\n",
"# Export\n",
"cop_ada = cop_ada.reset_index()\n",
"cop_ada.rename(columns={'dguid': 'ada_dguid'}, inplace=True)\n",
"cop_ada.to_parquet(path=f'{data_dir}/ada_2021.parquet', index=False, compression='zstd')\n",
"\n",
"del(cop_ada)\n",
"gc.collect()"
]
},
{
"cell_type": "markdown",
"id": "6d66d8ae-e3ca-4014-961c-4af3c577880d",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"source": [
"## 9.0 Process Forward sortation areas (FSAs)\n",
"There should be 1643 FSAs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bdee5c1a-9914-42ab-b759-e7685d5e627c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing /data/census_of_population/extracted/2021/98-401-X2021013_eng_CSV/98-401-X2021013_English_CSV_data.csv\n",
"Concatenating all dataframes into one\n"
]
},
{
"data": {
"text/plain": [
"<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"684"
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"FSA - Number of Columns BEFORE: 7893\n",
"Dropping columns that don't have values\n",
"FSA - Number of Columns AFTER: 7429\n"
]
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021013_eng_CSV/*English_CSV_data*\")\n",
"cop_df = process_cop_csv(csvs_to_process)\n",
"\n",
"# Get the dguid per level of geography\n",
"con.sql(\"\"\"\n",
"DROP TABLE IF EXISTS ada_2021;\n",
"DROP TABLE IF EXISTS fsa_2021;\n",
"CREATE TABLE fsa_2021 AS SELECT fsa_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/fsa_2021.parquet';\n",
"\"\"\")\n",
"con.commit()\n",
"\n",
"# Convert the duckdb tables to pandas dataframe\n",
"fsa_dguid = con.sql(\"SELECT * FROM fsa_2021\").to_df()\n",
"\n",
"# Join the Census of Population dataframe to each geographic level\n",
"cop_fsa = cop_df.join(fsa_dguid.set_index('dguid'), on='dguid', how='inner')\n",
"\n",
"del(fsa_dguid)\n",
"del(cop_df)\n",
"gc.collect()\n",
"\n",
"# Convert columns to lowest dtypes\n",
"cop_fsa = convert_to_lowest_type(cop_fsa)\n",
"\n",
"# Drop NA columns\n",
"print(f\"FSA - Number of Columns BEFORE: {len(cop_fsa.columns)}\")\n",
"drop_na_columns(cop_fsa)\n",
"print(f\"FSA - Number of Columns AFTER: {len(cop_fsa.columns)}\")\n",
"\n",
"# Export\n",
"cop_fsa = cop_fsa.reset_index()\n",
"cop_fsa.rename(columns={'dguid': 'fsa_dguid'}, inplace=True)\n",
"cop_fsa.to_parquet(path=f'{data_dir}/fsa_2021.parquet', index=False, compression='zstd')\n",
"\n",
"del(cop_fsa)\n",
"gc.collect()"
]
},
{
"cell_type": "markdown",
"id": "ddb67445-e9dc-489f-b2cd-7a7969393d5a",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"source": [
"## 10.0 Process Health regions (HRs) and Local health integration networks\n",
"Start looking here https://www150.statcan.gc.ca/n1/en/catalogue/82-402-X"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1e4c3ddf-4279-4ae7-8a8a-f43ae97d59b4",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
"csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021015_eng_CSV/*English_CSV_data*\")\n",
"cop_df = process_cop_csv(csvs_to_process)\n",
"\n",
"# Get the dguid per level of geography\n",
"con.sql(\"\"\"\n",
"DROP TABLE IF EXISTS fsa_2021;\n",
"DROP TABLE IF EXISTS hr_2022;\n",
"CREATE TABLE hr_2022 AS SELECT hr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/hr_2022.parquet';\n",
"\n",
"CREATE \n",
"\"\"\")\n",
"con.commit()\n",
"\n",
"# Convert the duckdb tables to pandas dataframe\n",
"hr_dguid = con.sql(\"SELECT * FROM hr_2022\").to_df()\n",
"\n",
"# Join the Census of Population dataframe to each geographic level\n",
"cop_fsa = cop_df.join(fsa_dguid.set_index('dguid'), on='dguid', how='inner')\n",
"\n",
"del(fsa_dguid)\n",
"del(cop_df)\n",
"gc.collect()\n",
"\n",
"# Convert columns to lowest dtypes\n",
"cop_fsa = convert_to_lowest_type(cop_fsa)\n",
"\n",
"# Drop NA columns\n",
"print(f\"FSA - Number of Columns BEFORE: {len(cop_fsa.columns)}\")\n",
"drop_na_columns(cop_fsa)\n",
"print(f\"FSA - Number of Columns AFTER: {len(cop_fsa.columns)}\")\n",
"\n",
"# Export\n",
"cop_fsa = cop_fsa.reset_index()\n",
"cop_fsa.rename(columns={'dguid': 'fsa_dguid'}, inplace=True)\n",
"cop_fsa.to_parquet(path=f'{data_dir}/fsa_2021.parquet', index=False, compression='zstd')\n",
"\n",
"del(cop_fsa)\n",
"gc.collect()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}