{ "cells": [ { "cell_type": "code", "execution_count": 10, "id": "fc8ca6f9", "metadata": {}, "outputs": [], "source": [ "import gc\n", "import glob\n", "import os\n", "\n", "import duckdb\n", "from IPython.core.interactiveshell import InteractiveShell \n", "import numpy as np\n", "import pandas as pd\n", "from sqlalchemy import create_engine\n", "\n", "# Enable multiple outputs per cell\n", "InteractiveShell.ast_node_interactivity = \"all\"\n", "# Show all columns\n", "pd.set_option('display.max_columns', None)\n", "\n", "data_dir = '/data/census_of_population/output/2021/tabular'\n", "\n", "# PostgreSQL DB\n", "DATABASE = os.environ.get(\"POSTGRES_DB\")\n", "USER = os.environ.get(\"POSTGRES_USER\")\n", "PASSWORD = os.environ.get(\"POSTGRES_PASSWORD\")\n", "\n", "engine = create_engine(f\"postgresql://{USER}:{PASSWORD}@db:5432/{DATABASE}\")\n", "\n", "# DuckDB\n", "con = duckdb.connect()\n", "con.install_extension(\"spatial\")\n", "con.load_extension(\"spatial\")" ] }, { "cell_type": "markdown", "id": "3d20a1f5", "metadata": {}, "source": [ "# Datasets\n", "- 1.0 Canada, provinces, territories, census divisions (CDs), census subdivisions (CSDs) and dissemination areas (DAs)\n", "- 2.0 Census metropolitan areas (CMAs), tracted census agglomerations (CAs) and census tracts (CTs)\n", "- 3.0 Economic regions (ERs)\n", "- 4.0 Population centres (POPCTRs)\n", "- 5.0 Canada, provinces, territories and federal electoral districts (FEDs) (2013 Representation Order). **Just process FEDs**\n", "- 6.0 Canada, provinces, territories and federal electoral districts (FEDs) (2023 Representation Order). **Just process FEDs**\n", "- 7.0 Designated places (DPLs)\n", "- 8.0 Aggregate dissemination areas (ADAs)\n", "- 9.0 Forward sortation areas (FSAs)\n", "- 10.0 Health regions (HRs)\n", " - The Health Regions CSV also has a `Home and Community Care Support Services` `GEO_LEVEL`\n", " - Need to take a look at the HRs files listed here https://www150.statcan.gc.ca/n1/pub/82-402-x/2023001/hrbf-flrs-eng.htm\n", "- 11.0 Dissemination Blocks (DBs)" ] }, { "cell_type": "code", "execution_count": 2, "id": "b88a0db8", "metadata": {}, "outputs": [], "source": [ "def process_cop_csv(csvs_to_process):\n", " \"\"\"\n", " 1. Reads subset of fields for Census of Population CSV files\n", " 2. Pivots on characteristic_id\n", " 3. Appends all of the processed CSVs as one dataframe\n", " \"\"\"\n", " dataframes_to_concatenate = []\n", " for filename in csvs_to_process:\n", " print(f\"Processing {filename}\")\n", " params = {\n", " 'filepath_or_buffer': filename,\n", " 'encoding': 'latin-1',\n", " 'usecols': ['DGUID', \n", " 'CHARACTERISTIC_ID', \n", " 'C1_COUNT_TOTAL',\n", " 'C2_COUNT_MEN+',\n", " 'C3_COUNT_WOMEN+'\n", " ],\n", " 'dtype': {\n", " 'CHARACTERISTIC_ID': np.int16\n", " }\n", " }\n", " cop_df = pd.read_csv(**params)\n", " cop_df.rename(columns={\n", " 'C1_COUNT_TOTAL': 'count_total',\n", " 'C2_COUNT_MEN+': 'count_men', \n", " 'C3_COUNT_WOMEN+': 'count_women',\n", " 'DGUID': 'dguid'\n", " }, inplace=True)\n", "\n", " cop_df = cop_df.pivot(index='dguid', columns='CHARACTERISTIC_ID')\n", "\n", " # Flatten the hierarchical index\n", " # https://stackoverflow.com/questions/14507794/how-to-flatten-a-hierarchical-index-in-columns/57630176#57630176\n", " level_one = cop_df.columns.get_level_values(0).astype(str)\n", " level_two = cop_df.columns.get_level_values(1).astype(str)\n", " column_separator = ['_' if x != '' else '' for x in level_two]\n", " cop_df.columns = level_one + column_separator + level_two\n", " dataframes_to_concatenate.append(cop_df)\n", " \n", " print(\"Concatenating all dataframes into one\")\n", " cop_df = pd.concat(dataframes_to_concatenate)\n", " \n", " return cop_df" ] }, { "cell_type": "code", "execution_count": 3, "id": "cba01571", "metadata": {}, "outputs": [], "source": [ "def drop_na_columns(dataframe):\n", " \"\"\"\n", " Delete columns where there are no values.\n", " There are cases where there are values for the count_total\n", " columns, but no values for the count_men and count_women columns\n", " \"\"\"\n", " columns_to_drop = []\n", " for field in dataframe.columns:\n", " minimum_value = dataframe[field].min()\n", " maximum_value = dataframe[field].max()\n", " if pd.isna(minimum_value) and pd.isna(maximum_value):\n", " columns_to_drop.append(field)\n", "\n", " if columns_to_drop:\n", " print(\"Dropping columns that don't have values\")\n", " dataframe.drop(columns=columns_to_drop, inplace=True)" ] }, { "cell_type": "code", "execution_count": 21, "id": "6071a2fe", "metadata": {}, "outputs": [], "source": [ "def convert_to_lowest_type(df):\n", " \"\"\"\n", " Convert columns to the best possible dtypes\n", " For example, if the column is numerical and has a maximum value of 32,000 we can assign it a type of int16\n", " \"\"\"\n", " params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", " }\n", " df = df.convert_dtypes(**params)\n", "\n", " dtypes = pd.DataFrame(df.dtypes)\n", " \n", " # Downcast to the smallest numerical dtype\n", " for row in dtypes.itertuples():\n", " column = row[0]\n", " the_type = str(row[1])\n", " \n", " # Skipping downcasting Float64 as there were issues with decimal places\n", " # For example, instead of a value being 65.4, it turned into 65.4000015258789\n", " if the_type == 'Float64':\n", " continue \n", " elif the_type == 'Int64':\n", " df[column] = pd.to_numeric(df[column], downcast='integer')\n", "\n", " return df" ] }, { "cell_type": "markdown", "id": "86ef0ae5", "metadata": {}, "source": [ "# Start processing\n", "## 1.0 Process Canada, provinces, territories (PRs), census divisions (CDs), census subdivisions (CSDs) and dissemination areas (DAs)" ] }, { "cell_type": "code", "execution_count": 5, "id": "e648921d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Atlantic.csv\n", "Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_BritishColumbia.csv\n", "Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Ontario.csv\n", "Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Prairies.csv\n", "Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Quebec.csv\n", "Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Territories.csv\n", "Concatenating all dataframes into one\n" ] } ], "source": [ "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/*English_CSV_data*\")\n", "cop_df = process_cop_csv(csvs_to_process)" ] }, { "cell_type": "markdown", "id": "9a893e04", "metadata": {}, "source": [ "# Remove duplicates\n", "- For example, for some reason, they included Canada (dguid 2021A000011124) 6 times (once per CSV), so we need to get unique values" ] }, { "cell_type": "code", "execution_count": null, "id": "5712f497", "metadata": { "scrolled": true }, "outputs": [], "source": [ "print(f\"Number of records before {len(cop_df)}\")\n", "print(\"Before:\")\n", "cop_df[cop_df.index == '2021A000011124']\n", "\n", "# Get unique records\n", "cop_df = cop_df.groupby(cop_df.index).last()\n", "print(f\"Number of records after {len(cop_df)}\")\n", "cop_df[cop_df.index == '2021A000011124']" ] }, { "cell_type": "markdown", "id": "f987a804", "metadata": {}, "source": [ "Get unique records" ] }, { "cell_type": "markdown", "id": "0e0e6cb8", "metadata": {}, "source": [ "# Split the Census of Population dataframe by geographic level" ] }, { "cell_type": "code", "execution_count": null, "id": "bdee50fc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "0" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get the dguid per level of geography\n", "con.sql(\"\"\"\n", "DROP TABLE IF EXISTS country_2021;\n", "CREATE TABLE country_2021 AS SELECT country_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/country_2021.parquet';\n", "\n", "DROP TABLE IF EXISTS pr_2021;\n", "CREATE TABLE pr_2021 AS SELECT pr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/pr_2021.parquet';\n", "\n", "DROP TABLE IF EXISTS cd_2021;\n", "CREATE TABLE cd_2021 AS SELECT cd_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/cd_2021.parquet';\n", "\n", "DROP TABLE IF EXISTS csd_2021;\n", "CREATE TABLE csd_2021 AS SELECT csd_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/csd_2021.parquet';\n", "\n", "DROP TABLE IF EXISTS da_2021;\n", "CREATE TABLE da_2021 AS SELECT da_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/da_2021.parquet';\n", "\"\"\")\n", "con.commit()\n", "\n", "# Convert the duckdb tables to pandas dataframe\n", "country_dguid = con.sql(\"SELECT * FROM country_2021\").to_df()\n", "pr_dguid = con.sql(\"SELECT * FROM pr_2021\").to_df()\n", "cd_dguid = con.sql(\"SELECT * FROM cd_2021\").to_df()\n", "csd_dguid = con.sql(\"SELECT * FROM csd_2021\").to_df()\n", "da_dguid = con.sql(\"SELECT * FROM da_2021\").to_df()\n", "\n", "# Join the Census of Population dataframe to each geographic level\n", "cop_country = cop_df.join(country_dguid.set_index('dguid'), on='dguid', how='inner')\n", "cop_pr = cop_df.join(pr_dguid.set_index('dguid'), on='dguid', how='inner')\n", "cop_cd = cop_df.join(cd_dguid.set_index('dguid'), on='dguid', how='inner')\n", "cop_csd = cop_df.join(csd_dguid.set_index('dguid'), on='dguid', how='inner')\n", "cop_da = cop_df.join(da_dguid.set_index('dguid'), on='dguid', how='inner')\n", "\n", "del(cop_df)\n", "del(country_dguid)\n", "del(pr_dguid)\n", "del(cd_dguid)\n", "del(csd_dguid)\n", "del(da_dguid)\n", "gc.collect()" ] }, { "cell_type": "markdown", "id": "2ca0a313-6530-4ecf-becd-8bedc31fdeed", "metadata": {}, "source": [ "# Convert dataframe columns to lowest dtype" ] }, { "cell_type": "code", "execution_count": 9, "id": "96aed739", "metadata": {}, "outputs": [], "source": [ "cop_country = convert_to_lowest_type(cop_country)\n", "cop_pr = convert_to_lowest_type(cop_pr)\n", "cop_cd = convert_to_lowest_type(cop_cd)\n", "cop_csd = convert_to_lowest_type(cop_csd)\n", "cop_da = convert_to_lowest_type(cop_da)" ] }, { "cell_type": "markdown", "id": "9de07a61", "metadata": {}, "source": [ "# Delete columns where there are no values" ] }, { "cell_type": "markdown", "id": "7da3c692", "metadata": {}, "source": [ "## Number of columns before" ] }, { "cell_type": "code", "execution_count": 11, "id": "7c6531d5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Country- length: 7893\n", "PR- length: 7893\n", "CD- length: 7893\n", "CSD- length: 7893\n", "DA- length: 7893\n" ] } ], "source": [ "print(f\"Country- length: {len(cop_country.columns)}\")\n", "print(f\"PR- length: {len(cop_pr.columns)}\")\n", "print(f\"CD- length: {len(cop_cd.columns)}\")\n", "print(f\"CSD- length: {len(cop_csd.columns)}\")\n", "print(f\"DA- length: {len(cop_da.columns)}\")" ] }, { "cell_type": "code", "execution_count": 12, "id": "a971d90c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dropping columns that don't have values\n", "Dropping columns that don't have values\n", "Dropping columns that don't have values\n", "Dropping columns that don't have values\n", "Dropping columns that don't have values\n" ] } ], "source": [ "drop_na_columns(cop_country)\n", "drop_na_columns(cop_pr)\n", "drop_na_columns(cop_cd)\n", "drop_na_columns(cop_csd)\n", "drop_na_columns(cop_da)" ] }, { "cell_type": "markdown", "id": "511023c8", "metadata": {}, "source": [ "## Number of columns after" ] }, { "cell_type": "code", "execution_count": 13, "id": "1b003396", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Country- length: 7433\n", "PR- length: 7433\n", "CD- length: 7433\n", "CSD- length: 7433\n", "DA- length: 7431\n" ] } ], "source": [ "print(f\"Country- length: {len(cop_country.columns)}\")\n", "print(f\"PR- length: {len(cop_pr.columns)}\")\n", "print(f\"CD- length: {len(cop_cd.columns)}\")\n", "print(f\"CSD- length: {len(cop_csd.columns)}\")\n", "print(f\"DA- length: {len(cop_da.columns)}\")" ] }, { "cell_type": "code", "execution_count": 27, "id": "fa72bcce", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "# Country\n", "cop_country = cop_country.reset_index()\n", "cop_country.rename(columns={'dguid': 'country_dguid'}, inplace=True)\n", "cop_country.to_parquet(path=f'{data_dir}/country_2021.parquet', index=False, compression='zstd')\n", "\n", "# Provinces and Territories\n", "cop_pr = cop_pr.reset_index()\n", "cop_pr.rename(columns={'dguid': 'pr_dguid'}, inplace=True)\n", "cop_pr.to_parquet(path=f'{data_dir}/pr_2021.parquet', index=False, compression='zstd')\n", "\n", "# Census Divisions\n", "cop_cd = cop_cd.reset_index()\n", "cop_cd.rename(columns={'dguid': 'cd_dguid'}, inplace=True)\n", "cop_cd.to_parquet(path=f'{data_dir}/cd_2021.parquet', index=False, compression='zstd')\n", "\n", "# Census Subdivisions\n", "cop_csd = cop_csd.reset_index()\n", "cop_csd.rename(columns={'dguid': 'csd_dguid'}, inplace=True)\n", "cop_csd.to_parquet(path=f'{data_dir}/csd_2021.parquet', index=False, compression='zstd')\n", "\n", "# Dissemination Areas\n", "cop_da = cop_da.reset_index()\n", "cop_da.rename(columns={'dguid': 'da_dguid'}, inplace=True)\n", "cop_da.to_parquet(path=f'{data_dir}/da_2021.parquet', index=False, compression='zstd')" ] }, { "cell_type": "code", "execution_count": 28, "id": "441a7834-594f-400c-85a3-a353c8bdf202", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "del(cop_country)\n", "del(cop_pr)\n", "del(cop_cd)\n", "del(cop_csd)\n", "del(cop_da)\n", "gc.collect()" ] }, { "cell_type": "markdown", "id": "6c7b31ef-9657-4a2e-bdf7-8d75313c10ef", "metadata": {}, "source": [ "## 2.0 Process Census metropolitan areas (CMAs), tracted census agglomerations (CAs) and census tracts (CTs)" ] }, { "cell_type": "markdown", "id": "91b81c24-c160-4c38-8713-8d1f06d18624", "metadata": {}, "source": [ "# TODO: Finish processing CMA" ] }, { "cell_type": "code", "execution_count": null, "id": "b8c5ebfd-d5b7-4700-8006-3fa96402dc94", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing /data/census_of_population/extracted/2021/98-401-X2021007_eng_CSV/98-401-X2021007_English_CSV_data.csv\n", "Concatenating all dataframes into one\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "244615" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021007_eng_CSV/*English_CSV_data*\")\n", "cop_df = process_cop_csv(csvs_to_process)\n", "\n", "# Get the dguid per level of geography\n", "con.sql(\"\"\"\n", "DROP TABLE IF EXISTS country_2021;\n", "DROP TABLE IF EXISTS pr_2021;\n", "DROP TABLE IF EXISTS cd_2021;\n", "DROP TABLE IF EXISTS csd_2021;\n", "DROP TABLE IF EXISTS da_2021;\n", "\n", "DROP TABLE IF EXISTS cma_2021;\n", "CREATE TABLE cma_2021 AS SELECT cma_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/cma_2021.parquet';\n", "\n", "DROP TABLE IF EXISTS ct_2021;\n", "CREATE TABLE ct_2021 AS SELECT ct_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/ct_2021.parquet';\n", "\"\"\")\n", "con.commit()\n", "\n", "# Convert the duckdb tables to pandas dataframe\n", "cma_dguid = con.sql(\"SELECT * FROM cma_2021\").to_df()\n", "ct_dguid = con.sql(\"SELECT * FROM ct_2021\").to_df()\n", "\n", "# Join the Census of Population dataframe to each geographic level\n", "# There's going to be missing links\n", "cop_cma = cop_df.join(cma_dguid.set_index('dguid'), on='dguid', how='inner')\n", "cop_ct = cop_df.join(ct_dguid.set_index('dguid'), on='dguid', how='inner')\n", "\n", "del(ct_dguid)\n", "del(cop_df)\n", "gc.collect()\n", "\n", "# Convert columns to lowest dtypes\n", "cop_ct = convert_to_lowest_type(cop_ct)\n", "\n", "# Drop NA columns\n", "print(f\"CT - Number of Columns BEFORE: {len(cop_ct.columns)}\")\n", "drop_na_columns(cop_ct)\n", "print(f\"CT - Number of Columns AFTER: {len(cop_ct.columns)}\")\n", "\n", "# Export\n", "# Census Tracts\n", "cop_ct = cop_ct.reset_index()\n", "cop_ct.rename(columns={'dguid': 'ct_dguid'}, inplace=True)\n", "cop_ct.to_parquet(path=f'{data_dir}/ct_2021.parquet', index=False, compression='zstd')\n", "\n", "del(cop_ct)\n", "gc.collect()" ] }, { "cell_type": "markdown", "id": "f7f5e3fc-a6b1-46bc-a299-f0dd0d6db897", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "## 3.0 Process Economic regions (ERs) (98-401-X2021008_eng_CSV)\n", "This file also includes Provinces and Territories and Country" ] }, { "cell_type": "code", "execution_count": null, "id": "8fa4d456-90ed-4b31-8c91-5d0c8b1faef5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing /data/census_of_population/extracted/2021/98-401-X2021008_eng_CSV/98-401-X2021008_English_CSV_data.csv\n", "Concatenating all dataframes into one\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021008_eng_CSV/*English_CSV_data*\")\n", "cop_df = process_cop_csv(csvs_to_process)\n", "\n", "# Get the dguid per level of geography\n", "con.sql(\"\"\"\n", "DROP TABLE IF EXISTS cma_2021;\n", "DROP TABLE IF EXISTS ct_2021;\n", "\n", "DROP TABLE IF EXISTS er_2021;\n", "CREATE TABLE er_2021 AS SELECT er_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/er_2021.parquet';\n", "\"\"\")\n", "con.commit()\n", "\n", "# Convert the duckdb tables to pandas dataframe\n", "er_dguid = con.sql(\"SELECT * FROM er_2021\").to_df()\n", "\n", "# Join the Census of Population dataframe to each geographic level\n", "cop_er = cop_df.join(er_dguid.set_index('dguid'), on='dguid', how='inner')\n", "\n", "del(er_dguid)\n", "del(cop_df)\n", "gc.collect()\n", "\n", "# Convert columns to lowest dtypes\n", "cop_er = convert_to_lowest_type(cop_er)\n", "\n", "# Drop NA columns\n", "print(f\"ER - Number of Columns BEFORE: {len(cop_er.columns)}\")\n", "drop_na_columns(cop_er)\n", "print(f\"CT - Number of Columns AFTER: {len(cop_er.columns)}\")\n", "\n", "# Export\n", "# Economic Regions\n", "cop_er = cop_er.reset_index()\n", "cop_er.rename(columns={'dguid': 'er_dguid'}, inplace=True)\n", "cop_er.to_parquet(path=f'{data_dir}/er_2021.parquet', index=False, compression='zstd')\n", "\n", "del(cop_er)\n", "gc.collect()" ] }, { "cell_type": "markdown", "id": "5a57d285-6567-4288-a632-ce96e02ca23c", "metadata": {}, "source": [ "## 4.0 Process Population centres (POPCTRs)\n", "### There are 1026 DGUIDs in the Census of Population data, but there should be 1030\n", "They also use the pop_ctr_dguid and not the pop_ctr_p_dguid. So, there's no way to differentiate between Ottawa, and Gatineau for pop_ctr_dguid 2021S05100616" ] }, { "cell_type": "code", "execution_count": null, "id": "cef1f94a-bec2-45bb-95f1-f06a8f04b470", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing /data/census_of_population/extracted/2021/98-401-X2021009_eng_CSV/98-401-X2021009_English_CSV_data.csv\n", "Concatenating all dataframes into one\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021009_eng_CSV/*English_CSV_data*\")\n", "cop_df = process_cop_csv(csvs_to_process)\n", "\n", "# Get the dguid per level of geography\n", "con.sql(\"\"\"\n", "DROP TABLE IF EXISTS er_2021;\n", "\n", "DROP TABLE IF EXISTS pop_ctr_2021;\n", "CREATE TABLE pop_ctr_2021 AS SELECT pop_ctr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/pop_ctr_2021.parquet';\n", "\"\"\")\n", "con.commit()\n", "\n", "# Convert the duckdb tables to pandas dataframe\n", "pop_ctr_dguid = con.sql(\"SELECT * FROM pop_ctr_2021\").to_df()\n", "\n", "# Join the Census of Population dataframe to each geographic level\n", "cop_pop_ctr = cop_df.join(pop_ctr_dguid.set_index('dguid'), on='dguid', how='inner')\n", "\n", "del(pop_ctr_dguid)\n", "del(cop_df)\n", "gc.collect()\n", "\n", "# Convert columns to lowest dtypes\n", "cop_pop_ctr = convert_to_lowest_type(cop_pop_ctr)\n", "\n", "# Drop NA columns\n", "print(f\"POP CTR - Number of Columns BEFORE: {len(cop_pop_ctr.columns)}\")\n", "drop_na_columns(cop_pop_ctr)\n", "print(f\"POP CTR- Number of Columns AFTER: {len(cop_pop_ctr.columns)}\")\n", "\n", "# Export\n", "# Population Centers\n", "cop_pop_ctr = cop_pop_ctr.reset_index()\n", "cop_pop_ctr.rename(columns={'dguid': 'pop_ctr_dguid'}, inplace=True)\n", "cop_pop_ctr.to_parquet(path=f'{data_dir}/pop_ctr_2021.parquet', index=False, compression='zstd')\n", "\n", "del(cop_pop_ctr)\n", "gc.collect()" ] }, { "cell_type": "markdown", "id": "9b196a95-62c1-449f-8e44-08bb81719750", "metadata": {}, "source": [ "## 5.0 Process Federal electoral districts (FEDs) (2013 Representation Order)" ] }, { "cell_type": "code", "execution_count": null, "id": "2240a33a-a100-440e-9c55-c2a3509523ea", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing /data/census_of_population/extracted/2021/98-401-X2021010_eng_CSV/98-401-X2021010_English_CSV_data.csv\n", "Concatenating all dataframes into one\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "0" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "FED - Number of Columns BEFORE: 7893\n", "Dropping columns that don't have values\n", "FED - Number of Columns AFTER: 7433\n" ] }, { "data": { "text/plain": [ "0" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021010_eng_CSV/*English_CSV_data*\")\n", "cop_df = process_cop_csv(csvs_to_process)\n", "\n", "# Get the dguid per level of geography\n", "con.sql(\"\"\"\n", "DROP TABLE IF EXISTS pop_ctr_2021;\n", "\n", "DROP TABLE IF EXISTS fed_2013;\n", "CREATE TABLE fed_2013 AS SELECT fed_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/fed_2021_2013.parquet';\n", "\"\"\")\n", "con.commit()\n", "\n", "# Convert the duckdb tables to pandas dataframe\n", "fed_dguid = con.sql(\"SELECT * FROM fed_2013\").to_df()\n", "\n", "# Join the Census of Population dataframe to each geographic level\n", "cop_fed = cop_df.join(fed_dguid.set_index('dguid'), on='dguid', how='inner')\n", "\n", "del(fed_dguid)\n", "del(cop_df)\n", "gc.collect()\n", "\n", "# Convert columns to lowest dtypes\n", "cop_fed = convert_to_lowest_type(cop_fed)\n", "\n", "# Drop NA columns\n", "print(f\"FED - Number of Columns BEFORE: {len(cop_fed.columns)}\")\n", "drop_na_columns(cop_fed)\n", "print(f\"FED - Number of Columns AFTER: {len(cop_fed.columns)}\")\n", "\n", "# Export\n", "cop_fed = cop_fed.reset_index()\n", "cop_fed.rename(columns={'dguid': 'fed_dguid'}, inplace=True)\n", "cop_fed.to_parquet(path=f'{data_dir}/fed_2013.parquet', index=False, compression='zstd')\n", "\n", "del(cop_fed)\n", "gc.collect()" ] }, { "cell_type": "markdown", "id": "dada67c8-8f4b-4795-a97b-3d68887fc582", "metadata": {}, "source": [ "## 6.0 Process Federal electoral districts (FEDs) (2023 Representation Order)\n", "There should be 343 2023 FEDs" ] }, { "cell_type": "code", "execution_count": null, "id": "fb3d4506-cf3a-446f-9ff8-2a8e408630a4", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing /data/census_of_population/extracted/2021/98-401-X2021029_eng_CSV/98-401-X2021029_English_CSV_data.csv\n", "Concatenating all dataframes into one\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "31" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "FED - Number of Columns BEFORE: 7894\n", "Dropping columns that don't have values\n", "FED - Number of Columns AFTER: 7427\n" ] }, { "data": { "text/plain": [ "0" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021029_eng_CSV/*English_CSV_data*\")\n", "cop_df = process_cop_csv(csvs_to_process)\n", "\n", "# Get the dguid per level of geography\n", "con.sql(\"\"\"\n", "DROP TABLE IF EXISTS fed_2013;\n", "\n", "/*\n", "DROP TABLE IF EXISTS fed_2023;\n", "CREATE TABLE fed_2023 AS SELECT fed_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/fed_2023.parquet';\n", "*/\n", "\"\"\")\n", "con.commit()\n", "\n", "# Convert the duckdb tables to pandas dataframe\n", "#fed_dguid = con.sql(\"SELECT * FROM fed_2023\").to_df()\n", "\n", "# Join the Census of Population dataframe to each geographic level\n", "#cop_fed = cop_df.join(fed_dguid.set_index('dguid'), on='dguid', how='inner')\n", "cop_df = cop_df.reset_index()\n", "cop_df = cop_df[cop_df['dguid'].str.contains(\"2023\")]\n", "\n", "# Convert columns to lowest dtypes\n", "cop_fed = convert_to_lowest_type(cop_df)\n", "\n", "del(cop_df)\n", "gc.collect()\n", "\n", "# Drop NA columns\n", "print(f\"FED - Number of Columns BEFORE: {len(cop_fed.columns)}\")\n", "drop_na_columns(cop_fed)\n", "print(f\"FED - Number of Columns AFTER: {len(cop_fed.columns)}\")\n", "\n", "# Export\n", "cop_fed.rename(columns={'dguid': 'fed_dguid'}, inplace=True)\n", "cop_fed.to_parquet(path=f'{data_dir}/fed_2023.parquet', index=False, compression='zstd')\n", "\n", "del(cop_fed)\n", "gc.collect()" ] }, { "cell_type": "markdown", "id": "3cf32961-f1a4-44e2-8676-bb10263190b9", "metadata": {}, "source": [ "## 7.0 Process Designated places (DPLs)\n", "There should be 1685 DPLs" ] }, { "cell_type": "code", "execution_count": null, "id": "958a12c8-2d4d-474a-8af7-81a6b71e24d1", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing /data/census_of_population/extracted/2021/98-401-X2021011_eng_CSV/98-401-X2021011_English_CSV_data.csv\n", "Concatenating all dataframes into one\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "0" ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "DPL - Number of Columns BEFORE: 7893\n", "Dropping columns that don't have values\n", "DPL - Number of Columns AFTER: 7433\n" ] } ], "source": [ "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021011_eng_CSV/*English_CSV_data*\")\n", "cop_df = process_cop_csv(csvs_to_process)\n", "\n", "# Get the dguid per level of geography\n", "con.sql(\"\"\"\n", "DROP TABLE IF EXISTS dpl_2021;\n", "CREATE TABLE dpl_2021 AS SELECT dpl_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/dpl_2021.parquet';\n", "\"\"\")\n", "con.commit()\n", "\n", "# Convert the duckdb tables to pandas dataframe\n", "dpl_dguid = con.sql(\"SELECT * FROM dpl_2021\").to_df()\n", "\n", "# Join the Census of Population dataframe to each geographic level\n", "cop_dpl = cop_df.join(dpl_dguid.set_index('dguid'), on='dguid', how='inner')\n", "\n", "del(dpl_dguid)\n", "del(cop_df)\n", "gc.collect()\n", "\n", "# Convert columns to lowest dtypes\n", "cop_dpl = convert_to_lowest_type(cop_dpl)\n", "\n", "# Drop NA columns\n", "print(f\"DPL - Number of Columns BEFORE: {len(cop_dpl.columns)}\")\n", "drop_na_columns(cop_dpl)\n", "print(f\"DPL - Number of Columns AFTER: {len(cop_dpl.columns)}\")\n", "\n", "# Export\n", "cop_dpl = cop_dpl.reset_index()\n", "cop_dpl.rename(columns={'dguid': 'dpl_dguid'}, inplace=True)\n", "cop_dpl.to_parquet(path=f'{data_dir}/dpl_2021.parquet', index=False, compression='zstd')\n", "\n", "del(cop_dpl)\n", "gc.collect()" ] }, { "cell_type": "markdown", "id": "bceb5f4a-4019-4d25-8671-b0854233e109", "metadata": {}, "source": [ "## 8.0 Process Aggregate dissemination areas (ADAs)\n", "There should be 5433 ADAs" ] }, { "cell_type": "code", "execution_count": null, "id": "8c846535-aff6-434e-ada0-51e739d89d23", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing /data/census_of_population/extracted/2021/98-401-X2021012_eng_CSV/98-401-X2021012_English_CSV_data.csv\n", "Concatenating all dataframes into one\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021012_eng_CSV/*English_CSV_data*\")\n", "cop_df = process_cop_csv(csvs_to_process)\n", "\n", "# Get the dguid per level of geography\n", "con.sql(\"\"\"\n", "DROP TABLE IF EXISTS dpl_2021;\n", "DROP TABLE IF EXISTS ada_2021;\n", "CREATE TABLE ada_2021 AS SELECT ada_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/ada_2021.parquet';\n", "\"\"\")\n", "con.commit()\n", "\n", "# Convert the duckdb tables to pandas dataframe\n", "ada_dguid = con.sql(\"SELECT * FROM ada_2021\").to_df()\n", "\n", "# Join the Census of Population dataframe to each geographic level\n", "cop_ada = cop_df.join(ada_dguid.set_index('dguid'), on='dguid', how='inner')\n", "\n", "del(ada_dguid)\n", "del(cop_df)\n", "gc.collect()\n", "\n", "# Convert columns to lowest dtypes\n", "cop_ada = convert_to_lowest_type(cop_ada)\n", "\n", "# Drop NA columns\n", "print(f\"ADA - Number of Columns BEFORE: {len(cop_ada.columns)}\")\n", "drop_na_columns(cop_ada)\n", "print(f\"ADA - Number of Columns AFTER: {len(cop_ada.columns)}\")\n", "\n", "# Export\n", "cop_ada = cop_ada.reset_index()\n", "cop_ada.rename(columns={'dguid': 'ada_dguid'}, inplace=True)\n", "cop_ada.to_parquet(path=f'{data_dir}/ada_2021.parquet', index=False, compression='zstd')\n", "\n", "del(cop_ada)\n", "gc.collect()" ] }, { "cell_type": "markdown", "id": "6d66d8ae-e3ca-4014-961c-4af3c577880d", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "## 9.0 Process Forward sortation areas (FSAs)\n", "There should be 1643 FSAs" ] }, { "cell_type": "code", "execution_count": null, "id": "bdee5c1a-9914-42ab-b759-e7685d5e627c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing /data/census_of_population/extracted/2021/98-401-X2021013_eng_CSV/98-401-X2021013_English_CSV_data.csv\n", "Concatenating all dataframes into one\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 116, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "684" ] }, "execution_count": 116, "metadata": {}, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "FSA - Number of Columns BEFORE: 7893\n", "Dropping columns that don't have values\n", "FSA - Number of Columns AFTER: 7429\n" ] }, { "data": { "text/plain": [ "0" ] }, "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021013_eng_CSV/*English_CSV_data*\")\n", "cop_df = process_cop_csv(csvs_to_process)\n", "\n", "# Get the dguid per level of geography\n", "con.sql(\"\"\"\n", "DROP TABLE IF EXISTS ada_2021;\n", "DROP TABLE IF EXISTS fsa_2021;\n", "CREATE TABLE fsa_2021 AS SELECT fsa_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/fsa_2021.parquet';\n", "\"\"\")\n", "con.commit()\n", "\n", "# Convert the duckdb tables to pandas dataframe\n", "fsa_dguid = con.sql(\"SELECT * FROM fsa_2021\").to_df()\n", "\n", "# Join the Census of Population dataframe to each geographic level\n", "cop_fsa = cop_df.join(fsa_dguid.set_index('dguid'), on='dguid', how='inner')\n", "\n", "del(fsa_dguid)\n", "del(cop_df)\n", "gc.collect()\n", "\n", "# Convert columns to lowest dtypes\n", "cop_fsa = convert_to_lowest_type(cop_fsa)\n", "\n", "# Drop NA columns\n", "print(f\"FSA - Number of Columns BEFORE: {len(cop_fsa.columns)}\")\n", "drop_na_columns(cop_fsa)\n", "print(f\"FSA - Number of Columns AFTER: {len(cop_fsa.columns)}\")\n", "\n", "# Export\n", "cop_fsa = cop_fsa.reset_index()\n", "cop_fsa.rename(columns={'dguid': 'fsa_dguid'}, inplace=True)\n", "cop_fsa.to_parquet(path=f'{data_dir}/fsa_2021.parquet', index=False, compression='zstd')\n", "\n", "del(cop_fsa)\n", "gc.collect()" ] }, { "cell_type": "markdown", "id": "ddb67445-e9dc-489f-b2cd-7a7969393d5a", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "## 10.0 Process Health regions (HRs) and Local health integration networks\n", "Start looking here https://www150.statcan.gc.ca/n1/en/catalogue/82-402-X" ] }, { "cell_type": "code", "execution_count": null, "id": "1e4c3ddf-4279-4ae7-8a8a-f43ae97d59b4", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021015_eng_CSV/*English_CSV_data*\")\n", "cop_df = process_cop_csv(csvs_to_process)\n", "\n", "# Get the dguid per level of geography\n", "con.sql(\"\"\"\n", "DROP TABLE IF EXISTS fsa_2021;\n", "DROP TABLE IF EXISTS hr_2022;\n", "CREATE TABLE hr_2022 AS SELECT hr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/hr_2022.parquet';\n", "\"\"\")\n", "con.commit()\n", "\n", "# Convert the duckdb tables to pandas dataframe\n", "hr_dguid = con.sql(\"SELECT * FROM hr_2022\").to_df()" ] }, { "cell_type": "markdown", "id": "846e1be5-cb81-4d5e-a1e5-47a5bb1401b5", "metadata": {}, "source": [ "## 11.0 Dissemination Blocks (DBs)" ] }, { "cell_type": "code", "execution_count": 25, "id": "c850b121-5d3f-492f-be94-cb7258552ebd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sql = \"\"\"\n", "SELECT db_dguid, \n", "db_pop_2021 AS count_total_1, \n", "db_total_private_dwell_2021 AS count_total_4,\n", "db_usual_residents_dwellings_2021 AS count_total_5\n", "FROM silver.gaf_2021;\n", "\"\"\"\n", "\n", "cop_df = pd.read_sql_query(sql=sql, con=engine, index_col='db_dguid')\n", "\n", "# Get the dguid per level of geography\n", "con.sql(\"\"\"\n", "DROP TABLE IF EXISTS db_2021;\n", "CREATE TABLE db_2021 AS SELECT db_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/db_2021.parquet';\n", "\"\"\")\n", "con.commit()\n", "\n", "# Convert the duckdb tables to pandas dataframe\n", "db_dguid = con.sql(\"SELECT * FROM db_2021\").to_df()\n", "\n", "# Join the Census of Population dataframe to each geographic level\n", "cop_db = cop_df.join(db_dguid.set_index('dguid'), on='db_dguid', how='inner')\n", "\n", "# Convert columns to lowest dtypes\n", "cop_db = convert_to_lowest_type(cop_db)\n", "\n", "# Export\n", "cop_db = cop_db.reset_index()\n", "cop_db.to_parquet(path=f'{data_dir}/db_2021.parquet', index=False, compression='zstd')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }