d4c-datapkg-statistical/census_of_population/process_2021.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "fc8ca6f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import gc\n",
    "import glob\n",
    "\n",
    "import duckdb\n",
    "from IPython.core.interactiveshell import InteractiveShell  \n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import sqlalchemy\n",
    "\n",
    "# Enable multiple outputs per cell\n",
    "InteractiveShell.ast_node_interactivity = \"all\"\n",
    "# Show all columns\n",
    "pd.set_option('display.max_columns', None)\n",
    "\n",
    "data_dir = '/data/census_of_population/output/2021/tabular'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3d20a1f5",
   "metadata": {},
   "source": [
    "# Datasets\n",
    "- 1.0 Canada, provinces, territories, census divisions (CDs), census subdivisions (CSDs) and dissemination areas (DAs)\n",
    "- 2.0 Census metropolitan areas (CMAs), tracted census agglomerations (CAs) and census tracts (CTs)\n",
    "- 3.0 Economic regions (ERs)\n",
    "- 4.0 Population centres (POPCTRs)\n",
    "- 5.0 Canada, provinces, territories and federal electoral districts (FEDs) (2013 Representation Order). **Just process FEDs**\n",
    "- 6.0 Canada, provinces, territories and federal electoral districts (FEDs) (2023 Representation Order). **Just process FEDs**\n",
    "- 7.0 Designated places (DPLs)\n",
    "- 8.0 Aggregate dissemination areas (ADAs)\n",
    "- 9.0 Forward sortation areas (FSAs)\n",
    "- 10.0 Health regions (HRs)\n",
    "   - The Health Regions CSV also has a `Home and Community Care Support Services` `GEO_LEVEL`\n",
    "   - Need to take a look at the HRs files listed here https://www150.statcan.gc.ca/n1/pub/82-402-x/2023001/hrbf-flrs-eng.htm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b88a0db8",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_cop_csv(csvs_to_process):\n",
    "    \"\"\"\n",
    "    1. Reads subset of fields for Census of Population CSV files\n",
    "    2. Pivots on characteristic_id\n",
    "    3. Appends all of the processed CSVs as one dataframe\n",
    "    \"\"\"\n",
    "    dataframes_to_concatenate = []\n",
    "    for filename in csvs_to_process:\n",
    "        print(f\"Processing {filename}\")\n",
    "        params = {\n",
    "            'filepath_or_buffer': filename,\n",
    "            'encoding': 'latin-1',\n",
    "            'usecols': ['DGUID', \n",
    "                        'CHARACTERISTIC_ID', \n",
    "                        'C1_COUNT_TOTAL',\n",
    "                        'C2_COUNT_MEN+',\n",
    "                        'C3_COUNT_WOMEN+'\n",
    "                        ],\n",
    "            'dtype': {\n",
    "                'CHARACTERISTIC_ID': np.int16\n",
    "            }\n",
    "        }\n",
    "        cop_df = pd.read_csv(**params)\n",
    "        cop_df.rename(columns={\n",
    "            'C1_COUNT_TOTAL': 'count_total',\n",
    "            'C2_COUNT_MEN+': 'count_men', \n",
    "            'C3_COUNT_WOMEN+': 'count_women',\n",
    "            'DGUID': 'dguid'\n",
    "        }, inplace=True)\n",
    "\n",
    "        cop_df = cop_df.pivot(index='dguid', columns='CHARACTERISTIC_ID')\n",
    "\n",
    "        # Flatten the hierarchical index\n",
    "        # https://stackoverflow.com/questions/14507794/how-to-flatten-a-hierarchical-index-in-columns/57630176#57630176\n",
    "        level_one = cop_df.columns.get_level_values(0).astype(str)\n",
    "        level_two = cop_df.columns.get_level_values(1).astype(str)\n",
    "        column_separator = ['_' if x != '' else '' for x in level_two]\n",
    "        cop_df.columns = level_one + column_separator + level_two\n",
    "        dataframes_to_concatenate.append(cop_df)\n",
    "        \n",
    "    print(\"Concatenating all dataframes into one\")\n",
    "    cop_df = pd.concat(dataframes_to_concatenate)\n",
    "    \n",
    "    return cop_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "cba01571",
   "metadata": {},
   "outputs": [],
   "source": [
    "def drop_na_columns(dataframe):\n",
    "    \"\"\"\n",
    "    Delete columns where there are no values.\n",
    "    There are cases where there are values for the count_total\n",
    "    columns, but no values for the count_men and count_women columns\n",
    "    \"\"\"\n",
    "    columns_to_drop = []\n",
    "    for field in dataframe.columns:\n",
    "        minimum_value = dataframe[field].min()\n",
    "        maximum_value = dataframe[field].max()\n",
    "        if pd.isna(minimum_value) and pd.isna(maximum_value):\n",
    "            columns_to_drop.append(field)\n",
    "\n",
    "    if columns_to_drop:\n",
    "        print(\"Dropping columns that don't have values\")\n",
    "        dataframe.drop(columns=columns_to_drop, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "6071a2fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert_to_lowest_type(df):\n",
    "    \"\"\"\n",
    "    Convert columns to the best possible dtypes\n",
    "    For example, if the column is numerical and has a maximum value of 32,000 we can assign it a type of int16\n",
    "    \"\"\"\n",
    "    params = {\n",
    "        'convert_string': False,\n",
    "        'convert_boolean': False\n",
    "    }\n",
    "    df = df.convert_dtypes(**params)\n",
    "\n",
    "    dtypes = pd.DataFrame(df.dtypes)\n",
    "    \n",
    "    # Downcast to the smallest numerical dtype\n",
    "    for row in dtypes.itertuples():\n",
    "        column = row[0]\n",
    "        the_type = str(row[1])\n",
    "        \n",
    "        # Skipping downcasting Float64 as there were issues with decimal places\n",
    "        # For example, instead of a value being 65.4, it turned into 65.4000015258789\n",
    "        if the_type == 'Float64':\n",
    "            continue          \n",
    "        elif the_type == 'Int64':\n",
    "            df[column] = pd.to_numeric(df[column], downcast='integer')\n",
    "\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "86ef0ae5",
   "metadata": {},
   "source": [
    "# Start processing\n",
    "## 1.0 Process Canada, provinces, territories (PRs), census divisions (CDs), census subdivisions (CSDs) and dissemination areas (DAs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e648921d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Atlantic.csv\n",
      "Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_BritishColumbia.csv\n",
      "Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Ontario.csv\n",
      "Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Prairies.csv\n",
      "Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Quebec.csv\n",
      "Processing /data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/98-401-X2021006_English_CSV_data_Territories.csv\n",
      "Concatenating all dataframes into one\n"
     ]
    }
   ],
   "source": [
    "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021006_eng_CSV/*English_CSV_data*\")\n",
    "cop_df = process_cop_csv(csvs_to_process)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9a893e04",
   "metadata": {},
   "source": [
    "# Remove duplicates\n",
    "- For example, for some reason, they included Canada (dguid 2021A000011124) 6 times (once per CSV), so we need to get unique values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5712f497",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "print(f\"Number of records before {len(cop_df)}\")\n",
    "print(\"Before:\")\n",
    "cop_df[cop_df.index == '2021A000011124']\n",
    "\n",
    "# Get unique records\n",
    "cop_df = cop_df.groupby(cop_df.index).last()\n",
    "print(f\"Number of records after {len(cop_df)}\")\n",
    "cop_df[cop_df.index == '2021A000011124']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f987a804",
   "metadata": {},
   "source": [
    "Get unique records"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0e0e6cb8",
   "metadata": {},
   "source": [
    "# Split the Census of Population dataframe by geographic level"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bdee50fc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "con = duckdb.connect()\n",
    "con.install_extension(\"spatial\")\n",
    "con.load_extension(\"spatial\")\n",
    "\n",
    "# Get the dguid per level of geography\n",
    "con.sql(\"\"\"\n",
    "DROP TABLE IF EXISTS country_2021;\n",
    "CREATE TABLE country_2021 AS SELECT country_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/country_2021.parquet';\n",
    "\n",
    "DROP TABLE IF EXISTS pr_2021;\n",
    "CREATE TABLE pr_2021 AS SELECT pr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/pr_2021.parquet';\n",
    "\n",
    "DROP TABLE IF EXISTS cd_2021;\n",
    "CREATE TABLE cd_2021 AS SELECT cd_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/cd_2021.parquet';\n",
    "\n",
    "DROP TABLE IF EXISTS csd_2021;\n",
    "CREATE TABLE csd_2021 AS SELECT csd_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/csd_2021.parquet';\n",
    "\n",
    "DROP TABLE IF EXISTS da_2021;\n",
    "CREATE TABLE da_2021 AS SELECT da_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/da_2021.parquet';\n",
    "\"\"\")\n",
    "con.commit()\n",
    "\n",
    "# Convert the duckdb tables to pandas dataframe\n",
    "country_dguid = con.sql(\"SELECT * FROM country_2021\").to_df()\n",
    "pr_dguid = con.sql(\"SELECT * FROM pr_2021\").to_df()\n",
    "cd_dguid = con.sql(\"SELECT * FROM cd_2021\").to_df()\n",
    "csd_dguid = con.sql(\"SELECT * FROM csd_2021\").to_df()\n",
    "da_dguid = con.sql(\"SELECT * FROM da_2021\").to_df()\n",
    "\n",
    "# Join the Census of Population dataframe to each geographic level\n",
    "cop_country = cop_df.join(country_dguid.set_index('dguid'), on='dguid', how='inner')\n",
    "cop_pr = cop_df.join(pr_dguid.set_index('dguid'), on='dguid', how='inner')\n",
    "cop_cd = cop_df.join(cd_dguid.set_index('dguid'), on='dguid', how='inner')\n",
    "cop_csd = cop_df.join(csd_dguid.set_index('dguid'), on='dguid', how='inner')\n",
    "cop_da = cop_df.join(da_dguid.set_index('dguid'), on='dguid', how='inner')\n",
    "\n",
    "del(cop_df)\n",
    "del(country_dguid)\n",
    "del(pr_dguid)\n",
    "del(cd_dguid)\n",
    "del(csd_dguid)\n",
    "del(da_dguid)\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2ca0a313-6530-4ecf-becd-8bedc31fdeed",
   "metadata": {},
   "source": [
    "# Convert dataframe columns to lowest dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "96aed739",
   "metadata": {},
   "outputs": [],
   "source": [
    "cop_country = convert_to_lowest_type(cop_country)\n",
    "cop_pr = convert_to_lowest_type(cop_pr)\n",
    "cop_cd = convert_to_lowest_type(cop_cd)\n",
    "cop_csd = convert_to_lowest_type(cop_csd)\n",
    "cop_da = convert_to_lowest_type(cop_da)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9de07a61",
   "metadata": {},
   "source": [
    "# Delete columns where there are no values"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7da3c692",
   "metadata": {},
   "source": [
    "## Number of columns before"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "7c6531d5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Country- length: 7893\n",
      "PR- length: 7893\n",
      "CD- length: 7893\n",
      "CSD- length: 7893\n",
      "DA- length: 7893\n"
     ]
    }
   ],
   "source": [
    "print(f\"Country- length: {len(cop_country.columns)}\")\n",
    "print(f\"PR- length: {len(cop_pr.columns)}\")\n",
    "print(f\"CD- length: {len(cop_cd.columns)}\")\n",
    "print(f\"CSD- length: {len(cop_csd.columns)}\")\n",
    "print(f\"DA- length: {len(cop_da.columns)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "a971d90c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dropping columns that don't have values\n",
      "Dropping columns that don't have values\n",
      "Dropping columns that don't have values\n",
      "Dropping columns that don't have values\n",
      "Dropping columns that don't have values\n"
     ]
    }
   ],
   "source": [
    "drop_na_columns(cop_country)\n",
    "drop_na_columns(cop_pr)\n",
    "drop_na_columns(cop_cd)\n",
    "drop_na_columns(cop_csd)\n",
    "drop_na_columns(cop_da)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "511023c8",
   "metadata": {},
   "source": [
    "## Number of columns after"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "1b003396",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Country- length: 7433\n",
      "PR- length: 7433\n",
      "CD- length: 7433\n",
      "CSD- length: 7433\n",
      "DA- length: 7431\n"
     ]
    }
   ],
   "source": [
    "print(f\"Country- length: {len(cop_country.columns)}\")\n",
    "print(f\"PR- length: {len(cop_pr.columns)}\")\n",
    "print(f\"CD- length: {len(cop_cd.columns)}\")\n",
    "print(f\"CSD- length: {len(cop_csd.columns)}\")\n",
    "print(f\"DA- length: {len(cop_da.columns)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "fa72bcce",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Country\n",
    "cop_country = cop_country.reset_index()\n",
    "cop_country.rename(columns={'dguid': 'country_dguid'}, inplace=True)\n",
    "cop_country.to_parquet(path=f'{data_dir}/country_2021.parquet', index=False, compression='zstd')\n",
    "\n",
    "# Provinces and Territories\n",
    "cop_pr = cop_pr.reset_index()\n",
    "cop_pr.rename(columns={'dguid': 'pr_dguid'}, inplace=True)\n",
    "cop_pr.to_parquet(path=f'{data_dir}/pr_2021.parquet', index=False, compression='zstd')\n",
    "\n",
    "# Census Divisions\n",
    "cop_cd = cop_cd.reset_index()\n",
    "cop_cd.rename(columns={'dguid': 'cd_dguid'}, inplace=True)\n",
    "cop_cd.to_parquet(path=f'{data_dir}/cd_2021.parquet', index=False, compression='zstd')\n",
    "\n",
    "# Census Subdivisions\n",
    "cop_csd = cop_csd.reset_index()\n",
    "cop_csd.rename(columns={'dguid': 'csd_dguid'}, inplace=True)\n",
    "cop_csd.to_parquet(path=f'{data_dir}/csd_2021.parquet', index=False, compression='zstd')\n",
    "\n",
    "# Dissemination Areas\n",
    "cop_da = cop_da.reset_index()\n",
    "cop_da.rename(columns={'dguid': 'da_dguid'}, inplace=True)\n",
    "cop_da.to_parquet(path=f'{data_dir}/da_2021.parquet', index=False, compression='zstd')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "441a7834-594f-400c-85a3-a353c8bdf202",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "del(cop_country)\n",
    "del(cop_pr)\n",
    "del(cop_cd)\n",
    "del(cop_csd)\n",
    "del(cop_da)\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6c7b31ef-9657-4a2e-bdf7-8d75313c10ef",
   "metadata": {},
   "source": [
    "## 2.0 Process Census metropolitan areas (CMAs), tracted census agglomerations (CAs) and census tracts (CTs)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "91b81c24-c160-4c38-8713-8d1f06d18624",
   "metadata": {},
   "source": [
    "# TODO: Finish processing CMA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8c5ebfd-d5b7-4700-8006-3fa96402dc94",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing /data/census_of_population/extracted/2021/98-401-X2021007_eng_CSV/98-401-X2021007_English_CSV_data.csv\n",
      "Concatenating all dataframes into one\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "244615"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021007_eng_CSV/*English_CSV_data*\")\n",
    "cop_df = process_cop_csv(csvs_to_process)\n",
    "\n",
    "# Get the dguid per level of geography\n",
    "con.sql(\"\"\"\n",
    "DROP TABLE IF EXISTS country_2021;\n",
    "DROP TABLE IF EXISTS pr_2021;\n",
    "DROP TABLE IF EXISTS cd_2021;\n",
    "DROP TABLE IF EXISTS csd_2021;\n",
    "DROP TABLE IF EXISTS da_2021;\n",
    "\n",
    "DROP TABLE IF EXISTS cma_2021;\n",
    "CREATE TABLE cma_2021 AS SELECT cma_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/cma_2021.parquet';\n",
    "\n",
    "DROP TABLE IF EXISTS ct_2021;\n",
    "CREATE TABLE ct_2021 AS SELECT ct_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/ct_2021.parquet';\n",
    "\"\"\")\n",
    "con.commit()\n",
    "\n",
    "# Convert the duckdb tables to pandas dataframe\n",
    "cma_dguid = con.sql(\"SELECT * FROM cma_2021\").to_df()\n",
    "ct_dguid = con.sql(\"SELECT * FROM ct_2021\").to_df()\n",
    "\n",
    "# Join the Census of Population dataframe to each geographic level\n",
    "# There's going to be missing links\n",
    "cop_cma = cop_df.join(cma_dguid.set_index('dguid'), on='dguid', how='inner')\n",
    "cop_ct = cop_df.join(ct_dguid.set_index('dguid'), on='dguid', how='inner')\n",
    "\n",
    "del(ct_dguid)\n",
    "del(cop_df)\n",
    "gc.collect()\n",
    "\n",
    "# Convert columns to lowest dtypes\n",
    "cop_ct = convert_to_lowest_type(cop_ct)\n",
    "\n",
    "# Drop NA columns\n",
    "print(f\"CT - Number of Columns BEFORE: {len(cop_ct.columns)}\")\n",
    "drop_na_columns(cop_ct)\n",
    "print(f\"CT - Number of Columns AFTER: {len(cop_ct.columns)}\")\n",
    "\n",
    "# Export\n",
    "# Census Tracts\n",
    "cop_ct = cop_ct.reset_index()\n",
    "cop_ct.rename(columns={'dguid': 'ct_dguid'}, inplace=True)\n",
    "cop_ct.to_parquet(path=f'{data_dir}/ct_2021.parquet', index=False, compression='zstd')\n",
    "\n",
    "del(cop_ct)\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f7f5e3fc-a6b1-46bc-a299-f0dd0d6db897",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "## 3.0 Process Economic regions (ERs) (98-401-X2021008_eng_CSV)\n",
    "This file also includes Provinces and Territories and Country"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8fa4d456-90ed-4b31-8c91-5d0c8b1faef5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing /data/census_of_population/extracted/2021/98-401-X2021008_eng_CSV/98-401-X2021008_English_CSV_data.csv\n",
      "Concatenating all dataframes into one\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021008_eng_CSV/*English_CSV_data*\")\n",
    "cop_df = process_cop_csv(csvs_to_process)\n",
    "\n",
    "# Get the dguid per level of geography\n",
    "con.sql(\"\"\"\n",
    "DROP TABLE IF EXISTS cma_2021;\n",
    "DROP TABLE IF EXISTS ct_2021;\n",
    "\n",
    "DROP TABLE IF EXISTS er_2021;\n",
    "CREATE TABLE er_2021 AS SELECT er_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/er_2021.parquet';\n",
    "\"\"\")\n",
    "con.commit()\n",
    "\n",
    "# Convert the duckdb tables to pandas dataframe\n",
    "er_dguid = con.sql(\"SELECT * FROM er_2021\").to_df()\n",
    "\n",
    "# Join the Census of Population dataframe to each geographic level\n",
    "cop_er = cop_df.join(er_dguid.set_index('dguid'), on='dguid', how='inner')\n",
    "\n",
    "del(er_dguid)\n",
    "del(cop_df)\n",
    "gc.collect()\n",
    "\n",
    "# Convert columns to lowest dtypes\n",
    "cop_er = convert_to_lowest_type(cop_er)\n",
    "\n",
    "# Drop NA columns\n",
    "print(f\"ER - Number of Columns BEFORE: {len(cop_er.columns)}\")\n",
    "drop_na_columns(cop_er)\n",
    "print(f\"CT - Number of Columns AFTER: {len(cop_er.columns)}\")\n",
    "\n",
    "# Export\n",
    "# Economic Regions\n",
    "cop_er = cop_er.reset_index()\n",
    "cop_er.rename(columns={'dguid': 'er_dguid'}, inplace=True)\n",
    "cop_er.to_parquet(path=f'{data_dir}/er_2021.parquet', index=False, compression='zstd')\n",
    "\n",
    "del(cop_er)\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5a57d285-6567-4288-a632-ce96e02ca23c",
   "metadata": {},
   "source": [
    "## 4.0 Process Population centres (POPCTRs)\n",
    "### There are 1026 DGUIDs in the Census of Population data, but there should be 1030\n",
    "They also use the pop_ctr_dguid and not the pop_ctr_p_dguid. So, there's no way to differentiate between Ottawa, and Gatineau for pop_ctr_dguid 2021S05100616"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cef1f94a-bec2-45bb-95f1-f06a8f04b470",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing /data/census_of_population/extracted/2021/98-401-X2021009_eng_CSV/98-401-X2021009_English_CSV_data.csv\n",
      "Concatenating all dataframes into one\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021009_eng_CSV/*English_CSV_data*\")\n",
    "cop_df = process_cop_csv(csvs_to_process)\n",
    "\n",
    "# Get the dguid per level of geography\n",
    "con.sql(\"\"\"\n",
    "DROP TABLE IF EXISTS er_2021;\n",
    "\n",
    "DROP TABLE IF EXISTS pop_ctr_2021;\n",
    "CREATE TABLE pop_ctr_2021 AS SELECT pop_ctr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/pop_ctr_2021.parquet';\n",
    "\"\"\")\n",
    "con.commit()\n",
    "\n",
    "# Convert the duckdb tables to pandas dataframe\n",
    "pop_ctr_dguid = con.sql(\"SELECT * FROM pop_ctr_2021\").to_df()\n",
    "\n",
    "# Join the Census of Population dataframe to each geographic level\n",
    "cop_pop_ctr = cop_df.join(pop_ctr_dguid.set_index('dguid'), on='dguid', how='inner')\n",
    "\n",
    "del(pop_ctr_dguid)\n",
    "del(cop_df)\n",
    "gc.collect()\n",
    "\n",
    "# Convert columns to lowest dtypes\n",
    "cop_pop_ctr = convert_to_lowest_type(cop_pop_ctr)\n",
    "\n",
    "# Drop NA columns\n",
    "print(f\"POP CTR - Number of Columns BEFORE: {len(cop_pop_ctr.columns)}\")\n",
    "drop_na_columns(cop_pop_ctr)\n",
    "print(f\"POP CTR- Number of Columns AFTER: {len(cop_pop_ctr.columns)}\")\n",
    "\n",
    "# Export\n",
    "# Population Centers\n",
    "cop_pop_ctr = cop_pop_ctr.reset_index()\n",
    "cop_pop_ctr.rename(columns={'dguid': 'pop_ctr_dguid'}, inplace=True)\n",
    "cop_pop_ctr.to_parquet(path=f'{data_dir}/pop_ctr_2021.parquet', index=False, compression='zstd')\n",
    "\n",
    "del(cop_pop_ctr)\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9b196a95-62c1-449f-8e44-08bb81719750",
   "metadata": {},
   "source": [
    "## 5.0 Process Federal electoral districts (FEDs) (2013 Representation Order)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2240a33a-a100-440e-9c55-c2a3509523ea",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing /data/census_of_population/extracted/2021/98-401-X2021010_eng_CSV/98-401-X2021010_English_CSV_data.csv\n",
      "Concatenating all dataframes into one\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "FED - Number of Columns BEFORE: 7893\n",
      "Dropping columns that don't have values\n",
      "FED - Number of Columns AFTER: 7433\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021010_eng_CSV/*English_CSV_data*\")\n",
    "cop_df = process_cop_csv(csvs_to_process)\n",
    "\n",
    "# Get the dguid per level of geography\n",
    "con.sql(\"\"\"\n",
    "DROP TABLE IF EXISTS pop_ctr_2021;\n",
    "\n",
    "DROP TABLE IF EXISTS fed_2013;\n",
    "CREATE TABLE fed_2013 AS SELECT fed_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/fed_2021_2013.parquet';\n",
    "\"\"\")\n",
    "con.commit()\n",
    "\n",
    "# Convert the duckdb tables to pandas dataframe\n",
    "fed_dguid = con.sql(\"SELECT * FROM fed_2013\").to_df()\n",
    "\n",
    "# Join the Census of Population dataframe to each geographic level\n",
    "cop_fed = cop_df.join(fed_dguid.set_index('dguid'), on='dguid', how='inner')\n",
    "\n",
    "del(fed_dguid)\n",
    "del(cop_df)\n",
    "gc.collect()\n",
    "\n",
    "# Convert columns to lowest dtypes\n",
    "cop_fed = convert_to_lowest_type(cop_fed)\n",
    "\n",
    "# Drop NA columns\n",
    "print(f\"FED - Number of Columns BEFORE: {len(cop_fed.columns)}\")\n",
    "drop_na_columns(cop_fed)\n",
    "print(f\"FED - Number of Columns AFTER: {len(cop_fed.columns)}\")\n",
    "\n",
    "# Export\n",
    "cop_fed = cop_fed.reset_index()\n",
    "cop_fed.rename(columns={'dguid': 'fed_dguid'}, inplace=True)\n",
    "cop_fed.to_parquet(path=f'{data_dir}/fed_2013.parquet', index=False, compression='zstd')\n",
    "\n",
    "del(cop_fed)\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dada67c8-8f4b-4795-a97b-3d68887fc582",
   "metadata": {},
   "source": [
    "## 6.0 Process Federal electoral districts (FEDs) (2023 Representation Order)\n",
    "There should be 343 2023 FEDs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb3d4506-cf3a-446f-9ff8-2a8e408630a4",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing /data/census_of_population/extracted/2021/98-401-X2021029_eng_CSV/98-401-X2021029_English_CSV_data.csv\n",
      "Concatenating all dataframes into one\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "31"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "FED - Number of Columns BEFORE: 7894\n",
      "Dropping columns that don't have values\n",
      "FED - Number of Columns AFTER: 7427\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021029_eng_CSV/*English_CSV_data*\")\n",
    "cop_df = process_cop_csv(csvs_to_process)\n",
    "\n",
    "# Get the dguid per level of geography\n",
    "con.sql(\"\"\"\n",
    "DROP TABLE IF EXISTS fed_2013;\n",
    "\n",
    "/*\n",
    "DROP TABLE IF EXISTS fed_2023;\n",
    "CREATE TABLE fed_2023 AS SELECT fed_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/fed_2023.parquet';\n",
    "*/\n",
    "\"\"\")\n",
    "con.commit()\n",
    "\n",
    "# Convert the duckdb tables to pandas dataframe\n",
    "#fed_dguid = con.sql(\"SELECT * FROM fed_2023\").to_df()\n",
    "\n",
    "# Join the Census of Population dataframe to each geographic level\n",
    "#cop_fed = cop_df.join(fed_dguid.set_index('dguid'), on='dguid', how='inner')\n",
    "cop_df = cop_df.reset_index()\n",
    "cop_df = cop_df[cop_df['dguid'].str.contains(\"2023\")]\n",
    "\n",
    "# Convert columns to lowest dtypes\n",
    "cop_fed = convert_to_lowest_type(cop_df)\n",
    "\n",
    "del(cop_df)\n",
    "gc.collect()\n",
    "\n",
    "# Drop NA columns\n",
    "print(f\"FED - Number of Columns BEFORE: {len(cop_fed.columns)}\")\n",
    "drop_na_columns(cop_fed)\n",
    "print(f\"FED - Number of Columns AFTER: {len(cop_fed.columns)}\")\n",
    "\n",
    "# Export\n",
    "cop_fed.rename(columns={'dguid': 'fed_dguid'}, inplace=True)\n",
    "cop_fed.to_parquet(path=f'{data_dir}/fed_2023.parquet', index=False, compression='zstd')\n",
    "\n",
    "del(cop_fed)\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3cf32961-f1a4-44e2-8676-bb10263190b9",
   "metadata": {},
   "source": [
    "## 7.0 Process Designated places (DPLs)\n",
    "There should be 1685 DPLs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "958a12c8-2d4d-474a-8af7-81a6b71e24d1",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing /data/census_of_population/extracted/2021/98-401-X2021011_eng_CSV/98-401-X2021011_English_CSV_data.csv\n",
      "Concatenating all dataframes into one\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DPL - Number of Columns BEFORE: 7893\n",
      "Dropping columns that don't have values\n",
      "DPL - Number of Columns AFTER: 7433\n"
     ]
    }
   ],
   "source": [
    "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021011_eng_CSV/*English_CSV_data*\")\n",
    "cop_df = process_cop_csv(csvs_to_process)\n",
    "\n",
    "# Get the dguid per level of geography\n",
    "con.sql(\"\"\"\n",
    "DROP TABLE IF EXISTS dpl_2021;\n",
    "CREATE TABLE dpl_2021 AS SELECT dpl_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/dpl_2021.parquet';\n",
    "\"\"\")\n",
    "con.commit()\n",
    "\n",
    "# Convert the duckdb tables to pandas dataframe\n",
    "dpl_dguid = con.sql(\"SELECT * FROM dpl_2021\").to_df()\n",
    "\n",
    "# Join the Census of Population dataframe to each geographic level\n",
    "cop_dpl = cop_df.join(dpl_dguid.set_index('dguid'), on='dguid', how='inner')\n",
    "\n",
    "del(dpl_dguid)\n",
    "del(cop_df)\n",
    "gc.collect()\n",
    "\n",
    "# Convert columns to lowest dtypes\n",
    "cop_dpl = convert_to_lowest_type(cop_dpl)\n",
    "\n",
    "# Drop NA columns\n",
    "print(f\"DPL - Number of Columns BEFORE: {len(cop_dpl.columns)}\")\n",
    "drop_na_columns(cop_dpl)\n",
    "print(f\"DPL - Number of Columns AFTER: {len(cop_dpl.columns)}\")\n",
    "\n",
    "# Export\n",
    "cop_dpl = cop_dpl.reset_index()\n",
    "cop_dpl.rename(columns={'dguid': 'dpl_dguid'}, inplace=True)\n",
    "cop_dpl.to_parquet(path=f'{data_dir}/dpl_2021.parquet', index=False, compression='zstd')\n",
    "\n",
    "del(cop_dpl)\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bceb5f4a-4019-4d25-8671-b0854233e109",
   "metadata": {},
   "source": [
    "## 8.0 Process Aggregate dissemination areas (ADAs)\n",
    "There should be 5433 ADAs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c846535-aff6-434e-ada0-51e739d89d23",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing /data/census_of_population/extracted/2021/98-401-X2021012_eng_CSV/98-401-X2021012_English_CSV_data.csv\n",
      "Concatenating all dataframes into one\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021012_eng_CSV/*English_CSV_data*\")\n",
    "cop_df = process_cop_csv(csvs_to_process)\n",
    "\n",
    "# Get the dguid per level of geography\n",
    "con.sql(\"\"\"\n",
    "DROP TABLE IF EXISTS dpl_2021;\n",
    "DROP TABLE IF EXISTS ada_2021;\n",
    "CREATE TABLE ada_2021 AS SELECT ada_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/ada_2021.parquet';\n",
    "\"\"\")\n",
    "con.commit()\n",
    "\n",
    "# Convert the duckdb tables to pandas dataframe\n",
    "ada_dguid = con.sql(\"SELECT * FROM ada_2021\").to_df()\n",
    "\n",
    "# Join the Census of Population dataframe to each geographic level\n",
    "cop_ada = cop_df.join(ada_dguid.set_index('dguid'), on='dguid', how='inner')\n",
    "\n",
    "del(ada_dguid)\n",
    "del(cop_df)\n",
    "gc.collect()\n",
    "\n",
    "# Convert columns to lowest dtypes\n",
    "cop_ada = convert_to_lowest_type(cop_ada)\n",
    "\n",
    "# Drop NA columns\n",
    "print(f\"ADA - Number of Columns BEFORE: {len(cop_ada.columns)}\")\n",
    "drop_na_columns(cop_ada)\n",
    "print(f\"ADA - Number of Columns AFTER: {len(cop_ada.columns)}\")\n",
    "\n",
    "# Export\n",
    "cop_ada = cop_ada.reset_index()\n",
    "cop_ada.rename(columns={'dguid': 'ada_dguid'}, inplace=True)\n",
    "cop_ada.to_parquet(path=f'{data_dir}/ada_2021.parquet', index=False, compression='zstd')\n",
    "\n",
    "del(cop_ada)\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6d66d8ae-e3ca-4014-961c-4af3c577880d",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "## 9.0 Process Forward sortation areas (FSAs)\n",
    "There should be 1643 FSAs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bdee5c1a-9914-42ab-b759-e7685d5e627c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing /data/census_of_population/extracted/2021/98-401-X2021013_eng_CSV/98-401-X2021013_English_CSV_data.csv\n",
      "Concatenating all dataframes into one\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<duckdb.duckdb.DuckDBPyConnection at 0x7f2d643eedf0>"
      ]
     },
     "execution_count": 116,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "684"
      ]
     },
     "execution_count": 116,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "FSA - Number of Columns BEFORE: 7893\n",
      "Dropping columns that don't have values\n",
      "FSA - Number of Columns AFTER: 7429\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 116,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021013_eng_CSV/*English_CSV_data*\")\n",
    "cop_df = process_cop_csv(csvs_to_process)\n",
    "\n",
    "# Get the dguid per level of geography\n",
    "con.sql(\"\"\"\n",
    "DROP TABLE IF EXISTS ada_2021;\n",
    "DROP TABLE IF EXISTS fsa_2021;\n",
    "CREATE TABLE fsa_2021 AS SELECT fsa_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/fsa_2021.parquet';\n",
    "\"\"\")\n",
    "con.commit()\n",
    "\n",
    "# Convert the duckdb tables to pandas dataframe\n",
    "fsa_dguid = con.sql(\"SELECT * FROM fsa_2021\").to_df()\n",
    "\n",
    "# Join the Census of Population dataframe to each geographic level\n",
    "cop_fsa = cop_df.join(fsa_dguid.set_index('dguid'), on='dguid', how='inner')\n",
    "\n",
    "del(fsa_dguid)\n",
    "del(cop_df)\n",
    "gc.collect()\n",
    "\n",
    "# Convert columns to lowest dtypes\n",
    "cop_fsa = convert_to_lowest_type(cop_fsa)\n",
    "\n",
    "# Drop NA columns\n",
    "print(f\"FSA - Number of Columns BEFORE: {len(cop_fsa.columns)}\")\n",
    "drop_na_columns(cop_fsa)\n",
    "print(f\"FSA - Number of Columns AFTER: {len(cop_fsa.columns)}\")\n",
    "\n",
    "# Export\n",
    "cop_fsa = cop_fsa.reset_index()\n",
    "cop_fsa.rename(columns={'dguid': 'fsa_dguid'}, inplace=True)\n",
    "cop_fsa.to_parquet(path=f'{data_dir}/fsa_2021.parquet', index=False, compression='zstd')\n",
    "\n",
    "del(cop_fsa)\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ddb67445-e9dc-489f-b2cd-7a7969393d5a",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "## 10.0 Process Health regions (HRs) and Local health integration networks\n",
    "Start looking here https://www150.statcan.gc.ca/n1/en/catalogue/82-402-X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e4c3ddf-4279-4ae7-8a8a-f43ae97d59b4",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "csvs_to_process = glob.glob(\"/data/census_of_population/extracted/2021/98-401-X2021015_eng_CSV/*English_CSV_data*\")\n",
    "cop_df = process_cop_csv(csvs_to_process)\n",
    "\n",
    "# Get the dguid per level of geography\n",
    "con.sql(\"\"\"\n",
    "DROP TABLE IF EXISTS fsa_2021;\n",
    "DROP TABLE IF EXISTS hr_2022;\n",
    "CREATE TABLE hr_2022 AS SELECT hr_dguid AS dguid FROM 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/hr_2022.parquet';\n",
    "\n",
    "CREATE \n",
    "\"\"\")\n",
    "con.commit()\n",
    "\n",
    "# Convert the duckdb tables to pandas dataframe\n",
    "hr_dguid = con.sql(\"SELECT * FROM hr_2022\").to_df()\n",
    "\n",
    "# Join the Census of Population dataframe to each geographic level\n",
    "cop_fsa = cop_df.join(fsa_dguid.set_index('dguid'), on='dguid', how='inner')\n",
    "\n",
    "del(fsa_dguid)\n",
    "del(cop_df)\n",
    "gc.collect()\n",
    "\n",
    "# Convert columns to lowest dtypes\n",
    "cop_fsa = convert_to_lowest_type(cop_fsa)\n",
    "\n",
    "# Drop NA columns\n",
    "print(f\"FSA - Number of Columns BEFORE: {len(cop_fsa.columns)}\")\n",
    "drop_na_columns(cop_fsa)\n",
    "print(f\"FSA - Number of Columns AFTER: {len(cop_fsa.columns)}\")\n",
    "\n",
    "# Export\n",
    "cop_fsa = cop_fsa.reset_index()\n",
    "cop_fsa.rename(columns={'dguid': 'fsa_dguid'}, inplace=True)\n",
    "cop_fsa.to_parquet(path=f'{data_dir}/fsa_2021.parquet', index=False, compression='zstd')\n",
    "\n",
    "del(cop_fsa)\n",
    "gc.collect()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}