d4c-datapkg-statistical/census_of_agriculture/process_2016.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "a06cc1c7-7826-4270-8c04-48ad4de90bc9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import gc\n",
    "\n",
    "from IPython.core.interactiveshell import InteractiveShell  \n",
    "import geopandas as gpd\n",
    "from ordered_set import OrderedSet\n",
    "import pandas as pd\n",
    "\n",
    "# Enable multiple outputs per cell\n",
    "InteractiveShell.ast_node_interactivity = \"all\"\n",
    "# Show all columns\n",
    "pd.set_option('display.max_columns', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "6e4dad8c-afec-4bb6-aee2-20b9b9c5a7a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "input_data_dir = '/data/census_of_agriculture/input/2016'\n",
    "output_data_dir = '/data/census_of_agriculture/output/2016/tabular'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c049bbce-8dcb-418c-b7cb-7015f920a39a",
   "metadata": {},
   "source": [
    "# 1.0 Process Excel sheet with column names and descriptions\n",
    "The compilation of all of the file geodatabase dataset columns should match this dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "0026e968-c4a4-4dc3-9f5c-cdf9706bb8e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Reading Excel sheet with variables\")\n",
    "\n",
    "data_description = pd.read_excel(f'{input_data_dir}/CEAG16_VariablesDescriptions_REAG16_EN_FR.xlsx', skiprows=3,\n",
    "                       usecols=['Variables', 'Long description of the variables'])\n",
    "data_description.rename(columns={'Variables': 'variables', 'Long description of the variables': 'description_en'}, inplace=True)\n",
    "data_description['variables'] = data_description['variables'].str.lower()\n",
    "\n",
    "# There are duplicate variables that are identical. For example, opermore_n\n",
    "data_description = data_description.groupby(['variables', 'description_en']).last().reset_index()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ae758963-b639-44a9-9904-d7438594a729",
   "metadata": {},
   "source": [
    "# 2.0 Process Provinces and Territories\n",
    "## 2.1 Process Agricultural Operations\n",
    "**TODO:** \n",
    "- Mistakes:\n",
    "    - There is a duplicate of `farms_n` column that is named `farms_n1`. The values are identical for both columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "2b8467e8-b979-423c-856c-80242f9a8443",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quick check on columns that are on the geodataframe but not on the variables list\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'farms_n1',\n",
       " 'geo_descr1',\n",
       " 'geo_descr_',\n",
       " 'geometry',\n",
       " 'shape_area',\n",
       " 'shape_length'}"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = f'{input_data_dir}/CEAG16_AgriculturalOperations_ExploitationsAgricoles_REAG16.gdb.zip'\n",
    "\n",
    "print(f'Processing {dataset}, lpr_000b16a_ceag16_n feature class')\n",
    "\n",
    "ao_pr = gpd.read_file(dataset, layer='lpr_000b16a_ceag16_n')\n",
    "# Lowercase column names\n",
    "ao_pr.columns = [x.lower() for x in ao_pr.columns]\n",
    "\n",
    "# Calculate dguid\n",
    "ao_pr['geo_pruid'] = '2016A0002' + ao_pr['geo_pruid']\n",
    "ao_pr.rename(columns={'geo_pruid':'pr_dguid'}, inplace=True)\n",
    "\n",
    "# Select the variables\n",
    "variable_names = list(data_description['variables'])\n",
    "variable_names.insert(0, 'pr_dguid')\n",
    "\n",
    "# Quick check\n",
    "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
    "set(ao_pr.columns) -  set(variable_names)\n",
    "\n",
    "variable_names = list(OrderedSet(variable_names) & OrderedSet(ao_pr.columns))\n",
    "\n",
    "# Get rid of the geometry column and shape area, length\n",
    "ao_pr = ao_pr[variable_names]\n",
    "\n",
    "# Convert to lowest data type\n",
    "params = {\n",
    "    'convert_string': False,\n",
    "    'convert_boolean': False\n",
    "}\n",
    "ao_pr = ao_pr.convert_dtypes(**params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "54b1ab9d-68c7-4751-a17e-e5ad321fd72f",
   "metadata": {},
   "source": [
    "## 2.2 Process Crop Cultures"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "0dfa08f0-2d8d-4279-bd22-2f127d596c3f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quick check on columns that are on the geodataframe but not on the variables list\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'geo_descr1',\n",
       " 'geo_descr_',\n",
       " 'geometry',\n",
       " 'shape_area',\n",
       " 'shape_leng',\n",
       " 'shape_length'}"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = f'{input_data_dir}/CEAG16_Crops_Cultures_REAG16.gdb.zip'\n",
    "\n",
    "print(f'Processing {dataset}, lpr_000b16a_ceag16_n feature class')\n",
    "\n",
    "cc_pr = gpd.read_file(dataset, \n",
    "                      layer='lpr_000b16a_ceag16_n')\n",
    "\n",
    "# Lowercase column names\n",
    "cc_pr.columns = [x.lower() for x in cc_pr.columns]\n",
    "\n",
    "# Calculate dguid\n",
    "cc_pr['geo_pruid'] = '2016A0002' + cc_pr['geo_pruid']\n",
    "cc_pr.rename(columns={'geo_pruid':'pr_dguid'}, inplace=True)\n",
    "\n",
    "# Select the variables\n",
    "variable_names = list(data_description['variables'])\n",
    "variable_names.insert(0, 'pr_dguid')\n",
    "\n",
    "# Quick check\n",
    "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
    "set(cc_pr.columns) -  set(variable_names)\n",
    "\n",
    "variable_names = list(OrderedSet(variable_names) & OrderedSet(cc_pr.columns))\n",
    "\n",
    "# Get rid of the geometry column and shape area, length\n",
    "cc_pr = cc_pr[variable_names]\n",
    "\n",
    "# Convert to lowest data type\n",
    "params = {\n",
    "    'convert_string': False,\n",
    "    'convert_boolean': False\n",
    "}\n",
    "cc_pr = cc_pr.convert_dtypes(**params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "076eb1e3-b319-47ff-b959-cae6dd600081",
   "metadata": {},
   "source": [
    "## 2.3 Process Farm Operators\n",
    "**TODO:** \n",
    "- Mistakes:\n",
    "    - Column `more_avg_a` should be called `more_avg_age`\n",
    "    - Column `one_avg_ag` should be called `one_avg_age`\n",
    "    - On the Excel sheet, there are four `OPER_N`, with the same definition\n",
    "        - On the dataset, this column is replicated five times as `oper_n`, `oper_n1`, `oper_n2`, `oper_n3`, `oper_n4`\n",
    "    - No idea what `opermore_1` is supposed to be\n",
    "    - Column `operone_n1` is duplicate of `operone_n`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "de1dbde2-e820-438f-ae9d-b54455b10ac4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quick check on columns that are on the geodataframe but not on the variables list\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'geo_descr1',\n",
       " 'geo_descr_',\n",
       " 'geometry',\n",
       " 'oper_n1',\n",
       " 'oper_n2',\n",
       " 'oper_n3',\n",
       " 'oper_n4',\n",
       " 'opermore_1',\n",
       " 'operone_n1',\n",
       " 'shape_area',\n",
       " 'shape_leng',\n",
       " 'shape_length'}"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = f'{input_data_dir}/CEAG16_FarmOperators_ExploitantsAgricoles_REAG16.gdb.zip'\n",
    "\n",
    "print(f'Processing {dataset}, lpr_000b16a_ceag16_n feature class')\n",
    "\n",
    "fo_pr = gpd.read_file(dataset, layer='lpr_000b16a_ceag16_n')\n",
    "\n",
    "# Lowercase column names\n",
    "fo_pr.columns = [x.lower() for x in fo_pr.columns]\n",
    "\n",
    "# Calculate dguid\n",
    "fo_pr['geo_pruid'] = '2016A0002' + fo_pr['geo_pruid']\n",
    "fo_pr.rename(columns={'geo_pruid':'pr_dguid'}, inplace=True)\n",
    "\n",
    "# Fix mistakes\n",
    "fo_pr.rename(columns={\n",
    "    'more_avg_a': 'more_avg_age',\n",
    "    'one_avg_ag': 'one_avg_age'}, inplace=True)\n",
    "\n",
    "# Select the variables\n",
    "variable_names = list(data_description['variables'])\n",
    "variable_names.insert(0, 'pr_dguid')\n",
    "\n",
    "# Quick check\n",
    "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
    "set(fo_pr.columns) -  set(variable_names)\n",
    "\n",
    "variable_names = list(OrderedSet(variable_names) & OrderedSet(fo_pr.columns))\n",
    "\n",
    "# Get rid of the geometry column and shape area, length\n",
    "fo_pr = fo_pr[variable_names]\n",
    "\n",
    "# Convert to lowest data type\n",
    "params = {\n",
    "    'convert_string': False,\n",
    "    'convert_boolean': False\n",
    "}\n",
    "fo_pr = fo_pr.convert_dtypes(**params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "39161daf-5d6a-47e0-9211-0cc0137e2c9e",
   "metadata": {},
   "source": [
    "## 2.4 Process Livestock Poultry Bees"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "79731af0-56f2-4840-b903-49410512d410",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quick check on columns that are on the geodataframe but not on the variables list\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'geo_descr1',\n",
       " 'geo_descr_',\n",
       " 'geometry',\n",
       " 'shape_area',\n",
       " 'shape_leng',\n",
       " 'shape_length'}"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = f'{input_data_dir}/CEAG16_LivestockPoultryBees_BetailVolailleAbeilles_REAG16.gdb.zip'\n",
    "\n",
    "print(f'Processing {dataset}, lpr_000b16a_ceag16_n feature class')\n",
    "\n",
    "lpb_pr = gpd.read_file(dataset, layer='lpr_000b16a_ceag16_n')\n",
    "\n",
    "# Lowercase column names\n",
    "lpb_pr.columns = [x.lower() for x in lpb_pr.columns]\n",
    "\n",
    "# Calculate dguid\n",
    "lpb_pr['geo_pruid'] = '2016A0002' + lpb_pr['geo_pruid']\n",
    "lpb_pr.rename(columns={'geo_pruid':'pr_dguid'}, inplace=True)\n",
    "\n",
    "# Select the variables\n",
    "variable_names = list(data_description['variables'])\n",
    "variable_names.insert(0, 'pr_dguid')\n",
    "\n",
    "# Quick check\n",
    "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
    "set(lpb_pr.columns) -  set(variable_names)\n",
    "\n",
    "variable_names = list(OrderedSet(variable_names) & OrderedSet(lpb_pr.columns))\n",
    "\n",
    "# Get rid of the geometry column and shape area, length\n",
    "lpb_pr = lpb_pr[variable_names]\n",
    "\n",
    "# Convert to lowest data type\n",
    "params = {\n",
    "    'convert_string': False,\n",
    "    'convert_boolean': False\n",
    "}\n",
    "lpb_pr = lpb_pr.convert_dtypes(**params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6a33cfe2-4bbb-46a7-81fb-494a61663a86",
   "metadata": {},
   "source": [
    "## 2.5 Process Use Tenure Practices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "328daec4-f6de-42c4-b11d-5ba287b7a94d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quick check on columns that are on the geodataframe but not on the variables list\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'geo_descr1',\n",
       " 'geo_descr_',\n",
       " 'geometry',\n",
       " 'shape_area',\n",
       " 'shape_leng',\n",
       " 'shape_length'}"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = f'{input_data_dir}/CEAG16_UseTenurePractices_UtilisationOccupationPratiques_REAG16.gdb.zip'\n",
    "\n",
    "print(f'Processing {dataset}, lpr_000b16a_ceag16_n feature class')\n",
    "\n",
    "utp_pr = gpd.read_file(dataset, layer='lpr_000b16a_ceag16_n')\n",
    "\n",
    "# Lowercase column names\n",
    "utp_pr.columns = [x.lower() for x in utp_pr.columns]\n",
    "\n",
    "# Calculate dguid\n",
    "utp_pr['geo_pruid'] = '2016A0002' + utp_pr['geo_pruid']\n",
    "utp_pr.rename(columns={'geo_pruid':'pr_dguid'}, inplace=True)\n",
    "\n",
    "# Select the variables\n",
    "variable_names = list(data_description['variables'])\n",
    "variable_names.insert(0, 'pr_dguid')\n",
    "\n",
    "# Quick check\n",
    "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
    "set(utp_pr.columns) -  set(variable_names)\n",
    "\n",
    "variable_names = list(OrderedSet(variable_names) & OrderedSet(utp_pr.columns))\n",
    "\n",
    "# Get rid of the geometry column and shape area, length\n",
    "utp_pr = utp_pr[variable_names]\n",
    "\n",
    "# Convert to lowest data type\n",
    "params = {\n",
    "    'convert_string': False,\n",
    "    'convert_boolean': False\n",
    "}\n",
    "utp_pr = utp_pr.convert_dtypes(**params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4963cfbb-71c3-4ce7-98f1-3c6ad3d90fde",
   "metadata": {},
   "source": [
    "## 2.6 Join the DataFrames and Export"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "11b5355a-2293-4751-a364-54a0963ed662",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'geo_descr_en', 'geo_descr_fr'}"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be pr_dguid\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'pr_dguid'}"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(\"Merging all Province and Territories dataframes into one\")\n",
    "pr_merge = ao_pr.merge(cc_pr, how='inner', on='pr_dguid') \\\n",
    "                .merge(fo_pr, how='inner', on='pr_dguid') \\\n",
    "                .merge(lpb_pr, how='inner', on='pr_dguid') \\\n",
    "                .merge(utp_pr, how='inner', on='pr_dguid')\n",
    "\n",
    "# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr\n",
    "print(\"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\")\n",
    "set(data_description['variables']) - set(pr_merge.columns)\n",
    "print(\"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be pr_dguid\")\n",
    "set(pr_merge.columns) - set(data_description['variables'])\n",
    "\n",
    "# Export\n",
    "print(\"Exporting pr_2016.parquet\")\n",
    "pr_merge.to_parquet(f'{output_data_dir}/pr_2016.parquet', index=False, compression='zstd')\n",
    "\n",
    "# Create country as well\n",
    "# TODO: check if -1 values subtracted from the sum\n",
    "country = pd.read_parquet(f'{output_data_dir}/pr_2016.parquet')\n",
    "country['pr_dguid'] = '2016A000011124'\n",
    "country.rename(columns={'pr_dguid': 'country_dguid'}, inplace=True)\n",
    "country = country.groupby(['country_dguid']).sum()\n",
    "country.reset_index(inplace=True)\n",
    "# Convert to lowest data type\n",
    "params = {\n",
    "    'convert_string': False,\n",
    "    'convert_boolean': False\n",
    "}\n",
    "country = country.convert_dtypes(**params)\n",
    "print(\"Exporting country_2016.parquet\")\n",
    "country.to_parquet(f'{output_data_dir}/country_2016.parquet', index=False, compression='zstd')\n",
    "\n",
    "del(ao_pr)\n",
    "del(cc_pr)\n",
    "del(fo_pr)\n",
    "del(lpb_pr)\n",
    "del(utp_pr)\n",
    "del(pr_merge)\n",
    "del(country)\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bd78f553-52b4-427e-bf40-7ee4101d65ac",
   "metadata": {},
   "source": [
    "# 3.0 Process Census Agricultural Regions"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ac1383f5-1340-4dd8-9346-0d21ab506886",
   "metadata": {},
   "source": [
    "## 3.1 Process Agricultural Operations\n",
    "**TODO:** \n",
    "- Mistakes:\n",
    "    - There is a duplicate of `farms_n` column that is named `farms_n1`. The values are identical for both columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "740de275-a602-4a18-9f7a-708db0e4e529",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quick check on columns that are on the geodataframe but not on the variables list\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'farms_n1',\n",
       " 'geo_descr1',\n",
       " 'geo_descr_',\n",
       " 'geometry',\n",
       " 'shape_area',\n",
       " 'shape_length'}"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = f'{input_data_dir}/CEAG16_AgriculturalOperations_ExploitationsAgricoles_REAG16.gdb.zip'\n",
    "\n",
    "print(f'Processing {dataset}, lcar000b16a_ceag16_n feature class')\n",
    "\n",
    "ao_car = gpd.read_file(dataset, layer='lcar000b16a_ceag16_n')\n",
    "\n",
    "# Lowercase column names\n",
    "ao_car.columns = [x.lower() for x in ao_car.columns]\n",
    "\n",
    "# Calculate dguid\n",
    "ao_car['geo_caruid'] = '2016S0501' + ao_car['geo_caruid']\n",
    "ao_car.rename(columns={'geo_caruid':'car_dguid'}, inplace=True)\n",
    "\n",
    "# Select the variables\n",
    "variable_names = list(data_description['variables'])\n",
    "variable_names.insert(0, 'car_dguid')\n",
    "\n",
    "# Quick check\n",
    "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
    "set(ao_car.columns) -  set(variable_names)\n",
    "\n",
    "variable_names = list(OrderedSet(variable_names) & OrderedSet(ao_car.columns))\n",
    "\n",
    "# Get rid of the geometry column and shape area, length\n",
    "ao_car = ao_car[variable_names]\n",
    "\n",
    "# Convert to lowest data type\n",
    "params = {\n",
    "    'convert_string': False,\n",
    "    'convert_boolean': False\n",
    "}\n",
    "ao_car = ao_car.convert_dtypes(**params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c4254a8e-30cf-43a0-8a08-b7ac1ab9689c",
   "metadata": {},
   "source": [
    "## 3.2 Process Crop Cultures"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "92b08284-0adb-43fd-a89f-19395afb9bd0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quick check on columns that are on the geodataframe but not on the variables list\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'geo_descr1',\n",
       " 'geo_descr_',\n",
       " 'geometry',\n",
       " 'shape_area',\n",
       " 'shape_leng',\n",
       " 'shape_length'}"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = f'{input_data_dir}/CEAG16_Crops_Cultures_REAG16.gdb.zip'\n",
    "\n",
    "print(f'Processing {dataset}, lcar_000b16a_ceag16_n feature class')\n",
    "\n",
    "cc_car = gpd.read_file(dataset, layer='lcar_000b16a_ceag16_n')\n",
    "\n",
    "# Lowercase column names\n",
    "cc_car.columns = [x.lower() for x in cc_car.columns]\n",
    "\n",
    "# Calculate dguid\n",
    "cc_car['geo_caruid'] = '2016S0501' + cc_car['geo_caruid']\n",
    "cc_car.rename(columns={'geo_caruid':'car_dguid'}, inplace=True)\n",
    "\n",
    "# Select the variables\n",
    "variable_names = list(data_description['variables'])\n",
    "variable_names.insert(0, 'car_dguid')\n",
    "\n",
    "# Quick check\n",
    "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
    "set(cc_car.columns) -  set(variable_names)\n",
    "\n",
    "variable_names = list(OrderedSet(variable_names) & OrderedSet(cc_car.columns))\n",
    "\n",
    "# Get rid of the geometry column and shape area, length\n",
    "cc_car = cc_car[variable_names]\n",
    "\n",
    "# Convert to lowest data type\n",
    "params = {\n",
    "    'convert_string': False,\n",
    "    'convert_boolean': False\n",
    "}\n",
    "cc_car = cc_car.convert_dtypes(**params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "babc334e-47c0-435d-919c-2df54a70130a",
   "metadata": {},
   "source": [
    "## 3.3 Process Farm Operators\n",
    "**TODO:** \n",
    "- Mistakes:\n",
    "    - Column `more_avg_a` should be called `more_avg_age`\n",
    "    - Column `one_avg_ag` should be called `one_avg_age`\n",
    "    - On the Excel sheet, there are four `OPER_N`, with the same definition\n",
    "        - On the dataset, this column is replicated five times as `oper_n`, `oper_n1`, `oper_n2`, `oper_n3`, `oper_n4`\n",
    "    - No idea what `opermore_1` is supposed to be\n",
    "    - Column `operone_n1` is duplicate of `operone_n`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "47afb1ab-f4a9-4581-8222-b4a52ff0eb4a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quick check on columns that are on the geodataframe but not on the variables list\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'geo_descr1',\n",
       " 'geo_descr_',\n",
       " 'geometry',\n",
       " 'oper_n1',\n",
       " 'oper_n2',\n",
       " 'oper_n3',\n",
       " 'oper_n4',\n",
       " 'opermore_1',\n",
       " 'operone_n1',\n",
       " 'shape_area',\n",
       " 'shape_leng',\n",
       " 'shape_length'}"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = f'{input_data_dir}/CEAG16_FarmOperators_ExploitantsAgricoles_REAG16.gdb.zip'\n",
    "\n",
    "print(f'Processing {dataset}, lcar000b16a_ceag16_n feature class')\n",
    "\n",
    "fo_car = gpd.read_file(dataset, layer='lcar000b16a_ceag16_n')\n",
    "\n",
    "# Lowercase column names\n",
    "fo_car.columns = [x.lower() for x in fo_car.columns]\n",
    "\n",
    "# Calculate dguid\n",
    "fo_car['geo_caruid'] = '2016S0501' + fo_car['geo_caruid']\n",
    "fo_car.rename(columns={'geo_caruid':'car_dguid'}, inplace=True)\n",
    "\n",
    "# Fix mistakes\n",
    "fo_car.rename(columns={\n",
    "    'more_avg_a': 'more_avg_age',\n",
    "    'one_avg_ag': 'one_avg_age'}, inplace=True)\n",
    "\n",
    "# Select the variables\n",
    "variable_names = list(data_description['variables'])\n",
    "variable_names.insert(0, 'car_dguid')\n",
    "\n",
    "# Quick check\n",
    "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
    "set(fo_car.columns) -  set(variable_names)\n",
    "\n",
    "variable_names = list(OrderedSet(variable_names) & OrderedSet(fo_car.columns))\n",
    "\n",
    "# Get rid of the geometry column and shape area, length\n",
    "fo_car = fo_car[variable_names]\n",
    "\n",
    "# Convert to lowest data type\n",
    "params = {\n",
    "    'convert_string': False,\n",
    "    'convert_boolean': False\n",
    "}\n",
    "fo_car = fo_car.convert_dtypes(**params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "03bf9372-c2da-46b9-8da9-64c358cb51df",
   "metadata": {},
   "source": [
    "## 3.4 Process Livestock Poultry Bees"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "abf3ff3f-6fbc-482f-9ef8-f7ae6c289884",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quick check on columns that are on the geodataframe but not on the variables list\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'geo_descr1',\n",
       " 'geo_descr_',\n",
       " 'geometry',\n",
       " 'shape_area',\n",
       " 'shape_leng',\n",
       " 'shape_length'}"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = f'{input_data_dir}/CEAG16_LivestockPoultryBees_BetailVolailleAbeilles_REAG16.gdb.zip'\n",
    "\n",
    "print(f'Processing {dataset}, lcar000b16a_ceag16_n feature class')\n",
    "\n",
    "lpb_car = gpd.read_file(dataset, layer='lcar000b16a_ceag16_n')\n",
    "\n",
    "# Lowercase column names\n",
    "lpb_car.columns = [x.lower() for x in lpb_car.columns]\n",
    "\n",
    "# Calculate dguid\n",
    "lpb_car['geo_caruid'] = '2016S0501' + lpb_car['geo_caruid']\n",
    "lpb_car.rename(columns={'geo_caruid':'car_dguid'}, inplace=True)\n",
    "\n",
    "# Select the variables\n",
    "variable_names = list(data_description['variables'])\n",
    "variable_names.insert(0, 'car_dguid')\n",
    "\n",
    "# Quick check\n",
    "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
    "set(lpb_car.columns) -  set(variable_names)\n",
    "\n",
    "variable_names = list(OrderedSet(variable_names) & OrderedSet(lpb_car.columns))\n",
    "\n",
    "# Get rid of the geometry column and shape area, length\n",
    "lpb_car = lpb_car[variable_names]\n",
    "\n",
    "# Convert to lowest data type\n",
    "params = {\n",
    "    'convert_string': False,\n",
    "    'convert_boolean': False\n",
    "}\n",
    "lpb_car = lpb_car.convert_dtypes(**params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a3962c9c-3b5b-4fd8-a6ca-8e004159a521",
   "metadata": {},
   "source": [
    "## 3.5 Process Use Tenure Practices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "16678c91-7132-4ed2-981b-f60b0d81bc73",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quick check on columns that are on the geodataframe but not on the variables list\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'geo_descr1',\n",
       " 'geo_descr_',\n",
       " 'geometry',\n",
       " 'shape_area',\n",
       " 'shape_leng',\n",
       " 'shape_length'}"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = f'{input_data_dir}/CEAG16_UseTenurePractices_UtilisationOccupationPratiques_REAG16.gdb.zip'\n",
    "\n",
    "print(f'Processing {dataset}, lcar_000b16a_ceag16_n feature class')\n",
    "\n",
    "utp_car = gpd.read_file(dataset, layer='lcar_000b16a_ceag16_n')\n",
    "\n",
    "# Lowercase column names\n",
    "utp_car.columns = [x.lower() for x in utp_car.columns]\n",
    "\n",
    "# Calculate dguid\n",
    "utp_car['geo_caruid'] = '2016S0501' + utp_car['geo_caruid']\n",
    "utp_car.rename(columns={'geo_caruid':'car_dguid'}, inplace=True)\n",
    "\n",
    "# Select the variables\n",
    "variable_names = list(data_description['variables'])\n",
    "variable_names.insert(0, 'car_dguid')\n",
    "\n",
    "# Quick check\n",
    "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
    "set(utp_car.columns) -  set(variable_names)\n",
    "\n",
    "variable_names = list(OrderedSet(variable_names) & OrderedSet(utp_car.columns))\n",
    "\n",
    "# Get rid of the geometry column and shape area, length\n",
    "utp_car = utp_car[variable_names]\n",
    "\n",
    "# Convert to lowest data type\n",
    "params = {\n",
    "    'convert_string': False,\n",
    "    'convert_boolean': False\n",
    "}\n",
    "utp_car = utp_car.convert_dtypes(**params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9b99344d-edfc-4f51-ab9f-8a56100394b0",
   "metadata": {},
   "source": [
    "## 3.6 Join the DataFrames and Export"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "d1cfb3c1-c94e-4f4a-9c90-e61c43a791a3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'geo_descr_en', 'geo_descr_fr'}"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be car_dguid\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'car_dguid'}"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(\"Merging all Census Agricultural Regions dataframes into one\")\n",
    "car_merge = ao_car.merge(cc_car, how='inner', on='car_dguid') \\\n",
    "                .merge(fo_car, how='inner', on='car_dguid') \\\n",
    "                .merge(lpb_car, how='inner', on='car_dguid') \\\n",
    "                .merge(utp_car, how='inner', on='car_dguid')\n",
    "\n",
    "# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr\n",
    "print(\"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\")\n",
    "set(data_description['variables']) - set(car_merge.columns)\n",
    "print(\"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be car_dguid\")\n",
    "set(car_merge.columns) - set(data_description['variables'])\n",
    "\n",
    "# Export\n",
    "print(\"Exporting car_2016.parquet\")\n",
    "car_merge.to_parquet(f'{output_data_dir}/car_2016.parquet', index=False, compression='zstd')\n",
    "\n",
    "del(ao_car)\n",
    "del(cc_car)\n",
    "del(fo_car)\n",
    "del(lpb_car)\n",
    "del(utp_car)\n",
    "del(car_merge)\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5ec65522-e754-40b9-994d-e21c5bae88ac",
   "metadata": {},
   "source": [
    "# 4.0 Process Census Divisions\n",
    "## 4.1 Process Agricultural Operations\n",
    "**TODO:** \n",
    "- Mistakes:\n",
    "    - There is a duplicate of `farms_n` column that is named `farms_n1`. The values are identical for both columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "02f789b8-e3d8-4359-97b1-46be400bc019",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quick check on columns that are on the geodataframe but not on the variables list\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'farms_n1',\n",
       " 'geo_descr1',\n",
       " 'geo_descr_',\n",
       " 'geometry',\n",
       " 'shape_area',\n",
       " 'shape_length'}"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = f'{input_data_dir}/CEAG16_AgriculturalOperations_ExploitationsAgricoles_REAG16.gdb.zip'\n",
    "\n",
    "print(f'Processing {dataset}, lcd_000b16a_ceag16_n feature class')\n",
    "\n",
    "ao_cd = gpd.read_file(dataset, layer='lcd_000b16a_ceag16_n')\n",
    "\n",
    "# Lowercase column names\n",
    "ao_cd.columns = [x.lower() for x in ao_cd.columns]\n",
    "\n",
    "# Calculate dguid\n",
    "ao_cd['geo_cduid'] = '2016A0003' + ao_cd['geo_cduid']\n",
    "ao_cd.rename(columns={'geo_cduid':'cd_dguid'}, inplace=True)\n",
    "\n",
    "# Select the variables\n",
    "variable_names = list(data_description['variables'])\n",
    "variable_names.insert(0, 'cd_dguid')\n",
    "\n",
    "# Quick check\n",
    "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
    "set(ao_cd.columns) -  set(variable_names)\n",
    "\n",
    "variable_names = list(OrderedSet(variable_names) & OrderedSet(ao_cd.columns))\n",
    "\n",
    "# Get rid of the geometry column and shape area, length\n",
    "ao_cd = ao_cd[variable_names]\n",
    "\n",
    "# Convert to lowest data type\n",
    "params = {\n",
    "    'convert_string': False,\n",
    "    'convert_boolean': False\n",
    "}\n",
    "ao_cd = ao_cd.convert_dtypes(**params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b22c32c1-bf09-42f6-ad9f-15e19e88f125",
   "metadata": {},
   "source": [
    "## 4.2 Process Crop Cultures"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "05283ef4-d1a9-48aa-8b34-d11123e19dc1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quick check on columns that are on the geodataframe but not on the variables list\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'geo_descr1',\n",
       " 'geo_descr_',\n",
       " 'geometry',\n",
       " 'shape_area',\n",
       " 'shape_leng',\n",
       " 'shape_length'}"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = f'{input_data_dir}/CEAG16_Crops_Cultures_REAG16.gdb.zip'\n",
    "\n",
    "print(f'Processing {dataset}, lcd_000b16a_ceag16_n feature class')\n",
    "\n",
    "cc_cd = gpd.read_file(dataset, layer='lcd_000b16a_ceag16_n')\n",
    "\n",
    "# Lowercase column names\n",
    "cc_cd.columns = [x.lower() for x in cc_cd.columns]\n",
    "\n",
    "# Calculate dguid\n",
    "cc_cd['geo_cduid'] = '2016A0003' + cc_cd['geo_cduid']\n",
    "cc_cd.rename(columns={'geo_cduid':'cd_dguid'}, inplace=True)\n",
    "\n",
    "# Select the variables\n",
    "variable_names = list(data_description['variables'])\n",
    "variable_names.insert(0, 'cd_dguid')\n",
    "\n",
    "# Quick check\n",
    "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
    "set(cc_cd.columns) -  set(variable_names)\n",
    "\n",
    "variable_names = list(OrderedSet(variable_names) & OrderedSet(cc_cd.columns))\n",
    "\n",
    "# Get rid of the geometry column and shape area, length\n",
    "cc_cd = cc_cd[variable_names]\n",
    "\n",
    "# Convert to lowest data type\n",
    "params = {\n",
    "    'convert_string': False,\n",
    "    'convert_boolean': False\n",
    "}\n",
    "cc_cd = cc_cd.convert_dtypes(**params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "52241e8d-b468-4dd6-99b9-342f9a6fde94",
   "metadata": {},
   "source": [
    "## 4.3 Process Farm Operators\n",
    "**TODO:** \n",
    "- Mistakes:\n",
    "    - Column `more_avg_a` should be called `more_avg_age`\n",
    "    - Column `one_avg_ag` should be called `one_avg_age`\n",
    "    - On the Excel sheet, there are four `OPER_N`, with the same definition\n",
    "        - On the dataset, this column is replicated five times as `oper_n`, `oper_n1`, `oper_n2`, `oper_n3`, `oper_n4`\n",
    "    - No idea what `opermore_1` is supposed to be\n",
    "    - Column `operone_n1` is duplicate of `operone_n`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "cf5b26d1-7315-4e9d-85fe-473a68cc5f0f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quick check on columns that are on the geodataframe but not on the variables list\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'geo_descr1',\n",
       " 'geo_descr_',\n",
       " 'geometry',\n",
       " 'oper_n1',\n",
       " 'oper_n2',\n",
       " 'oper_n3',\n",
       " 'oper_n4',\n",
       " 'opermore_1',\n",
       " 'operone_n1',\n",
       " 'shape_area',\n",
       " 'shape_leng',\n",
       " 'shape_length'}"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = f'{input_data_dir}/CEAG16_FarmOperators_ExploitantsAgricoles_REAG16.gdb.zip'\n",
    "\n",
    "print(f'Processing {dataset}, lcd_000b16a_ceag16_n feature class')\n",
    "\n",
    "fo_cd = gpd.read_file(dataset, \n",
    "                      layer='lcd_000b16a_ceag16_n')\n",
    "\n",
    "# Lowercase column names\n",
    "fo_cd.columns = [x.lower() for x in fo_cd.columns]\n",
    "\n",
    "# Calculate dguid\n",
    "fo_cd['geo_cduid'] = '2016A0003' + fo_cd['geo_cduid']\n",
    "fo_cd.rename(columns={'geo_cduid':'cd_dguid'}, inplace=True)\n",
    "\n",
    "# Select the variables\n",
    "variable_names = list(data_description['variables'])\n",
    "variable_names.insert(0, 'cd_dguid')\n",
    "\n",
    "# Fix mistakes\n",
    "fo_cd.rename(columns={\n",
    "    'more_avg_a': 'more_avg_age',\n",
    "    'one_avg_ag': 'one_avg_age'}, inplace=True)\n",
    "\n",
    "# Quick check\n",
    "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
    "set(fo_cd.columns) -  set(variable_names)\n",
    "\n",
    "variable_names = list(OrderedSet(variable_names) & OrderedSet(fo_cd.columns))\n",
    "\n",
    "# Get rid of the geometry column and shape area, length\n",
    "fo_cd = fo_cd[variable_names]\n",
    "\n",
    "# Convert to lowest data type\n",
    "params = {\n",
    "    'convert_string': False,\n",
    "    'convert_boolean': False\n",
    "}\n",
    "fo_cd = fo_cd.convert_dtypes(**params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f68f6fc6-f757-4f21-b32a-7070d4bcc3ec",
   "metadata": {},
   "source": [
    "## 4.4 Process Livestock Poultry Bees"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "e3cd38c6-6563-4067-8280-32277b00a33b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quick check on columns that are on the geodataframe but not on the variables list\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'geo_descr1',\n",
       " 'geo_descr_',\n",
       " 'geometry',\n",
       " 'shape_area',\n",
       " 'shape_leng',\n",
       " 'shape_length'}"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = f'{input_data_dir}/CEAG16_LivestockPoultryBees_BetailVolailleAbeilles_REAG16.gdb.zip'\n",
    "\n",
    "print(f'Processing {dataset}, lcd_000b16a_ceag16_n feature class')\n",
    "\n",
    "lpb_cd = gpd.read_file(dataset, layer='lcd_000b16a_ceag16_n')\n",
    "\n",
    "# Lowercase column names\n",
    "lpb_cd.columns = [x.lower() for x in lpb_cd.columns]\n",
    "\n",
    "# Calculate dguid\n",
    "lpb_cd['geo_cduid'] = '2016A0003' + lpb_cd['geo_cduid']\n",
    "lpb_cd.rename(columns={'geo_cduid':'cd_dguid'}, inplace=True)\n",
    "\n",
    "# Select the variables\n",
    "variable_names = list(data_description['variables'])\n",
    "variable_names.insert(0, 'cd_dguid')\n",
    "\n",
    "# Quick check\n",
    "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
    "set(lpb_cd.columns) -  set(variable_names)\n",
    "\n",
    "variable_names = list(OrderedSet(variable_names) & OrderedSet(lpb_cd.columns))\n",
    "\n",
    "# Get rid of the geometry column and shape area, length\n",
    "lpb_cd = lpb_cd[variable_names]\n",
    "\n",
    "# Convert to lowest data type\n",
    "params = {\n",
    "    'convert_string': False,\n",
    "    'convert_boolean': False\n",
    "}\n",
    "lpb_cd = lpb_cd.convert_dtypes(**params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e06083ab-9a15-4e7f-82e1-721c9ee1d4f1",
   "metadata": {},
   "source": [
    "## 4.5 Process Use Tenure Practices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "efe9a9da-9f18-43f7-8ba7-5f2730743358",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quick check on columns that are on the geodataframe but not on the variables list\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'geo_descr1',\n",
       " 'geo_descr_',\n",
       " 'geometry',\n",
       " 'shape_area',\n",
       " 'shape_leng',\n",
       " 'shape_length'}"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = f'{input_data_dir}/CEAG16_UseTenurePractices_UtilisationOccupationPratiques_REAG16.gdb.zip'\n",
    "\n",
    "print(f'Processing {dataset}, lcd_000b16a_ceag16_n feature class')\n",
    "\n",
    "utp_cd = gpd.read_file(dataset, layer='lcd_000b16a_ceag16_n')\n",
    "\n",
    "# Lowercase column names\n",
    "utp_cd.columns = [x.lower() for x in utp_cd.columns]\n",
    "\n",
    "# Calculate dguid\n",
    "utp_cd['geo_cduid'] = '2016A0003' + utp_cd['geo_cduid']\n",
    "utp_cd.rename(columns={'geo_cduid':'cd_dguid'}, inplace=True)\n",
    "\n",
    "# Select the variables\n",
    "variable_names = list(data_description['variables'])\n",
    "variable_names.insert(0, 'cd_dguid')\n",
    "\n",
    "# Quick check\n",
    "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
    "set(utp_cd.columns) -  set(variable_names)\n",
    "\n",
    "variable_names = list(OrderedSet(variable_names) & OrderedSet(utp_cd.columns))\n",
    "\n",
    "# Get rid of the geometry column and shape area, length\n",
    "utp_cd = utp_cd[variable_names]\n",
    "\n",
    "# Convert to lowest data type\n",
    "params = {\n",
    "    'convert_string': False,\n",
    "    'convert_boolean': False\n",
    "}\n",
    "utp_cd = utp_cd.convert_dtypes(**params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1f2bf2f9-8638-441a-bd54-1e988861dc96",
   "metadata": {},
   "source": [
    "## 4.6 Join the DataFrames and Export"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "029991c3-c3c9-4047-89b6-db0b019f3261",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'geo_descr_en', 'geo_descr_fr'}"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'cd_dguid'}"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(\"Merging all Census Divisions dataframes into one\")\n",
    "cd_merge = ao_cd.merge(cc_cd, how='inner', on='cd_dguid') \\\n",
    "                .merge(fo_cd, how='inner', on='cd_dguid') \\\n",
    "                .merge(lpb_cd, how='inner', on='cd_dguid') \\\n",
    "                .merge(utp_cd, how='inner', on='cd_dguid')\n",
    "\n",
    "# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr\n",
    "print(\"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\")\n",
    "set(data_description['variables']) - set(cd_merge.columns)\n",
    "print(\"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid\")\n",
    "set(cd_merge.columns) - set(data_description['variables'])\n",
    "\n",
    "# Export\n",
    "cd_merge.to_parquet(f'{output_data_dir}/cd_2016.parquet', index=False, compression='zstd')\n",
    "\n",
    "del(ao_cd)\n",
    "del(cc_cd)\n",
    "del(fo_cd)\n",
    "del(lpb_cd)\n",
    "del(utp_cd)\n",
    "del(cd_merge)\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c5d3a2fd-3d8b-461f-9933-19c13bcd01ba",
   "metadata": {},
   "source": [
    "# 5.0 Process Consolidated Subdivisions\n",
    "## 5.1 Process Agricultural Operations\n",
    "**TODO:** \n",
    "- Mistakes:\n",
    "    - There is a duplicate of `farms_n` column that is named `farms_n1`. The values are identical for both columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "f0a5858b-4fbc-4993-a9ed-b17c1606c8bd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quick check on columns that are on the geodataframe but not on the variables list\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'farms_n1',\n",
       " 'geo_descr1',\n",
       " 'geo_descr_',\n",
       " 'geometry',\n",
       " 'shape_area',\n",
       " 'shape_length'}"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = f'{input_data_dir}/CEAG16_AgriculturalOperations_ExploitationsAgricoles_REAG16.gdb.zip'\n",
    "\n",
    "print(f'Processing {dataset}, lccs000b16a_ceag16_n feature class')\n",
    "\n",
    "ao_ccs = gpd.read_file(dataset, layer='lccs000b16a_ceag16_n')\n",
    "\n",
    "# Lowercase column names\n",
    "ao_ccs.columns = [x.lower() for x in ao_ccs.columns]\n",
    "\n",
    "# Calculate dguid\n",
    "ao_ccs['ccsuid'] = '2016S0502' + ao_ccs['ccsuid']\n",
    "ao_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n",
    "\n",
    "# Select the variables\n",
    "variable_names = list(data_description['variables'])\n",
    "variable_names.insert(0, 'ccs_dguid')\n",
    "\n",
    "# Quick check\n",
    "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
    "set(ao_ccs.columns) -  set(variable_names)\n",
    "\n",
    "variable_names = list(OrderedSet(variable_names) & OrderedSet(ao_ccs.columns))\n",
    "\n",
    "# Get rid of the geometry column and shape area, length\n",
    "ao_ccs = ao_ccs[variable_names]\n",
    "\n",
    "# Convert to lowest data type\n",
    "params = {\n",
    "    'convert_string': False,\n",
    "    'convert_boolean': False\n",
    "}\n",
    "ao_ccs = ao_ccs.convert_dtypes(**params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d667b9b4-36fc-489d-9430-d8d099926d2f",
   "metadata": {},
   "source": [
    "## 5.2 Process Crop Cultures"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "7352d371-cf2a-4846-b239-f11ed0f5dd66",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quick check on columns that are on the geodataframe but not on the variables list\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'geo_descr1',\n",
       " 'geo_descr_',\n",
       " 'geometry',\n",
       " 'shape_area',\n",
       " 'shape_leng',\n",
       " 'shape_length'}"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = f'{input_data_dir}/CEAG16_Crops_Cultures_REAG16.gdb.zip'\n",
    "\n",
    "print(f'Processing {dataset}, lccs_000b16a_ceag16_n feature class')\n",
    "\n",
    "cc_ccs = gpd.read_file(dataset, layer='lccs_000b16a_ceag16_n')\n",
    "\n",
    "# Lowercase column names\n",
    "cc_ccs.columns = [x.lower() for x in cc_ccs.columns]\n",
    "\n",
    "# Calculate dguid\n",
    "cc_ccs['geo_ccsuid'] = '2016S0502' + cc_ccs['geo_ccsuid']\n",
    "cc_ccs.rename(columns={'geo_ccsuid':'ccs_dguid'}, inplace=True)\n",
    "\n",
    "# Select the variables\n",
    "variable_names = list(data_description['variables'])\n",
    "variable_names.insert(0, 'ccs_dguid')\n",
    "\n",
    "# Quick check\n",
    "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
    "set(cc_ccs.columns) -  set(variable_names)\n",
    "\n",
    "variable_names = list(OrderedSet(variable_names) & OrderedSet(cc_ccs.columns))\n",
    "\n",
    "# Get rid of the geometry column and shape area, length\n",
    "cc_ccs = cc_ccs[variable_names]\n",
    "\n",
    "# Convert to lowest data type\n",
    "params = {\n",
    "    'convert_string': False,\n",
    "    'convert_boolean': False\n",
    "}\n",
    "cc_ccs = cc_ccs.convert_dtypes(**params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ab1841c1-9ebb-41b4-ac04-a998c249b5aa",
   "metadata": {},
   "source": [
    "## 5.3 Process Farm Operators\n",
    "**TODO:** \n",
    "- Mistakes:\n",
    "    - Column `more_avg_a` should be called `more_avg_age`\n",
    "    - Column `one_avg_ag` should be called `one_avg_age`\n",
    "    - On the Excel sheet, there are four `OPER_N`, with the same definition\n",
    "        - On the dataset, this column is replicated five times as `oper_n`, `oper_n1`, `oper_n2`, `oper_n3`, `oper_n4`\n",
    "    - No idea what `opermore_1` is supposed to be\n",
    "    - Column `operone_n1` is duplicate of `operone_n`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "c047128d-ba23-4701-9907-9199b0f68549",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quick check on columns that are on the geodataframe but not on the variables list\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'geo_descr1',\n",
       " 'geo_descr_',\n",
       " 'geometry',\n",
       " 'oper_n1',\n",
       " 'oper_n2',\n",
       " 'oper_n3',\n",
       " 'oper_n4',\n",
       " 'opermore_1',\n",
       " 'operone_n1',\n",
       " 'shape_area',\n",
       " 'shape_leng',\n",
       " 'shape_length'}"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = f'{input_data_dir}/CEAG16_FarmOperators_ExploitantsAgricoles_REAG16.gdb.zip'\n",
    "\n",
    "print(f'Processing {dataset}, lccs000b16a_ceag16_n feature class')\n",
    "\n",
    "fo_ccs = gpd.read_file(dataset, layer='lccs000b16a_ceag16_n')\n",
    "\n",
    "# Lowercase column names\n",
    "fo_ccs.columns = [x.lower() for x in fo_ccs.columns]\n",
    "\n",
    "# Calculate dguid\n",
    "fo_ccs['ccsuid'] = '2016S0502' + fo_ccs['ccsuid']\n",
    "fo_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n",
    "\n",
    "# Select the variables\n",
    "variable_names = list(data_description['variables'])\n",
    "variable_names.insert(0, 'ccs_dguid')\n",
    "\n",
    "# Fix mistakes\n",
    "fo_ccs.rename(columns={\n",
    "    'more_avg_a': 'more_avg_age',\n",
    "    'one_avg_ag': 'one_avg_age'}, inplace=True)\n",
    "\n",
    "# Quick check\n",
    "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
    "set(fo_ccs.columns) -  set(variable_names)\n",
    "\n",
    "variable_names = list(OrderedSet(variable_names) & OrderedSet(fo_ccs.columns))\n",
    "\n",
    "# Get rid of the geometry column and shape area, length\n",
    "fo_ccs = fo_ccs[variable_names]\n",
    "\n",
    "# Convert to lowest data type\n",
    "params = {\n",
    "    'convert_string': False,\n",
    "    'convert_boolean': False\n",
    "}\n",
    "fo_ccs = fo_ccs.convert_dtypes(**params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ae7f58ae-f7b2-4eb3-b4b4-62e9605a290c",
   "metadata": {},
   "source": [
    "## 5.4 Process Livestock Poultry Bees"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "69e39b2b-7032-406b-9686-da84463dce5c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quick check on columns that are on the geodataframe but not on the variables list\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'geo_descr1',\n",
       " 'geo_descr_',\n",
       " 'geometry',\n",
       " 'shape_area',\n",
       " 'shape_leng',\n",
       " 'shape_length'}"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = f'{input_data_dir}/CEAG16_LivestockPoultryBees_BetailVolailleAbeilles_REAG16.gdb.zip'\n",
    "\n",
    "print(f'Processing {dataset}, lccs000b16a_ceag16_n feature class')\n",
    "\n",
    "lpb_ccs = gpd.read_file(dataset, layer='lccs000b16a_ceag16_n')\n",
    "\n",
    "# Lowercase column names\n",
    "lpb_ccs.columns = [x.lower() for x in lpb_ccs.columns]\n",
    "\n",
    "# Calculate dguid\n",
    "lpb_ccs['ccsuid'] = '2016S0502' + lpb_ccs['ccsuid']\n",
    "lpb_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n",
    "\n",
    "# Select the variables\n",
    "variable_names = list(data_description['variables'])\n",
    "variable_names.insert(0, 'ccs_dguid')\n",
    "\n",
    "# Quick check\n",
    "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
    "set(lpb_ccs.columns) -  set(variable_names)\n",
    "\n",
    "variable_names = list(OrderedSet(variable_names) & OrderedSet(lpb_ccs.columns))\n",
    "\n",
    "# Get rid of the geometry column and shape area, length\n",
    "lpb_ccs = lpb_ccs[variable_names]\n",
    "\n",
    "# Convert to lowest data type\n",
    "params = {\n",
    "    'convert_string': False,\n",
    "    'convert_boolean': False\n",
    "}\n",
    "lpb_ccs = lpb_ccs.convert_dtypes(**params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d7343f35-7a86-4cfd-ba10-d81d5347f276",
   "metadata": {},
   "source": [
    "## 5.5 Process Use Tenure Practices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "19b31cd4-37ae-4541-8e15-17e698256c98",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quick check on columns that are on the geodataframe but not on the variables list\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'geo_descr1',\n",
       " 'geo_descr_',\n",
       " 'geometry',\n",
       " 'shape_area',\n",
       " 'shape_leng',\n",
       " 'shape_length'}"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = f'{input_data_dir}/CEAG16_UseTenurePractices_UtilisationOccupationPratiques_REAG16.gdb.zip'\n",
    "\n",
    "print(f'Processing {dataset}, lccs_000b16a_ceag16_n feature class')\n",
    "\n",
    "utp_ccs = gpd.read_file(dataset, layer='lccs_000b16a_ceag16_n')\n",
    "\n",
    "# Lowercase column names\n",
    "utp_ccs.columns = [x.lower() for x in utp_ccs.columns]\n",
    "\n",
    "# Calculate dguid\n",
    "utp_ccs['ccsuid'] = '2016S0502' + utp_ccs['ccsuid']\n",
    "utp_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n",
    "\n",
    "# Select the variables\n",
    "variable_names = list(data_description['variables'])\n",
    "variable_names.insert(0, 'ccs_dguid')\n",
    "\n",
    "# Quick check\n",
    "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
    "set(utp_ccs.columns) -  set(variable_names)\n",
    "\n",
    "variable_names = list(OrderedSet(variable_names) & OrderedSet(utp_ccs.columns))\n",
    "\n",
    "# Get rid of the geometry column and shape area, length\n",
    "utp_ccs = utp_ccs[variable_names]\n",
    "\n",
    "# Convert to lowest data type\n",
    "params = {\n",
    "    'convert_string': False,\n",
    "    'convert_boolean': False\n",
    "}\n",
    "utp_ccs = utp_ccs.convert_dtypes(**params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aec1734e-6ade-4849-a160-8a849af15265",
   "metadata": {},
   "source": [
    "## 5.6 Join the DataFrames and Export"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "11bd0ad8-a207-43ba-ad27-826e79e4b678",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'geo_descr_en', 'geo_descr_fr'}"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'ccs_dguid'}"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(\"Merging all Census Consolidated Subdivisions dataframes into one\")\n",
    "ccs_merge = ao_ccs.merge(cc_ccs, how='inner', on='ccs_dguid') \\\n",
    "                  .merge(fo_ccs, how='inner', on='ccs_dguid') \\\n",
    "                  .merge(lpb_ccs, how='inner', on='ccs_dguid') \\\n",
    "                  .merge(utp_ccs, how='inner', on='ccs_dguid')\n",
    "\n",
    "# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr\n",
    "print(\"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\")\n",
    "set(data_description['variables']) - set(ccs_merge.columns)\n",
    "print(\"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid\")\n",
    "set(ccs_merge.columns) - set(data_description['variables'])\n",
    "\n",
    "# Export\n",
    "print(\"Exporting ccs_2016.parquet\")\n",
    "ccs_merge.to_parquet(f'{output_data_dir}/ccs_2016.parquet', index=False, compression='zstd')\n",
    "\n",
    "del(ao_ccs)\n",
    "del(cc_ccs)\n",
    "del(fo_ccs)\n",
    "del(lpb_ccs)\n",
    "del(utp_ccs)\n",
    "del(ccs_merge)\n",
    "gc.collect()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}