{ "cells": [ { "cell_type": "code", "execution_count": 40, "id": "a06cc1c7-7826-4270-8c04-48ad4de90bc9", "metadata": {}, "outputs": [], "source": [ "import gc\n", "\n", "from IPython.core.interactiveshell import InteractiveShell \n", "import geopandas as gpd\n", "import pandas as pd\n", "\n", "# Enable multiple outputs per cell\n", "InteractiveShell.ast_node_interactivity = \"all\"\n", "# Show all columns\n", "pd.set_option('display.max_columns', None)" ] }, { "cell_type": "code", "execution_count": 41, "id": "6e4dad8c-afec-4bb6-aee2-20b9b9c5a7a5", "metadata": {}, "outputs": [], "source": [ "input_data_dir = '/data/census_of_agriculture/input/2021'\n", "output_data_dir = '/data/census_of_agriculture/output/2021/tabular'" ] }, { "cell_type": "markdown", "id": "c049bbce-8dcb-418c-b7cb-7015f920a39a", "metadata": {}, "source": [ "# 1.0 Process Excel sheet with column names and descriptions\n", "The compilation of all of the file geodatabase dataset columns should match this dataset" ] }, { "cell_type": "code", "execution_count": 42, "id": "0026e968-c4a4-4dc3-9f5c-cdf9706bb8e9", "metadata": {}, "outputs": [], "source": [ "print(\"Reading Excel sheet with variables\")\n", "\n", "data_description = pd.read_excel(f'{input_data_dir}/CEAG21_VariablesDescriptions_REAG21_EN_FR.xlsx', skiprows=2,\n", " usecols=['2021 Variables', 'Categories', '2021 Long description of the variables (EN)'])\n", "data_description.rename(columns={'2021 Variables': 'variables', 'Categories': 'categories', '2021 Long description of the variables (EN)': 'description_en'}, inplace=True)\n", "data_description['variables'] = data_description['variables'].str.lower()" ] }, { "cell_type": "markdown", "id": "ae758963-b639-44a9-9904-d7438594a729", "metadata": {}, "source": [ "# 2.0 Process Provinces and Territories\n", "## 2.1 Process Agricultural Operations\n", "**TODO:** \n", "- Figure out the -1 values\n", "- Figure out why `valoeq` is float and not integer\n", " - It is float because Nova Scotia has a value of 82047377.0000004\n", "- Figure out why the data types are integer64" ] }, { "cell_type": "code", "execution_count": 43, "id": "2b8467e8-b979-423c-856c-80242f9a8443", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quick check on columns that are on the geodataframe but not on the variables list\n" ] }, { "data": { "text/plain": [ "{'geometry', 'prename', 'prfname', 'shape_area', 'shape_length'}" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = f'{input_data_dir}/CEAG21_AgriculturalOperations_ExploitationsAgricoles_REAG21.gdb.zip'\n", "\n", "print(f'Processing {dataset}, lpr_000b21a_e_ceag21_n feature class')\n", "\n", "ao_pr = gpd.read_file(dataset, layer='lpr_000b21a_e_ceag21_n')\n", "# Lowercase column names\n", "ao_pr.columns = [x.lower() for x in ao_pr.columns]\n", "\n", "# Calculate dguid\n", "ao_pr['pruid'] = '2021A0002' + ao_pr['pruid']\n", "ao_pr.rename(columns={'pruid':'pr_dguid'}, inplace=True)\n", "\n", "# Select the variables for agricultural operations\n", "variable_names = list(data_description[data_description['categories'] == 'Agricultural operations']['variables'])\n", "variable_names.insert(0, 'pr_dguid')\n", "\n", "# Quick check\n", "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", "set(ao_pr.columns) - set(variable_names)\n", "\n", "# Get rid of the geometry column and shape area, length\n", "ao_pr = ao_pr[variable_names]\n", "\n", "# Convert to lowest data type\n", "params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", "}\n", "ao_pr = ao_pr.convert_dtypes(**params)" ] }, { "cell_type": "markdown", "id": "54b1ab9d-68c7-4751-a17e-e5ad321fd72f", "metadata": {}, "source": [ "## 2.2 Process Crop Cultures" ] }, { "cell_type": "code", "execution_count": 44, "id": "0dfa08f0-2d8d-4279-bd22-2f127d596c3f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quick check on columns that are on the geodataframe but not on the variables list\n" ] }, { "data": { "text/plain": [ "{'geometry',\n", " 'prename',\n", " 'prfname',\n", " 'shape_area',\n", " 'shape_area_1',\n", " 'shape_length',\n", " 'shape_length_1'}" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = f'{input_data_dir}/CEAG21_Crops_Cultures_REAG21.gdb.zip'\n", "\n", "print(f'Processing {dataset}, lpr_000b21a_e_ceag21_n feature class')\n", "\n", "cc_pr = gpd.read_file(dataset, layer='lpr_000b21a_e_ceag21_n')\n", "\n", "# Lowercase column names\n", "cc_pr.columns = [x.lower() for x in cc_pr.columns]\n", "\n", "# Calculate dguid\n", "cc_pr['pruid'] = '2021A0002' + cc_pr['pruid']\n", "cc_pr.rename(columns={'pruid':'pr_dguid'}, inplace=True)\n", "\n", "# Select the variables for agricultural operations\n", "variable_names = list(data_description[data_description['categories'] == 'Crops']['variables'])\n", "variable_names.insert(0, 'pr_dguid')\n", "\n", "# Quick check\n", "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", "set(cc_pr.columns) - set(variable_names)\n", "\n", "# Get rid of the geometry column and shape area, length\n", "cc_pr = cc_pr[variable_names]\n", "\n", "# Convert to lowest data type\n", "params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", "}\n", "cc_pr = cc_pr.convert_dtypes(**params)" ] }, { "cell_type": "markdown", "id": "076eb1e3-b319-47ff-b959-cae6dd600081", "metadata": {}, "source": [ "## 2.3 Process Farm Operators" ] }, { "cell_type": "code", "execution_count": 45, "id": "de1dbde2-e820-438f-ae9d-b54455b10ac4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quick check on columns that are on the geodataframe but not on the variables list\n" ] }, { "data": { "text/plain": [ "{'geometry', 'prename', 'prfname', 'shape_area', 'shape_length'}" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = f'{input_data_dir}/CEAG21_FarmOperators_ExploitantsAgricoles_REAG21.gdb.zip'\n", "\n", "print(f'Processing {dataset}, lpr_000b21a_e_ceag21_n feature class')\n", "\n", "fo_pr = gpd.read_file(dataset, layer='lpr_000b21a_e_ceag21_n')\n", "\n", "# Lowercase column names\n", "fo_pr.columns = [x.lower() for x in fo_pr.columns]\n", "\n", "# Calculate dguid\n", "fo_pr['pruid'] = '2021A0002' + fo_pr['pruid']\n", "fo_pr.rename(columns={'pruid':'pr_dguid'}, inplace=True)\n", "\n", "# Select the variables for farm operators\n", "variable_names = list(data_description[data_description['categories'] == 'Farm operators']['variables'])\n", "variable_names.insert(0, 'pr_dguid')\n", "\n", "# Quick check\n", "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", "set(fo_pr.columns) - set(variable_names)\n", "\n", "# Get rid of the geometry column and shape area, length\n", "fo_pr = fo_pr[variable_names]\n", "\n", "# Convert to lowest data type\n", "params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", "}\n", "fo_pr = fo_pr.convert_dtypes(**params)" ] }, { "cell_type": "markdown", "id": "39161daf-5d6a-47e0-9211-0cc0137e2c9e", "metadata": {}, "source": [ "## 2.4 Process Livestock Poultry Bees" ] }, { "cell_type": "code", "execution_count": 46, "id": "79731af0-56f2-4840-b903-49410512d410", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quick check on columns that are on the geodataframe but not on the variables list\n" ] }, { "data": { "text/plain": [ "{'geometry', 'prename', 'prfname', 'shape_area', 'shape_length'}" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = f'{input_data_dir}/CEAG21_LivestockPoultryBees_BetailVolailleAbeilles_REAG21.gdb.zip'\n", "\n", "print(f'Processing {dataset}, lpr_000b21a_e_ceag21_n feature class')\n", "\n", "lpb_pr = gpd.read_file(dataset, layer='lpr_000b21a_e_ceag21_n')\n", "\n", "# Lowercase column names\n", "lpb_pr.columns = [x.lower() for x in lpb_pr.columns]\n", "\n", "# Calculate dguid\n", "lpb_pr['pruid'] = '2021A0002' + lpb_pr['pruid']\n", "lpb_pr.rename(columns={'pruid':'pr_dguid'}, inplace=True)\n", "\n", "# Select the variables for livestock poultry bees\n", "variable_names = list(data_description[data_description['categories'] == 'Livestock Poultry Bees']['variables'])\n", "variable_names.insert(0, 'pr_dguid')\n", "\n", "# Quick check\n", "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", "set(lpb_pr.columns) - set(variable_names)\n", "\n", "# Get rid of the geometry column and shape area, length\n", "lpb_pr = lpb_pr[variable_names]\n", "\n", "# Convert to lowest data type\n", "params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", "}\n", "lpb_pr = lpb_pr.convert_dtypes(**params)" ] }, { "cell_type": "markdown", "id": "6a33cfe2-4bbb-46a7-81fb-494a61663a86", "metadata": {}, "source": [ "## 2.5 Process Use Tenure Practices" ] }, { "cell_type": "code", "execution_count": 47, "id": "328daec4-f6de-42c4-b11d-5ba287b7a94d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quick check on columns that are on the geodataframe but not on the variables list\n" ] }, { "data": { "text/plain": [ "{'geometry', 'prename', 'prfname', 'shape_area', 'shape_length'}" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = f'{input_data_dir}/CEAG21_UseTenurePractices_UtilisationOccupationPratiques_REAG21.gdb.zip'\n", "\n", "print(f'Processing {dataset}, lpr_000b21a_e_ceag21_n feature class')\n", "\n", "utp_pr = gpd.read_file(dataset, layer='lpr_000b21a_e_ceag21_n')\n", "\n", "# Lowercase column names\n", "utp_pr.columns = [x.lower() for x in utp_pr.columns]\n", "\n", "# Calculate dguid\n", "utp_pr['pruid'] = '2021A0002' + utp_pr['pruid']\n", "utp_pr.rename(columns={'pruid':'pr_dguid'}, inplace=True)\n", "\n", "# Select the variables for tenure practices\n", "variable_names = list(data_description[data_description['categories'] == 'Use Tenure Practices']['variables'])\n", "variable_names.insert(0, 'pr_dguid')\n", "\n", "# Quick check\n", "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", "set(utp_pr.columns) - set(variable_names)\n", "\n", "# Get rid of the geometry column and shape area, length\n", "utp_pr = utp_pr[variable_names]\n", "\n", "# Convert to lowest data type\n", "params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", "}\n", "utp_pr = utp_pr.convert_dtypes(**params)" ] }, { "cell_type": "markdown", "id": "4963cfbb-71c3-4ce7-98f1-3c6ad3d90fde", "metadata": {}, "source": [ "## 2.6 Join the DataFrames and Export" ] }, { "cell_type": "code", "execution_count": 48, "id": "11b5355a-2293-4751-a364-54a0963ed662", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\n" ] }, { "data": { "text/plain": [ "{'geo_descr_en', 'geo_descr_fr'}" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be pr_dguid\n" ] }, { "data": { "text/plain": [ "{'pr_dguid'}" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "308" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(\"Merging all Province and Territories dataframes into one\")\n", "pr_merge = ao_pr.merge(cc_pr, how='inner', on='pr_dguid') \\\n", " .merge(fo_pr, how='inner', on='pr_dguid') \\\n", " .merge(lpb_pr, how='inner', on='pr_dguid') \\\n", " .merge(utp_pr, how='inner', on='pr_dguid')\n", "\n", "# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr\n", "print(\"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\")\n", "set(data_description['variables']) - set(pr_merge.columns)\n", "print(\"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be pr_dguid\")\n", "set(pr_merge.columns) - set(data_description['variables'])\n", "\n", "# Export\n", "print(\"Exporting pr_2021.parquet\")\n", "pr_merge.to_parquet(f'{output_data_dir}/pr_2021.parquet', index=False, compression='zstd')\n", "\n", "# Create country as well\n", "# TODO: check if -1 values subtracted from the sum\n", "country = pd.read_parquet(f'{output_data_dir}/pr_2021.parquet')\n", "country['pr_dguid'] = '2021A000011124'\n", "country.rename(columns={'pr_dguid': 'country_dguid'}, inplace=True)\n", "country = country.groupby(['country_dguid']).sum()\n", "country.reset_index(inplace=True)\n", "# Convert to lowest data type\n", "params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", "}\n", "country = country.convert_dtypes(**params)\n", "print(\"Exporting country_2021.parquet\")\n", "country.to_parquet(f'{output_data_dir}/country_2021.parquet', index=False, compression='zstd')\n", "\n", "del(ao_pr)\n", "del(cc_pr)\n", "del(fo_pr)\n", "del(lpb_pr)\n", "del(utp_pr)\n", "del(pr_merge)\n", "del(country)\n", "gc.collect()" ] }, { "cell_type": "markdown", "id": "bd78f553-52b4-427e-bf40-7ee4101d65ac", "metadata": {}, "source": [ "# 3.0 Process Census Agricultural Regions" ] }, { "cell_type": "markdown", "id": "ac1383f5-1340-4dd8-9346-0d21ab506886", "metadata": {}, "source": [ "## 3.1 Process Agricultural Operations" ] }, { "cell_type": "code", "execution_count": 49, "id": "740de275-a602-4a18-9f7a-708db0e4e529", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quick check on columns that are on the geodataframe but not on the variables list\n" ] }, { "data": { "text/plain": [ "{'carename', 'carfname', 'geometry', 'shape_area', 'shape_length'}" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = f'{input_data_dir}/CEAG21_AgriculturalOperations_ExploitationsAgricoles_REAG21.gdb.zip'\n", "\n", "print(f'Processing {dataset}, lcar000b21a_e_ceag21_n feature class')\n", "\n", "ao_car = gpd.read_file(dataset, layer='lcar000b21a_e_ceag21_n')\n", "\n", "# Lowercase column names\n", "ao_car.columns = [x.lower() for x in ao_car.columns]\n", "\n", "# Calculate dguid\n", "ao_car['caruid'] = '2021S0501' + ao_car['caruid']\n", "ao_car.rename(columns={'caruid':'car_dguid'}, inplace=True)\n", "\n", "# Select the variables for agricultural operations\n", "variable_names = list(data_description[data_description['categories'] == 'Agricultural operations']['variables'])\n", "variable_names.insert(0, 'car_dguid')\n", "\n", "# Quick check\n", "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", "set(ao_car.columns) - set(variable_names)\n", "\n", "# Get rid of the geometry column and shape area, length\n", "ao_car = ao_car[variable_names]\n", "\n", "# Convert to lowest data type\n", "params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", "}\n", "ao_car = ao_car.convert_dtypes(**params)" ] }, { "cell_type": "markdown", "id": "c4254a8e-30cf-43a0-8a08-b7ac1ab9689c", "metadata": {}, "source": [ "## 3.2 Process Crop Cultures" ] }, { "cell_type": "code", "execution_count": 50, "id": "92b08284-0adb-43fd-a89f-19395afb9bd0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quick check on columns that are on the geodataframe but not on the variables list\n" ] }, { "data": { "text/plain": [ "{'carename', 'carfname', 'geometry', 'shape_area', 'shape_length'}" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = f'{input_data_dir}/CEAG21_Crops_Cultures_REAG21.gdb.zip'\n", "\n", "print(f'Processing {dataset}, lcar000b21a_e_ceag21_n feature class')\n", "\n", "cc_car = gpd.read_file(dataset, layer='lcar000b21a_e_ceag21_n')\n", "\n", "# Lowercase column names\n", "cc_car.columns = [x.lower() for x in cc_car.columns]\n", "\n", "# Calculate dguid\n", "cc_car['caruid'] = '2021S0501' + cc_car['caruid']\n", "cc_car.rename(columns={'caruid':'car_dguid'}, inplace=True)\n", "\n", "# Select the variables for crop cultures\n", "variable_names = list(data_description[data_description['categories'] == 'Crops']['variables'])\n", "variable_names.insert(0, 'car_dguid')\n", "\n", "# Quick check\n", "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", "set(cc_car.columns) - set(variable_names)\n", "\n", "# Get rid of the geometry column and shape area, length\n", "cc_car = cc_car[variable_names]\n", "\n", "# Convert to lowest data type\n", "params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", "}\n", "cc_car = cc_car.convert_dtypes(**params)" ] }, { "cell_type": "markdown", "id": "babc334e-47c0-435d-919c-2df54a70130a", "metadata": {}, "source": [ "## 3.3 Process Farm Operators\n", "Census of Agriculture release made a mistake in this file:\n", "- `more_avg_age` is now `more_avg_a` in the file\n", "- `more_med_age` is now `more_med_a` in the file\n", "- `one_avg_age` is now `one_avg_ag` in the file\n", "- `one_med_age` is now `one_med_ag` in the file\n", "- `plan_nodis_n` is now `plan_nodis` in the file" ] }, { "cell_type": "code", "execution_count": 51, "id": "47afb1ab-f4a9-4581-8222-b4a52ff0eb4a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quick check on columns that are on the geodataframe but not on the variables list\n" ] }, { "data": { "text/plain": [ "{'carename',\n", " 'carfname',\n", " 'geometry',\n", " 'shape_area',\n", " 'shape_leng',\n", " 'shape_length'}" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = f'{input_data_dir}/CEAG21_FarmOperators_ExploitantsAgricoles_REAG21.gdb.zip'\n", "\n", "print(f'Processing {dataset}, lcar000b21a_e_ceag21_n feature class')\n", "\n", "fo_car = gpd.read_file(dataset, layer='lcar000b21a_e_ceag21_n')\n", "\n", "# Lowercase column names\n", "fo_car.columns = [x.lower() for x in fo_car.columns]\n", "\n", "# Calculate dguid\n", "fo_car['caruid'] = '2021S0501' + fo_car['caruid']\n", "fo_car.rename(columns={'caruid':'car_dguid'}, inplace=True)\n", "\n", "# Select the variables for farm operators\n", "variable_names = list(data_description[data_description['categories'] == 'Farm operators']['variables'])\n", "variable_names.insert(0, 'car_dguid')\n", "\n", "# Rename mistakes\n", "fo_car.rename(columns={\n", " 'more_avg_a': 'more_avg_age',\n", " 'more_med_a': 'more_med_age',\n", " 'one_avg_ag': 'one_avg_age',\n", " 'one_med_ag': 'one_med_age',\n", " 'plan_nodis': 'plan_nodis_n'\n", "}, inplace=True)\n", "\n", "# Quick check\n", "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", "set(fo_car.columns) - set(variable_names)\n", "\n", "# Get rid of the geometry column and shape area, length\n", "fo_car = fo_car[variable_names]\n", "\n", "# Convert to lowest data type\n", "params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", "}\n", "fo_car = fo_car.convert_dtypes(**params)" ] }, { "cell_type": "markdown", "id": "03bf9372-c2da-46b9-8da9-64c358cb51df", "metadata": {}, "source": [ "## 3.4 Process Livestock Poultry Bees" ] }, { "cell_type": "code", "execution_count": 52, "id": "abf3ff3f-6fbc-482f-9ef8-f7ae6c289884", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quick check on columns that are on the geodataframe but not on the variables list\n" ] }, { "data": { "text/plain": [ "{'carename', 'carfname', 'geometry', 'shape_area', 'shape_length'}" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = f'{input_data_dir}/CEAG21_LivestockPoultryBees_BetailVolailleAbeilles_REAG21.gdb.zip'\n", "\n", "print(f'Processing {dataset}, lcar000b21a_e_ceag21_n feature class')\n", "\n", "lpb_car = gpd.read_file(dataset, layer='lcar000b21a_e_ceag21_n')\n", "\n", "# Lowercase column names\n", "lpb_car.columns = [x.lower() for x in lpb_car.columns]\n", "\n", "# Calculate dguid\n", "lpb_car['caruid'] = '2021S0501' + lpb_car['caruid']\n", "lpb_car.rename(columns={'caruid':'car_dguid'}, inplace=True)\n", "\n", "# Select the variables for livestock poultry bees\n", "variable_names = list(data_description[data_description['categories'] == 'Livestock Poultry Bees']['variables'])\n", "variable_names.insert(0, 'car_dguid')\n", "\n", "# Quick check\n", "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", "set(lpb_car.columns) - set(variable_names)\n", "\n", "# Get rid of the geometry column and shape area, length\n", "lpb_car = lpb_car[variable_names]\n", "\n", "# Convert to lowest data type\n", "params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", "}\n", "lpb_car = lpb_car.convert_dtypes(**params)" ] }, { "cell_type": "markdown", "id": "a3962c9c-3b5b-4fd8-a6ca-8e004159a521", "metadata": {}, "source": [ "## 3.5 Process Use Tenure Practices" ] }, { "cell_type": "code", "execution_count": 53, "id": "16678c91-7132-4ed2-981b-f60b0d81bc73", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quick check on columns that are on the geodataframe but not on the variables list\n" ] }, { "data": { "text/plain": [ "{'carename', 'carfname', 'geometry', 'shape_area', 'shape_length'}" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = f'{input_data_dir}/CEAG21_UseTenurePractices_UtilisationOccupationPratiques_REAG21.gdb.zip'\n", "\n", "print(f'Processing {dataset}, lcar000b21a_e_ceag21_n feature class')\n", "\n", "utp_car = gpd.read_file(dataset, layer='lcar000b21a_e_ceag21_n')\n", "\n", "# Lowercase column names\n", "utp_car.columns = [x.lower() for x in utp_car.columns]\n", "\n", "# Calculate dguid\n", "utp_car['caruid'] = '2021S0501' + utp_car['caruid']\n", "utp_car.rename(columns={'caruid':'car_dguid'}, inplace=True)\n", "\n", "# Select the variables for tenure practices\n", "variable_names = list(data_description[data_description['categories'] == 'Use Tenure Practices']['variables'])\n", "variable_names.insert(0, 'car_dguid')\n", "\n", "# Quick check\n", "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", "set(utp_car.columns) - set(variable_names)\n", "\n", "# Get rid of the geometry column and shape area, length\n", "utp_car = utp_car[variable_names]\n", "\n", "# Convert to lowest data type\n", "params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", "}\n", "utp_car = utp_car.convert_dtypes(**params)" ] }, { "cell_type": "markdown", "id": "9b99344d-edfc-4f51-ab9f-8a56100394b0", "metadata": {}, "source": [ "## 3.6 Join the DataFrames and Export" ] }, { "cell_type": "code", "execution_count": 54, "id": "d1cfb3c1-c94e-4f4a-9c90-e61c43a791a3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\n" ] }, { "data": { "text/plain": [ "{'geo_descr_en', 'geo_descr_fr'}" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be car_dguid\n" ] }, { "data": { "text/plain": [ "{'car_dguid'}" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "0" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(\"Merging all Census Agricultural Regions dataframes into one\")\n", "car_merge = ao_car.merge(cc_car, how='inner', on='car_dguid') \\\n", " .merge(fo_car, how='inner', on='car_dguid') \\\n", " .merge(lpb_car, how='inner', on='car_dguid') \\\n", " .merge(utp_car, how='inner', on='car_dguid')\n", "\n", "# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr\n", "print(\"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\")\n", "set(data_description['variables']) - set(car_merge.columns)\n", "print(\"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be car_dguid\")\n", "set(car_merge.columns) - set(data_description['variables'])\n", "\n", "# Export\n", "print(\"Exporting car_2021.parquet\")\n", "car_merge.to_parquet(f'{output_data_dir}/car_2021.parquet', index=False, compression='zstd')\n", "\n", "del(ao_car)\n", "del(cc_car)\n", "del(fo_car)\n", "del(lpb_car)\n", "del(utp_car)\n", "del(car_merge)\n", "gc.collect()" ] }, { "cell_type": "markdown", "id": "5ec65522-e754-40b9-994d-e21c5bae88ac", "metadata": {}, "source": [ "# 4.0 Process Census Divisions\n", "## 4.1 Process Agricultural Operations" ] }, { "cell_type": "code", "execution_count": 55, "id": "02f789b8-e3d8-4359-97b1-46be400bc019", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quick check on columns that are on the geodataframe but not on the variables list\n" ] }, { "data": { "text/plain": [ "{'cdname', 'drnom', 'geometry', 'shape_area', 'shape_length'}" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = f'{input_data_dir}/CEAG21_AgriculturalOperations_ExploitationsAgricoles_REAG21.gdb.zip'\n", "\n", "print(f'Processing {dataset}, lcd000b21a_e_ceag21_n feature class')\n", "\n", "ao_cd = gpd.read_file(dataset, layer='lcd000b21a_e_ceag21_n')\n", "\n", "# Lowercase column names\n", "ao_cd.columns = [x.lower() for x in ao_cd.columns]\n", "\n", "# Calculate dguid\n", "ao_cd['cduid'] = '2021A0003' + ao_cd['cduid']\n", "ao_cd.rename(columns={'cduid':'cd_dguid'}, inplace=True)\n", "\n", "# Select the variables for agricultural operations\n", "variable_names = list(data_description[data_description['categories'] == 'Agricultural operations']['variables'])\n", "variable_names.insert(0, 'cd_dguid')\n", "\n", "# Quick check\n", "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", "set(ao_cd.columns) - set(variable_names)\n", "\n", "# Get rid of the geometry column and shape area, length\n", "ao_cd = ao_cd[variable_names]\n", "\n", "# Convert to lowest data type\n", "params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", "}\n", "ao_cd = ao_cd.convert_dtypes(**params)" ] }, { "cell_type": "markdown", "id": "b22c32c1-bf09-42f6-ad9f-15e19e88f125", "metadata": {}, "source": [ "## 4.2 Process Crop Cultures" ] }, { "cell_type": "code", "execution_count": 56, "id": "05283ef4-d1a9-48aa-8b34-d11123e19dc1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quick check on columns that are on the geodataframe but not on the variables list\n" ] }, { "data": { "text/plain": [ "{'cdname', 'drnom', 'geometry', 'shape_area', 'shape_length'}" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = f'{input_data_dir}/CEAG21_Crops_Cultures_REAG21.gdb.zip'\n", "\n", "print(f'Processing {dataset}, lcd000b21a_e_ceag21_n feature class')\n", "\n", "cc_cd = gpd.read_file(dataset, layer='lcd000b21a_e_ceag21_n')\n", "\n", "# Lowercase column names\n", "cc_cd.columns = [x.lower() for x in cc_cd.columns]\n", "\n", "# Calculate dguid\n", "cc_cd['cduid'] = '2021A0003' + cc_cd['cduid']\n", "cc_cd.rename(columns={'cduid':'cd_dguid'}, inplace=True)\n", "\n", "# Select the variables for agricultural operations\n", "variable_names = list(data_description[data_description['categories'] == 'Crops']['variables'])\n", "variable_names.insert(0, 'cd_dguid')\n", "\n", "# Quick check\n", "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", "set(cc_cd.columns) - set(variable_names)\n", "\n", "# Get rid of the geometry column and shape area, length\n", "cc_cd = cc_cd[variable_names]\n", "\n", "# Convert to lowest data type\n", "params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", "}\n", "cc_cd = cc_cd.convert_dtypes(**params)" ] }, { "cell_type": "markdown", "id": "52241e8d-b468-4dd6-99b9-342f9a6fde94", "metadata": {}, "source": [ "## 4.3 Process Farm Operators\n", "Census of Agriculture release made a mistake in this file:\n", "- `more_avg_age` is now `more_avg_a` in the file\n", "- `more_med_age` is now `more_med_a` in the file\n", "- `one_avg_age` is now `one_avg_ag` in the file\n", "- `one_med_age` is now `one_med_ag` in the file\n", "- `plan_nodis_n` is now `plan_nodis` in the file" ] }, { "cell_type": "code", "execution_count": 57, "id": "cf5b26d1-7315-4e9d-85fe-473a68cc5f0f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quick check on columns that are on the geodataframe but not on the variables list\n" ] }, { "data": { "text/plain": [ "{'cdname', 'drnom', 'geometry', 'shape_area', 'shape_leng', 'shape_length'}" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = f'{input_data_dir}/CEAG21_FarmOperators_ExploitantsAgricoles_REAG21.gdb.zip'\n", "\n", "print(f'Processing {dataset}, lcd000b21a_e_ceag21_n feature class')\n", "\n", "fo_cd = gpd.read_file(dataset, layer='lcd000b21a_e_ceag21_n')\n", "\n", "# Lowercase column names\n", "fo_cd.columns = [x.lower() for x in fo_cd.columns]\n", "\n", "# Calculate dguid\n", "fo_cd['cduid'] = '2021A0003' + fo_cd['cduid']\n", "fo_cd.rename(columns={'cduid':'cd_dguid'}, inplace=True)\n", "\n", "# Select the variables for agricultural operations\n", "variable_names = list(data_description[data_description['categories'] == 'Farm operators']['variables'])\n", "variable_names.insert(0, 'cd_dguid')\n", "\n", "# Rename mistakes\n", "fo_cd.rename(columns={\n", " 'more_avg_a': 'more_avg_age',\n", " 'more_med_a': 'more_med_age',\n", " 'one_avg_ag': 'one_avg_age',\n", " 'one_med_ag': 'one_med_age',\n", " 'plan_nodis': 'plan_nodis_n'\n", "}, inplace=True)\n", "\n", "# Quick check\n", "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", "set(fo_cd.columns) - set(variable_names)\n", "\n", "# Get rid of the geometry column and shape area, length\n", "fo_cd = fo_cd[variable_names]\n", "\n", "# Convert to lowest data type\n", "params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", "}\n", "fo_cd = fo_cd.convert_dtypes(**params)" ] }, { "cell_type": "markdown", "id": "f68f6fc6-f757-4f21-b32a-7070d4bcc3ec", "metadata": {}, "source": [ "## 4.4 Process Livestock Poultry Bees" ] }, { "cell_type": "code", "execution_count": 58, "id": "e3cd38c6-6563-4067-8280-32277b00a33b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quick check on columns that are on the geodataframe but not on the variables list\n" ] }, { "data": { "text/plain": [ "{'cdname', 'drnom', 'geometry', 'shape_area', 'shape_length'}" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = f'{input_data_dir}/CEAG21_LivestockPoultryBees_BetailVolailleAbeilles_REAG21.gdb.zip'\n", "\n", "print(f'Processing {dataset}, lcd000b21a_e_ceag21_n feature class')\n", "\n", "lpb_cd = gpd.read_file(dataset, layer='lcd000b21a_e_ceag21_n')\n", "\n", "# Lowercase column names\n", "lpb_cd.columns = [x.lower() for x in lpb_cd.columns]\n", "\n", "# Calculate dguid\n", "lpb_cd['cduid'] = '2021A0003' + lpb_cd['cduid']\n", "lpb_cd.rename(columns={'cduid':'cd_dguid'}, inplace=True)\n", "\n", "# Select the variables for livestock poultry bees\n", "variable_names = list(data_description[data_description['categories'] == 'Livestock Poultry Bees']['variables'])\n", "variable_names.insert(0, 'cd_dguid')\n", "\n", "# Quick check\n", "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", "set(lpb_cd.columns) - set(variable_names)\n", "\n", "# Get rid of the geometry column and shape area, length\n", "lpb_cd = lpb_cd[variable_names]\n", "\n", "# Convert to lowest data type\n", "params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", "}\n", "lpb_cd = lpb_cd.convert_dtypes(**params)" ] }, { "cell_type": "markdown", "id": "e06083ab-9a15-4e7f-82e1-721c9ee1d4f1", "metadata": {}, "source": [ "## 4.5 Process Use Tenure Practices" ] }, { "cell_type": "code", "execution_count": 59, "id": "efe9a9da-9f18-43f7-8ba7-5f2730743358", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quick check on columns that are on the geodataframe but not on the variables list\n" ] }, { "data": { "text/plain": [ "{'cdname', 'drnom', 'geometry', 'shape_area', 'shape_length'}" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = f'{input_data_dir}/CEAG21_UseTenurePractices_UtilisationOccupationPratiques_REAG21.gdb.zip'\n", "\n", "print(f'Processing {dataset}, lcd000b21a_e_ceag21_n feature class')\n", "\n", "utp_cd = gpd.read_file(dataset, layer='lcd000b21a_e_ceag21_n')\n", "\n", "# Lowercase column names\n", "utp_cd.columns = [x.lower() for x in utp_cd.columns]\n", "\n", "# Calculate dguid\n", "utp_cd['cduid'] = '2021A0003' + utp_cd['cduid']\n", "utp_cd.rename(columns={'cduid':'cd_dguid'}, inplace=True)\n", "\n", "# Select the variables for tenure practices\n", "variable_names = list(data_description[data_description['categories'] == 'Use Tenure Practices']['variables'])\n", "variable_names.insert(0, 'cd_dguid')\n", "\n", "# Quick check\n", "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", "set(utp_cd.columns) - set(variable_names)\n", "\n", "# Get rid of the geometry column and shape area, length\n", "utp_cd = utp_cd[variable_names]\n", "\n", "# Convert to lowest data type\n", "params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", "}\n", "utp_cd = utp_cd.convert_dtypes(**params)" ] }, { "cell_type": "markdown", "id": "1f2bf2f9-8638-441a-bd54-1e988861dc96", "metadata": {}, "source": [ "## 4.6 Join the DataFrames and Export" ] }, { "cell_type": "code", "execution_count": 60, "id": "029991c3-c3c9-4047-89b6-db0b019f3261", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\n" ] }, { "data": { "text/plain": [ "{'geo_descr_en', 'geo_descr_fr'}" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid\n" ] }, { "data": { "text/plain": [ "{'cd_dguid'}" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "0" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(\"Merging all Census Divisions dataframes into one\")\n", "cd_merge = ao_cd.merge(cc_cd, how='inner', on='cd_dguid') \\\n", " .merge(fo_cd, how='inner', on='cd_dguid') \\\n", " .merge(lpb_cd, how='inner', on='cd_dguid') \\\n", " .merge(utp_cd, how='inner', on='cd_dguid')\n", "\n", "# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr\n", "print(\"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\")\n", "set(data_description['variables']) - set(cd_merge.columns)\n", "print(\"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid\")\n", "set(cd_merge.columns) - set(data_description['variables'])\n", "\n", "# Export\n", "print(\"Exporting cd_2021.parquet\")\n", "cd_merge.to_parquet(f'{output_data_dir}/cd_2021.parquet', index=False, compression='zstd')\n", "\n", "del(ao_cd)\n", "del(cc_cd)\n", "del(fo_cd)\n", "del(lpb_cd)\n", "del(utp_cd)\n", "del(cd_merge)\n", "gc.collect()" ] }, { "cell_type": "markdown", "id": "c5d3a2fd-3d8b-461f-9933-19c13bcd01ba", "metadata": {}, "source": [ "# 5.0 Process Consolidated Subdivisions\n", "## 5.1 Process Agricultural Operations" ] }, { "cell_type": "code", "execution_count": 61, "id": "f0a5858b-4fbc-4993-a9ed-b17c1606c8bd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quick check on columns that are on the geodataframe but not on the variables list\n" ] }, { "data": { "text/plain": [ "{'ccsname', 'geometry', 'shape_area', 'shape_length', 'srunom'}" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = f'{input_data_dir}/CEAG21_AgriculturalOperations_ExploitationsAgricoles_REAG21.gdb.zip'\n", "\n", "print(f'Processing {dataset}, lccs000b21a_e_ceag21_n feature class')\n", "\n", "ao_ccs = gpd.read_file(dataset, layer='lccs000b21a_e_ceag21_n')\n", "\n", "# Lowercase column names\n", "ao_ccs.columns = [x.lower() for x in ao_ccs.columns]\n", "\n", "# Calculate dguid\n", "ao_ccs['ccsuid'] = '2021S0502' + ao_ccs['ccsuid']\n", "ao_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n", "\n", "# Select the variables for agricultural operations\n", "variable_names = list(data_description[data_description['categories'] == 'Agricultural operations']['variables'])\n", "variable_names.insert(0, 'ccs_dguid')\n", "\n", "# Quick check\n", "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", "set(ao_ccs.columns) - set(variable_names)\n", "\n", "# Get rid of the geometry column and shape area, length\n", "ao_ccs = ao_ccs[variable_names]\n", "\n", "# Convert to lowest data type\n", "params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", "}\n", "ao_ccs = ao_ccs.convert_dtypes(**params)" ] }, { "cell_type": "markdown", "id": "d667b9b4-36fc-489d-9430-d8d099926d2f", "metadata": {}, "source": [ "## 5.2 Process Crop Cultures" ] }, { "cell_type": "code", "execution_count": 62, "id": "7352d371-cf2a-4846-b239-f11ed0f5dd66", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quick check on columns that are on the geodataframe but not on the variables list\n" ] }, { "data": { "text/plain": [ "{'ccsname', 'geometry', 'shape_area', 'shape_length', 'srunom'}" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = f'{input_data_dir}/CEAG21_Crops_Cultures_REAG21.gdb.zip'\n", "\n", "print(f'Processing {dataset}, lccs000b21a_e_ceag21_n feature class')\n", "\n", "cc_ccs = gpd.read_file(dataset, layer='lccs000b21a_e_ceag21_n')\n", "\n", "# Lowercase column names\n", "cc_ccs.columns = [x.lower() for x in cc_ccs.columns]\n", "\n", "# Calculate dguid\n", "cc_ccs['ccsuid'] = '2021S0502' + cc_ccs['ccsuid']\n", "cc_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n", "\n", "# Select the variables for agricultural operations\n", "variable_names = list(data_description[data_description['categories'] == 'Crops']['variables'])\n", "variable_names.insert(0, 'ccs_dguid')\n", "\n", "# Quick check\n", "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", "set(cc_ccs.columns) - set(variable_names)\n", "\n", "# Get rid of the geometry column and shape area, length\n", "cc_ccs = cc_ccs[variable_names]\n", "\n", "# Convert to lowest data type\n", "params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", "}\n", "cc_ccs = cc_ccs.convert_dtypes(**params)" ] }, { "cell_type": "markdown", "id": "ab1841c1-9ebb-41b4-ac04-a998c249b5aa", "metadata": {}, "source": [ "## 5.3 Process Farm Operators" ] }, { "cell_type": "code", "execution_count": 63, "id": "c047128d-ba23-4701-9907-9199b0f68549", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quick check on columns that are on the geodataframe but not on the variables list\n" ] }, { "data": { "text/plain": [ "{'ccsname', 'geometry', 'shape_area', 'shape_length', 'srunom'}" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = f'{input_data_dir}/CEAG21_FarmOperators_ExploitantsAgricoles_REAG21.gdb.zip'\n", "\n", "print(f'Processing {dataset}, lccs000b21a_e_ceag21_n feature class')\n", "\n", "fo_ccs = gpd.read_file(dataset, layer='lccs000b21a_e_ceag21_n')\n", "\n", "# Lowercase column names\n", "fo_ccs.columns = [x.lower() for x in fo_ccs.columns]\n", "\n", "# Calculate dguid\n", "fo_ccs['ccsuid'] = '2021S0502' + fo_ccs['ccsuid']\n", "fo_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n", "\n", "# Select the variables for agricultural operations\n", "variable_names = list(data_description[data_description['categories'] == 'Farm operators']['variables'])\n", "variable_names.insert(0, 'ccs_dguid')\n", "\n", "# Quick check\n", "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", "set(fo_ccs.columns) - set(variable_names)\n", "\n", "# Get rid of the geometry column and shape area, length\n", "fo_ccs = fo_ccs[variable_names]\n", "\n", "# Convert to lowest data type\n", "params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", "}\n", "fo_ccs = fo_ccs.convert_dtypes(**params)" ] }, { "cell_type": "markdown", "id": "ae7f58ae-f7b2-4eb3-b4b4-62e9605a290c", "metadata": {}, "source": [ "## 5.4 Process Livestock Poultry Bees" ] }, { "cell_type": "code", "execution_count": 64, "id": "69e39b2b-7032-406b-9686-da84463dce5c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quick check on columns that are on the geodataframe but not on the variables list\n" ] }, { "data": { "text/plain": [ "{'ccsname', 'geometry', 'shape_area', 'shape_length', 'srunom'}" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = f'{input_data_dir}/CEAG21_LivestockPoultryBees_BetailVolailleAbeilles_REAG21.gdb.zip'\n", "\n", "print(f'Processing {dataset}, lccs000b21a_e_ceag21_n feature class')\n", "\n", "lpb_ccs = gpd.read_file(dataset, layer='lccs000b21a_e_ceag21_n')\n", "\n", "# Lowercase column names\n", "lpb_ccs.columns = [x.lower() for x in lpb_ccs.columns]\n", "\n", "# Calculate dguid\n", "lpb_ccs['ccsuid'] = '2021S0502' + lpb_ccs['ccsuid']\n", "lpb_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n", "\n", "\n", "# Select the variables for livestock poultry bees\n", "variable_names = list(data_description[data_description['categories'] == 'Livestock Poultry Bees']['variables'])\n", "variable_names.insert(0, 'ccs_dguid')\n", "\n", "# Quick check\n", "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", "set(lpb_ccs.columns) - set(variable_names)\n", "\n", "# Get rid of the geometry column and shape area, length\n", "lpb_ccs = lpb_ccs[variable_names]\n", "\n", "# Convert to lowest data type\n", "params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", "}\n", "lpb_ccs = lpb_ccs.convert_dtypes(**params)" ] }, { "cell_type": "markdown", "id": "d7343f35-7a86-4cfd-ba10-d81d5347f276", "metadata": {}, "source": [ "## 5.5 Process Use Tenure Practices" ] }, { "cell_type": "code", "execution_count": 65, "id": "19b31cd4-37ae-4541-8e15-17e698256c98", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quick check on columns that are on the geodataframe but not on the variables list\n" ] }, { "data": { "text/plain": [ "{'ccsname', 'geometry', 'shape_area', 'shape_length', 'srunom'}" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = f'{input_data_dir}/CEAG21_UseTenurePractices_UtilisationOccupationPratiques_REAG21.gdb.zip'\n", "\n", "print(f'Processing {dataset}, lccs000b21a_e_ceag21_n feature class')\n", "\n", "utp_ccs = gpd.read_file(dataset, layer='lccs000b21a_e_ceag21_n')\n", "\n", "# Lowercase column names\n", "utp_ccs.columns = [x.lower() for x in utp_ccs.columns]\n", "\n", "# Calculate dguid\n", "utp_ccs['ccsuid'] = '2021S0502' + utp_ccs['ccsuid']\n", "utp_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n", "\n", "# Select the variables for tenure practices\n", "variable_names = list(data_description[data_description['categories'] == 'Use Tenure Practices']['variables'])\n", "variable_names.insert(0, 'ccs_dguid')\n", "\n", "# Quick check\n", "print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n", "set(utp_ccs.columns) - set(variable_names)\n", "\n", "# Get rid of the geometry column and shape area, length\n", "utp_ccs = utp_ccs[variable_names]\n", "\n", "# Convert to lowest data type\n", "params = {\n", " 'convert_string': False,\n", " 'convert_boolean': False\n", "}\n", "utp_ccs = utp_ccs.convert_dtypes(**params)" ] }, { "cell_type": "markdown", "id": "aec1734e-6ade-4849-a160-8a849af15265", "metadata": {}, "source": [ "## 5.6 Join the DataFrames and Export" ] }, { "cell_type": "code", "execution_count": 66, "id": "11bd0ad8-a207-43ba-ad27-826e79e4b678", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\n" ] }, { "data": { "text/plain": [ "{'geo_descr_en', 'geo_descr_fr'}" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid\n" ] }, { "data": { "text/plain": [ "{'ccs_dguid'}" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "0" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(\"Merging all Census Consolidated Subdivisions dataframes into one\")\n", "ccs_merge = ao_ccs.merge(cc_ccs, how='inner', on='ccs_dguid') \\\n", " .merge(fo_ccs, how='inner', on='ccs_dguid') \\\n", " .merge(lpb_ccs, how='inner', on='ccs_dguid') \\\n", " .merge(utp_ccs, how='inner', on='ccs_dguid')\n", "\n", "# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr\n", "print(\"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\")\n", "set(data_description['variables']) - set(ccs_merge.columns)\n", "print(\"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid\")\n", "set(ccs_merge.columns) - set(data_description['variables'])\n", "\n", "# Export\n", "print(\"Exporting ccs_2021.parquet\")\n", "ccs_merge.to_parquet(f'{output_data_dir}/ccs_2021.parquet', index=False, compression='zstd')\n", "\n", "del(ao_ccs)\n", "del(cc_ccs)\n", "del(fo_ccs)\n", "del(lpb_ccs)\n", "del(utp_ccs)\n", "del(ccs_merge)\n", "gc.collect()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }