Files
d4c-datapkg-statistical/census_of_agriculture/process_2016.ipynb
T
Diego Ripley f93e4d0cec Initial commit
2025-05-24 13:37:31 -04:00

1980 lines
59 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 65,
"id": "a06cc1c7-7826-4270-8c04-48ad4de90bc9",
"metadata": {},
"outputs": [],
"source": [
"import gc\n",
"\n",
"from IPython.core.interactiveshell import InteractiveShell \n",
"import geopandas as gpd\n",
"from ordered_set import OrderedSet\n",
"import pandas as pd\n",
"\n",
"# Enable multiple outputs per cell\n",
"InteractiveShell.ast_node_interactivity = \"all\"\n",
"# Show all columns\n",
"pd.set_option('display.max_columns', None)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "6e4dad8c-afec-4bb6-aee2-20b9b9c5a7a5",
"metadata": {},
"outputs": [],
"source": [
"input_data_dir = '/data/census_of_agriculture/input/2016'\n",
"output_data_dir = '/data/census_of_agriculture/output/2016/tabular'"
]
},
{
"cell_type": "markdown",
"id": "c049bbce-8dcb-418c-b7cb-7015f920a39a",
"metadata": {},
"source": [
"# 1.0 Process Excel sheet with column names and descriptions\n",
"The compilation of all of the file geodatabase dataset columns should match this dataset"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "0026e968-c4a4-4dc3-9f5c-cdf9706bb8e9",
"metadata": {},
"outputs": [],
"source": [
"print(\"Reading Excel sheet with variables\")\n",
"\n",
"data_description = pd.read_excel(f'{input_data_dir}/CEAG16_VariablesDescriptions_REAG16_EN_FR.xlsx', skiprows=3,\n",
" usecols=['Variables', 'Long description of the variables'])\n",
"data_description.rename(columns={'Variables': 'variables', 'Long description of the variables': 'description_en'}, inplace=True)\n",
"data_description['variables'] = data_description['variables'].str.lower()\n",
"\n",
"# There are duplicate variables that are identical. For example, opermore_n\n",
"data_description = data_description.groupby(['variables', 'description_en']).last().reset_index()"
]
},
{
"cell_type": "markdown",
"id": "ae758963-b639-44a9-9904-d7438594a729",
"metadata": {},
"source": [
"# 2.0 Process Provinces and Territories\n",
"## 2.1 Process Agricultural Operations\n",
"**TODO:** \n",
"- Mistakes:\n",
" - There is a duplicate of `farms_n` column that is named `farms_n1`. The values are identical for both columns"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "2b8467e8-b979-423c-856c-80242f9a8443",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quick check on columns that are on the geodataframe but not on the variables list\n"
]
},
{
"data": {
"text/plain": [
"{'farms_n1',\n",
" 'geo_descr1',\n",
" 'geo_descr_',\n",
" 'geometry',\n",
" 'shape_area',\n",
" 'shape_length'}"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = f'{input_data_dir}/CEAG16_AgriculturalOperations_ExploitationsAgricoles_REAG16.gdb.zip'\n",
"\n",
"print(f'Processing {dataset}, lpr_000b16a_ceag16_n feature class')\n",
"\n",
"ao_pr = gpd.read_file(dataset, layer='lpr_000b16a_ceag16_n')\n",
"# Lowercase column names\n",
"ao_pr.columns = [x.lower() for x in ao_pr.columns]\n",
"\n",
"# Calculate dguid\n",
"ao_pr['geo_pruid'] = '2016A0002' + ao_pr['geo_pruid']\n",
"ao_pr.rename(columns={'geo_pruid':'pr_dguid'}, inplace=True)\n",
"\n",
"# Select the variables\n",
"variable_names = list(data_description['variables'])\n",
"variable_names.insert(0, 'pr_dguid')\n",
"\n",
"# Quick check\n",
"print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
"set(ao_pr.columns) - set(variable_names)\n",
"\n",
"variable_names = list(OrderedSet(variable_names) & OrderedSet(ao_pr.columns))\n",
"\n",
"# Get rid of the geometry column and shape area, length\n",
"ao_pr = ao_pr[variable_names]\n",
"\n",
"# Convert to lowest data type\n",
"params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
"}\n",
"ao_pr = ao_pr.convert_dtypes(**params)"
]
},
{
"cell_type": "markdown",
"id": "54b1ab9d-68c7-4751-a17e-e5ad321fd72f",
"metadata": {},
"source": [
"## 2.2 Process Crop Cultures"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "0dfa08f0-2d8d-4279-bd22-2f127d596c3f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quick check on columns that are on the geodataframe but not on the variables list\n"
]
},
{
"data": {
"text/plain": [
"{'geo_descr1',\n",
" 'geo_descr_',\n",
" 'geometry',\n",
" 'shape_area',\n",
" 'shape_leng',\n",
" 'shape_length'}"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = f'{input_data_dir}/CEAG16_Crops_Cultures_REAG16.gdb.zip'\n",
"\n",
"print(f'Processing {dataset}, lpr_000b16a_ceag16_n feature class')\n",
"\n",
"cc_pr = gpd.read_file(dataset, \n",
" layer='lpr_000b16a_ceag16_n')\n",
"\n",
"# Lowercase column names\n",
"cc_pr.columns = [x.lower() for x in cc_pr.columns]\n",
"\n",
"# Calculate dguid\n",
"cc_pr['geo_pruid'] = '2016A0002' + cc_pr['geo_pruid']\n",
"cc_pr.rename(columns={'geo_pruid':'pr_dguid'}, inplace=True)\n",
"\n",
"# Select the variables\n",
"variable_names = list(data_description['variables'])\n",
"variable_names.insert(0, 'pr_dguid')\n",
"\n",
"# Quick check\n",
"print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
"set(cc_pr.columns) - set(variable_names)\n",
"\n",
"variable_names = list(OrderedSet(variable_names) & OrderedSet(cc_pr.columns))\n",
"\n",
"# Get rid of the geometry column and shape area, length\n",
"cc_pr = cc_pr[variable_names]\n",
"\n",
"# Convert to lowest data type\n",
"params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
"}\n",
"cc_pr = cc_pr.convert_dtypes(**params)"
]
},
{
"cell_type": "markdown",
"id": "076eb1e3-b319-47ff-b959-cae6dd600081",
"metadata": {},
"source": [
"## 2.3 Process Farm Operators\n",
"**TODO:** \n",
"- Mistakes:\n",
" - Column `more_avg_a` should be called `more_avg_age`\n",
" - Column `one_avg_ag` should be called `one_avg_age`\n",
" - On the Excel sheet, there are four `OPER_N`, with the same definition\n",
" - On the dataset, this column is replicated five times as `oper_n`, `oper_n1`, `oper_n2`, `oper_n3`, `oper_n4`\n",
" - No idea what `opermore_1` is supposed to be\n",
" - Column `operone_n1` is duplicate of `operone_n`"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "de1dbde2-e820-438f-ae9d-b54455b10ac4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quick check on columns that are on the geodataframe but not on the variables list\n"
]
},
{
"data": {
"text/plain": [
"{'geo_descr1',\n",
" 'geo_descr_',\n",
" 'geometry',\n",
" 'oper_n1',\n",
" 'oper_n2',\n",
" 'oper_n3',\n",
" 'oper_n4',\n",
" 'opermore_1',\n",
" 'operone_n1',\n",
" 'shape_area',\n",
" 'shape_leng',\n",
" 'shape_length'}"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = f'{input_data_dir}/CEAG16_FarmOperators_ExploitantsAgricoles_REAG16.gdb.zip'\n",
"\n",
"print(f'Processing {dataset}, lpr_000b16a_ceag16_n feature class')\n",
"\n",
"fo_pr = gpd.read_file(dataset, layer='lpr_000b16a_ceag16_n')\n",
"\n",
"# Lowercase column names\n",
"fo_pr.columns = [x.lower() for x in fo_pr.columns]\n",
"\n",
"# Calculate dguid\n",
"fo_pr['geo_pruid'] = '2016A0002' + fo_pr['geo_pruid']\n",
"fo_pr.rename(columns={'geo_pruid':'pr_dguid'}, inplace=True)\n",
"\n",
"# Fix mistakes\n",
"fo_pr.rename(columns={\n",
" 'more_avg_a': 'more_avg_age',\n",
" 'one_avg_ag': 'one_avg_age'}, inplace=True)\n",
"\n",
"# Select the variables\n",
"variable_names = list(data_description['variables'])\n",
"variable_names.insert(0, 'pr_dguid')\n",
"\n",
"# Quick check\n",
"print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
"set(fo_pr.columns) - set(variable_names)\n",
"\n",
"variable_names = list(OrderedSet(variable_names) & OrderedSet(fo_pr.columns))\n",
"\n",
"# Get rid of the geometry column and shape area, length\n",
"fo_pr = fo_pr[variable_names]\n",
"\n",
"# Convert to lowest data type\n",
"params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
"}\n",
"fo_pr = fo_pr.convert_dtypes(**params)"
]
},
{
"cell_type": "markdown",
"id": "39161daf-5d6a-47e0-9211-0cc0137e2c9e",
"metadata": {},
"source": [
"## 2.4 Process Livestock Poultry Bees"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "79731af0-56f2-4840-b903-49410512d410",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quick check on columns that are on the geodataframe but not on the variables list\n"
]
},
{
"data": {
"text/plain": [
"{'geo_descr1',\n",
" 'geo_descr_',\n",
" 'geometry',\n",
" 'shape_area',\n",
" 'shape_leng',\n",
" 'shape_length'}"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = f'{input_data_dir}/CEAG16_LivestockPoultryBees_BetailVolailleAbeilles_REAG16.gdb.zip'\n",
"\n",
"print(f'Processing {dataset}, lpr_000b16a_ceag16_n feature class')\n",
"\n",
"lpb_pr = gpd.read_file(dataset, layer='lpr_000b16a_ceag16_n')\n",
"\n",
"# Lowercase column names\n",
"lpb_pr.columns = [x.lower() for x in lpb_pr.columns]\n",
"\n",
"# Calculate dguid\n",
"lpb_pr['geo_pruid'] = '2016A0002' + lpb_pr['geo_pruid']\n",
"lpb_pr.rename(columns={'geo_pruid':'pr_dguid'}, inplace=True)\n",
"\n",
"# Select the variables\n",
"variable_names = list(data_description['variables'])\n",
"variable_names.insert(0, 'pr_dguid')\n",
"\n",
"# Quick check\n",
"print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
"set(lpb_pr.columns) - set(variable_names)\n",
"\n",
"variable_names = list(OrderedSet(variable_names) & OrderedSet(lpb_pr.columns))\n",
"\n",
"# Get rid of the geometry column and shape area, length\n",
"lpb_pr = lpb_pr[variable_names]\n",
"\n",
"# Convert to lowest data type\n",
"params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
"}\n",
"lpb_pr = lpb_pr.convert_dtypes(**params)"
]
},
{
"cell_type": "markdown",
"id": "6a33cfe2-4bbb-46a7-81fb-494a61663a86",
"metadata": {},
"source": [
"## 2.5 Process Use Tenure Practices"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "328daec4-f6de-42c4-b11d-5ba287b7a94d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quick check on columns that are on the geodataframe but not on the variables list\n"
]
},
{
"data": {
"text/plain": [
"{'geo_descr1',\n",
" 'geo_descr_',\n",
" 'geometry',\n",
" 'shape_area',\n",
" 'shape_leng',\n",
" 'shape_length'}"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = f'{input_data_dir}/CEAG16_UseTenurePractices_UtilisationOccupationPratiques_REAG16.gdb.zip'\n",
"\n",
"print(f'Processing {dataset}, lpr_000b16a_ceag16_n feature class')\n",
"\n",
"utp_pr = gpd.read_file(dataset, layer='lpr_000b16a_ceag16_n')\n",
"\n",
"# Lowercase column names\n",
"utp_pr.columns = [x.lower() for x in utp_pr.columns]\n",
"\n",
"# Calculate dguid\n",
"utp_pr['geo_pruid'] = '2016A0002' + utp_pr['geo_pruid']\n",
"utp_pr.rename(columns={'geo_pruid':'pr_dguid'}, inplace=True)\n",
"\n",
"# Select the variables\n",
"variable_names = list(data_description['variables'])\n",
"variable_names.insert(0, 'pr_dguid')\n",
"\n",
"# Quick check\n",
"print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
"set(utp_pr.columns) - set(variable_names)\n",
"\n",
"variable_names = list(OrderedSet(variable_names) & OrderedSet(utp_pr.columns))\n",
"\n",
"# Get rid of the geometry column and shape area, length\n",
"utp_pr = utp_pr[variable_names]\n",
"\n",
"# Convert to lowest data type\n",
"params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
"}\n",
"utp_pr = utp_pr.convert_dtypes(**params)"
]
},
{
"cell_type": "markdown",
"id": "4963cfbb-71c3-4ce7-98f1-3c6ad3d90fde",
"metadata": {},
"source": [
"## 2.6 Join the DataFrames and Export"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "11b5355a-2293-4751-a364-54a0963ed662",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\n"
]
},
{
"data": {
"text/plain": [
"{'geo_descr_en', 'geo_descr_fr'}"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be pr_dguid\n"
]
},
{
"data": {
"text/plain": [
"{'pr_dguid'}"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(\"Merging all Province and Territories dataframes into one\")\n",
"pr_merge = ao_pr.merge(cc_pr, how='inner', on='pr_dguid') \\\n",
" .merge(fo_pr, how='inner', on='pr_dguid') \\\n",
" .merge(lpb_pr, how='inner', on='pr_dguid') \\\n",
" .merge(utp_pr, how='inner', on='pr_dguid')\n",
"\n",
"# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr\n",
"print(\"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\")\n",
"set(data_description['variables']) - set(pr_merge.columns)\n",
"print(\"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be pr_dguid\")\n",
"set(pr_merge.columns) - set(data_description['variables'])\n",
"\n",
"# Export\n",
"print(\"Exporting pr_2016.parquet\")\n",
"pr_merge.to_parquet(f'{output_data_dir}/pr_2016.parquet', index=False, compression='zstd')\n",
"\n",
"# Create country as well\n",
"# TODO: check if -1 values subtracted from the sum\n",
"country = pd.read_parquet(f'{output_data_dir}/pr_2016.parquet')\n",
"country['pr_dguid'] = '2016A000011124'\n",
"country.rename(columns={'pr_dguid': 'country_dguid'}, inplace=True)\n",
"country = country.groupby(['country_dguid']).sum()\n",
"country.reset_index(inplace=True)\n",
"# Convert to lowest data type\n",
"params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
"}\n",
"country = country.convert_dtypes(**params)\n",
"print(\"Exporting country_2016.parquet\")\n",
"country.to_parquet(f'{output_data_dir}/country_2016.parquet', index=False, compression='zstd')\n",
"\n",
"del(ao_pr)\n",
"del(cc_pr)\n",
"del(fo_pr)\n",
"del(lpb_pr)\n",
"del(utp_pr)\n",
"del(pr_merge)\n",
"del(country)\n",
"gc.collect()"
]
},
{
"cell_type": "markdown",
"id": "bd78f553-52b4-427e-bf40-7ee4101d65ac",
"metadata": {},
"source": [
"# 3.0 Process Census Agricultural Regions"
]
},
{
"cell_type": "markdown",
"id": "ac1383f5-1340-4dd8-9346-0d21ab506886",
"metadata": {},
"source": [
"## 3.1 Process Agricultural Operations\n",
"**TODO:** \n",
"- Mistakes:\n",
" - There is a duplicate of `farms_n` column that is named `farms_n1`. The values are identical for both columns"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "740de275-a602-4a18-9f7a-708db0e4e529",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quick check on columns that are on the geodataframe but not on the variables list\n"
]
},
{
"data": {
"text/plain": [
"{'farms_n1',\n",
" 'geo_descr1',\n",
" 'geo_descr_',\n",
" 'geometry',\n",
" 'shape_area',\n",
" 'shape_length'}"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = f'{input_data_dir}/CEAG16_AgriculturalOperations_ExploitationsAgricoles_REAG16.gdb.zip'\n",
"\n",
"print(f'Processing {dataset}, lcar000b16a_ceag16_n feature class')\n",
"\n",
"ao_car = gpd.read_file(dataset, layer='lcar000b16a_ceag16_n')\n",
"\n",
"# Lowercase column names\n",
"ao_car.columns = [x.lower() for x in ao_car.columns]\n",
"\n",
"# Calculate dguid\n",
"ao_car['geo_caruid'] = '2016S0501' + ao_car['geo_caruid']\n",
"ao_car.rename(columns={'geo_caruid':'car_dguid'}, inplace=True)\n",
"\n",
"# Select the variables\n",
"variable_names = list(data_description['variables'])\n",
"variable_names.insert(0, 'car_dguid')\n",
"\n",
"# Quick check\n",
"print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
"set(ao_car.columns) - set(variable_names)\n",
"\n",
"variable_names = list(OrderedSet(variable_names) & OrderedSet(ao_car.columns))\n",
"\n",
"# Get rid of the geometry column and shape area, length\n",
"ao_car = ao_car[variable_names]\n",
"\n",
"# Convert to lowest data type\n",
"params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
"}\n",
"ao_car = ao_car.convert_dtypes(**params)"
]
},
{
"cell_type": "markdown",
"id": "c4254a8e-30cf-43a0-8a08-b7ac1ab9689c",
"metadata": {},
"source": [
"## 3.2 Process Crop Cultures"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "92b08284-0adb-43fd-a89f-19395afb9bd0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quick check on columns that are on the geodataframe but not on the variables list\n"
]
},
{
"data": {
"text/plain": [
"{'geo_descr1',\n",
" 'geo_descr_',\n",
" 'geometry',\n",
" 'shape_area',\n",
" 'shape_leng',\n",
" 'shape_length'}"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = f'{input_data_dir}/CEAG16_Crops_Cultures_REAG16.gdb.zip'\n",
"\n",
"print(f'Processing {dataset}, lcar_000b16a_ceag16_n feature class')\n",
"\n",
"cc_car = gpd.read_file(dataset, layer='lcar_000b16a_ceag16_n')\n",
"\n",
"# Lowercase column names\n",
"cc_car.columns = [x.lower() for x in cc_car.columns]\n",
"\n",
"# Calculate dguid\n",
"cc_car['geo_caruid'] = '2016S0501' + cc_car['geo_caruid']\n",
"cc_car.rename(columns={'geo_caruid':'car_dguid'}, inplace=True)\n",
"\n",
"# Select the variables\n",
"variable_names = list(data_description['variables'])\n",
"variable_names.insert(0, 'car_dguid')\n",
"\n",
"# Quick check\n",
"print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
"set(cc_car.columns) - set(variable_names)\n",
"\n",
"variable_names = list(OrderedSet(variable_names) & OrderedSet(cc_car.columns))\n",
"\n",
"# Get rid of the geometry column and shape area, length\n",
"cc_car = cc_car[variable_names]\n",
"\n",
"# Convert to lowest data type\n",
"params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
"}\n",
"cc_car = cc_car.convert_dtypes(**params)"
]
},
{
"cell_type": "markdown",
"id": "babc334e-47c0-435d-919c-2df54a70130a",
"metadata": {},
"source": [
"## 3.3 Process Farm Operators\n",
"**TODO:** \n",
"- Mistakes:\n",
" - Column `more_avg_a` should be called `more_avg_age`\n",
" - Column `one_avg_ag` should be called `one_avg_age`\n",
" - On the Excel sheet, there are four `OPER_N`, with the same definition\n",
" - On the dataset, this column is replicated five times as `oper_n`, `oper_n1`, `oper_n2`, `oper_n3`, `oper_n4`\n",
" - No idea what `opermore_1` is supposed to be\n",
" - Column `operone_n1` is duplicate of `operone_n`"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "47afb1ab-f4a9-4581-8222-b4a52ff0eb4a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quick check on columns that are on the geodataframe but not on the variables list\n"
]
},
{
"data": {
"text/plain": [
"{'geo_descr1',\n",
" 'geo_descr_',\n",
" 'geometry',\n",
" 'oper_n1',\n",
" 'oper_n2',\n",
" 'oper_n3',\n",
" 'oper_n4',\n",
" 'opermore_1',\n",
" 'operone_n1',\n",
" 'shape_area',\n",
" 'shape_leng',\n",
" 'shape_length'}"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = f'{input_data_dir}/CEAG16_FarmOperators_ExploitantsAgricoles_REAG16.gdb.zip'\n",
"\n",
"print(f'Processing {dataset}, lcar000b16a_ceag16_n feature class')\n",
"\n",
"fo_car = gpd.read_file(dataset, layer='lcar000b16a_ceag16_n')\n",
"\n",
"# Lowercase column names\n",
"fo_car.columns = [x.lower() for x in fo_car.columns]\n",
"\n",
"# Calculate dguid\n",
"fo_car['geo_caruid'] = '2016S0501' + fo_car['geo_caruid']\n",
"fo_car.rename(columns={'geo_caruid':'car_dguid'}, inplace=True)\n",
"\n",
"# Fix mistakes\n",
"fo_car.rename(columns={\n",
" 'more_avg_a': 'more_avg_age',\n",
" 'one_avg_ag': 'one_avg_age'}, inplace=True)\n",
"\n",
"# Select the variables\n",
"variable_names = list(data_description['variables'])\n",
"variable_names.insert(0, 'car_dguid')\n",
"\n",
"# Quick check\n",
"print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
"set(fo_car.columns) - set(variable_names)\n",
"\n",
"variable_names = list(OrderedSet(variable_names) & OrderedSet(fo_car.columns))\n",
"\n",
"# Get rid of the geometry column and shape area, length\n",
"fo_car = fo_car[variable_names]\n",
"\n",
"# Convert to lowest data type\n",
"params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
"}\n",
"fo_car = fo_car.convert_dtypes(**params)"
]
},
{
"cell_type": "markdown",
"id": "03bf9372-c2da-46b9-8da9-64c358cb51df",
"metadata": {},
"source": [
"## 3.4 Process Livestock Poultry Bees"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "abf3ff3f-6fbc-482f-9ef8-f7ae6c289884",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quick check on columns that are on the geodataframe but not on the variables list\n"
]
},
{
"data": {
"text/plain": [
"{'geo_descr1',\n",
" 'geo_descr_',\n",
" 'geometry',\n",
" 'shape_area',\n",
" 'shape_leng',\n",
" 'shape_length'}"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = f'{input_data_dir}/CEAG16_LivestockPoultryBees_BetailVolailleAbeilles_REAG16.gdb.zip'\n",
"\n",
"print(f'Processing {dataset}, lcar000b16a_ceag16_n feature class')\n",
"\n",
"lpb_car = gpd.read_file(dataset, layer='lcar000b16a_ceag16_n')\n",
"\n",
"# Lowercase column names\n",
"lpb_car.columns = [x.lower() for x in lpb_car.columns]\n",
"\n",
"# Calculate dguid\n",
"lpb_car['geo_caruid'] = '2016S0501' + lpb_car['geo_caruid']\n",
"lpb_car.rename(columns={'geo_caruid':'car_dguid'}, inplace=True)\n",
"\n",
"# Select the variables\n",
"variable_names = list(data_description['variables'])\n",
"variable_names.insert(0, 'car_dguid')\n",
"\n",
"# Quick check\n",
"print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
"set(lpb_car.columns) - set(variable_names)\n",
"\n",
"variable_names = list(OrderedSet(variable_names) & OrderedSet(lpb_car.columns))\n",
"\n",
"# Get rid of the geometry column and shape area, length\n",
"lpb_car = lpb_car[variable_names]\n",
"\n",
"# Convert to lowest data type\n",
"params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
"}\n",
"lpb_car = lpb_car.convert_dtypes(**params)"
]
},
{
"cell_type": "markdown",
"id": "a3962c9c-3b5b-4fd8-a6ca-8e004159a521",
"metadata": {},
"source": [
"## 3.5 Process Use Tenure Practices"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "16678c91-7132-4ed2-981b-f60b0d81bc73",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quick check on columns that are on the geodataframe but not on the variables list\n"
]
},
{
"data": {
"text/plain": [
"{'geo_descr1',\n",
" 'geo_descr_',\n",
" 'geometry',\n",
" 'shape_area',\n",
" 'shape_leng',\n",
" 'shape_length'}"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = f'{input_data_dir}/CEAG16_UseTenurePractices_UtilisationOccupationPratiques_REAG16.gdb.zip'\n",
"\n",
"print(f'Processing {dataset}, lcar_000b16a_ceag16_n feature class')\n",
"\n",
"utp_car = gpd.read_file(dataset, layer='lcar_000b16a_ceag16_n')\n",
"\n",
"# Lowercase column names\n",
"utp_car.columns = [x.lower() for x in utp_car.columns]\n",
"\n",
"# Calculate dguid\n",
"utp_car['geo_caruid'] = '2016S0501' + utp_car['geo_caruid']\n",
"utp_car.rename(columns={'geo_caruid':'car_dguid'}, inplace=True)\n",
"\n",
"# Select the variables\n",
"variable_names = list(data_description['variables'])\n",
"variable_names.insert(0, 'car_dguid')\n",
"\n",
"# Quick check\n",
"print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
"set(utp_car.columns) - set(variable_names)\n",
"\n",
"variable_names = list(OrderedSet(variable_names) & OrderedSet(utp_car.columns))\n",
"\n",
"# Get rid of the geometry column and shape area, length\n",
"utp_car = utp_car[variable_names]\n",
"\n",
"# Convert to lowest data type\n",
"params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
"}\n",
"utp_car = utp_car.convert_dtypes(**params)"
]
},
{
"cell_type": "markdown",
"id": "9b99344d-edfc-4f51-ab9f-8a56100394b0",
"metadata": {},
"source": [
"## 3.6 Join the DataFrames and Export"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "d1cfb3c1-c94e-4f4a-9c90-e61c43a791a3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\n"
]
},
{
"data": {
"text/plain": [
"{'geo_descr_en', 'geo_descr_fr'}"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be car_dguid\n"
]
},
{
"data": {
"text/plain": [
"{'car_dguid'}"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(\"Merging all Census Agricultural Regions dataframes into one\")\n",
"car_merge = ao_car.merge(cc_car, how='inner', on='car_dguid') \\\n",
" .merge(fo_car, how='inner', on='car_dguid') \\\n",
" .merge(lpb_car, how='inner', on='car_dguid') \\\n",
" .merge(utp_car, how='inner', on='car_dguid')\n",
"\n",
"# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr\n",
"print(\"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\")\n",
"set(data_description['variables']) - set(car_merge.columns)\n",
"print(\"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be car_dguid\")\n",
"set(car_merge.columns) - set(data_description['variables'])\n",
"\n",
"# Export\n",
"print(\"Exporting car_2016.parquet\")\n",
"car_merge.to_parquet(f'{output_data_dir}/car_2016.parquet', index=False, compression='zstd')\n",
"\n",
"del(ao_car)\n",
"del(cc_car)\n",
"del(fo_car)\n",
"del(lpb_car)\n",
"del(utp_car)\n",
"del(car_merge)\n",
"gc.collect()"
]
},
{
"cell_type": "markdown",
"id": "5ec65522-e754-40b9-994d-e21c5bae88ac",
"metadata": {},
"source": [
"# 4.0 Process Census Divisions\n",
"## 4.1 Process Agricultural Operations\n",
"**TODO:** \n",
"- Mistakes:\n",
" - There is a duplicate of `farms_n` column that is named `farms_n1`. The values are identical for both columns"
]
},
{
"cell_type": "code",
"execution_count": 75,
"id": "02f789b8-e3d8-4359-97b1-46be400bc019",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quick check on columns that are on the geodataframe but not on the variables list\n"
]
},
{
"data": {
"text/plain": [
"{'farms_n1',\n",
" 'geo_descr1',\n",
" 'geo_descr_',\n",
" 'geometry',\n",
" 'shape_area',\n",
" 'shape_length'}"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = f'{input_data_dir}/CEAG16_AgriculturalOperations_ExploitationsAgricoles_REAG16.gdb.zip'\n",
"\n",
"print(f'Processing {dataset}, lcd_000b16a_ceag16_n feature class')\n",
"\n",
"ao_cd = gpd.read_file(dataset, layer='lcd_000b16a_ceag16_n')\n",
"\n",
"# Lowercase column names\n",
"ao_cd.columns = [x.lower() for x in ao_cd.columns]\n",
"\n",
"# Calculate dguid\n",
"ao_cd['geo_cduid'] = '2016A0003' + ao_cd['geo_cduid']\n",
"ao_cd.rename(columns={'geo_cduid':'cd_dguid'}, inplace=True)\n",
"\n",
"# Select the variables\n",
"variable_names = list(data_description['variables'])\n",
"variable_names.insert(0, 'cd_dguid')\n",
"\n",
"# Quick check\n",
"print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
"set(ao_cd.columns) - set(variable_names)\n",
"\n",
"variable_names = list(OrderedSet(variable_names) & OrderedSet(ao_cd.columns))\n",
"\n",
"# Get rid of the geometry column and shape area, length\n",
"ao_cd = ao_cd[variable_names]\n",
"\n",
"# Convert to lowest data type\n",
"params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
"}\n",
"ao_cd = ao_cd.convert_dtypes(**params)"
]
},
{
"cell_type": "markdown",
"id": "b22c32c1-bf09-42f6-ad9f-15e19e88f125",
"metadata": {},
"source": [
"## 4.2 Process Crop Cultures"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "05283ef4-d1a9-48aa-8b34-d11123e19dc1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quick check on columns that are on the geodataframe but not on the variables list\n"
]
},
{
"data": {
"text/plain": [
"{'geo_descr1',\n",
" 'geo_descr_',\n",
" 'geometry',\n",
" 'shape_area',\n",
" 'shape_leng',\n",
" 'shape_length'}"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = f'{input_data_dir}/CEAG16_Crops_Cultures_REAG16.gdb.zip'\n",
"\n",
"print(f'Processing {dataset}, lcd_000b16a_ceag16_n feature class')\n",
"\n",
"cc_cd = gpd.read_file(dataset, layer='lcd_000b16a_ceag16_n')\n",
"\n",
"# Lowercase column names\n",
"cc_cd.columns = [x.lower() for x in cc_cd.columns]\n",
"\n",
"# Calculate dguid\n",
"cc_cd['geo_cduid'] = '2016A0003' + cc_cd['geo_cduid']\n",
"cc_cd.rename(columns={'geo_cduid':'cd_dguid'}, inplace=True)\n",
"\n",
"# Select the variables\n",
"variable_names = list(data_description['variables'])\n",
"variable_names.insert(0, 'cd_dguid')\n",
"\n",
"# Quick check\n",
"print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
"set(cc_cd.columns) - set(variable_names)\n",
"\n",
"variable_names = list(OrderedSet(variable_names) & OrderedSet(cc_cd.columns))\n",
"\n",
"# Get rid of the geometry column and shape area, length\n",
"cc_cd = cc_cd[variable_names]\n",
"\n",
"# Convert to lowest data type\n",
"params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
"}\n",
"cc_cd = cc_cd.convert_dtypes(**params)"
]
},
{
"cell_type": "markdown",
"id": "52241e8d-b468-4dd6-99b9-342f9a6fde94",
"metadata": {},
"source": [
"## 4.3 Process Farm Operators\n",
"**TODO:** \n",
"- Mistakes:\n",
" - Column `more_avg_a` should be called `more_avg_age`\n",
" - Column `one_avg_ag` should be called `one_avg_age`\n",
" - On the Excel sheet, there are four `OPER_N`, with the same definition\n",
" - On the dataset, this column is replicated five times as `oper_n`, `oper_n1`, `oper_n2`, `oper_n3`, `oper_n4`\n",
" - No idea what `opermore_1` is supposed to be\n",
" - Column `operone_n1` is duplicate of `operone_n`"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "cf5b26d1-7315-4e9d-85fe-473a68cc5f0f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quick check on columns that are on the geodataframe but not on the variables list\n"
]
},
{
"data": {
"text/plain": [
"{'geo_descr1',\n",
" 'geo_descr_',\n",
" 'geometry',\n",
" 'oper_n1',\n",
" 'oper_n2',\n",
" 'oper_n3',\n",
" 'oper_n4',\n",
" 'opermore_1',\n",
" 'operone_n1',\n",
" 'shape_area',\n",
" 'shape_leng',\n",
" 'shape_length'}"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = f'{input_data_dir}/CEAG16_FarmOperators_ExploitantsAgricoles_REAG16.gdb.zip'\n",
"\n",
"print(f'Processing {dataset}, lcd_000b16a_ceag16_n feature class')\n",
"\n",
"fo_cd = gpd.read_file(dataset, \n",
" layer='lcd_000b16a_ceag16_n')\n",
"\n",
"# Lowercase column names\n",
"fo_cd.columns = [x.lower() for x in fo_cd.columns]\n",
"\n",
"# Calculate dguid\n",
"fo_cd['geo_cduid'] = '2016A0003' + fo_cd['geo_cduid']\n",
"fo_cd.rename(columns={'geo_cduid':'cd_dguid'}, inplace=True)\n",
"\n",
"# Select the variables\n",
"variable_names = list(data_description['variables'])\n",
"variable_names.insert(0, 'cd_dguid')\n",
"\n",
"# Fix mistakes\n",
"fo_cd.rename(columns={\n",
" 'more_avg_a': 'more_avg_age',\n",
" 'one_avg_ag': 'one_avg_age'}, inplace=True)\n",
"\n",
"# Quick check\n",
"print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
"set(fo_cd.columns) - set(variable_names)\n",
"\n",
"variable_names = list(OrderedSet(variable_names) & OrderedSet(fo_cd.columns))\n",
"\n",
"# Get rid of the geometry column and shape area, length\n",
"fo_cd = fo_cd[variable_names]\n",
"\n",
"# Convert to lowest data type\n",
"params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
"}\n",
"fo_cd = fo_cd.convert_dtypes(**params)"
]
},
{
"cell_type": "markdown",
"id": "f68f6fc6-f757-4f21-b32a-7070d4bcc3ec",
"metadata": {},
"source": [
"## 4.4 Process Livestock Poultry Bees"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "e3cd38c6-6563-4067-8280-32277b00a33b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quick check on columns that are on the geodataframe but not on the variables list\n"
]
},
{
"data": {
"text/plain": [
"{'geo_descr1',\n",
" 'geo_descr_',\n",
" 'geometry',\n",
" 'shape_area',\n",
" 'shape_leng',\n",
" 'shape_length'}"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = f'{input_data_dir}/CEAG16_LivestockPoultryBees_BetailVolailleAbeilles_REAG16.gdb.zip'\n",
"\n",
"print(f'Processing {dataset}, lcd_000b16a_ceag16_n feature class')\n",
"\n",
"lpb_cd = gpd.read_file(dataset, layer='lcd_000b16a_ceag16_n')\n",
"\n",
"# Lowercase column names\n",
"lpb_cd.columns = [x.lower() for x in lpb_cd.columns]\n",
"\n",
"# Calculate dguid\n",
"lpb_cd['geo_cduid'] = '2016A0003' + lpb_cd['geo_cduid']\n",
"lpb_cd.rename(columns={'geo_cduid':'cd_dguid'}, inplace=True)\n",
"\n",
"# Select the variables\n",
"variable_names = list(data_description['variables'])\n",
"variable_names.insert(0, 'cd_dguid')\n",
"\n",
"# Quick check\n",
"print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
"set(lpb_cd.columns) - set(variable_names)\n",
"\n",
"variable_names = list(OrderedSet(variable_names) & OrderedSet(lpb_cd.columns))\n",
"\n",
"# Get rid of the geometry column and shape area, length\n",
"lpb_cd = lpb_cd[variable_names]\n",
"\n",
"# Convert to lowest data type\n",
"params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
"}\n",
"lpb_cd = lpb_cd.convert_dtypes(**params)"
]
},
{
"cell_type": "markdown",
"id": "e06083ab-9a15-4e7f-82e1-721c9ee1d4f1",
"metadata": {},
"source": [
"## 4.5 Process Use Tenure Practices"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "efe9a9da-9f18-43f7-8ba7-5f2730743358",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quick check on columns that are on the geodataframe but not on the variables list\n"
]
},
{
"data": {
"text/plain": [
"{'geo_descr1',\n",
" 'geo_descr_',\n",
" 'geometry',\n",
" 'shape_area',\n",
" 'shape_leng',\n",
" 'shape_length'}"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = f'{input_data_dir}/CEAG16_UseTenurePractices_UtilisationOccupationPratiques_REAG16.gdb.zip'\n",
"\n",
"print(f'Processing {dataset}, lcd_000b16a_ceag16_n feature class')\n",
"\n",
"utp_cd = gpd.read_file(dataset, layer='lcd_000b16a_ceag16_n')\n",
"\n",
"# Lowercase column names\n",
"utp_cd.columns = [x.lower() for x in utp_cd.columns]\n",
"\n",
"# Calculate dguid\n",
"utp_cd['geo_cduid'] = '2016A0003' + utp_cd['geo_cduid']\n",
"utp_cd.rename(columns={'geo_cduid':'cd_dguid'}, inplace=True)\n",
"\n",
"# Select the variables\n",
"variable_names = list(data_description['variables'])\n",
"variable_names.insert(0, 'cd_dguid')\n",
"\n",
"# Quick check\n",
"print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
"set(utp_cd.columns) - set(variable_names)\n",
"\n",
"variable_names = list(OrderedSet(variable_names) & OrderedSet(utp_cd.columns))\n",
"\n",
"# Get rid of the geometry column and shape area, length\n",
"utp_cd = utp_cd[variable_names]\n",
"\n",
"# Convert to lowest data type\n",
"params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
"}\n",
"utp_cd = utp_cd.convert_dtypes(**params)"
]
},
{
"cell_type": "markdown",
"id": "1f2bf2f9-8638-441a-bd54-1e988861dc96",
"metadata": {},
"source": [
"## 4.6 Join the DataFrames and Export"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "029991c3-c3c9-4047-89b6-db0b019f3261",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\n"
]
},
{
"data": {
"text/plain": [
"{'geo_descr_en', 'geo_descr_fr'}"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid\n"
]
},
{
"data": {
"text/plain": [
"{'cd_dguid'}"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(\"Merging all Census Divisions dataframes into one\")\n",
"cd_merge = ao_cd.merge(cc_cd, how='inner', on='cd_dguid') \\\n",
" .merge(fo_cd, how='inner', on='cd_dguid') \\\n",
" .merge(lpb_cd, how='inner', on='cd_dguid') \\\n",
" .merge(utp_cd, how='inner', on='cd_dguid')\n",
"\n",
"# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr\n",
"print(\"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\")\n",
"set(data_description['variables']) - set(cd_merge.columns)\n",
"print(\"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid\")\n",
"set(cd_merge.columns) - set(data_description['variables'])\n",
"\n",
"# Export\n",
"cd_merge.to_parquet(f'{output_data_dir}/cd_2016.parquet', index=False, compression='zstd')\n",
"\n",
"del(ao_cd)\n",
"del(cc_cd)\n",
"del(fo_cd)\n",
"del(lpb_cd)\n",
"del(utp_cd)\n",
"del(cd_merge)\n",
"gc.collect()"
]
},
{
"cell_type": "markdown",
"id": "c5d3a2fd-3d8b-461f-9933-19c13bcd01ba",
"metadata": {},
"source": [
"# 5.0 Process Consolidated Subdivisions\n",
"## 5.1 Process Agricultural Operations\n",
"**TODO:** \n",
"- Mistakes:\n",
" - There is a duplicate of `farms_n` column that is named `farms_n1`. The values are identical for both columns"
]
},
{
"cell_type": "code",
"execution_count": 77,
"id": "f0a5858b-4fbc-4993-a9ed-b17c1606c8bd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quick check on columns that are on the geodataframe but not on the variables list\n"
]
},
{
"data": {
"text/plain": [
"{'farms_n1',\n",
" 'geo_descr1',\n",
" 'geo_descr_',\n",
" 'geometry',\n",
" 'shape_area',\n",
" 'shape_length'}"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = f'{input_data_dir}/CEAG16_AgriculturalOperations_ExploitationsAgricoles_REAG16.gdb.zip'\n",
"\n",
"print(f'Processing {dataset}, lccs000b16a_ceag16_n feature class')\n",
"\n",
"ao_ccs = gpd.read_file(dataset, layer='lccs000b16a_ceag16_n')\n",
"\n",
"# Lowercase column names\n",
"ao_ccs.columns = [x.lower() for x in ao_ccs.columns]\n",
"\n",
"# Calculate dguid\n",
"ao_ccs['ccsuid'] = '2016S0502' + ao_ccs['ccsuid']\n",
"ao_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n",
"\n",
"# Select the variables\n",
"variable_names = list(data_description['variables'])\n",
"variable_names.insert(0, 'ccs_dguid')\n",
"\n",
"# Quick check\n",
"print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
"set(ao_ccs.columns) - set(variable_names)\n",
"\n",
"variable_names = list(OrderedSet(variable_names) & OrderedSet(ao_ccs.columns))\n",
"\n",
"# Get rid of the geometry column and shape area, length\n",
"ao_ccs = ao_ccs[variable_names]\n",
"\n",
"# Convert to lowest data type\n",
"params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
"}\n",
"ao_ccs = ao_ccs.convert_dtypes(**params)"
]
},
{
"cell_type": "markdown",
"id": "d667b9b4-36fc-489d-9430-d8d099926d2f",
"metadata": {},
"source": [
"## 5.2 Process Crop Cultures"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "7352d371-cf2a-4846-b239-f11ed0f5dd66",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quick check on columns that are on the geodataframe but not on the variables list\n"
]
},
{
"data": {
"text/plain": [
"{'geo_descr1',\n",
" 'geo_descr_',\n",
" 'geometry',\n",
" 'shape_area',\n",
" 'shape_leng',\n",
" 'shape_length'}"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = f'{input_data_dir}/CEAG16_Crops_Cultures_REAG16.gdb.zip'\n",
"\n",
"print(f'Processing {dataset}, lccs_000b16a_ceag16_n feature class')\n",
"\n",
"cc_ccs = gpd.read_file(dataset, layer='lccs_000b16a_ceag16_n')\n",
"\n",
"# Lowercase column names\n",
"cc_ccs.columns = [x.lower() for x in cc_ccs.columns]\n",
"\n",
"# Calculate dguid\n",
"cc_ccs['geo_ccsuid'] = '2016S0502' + cc_ccs['geo_ccsuid']\n",
"cc_ccs.rename(columns={'geo_ccsuid':'ccs_dguid'}, inplace=True)\n",
"\n",
"# Select the variables\n",
"variable_names = list(data_description['variables'])\n",
"variable_names.insert(0, 'ccs_dguid')\n",
"\n",
"# Quick check\n",
"print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
"set(cc_ccs.columns) - set(variable_names)\n",
"\n",
"variable_names = list(OrderedSet(variable_names) & OrderedSet(cc_ccs.columns))\n",
"\n",
"# Get rid of the geometry column and shape area, length\n",
"cc_ccs = cc_ccs[variable_names]\n",
"\n",
"# Convert to lowest data type\n",
"params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
"}\n",
"cc_ccs = cc_ccs.convert_dtypes(**params)"
]
},
{
"cell_type": "markdown",
"id": "ab1841c1-9ebb-41b4-ac04-a998c249b5aa",
"metadata": {},
"source": [
"## 5.3 Process Farm Operators\n",
"**TODO:** \n",
"- Mistakes:\n",
" - Column `more_avg_a` should be called `more_avg_age`\n",
" - Column `one_avg_ag` should be called `one_avg_age`\n",
" - On the Excel sheet, there are four `OPER_N`, with the same definition\n",
" - On the dataset, this column is replicated five times as `oper_n`, `oper_n1`, `oper_n2`, `oper_n3`, `oper_n4`\n",
" - No idea what `opermore_1` is supposed to be\n",
" - Column `operone_n1` is duplicate of `operone_n`"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "c047128d-ba23-4701-9907-9199b0f68549",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quick check on columns that are on the geodataframe but not on the variables list\n"
]
},
{
"data": {
"text/plain": [
"{'geo_descr1',\n",
" 'geo_descr_',\n",
" 'geometry',\n",
" 'oper_n1',\n",
" 'oper_n2',\n",
" 'oper_n3',\n",
" 'oper_n4',\n",
" 'opermore_1',\n",
" 'operone_n1',\n",
" 'shape_area',\n",
" 'shape_leng',\n",
" 'shape_length'}"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = f'{input_data_dir}/CEAG16_FarmOperators_ExploitantsAgricoles_REAG16.gdb.zip'\n",
"\n",
"print(f'Processing {dataset}, lccs000b16a_ceag16_n feature class')\n",
"\n",
"fo_ccs = gpd.read_file(dataset, layer='lccs000b16a_ceag16_n')\n",
"\n",
"# Lowercase column names\n",
"fo_ccs.columns = [x.lower() for x in fo_ccs.columns]\n",
"\n",
"# Calculate dguid\n",
"fo_ccs['ccsuid'] = '2016S0502' + fo_ccs['ccsuid']\n",
"fo_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n",
"\n",
"# Select the variables\n",
"variable_names = list(data_description['variables'])\n",
"variable_names.insert(0, 'ccs_dguid')\n",
"\n",
"# Fix mistakes\n",
"fo_ccs.rename(columns={\n",
" 'more_avg_a': 'more_avg_age',\n",
" 'one_avg_ag': 'one_avg_age'}, inplace=True)\n",
"\n",
"# Quick check\n",
"print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
"set(fo_ccs.columns) - set(variable_names)\n",
"\n",
"variable_names = list(OrderedSet(variable_names) & OrderedSet(fo_ccs.columns))\n",
"\n",
"# Get rid of the geometry column and shape area, length\n",
"fo_ccs = fo_ccs[variable_names]\n",
"\n",
"# Convert to lowest data type\n",
"params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
"}\n",
"fo_ccs = fo_ccs.convert_dtypes(**params)"
]
},
{
"cell_type": "markdown",
"id": "ae7f58ae-f7b2-4eb3-b4b4-62e9605a290c",
"metadata": {},
"source": [
"## 5.4 Process Livestock Poultry Bees"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "69e39b2b-7032-406b-9686-da84463dce5c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quick check on columns that are on the geodataframe but not on the variables list\n"
]
},
{
"data": {
"text/plain": [
"{'geo_descr1',\n",
" 'geo_descr_',\n",
" 'geometry',\n",
" 'shape_area',\n",
" 'shape_leng',\n",
" 'shape_length'}"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = f'{input_data_dir}/CEAG16_LivestockPoultryBees_BetailVolailleAbeilles_REAG16.gdb.zip'\n",
"\n",
"print(f'Processing {dataset}, lccs000b16a_ceag16_n feature class')\n",
"\n",
"lpb_ccs = gpd.read_file(dataset, layer='lccs000b16a_ceag16_n')\n",
"\n",
"# Lowercase column names\n",
"lpb_ccs.columns = [x.lower() for x in lpb_ccs.columns]\n",
"\n",
"# Calculate dguid\n",
"lpb_ccs['ccsuid'] = '2016S0502' + lpb_ccs['ccsuid']\n",
"lpb_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n",
"\n",
"# Select the variables\n",
"variable_names = list(data_description['variables'])\n",
"variable_names.insert(0, 'ccs_dguid')\n",
"\n",
"# Quick check\n",
"print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
"set(lpb_ccs.columns) - set(variable_names)\n",
"\n",
"variable_names = list(OrderedSet(variable_names) & OrderedSet(lpb_ccs.columns))\n",
"\n",
"# Get rid of the geometry column and shape area, length\n",
"lpb_ccs = lpb_ccs[variable_names]\n",
"\n",
"# Convert to lowest data type\n",
"params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
"}\n",
"lpb_ccs = lpb_ccs.convert_dtypes(**params)"
]
},
{
"cell_type": "markdown",
"id": "d7343f35-7a86-4cfd-ba10-d81d5347f276",
"metadata": {},
"source": [
"## 5.5 Process Use Tenure Practices"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "19b31cd4-37ae-4541-8e15-17e698256c98",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quick check on columns that are on the geodataframe but not on the variables list\n"
]
},
{
"data": {
"text/plain": [
"{'geo_descr1',\n",
" 'geo_descr_',\n",
" 'geometry',\n",
" 'shape_area',\n",
" 'shape_leng',\n",
" 'shape_length'}"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = f'{input_data_dir}/CEAG16_UseTenurePractices_UtilisationOccupationPratiques_REAG16.gdb.zip'\n",
"\n",
"print(f'Processing {dataset}, lccs_000b16a_ceag16_n feature class')\n",
"\n",
"utp_ccs = gpd.read_file(dataset, layer='lccs_000b16a_ceag16_n')\n",
"\n",
"# Lowercase column names\n",
"utp_ccs.columns = [x.lower() for x in utp_ccs.columns]\n",
"\n",
"# Calculate dguid\n",
"utp_ccs['ccsuid'] = '2016S0502' + utp_ccs['ccsuid']\n",
"utp_ccs.rename(columns={'ccsuid':'ccs_dguid'}, inplace=True)\n",
"\n",
"# Select the variables\n",
"variable_names = list(data_description['variables'])\n",
"variable_names.insert(0, 'ccs_dguid')\n",
"\n",
"# Quick check\n",
"print(\"Quick check on columns that are on the geodataframe but not on the variables list\")\n",
"set(utp_ccs.columns) - set(variable_names)\n",
"\n",
"variable_names = list(OrderedSet(variable_names) & OrderedSet(utp_ccs.columns))\n",
"\n",
"# Get rid of the geometry column and shape area, length\n",
"utp_ccs = utp_ccs[variable_names]\n",
"\n",
"# Convert to lowest data type\n",
"params = {\n",
" 'convert_string': False,\n",
" 'convert_boolean': False\n",
"}\n",
"utp_ccs = utp_ccs.convert_dtypes(**params)"
]
},
{
"cell_type": "markdown",
"id": "aec1734e-6ade-4849-a160-8a849af15265",
"metadata": {},
"source": [
"## 5.6 Join the DataFrames and Export"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "11bd0ad8-a207-43ba-ad27-826e79e4b678",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\n"
]
},
{
"data": {
"text/plain": [
"{'geo_descr_en', 'geo_descr_fr'}"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid\n"
]
},
{
"data": {
"text/plain": [
"{'ccs_dguid'}"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(\"Merging all Census Consolidated Subdivisions dataframes into one\")\n",
"ccs_merge = ao_ccs.merge(cc_ccs, how='inner', on='ccs_dguid') \\\n",
" .merge(fo_ccs, how='inner', on='ccs_dguid') \\\n",
" .merge(lpb_ccs, how='inner', on='ccs_dguid') \\\n",
" .merge(utp_ccs, how='inner', on='ccs_dguid')\n",
"\n",
"# Check that there are expected number of columns. The only difference should be geo_descr_en and geo_descr_fr\n",
"print(\"Checking differences between variables defined in Excel sheet and dataframe. The only difference should be geo_descr_en and geo_descr_fr\")\n",
"set(data_description['variables']) - set(ccs_merge.columns)\n",
"print(\"Checking differences between columns in dataframe and variables defined in Excel sheet. Only difference should be cd_dguid\")\n",
"set(ccs_merge.columns) - set(data_description['variables'])\n",
"\n",
"# Export\n",
"print(\"Exporting ccs_2016.parquet\")\n",
"ccs_merge.to_parquet(f'{output_data_dir}/ccs_2016.parquet', index=False, compression='zstd')\n",
"\n",
"del(ao_ccs)\n",
"del(cc_ccs)\n",
"del(fo_ccs)\n",
"del(lpb_ccs)\n",
"del(utp_ccs)\n",
"del(ccs_merge)\n",
"gc.collect()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}