DuckDB issue with duplicate column names (ex. 'Value' and 'VALUE' are treated the same)

This commit is contained in:
Diego Ripley
2025-06-25 15:30:36 +00:00
parent e929850d4a
commit b71a7b326e
@@ -0,0 +1,484 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "a2239884-1380-45bb-8ad4-648ef2c5b46b",
"metadata": {},
"source": [
"DuckDB treats all column names in a case insensitive manner. So \"Value\" and \"VALUE\" are treated the same, thus \"VALUE\" becomes \"VALUE_1\"\n",
"\n",
"An example of this happening is productId 38100105"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "5909b14d-1f07-46bc-84bf-09f269e15e41",
"metadata": {},
"outputs": [],
"source": [
"from collections import Counter\n",
"import glob\n",
"import pprint\n",
"\n",
"import duckdb\n",
"import pyarrow.parquet as pq"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "04e61bd3-ab4c-46aa-9c0b-de949699ca0a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>REF_DATE</th>\n",
" <th>REF_START_DATE</th>\n",
" <th>REF_END_DATE</th>\n",
" <th>GEO</th>\n",
" <th>DGUID</th>\n",
" <th>Value</th>\n",
" <th>UOM</th>\n",
" <th>UOM_ID</th>\n",
" <th>SCALAR_FACTOR</th>\n",
" <th>SCALAR_ID</th>\n",
" <th>VECTOR</th>\n",
" <th>COORDINATE</th>\n",
" <th>VALUE_1</th>\n",
" <th>STATUS</th>\n",
" <th>SYMBOL</th>\n",
" <th>TERMINATED</th>\n",
" <th>DECIMALS</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1961</td>\n",
" <td>1961-01-01</td>\n",
" <td>1961-12-31</td>\n",
" <td>Newfoundland and Labrador</td>\n",
" <td>None</td>\n",
" <td>Present value calculation, timber stocks, meth...</td>\n",
" <td>Dollars</td>\n",
" <td>81</td>\n",
" <td>millions</td>\n",
" <td>6</td>\n",
" <td>v3822242</td>\n",
" <td>2.1</td>\n",
" <td>470.0</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1961</td>\n",
" <td>1961-01-01</td>\n",
" <td>1961-12-31</td>\n",
" <td>Newfoundland and Labrador</td>\n",
" <td>None</td>\n",
" <td>Present value calculation, timber stocks, meth...</td>\n",
" <td>Dollars</td>\n",
" <td>81</td>\n",
" <td>millions</td>\n",
" <td>6</td>\n",
" <td>v3822243</td>\n",
" <td>2.2</td>\n",
" <td>539.1</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1961</td>\n",
" <td>1961-01-01</td>\n",
" <td>1961-12-31</td>\n",
" <td>Nova Scotia</td>\n",
" <td>None</td>\n",
" <td>Present value calculation, timber stocks, meth...</td>\n",
" <td>Dollars</td>\n",
" <td>81</td>\n",
" <td>millions</td>\n",
" <td>6</td>\n",
" <td>v3822244</td>\n",
" <td>3.1</td>\n",
" <td>0.0</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1961</td>\n",
" <td>1961-01-01</td>\n",
" <td>1961-12-31</td>\n",
" <td>Nova Scotia</td>\n",
" <td>None</td>\n",
" <td>Present value calculation, timber stocks, meth...</td>\n",
" <td>Dollars</td>\n",
" <td>81</td>\n",
" <td>millions</td>\n",
" <td>6</td>\n",
" <td>v3822245</td>\n",
" <td>3.2</td>\n",
" <td>76.6</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1961</td>\n",
" <td>1961-01-01</td>\n",
" <td>1961-12-31</td>\n",
" <td>New Brunswick</td>\n",
" <td>None</td>\n",
" <td>Present value calculation, timber stocks, meth...</td>\n",
" <td>Dollars</td>\n",
" <td>81</td>\n",
" <td>millions</td>\n",
" <td>6</td>\n",
" <td>v3822246</td>\n",
" <td>4.1</td>\n",
" <td>637.9</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>753</th>\n",
" <td>2010</td>\n",
" <td>2010-01-01</td>\n",
" <td>2010-12-31</td>\n",
" <td>Canada</td>\n",
" <td>None</td>\n",
" <td>Present value calculation, timber stocks, meth...</td>\n",
" <td>Dollars</td>\n",
" <td>81</td>\n",
" <td>millions</td>\n",
" <td>6</td>\n",
" <td>v3822241</td>\n",
" <td>1.2</td>\n",
" <td>124971.2</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>754</th>\n",
" <td>2011</td>\n",
" <td>2011-01-01</td>\n",
" <td>2011-12-31</td>\n",
" <td>Canada</td>\n",
" <td>None</td>\n",
" <td>Present value calculation, timber stocks, meth...</td>\n",
" <td>Dollars</td>\n",
" <td>81</td>\n",
" <td>millions</td>\n",
" <td>6</td>\n",
" <td>v3822240</td>\n",
" <td>1.1</td>\n",
" <td>120498.5</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>755</th>\n",
" <td>2011</td>\n",
" <td>2011-01-01</td>\n",
" <td>2011-12-31</td>\n",
" <td>Canada</td>\n",
" <td>None</td>\n",
" <td>Present value calculation, timber stocks, meth...</td>\n",
" <td>Dollars</td>\n",
" <td>81</td>\n",
" <td>millions</td>\n",
" <td>6</td>\n",
" <td>v3822241</td>\n",
" <td>1.2</td>\n",
" <td>120498.5</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>756</th>\n",
" <td>2012</td>\n",
" <td>2012-01-01</td>\n",
" <td>2012-12-31</td>\n",
" <td>Canada</td>\n",
" <td>None</td>\n",
" <td>Present value calculation, timber stocks, meth...</td>\n",
" <td>Dollars</td>\n",
" <td>81</td>\n",
" <td>millions</td>\n",
" <td>6</td>\n",
" <td>v3822240</td>\n",
" <td>1.1</td>\n",
" <td>113132.6</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>757</th>\n",
" <td>2012</td>\n",
" <td>2012-01-01</td>\n",
" <td>2012-12-31</td>\n",
" <td>Canada</td>\n",
" <td>None</td>\n",
" <td>Present value calculation, timber stocks, meth...</td>\n",
" <td>Dollars</td>\n",
" <td>81</td>\n",
" <td>millions</td>\n",
" <td>6</td>\n",
" <td>v3822241</td>\n",
" <td>1.2</td>\n",
" <td>113132.6</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>758 rows × 17 columns</p>\n",
"</div>"
],
"text/plain": [
" REF_DATE REF_START_DATE REF_END_DATE GEO DGUID \\\n",
"0 1961 1961-01-01 1961-12-31 Newfoundland and Labrador None \n",
"1 1961 1961-01-01 1961-12-31 Newfoundland and Labrador None \n",
"2 1961 1961-01-01 1961-12-31 Nova Scotia None \n",
"3 1961 1961-01-01 1961-12-31 Nova Scotia None \n",
"4 1961 1961-01-01 1961-12-31 New Brunswick None \n",
".. ... ... ... ... ... \n",
"753 2010 2010-01-01 2010-12-31 Canada None \n",
"754 2011 2011-01-01 2011-12-31 Canada None \n",
"755 2011 2011-01-01 2011-12-31 Canada None \n",
"756 2012 2012-01-01 2012-12-31 Canada None \n",
"757 2012 2012-01-01 2012-12-31 Canada None \n",
"\n",
" Value UOM UOM_ID \\\n",
"0 Present value calculation, timber stocks, meth... Dollars 81 \n",
"1 Present value calculation, timber stocks, meth... Dollars 81 \n",
"2 Present value calculation, timber stocks, meth... Dollars 81 \n",
"3 Present value calculation, timber stocks, meth... Dollars 81 \n",
"4 Present value calculation, timber stocks, meth... Dollars 81 \n",
".. ... ... ... \n",
"753 Present value calculation, timber stocks, meth... Dollars 81 \n",
"754 Present value calculation, timber stocks, meth... Dollars 81 \n",
"755 Present value calculation, timber stocks, meth... Dollars 81 \n",
"756 Present value calculation, timber stocks, meth... Dollars 81 \n",
"757 Present value calculation, timber stocks, meth... Dollars 81 \n",
"\n",
" SCALAR_FACTOR SCALAR_ID VECTOR COORDINATE VALUE_1 STATUS SYMBOL \\\n",
"0 millions 6 v3822242 2.1 470.0 None None \n",
"1 millions 6 v3822243 2.2 539.1 None None \n",
"2 millions 6 v3822244 3.1 0.0 None None \n",
"3 millions 6 v3822245 3.2 76.6 None None \n",
"4 millions 6 v3822246 4.1 637.9 None None \n",
".. ... ... ... ... ... ... ... \n",
"753 millions 6 v3822241 1.2 124971.2 None None \n",
"754 millions 6 v3822240 1.1 120498.5 None None \n",
"755 millions 6 v3822241 1.2 120498.5 None None \n",
"756 millions 6 v3822240 1.1 113132.6 None None \n",
"757 millions 6 v3822241 1.2 113132.6 None None \n",
"\n",
" TERMINATED DECIMALS \n",
"0 None 1 \n",
"1 None 1 \n",
"2 None 1 \n",
"3 None 1 \n",
"4 None 1 \n",
".. ... ... \n",
"753 None 1 \n",
"754 None 1 \n",
"755 None 1 \n",
"756 None 1 \n",
"757 None 1 \n",
"\n",
"[758 rows x 17 columns]"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"con = duckdb.connect()\n",
"\n",
"issue = con.execute(\"SELECT * FROM '/data/tables/output/en/june_20_2025/38100105.parquet'\").df()\n",
"\n",
"issue"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "cbf8953d-8523-42e8-b28c-6b464869ce61",
"metadata": {},
"outputs": [],
"source": [
"files = glob.glob(\"/data/tables/output/en/june_20_2025/*.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "d52074f9-7746-4569-9aea-57c204eda2eb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'10100164': ['VALUE'],\n",
" '13100902': ['STATUS'],\n",
" '13100904': ['STATUS'],\n",
" '23100049': ['VALUE'],\n",
" '23100050': ['VALUE'],\n",
" '23100268': ['STATUS'],\n",
" '36100374': ['VALUE'],\n",
" '36100396': ['VALUE'],\n",
" '36100397': ['VALUE'],\n",
" '36100658': ['VALUE'],\n",
" '38100104': ['VALUE'],\n",
" '38100105': ['VALUE']}\n"
]
}
],
"source": [
"duplicate_column_names = {}\n",
"for file in files:\n",
" # Open the Parquet file metadata\n",
" dataset = pq.ParquetFile(file)\n",
" # Get the column names\n",
" column_names = [x.upper() for x in dataset.schema.names]\n",
" count_of_column_names = [x for x in column_names if column_names.count(x) > 1]\n",
" if count_of_column_names:\n",
" product_id = file.split('/')[-1].split('.parquet')[0]\n",
" duplicate_column_names[product_id] = list(set(count_of_column_names))\n",
"\n",
"pprint.pprint(duplicate_column_names)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "85dc8ce4-3ba5-4db8-bc2c-9ee101296e84",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/data/tables/output/en/june_20_2025/11100235.parquet\n",
"['REF_DATE', 'REF_START_DATE', 'REF_END_DATE', 'GEO', 'DGUID', 'North American Product Classification System (NAPCS) Canada 2012 Version 1.1', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS']\n",
"/data/tables/output/en/june_20_2025/11100236.parquet\n",
"['REF_DATE', 'REF_START_DATE', 'REF_END_DATE', 'GEO', 'DGUID', 'North American Product Classification System (NAPCS) Canada 2012 Version 1.1', 'North American Industry Classification System (NAICS)', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS']\n",
"/data/tables/output/en/june_20_2025/20100014.parquet\n",
"['REF_DATE', 'REF_START_DATE', 'REF_END_DATE', 'GEO', 'DGUID', 'North American Product Classification System (NAPCS) Canada 2012 Version 1.2', 'Financial variables', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS']\n",
"/data/tables/output/en/june_20_2025/20100015.parquet\n",
"['REF_DATE', 'REF_START_DATE', 'REF_END_DATE', 'GEO', 'DGUID', 'North American Product Classification System (NAPCS) Canada 2022 Version 1.0', 'Financial variables', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS']\n",
"/data/tables/output/en/june_20_2025/23100313.parquet\n",
"['REF_DATE', 'REF_START_DATE', 'REF_END_DATE', 'GEO', 'DGUID', 'Distance-capacity public transit service area', 'Location', 'Gender', 'Demographic and socio-economic', 'Sustainable Development Goals (SDGs) 11.2.1 indicator', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS']\n"
]
}
],
"source": [
"for file in files:\n",
" # Open the Parquet file metadata\n",
" dataset = pq.ParquetFile(file)\n",
" # Get the column names\n",
" column_names = dataset.schema.names\n",
" has_dot = ['.' in x for x in column_names if '.' in x]\n",
" if has_dot:\n",
" print(file)\n",
" print(column_names)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ebc2fb59-105b-425a-9c92-ad04be934df6",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}