From b71a7b326e7d97f882bd861b8e9ae182261feef1 Mon Sep 17 00:00:00 2001 From: Diego Ripley Date: Wed, 25 Jun 2025 15:30:36 +0000 Subject: [PATCH] DuckDB issue with duplicate column names (ex. 'Value' and 'VALUE' are treated the same) --- .../check_duplicate_column_names.ipynb | 484 ++++++++++++++++++ 1 file changed, 484 insertions(+) create mode 100644 experiments/statcan_products/check_duplicate_column_names.ipynb diff --git a/experiments/statcan_products/check_duplicate_column_names.ipynb b/experiments/statcan_products/check_duplicate_column_names.ipynb new file mode 100644 index 0000000..bb49ba9 --- /dev/null +++ b/experiments/statcan_products/check_duplicate_column_names.ipynb @@ -0,0 +1,484 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a2239884-1380-45bb-8ad4-648ef2c5b46b", + "metadata": {}, + "source": [ + "DuckDB treats all column names in a case insensitive manner. So \"Value\" and \"VALUE\" are treated the same, thus \"VALUE\" becomes \"VALUE_1\"\n", + "\n", + "An example of this happening is productId 38100105" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "5909b14d-1f07-46bc-84bf-09f269e15e41", + "metadata": {}, + "outputs": [], + "source": [ + "from collections import Counter\n", + "import glob\n", + "import pprint\n", + "\n", + "import duckdb\n", + "import pyarrow.parquet as pq" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "04e61bd3-ab4c-46aa-9c0b-de949699ca0a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
REF_DATEREF_START_DATEREF_END_DATEGEODGUIDValueUOMUOM_IDSCALAR_FACTORSCALAR_IDVECTORCOORDINATEVALUE_1STATUSSYMBOLTERMINATEDDECIMALS
019611961-01-011961-12-31Newfoundland and LabradorNonePresent value calculation, timber stocks, meth...Dollars81millions6v38222422.1470.0NoneNoneNone1
119611961-01-011961-12-31Newfoundland and LabradorNonePresent value calculation, timber stocks, meth...Dollars81millions6v38222432.2539.1NoneNoneNone1
219611961-01-011961-12-31Nova ScotiaNonePresent value calculation, timber stocks, meth...Dollars81millions6v38222443.10.0NoneNoneNone1
319611961-01-011961-12-31Nova ScotiaNonePresent value calculation, timber stocks, meth...Dollars81millions6v38222453.276.6NoneNoneNone1
419611961-01-011961-12-31New BrunswickNonePresent value calculation, timber stocks, meth...Dollars81millions6v38222464.1637.9NoneNoneNone1
......................................................
75320102010-01-012010-12-31CanadaNonePresent value calculation, timber stocks, meth...Dollars81millions6v38222411.2124971.2NoneNoneNone1
75420112011-01-012011-12-31CanadaNonePresent value calculation, timber stocks, meth...Dollars81millions6v38222401.1120498.5NoneNoneNone1
75520112011-01-012011-12-31CanadaNonePresent value calculation, timber stocks, meth...Dollars81millions6v38222411.2120498.5NoneNoneNone1
75620122012-01-012012-12-31CanadaNonePresent value calculation, timber stocks, meth...Dollars81millions6v38222401.1113132.6NoneNoneNone1
75720122012-01-012012-12-31CanadaNonePresent value calculation, timber stocks, meth...Dollars81millions6v38222411.2113132.6NoneNoneNone1
\n", + "

758 rows × 17 columns

\n", + "
" + ], + "text/plain": [ + " REF_DATE REF_START_DATE REF_END_DATE GEO DGUID \\\n", + "0 1961 1961-01-01 1961-12-31 Newfoundland and Labrador None \n", + "1 1961 1961-01-01 1961-12-31 Newfoundland and Labrador None \n", + "2 1961 1961-01-01 1961-12-31 Nova Scotia None \n", + "3 1961 1961-01-01 1961-12-31 Nova Scotia None \n", + "4 1961 1961-01-01 1961-12-31 New Brunswick None \n", + ".. ... ... ... ... ... \n", + "753 2010 2010-01-01 2010-12-31 Canada None \n", + "754 2011 2011-01-01 2011-12-31 Canada None \n", + "755 2011 2011-01-01 2011-12-31 Canada None \n", + "756 2012 2012-01-01 2012-12-31 Canada None \n", + "757 2012 2012-01-01 2012-12-31 Canada None \n", + "\n", + " Value UOM UOM_ID \\\n", + "0 Present value calculation, timber stocks, meth... Dollars 81 \n", + "1 Present value calculation, timber stocks, meth... Dollars 81 \n", + "2 Present value calculation, timber stocks, meth... Dollars 81 \n", + "3 Present value calculation, timber stocks, meth... Dollars 81 \n", + "4 Present value calculation, timber stocks, meth... Dollars 81 \n", + ".. ... ... ... \n", + "753 Present value calculation, timber stocks, meth... Dollars 81 \n", + "754 Present value calculation, timber stocks, meth... Dollars 81 \n", + "755 Present value calculation, timber stocks, meth... Dollars 81 \n", + "756 Present value calculation, timber stocks, meth... Dollars 81 \n", + "757 Present value calculation, timber stocks, meth... Dollars 81 \n", + "\n", + " SCALAR_FACTOR SCALAR_ID VECTOR COORDINATE VALUE_1 STATUS SYMBOL \\\n", + "0 millions 6 v3822242 2.1 470.0 None None \n", + "1 millions 6 v3822243 2.2 539.1 None None \n", + "2 millions 6 v3822244 3.1 0.0 None None \n", + "3 millions 6 v3822245 3.2 76.6 None None \n", + "4 millions 6 v3822246 4.1 637.9 None None \n", + ".. ... ... ... ... ... ... ... \n", + "753 millions 6 v3822241 1.2 124971.2 None None \n", + "754 millions 6 v3822240 1.1 120498.5 None None \n", + "755 millions 6 v3822241 1.2 120498.5 None None \n", + "756 millions 6 v3822240 1.1 113132.6 None None \n", + "757 millions 6 v3822241 1.2 113132.6 None None \n", + "\n", + " TERMINATED DECIMALS \n", + "0 None 1 \n", + "1 None 1 \n", + "2 None 1 \n", + "3 None 1 \n", + "4 None 1 \n", + ".. ... ... \n", + "753 None 1 \n", + "754 None 1 \n", + "755 None 1 \n", + "756 None 1 \n", + "757 None 1 \n", + "\n", + "[758 rows x 17 columns]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "con = duckdb.connect()\n", + "\n", + "issue = con.execute(\"SELECT * FROM '/data/tables/output/en/june_20_2025/38100105.parquet'\").df()\n", + "\n", + "issue" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "cbf8953d-8523-42e8-b28c-6b464869ce61", + "metadata": {}, + "outputs": [], + "source": [ + "files = glob.glob(\"/data/tables/output/en/june_20_2025/*.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "d52074f9-7746-4569-9aea-57c204eda2eb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'10100164': ['VALUE'],\n", + " '13100902': ['STATUS'],\n", + " '13100904': ['STATUS'],\n", + " '23100049': ['VALUE'],\n", + " '23100050': ['VALUE'],\n", + " '23100268': ['STATUS'],\n", + " '36100374': ['VALUE'],\n", + " '36100396': ['VALUE'],\n", + " '36100397': ['VALUE'],\n", + " '36100658': ['VALUE'],\n", + " '38100104': ['VALUE'],\n", + " '38100105': ['VALUE']}\n" + ] + } + ], + "source": [ + "duplicate_column_names = {}\n", + "for file in files:\n", + " # Open the Parquet file metadata\n", + " dataset = pq.ParquetFile(file)\n", + " # Get the column names\n", + " column_names = [x.upper() for x in dataset.schema.names]\n", + " count_of_column_names = [x for x in column_names if column_names.count(x) > 1]\n", + " if count_of_column_names:\n", + " product_id = file.split('/')[-1].split('.parquet')[0]\n", + " duplicate_column_names[product_id] = list(set(count_of_column_names))\n", + "\n", + "pprint.pprint(duplicate_column_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "85dc8ce4-3ba5-4db8-bc2c-9ee101296e84", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/data/tables/output/en/june_20_2025/11100235.parquet\n", + "['REF_DATE', 'REF_START_DATE', 'REF_END_DATE', 'GEO', 'DGUID', 'North American Product Classification System (NAPCS) Canada 2012 Version 1.1', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS']\n", + "/data/tables/output/en/june_20_2025/11100236.parquet\n", + "['REF_DATE', 'REF_START_DATE', 'REF_END_DATE', 'GEO', 'DGUID', 'North American Product Classification System (NAPCS) Canada 2012 Version 1.1', 'North American Industry Classification System (NAICS)', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS']\n", + "/data/tables/output/en/june_20_2025/20100014.parquet\n", + "['REF_DATE', 'REF_START_DATE', 'REF_END_DATE', 'GEO', 'DGUID', 'North American Product Classification System (NAPCS) Canada 2012 Version 1.2', 'Financial variables', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS']\n", + "/data/tables/output/en/june_20_2025/20100015.parquet\n", + "['REF_DATE', 'REF_START_DATE', 'REF_END_DATE', 'GEO', 'DGUID', 'North American Product Classification System (NAPCS) Canada 2022 Version 1.0', 'Financial variables', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS']\n", + "/data/tables/output/en/june_20_2025/23100313.parquet\n", + "['REF_DATE', 'REF_START_DATE', 'REF_END_DATE', 'GEO', 'DGUID', 'Distance-capacity public transit service area', 'Location', 'Gender', 'Demographic and socio-economic', 'Sustainable Development Goals (SDGs) 11.2.1 indicator', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS']\n" + ] + } + ], + "source": [ + "for file in files:\n", + " # Open the Parquet file metadata\n", + " dataset = pq.ParquetFile(file)\n", + " # Get the column names\n", + " column_names = dataset.schema.names\n", + " has_dot = ['.' in x for x in column_names if '.' in x]\n", + " if has_dot:\n", + " print(file)\n", + " print(column_names)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebc2fb59-105b-425a-9c92-ad04be934df6", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}