From 8875722d10e2730d41b083fb2efcd8983a439610 Mon Sep 17 00:00:00 2001 From: Diego Ripley Date: Sat, 21 Jun 2025 18:01:16 +0000 Subject: [PATCH] Made changes to processing of data tables --- .../process_files_multiprocessing.py | 5 +- .../process_product_test.ipynb | 549 ++++++++++++------ 2 files changed, 376 insertions(+), 178 deletions(-) diff --git a/experiments/statcan_products/process_files_multiprocessing.py b/experiments/statcan_products/process_files_multiprocessing.py index 73c7cad..1581125 100644 --- a/experiments/statcan_products/process_files_multiprocessing.py +++ b/experiments/statcan_products/process_files_multiprocessing.py @@ -178,7 +178,7 @@ def convert_to_lowest_type(df): for row in dtypes.itertuples(): column = row[0] the_type = str(row[1]) - if the_type == 'int64': + if the_type == 'Int64': df[column] = pd.to_numeric(df[column], downcast='integer') return df @@ -229,6 +229,8 @@ def process_cube(product_id, language="en"): - productId 43100011 has all with DECIMAL = 1 (float64) - productId 17100009 has DECIMAL = 0 (int64) - productId 35100076 has multiple DECIMAL precisions [0, 1, 2] (int64, float64, float64) + - productId 10100164 has two columns named the same "Value" and "VALUE". It is processed fine with the read_csv, and when it is exported as parquet. + DuckDB has an issue with it, but Pandas and Polars are able to handle "Value" and "VALUE" """ cur.execute("SELECT product_id FROM downloaded WHERE product_id = ?", (product_id,)) result = cur.fetchone() @@ -268,6 +270,7 @@ def process_cube(product_id, language="en"): if column in columns: parameters["dtype"][column] = 'int16' + # The remaining columns should be string, with the exception of VALUE for column in columns: if column not in columns_always_int_8 and column not in columns_always_int_16 and column != "VALUE": parameters["dtype"][column] = 'string' diff --git a/experiments/statcan_products/process_product_test.ipynb b/experiments/statcan_products/process_product_test.ipynb index ea26878..886bab3 100644 --- a/experiments/statcan_products/process_product_test.ipynb +++ b/experiments/statcan_products/process_product_test.ipynb @@ -25,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 1, "id": "98859cd6-6fa4-4aef-a113-455699524fae", "metadata": { "editable": true, @@ -77,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 2, "id": "28ac4c01-c1c5-427f-bb2c-0da99a4c5591", "metadata": {}, "outputs": [], @@ -99,7 +99,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 3, "id": "9daa94f4-16c9-4d8b-951e-3d5d38eb618f", "metadata": {}, "outputs": [], @@ -109,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 4, "id": "0af9a4b3-7b59-460b-b933-504919d4bd2a", "metadata": { "editable": true, @@ -150,7 +150,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 5, "id": "4b7996d2-75ab-4173-a17a-64fb7ab63740", "metadata": {}, "outputs": [], @@ -163,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 6, "id": "23cbabc3-0d4b-4e28-a4df-b2c5e8c7ea8b", "metadata": {}, "outputs": [], @@ -212,7 +212,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 7, "id": "dc5573ef-734b-44d8-a4c4-0df19d655975", "metadata": {}, "outputs": [], @@ -222,7 +222,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 8, "id": "eddf6501-8428-44cc-8d2d-e245803a3943", "metadata": { "editable": true, @@ -296,7 +296,7 @@ " for row in dtypes.itertuples():\n", " column = row[0]\n", " the_type = str(row[1])\n", - " if the_type == 'int64':\n", + " if the_type == 'Int64':\n", " df[column] = pd.to_numeric(df[column], downcast='integer')\n", "\n", " return df" @@ -304,7 +304,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 10, "id": "144e3716-b0e7-4a39-9a25-ededea506f4f", "metadata": {}, "outputs": [], @@ -352,7 +352,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 40, "id": "858e405e-7c02-4193-8abe-f23951761b09", "metadata": { "editable": true, @@ -366,15 +366,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Reading /data/tables/scratch/13100102.csv\n", - "Index(['REF_DATE', 'GEO', 'DGUID',\n", - " 'North American Industry Classification System (NAICS)',\n", - " 'Summary statistics', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID',\n", - " 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED',\n", - " 'DECIMALS'],\n", - " dtype='object')\n", - "{'engine': 'c', 'low_memory': True, 'nrows': 1000000, 'dtype': {'DECIMALS': 'int8', 'SCALAR_ID': 'int8', 'UOM_ID': 'int16', 'REF_DATE': 'string', 'GEO': 'string', 'DGUID': 'string', 'North American Industry Classification System (NAICS)': 'string', 'Summary statistics': 'string', 'UOM': 'string', 'SCALAR_FACTOR': 'string', 'VECTOR': 'string', 'COORDINATE': 'string', 'STATUS': 'string', 'SYMBOL': 'string', 'TERMINATED': 'string'}}\n", - "Reading /data/tables/scratch/13100102.csv as a Pandas dataframe\n" + "Already processed 10100164\n", + "Extracting /data/tables/input/en/10100164.zip to /data/tables/scratch\n", + "Reading /data/tables/scratch/10100164.csv\n", + "Index(['REF_DATE', 'GEO', 'DGUID', 'Value', 'Type of cannabis', 'UOM',\n", + " 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE',\n", + " 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS'],\n", + " dtype='object')\n" ] } ], @@ -384,9 +382,11 @@ "- productId 43100011 has all with DECIMAL = 1 (float64)\n", "- productId 17100009 has DECIMAL = 0 (int64)\n", "- productId 35100076 has multiple DECIMAL precisions [0, 1, 2] (int64, float64, float64)\n", + "- productId 10100164 has two columns named the same \"Value\" and \"VALUE\". It is processed fine with the read_csv, and when it is exported as parquet.\n", + "DuckDB has an issue with it, but Pandas and Polars are able to handle \"Value\" and \"VALUE\"\n", "\"\"\"\n", "\n", - "product_id = \"13100102\"\n", + "product_id = \"10100164\"\n", "#def process_cube(product_id, language=\"en\"):\n", "language = \"en\"\n", "cur.execute(\"SELECT product_id FROM downloaded WHERE product_id = ?\", (product_id,))\n", @@ -394,7 +394,7 @@ "if result:\n", " print(f\"Already processed {product_id}\")\n", " #return\n", - "#extract_zipfile(product_id, language)\n", + "extract_zipfile(product_id, language)\n", "\"\"\"\n", "The pandas column reader is better than the Polars one\n", "Here is an example where polars was not reading it right:\n", @@ -417,8 +417,25 @@ "}\n", "\n", "columns = pd.read_csv(product_csv, nrows=0).columns\n", - "print(columns)\n", - "\n", + "print(columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "c1f89175-78e5-4e95-8a3e-3b65f0cb4b2d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'engine': 'c', 'low_memory': True, 'nrows': 1000000, 'dtype': {'DECIMALS': 'int8', 'SCALAR_ID': 'int8', 'UOM_ID': 'int16', 'REF_DATE': 'string', 'GEO': 'string', 'DGUID': 'string', 'Value': 'string', 'Type of cannabis': 'string', 'UOM': 'string', 'SCALAR_FACTOR': 'string', 'VECTOR': 'string', 'COORDINATE': 'string', 'STATUS': 'string', 'SYMBOL': 'string', 'TERMINATED': 'string'}}\n", + "Reading /data/tables/scratch/10100164.csv as a Pandas dataframe\n" + ] + } + ], + "source": [ "columns_always_int_8 = [\"DECIMALS\", \"SCALAR_ID\"]\n", "for column in columns_always_int_8:\n", " if column in columns:\n", @@ -429,6 +446,7 @@ " if column in columns:\n", " parameters[\"dtype\"][column] = 'int16'\n", "\n", + "# The remaining columns should be string, with the exception of VALUE\n", "for column in columns:\n", " if column not in columns_always_int_8 and column not in columns_always_int_16 and column != \"VALUE\":\n", " parameters[\"dtype\"][column] = 'string'\n", @@ -443,7 +461,221 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 31, + "id": "87ff5f69-ca1f-40e0-ac73-c73dd8c1bd4d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
REF_DATEGEODGUIDValueType of cannabisUOMUOM_IDSCALAR_FACTORSCALAR_IDVECTORCOORDINATEVALUESTATUSSYMBOLTERMINATEDDECIMALS
02021/2022Canada2021A000011124Value of salesTotal cannabis productsDollars81thousands3v14904366601.1.14027928.0<NA><NA><NA>0
12021/2022Canada2021A000011124Value of salesDried cannabisDollars81thousands3v14904366471.1.22861838.0<NA><NA><NA>0
22021/2022Canada2021A000011124Value of salesInhaled cannabis extractsDollars81thousands3v14904366481.1.3729178.0<NA><NA><NA>0
32021/2022Canada2021A000011124Value of salesIngested cannabis extractsDollars81thousands3v14904366491.1.4158283.0<NA><NA><NA>0
42021/2022Canada2021A000011124Value of salesSolid cannabis ediblesDollars81thousands3v14904366501.1.5166336.0<NA><NA><NA>0
\n", + "
" + ], + "text/plain": [ + " REF_DATE GEO DGUID Value \\\n", + "0 2021/2022 Canada 2021A000011124 Value of sales \n", + "1 2021/2022 Canada 2021A000011124 Value of sales \n", + "2 2021/2022 Canada 2021A000011124 Value of sales \n", + "3 2021/2022 Canada 2021A000011124 Value of sales \n", + "4 2021/2022 Canada 2021A000011124 Value of sales \n", + "\n", + " Type of cannabis UOM UOM_ID SCALAR_FACTOR SCALAR_ID \\\n", + "0 Total cannabis products Dollars 81 thousands 3 \n", + "1 Dried cannabis Dollars 81 thousands 3 \n", + "2 Inhaled cannabis extracts Dollars 81 thousands 3 \n", + "3 Ingested cannabis extracts Dollars 81 thousands 3 \n", + "4 Solid cannabis edibles Dollars 81 thousands 3 \n", + "\n", + " VECTOR COORDINATE VALUE STATUS SYMBOL TERMINATED DECIMALS \n", + "0 v1490436660 1.1.1 4027928.0 0 \n", + "1 v1490436647 1.1.2 2861838.0 0 \n", + "2 v1490436648 1.1.3 729178.0 0 \n", + "3 v1490436649 1.1.4 158283.0 0 \n", + "4 v1490436650 1.1.5 166336.0 0 " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "fde0e149-d146-4516-8966-0989e4ccf290", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "REF_DATE string[python]\n", + "GEO string[python]\n", + "DGUID string[python]\n", + "Value string[python]\n", + "Type of cannabis string[python]\n", + "UOM string[python]\n", + "UOM_ID int16\n", + "SCALAR_FACTOR string[python]\n", + "SCALAR_ID int8\n", + "VECTOR string[python]\n", + "COORDINATE string[python]\n", + "VALUE float64\n", + "STATUS string[python]\n", + "SYMBOL string[python]\n", + "TERMINATED string[python]\n", + "DECIMALS int8\n", + "dtype: object" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 33, "id": "7579a135-1dfe-4fc0-991b-4b261d6577e0", "metadata": {}, "outputs": [ @@ -451,8 +683,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "[1]\n", - "{'VALUE': 'float64'}\n" + "[0]\n", + "{'VALUE': 'Int64'}\n" ] } ], @@ -469,19 +701,36 @@ " print(convert_dict)\n", " df = df.astype(convert_dict)\n", "elif 0 in (unique_decimal_values):\n", - " if df[\"VALUE\"].dtype != \"int64\":\n", + " if df[\"VALUE\"].dtype != \"Int64\":\n", " # If DECIMALS = [0]\n", - " convert_dict = {\"VALUE\": \"int64\"}\n", + " convert_dict = {\"VALUE\": \"Int64\"}\n", " print(convert_dict)\n", - " df = df.astype(convert_dict)\n", - "\n", - "df = convert_to_lowest_type(df)\n", + " df = df.astype(convert_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "fe87b3b0-4e04-41a8-a704-75cb9829b0a5", + "metadata": {}, + "outputs": [], + "source": [ + "df = convert_to_lowest_type(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "74507546-4080-4962-88fc-58c1e3943d17", + "metadata": {}, + "outputs": [], + "source": [ "df = compute_ref_date_bounds(df)" ] }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 39, "id": "6c2781b3-8eea-4317-a8c0-083d97ee04fc", "metadata": {}, "outputs": [ @@ -511,8 +760,8 @@ " REF_END_DATE\n", " GEO\n", " DGUID\n", - " North American Industry Classification System (NAICS)\n", - " Summary statistics\n", + " Value\n", + " Type of cannabis\n", " UOM\n", " UOM_ID\n", " SCALAR_FACTOR\n", @@ -529,144 +778,144 @@ " \n", " \n", " 0\n", - " 2014\n", - " 2014-01-01\n", - " 2014-12-31\n", + " 2021/2022\n", + " NaT\n", + " NaT\n", " Canada\n", - " 2016A000011124\n", - " Nursing and residential care facilities [623]\n", - " Operating revenue\n", + " 2021A000011124\n", + " Value of sales\n", + " Total cannabis products\n", " Dollars\n", " 81\n", - " millions\n", - " 6\n", - " v114809189\n", + " thousands\n", + " 3\n", + " v1490436660\n", " 1.1.1\n", - " 9310.7\n", + " 4027928\n", " <NA>\n", " <NA>\n", " <NA>\n", - " 1\n", + " 0\n", " \n", " \n", " 1\n", - " 2014\n", - " 2014-01-01\n", - " 2014-12-31\n", + " 2021/2022\n", + " NaT\n", + " NaT\n", " Canada\n", - " 2016A000011124\n", - " Nursing and residential care facilities [623]\n", - " Operating expenses\n", + " 2021A000011124\n", + " Value of sales\n", + " Dried cannabis\n", " Dollars\n", " 81\n", - " millions\n", - " 6\n", - " v114809190\n", + " thousands\n", + " 3\n", + " v1490436647\n", " 1.1.2\n", - " 8499.5\n", + " 2861838\n", " <NA>\n", " <NA>\n", " <NA>\n", - " 1\n", + " 0\n", " \n", " \n", " 2\n", - " 2014\n", - " 2014-01-01\n", - " 2014-12-31\n", + " 2021/2022\n", + " NaT\n", + " NaT\n", " Canada\n", - " 2016A000011124\n", - " Nursing and residential care facilities [623]\n", - " Salaries, wages, commissions and benefits\n", + " 2021A000011124\n", + " Value of sales\n", + " Inhaled cannabis extracts\n", " Dollars\n", " 81\n", - " millions\n", - " 6\n", - " v114809191\n", + " thousands\n", + " 3\n", + " v1490436648\n", " 1.1.3\n", - " 4630.3\n", + " 729178\n", " <NA>\n", " <NA>\n", " <NA>\n", - " 1\n", + " 0\n", " \n", " \n", " 3\n", - " 2014\n", - " 2014-01-01\n", - " 2014-12-31\n", + " 2021/2022\n", + " NaT\n", + " NaT\n", " Canada\n", - " 2016A000011124\n", - " Nursing and residential care facilities [623]\n", - " Operating profit margin\n", - " Percent\n", - " 239\n", - " units\n", - " 0\n", - " v114809192\n", + " 2021A000011124\n", + " Value of sales\n", + " Ingested cannabis extracts\n", + " Dollars\n", + " 81\n", + " thousands\n", + " 3\n", + " v1490436649\n", " 1.1.4\n", - " 8.7\n", + " 158283\n", " <NA>\n", " <NA>\n", " <NA>\n", - " 1\n", + " 0\n", " \n", " \n", " 4\n", - " 2014\n", - " 2014-01-01\n", - " 2014-12-31\n", - " Newfoundland and Labrador\n", - " 2016A000210\n", - " Nursing and residential care facilities [623]\n", - " Operating revenue\n", + " 2021/2022\n", + " NaT\n", + " NaT\n", + " Canada\n", + " 2021A000011124\n", + " Value of sales\n", + " Solid cannabis edibles\n", " Dollars\n", " 81\n", - " millions\n", - " 6\n", - " v114809193\n", - " 2.1.1\n", - " 97.9\n", + " thousands\n", + " 3\n", + " v1490436650\n", + " 1.1.5\n", + " 166336\n", " <NA>\n", " <NA>\n", " <NA>\n", - " 1\n", + " 0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " REF_DATE REF_START_DATE REF_END_DATE GEO \\\n", - "0 2014 2014-01-01 2014-12-31 Canada \n", - "1 2014 2014-01-01 2014-12-31 Canada \n", - "2 2014 2014-01-01 2014-12-31 Canada \n", - "3 2014 2014-01-01 2014-12-31 Canada \n", - "4 2014 2014-01-01 2014-12-31 Newfoundland and Labrador \n", + " REF_DATE REF_START_DATE REF_END_DATE GEO DGUID \\\n", + "0 2021/2022 NaT NaT Canada 2021A000011124 \n", + "1 2021/2022 NaT NaT Canada 2021A000011124 \n", + "2 2021/2022 NaT NaT Canada 2021A000011124 \n", + "3 2021/2022 NaT NaT Canada 2021A000011124 \n", + "4 2021/2022 NaT NaT Canada 2021A000011124 \n", "\n", - " DGUID North American Industry Classification System (NAICS) \\\n", - "0 2016A000011124 Nursing and residential care facilities [623] \n", - "1 2016A000011124 Nursing and residential care facilities [623] \n", - "2 2016A000011124 Nursing and residential care facilities [623] \n", - "3 2016A000011124 Nursing and residential care facilities [623] \n", - "4 2016A000210 Nursing and residential care facilities [623] \n", + " Value Type of cannabis UOM UOM_ID SCALAR_FACTOR \\\n", + "0 Value of sales Total cannabis products Dollars 81 thousands \n", + "1 Value of sales Dried cannabis Dollars 81 thousands \n", + "2 Value of sales Inhaled cannabis extracts Dollars 81 thousands \n", + "3 Value of sales Ingested cannabis extracts Dollars 81 thousands \n", + "4 Value of sales Solid cannabis edibles Dollars 81 thousands \n", "\n", - " Summary statistics UOM UOM_ID SCALAR_FACTOR \\\n", - "0 Operating revenue Dollars 81 millions \n", - "1 Operating expenses Dollars 81 millions \n", - "2 Salaries, wages, commissions and benefits Dollars 81 millions \n", - "3 Operating profit margin Percent 239 units \n", - "4 Operating revenue Dollars 81 millions \n", + " SCALAR_ID VECTOR COORDINATE VALUE STATUS SYMBOL TERMINATED \\\n", + "0 3 v1490436660 1.1.1 4027928 \n", + "1 3 v1490436647 1.1.2 2861838 \n", + "2 3 v1490436648 1.1.3 729178 \n", + "3 3 v1490436649 1.1.4 158283 \n", + "4 3 v1490436650 1.1.5 166336 \n", "\n", - " SCALAR_ID VECTOR COORDINATE VALUE STATUS SYMBOL TERMINATED DECIMALS \n", - "0 6 v114809189 1.1.1 9310.7 1 \n", - "1 6 v114809190 1.1.2 8499.5 1 \n", - "2 6 v114809191 1.1.3 4630.3 1 \n", - "3 0 v114809192 1.1.4 8.7 1 \n", - "4 6 v114809193 2.1.1 97.9 1 " + " DECIMALS \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 " ] }, - "execution_count": 80, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -677,7 +926,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 17, "id": "49cc1fa3-1ac8-4510-b4fd-7827a041e4a9", "metadata": { "editable": true, @@ -691,7 +940,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Exporting dataframe as parquet to /data/tables/output/en/13100102.parquet\n" + "Exporting dataframe as parquet to /data/tables/output/en/10100164_test.parquet\n" ] } ], @@ -710,7 +959,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "id": "788bc668-8057-4e06-91a3-b99991e0a410", "metadata": { "editable": true, @@ -719,28 +968,7 @@ }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Removing scratch files\n", - "Reading metadata /data/tables/input/metadata/43100011.json\n" - ] - }, - { - "ename": "OperationalError", - "evalue": "no such column: last_processed", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mOperationalError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[26]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m 4\u001b[39m os.remove(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mscratch_folder\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mproduct_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m_MetaData.csv\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 5\u001b[39m update_last_downloaded(product_id)\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m \u001b[43mupdate_last_processed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mproduct_id\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 3\u001b[39m, in \u001b[36mupdate_last_processed\u001b[39m\u001b[34m(product_id)\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mupdate_last_processed\u001b[39m(product_id):\n\u001b[32m 2\u001b[39m time_finished_processing = datetime.now().isoformat()\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m \u001b[43mcur\u001b[49m\u001b[43m.\u001b[49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mUPDATE downloaded SET last_processed = ? WHERE product_id = ?\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mtime_finished_processing\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mproduct_id\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 4\u001b[39m con.commit()\n", - "\u001b[31mOperationalError\u001b[39m: no such column: last_processed" - ] - } - ], + "outputs": [], "source": [ "# Remove the scratch files\n", "print(\"Removing scratch files\")\n", @@ -749,39 +977,6 @@ "update_last_downloaded(product_id)\n", "update_last_processed(product_id)" ] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "06fb89ad-77ba-46db-bb88-15f5636d707d", - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'process_cube' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[40]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mprocess_cube\u001b[49m(\u001b[33m\"\u001b[39m\u001b[33m43100011\u001b[39m\u001b[33m\"\u001b[39m)\n", - "\u001b[31mNameError\u001b[39m: name 'process_cube' is not defined" - ] - } - ], - "source": [ - "process_cube(\"37100216\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9cc04f15-006f-4a3a-9610-65736820ba84", - "metadata": {}, - "outputs": [], - "source": [ - "# This one has multiple DECIMAL precision values\n", - "process_cube(\"43100011)" - ] } ], "metadata": {