Made changes to processing of data tables

2026-06-13 14:10:55 +02:00 · 2025-06-21 18:01:16 +00:00
parent 7c8211cb5f
commit 8875722d10
2 changed files with 376 additions and 178 deletions
@@ -178,7 +178,7 @@ def convert_to_lowest_type(df):
    for row in dtypes.itertuples():
        column = row[0]
        the_type = str(row[1])
-        if the_type == 'int64':
+        if the_type == 'Int64':
            df[column] = pd.to_numeric(df[column], downcast='integer')
    return df
@@ -229,6 +229,8 @@ def process_cube(product_id, language="en"):
    - productId 43100011 has all with DECIMAL = 1 (float64)
    - productId 17100009 has DECIMAL = 0 (int64)
    - productId 35100076 has multiple DECIMAL precisions [0, 1, 2] (int64, float64, float64)
    - productId 10100164 has two columns named the same "Value" and "VALUE". It is processed fine with the read_csv, and when it is exported as parquet.
    DuckDB has an issue with it, but Pandas and Polars are able to handle "Value" and "VALUE"
    """
    cur.execute("SELECT product_id FROM downloaded WHERE product_id = ?", (product_id,))
    result = cur.fetchone()
@@ -268,6 +270,7 @@ def process_cube(product_id, language="en"):
        if column in columns:
            parameters["dtype"][column] = 'int16'
    # The remaining columns should be string, with the exception of VALUE
    for column in columns:
        if column not in columns_always_int_8 and column not in columns_always_int_16 and column != "VALUE":
            parameters["dtype"][column] = 'string'
@@ -25,7 +25,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 1,
   "id": "98859cd6-6fa4-4aef-a113-455699524fae",
   "metadata": {
    "editable": true,
@@ -77,7 +77,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 2,
   "id": "28ac4c01-c1c5-427f-bb2c-0da99a4c5591",
   "metadata": {},
   "outputs": [],
@@ -99,7 +99,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 3,
   "id": "9daa94f4-16c9-4d8b-951e-3d5d38eb618f",
   "metadata": {},
   "outputs": [],
@@ -109,7 +109,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 4,
   "id": "0af9a4b3-7b59-460b-b933-504919d4bd2a",
   "metadata": {
    "editable": true,
@@ -150,7 +150,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 5,
   "id": "4b7996d2-75ab-4173-a17a-64fb7ab63740",
   "metadata": {},
   "outputs": [],
@@ -163,7 +163,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 6,
   "id": "23cbabc3-0d4b-4e28-a4df-b2c5e8c7ea8b",
   "metadata": {},
   "outputs": [],
@@ -212,7 +212,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 7,
   "id": "dc5573ef-734b-44d8-a4c4-0df19d655975",
   "metadata": {},
   "outputs": [],
@@ -222,7 +222,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 8,
   "id": "eddf6501-8428-44cc-8d2d-e245803a3943",
   "metadata": {
    "editable": true,
@@ -296,7 +296,7 @@
    "    for row in dtypes.itertuples():\n",
    "        column = row[0]\n",
    "        the_type = str(row[1])\n",
-    "        if the_type == 'int64':\n",
+    "        if the_type == 'Int64':\n",
    "            df[column] = pd.to_numeric(df[column], downcast='integer')\n",
    "\n",
    "    return df"
@@ -304,7 +304,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 10,
   "id": "144e3716-b0e7-4a39-9a25-ededea506f4f",
   "metadata": {},
   "outputs": [],
@@ -352,7 +352,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": 40,
   "id": "858e405e-7c02-4193-8abe-f23951761b09",
   "metadata": {
    "editable": true,
@@ -366,15 +366,13 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Reading /data/tables/scratch/13100102.csv\n",
+      "Already processed 10100164\n",
-      "Index(['REF_DATE', 'GEO', 'DGUID',\n",
+      "Extracting /data/tables/input/en/10100164.zip to /data/tables/scratch\n",
-      "       'North American Industry Classification System (NAICS)',\n",
+      "Reading /data/tables/scratch/10100164.csv\n",
-      "       'Summary statistics', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID',\n",
+      "Index(['REF_DATE', 'GEO', 'DGUID', 'Value', 'Type of cannabis', 'UOM',\n",
-      "       'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED',\n",
+      "       'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE',\n",
-      "       'DECIMALS'],\n",
+      "       'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS'],\n",
-      "      dtype='object')\n",
+      "      dtype='object')\n"
      "{'engine': 'c', 'low_memory': True, 'nrows': 1000000, 'dtype': {'DECIMALS': 'int8', 'SCALAR_ID': 'int8', 'UOM_ID': 'int16', 'REF_DATE': 'string', 'GEO': 'string', 'DGUID': 'string', 'North American Industry Classification System (NAICS)': 'string', 'Summary statistics': 'string', 'UOM': 'string', 'SCALAR_FACTOR': 'string', 'VECTOR': 'string', 'COORDINATE': 'string', 'STATUS': 'string', 'SYMBOL': 'string', 'TERMINATED': 'string'}}\n",
      "Reading /data/tables/scratch/13100102.csv as a Pandas dataframe\n"
     ]
    }
   ],
@@ -384,9 +382,11 @@
    "- productId 43100011 has all with DECIMAL = 1 (float64)\n",
    "- productId 17100009 has DECIMAL = 0 (int64)\n",
    "- productId 35100076 has multiple DECIMAL precisions [0, 1, 2] (int64, float64, float64)\n",
    "- productId 10100164 has two columns named the same \"Value\" and \"VALUE\". It is processed fine with the read_csv, and when it is exported as parquet.\n",
    "DuckDB has an issue with it, but Pandas and Polars are able to handle \"Value\" and \"VALUE\"\n",
    "\"\"\"\n",
    "\n",
-    "product_id = \"13100102\"\n",
+    "product_id = \"10100164\"\n",
    "#def process_cube(product_id, language=\"en\"):\n",
    "language = \"en\"\n",
    "cur.execute(\"SELECT product_id FROM downloaded WHERE product_id = ?\", (product_id,))\n",
@@ -394,7 +394,7 @@
    "if result:\n",
    "    print(f\"Already processed {product_id}\")\n",
    "    #return\n",
-    "#extract_zipfile(product_id, language)\n",
+    "extract_zipfile(product_id, language)\n",
    "\"\"\"\n",
    "The pandas column reader is better than the Polars one\n",
    "Here is an example where polars was not reading it right:\n",
@@ -417,8 +417,25 @@
    "}\n",
    "\n",
    "columns = pd.read_csv(product_csv, nrows=0).columns\n",
-    "print(columns)\n",
+    "print(columns)"
-    "\n",
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "c1f89175-78e5-4e95-8a3e-3b65f0cb4b2d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'engine': 'c', 'low_memory': True, 'nrows': 1000000, 'dtype': {'DECIMALS': 'int8', 'SCALAR_ID': 'int8', 'UOM_ID': 'int16', 'REF_DATE': 'string', 'GEO': 'string', 'DGUID': 'string', 'Value': 'string', 'Type of cannabis': 'string', 'UOM': 'string', 'SCALAR_FACTOR': 'string', 'VECTOR': 'string', 'COORDINATE': 'string', 'STATUS': 'string', 'SYMBOL': 'string', 'TERMINATED': 'string'}}\n",
      "Reading /data/tables/scratch/10100164.csv as a Pandas dataframe\n"
     ]
    }
   ],
   "source": [
    "columns_always_int_8 = [\"DECIMALS\", \"SCALAR_ID\"]\n",
    "for column in columns_always_int_8:\n",
    "    if column in columns:\n",
@@ -429,6 +446,7 @@
    "    if column in columns:\n",
    "        parameters[\"dtype\"][column] = 'int16'\n",
    "\n",
    "# The remaining columns should be string, with the exception of VALUE\n",
    "for column in columns:\n",
    "    if column not in columns_always_int_8 and column not in columns_always_int_16 and column != \"VALUE\":\n",
    "        parameters[\"dtype\"][column] = 'string'\n",
@@ -443,7 +461,221 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": 31,
   "id": "87ff5f69-ca1f-40e0-ac73-c73dd8c1bd4d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>REF_DATE</th>\n",
       "      <th>GEO</th>\n",
       "      <th>DGUID</th>\n",
       "      <th>Value</th>\n",
       "      <th>Type of cannabis</th>\n",
       "      <th>UOM</th>\n",
       "      <th>UOM_ID</th>\n",
       "      <th>SCALAR_FACTOR</th>\n",
       "      <th>SCALAR_ID</th>\n",
       "      <th>VECTOR</th>\n",
       "      <th>COORDINATE</th>\n",
       "      <th>VALUE</th>\n",
       "      <th>STATUS</th>\n",
       "      <th>SYMBOL</th>\n",
       "      <th>TERMINATED</th>\n",
       "      <th>DECIMALS</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2021/2022</td>\n",
       "      <td>Canada</td>\n",
       "      <td>2021A000011124</td>\n",
       "      <td>Value of sales</td>\n",
       "      <td>Total cannabis products</td>\n",
       "      <td>Dollars</td>\n",
       "      <td>81</td>\n",
       "      <td>thousands</td>\n",
       "      <td>3</td>\n",
       "      <td>v1490436660</td>\n",
       "      <td>1.1.1</td>\n",
       "      <td>4027928.0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2021/2022</td>\n",
       "      <td>Canada</td>\n",
       "      <td>2021A000011124</td>\n",
       "      <td>Value of sales</td>\n",
       "      <td>Dried cannabis</td>\n",
       "      <td>Dollars</td>\n",
       "      <td>81</td>\n",
       "      <td>thousands</td>\n",
       "      <td>3</td>\n",
       "      <td>v1490436647</td>\n",
       "      <td>1.1.2</td>\n",
       "      <td>2861838.0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2021/2022</td>\n",
       "      <td>Canada</td>\n",
       "      <td>2021A000011124</td>\n",
       "      <td>Value of sales</td>\n",
       "      <td>Inhaled cannabis extracts</td>\n",
       "      <td>Dollars</td>\n",
       "      <td>81</td>\n",
       "      <td>thousands</td>\n",
       "      <td>3</td>\n",
       "      <td>v1490436648</td>\n",
       "      <td>1.1.3</td>\n",
       "      <td>729178.0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2021/2022</td>\n",
       "      <td>Canada</td>\n",
       "      <td>2021A000011124</td>\n",
       "      <td>Value of sales</td>\n",
       "      <td>Ingested cannabis extracts</td>\n",
       "      <td>Dollars</td>\n",
       "      <td>81</td>\n",
       "      <td>thousands</td>\n",
       "      <td>3</td>\n",
       "      <td>v1490436649</td>\n",
       "      <td>1.1.4</td>\n",
       "      <td>158283.0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2021/2022</td>\n",
       "      <td>Canada</td>\n",
       "      <td>2021A000011124</td>\n",
       "      <td>Value of sales</td>\n",
       "      <td>Solid cannabis edibles</td>\n",
       "      <td>Dollars</td>\n",
       "      <td>81</td>\n",
       "      <td>thousands</td>\n",
       "      <td>3</td>\n",
       "      <td>v1490436650</td>\n",
       "      <td>1.1.5</td>\n",
       "      <td>166336.0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    REF_DATE     GEO           DGUID           Value  \\\n",
       "0  2021/2022  Canada  2021A000011124  Value of sales   \n",
       "1  2021/2022  Canada  2021A000011124  Value of sales   \n",
       "2  2021/2022  Canada  2021A000011124  Value of sales   \n",
       "3  2021/2022  Canada  2021A000011124  Value of sales   \n",
       "4  2021/2022  Canada  2021A000011124  Value of sales   \n",
       "\n",
       "             Type of cannabis      UOM  UOM_ID SCALAR_FACTOR  SCALAR_ID  \\\n",
       "0     Total cannabis products  Dollars      81     thousands          3   \n",
       "1              Dried cannabis  Dollars      81     thousands          3   \n",
       "2   Inhaled cannabis extracts  Dollars      81     thousands          3   \n",
       "3  Ingested cannabis extracts  Dollars      81     thousands          3   \n",
       "4      Solid cannabis edibles  Dollars      81     thousands          3   \n",
       "\n",
       "        VECTOR COORDINATE      VALUE STATUS SYMBOL TERMINATED  DECIMALS  \n",
       "0  v1490436660      1.1.1  4027928.0   <NA>   <NA>       <NA>         0  \n",
       "1  v1490436647      1.1.2  2861838.0   <NA>   <NA>       <NA>         0  \n",
       "2  v1490436648      1.1.3   729178.0   <NA>   <NA>       <NA>         0  \n",
       "3  v1490436649      1.1.4   158283.0   <NA>   <NA>       <NA>         0  \n",
       "4  v1490436650      1.1.5   166336.0   <NA>   <NA>       <NA>         0  "
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "fde0e149-d146-4516-8966-0989e4ccf290",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "REF_DATE            string[python]\n",
       "GEO                 string[python]\n",
       "DGUID               string[python]\n",
       "Value               string[python]\n",
       "Type of cannabis    string[python]\n",
       "UOM                 string[python]\n",
       "UOM_ID                       int16\n",
       "SCALAR_FACTOR       string[python]\n",
       "SCALAR_ID                     int8\n",
       "VECTOR              string[python]\n",
       "COORDINATE          string[python]\n",
       "VALUE                      float64\n",
       "STATUS              string[python]\n",
       "SYMBOL              string[python]\n",
       "TERMINATED          string[python]\n",
       "DECIMALS                      int8\n",
       "dtype: object"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "7579a135-1dfe-4fc0-991b-4b261d6577e0",
   "metadata": {},
   "outputs": [
@@ -451,8 +683,8 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[1]\n",
+      "[0]\n",
-      "{'VALUE': 'float64'}\n"
+      "{'VALUE': 'Int64'}\n"
     ]
    }
   ],
@@ -469,19 +701,36 @@
    "    print(convert_dict)\n",
    "    df = df.astype(convert_dict)\n",
    "elif 0 in (unique_decimal_values):\n",
-    "    if df[\"VALUE\"].dtype != \"int64\":\n",
+    "    if df[\"VALUE\"].dtype != \"Int64\":\n",
    "        # If DECIMALS = [0]\n",
-    "        convert_dict = {\"VALUE\": \"int64\"}\n",
+    "        convert_dict = {\"VALUE\": \"Int64\"}\n",
    "        print(convert_dict)\n",
-    "        df = df.astype(convert_dict)\n",
+    "        df = df.astype(convert_dict)"
-    "\n",
+   ]
-    "df = convert_to_lowest_type(df)\n",
+  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "fe87b3b0-4e04-41a8-a704-75cb9829b0a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = convert_to_lowest_type(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "74507546-4080-4962-88fc-58c1e3943d17",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = compute_ref_date_bounds(df)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 80,
+   "execution_count": 39,
   "id": "6c2781b3-8eea-4317-a8c0-083d97ee04fc",
   "metadata": {},
   "outputs": [
@@ -511,8 +760,8 @@
       "      <th>REF_END_DATE</th>\n",
       "      <th>GEO</th>\n",
       "      <th>DGUID</th>\n",
-       "      <th>North American Industry Classification System (NAICS)</th>\n",
+       "      <th>Value</th>\n",
-       "      <th>Summary statistics</th>\n",
+       "      <th>Type of cannabis</th>\n",
       "      <th>UOM</th>\n",
       "      <th>UOM_ID</th>\n",
       "      <th>SCALAR_FACTOR</th>\n",
@@ -529,144 +778,144 @@
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
-       "      <td>2014</td>\n",
+       "      <td>2021/2022</td>\n",
-       "      <td>2014-01-01</td>\n",
+       "      <td>NaT</td>\n",
-       "      <td>2014-12-31</td>\n",
+       "      <td>NaT</td>\n",
       "      <td>Canada</td>\n",
-       "      <td>2016A000011124</td>\n",
+       "      <td>2021A000011124</td>\n",
-       "      <td>Nursing and residential care facilities [623]</td>\n",
+       "      <td>Value of sales</td>\n",
-       "      <td>Operating revenue</td>\n",
+       "      <td>Total cannabis products</td>\n",
       "      <td>Dollars</td>\n",
       "      <td>81</td>\n",
-       "      <td>millions</td>\n",
+       "      <td>thousands</td>\n",
-       "      <td>6</td>\n",
+       "      <td>3</td>\n",
-       "      <td>v114809189</td>\n",
+       "      <td>v1490436660</td>\n",
       "      <td>1.1.1</td>\n",
-       "      <td>9310.7</td>\n",
+       "      <td>4027928</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>1</td>\n",
+       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
-       "      <td>2014</td>\n",
+       "      <td>2021/2022</td>\n",
-       "      <td>2014-01-01</td>\n",
+       "      <td>NaT</td>\n",
-       "      <td>2014-12-31</td>\n",
+       "      <td>NaT</td>\n",
       "      <td>Canada</td>\n",
-       "      <td>2016A000011124</td>\n",
+       "      <td>2021A000011124</td>\n",
-       "      <td>Nursing and residential care facilities [623]</td>\n",
+       "      <td>Value of sales</td>\n",
-       "      <td>Operating expenses</td>\n",
+       "      <td>Dried cannabis</td>\n",
       "      <td>Dollars</td>\n",
       "      <td>81</td>\n",
-       "      <td>millions</td>\n",
+       "      <td>thousands</td>\n",
-       "      <td>6</td>\n",
+       "      <td>3</td>\n",
-       "      <td>v114809190</td>\n",
+       "      <td>v1490436647</td>\n",
       "      <td>1.1.2</td>\n",
-       "      <td>8499.5</td>\n",
+       "      <td>2861838</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>1</td>\n",
+       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
-       "      <td>2014</td>\n",
+       "      <td>2021/2022</td>\n",
-       "      <td>2014-01-01</td>\n",
+       "      <td>NaT</td>\n",
-       "      <td>2014-12-31</td>\n",
+       "      <td>NaT</td>\n",
       "      <td>Canada</td>\n",
-       "      <td>2016A000011124</td>\n",
+       "      <td>2021A000011124</td>\n",
-       "      <td>Nursing and residential care facilities [623]</td>\n",
+       "      <td>Value of sales</td>\n",
-       "      <td>Salaries, wages, commissions and benefits</td>\n",
+       "      <td>Inhaled cannabis extracts</td>\n",
       "      <td>Dollars</td>\n",
       "      <td>81</td>\n",
-       "      <td>millions</td>\n",
+       "      <td>thousands</td>\n",
-       "      <td>6</td>\n",
+       "      <td>3</td>\n",
-       "      <td>v114809191</td>\n",
+       "      <td>v1490436648</td>\n",
       "      <td>1.1.3</td>\n",
-       "      <td>4630.3</td>\n",
+       "      <td>729178</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>1</td>\n",
+       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
-       "      <td>2014</td>\n",
+       "      <td>2021/2022</td>\n",
-       "      <td>2014-01-01</td>\n",
+       "      <td>NaT</td>\n",
-       "      <td>2014-12-31</td>\n",
+       "      <td>NaT</td>\n",
       "      <td>Canada</td>\n",
-       "      <td>2016A000011124</td>\n",
+       "      <td>2021A000011124</td>\n",
-       "      <td>Nursing and residential care facilities [623]</td>\n",
+       "      <td>Value of sales</td>\n",
-       "      <td>Operating profit margin</td>\n",
+       "      <td>Ingested cannabis extracts</td>\n",
-       "      <td>Percent</td>\n",
+       "      <td>Dollars</td>\n",
-       "      <td>239</td>\n",
+       "      <td>81</td>\n",
-       "      <td>units</td>\n",
+       "      <td>thousands</td>\n",
-       "      <td>0</td>\n",
+       "      <td>3</td>\n",
-       "      <td>v114809192</td>\n",
+       "      <td>v1490436649</td>\n",
       "      <td>1.1.4</td>\n",
-       "      <td>8.7</td>\n",
+       "      <td>158283</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>1</td>\n",
+       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
-       "      <td>2014</td>\n",
+       "      <td>2021/2022</td>\n",
-       "      <td>2014-01-01</td>\n",
+       "      <td>NaT</td>\n",
-       "      <td>2014-12-31</td>\n",
+       "      <td>NaT</td>\n",
-       "      <td>Newfoundland and Labrador</td>\n",
+       "      <td>Canada</td>\n",
-       "      <td>2016A000210</td>\n",
+       "      <td>2021A000011124</td>\n",
-       "      <td>Nursing and residential care facilities [623]</td>\n",
+       "      <td>Value of sales</td>\n",
-       "      <td>Operating revenue</td>\n",
+       "      <td>Solid cannabis edibles</td>\n",
       "      <td>Dollars</td>\n",
       "      <td>81</td>\n",
-       "      <td>millions</td>\n",
+       "      <td>thousands</td>\n",
-       "      <td>6</td>\n",
+       "      <td>3</td>\n",
-       "      <td>v114809193</td>\n",
+       "      <td>v1490436650</td>\n",
-       "      <td>2.1.1</td>\n",
+       "      <td>1.1.5</td>\n",
-       "      <td>97.9</td>\n",
+       "      <td>166336</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>1</td>\n",
+       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
-       "  REF_DATE REF_START_DATE REF_END_DATE                        GEO  \\\n",
+       "    REF_DATE REF_START_DATE REF_END_DATE     GEO           DGUID  \\\n",
-       "0     2014     2014-01-01   2014-12-31                     Canada   \n",
+       "0  2021/2022            NaT          NaT  Canada  2021A000011124   \n",
-       "1     2014     2014-01-01   2014-12-31                     Canada   \n",
+       "1  2021/2022            NaT          NaT  Canada  2021A000011124   \n",
-       "2     2014     2014-01-01   2014-12-31                     Canada   \n",
+       "2  2021/2022            NaT          NaT  Canada  2021A000011124   \n",
-       "3     2014     2014-01-01   2014-12-31                     Canada   \n",
+       "3  2021/2022            NaT          NaT  Canada  2021A000011124   \n",
-       "4     2014     2014-01-01   2014-12-31  Newfoundland and Labrador   \n",
+       "4  2021/2022            NaT          NaT  Canada  2021A000011124   \n",
       "\n",
-       "            DGUID North American Industry Classification System (NAICS)  \\\n",
+       "            Value            Type of cannabis      UOM  UOM_ID SCALAR_FACTOR  \\\n",
-       "0  2016A000011124      Nursing and residential care facilities [623]      \n",
+       "0  Value of sales     Total cannabis products  Dollars      81     thousands   \n",
-       "1  2016A000011124      Nursing and residential care facilities [623]      \n",
+       "1  Value of sales              Dried cannabis  Dollars      81     thousands   \n",
-       "2  2016A000011124      Nursing and residential care facilities [623]      \n",
+       "2  Value of sales   Inhaled cannabis extracts  Dollars      81     thousands   \n",
-       "3  2016A000011124      Nursing and residential care facilities [623]      \n",
+       "3  Value of sales  Ingested cannabis extracts  Dollars      81     thousands   \n",
-       "4     2016A000210      Nursing and residential care facilities [623]      \n",
+       "4  Value of sales      Solid cannabis edibles  Dollars      81     thousands   \n",
       "\n",
-       "                          Summary statistics      UOM  UOM_ID SCALAR_FACTOR  \\\n",
+       "   SCALAR_ID       VECTOR COORDINATE    VALUE STATUS SYMBOL TERMINATED  \\\n",
-       "0                          Operating revenue  Dollars      81      millions   \n",
+       "0          3  v1490436660      1.1.1  4027928   <NA>   <NA>       <NA>   \n",
-       "1                         Operating expenses  Dollars      81      millions   \n",
+       "1          3  v1490436647      1.1.2  2861838   <NA>   <NA>       <NA>   \n",
-       "2  Salaries, wages, commissions and benefits  Dollars      81      millions   \n",
+       "2          3  v1490436648      1.1.3   729178   <NA>   <NA>       <NA>   \n",
-       "3                    Operating profit margin  Percent     239         units   \n",
+       "3          3  v1490436649      1.1.4   158283   <NA>   <NA>       <NA>   \n",
-       "4                          Operating revenue  Dollars      81      millions   \n",
+       "4          3  v1490436650      1.1.5   166336   <NA>   <NA>       <NA>   \n",
       "\n",
-       "   SCALAR_ID      VECTOR COORDINATE   VALUE STATUS SYMBOL TERMINATED  DECIMALS  \n",
+       "   DECIMALS  \n",
-       "0          6  v114809189      1.1.1  9310.7   <NA>   <NA>       <NA>         1  \n",
+       "0         0  \n",
-       "1          6  v114809190      1.1.2  8499.5   <NA>   <NA>       <NA>         1  \n",
+       "1         0  \n",
-       "2          6  v114809191      1.1.3  4630.3   <NA>   <NA>       <NA>         1  \n",
+       "2         0  \n",
-       "3          0  v114809192      1.1.4     8.7   <NA>   <NA>       <NA>         1  \n",
+       "3         0  \n",
-       "4          6  v114809193      2.1.1    97.9   <NA>   <NA>       <NA>         1  "
+       "4         0  "
      ]
     },
-     "execution_count": 80,
+     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -677,7 +926,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": 17,
   "id": "49cc1fa3-1ac8-4510-b4fd-7827a041e4a9",
   "metadata": {
    "editable": true,
@@ -691,7 +940,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Exporting dataframe as parquet to /data/tables/output/en/13100102.parquet\n"
+      "Exporting dataframe as parquet to /data/tables/output/en/10100164_test.parquet\n"
     ]
    }
   ],
@@ -710,7 +959,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": null,
   "id": "788bc668-8057-4e06-91a3-b99991e0a410",
   "metadata": {
    "editable": true,
@@ -719,28 +968,7 @@
    },
    "tags": []
   },
-   "outputs": [
+   "outputs": [],
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Removing scratch files\n",
      "Reading metadata /data/tables/input/metadata/43100011.json\n"
     ]
    },
    {
     "ename": "OperationalError",
     "evalue": "no such column: last_processed",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mOperationalError\u001b[39m                          Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[26]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m      4\u001b[39m os.remove(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mscratch_folder\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mproduct_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m_MetaData.csv\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m      5\u001b[39m update_last_downloaded(product_id)\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m \u001b[43mupdate_last_processed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mproduct_id\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 3\u001b[39m, in \u001b[36mupdate_last_processed\u001b[39m\u001b[34m(product_id)\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mupdate_last_processed\u001b[39m(product_id):\n\u001b[32m      2\u001b[39m     time_finished_processing = datetime.now().isoformat()\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m     \u001b[43mcur\u001b[49m\u001b[43m.\u001b[49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mUPDATE downloaded SET last_processed = ? WHERE product_id = ?\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mtime_finished_processing\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mproduct_id\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m      4\u001b[39m     con.commit()\n",
      "\u001b[31mOperationalError\u001b[39m: no such column: last_processed"
     ]
    }
   ],
   "source": [
    "# Remove the scratch files\n",
    "print(\"Removing scratch files\")\n",
@@ -749,39 +977,6 @@
    "update_last_downloaded(product_id)\n",
    "update_last_processed(product_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "06fb89ad-77ba-46db-bb88-15f5636d707d",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'process_cube' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mNameError\u001b[39m                                 Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[40]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mprocess_cube\u001b[49m(\u001b[33m\"\u001b[39m\u001b[33m43100011\u001b[39m\u001b[33m\"\u001b[39m)\n",
      "\u001b[31mNameError\u001b[39m: name 'process_cube' is not defined"
     ]
    }
   ],
   "source": [
    "process_cube(\"37100216\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9cc04f15-006f-4a3a-9610-65736820ba84",
   "metadata": {},
   "outputs": [],
   "source": [
    "# This one has multiple DECIMAL precision values\n",
    "process_cube(\"43100011)"
   ]
  }
 ],
 "metadata": {