Made changes

2026-06-13 14:10:55 +02:00 · 2025-06-20 17:32:01 -04:00
parent 5a95616b3c
commit 72ca6c87e1
2 changed files with 407 additions and 59 deletions
@@ -8,7 +8,6 @@ import zipfile
 from zoneinfo import ZoneInfo
 import pandas as pd
 import polars as pl
 import requests
 from tqdm import tqdm
@@ -17,7 +16,6 @@ input_folder = f"{data_folder}/input"
 scratch_folder = f"{data_folder}/scratch"
 output_folder = f"{data_folder}/output"
 if not os.path.exists(f"{data_folder}/processing.db"):
    con = sqlite3.connect(f"{data_folder}/processing.db")
    cur = con.cursor()
@@ -42,7 +40,7 @@ def setup():
    """
    Makes data folders
    """
-    folders_to_create = [data_folder, input_folder,
+    folders_to_create = [data_folder, input_folder, 
                         scratch_folder, output_folder,
                         f"{input_folder}/en", f"{output_folder}/en",
                         f"{input_folder}/fr", f"{output_folder}/fr",
@@ -125,29 +123,62 @@ def update_tables():
        download_cube(product_id)
        process_cube(product_id)
 def compute_ref_date_bounds(df):
    """
    TODO: There are cases where the REF_DATE is a range, ex. 2023/2024.
    For productId 17100022 the period is from July 1 to June 30 (seen in the metadata), so can't just 
    use January 1, 2023 and December 31, 2024
    """
    series = df["REF_DATE"]
    # Initialize the two new columns with NaT
    df["REF_START_DATE"] = pd.NaT
    df["REF_END_DATE"] = pd.NaT
    # Skip rows that contain slashes
    valid_mask = ~series.str.contains("/", na=False)
    # Case 1: YYYY-MM-DD
    full_mask = valid_mask & series.str.fullmatch(r"\d{4}-\d{2}-\d{2}")
    parsed_full = pd.to_datetime(series[full_mask], format="%Y-%m-%d", errors="coerce")
    df.loc[full_mask, "REF_START_DATE"] = parsed_full
    df.loc[full_mask, "REF_END_DATE"] = parsed_full
    # Case 2: YYYY-MM
    month_mask = valid_mask & series.str.fullmatch(r"\d{4}-\d{2}")
    parsed_month = pd.to_datetime(series[month_mask], format="%Y-%m", errors="coerce")
    df.loc[month_mask, "REF_START_DATE"] = parsed_month
    df.loc[month_mask, "REF_END_DATE"] = parsed_month + pd.to_timedelta(
        parsed_month.dt.days_in_month - 1, unit='D'
    )
    # Case 3: YYYY
    year_mask = valid_mask & series.str.fullmatch(r"\d{4}")
    parsed_year = pd.to_datetime(series[year_mask], format="%Y", errors="coerce")
    df.loc[year_mask, "REF_START_DATE"] = parsed_year
    df.loc[year_mask, "REF_END_DATE"] = parsed_year + pd.offsets.YearEnd(0)
    # Move columns after REF_DATE
    ref_idx = df.columns.get_loc("REF_DATE")
    cols = list(df.columns)
    cols.remove("REF_START_DATE")
    cols.remove("REF_END_DATE")
    cols[ref_idx + 1:ref_idx + 1] = ["REF_START_DATE", "REF_END_DATE"]
    return df[cols]
 def convert_to_lowest_type(df):
    """
    Convert columns to the best possible dtypes
-    For example, if the column is numerical and has a maximum value of 32,000
+    For example, if the column is numerical and has a maximum value of 32,000 
    we can assign it a type of int16
    """
    print("Converting dataframe to optimal data types")
    params = {
        'convert_string': False,
        'convert_boolean': False
    }
    df = df.convert_dtypes(**params)
    dtypes = pd.DataFrame(df.dtypes)
    # Downcast to the smallest numerical dtype
    for row in dtypes.itertuples():
        column = row[0]
        the_type = str(row[1])
-        # Skipping downcasting Float64 as there were issues with decimal places
+        if the_type == 'int64':
        # For example, instead of a value being 65.4, it turned into 65.4000015258789
        if the_type == 'Float64':
            continue
        elif the_type == 'Int64':
            df[column] = pd.to_numeric(df[column], downcast='integer')
    return df
@@ -193,6 +224,17 @@ def download_cube(product_id, language="en"):
        progress_bar.close()
 def process_cube(product_id, language="en"):
    """
    Examples: 
    - productId 43100011 has all with DECIMAL = 1 (float64)
    - productId 17100009 has DECIMAL = 0 (int64)
    - productId 35100076 has multiple DECIMAL precisions [0, 1, 2] (int64, float64, float64)
    """
    cur.execute("SELECT product_id FROM downloaded WHERE product_id = ?", (product_id,))
    result = cur.fetchone()
    if result:
        print(f"Already processed {product_id}")
        return
    extract_zipfile(product_id, language)
    """
    The pandas column reader is better than the Polars one
@@ -207,20 +249,63 @@ def process_cube(product_id, language="en"):
    #    json.dump(metadata, outfile)
    # Read CSV using Pandas
    product_csv = f"{scratch_folder}/{product_id}.csv"
    print(f"Reading {product_csv}")
    parameters = {
        "engine": "c",
-        "low_memory": True
+        "low_memory": True,
        "nrows": 100000,
        "dtype": {}
    }
    columns = pd.read_csv(product_csv, nrows=0).columns
    columns_always_int_8 = ["DECIMALS", "SCALAR_ID"]
    for column in columns_always_int_8:
        if column in columns:
            parameters["dtype"][column] = 'int8'
    columns_always_int_16 = ["UOM_ID"]
    for column in columns_always_int_16:
        if column in columns:
            parameters["dtype"][column] = 'int16'
    for column in columns:
        if column not in columns_always_int_8 and column not in columns_always_int_16 and column != "VALUE":
            parameters["dtype"][column] = 'string'
    if not parameters["dtype"]:
        del parameters["dtype"]
    print(f"Reading {product_csv} as a Pandas dataframe")
    df = pd.read_csv(product_csv, **parameters)
    unique_decimal_values = df["DECIMALS"].unique()
    if any(unique_decimal_values):
        """
        A table can have both float and integer in the VALUE field. 
        productId 11100025 is an example
        So if we have unique values for DECIMALS to be [0,1], then we convert to float64
        """
        convert_dict = {"VALUE": "float64"}
        print(convert_dict)
        df = df.astype(convert_dict)
    elif 0 in (unique_decimal_values):
        if df["VALUE"].dtype != "Int64":
            # If DECIMALS = [0]
            convert_dict = {"VALUE": "Int64"}
            print(convert_dict)
            df = df.astype(convert_dict)
    df = convert_to_lowest_type(df)
-    print("Import Pandas dataframe as a Polars dataframe")
+    df = compute_ref_date_bounds(df)
    df = pl.from_pandas(df)
    output_parquet = f"{output_folder}/{language}/{product_id}.parquet"
    print(f"Exporting dataframe as parquet to {output_parquet}")
-    df.write_parquet(output_parquet,
+    parameters = {
-                     compression='zstd',
+        "path": output_parquet,
-                     compression_level=22)
+        "engine": "pyarrow",
        "compression": "zstd",
        "index": False,
        "compression_level": 22
    }
    df.to_parquet(**parameters)
    # Remove the scratch files
    print("Removing scratch files")
    os.remove(f"{scratch_folder}/{product_id}.csv")
@@ -228,6 +313,7 @@ def process_cube(product_id, language="en"):
    update_last_downloaded(product_id)
    update_last_processed(product_id)
 if __name__ == '__main__':
    setup()
    files_to_process = glob.glob(f"{input_folder}/en/*.zip")
@@ -235,5 +321,7 @@ if __name__ == '__main__':
    files_to_process = [x.split("/")[-1].split(".zip")[0] for x in files_to_process]
    to_process = len(files_to_process)
    print(f"Processing {to_process}")
-    with Pool(4) as p:
+    #for product_id in files_to_process:
-        p.map(process_cube, files_to_process)
+    #    process_cube(product_id)
    with Pool(processes=16) as p:
        p.map(process_cube, files_to_process, chunksize=8)
@@ -25,7 +25,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 20,
   "id": "98859cd6-6fa4-4aef-a113-455699524fae",
   "metadata": {
    "editable": true,
@@ -77,7 +77,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 21,
   "id": "28ac4c01-c1c5-427f-bb2c-0da99a4c5591",
   "metadata": {},
   "outputs": [],
@@ -99,7 +99,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 22,
   "id": "9daa94f4-16c9-4d8b-951e-3d5d38eb618f",
   "metadata": {},
   "outputs": [],
@@ -109,7 +109,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 23,
   "id": "0af9a4b3-7b59-460b-b933-504919d4bd2a",
   "metadata": {
    "editable": true,
@@ -136,12 +136,12 @@
    "    last_updated = datetime.strptime(last_updated, \"%Y-%m-%dT%H:%M\")\n",
    "    last_updated = last_updated.replace(tzinfo=ZoneInfo(\"America/Toronto\"))\n",
    "    last_updated = last_updated.astimezone(ZoneInfo(\"UTC\")).isoformat()\n",
-    "    \n",
+    "\n",
    "    data = (product_id, last_updated)\n",
    "    cur.execute(\"SELECT product_id FROM downloaded WHERE product_id = ?\", (product_id,))\n",
    "    result = cur.fetchone()\n",
    "    if not result:\n",
-    "        cur.execute(\"INSERT INTO downloaded VALUES (?, ?)\", data)\n",
+    "        cur.execute(\"INSERT INTO downloaded (product_id, last_updated) VALUES (?, ?)\", data)\n",
    "    else:\n",
    "        cur.execute(\"UPDATE downloaded SET last_updated = ? WHERE product_id = ?\", (last_updated, product_id))\n",
    "\n",
@@ -150,7 +150,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 24,
   "id": "4b7996d2-75ab-4173-a17a-64fb7ab63740",
   "metadata": {},
   "outputs": [],
@@ -163,7 +163,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 25,
   "id": "23cbabc3-0d4b-4e28-a4df-b2c5e8c7ea8b",
   "metadata": {},
   "outputs": [],
@@ -212,7 +212,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 26,
   "id": "dc5573ef-734b-44d8-a4c4-0df19d655975",
   "metadata": {},
   "outputs": [],
@@ -222,7 +222,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 27,
   "id": "eddf6501-8428-44cc-8d2d-e245803a3943",
   "metadata": {
    "editable": true,
@@ -280,7 +280,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 28,
   "id": "e67642d3-cc6c-4c5d-b5a3-2fe18364ad71",
   "metadata": {},
   "outputs": [],
@@ -304,7 +304,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 29,
   "id": "144e3716-b0e7-4a39-9a25-ededea506f4f",
   "metadata": {},
   "outputs": [],
@@ -352,7 +352,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 78,
   "id": "858e405e-7c02-4193-8abe-f23951761b09",
   "metadata": {
    "editable": true,
@@ -361,9 +361,32 @@
    },
    "tags": []
   },
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reading /data/tables/scratch/13100102.csv\n",
      "Index(['REF_DATE', 'GEO', 'DGUID',\n",
      "       'North American Industry Classification System (NAICS)',\n",
      "       'Summary statistics', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID',\n",
      "       'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED',\n",
      "       'DECIMALS'],\n",
      "      dtype='object')\n",
      "{'engine': 'c', 'low_memory': True, 'nrows': 1000000, 'dtype': {'DECIMALS': 'int8', 'SCALAR_ID': 'int8', 'UOM_ID': 'int16', 'REF_DATE': 'string', 'GEO': 'string', 'DGUID': 'string', 'North American Industry Classification System (NAICS)': 'string', 'Summary statistics': 'string', 'UOM': 'string', 'SCALAR_FACTOR': 'string', 'VECTOR': 'string', 'COORDINATE': 'string', 'STATUS': 'string', 'SYMBOL': 'string', 'TERMINATED': 'string'}}\n",
      "Reading /data/tables/scratch/13100102.csv as a Pandas dataframe\n"
     ]
    }
   ],
   "source": [
-    "product_id = \"43100011\"\n",
+    "\"\"\"\n",
    "Examples: \n",
    "- productId 43100011 has all with DECIMAL = 1 (float64)\n",
    "- productId 17100009 has DECIMAL = 0 (int64)\n",
    "- productId 35100076 has multiple DECIMAL precisions [0, 1, 2] (int64, float64, float64)\n",
    "\"\"\"\n",
    "\n",
    "product_id = \"13100102\"\n",
    "#def process_cube(product_id, language=\"en\"):\n",
    "language = \"en\"\n",
    "cur.execute(\"SELECT product_id FROM downloaded WHERE product_id = ?\", (product_id,))\n",
@@ -385,13 +408,16 @@
    "#    json.dump(metadata, outfile)\n",
    "# Read CSV using Pandas\n",
    "product_csv = f\"{scratch_folder}/{product_id}.csv\"\n",
    "print(f\"Reading {product_csv}\")\n",
    "parameters = {\n",
    "    \"engine\": \"c\",\n",
    "    \"low_memory\": True,\n",
-    "    #\"nrows\": 1000000,\n",
+    "    \"nrows\": 1000000,\n",
    "    \"dtype\": {}\n",
    "}\n",
    "\n",
    "columns = pd.read_csv(product_csv, nrows=0).columns\n",
    "print(columns)\n",
    "\n",
    "columns_always_int_8 = [\"DECIMALS\", \"SCALAR_ID\"]\n",
    "for column in columns_always_int_8:\n",
@@ -407,24 +433,46 @@
    "    if column not in columns_always_int_8 and column not in columns_always_int_16 and column != \"VALUE\":\n",
    "        parameters[\"dtype\"][column] = 'string'\n",
    "\n",
    "print(parameters)\n",
    "if not parameters[\"dtype\"]:\n",
    "    del parameters[\"dtype\"]\n",
    "\n",
    "print(f\"Reading {product_csv} as a Pandas dataframe\")\n",
-    "df = pd.read_csv(product_csv, **parameters)\n",
+    "df = pd.read_csv(product_csv, **parameters)"
-    "\n",
+   ]
-    "unique_decimal_values = len(df[\"DECIMALS\"].unique())\n",
+  },
-    "if unique_decimal_values > 1:\n",
+  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "7579a135-1dfe-4fc0-991b-4b261d6577e0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1]\n",
      "{'VALUE': 'float64'}\n"
     ]
    }
   ],
   "source": [
    "unique_decimal_values = df[\"DECIMALS\"].unique()\n",
    "print(unique_decimal_values)\n",
    "if any(unique_decimal_values):\n",
    "    \"\"\"\n",
-    "    Example of when you can have a table with DECIMALS = 0 and DECIMALS = 1, productId 11100025\n",
+    "    A table can have both float and integer in the VALUE field. \n",
    "    productId 11100025 is an example\n",
    "    So if we have unique values for DECIMALS to be [0,1], then we convert to float64\n",
    "    \"\"\"\n",
    "    # Turn to float if not already\n",
    "    convert_dict = {\"VALUE\": \"float64\"}\n",
    "    print(convert_dict)\n",
    "    df = df.astype(convert_dict)\n",
-    "elif unique_decimal_values == 1:\n",
+    "elif 0 in (unique_decimal_values):\n",
    "    if df[\"VALUE\"].dtype != \"int64\":\n",
-    "        # Turn to int64 if not already\n",
+    "        # If DECIMALS = [0]\n",
    "        convert_dict = {\"VALUE\": \"int64\"}\n",
    "        print(convert_dict)\n",
    "        df = df.astype(convert_dict)\n",
    "\n",
    "df = convert_to_lowest_type(df)\n",
@@ -433,7 +481,203 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 80,
   "id": "6c2781b3-8eea-4317-a8c0-083d97ee04fc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>REF_DATE</th>\n",
       "      <th>REF_START_DATE</th>\n",
       "      <th>REF_END_DATE</th>\n",
       "      <th>GEO</th>\n",
       "      <th>DGUID</th>\n",
       "      <th>North American Industry Classification System (NAICS)</th>\n",
       "      <th>Summary statistics</th>\n",
       "      <th>UOM</th>\n",
       "      <th>UOM_ID</th>\n",
       "      <th>SCALAR_FACTOR</th>\n",
       "      <th>SCALAR_ID</th>\n",
       "      <th>VECTOR</th>\n",
       "      <th>COORDINATE</th>\n",
       "      <th>VALUE</th>\n",
       "      <th>STATUS</th>\n",
       "      <th>SYMBOL</th>\n",
       "      <th>TERMINATED</th>\n",
       "      <th>DECIMALS</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2014</td>\n",
       "      <td>2014-01-01</td>\n",
       "      <td>2014-12-31</td>\n",
       "      <td>Canada</td>\n",
       "      <td>2016A000011124</td>\n",
       "      <td>Nursing and residential care facilities [623]</td>\n",
       "      <td>Operating revenue</td>\n",
       "      <td>Dollars</td>\n",
       "      <td>81</td>\n",
       "      <td>millions</td>\n",
       "      <td>6</td>\n",
       "      <td>v114809189</td>\n",
       "      <td>1.1.1</td>\n",
       "      <td>9310.7</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2014</td>\n",
       "      <td>2014-01-01</td>\n",
       "      <td>2014-12-31</td>\n",
       "      <td>Canada</td>\n",
       "      <td>2016A000011124</td>\n",
       "      <td>Nursing and residential care facilities [623]</td>\n",
       "      <td>Operating expenses</td>\n",
       "      <td>Dollars</td>\n",
       "      <td>81</td>\n",
       "      <td>millions</td>\n",
       "      <td>6</td>\n",
       "      <td>v114809190</td>\n",
       "      <td>1.1.2</td>\n",
       "      <td>8499.5</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2014</td>\n",
       "      <td>2014-01-01</td>\n",
       "      <td>2014-12-31</td>\n",
       "      <td>Canada</td>\n",
       "      <td>2016A000011124</td>\n",
       "      <td>Nursing and residential care facilities [623]</td>\n",
       "      <td>Salaries, wages, commissions and benefits</td>\n",
       "      <td>Dollars</td>\n",
       "      <td>81</td>\n",
       "      <td>millions</td>\n",
       "      <td>6</td>\n",
       "      <td>v114809191</td>\n",
       "      <td>1.1.3</td>\n",
       "      <td>4630.3</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2014</td>\n",
       "      <td>2014-01-01</td>\n",
       "      <td>2014-12-31</td>\n",
       "      <td>Canada</td>\n",
       "      <td>2016A000011124</td>\n",
       "      <td>Nursing and residential care facilities [623]</td>\n",
       "      <td>Operating profit margin</td>\n",
       "      <td>Percent</td>\n",
       "      <td>239</td>\n",
       "      <td>units</td>\n",
       "      <td>0</td>\n",
       "      <td>v114809192</td>\n",
       "      <td>1.1.4</td>\n",
       "      <td>8.7</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2014</td>\n",
       "      <td>2014-01-01</td>\n",
       "      <td>2014-12-31</td>\n",
       "      <td>Newfoundland and Labrador</td>\n",
       "      <td>2016A000210</td>\n",
       "      <td>Nursing and residential care facilities [623]</td>\n",
       "      <td>Operating revenue</td>\n",
       "      <td>Dollars</td>\n",
       "      <td>81</td>\n",
       "      <td>millions</td>\n",
       "      <td>6</td>\n",
       "      <td>v114809193</td>\n",
       "      <td>2.1.1</td>\n",
       "      <td>97.9</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  REF_DATE REF_START_DATE REF_END_DATE                        GEO  \\\n",
       "0     2014     2014-01-01   2014-12-31                     Canada   \n",
       "1     2014     2014-01-01   2014-12-31                     Canada   \n",
       "2     2014     2014-01-01   2014-12-31                     Canada   \n",
       "3     2014     2014-01-01   2014-12-31                     Canada   \n",
       "4     2014     2014-01-01   2014-12-31  Newfoundland and Labrador   \n",
       "\n",
       "            DGUID North American Industry Classification System (NAICS)  \\\n",
       "0  2016A000011124      Nursing and residential care facilities [623]      \n",
       "1  2016A000011124      Nursing and residential care facilities [623]      \n",
       "2  2016A000011124      Nursing and residential care facilities [623]      \n",
       "3  2016A000011124      Nursing and residential care facilities [623]      \n",
       "4     2016A000210      Nursing and residential care facilities [623]      \n",
       "\n",
       "                          Summary statistics      UOM  UOM_ID SCALAR_FACTOR  \\\n",
       "0                          Operating revenue  Dollars      81      millions   \n",
       "1                         Operating expenses  Dollars      81      millions   \n",
       "2  Salaries, wages, commissions and benefits  Dollars      81      millions   \n",
       "3                    Operating profit margin  Percent     239         units   \n",
       "4                          Operating revenue  Dollars      81      millions   \n",
       "\n",
       "   SCALAR_ID      VECTOR COORDINATE   VALUE STATUS SYMBOL TERMINATED  DECIMALS  \n",
       "0          6  v114809189      1.1.1  9310.7   <NA>   <NA>       <NA>         1  \n",
       "1          6  v114809190      1.1.2  8499.5   <NA>   <NA>       <NA>         1  \n",
       "2          6  v114809191      1.1.3  4630.3   <NA>   <NA>       <NA>         1  \n",
       "3          0  v114809192      1.1.4     8.7   <NA>   <NA>       <NA>         1  \n",
       "4          6  v114809193      2.1.1    97.9   <NA>   <NA>       <NA>         1  "
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "id": "49cc1fa3-1ac8-4510-b4fd-7827a041e4a9",
   "metadata": {
    "editable": true,
@@ -447,14 +691,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Exporting dataframe as parquet to /data/tables/output/en/43100011.parquet\n"
+      "Exporting dataframe as parquet to /data/tables/output/en/13100102.parquet\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "IOStream.flush timed out\n"
     ]
    }
   ],
@@ -515,12 +752,35 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 40,
   "id": "06fb89ad-77ba-46db-bb88-15f5636d707d",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'process_cube' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mNameError\u001b[39m                                 Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[40]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mprocess_cube\u001b[49m(\u001b[33m\"\u001b[39m\u001b[33m43100011\u001b[39m\u001b[33m\"\u001b[39m)\n",
      "\u001b[31mNameError\u001b[39m: name 'process_cube' is not defined"
     ]
    }
   ],
   "source": [
    "process_cube(\"37100216\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9cc04f15-006f-4a3a-9610-65736820ba84",
   "metadata": {},
   "outputs": [],
   "source": [
-    "#process_cube(\"43100011\")"
+    "# This one has multiple DECIMAL precision values\n",
    "process_cube(\"43100011)"
   ]
  }
 ],