diff --git a/experiments/statcan_products/process_files_multiprocessing.py b/experiments/statcan_products/process_files_multiprocessing.py
index 73c7cad..1581125 100644
--- a/experiments/statcan_products/process_files_multiprocessing.py
+++ b/experiments/statcan_products/process_files_multiprocessing.py
@@ -178,7 +178,7 @@ def convert_to_lowest_type(df):
for row in dtypes.itertuples():
column = row[0]
the_type = str(row[1])
- if the_type == 'int64':
+ if the_type == 'Int64':
df[column] = pd.to_numeric(df[column], downcast='integer')
return df
@@ -229,6 +229,8 @@ def process_cube(product_id, language="en"):
- productId 43100011 has all with DECIMAL = 1 (float64)
- productId 17100009 has DECIMAL = 0 (int64)
- productId 35100076 has multiple DECIMAL precisions [0, 1, 2] (int64, float64, float64)
+ - productId 10100164 has two columns named the same "Value" and "VALUE". It is processed fine with the read_csv, and when it is exported as parquet.
+ DuckDB has an issue with it, but Pandas and Polars are able to handle "Value" and "VALUE"
"""
cur.execute("SELECT product_id FROM downloaded WHERE product_id = ?", (product_id,))
result = cur.fetchone()
@@ -268,6 +270,7 @@ def process_cube(product_id, language="en"):
if column in columns:
parameters["dtype"][column] = 'int16'
+ # The remaining columns should be string, with the exception of VALUE
for column in columns:
if column not in columns_always_int_8 and column not in columns_always_int_16 and column != "VALUE":
parameters["dtype"][column] = 'string'
diff --git a/experiments/statcan_products/process_product_test.ipynb b/experiments/statcan_products/process_product_test.ipynb
index ea26878..886bab3 100644
--- a/experiments/statcan_products/process_product_test.ipynb
+++ b/experiments/statcan_products/process_product_test.ipynb
@@ -25,7 +25,7 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 1,
"id": "98859cd6-6fa4-4aef-a113-455699524fae",
"metadata": {
"editable": true,
@@ -77,7 +77,7 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 2,
"id": "28ac4c01-c1c5-427f-bb2c-0da99a4c5591",
"metadata": {},
"outputs": [],
@@ -99,7 +99,7 @@
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 3,
"id": "9daa94f4-16c9-4d8b-951e-3d5d38eb618f",
"metadata": {},
"outputs": [],
@@ -109,7 +109,7 @@
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 4,
"id": "0af9a4b3-7b59-460b-b933-504919d4bd2a",
"metadata": {
"editable": true,
@@ -150,7 +150,7 @@
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 5,
"id": "4b7996d2-75ab-4173-a17a-64fb7ab63740",
"metadata": {},
"outputs": [],
@@ -163,7 +163,7 @@
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 6,
"id": "23cbabc3-0d4b-4e28-a4df-b2c5e8c7ea8b",
"metadata": {},
"outputs": [],
@@ -212,7 +212,7 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 7,
"id": "dc5573ef-734b-44d8-a4c4-0df19d655975",
"metadata": {},
"outputs": [],
@@ -222,7 +222,7 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 8,
"id": "eddf6501-8428-44cc-8d2d-e245803a3943",
"metadata": {
"editable": true,
@@ -296,7 +296,7 @@
" for row in dtypes.itertuples():\n",
" column = row[0]\n",
" the_type = str(row[1])\n",
- " if the_type == 'int64':\n",
+ " if the_type == 'Int64':\n",
" df[column] = pd.to_numeric(df[column], downcast='integer')\n",
"\n",
" return df"
@@ -304,7 +304,7 @@
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 10,
"id": "144e3716-b0e7-4a39-9a25-ededea506f4f",
"metadata": {},
"outputs": [],
@@ -352,7 +352,7 @@
},
{
"cell_type": "code",
- "execution_count": 78,
+ "execution_count": 40,
"id": "858e405e-7c02-4193-8abe-f23951761b09",
"metadata": {
"editable": true,
@@ -366,15 +366,13 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Reading /data/tables/scratch/13100102.csv\n",
- "Index(['REF_DATE', 'GEO', 'DGUID',\n",
- " 'North American Industry Classification System (NAICS)',\n",
- " 'Summary statistics', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID',\n",
- " 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED',\n",
- " 'DECIMALS'],\n",
- " dtype='object')\n",
- "{'engine': 'c', 'low_memory': True, 'nrows': 1000000, 'dtype': {'DECIMALS': 'int8', 'SCALAR_ID': 'int8', 'UOM_ID': 'int16', 'REF_DATE': 'string', 'GEO': 'string', 'DGUID': 'string', 'North American Industry Classification System (NAICS)': 'string', 'Summary statistics': 'string', 'UOM': 'string', 'SCALAR_FACTOR': 'string', 'VECTOR': 'string', 'COORDINATE': 'string', 'STATUS': 'string', 'SYMBOL': 'string', 'TERMINATED': 'string'}}\n",
- "Reading /data/tables/scratch/13100102.csv as a Pandas dataframe\n"
+ "Already processed 10100164\n",
+ "Extracting /data/tables/input/en/10100164.zip to /data/tables/scratch\n",
+ "Reading /data/tables/scratch/10100164.csv\n",
+ "Index(['REF_DATE', 'GEO', 'DGUID', 'Value', 'Type of cannabis', 'UOM',\n",
+ " 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE',\n",
+ " 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS'],\n",
+ " dtype='object')\n"
]
}
],
@@ -384,9 +382,11 @@
"- productId 43100011 has all with DECIMAL = 1 (float64)\n",
"- productId 17100009 has DECIMAL = 0 (int64)\n",
"- productId 35100076 has multiple DECIMAL precisions [0, 1, 2] (int64, float64, float64)\n",
+ "- productId 10100164 has two columns named the same \"Value\" and \"VALUE\". It is processed fine with the read_csv, and when it is exported as parquet.\n",
+ "DuckDB has an issue with it, but Pandas and Polars are able to handle \"Value\" and \"VALUE\"\n",
"\"\"\"\n",
"\n",
- "product_id = \"13100102\"\n",
+ "product_id = \"10100164\"\n",
"#def process_cube(product_id, language=\"en\"):\n",
"language = \"en\"\n",
"cur.execute(\"SELECT product_id FROM downloaded WHERE product_id = ?\", (product_id,))\n",
@@ -394,7 +394,7 @@
"if result:\n",
" print(f\"Already processed {product_id}\")\n",
" #return\n",
- "#extract_zipfile(product_id, language)\n",
+ "extract_zipfile(product_id, language)\n",
"\"\"\"\n",
"The pandas column reader is better than the Polars one\n",
"Here is an example where polars was not reading it right:\n",
@@ -417,8 +417,25 @@
"}\n",
"\n",
"columns = pd.read_csv(product_csv, nrows=0).columns\n",
- "print(columns)\n",
- "\n",
+ "print(columns)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "c1f89175-78e5-4e95-8a3e-3b65f0cb4b2d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'engine': 'c', 'low_memory': True, 'nrows': 1000000, 'dtype': {'DECIMALS': 'int8', 'SCALAR_ID': 'int8', 'UOM_ID': 'int16', 'REF_DATE': 'string', 'GEO': 'string', 'DGUID': 'string', 'Value': 'string', 'Type of cannabis': 'string', 'UOM': 'string', 'SCALAR_FACTOR': 'string', 'VECTOR': 'string', 'COORDINATE': 'string', 'STATUS': 'string', 'SYMBOL': 'string', 'TERMINATED': 'string'}}\n",
+ "Reading /data/tables/scratch/10100164.csv as a Pandas dataframe\n"
+ ]
+ }
+ ],
+ "source": [
"columns_always_int_8 = [\"DECIMALS\", \"SCALAR_ID\"]\n",
"for column in columns_always_int_8:\n",
" if column in columns:\n",
@@ -429,6 +446,7 @@
" if column in columns:\n",
" parameters[\"dtype\"][column] = 'int16'\n",
"\n",
+ "# The remaining columns should be string, with the exception of VALUE\n",
"for column in columns:\n",
" if column not in columns_always_int_8 and column not in columns_always_int_16 and column != \"VALUE\":\n",
" parameters[\"dtype\"][column] = 'string'\n",
@@ -443,7 +461,221 @@
},
{
"cell_type": "code",
- "execution_count": 79,
+ "execution_count": 31,
+ "id": "87ff5f69-ca1f-40e0-ac73-c73dd8c1bd4d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " REF_DATE | \n",
+ " GEO | \n",
+ " DGUID | \n",
+ " Value | \n",
+ " Type of cannabis | \n",
+ " UOM | \n",
+ " UOM_ID | \n",
+ " SCALAR_FACTOR | \n",
+ " SCALAR_ID | \n",
+ " VECTOR | \n",
+ " COORDINATE | \n",
+ " VALUE | \n",
+ " STATUS | \n",
+ " SYMBOL | \n",
+ " TERMINATED | \n",
+ " DECIMALS | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2021/2022 | \n",
+ " Canada | \n",
+ " 2021A000011124 | \n",
+ " Value of sales | \n",
+ " Total cannabis products | \n",
+ " Dollars | \n",
+ " 81 | \n",
+ " thousands | \n",
+ " 3 | \n",
+ " v1490436660 | \n",
+ " 1.1.1 | \n",
+ " 4027928.0 | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2021/2022 | \n",
+ " Canada | \n",
+ " 2021A000011124 | \n",
+ " Value of sales | \n",
+ " Dried cannabis | \n",
+ " Dollars | \n",
+ " 81 | \n",
+ " thousands | \n",
+ " 3 | \n",
+ " v1490436647 | \n",
+ " 1.1.2 | \n",
+ " 2861838.0 | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2021/2022 | \n",
+ " Canada | \n",
+ " 2021A000011124 | \n",
+ " Value of sales | \n",
+ " Inhaled cannabis extracts | \n",
+ " Dollars | \n",
+ " 81 | \n",
+ " thousands | \n",
+ " 3 | \n",
+ " v1490436648 | \n",
+ " 1.1.3 | \n",
+ " 729178.0 | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2021/2022 | \n",
+ " Canada | \n",
+ " 2021A000011124 | \n",
+ " Value of sales | \n",
+ " Ingested cannabis extracts | \n",
+ " Dollars | \n",
+ " 81 | \n",
+ " thousands | \n",
+ " 3 | \n",
+ " v1490436649 | \n",
+ " 1.1.4 | \n",
+ " 158283.0 | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2021/2022 | \n",
+ " Canada | \n",
+ " 2021A000011124 | \n",
+ " Value of sales | \n",
+ " Solid cannabis edibles | \n",
+ " Dollars | \n",
+ " 81 | \n",
+ " thousands | \n",
+ " 3 | \n",
+ " v1490436650 | \n",
+ " 1.1.5 | \n",
+ " 166336.0 | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " REF_DATE GEO DGUID Value \\\n",
+ "0 2021/2022 Canada 2021A000011124 Value of sales \n",
+ "1 2021/2022 Canada 2021A000011124 Value of sales \n",
+ "2 2021/2022 Canada 2021A000011124 Value of sales \n",
+ "3 2021/2022 Canada 2021A000011124 Value of sales \n",
+ "4 2021/2022 Canada 2021A000011124 Value of sales \n",
+ "\n",
+ " Type of cannabis UOM UOM_ID SCALAR_FACTOR SCALAR_ID \\\n",
+ "0 Total cannabis products Dollars 81 thousands 3 \n",
+ "1 Dried cannabis Dollars 81 thousands 3 \n",
+ "2 Inhaled cannabis extracts Dollars 81 thousands 3 \n",
+ "3 Ingested cannabis extracts Dollars 81 thousands 3 \n",
+ "4 Solid cannabis edibles Dollars 81 thousands 3 \n",
+ "\n",
+ " VECTOR COORDINATE VALUE STATUS SYMBOL TERMINATED DECIMALS \n",
+ "0 v1490436660 1.1.1 4027928.0 0 \n",
+ "1 v1490436647 1.1.2 2861838.0 0 \n",
+ "2 v1490436648 1.1.3 729178.0 0 \n",
+ "3 v1490436649 1.1.4 158283.0 0 \n",
+ "4 v1490436650 1.1.5 166336.0 0 "
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "fde0e149-d146-4516-8966-0989e4ccf290",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "REF_DATE string[python]\n",
+ "GEO string[python]\n",
+ "DGUID string[python]\n",
+ "Value string[python]\n",
+ "Type of cannabis string[python]\n",
+ "UOM string[python]\n",
+ "UOM_ID int16\n",
+ "SCALAR_FACTOR string[python]\n",
+ "SCALAR_ID int8\n",
+ "VECTOR string[python]\n",
+ "COORDINATE string[python]\n",
+ "VALUE float64\n",
+ "STATUS string[python]\n",
+ "SYMBOL string[python]\n",
+ "TERMINATED string[python]\n",
+ "DECIMALS int8\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
"id": "7579a135-1dfe-4fc0-991b-4b261d6577e0",
"metadata": {},
"outputs": [
@@ -451,8 +683,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "[1]\n",
- "{'VALUE': 'float64'}\n"
+ "[0]\n",
+ "{'VALUE': 'Int64'}\n"
]
}
],
@@ -469,19 +701,36 @@
" print(convert_dict)\n",
" df = df.astype(convert_dict)\n",
"elif 0 in (unique_decimal_values):\n",
- " if df[\"VALUE\"].dtype != \"int64\":\n",
+ " if df[\"VALUE\"].dtype != \"Int64\":\n",
" # If DECIMALS = [0]\n",
- " convert_dict = {\"VALUE\": \"int64\"}\n",
+ " convert_dict = {\"VALUE\": \"Int64\"}\n",
" print(convert_dict)\n",
- " df = df.astype(convert_dict)\n",
- "\n",
- "df = convert_to_lowest_type(df)\n",
+ " df = df.astype(convert_dict)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "fe87b3b0-4e04-41a8-a704-75cb9829b0a5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = convert_to_lowest_type(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "74507546-4080-4962-88fc-58c1e3943d17",
+ "metadata": {},
+ "outputs": [],
+ "source": [
"df = compute_ref_date_bounds(df)"
]
},
{
"cell_type": "code",
- "execution_count": 80,
+ "execution_count": 39,
"id": "6c2781b3-8eea-4317-a8c0-083d97ee04fc",
"metadata": {},
"outputs": [
@@ -511,8 +760,8 @@
" REF_END_DATE | \n",
" GEO | \n",
" DGUID | \n",
- " North American Industry Classification System (NAICS) | \n",
- " Summary statistics | \n",
+ " Value | \n",
+ " Type of cannabis | \n",
" UOM | \n",
" UOM_ID | \n",
" SCALAR_FACTOR | \n",
@@ -529,144 +778,144 @@
" \n",
" \n",
" | 0 | \n",
- " 2014 | \n",
- " 2014-01-01 | \n",
- " 2014-12-31 | \n",
+ " 2021/2022 | \n",
+ " NaT | \n",
+ " NaT | \n",
" Canada | \n",
- " 2016A000011124 | \n",
- " Nursing and residential care facilities [623] | \n",
- " Operating revenue | \n",
+ " 2021A000011124 | \n",
+ " Value of sales | \n",
+ " Total cannabis products | \n",
" Dollars | \n",
" 81 | \n",
- " millions | \n",
- " 6 | \n",
- " v114809189 | \n",
+ " thousands | \n",
+ " 3 | \n",
+ " v1490436660 | \n",
" 1.1.1 | \n",
- " 9310.7 | \n",
+ " 4027928 | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
- " 1 | \n",
+ " 0 | \n",
"
\n",
" \n",
" | 1 | \n",
- " 2014 | \n",
- " 2014-01-01 | \n",
- " 2014-12-31 | \n",
+ " 2021/2022 | \n",
+ " NaT | \n",
+ " NaT | \n",
" Canada | \n",
- " 2016A000011124 | \n",
- " Nursing and residential care facilities [623] | \n",
- " Operating expenses | \n",
+ " 2021A000011124 | \n",
+ " Value of sales | \n",
+ " Dried cannabis | \n",
" Dollars | \n",
" 81 | \n",
- " millions | \n",
- " 6 | \n",
- " v114809190 | \n",
+ " thousands | \n",
+ " 3 | \n",
+ " v1490436647 | \n",
" 1.1.2 | \n",
- " 8499.5 | \n",
+ " 2861838 | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
- " 1 | \n",
+ " 0 | \n",
"
\n",
" \n",
" | 2 | \n",
- " 2014 | \n",
- " 2014-01-01 | \n",
- " 2014-12-31 | \n",
+ " 2021/2022 | \n",
+ " NaT | \n",
+ " NaT | \n",
" Canada | \n",
- " 2016A000011124 | \n",
- " Nursing and residential care facilities [623] | \n",
- " Salaries, wages, commissions and benefits | \n",
+ " 2021A000011124 | \n",
+ " Value of sales | \n",
+ " Inhaled cannabis extracts | \n",
" Dollars | \n",
" 81 | \n",
- " millions | \n",
- " 6 | \n",
- " v114809191 | \n",
+ " thousands | \n",
+ " 3 | \n",
+ " v1490436648 | \n",
" 1.1.3 | \n",
- " 4630.3 | \n",
+ " 729178 | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
- " 1 | \n",
+ " 0 | \n",
"
\n",
" \n",
" | 3 | \n",
- " 2014 | \n",
- " 2014-01-01 | \n",
- " 2014-12-31 | \n",
+ " 2021/2022 | \n",
+ " NaT | \n",
+ " NaT | \n",
" Canada | \n",
- " 2016A000011124 | \n",
- " Nursing and residential care facilities [623] | \n",
- " Operating profit margin | \n",
- " Percent | \n",
- " 239 | \n",
- " units | \n",
- " 0 | \n",
- " v114809192 | \n",
+ " 2021A000011124 | \n",
+ " Value of sales | \n",
+ " Ingested cannabis extracts | \n",
+ " Dollars | \n",
+ " 81 | \n",
+ " thousands | \n",
+ " 3 | \n",
+ " v1490436649 | \n",
" 1.1.4 | \n",
- " 8.7 | \n",
+ " 158283 | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
- " 1 | \n",
+ " 0 | \n",
"
\n",
" \n",
" | 4 | \n",
- " 2014 | \n",
- " 2014-01-01 | \n",
- " 2014-12-31 | \n",
- " Newfoundland and Labrador | \n",
- " 2016A000210 | \n",
- " Nursing and residential care facilities [623] | \n",
- " Operating revenue | \n",
+ " 2021/2022 | \n",
+ " NaT | \n",
+ " NaT | \n",
+ " Canada | \n",
+ " 2021A000011124 | \n",
+ " Value of sales | \n",
+ " Solid cannabis edibles | \n",
" Dollars | \n",
" 81 | \n",
- " millions | \n",
- " 6 | \n",
- " v114809193 | \n",
- " 2.1.1 | \n",
- " 97.9 | \n",
+ " thousands | \n",
+ " 3 | \n",
+ " v1490436650 | \n",
+ " 1.1.5 | \n",
+ " 166336 | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
- " 1 | \n",
+ " 0 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " REF_DATE REF_START_DATE REF_END_DATE GEO \\\n",
- "0 2014 2014-01-01 2014-12-31 Canada \n",
- "1 2014 2014-01-01 2014-12-31 Canada \n",
- "2 2014 2014-01-01 2014-12-31 Canada \n",
- "3 2014 2014-01-01 2014-12-31 Canada \n",
- "4 2014 2014-01-01 2014-12-31 Newfoundland and Labrador \n",
+ " REF_DATE REF_START_DATE REF_END_DATE GEO DGUID \\\n",
+ "0 2021/2022 NaT NaT Canada 2021A000011124 \n",
+ "1 2021/2022 NaT NaT Canada 2021A000011124 \n",
+ "2 2021/2022 NaT NaT Canada 2021A000011124 \n",
+ "3 2021/2022 NaT NaT Canada 2021A000011124 \n",
+ "4 2021/2022 NaT NaT Canada 2021A000011124 \n",
"\n",
- " DGUID North American Industry Classification System (NAICS) \\\n",
- "0 2016A000011124 Nursing and residential care facilities [623] \n",
- "1 2016A000011124 Nursing and residential care facilities [623] \n",
- "2 2016A000011124 Nursing and residential care facilities [623] \n",
- "3 2016A000011124 Nursing and residential care facilities [623] \n",
- "4 2016A000210 Nursing and residential care facilities [623] \n",
+ " Value Type of cannabis UOM UOM_ID SCALAR_FACTOR \\\n",
+ "0 Value of sales Total cannabis products Dollars 81 thousands \n",
+ "1 Value of sales Dried cannabis Dollars 81 thousands \n",
+ "2 Value of sales Inhaled cannabis extracts Dollars 81 thousands \n",
+ "3 Value of sales Ingested cannabis extracts Dollars 81 thousands \n",
+ "4 Value of sales Solid cannabis edibles Dollars 81 thousands \n",
"\n",
- " Summary statistics UOM UOM_ID SCALAR_FACTOR \\\n",
- "0 Operating revenue Dollars 81 millions \n",
- "1 Operating expenses Dollars 81 millions \n",
- "2 Salaries, wages, commissions and benefits Dollars 81 millions \n",
- "3 Operating profit margin Percent 239 units \n",
- "4 Operating revenue Dollars 81 millions \n",
+ " SCALAR_ID VECTOR COORDINATE VALUE STATUS SYMBOL TERMINATED \\\n",
+ "0 3 v1490436660 1.1.1 4027928 \n",
+ "1 3 v1490436647 1.1.2 2861838 \n",
+ "2 3 v1490436648 1.1.3 729178 \n",
+ "3 3 v1490436649 1.1.4 158283 \n",
+ "4 3 v1490436650 1.1.5 166336 \n",
"\n",
- " SCALAR_ID VECTOR COORDINATE VALUE STATUS SYMBOL TERMINATED DECIMALS \n",
- "0 6 v114809189 1.1.1 9310.7 1 \n",
- "1 6 v114809190 1.1.2 8499.5 1 \n",
- "2 6 v114809191 1.1.3 4630.3 1 \n",
- "3 0 v114809192 1.1.4 8.7 1 \n",
- "4 6 v114809193 2.1.1 97.9 1 "
+ " DECIMALS \n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 "
]
},
- "execution_count": 80,
+ "execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
@@ -677,7 +926,7 @@
},
{
"cell_type": "code",
- "execution_count": 81,
+ "execution_count": 17,
"id": "49cc1fa3-1ac8-4510-b4fd-7827a041e4a9",
"metadata": {
"editable": true,
@@ -691,7 +940,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Exporting dataframe as parquet to /data/tables/output/en/13100102.parquet\n"
+ "Exporting dataframe as parquet to /data/tables/output/en/10100164_test.parquet\n"
]
}
],
@@ -710,7 +959,7 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": null,
"id": "788bc668-8057-4e06-91a3-b99991e0a410",
"metadata": {
"editable": true,
@@ -719,28 +968,7 @@
},
"tags": []
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Removing scratch files\n",
- "Reading metadata /data/tables/input/metadata/43100011.json\n"
- ]
- },
- {
- "ename": "OperationalError",
- "evalue": "no such column: last_processed",
- "output_type": "error",
- "traceback": [
- "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
- "\u001b[31mOperationalError\u001b[39m Traceback (most recent call last)",
- "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[26]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m 4\u001b[39m os.remove(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mscratch_folder\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mproduct_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m_MetaData.csv\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 5\u001b[39m update_last_downloaded(product_id)\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m \u001b[43mupdate_last_processed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mproduct_id\u001b[49m\u001b[43m)\u001b[49m\n",
- "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 3\u001b[39m, in \u001b[36mupdate_last_processed\u001b[39m\u001b[34m(product_id)\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mupdate_last_processed\u001b[39m(product_id):\n\u001b[32m 2\u001b[39m time_finished_processing = datetime.now().isoformat()\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m \u001b[43mcur\u001b[49m\u001b[43m.\u001b[49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mUPDATE downloaded SET last_processed = ? WHERE product_id = ?\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mtime_finished_processing\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mproduct_id\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 4\u001b[39m con.commit()\n",
- "\u001b[31mOperationalError\u001b[39m: no such column: last_processed"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# Remove the scratch files\n",
"print(\"Removing scratch files\")\n",
@@ -749,39 +977,6 @@
"update_last_downloaded(product_id)\n",
"update_last_processed(product_id)"
]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "id": "06fb89ad-77ba-46db-bb88-15f5636d707d",
- "metadata": {},
- "outputs": [
- {
- "ename": "NameError",
- "evalue": "name 'process_cube' is not defined",
- "output_type": "error",
- "traceback": [
- "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
- "\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
- "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[40]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mprocess_cube\u001b[49m(\u001b[33m\"\u001b[39m\u001b[33m43100011\u001b[39m\u001b[33m\"\u001b[39m)\n",
- "\u001b[31mNameError\u001b[39m: name 'process_cube' is not defined"
- ]
- }
- ],
- "source": [
- "process_cube(\"37100216\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9cc04f15-006f-4a3a-9610-65736820ba84",
- "metadata": {},
- "outputs": [],
- "source": [
- "# This one has multiple DECIMAL precision values\n",
- "process_cube(\"43100011)"
- ]
}
],
"metadata": {