Made changes to processing of data tables

This commit is contained in:
Diego Ripley
2025-06-21 18:01:16 +00:00
parent 7c8211cb5f
commit 8875722d10
2 changed files with 376 additions and 178 deletions
@@ -178,7 +178,7 @@ def convert_to_lowest_type(df):
for row in dtypes.itertuples(): for row in dtypes.itertuples():
column = row[0] column = row[0]
the_type = str(row[1]) the_type = str(row[1])
if the_type == 'int64': if the_type == 'Int64':
df[column] = pd.to_numeric(df[column], downcast='integer') df[column] = pd.to_numeric(df[column], downcast='integer')
return df return df
@@ -229,6 +229,8 @@ def process_cube(product_id, language="en"):
- productId 43100011 has all with DECIMAL = 1 (float64) - productId 43100011 has all with DECIMAL = 1 (float64)
- productId 17100009 has DECIMAL = 0 (int64) - productId 17100009 has DECIMAL = 0 (int64)
- productId 35100076 has multiple DECIMAL precisions [0, 1, 2] (int64, float64, float64) - productId 35100076 has multiple DECIMAL precisions [0, 1, 2] (int64, float64, float64)
- productId 10100164 has two columns named the same "Value" and "VALUE". It is processed fine with the read_csv, and when it is exported as parquet.
DuckDB has an issue with it, but Pandas and Polars are able to handle "Value" and "VALUE"
""" """
cur.execute("SELECT product_id FROM downloaded WHERE product_id = ?", (product_id,)) cur.execute("SELECT product_id FROM downloaded WHERE product_id = ?", (product_id,))
result = cur.fetchone() result = cur.fetchone()
@@ -268,6 +270,7 @@ def process_cube(product_id, language="en"):
if column in columns: if column in columns:
parameters["dtype"][column] = 'int16' parameters["dtype"][column] = 'int16'
# The remaining columns should be string, with the exception of VALUE
for column in columns: for column in columns:
if column not in columns_always_int_8 and column not in columns_always_int_16 and column != "VALUE": if column not in columns_always_int_8 and column not in columns_always_int_16 and column != "VALUE":
parameters["dtype"][column] = 'string' parameters["dtype"][column] = 'string'
@@ -25,7 +25,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 20, "execution_count": 1,
"id": "98859cd6-6fa4-4aef-a113-455699524fae", "id": "98859cd6-6fa4-4aef-a113-455699524fae",
"metadata": { "metadata": {
"editable": true, "editable": true,
@@ -77,7 +77,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 21, "execution_count": 2,
"id": "28ac4c01-c1c5-427f-bb2c-0da99a4c5591", "id": "28ac4c01-c1c5-427f-bb2c-0da99a4c5591",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -99,7 +99,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 22, "execution_count": 3,
"id": "9daa94f4-16c9-4d8b-951e-3d5d38eb618f", "id": "9daa94f4-16c9-4d8b-951e-3d5d38eb618f",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -109,7 +109,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 23, "execution_count": 4,
"id": "0af9a4b3-7b59-460b-b933-504919d4bd2a", "id": "0af9a4b3-7b59-460b-b933-504919d4bd2a",
"metadata": { "metadata": {
"editable": true, "editable": true,
@@ -150,7 +150,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 24, "execution_count": 5,
"id": "4b7996d2-75ab-4173-a17a-64fb7ab63740", "id": "4b7996d2-75ab-4173-a17a-64fb7ab63740",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -163,7 +163,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 25, "execution_count": 6,
"id": "23cbabc3-0d4b-4e28-a4df-b2c5e8c7ea8b", "id": "23cbabc3-0d4b-4e28-a4df-b2c5e8c7ea8b",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -212,7 +212,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 26, "execution_count": 7,
"id": "dc5573ef-734b-44d8-a4c4-0df19d655975", "id": "dc5573ef-734b-44d8-a4c4-0df19d655975",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -222,7 +222,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 27, "execution_count": 8,
"id": "eddf6501-8428-44cc-8d2d-e245803a3943", "id": "eddf6501-8428-44cc-8d2d-e245803a3943",
"metadata": { "metadata": {
"editable": true, "editable": true,
@@ -296,7 +296,7 @@
" for row in dtypes.itertuples():\n", " for row in dtypes.itertuples():\n",
" column = row[0]\n", " column = row[0]\n",
" the_type = str(row[1])\n", " the_type = str(row[1])\n",
" if the_type == 'int64':\n", " if the_type == 'Int64':\n",
" df[column] = pd.to_numeric(df[column], downcast='integer')\n", " df[column] = pd.to_numeric(df[column], downcast='integer')\n",
"\n", "\n",
" return df" " return df"
@@ -304,7 +304,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 29, "execution_count": 10,
"id": "144e3716-b0e7-4a39-9a25-ededea506f4f", "id": "144e3716-b0e7-4a39-9a25-ededea506f4f",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -352,7 +352,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 78, "execution_count": 40,
"id": "858e405e-7c02-4193-8abe-f23951761b09", "id": "858e405e-7c02-4193-8abe-f23951761b09",
"metadata": { "metadata": {
"editable": true, "editable": true,
@@ -366,15 +366,13 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Reading /data/tables/scratch/13100102.csv\n", "Already processed 10100164\n",
"Index(['REF_DATE', 'GEO', 'DGUID',\n", "Extracting /data/tables/input/en/10100164.zip to /data/tables/scratch\n",
" 'North American Industry Classification System (NAICS)',\n", "Reading /data/tables/scratch/10100164.csv\n",
" 'Summary statistics', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID',\n", "Index(['REF_DATE', 'GEO', 'DGUID', 'Value', 'Type of cannabis', 'UOM',\n",
" 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED',\n", " 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE',\n",
" 'DECIMALS'],\n", " 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS'],\n",
" dtype='object')\n", " dtype='object')\n"
"{'engine': 'c', 'low_memory': True, 'nrows': 1000000, 'dtype': {'DECIMALS': 'int8', 'SCALAR_ID': 'int8', 'UOM_ID': 'int16', 'REF_DATE': 'string', 'GEO': 'string', 'DGUID': 'string', 'North American Industry Classification System (NAICS)': 'string', 'Summary statistics': 'string', 'UOM': 'string', 'SCALAR_FACTOR': 'string', 'VECTOR': 'string', 'COORDINATE': 'string', 'STATUS': 'string', 'SYMBOL': 'string', 'TERMINATED': 'string'}}\n",
"Reading /data/tables/scratch/13100102.csv as a Pandas dataframe\n"
] ]
} }
], ],
@@ -384,9 +382,11 @@
"- productId 43100011 has all with DECIMAL = 1 (float64)\n", "- productId 43100011 has all with DECIMAL = 1 (float64)\n",
"- productId 17100009 has DECIMAL = 0 (int64)\n", "- productId 17100009 has DECIMAL = 0 (int64)\n",
"- productId 35100076 has multiple DECIMAL precisions [0, 1, 2] (int64, float64, float64)\n", "- productId 35100076 has multiple DECIMAL precisions [0, 1, 2] (int64, float64, float64)\n",
"- productId 10100164 has two columns named the same \"Value\" and \"VALUE\". It is processed fine with the read_csv, and when it is exported as parquet.\n",
"DuckDB has an issue with it, but Pandas and Polars are able to handle \"Value\" and \"VALUE\"\n",
"\"\"\"\n", "\"\"\"\n",
"\n", "\n",
"product_id = \"13100102\"\n", "product_id = \"10100164\"\n",
"#def process_cube(product_id, language=\"en\"):\n", "#def process_cube(product_id, language=\"en\"):\n",
"language = \"en\"\n", "language = \"en\"\n",
"cur.execute(\"SELECT product_id FROM downloaded WHERE product_id = ?\", (product_id,))\n", "cur.execute(\"SELECT product_id FROM downloaded WHERE product_id = ?\", (product_id,))\n",
@@ -394,7 +394,7 @@
"if result:\n", "if result:\n",
" print(f\"Already processed {product_id}\")\n", " print(f\"Already processed {product_id}\")\n",
" #return\n", " #return\n",
"#extract_zipfile(product_id, language)\n", "extract_zipfile(product_id, language)\n",
"\"\"\"\n", "\"\"\"\n",
"The pandas column reader is better than the Polars one\n", "The pandas column reader is better than the Polars one\n",
"Here is an example where polars was not reading it right:\n", "Here is an example where polars was not reading it right:\n",
@@ -417,8 +417,25 @@
"}\n", "}\n",
"\n", "\n",
"columns = pd.read_csv(product_csv, nrows=0).columns\n", "columns = pd.read_csv(product_csv, nrows=0).columns\n",
"print(columns)\n", "print(columns)"
"\n", ]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "c1f89175-78e5-4e95-8a3e-3b65f0cb4b2d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'engine': 'c', 'low_memory': True, 'nrows': 1000000, 'dtype': {'DECIMALS': 'int8', 'SCALAR_ID': 'int8', 'UOM_ID': 'int16', 'REF_DATE': 'string', 'GEO': 'string', 'DGUID': 'string', 'Value': 'string', 'Type of cannabis': 'string', 'UOM': 'string', 'SCALAR_FACTOR': 'string', 'VECTOR': 'string', 'COORDINATE': 'string', 'STATUS': 'string', 'SYMBOL': 'string', 'TERMINATED': 'string'}}\n",
"Reading /data/tables/scratch/10100164.csv as a Pandas dataframe\n"
]
}
],
"source": [
"columns_always_int_8 = [\"DECIMALS\", \"SCALAR_ID\"]\n", "columns_always_int_8 = [\"DECIMALS\", \"SCALAR_ID\"]\n",
"for column in columns_always_int_8:\n", "for column in columns_always_int_8:\n",
" if column in columns:\n", " if column in columns:\n",
@@ -429,6 +446,7 @@
" if column in columns:\n", " if column in columns:\n",
" parameters[\"dtype\"][column] = 'int16'\n", " parameters[\"dtype\"][column] = 'int16'\n",
"\n", "\n",
"# The remaining columns should be string, with the exception of VALUE\n",
"for column in columns:\n", "for column in columns:\n",
" if column not in columns_always_int_8 and column not in columns_always_int_16 and column != \"VALUE\":\n", " if column not in columns_always_int_8 and column not in columns_always_int_16 and column != \"VALUE\":\n",
" parameters[\"dtype\"][column] = 'string'\n", " parameters[\"dtype\"][column] = 'string'\n",
@@ -443,7 +461,221 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 79, "execution_count": 31,
"id": "87ff5f69-ca1f-40e0-ac73-c73dd8c1bd4d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>REF_DATE</th>\n",
" <th>GEO</th>\n",
" <th>DGUID</th>\n",
" <th>Value</th>\n",
" <th>Type of cannabis</th>\n",
" <th>UOM</th>\n",
" <th>UOM_ID</th>\n",
" <th>SCALAR_FACTOR</th>\n",
" <th>SCALAR_ID</th>\n",
" <th>VECTOR</th>\n",
" <th>COORDINATE</th>\n",
" <th>VALUE</th>\n",
" <th>STATUS</th>\n",
" <th>SYMBOL</th>\n",
" <th>TERMINATED</th>\n",
" <th>DECIMALS</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2021/2022</td>\n",
" <td>Canada</td>\n",
" <td>2021A000011124</td>\n",
" <td>Value of sales</td>\n",
" <td>Total cannabis products</td>\n",
" <td>Dollars</td>\n",
" <td>81</td>\n",
" <td>thousands</td>\n",
" <td>3</td>\n",
" <td>v1490436660</td>\n",
" <td>1.1.1</td>\n",
" <td>4027928.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2021/2022</td>\n",
" <td>Canada</td>\n",
" <td>2021A000011124</td>\n",
" <td>Value of sales</td>\n",
" <td>Dried cannabis</td>\n",
" <td>Dollars</td>\n",
" <td>81</td>\n",
" <td>thousands</td>\n",
" <td>3</td>\n",
" <td>v1490436647</td>\n",
" <td>1.1.2</td>\n",
" <td>2861838.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2021/2022</td>\n",
" <td>Canada</td>\n",
" <td>2021A000011124</td>\n",
" <td>Value of sales</td>\n",
" <td>Inhaled cannabis extracts</td>\n",
" <td>Dollars</td>\n",
" <td>81</td>\n",
" <td>thousands</td>\n",
" <td>3</td>\n",
" <td>v1490436648</td>\n",
" <td>1.1.3</td>\n",
" <td>729178.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2021/2022</td>\n",
" <td>Canada</td>\n",
" <td>2021A000011124</td>\n",
" <td>Value of sales</td>\n",
" <td>Ingested cannabis extracts</td>\n",
" <td>Dollars</td>\n",
" <td>81</td>\n",
" <td>thousands</td>\n",
" <td>3</td>\n",
" <td>v1490436649</td>\n",
" <td>1.1.4</td>\n",
" <td>158283.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2021/2022</td>\n",
" <td>Canada</td>\n",
" <td>2021A000011124</td>\n",
" <td>Value of sales</td>\n",
" <td>Solid cannabis edibles</td>\n",
" <td>Dollars</td>\n",
" <td>81</td>\n",
" <td>thousands</td>\n",
" <td>3</td>\n",
" <td>v1490436650</td>\n",
" <td>1.1.5</td>\n",
" <td>166336.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" REF_DATE GEO DGUID Value \\\n",
"0 2021/2022 Canada 2021A000011124 Value of sales \n",
"1 2021/2022 Canada 2021A000011124 Value of sales \n",
"2 2021/2022 Canada 2021A000011124 Value of sales \n",
"3 2021/2022 Canada 2021A000011124 Value of sales \n",
"4 2021/2022 Canada 2021A000011124 Value of sales \n",
"\n",
" Type of cannabis UOM UOM_ID SCALAR_FACTOR SCALAR_ID \\\n",
"0 Total cannabis products Dollars 81 thousands 3 \n",
"1 Dried cannabis Dollars 81 thousands 3 \n",
"2 Inhaled cannabis extracts Dollars 81 thousands 3 \n",
"3 Ingested cannabis extracts Dollars 81 thousands 3 \n",
"4 Solid cannabis edibles Dollars 81 thousands 3 \n",
"\n",
" VECTOR COORDINATE VALUE STATUS SYMBOL TERMINATED DECIMALS \n",
"0 v1490436660 1.1.1 4027928.0 <NA> <NA> <NA> 0 \n",
"1 v1490436647 1.1.2 2861838.0 <NA> <NA> <NA> 0 \n",
"2 v1490436648 1.1.3 729178.0 <NA> <NA> <NA> 0 \n",
"3 v1490436649 1.1.4 158283.0 <NA> <NA> <NA> 0 \n",
"4 v1490436650 1.1.5 166336.0 <NA> <NA> <NA> 0 "
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "fde0e149-d146-4516-8966-0989e4ccf290",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"REF_DATE string[python]\n",
"GEO string[python]\n",
"DGUID string[python]\n",
"Value string[python]\n",
"Type of cannabis string[python]\n",
"UOM string[python]\n",
"UOM_ID int16\n",
"SCALAR_FACTOR string[python]\n",
"SCALAR_ID int8\n",
"VECTOR string[python]\n",
"COORDINATE string[python]\n",
"VALUE float64\n",
"STATUS string[python]\n",
"SYMBOL string[python]\n",
"TERMINATED string[python]\n",
"DECIMALS int8\n",
"dtype: object"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "7579a135-1dfe-4fc0-991b-4b261d6577e0", "id": "7579a135-1dfe-4fc0-991b-4b261d6577e0",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -451,8 +683,8 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"[1]\n", "[0]\n",
"{'VALUE': 'float64'}\n" "{'VALUE': 'Int64'}\n"
] ]
} }
], ],
@@ -469,19 +701,36 @@
" print(convert_dict)\n", " print(convert_dict)\n",
" df = df.astype(convert_dict)\n", " df = df.astype(convert_dict)\n",
"elif 0 in (unique_decimal_values):\n", "elif 0 in (unique_decimal_values):\n",
" if df[\"VALUE\"].dtype != \"int64\":\n", " if df[\"VALUE\"].dtype != \"Int64\":\n",
" # If DECIMALS = [0]\n", " # If DECIMALS = [0]\n",
" convert_dict = {\"VALUE\": \"int64\"}\n", " convert_dict = {\"VALUE\": \"Int64\"}\n",
" print(convert_dict)\n", " print(convert_dict)\n",
" df = df.astype(convert_dict)\n", " df = df.astype(convert_dict)"
"\n", ]
"df = convert_to_lowest_type(df)\n", },
{
"cell_type": "code",
"execution_count": 36,
"id": "fe87b3b0-4e04-41a8-a704-75cb9829b0a5",
"metadata": {},
"outputs": [],
"source": [
"df = convert_to_lowest_type(df)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "74507546-4080-4962-88fc-58c1e3943d17",
"metadata": {},
"outputs": [],
"source": [
"df = compute_ref_date_bounds(df)" "df = compute_ref_date_bounds(df)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 80, "execution_count": 39,
"id": "6c2781b3-8eea-4317-a8c0-083d97ee04fc", "id": "6c2781b3-8eea-4317-a8c0-083d97ee04fc",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -511,8 +760,8 @@
" <th>REF_END_DATE</th>\n", " <th>REF_END_DATE</th>\n",
" <th>GEO</th>\n", " <th>GEO</th>\n",
" <th>DGUID</th>\n", " <th>DGUID</th>\n",
" <th>North American Industry Classification System (NAICS)</th>\n", " <th>Value</th>\n",
" <th>Summary statistics</th>\n", " <th>Type of cannabis</th>\n",
" <th>UOM</th>\n", " <th>UOM</th>\n",
" <th>UOM_ID</th>\n", " <th>UOM_ID</th>\n",
" <th>SCALAR_FACTOR</th>\n", " <th>SCALAR_FACTOR</th>\n",
@@ -529,144 +778,144 @@
" <tbody>\n", " <tbody>\n",
" <tr>\n", " <tr>\n",
" <th>0</th>\n", " <th>0</th>\n",
" <td>2014</td>\n", " <td>2021/2022</td>\n",
" <td>2014-01-01</td>\n", " <td>NaT</td>\n",
" <td>2014-12-31</td>\n", " <td>NaT</td>\n",
" <td>Canada</td>\n", " <td>Canada</td>\n",
" <td>2016A000011124</td>\n", " <td>2021A000011124</td>\n",
" <td>Nursing and residential care facilities [623]</td>\n", " <td>Value of sales</td>\n",
" <td>Operating revenue</td>\n", " <td>Total cannabis products</td>\n",
" <td>Dollars</td>\n", " <td>Dollars</td>\n",
" <td>81</td>\n", " <td>81</td>\n",
" <td>millions</td>\n", " <td>thousands</td>\n",
" <td>6</td>\n", " <td>3</td>\n",
" <td>v114809189</td>\n", " <td>v1490436660</td>\n",
" <td>1.1.1</td>\n", " <td>1.1.1</td>\n",
" <td>9310.7</td>\n", " <td>4027928</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>&lt;NA&gt;</td>\n",
" <td>1</td>\n", " <td>0</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>1</th>\n", " <th>1</th>\n",
" <td>2014</td>\n", " <td>2021/2022</td>\n",
" <td>2014-01-01</td>\n", " <td>NaT</td>\n",
" <td>2014-12-31</td>\n", " <td>NaT</td>\n",
" <td>Canada</td>\n", " <td>Canada</td>\n",
" <td>2016A000011124</td>\n", " <td>2021A000011124</td>\n",
" <td>Nursing and residential care facilities [623]</td>\n", " <td>Value of sales</td>\n",
" <td>Operating expenses</td>\n", " <td>Dried cannabis</td>\n",
" <td>Dollars</td>\n", " <td>Dollars</td>\n",
" <td>81</td>\n", " <td>81</td>\n",
" <td>millions</td>\n", " <td>thousands</td>\n",
" <td>6</td>\n", " <td>3</td>\n",
" <td>v114809190</td>\n", " <td>v1490436647</td>\n",
" <td>1.1.2</td>\n", " <td>1.1.2</td>\n",
" <td>8499.5</td>\n", " <td>2861838</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>&lt;NA&gt;</td>\n",
" <td>1</td>\n", " <td>0</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>2</th>\n", " <th>2</th>\n",
" <td>2014</td>\n", " <td>2021/2022</td>\n",
" <td>2014-01-01</td>\n", " <td>NaT</td>\n",
" <td>2014-12-31</td>\n", " <td>NaT</td>\n",
" <td>Canada</td>\n", " <td>Canada</td>\n",
" <td>2016A000011124</td>\n", " <td>2021A000011124</td>\n",
" <td>Nursing and residential care facilities [623]</td>\n", " <td>Value of sales</td>\n",
" <td>Salaries, wages, commissions and benefits</td>\n", " <td>Inhaled cannabis extracts</td>\n",
" <td>Dollars</td>\n", " <td>Dollars</td>\n",
" <td>81</td>\n", " <td>81</td>\n",
" <td>millions</td>\n", " <td>thousands</td>\n",
" <td>6</td>\n", " <td>3</td>\n",
" <td>v114809191</td>\n", " <td>v1490436648</td>\n",
" <td>1.1.3</td>\n", " <td>1.1.3</td>\n",
" <td>4630.3</td>\n", " <td>729178</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>&lt;NA&gt;</td>\n",
" <td>1</td>\n", " <td>0</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>3</th>\n", " <th>3</th>\n",
" <td>2014</td>\n", " <td>2021/2022</td>\n",
" <td>2014-01-01</td>\n", " <td>NaT</td>\n",
" <td>2014-12-31</td>\n", " <td>NaT</td>\n",
" <td>Canada</td>\n", " <td>Canada</td>\n",
" <td>2016A000011124</td>\n", " <td>2021A000011124</td>\n",
" <td>Nursing and residential care facilities [623]</td>\n", " <td>Value of sales</td>\n",
" <td>Operating profit margin</td>\n", " <td>Ingested cannabis extracts</td>\n",
" <td>Percent</td>\n", " <td>Dollars</td>\n",
" <td>239</td>\n", " <td>81</td>\n",
" <td>units</td>\n", " <td>thousands</td>\n",
" <td>0</td>\n", " <td>3</td>\n",
" <td>v114809192</td>\n", " <td>v1490436649</td>\n",
" <td>1.1.4</td>\n", " <td>1.1.4</td>\n",
" <td>8.7</td>\n", " <td>158283</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>&lt;NA&gt;</td>\n",
" <td>1</td>\n", " <td>0</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>4</th>\n", " <th>4</th>\n",
" <td>2014</td>\n", " <td>2021/2022</td>\n",
" <td>2014-01-01</td>\n", " <td>NaT</td>\n",
" <td>2014-12-31</td>\n", " <td>NaT</td>\n",
" <td>Newfoundland and Labrador</td>\n", " <td>Canada</td>\n",
" <td>2016A000210</td>\n", " <td>2021A000011124</td>\n",
" <td>Nursing and residential care facilities [623]</td>\n", " <td>Value of sales</td>\n",
" <td>Operating revenue</td>\n", " <td>Solid cannabis edibles</td>\n",
" <td>Dollars</td>\n", " <td>Dollars</td>\n",
" <td>81</td>\n", " <td>81</td>\n",
" <td>millions</td>\n", " <td>thousands</td>\n",
" <td>6</td>\n", " <td>3</td>\n",
" <td>v114809193</td>\n", " <td>v1490436650</td>\n",
" <td>2.1.1</td>\n", " <td>1.1.5</td>\n",
" <td>97.9</td>\n", " <td>166336</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n", " <td>&lt;NA&gt;</td>\n",
" <td>1</td>\n", " <td>0</td>\n",
" </tr>\n", " </tr>\n",
" </tbody>\n", " </tbody>\n",
"</table>\n", "</table>\n",
"</div>" "</div>"
], ],
"text/plain": [ "text/plain": [
" REF_DATE REF_START_DATE REF_END_DATE GEO \\\n", " REF_DATE REF_START_DATE REF_END_DATE GEO DGUID \\\n",
"0 2014 2014-01-01 2014-12-31 Canada \n", "0 2021/2022 NaT NaT Canada 2021A000011124 \n",
"1 2014 2014-01-01 2014-12-31 Canada \n", "1 2021/2022 NaT NaT Canada 2021A000011124 \n",
"2 2014 2014-01-01 2014-12-31 Canada \n", "2 2021/2022 NaT NaT Canada 2021A000011124 \n",
"3 2014 2014-01-01 2014-12-31 Canada \n", "3 2021/2022 NaT NaT Canada 2021A000011124 \n",
"4 2014 2014-01-01 2014-12-31 Newfoundland and Labrador \n", "4 2021/2022 NaT NaT Canada 2021A000011124 \n",
"\n", "\n",
" DGUID North American Industry Classification System (NAICS) \\\n", " Value Type of cannabis UOM UOM_ID SCALAR_FACTOR \\\n",
"0 2016A000011124 Nursing and residential care facilities [623] \n", "0 Value of sales Total cannabis products Dollars 81 thousands \n",
"1 2016A000011124 Nursing and residential care facilities [623] \n", "1 Value of sales Dried cannabis Dollars 81 thousands \n",
"2 2016A000011124 Nursing and residential care facilities [623] \n", "2 Value of sales Inhaled cannabis extracts Dollars 81 thousands \n",
"3 2016A000011124 Nursing and residential care facilities [623] \n", "3 Value of sales Ingested cannabis extracts Dollars 81 thousands \n",
"4 2016A000210 Nursing and residential care facilities [623] \n", "4 Value of sales Solid cannabis edibles Dollars 81 thousands \n",
"\n", "\n",
" Summary statistics UOM UOM_ID SCALAR_FACTOR \\\n", " SCALAR_ID VECTOR COORDINATE VALUE STATUS SYMBOL TERMINATED \\\n",
"0 Operating revenue Dollars 81 millions \n", "0 3 v1490436660 1.1.1 4027928 <NA> <NA> <NA> \n",
"1 Operating expenses Dollars 81 millions \n", "1 3 v1490436647 1.1.2 2861838 <NA> <NA> <NA> \n",
"2 Salaries, wages, commissions and benefits Dollars 81 millions \n", "2 3 v1490436648 1.1.3 729178 <NA> <NA> <NA> \n",
"3 Operating profit margin Percent 239 units \n", "3 3 v1490436649 1.1.4 158283 <NA> <NA> <NA> \n",
"4 Operating revenue Dollars 81 millions \n", "4 3 v1490436650 1.1.5 166336 <NA> <NA> <NA> \n",
"\n", "\n",
" SCALAR_ID VECTOR COORDINATE VALUE STATUS SYMBOL TERMINATED DECIMALS \n", " DECIMALS \n",
"0 6 v114809189 1.1.1 9310.7 <NA> <NA> <NA> 1 \n", "0 0 \n",
"1 6 v114809190 1.1.2 8499.5 <NA> <NA> <NA> 1 \n", "1 0 \n",
"2 6 v114809191 1.1.3 4630.3 <NA> <NA> <NA> 1 \n", "2 0 \n",
"3 0 v114809192 1.1.4 8.7 <NA> <NA> <NA> 1 \n", "3 0 \n",
"4 6 v114809193 2.1.1 97.9 <NA> <NA> <NA> 1 " "4 0 "
] ]
}, },
"execution_count": 80, "execution_count": 39,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@@ -677,7 +926,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 81, "execution_count": 17,
"id": "49cc1fa3-1ac8-4510-b4fd-7827a041e4a9", "id": "49cc1fa3-1ac8-4510-b4fd-7827a041e4a9",
"metadata": { "metadata": {
"editable": true, "editable": true,
@@ -691,7 +940,7 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Exporting dataframe as parquet to /data/tables/output/en/13100102.parquet\n" "Exporting dataframe as parquet to /data/tables/output/en/10100164_test.parquet\n"
] ]
} }
], ],
@@ -710,7 +959,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 26, "execution_count": null,
"id": "788bc668-8057-4e06-91a3-b99991e0a410", "id": "788bc668-8057-4e06-91a3-b99991e0a410",
"metadata": { "metadata": {
"editable": true, "editable": true,
@@ -719,28 +968,7 @@
}, },
"tags": [] "tags": []
}, },
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"Removing scratch files\n",
"Reading metadata /data/tables/input/metadata/43100011.json\n"
]
},
{
"ename": "OperationalError",
"evalue": "no such column: last_processed",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mOperationalError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[26]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m 4\u001b[39m os.remove(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mscratch_folder\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mproduct_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m_MetaData.csv\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 5\u001b[39m update_last_downloaded(product_id)\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m \u001b[43mupdate_last_processed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mproduct_id\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 3\u001b[39m, in \u001b[36mupdate_last_processed\u001b[39m\u001b[34m(product_id)\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mupdate_last_processed\u001b[39m(product_id):\n\u001b[32m 2\u001b[39m time_finished_processing = datetime.now().isoformat()\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m \u001b[43mcur\u001b[49m\u001b[43m.\u001b[49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mUPDATE downloaded SET last_processed = ? WHERE product_id = ?\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mtime_finished_processing\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mproduct_id\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 4\u001b[39m con.commit()\n",
"\u001b[31mOperationalError\u001b[39m: no such column: last_processed"
]
}
],
"source": [ "source": [
"# Remove the scratch files\n", "# Remove the scratch files\n",
"print(\"Removing scratch files\")\n", "print(\"Removing scratch files\")\n",
@@ -749,39 +977,6 @@
"update_last_downloaded(product_id)\n", "update_last_downloaded(product_id)\n",
"update_last_processed(product_id)" "update_last_processed(product_id)"
] ]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "06fb89ad-77ba-46db-bb88-15f5636d707d",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'process_cube' is not defined",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[40]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mprocess_cube\u001b[49m(\u001b[33m\"\u001b[39m\u001b[33m43100011\u001b[39m\u001b[33m\"\u001b[39m)\n",
"\u001b[31mNameError\u001b[39m: name 'process_cube' is not defined"
]
}
],
"source": [
"process_cube(\"37100216\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9cc04f15-006f-4a3a-9610-65736820ba84",
"metadata": {},
"outputs": [],
"source": [
"# This one has multiple DECIMAL precision values\n",
"process_cube(\"43100011)"
]
} }
], ],
"metadata": { "metadata": {