mirror of
https://github.com/dataforcanada/d4c-datapkg-statistical.git
synced 2026-06-13 14:10:55 +02:00
Had to optimize the code. Leaving it outside of function for now in case I need to continue working on it
This commit is contained in:
@@ -3,20 +3,37 @@
|
|||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "8b767621-b96b-4eaa-a16a-0329bab29c0f",
|
"id": "8b767621-b96b-4eaa-a16a-0329bab29c0f",
|
||||||
"metadata": {},
|
"metadata": {
|
||||||
|
"editable": true,
|
||||||
|
"slideshow": {
|
||||||
|
"slide_type": ""
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"# Notes\n",
|
"# Notes\n",
|
||||||
"- Make sure Statistics Canada knows about these issues:\n",
|
"- Make sure Statistics Canada knows about these issues:\n",
|
||||||
" - When downloading the XML for product_id 98100404 it just returns the structure document, not the data document\n",
|
" - When downloading the XML for product_id 98100404 it just returns the structure document, not the data document\n",
|
||||||
" - The releaseTime value is in Eastern Standard Zone for https://www150.statcan.gc.ca/t1/wds/rest/getCubeMetadata and UTC for https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite\n",
|
" - The releaseTime value is in Eastern Standard Zone for https://www150.statcan.gc.ca/t1/wds/rest/getCubeMetadata and UTC for https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite\n",
|
||||||
" - The releaseTime value is different when getting it from https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite and https://www150.statcan.gc.ca/t1/wds/rest/getCubeMetadata . For example productId 10100007"
|
" - The releaseTime value is different when getting it from https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite and https://www150.statcan.gc.ca/t1/wds/rest/getCubeMetadata . For example productId 10100007\n",
|
||||||
|
" - Currently having an issue with processing productId 13100442. Expected 18 fields but saw 19 at line 162\n",
|
||||||
|
" - There are cases where the `REF_DATE` is a range, ex. 2023/2024\n",
|
||||||
|
" - For productId 17100022 the period is from July 1 to June 30 (seen in the metadata), so can't just use January 1, 2023 and December 31, 2024\n",
|
||||||
|
" - There can be times when a table's `VALUE` field can have both float and integer data types. For example, productId 11100025 has `DECIMAL` values of 0 and 1, corresponding to integer and float data types\n",
|
||||||
|
" - Need to look into cases where there is no `DGUID`, but the `GEO` value tells you something like `All census metropolitan areas` (productId 11100025). I guess I can do the linking to the CMAs on the API side"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 209,
|
"execution_count": 1,
|
||||||
"id": "98859cd6-6fa4-4aef-a113-455699524fae",
|
"id": "98859cd6-6fa4-4aef-a113-455699524fae",
|
||||||
"metadata": {},
|
"metadata": {
|
||||||
|
"editable": true,
|
||||||
|
"slideshow": {
|
||||||
|
"slide_type": ""
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from datetime import datetime\n",
|
"from datetime import datetime\n",
|
||||||
@@ -29,7 +46,6 @@
|
|||||||
"from zoneinfo import ZoneInfo\n",
|
"from zoneinfo import ZoneInfo\n",
|
||||||
"\n",
|
"\n",
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"import polars as pl\n",
|
|
||||||
"import requests\n",
|
"import requests\n",
|
||||||
"from tqdm import tqdm\n",
|
"from tqdm import tqdm\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -38,14 +54,14 @@
|
|||||||
"scratch_folder = f\"{data_folder}/scratch\"\n",
|
"scratch_folder = f\"{data_folder}/scratch\"\n",
|
||||||
"output_folder = f\"{data_folder}/output\"\n",
|
"output_folder = f\"{data_folder}/output\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
|
||||||
"if not os.path.exists(f\"{data_folder}/processing.db\"):\n",
|
"if not os.path.exists(f\"{data_folder}/processing.db\"):\n",
|
||||||
" con = sqlite3.connect(f\"{data_folder}/processing.db\")\n",
|
" con = sqlite3.connect(f\"{data_folder}/processing.db\")\n",
|
||||||
" cur = con.cursor()\n",
|
" cur = con.cursor()\n",
|
||||||
" cur.executescript(\"\"\"\n",
|
" cur.executescript(\"\"\"\n",
|
||||||
" CREATE TABLE IF NOT EXISTS downloaded (\n",
|
" CREATE TABLE IF NOT EXISTS downloaded (\n",
|
||||||
" product_id TEXT PRIMARY KEY,\n",
|
" product_id TEXT PRIMARY KEY,\n",
|
||||||
" last_updated TEXT\n",
|
" last_updated TEXT,\n",
|
||||||
|
" last_processed TEXT\n",
|
||||||
" );\n",
|
" );\n",
|
||||||
"\n",
|
"\n",
|
||||||
" CREATE TABLE IF NOT EXISTS cubes (\n",
|
" CREATE TABLE IF NOT EXISTS cubes (\n",
|
||||||
@@ -61,7 +77,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 210,
|
"execution_count": 2,
|
||||||
"id": "28ac4c01-c1c5-427f-bb2c-0da99a4c5591",
|
"id": "28ac4c01-c1c5-427f-bb2c-0da99a4c5591",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -83,7 +99,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 211,
|
"execution_count": 3,
|
||||||
"id": "9daa94f4-16c9-4d8b-951e-3d5d38eb618f",
|
"id": "9daa94f4-16c9-4d8b-951e-3d5d38eb618f",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -93,9 +109,15 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 212,
|
"execution_count": 4,
|
||||||
"id": "0af9a4b3-7b59-460b-b933-504919d4bd2a",
|
"id": "0af9a4b3-7b59-460b-b933-504919d4bd2a",
|
||||||
"metadata": {},
|
"metadata": {
|
||||||
|
"editable": true,
|
||||||
|
"slideshow": {
|
||||||
|
"slide_type": ""
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def update_last_downloaded(product_id):\n",
|
"def update_last_downloaded(product_id):\n",
|
||||||
@@ -128,7 +150,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 213,
|
"execution_count": 5,
|
||||||
"id": "4b7996d2-75ab-4173-a17a-64fb7ab63740",
|
"id": "4b7996d2-75ab-4173-a17a-64fb7ab63740",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -141,7 +163,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 214,
|
"execution_count": 6,
|
||||||
"id": "23cbabc3-0d4b-4e28-a4df-b2c5e8c7ea8b",
|
"id": "23cbabc3-0d4b-4e28-a4df-b2c5e8c7ea8b",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -190,7 +212,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 215,
|
"execution_count": 7,
|
||||||
"id": "dc5573ef-734b-44d8-a4c4-0df19d655975",
|
"id": "dc5573ef-734b-44d8-a4c4-0df19d655975",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -200,8 +222,66 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 228,
|
"execution_count": null,
|
||||||
"id": "144e3716-b0e7-4a39-9a25-ededea506f4f",
|
"id": "eddf6501-8428-44cc-8d2d-e245803a3943",
|
||||||
|
"metadata": {
|
||||||
|
"editable": true,
|
||||||
|
"slideshow": {
|
||||||
|
"slide_type": ""
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def compute_ref_date_bounds(df):\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" TODO: There are cases where the REF_DATE is a range, ex. 2023/2024.\n",
|
||||||
|
" For productId 17100022 the period is from July 1 to June 30 (seen in the metadata), so can't just \n",
|
||||||
|
" use January 1, 2023 and December 31, 2024\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" series = df[\"REF_DATE\"]\n",
|
||||||
|
"\n",
|
||||||
|
" # Initialize the two new columns with NaT\n",
|
||||||
|
" df[\"REF_START_DATE\"] = pd.NaT\n",
|
||||||
|
" df[\"REF_END_DATE\"] = pd.NaT\n",
|
||||||
|
"\n",
|
||||||
|
" # Skip rows that contain slashes\n",
|
||||||
|
" valid_mask = ~series.str.contains(\"/\", na=False)\n",
|
||||||
|
"\n",
|
||||||
|
" # Case 1: YYYY-MM-DD\n",
|
||||||
|
" full_mask = valid_mask & series.str.fullmatch(r\"\\d{4}-\\d{2}-\\d{2}\")\n",
|
||||||
|
" parsed_full = pd.to_datetime(series[full_mask], format=\"%Y-%m-%d\", errors=\"coerce\")\n",
|
||||||
|
" df.loc[full_mask, \"REF_START_DATE\"] = parsed_full\n",
|
||||||
|
" df.loc[full_mask, \"REF_END_DATE\"] = parsed_full\n",
|
||||||
|
"\n",
|
||||||
|
" # Case 2: YYYY-MM\n",
|
||||||
|
" month_mask = valid_mask & series.str.fullmatch(r\"\\d{4}-\\d{2}\")\n",
|
||||||
|
" parsed_month = pd.to_datetime(series[month_mask], format=\"%Y-%m\", errors=\"coerce\")\n",
|
||||||
|
" df.loc[month_mask, \"REF_START_DATE\"] = parsed_month\n",
|
||||||
|
" df.loc[month_mask, \"REF_END_DATE\"] = parsed_month + pd.to_timedelta(\n",
|
||||||
|
" parsed_month.dt.days_in_month - 1, unit='D'\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
" # Case 3: YYYY\n",
|
||||||
|
" year_mask = valid_mask & series.str.fullmatch(r\"\\d{4}\")\n",
|
||||||
|
" parsed_year = pd.to_datetime(series[year_mask], format=\"%Y\", errors=\"coerce\")\n",
|
||||||
|
" df.loc[year_mask, \"REF_START_DATE\"] = parsed_year\n",
|
||||||
|
" df.loc[year_mask, \"REF_END_DATE\"] = parsed_year + pd.offsets.YearEnd(0)\n",
|
||||||
|
"\n",
|
||||||
|
" # Move columns after REF_DATE\n",
|
||||||
|
" ref_idx = df.columns.get_loc(\"REF_DATE\")\n",
|
||||||
|
" cols = list(df.columns)\n",
|
||||||
|
" cols.remove(\"REF_START_DATE\")\n",
|
||||||
|
" cols.remove(\"REF_END_DATE\")\n",
|
||||||
|
" cols[ref_idx + 1:ref_idx + 1] = [\"REF_START_DATE\", \"REF_END_DATE\"]\n",
|
||||||
|
"\n",
|
||||||
|
" return df[cols]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "e67642d3-cc6c-4c5d-b5a3-2fe18364ad71",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@@ -211,27 +291,24 @@
|
|||||||
" For example, if the column is numerical and has a maximum value of 32,000 \n",
|
" For example, if the column is numerical and has a maximum value of 32,000 \n",
|
||||||
" we can assign it a type of int16\n",
|
" we can assign it a type of int16\n",
|
||||||
" \"\"\"\n",
|
" \"\"\"\n",
|
||||||
" print(\"Converting dataframe to optimal data types\")\n",
|
|
||||||
" params = {\n",
|
|
||||||
" 'convert_string': False,\n",
|
|
||||||
" 'convert_boolean': False\n",
|
|
||||||
" }\n",
|
|
||||||
" df = df.convert_dtypes(**params)\n",
|
|
||||||
"\n",
|
|
||||||
" dtypes = pd.DataFrame(df.dtypes)\n",
|
" dtypes = pd.DataFrame(df.dtypes)\n",
|
||||||
" # Downcast to the smallest numerical dtype\n",
|
" # Downcast to the smallest numerical dtype\n",
|
||||||
" for row in dtypes.itertuples():\n",
|
" for row in dtypes.itertuples():\n",
|
||||||
" column = row[0]\n",
|
" column = row[0]\n",
|
||||||
" the_type = str(row[1])\n",
|
" the_type = str(row[1])\n",
|
||||||
" # Skipping downcasting Float64 as there were issues with decimal places\n",
|
" if the_type == 'int64':\n",
|
||||||
" # For example, instead of a value being 65.4, it turned into 65.4000015258789\n",
|
|
||||||
" if the_type == 'Float64':\n",
|
|
||||||
" continue\n",
|
|
||||||
" elif the_type == 'Int64':\n",
|
|
||||||
" df[column] = pd.to_numeric(df[column], downcast='integer')\n",
|
" df[column] = pd.to_numeric(df[column], downcast='integer')\n",
|
||||||
"\n",
|
"\n",
|
||||||
" return df\n",
|
" return df"
|
||||||
"\n",
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"id": "144e3716-b0e7-4a39-9a25-ededea506f4f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
"def extract_zipfile(product_id, language):\n",
|
"def extract_zipfile(product_id, language):\n",
|
||||||
" \"\"\"\n",
|
" \"\"\"\n",
|
||||||
" It is faster to extract the zip file and read the CSV, than open\n",
|
" It is faster to extract the zip file and read the CSV, than open\n",
|
||||||
@@ -270,74 +347,180 @@
|
|||||||
" if chunk: # filter out keep-alive new chunks\n",
|
" if chunk: # filter out keep-alive new chunks\n",
|
||||||
" handle.write(chunk)\n",
|
" handle.write(chunk)\n",
|
||||||
" progress_bar.update(len(chunk))\n",
|
" progress_bar.update(len(chunk))\n",
|
||||||
" progress_bar.close()\n",
|
" progress_bar.close()"
|
||||||
" \n",
|
|
||||||
"def process_cube(product_id, language=\"en\"):\n",
|
|
||||||
" cur.execute(\"SELECT product_id FROM downloaded WHERE product_id = ?\", (product_id,))\n",
|
|
||||||
" result = cur.fetchone()\n",
|
|
||||||
" if result:\n",
|
|
||||||
" print(f\"Already processed {product_id}\")\n",
|
|
||||||
" return\n",
|
|
||||||
" extract_zipfile(product_id, language)\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" The pandas column reader is better than the Polars one\n",
|
|
||||||
" Here is an example where polars was not reading it right:\n",
|
|
||||||
" https://www150.statcan.gc.ca/n1/tbl/csv/98100404-eng.zip\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" # Get metadata\n",
|
|
||||||
" #metadata_file = f\"{input_folder}/metadata/{product_id}.json\"\n",
|
|
||||||
" #metadata = get_cube_metadata(product_id)\n",
|
|
||||||
" #print(f\"Writing metadata file {metadata_file}\")\n",
|
|
||||||
" #with open(metadata_file, \"w\") as outfile:\n",
|
|
||||||
" # json.dump(metadata, outfile)\n",
|
|
||||||
" # Read CSV using Pandas\n",
|
|
||||||
" product_csv = f\"{scratch_folder}/{product_id}.csv\"\n",
|
|
||||||
" parameters = {\n",
|
|
||||||
" \"engine\": \"c\",\n",
|
|
||||||
" \"low_memory\": True\n",
|
|
||||||
" }\n",
|
|
||||||
" print(f\"Reading {product_csv} as a Pandas dataframe\")\n",
|
|
||||||
" df = pd.read_csv(product_csv, **parameters)\n",
|
|
||||||
" df = convert_to_lowest_type(df)\n",
|
|
||||||
" # Fix mixed types. This happened in productId 11100147\n",
|
|
||||||
" for col in df.select_dtypes(include=[\"object\"]).columns:\n",
|
|
||||||
" types_in_col = df[col].map(type).unique()\n",
|
|
||||||
" # If there's any non-str type in the column\n",
|
|
||||||
" if any(t != str for t in types_in_col):\n",
|
|
||||||
" print(f\"Fixing mixed types in column: {col} — types found: {types_in_col}\")\n",
|
|
||||||
" # Replace non-str values with None\n",
|
|
||||||
" df[col] = df[col].where(df[col].map(type) == str, None).astype(\"string\")\n",
|
|
||||||
" print(\"Import Pandas dataframe as a Polars dataframe\")\n",
|
|
||||||
" df = pl.from_pandas(df)\n",
|
|
||||||
" output_parquet = f\"{output_folder}/{language}/{product_id}.parquet\"\n",
|
|
||||||
" print(f\"Exporting dataframe as parquet to {output_parquet}\")\n",
|
|
||||||
" df.write_parquet(output_parquet,\n",
|
|
||||||
" compression='zstd',\n",
|
|
||||||
" compression_level=22)\n",
|
|
||||||
" # Remove the scratch files\n",
|
|
||||||
" print(\"Removing scratch files\")\n",
|
|
||||||
" os.remove(f\"{scratch_folder}/{product_id}.csv\")\n",
|
|
||||||
" os.remove(f\"{scratch_folder}/{product_id}_MetaData.csv\")\n",
|
|
||||||
" update_last_downloaded(product_id)\n",
|
|
||||||
" update_last_processed(product_id)"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 229,
|
"execution_count": 12,
|
||||||
"id": "6be0842e-c1e0-45b4-90e5-0be28b963aed",
|
"id": "858e405e-7c02-4193-8abe-f23951761b09",
|
||||||
"metadata": {},
|
"metadata": {
|
||||||
|
"editable": true,
|
||||||
|
"slideshow": {
|
||||||
|
"slide_type": ""
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"product_id = \"43100011\"\n",
|
||||||
|
"#def process_cube(product_id, language=\"en\"):\n",
|
||||||
|
"language = \"en\"\n",
|
||||||
|
"cur.execute(\"SELECT product_id FROM downloaded WHERE product_id = ?\", (product_id,))\n",
|
||||||
|
"result = cur.fetchone()\n",
|
||||||
|
"if result:\n",
|
||||||
|
" print(f\"Already processed {product_id}\")\n",
|
||||||
|
" #return\n",
|
||||||
|
"#extract_zipfile(product_id, language)\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"The pandas column reader is better than the Polars one\n",
|
||||||
|
"Here is an example where polars was not reading it right:\n",
|
||||||
|
"https://www150.statcan.gc.ca/n1/tbl/csv/98100404-eng.zip\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"# Get metadata\n",
|
||||||
|
"#metadata_file = f\"{input_folder}/metadata/{product_id}.json\"\n",
|
||||||
|
"#metadata = get_cube_metadata(product_id)\n",
|
||||||
|
"#print(f\"Writing metadata file {metadata_file}\")\n",
|
||||||
|
"#with open(metadata_file, \"w\") as outfile:\n",
|
||||||
|
"# json.dump(metadata, outfile)\n",
|
||||||
|
"# Read CSV using Pandas\n",
|
||||||
|
"product_csv = f\"{scratch_folder}/{product_id}.csv\"\n",
|
||||||
|
"parameters = {\n",
|
||||||
|
" \"engine\": \"c\",\n",
|
||||||
|
" \"low_memory\": True,\n",
|
||||||
|
" #\"nrows\": 1000000,\n",
|
||||||
|
" \"dtype\": {}\n",
|
||||||
|
"}\n",
|
||||||
|
"columns = pd.read_csv(product_csv, nrows=0).columns\n",
|
||||||
|
"\n",
|
||||||
|
"columns_always_int_8 = [\"DECIMALS\", \"SCALAR_ID\"]\n",
|
||||||
|
"for column in columns_always_int_8:\n",
|
||||||
|
" if column in columns:\n",
|
||||||
|
" parameters[\"dtype\"][column] = 'int8'\n",
|
||||||
|
"\n",
|
||||||
|
"columns_always_int_16 = [\"UOM_ID\"]\n",
|
||||||
|
"for column in columns_always_int_16:\n",
|
||||||
|
" if column in columns:\n",
|
||||||
|
" parameters[\"dtype\"][column] = 'int16'\n",
|
||||||
|
"\n",
|
||||||
|
"for column in columns:\n",
|
||||||
|
" if column not in columns_always_int_8 and column not in columns_always_int_16 and column != \"VALUE\":\n",
|
||||||
|
" parameters[\"dtype\"][column] = 'string'\n",
|
||||||
|
"\n",
|
||||||
|
"if not parameters[\"dtype\"]:\n",
|
||||||
|
" del parameters[\"dtype\"]\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"Reading {product_csv} as a Pandas dataframe\")\n",
|
||||||
|
"df = pd.read_csv(product_csv, **parameters)\n",
|
||||||
|
"\n",
|
||||||
|
"unique_decimal_values = len(df[\"DECIMALS\"].unique())\n",
|
||||||
|
"if unique_decimal_values > 1:\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Example of when you can have a table with DECIMALS = 0 and DECIMALS = 1, productId 11100025\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" # Turn to float if not already\n",
|
||||||
|
" convert_dict = {\"VALUE\": \"float64\"}\n",
|
||||||
|
" df = df.astype(convert_dict)\n",
|
||||||
|
"elif unique_decimal_values == 1:\n",
|
||||||
|
" if df[\"VALUE\"].dtype != \"int64\":\n",
|
||||||
|
" # Turn to int64 if not already\n",
|
||||||
|
" convert_dict = {\"VALUE\": \"int64\"}\n",
|
||||||
|
" df = df.astype(convert_dict)\n",
|
||||||
|
"\n",
|
||||||
|
"df = convert_to_lowest_type(df)\n",
|
||||||
|
"df = compute_ref_date_bounds(df)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 22,
|
||||||
|
"id": "49cc1fa3-1ac8-4510-b4fd-7827a041e4a9",
|
||||||
|
"metadata": {
|
||||||
|
"editable": true,
|
||||||
|
"slideshow": {
|
||||||
|
"slide_type": ""
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"Already processed 11100147\n"
|
"Exporting dataframe as parquet to /data/tables/output/en/43100011.parquet\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"IOStream.flush timed out\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"process_cube(\"11100147\")"
|
"output_parquet = f\"{output_folder}/{language}/{product_id}.parquet\"\n",
|
||||||
|
"print(f\"Exporting dataframe as parquet to {output_parquet}\")\n",
|
||||||
|
"parameters = {\n",
|
||||||
|
" \"path\": output_parquet,\n",
|
||||||
|
" \"engine\": \"pyarrow\",\n",
|
||||||
|
" \"compression\": \"zstd\",\n",
|
||||||
|
" \"index\": False,\n",
|
||||||
|
" \"compression_level\": 22\n",
|
||||||
|
"}\n",
|
||||||
|
"df.to_parquet(**parameters)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 26,
|
||||||
|
"id": "788bc668-8057-4e06-91a3-b99991e0a410",
|
||||||
|
"metadata": {
|
||||||
|
"editable": true,
|
||||||
|
"slideshow": {
|
||||||
|
"slide_type": ""
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Removing scratch files\n",
|
||||||
|
"Reading metadata /data/tables/input/metadata/43100011.json\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ename": "OperationalError",
|
||||||
|
"evalue": "no such column: last_processed",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||||
|
"\u001b[31mOperationalError\u001b[39m Traceback (most recent call last)",
|
||||||
|
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[26]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m 4\u001b[39m os.remove(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mscratch_folder\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mproduct_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m_MetaData.csv\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 5\u001b[39m update_last_downloaded(product_id)\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m \u001b[43mupdate_last_processed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mproduct_id\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||||
|
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 3\u001b[39m, in \u001b[36mupdate_last_processed\u001b[39m\u001b[34m(product_id)\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mupdate_last_processed\u001b[39m(product_id):\n\u001b[32m 2\u001b[39m time_finished_processing = datetime.now().isoformat()\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m \u001b[43mcur\u001b[49m\u001b[43m.\u001b[49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mUPDATE downloaded SET last_processed = ? WHERE product_id = ?\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mtime_finished_processing\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mproduct_id\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 4\u001b[39m con.commit()\n",
|
||||||
|
"\u001b[31mOperationalError\u001b[39m: no such column: last_processed"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Remove the scratch files\n",
|
||||||
|
"print(\"Removing scratch files\")\n",
|
||||||
|
"os.remove(f\"{scratch_folder}/{product_id}.csv\")\n",
|
||||||
|
"os.remove(f\"{scratch_folder}/{product_id}_MetaData.csv\")\n",
|
||||||
|
"update_last_downloaded(product_id)\n",
|
||||||
|
"update_last_processed(product_id)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 24,
|
||||||
|
"id": "06fb89ad-77ba-46db-bb88-15f5636d707d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#process_cube(\"43100011\")"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
Reference in New Issue
Block a user