mirror of
https://github.com/dataforcanada/d4c-datapkg-statistical.git
synced 2026-06-13 14:10:55 +02:00
Made changes
This commit is contained in:
@@ -8,7 +8,6 @@ import zipfile
|
|||||||
from zoneinfo import ZoneInfo
|
from zoneinfo import ZoneInfo
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import polars as pl
|
|
||||||
import requests
|
import requests
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
@@ -17,7 +16,6 @@ input_folder = f"{data_folder}/input"
|
|||||||
scratch_folder = f"{data_folder}/scratch"
|
scratch_folder = f"{data_folder}/scratch"
|
||||||
output_folder = f"{data_folder}/output"
|
output_folder = f"{data_folder}/output"
|
||||||
|
|
||||||
|
|
||||||
if not os.path.exists(f"{data_folder}/processing.db"):
|
if not os.path.exists(f"{data_folder}/processing.db"):
|
||||||
con = sqlite3.connect(f"{data_folder}/processing.db")
|
con = sqlite3.connect(f"{data_folder}/processing.db")
|
||||||
cur = con.cursor()
|
cur = con.cursor()
|
||||||
@@ -42,7 +40,7 @@ def setup():
|
|||||||
"""
|
"""
|
||||||
Makes data folders
|
Makes data folders
|
||||||
"""
|
"""
|
||||||
folders_to_create = [data_folder, input_folder,
|
folders_to_create = [data_folder, input_folder,
|
||||||
scratch_folder, output_folder,
|
scratch_folder, output_folder,
|
||||||
f"{input_folder}/en", f"{output_folder}/en",
|
f"{input_folder}/en", f"{output_folder}/en",
|
||||||
f"{input_folder}/fr", f"{output_folder}/fr",
|
f"{input_folder}/fr", f"{output_folder}/fr",
|
||||||
@@ -125,29 +123,62 @@ def update_tables():
|
|||||||
download_cube(product_id)
|
download_cube(product_id)
|
||||||
process_cube(product_id)
|
process_cube(product_id)
|
||||||
|
|
||||||
|
def compute_ref_date_bounds(df):
|
||||||
|
"""
|
||||||
|
TODO: There are cases where the REF_DATE is a range, ex. 2023/2024.
|
||||||
|
For productId 17100022 the period is from July 1 to June 30 (seen in the metadata), so can't just
|
||||||
|
use January 1, 2023 and December 31, 2024
|
||||||
|
"""
|
||||||
|
series = df["REF_DATE"]
|
||||||
|
|
||||||
|
# Initialize the two new columns with NaT
|
||||||
|
df["REF_START_DATE"] = pd.NaT
|
||||||
|
df["REF_END_DATE"] = pd.NaT
|
||||||
|
|
||||||
|
# Skip rows that contain slashes
|
||||||
|
valid_mask = ~series.str.contains("/", na=False)
|
||||||
|
|
||||||
|
# Case 1: YYYY-MM-DD
|
||||||
|
full_mask = valid_mask & series.str.fullmatch(r"\d{4}-\d{2}-\d{2}")
|
||||||
|
parsed_full = pd.to_datetime(series[full_mask], format="%Y-%m-%d", errors="coerce")
|
||||||
|
df.loc[full_mask, "REF_START_DATE"] = parsed_full
|
||||||
|
df.loc[full_mask, "REF_END_DATE"] = parsed_full
|
||||||
|
|
||||||
|
# Case 2: YYYY-MM
|
||||||
|
month_mask = valid_mask & series.str.fullmatch(r"\d{4}-\d{2}")
|
||||||
|
parsed_month = pd.to_datetime(series[month_mask], format="%Y-%m", errors="coerce")
|
||||||
|
df.loc[month_mask, "REF_START_DATE"] = parsed_month
|
||||||
|
df.loc[month_mask, "REF_END_DATE"] = parsed_month + pd.to_timedelta(
|
||||||
|
parsed_month.dt.days_in_month - 1, unit='D'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Case 3: YYYY
|
||||||
|
year_mask = valid_mask & series.str.fullmatch(r"\d{4}")
|
||||||
|
parsed_year = pd.to_datetime(series[year_mask], format="%Y", errors="coerce")
|
||||||
|
df.loc[year_mask, "REF_START_DATE"] = parsed_year
|
||||||
|
df.loc[year_mask, "REF_END_DATE"] = parsed_year + pd.offsets.YearEnd(0)
|
||||||
|
|
||||||
|
# Move columns after REF_DATE
|
||||||
|
ref_idx = df.columns.get_loc("REF_DATE")
|
||||||
|
cols = list(df.columns)
|
||||||
|
cols.remove("REF_START_DATE")
|
||||||
|
cols.remove("REF_END_DATE")
|
||||||
|
cols[ref_idx + 1:ref_idx + 1] = ["REF_START_DATE", "REF_END_DATE"]
|
||||||
|
|
||||||
|
return df[cols]
|
||||||
|
|
||||||
def convert_to_lowest_type(df):
|
def convert_to_lowest_type(df):
|
||||||
"""
|
"""
|
||||||
Convert columns to the best possible dtypes
|
Convert columns to the best possible dtypes
|
||||||
For example, if the column is numerical and has a maximum value of 32,000
|
For example, if the column is numerical and has a maximum value of 32,000
|
||||||
we can assign it a type of int16
|
we can assign it a type of int16
|
||||||
"""
|
"""
|
||||||
print("Converting dataframe to optimal data types")
|
|
||||||
params = {
|
|
||||||
'convert_string': False,
|
|
||||||
'convert_boolean': False
|
|
||||||
}
|
|
||||||
df = df.convert_dtypes(**params)
|
|
||||||
|
|
||||||
dtypes = pd.DataFrame(df.dtypes)
|
dtypes = pd.DataFrame(df.dtypes)
|
||||||
# Downcast to the smallest numerical dtype
|
# Downcast to the smallest numerical dtype
|
||||||
for row in dtypes.itertuples():
|
for row in dtypes.itertuples():
|
||||||
column = row[0]
|
column = row[0]
|
||||||
the_type = str(row[1])
|
the_type = str(row[1])
|
||||||
# Skipping downcasting Float64 as there were issues with decimal places
|
if the_type == 'int64':
|
||||||
# For example, instead of a value being 65.4, it turned into 65.4000015258789
|
|
||||||
if the_type == 'Float64':
|
|
||||||
continue
|
|
||||||
elif the_type == 'Int64':
|
|
||||||
df[column] = pd.to_numeric(df[column], downcast='integer')
|
df[column] = pd.to_numeric(df[column], downcast='integer')
|
||||||
|
|
||||||
return df
|
return df
|
||||||
@@ -193,6 +224,17 @@ def download_cube(product_id, language="en"):
|
|||||||
progress_bar.close()
|
progress_bar.close()
|
||||||
|
|
||||||
def process_cube(product_id, language="en"):
|
def process_cube(product_id, language="en"):
|
||||||
|
"""
|
||||||
|
Examples:
|
||||||
|
- productId 43100011 has all with DECIMAL = 1 (float64)
|
||||||
|
- productId 17100009 has DECIMAL = 0 (int64)
|
||||||
|
- productId 35100076 has multiple DECIMAL precisions [0, 1, 2] (int64, float64, float64)
|
||||||
|
"""
|
||||||
|
cur.execute("SELECT product_id FROM downloaded WHERE product_id = ?", (product_id,))
|
||||||
|
result = cur.fetchone()
|
||||||
|
if result:
|
||||||
|
print(f"Already processed {product_id}")
|
||||||
|
return
|
||||||
extract_zipfile(product_id, language)
|
extract_zipfile(product_id, language)
|
||||||
"""
|
"""
|
||||||
The pandas column reader is better than the Polars one
|
The pandas column reader is better than the Polars one
|
||||||
@@ -207,20 +249,63 @@ def process_cube(product_id, language="en"):
|
|||||||
# json.dump(metadata, outfile)
|
# json.dump(metadata, outfile)
|
||||||
# Read CSV using Pandas
|
# Read CSV using Pandas
|
||||||
product_csv = f"{scratch_folder}/{product_id}.csv"
|
product_csv = f"{scratch_folder}/{product_id}.csv"
|
||||||
|
print(f"Reading {product_csv}")
|
||||||
parameters = {
|
parameters = {
|
||||||
"engine": "c",
|
"engine": "c",
|
||||||
"low_memory": True
|
"low_memory": True,
|
||||||
|
"nrows": 100000,
|
||||||
|
"dtype": {}
|
||||||
}
|
}
|
||||||
|
columns = pd.read_csv(product_csv, nrows=0).columns
|
||||||
|
|
||||||
|
columns_always_int_8 = ["DECIMALS", "SCALAR_ID"]
|
||||||
|
for column in columns_always_int_8:
|
||||||
|
if column in columns:
|
||||||
|
parameters["dtype"][column] = 'int8'
|
||||||
|
|
||||||
|
columns_always_int_16 = ["UOM_ID"]
|
||||||
|
for column in columns_always_int_16:
|
||||||
|
if column in columns:
|
||||||
|
parameters["dtype"][column] = 'int16'
|
||||||
|
|
||||||
|
for column in columns:
|
||||||
|
if column not in columns_always_int_8 and column not in columns_always_int_16 and column != "VALUE":
|
||||||
|
parameters["dtype"][column] = 'string'
|
||||||
|
|
||||||
|
if not parameters["dtype"]:
|
||||||
|
del parameters["dtype"]
|
||||||
|
|
||||||
print(f"Reading {product_csv} as a Pandas dataframe")
|
print(f"Reading {product_csv} as a Pandas dataframe")
|
||||||
df = pd.read_csv(product_csv, **parameters)
|
df = pd.read_csv(product_csv, **parameters)
|
||||||
|
unique_decimal_values = df["DECIMALS"].unique()
|
||||||
|
if any(unique_decimal_values):
|
||||||
|
"""
|
||||||
|
A table can have both float and integer in the VALUE field.
|
||||||
|
productId 11100025 is an example
|
||||||
|
So if we have unique values for DECIMALS to be [0,1], then we convert to float64
|
||||||
|
"""
|
||||||
|
convert_dict = {"VALUE": "float64"}
|
||||||
|
print(convert_dict)
|
||||||
|
df = df.astype(convert_dict)
|
||||||
|
elif 0 in (unique_decimal_values):
|
||||||
|
if df["VALUE"].dtype != "Int64":
|
||||||
|
# If DECIMALS = [0]
|
||||||
|
convert_dict = {"VALUE": "Int64"}
|
||||||
|
print(convert_dict)
|
||||||
|
df = df.astype(convert_dict)
|
||||||
|
|
||||||
df = convert_to_lowest_type(df)
|
df = convert_to_lowest_type(df)
|
||||||
print("Import Pandas dataframe as a Polars dataframe")
|
df = compute_ref_date_bounds(df)
|
||||||
df = pl.from_pandas(df)
|
|
||||||
output_parquet = f"{output_folder}/{language}/{product_id}.parquet"
|
output_parquet = f"{output_folder}/{language}/{product_id}.parquet"
|
||||||
print(f"Exporting dataframe as parquet to {output_parquet}")
|
print(f"Exporting dataframe as parquet to {output_parquet}")
|
||||||
df.write_parquet(output_parquet,
|
parameters = {
|
||||||
compression='zstd',
|
"path": output_parquet,
|
||||||
compression_level=22)
|
"engine": "pyarrow",
|
||||||
|
"compression": "zstd",
|
||||||
|
"index": False,
|
||||||
|
"compression_level": 22
|
||||||
|
}
|
||||||
|
df.to_parquet(**parameters)
|
||||||
# Remove the scratch files
|
# Remove the scratch files
|
||||||
print("Removing scratch files")
|
print("Removing scratch files")
|
||||||
os.remove(f"{scratch_folder}/{product_id}.csv")
|
os.remove(f"{scratch_folder}/{product_id}.csv")
|
||||||
@@ -228,6 +313,7 @@ def process_cube(product_id, language="en"):
|
|||||||
update_last_downloaded(product_id)
|
update_last_downloaded(product_id)
|
||||||
update_last_processed(product_id)
|
update_last_processed(product_id)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
setup()
|
setup()
|
||||||
files_to_process = glob.glob(f"{input_folder}/en/*.zip")
|
files_to_process = glob.glob(f"{input_folder}/en/*.zip")
|
||||||
@@ -235,5 +321,7 @@ if __name__ == '__main__':
|
|||||||
files_to_process = [x.split("/")[-1].split(".zip")[0] for x in files_to_process]
|
files_to_process = [x.split("/")[-1].split(".zip")[0] for x in files_to_process]
|
||||||
to_process = len(files_to_process)
|
to_process = len(files_to_process)
|
||||||
print(f"Processing {to_process}")
|
print(f"Processing {to_process}")
|
||||||
with Pool(4) as p:
|
#for product_id in files_to_process:
|
||||||
p.map(process_cube, files_to_process)
|
# process_cube(product_id)
|
||||||
|
with Pool(processes=16) as p:
|
||||||
|
p.map(process_cube, files_to_process, chunksize=8)
|
||||||
|
|||||||
@@ -25,7 +25,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 20,
|
||||||
"id": "98859cd6-6fa4-4aef-a113-455699524fae",
|
"id": "98859cd6-6fa4-4aef-a113-455699524fae",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"editable": true,
|
"editable": true,
|
||||||
@@ -77,7 +77,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 21,
|
||||||
"id": "28ac4c01-c1c5-427f-bb2c-0da99a4c5591",
|
"id": "28ac4c01-c1c5-427f-bb2c-0da99a4c5591",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -99,7 +99,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 22,
|
||||||
"id": "9daa94f4-16c9-4d8b-951e-3d5d38eb618f",
|
"id": "9daa94f4-16c9-4d8b-951e-3d5d38eb618f",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -109,7 +109,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 23,
|
||||||
"id": "0af9a4b3-7b59-460b-b933-504919d4bd2a",
|
"id": "0af9a4b3-7b59-460b-b933-504919d4bd2a",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"editable": true,
|
"editable": true,
|
||||||
@@ -136,12 +136,12 @@
|
|||||||
" last_updated = datetime.strptime(last_updated, \"%Y-%m-%dT%H:%M\")\n",
|
" last_updated = datetime.strptime(last_updated, \"%Y-%m-%dT%H:%M\")\n",
|
||||||
" last_updated = last_updated.replace(tzinfo=ZoneInfo(\"America/Toronto\"))\n",
|
" last_updated = last_updated.replace(tzinfo=ZoneInfo(\"America/Toronto\"))\n",
|
||||||
" last_updated = last_updated.astimezone(ZoneInfo(\"UTC\")).isoformat()\n",
|
" last_updated = last_updated.astimezone(ZoneInfo(\"UTC\")).isoformat()\n",
|
||||||
" \n",
|
"\n",
|
||||||
" data = (product_id, last_updated)\n",
|
" data = (product_id, last_updated)\n",
|
||||||
" cur.execute(\"SELECT product_id FROM downloaded WHERE product_id = ?\", (product_id,))\n",
|
" cur.execute(\"SELECT product_id FROM downloaded WHERE product_id = ?\", (product_id,))\n",
|
||||||
" result = cur.fetchone()\n",
|
" result = cur.fetchone()\n",
|
||||||
" if not result:\n",
|
" if not result:\n",
|
||||||
" cur.execute(\"INSERT INTO downloaded VALUES (?, ?)\", data)\n",
|
" cur.execute(\"INSERT INTO downloaded (product_id, last_updated) VALUES (?, ?)\", data)\n",
|
||||||
" else:\n",
|
" else:\n",
|
||||||
" cur.execute(\"UPDATE downloaded SET last_updated = ? WHERE product_id = ?\", (last_updated, product_id))\n",
|
" cur.execute(\"UPDATE downloaded SET last_updated = ? WHERE product_id = ?\", (last_updated, product_id))\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -150,7 +150,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": 24,
|
||||||
"id": "4b7996d2-75ab-4173-a17a-64fb7ab63740",
|
"id": "4b7996d2-75ab-4173-a17a-64fb7ab63740",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -163,7 +163,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": 25,
|
||||||
"id": "23cbabc3-0d4b-4e28-a4df-b2c5e8c7ea8b",
|
"id": "23cbabc3-0d4b-4e28-a4df-b2c5e8c7ea8b",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -212,7 +212,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": 26,
|
||||||
"id": "dc5573ef-734b-44d8-a4c4-0df19d655975",
|
"id": "dc5573ef-734b-44d8-a4c4-0df19d655975",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -222,7 +222,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 27,
|
||||||
"id": "eddf6501-8428-44cc-8d2d-e245803a3943",
|
"id": "eddf6501-8428-44cc-8d2d-e245803a3943",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"editable": true,
|
"editable": true,
|
||||||
@@ -280,7 +280,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 8,
|
"execution_count": 28,
|
||||||
"id": "e67642d3-cc6c-4c5d-b5a3-2fe18364ad71",
|
"id": "e67642d3-cc6c-4c5d-b5a3-2fe18364ad71",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -304,7 +304,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": 29,
|
||||||
"id": "144e3716-b0e7-4a39-9a25-ededea506f4f",
|
"id": "144e3716-b0e7-4a39-9a25-ededea506f4f",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -352,7 +352,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 12,
|
"execution_count": 78,
|
||||||
"id": "858e405e-7c02-4193-8abe-f23951761b09",
|
"id": "858e405e-7c02-4193-8abe-f23951761b09",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"editable": true,
|
"editable": true,
|
||||||
@@ -361,9 +361,32 @@
|
|||||||
},
|
},
|
||||||
"tags": []
|
"tags": []
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Reading /data/tables/scratch/13100102.csv\n",
|
||||||
|
"Index(['REF_DATE', 'GEO', 'DGUID',\n",
|
||||||
|
" 'North American Industry Classification System (NAICS)',\n",
|
||||||
|
" 'Summary statistics', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID',\n",
|
||||||
|
" 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED',\n",
|
||||||
|
" 'DECIMALS'],\n",
|
||||||
|
" dtype='object')\n",
|
||||||
|
"{'engine': 'c', 'low_memory': True, 'nrows': 1000000, 'dtype': {'DECIMALS': 'int8', 'SCALAR_ID': 'int8', 'UOM_ID': 'int16', 'REF_DATE': 'string', 'GEO': 'string', 'DGUID': 'string', 'North American Industry Classification System (NAICS)': 'string', 'Summary statistics': 'string', 'UOM': 'string', 'SCALAR_FACTOR': 'string', 'VECTOR': 'string', 'COORDINATE': 'string', 'STATUS': 'string', 'SYMBOL': 'string', 'TERMINATED': 'string'}}\n",
|
||||||
|
"Reading /data/tables/scratch/13100102.csv as a Pandas dataframe\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"product_id = \"43100011\"\n",
|
"\"\"\"\n",
|
||||||
|
"Examples: \n",
|
||||||
|
"- productId 43100011 has all with DECIMAL = 1 (float64)\n",
|
||||||
|
"- productId 17100009 has DECIMAL = 0 (int64)\n",
|
||||||
|
"- productId 35100076 has multiple DECIMAL precisions [0, 1, 2] (int64, float64, float64)\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"product_id = \"13100102\"\n",
|
||||||
"#def process_cube(product_id, language=\"en\"):\n",
|
"#def process_cube(product_id, language=\"en\"):\n",
|
||||||
"language = \"en\"\n",
|
"language = \"en\"\n",
|
||||||
"cur.execute(\"SELECT product_id FROM downloaded WHERE product_id = ?\", (product_id,))\n",
|
"cur.execute(\"SELECT product_id FROM downloaded WHERE product_id = ?\", (product_id,))\n",
|
||||||
@@ -385,13 +408,16 @@
|
|||||||
"# json.dump(metadata, outfile)\n",
|
"# json.dump(metadata, outfile)\n",
|
||||||
"# Read CSV using Pandas\n",
|
"# Read CSV using Pandas\n",
|
||||||
"product_csv = f\"{scratch_folder}/{product_id}.csv\"\n",
|
"product_csv = f\"{scratch_folder}/{product_id}.csv\"\n",
|
||||||
|
"print(f\"Reading {product_csv}\")\n",
|
||||||
"parameters = {\n",
|
"parameters = {\n",
|
||||||
" \"engine\": \"c\",\n",
|
" \"engine\": \"c\",\n",
|
||||||
" \"low_memory\": True,\n",
|
" \"low_memory\": True,\n",
|
||||||
" #\"nrows\": 1000000,\n",
|
" \"nrows\": 1000000,\n",
|
||||||
" \"dtype\": {}\n",
|
" \"dtype\": {}\n",
|
||||||
"}\n",
|
"}\n",
|
||||||
|
"\n",
|
||||||
"columns = pd.read_csv(product_csv, nrows=0).columns\n",
|
"columns = pd.read_csv(product_csv, nrows=0).columns\n",
|
||||||
|
"print(columns)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"columns_always_int_8 = [\"DECIMALS\", \"SCALAR_ID\"]\n",
|
"columns_always_int_8 = [\"DECIMALS\", \"SCALAR_ID\"]\n",
|
||||||
"for column in columns_always_int_8:\n",
|
"for column in columns_always_int_8:\n",
|
||||||
@@ -407,24 +433,46 @@
|
|||||||
" if column not in columns_always_int_8 and column not in columns_always_int_16 and column != \"VALUE\":\n",
|
" if column not in columns_always_int_8 and column not in columns_always_int_16 and column != \"VALUE\":\n",
|
||||||
" parameters[\"dtype\"][column] = 'string'\n",
|
" parameters[\"dtype\"][column] = 'string'\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"print(parameters)\n",
|
||||||
"if not parameters[\"dtype\"]:\n",
|
"if not parameters[\"dtype\"]:\n",
|
||||||
" del parameters[\"dtype\"]\n",
|
" del parameters[\"dtype\"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(f\"Reading {product_csv} as a Pandas dataframe\")\n",
|
"print(f\"Reading {product_csv} as a Pandas dataframe\")\n",
|
||||||
"df = pd.read_csv(product_csv, **parameters)\n",
|
"df = pd.read_csv(product_csv, **parameters)"
|
||||||
"\n",
|
]
|
||||||
"unique_decimal_values = len(df[\"DECIMALS\"].unique())\n",
|
},
|
||||||
"if unique_decimal_values > 1:\n",
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 79,
|
||||||
|
"id": "7579a135-1dfe-4fc0-991b-4b261d6577e0",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[1]\n",
|
||||||
|
"{'VALUE': 'float64'}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"unique_decimal_values = df[\"DECIMALS\"].unique()\n",
|
||||||
|
"print(unique_decimal_values)\n",
|
||||||
|
"if any(unique_decimal_values):\n",
|
||||||
" \"\"\"\n",
|
" \"\"\"\n",
|
||||||
" Example of when you can have a table with DECIMALS = 0 and DECIMALS = 1, productId 11100025\n",
|
" A table can have both float and integer in the VALUE field. \n",
|
||||||
|
" productId 11100025 is an example\n",
|
||||||
|
" So if we have unique values for DECIMALS to be [0,1], then we convert to float64\n",
|
||||||
" \"\"\"\n",
|
" \"\"\"\n",
|
||||||
" # Turn to float if not already\n",
|
|
||||||
" convert_dict = {\"VALUE\": \"float64\"}\n",
|
" convert_dict = {\"VALUE\": \"float64\"}\n",
|
||||||
|
" print(convert_dict)\n",
|
||||||
" df = df.astype(convert_dict)\n",
|
" df = df.astype(convert_dict)\n",
|
||||||
"elif unique_decimal_values == 1:\n",
|
"elif 0 in (unique_decimal_values):\n",
|
||||||
" if df[\"VALUE\"].dtype != \"int64\":\n",
|
" if df[\"VALUE\"].dtype != \"int64\":\n",
|
||||||
" # Turn to int64 if not already\n",
|
" # If DECIMALS = [0]\n",
|
||||||
" convert_dict = {\"VALUE\": \"int64\"}\n",
|
" convert_dict = {\"VALUE\": \"int64\"}\n",
|
||||||
|
" print(convert_dict)\n",
|
||||||
" df = df.astype(convert_dict)\n",
|
" df = df.astype(convert_dict)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"df = convert_to_lowest_type(df)\n",
|
"df = convert_to_lowest_type(df)\n",
|
||||||
@@ -433,7 +481,203 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 22,
|
"execution_count": 80,
|
||||||
|
"id": "6c2781b3-8eea-4317-a8c0-083d97ee04fc",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>REF_DATE</th>\n",
|
||||||
|
" <th>REF_START_DATE</th>\n",
|
||||||
|
" <th>REF_END_DATE</th>\n",
|
||||||
|
" <th>GEO</th>\n",
|
||||||
|
" <th>DGUID</th>\n",
|
||||||
|
" <th>North American Industry Classification System (NAICS)</th>\n",
|
||||||
|
" <th>Summary statistics</th>\n",
|
||||||
|
" <th>UOM</th>\n",
|
||||||
|
" <th>UOM_ID</th>\n",
|
||||||
|
" <th>SCALAR_FACTOR</th>\n",
|
||||||
|
" <th>SCALAR_ID</th>\n",
|
||||||
|
" <th>VECTOR</th>\n",
|
||||||
|
" <th>COORDINATE</th>\n",
|
||||||
|
" <th>VALUE</th>\n",
|
||||||
|
" <th>STATUS</th>\n",
|
||||||
|
" <th>SYMBOL</th>\n",
|
||||||
|
" <th>TERMINATED</th>\n",
|
||||||
|
" <th>DECIMALS</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>2014</td>\n",
|
||||||
|
" <td>2014-01-01</td>\n",
|
||||||
|
" <td>2014-12-31</td>\n",
|
||||||
|
" <td>Canada</td>\n",
|
||||||
|
" <td>2016A000011124</td>\n",
|
||||||
|
" <td>Nursing and residential care facilities [623]</td>\n",
|
||||||
|
" <td>Operating revenue</td>\n",
|
||||||
|
" <td>Dollars</td>\n",
|
||||||
|
" <td>81</td>\n",
|
||||||
|
" <td>millions</td>\n",
|
||||||
|
" <td>6</td>\n",
|
||||||
|
" <td>v114809189</td>\n",
|
||||||
|
" <td>1.1.1</td>\n",
|
||||||
|
" <td>9310.7</td>\n",
|
||||||
|
" <td><NA></td>\n",
|
||||||
|
" <td><NA></td>\n",
|
||||||
|
" <td><NA></td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>2014</td>\n",
|
||||||
|
" <td>2014-01-01</td>\n",
|
||||||
|
" <td>2014-12-31</td>\n",
|
||||||
|
" <td>Canada</td>\n",
|
||||||
|
" <td>2016A000011124</td>\n",
|
||||||
|
" <td>Nursing and residential care facilities [623]</td>\n",
|
||||||
|
" <td>Operating expenses</td>\n",
|
||||||
|
" <td>Dollars</td>\n",
|
||||||
|
" <td>81</td>\n",
|
||||||
|
" <td>millions</td>\n",
|
||||||
|
" <td>6</td>\n",
|
||||||
|
" <td>v114809190</td>\n",
|
||||||
|
" <td>1.1.2</td>\n",
|
||||||
|
" <td>8499.5</td>\n",
|
||||||
|
" <td><NA></td>\n",
|
||||||
|
" <td><NA></td>\n",
|
||||||
|
" <td><NA></td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>2014</td>\n",
|
||||||
|
" <td>2014-01-01</td>\n",
|
||||||
|
" <td>2014-12-31</td>\n",
|
||||||
|
" <td>Canada</td>\n",
|
||||||
|
" <td>2016A000011124</td>\n",
|
||||||
|
" <td>Nursing and residential care facilities [623]</td>\n",
|
||||||
|
" <td>Salaries, wages, commissions and benefits</td>\n",
|
||||||
|
" <td>Dollars</td>\n",
|
||||||
|
" <td>81</td>\n",
|
||||||
|
" <td>millions</td>\n",
|
||||||
|
" <td>6</td>\n",
|
||||||
|
" <td>v114809191</td>\n",
|
||||||
|
" <td>1.1.3</td>\n",
|
||||||
|
" <td>4630.3</td>\n",
|
||||||
|
" <td><NA></td>\n",
|
||||||
|
" <td><NA></td>\n",
|
||||||
|
" <td><NA></td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>2014</td>\n",
|
||||||
|
" <td>2014-01-01</td>\n",
|
||||||
|
" <td>2014-12-31</td>\n",
|
||||||
|
" <td>Canada</td>\n",
|
||||||
|
" <td>2016A000011124</td>\n",
|
||||||
|
" <td>Nursing and residential care facilities [623]</td>\n",
|
||||||
|
" <td>Operating profit margin</td>\n",
|
||||||
|
" <td>Percent</td>\n",
|
||||||
|
" <td>239</td>\n",
|
||||||
|
" <td>units</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>v114809192</td>\n",
|
||||||
|
" <td>1.1.4</td>\n",
|
||||||
|
" <td>8.7</td>\n",
|
||||||
|
" <td><NA></td>\n",
|
||||||
|
" <td><NA></td>\n",
|
||||||
|
" <td><NA></td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>2014</td>\n",
|
||||||
|
" <td>2014-01-01</td>\n",
|
||||||
|
" <td>2014-12-31</td>\n",
|
||||||
|
" <td>Newfoundland and Labrador</td>\n",
|
||||||
|
" <td>2016A000210</td>\n",
|
||||||
|
" <td>Nursing and residential care facilities [623]</td>\n",
|
||||||
|
" <td>Operating revenue</td>\n",
|
||||||
|
" <td>Dollars</td>\n",
|
||||||
|
" <td>81</td>\n",
|
||||||
|
" <td>millions</td>\n",
|
||||||
|
" <td>6</td>\n",
|
||||||
|
" <td>v114809193</td>\n",
|
||||||
|
" <td>2.1.1</td>\n",
|
||||||
|
" <td>97.9</td>\n",
|
||||||
|
" <td><NA></td>\n",
|
||||||
|
" <td><NA></td>\n",
|
||||||
|
" <td><NA></td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" REF_DATE REF_START_DATE REF_END_DATE GEO \\\n",
|
||||||
|
"0 2014 2014-01-01 2014-12-31 Canada \n",
|
||||||
|
"1 2014 2014-01-01 2014-12-31 Canada \n",
|
||||||
|
"2 2014 2014-01-01 2014-12-31 Canada \n",
|
||||||
|
"3 2014 2014-01-01 2014-12-31 Canada \n",
|
||||||
|
"4 2014 2014-01-01 2014-12-31 Newfoundland and Labrador \n",
|
||||||
|
"\n",
|
||||||
|
" DGUID North American Industry Classification System (NAICS) \\\n",
|
||||||
|
"0 2016A000011124 Nursing and residential care facilities [623] \n",
|
||||||
|
"1 2016A000011124 Nursing and residential care facilities [623] \n",
|
||||||
|
"2 2016A000011124 Nursing and residential care facilities [623] \n",
|
||||||
|
"3 2016A000011124 Nursing and residential care facilities [623] \n",
|
||||||
|
"4 2016A000210 Nursing and residential care facilities [623] \n",
|
||||||
|
"\n",
|
||||||
|
" Summary statistics UOM UOM_ID SCALAR_FACTOR \\\n",
|
||||||
|
"0 Operating revenue Dollars 81 millions \n",
|
||||||
|
"1 Operating expenses Dollars 81 millions \n",
|
||||||
|
"2 Salaries, wages, commissions and benefits Dollars 81 millions \n",
|
||||||
|
"3 Operating profit margin Percent 239 units \n",
|
||||||
|
"4 Operating revenue Dollars 81 millions \n",
|
||||||
|
"\n",
|
||||||
|
" SCALAR_ID VECTOR COORDINATE VALUE STATUS SYMBOL TERMINATED DECIMALS \n",
|
||||||
|
"0 6 v114809189 1.1.1 9310.7 <NA> <NA> <NA> 1 \n",
|
||||||
|
"1 6 v114809190 1.1.2 8499.5 <NA> <NA> <NA> 1 \n",
|
||||||
|
"2 6 v114809191 1.1.3 4630.3 <NA> <NA> <NA> 1 \n",
|
||||||
|
"3 0 v114809192 1.1.4 8.7 <NA> <NA> <NA> 1 \n",
|
||||||
|
"4 6 v114809193 2.1.1 97.9 <NA> <NA> <NA> 1 "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 80,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 81,
|
||||||
"id": "49cc1fa3-1ac8-4510-b4fd-7827a041e4a9",
|
"id": "49cc1fa3-1ac8-4510-b4fd-7827a041e4a9",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"editable": true,
|
"editable": true,
|
||||||
@@ -447,14 +691,7 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"Exporting dataframe as parquet to /data/tables/output/en/43100011.parquet\n"
|
"Exporting dataframe as parquet to /data/tables/output/en/13100102.parquet\n"
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"IOStream.flush timed out\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -515,12 +752,35 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 24,
|
"execution_count": 40,
|
||||||
"id": "06fb89ad-77ba-46db-bb88-15f5636d707d",
|
"id": "06fb89ad-77ba-46db-bb88-15f5636d707d",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "NameError",
|
||||||
|
"evalue": "name 'process_cube' is not defined",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||||
|
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
|
||||||
|
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[40]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mprocess_cube\u001b[49m(\u001b[33m\"\u001b[39m\u001b[33m43100011\u001b[39m\u001b[33m\"\u001b[39m)\n",
|
||||||
|
"\u001b[31mNameError\u001b[39m: name 'process_cube' is not defined"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"process_cube(\"37100216\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "9cc04f15-006f-4a3a-9610-65736820ba84",
|
||||||
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"#process_cube(\"43100011\")"
|
"# This one has multiple DECIMAL precision values\n",
|
||||||
|
"process_cube(\"43100011)"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
Reference in New Issue
Block a user