mirror of
https://github.com/dataforcanada/d4c-datapkg-statistical.git
synced 2026-06-13 14:10:55 +02:00
Made changes
This commit is contained in:
@@ -8,7 +8,6 @@ import zipfile
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
import pandas as pd
|
||||
import polars as pl
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
|
||||
@@ -17,7 +16,6 @@ input_folder = f"{data_folder}/input"
|
||||
scratch_folder = f"{data_folder}/scratch"
|
||||
output_folder = f"{data_folder}/output"
|
||||
|
||||
|
||||
if not os.path.exists(f"{data_folder}/processing.db"):
|
||||
con = sqlite3.connect(f"{data_folder}/processing.db")
|
||||
cur = con.cursor()
|
||||
@@ -42,7 +40,7 @@ def setup():
|
||||
"""
|
||||
Makes data folders
|
||||
"""
|
||||
folders_to_create = [data_folder, input_folder,
|
||||
folders_to_create = [data_folder, input_folder,
|
||||
scratch_folder, output_folder,
|
||||
f"{input_folder}/en", f"{output_folder}/en",
|
||||
f"{input_folder}/fr", f"{output_folder}/fr",
|
||||
@@ -125,29 +123,62 @@ def update_tables():
|
||||
download_cube(product_id)
|
||||
process_cube(product_id)
|
||||
|
||||
def compute_ref_date_bounds(df):
|
||||
"""
|
||||
TODO: There are cases where the REF_DATE is a range, ex. 2023/2024.
|
||||
For productId 17100022 the period is from July 1 to June 30 (seen in the metadata), so can't just
|
||||
use January 1, 2023 and December 31, 2024
|
||||
"""
|
||||
series = df["REF_DATE"]
|
||||
|
||||
# Initialize the two new columns with NaT
|
||||
df["REF_START_DATE"] = pd.NaT
|
||||
df["REF_END_DATE"] = pd.NaT
|
||||
|
||||
# Skip rows that contain slashes
|
||||
valid_mask = ~series.str.contains("/", na=False)
|
||||
|
||||
# Case 1: YYYY-MM-DD
|
||||
full_mask = valid_mask & series.str.fullmatch(r"\d{4}-\d{2}-\d{2}")
|
||||
parsed_full = pd.to_datetime(series[full_mask], format="%Y-%m-%d", errors="coerce")
|
||||
df.loc[full_mask, "REF_START_DATE"] = parsed_full
|
||||
df.loc[full_mask, "REF_END_DATE"] = parsed_full
|
||||
|
||||
# Case 2: YYYY-MM
|
||||
month_mask = valid_mask & series.str.fullmatch(r"\d{4}-\d{2}")
|
||||
parsed_month = pd.to_datetime(series[month_mask], format="%Y-%m", errors="coerce")
|
||||
df.loc[month_mask, "REF_START_DATE"] = parsed_month
|
||||
df.loc[month_mask, "REF_END_DATE"] = parsed_month + pd.to_timedelta(
|
||||
parsed_month.dt.days_in_month - 1, unit='D'
|
||||
)
|
||||
|
||||
# Case 3: YYYY
|
||||
year_mask = valid_mask & series.str.fullmatch(r"\d{4}")
|
||||
parsed_year = pd.to_datetime(series[year_mask], format="%Y", errors="coerce")
|
||||
df.loc[year_mask, "REF_START_DATE"] = parsed_year
|
||||
df.loc[year_mask, "REF_END_DATE"] = parsed_year + pd.offsets.YearEnd(0)
|
||||
|
||||
# Move columns after REF_DATE
|
||||
ref_idx = df.columns.get_loc("REF_DATE")
|
||||
cols = list(df.columns)
|
||||
cols.remove("REF_START_DATE")
|
||||
cols.remove("REF_END_DATE")
|
||||
cols[ref_idx + 1:ref_idx + 1] = ["REF_START_DATE", "REF_END_DATE"]
|
||||
|
||||
return df[cols]
|
||||
|
||||
def convert_to_lowest_type(df):
|
||||
"""
|
||||
Convert columns to the best possible dtypes
|
||||
For example, if the column is numerical and has a maximum value of 32,000
|
||||
For example, if the column is numerical and has a maximum value of 32,000
|
||||
we can assign it a type of int16
|
||||
"""
|
||||
print("Converting dataframe to optimal data types")
|
||||
params = {
|
||||
'convert_string': False,
|
||||
'convert_boolean': False
|
||||
}
|
||||
df = df.convert_dtypes(**params)
|
||||
|
||||
dtypes = pd.DataFrame(df.dtypes)
|
||||
# Downcast to the smallest numerical dtype
|
||||
for row in dtypes.itertuples():
|
||||
column = row[0]
|
||||
the_type = str(row[1])
|
||||
# Skipping downcasting Float64 as there were issues with decimal places
|
||||
# For example, instead of a value being 65.4, it turned into 65.4000015258789
|
||||
if the_type == 'Float64':
|
||||
continue
|
||||
elif the_type == 'Int64':
|
||||
if the_type == 'int64':
|
||||
df[column] = pd.to_numeric(df[column], downcast='integer')
|
||||
|
||||
return df
|
||||
@@ -193,6 +224,17 @@ def download_cube(product_id, language="en"):
|
||||
progress_bar.close()
|
||||
|
||||
def process_cube(product_id, language="en"):
|
||||
"""
|
||||
Examples:
|
||||
- productId 43100011 has all with DECIMAL = 1 (float64)
|
||||
- productId 17100009 has DECIMAL = 0 (int64)
|
||||
- productId 35100076 has multiple DECIMAL precisions [0, 1, 2] (int64, float64, float64)
|
||||
"""
|
||||
cur.execute("SELECT product_id FROM downloaded WHERE product_id = ?", (product_id,))
|
||||
result = cur.fetchone()
|
||||
if result:
|
||||
print(f"Already processed {product_id}")
|
||||
return
|
||||
extract_zipfile(product_id, language)
|
||||
"""
|
||||
The pandas column reader is better than the Polars one
|
||||
@@ -207,20 +249,63 @@ def process_cube(product_id, language="en"):
|
||||
# json.dump(metadata, outfile)
|
||||
# Read CSV using Pandas
|
||||
product_csv = f"{scratch_folder}/{product_id}.csv"
|
||||
print(f"Reading {product_csv}")
|
||||
parameters = {
|
||||
"engine": "c",
|
||||
"low_memory": True
|
||||
"low_memory": True,
|
||||
"nrows": 100000,
|
||||
"dtype": {}
|
||||
}
|
||||
columns = pd.read_csv(product_csv, nrows=0).columns
|
||||
|
||||
columns_always_int_8 = ["DECIMALS", "SCALAR_ID"]
|
||||
for column in columns_always_int_8:
|
||||
if column in columns:
|
||||
parameters["dtype"][column] = 'int8'
|
||||
|
||||
columns_always_int_16 = ["UOM_ID"]
|
||||
for column in columns_always_int_16:
|
||||
if column in columns:
|
||||
parameters["dtype"][column] = 'int16'
|
||||
|
||||
for column in columns:
|
||||
if column not in columns_always_int_8 and column not in columns_always_int_16 and column != "VALUE":
|
||||
parameters["dtype"][column] = 'string'
|
||||
|
||||
if not parameters["dtype"]:
|
||||
del parameters["dtype"]
|
||||
|
||||
print(f"Reading {product_csv} as a Pandas dataframe")
|
||||
df = pd.read_csv(product_csv, **parameters)
|
||||
unique_decimal_values = df["DECIMALS"].unique()
|
||||
if any(unique_decimal_values):
|
||||
"""
|
||||
A table can have both float and integer in the VALUE field.
|
||||
productId 11100025 is an example
|
||||
So if we have unique values for DECIMALS to be [0,1], then we convert to float64
|
||||
"""
|
||||
convert_dict = {"VALUE": "float64"}
|
||||
print(convert_dict)
|
||||
df = df.astype(convert_dict)
|
||||
elif 0 in (unique_decimal_values):
|
||||
if df["VALUE"].dtype != "Int64":
|
||||
# If DECIMALS = [0]
|
||||
convert_dict = {"VALUE": "Int64"}
|
||||
print(convert_dict)
|
||||
df = df.astype(convert_dict)
|
||||
|
||||
df = convert_to_lowest_type(df)
|
||||
print("Import Pandas dataframe as a Polars dataframe")
|
||||
df = pl.from_pandas(df)
|
||||
df = compute_ref_date_bounds(df)
|
||||
output_parquet = f"{output_folder}/{language}/{product_id}.parquet"
|
||||
print(f"Exporting dataframe as parquet to {output_parquet}")
|
||||
df.write_parquet(output_parquet,
|
||||
compression='zstd',
|
||||
compression_level=22)
|
||||
parameters = {
|
||||
"path": output_parquet,
|
||||
"engine": "pyarrow",
|
||||
"compression": "zstd",
|
||||
"index": False,
|
||||
"compression_level": 22
|
||||
}
|
||||
df.to_parquet(**parameters)
|
||||
# Remove the scratch files
|
||||
print("Removing scratch files")
|
||||
os.remove(f"{scratch_folder}/{product_id}.csv")
|
||||
@@ -228,6 +313,7 @@ def process_cube(product_id, language="en"):
|
||||
update_last_downloaded(product_id)
|
||||
update_last_processed(product_id)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
setup()
|
||||
files_to_process = glob.glob(f"{input_folder}/en/*.zip")
|
||||
@@ -235,5 +321,7 @@ if __name__ == '__main__':
|
||||
files_to_process = [x.split("/")[-1].split(".zip")[0] for x in files_to_process]
|
||||
to_process = len(files_to_process)
|
||||
print(f"Processing {to_process}")
|
||||
with Pool(4) as p:
|
||||
p.map(process_cube, files_to_process)
|
||||
#for product_id in files_to_process:
|
||||
# process_cube(product_id)
|
||||
with Pool(processes=16) as p:
|
||||
p.map(process_cube, files_to_process, chunksize=8)
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 20,
|
||||
"id": "98859cd6-6fa4-4aef-a113-455699524fae",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
@@ -77,7 +77,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 21,
|
||||
"id": "28ac4c01-c1c5-427f-bb2c-0da99a4c5591",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -99,7 +99,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 22,
|
||||
"id": "9daa94f4-16c9-4d8b-951e-3d5d38eb618f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -109,7 +109,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 23,
|
||||
"id": "0af9a4b3-7b59-460b-b933-504919d4bd2a",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
@@ -136,12 +136,12 @@
|
||||
" last_updated = datetime.strptime(last_updated, \"%Y-%m-%dT%H:%M\")\n",
|
||||
" last_updated = last_updated.replace(tzinfo=ZoneInfo(\"America/Toronto\"))\n",
|
||||
" last_updated = last_updated.astimezone(ZoneInfo(\"UTC\")).isoformat()\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" data = (product_id, last_updated)\n",
|
||||
" cur.execute(\"SELECT product_id FROM downloaded WHERE product_id = ?\", (product_id,))\n",
|
||||
" result = cur.fetchone()\n",
|
||||
" if not result:\n",
|
||||
" cur.execute(\"INSERT INTO downloaded VALUES (?, ?)\", data)\n",
|
||||
" cur.execute(\"INSERT INTO downloaded (product_id, last_updated) VALUES (?, ?)\", data)\n",
|
||||
" else:\n",
|
||||
" cur.execute(\"UPDATE downloaded SET last_updated = ? WHERE product_id = ?\", (last_updated, product_id))\n",
|
||||
"\n",
|
||||
@@ -150,7 +150,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 24,
|
||||
"id": "4b7996d2-75ab-4173-a17a-64fb7ab63740",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -163,7 +163,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 25,
|
||||
"id": "23cbabc3-0d4b-4e28-a4df-b2c5e8c7ea8b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -212,7 +212,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 26,
|
||||
"id": "dc5573ef-734b-44d8-a4c4-0df19d655975",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -222,7 +222,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 27,
|
||||
"id": "eddf6501-8428-44cc-8d2d-e245803a3943",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
@@ -280,7 +280,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 28,
|
||||
"id": "e67642d3-cc6c-4c5d-b5a3-2fe18364ad71",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -304,7 +304,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 29,
|
||||
"id": "144e3716-b0e7-4a39-9a25-ededea506f4f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -352,7 +352,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 78,
|
||||
"id": "858e405e-7c02-4193-8abe-f23951761b09",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
@@ -361,9 +361,32 @@
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Reading /data/tables/scratch/13100102.csv\n",
|
||||
"Index(['REF_DATE', 'GEO', 'DGUID',\n",
|
||||
" 'North American Industry Classification System (NAICS)',\n",
|
||||
" 'Summary statistics', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID',\n",
|
||||
" 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED',\n",
|
||||
" 'DECIMALS'],\n",
|
||||
" dtype='object')\n",
|
||||
"{'engine': 'c', 'low_memory': True, 'nrows': 1000000, 'dtype': {'DECIMALS': 'int8', 'SCALAR_ID': 'int8', 'UOM_ID': 'int16', 'REF_DATE': 'string', 'GEO': 'string', 'DGUID': 'string', 'North American Industry Classification System (NAICS)': 'string', 'Summary statistics': 'string', 'UOM': 'string', 'SCALAR_FACTOR': 'string', 'VECTOR': 'string', 'COORDINATE': 'string', 'STATUS': 'string', 'SYMBOL': 'string', 'TERMINATED': 'string'}}\n",
|
||||
"Reading /data/tables/scratch/13100102.csv as a Pandas dataframe\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"product_id = \"43100011\"\n",
|
||||
"\"\"\"\n",
|
||||
"Examples: \n",
|
||||
"- productId 43100011 has all with DECIMAL = 1 (float64)\n",
|
||||
"- productId 17100009 has DECIMAL = 0 (int64)\n",
|
||||
"- productId 35100076 has multiple DECIMAL precisions [0, 1, 2] (int64, float64, float64)\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"product_id = \"13100102\"\n",
|
||||
"#def process_cube(product_id, language=\"en\"):\n",
|
||||
"language = \"en\"\n",
|
||||
"cur.execute(\"SELECT product_id FROM downloaded WHERE product_id = ?\", (product_id,))\n",
|
||||
@@ -385,13 +408,16 @@
|
||||
"# json.dump(metadata, outfile)\n",
|
||||
"# Read CSV using Pandas\n",
|
||||
"product_csv = f\"{scratch_folder}/{product_id}.csv\"\n",
|
||||
"print(f\"Reading {product_csv}\")\n",
|
||||
"parameters = {\n",
|
||||
" \"engine\": \"c\",\n",
|
||||
" \"low_memory\": True,\n",
|
||||
" #\"nrows\": 1000000,\n",
|
||||
" \"nrows\": 1000000,\n",
|
||||
" \"dtype\": {}\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"columns = pd.read_csv(product_csv, nrows=0).columns\n",
|
||||
"print(columns)\n",
|
||||
"\n",
|
||||
"columns_always_int_8 = [\"DECIMALS\", \"SCALAR_ID\"]\n",
|
||||
"for column in columns_always_int_8:\n",
|
||||
@@ -407,24 +433,46 @@
|
||||
" if column not in columns_always_int_8 and column not in columns_always_int_16 and column != \"VALUE\":\n",
|
||||
" parameters[\"dtype\"][column] = 'string'\n",
|
||||
"\n",
|
||||
"print(parameters)\n",
|
||||
"if not parameters[\"dtype\"]:\n",
|
||||
" del parameters[\"dtype\"]\n",
|
||||
"\n",
|
||||
"print(f\"Reading {product_csv} as a Pandas dataframe\")\n",
|
||||
"df = pd.read_csv(product_csv, **parameters)\n",
|
||||
"\n",
|
||||
"unique_decimal_values = len(df[\"DECIMALS\"].unique())\n",
|
||||
"if unique_decimal_values > 1:\n",
|
||||
"df = pd.read_csv(product_csv, **parameters)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 79,
|
||||
"id": "7579a135-1dfe-4fc0-991b-4b261d6577e0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[1]\n",
|
||||
"{'VALUE': 'float64'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"unique_decimal_values = df[\"DECIMALS\"].unique()\n",
|
||||
"print(unique_decimal_values)\n",
|
||||
"if any(unique_decimal_values):\n",
|
||||
" \"\"\"\n",
|
||||
" Example of when you can have a table with DECIMALS = 0 and DECIMALS = 1, productId 11100025\n",
|
||||
" A table can have both float and integer in the VALUE field. \n",
|
||||
" productId 11100025 is an example\n",
|
||||
" So if we have unique values for DECIMALS to be [0,1], then we convert to float64\n",
|
||||
" \"\"\"\n",
|
||||
" # Turn to float if not already\n",
|
||||
" convert_dict = {\"VALUE\": \"float64\"}\n",
|
||||
" print(convert_dict)\n",
|
||||
" df = df.astype(convert_dict)\n",
|
||||
"elif unique_decimal_values == 1:\n",
|
||||
"elif 0 in (unique_decimal_values):\n",
|
||||
" if df[\"VALUE\"].dtype != \"int64\":\n",
|
||||
" # Turn to int64 if not already\n",
|
||||
" # If DECIMALS = [0]\n",
|
||||
" convert_dict = {\"VALUE\": \"int64\"}\n",
|
||||
" print(convert_dict)\n",
|
||||
" df = df.astype(convert_dict)\n",
|
||||
"\n",
|
||||
"df = convert_to_lowest_type(df)\n",
|
||||
@@ -433,7 +481,203 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"execution_count": 80,
|
||||
"id": "6c2781b3-8eea-4317-a8c0-083d97ee04fc",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>REF_DATE</th>\n",
|
||||
" <th>REF_START_DATE</th>\n",
|
||||
" <th>REF_END_DATE</th>\n",
|
||||
" <th>GEO</th>\n",
|
||||
" <th>DGUID</th>\n",
|
||||
" <th>North American Industry Classification System (NAICS)</th>\n",
|
||||
" <th>Summary statistics</th>\n",
|
||||
" <th>UOM</th>\n",
|
||||
" <th>UOM_ID</th>\n",
|
||||
" <th>SCALAR_FACTOR</th>\n",
|
||||
" <th>SCALAR_ID</th>\n",
|
||||
" <th>VECTOR</th>\n",
|
||||
" <th>COORDINATE</th>\n",
|
||||
" <th>VALUE</th>\n",
|
||||
" <th>STATUS</th>\n",
|
||||
" <th>SYMBOL</th>\n",
|
||||
" <th>TERMINATED</th>\n",
|
||||
" <th>DECIMALS</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>2014</td>\n",
|
||||
" <td>2014-01-01</td>\n",
|
||||
" <td>2014-12-31</td>\n",
|
||||
" <td>Canada</td>\n",
|
||||
" <td>2016A000011124</td>\n",
|
||||
" <td>Nursing and residential care facilities [623]</td>\n",
|
||||
" <td>Operating revenue</td>\n",
|
||||
" <td>Dollars</td>\n",
|
||||
" <td>81</td>\n",
|
||||
" <td>millions</td>\n",
|
||||
" <td>6</td>\n",
|
||||
" <td>v114809189</td>\n",
|
||||
" <td>1.1.1</td>\n",
|
||||
" <td>9310.7</td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2014</td>\n",
|
||||
" <td>2014-01-01</td>\n",
|
||||
" <td>2014-12-31</td>\n",
|
||||
" <td>Canada</td>\n",
|
||||
" <td>2016A000011124</td>\n",
|
||||
" <td>Nursing and residential care facilities [623]</td>\n",
|
||||
" <td>Operating expenses</td>\n",
|
||||
" <td>Dollars</td>\n",
|
||||
" <td>81</td>\n",
|
||||
" <td>millions</td>\n",
|
||||
" <td>6</td>\n",
|
||||
" <td>v114809190</td>\n",
|
||||
" <td>1.1.2</td>\n",
|
||||
" <td>8499.5</td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>2014</td>\n",
|
||||
" <td>2014-01-01</td>\n",
|
||||
" <td>2014-12-31</td>\n",
|
||||
" <td>Canada</td>\n",
|
||||
" <td>2016A000011124</td>\n",
|
||||
" <td>Nursing and residential care facilities [623]</td>\n",
|
||||
" <td>Salaries, wages, commissions and benefits</td>\n",
|
||||
" <td>Dollars</td>\n",
|
||||
" <td>81</td>\n",
|
||||
" <td>millions</td>\n",
|
||||
" <td>6</td>\n",
|
||||
" <td>v114809191</td>\n",
|
||||
" <td>1.1.3</td>\n",
|
||||
" <td>4630.3</td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>2014</td>\n",
|
||||
" <td>2014-01-01</td>\n",
|
||||
" <td>2014-12-31</td>\n",
|
||||
" <td>Canada</td>\n",
|
||||
" <td>2016A000011124</td>\n",
|
||||
" <td>Nursing and residential care facilities [623]</td>\n",
|
||||
" <td>Operating profit margin</td>\n",
|
||||
" <td>Percent</td>\n",
|
||||
" <td>239</td>\n",
|
||||
" <td>units</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>v114809192</td>\n",
|
||||
" <td>1.1.4</td>\n",
|
||||
" <td>8.7</td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>2014</td>\n",
|
||||
" <td>2014-01-01</td>\n",
|
||||
" <td>2014-12-31</td>\n",
|
||||
" <td>Newfoundland and Labrador</td>\n",
|
||||
" <td>2016A000210</td>\n",
|
||||
" <td>Nursing and residential care facilities [623]</td>\n",
|
||||
" <td>Operating revenue</td>\n",
|
||||
" <td>Dollars</td>\n",
|
||||
" <td>81</td>\n",
|
||||
" <td>millions</td>\n",
|
||||
" <td>6</td>\n",
|
||||
" <td>v114809193</td>\n",
|
||||
" <td>2.1.1</td>\n",
|
||||
" <td>97.9</td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" REF_DATE REF_START_DATE REF_END_DATE GEO \\\n",
|
||||
"0 2014 2014-01-01 2014-12-31 Canada \n",
|
||||
"1 2014 2014-01-01 2014-12-31 Canada \n",
|
||||
"2 2014 2014-01-01 2014-12-31 Canada \n",
|
||||
"3 2014 2014-01-01 2014-12-31 Canada \n",
|
||||
"4 2014 2014-01-01 2014-12-31 Newfoundland and Labrador \n",
|
||||
"\n",
|
||||
" DGUID North American Industry Classification System (NAICS) \\\n",
|
||||
"0 2016A000011124 Nursing and residential care facilities [623] \n",
|
||||
"1 2016A000011124 Nursing and residential care facilities [623] \n",
|
||||
"2 2016A000011124 Nursing and residential care facilities [623] \n",
|
||||
"3 2016A000011124 Nursing and residential care facilities [623] \n",
|
||||
"4 2016A000210 Nursing and residential care facilities [623] \n",
|
||||
"\n",
|
||||
" Summary statistics UOM UOM_ID SCALAR_FACTOR \\\n",
|
||||
"0 Operating revenue Dollars 81 millions \n",
|
||||
"1 Operating expenses Dollars 81 millions \n",
|
||||
"2 Salaries, wages, commissions and benefits Dollars 81 millions \n",
|
||||
"3 Operating profit margin Percent 239 units \n",
|
||||
"4 Operating revenue Dollars 81 millions \n",
|
||||
"\n",
|
||||
" SCALAR_ID VECTOR COORDINATE VALUE STATUS SYMBOL TERMINATED DECIMALS \n",
|
||||
"0 6 v114809189 1.1.1 9310.7 <NA> <NA> <NA> 1 \n",
|
||||
"1 6 v114809190 1.1.2 8499.5 <NA> <NA> <NA> 1 \n",
|
||||
"2 6 v114809191 1.1.3 4630.3 <NA> <NA> <NA> 1 \n",
|
||||
"3 0 v114809192 1.1.4 8.7 <NA> <NA> <NA> 1 \n",
|
||||
"4 6 v114809193 2.1.1 97.9 <NA> <NA> <NA> 1 "
|
||||
]
|
||||
},
|
||||
"execution_count": 80,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 81,
|
||||
"id": "49cc1fa3-1ac8-4510-b4fd-7827a041e4a9",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
@@ -447,14 +691,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Exporting dataframe as parquet to /data/tables/output/en/43100011.parquet\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"IOStream.flush timed out\n"
|
||||
"Exporting dataframe as parquet to /data/tables/output/en/13100102.parquet\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -515,12 +752,35 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"execution_count": 40,
|
||||
"id": "06fb89ad-77ba-46db-bb88-15f5636d707d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'process_cube' is not defined",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[40]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mprocess_cube\u001b[49m(\u001b[33m\"\u001b[39m\u001b[33m43100011\u001b[39m\u001b[33m\"\u001b[39m)\n",
|
||||
"\u001b[31mNameError\u001b[39m: name 'process_cube' is not defined"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"process_cube(\"37100216\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9cc04f15-006f-4a3a-9610-65736820ba84",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#process_cube(\"43100011\")"
|
||||
"# This one has multiple DECIMAL precision values\n",
|
||||
"process_cube(\"43100011)"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
Reference in New Issue
Block a user