mirror of
https://github.com/dataforcanada/d4c-datapkg-statistical.git
synced 2026-06-13 14:10:55 +02:00
Convert statcan CSV into parquet
This commit is contained in:
@@ -0,0 +1,81 @@
|
||||
import glob
|
||||
from multiprocessing import Pool
|
||||
import zipfile
|
||||
|
||||
import pandas as pd
|
||||
import polars as pl
|
||||
|
||||
data_folder = "/data/tables"
|
||||
input_folder = f"{data_folder}/input"
|
||||
scratch_folder = f"{data_folder}/scratch"
|
||||
output_folder = f"{data_folder}/output"
|
||||
|
||||
def convert_to_lowest_type(df):
|
||||
"""
|
||||
Convert columns to the best possible dtypes
|
||||
For example, if the column is numerical and has a maximum value of 32,000
|
||||
we can assign it a type of int16
|
||||
"""
|
||||
print("Converting dataframe to optimal data types")
|
||||
params = {
|
||||
'convert_string': False,
|
||||
'convert_boolean': False
|
||||
}
|
||||
df = df.convert_dtypes(**params)
|
||||
|
||||
dtypes = pd.DataFrame(df.dtypes)
|
||||
|
||||
# Downcast to the smallest numerical dtype
|
||||
for row in dtypes.itertuples():
|
||||
column = row[0]
|
||||
the_type = str(row[1])
|
||||
|
||||
# Skipping downcasting Float64 as there were issues with decimal places
|
||||
# For example, instead of a value being 65.4, it turned into 65.4000015258789
|
||||
if the_type == 'Float64':
|
||||
continue
|
||||
elif the_type == 'Int64':
|
||||
df[column] = pd.to_numeric(df[column], downcast='integer')
|
||||
|
||||
return df
|
||||
|
||||
def extract_zipfile(product_id, language):
|
||||
"""
|
||||
It is faster to extract the zip file and read the CSV, than open
|
||||
via zipfile and then Pandas
|
||||
"""
|
||||
zip_file = f"{input_folder}/{language}/{product_id}.zip"
|
||||
with zipfile.ZipFile(zip_file) as myzip:
|
||||
print(f"Extracting {zip_file} to {scratch_folder}")
|
||||
myzip.extractall(path=scratch_folder)
|
||||
|
||||
|
||||
def process_table(product_id, language="en"):
|
||||
extract_zipfile(product_id, language)
|
||||
"""
|
||||
The pandas column reader is better than the Polars one
|
||||
|
||||
Here is an example where polars was not reading it right:
|
||||
https://www150.statcan.gc.ca/n1/tbl/csv/98100404-eng.zip
|
||||
"""
|
||||
product_csv = f"{scratch_folder}/{product_id}.csv"
|
||||
parameters = {
|
||||
"engine": "pyarrow"
|
||||
}
|
||||
print(f"Reading {product_csv} as a Pandas dataframe")
|
||||
df = pd.read_csv(product_csv, **parameters)
|
||||
df = convert_to_lowest_type(df)
|
||||
print("Import Pandas dataframe as a Polars dataframe")
|
||||
df = pl.from_pandas(df)
|
||||
output_parquet = f"{output_folder}/{language}/{product_id}.parquet"
|
||||
print(f"Exporting dataframe as parquet to {output_parquet}")
|
||||
df.write_parquet(f"{output_folder}/{language}/{product_id}.parquet",
|
||||
compression='zstd',
|
||||
compression_level=22)
|
||||
|
||||
if __name__ == '__main__':
|
||||
files_to_process = glob.glob(f"{input_folder}/en/*.zip")
|
||||
# Get the product_id
|
||||
files_to_process = [x.split("/")[-1].split(".zip")[0] for x in files_to_process]
|
||||
with Pool() as p:
|
||||
p.map(process_table, files_to_process)
|
||||
Reference in New Issue
Block a user