Convert statcan CSV into parquet

2026-06-13 14:10:55 +02:00 · 2025-06-17 20:46:24 +00:00
parent 4d16ef8232
commit ea603f2914
1 changed files with 81 additions and 0 deletions
@@ -0,0 +1,81 @@
+import glob
+from multiprocessing import Pool
+import zipfile
+
+import pandas as pd
+import polars as pl
+
+data_folder = "/data/tables"
+input_folder = f"{data_folder}/input"
+scratch_folder = f"{data_folder}/scratch"
+output_folder = f"{data_folder}/output"
+
+def convert_to_lowest_type(df):
+    """
+    Convert columns to the best possible dtypes
+    For example, if the column is numerical and has a maximum value of 32,000 
+    we can assign it a type of int16
+    """
+    print("Converting dataframe to optimal data types")
+    params = {
+        'convert_string': False,
+        'convert_boolean': False
+    }
+    df = df.convert_dtypes(**params)
+
+    dtypes = pd.DataFrame(df.dtypes)
+    
+    # Downcast to the smallest numerical dtype
+    for row in dtypes.itertuples():
+        column = row[0]
+        the_type = str(row[1])
+        
+        # Skipping downcasting Float64 as there were issues with decimal places
+        # For example, instead of a value being 65.4, it turned into 65.4000015258789
+        if the_type == 'Float64':
+            continue          
+        elif the_type == 'Int64':
+            df[column] = pd.to_numeric(df[column], downcast='integer')
+
+    return df
+
+def extract_zipfile(product_id, language):
+    """
+    It is faster to extract the zip file and read the CSV, than open
+    via zipfile and then Pandas
+    """
+    zip_file = f"{input_folder}/{language}/{product_id}.zip"
+    with zipfile.ZipFile(zip_file) as myzip:
+        print(f"Extracting {zip_file} to {scratch_folder}")
+        myzip.extractall(path=scratch_folder)
+
+
+def process_table(product_id, language="en"):
+    extract_zipfile(product_id, language)
+    """
+    The pandas column reader is better than the Polars one
+    
+    Here is an example where polars was not reading it right:
+    https://www150.statcan.gc.ca/n1/tbl/csv/98100404-eng.zip
+    """
+    product_csv = f"{scratch_folder}/{product_id}.csv"
+    parameters = {
+        "engine": "pyarrow"
+    }
+    print(f"Reading {product_csv} as a Pandas dataframe")
+    df = pd.read_csv(product_csv, **parameters)
+    df = convert_to_lowest_type(df)
+    print("Import Pandas dataframe as a Polars dataframe")
+    df = pl.from_pandas(df)
+    output_parquet = f"{output_folder}/{language}/{product_id}.parquet"
+    print(f"Exporting dataframe as parquet to {output_parquet}")
+    df.write_parquet(f"{output_folder}/{language}/{product_id}.parquet",
+                     compression='zstd',
+                     compression_level=22)
+
+if __name__ == '__main__':
+    files_to_process = glob.glob(f"{input_folder}/en/*.zip")
+    # Get the product_id
+    files_to_process = [x.split("/")[-1].split(".zip")[0] for x in files_to_process]
+    with Pool() as p:
+        p.map(process_table, files_to_process)