diff --git a/experiments/statcan_products/csv_file_size_inside_zip.ipynb b/experiments/statcan_products/csv_file_size_inside_zip.ipynb new file mode 100644 index 0000000..5c2af30 --- /dev/null +++ b/experiments/statcan_products/csv_file_size_inside_zip.ipynb @@ -0,0 +1,152 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "id": "b410f6f2-2ce8-4eaa-9c25-03e076a9d996", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Buckaroo has been enabled as the default DataFrame viewer. To return to default dataframe visualization use `from buckaroo import disable; disable()`\n" + ] + } + ], + "source": [ + "import glob\n", + "import zipfile\n", + "\n", + "import buckaroo\n", + "import polars as pl" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6e350e77-2e82-4f9c-9f9c-a67297c23126", + "metadata": {}, + "outputs": [], + "source": [ + "zip_files = glob.glob(\"/data/tables/input/en/*.zip\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "0c7532d5-c8dc-4b93-900b-0e43d0718afd", + "metadata": {}, + "outputs": [], + "source": [ + "file_sizes = {}" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "da44a381-759b-443b-9440-c81c6cae3108", + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_file_size(filepath):\n", + " with zipfile.ZipFile(filepath, 'r') as zip:\n", + " for info in zip.infolist():\n", + " if 'MetaData' in info.filename:\n", + " continue\n", + " size_gb = round(info.file_size / (1024 ** 3), 2)\n", + " file_sizes[info.filename] = size_gb" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "16409ae7-e62e-4d32-a1f2-42300bbeb80f", + "metadata": {}, + "outputs": [], + "source": [ + "for filepath in zip_files:\n", + " try:\n", + " calculate_file_size(filepath)\n", + " except Exception:\n", + " print(f\"Failed to process {filepath}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "f228535d-7645-447a-ad80-0367d72102a2", + "metadata": {}, + "outputs": [], + "source": [ + "#sorted(file_sizes.items(), key=lambda item: item[1], reverse=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "08d51b9e-08f6-4f4c-b0e9-22142987bdf3", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7e162d24346c48dcbfc550b76cdbdd93", + "version_major": 2, + "version_minor": 1 + }, + "text/plain": [ + "PolarsBuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'p…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "pl.DataFrame(list(file_sizes.items()), strict=False, orient='row', schema={\"filename\": pl.String, \"filesize\": pl.Float64})\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a358e8d9-a914-4dfa-9992-c99f07205def", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}