mirror of
https://github.com/dataforcanada/d4c-datapkg-statistical.git
synced 2026-06-13 14:10:55 +02:00
Calculate CSV file size by viewing inside of zip file
This commit is contained in:
@@ -0,0 +1,152 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "b410f6f2-2ce8-4eaa-9c25-03e076a9d996",
|
||||||
|
"metadata": {
|
||||||
|
"editable": true,
|
||||||
|
"slideshow": {
|
||||||
|
"slide_type": ""
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Buckaroo has been enabled as the default DataFrame viewer. To return to default dataframe visualization use `from buckaroo import disable; disable()`\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import glob\n",
|
||||||
|
"import zipfile\n",
|
||||||
|
"\n",
|
||||||
|
"import buckaroo\n",
|
||||||
|
"import polars as pl"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "6e350e77-2e82-4f9c-9f9c-a67297c23126",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"zip_files = glob.glob(\"/data/tables/input/en/*.zip\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 25,
|
||||||
|
"id": "0c7532d5-c8dc-4b93-900b-0e43d0718afd",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"file_sizes = {}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 30,
|
||||||
|
"id": "da44a381-759b-443b-9440-c81c6cae3108",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def calculate_file_size(filepath):\n",
|
||||||
|
" with zipfile.ZipFile(filepath, 'r') as zip:\n",
|
||||||
|
" for info in zip.infolist():\n",
|
||||||
|
" if 'MetaData' in info.filename:\n",
|
||||||
|
" continue\n",
|
||||||
|
" size_gb = round(info.file_size / (1024 ** 3), 2)\n",
|
||||||
|
" file_sizes[info.filename] = size_gb"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 31,
|
||||||
|
"id": "16409ae7-e62e-4d32-a1f2-42300bbeb80f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"for filepath in zip_files:\n",
|
||||||
|
" try:\n",
|
||||||
|
" calculate_file_size(filepath)\n",
|
||||||
|
" except Exception:\n",
|
||||||
|
" print(f\"Failed to process {filepath}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 32,
|
||||||
|
"id": "f228535d-7645-447a-ad80-0367d72102a2",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#sorted(file_sizes.items(), key=lambda item: item[1], reverse=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 33,
|
||||||
|
"id": "08d51b9e-08f6-4f4c-b0e9-22142987bdf3",
|
||||||
|
"metadata": {
|
||||||
|
"editable": true,
|
||||||
|
"slideshow": {
|
||||||
|
"slide_type": ""
|
||||||
|
},
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "7e162d24346c48dcbfc550b76cdbdd93",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 1
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"PolarsBuckarooWidget(buckaroo_options={'sampled': ['random'], 'auto_clean': ['aggressive', 'conservative'], 'p…"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"pl.DataFrame(list(file_sizes.items()), strict=False, orient='row', schema={\"filename\": pl.String, \"filesize\": pl.Float64})\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "a358e8d9-a914-4dfa-9992-c99f07205def",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user