Update Python CLI to call Cloudflare HTTP ingestor. TODO: For singlepart upload, return back part size and number of parts. More descriptive error logged

This commit is contained in:
Diego Ripley
2026-03-13 07:18:36 -04:00
parent d4484b665f
commit 422cff5273
5 changed files with 318 additions and 63 deletions
+24 -7
View File
@@ -15,7 +15,7 @@ uv run d4c-http-ingestor \
--auth-token "$D4C_INGESTOR_AUTH_TOKEN" \ --auth-token "$D4C_INGESTOR_AUTH_TOKEN" \
--db ca-qc_government_and_municipalities_of_quebec-2026A000224_d4c-datapkg-orthoimagery_orthorectified_imagery_from_quebec.sqlite \ --db ca-qc_government_and_municipalities_of_quebec-2026A000224_d4c-datapkg-orthoimagery_orthorectified_imagery_from_quebec.sqlite \
--key-prefix dataforcanada/d4c-datapkg-orthoimagery/archive/ca-qc_government_and_municipalities_of_quebec-2026A000224_d4c-datapkg-orthoimagery_orthorectified_imagery_from_quebec \ --key-prefix dataforcanada/d4c-datapkg-orthoimagery/archive/ca-qc_government_and_municipalities_of_quebec-2026A000224_d4c-datapkg-orthoimagery_orthorectified_imagery_from_quebec \
--out parquet/ \ --out ca-qc_government_and_municipalities_of_quebec-2026A000224_d4c-datapkg-orthoimagery_orthorectified_imagery_from_quebec \
--concurrency 12 --concurrency 12
``` ```
@@ -26,7 +26,7 @@ The auth token can also be set via the `D4C_INGESTOR_AUTH_TOKEN` environment var
``` ```
usage: d4c-http-ingestor [-h] --urls URLS --dataset-id DATASET_ID usage: d4c-http-ingestor [-h] --urls URLS --dataset-id DATASET_ID
[--worker-url WORKER_URL] [--auth-token AUTH_TOKEN] [--worker-url WORKER_URL] [--auth-token AUTH_TOKEN]
--db DB [--key-prefix KEY_PREFIX] [--out OUT] --db DB [--key-prefix KEY_PREFIX] --out OUT
[--concurrency CONCURRENCY] [--timeout TIMEOUT] [--concurrency CONCURRENCY] [--timeout TIMEOUT]
[--max-retries MAX_RETRIES] [--max-retries MAX_RETRIES]
[--resume | --no-resume] [--force-refresh] [--resume | --no-resume] [--force-refresh]
@@ -40,7 +40,7 @@ usage: d4c-http-ingestor [-h] --urls URLS --dataset-id DATASET_ID
| `--auth-token` | `$D4C_INGESTOR_AUTH_TOKEN` | Bearer token for the worker | | `--auth-token` | `$D4C_INGESTOR_AUTH_TOKEN` | Bearer token for the worker |
| `--db` | *(required)* | Path to the SQLite database file | | `--db` | *(required)* | Path to the SQLite database file |
| `--key-prefix` | `""` | S3 key prefix passed to the worker | | `--key-prefix` | `""` | S3 key prefix passed to the worker |
| `--out` | `parquet/` | Output directory for the Parquet artifact | | `--out` | *(required)* | Parquet output filename stem (e.g. `my-dataset``my-dataset.parquet`) |
| `--concurrency` | `12` | Maximum concurrent worker requests | | `--concurrency` | `12` | Maximum concurrent worker requests |
| `--timeout` | `600` | Per-request timeout in seconds | | `--timeout` | `600` | Per-request timeout in seconds |
| `--max-retries` | `3` | Maximum retry attempts per URL on failure | | `--max-retries` | `3` | Maximum retry attempts per URL on failure |
@@ -61,9 +61,10 @@ usage: d4c-http-ingestor [-h] --urls URLS --dataset-id DATASET_ID
"key_prefix": "<key-prefix>" "key_prefix": "<key-prefix>"
} }
``` ```
6. Persists each result (success/failed) to SQLite with idempotent upsert. 6. Persists each result (success/failed) to SQLite with idempotent upsert, including the ETag, multipart info, and timestamps returned by the worker.
7. Failed URLs are retried with exponential backoff + jitter (up to `--max-retries`). 7. Failed URLs are retried with exponential backoff + jitter (up to `--max-retries`).
8. On completion, exports the full `downloads` table to `parquet/downloads.parquet`. 8. **Every 100 successful downloads**, the full `downloads` table is exported to `{out}.parquet` and uploaded to S3 via the worker's PUT endpoint at the `--key-prefix` location.
9. On completion, a final Parquet export + upload is performed.
Re-runs append new datasets or update existing rows into the Parquet dataset. Re-runs append new datasets or update existing rows into the Parquet dataset.
@@ -77,9 +78,12 @@ CREATE TABLE IF NOT EXISTS downloads (
dataset_id TEXT NOT NULL, dataset_id TEXT NOT NULL,
status TEXT NOT NULL, -- success | failed | skipped status TEXT NOT NULL, -- success | failed | skipped
http_status INTEGER, http_status INTEGER,
etag TEXT,
error TEXT, error TEXT,
started_at TEXT NOT NULL, started_at TEXT NOT NULL,
finished_at TEXT finished_at TEXT,
multipart_part_size INTEGER,
multipart_number_parts INTEGER
); );
CREATE INDEX IF NOT EXISTS ix_downloads_dataset ON downloads(dataset_id); CREATE INDEX IF NOT EXISTS ix_downloads_dataset ON downloads(dataset_id);
CREATE INDEX IF NOT EXISTS ix_downloads_status ON downloads(status); CREATE INDEX IF NOT EXISTS ix_downloads_status ON downloads(status);
@@ -87,7 +91,20 @@ CREATE INDEX IF NOT EXISTS ix_downloads_status ON downloads(status);
### Parquet columns ### Parquet columns
Mirrors the SQLite schema exactly. | Column | Arrow Type | Description |
|--------|-----------|-------------|
| `url` | `string` | Source download URL |
| `dataset_id` | `string` | Logical dataset identifier |
| `status` | `string` | `success`, `failed`, or `skipped` |
| `http_status` | `int32` | HTTP status code from the worker |
| `etag` | `string` | S3 ETag of the uploaded object (quotes stripped) |
| `error` | `string` | Error message (if failed) |
| `started_at` | `timestamp[us, tz=UTC]` | When the worker started processing (from worker response) |
| `finished_at` | `timestamp[us, tz=UTC]` | When the worker finished processing (from worker response) |
| `multipart_part_size` | `int32` | S3 multipart part size in bytes (if multipart was used) |
| `multipart_number_parts` | `int32` | Number of parts uploaded (if multipart was used) |
> Note: `started_at` and `finished_at` are stored as ISO-8601 text in SQLite but converted to proper Arrow timestamps in the Parquet output. These values come from the Cloudflare worker response, not the Python CLI.
## Dependencies ## Dependencies
@@ -10,7 +10,6 @@ import argparse
import asyncio import asyncio
import os import os
import sys import sys
from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
import httpx import httpx
@@ -26,18 +25,19 @@ from rich.progress import (
from d4c_http_ingestor.db import DownloadRow, DownloadsDB from d4c_http_ingestor.db import DownloadRow, DownloadsDB
from d4c_http_ingestor.parquet import export_parquet from d4c_http_ingestor.parquet import export_parquet
from d4c_http_ingestor.worker import call_worker_with_retries from d4c_http_ingestor.worker import call_worker_with_retries, upload_file_to_worker
console = Console() console = Console()
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Helpers # Constants
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
EXPORT_EVERY_N = 100 # Export + upload Parquet every N successful downloads
def _utcnow() -> str: # ---------------------------------------------------------------------------
"""Return the current UTC time as an ISO-8601 string.""" # Helpers
return datetime.now(timezone.utc).isoformat() # ---------------------------------------------------------------------------
def _read_urls(path: str) -> list[str]: def _read_urls(path: str) -> list[str]:
@@ -56,6 +56,50 @@ def _read_urls(path: str) -> list[str]:
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
async def _export_and_upload(
db: DownloadsDB,
client: httpx.AsyncClient,
*,
out_stem: str,
worker_url: str,
auth_token: str,
key_prefix: str,
) -> None:
"""Export the SQLite DB to Parquet and upload it to S3 via the worker."""
rows = db.all_rows()
if not rows:
return
parquet_path = export_parquet(rows, out_stem)
parquet_filename = parquet_path.name
s3_key = (
f"{key_prefix.rstrip('/')}/{parquet_filename}"
if key_prefix
else parquet_filename
)
console.print(
f" [cyan]Uploading[/] {parquet_filename} → s3://…/{s3_key}"
)
result = await upload_file_to_worker(
client,
worker_url=worker_url,
auth_token=auth_token,
file_path=parquet_path,
s3_key=s3_key,
content_type="application/vnd.apache.parquet",
)
if result.ok:
console.print(
f" [green]✓[/] Parquet uploaded ({len(rows)} rows, "
f"{parquet_path.stat().st_size:,} bytes)"
)
else:
console.print(
f" [red]✗[/] Parquet upload failed: {result.error}"
)
async def _process_urls( async def _process_urls(
urls: list[str], urls: list[str],
*, *,
@@ -64,18 +108,45 @@ async def _process_urls(
worker_url: str, worker_url: str,
auth_token: str, auth_token: str,
key_prefix: str, key_prefix: str,
out_stem: str,
concurrency: int, concurrency: int,
timeout: float, timeout: float,
max_retries: int, max_retries: int,
progress: Progress, progress: Progress,
task_id: int, task_id: int,
) -> None: ) -> None:
"""Submit *urls* to the worker with bounded concurrency.""" """Submit *urls* to the worker with bounded concurrency.
sem = asyncio.Semaphore(concurrency)
Uses a fixed-size worker pool so that exactly *concurrency* requests
are in-flight at any time. As soon as one request completes, the
next URL is picked up immediately — no idle slots.
Every :data:`EXPORT_EVERY_N` successful downloads, the SQLite database
is exported to Parquet and uploaded to S3 via the worker's PUT endpoint.
"""
queue: asyncio.Queue[str | None] = asyncio.Queue()
# Seed the queue with every URL to process.
for url in urls:
queue.put_nowait(url)
# Sentinel values one per worker so they know when to stop.
for _ in range(concurrency):
queue.put_nowait(None)
# Shared mutable state protected by a lock.
success_count = 0
export_lock = asyncio.Lock()
async def _worker(client: httpx.AsyncClient) -> None:
"""Pull URLs from the queue until a ``None`` sentinel is received."""
nonlocal success_count
while True:
url = await queue.get()
if url is None:
return
async def _handle(client: httpx.AsyncClient, url: str) -> None:
async with sem:
started = _utcnow()
user_agent = f"Data for Canada - {dataset_id}" user_agent = f"Data for Canada - {dataset_id}"
result = await call_worker_with_retries( result = await call_worker_with_retries(
@@ -89,27 +160,59 @@ async def _process_urls(
max_retries=max_retries, max_retries=max_retries,
) )
finished = _utcnow() # Use started_at/finished_at from the worker response
row = DownloadRow( row = DownloadRow(
url=url, url=url,
dataset_id=dataset_id, dataset_id=dataset_id,
status="success" if result.ok else "failed", status="success" if result.ok else "failed",
http_status=result.http_status, http_status=result.http_status,
etag=result.etag,
error=result.error, error=result.error,
started_at=started, started_at=result.started_at or "",
finished_at=finished, finished_at=result.finished_at,
multipart_part_size=result.multipart_part_size,
multipart_number_parts=result.multipart_number_parts,
) )
db.upsert(row) db.upsert(row)
progress.advance(task_id) progress.advance(task_id)
# Use a single shared httpx client with generous limits # Periodic Parquet export + upload every N successes
if result.ok:
async with export_lock:
success_count += 1
if success_count % EXPORT_EVERY_N == 0:
console.print(
f"\n [yellow]Checkpoint[/]: {success_count} "
f"successes — exporting Parquet…"
)
await _export_and_upload(
db,
client,
out_stem=out_stem,
worker_url=worker_url,
auth_token=auth_token,
key_prefix=key_prefix,
)
# Use a single shared httpx client with generous limits.
limits = httpx.Limits( limits = httpx.Limits(
max_connections=concurrency + 4, max_connections=concurrency + 4,
max_keepalive_connections=concurrency, max_keepalive_connections=concurrency,
) )
async with httpx.AsyncClient(limits=limits, follow_redirects=True) as client: async with httpx.AsyncClient(limits=limits, follow_redirects=True) as client:
tasks = [asyncio.create_task(_handle(client, u)) for u in urls] workers = [asyncio.create_task(_worker(client)) for _ in range(concurrency)]
await asyncio.gather(*tasks) await asyncio.gather(*workers)
# Final export + upload after all URLs are processed
console.print("\n [yellow]Final export[/]: exporting Parquet…")
await _export_and_upload(
db,
client,
out_stem=out_stem,
worker_url=worker_url,
auth_token=auth_token,
key_prefix=key_prefix,
)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -154,8 +257,12 @@ def build_parser() -> argparse.ArgumentParser:
) )
p.add_argument( p.add_argument(
"--out", "--out",
default="parquet/", required=True,
help="Output directory for the Parquet artifact (default: parquet/).", help=(
"Parquet output filename stem. For example, "
"'--out my-dataset' creates 'my-dataset.parquet' in the "
"current working directory."
),
) )
p.add_argument( p.add_argument(
"--concurrency", "--concurrency",
@@ -252,6 +359,7 @@ def main(argv: list[str] | None = None) -> None:
worker_url=args.worker_url, worker_url=args.worker_url,
auth_token=args.auth_token, auth_token=args.auth_token,
key_prefix=args.key_prefix, key_prefix=args.key_prefix,
out_stem=args.out,
concurrency=args.concurrency, concurrency=args.concurrency,
timeout=args.timeout, timeout=args.timeout,
max_retries=args.max_retries, max_retries=args.max_retries,
@@ -269,7 +377,7 @@ def main(argv: list[str] | None = None) -> None:
) )
console.print(f" [{colour}]{status}[/]: {cnt}") console.print(f" [{colour}]{status}[/]: {cnt}")
# -- Export Parquet -------------------------------------------------- # -- Final local Parquet export (no upload — already done above) -----
rows = db.all_rows() rows = db.all_rows()
if rows: if rows:
dest = export_parquet(rows, args.out) dest = export_parquet(rows, args.out)
@@ -19,9 +19,12 @@ class DownloadRow:
dataset_id: str dataset_id: str
status: str # success | failed | skipped status: str # success | failed | skipped
http_status: int | None = None http_status: int | None = None
etag: str | None = None
error: str | None = None error: str | None = None
started_at: str = "" started_at: str = ""
finished_at: str | None = None finished_at: str | None = None
multipart_part_size: int | None = None
multipart_number_parts: int | None = None
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -34,9 +37,12 @@ CREATE TABLE IF NOT EXISTS downloads (
dataset_id TEXT NOT NULL, dataset_id TEXT NOT NULL,
status TEXT NOT NULL, status TEXT NOT NULL,
http_status INTEGER, http_status INTEGER,
etag TEXT,
error TEXT, error TEXT,
started_at TEXT NOT NULL, started_at TEXT NOT NULL,
finished_at TEXT finished_at TEXT,
multipart_part_size INTEGER,
multipart_number_parts INTEGER
); );
CREATE INDEX IF NOT EXISTS ix_downloads_dataset ON downloads(dataset_id); CREATE INDEX IF NOT EXISTS ix_downloads_dataset ON downloads(dataset_id);
CREATE INDEX IF NOT EXISTS ix_downloads_status ON downloads(status); CREATE INDEX IF NOT EXISTS ix_downloads_status ON downloads(status);
@@ -82,21 +88,39 @@ class DownloadsDB:
) )
return [r["url"] for r in cur.fetchall()] return [r["url"] for r in cur.fetchall()]
def count_successful(self) -> int:
"""Return the number of rows with status ``success``."""
cur = self._conn.execute(
"SELECT COUNT(*) AS cnt FROM downloads WHERE status = 'success'"
)
return cur.fetchone()["cnt"]
# -- mutations ----------------------------------------------------------- # -- mutations -----------------------------------------------------------
def upsert(self, row: DownloadRow) -> None: def upsert(self, row: DownloadRow) -> None:
"""Insert or replace a download row.""" """Insert or replace a download row."""
self._conn.execute( self._conn.execute(
"""\ """\
INSERT INTO downloads (url, dataset_id, status, http_status, error, started_at, finished_at) INSERT INTO downloads (
VALUES (:url, :dataset_id, :status, :http_status, :error, :started_at, :finished_at) url, dataset_id, status, http_status, etag, error,
started_at, finished_at,
multipart_part_size, multipart_number_parts
)
VALUES (
:url, :dataset_id, :status, :http_status, :etag, :error,
:started_at, :finished_at,
:multipart_part_size, :multipart_number_parts
)
ON CONFLICT(url) DO UPDATE SET ON CONFLICT(url) DO UPDATE SET
dataset_id = excluded.dataset_id, dataset_id = excluded.dataset_id,
status = excluded.status, status = excluded.status,
http_status = excluded.http_status, http_status = excluded.http_status,
etag = excluded.etag,
error = excluded.error, error = excluded.error,
started_at = excluded.started_at, started_at = excluded.started_at,
finished_at = excluded.finished_at finished_at = excluded.finished_at,
multipart_part_size = excluded.multipart_part_size,
multipart_number_parts = excluded.multipart_number_parts
""", """,
row.__dict__, row.__dict__,
) )
@@ -2,6 +2,7 @@
from __future__ import annotations from __future__ import annotations
from datetime import datetime
from pathlib import Path from pathlib import Path
import pyarrow as pa import pyarrow as pa
@@ -9,6 +10,18 @@ import pyarrow.parquet as pq
from d4c_http_ingestor.db import DownloadRow from d4c_http_ingestor.db import DownloadRow
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _parse_iso(s: str | None) -> datetime | None:
"""Parse an ISO-8601 string into a datetime, or return None."""
if not s:
return None
return datetime.fromisoformat(s)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Schema mirrors the SQLite ``downloads`` table # Schema mirrors the SQLite ``downloads`` table
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -19,9 +32,12 @@ _ARROW_SCHEMA = pa.schema(
pa.field("dataset_id", pa.string(), nullable=False), pa.field("dataset_id", pa.string(), nullable=False),
pa.field("status", pa.string(), nullable=False), pa.field("status", pa.string(), nullable=False),
pa.field("http_status", pa.int32(), nullable=True), pa.field("http_status", pa.int32(), nullable=True),
pa.field("etag", pa.string(), nullable=True),
pa.field("error", pa.string(), nullable=True), pa.field("error", pa.string(), nullable=True),
pa.field("started_at", pa.string(), nullable=False), pa.field("started_at", pa.timestamp("us", tz="UTC"), nullable=True),
pa.field("finished_at", pa.string(), nullable=True), pa.field("finished_at", pa.timestamp("us", tz="UTC"), nullable=True),
pa.field("multipart_part_size", pa.int32(), nullable=True),
pa.field("multipart_number_parts", pa.int32(), nullable=True),
] ]
) )
@@ -33,25 +49,31 @@ def rows_to_table(rows: list[DownloadRow]) -> pa.Table:
pa.array([r.dataset_id for r in rows], type=pa.string()), pa.array([r.dataset_id for r in rows], type=pa.string()),
pa.array([r.status for r in rows], type=pa.string()), pa.array([r.status for r in rows], type=pa.string()),
pa.array([r.http_status for r in rows], type=pa.int32()), pa.array([r.http_status for r in rows], type=pa.int32()),
pa.array([r.etag for r in rows], type=pa.string()),
pa.array([r.error for r in rows], type=pa.string()), pa.array([r.error for r in rows], type=pa.string()),
pa.array([r.started_at for r in rows], type=pa.string()), pa.array(
pa.array([r.finished_at for r in rows], type=pa.string()), [_parse_iso(r.started_at) for r in rows],
type=pa.timestamp("us", tz="UTC"),
),
pa.array(
[_parse_iso(r.finished_at) for r in rows],
type=pa.timestamp("us", tz="UTC"),
),
pa.array([r.multipart_part_size for r in rows], type=pa.int32()),
pa.array([r.multipart_number_parts for r in rows], type=pa.int32()),
] ]
return pa.table(arrays, schema=_ARROW_SCHEMA) return pa.table(arrays, schema=_ARROW_SCHEMA)
def export_parquet(rows: list[DownloadRow], out_dir: str | Path) -> Path: def export_parquet(rows: list[DownloadRow], out_stem: str) -> Path:
"""Write *rows* as a single Parquet file inside *out_dir*. """Write *rows* as a Parquet file named ``{out_stem}.parquet`` in the CWD.
The file is named ``downloads.parquet`` and is overwritten on each run so The file is overwritten on each call so that re-runs always reflect the
that re-runs always reflect the latest state of the SQLite database. latest state of the SQLite database.
Returns the path to the written file. Returns the path to the written file.
""" """
out_dir = Path(out_dir) dest = Path(f"{out_stem}.parquet")
out_dir.mkdir(parents=True, exist_ok=True)
dest = out_dir / "downloads.parquet"
table = rows_to_table(rows) table = rows_to_table(rows)
pq.write_table(table, dest, compression="zstd") pq.write_table(table, dest, compression="zstd")
return dest return dest
@@ -5,6 +5,8 @@ from __future__ import annotations
import asyncio import asyncio
import random import random
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any from typing import Any
import httpx import httpx
@@ -21,7 +23,12 @@ class WorkerResult:
key: str | None = None key: str | None = None
content_type: str | None = None content_type: str | None = None
size_bytes: int | None = None size_bytes: int | None = None
etag: str | None = None
error: str | None = None error: str | None = None
multipart_part_size: int | None = None
multipart_number_parts: int | None = None
started_at: str | None = None
finished_at: str | None = None
async def call_worker( async def call_worker(
@@ -49,6 +56,8 @@ async def call_worker(
"Content-Type": "application/json", "Content-Type": "application/json",
} }
started = datetime.now(timezone.utc).isoformat()
try: try:
resp = await client.post( resp = await client.post(
worker_url, worker_url,
@@ -67,6 +76,11 @@ async def call_worker(
key=body.get("key"), key=body.get("key"),
content_type=body.get("content_type"), content_type=body.get("content_type"),
size_bytes=body.get("size_bytes"), size_bytes=body.get("size_bytes"),
etag=body.get("etag"),
multipart_part_size=body.get("multipart_part_size"),
multipart_number_parts=body.get("multipart_number_parts"),
started_at=body.get("started_at"),
finished_at=body.get("finished_at"),
) )
else: else:
return WorkerResult( return WorkerResult(
@@ -74,13 +88,15 @@ async def call_worker(
ok=False, ok=False,
http_status=resp.status_code, http_status=resp.status_code,
error=body.get("error", resp.text), error=body.get("error", resp.text),
started_at=body.get("started_at"),
finished_at=body.get("finished_at"),
) )
except httpx.TimeoutException as exc: except httpx.TimeoutException as exc:
return WorkerResult(url=download_url, ok=False, error=f"Timeout: {exc}") return WorkerResult(url=download_url, ok=False, error=f"Timeout: {exc}", started_at=started)
except httpx.HTTPError as exc: except httpx.HTTPError as exc:
return WorkerResult(url=download_url, ok=False, error=f"HTTP error: {exc}") return WorkerResult(url=download_url, ok=False, error=f"HTTP error: {exc}", started_at=started)
except Exception as exc: # noqa: BLE001 except Exception as exc: # noqa: BLE001
return WorkerResult(url=download_url, ok=False, error=str(exc)) return WorkerResult(url=download_url, ok=False, error=str(exc), started_at=started)
async def call_worker_with_retries( async def call_worker_with_retries(
@@ -121,3 +137,71 @@ async def call_worker_with_retries(
assert last_result is not None # noqa: S101 assert last_result is not None # noqa: S101
return last_result return last_result
async def upload_file_to_worker(
client: httpx.AsyncClient,
*,
worker_url: str,
auth_token: str,
file_path: Path,
s3_key: str,
content_type: str = "application/octet-stream",
timeout: float = 120.0,
) -> WorkerResult:
"""Upload a local file directly to S3 via the worker PUT endpoint.
Reads the file and sends it as a PUT request body with the ``X-S3-Key``
header specifying the destination S3 object key.
"""
headers = {
"Authorization": f"Bearer {auth_token}",
"X-S3-Key": s3_key,
"Content-Type": content_type,
}
try:
file_size = file_path.stat().st_size
headers["Content-Length"] = str(file_size)
with open(file_path, "rb") as fh:
file_bytes = fh.read()
resp = await client.put(
worker_url,
content=file_bytes,
headers=headers,
timeout=timeout,
)
body: dict[str, Any] = resp.json()
if resp.is_success and body.get("ok"):
return WorkerResult(
url=str(file_path),
ok=True,
http_status=resp.status_code,
bucket=body.get("bucket"),
key=body.get("key"),
content_type=body.get("content_type"),
size_bytes=body.get("size_bytes"),
etag=body.get("etag"),
multipart_part_size=body.get("multipart_part_size"),
multipart_number_parts=body.get("multipart_number_parts"),
started_at=body.get("started_at"),
finished_at=body.get("finished_at"),
)
else:
return WorkerResult(
url=str(file_path),
ok=False,
http_status=resp.status_code,
error=body.get("error", resp.text),
started_at=body.get("started_at"),
finished_at=body.get("finished_at"),
)
except httpx.TimeoutException as exc:
return WorkerResult(url=str(file_path), ok=False, error=f"Timeout: {exc}")
except httpx.HTTPError as exc:
return WorkerResult(url=str(file_path), ok=False, error=f"HTTP error: {exc}")
except Exception as exc: # noqa: BLE001
return WorkerResult(url=str(file_path), ok=False, error=str(exc))