mirror of
https://github.com/dataforcanada/d4c-infra-distribution.git
synced 2026-06-13 14:10:53 +02:00
Python CLI to call Cloudflare HTTP ingestor. To start running tomorrow morning! https://tenor.com/view/bbnft-yuge-theriouthly-seriously-mike-tyson-gif-5575001300972074272
This commit is contained in:
@@ -0,0 +1,5 @@
|
||||
.venv/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.sqlite
|
||||
parquet/
|
||||
@@ -0,0 +1,96 @@
|
||||
# 06 – Call HTTP Ingestor
|
||||
|
||||
Python CLI that orchestrates concurrent calls to the [Cloudflare HTTP ingestor worker](../05_cloudflare_http_ingestor/) and persists results to SQLite + Parquet.
|
||||
|
||||
## Quick start
|
||||
|
||||
```bash
|
||||
cd scripts/06_call_http_ingestor
|
||||
|
||||
# Install & run (uv handles the virtualenv automatically)
|
||||
uv run d4c-http-ingestor \
|
||||
--urls ../05_cloudflare_http_ingestor/ca-qc_government_and_municipalities_of_quebec-2026A000224_d4c-datapkg-orthoimagery_orthorectified_imagery_from_quebec_2026-03-10.txt \
|
||||
--dataset-id ca-qc_government_and_municipalities_of_quebec-2026A000224_d4c-datapkg-orthoimagery_orthorectified_imagery_from_quebec \
|
||||
--worker-url https://cf-data-ingestor.labs.dataforcanada.org/ \
|
||||
--auth-token "$D4C_INGESTOR_AUTH_TOKEN" \
|
||||
--db ca-qc_government_and_municipalities_of_quebec-2026A000224_d4c-datapkg-orthoimagery_orthorectified_imagery_from_quebec.sqlite \
|
||||
--key-prefix dataforcanada/d4c-datapkg-orthoimagery/archive/ca-qc_government_and_municipalities_of_quebec-2026A000224_d4c-datapkg-orthoimagery_orthorectified_imagery_from_quebec \
|
||||
--out parquet/ \
|
||||
--concurrency 12
|
||||
```
|
||||
|
||||
The auth token can also be set via the `D4C_INGESTOR_AUTH_TOKEN` environment variable.
|
||||
|
||||
## CLI reference
|
||||
|
||||
```
|
||||
usage: d4c-http-ingestor [-h] --urls URLS --dataset-id DATASET_ID
|
||||
[--worker-url WORKER_URL] [--auth-token AUTH_TOKEN]
|
||||
--db DB [--key-prefix KEY_PREFIX] [--out OUT]
|
||||
[--concurrency CONCURRENCY] [--timeout TIMEOUT]
|
||||
[--max-retries MAX_RETRIES]
|
||||
[--resume | --no-resume] [--force-refresh]
|
||||
```
|
||||
|
||||
| Flag | Default | Description |
|
||||
|------|---------|-------------|
|
||||
| `--urls` | *(required)* | Path to a newline-delimited file of URLs to ingest |
|
||||
| `--dataset-id` | *(required)* | Logical dataset identifier (used in User-Agent and DB) |
|
||||
| `--worker-url` | `https://cf-data-ingestor.labs.dataforcanada.org/` | Base URL of the Cloudflare ingestor worker |
|
||||
| `--auth-token` | `$D4C_INGESTOR_AUTH_TOKEN` | Bearer token for the worker |
|
||||
| `--db` | *(required)* | Path to the SQLite database file |
|
||||
| `--key-prefix` | `""` | S3 key prefix passed to the worker |
|
||||
| `--out` | `parquet/` | Output directory for the Parquet artifact |
|
||||
| `--concurrency` | `12` | Maximum concurrent worker requests |
|
||||
| `--timeout` | `600` | Per-request timeout in seconds |
|
||||
| `--max-retries` | `3` | Maximum retry attempts per URL on failure |
|
||||
| `--resume` | `true` | Skip URLs already recorded as `success` |
|
||||
| `--force-refresh` | `false` | Ignore cached freshness; re-process all URLs |
|
||||
|
||||
## How it works
|
||||
|
||||
1. Reads URLs from the input file.
|
||||
2. Opens (or creates) a SQLite database with the `downloads` table.
|
||||
3. If `--resume` (default), filters out URLs already marked `success`.
|
||||
4. Submits up to `--concurrency` concurrent POST requests to the worker.
|
||||
5. Each request sends:
|
||||
```json
|
||||
{
|
||||
"download_url": "<url from file>",
|
||||
"user_agent": "Data for Canada - <dataset-id>",
|
||||
"key_prefix": "<key-prefix>"
|
||||
}
|
||||
```
|
||||
6. Persists each result (success/failed) to SQLite with idempotent upsert.
|
||||
7. Failed URLs are retried with exponential backoff + jitter (up to `--max-retries`).
|
||||
8. On completion, exports the full `downloads` table to `parquet/downloads.parquet`.
|
||||
|
||||
Re-runs append new datasets or update existing rows into the Parquet dataset.
|
||||
|
||||
## Data model
|
||||
|
||||
### SQLite schema (`downloads` table)
|
||||
|
||||
```sql
|
||||
CREATE TABLE IF NOT EXISTS downloads (
|
||||
url TEXT PRIMARY KEY,
|
||||
dataset_id TEXT NOT NULL,
|
||||
status TEXT NOT NULL, -- success | failed | skipped
|
||||
http_status INTEGER,
|
||||
error TEXT,
|
||||
started_at TEXT NOT NULL,
|
||||
finished_at TEXT
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS ix_downloads_dataset ON downloads(dataset_id);
|
||||
CREATE INDEX IF NOT EXISTS ix_downloads_status ON downloads(status);
|
||||
```
|
||||
|
||||
### Parquet columns
|
||||
|
||||
Mirrors the SQLite schema exactly.
|
||||
|
||||
## Dependencies
|
||||
|
||||
- [httpx](https://www.python-httpx.org/) – async HTTP client
|
||||
- [pyarrow](https://arrow.apache.org/docs/python/) – Parquet I/O
|
||||
- [rich](https://rich.readthedocs.io/) – progress bars and terminal output
|
||||
+234772
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,20 @@
|
||||
[project]
|
||||
name = "d4c-http-ingestor"
|
||||
version = "0.1.0"
|
||||
description = "CLI caller for the Cloudflare HTTP ingestor worker – orchestrates concurrent downloads, persists results to SQLite, and exports Parquet."
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"httpx>=0.27",
|
||||
"pyarrow>=15",
|
||||
"rich>=13",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
d4c-http-ingestor = "d4c_http_ingestor.cli:main"
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["src/d4c_http_ingestor"]
|
||||
@@ -0,0 +1,3 @@
|
||||
"""d4c-http-ingestor – CLI caller for the Cloudflare HTTP ingestor worker."""
|
||||
|
||||
__version__ = "0.1.0"
|
||||
@@ -0,0 +1,284 @@
|
||||
"""CLI entry-point for d4c-http-ingestor.
|
||||
|
||||
Orchestrates concurrent calls to the Cloudflare HTTP ingestor worker,
|
||||
persists results to SQLite, and exports a Parquet artifact.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
from rich.console import Console
|
||||
from rich.progress import (
|
||||
BarColumn,
|
||||
MofNCompleteColumn,
|
||||
Progress,
|
||||
SpinnerColumn,
|
||||
TextColumn,
|
||||
TimeElapsedColumn,
|
||||
)
|
||||
|
||||
from d4c_http_ingestor.db import DownloadRow, DownloadsDB
|
||||
from d4c_http_ingestor.parquet import export_parquet
|
||||
from d4c_http_ingestor.worker import call_worker_with_retries
|
||||
|
||||
console = Console()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _utcnow() -> str:
|
||||
"""Return the current UTC time as an ISO-8601 string."""
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _read_urls(path: str) -> list[str]:
|
||||
"""Read a newline-delimited URL file, stripping blanks and comments."""
|
||||
lines: list[str] = []
|
||||
with open(path) as fh:
|
||||
for raw in fh:
|
||||
line = raw.strip()
|
||||
if line and not line.startswith("#"):
|
||||
lines.append(line)
|
||||
return lines
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Async orchestrator
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def _process_urls(
|
||||
urls: list[str],
|
||||
*,
|
||||
db: DownloadsDB,
|
||||
dataset_id: str,
|
||||
worker_url: str,
|
||||
auth_token: str,
|
||||
key_prefix: str,
|
||||
concurrency: int,
|
||||
timeout: float,
|
||||
max_retries: int,
|
||||
progress: Progress,
|
||||
task_id: int,
|
||||
) -> None:
|
||||
"""Submit *urls* to the worker with bounded concurrency."""
|
||||
sem = asyncio.Semaphore(concurrency)
|
||||
|
||||
async def _handle(client: httpx.AsyncClient, url: str) -> None:
|
||||
async with sem:
|
||||
started = _utcnow()
|
||||
user_agent = f"Data for Canada - {dataset_id}"
|
||||
|
||||
result = await call_worker_with_retries(
|
||||
client,
|
||||
worker_url=worker_url,
|
||||
auth_token=auth_token,
|
||||
download_url=url,
|
||||
user_agent=user_agent,
|
||||
key_prefix=key_prefix,
|
||||
timeout=timeout,
|
||||
max_retries=max_retries,
|
||||
)
|
||||
|
||||
finished = _utcnow()
|
||||
row = DownloadRow(
|
||||
url=url,
|
||||
dataset_id=dataset_id,
|
||||
status="success" if result.ok else "failed",
|
||||
http_status=result.http_status,
|
||||
error=result.error,
|
||||
started_at=started,
|
||||
finished_at=finished,
|
||||
)
|
||||
db.upsert(row)
|
||||
progress.advance(task_id)
|
||||
|
||||
# Use a single shared httpx client with generous limits
|
||||
limits = httpx.Limits(
|
||||
max_connections=concurrency + 4,
|
||||
max_keepalive_connections=concurrency,
|
||||
)
|
||||
async with httpx.AsyncClient(limits=limits, follow_redirects=True) as client:
|
||||
tasks = [asyncio.create_task(_handle(client, u)) for u in urls]
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
p = argparse.ArgumentParser(
|
||||
prog="d4c-http-ingestor",
|
||||
description="Orchestrate concurrent downloads via the Cloudflare HTTP ingestor worker.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--urls",
|
||||
required=True,
|
||||
help="Path to a newline-delimited file of URLs to ingest.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--dataset-id",
|
||||
required=True,
|
||||
help="Logical dataset identifier (used in User-Agent and DB).",
|
||||
)
|
||||
p.add_argument(
|
||||
"--worker-url",
|
||||
default="https://cf-data-ingestor.labs.dataforcanada.org/",
|
||||
help="Base URL of the Cloudflare ingestor worker.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--auth-token",
|
||||
default=os.environ.get("D4C_INGESTOR_AUTH_TOKEN", ""),
|
||||
help="Bearer token for the worker (default: $D4C_INGESTOR_AUTH_TOKEN).",
|
||||
)
|
||||
p.add_argument(
|
||||
"--db",
|
||||
required=True,
|
||||
help="Path to the SQLite database file.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--key-prefix",
|
||||
default="",
|
||||
help="S3 key prefix passed to the worker (e.g. dataforcanada/d4c-datapkg-orthoimagery/archive/).",
|
||||
)
|
||||
p.add_argument(
|
||||
"--out",
|
||||
default="parquet/",
|
||||
help="Output directory for the Parquet artifact (default: parquet/).",
|
||||
)
|
||||
p.add_argument(
|
||||
"--concurrency",
|
||||
type=int,
|
||||
default=12,
|
||||
help="Maximum number of concurrent worker requests (default: 12).",
|
||||
)
|
||||
p.add_argument(
|
||||
"--timeout",
|
||||
type=float,
|
||||
default=600.0,
|
||||
help="Per-request timeout in seconds (default: 600).",
|
||||
)
|
||||
p.add_argument(
|
||||
"--max-retries",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Maximum retry attempts per URL on failure (default: 3).",
|
||||
)
|
||||
p.add_argument(
|
||||
"--resume",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
default=True,
|
||||
help="Resume from previous run, skipping successful URLs (default: true).",
|
||||
)
|
||||
p.add_argument(
|
||||
"--force-refresh",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Ignore cached freshness; re-process all URLs.",
|
||||
)
|
||||
return p
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> None:
|
||||
parser = build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if not args.auth_token:
|
||||
console.print(
|
||||
"[bold red]Error:[/] --auth-token or $D4C_INGESTOR_AUTH_TOKEN is required."
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
# -- Read URL list -------------------------------------------------------
|
||||
url_file = Path(args.urls)
|
||||
if not url_file.is_file():
|
||||
console.print(f"[bold red]Error:[/] URL file not found: {url_file}")
|
||||
sys.exit(1)
|
||||
|
||||
all_urls = _read_urls(str(url_file))
|
||||
console.print(f"Loaded [bold]{len(all_urls)}[/] URLs from [cyan]{url_file}[/]")
|
||||
|
||||
# -- Open DB & determine work set ----------------------------------------
|
||||
with DownloadsDB(args.db) as db:
|
||||
if args.force_refresh:
|
||||
urls_to_process = all_urls
|
||||
console.print("[yellow]--force-refresh[/]: re-processing all URLs")
|
||||
elif args.resume:
|
||||
already_done = db.successful_urls()
|
||||
urls_to_process = [u for u in all_urls if u not in already_done]
|
||||
skipped = len(all_urls) - len(urls_to_process)
|
||||
if skipped:
|
||||
console.print(
|
||||
f"Resuming: skipping [green]{skipped}[/] already-successful URLs"
|
||||
)
|
||||
else:
|
||||
urls_to_process = all_urls
|
||||
|
||||
if not urls_to_process:
|
||||
console.print("[green]Nothing to do – all URLs already succeeded.[/]")
|
||||
else:
|
||||
console.print(
|
||||
f"Processing [bold]{len(urls_to_process)}[/] URLs "
|
||||
f"with concurrency=[cyan]{args.concurrency}[/]"
|
||||
)
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
MofNCompleteColumn(),
|
||||
TimeElapsedColumn(),
|
||||
console=console,
|
||||
) as progress:
|
||||
task_id = progress.add_task(
|
||||
"Ingesting…", total=len(urls_to_process)
|
||||
)
|
||||
asyncio.run(
|
||||
_process_urls(
|
||||
urls_to_process,
|
||||
db=db,
|
||||
dataset_id=args.dataset_id,
|
||||
worker_url=args.worker_url,
|
||||
auth_token=args.auth_token,
|
||||
key_prefix=args.key_prefix,
|
||||
concurrency=args.concurrency,
|
||||
timeout=args.timeout,
|
||||
max_retries=args.max_retries,
|
||||
progress=progress,
|
||||
task_id=task_id,
|
||||
)
|
||||
)
|
||||
|
||||
# -- Summary ---------------------------------------------------------
|
||||
counts = db.count_by_status()
|
||||
console.print("\n[bold]Summary:[/]")
|
||||
for status, cnt in sorted(counts.items()):
|
||||
colour = {"success": "green", "failed": "red", "skipped": "yellow"}.get(
|
||||
status, "white"
|
||||
)
|
||||
console.print(f" [{colour}]{status}[/]: {cnt}")
|
||||
|
||||
# -- Export Parquet --------------------------------------------------
|
||||
rows = db.all_rows()
|
||||
if rows:
|
||||
dest = export_parquet(rows, args.out)
|
||||
console.print(f"\nParquet written to [cyan]{dest}[/] ({len(rows)} rows)")
|
||||
else:
|
||||
console.print("\n[yellow]No rows to export.[/]")
|
||||
|
||||
console.print("[bold green]Done.[/]")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,126 @@
|
||||
"""SQLite persistence layer for download tracking."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Sequence
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Data model
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class DownloadRow:
|
||||
"""Mirrors a single row in the ``downloads`` table."""
|
||||
|
||||
url: str
|
||||
dataset_id: str
|
||||
status: str # success | failed | skipped
|
||||
http_status: int | None = None
|
||||
error: str | None = None
|
||||
started_at: str = ""
|
||||
finished_at: str | None = None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Schema
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_SCHEMA_SQL = """\
|
||||
CREATE TABLE IF NOT EXISTS downloads (
|
||||
url TEXT PRIMARY KEY,
|
||||
dataset_id TEXT NOT NULL,
|
||||
status TEXT NOT NULL,
|
||||
http_status INTEGER,
|
||||
error TEXT,
|
||||
started_at TEXT NOT NULL,
|
||||
finished_at TEXT
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS ix_downloads_dataset ON downloads(dataset_id);
|
||||
CREATE INDEX IF NOT EXISTS ix_downloads_status ON downloads(status);
|
||||
"""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Database handle
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class DownloadsDB:
|
||||
"""Thin wrapper around a SQLite database for download tracking."""
|
||||
|
||||
def __init__(self, path: str | Path) -> None:
|
||||
self.path = Path(path)
|
||||
self.path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self._conn = sqlite3.connect(str(self.path))
|
||||
self._conn.row_factory = sqlite3.Row
|
||||
self._conn.execute("PRAGMA journal_mode=WAL")
|
||||
self._conn.executescript(_SCHEMA_SQL)
|
||||
|
||||
# -- queries -------------------------------------------------------------
|
||||
|
||||
def get(self, url: str) -> DownloadRow | None:
|
||||
"""Return the row for *url*, or ``None`` if it doesn't exist."""
|
||||
cur = self._conn.execute("SELECT * FROM downloads WHERE url = ?", (url,))
|
||||
row = cur.fetchone()
|
||||
if row is None:
|
||||
return None
|
||||
return DownloadRow(**dict(row))
|
||||
|
||||
def successful_urls(self) -> set[str]:
|
||||
"""Return the set of URLs already recorded as ``success``."""
|
||||
cur = self._conn.execute(
|
||||
"SELECT url FROM downloads WHERE status = 'success'"
|
||||
)
|
||||
return {r["url"] for r in cur.fetchall()}
|
||||
|
||||
def pending_urls(self) -> list[str]:
|
||||
"""Return URLs recorded as ``failed`` or not yet present (for resume)."""
|
||||
cur = self._conn.execute(
|
||||
"SELECT url FROM downloads WHERE status IN ('failed')"
|
||||
)
|
||||
return [r["url"] for r in cur.fetchall()]
|
||||
|
||||
# -- mutations -----------------------------------------------------------
|
||||
|
||||
def upsert(self, row: DownloadRow) -> None:
|
||||
"""Insert or replace a download row."""
|
||||
self._conn.execute(
|
||||
"""\
|
||||
INSERT INTO downloads (url, dataset_id, status, http_status, error, started_at, finished_at)
|
||||
VALUES (:url, :dataset_id, :status, :http_status, :error, :started_at, :finished_at)
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
dataset_id = excluded.dataset_id,
|
||||
status = excluded.status,
|
||||
http_status = excluded.http_status,
|
||||
error = excluded.error,
|
||||
started_at = excluded.started_at,
|
||||
finished_at = excluded.finished_at
|
||||
""",
|
||||
row.__dict__,
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
def all_rows(self) -> list[DownloadRow]:
|
||||
"""Return every row in the table."""
|
||||
cur = self._conn.execute("SELECT * FROM downloads ORDER BY started_at")
|
||||
return [DownloadRow(**dict(r)) for r in cur.fetchall()]
|
||||
|
||||
def count_by_status(self) -> dict[str, int]:
|
||||
"""Return ``{status: count}`` summary."""
|
||||
cur = self._conn.execute(
|
||||
"SELECT status, COUNT(*) AS cnt FROM downloads GROUP BY status"
|
||||
)
|
||||
return {r["status"]: r["cnt"] for r in cur.fetchall()}
|
||||
|
||||
# -- lifecycle -----------------------------------------------------------
|
||||
|
||||
def close(self) -> None:
|
||||
self._conn.close()
|
||||
|
||||
def __enter__(self) -> "DownloadsDB":
|
||||
return self
|
||||
|
||||
def __exit__(self, *exc) -> None: # noqa: ANN002
|
||||
self.close()
|
||||
@@ -0,0 +1,57 @@
|
||||
"""Export the downloads table to a Parquet dataset."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
from d4c_http_ingestor.db import DownloadRow
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Schema – mirrors the SQLite ``downloads`` table
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_ARROW_SCHEMA = pa.schema(
|
||||
[
|
||||
pa.field("url", pa.string(), nullable=False),
|
||||
pa.field("dataset_id", pa.string(), nullable=False),
|
||||
pa.field("status", pa.string(), nullable=False),
|
||||
pa.field("http_status", pa.int32(), nullable=True),
|
||||
pa.field("error", pa.string(), nullable=True),
|
||||
pa.field("started_at", pa.string(), nullable=False),
|
||||
pa.field("finished_at", pa.string(), nullable=True),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def rows_to_table(rows: list[DownloadRow]) -> pa.Table:
|
||||
"""Convert a list of :class:`DownloadRow` into a PyArrow Table."""
|
||||
arrays = [
|
||||
pa.array([r.url for r in rows], type=pa.string()),
|
||||
pa.array([r.dataset_id for r in rows], type=pa.string()),
|
||||
pa.array([r.status for r in rows], type=pa.string()),
|
||||
pa.array([r.http_status for r in rows], type=pa.int32()),
|
||||
pa.array([r.error for r in rows], type=pa.string()),
|
||||
pa.array([r.started_at for r in rows], type=pa.string()),
|
||||
pa.array([r.finished_at for r in rows], type=pa.string()),
|
||||
]
|
||||
return pa.table(arrays, schema=_ARROW_SCHEMA)
|
||||
|
||||
|
||||
def export_parquet(rows: list[DownloadRow], out_dir: str | Path) -> Path:
|
||||
"""Write *rows* as a single Parquet file inside *out_dir*.
|
||||
|
||||
The file is named ``downloads.parquet`` and is overwritten on each run so
|
||||
that re-runs always reflect the latest state of the SQLite database.
|
||||
|
||||
Returns the path to the written file.
|
||||
"""
|
||||
out_dir = Path(out_dir)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
dest = out_dir / "downloads.parquet"
|
||||
|
||||
table = rows_to_table(rows)
|
||||
pq.write_table(table, dest, compression="zstd")
|
||||
return dest
|
||||
@@ -0,0 +1,123 @@
|
||||
"""HTTP client for the Cloudflare data-ingestor worker."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import random
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
@dataclass
|
||||
class WorkerResult:
|
||||
"""Outcome of a single worker invocation."""
|
||||
|
||||
url: str
|
||||
ok: bool
|
||||
http_status: int | None = None
|
||||
bucket: str | None = None
|
||||
key: str | None = None
|
||||
content_type: str | None = None
|
||||
size_bytes: int | None = None
|
||||
error: str | None = None
|
||||
|
||||
|
||||
async def call_worker(
|
||||
client: httpx.AsyncClient,
|
||||
*,
|
||||
worker_url: str,
|
||||
auth_token: str,
|
||||
download_url: str,
|
||||
user_agent: str,
|
||||
key_prefix: str,
|
||||
timeout: float = 600.0,
|
||||
) -> WorkerResult:
|
||||
"""POST a single download job to the Cloudflare worker.
|
||||
|
||||
Returns a :class:`WorkerResult` regardless of success/failure so the
|
||||
caller never has to catch transport exceptions.
|
||||
"""
|
||||
payload = {
|
||||
"download_url": download_url,
|
||||
"user_agent": user_agent,
|
||||
"key_prefix": key_prefix,
|
||||
}
|
||||
headers = {
|
||||
"Authorization": f"Bearer {auth_token}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
try:
|
||||
resp = await client.post(
|
||||
worker_url,
|
||||
json=payload,
|
||||
headers=headers,
|
||||
timeout=timeout,
|
||||
)
|
||||
body: dict[str, Any] = resp.json()
|
||||
|
||||
if resp.is_success and body.get("ok"):
|
||||
return WorkerResult(
|
||||
url=download_url,
|
||||
ok=True,
|
||||
http_status=resp.status_code,
|
||||
bucket=body.get("bucket"),
|
||||
key=body.get("key"),
|
||||
content_type=body.get("content_type"),
|
||||
size_bytes=body.get("size_bytes"),
|
||||
)
|
||||
else:
|
||||
return WorkerResult(
|
||||
url=download_url,
|
||||
ok=False,
|
||||
http_status=resp.status_code,
|
||||
error=body.get("error", resp.text),
|
||||
)
|
||||
except httpx.TimeoutException as exc:
|
||||
return WorkerResult(url=download_url, ok=False, error=f"Timeout: {exc}")
|
||||
except httpx.HTTPError as exc:
|
||||
return WorkerResult(url=download_url, ok=False, error=f"HTTP error: {exc}")
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return WorkerResult(url=download_url, ok=False, error=str(exc))
|
||||
|
||||
|
||||
async def call_worker_with_retries(
|
||||
client: httpx.AsyncClient,
|
||||
*,
|
||||
worker_url: str,
|
||||
auth_token: str,
|
||||
download_url: str,
|
||||
user_agent: str,
|
||||
key_prefix: str,
|
||||
timeout: float = 600.0,
|
||||
max_retries: int = 3,
|
||||
backoff_base: float = 2.0,
|
||||
backoff_max: float = 60.0,
|
||||
) -> WorkerResult:
|
||||
"""Call the worker with exponential backoff + jitter on failure."""
|
||||
last_result: WorkerResult | None = None
|
||||
|
||||
for attempt in range(1, max_retries + 1):
|
||||
result = await call_worker(
|
||||
client,
|
||||
worker_url=worker_url,
|
||||
auth_token=auth_token,
|
||||
download_url=download_url,
|
||||
user_agent=user_agent,
|
||||
key_prefix=key_prefix,
|
||||
timeout=timeout,
|
||||
)
|
||||
if result.ok:
|
||||
return result
|
||||
|
||||
last_result = result
|
||||
|
||||
if attempt < max_retries:
|
||||
delay = min(backoff_base ** attempt, backoff_max)
|
||||
jitter = random.uniform(0, delay * 0.5) # noqa: S311
|
||||
await asyncio.sleep(delay + jitter)
|
||||
|
||||
assert last_result is not None # noqa: S101
|
||||
return last_result
|
||||
Generated
+190
@@ -0,0 +1,190 @@
|
||||
version = 1
|
||||
revision = 3
|
||||
requires-python = ">=3.11"
|
||||
|
||||
[[package]]
|
||||
name = "anyio"
|
||||
version = "4.12.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "idna" },
|
||||
{ name = "typing-extensions", marker = "python_full_version < '3.13'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/96/f0/5eb65b2bb0d09ac6776f2eb54adee6abe8228ea05b20a5ad0e4945de8aac/anyio-4.12.1.tar.gz", hash = "sha256:41cfcc3a4c85d3f05c932da7c26d0201ac36f72abd4435ba90d0464a3ffed703", size = 228685, upload-time = "2026-01-06T11:45:21.246Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/38/0e/27be9fdef66e72d64c0cdc3cc2823101b80585f8119b5c112c2e8f5f7dab/anyio-4.12.1-py3-none-any.whl", hash = "sha256:d405828884fc140aa80a3c667b8beed277f1dfedec42ba031bd6ac3db606ab6c", size = 113592, upload-time = "2026-01-06T11:45:19.497Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "certifi"
|
||||
version = "2026.2.25"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/af/2d/7bf41579a8986e348fa033a31cdd0e4121114f6bce2457e8876010b092dd/certifi-2026.2.25.tar.gz", hash = "sha256:e887ab5cee78ea814d3472169153c2d12cd43b14bd03329a39a9c6e2e80bfba7", size = 155029, upload-time = "2026-02-25T02:54:17.342Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/9a/3c/c17fb3ca2d9c3acff52e30b309f538586f9f5b9c9cf454f3845fc9af4881/certifi-2026.2.25-py3-none-any.whl", hash = "sha256:027692e4402ad994f1c42e52a4997a9763c646b73e4096e4d5d6db8af1d6f0fa", size = 153684, upload-time = "2026-02-25T02:54:15.766Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "d4c-http-ingestor"
|
||||
version = "0.1.0"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "httpx" },
|
||||
{ name = "pyarrow" },
|
||||
{ name = "rich" },
|
||||
]
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "httpx", specifier = ">=0.27" },
|
||||
{ name = "pyarrow", specifier = ">=15" },
|
||||
{ name = "rich", specifier = ">=13" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "h11"
|
||||
version = "0.16.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "httpcore"
|
||||
version = "1.0.9"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "certifi" },
|
||||
{ name = "h11" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "httpx"
|
||||
version = "0.28.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "anyio" },
|
||||
{ name = "certifi" },
|
||||
{ name = "httpcore" },
|
||||
{ name = "idna" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "3.11"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "markdown-it-py"
|
||||
version = "4.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "mdurl" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mdurl"
|
||||
version = "0.1.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyarrow"
|
||||
version = "23.0.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/88/22/134986a4cc224d593c1afde5494d18ff629393d74cc2eddb176669f234a4/pyarrow-23.0.1.tar.gz", hash = "sha256:b8c5873e33440b2bc2f4a79d2b47017a89c5a24116c055625e6f2ee50523f019", size = 1167336, upload-time = "2026-02-16T10:14:12.39Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/b0/41/8e6b6ef7e225d4ceead8459427a52afdc23379768f54dd3566014d7618c1/pyarrow-23.0.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:6f0147ee9e0386f519c952cc670eb4a8b05caa594eeffe01af0e25f699e4e9bb", size = 34302230, upload-time = "2026-02-16T10:09:03.859Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/bf/4a/1472c00392f521fea03ae93408bf445cc7bfa1ab81683faf9bc188e36629/pyarrow-23.0.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:0ae6e17c828455b6265d590100c295193f93cc5675eb0af59e49dbd00d2de350", size = 35850050, upload-time = "2026-02-16T10:09:11.877Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0c/b2/bd1f2f05ded56af7f54d702c8364c9c43cd6abb91b0e9933f3d77b4f4132/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:fed7020203e9ef273360b9e45be52a2a47d3103caf156a30ace5247ffb51bdbd", size = 44491918, upload-time = "2026-02-16T10:09:18.144Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0b/62/96459ef5b67957eac38a90f541d1c28833d1b367f014a482cb63f3b7cd2d/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:26d50dee49d741ac0e82185033488d28d35be4d763ae6f321f97d1140eb7a0e9", size = 47562811, upload-time = "2026-02-16T10:09:25.792Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7d/94/1170e235add1f5f45a954e26cd0e906e7e74e23392dcb560de471f7366ec/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3c30143b17161310f151f4a2bcfe41b5ff744238c1039338779424e38579d701", size = 48183766, upload-time = "2026-02-16T10:09:34.645Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0e/2d/39a42af4570377b99774cdb47f63ee6c7da7616bd55b3d5001aa18edfe4f/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db2190fa79c80a23fdd29fef4b8992893f024ae7c17d2f5f4db7171fa30c2c78", size = 50607669, upload-time = "2026-02-16T10:09:44.153Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/00/ca/db94101c187f3df742133ac837e93b1f269ebdac49427f8310ee40b6a58f/pyarrow-23.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:f00f993a8179e0e1c9713bcc0baf6d6c01326a406a9c23495ec1ba9c9ebf2919", size = 27527698, upload-time = "2026-02-16T10:09:50.263Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9a/4b/4166bb5abbfe6f750fc60ad337c43ecf61340fa52ab386da6e8dbf9e63c4/pyarrow-23.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:f4b0dbfa124c0bb161f8b5ebb40f1a680b70279aa0c9901d44a2b5a20806039f", size = 34214575, upload-time = "2026-02-16T10:09:56.225Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e1/da/3f941e3734ac8088ea588b53e860baeddac8323ea40ce22e3d0baa865cc9/pyarrow-23.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:7707d2b6673f7de054e2e83d59f9e805939038eebe1763fe811ee8fa5c0cd1a7", size = 35832540, upload-time = "2026-02-16T10:10:03.428Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/88/7c/3d841c366620e906d54430817531b877ba646310296df42ef697308c2705/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:86ff03fb9f1a320266e0de855dee4b17da6794c595d207f89bba40d16b5c78b9", size = 44470940, upload-time = "2026-02-16T10:10:10.704Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2c/a5/da83046273d990f256cb79796a190bbf7ec999269705ddc609403f8c6b06/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:813d99f31275919c383aab17f0f455a04f5a429c261cc411b1e9a8f5e4aaaa05", size = 47586063, upload-time = "2026-02-16T10:10:17.95Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/5b/3c/b7d2ebcff47a514f47f9da1e74b7949138c58cfeb108cdd4ee62f43f0cf3/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bf5842f960cddd2ef757d486041d57c96483efc295a8c4a0e20e704cbbf39c67", size = 48173045, upload-time = "2026-02-16T10:10:25.363Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/43/b2/b40961262213beaba6acfc88698eb773dfce32ecdf34d19291db94c2bd73/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:564baf97c858ecc03ec01a41062e8f4698abc3e6e2acd79c01c2e97880a19730", size = 50621741, upload-time = "2026-02-16T10:10:33.477Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f6/70/1fdda42d65b28b078e93d75d371b2185a61da89dda4def8ba6ba41ebdeb4/pyarrow-23.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:07deae7783782ac7250989a7b2ecde9b3c343a643f82e8a4df03d93b633006f0", size = 27620678, upload-time = "2026-02-16T10:10:39.31Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/47/10/2cbe4c6f0fb83d2de37249567373d64327a5e4d8db72f486db42875b08f6/pyarrow-23.0.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6b8fda694640b00e8af3c824f99f789e836720aa8c9379fb435d4c4953a756b8", size = 34210066, upload-time = "2026-02-16T10:10:45.487Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cb/4f/679fa7e84dadbaca7a65f7cdba8d6c83febbd93ca12fa4adf40ba3b6362b/pyarrow-23.0.1-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:8ff51b1addc469b9444b7c6f3548e19dc931b172ab234e995a60aea9f6e6025f", size = 35825526, upload-time = "2026-02-16T10:10:52.266Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f9/63/d2747d930882c9d661e9398eefc54f15696547b8983aaaf11d4a2e8b5426/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:71c5be5cbf1e1cb6169d2a0980850bccb558ddc9b747b6206435313c47c37677", size = 44473279, upload-time = "2026-02-16T10:11:01.557Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b3/93/10a48b5e238de6d562a411af6467e71e7aedbc9b87f8d3a35f1560ae30fb/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9b6f4f17b43bc39d56fec96e53fe89d94bac3eb134137964371b45352d40d0c2", size = 47585798, upload-time = "2026-02-16T10:11:09.401Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/5c/20/476943001c54ef078dbf9542280e22741219a184a0632862bca4feccd666/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fc13fc6c403d1337acab46a2c4346ca6c9dec5780c3c697cf8abfd5e19b6b37", size = 48179446, upload-time = "2026-02-16T10:11:17.781Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4b/b6/5dd0c47b335fcd8edba9bfab78ad961bd0fd55ebe53468cc393f45e0be60/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5c16ed4f53247fa3ffb12a14d236de4213a4415d127fe9cebed33d51671113e2", size = 50623972, upload-time = "2026-02-16T10:11:26.185Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d5/09/a532297c9591a727d67760e2e756b83905dd89adb365a7f6e9c72578bcc1/pyarrow-23.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:cecfb12ef629cf6be0b1887f9f86463b0dd3dc3195ae6224e74006be4736035a", size = 27540749, upload-time = "2026-02-16T10:12:23.297Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a5/8e/38749c4b1303e6ae76b3c80618f84861ae0c55dd3c2273842ea6f8258233/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:29f7f7419a0e30264ea261fdc0e5fe63ce5a6095003db2945d7cd78df391a7e1", size = 34471544, upload-time = "2026-02-16T10:11:32.535Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a3/73/f237b2bc8c669212f842bcfd842b04fc8d936bfc9d471630569132dc920d/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:33d648dc25b51fd8055c19e4261e813dfc4d2427f068bcecc8b53d01b81b0500", size = 35949911, upload-time = "2026-02-16T10:11:39.813Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0c/86/b912195eee0903b5611bf596833def7d146ab2d301afeb4b722c57ffc966/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cd395abf8f91c673dd3589cadc8cc1ee4e8674fa61b2e923c8dd215d9c7d1f41", size = 44520337, upload-time = "2026-02-16T10:11:47.764Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/69/c2/f2a717fb824f62d0be952ea724b4f6f9372a17eed6f704b5c9526f12f2f1/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:00be9576d970c31defb5c32eb72ef585bf600ef6d0a82d5eccaae96639cf9d07", size = 47548944, upload-time = "2026-02-16T10:11:56.607Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/84/a7/90007d476b9f0dc308e3bc57b832d004f848fd6c0da601375d20d92d1519/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c2139549494445609f35a5cda4eb94e2c9e4d704ce60a095b342f82460c73a83", size = 48236269, upload-time = "2026-02-16T10:12:04.47Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b0/3f/b16fab3e77709856eb6ac328ce35f57a6d4a18462c7ca5186ef31b45e0e0/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7044b442f184d84e2351e5084600f0d7343d6117aabcbc1ac78eb1ae11eb4125", size = 50604794, upload-time = "2026-02-16T10:12:11.797Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e9/a1/22df0620a9fac31d68397a75465c344e83c3dfe521f7612aea33e27ab6c0/pyarrow-23.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:a35581e856a2fafa12f3f54fce4331862b1cfb0bef5758347a858a4aa9d6bae8", size = 27660642, upload-time = "2026-02-16T10:12:17.746Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8d/1b/6da9a89583ce7b23ac611f183ae4843cd3a6cf54f079549b0e8c14031e73/pyarrow-23.0.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:5df1161da23636a70838099d4aaa65142777185cc0cdba4037a18cee7d8db9ca", size = 34238755, upload-time = "2026-02-16T10:12:32.819Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ae/b5/d58a241fbe324dbaeb8df07be6af8752c846192d78d2272e551098f74e88/pyarrow-23.0.1-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:fa8e51cb04b9f8c9c5ace6bab63af9a1f88d35c0d6cbf53e8c17c098552285e1", size = 35847826, upload-time = "2026-02-16T10:12:38.949Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/54/a5/8cbc83f04aba433ca7b331b38f39e000efd9f0c7ce47128670e737542996/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:0b95a3994f015be13c63148fef8832e8a23938128c185ee951c98908a696e0eb", size = 44536859, upload-time = "2026-02-16T10:12:45.467Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/36/2e/c0f017c405fcdc252dbccafbe05e36b0d0eb1ea9a958f081e01c6972927f/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:4982d71350b1a6e5cfe1af742c53dfb759b11ce14141870d05d9e540d13bc5d1", size = 47614443, upload-time = "2026-02-16T10:12:55.525Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/af/6b/2314a78057912f5627afa13ba43809d9d653e6630859618b0fd81a4e0759/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c250248f1fe266db627921c89b47b7c06fee0489ad95b04d50353537d74d6886", size = 48232991, upload-time = "2026-02-16T10:13:04.729Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/40/f2/1bcb1d3be3460832ef3370d621142216e15a2c7c62602a4ea19ec240dd64/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5f4763b83c11c16e5f4c15601ba6dfa849e20723b46aa2617cb4bffe8768479f", size = 50645077, upload-time = "2026-02-16T10:13:14.147Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/eb/3f/b1da7b61cd66566a4d4c8383d376c606d1c34a906c3f1cb35c479f59d1aa/pyarrow-23.0.1-cp314-cp314-win_amd64.whl", hash = "sha256:3a4c85ef66c134161987c17b147d6bffdca4566f9a4c1d81a0a01cdf08414ea5", size = 28234271, upload-time = "2026-02-16T10:14:09.397Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b5/78/07f67434e910a0f7323269be7bfbf58699bd0c1d080b18a1ab49ba943fe8/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:17cd28e906c18af486a499422740298c52d7c6795344ea5002a7720b4eadf16d", size = 34488692, upload-time = "2026-02-16T10:13:21.541Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/50/76/34cf7ae93ece1f740a04910d9f7e80ba166b9b4ab9596a953e9e62b90fe1/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:76e823d0e86b4fb5e1cf4a58d293036e678b5a4b03539be933d3b31f9406859f", size = 35964383, upload-time = "2026-02-16T10:13:28.63Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/46/90/459b827238936d4244214be7c684e1b366a63f8c78c380807ae25ed92199/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:a62e1899e3078bf65943078b3ad2a6ddcacf2373bc06379aac61b1e548a75814", size = 44538119, upload-time = "2026-02-16T10:13:35.506Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/28/a1/93a71ae5881e99d1f9de1d4554a87be37da11cd6b152239fb5bd924fdc64/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:df088e8f640c9fae3b1f495b3c64755c4e719091caf250f3a74d095ddf3c836d", size = 47571199, upload-time = "2026-02-16T10:13:42.504Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/88/a3/d2c462d4ef313521eaf2eff04d204ac60775263f1fb08c374b543f79f610/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:46718a220d64677c93bc243af1d44b55998255427588e400677d7192671845c7", size = 48259435, upload-time = "2026-02-16T10:13:49.226Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cc/f1/11a544b8c3d38a759eb3fbb022039117fd633e9a7b19e4841cc3da091915/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a09f3876e87f48bc2f13583ab551f0379e5dfb83210391e68ace404181a20690", size = 50629149, upload-time = "2026-02-16T10:13:57.238Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/50/f2/c0e76a0b451ffdf0cf788932e182758eb7558953f4f27f1aff8e2518b653/pyarrow-23.0.1-cp314-cp314t-win_amd64.whl", hash = "sha256:527e8d899f14bd15b740cd5a54ad56b7f98044955373a17179d5956ddb93d9ce", size = 28365807, upload-time = "2026-02-16T10:14:03.892Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pygments"
|
||||
version = "2.19.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rich"
|
||||
version = "14.3.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "markdown-it-py" },
|
||||
{ name = "pygments" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/b3/c6/f3b320c27991c46f43ee9d856302c70dc2d0fb2dba4842ff739d5f46b393/rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b", size = 230582, upload-time = "2026-02-19T17:23:12.474Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/14/25/b208c5683343959b670dc001595f2f3737e051da617f66c31f7c4fa93abc/rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d", size = 310458, upload-time = "2026-02-19T17:23:13.732Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typing-extensions"
|
||||
version = "4.15.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
|
||||
]
|
||||
Reference in New Issue
Block a user