Made improvements

2026-06-13 14:10:53 +02:00 · 2026-03-14 07:34:18 -04:00
parent 9c35374b04
commit c3f0fb641b
4 changed files with 31840 additions and 31751 deletions
@@ -3,6 +3,7 @@
 from __future__ import annotations

 import sqlite3
+import sys
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Sequence
@@ -48,6 +49,77 @@ CREATE INDEX IF NOT EXISTS ix_downloads_dataset ON downloads(dataset_id);
 CREATE INDEX IF NOT EXISTS ix_downloads_status  ON downloads(status);
 """

+# Columns in the canonical schema (order matters for the migration INSERT).
+_CANONICAL_COLUMNS = [
+    "url", "dataset_id", "status", "http_status", "etag", "error",
+    "started_at", "finished_at", "multipart_part_size", "multipart_number_parts",
+]
+
+
+# ---------------------------------------------------------------------------
+# Migration helpers
+# ---------------------------------------------------------------------------
+
+def _table_has_primary_key(conn: sqlite3.Connection, table: str) -> bool:
+    """Return ``True`` if *table* has at least one column marked as PK."""
+    cur = conn.execute(f"PRAGMA table_info({table})")
+    for row in cur.fetchall():
+        if row["pk"]:  # pk column is non-zero for PK members
+            return True
+    return False
+
+
+def _existing_columns(conn: sqlite3.Connection, table: str) -> list[str]:
+    """Return the column names of *table* in definition order."""
+    cur = conn.execute(f"PRAGMA table_info({table})")
+    return [row["name"] for row in cur.fetchall()]
+
+
+def _migrate_downloads(conn: sqlite3.Connection) -> None:
+    """Recreate the ``downloads`` table with the correct PRIMARY KEY.
+
+    This handles databases that were originally created by an external tool
+    (e.g. pandas / DuckDB) where ``url`` was defined as plain ``VARCHAR``
+    without a PRIMARY KEY constraint.  Existing data is preserved; duplicate
+    ``url`` values (if any) are collapsed by keeping the most recent row
+    (highest ``rowid``).
+    """
+    old_cols = _existing_columns(conn, "downloads")
+    # Only copy columns that exist in both old and new schemas.
+    shared = [c for c in _CANONICAL_COLUMNS if c in old_cols]
+
+    cols_csv = ", ".join(shared)
+
+    conn.executescript(f"""\
+        BEGIN;
+        ALTER TABLE downloads RENAME TO _downloads_old;
+        CREATE TABLE downloads (
+          url                    TEXT PRIMARY KEY,
+          dataset_id             TEXT NOT NULL DEFAULT '',
+          status                 TEXT NOT NULL DEFAULT '',
+          http_status            INTEGER,
+          etag                   TEXT,
+          error                  TEXT,
+          started_at             TEXT NOT NULL DEFAULT '',
+          finished_at            TEXT,
+          multipart_part_size    INTEGER,
+          multipart_number_parts INTEGER
+        );
+        INSERT OR IGNORE INTO downloads ({cols_csv})
+            SELECT {cols_csv}
+            FROM _downloads_old
+            ORDER BY rowid DESC;
+        DROP TABLE _downloads_old;
+        CREATE INDEX IF NOT EXISTS ix_downloads_dataset ON downloads(dataset_id);
+        CREATE INDEX IF NOT EXISTS ix_downloads_status  ON downloads(status);
+        COMMIT;
+    """)
+    print(
+        f"[migrate] Recreated 'downloads' table with PRIMARY KEY(url); "
+        f"copied columns: {cols_csv}",
+        file=sys.stderr,
+    )
+

 # ---------------------------------------------------------------------------
 # Database handle
@@ -62,6 +134,22 @@ class DownloadsDB:
        self._conn = sqlite3.connect(str(self.path))
        self._conn.row_factory = sqlite3.Row
        self._conn.execute("PRAGMA journal_mode=WAL")
+
+        # --- Schema migration ---------------------------------------------------
+        # If the table already exists but was created without a PRIMARY KEY
+        # (e.g. by pandas/DuckDB), recreate it with the correct schema.
+        cur = self._conn.execute(
+            "SELECT name FROM sqlite_master WHERE type='table' AND name='downloads'"
+        )
+        if cur.fetchone() is not None:
+            if not _table_has_primary_key(self._conn, "downloads"):
+                print(
+                    "[migrate] Detected 'downloads' table without PRIMARY KEY — migrating…",
+                    file=sys.stderr,
+                )
+                _migrate_downloads(self._conn)
+        # ------------------------------------------------------------------------
+
        self._conn.executescript(_SCHEMA_SQL)

    # -- queries -------------------------------------------------------------