mirror of
https://github.com/dataforcanada/d4c-infra-distribution.git
synced 2026-06-13 14:10:53 +02:00
Made improvements
This commit is contained in:
+31747
-31747
File diff suppressed because it is too large
Load Diff
@@ -3,6 +3,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Sequence
|
||||
@@ -48,6 +49,77 @@ CREATE INDEX IF NOT EXISTS ix_downloads_dataset ON downloads(dataset_id);
|
||||
CREATE INDEX IF NOT EXISTS ix_downloads_status ON downloads(status);
|
||||
"""
|
||||
|
||||
# Columns in the canonical schema (order matters for the migration INSERT).
|
||||
_CANONICAL_COLUMNS = [
|
||||
"url", "dataset_id", "status", "http_status", "etag", "error",
|
||||
"started_at", "finished_at", "multipart_part_size", "multipart_number_parts",
|
||||
]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Migration helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _table_has_primary_key(conn: sqlite3.Connection, table: str) -> bool:
|
||||
"""Return ``True`` if *table* has at least one column marked as PK."""
|
||||
cur = conn.execute(f"PRAGMA table_info({table})")
|
||||
for row in cur.fetchall():
|
||||
if row["pk"]: # pk column is non-zero for PK members
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _existing_columns(conn: sqlite3.Connection, table: str) -> list[str]:
|
||||
"""Return the column names of *table* in definition order."""
|
||||
cur = conn.execute(f"PRAGMA table_info({table})")
|
||||
return [row["name"] for row in cur.fetchall()]
|
||||
|
||||
|
||||
def _migrate_downloads(conn: sqlite3.Connection) -> None:
|
||||
"""Recreate the ``downloads`` table with the correct PRIMARY KEY.
|
||||
|
||||
This handles databases that were originally created by an external tool
|
||||
(e.g. pandas / DuckDB) where ``url`` was defined as plain ``VARCHAR``
|
||||
without a PRIMARY KEY constraint. Existing data is preserved; duplicate
|
||||
``url`` values (if any) are collapsed by keeping the most recent row
|
||||
(highest ``rowid``).
|
||||
"""
|
||||
old_cols = _existing_columns(conn, "downloads")
|
||||
# Only copy columns that exist in both old and new schemas.
|
||||
shared = [c for c in _CANONICAL_COLUMNS if c in old_cols]
|
||||
|
||||
cols_csv = ", ".join(shared)
|
||||
|
||||
conn.executescript(f"""\
|
||||
BEGIN;
|
||||
ALTER TABLE downloads RENAME TO _downloads_old;
|
||||
CREATE TABLE downloads (
|
||||
url TEXT PRIMARY KEY,
|
||||
dataset_id TEXT NOT NULL DEFAULT '',
|
||||
status TEXT NOT NULL DEFAULT '',
|
||||
http_status INTEGER,
|
||||
etag TEXT,
|
||||
error TEXT,
|
||||
started_at TEXT NOT NULL DEFAULT '',
|
||||
finished_at TEXT,
|
||||
multipart_part_size INTEGER,
|
||||
multipart_number_parts INTEGER
|
||||
);
|
||||
INSERT OR IGNORE INTO downloads ({cols_csv})
|
||||
SELECT {cols_csv}
|
||||
FROM _downloads_old
|
||||
ORDER BY rowid DESC;
|
||||
DROP TABLE _downloads_old;
|
||||
CREATE INDEX IF NOT EXISTS ix_downloads_dataset ON downloads(dataset_id);
|
||||
CREATE INDEX IF NOT EXISTS ix_downloads_status ON downloads(status);
|
||||
COMMIT;
|
||||
""")
|
||||
print(
|
||||
f"[migrate] Recreated 'downloads' table with PRIMARY KEY(url); "
|
||||
f"copied columns: {cols_csv}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Database handle
|
||||
@@ -62,6 +134,22 @@ class DownloadsDB:
|
||||
self._conn = sqlite3.connect(str(self.path))
|
||||
self._conn.row_factory = sqlite3.Row
|
||||
self._conn.execute("PRAGMA journal_mode=WAL")
|
||||
|
||||
# --- Schema migration ---------------------------------------------------
|
||||
# If the table already exists but was created without a PRIMARY KEY
|
||||
# (e.g. by pandas/DuckDB), recreate it with the correct schema.
|
||||
cur = self._conn.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='downloads'"
|
||||
)
|
||||
if cur.fetchone() is not None:
|
||||
if not _table_has_primary_key(self._conn, "downloads"):
|
||||
print(
|
||||
"[migrate] Detected 'downloads' table without PRIMARY KEY — migrating…",
|
||||
file=sys.stderr,
|
||||
)
|
||||
_migrate_downloads(self._conn)
|
||||
# ------------------------------------------------------------------------
|
||||
|
||||
self._conn.executescript(_SCHEMA_SQL)
|
||||
|
||||
# -- queries -------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user