Download 2013 Vancouver parquet and create text file for each ECW URL

2026-06-13 14:10:53 +02:00 · 2026-06-02 13:50:18 -04:00
parent ca67c9539b
commit 12bebb72b1
1 changed files with 50 additions and 0 deletions
@@ -0,0 +1,50 @@
 #!/bin/bash
 #
 # download.sh — Download Vancouver 2013 orthoimagery index file
 #
 # This script:
 #   1. Downloads the dataset index as a Parquet file from Vancouver Open Data.
 #   2. Extracts image URLs from the Parquet file using DuckDB.
 #
 # Usage:
 #   cd scripts/ca-bc_vancouver-2013A00055915022_d4c-datapkg-orthoimagery_2013_075mm
 #   bash download.sh
 #
 set -euo pipefail
 # ---------------------------------------------------------------------------
 # Configuration
 # ---------------------------------------------------------------------------
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 DATASET_ID="ca-bc_vancouver-2013A00055915022_d4c-datapkg-orthoimagery_2013_075mm"
 INDEX_BASE="ca-bc_vancouver-2013A00055915022_d4c-datapkg-orthoimagery_index_2013_075mm_2026-06-02"
 PARQUET_FILE="${SCRIPT_DIR}/${INDEX_BASE}.parquet"
 URL_FILE="${SCRIPT_DIR}/${INDEX_BASE}.txt"
 DATA_INPUT_DIR="${SCRIPT_DIR}/../../data/input/${DATASET_ID}"
 PARQUET_URL="https://opendata.vancouver.ca/api/explore/v2.1/catalog/datasets/orthophoto-imagery-2013/exports/parquet"
 # ---------------------------------------------------------------------------
 # Step 1 — Download the Parquet index file
 # ---------------------------------------------------------------------------
 echo "==> Step 1: Downloading Parquet index file..."
 aria2c \
  "${PARQUET_URL}" \
  -d "${SCRIPT_DIR}" \
  -o "${INDEX_BASE}.parquet"
 echo "    Saved to: ${PARQUET_FILE}"
 # ---------------------------------------------------------------------------
 # Step 2 — Extract image URLs from the Parquet file using DuckDB
 # ---------------------------------------------------------------------------
 echo "==> Step 2: Extracting image URLs with DuckDB..."
 duckdb -noheader -csv -c \
  "SELECT ecw_url FROM '${PARQUET_FILE}';" \
  | tr -d '"' \
  > "${URL_FILE}"
 echo "    URL list written to: ${URL_FILE}"