From 12bebb72b196e2332a7e526c4685db38d72bbf5f Mon Sep 17 00:00:00 2001 From: Diego Ripley Date: Tue, 2 Jun 2026 13:50:18 -0400 Subject: [PATCH] Download 2013 Vancouver parquet and create text file for each ECW URL --- .../download.sh | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100755 scripts/ca-bc_vancouver-2013A00055915022_d4c-datapkg-orthoimagery_2013_075mm/download.sh diff --git a/scripts/ca-bc_vancouver-2013A00055915022_d4c-datapkg-orthoimagery_2013_075mm/download.sh b/scripts/ca-bc_vancouver-2013A00055915022_d4c-datapkg-orthoimagery_2013_075mm/download.sh new file mode 100755 index 0000000..a00123f --- /dev/null +++ b/scripts/ca-bc_vancouver-2013A00055915022_d4c-datapkg-orthoimagery_2013_075mm/download.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# +# download.sh — Download Vancouver 2013 orthoimagery index file +# +# This script: +# 1. Downloads the dataset index as a Parquet file from Vancouver Open Data. +# 2. Extracts image URLs from the Parquet file using DuckDB. +# +# Usage: +# cd scripts/ca-bc_vancouver-2013A00055915022_d4c-datapkg-orthoimagery_2013_075mm +# bash download.sh +# + +set -euo pipefail + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +DATASET_ID="ca-bc_vancouver-2013A00055915022_d4c-datapkg-orthoimagery_2013_075mm" +INDEX_BASE="ca-bc_vancouver-2013A00055915022_d4c-datapkg-orthoimagery_index_2013_075mm_2026-06-02" +PARQUET_FILE="${SCRIPT_DIR}/${INDEX_BASE}.parquet" +URL_FILE="${SCRIPT_DIR}/${INDEX_BASE}.txt" +DATA_INPUT_DIR="${SCRIPT_DIR}/../../data/input/${DATASET_ID}" + +PARQUET_URL="https://opendata.vancouver.ca/api/explore/v2.1/catalog/datasets/orthophoto-imagery-2013/exports/parquet" + +# --------------------------------------------------------------------------- +# Step 1 — Download the Parquet index file +# --------------------------------------------------------------------------- +echo "==> Step 1: Downloading Parquet index file..." +aria2c \ + "${PARQUET_URL}" \ + -d "${SCRIPT_DIR}" \ + -o "${INDEX_BASE}.parquet" + +echo " Saved to: ${PARQUET_FILE}" + +# --------------------------------------------------------------------------- +# Step 2 — Extract image URLs from the Parquet file using DuckDB +# --------------------------------------------------------------------------- +echo "==> Step 2: Extracting image URLs with DuckDB..." +duckdb -noheader -csv -c \ + "SELECT ecw_url FROM '${PARQUET_FILE}';" \ + | tr -d '"' \ + > "${URL_FILE}" + +echo " URL list written to: ${URL_FILE}" + +