Files
d4c-infra-distribution/scripts/05_cloudflare_http_ingestor/src/index.js
T

318 lines
10 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { AwsClient } from "aws4fetch";
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
const MIN_PART_SIZE = 5 * 1024 * 1024; // 5 MiB S3 minimum for multipart parts
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/** Return a plain-text Response with the given status. */
function textResponse(body, status = 200) {
return new Response(body, {
status,
headers: { "Content-Type": "text/plain; charset=utf-8" },
});
}
/** Return a JSON Response. */
function jsonResponse(obj, status = 200) {
return new Response(JSON.stringify(obj), {
status,
headers: { "Content-Type": "application/json" },
});
}
/** Derive the S3 object key from the download URL (filename only). */
function objectKeyFromUrl(downloadUrl, prefix) {
const pathname = new URL(downloadUrl).pathname;
const filename = pathname.split("/").pop() || "unnamed";
return prefix ? `${prefix.replace(/\/+$/, "")}/${filename}` : filename;
}
/** Build a pre-configured AwsClient for S3 in the target region. */
function makeAwsClient(env) {
return new AwsClient({
accessKeyId: env.AWS_ACCESS_KEY_ID,
secretAccessKey: env.AWS_SECRET_ACCESS_KEY,
region: env.S3_REGION || "us-west-2",
service: "s3",
});
}
/**
* Construct the S3 endpoint URL for a given bucket + key.
* Uses path-style addressing (s3.region.amazonaws.com/bucket/key) which is
* required for bucket names containing dots (e.g. "us-west-2.opendata.source.coop").
* If a custom S3_ENDPOINT is set, it is used as the base instead.
*/
function s3Url(bucket, key, region, endpoint) {
let base;
if (endpoint) {
// Normalise: strip trailing slashes, prepend https:// if no scheme given
base = endpoint.replace(/\/+$/, "");
if (!/^https?:\/\//i.test(base)) {
base = `https://${base}`;
}
} else {
base = `https://s3.${region}.amazonaws.com`;
}
return `${base}/${bucket}/${encodeURI(key)}`;
}
// ---------------------------------------------------------------------------
// S3 Upload Single PUT (streaming, requires known Content-Length)
// ---------------------------------------------------------------------------
async function putObjectStreaming(aws, bucket, region, key, body, contentLength, contentType, endpoint) {
const url = s3Url(bucket, key, region, endpoint);
const headers = {
"Content-Type": contentType || "application/octet-stream",
"Content-Length": String(contentLength),
"x-amz-acl": "bucket-owner-full-control",
};
const resp = await aws.fetch(url, {
method: "PUT",
headers,
body, // ReadableStream streamed directly, no buffering
});
if (!resp.ok) {
const text = await resp.text();
throw new Error(`S3 PUT failed (${resp.status}): ${text}`);
}
return resp;
}
// ---------------------------------------------------------------------------
// S3 Upload Multipart (streaming, for unknown Content-Length)
// ---------------------------------------------------------------------------
async function initiateMultipart(aws, bucket, region, key, contentType, endpoint) {
const url = `${s3Url(bucket, key, region, endpoint)}?uploads`;
const resp = await aws.fetch(url, {
method: "POST",
headers: {
"Content-Type": contentType || "application/octet-stream",
"x-amz-acl": "bucket-owner-full-control",
},
});
if (!resp.ok) {
const text = await resp.text();
throw new Error(`Initiate multipart failed (${resp.status}): ${text}`);
}
const xml = await resp.text();
const match = xml.match(/<UploadId>(.+?)<\/UploadId>/);
if (!match) throw new Error("Could not parse UploadId from response");
return match[1];
}
async function uploadPart(aws, bucket, region, key, uploadId, partNumber, body, length, endpoint) {
const url = `${s3Url(bucket, key, region, endpoint)}?partNumber=${partNumber}&uploadId=${encodeURIComponent(uploadId)}`;
const resp = await aws.fetch(url, {
method: "PUT",
headers: { "Content-Length": String(length) },
body,
});
if (!resp.ok) {
const text = await resp.text();
throw new Error(`Upload part ${partNumber} failed (${resp.status}): ${text}`);
}
const etag = resp.headers.get("ETag");
return etag;
}
async function completeMultipart(aws, bucket, region, key, uploadId, parts, endpoint) {
const partsXml = parts
.map((p) => `<Part><PartNumber>${p.partNumber}</PartNumber><ETag>${p.etag}</ETag></Part>`)
.join("");
const xmlBody = `<CompleteMultipartUpload>${partsXml}</CompleteMultipartUpload>`;
const url = `${s3Url(bucket, key, region, endpoint)}?uploadId=${encodeURIComponent(uploadId)}`;
const resp = await aws.fetch(url, {
method: "POST",
headers: { "Content-Type": "application/xml" },
body: xmlBody,
});
if (!resp.ok) {
const text = await resp.text();
throw new Error(`Complete multipart failed (${resp.status}): ${text}`);
}
return resp;
}
async function abortMultipart(aws, bucket, region, key, uploadId, endpoint) {
const url = `${s3Url(bucket, key, region, endpoint)}?uploadId=${encodeURIComponent(uploadId)}`;
try {
await aws.fetch(url, { method: "DELETE" });
} catch {
// best-effort cleanup
}
}
/**
* Read from a ReadableStream in ≥ MIN_PART_SIZE chunks and upload each as an
* S3 multipart part. Memory usage stays bounded to ~MIN_PART_SIZE at a time.
*/
async function multipartStreamUpload(aws, bucket, region, key, stream, contentType, endpoint) {
const uploadId = await initiateMultipart(aws, bucket, region, key, contentType, endpoint);
const parts = [];
let partNumber = 1;
let buffer = [];
let bufferSize = 0;
const reader = stream.getReader();
try {
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer.push(value);
bufferSize += value.byteLength;
// Flush when we've accumulated enough for a part
if (bufferSize >= MIN_PART_SIZE) {
const blob = new Blob(buffer);
const etag = await uploadPart(aws, bucket, region, key, uploadId, partNumber, blob, bufferSize, endpoint);
parts.push({ partNumber, etag });
partNumber++;
buffer = [];
bufferSize = 0;
}
}
// Upload remaining bytes as the final part
if (bufferSize > 0) {
const blob = new Blob(buffer);
const etag = await uploadPart(aws, bucket, region, key, uploadId, partNumber, blob, bufferSize, endpoint);
parts.push({ partNumber, etag });
}
await completeMultipart(aws, bucket, region, key, uploadId, parts, endpoint);
} catch (err) {
await abortMultipart(aws, bucket, region, key, uploadId, endpoint);
throw err;
}
}
// ---------------------------------------------------------------------------
// Main handler
// ---------------------------------------------------------------------------
export default {
async fetch(request, env) {
// ---- Method check ----
if (request.method !== "POST") {
return textResponse("Method Not Allowed. Use POST.", 405);
}
// ---- Auth check ----
const authHeader = request.headers.get("Authorization") || "";
const token = authHeader.startsWith("Bearer ") ? authHeader.slice(7) : "";
if (!token || token !== env.AUTH_TOKEN) {
return textResponse("Unauthorized", 401);
}
// ---- Content-Type check ----
const ct = request.headers.get("Content-Type") || "";
if (!ct.includes("application/json")) {
return textResponse("Content-Type must be application/json", 415);
}
// ---- Parse body ----
let payload;
try {
payload = await request.json();
} catch {
return textResponse("Invalid JSON body", 400);
}
const { download_url, user_agent, key_prefix } = payload;
if (!download_url || !user_agent) {
return jsonResponse(
{ error: "'download_url' and 'user_agent' are required." },
400,
);
}
// Validate the download URL
let parsedUrl;
try {
parsedUrl = new URL(download_url);
if (!["http:", "https:"].includes(parsedUrl.protocol)) throw new Error();
} catch {
return jsonResponse({ error: "Invalid download_url" }, 400);
}
// ---- Fetch the source file (streaming) ----
let sourceResp;
try {
sourceResp = await fetch(download_url, {
headers: { "User-Agent": user_agent },
redirect: "follow",
});
} catch (err) {
return jsonResponse({ error: `Download failed: ${err.message}` }, 502);
}
if (!sourceResp.ok) {
return jsonResponse(
{ error: `Source returned HTTP ${sourceResp.status}` },
502,
);
}
// ---- Prepare S3 parameters ----
const bucket = env.S3_BUCKET;
const region = env.S3_REGION || "us-west-2";
const endpoint = env.S3_ENDPOINT || "";
const key = objectKeyFromUrl(download_url, key_prefix || "");
const sourceContentType =
sourceResp.headers.get("Content-Type") || "application/octet-stream";
const contentLength = sourceResp.headers.get("Content-Length");
const aws = makeAwsClient(env);
// ---- Upload to S3 ----
try {
if (contentLength && Number(contentLength) > 0) {
// Known size → single streaming PUT (zero extra memory)
await putObjectStreaming(
aws,
bucket,
region,
key,
sourceResp.body,
contentLength,
sourceContentType,
endpoint,
);
} else {
// Unknown size → multipart streaming upload (≤ 5 MiB buffer)
await multipartStreamUpload(
aws,
bucket,
region,
key,
sourceResp.body,
sourceContentType,
endpoint,
);
}
} catch (err) {
return jsonResponse({ error: `S3 upload failed: ${err.message}` }, 502);
}
return jsonResponse({
ok: true,
bucket,
key,
content_type: sourceContentType,
...(contentLength ? { size_bytes: Number(contentLength) } : {}),
});
},
};