#!/bin/bash
# =============================================================================
# test-disk-handling-lb.sh
#
# End-to-end disk-handling test suite — Load Balancer variant.
#
# Identical to test-disk-handling.sh but submits tasks directly through the
# OVH Octavia LoadBalancer (http://<LB_ENDPOINT>) instead of a kubectl
# port-forward.  This tests that the LB correctly forwards POST request bodies
# to the Funnel HTTP server.
#
# What it does:
#   0. Records current Cinder volumes (baseline)
#   1. Sets Karpenter consolidateAfter=Never (freeze — prevents mid-test consolidation)
#   2. Submits 3 small Funnel tasks (quick, low resources):
#        A. hello        — basic pull + run (confirms containerd works)
#        B. disk-write   — writes 5GB, confirms Cinder LV is the data disk
#        C. nfs-check    — reads /mnt/shared, confirms NFS keepalive works
#   3. Waits for all tasks to complete
#   4. On the live worker node (via kubectl exec into funnel-disk-setup pod):
#        - Checks /var/lib/containerd is a symlink → /var/funnel-work/containerd
#        - Reports df for /var/funnel-work and /dev/sda1
#        - Checks autoscaler service status
#   5. Sets consolidateAfter=1m to trigger node drain, waits for node removal
#   6. Checks that the node is gone and all Cinder volumes are deleted
#   7. Restores consolidateAfter to 5m
#
# Usage:
#   ./test-disk-handling-lb.sh [--no-restore]
#
# Requirements:
#   - KUBECONFIG set or ~/.kube/ovh-tes.yaml present
#   - openstack CLI available + creds in env.variables
#   - LB_ENDPOINT set in env.variables (or FUNNEL_SERVER in environment)
# =============================================================================
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
EXAMPLES_DIR="${SCRIPT_DIR}/funnel_examples"
ENV_FILE="${SCRIPT_DIR}/env.variables"

# ─── Config ─────────────────────────────────────────────────────────────────
KUBECONFIG="${KUBECONFIG:-${HOME}/.kube/ovh-tes.yaml}"
export KUBECONFIG

# FUNNEL_SERVER is derived from LB_ENDPOINT (set in env.variables).
# env.variables is sourced early enough that LB_ENDPOINT is available here;
# if FUNNEL_SERVER is already set in the environment it takes precedence.
# The value is resolved after load_openstack() so that env.variables has been
# read; we do a late-binding default instead.
FUNNEL_SERVER="${FUNNEL_SERVER:-}"   # resolved after load_openstack()
FUNNEL_NAMESPACE="funnel"
NODEPOOL_NAME="workers"

TEST_CONSOLIDATE_AFTER="Never"  # freeze consolidation while tasks are running
CLEANUP_CONSOLIDATE_AFTER="1m" # switched to this after tasks complete to trigger Step 6 cleanup
PROD_CONSOLIDATE_AFTER="5m"    # restored at the end (or on trap)

TASK_TIMEOUT_SEC=600           # 10 min max per task
NODE_DRAIN_WAIT_SEC=480        # 8 min: 1m consolidateAfter + drain ~30s + up to 360s Karpenter
                               # Cinder cleanup + OVH delete ~30s + margin
CLEANUP_WAIT_SEC=60            # 1 min extra wait after node gone (volumes deleted before VM now)

NO_RESTORE="${1:-}"

# ─── Colours ────────────────────────────────────────────────────────────────
GREEN='\033[0;32m'; YELLOW='\033[1;33m'; RED='\033[0;31m'; NC='\033[0m'
ok()   { echo -e "${GREEN}[OK]${NC}    $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC}  $*"; }
fail() { echo -e "${RED}[FAIL]${NC}  $*"; }
info() { echo -e "        $*"; }

# ─── Cleanup trap ────────────────────────────────────────────────────────────
cleanup() {
  # Restore Karpenter
  if [[ "$NO_RESTORE" != "--no-restore" ]]; then
    echo ""
    echo "Restoring consolidateAfter to ${PROD_CONSOLIDATE_AFTER}..."
    kubectl patch nodepool "${NODEPOOL_NAME}" --type=merge \
      -p "{\"spec\":{\"disruption\":{\"consolidateAfter\":\"${PROD_CONSOLIDATE_AFTER}\"}}}" \
      2>/dev/null || warn "Could not restore consolidateAfter — do it manually!"
    ok "consolidateAfter restored to ${PROD_CONSOLIDATE_AFTER}"
  fi
}
trap cleanup EXIT

# ─── OpenStack auth ──────────────────────────────────────────────────────────
load_openstack() {
  if [[ -f "$ENV_FILE" ]]; then
    # shellcheck source=/dev/null
    source "$ENV_FILE"
    export OS_AUTH_URL OS_TENANT_ID OS_USERNAME OS_PASSWORD OS_REGION_NAME \
           OS_USER_DOMAIN_NAME OS_PROJECT_DOMAIN_NAME
    export OS_IDENTITY_API_VERSION=3
  else
    warn "env.variables not found — Cinder checks will be skipped"
  fi
}

# ─── Helper: submit a Funnel TES task, return task ID ───────────────────────
# NOTE: does NOT use curl -f so that HTTP error bodies are visible on failure.
submit_task() {
  local task_file="$1"
  local task_name="$2"
  local resp http_status body task_id
  if [[ ! -f "$task_file" ]]; then
    echo "[submit_task] ERROR: ${task_name} — task file not found: ${task_file}" >&2
    return 1
  fi
  # Use --data-binary (not -d) to preserve newlines and send correct Content-Length.
  # The OVH LB WAF can RST connections for -d @file if the stripped body triggers
  # a rule; --data-binary sends the file verbatim.
  resp=$(curl -s -w "\nHTTP_STATUS:%{http_code}" \
    -X POST "${FUNNEL_SERVER}/v1/tasks" \
    -H "Content-Type: application/json" \
    --data-binary @"${task_file}" 2>&1)
  http_status=$(echo "$resp" | grep 'HTTP_STATUS:' | cut -d: -f2 || echo "000")
  body=$(echo "$resp" | grep -v 'HTTP_STATUS:')
  if [[ "$http_status" != "200" ]]; then
    echo "[submit_task] ERROR: ${task_name} — HTTP ${http_status}" >&2
    echo "[submit_task] Response: ${body:0:500}" >&2
    return 1
  fi
  task_id=$(echo "$body" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])" 2>/dev/null || true)
  if [[ -z "$task_id" ]]; then
    echo "[submit_task] ERROR: ${task_name} — HTTP 200 but no task ID in response" >&2
    echo "[submit_task] Response: ${body:0:500}" >&2
    return 1
  fi
  echo "$task_id"
}

# ─── Helper: poll until task reaches terminal state ─────────────────────────
wait_for_task() {
  local task_id="$1"
  local timeout="$2"
  local elapsed=0
  local state=""
  while [[ $elapsed -lt $timeout ]]; do
    state=$(curl -sf "${FUNNEL_SERVER}/v1/tasks/${task_id}?view=MINIMAL" \
      | python3 -c "import sys,json; print(json.load(sys.stdin)['state'])" 2>/dev/null || echo "UNKNOWN")
    case "$state" in
      COMPLETE)             return 0 ;;
      EXECUTOR_ERROR|SYSTEM_ERROR|CANCELED) return 1 ;;
    esac
    sleep 10
    elapsed=$((elapsed + 10))
  done
  warn "Task ${task_id} timed out after ${timeout}s (state: ${state})"
  return 2
}

# ─── Helper: monitor disk-write by watching /var/funnel-work usage ───────────
# Unlike wait_for_task(), this has no fixed deadline — the watchdog resets every
# time disk usage increases, so a slow but progressing write never times out.
# Returns 0=COMPLETE, 1=task error, 2=stalled (no disk growth) or hard-cap hit.
monitor_disk_write() {
  local task_id="$1"
  local ds_pod="$2"
  local stall_sec=120       # seconds with no growth → declare stuck
  local hard_cap_sec=3600  # absolute ceiling: 1 hour
  local stall_max=$(( stall_sec / 10 ))
  local stall_polls=0
  local last_used=0
  local peak_used=0
  local state=""
  local elapsed=0
  local cur_used
  local _df_raw="" _df_rc=0 _new_pod=""
  local _vol_poll_count=0
  local prev_state=""

  info "  Monitoring /var/funnel-work usage (stuck if no growth for ${stall_sec}s)..."

  while [[ $elapsed -lt $hard_cap_sec ]]; do
    state=$(curl -sf "${FUNNEL_SERVER}/v1/tasks/${task_id}?view=MINIMAL" \
      | python3 -c "import sys,json; print(json.load(sys.stdin)['state'])" \
      2>/dev/null || echo "UNKNOWN")
    case "$state" in
      COMPLETE)                              return 0 ;;
      EXECUTOR_ERROR|SYSTEM_ERROR|CANCELED)  return 1 ;;
      QUEUED|INITIALIZING)
        # Task hasn't started yet — don't burn the hard cap or the stall counter.
        # Bump hard_cap_sec by the poll interval so waiting-to-schedule time is free.
        hard_cap_sec=$(( hard_cap_sec + 10 ))
        info "  Task still ${state} — waiting for scheduler  (cap extended to ${hard_cap_sec}s)"
        _vol_poll_count=$(( _vol_poll_count + 1 ))
        # Poll OpenStack while waiting — a second node may have been provisioned for
        # this task (e.g. node 1 consolidated while B was QUEUED; Karpenter spawned
        # node 2 whose init disk must be tracked before its cleanup.sh deletes it).
        (( _vol_poll_count % 6 == 0 )) && accumulate_new_volumes
        prev_state="$state"
        sleep 10
        elapsed=$(( elapsed + 10 ))
        continue
        ;;
    esac

    # ── QUEUED/INITIALIZING → RUNNING transition: re-discover worker node ────
    # The task may have been scheduled on a DIFFERENT node than WORKER_NODE_NAME
    # (node 1 may have consolidated while B was QUEUED; Karpenter provisioned node 2).
    # Re-bind ds_pod and WORKER_NODE_NAME so disk monitoring and volume tracking
    # target the correct node for the rest of this monitoring loop.
    if [[ ("$prev_state" == "QUEUED" || "$prev_state" == "INITIALIZING") && "$state" == "RUNNING" ]]; then
      info "  Task transitioned ${prev_state} → RUNNING — re-discovering worker node..."
      local _new_worker
      _new_worker=$(kubectl get nodes -l "karpenter.sh/nodepool=${NODEPOOL_NAME}" \
        --no-headers -o name 2>/dev/null | head -1 || true)
      if [[ -n "$_new_worker" ]]; then
        local _new_worker_name="${_new_worker#node/}"
        if [[ "$_new_worker_name" != "$WORKER_NODE_NAME" ]]; then
          info "  Worker node changed: ${WORKER_NODE_NAME} → ${_new_worker_name}"
          WORKER_NODE_NAME="$_new_worker_name"
        fi
        # Accumulate init disk of whichever node is currently running the task.
        accumulate_new_volumes
        local _trans_pod
        _trans_pod=$(kubectl get pods -n "${FUNNEL_NAMESPACE}" -l app=funnel-disk-setup \
          --field-selector "spec.nodeName=${WORKER_NODE_NAME}" \
          --no-headers -o name 2>/dev/null | head -1 || true)
        if [[ -n "$_trans_pod" ]]; then
          [[ "$_trans_pod" != "$ds_pod" ]] && info "  Updated DS pod: ${_trans_pod} (was ${ds_pod:-none})"
          ds_pod="$_trans_pod"
        else
          info "  No DS pod on ${WORKER_NODE_NAME} yet — setup.sh may still be running"
        fi
      else
        info "  No karpenter worker node found yet — will retry next poll"
      fi
    fi
    prev_state="$state"

    if [[ -n "$ds_pod" ]]; then
      # Capture stdout+stderr together so nsenter errors are visible (not silenced).
      # Use -P (POSIX) so long LVM device names don't cause df to wrap onto a
      # second line, which would put the header at NR==2 and yield 'Used' instead
      # of a number.
      _df_raw=$(kubectl exec -n "${FUNNEL_NAMESPACE}" "${ds_pod}" -- \
        nsenter -t 1 --mount -- df -P /var/funnel-work 2>&1) \
        && _df_rc=0 || _df_rc=$?

      if [[ $_df_rc -eq 0 ]]; then
        cur_used=$(echo "$_df_raw" | awk 'NR==2{print $3}')
      else
        # Show the actual error, then re-query in case the pod restarted.
        info "  exec/nsenter failed (rc=${_df_rc}): ${_df_raw:0:200}  [${state}]"

        # Check whether the worker node itself is still present.
        local _node_alive
        _node_alive=$(kubectl get node "${WORKER_NODE_NAME}" \
          --no-headers -o name 2>/dev/null || true)
        if [[ -z "$_node_alive" ]]; then
          # Node is gone — Karpenter consolidated it.  Grab volumes NOW while
          # OpenStack may still show the volume as 'in-use' before cleanup.sh
          # deletes it; fast-poll window (60 s) is often too late.
          accumulate_new_volumes
          # Fast-poll task state for up to 60s so Funnel catches up.
          info "  Worker node ${WORKER_NODE_NAME} gone — fast-polling task state..."
          local _fp=0
          while [[ $_fp -lt 60 ]]; do
            state=$(curl -sf "${FUNNEL_SERVER}/v1/tasks/${task_id}?view=MINIMAL" \
              | python3 -c "import sys,json; print(json.load(sys.stdin)['state'])" \
              2>/dev/null || echo "UNKNOWN")
            case "$state" in
              COMPLETE)                             return 0 ;;
              EXECUTOR_ERROR|SYSTEM_ERROR|CANCELED) return 1 ;;
            esac
            sleep 5; _fp=$(( _fp + 5 ))
          done
          warn "Node gone but task still ${state} after 60s — treating as failure"
          return 2
        fi

        # Node alive but exec failed — likely a transient pod restart.
        # Re-query DS pod; don't increment stall counter.
        _new_pod=$(kubectl get pods -n "${FUNNEL_NAMESPACE}" -l app=funnel-disk-setup \
          --field-selector "spec.nodeName=${WORKER_NODE_NAME}" \
          --no-headers -o name 2>/dev/null | head -1 || true)
        if [[ -n "$_new_pod" && "$_new_pod" != "$ds_pod" ]]; then
          info "  DS pod restarted: ${ds_pod} → ${_new_pod} — updating reference"
          ds_pod="$_new_pod"
        fi
        cur_used=""
      fi

      if [[ "$cur_used" =~ ^[0-9]+$ ]]; then
        [[ "$cur_used" -gt "$peak_used" ]] && peak_used="$cur_used"
        if [[ "$cur_used" -gt "$last_used" ]]; then
          info "  /var/funnel-work: ${cur_used}K (+$((cur_used - last_used))K) ↑  [${state}]"
          last_used="$cur_used"
          stall_polls=0
        elif [[ "$peak_used" -gt 0 && "$cur_used" -lt $(( peak_used / 2 )) ]]; then
          # Usage dropped >50% of peak → file removed after write; wait for COMPLETE
          info "  /var/funnel-work: ${cur_used}K ↓ (cleanup after write)  [${state}]"
          stall_polls=0
        else
          stall_polls=$(( stall_polls + 1 ))
          info "  /var/funnel-work: ${cur_used}K (no growth ${stall_polls}/${stall_max})  [${state}]"
          if [[ $stall_polls -ge $stall_max ]]; then
            warn "disk-write stuck: no growth for ${stall_sec}s (peak=${peak_used}K, now=${cur_used}K, state=${state})"
            return 2
          fi
        fi
      elif [[ -n "$cur_used" ]]; then
        # df returned something non-numeric — unexpected output, log it
        info "  unexpected df output: '${cur_used}'  [${state}]"
      fi
      # If cur_used is empty (exec failed), stall_polls is NOT incremented:
      # the task may still be progressing even though we can't measure it.
    fi

    sleep 10
    elapsed=$(( elapsed + 10 ))
    _vol_poll_count=$(( _vol_poll_count + 1 ))
    # Every ~60 s, poll OpenStack to pick up expand volumes (or a second node's init disk).
    (( _vol_poll_count % 6 == 0 )) && accumulate_new_volumes
  done
  warn "disk-write hit hard cap of ${hard_cap_sec}s (state: ${state})"  
  return 2
}

# ─── Helper: get Cinder volume IDs (funnel-* only) ──────────────────────────
list_cinder_volumes() {
  openstack volume list --format value -c ID -c Name -c Status 2>/dev/null \
    | grep "funnel-" | awk '{print $1, $2, $3}'
}

# ─── Helper: accumulate new funnel-* Cinder volumes into TRACKED_VOLUMES ─────
# Called at key checkpoints to pick up volumes from ANY node — including nodes
# that provisioned and consolidated before this call (e.g. a first node that ran
# Tasks A/C/D and was consolidated while Task B was still QUEUED/RUNNING, or an
# expand volume added by autoscaler.sh mid-write).
# TRACKED_VOLUMES is a newline-separated list of Cinder UUIDs; all must be gone
# by the end of Step 6 (Karpenter Delete() hook runs cleanup.sh).
accumulate_new_volumes() {
  command -v openstack &>/dev/null || return 0
  local _cur _new_ids _vid
  _cur=$(list_cinder_volumes | awk '{print $1}' || true)
  [[ -z "$_cur" ]] && return 0
  # Diff against pre-test baseline so pre-existing orphans aren't counted.
  _new_ids=$(comm -13 \
    <(echo "$BASELINE_VOLUMES" | awk '{print $1}' | sort) \
    <(echo "$_cur" | sort))
  [[ -z "$_new_ids" ]] && return 0
  while IFS= read -r _vid; do
    [[ -z "$_vid" ]] && continue
    if ! grep -qxF "$_vid" <<< "${TRACKED_VOLUMES}" 2>/dev/null; then
      info "  [volume track] New funnel volume: ${_vid}"
      TRACKED_VOLUMES="${TRACKED_VOLUMES:+${TRACKED_VOLUMES}$'\n'}${_vid}"
    fi
  done <<< "$_new_ids"
}

# =============================================================================
echo ""
echo "================================================================="
echo "  Funnel Disk-Handling Test Suite  [LoadBalancer variant]"
echo "  $(date)"
echo "================================================================="
echo ""

# ─── Resolve FUNNEL_SERVER from env.variables LB_ENDPOINT ───────────────────
# load_openstack() sources env.variables which sets LB_ENDPOINT.
# We call it here early (before Step 0 banner) so FUNNEL_SERVER is ready.
load_openstack
if [[ -z "$FUNNEL_SERVER" ]]; then
  if [[ -n "${LB_ENDPOINT:-}" ]]; then
    FUNNEL_SERVER="http://${LB_ENDPOINT}"
  else
    fail "FUNNEL_SERVER not set and LB_ENDPOINT not found in env.variables."
    exit 1
  fi
fi

# ─── LB connectivity check ───────────────────────────────────────────────────
echo "── LB connectivity: ${FUNNEL_SERVER} ─────────────────────────────"
LB_WAIT=0
until curl -sf "${FUNNEL_SERVER}/v1/tasks" >/dev/null 2>&1; do
  LB_WAIT=$((LB_WAIT+1))
  if [[ $LB_WAIT -ge 15 ]]; then
    fail "LoadBalancer at ${FUNNEL_SERVER} not reachable after 15s."
    exit 1
  fi
  sleep 1
done
# Verify POST body is forwarded correctly by submitting a minimal probe task
# and confirming we get back a task ID (not an empty body or connection reset).
# Uses submit_task() so the same --data-binary path is exercised as real tasks.
_probe_json=$(mktemp /tmp/lb-probe-XXXXXX.json)
echo '{"name":"lb-probe","executors":[{"image":"alpine","command":["true"]}]}' > "$_probe_json"
_probe_id=$(submit_task "$_probe_json" "lb-probe" 2>&1) && _probe_ok=true || _probe_ok=false
rm -f "$_probe_json"
if [[ "$_probe_ok" == "true" && -n "$_probe_id" ]]; then
  # Cancel immediately — connectivity check only, not a real task.
  # Leaving it QUEUED would trigger Karpenter to provision a worker node early,
  # which races with Step 3 and contaminates the Cinder volume baseline.
  curl -s -X POST "${FUNNEL_SERVER}/v1/tasks/${_probe_id}:cancel" >/dev/null 2>&1 || true
  ok "LB POST test OK — probe task ${_probe_id} created and cancelled."
else
  fail "LB POST body dropped or rejected. LB not forwarding POST bodies correctly."
  fail "submit_task error: ${_probe_id}"
  fail "Use test-disk-handling.sh (port-forward variant) instead."
  exit 1
fi
echo ""


# ─── Step 0: Baseline Cinder volumes ─────────────────────────────────────────
echo "── Step 0: Baseline Cinder volumes ──────────────────────────────"
# (OpenStack creds already loaded above for LB_ENDPOINT resolution)
BASELINE_VOLUMES=""
if command -v openstack &>/dev/null; then
  BASELINE_VOLUMES=$(list_cinder_volumes || true)
  if [[ -n "$BASELINE_VOLUMES" ]]; then
    BASELINE_COUNT=$(echo "$BASELINE_VOLUMES" | wc -l)
    warn "${BASELINE_COUNT} pre-existing funnel Cinder volume(s) found:"
    echo "$BASELINE_VOLUMES" | while read -r line; do info "  $line"; done
    echo ""
    echo "  These are LEFTOVER volumes from previous runs (cleanup bug)."
    echo "  Test will track NEW volumes created during this run."
  else
    ok "No pre-existing funnel Cinder volumes — clean baseline."
  fi
else
  warn "openstack CLI not available — Cinder tracking skipped"
fi
echo ""

# ─── Step 1: Freeze Karpenter consolidation while tasks run ─────────────────
echo "── Step 1: Set Karpenter consolidateAfter=${TEST_CONSOLIDATE_AFTER} (freeze during tasks) ──"
# Never = Karpenter won't consolidate the node mid-test (e.g. when tasks A/C/D
# finish quickly but B is still QUEUED).  We'll switch to ${CLEANUP_CONSOLIDATE_AFTER}
# after all tasks complete so Step 6 can verify the Cinder cleanup.
kubectl patch nodepool "${NODEPOOL_NAME}" --type=merge \
  -p "{\"spec\":{\"disruption\":{\"consolidateAfter\":\"${TEST_CONSOLIDATE_AFTER}\"}}}"  
ok "consolidateAfter set to ${TEST_CONSOLIDATE_AFTER} — consolidation frozen until tasks complete"
echo ""

# ─── Step 2: Submit test tasks ───────────────────────────────────────────────
echo "── Step 2: Submit tasks ─────────────────────────────────────────"

# Task A: Hello world — confirms basic Funnel/nerdctl/containerd path
TASK_A_FILE="${EXAMPLES_DIR}/hello.json"
TASK_A_ID=$(submit_task "$TASK_A_FILE" "hello")
ok "Task A (hello):      ${TASK_A_ID}"

# Task B: Disk write — confirms workdir goes to Cinder LV
TASK_B_FILE="${EXAMPLES_DIR}/disk-write-test.json"
TASK_B_ID=$(submit_task "$TASK_B_FILE" "disk-write")
ok "Task B (disk-write): ${TASK_B_ID}"

# Task C: NFS check — verifies that NFS is actually accessible INSIDE the
# nerdctl container (not just in the worker pod).
#
# IMPORTANT: "ls /mnt/shared" from inside the container is NOT sufficient —
# the hostPath bind creates an empty mountpoint, so ls always succeeds even
# without a real NFS mount.
#
# The correct check is the filesystem type in /proc/mounts.  The hostPath bind
# appears as the underlying disk fs (ext4/xfs) when NFS is absent; it becomes
# nfs/nfs4 once the DaemonSet holder has completed the mount.  We also list
# /mnt/shared top-level contents for informational purposes.
NFS_CHECK_JSON=$(mktemp /tmp/nfs-check-XXXXXX.json)
cat > "$NFS_CHECK_JSON" << 'NFS_EOF'
{
  "name": "nfs-check",
  "description": "Verifies NFS mount is real inside the nerdctl container (not just the worker pod).",
  "resources": { "cpu_cores": 1, "ram_gb": 0.25, "disk_gb": 1.0 },
  "executors": [{
    "image": "alpine:3.19",
    "command": ["/bin/sh", "-c",
      "set -e\n\necho '=== /proc/mounts type check ==='\nif grep -qE '^[^ ]+ /mnt/shared nfs' /proc/mounts; then\n  echo '[PASS] /mnt/shared is a real NFS mount inside the container:'\n  grep '/mnt/shared' /proc/mounts\nelse\n  echo '[FAIL] /mnt/shared is NOT an NFS mount inside the container!'\n  echo '  fs type seen (should be nfs/nfs4):'\n  grep '/mnt/shared' /proc/mounts || echo '  (no /mnt/shared entry at all)'\n  exit 1\nfi\n\necho ''\necho '=== Top-level contents (informational) ==='\nls /mnt/shared | head -20 || true\n\necho ''\necho '=== NFS check PASSED ==='"],
    "workdir": "/tmp"
  }],
  "volumes": ["/mnt/shared"]
}
NFS_EOF
TASK_C_ID=$(submit_task "$NFS_CHECK_JSON" "nfs-check")
rm -f "$NFS_CHECK_JSON"
ok "Task C (nfs-check):  ${TASK_C_ID}"
echo ""

# Task D: NFS race-condition probe — submitted immediately after A/B/C, before
# the node is up, so if Karpenter provisions a NEW node for this batch we can
# confirm that wait-for-nfs actually blocks until the holder has the NFS mount.
# Uses the same NFS check but also prints the init-container timing via the
# Funnel task logs (stdout shows wall-clock time of when the task started).
NFS_RACE_JSON=$(mktemp /tmp/nfs-race-XXXXXX.json)
cat > "$NFS_RACE_JSON" << 'NFS_EOF'
{
  "name": "nfs-race-probe",
  "description": "Probes NFS availability timing on a potentially fresh node.",
  "resources": { "cpu_cores": 1, "ram_gb": 0.25, "disk_gb": 1.0 },
  "executors": [{
    "image": "alpine:3.19",
    "command": ["/bin/sh", "-c",
      "echo 'Container started at: '$(date -Iseconds)\nif grep -qE '^[^ ]+ /mnt/shared nfs' /proc/mounts; then\n  echo '[PASS] NFS confirmed (nfs/nfs4 type) inside container at container-start time.'\n  grep '/mnt/shared' /proc/mounts\nelse\n  echo '[FAIL] /mnt/shared is NOT an NFS mount inside the container!'\n  echo 'Race condition confirmed: wait-for-nfs did not block long enough.'\n  echo 'fs type seen (should be nfs/nfs4):'\n  grep '/mnt/shared' /proc/mounts || echo '(no /mnt/shared entry)'\n  exit 1\nfi"],
    "workdir": "/tmp"
  }],
  "volumes": ["/mnt/shared"]
}
NFS_EOF
TASK_D_ID=$(submit_task "$NFS_RACE_JSON" "nfs-race-probe")
rm -f "$NFS_RACE_JSON"
ok "Task D (nfs-race):   ${TASK_D_ID}"
echo ""

# ─── Step 3: Wait for node to appear ─────────────────────────────────────────
echo "── Step 3: Wait for worker node to appear ───────────────────────"
NODE_WAIT=0
WORKER_NODE=""
while [[ $NODE_WAIT -lt 300 ]]; do
  WORKER_NODE=$(kubectl get nodes -l "karpenter.sh/nodepool=${NODEPOOL_NAME}" \
    --no-headers -o name 2>/dev/null | head -1 || true)
  if [[ -n "$WORKER_NODE" ]]; then
    ok "Worker node appeared: ${WORKER_NODE}"
    break
  fi
  sleep 10
  NODE_WAIT=$((NODE_WAIT + 10))
done
if [[ -z "$WORKER_NODE" ]]; then
  fail "No worker node appeared within 5 minutes."
  exit 1
fi

# Wait for the funnel-disk-setup DaemonSet pod to become Ready on this node.
# The pod starts in Init:0/1 (setup.sh running: creates + attaches Cinder volume)
# and only transitions to Running AFTER setup.sh exits 0, which guarantees:
#   • /var/lib/funnel-autoscaler-volumes has been written with the initial volume ID
#   • the autoscaler.sh systemd service has been started on the host
# The previous approach (sleep 15) was a race condition: the Node object can
# appear in Kubernetes minutes before setup.sh finishes, causing the state file
# to appear empty or missing when Step 5 reads it.
WORKER_NODE_NAME="${WORKER_NODE#node/}"
info "Waiting for funnel-disk-setup pod on ${WORKER_NODE_NAME} to become Ready..."
DS_POD=""
DS_PHASE=""
DS_INIT_EXIT=""
DS_WAIT=0
while [[ $DS_WAIT -lt 600 ]]; do
  DS_POD=$(kubectl get pods -n "${FUNNEL_NAMESPACE}" -l app=funnel-disk-setup \
    --field-selector "spec.nodeName=${WORKER_NODE_NAME}" \
    --no-headers -o name 2>/dev/null | head -1 || true)
  if [[ -n "$DS_POD" ]]; then
    DS_PHASE=$(kubectl get -n "${FUNNEL_NAMESPACE}" "${DS_POD}" \
      -o jsonpath='{.status.phase}' 2>/dev/null || echo "")
    DS_INIT_EXIT=$(kubectl get -n "${FUNNEL_NAMESPACE}" "${DS_POD}" \
      -o jsonpath='{.status.initContainerStatuses[0].state.terminated.exitCode}' \
      2>/dev/null || echo "")
    if [[ "$DS_PHASE" == "Running" || "$DS_INIT_EXIT" == "0" ]]; then
      ok "funnel-disk-setup pod Ready: ${DS_POD} (phase=${DS_PHASE})"
      break
    fi
    info "  ${DS_POD}: phase=${DS_PHASE:-Pending} init-exit=${DS_INIT_EXIT:-running} — waiting..."
  else
    info "  no funnel-disk-setup pod on ${WORKER_NODE_NAME} yet — waiting..."
  fi
  sleep 10
  DS_WAIT=$((DS_WAIT + 10))
done
if [[ -z "$DS_POD" ]]; then
  fail "No funnel-disk-setup pod appeared on ${WORKER_NODE_NAME} within 10 min — aborting."
  exit 1
fi
if [[ "$DS_PHASE" != "Running" && "$DS_INIT_EXIT" != "0" ]]; then
  warn "funnel-disk-setup pod not Ready after 10 min — state file may be incomplete."
fi

# Accumulate the init disk created during node provisioning.
# Expand volumes (added by autoscaler.sh during Task B) and any second-node
# disks are picked up every 60 s inside monitor_disk_write(), with a final
# sweep immediately after it returns.
TRACKED_VOLUMES=""
accumulate_new_volumes
if [[ -z "$TRACKED_VOLUMES" ]]; then
  info "OpenStack: no new funnel-* volumes visible yet (volume may still be attaching)."
fi
echo ""

# ─── Step 4a: Wait for Task A (hello) — confirms node is up and working ──────
echo "── Step 4: Wait for tasks to complete ──────────────────────────"
TASK_A_OK=false; TASK_B_OK=false; TASK_C_OK=false; TASK_D_OK=false

wait_for_task "$TASK_A_ID" $TASK_TIMEOUT_SEC && TASK_A_OK=true || true
[[ "$TASK_A_OK" == "true" ]] && ok "Task A (hello)       COMPLETE" || fail "Task A (hello)       FAILED"

# ─── Step 5: Inspect the live node via the DaemonSet pod ─────────────────────
# IMPORTANT: Step 5 runs here, BEFORE Task B (disk-write) completes.
# Task B keeps the node alive and prevents consolidation during inspection.
# With consolidateAfter=1m, the window after all tasks complete is too narrow.
echo ""
echo "── Step 5: Inspect node disk layout ───────────────────────────"
# DS_POD was located and confirmed Ready in Step 3 — reuse it.
# Re-verify it is still Running in case of an unexpected pod restart between steps.
if [[ -n "$DS_POD" ]]; then
  _pod_phase=$(kubectl get -n "${FUNNEL_NAMESPACE}" "${DS_POD}" \
    -o jsonpath='{.status.phase}' 2>/dev/null || echo "gone")
  if [[ "$_pod_phase" != "Running" ]]; then
    warn "DS pod ${DS_POD} no longer Running (phase=${_pod_phase}) — re-querying..."
    DS_POD=$(kubectl get pods -n "${FUNNEL_NAMESPACE}" -l app=funnel-disk-setup \
      --field-selector "spec.nodeName=${WORKER_NODE_NAME}" \
      --no-headers -o name 2>/dev/null | head -1 || true)
  fi
fi

if [[ -n "$DS_POD" ]]; then
  info "DaemonSet pod: ${DS_POD}"
  echo ""
  echo "  ── df on host (via nsenter) ──"
  kubectl exec -n "${FUNNEL_NAMESPACE}" "${DS_POD}" -- \
    nsenter -t 1 --mount -- df -h /var/funnel-work /dev/sda1 2>/dev/null \
    | sed 's/^/    /' || warn "df failed"

  echo ""
  echo "  ── Cinder LV block device ──"
  LV_DEV=$(kubectl exec -n "${FUNNEL_NAMESPACE}" "${DS_POD}" -- \
    nsenter -t 1 --mount -- bash -c \
    "grep '/var/funnel-work' /proc/mounts | awk '{print \$1}'" 2>/dev/null || true)
  if [[ -n "$LV_DEV" ]]; then
    ok "Cinder LV mounted at /var/funnel-work: ${LV_DEV}"
  else
    fail "No Cinder LV mounted at /var/funnel-work — setup.sh failed"
  fi

  echo ""
  echo "  ── containerd symlink check ──"
  CONTAINERD_PATH=$(kubectl exec -n "${FUNNEL_NAMESPACE}" "${DS_POD}" -- \
    nsenter -t 1 --mount -- readlink -f /var/lib/containerd 2>/dev/null || echo "NOT_FOUND")
  if echo "$CONTAINERD_PATH" | grep -q "/var/funnel-work"; then
    ok "containerd is on Cinder LV: ${CONTAINERD_PATH}"
  else
    fail "containerd is NOT on Cinder LV — still at ${CONTAINERD_PATH} (v20 fix not active)"
  fi

  echo ""
  echo "  ── autoscaler + cleanup service status ──"
  # funnel-disk-cleanup is a systemd oneshot service invoked via preStop hook at
  # node shutdown. Seeing cleanup=inactive HERE is EXPECTED and correct — it means
  # the service has not run yet (node is still up).  Only cleanup=failed would
  # indicate a problem from a previous shutdown cycle.
  kubectl exec -n "${FUNNEL_NAMESPACE}" "${DS_POD}" -- \
    nsenter -t 1 --mount --pid -- systemctl is-active funnel-disk-autoscaler funnel-disk-cleanup 2>/dev/null \
    | paste - - | awk '{print "    autoscaler="$1"  cleanup="$2" (inactive=normal for oneshot)"}' \
    || warn "systemctl check failed"

  echo ""
  echo "  ── Volume state file (authoritative Cinder volume list) ──"
  # This is the DEFINITIVE list of volumes managed on this node.
  # setup.sh writes the initial volume ID here; autoscaler.sh appends extras.
  # cleanup.sh reads this to know what to detach+delete on shutdown.
  STATE_FILE_CONTENT=$(kubectl exec -n "${FUNNEL_NAMESPACE}" "${DS_POD}" -- \
    nsenter -t 1 --mount -- cat /var/lib/funnel-autoscaler-volumes 2>/dev/null || true)
  if [[ -n "$STATE_FILE_CONTENT" ]]; then
    VOL_COUNT=$(echo "$STATE_FILE_CONTENT" | grep -c '[a-f0-9-]' || true)
    ok "State file contains ${VOL_COUNT} volume ID(s) on this node (cross-checked against accumulated tracking):"
    _sf_added=0
    while IFS= read -r vid; do
      [[ -z "$vid" ]] && continue
      # Show current OpenStack status for each ID
      if command -v openstack &>/dev/null; then
        VOL_STATUS=$(openstack volume show "$vid" -f value -c status 2>/dev/null || echo "NOT_FOUND")
        info "  ${vid}  [OpenStack status: ${VOL_STATUS}]"
      else
        info "  ${vid}"
      fi
      # Merge any ID not yet in TRACKED_VOLUMES (e.g. volume still 'creating'
      # when the last accumulate_new_volumes call ran).
      if ! grep -qxF "$vid" <<< "${TRACKED_VOLUMES}" 2>/dev/null; then
        TRACKED_VOLUMES="${TRACKED_VOLUMES:+${TRACKED_VOLUMES}$'\n'}${vid}"
        _sf_added=$(( _sf_added + 1 ))
      fi
    done <<< "$STATE_FILE_CONTENT"
    [[ $_sf_added -gt 0 ]] && info "  Merged ${_sf_added} volume(s) from state file into tracking."
  else
    fail "State file /var/lib/funnel-autoscaler-volumes is empty or missing — setup.sh did not complete!"
    # Keep whatever was accumulated via OpenStack polling — do NOT clear.
  fi
else
  warn "No funnel-disk-setup pod found — cannot inspect node"
fi
echo ""

# ─── Step 4b: Wait for remaining tasks (B/C/D) to complete ──────────────────
# Task B: monitor by disk-usage growth rather than a fixed deadline.
# The write is 5GB (needed to trigger the Cinder autoscaler) and may take
# several minutes; the monitor resets its stall timer on every new byte written.
monitor_disk_write "$TASK_B_ID" "$DS_POD" && TASK_B_OK=true || TASK_B_OK=false
# Final volume sweep: picks up any expand volumes not yet seen by the in-loop polls
# (e.g. autoscaler fired in the last 60 s window, or a second node's init disk).
accumulate_new_volumes
info "Tracking ${#TRACKED_VOLUMES} byte(s) of volume IDs: $(echo "$TRACKED_VOLUMES" | grep -c '[a-f0-9-]' || echo 0) volume(s) total"
[[ "$TASK_B_OK" == "true" ]] && ok "Task B (disk-write)  COMPLETE" || fail "Task B (disk-write)  FAILED (stalled or errored — see disk usage log above)"

wait_for_task "$TASK_C_ID" $TASK_TIMEOUT_SEC && TASK_C_OK=true || true
[[ "$TASK_C_OK" == "true" ]] && ok "Task C (nfs-check)   COMPLETE" || fail "Task C (nfs-check)   FAILED (see task logs — NFS not real inside container)"

wait_for_task "$TASK_D_ID" $TASK_TIMEOUT_SEC && TASK_D_OK=true || true
[[ "$TASK_D_OK" == "true" ]] && ok "Task D (nfs-race)    COMPLETE" || fail "Task D (nfs-race)    FAILED (wait-for-nfs race condition still present)"

# Print stderr/stdout for C and D to help diagnose NFS issues
for ID_LABEL in "${TASK_C_ID}:C-nfs-check" "${TASK_D_ID}:D-nfs-race"; do
  ID="${ID_LABEL%%:*}"; LABEL="${ID_LABEL##*:}"
  echo ""
  echo "  ── Task ${LABEL} logs ──"
  curl -sf "${FUNNEL_SERVER}/v1/tasks/${ID}?view=FULL" \
    | python3 -c "
import sys, json
t = json.load(sys.stdin)
for log in t.get('logs', []):
  for elog in log.get('logs', []):
    print('  STDOUT:', elog.get('stdout','').strip()[:800] or '(empty)')
    print('  STDERR:', elog.get('stderr','').strip()[:400] or '(empty)')
" 2>/dev/null || warn "Could not fetch logs for task ${ID}"
done
echo ""

# ─── Step 6: Wait for Karpenter to reclaim the node ──────────────────────────
echo "── Step 6: Wait for node drain + Karpenter Cinder cleanup ──────"
# Now that all tasks are done, re-enable consolidation with a short window so
# Karpenter drains and deletes the node promptly for cleanup verification.
echo "  Enabling consolidateAfter=${CLEANUP_CONSOLIDATE_AFTER} to trigger node drain..."
kubectl patch nodepool "${NODEPOOL_NAME}" --type=merge \
  -p "{\"spec\":{\"disruption\":{\"consolidateAfter\":\"${CLEANUP_CONSOLIDATE_AFTER}\"}}}"
ok "consolidateAfter set to ${CLEANUP_CONSOLIDATE_AFTER} — Karpenter will drain node within ~${CLEANUP_CONSOLIDATE_AFTER}"
info "(${CLEANUP_CONSOLIDATE_AFTER} → drain → cleanupCinderVolumes() in Delete() → OVH delete)"
info "Polling for node removal (max ${NODE_DRAIN_WAIT_SEC}s)..."
# Poll instead of blind sleep — if the node already consolidated during Step 4b
# (all tasks finished and consolidateAfter=1m fired mid-monitoring), we exit
# immediately rather than burning NODE_DRAIN_WAIT_SEC.
NODE_DRAIN_ELAPSED=0
REMAINING_NODE=""
while [[ $NODE_DRAIN_ELAPSED -lt $NODE_DRAIN_WAIT_SEC ]]; do
  REMAINING_NODE=$(kubectl get nodes -l "karpenter.sh/nodepool=${NODEPOOL_NAME}" \
    --no-headers -o name 2>/dev/null | head -1 || true)
  if [[ -z "$REMAINING_NODE" ]]; then
    info "  Node gone after ${NODE_DRAIN_ELAPSED}s."
    break
  fi
  sleep 15
  NODE_DRAIN_ELAPSED=$(( NODE_DRAIN_ELAPSED + 15 ))
done

echo ""
echo "  ── Node check ──"
if [[ -z "$REMAINING_NODE" ]]; then
  ok "Worker node is gone — Karpenter reclaimed it."
else
  warn "Worker node still present: ${REMAINING_NODE}"
  info "  Check Karpenter logs: kubectl logs -n kube-system -l app.kubernetes.io/name=karpenter"
fi

echo ""
echo "  ── Cinder volume cleanup check ──"
sleep $CLEANUP_WAIT_SEC
# Gate on TRACKED_VOLUMES (read from the host state file in Step 5).
# This is authoritative: those exact IDs were managed by setup.sh/autoscaler.sh
# and should have been deleted by cleanup.sh before node shutdown.
ORPHANED=""
CLEANUP_OK=false
if [[ -z "$TRACKED_VOLUMES" ]]; then
  warn "No volumes were tracked (state file was empty/missing in Step 5) — cannot verify cleanup"
elif ! command -v openstack &>/dev/null; then
  warn "openstack CLI unavailable — cannot query volume status; volumes to check:"
  echo "$TRACKED_VOLUMES" | while IFS= read -r vid; do
    [[ -z "$vid" ]] && continue
    info "  ${vid} (status unknown)"
  done
else
  ORPHANED=""
  while IFS= read -r vid; do
    [[ -z "$vid" ]] && continue
    VOL_STATUS=$(openstack volume show "$vid" -f value -c status 2>/dev/null || echo "deleted")
    if [[ -z "$VOL_STATUS" || "$VOL_STATUS" == "deleted" || "$VOL_STATUS" == "NOT_FOUND" ]]; then
      ok "Volume ${vid} deleted — Karpenter cleanupCinderVolumes() succeeded"
    else
      ORPHANED="${ORPHANED}${vid} "
      fail "Volume ${vid} still exists (status: ${VOL_STATUS}) — Karpenter Cinder cleanup did NOT delete it!"
    fi
  done <<< "$TRACKED_VOLUMES"

  if [[ -z "$ORPHANED" ]]; then
    ok "ALL tracked Cinder volumes cleaned up — disk lifecycle is correct."
    CLEANUP_OK=true
  else
    fail "ORPHANED volumes remain: ${ORPHANED}"
    info "  To delete manually:"
    for vid in $ORPHANED; do
      info "    openstack volume delete ${vid}"
    done
  fi
fi

# ── Broad orphan scan: safety net for any volume that slipped through polling ──
# accumulate_new_volumes() is called at Step 3, every 60 s in monitor_disk_write,
# and immediately after monitor_disk_write returns.  This scan catches anything
# that was still in 'creating' state during all three of those windows.
if command -v openstack &>/dev/null; then
  echo ""
  echo "  ── Broad orphan scan (all funnel-* volumes vs baseline) ──"
  FINAL_VOLUMES=$(list_cinder_volumes || true)
  EXTRA_VOLUMES=$(comm -13 \
    <(echo "$BASELINE_VOLUMES" | awk '{print $1}' | sort) \
    <(echo "$FINAL_VOLUMES" | awk '{print $1}' | sort))
  if [[ -z "$EXTRA_VOLUMES" ]]; then
    ok "Broad scan: zero extra funnel-* volumes vs baseline — fully clean."
  else
    UNTRACKED_COUNT=0
    while IFS= read -r vid; do
      [[ -z "$vid" ]] && continue
      VOL_NAME=$(openstack volume show "$vid" -f value -c name 2>/dev/null || echo "unknown")
      VOL_STATUS=$(openstack volume show "$vid" -f value -c status 2>/dev/null || echo "unknown")
      if echo "${TRACKED_VOLUMES}" | grep -qF "$vid" 2>/dev/null; then
        warn "Broad scan: tracked volume ${vid} (${VOL_NAME}, ${VOL_STATUS}) still present — already reported above."
      else
        fail "UNTRACKED ORPHAN: ${vid} (${VOL_NAME}, status=${VOL_STATUS})"
        info "  This volume was NOT in the state file — likely an expand volume whose ID was"
        info "  never written (autoscaler.sh aborted before appending to STATE_FILE)."
        info "  Delete manually: openstack volume delete ${vid}"
        UNTRACKED_COUNT=$((UNTRACKED_COUNT + 1))
        ORPHANED="${ORPHANED:-}${vid} "
        CLEANUP_OK=false
      fi
    done <<< "$EXTRA_VOLUMES"
    [[ $UNTRACKED_COUNT -eq 0 ]] && ok "Broad scan: all extra volumes were in the tracked set."
  fi
fi
echo ""

# ─── Summary ────────────────────────────────────────────────────────────────
echo "================================================================="
echo "  SUMMARY"
echo "================================================================="
echo ""
[[ "$TASK_A_OK" == "true" ]] && ok "Task A (hello):       PASS" || fail "Task A (hello):       FAIL"
[[ "$TASK_B_OK" == "true" ]] && ok "Task B (disk-write):  PASS" || fail "Task B (disk-write):  FAIL"
[[ "$TASK_C_OK" == "true" ]] && ok "Task C (nfs-check):   PASS" || fail "Task C (nfs-check):   FAIL — NFS not an actual nfs mount inside container"
[[ "$TASK_D_OK" == "true" ]] && ok "Task D (nfs-race):    PASS" || fail "Task D (nfs-race):    FAIL — race condition: NFS not mounted at container start"

if [[ -z "$TRACKED_VOLUMES" ]]; then
  warn "Cleanup E (volumes):  UNKNOWN — state file empty, setup.sh may not have completed"
elif [[ "$CLEANUP_OK" == "true" ]]; then
  ok "Cleanup E (volumes):  PASS — all Cinder volumes removed by Karpenter Delete()"
elif command -v openstack &>/dev/null; then
  fail "Cleanup E (volumes):  FAIL — orphaned volumes: ${ORPHANED}"
else
  warn "Cleanup E (volumes):  UNKNOWN — openstack CLI not available"
fi

ALL_TASKS_OK=false
[[ "$TASK_A_OK" == "true" && "$TASK_B_OK" == "true" && "$TASK_C_OK" == "true" && "$TASK_D_OK" == "true" ]] && ALL_TASKS_OK=true

echo ""
if [[ "$ALL_TASKS_OK" == "true" && "$CLEANUP_OK" == "true" ]]; then
  echo -e "${GREEN}ALL CHECKS PASSED${NC} — disk handling is working correctly."
  echo "Safe to run real Cromwell workflows."
elif [[ "$ALL_TASKS_OK" == "true" ]]; then
  echo -e "${YELLOW}TASKS PASSED, but cleanup could not be fully verified.${NC}"
  echo "Check Step 6 output above for details."
else
  echo -e "${RED}SOME CHECKS FAILED${NC} — investigate before running real workflows."
fi
echo ""
