#!/usr/bin/env bash
# Start Qwen3-VL-32B-Instruct on port 8400 (tensor-parallel across both GPUs)
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
LOG_DIR="${PROJECT_ROOT}/llm_service/logs"
mkdir -p "$LOG_DIR"

MODEL_PATH="${VL32B_MODEL_PATH:-Qwen/Qwen3-VL-32B-Instruct}"

echo ">> Starting Qwen3-VL-32B on port 8400..."
nohup vllm serve "$MODEL_PATH" \
  --port 8400 \
  --gpu-memory-utilization 0.90 \
  --max-model-len 8192 \
  --tensor-parallel-size 2 \
  > "${LOG_DIR}/vl32b-8400.log" 2>&1 &

echo $! > "${LOG_DIR}/vl32b-8400.pid"
echo "   PID $(cat "${LOG_DIR}/vl32b-8400.pid") — waiting for service..."

# Wait up to 120s for the service to be ready
for i in $(seq 1 60); do
  if curl -sf http://localhost:8400/v1/models > /dev/null 2>&1; then
    echo "   Qwen3-VL-32B ready on :8400"
    exit 0
  fi
  sleep 2
done

echo "ERROR: Qwen3-VL-32B failed to start within 120s" >&2
exit 1
