#!/usr/bin/env bash
# Start Qwen3.5-27B-AWQ via vLLM on port 8100 (tp=2, AWQ INT4)

set -euo pipefail

MODEL_PATH="/lsinfo/ai/hellotax_ai/llm_service/base_models/Qwen3.5-27B-AWQ"
PORT=8100
LOG_FILE="/lsinfo/ai/hellotax_ai/llm_service/logs/vllm-${PORT}.log"
PID_FILE="/lsinfo/ai/hellotax_ai/llm_service/logs/vllm-${PORT}.pid"

mkdir -p "$(dirname "$LOG_FILE")"

if [[ -f "$PID_FILE" ]] && kill -0 "$(cat "$PID_FILE")" 2>/dev/null; then
  echo "vLLM already running (PID $(cat "$PID_FILE"))"
  exit 0
fi

echo "Starting vLLM — Qwen3.5-27B-AWQ on port ${PORT}..."

export OMP_NUM_THREADS=4
export VLLM_ATTENTION_BACKEND=FLASH_ATTN
nohup /lsinfo/ai/hellotax_ai/llm_service/venv_vllm/bin/python -m vllm.entrypoints.openai.api_server \
  --model "${MODEL_PATH}" \
  --host 0.0.0.0 \
  --port "${PORT}" \
  --served-model-name Qwen3.5-27B-AWQ \
  --tensor-parallel-size 2 \
  --dtype auto \
  --quantization awq \
  --enforce-eager \
  --max-model-len 32768 \
  --gpu-memory-utilization 0.90 \
  --max-num-seqs 5 \
  --trust-remote-code \
  --api-key sk-local \
  --enable-auto-tool-choice \
  --tool-call-parser qwen3_coder \
  > "${LOG_FILE}" 2>&1 &

echo $! > "${PID_FILE}"
echo "PID $(cat "$PID_FILE") — log: ${LOG_FILE}"

echo -n "Waiting for vLLM to be ready"
for i in $(seq 1 60); do
  if curl -sf http://localhost:${PORT}/health > /dev/null 2>&1; then
    echo " ready."
    exit 0
  fi
  echo -n "."
  sleep 5
done
echo " timeout. Check log: ${LOG_FILE}"
exit 1
