#!/bin/bash

set -e

SERVER="124.174.37.152"
PASS="Lsinfo_123\$#.hs"
ROOT="/lsinfo/ai/hellotax_ai"

SSH="sshpass -p ${PASS} ssh -o StrictHostKeyChecking=no root@${SERVER}"

echo "启动远程服务器 ${SERVER} 上的所有服务..."

$SSH bash << 'REMOTE'
set -e
source /root/.nvm/nvm.sh
ROOT="/lsinfo/ai/hellotax_ai"
LOG_DIR="${ROOT}/apm/logs"
LLM_LOG="${ROOT}/apm/logs"

mkdir -p "${LOG_DIR}"

echo "[1/17] 启动 APM 监控服务..."
cd "${ROOT}/apm"
docker-compose up -d
cd langfuse
docker-compose up -d
echo "✓ APM 已启动"

echo "[2/4] 启动 base_platform Docker 服务..."
cd "${ROOT}/base_platform"
docker-compose up -d postgres redis neo4j milvus etcd minio
echo "✓ base_platform Docker 已启动"

echo "[3/4] 启动 training_center postgres..."
cd "${ROOT}/training_center"
docker-compose up -d postgres
echo "✓ training_center postgres 已启动 (5434)"

echo "[4/4] 启动 data_center postgres..."
cd "${ROOT}/data_center"
docker-compose up -d postgres
echo "✓ data_center postgres 已启动 (5435)"

echo "[5/17] 启动 vLLM (8100)..."
if [[ -f "${LLM_LOG}/vllm-8100.pid" ]] && kill -0 "$(cat "${LLM_LOG}/vllm-8100.pid")" 2>/dev/null; then
  echo "✓ vLLM 已在运行"
else
  export OMP_NUM_THREADS=4
  export VLLM_ATTENTION_BACKEND=FLASH_ATTN
  nohup ${ROOT}/llm_service/venv_vllm/bin/python -m vllm.entrypoints.openai.api_server \
    --model "${ROOT}/llm_service/base_models/Qwen3.5-27B-AWQ" \
    --host 0.0.0.0 --port 8100 \
    --served-model-name Qwen3.5-27B-AWQ \
    --tensor-parallel-size 2 --dtype auto --quantization awq \
    --enforce-eager --max-model-len 32768 --gpu-memory-utilization 0.78 \
    --max-num-seqs 5 --trust-remote-code --api-key sk-7dd698cf18851f832698c36a9998e60b9b0e32163fbee6ea \
    --enable-auto-tool-choice --tool-call-parser qwen3_coder \
    > "${LLM_LOG}/vllm-8100.log" 2>&1 &
  echo $! > "${LLM_LOG}/vllm-8100.pid"
  echo "✓ vLLM 已启动 (PID $!)"
fi

echo "[6/17] 启动 Embedding (8200)..."
if [[ -f "${LLM_LOG}/embedding-8200.pid" ]] && kill -0 "$(cat "${LLM_LOG}/embedding-8200.pid")" 2>/dev/null; then
  echo "✓ Embedding 已在运行"
else
  EMBEDDING_MODEL_PATH="${ROOT}/llm_service/base_models/bge-m3" \
  SERVED_MODEL_NAME=bge-m3 \
  API_KEY=sk-53aebda65025194d62ae357d72be6f1b6003300fc170828d \
  CUDA_VISIBLE_DEVICES=0 \
  nohup ${ROOT}/llm_service/venv_embed/bin/python \
    ${ROOT}/llm_service/servers/embedding_server.py \
    --host 0.0.0.0 --port 8200 \
    > "${LLM_LOG}/embedding-8200.log" 2>&1 &
  echo $! > "${LLM_LOG}/embedding-8200.pid"
  echo "✓ Embedding 已启动 (PID $!)"
fi

echo "[7/17] 启动 Reranker (8300)..."
if [[ -f "${LLM_LOG}/reranker-8300.pid" ]] && kill -0 "$(cat "${LLM_LOG}/reranker-8300.pid")" 2>/dev/null; then
  echo "✓ Reranker 已在运行"
else
  RERANK_MODEL_PATH="${ROOT}/llm_service/base_models/bge-reranker-v2-m3" \
  SERVED_MODEL_NAME=bge-reranker-v2-m3 \
  API_KEY=sk-36814eb1e94efa673db7327064def248a074e030f39ffe3c \
  CUDA_VISIBLE_DEVICES=1 \
  nohup ${ROOT}/llm_service/venv_embed/bin/python \
    ${ROOT}/llm_service/servers/rerank_server.py \
    --host 0.0.0.0 --port 8300 \
    > "${LLM_LOG}/reranker-8300.log" 2>&1 &
  echo $! > "${LLM_LOG}/reranker-8300.pid"
  echo "✓ Reranker 已启动 (PID $!)"
fi

echo "[8/17] 启动 OCR (8400)..."
if [[ -f "${LLM_LOG}/ocr-8400.pid" ]] && kill -0 "$(cat "${LLM_LOG}/ocr-8400.pid")" 2>/dev/null; then
  echo "✓ OCR 已在运行"
else
  export OMP_NUM_THREADS=4
  nohup ${ROOT}/llm_service/venv_vllm/bin/python -m vllm.entrypoints.openai.api_server \
    --model "${ROOT}/llm_service/base_models/Qwen3-VL-8B-Instruct" \
    --host 0.0.0.0 --port 8400 \
    --served-model-name Qwen3-VL-8B-Instruct \
    --tensor-parallel-size 2 --dtype bfloat16 --enforce-eager \
    --gpu-memory-utilization 0.18 --max-model-len 8192 \
    --max-num-seqs 2 --trust-remote-code --api-key sk-1139956aef2fa5e4af6dc9a78d858c0df648d243396af63a \
    --limit-mm-per-prompt '{"image":5,"video":1}' \
    > "${LLM_LOG}/ocr-8400.log" 2>&1 &
  echo $! > "${LLM_LOG}/ocr-8400.pid"
  echo "✓ OCR 已启动 (PID $!)"
fi

echo "等待 PostgreSQL 就绪..."
for i in $(seq 1 30); do
  if docker exec base_platform_postgres pg_isready -U user &>/dev/null; then
    break
  fi
  sleep 1
done

echo "[9/17] 启动 base_platform 后端 (8000)..."
cd "${ROOT}/base_platform"
source venv/bin/activate
pip install -e "${ROOT}/shared/common_logging" -e "${ROOT}/shared/common_metrics" -e "${ROOT}/shared/common_langfuse" -q -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
if ! PGPASSWORD=pass psql -h localhost -U user -d base_platform -c '\dt' &>/dev/null; then
  APP_ENV=production alembic upgrade head
  bash "${ROOT}/scripts/base_platform/init/init_all.sh"
fi
APP_ENV=production SWITCH_MODE_SCRIPT="${ROOT}/scripts/llm/switch_mode_mock.sh" \
  nohup uvicorn app.main:app --host 0.0.0.0 --port 8000 > "${LOG_DIR}/base_platform.log" 2>&1 &
BASE_PID=$!
echo "✓ base_platform 后端已启动 (PID: ${BASE_PID})"

echo "[10/17] 启动 saas_portal 前端 (8888)..."
cd "${ROOT}/saas_portal/frontend"
lsof -ti:8888 | xargs kill -9 2>/dev/null || true
nohup pnpm dev --host 0.0.0.0 --port 8888 > "${LOG_DIR}/saas_portal_frontend.log" 2>&1 &
SAAS_PID=$!
echo "✓ saas_portal 前端已启动 (PID: ${SAAS_PID})"

echo "[11/17] 启动 training_center 后端 (8001)..."
cd "${ROOT}/training_center/backend"
source venv/bin/activate
pip install -e "${ROOT}/shared/common_logging" -e "${ROOT}/shared/common_metrics" -e "${ROOT}/shared/common_langfuse" -q -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
APP_ENV=production alembic upgrade head
APP_ENV=production \
  nohup uvicorn app.main:app --host 0.0.0.0 --port 8001 > "${LOG_DIR}/training_center_backend.log" 2>&1 &
TC_BACKEND_PID=$!
echo "✓ training_center 后端已启动 (PID: ${TC_BACKEND_PID})"

echo "[12/17] 启动 training_center 前端 (8889)..."
cd "${ROOT}/training_center/frontend"
lsof -ti:8889 | xargs kill -9 2>/dev/null || true
nohup pnpm dev --host 0.0.0.0 --port 8889 > "${LOG_DIR}/training_center_frontend.log" 2>&1 &
TC_FRONTEND_PID=$!
echo "✓ training_center 前端已启动 (PID: ${TC_FRONTEND_PID})"

echo "[13/17] 启动 data_center 后端 (8002)..."
cd "${ROOT}/data_center/backend"
source venv/bin/activate
pip install -e "${ROOT}/shared/common_logging" -e "${ROOT}/shared/common_metrics" -e "${ROOT}/shared/common_langfuse" -q -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
APP_ENV=production alembic upgrade head
APP_ENV=production \
  nohup uvicorn app.main:app --host 0.0.0.0 --port 8002 > "${LOG_DIR}/data_center_backend.log" 2>&1 &
DC_BACKEND_PID=$!
echo "✓ data_center 后端已启动 (PID: ${DC_BACKEND_PID})"

echo "[14/17] 启动 data_center Celery worker..."
nohup celery -A app.celery_app worker -Q data_center_queue -c 8 --loglevel=info \
  > "${LOG_DIR}/data_center_celery.log" 2>&1 &
DC_CELERY_PID=$!
echo "✓ data_center Celery 已启动 (PID: ${DC_CELERY_PID})"

echo "[15/17] 启动 data_center 前端 (8890)..."
cd "${ROOT}/data_center/frontend"
lsof -ti:8890 | xargs kill -9 2>/dev/null || true
nohup pnpm dev --host 0.0.0.0 --port 8890 > "${LOG_DIR}/data_center_frontend.log" 2>&1 &
DC_FRONTEND_PID=$!
echo "✓ data_center 前端已启动 (PID: ${DC_FRONTEND_PID})"

echo "[16/17] 启动 Nginx..."
/lsinfo/tools/nginx/sbin/nginx -t && /lsinfo/tools/nginx/sbin/nginx
echo "✓ Nginx 已启动"

echo "[17/17] 生成并启动 API 文档服务 (8500)..."
python3 "${ROOT}/scripts/generate-api-docs.py" "${ROOT}/docs/api/index.html"
nohup python3 -m http.server 8500 --bind 127.0.0.1 --directory "${ROOT}" \
  > "${LOG_DIR}/api_docs.log" 2>&1 &
API_DOCS_PID=$!
echo "✓ API 文档服务已启动 (PID: ${API_DOCS_PID})"

cat > "${LOG_DIR}/server-pids.txt" << EOF
BASE_PID=${BASE_PID}
SAAS_PID=${SAAS_PID}
TC_BACKEND_PID=${TC_BACKEND_PID}
TC_FRONTEND_PID=${TC_FRONTEND_PID}
DC_BACKEND_PID=${DC_BACKEND_PID}
DC_CELERY_PID=${DC_CELERY_PID}
DC_FRONTEND_PID=${DC_FRONTEND_PID}
API_DOCS_PID=${API_DOCS_PID}
EOF

echo ""
echo "================================"
echo "✓ 所有服务启动完成！"
echo "================================"
echo "业务服务："
echo "  SaaS Portal:       https://ai.leshuiyun.com"
echo "  Base Platform API: http://124.174.37.152:8000"
echo "  训练中心前端:       https://training.leshuiyun.com"
echo "  训练中心 API:      http://124.174.37.152:8001"
echo "  数据中心前端:       https://data.leshuiyun.com"
echo "  数据中心 API:      http://124.174.37.152:8002"
echo "  API 文档:          http://124.174.37.152:8500/docs/api/index.html"
echo "大模型服务："
echo "  LLM (vLLM):        http://llm.leshuiyun.com"
echo "  Embedding:         http://embedding.leshuiyun.com"
echo "  Reranker:          http://rerank.leshuiyun.com"
echo "  OCR:               http://ocr.leshuiyun.com"
echo "监控："
echo "  Prometheus:        http://124.174.37.152:9090"
echo "  Grafana:           http://124.174.37.152:3000"
echo "  Langfuse:          http://124.174.37.152:3150"
REMOTE

echo "远程服务器启动完成。"