o
    Ղi4I                     @  s   U d Z ddlmZ ddlZddlZddlZddlZddlZddlmZm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZmZ ddlZdd	lmZ dd
lmZ eeZed d Zed d ZdZdddZG dd dZdade d< dddZ!dS )z
GPU runtime monitoring service.

Builds a real-time view of:
- GPU device status and memory usage from nvidia-smi
- Running local model processes from ps/proc inspection
- Process health for known local inference services
    )annotationsN)datetimetimezone)StringIO)Path)Lock)AnyOptional)LLM_SERVICE_DIR)
get_loggerbase_modelszbge-m3zbge-reranker-v2-m3)
"vllm.entrypoints.openai.api_serverembedding_server.pyrerank_server.pymlx_lm.serverllamafactorytorchrun	deepspeedaccelerate launch--model_name_or_path--served-model-namereturnstrc                   C  s   t tj S N)r   nowr   utc	isoformat r   r   S/lsinfo/ai/hellotax_ai/base_platform/app/services/monitoring/gpu_runtime_service.py_utc_now_iso.   s   r   c                   @  s  e Zd Zd`daddZdbdd	Zdbd
dZdcddZddddZdeddZdfddZ	dfddZ
dgddZdhd$d%Zdid'd(Zdjd)d*Zdkd.d/Zdld1d2Zdmd8d9Zdld:d;Zdndod?d@ZdpdDdEZdqdHdIZdrdJdKZdsdLdMZdtdOdPZdudQdRZdvdUdVZdwdYdZZd`dxd]d^Zd_S )yGpuRuntimeMonitoringService      @cache_ttl_secondsfloatc                 C  s   || _ t | _d | _d| _d S )Ng        )_cache_ttl_secondsr   _cache_lock_cached_snapshot
_cached_at)selfr"   r   r   r   __init__3   s   
z$GpuRuntimeMonitoringService.__init__r   dict[str, Any]c                 C  st   t  }| j) | jr|| j | jk r| jW  d    S |  }|| _|| _|W  d    S 1 s3w   Y  d S r   )time	monotonicr%   r&   r'   r$   _build_snapshot)r(   r   snapshotr   r   r   get_snapshot9   s   $z(GpuRuntimeMonitoringService.get_snapshotc                 C  s   |   }dd |D }|  }|  D ];}|d }| ||}||}|d u r;| |p0|d }| ||}|||< |d |d  |d  |d 7  < q| ||}	| 	||}
| 
|
|	}||
|	d	S )
Nc                 S  s   i | ]}|d  |qS )uuidr   .0rowr   r   r   
<dictcomp>G   s    z?GpuRuntimeMonitoringService._build_snapshot.<locals>.<dictcomp>pidprocess_name	gpu_uuidsgpu_uuidgpu_memory_mbused_gpu_memory_mb)overviewdevicesmodels)_collect_gpu_rows_collect_runtime_processes_collect_compute_process_rows_find_model_parent_processget_read_process_cmdline_describe_processadd_build_models_build_devices_build_overview)r(   gpu_rowsgpu_by_uuid	processescompute_rowr5   
target_pidprocesscmdliner=   r<   r;   r   r   r   r-   E   s(   
z+GpuRuntimeMonitoringService._build_snapshotr<   list[dict[str, Any]]r=   c                 C  s^   t dd |D }|t dd |D 7 }t dd |D }t|t dd |D ||t dS )Nc                 s  s     | ]}|d  dv rdV  qdS )status>   offlinewarning   Nr   r2   devicer   r   r   	<genexpr>d       z>GpuRuntimeMonitoringService._build_overview.<locals>.<genexpr>c                 s  s     | ]}|d  dkrdV  qdS rQ   failedrT   Nr   r2   modelr   r   r   rW   e   rX   c                 s       | ]}|d  dkrdV  qdS rY   r   r[   r   r   r   rW   g   rX   c                 s  r]   )rQ   rR   rT   Nr   rU   r   r   r   rW   k   rX   )totalGpuCountonlineGpuCountrunningModelCount
alertCount	updatedAt)sumlenr   )r(   r<   r=   alert_countrunning_model_countr   r   r   rH   c   s   z+GpuRuntimeMonitoringService._build_overviewrI   rK   dict[int, dict[str, Any]]c                 C  s  i }|  D ]}|d }|drq|d D ]}||t | qqt }g }|D ]W}	t||	d t }
|	d }|	d }d}|dkrOt	|| d }| j
||	d	 |
|d
}|d|	d  |	d  d|	d  ||t	|d dt	|d d||	d	 |
d	 q,|S )N
model_nameGPU Process r7   r0   memory_total_mbmemory_used_mbr   d   utilization_percent)memory_usage_percentrm   running_model_namesrk   zgpu-indexname #   rT   )	idrq   hostNamerQ   memoryUsedGbmemoryTotalGbmemoryUsagePercentutilizationPercentrunningModelNames)values
startswith
setdefaultsetrE   socketgethostnamesortedrB   round_derive_gpu_statusappend)r(   rI   rK   running_models_by_gpurN   rh   r8   	host_namer<   gpu_rowrunning_modelsrj   rk   rn   rQ   r   r   r   rG   q   sH   
z*GpuRuntimeMonitoringService._build_devicesrJ   dict[str, dict[str, Any]]c           	        s   g }|  D ]S}|d }|drq fddt|d D }ddd |D p*d	}dd
d |D p6d}|d|d  || || |||t|d d d|d d qddddd|jfddd |S )Nrh   ri   c                   s   g | ]
}| v r | qS r   r   )r2   r8   )rJ   r   r   
<listcomp>       z=GpuRuntimeMonitoringService._build_models.<locals>.<listcomp>r7   z, c                 s  s    | ]	}t |d  V  qdS )rp   N)r   r1   r   r   r   rW          z<GpuRuntimeMonitoringService._build_models.<locals>.<genexpr>zN/Ac                 s  s&    | ]}|d   d|d  V  qdS )rq   rr   rp   Nr   r1   r   r   r   rW      s   $ 
Unassignedzpid-r5   r9   rs   rT   
started_at)rt   rq   versionrQ   gpuIdgpuNamerv   	startedAtr         )runningloadingidlerZ   c                   s      | d d| d  | d fS )NrQ   c   rv   rq   )rB   )item)status_orderr   r   <lambda>   s   z;GpuRuntimeMonitoringService._build_models.<locals>.<lambda>)key)	r{   r|   r   joinr   _build_version_label_derive_model_statusr   sort)	r(   rK   rJ   r=   rN   rh   rI   gpu_ids	gpu_namesr   )rJ   r   r   rF      s8   

z)GpuRuntimeMonitoringService._build_modelsc                 C  s   |  g d}|sg S g }| |D ]1}t|dk rq|| |d |d  |d  | |d | |d | |d d	 q|S )
N)
nvidia-smizD--query-gpu=index,name,uuid,memory.used,memory.total,utilization.gpu--format=csv,noheader,nounits   r   rT   r   r         )rp   rq   r0   rk   rj   rm   )_run_command_parse_csv_outputrd   r   	_safe_intstripr(   outputrowsrecordr   r   r   r>      s&   

z-GpuRuntimeMonitoringService._collect_gpu_rowsc              
   C  s   |  g d}|rd|v rg S g }| |D ]%}t|dk rq||d  | |d |d  | |d d qd	d
 |D S )N)r   z>--query-compute-apps=gpu_uuid,pid,process_name,used_gpu_memoryr   zNo running processes foundr   r   rT   r   r   )r8   r5   r6   r:   c                 S  s   g | ]
}|d  dkr|qS )r5   r   r   r1   r   r   r   r     r   zMGpuRuntimeMonitoringService._collect_compute_process_rows.<locals>.<listcomp>)r   r   rd   r   r   r   r   r   r   r   r@      s"   

	z9GpuRuntimeMonitoringService._collect_compute_process_rowsc                 C  s   |  g d}|si S i }| D ]5}| }|sq|d d}t|dkr'q| |d }|d  }|dks=| |s>q| ||||< q|S )N)psz-eoz
pid=,args=rT   r   r   )r   
splitlinesr   splitrd   r   _looks_like_model_processrD   )r(   r   rK   linepartsr5   argsr   r   r   r?     s"   z6GpuRuntimeMonitoringService._collect_runtime_processesr5   intr   r   c                 C  st   |  |}| |}| |d}| |}||||r| |nd | |r)| |nd ||dt | ||||d	S )Nz--portr   )	r5   r   runtimeporthealthyr   r9   r7   rh   )_read_process_env_infer_runtime_extract_flag_value_get_process_started_atr   _check_service_healthr~   _infer_model_name)r(   r5   r   env_mapr   r   r   r   r   r   rD     s   


z-GpuRuntimeMonitoringService._describe_processboolc                   s   |   t fddtD S )Nc                 3  s    | ]	}|   v V  qd S r   lower)r2   hint
normalizedr   r   rW   2  r   zHGpuRuntimeMonitoringService._looks_like_model_process.<locals>.<genexpr>)r   anyMODEL_PROCESS_HINTS)r(   r   r   r   r   r   0  s   z5GpuRuntimeMonitoringService._looks_like_model_processc                 C  sd   |  }d|v r
dS d|v rdS d|v rdS d|v rdS d	|v r"d
S d|v s.d|v s.d|v r0dS dS )Nr   vLLMr   	Embeddingr   Rerankerr   MLXr   zLLaMA-Factoryr   r   r   TrainingRuntimer   )r(   r   r   r   r   r   r   4  s   z*GpuRuntimeMonitoringService._infer_runtimer   dict[str, str]r   c                 C  s   |  |dp
|d}|r|S |  |dp.|  |dp.|dp.|dp.|dp.|d}|r6t|jS |d	krEt|dttjS |d
krTt|dttjS td|}|rdt|	djS d| S )Nr   SERVED_MODEL_NAMEz--modelr   
MODEL_PATHMODEL_NAME_OR_PATHEMBEDDING_MODEL_PATHRERANK_MODEL_PATHr   r   z-(/[^\s]+(?:base_models|trained_models)[^\s]*)rT   ri   )
r   rB   r   rq   r   DEFAULT_EMBEDDING_MODEL_PATHDEFAULT_RERANK_MODEL_PATHresearchgroup)r(   r5   r   r   r   served_model_name
model_path
path_matchr   r   r   r   D  s0   


z-GpuRuntimeMonitoringService._infer_model_namerN   c                 C  s4   |d }|d }|r| d| S | d|d  S )Nr   r   u    · :u    · PID r5   r   )r(   rN   r   r   r   r   r   r   _  s
   z0GpuRuntimeMonitoringService._build_version_labelrn   rm   ro   	list[str]rk   c                 C  s0   |dks|dkr
dS |s|dks|dkrdS dS )NZ   _   rS   r   busyonliner   )r(   rn   rm   ro   rk   r   r   r   r   f  s
   z.GpuRuntimeMonitoringService._derive_gpu_statusc                 C  s^   |d }|d dkpt |d }|du rdS |du r)|d r)| |d	 r'd
S dS |r-dS dS )Nr   r9   r   r7   Tr   Fr   r   r   rZ   r   )r   _started_recently)r(   rN   r   has_gpu_memoryr   r   r   r   s  s   z0GpuRuntimeMonitoringService._derive_model_status,  r   threshold_secondsc                 C  sT   zt |}W n
 ty   Y dS w |jd u r|jtjd}t tj|  |k S )NF)tzinfo)	r   fromisoformat
ValueErrorr   replacer   r   r   total_seconds)r(   r   r   
started_dtr   r   r   r     s   
z-GpuRuntimeMonitoringService._started_recentlyr   Optional[int]Optional[bool]c              	   C  sz   |sd S d| dg}|dv r| d| d |D ] }ztj|dd}|jdk r/|jW   S W q tjy:   Y qw dS )	Nzhttp://127.0.0.1:z/health>   r   r   z
/v1/modelsg?)timeouti  F)r   httpxrB   status_code
is_success	HTTPError)r(   r   r   urlsurlresponser   r   r   r     s   
z1GpuRuntimeMonitoringService._check_service_healthflagOptional[str]c                 C  s8   t t | d}||}|sd S |ddS )Nz!(?:=|\s+)(\"[^\"]+\"|'[^']+'|\S+)rT   z'")r   compileescaper   r   r   )r(   r   r   patternmatchr   r   r   r     s
   
z/GpuRuntimeMonitoringService._extract_flag_valuec                 C  s`   |  ddt|ddg}|st S | }zt|d}|tj	 W S  t
y/   | Y S w )Nr   z-pz-ozlstart=z%a %b %d %H:%M:%S %Y)r   r   r   r   r   strptime
astimezoner   r   r   r   )r(   r5   r   	raw_valuer   r   r   r   r     s   z3GpuRuntimeMonitoringService._get_process_started_atc                 C  sZ   t d| d}z	|jddd}W n
 ty   Y d S w ddd |d	D  p,d S )
N/proc/z/cmdlineutf-8ignoreencodingerrors c                 s  s    | ]}|r|V  qd S r   r   )r2   partr   r   r   rW     s    zDGpuRuntimeMonitoringService._read_process_cmdline.<locals>.<genexpr> )r   	read_textOSErrorr   r   r   )r(   r5   cmdline_pathcontentr   r   r   rC     s   "z1GpuRuntimeMonitoringService._read_process_cmdlineknown_processesc                 C  s   t  }|}|dkr[||vr[||v r|S || z7td| djddd}|d}t|dkrH|d   }t|dkrL| |d }W qW 	 |S W 	 |S  tyZ   Y 	 |S w |S )	z
        Find the parent model process for a given PID.
        Walks up the process tree until a known model process is found.
        rT   r   z/statr  r  r  )r   )	r~   rE   r   r	  r   rd   r   r   r
  )r(   r5   r  visitedcurrentstat_contentr   fieldsr   r   r   rA     s4   

z6GpuRuntimeMonitoringService._find_model_parent_processc                 C  s   t d| d}z| }W n ty   i  Y S w i }|dD ]}|r)d|vr*q!|dd\}}|jddd||jddd< q!|S )	Nr   z/environ       =rT   r  r  )r  )r   
read_bytesr
  r   decode)r(   r5   env_pathrawr   r   r   valuer   r   r   r     s   z-GpuRuntimeMonitoringService._read_process_envr   list[list[str]]c                 C  s   dd t t|D S )Nc                 S  s   g | ]}|r|qS r   r   r1   r   r   r   r     s    zAGpuRuntimeMonitoringService._parse_csv_output.<locals>.<listcomp>)csvreaderr   )r(   r   r   r   r   r     s   z-GpuRuntimeMonitoringService._parse_csv_outputr  r   c              	   C  s,   z	t t| W S  ttfy   Y dS w )Nr   )r   r   r   	TypeErrorr   )r(   r  r   r   r   r     s
   z%GpuRuntimeMonitoringService._safe_intcommandr   c              
   C  s   zt j|ddd|d}W n tt jfy( } ztd| W Y d }~d S d }~ww |jdkr?|j }|r=t	d|j| d S |j
 S )NFT)checkcapture_outputtextr   zGPU runtime command failed: %sr   z#GPU runtime command returned %s: %s)
subprocessrunr
  SubprocessErrorloggerrS   
returncodestderrr   infostdout)r(   r  r   resultexcr'  r   r   r   r     s&   



z(GpuRuntimeMonitoringService._run_commandN)r!   )r"   r#   )r   r*   )r<   rP   r=   rP   r   r*   )rI   rP   rK   rg   r   rP   )rK   rg   rJ   r   r   rP   )r   rP   )r   rg   )r5   r   r   r   r   r*   )r   r   r   r   )r   r   r   r   )
r5   r   r   r   r   r   r   r   r   r   )rN   r*   r   r   )
rn   r   rm   r   ro   r   rk   r   r   r   )r   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   )r5   r   r   r   )r5   r   r   r   )r5   r   r  rg   r   r   )r5   r   r   r   )r   r   r   r  )r  r   r   r   )r  r   r   r#   r   r   )__name__
__module____qualname__r)   r/   r-   rH   rG   rF   r>   r@   r?   rD   r   r   r   r   r   r   r   r   r   r   rC   rA   r   r   r   r   r   r   r   r   r    2   s6    




0
-














	


r    z%Optional[GpuRuntimeMonitoringService]_gpu_runtime_servicec                   C  s   t d u rt a t S r   )r/  r    r   r   r   r   get_gpu_runtime_service  s   r0  )r   r   )r   r    )"__doc__
__future__r   r  r   r   r"  r+   r   r   ior   pathlibr   	threadingr   typingr   r	   r   
app.configr
   app.core.loggingr   r,  r%  r   r   r   r   r    r/  __annotations__r0  r   r   r   r   <module>   s4    
   R