o
    3/i                     @   sp   d Z ddlZddlZddlmZ eeZdada	i a
dd Zdd Zejdejd	ejd
iZdd Zdd ZdS )znvMatmulHeuristics-based config selection for GEMM.

Queries NVIDIA's analytic heuristic library to pick tile/cluster dims based on
problem shape, then selects swap_ab by comparing estimated runtimes for both
orientations.
    N)
GemmConfigc               
   C   st   t durtS zddlm} m} | |jddada W tS  ty9 } ztd|  da daW Y d}~tS d}~ww )	z3Lazily initialize the nvMatmulHeuristics interface.Nr   )NvMatmulHeuristicsInterfaceNvMatmulHeuristicsTargetBSB)backend	precisionTz"nvMatmulHeuristics not available: F)	_nvmmh_available_ifacenvMatmulHeuristicsr   r   CUTLASS3	Exceptionloggerdebug)r   r   e r   b/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/quack/nvmmh_heuristic.py
_get_iface   s"   r   c           	   
   C   s   | t v rt |  S zEddlm}m} t }|du rW dS |j|jd}|| }|du r.W dS | }|	|| |j
|jfD ]}||| q>|t | < |W S  tyl } ztd|  dt | < W Y d}~dS d}~ww )z=Get or create a hardware descriptor for the given SM version.r   )NvMatmulHeuristicsNvidiaGpuNvMatmulHeuristicsMatmulLayoutN)	   
   z&Failed to create hardware descriptor: )_hw_descriptorsr
   r   r   r   H100_SXMB200getcreateHardwareDescriptorsetHardwarePredefinedGpuTN_ROW_MAJORTN_COL_MAJORloadInternalDiscoverySetr   r   r   )	device_capacityr   r   ifacegpu_mapgpuhwlayoutr   r   r   r   _get_hw,   s6   
r&   r   HSHSSSc           
      C   st   z/| j }|| _ | j||||d|d}|| _ |sW dS |d d }	|	j|	j|	j|	j|d d fW S  ty9   Y dS w )zXQuery nvMMH for top-1 config. Returns (tile_m, tile_n, cl_m, cl_n, est_runtime) or None.   )mnkmatmulLayoutcounthardware_descriptorNr   kernelruntime)r   get_with_mnk
cta_tile_m
cta_tile_n	cluster_m	cluster_nr   )
r!   r$   r*   r+   r,   r%   r   original_precisionresultscfgr   r   r   _query_top1X   s&    r:   c              
   C   sN  ddl m} t }|du rdS t|}|du rdS t| j}|du r%dS | jdkr/| jd n| jd }| jd }|jd }	t	||||	||j
|}
t	|||	|||j|}|
du r^|du r^dS |
rd|
d ntd}|rn|d ntd}||k r|dur|dd \}}}}d	}n|
dd \}}}}d
}|dko|dk}t||||||d|dS )aU  Use nvMatmulHeuristics to pick a GemmConfig based on problem shape.

    Queries both normal (M,N,K) with row-major output and swapped (N,M,K) with
    col-major output, picks the orientation with lower estimated runtime.

    Returns None if nvMatmulHeuristics is unavailable, letting the caller fall
    back to the hardcoded default.
    r   )r   N      infTFr         )tile_mtile_npingpongr5   r6   swap_abmax_swizzle_sizer    )r
   r   r   r&   _TORCH_DTYPE_TO_NVMMH_PRECISIONr   dtypendimshaper:   r   r   floatr   )ABr    r   r!   r$   r   r*   r,   r+   normalswapped	normal_rt
swapped_rtrB   rC   cl_mcl_nrE   rD   r   r   r   nvmmh_default_confign   sJ   	

rT   )__doc__loggingtorchquack.gemm_configr   	getLogger__name__r   r   r	   r   r   r&   bfloat16float16float32rG   r:   rT   r   r   r   r   <module>   s    
&