o
    ~riGn                     @   sf  d dl mZmZ e rddlZddlmZ ddlmZ d dlmZ d dl	m
Z
mZ eeZg dZed	d
 ZG dd deZG dd deZG dd deZG dd deZdd Zdd ZejdddejdedejfddZejdddejdedejfddZG d d! d!ejZd"d# Zd$d% Z d&d' Z!d(d) Z"d*d+ Z#d,d- Z$d1d.e%e& dB fd/d0Z'dS )2   )is_torch_availablelogging    N)nn)contextmanager)ConversionOps)get_module_from_nameshould_convert_module)g        g      ?g      ?g      ?g       @g      @g      @g      @g       g      g      g      g       g      g      g      c                 c   s    t  redd l}t| |jr| j} n
t| tr|| } t| dd }|dkrA|j|  d V  	 W d    d S 1 s<w   Y  |dkret|dre|j	|  d V  	 W d    d S 1 s`w   Y  d V  d S )Nr   typecudaxpu)
r   torch
isinstanceTensordevicestrgetattrr   hasattrr   )devr   dev_type r   m/lsinfo/ai/hellotax_ai/llm_service/venv_embed/lib/python3.10/site-packages/transformers/integrations/mxfp4.py	on_device1   s&   

  
r   c                   @   sb   e Zd Zdd Z			ddeeejf dejj	dB de
e dB dedB deeejf f
d	d
ZdS )Mxfp4Quantizec                 C   
   || _ d S Nhf_quantizerselfr   r   r   r   __init__H      
zMxfp4Quantize.__init__N
input_dictmodelmissing_keysfull_layer_namereturnc              	   K   s*  t | d \}}t|tr|d n|}t||\}}t|ji t|trt|	ddt
\}	}
t
jjt
jjt
jj}}}t|	|
t
\}	}
d|v rOdnd}||jv rZ|j|= t|||	 t|| d||
|| dd ||  d	|_i W  d    S W d    d S 1 sw   Y  d S )
Nr   gate_up_proj	down_proj_precision_configrhs_dataweight_scaleflex_ctxT)tupleitemsr   listr   r   r   Mxfp4GptOssExpertsquantize_to_mxfp4	transposetriton_kernels_hub
matmul_ogsPrecisionConfigFlexCtx
InFlexDataswizzle_mxfp4_parameterssetattrdiscard_is_hf_initialized)r   r"   r#   r$   r%   kwargs_valuemoduletriton_weight_tensorr/   r9   r:   r;   projr   r   r   convertK   s8   


"zMxfp4Quantize.convertNNN__name__
__module____qualname__r    dictr   r   r   r   Moduler3   rG   r   r   r   r   r   G   s     

r   c                   @   b   e Zd Zdd Z			ddeeejf dejj	dB dedB de
e dB deeejf f
d	d
ZdS )Mxfp4Dequantizec                 C   r   r   r   r   r   r   r   r    x   r!   zMxfp4Dequantize.__init__Nr"   r#   r%   r$   r&   c           	      K   s   i }d|v rdnd}| d|  v r8t|| d tr,|| d d || d< n|| d || d< | d|  v rft|| d trZ|| d d || d< n|| d || d< t|| d || d }||iS )Nr)   r*   _blocksr   _scales)keysr   r3   dequantize_convertops)	r   r"   r#   r%   r$   rA   
param_datarF   dequantizedr   r   r   rG   {   s   zMxfp4Dequantize.convertrH   rI   r   r   r   r   rP   w        

rP   c                   @   st   e Zd Zdd Z			ddeeejf dejj	dB dedB de
e dB deeejf f
d	d
ZedefddZdS )Mxfp4Deserializec                 C   r   r   r   r   r   r   r   r       r!   zMxfp4Deserialize.__init__Nr"   r#   r%   r$   r&   c           
      K   s"  i }d|v rdnd}| d|  v r8t|| d tr,|| d d || d< n|| d || d< | d|  v rft|| d trZ|| d d || d< n|| d || d< t||\}}	t|| d || d |||| d jt ||  d|_i S )Nr)   r*   rQ   r   rR   T)	rS   r   r3   r   swizzle_mxfp4_convertopsr   r7   r?   r@   )
r   r"   r#   r%   r$   rA   rU   rF   rD   rB   r   r   r   rG      s,   zMxfp4Deserialize.convertc                 C   s
   t | jS r   )Mxfp4ReverseDeserializer   )r   r   r   r   
reverse_op   s   
zMxfp4Deserialize.reverse_oprH   )rJ   rK   rL   r    rM   r   r   r   r   rN   r3   rG   propertyr   r[   r   r   r   r   rX      s$    


'rX   c                   @   rO   )rZ   c                 C   r   r   r   r   r   r   r   r       r!   z Mxfp4ReverseDeserialize.__init__Nr"   r#   r%   r$   r&   c                 K   sR  t |jdd}t |jdd}d|v rdnd}|ddd	 }	t||\}
}i }t|
trd
|v rA|dd}	t |
|d ||	< |S d|v rw|
jjj	
|
jjjdd|ddd||	 d< |
jjjj	
|
jjjjdd||	 d< |S |
jjj	
|
jjjdd||dd||	 d< |
jjjj	
|
jjjjdd||	 d< |S )Nnum_local_experts    hidden_sizei@  r)   r*   rB      r   biasrQ    _biasr'   r(   Z      rR   )r   configrsplitr   r   r4   replacer)   storagelayoutunswizzle_datadatar6   reshapegate_up_proj_precision_configr/   r*   down_proj_precision_config)r   r"   r#   r%   r$   rA   r]   r_   rF   namerD   rB   
state_dictr   r   r   rG      s@   


zMxfp4ReverseDeserialize.convertrH   rI   r   r   r   r   rZ      rW   rZ   c                 C   s.   |j jj}|| tjtjdd\} }| |fS )Nr`   )axis)numerics_detailsmxfpdowncast_to_mxfp_torchtor   bfloat16uint8)wr7   ru   w_scaler   r   r   r5      s   
r5   c           
      C   sn   |j j|j j|j j}}}|jj}|jjj}|jdd\}}	||| |d|fi |	} ||||}| |fS )zE
    Changes the layout of the tensors depending on the hardware
    r`   )mx_axisdtype)tensorFP4convert_layoutwrap_torch_tensortensor_detailsrj   StridedLayout"make_default_matmul_mxfp4_w_layout)
ry   rz   r7   r   r   r   rj   r   value_layoutvalue_layout_optsr   r   r   r<      s   

r<   i   r}   rows_per_chunkr}   r   r&   c                C   s  ddl }| tj} |tjd }| jdd |jks,J d| jdd d|jtjt|| jd}| j^ }}}|	|| }	| 
|	|} |
|	d}tj|	|d	 || jd}
td|	|D ]R}t|| |	}| || }||| }|
|| }|d
@ tj}|| |ddddd	f< ~|d? tj}|| |ddddd	f< ~tj|||d ~~~qa|
j
g |||d	 R  jg ||| d	 R  }
|
dd	 S )w
    Convert the mxfp4 weights again, dequantizing and makes them compatible with the forward
    pass of GPT_OSS.
    r   N   r'   zblocks.shape[:-1]=z does not match scales.shape=)r}   r   r`   r         )out)mathrv   r   rx   int32shaper~   
FP4_VALUESr   prodrm   emptyrangeminintldexpviewr6   
contiguous)blocksscalesr}   r   r   lutprefix_shapeGB
rows_totalr   r0r1blkexpsubidx_loidx_hir   r   r   _convert_moe_packed_tensors  s2   44r   c             	   C   sL   z	t | |||dW S  tjy%   | d} |d}t | |||d Y S w )r   r   cpu)r   r   OutOfMemoryErrorrv   )r   r   r}   r   r   r   r   convert_moe_packed_tensorsI  s   

r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )r4   c                    s   t    |j| _|j| _|j| _tjtj	| jd| j | jd dtj
ddd| _tjtj	| jd| j tjddd| _tjtj	| j| j| jd dftj
ddd| _tjtj	| j| jtjddd| _d| _t|dd	| _d | _d | _t|dd	| _d S )
Nr   r^   re   r|   Frequires_gradgZd;?swiglu_limitg      @)superr    r]   num_expertsintermediate_sizer_   r   	Parameterr   zerosrx   r)   float32gate_up_proj_biasr*   down_proj_biasalphar   limitrn   ro   )r   rf   	__class__r   r   r    c  s.   
" zMxfp4GptOssExperts.__init__hidden_statesr&   c                 C   s   t jjt jjt jj}}}t jj}t|j= ||d|d| j| j	fd}	||| j
| jtj||| jd |	d}
||
| j| jtj||| j|jd}W d    |S 1 sWw   Y  |S )Nswiglu)r   r   r   )gather_indxprecision_configgammasfused_activation)scatter_indxr   r   )r7   r8   FnSpecsFusedActivationr   	swiglu_fnr   r   r   r   r)   r   rv   r   r   rn   r*   r   ro   	gate_scal)r   r   routing_data
gather_idxscatter_idxr   r   r8   r   actintermediate_cache1intermediate_cache3r   r   r   forward  s<   

zMxfp4GptOssExperts.forward)rJ   rK   rL   r    r   r   r   __classcell__r   r   r   r   r4   b  s    r4   c                 C   s
  dd l }tjjtjjtjjtjjf\}}}}t| j t	j
 }t|jdd}d}	| jd }
| jd }|| }|| }|d | }|
| }dd }|| |\}}t	j|dd}t	j|dd\}}t	|d|}|d}t	j|||d d	|| }|dt	j}d
}t	||k ||}t	j|ddt	j}t	|t	j}t	||k ||	}t	||k||	}t	||	k|	|}|| }t	|| |	k|	|}|| | d}|| | d}||||}|}W d    n1 sw   Y  ||||||||fS )Nr   
LOCAL_RANK0r'   r`   c                 S   sF   t j|  dddd d d |f }| }t j| |dd}|| fS )Nr`   T)dimstabler   )r   argsortlongtake_along_dimr   )valsktk_indxtk_valr   r   r   topk  s   "z routing_torch_dist.<locals>.topkr   )binsmaxi  T)r   )src_indxdst_indx)osr7   routing
GatherIndxRoutingDataScatterIndxcompute_expt_data_torchr   r   r   distributedget_world_sizer   environgetr   softmaxsortgatherrm   histcr   rv   r   wherer   )logitsn_expts_actr   r   r   r   r   
world_sizerankreplace_valuen_tokensn_expts_totn_local_expertslocal_expert_startlocal_expert_endn_gates_padr   	expt_scal	expt_indxsort_indiceshistvar	topk_indx	gate_indxr   r   r   	expt_datahit_expertsr   r   r   routing_torch_dist  sN   



4r  c           
      C   s   dd l m} | r| rt| drt}ntjj}|jd }|	d| j
j}tj|| j
j| j
j}t|j ||| j
j\}}}W d    n1 sMw   Y  | j||||d}	|		|d| j
j}	|	|fS )Nr   
_is_hookedr'   )r   )torch.distributedr   is_availableis_initializedr   r  r7   r   r   rm   router
hidden_dimr   
functionallinearweightra   r   r   top_kexperts)
r   r   distr   
batch_sizerouter_logitsr   r   r   
routed_outr   r   r   mlp_forward  s   
r  c              
   K   s   ddl m} |d}|d}|d}	|d}
|d}|d}d	D ]W}||v r}|d ur;||||||	|
||}| d
}| d}t| |ddd | t| |r}t| |r}tt| |t| |}t| |tj	
|| t| | t| | q&d S )Nr   shard_and_distribute_moduler#   empty_paramcasting_dtypeto_contiguousr   device_mesh)r)   r*   rQ   rR   .r`   )integrations.tensor_parallelr  r   r>   rg   r   r   r   r   r   r   rv   delattr)rD   
param_nameparam_valuetarget_devicedq_param_namerA   r  r#   r  r  r  r   r  rF   blocks_attrscales_attrrV   r   r   r   
dequantize  s<   










r"  c                 C   s   t | |}tj|S r   )r   r   r   r   )r   r   rV   r   r   r   rT     s   
rT   c              	   K   s  |j j|j j|j j}}}ddlm}	 |d}
|d}|d}|d}|d}|d}d	|v rB|d
d dd }d|v rR|d
d dd }|durb|	|
||||||| nt| |	d
dd t
jj|dd | d}| d}t| |}t| |}|jjdkrB|jjdkrD|d}|dkr||| jd d}n
||d| jd }t|d|dkrtt
drt
j durt
j j}|| }|| }t| t|dd|dd|\}}W d   n1 sw   Y  |dkrt
|| j| jd g|_nt
|| j| jg|_t| || t| | d|||| dd t| | t| | ~dS dS dS )q
    This transforms the weights obtained using `convert_gpt_oss.py` to load them into `Mxfp4GptOssExperts`.
    r   r  r#   r  r  r  r   r  r   r  r'   rQ   r   r   rR   Nr`   Fr   metar)   r
   r   acceleratorr(   r+   r,   r.   )r8   r9   r:   r;   r  r  r   splitr>   rg   r   r   r   r   r   r
   sizerm   r   r   r%  current_acceleratorrv   r   r   r<   r6   Sizer_   r   r  )rD   r  r  r  r7   rA   r9   r:   r;   r  r#   r  r  r  r   r  rF   r   r!  r   r   local_expertsrE   r/   r   r   r   load_and_swizzle_mxfp4#  sl   






$









r+  c                 C   sn  |j j|j j|j j}}}| d}	t|d|dkr-ttdr-tj	 dur-tj	 j
}| | } || }|dkrJ| |	|jd d} n
| |	d|jd } t| t| d	d|d	d|\}
}W d   n1 ssw   Y  |dkrt|	|j|jd g|
_nt|	|j|jg|
_||jv r|j|= t|||
 t|| d
|||| dd dS )r#  r   r
   r   r%  Nr)   r   r'   r(   r+   r,   r.   )r8   r9   r:   r;   r'  r   r   r   r%  r(  r
   rv   r   rm   r   r   r<   r6   r)  r_   r   r=   r>   )r   r   rD   rF   r  r7   r9   r:   r;   r*  rE   r/   r   r   r   rY   j  s>   




rY   modules_to_not_convertc              	   C   s   |j r| S ddlm} |dad}|  D ]H\}}t||sq|jjdkrH|j sHt	d | 
|t| j d}W d   n1 sCw   Y  |jjd	kr]|j s]d
dlm} |t||_q|setd | S )aD  
    Public method that replaces the expert layers of the given model with mxfp4 quantized layers.

    Args:
        model (`torch.nn.Module`):
            The model to convert, can be any `torch.nn.Module` instance.
        quantization_config (`Mxfp4Config`, defaults to `None`):
            The quantization config object that contains the quantization parameters.
        modules_to_not_convert (`list`, *optional*, defaults to `None`):
            A list of modules to not convert. If a module name is in the list (e.g. `lm_head`), it will not be
            converted.
    r`   )
get_kernelz(kernels-community/gpt-oss-triton-kernelsFGptOssExpertsr$  TN	GptOssMLPr   )
MethodTypezYou are loading your model using mixed-precision FP4 quantization but no linear modules were found in your model. Please double check your model architecture, or submit an issue on github if you think this is a bug.)r"  hub_kernelsr-  r7   named_modulesr	   r   rJ   r   r   set_submoduler4   rf   typesr0  r  r   loggerwarning)r#   quantization_configr,  r-  has_been_replacedmodule_namerD   r0  r   r   r   replace_with_mxfp4_linear  s,   
r:  )NN)(utilsr   r   r   r   
contextlibr   core_model_loadingr   quantizers.quantizers_utilsr   r	   
get_loggerrJ   r5  r   r   r   rP   rX   rZ   r5   r<   rw   r}   r   r   r   r   rN   r4   r  r  r"  rT   r+  rY   r3   r   r:  r   r   r   r   <module>   sV   

004
:
CD!G1