o
    3/i                     @   s  d Z ddlmZ ddlZddlmZmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZmZmZ G dd	 d	eZd
ZdZdZdZdZeddddededefddZeddddedededejfddZeefddddedededefddZeddddedefdd ZdS )!aq  Rounding mode control and stochastic rounding primitives for GEMM epilogues.

Provides a RoundingMode enum for configuring how epilogues downconvert the
accumulator dtype (typically FP32) to the output dtype before storing to gmem.
Stochastic rounding (RS) uses the hardware cvt.rs.satfinite.bf16x2.f32 PTX
instruction and is only supported on Blackwell (SM100+) GPUs.
    )IntEnumN)Float32Uint32)ir)arithllvmvector)dsl_user_opInt32Tc                   @   s   e Zd ZdZdZdZdS )RoundingModeu   Rounding modes for epilogue dtype downconversion.

    RN — Round to nearest even (default hardware behavior)
    RS — Stochastic rounding (SM100+ only, BF16 output only)
    r      N)__name__
__module____qualname____doc__RNRS r   r   [/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/quack/rounding.pyr      s    r      l   S$ l   W= l   yn< l   .v locipabreturnc          	   	   C   s   t jd}tj|t| j||dt|j||dgddddtjjd}t	
 }ttj||dg||d}ttj||dg||d}||fS )	zuUnsigned 32b x 32b -> 64 wide multiply via PTX `mul.wide.u32`.

    Returns (hi, lo) as a pair of Uint32 values.
    z!llvm.struct<(i32, i32)>r   zL{
  .reg .u64 prod;
  mul.wide.u32 prod, $2, $3;
  mov.b64 {$1, $0}, prod;
}z	=r,=r,r,rFhas_side_effectsis_align_stackasm_dialectr   r   )r   Typeparser   
inline_asmr   ir_value
AsmDialectAD_ATTr   i32cutlassextractvalue)	r   r   r   r   	struct_tyresulti32_tyhilor   r   r   mul_wide_u32&   s    r/   	rand_bitsc                C   sV   t tjt t| j||dt|j||dt|j||dgddddtj	j
dS )zConvert 2 FP32 values to packed BF16x2 using stochastic rounding.

    Uses Blackwell PTX instruction: cvt.rs.satfinite.bf16x2.f32 dst, src_hi, src_lo, rand
    r   z+cvt.rs.satfinite.bf16x2.f32 $0, $2, $1, $3;z=r,f,f,rFr   )r(   r
   r   r#   r   r'   r   r$   r   r%   r&   )r   r   r0   r   r   r   r   r   cvt_f32x2_bf16x2_rs?   s   r1   counterkeyn_roundsc                C   s   t | }t d}t d}t d}t |}	t d}
t t}t t}t t}t t}t|D ].}t||||d\}}t||||d\}}||A |	A }||A |
A }|}|}|	| }	|
| }
q,||||fS )a  Philox 4x32b counter-based random number generator.

    Given a 32b counter and a 32b key, returns four pseudo-random uint32 words
    produced by running n_rounds of the Philox 4x32 bijection. Each round
    performs two wide 32x32->64 multiplies with the Philox constants.
    r   r   )r   PHILOX_ROUND_APHILOX_ROUND_BPHILOX_KEY_APHILOX_KEY_Branger/   )r2   r3   r4   r   r   c0c1c2c3k0k1round_around_bkey_akey_b_hi_blo_bhi_alo_ar   r   r   philox]   s(   
rI   seedtidc             
   C   s  t | j}|jd }|d dksJ d| |d }|d dks)J d| tjj}t jj|g||d}	t jj|gtj|d}
t	j
|
||d}t|D ]{}|d }|d d }tj| tjtj|||d||d	}tj| tjtj|||d||d	}|d }|d }|dkrt|d
> t|B }t|t|}|| }tt|t||||d}t|j||d}tj||tjtj|||d||d	}qNt	j|	|||d}|S )zConvert an MLIR FP32 vector to BF16 with stochastic rounding.

    Processes elements in pairs using Philox PRNG for entropy and the hardware
    cvt.rs.satfinite.bf16x2.f32 instruction.
    r      z&requires even number of elements, got    z>num_pairs must be divisible by 4 for stochastic rounding, got )r   r   r   )positionr   r      )r   
VectorTypetypeshaper(   BFloat16	mlir_typegetr
   r   
mlir_undefr9   r   extractelementr   constantr   rI   r1   r   r$   insertelementbitcast)src_vecrJ   rK   r   r   src_vec_type	num_elems	num_pairsdst_mlir_typedst_vec_typei32_vec_typei32_vecpair_idxlo_idxhi_idxsrc_losrc_hi	group_idx	intra_idxr2   
rand_batchentropy
packed_i32packed_i32_valdst_vecr   r   r   convert_f32_to_bf16_sr   sV   
ro   )r   enumr   r(   r   r   cutlass._mlirr   cutlass._mlir.dialectsr   r   r   cutlass.cutlass_dslr	   r
   r   r   PHILOX_N_ROUNDS_DEFAULTr5   r6   r7   r8   tupler/   r1   intrI   ro   r   r   r   r   <module>   sb    '