o
    4/i}                 <   @   s  d Z 	 ddlZddlmZmZ ddlZddlZddlmZ ddl	m
Z
 ddlmZ ddlm  mZ z
ddlmZ dZW n eyK   d	Zd
d ZY nw z
ddlmZ dZW n eyc   d	ZdZY nw dZdZdZdZdZdZdZdZ dZ!dZ"dZ#dZ$dZ%dZ&dZ'dZ(dZ)ee) Z*dZ+dZ,dZ-dede.de.de.fddZ/dede.de.de.fddZ0ej1dej2dej3dej4d ej5e. d!ej5e. d"ej3d#ej3d$ej3d%ej3d&ej3d'ej3d(ej3d)ej3d*ej3d+ej3d,ej5e6 d-ej5e6 d.ej5e6 d/ej5e. d0ej5e. d1ej5e. d2ej5e. d3ej5e. d4ej5e. d5ej5e7 d6ej5e7 d7ej5e7 f6d8d9Z8ej1dej2dej3dej4d ej5e. d!ej5e. d"ej3d#ej3d$ej3d%ej3d&ej3d'ej3d(ej3d)ej3d*ej3d+ej3d,ej5e6 d-ej5e6 d.ej5e6 d/ej5e. d0ej5e. d1ej5e. d2ej5e. d3ej5e. d4ej5e. d5ej5e7 d6ej5e7 d7ej5e7 f6d:d;Z9ej:dej3d"ej3d#ej3d$ej3d%ej3d&ej3d'ej3d(ej3d)ej3d*ej3d+ej3d,ej5e6 d-ej5e6 d.ej5e6 d/ej5e. d0ej5e. d1ej5e. d2ej5e. d3ej5e. d4ej5e. d5ej5e7 d6ej5e7 d7ej5e7 d<ej;f0d=d>Z<ej:dej3d"ej3d#ej3d$ej3d%ej3d&ej3d'ej3d(ej3d)ej3d*ej3d+ej3d,ej5e6 d-ej5e6 d.ej5e6 d/ej5e. d0ej5e. d1ej5e. d2ej5e. d3ej5e. d4ej5e. d5ej5e7 d6ej5e7 d7ej5e7 d<ej;f0d?d@Z=ej>d0e.d1e.d2e.d/e.d3e.d4e.dAej?d.e6d6e7fdBdCZ@ej>d0e.d1e.d2e.d/e.d3e.d4e.dAej?d.e6d6e7fdDdEZAe					dfd%ej3d&ej3d'ej3dFeej3 d"ej3d#ej3d$ej3d(ej3d.ee6 dGeej3 d6e7dHeej3 dIeej3 deej3ej3f fdJdKZBej1dej2dej3dej4d!ej5e. d%ej3d&ej3d'ej3d#ej3d(ej3d"ej3d$ej3d)ej3d*ej3d,ej5e6 d-ej5e6 d.ej5e6 d2ej5e. d/ej5e. d6ej5e7 f&dLdMZCej1dej2dej3dej4d!ej5e. d%ej3d&ej3d'ej3d#ej3d(ej3d"ej3d$ej3d)ej3d*ej3d,ej5e6 d-ej5e6 d.ej5e6 d2ej5e. d/ej5e. d6ej5e7 f&dNdOZDej:d+ej3d%ej3d&ej3d'ej3d#ej3d(ej3d"ej3d$ej3dej3d*ej3d)ej3d,ej5e6 d-ej5e6 d.ej5e6 d0ej5e. d1ej5e. d2ej5e. d/ej5e. d3ej5e. d4ej5e. d5ej5e7 d6ej5e7 d<ej;f.dPdQZEej:d+ej3d%ej3d&ej3d'ej3d#ej3d(ej3d"ej3d$ej3dej3d*ej3d)ej3d,ej5e6 d-ej5e6 d.ej5e6 d0ej5e. d1ej5e. d2ej5e. d/ej5e. d3ej5e. d4ej5e. d5ej5e7 d6ej5e7 d<ej;f.dRdSZFe			dgd%ej3d&ej3d'ej3dFej3d"ej3d#ej3d$ej3d(ej3d.ee6 dGeej3 d6e7deej3ej3f fdTdUZGej1dej3dVej3d ej5e. d!ej5e. dWej5e. d"ej3d#ej3d$ej3d%ej3d&ej3d'ej3d(ej3d)ej3d*ej3d+ej3d,ej5e6 d-ej5e6 d.ej5e6 d/ej5e. d0ej5e. d1ej5e. d2ej5e. d3ej5e. d4ej5e. d5ej5e7 d6ej5e7 d7ej5e7 dXej5e7 dYej5e7 f:dZd[ZHej:dej3dVej3d"ej3d#ej3d$ej3d%ej3d&ej3d'ej3d(ej3d)ej3d*ej3d+ej3d,ej5e6 d-ej5e6 d.ej5e6 d/ej5e. d0ej5e. d1ej5e. d2ej5e. d3ej5e. d4ej5e. dWej5e. d ej5e. d5ej5e7 d6ej5e7 d7ej5e7 dXej5e7 dYej5e7 d<ej;f:d\d]ZIej>d0e.d1e.d2e.d/e.d3e.d4e.d^e.d_e.dXe7dYe7d.e6d6e7dWe.d e.fd`daZJe					dhd%ej3d&ej3d'ej3dHej3dIej3d"ej3d#ej3d$ej3d(ej3d.ee6 dGeej3 dbeej3 dXe7d6e7deej3ej3f fdcddZKdS )ia3  
Copyright (c) 2025 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)OptionalTuple)cpasync)from_dlpack   )flashinfer_apiTFc                 C   s   | S )N )funcr   r   b/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/flashinfer/gdn_decode.pyr   ;   s   r   )gated_delta_rule             $               
batch_sizeseq_lenreturnc                 C   s   dS )zSelect vec_size for MTP kernel.

    Always use vec_size=4 (32 threads per group = full warp, 4 groups per block).
    Full warp shuffle is more efficient and achieves >= 1.0x speedup vs Triton.
    r   r   r   r   r   r   r
   get_vec_size_mtpu   s   r   c                 C   s4   | dkrdS | dkrdS | dkrdS | dkrdS dS )zSelect optimal TILE_V for MTP kernel based on batch size and sequence length.

    With vec_size=4, num_groups=4, rows_per_group = tile_v / 4.
    Tuned via grid search for optimal performance.
    r   r   r   r   r   @   r   r   r   r   r
   get_tile_v_mtp~   s   r   tiled_copy_load	h0_sourcesmem_layout_stagedvec_sizenum_v_tilesA_logadt_biasqkvbo
h0_indices
cu_seqlenssoftplus_betasoftplus_thresholdscaleHVBTHKVuse_initial_stateuse_qk_l2norm	is_varlenc           a   
   C   s  t j \}}}|d }t j }t j|}t j \}}}|t } |t }!|t }"| | }#| | }$|$||  }%d}&tj	 }'|'
tj|d}(|'
tjt |fd})|'
tjt |fd}*t t j|fddtj}+t t j|fddtj},t t j|fddtj}-t t j|fddtj}.t t j|fddtj}/t t j|fddtj}0|| }1t||$ }2t||#|&|$f }3t||$ }4t||#|&|$f }5t j  || ddf }6t |dttf| ddf}7t |6ttfd	}8| |}9|!|" }:ttd |"};t|:|:|; D ]-}<|<|: t }=|8dd|<f }>|(dd|=f }?|9|>}@|9|?}At | |@|A t j  qt |ddd|f|#|&|%|f}Bt |	ddd|f|#|&|%|f}Ct |B|. t |C|/ t|D ]}Dt|.|D |,|D< t|/|D |+|D< qkt |
ddd|f|#|&|$|f}Et |E|0 t|D ]}Dt|0|D |*|1|D < qt j  d
}Fd
}G|dkr|3|4 }H||H }Id
}J|I|krt j|Idd}Ktd|K }Ltt j|Ldd}Mttd| |M }Jn|H}Jt j|2dd |J }Nddt j|5 dd  }Gt j|Ndd}Ft j|Fd}Ft j|Gd}G|rd
}Od
}Pt|D ]}D|O|,|D |,|D  7 }O|P|+|D |+|D  7 }Pq%dD ]}Q|Ot jj |O|Qddd7 }O|Pt jj |P|Qddd7 }Pq?t j!|Od dd}Rt j!|Pd dd}St|D ]}D|,|D |R |,|D< |+|D |S |+|D< qrt|D ]}D|,|D | |,|D< q|:|" }Tt|:|TD ]}<|<|: t }=t j"d t j  |<|; }U|U|Tk r|U|: t }V|8dd|Uf }W|(dd|Vf }X|9|W}@|9|X}At | |@|A t j  tdtdD ]}Y|d }Zd
}[t |(d|df|Y|Z ||=f}\t |\|- t|D ]}D|-|D |F |-|D< |[|-|D |+|D  7 }[qdD ]}Q|[t jj |[|Qddd7 }[q(|*|<t |Y |Z  |[ }]|]|G }]d
}^t|D ]}D|-|D  |+|D |] 7  < |^|-|D |,|D  7 }^qOt |7dd|dfd|Y|Z ||<f}_t |-|_ dD ]}Q|^t jj |^|Qddd7 }^q|<t |Y |Z }`|dkr|`|k rt|^|)|`< qqt j  ||:t kr||Tt k r|)| ||#|&|$|f< dS dS dS zCEach block uses pipeline to load one batch and vectorized writebackr   r   r   r   r   strideNr   )Nr           Tfastmath      ?r   r   r   r   r      offsetmaskmask_and_clampư>r   )#cutearch
thread_idxwarp_idxmake_warp_uniform	block_idxNUM_BLOCKS_PER_STATEcutlassutilsSmemAllocatorallocate_tensorFloat32BFloat16make_layoutmake_rmem_tensorbarrier
local_tileTILE_VTILE_K	get_slicemin
NUM_STAGESrangepartition_Spartition_Dcopycp_async_commit_groupautovec_copyrange_constexprexplogshuffle_syncshuffle_sync_bflyrsqrtcp_async_wait_group)ar   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   tidx_lane_idrJ   rL   	batch_idxbatch_innernum_v_tiles_per_blocki_ni_hvi_hi_tsmemsDatasOutputsVr_kr_qr_hr_q_bf16r_k_bf16r_v_bf16k_startr_A_logr_a	r_dt_biasr_b
gSrc_batchgDstgSrcthr_copy_loadstart_v_tilesprefetch_countv_tilesstage	gSrc_tilesData_stagethr_gSrc	thr_sDataq_tilek_tileiv_tiler_gr_betaxbeta_x
softplus_x
exp_beta_x	log_input
log_result	r_g_valuesum_qsum_krC   
inv_norm_q
inv_norm_kend_v_tilesnext_v_tiles
next_stage	gSrc_next
sData_nextrow
row_offsetsum_hk
sData_tilev_newsum_hq	gDst_tileo_idxr   r   r
   *gdn_decode_kernel_small_batch_pretranspose   s<   















.r   c           \   
   C   sP  t j \}}}|d }t j }t j|}t j \}}}|| } || }!|!||  }"d}#t||! }$t|| |#|!f }%t||! }&t|| |#|!f }'tj	 }(|(
tj|d})|(
tjt |fd}*|(
tjt |fd}+t t j|fddtj},t t j|fddtj}-t t j|fddtj}.t t j|fddtj}/t t j|fddtj}0t t j|fddtj}1|| }2t j  ||ddf }3t |dttf|ddf}4t |3ttfd	}5| |}6ttd |}7t|7D ]*}8|8t }9|5dd|8f }:|)dd|9f };|6|:}<|6|;}=t | |<|= t j  qt |ddd|f| |#|"|f}>t |	ddd|f| |#|"|f}?t |>|/ t |?|0 t|D ]}@t|/|@ |-|@< t|0|@ |,|@< qUt |
ddd|f| |#|!|f}At |A|1 t|D ]}@t|1|@ |+|2|@ < qt j  d
}Bd
}C|dkr|%|& }D||D }Ed
}F|E|krt j|Edd}Gtd|G }Htt j|Hdd}Ittd| |I }Fn|D}Ft j|$dd |F }Jddt j|' dd  }Ct j|Jdd}Bt j|Bd}Bt j|Cd}C|rpd
}Kd
}Lt|D ]}@|K|-|@ |-|@  7 }K|L|,|@ |,|@  7 }LqdD ]}M|Kt jj|K|Mddd7 }K|Lt jj|L|Mddd7 }Lq)t j |Kd dd}Nt j |Ld dd}Ot|D ]}@|-|@ |N |-|@< |,|@ |O |,|@< q\t|D ]}@|-|@ | |-|@< qut|D ]	}8|8t }9t j!d t j  |8|7 }P|P|k r|Pt }Q|5dd|Pf }R|)dd|Qf }S|6|R}<|6|S}=t | |<|= t j  tdtdD ]}T|d }Ud
}Vt |)d|df|T|U ||9f}Wt |W|. t|D ]}@|.|@ |B |.|@< |V|.|@ |,|@  7 }VqdD ]}M|Vt jj|V|Mddd7 }Vq	|+|8t |T |U  |V }X|X|C }Xd
}Yt|D ]}@|.|@  |,|@ |X 7  < |Y|.|@ |-|@  7 }Yq0t |4dd|dfd|T|U ||8f}Zt |.|Z dD ]}M|Yt jj|Y|Mddd7 }Yqc|8t |T |U }[|dkr|[|k rt|Y|*|[< qϐqt j  ||k r|*| || |#|!|f< dS dS r7   )"rG   rH   rI   rJ   rK   rL   rN   rR   rO   rP   rQ   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   )\r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   rj   rk   rl   rJ   rm   rp   rq   rr   rs   r   r   r   r   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rC   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r
   (gdn_decode_kernel_big_batch_pretranspose  s2   















.
r   streamc           #      C   s   | j jd | j jd | j jd }}}tjtjtjjdtj	dd}tj
ddd	}t
d
}t|||}t|t}td } tj
tttftdtt fd	}!dt t t d|  d|  d }"t|| |!| ||||||||||	|
||||||||||||j|t ddftddg|"|d dS )z>Launch original pipelined kernel for small batch pretranspose.r   r   r   
cache_moder   num_bits_per_copyr   r   r   r   r9   r   r   r   r   gridblockrt   r   N)layoutshaperG   make_copy_atomr   	CopyG2SOpLoadCacheModeGLOBALrN   rR   rT   make_tiled_copy_tvceil_divrX   rY   r\   r   launchrM   NUM_THREADS#r   r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r   r   v_dimk_dim	copy_atomthread_layout
val_layoutr   r    r   r   
smem_bytesr   r   r
   .run_gdn_decode_kernel_small_batch_pretranspose  sr   




$
r   c           #      C   s  | j jd | j jd | j jd }}}tjtjtjjdtj	dd}tj
ddd	}t
d
}t|||}t|t}td } tj
tttftdtt fd	}!dt t t d|  d|  d }"t|| |!| ||||||||||	|
||||||||||||j|ddftddg|"|d d S )Nr   r   r   r   r   r   r   r   r9   r   r   r   r   )r   r   rG   r   r   r   r   r   rN   rR   rT   r   r   rX   rY   r\   r   r   r   r   r   r   r
   ,run_gdn_decode_kernel_big_batch_pretranspose  sr   




$
r   dtypec	           	      C      i S )zECache compiled kernel for given configuration (pretranspose version).r   	r/   r0   r1   r.   r2   r3   r   r-   r5   r   r   r
   _get_compiled_decode_kernel     r   c	           	      C   r   )zECache compiled kernel for given configuration (nontranspose version).r   r   r   r   r
   (_get_compiled_decode_kernel_nontranspose  r   r   stateoutputinitial_stateinitial_state_indicesc           0      C   s  | j \}}}}|j \}}}}|du}||duksJ d|rA|j d }|j ||||fks@J d| d| d| d| d|j  
n%|dusIJ d	|j ||||fksfJ d
| d| d| d| d|j  
|rk|jn|j}to|tjko|dv o|dko|dk}|r| jtjtjfv sJ d| j |jtjksJ d|j |du r|d n|}t|||dd| ||||r|n|||
|d}|	du}|r|	jn| j}|	dur|	| n|}	|	j|kr|		|}	|r|n|}|	|fS |rJ d|dksJ d| |jtjksJ d|j |dksJ d| |dks%J d| |t
 dks6J dt
 d| | jtjtjfv sHJ d| j |jtjksWJ d|j |du r`|d }|	du}|rj|	jn| j}|	du rtj||||ftj| jd}	||| ||}||||||| j||
f	}t| }d|vs|d j| jkrtj|tj| jd|d< tj|d tj| jd|d< |d } |d }!d|vrAttj j}"t|dd }#t|dd }$t|dd }%t|dd }&t| dd }'t|dd }(t|dd })t|dd }*t|	dd }+t| dd },t|!dd }-t}.tj|.|#|$|%|&|'|(|)|*|+|,|-fdd|||||||d!|
d"|"d#d$}/|/|d< n|d }/ttj j}"|/||||| ||||	| |!|" |	j|krh|		|}	| sx|||||| |	|fS )%u  Gated Delta Rule Decode kernel for single-token generation.

    This implements the decode phase of gated delta rule linear attention,
    processing one token at a time and updating the recurrent state.

    Args:
        q (torch.Tensor):
            Current query of shape ``[B, 1, H, K]``. Must be float16/bfloat16.
        k (torch.Tensor):
            Current key of shape ``[B, 1, H, K]``. Must be float16/bfloat16.
        v (torch.Tensor):
            Current value of shape ``[B, 1, HV, V]``. Must be float16/bfloat16.
        state (Optional[torch.Tensor]):
            Current state of shape ``[B, HV, V, K]`` (v-major / K-last layout).
            Float32: legacy kernel (T=1 only).  Bfloat16: gdn_decode_klast_bf16_state backend
            when T in 1..4 and K=V=128. Will be updated in-place.
            Pass ``None`` when using ``initial_state`` / ``initial_state_indices`` instead.
        A_log (torch.Tensor):
            Log decay parameter of shape ``[HV]``. Must be float32.
        a (torch.Tensor):
            Input-dependent decay of shape ``[B, 1, HV]``. Must be float16/bfloat16.
        dt_bias (torch.Tensor):
            Decay bias of shape ``[HV]``. Must be bfloat16 or float32.
        b (torch.Tensor):
            Update gate (beta) input of shape ``[B, 1, HV]``. Must be float16/bfloat16.
        scale (Optional[float]):
            Scale factor for queries. If None, defaults to ``1 / sqrt(K)``.
        output (Optional[torch.Tensor]):
            Pre-allocated output tensor of shape ``[B, 1, HV, V]``.
            If None, will be allocated automatically.
        use_qk_l2norm (bool):
            Whether to apply L2 normalization to q and k. Default: ``True``.
        initial_state (Optional[torch.Tensor]):
            State pool of shape ``[pool_size, HV, V, K]`` (K-last / K-contiguous,
            same layout as the per-batch ``state`` argument).
            When provided, the kernel gathers directly from the pool using
            ``initial_state_indices`` and writes updates back in-place — eliminating
            the caller-side gather/scatter overhead.
            Requires bfloat16 state with T in 1..4 and K=V=128 (bf16 fast path).
        initial_state_indices (Optional[torch.Tensor]):
            Per-batch indices of shape ``[B]`` (int32 or int64) mapping each batch
            entry to its slot in ``initial_state``.  Required when ``initial_state``
            is provided.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]:
            - output: Output tensor of shape ``[B, 1, HV, V]``
            - state or initial_state: Updated state (in-place).

    Note:
        - Requires SM90+ (Hopper, Blackwell, etc.)
        - State is always updated in-place; the pool path writes directly into
          ``initial_state`` memory (no separate scatter step needed)
        - State layout is v-major (K-last): [B, HV, V, K]. When state is bfloat16
          and T in 1..4 with K=V=128, the gdn_decode_klast_bf16_state kernel is used
          (supports both the direct ``state`` path and the pool+indices path).
        - pool+indices (``initial_state``/``initial_state_indices``) only supported
          via the bf16 fast path; float32 state raises an error.
        - Legacy path (float32 state, T=1): K and V must be multiples of 4.
    NzAinitial_state and initial_state_indices must be provided togetherr   (Expected initial_state shape [pool_size=, HV=, V=, K=], got z.Either state or initial_state must be providedExpected state shape [B=)r   r      r   r    q must be float16/bfloat16, got A_log must be float32, got       r>         4@)r!   r"   r#   r+   r,   r$   r%   r&   r'   initial_state_sourcer   use_qk_l2norm_in_kernelr-   zpool+indices (initial_state/initial_state_indices) requires bfloat16 state with T in 1..4 and K=V=128 (the gdn_decode_klast_bf16_state fast path)r    Decode only supports T=1, got T=state must be float32, got K must be at least 128, got K=V must be at least 128, got V=V must be divisible by ( to prevent out-of-bounds access, got V=r   devicer)   r*   compiledr   assumed_alignTF--enable-tvm-ffi)r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r   options)r   r   &_GDN_DECODE_KLAST_BF16_STATE_AVAILABLEtorchbfloat16float16float32-_gated_delta_rule_gdn_decode_klast_bf16_statecopy_torX   zerosr   reshaper   int32cudaCUstreamcurrent_streamcuda_streamr   r   rG   compileis_contiguous)0r$   r%   r&   r   r!   r"   r#   r'   r-   r   r5   r   r   r/   r0   r1   r2   rk   r.   r3   use_pool	pool_sizestate_dtypeuse_gdn_decode_klast_bf16_state	scale_valoutoutput_providedtarget_dtypereturn_stater   	cache_keycacher)   r*   r   h0_source_tensorA_log_tensora_tensordt_bias_tensorq_tensork_tensorv_tensorb_tensoro_tensorh0_indices_tensorcu_seqlens_tensorrun_funcr   r   r   r
   $gated_delta_rule_decode_pretranspose  s  M
"










r  c           i   	   C   s  t j \}}}|d }t j }t j|}t j \}}}d}t| }d| }t| }|t }|t }|t }|| }|| } || }!|!||  }"||  }#|#dkr|| }$|| }%|| }&|&|% }'t	j
 }(|(t	j|d})t jtfdd}*|(t	j|*d}+t jtfdd},t jtfdd}-|(t	j|,d}.|(t	j|-d}/|tk rt	|| d|"|f |.|< t	|| d|"|f |/|< |#| |! }0||0ddf }1t |1ttfd}2| |}3t	td	 |}4t|4D ].}5||5 }6|5t }7|2dd|6f }8|)dd|7f }9|3|8}:|3|9};t | |:|; t j  qt	|	|! }<t	|
|! }=t	|| d|!f }>t	|| d|!f }?d
}@d
}A|dkr|>|= }B||B }Cd
}D|C|krtt j|Cdd}Et	d|E }Ft	t j|Fdd}Gt	t	d| |G }Dn|B}Dt j|<dd |D }Hddt j|? dd  }At j|Hdd}@t j|@d}@t j|Ad}At j  |rsd
}Id
}J|tk r|/| }K|.| }L|K|K }I|L|L }JdD ]}M|It jj|I|Mddd7 }I|Jt jj|J|Mddd7 }Jq|dkr|I|+|< |J|+|d < t j  d
}Nd
}O|dkrId
}Pd
}Q||k r|+| }P|+|d  }QdD ]}M|Pt jj|P|Mddd7 }P|Qt jj|Q|Mddd7 }Qq|dkrIt j|Pd dd|+d< t j|Qd dd|+d	< t j  |+d }N|+d	 }O|tk rm|.| |O |.|< |/| | |N |/|< t j  n|tk r|/| | |/|< t j  t|D ]B}5||5 }6|5t }7t jd t j  |5|4 }R|R|k r||R }S|Rt }T|2dd|Sf }U|)dd|Tf }V|3|U}:|3|V};t | |:|; t j  |6t |' }Wt	|| d|!|Wf }Xd
}Yt|ddD ]}Z|Z| }[|[|$ }\|)|\|'|7f |@ }]|.|\ }^|Y|]|^ 7 }YqdD ]}M|Yt jj|Y|M| ddd7 }Yq|X|Y |A }_t j|_|%}_d
}`t|ddD ]/}Z|Z| }[|[|$ }\|)|\|'|7f |@ }a|.|\ }^|/|\ }b|a|^|_  }c|c|)|\|'|7f< |`|c|b 7 }`q5dD ]}M|`t jj|`|M| ddd7 }`qg|$dkr|6t |' }dt	|`|| d|!|df< t j  t	 |D ]*}Z||Zd  }e|et }f|et }g|ftk r|)|f|g|7f }]|6t |g }h|]||0|f|hf< qt j  qdS dS )zDSmall batch kernel for (N, 1, ...) format with K-major state layout.r   r   r   r   r8   r9   Nr   Nr   r;   Tr<   r>   r?   r@   rA   rB   )r   r   rF   r   unrollr   r   r   )!rG   rH   rI   rJ   rK   rL   TILE_V_SMALL_NT	TILE_K_NTNUM_BLOCKS_PER_STATE_SMALL_NTrN   rO   rP   rQ   rR   rT   rW   rZ   r[   NUM_STAGES_NTr]   r^   r_   r`   ra   rd   re   rf   rV   rg   rh   ri   rS   rc   )ir   r   r   r    r$   r%   r&   r"   r'   r!   r#   r(   r)   r+   r,   r-   r1   r.   r5   rj   rk   in_warp_tidrJ   rL   NUM_WARPS_SMALLV_PER_WARP_SMALLROWS_PER_ITER_SMALLNUM_K_ITERS_SMALLrm   rn   ro   start_v_tilerp   rq   rr   pool_idxk_localv_localv_basev_idxrt   ru   smem_o_layoutsmem_osmem_k_layoutsmem_q_layoutsKsQflat_idxr   r   r   r   v_tile_offsetr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   sum_q_partialsum_k_partialq_valk_valrC   r   r   local_sum_qlocal_sum_knext_v_tile_offsetnext_v_tiler   r   r   v_globalr_vr   k_iterk_basek_idxh_valr_k_valr   r   h_oldr_q_valh_newv_global_outflat_tidk_writev_writev_global_writer   r   r
   *gdn_decode_kernel_small_batch_nontranspose  sr  


























 wrL  c           _   	   C   sJ  t j \}}}|d }t j }t j|}t j \}}}|| }|| }|||  }|| }|dkr|t }|t }|t }|| }tj	 } | 
tj|d}!t jtfdd}"| 
tj|"d}#t jtfdd}$t jtfdd}%| 
tj|$d}&| 
tj|%d}'|tk rt||d||f |&|< t||d||f |'|< || | }(||(ddf })t |)ttfd}*| |}+ttd |},t|,D ]*}-|-t }.|*dd|-f }/|!dd|.f }0|+|/}1|+|0}2t | |1|2 t j  qt|	| }3t|
| }4t||d|f }5t||d|f }6d	}7d	}8|dkrq|5|4 }9||9 }:d	};|:|krRt j|:d
d}<td|< }=tt j|=d
d}>ttd| |> };n|9};t j|3d
d |; }?ddt j|6 d
d  }8t j|?d
d}7t j|7d}7t j|8d}8t j  |rQd	}@d	}A|tk r|'| }B|&| }C|B|B }@|C|C }AdD ]}D|@t jj|@|Dddd7 }@|At jj|A|Dddd7 }Aq|dkr|@|#|< |A|#|d < t j  d	}Ed	}F|dkr'd	}Gd	}H|tk r|#| }G|#|d  }HdD ]}D|Gt jj|G|Dddd7 }G|Ht jj|H|Dddd7 }Hq|dkr't j|Gd d
d|#d< t j|Hd d
d|#d< t j  |#d }E|#d }F|tk rK|&| |F |&|< |'| | |E |'|< t j  n|tk r^|'| | |'|< t j  t|D ]:}-|-t }.t jd t j  |-|, }I|I|k r|It }J|*dd|If }K|!dd|Jf }L|+|K}1|+|L}2t | |1|2 t j  |-t | }Mt||d||Mf }Nd	}Ott ddD ]}P|Pt! }Q|Q| }R|!|R||.f |7 }S|&|R }T|O|S|T 7 }OqdD ]}D|Ot jj|O|Dt ddd7 }Oq|N|O |8 }Ut j|U|}Ud	}Vtt ddD ]/}P|Pt! }Q|Q| }R|!|R||.f |7 }W|&|R }T|'|R }X|W|T|U  }Y|Y|!|R||.f< |V|Y|X 7 }VqdD ]}D|Vt jj|V|Dt ddd7 }Vq=|dkre|-t | }Zt"|V||d||Zf< t j  t#t D ]*}P||Pd  }[|[t }\|[t }]|\tk r|!|\|]|.f }S|-t |] }^|S||(|\|^f< qot j  qgdS dS )zDLarge batch kernel for (N, 1, ...) format with K-major state layout.r   r   r   r8   r9   Nr  r   r;   Tr<   r>   r?   r@   rA   rB   r   r  rF   r  r   )$rG   rH   rI   rJ   rK   rL   V_PER_WARP_NTrN   rO   rP   rQ   rR   rT   	TILE_V_NTr  rW   rZ   r[   r!  r]   r^   r_   r`   ra   rd   re   rf   rV   rg   NUM_WARPS_LARGE_NTrh   ri   NUM_K_ITERS_NTROWS_PER_ITER_NTrS   rc   )_r   r   r   r    r$   r%   r&   r"   r'   r!   r#   r(   r)   r+   r,   r-   r1   r.   r5   rj   rk   r"  rJ   rm   rp   rq   rr   r(  r)  r*  r+  r,  rt   ru   r-  r.  r/  r0  r1  r2  r3  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r5  r6  r7  r8  rC   r   r   r9  r:  r<  r   r   r   r=  r>  r   r?  r@  rA  rB  rC  r   r   rD  rE  rF  rG  rH  rI  rJ  rK  r   r   r
   (gdn_decode_kernel_big_batch_nontranspose  sP  


























 vrR  c           "      C   s  |j j\}}}|	j jd  |}tjtjtjjdtj	dd}t
|t}tjtttftdtt fd}tjddd}td	}t|||} d
t t t d
t  d
t d  d }!t| |||||||||||
|	||||||j|t ddftddg|!|d d S )Nr   r   r   r   r   r9   )r   r   )r   r   r   r   r   r   r   )r   r   rG   r   r   r   r   r   rN   rR   r   r  rT   r  r!  TILE_V_SMALL_PADDED_NTr   rL  r   r   NUM_THREADS_NT)"r*   r$   r%   r&   r"   r'   r!   r#   r   r)   r(   r+   r,   r-   r/   r0   r1   r.   r2   r3   r4   r5   r   batch_hv_dimr   r   r   r   num_v_tiles_smallsmem_layout_smallthread_layout_smallval_layout_smalltiled_copy_load_smallsmem_bytes_smallr   r   r
   .run_gdn_decode_kernel_small_batch_nontransposez  sj   


r\  c           "      C   s   |j j\}}}|	j jd  |}tjtjtjjdtj	dd}t
|t}tjtttftdtt fd}tjddd}td	}t|||} d
t t t d
t  d
t d  d }!t| |||||||||||
|	||||||j|ddftddg|!|d d S )Nr   r   r   r   r   r9   )r   r   )r   r   r   r   r   r   r   )r   r   rG   r   r   r   r   r   rN   rR   r   rN  rT   r  r!  TILE_V_PADDED_NTr   rR  r   NUM_THREADS_LARGE_NT)"r*   r$   r%   r&   r"   r'   r!   r#   r   r)   r(   r+   r,   r-   r/   r0   r1   r.   r2   r3   r4   r5   r   rU  r   r   r   r   r    base_smem_layoutr   r   r   r   r   r   r
   ,run_gdn_decode_kernel_big_batch_nontranspose  sf   


r`  c           )      C   sh  | j \}}}}|dksJ d| |j \}}}}|j ||||fks6J d| d| d| d| d|j  
|dksAJ d	| |dksLJ d
| |t dks\J dt d| | jtjtjfv smJ d| j |jtjks{J d|j |jtjksJ d|j |du r|d }|	du}|r|	jn| j}|	du rtj||||ftj| jd}	|	 }|
|| ||}||||||| j||
f	}t| }d|vs|d j| jkrtj|tj| jd|d< tj|d tj| jd|d< |d }|d }d|vr{ttj j}|tk }|rt}nt}t|dd}t|dd}t|dd}t|dd} t| dd}!t|dd}"t|dd}#t|dd}$t|	dd}%t|dd}&t|dd}'tj||'|!|"|#||$|| ||&|%fdd|||||||d|
|dd}(|(|d< n|d }(ttj j}|(|| |||||||||	| | | kr|| |	j|kr|	|}	|	|fS )a  Gated Delta Rule Decode kernel (K-major layout, no transpose needed).

    This implements the decode phase of gated delta rule linear attention,
    processing one token at a time and updating the recurrent state.
    This version uses K-major state layout [B, HV, K, V] which is more natural
    and doesn't require transposition.

    Args:
        q (torch.Tensor):
            Current query of shape ``[B, 1, H, K]``. Must be float16/bfloat16.
        k (torch.Tensor):
            Current key of shape ``[B, 1, H, K]``. Must be float16/bfloat16.
        v (torch.Tensor):
            Current value of shape ``[B, 1, HV, V]``. Must be float16/bfloat16.
        state (torch.Tensor):
            Current state of shape ``[B, HV, K, V]`` (k-major layout).
            Must be float32. Will be updated in-place.
        A_log (torch.Tensor):
            Log decay parameter of shape ``[HV]``. Must be float32.
        a (torch.Tensor):
            Input-dependent decay of shape ``[B, 1, HV]``. Must be float16/bfloat16.
        dt_bias (torch.Tensor):
            Decay bias of shape ``[HV]``. Must be bfloat16 or float32.
        b (torch.Tensor):
            Update gate (beta) input of shape ``[B, 1, HV]``. Must be float16/bfloat16.
        scale (Optional[float]):
            Scale factor for queries. If None, defaults to ``1 / sqrt(K)``.
        output (Optional[torch.Tensor]):
            Pre-allocated output tensor of shape ``[B, 1, HV, V]``.
            If None, will be allocated automatically.
        use_qk_l2norm (bool):
            Whether to apply L2 normalization to q and k. Default: ``True``.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]:
            - output: Output tensor of shape ``[B, 1, HV, V]``
            - state: Updated state tensor of shape ``[B, HV, K, V]``

    Note:
        - Requires SM90 (Hopper) architecture
        - State is updated in-place
        - K and V must be multiples of 4 for vectorized loads
        - State layout is k-major: [B, HV, K, V] (no transpose needed)
    r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   Nr   r   r)   r*   r   r   r   r>   r   Tr   )r+   r,   r-   r/   r0   r1   r.   r2   r3   r4   r5   r   r   )r   rN  r   r   r   r   r   r   r   
contiguousviewr   aranger   r   r   r   r   SMALL_BATCH_THRESHOLD_NTr\  r`  r   rG   r   data_ptrr   r   ))r$   r%   r&   r   r!   r"   r#   r'   r-   r   r5   r/   r0   r1   r2   rk   r.   r3   r  r	  state_contiguousr   r  r  r)   r*   r   use_small_batchr  r  r  r  r  r  r  r  r  r  r  r  r   r   r   r
   gated_delta_rule_decode  s   ;"




rh  intermediate_statestile_vdisable_state_updatecache_intermediate_statesc           ]   
   C   sT  t j \}}}|d }t j } t j| } || }!d|! }"d|" }#||! }$||! }%| |" |% }&t j \}'}}|'| }(|'| })|)| }*|)| }+|*||  },||+ }-t||* }.t||* }/tj	 }0|0
tjt j||f|d dfdd}1|0
tjt j||f|d dfdd}2|0
tjt |fd}3|0
tjt |fd}4t t j|fddtj}5t t j|fddtj}6t t j|fddtj}7t t j|fddtj}8t t j|fddtj}9|-dkr|$| }:t|D ]o};t |ddd|f|+|;|,|$f}<t |	ddd|f|+|;|,|$f}=t |<|8 t |=|9 t|D ]}>t|8|> |5|>< t|9|> |6|>< q%t|rd	}?d	}@t|D ]}>|?|5|> |5|>  7 }?|@|6|> |6|>  7 }@qJd
D ]}A|?t jj|?|Addd7 }?|@t jj|@|Addd7 }@qdt j|?d dd| }Bt j|@d dd}Ct|D ]}>|5|> |B |5|>< |6|> |C |6|>< qnt|D ]}>|5|> | |5|>< q||!k rt|D ]}>|5|> |1|;|:|> f< |6|> |2|;|:|> f< qt||+|;|*f }Dt||+|;|*f }E|D|/ }F||F }Gt j|Gdd}Htd| t jtd|H dd }I|G|kr!tdntd	}J|J|I td|J |F  }Kt j|.dd |K }Ltdtdt j|E dd  }Mt j|Ldd}N|dkrc|N|3|;< |M|4|;< qt j  ||# }Ot|OD ]4}P|(| |&|O  |P }Q|Q|k r|-| |* }Rt | dd|f|R|Q|$f}St |S|7 t|D ]};t |1d|f|;|$f}Tt |2d|f|;|$f}Ut |T|5 t |U|6 |3|; }N|4|; }Mt|D ]}>|7|> |N |7|>< qd	}Vt|D ]}>|V|7|> |6|>  7 }Vqd
D ]}A|Vt jj|V|Addd7 }Vqt|
|+|;|*|Qf }W|W|V |M }Xt|D ]}>|7|>  |6|> |X 7  < qt|rQ|+| | |;|  |* }Yt |dd|f|Y|Q|$f}Zt |7|Z d	}[t|D ]}>|[|7|> |5|>  7 }[qXd
D ]}A|[t jj|[|Addd7 }[qh|$dkrt|[||+|;|*|Qf< qt| rt | dd|f|R|Q|$f}\t |7|\ qrdS dS )av  
    Parallel MTP kernel - each block handles one [TILE_V, TILE_K] tile.

    Grid: (B * HV * num_v_tiles, 1, 1)
    Each block:
    - Loads its v_tile of state into registers
    - Processes all T time steps with state in registers
    - Writes output and optionally updates state

    This matches Triton's parallelization strategy for better small-batch performance.
    r   r   r   r   r9   r   r8   r   r;   r?   r@   rA   rB   rF   Tr<   r>   N)rG   rH   rI   rJ   rK   rL   rN   rR   rO   rP   rQ   rT   rU   rS   rc   rW   rb   
const_exprrg   rh   rd   re   rV   )]r   ri  r   r    rj  r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   rk  rl  rj   rk   rl   rJ   threads_per_groupgroups_per_warp
num_groupslane_in_groupgroup_in_warp	group_idxrm   i_vtmprq   rp   rr   	cache_idxr   r   rt   r2  r1  sGsBetary   rx   rz   r{   r|   r~   rs   r   r   r   r   r   rC   inv_norm_q_scaledr   r   r   r   r   r   softplus_valuse_softplusr   r   r   r   rows_per_grouprow_in_groupr,  flat_state_idxh_tilesQ_tilesK_tiler   r>  r   r3  
inter_tiler   
h_tile_outr   r   r
   gdn_verify_kernel_mtp  sH  +









 \r  c           #      C   s   | j jd | j jd | j jd }}}t||} || |  }!d| |d  d| |d   d|  d|  d }"t| ||| |||||||||	|
|||||||||||||||j|!ddftddg|"|d d S )Nr   r   r   r   r   r   r   )r   r   rG   r   r  r   NUM_THREADS_MTP)#r   ri  r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   rj  r   r4   r5   r6   rk  rl  r   rk   r   r   r    	grid_sizer   r   r   r
   run_gdn_verify_kernel_mtp	  sj   
!



r  r  cache_stepsc                 C   r   )z2Cache compiled MTP kernel for given configuration.r   )r/   r0   r1   r.   r2   r3   r  r  rk  rl  r-   r5   rj  r   r   r   r
   _get_compiled_mtp_kernel[	  s   r  intermediate_states_bufferc           0      C   s,  | j \}}}}|j \}}}}|j d }t||}t||}|j ||||fks:J d| d| d| d| d|j  
|dksEJ d| |dksPJ d	| || dks`J d
| d| | jtjtjfv sqJ d| j |jtjksJ d|j |jtjksJ d|j |	du r|d }	|
du}|r|
jn| j}|
du rtj||||ftj| j	d}
|
tj|| ||}|du}|r|j d }|j d }||ksJ d| d| d|
tj|| | || }n|}tjdddtj| j	d}|||||||||||	|||f}t| } d| vs!| d j	| j	kr/tj|d tj| j	d| d< | d }!d| vrttj j}"t|dd}#t|dd}$t|dd}%t|dd}&t|dd}'t| dd}(t|dd})t|dd}*t|dd}+t|
dd},t|dd}-t|!dd}.tjt|#|$|%|&|'|(|)|*|+|,|-|.fi ddddd|	d|d |d!|d"|d#|d$|d%|d&|d'd(d)|d*d+d,|d-|d.|"d/d0}/|/| d< n| d }/ttj j}"|/|||||| ||||
||!|" |s| s|||||| |
j|kr|

|}
|
|fS )1am  
    Gated Delta Rule MTP Kernel (Multiple Token Processing).

    This function processes multiple tokens (T > 1) in sequence, typically used for
    speculative decoding verification. It supports intermediate state caching for
    potential rollback scenarios.

    Args:
        q (torch.Tensor):
            Query tensor of shape ``[B, T, H, K]``.
        k (torch.Tensor):
            Key tensor of shape ``[B, T, H, K]``.
        v (torch.Tensor):
            Value tensor of shape ``[B, T, HV, V]``.
        initial_state (torch.Tensor):
            Initial state tensor of shape ``[pool_size, HV, V, K]`` (K-last layout).
        initial_state_indices (torch.Tensor):
            Indices mapping each batch to its initial state, shape ``[B]``.
        A_log (torch.Tensor):
            Log decay parameter of shape ``[HV]``.
        a (torch.Tensor):
            Input-dependent decay of shape ``[B, T, HV]``.
        dt_bias (torch.Tensor):
            Decay bias of shape ``[HV]``.
        b (torch.Tensor):
            Update gate input of shape ``[B, T, HV]``.
        scale (Optional[float]):
            Scaling factor for queries. If None, uses ``1/sqrt(K)``.
        output (Optional[torch.Tensor]):
            Pre-allocated output tensor of shape ``[B, T, HV, V]``.
        intermediate_states_buffer (Optional[torch.Tensor]):
            Buffer for caching intermediate states, shape ``[pool_size, T, HV, V, K]``.
            If None, intermediate states are not cached.
        disable_state_update (bool):
            If True, the initial state is not updated. Default: ``True``.
        use_qk_l2norm (bool):
            Whether to apply L2 normalization to q and k. Default: ``True``.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]:
            - output: Output tensor of shape ``[B, T, HV, V]``
            - initial_state: Updated state tensor (unchanged if disable_state_update=True)

    Note:
        - Requires SM90 (Hopper) architecture
        - Supports T > 1 (multiple token processing)
        - State layout is K-last: [pool_size, HV, V, K]
        - Optimized for speculative decoding verification scenarios
    r   r   r   r   r   r   r   r   r   r   r   r   z#initial_state must be float32, got r   Nr   r   r   z9intermediate_states_buffer second dimension (cache_steps=z) must be at least T=z" to prevent out-of-bounds indexingr*   r   r   r   r+   r>   r,   r   r-   r.   r/   r0   r1   r2   r3   rj  r   r4   Tr5   r6   Frk  rl  r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   ra  r  r   r   r   r   r   r   rG   r   r  r  r   )0r$   r%   r&   r   r   r!   r"   r#   r'   r-   r   r  rk  r5   r/   r0   r1   r2   rk   r.   r3   r  rj  r   r  r	  r   rl  buffer_sizer  ri  r  r  r*   r   r  intermediate_states_tensorr  r  r  r  r  r  r  r  r  r  r   r   r   r
   gated_delta_rule_mtpp	  s&  C


"







!
r  r8   )NNTNN)NNT)NNNTT)L__doc__	functoolstypingr   r   r   rN   cutlass.cuterG   cutlass.cute.nvgpur   cutlass.cute.runtimer   cuda.bindings.driverbindingsdriverr   api_loggingr   _FLASHINFER_AVAILABLEImportError!gdn_kernels.gdn_decode_bf16_stater   r   r   rX   rY   r\   r   rM   r  rN  r]  r  rS  r!  rT  r   r^  rO  rM  rQ  rP  rd  
TILE_K_MTPr  intr   r   kernel	TiledCopyTensorLayout	Constexprfloatboolr   r   jitr   r   r   r  r   r   r   r  rL  rR  r\  r`  rh  r  r  r  r  r   r   r   r
   <module>   s   			
  	
  	
f	
j		
	
  	
 r	
 ^	
P	
S
	
 L	
  	
Y	
	
