o
    3/i3                     @   s&  d dl mZmZmZmZmZ d dlmZ d dlZd dl	m
Z
 d dlmZmZmZmZ d dlmZmZmZ d dlmZ d dlmZ d dlZd dlmZ d d	lmZ d d
lmZ dZedddddde
jde
j de
j dee
j  de!ddfddZ"edddde
jde
j de
j dededdfddZ#edddde
j de
j fddZ$edddde
jde
j de
j e
j%B de
j fddZ&edddd e
j'd!e
j%de
j de
j fd"d#Z(e	ddddd$eej) d%e*d&e!de
j+fd'd(Z,eddddd)de
j de
j dee
j  d&e!ddf
d*d+Z-	dd$eej) d-e*d%e*d&e!de
jf
d.d/Z.	,	dd$eej) d0e*d-e*d%e*d&e!de
jfd1d2Z/e
j0d3e
j d4ede
j fd5d6Z1d7Z2d8Z3d9e2 Z4e	 	ddddd:e
j d;e*d<e!de
j fd=d>Z5e	 	ddddd:e
j d?ed@ed;e*d<e!de
j fdAdBZ6dCedDe*dEe*dFe*def
dGdHZ7dIe
j8fdJdKZ9dLe
j de
j fdMdNZ:d e
j;j'dLe
j de
j fdOdPZ<d e
j;j'dLe
j de
j fdQdRZ=eddddSej>j?dTeej) de
j+fdUdVZ@		ddWejAe* dXee
j) dYe!dZee* de
j+f
d[d\ZB		ddWejAe* dXee
j) dYe!dZee* de
j+f
d]d^ZC			dd_e
jDd`e
j dedWe*dYe!dZee* deee
je
j f fdadbZE		dd_e
jDd`e
j dedWe*dYe!deee
je
j f fdcddZF	dd_e
jDdee
j%dYe!de
jfdfdgZG		dd_e
jDdee
j%d`ee
j  dedWe*dYe!deee
je
j e
j f fdhdiZH	dd_e
jDdje
j dedWe*deee
je
j f f
dkdlZI		dd_e
jDdje
j dedWe*dme!deee
je
j f fdndoZJeddddpe
j8dqe
j8dre*eB fdsdtZKeddddue
j+de
j8fdvdwZLed,ddddxdye
j8dze
j8d{e
j8d|ed}ee d~e*ddfddZM	dde
j de
j de!defddZNe		ddddde
j+de
jOde
jPde
j de
j de!de!defddZQd+edejRjSfddZTe
j0de
j'de
j dje
j de
j dededefddZUe
j0de
j'de
j dje
j de
j dededefddZVe
j0	,ddue
j+de
j dje
j de
j dede*d~e*defddZWdS )    )OptionalTypeTupleCallableSequence)partialN)Int32Int16Boolean
const_expr)cpasyncwarp	warpgroup)CtaGroup)dsl_user_op)llvm)ir)
cute_nvgpul   } F)predretilelocip
tiled_copysrcdstr   r   returnc          	      K   s   t |jtjr|jtjjksJ t|j|jkr,t	||j}|
| |j |}t|r5| |}tj| ||f|||d| d S )Nr   r   r   )
isinstanceiteratorcutePointermemspaceAddressSpacermemr   element_typemake_rmem_tensor_likestoreloadtor   copy)	r   r   r   r   r   r   r   kwargssrc_cvt r,   ]/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/quack/copy_utils.pycvt_copy   s    
"r.   r   r   seedtidxc                C   s   t |jtjr|jtjjksJ ddlm} ddl	m
} t||j}	| }
||
||||d}|	|||
j|j |	}tj| ||||d dS )zGLike cvt_copy but uses stochastic rounding for FP32 -> BF16 conversion.r   )convert_f32_to_bf16_sr)	TensorSSAr/   N)r   r   r   r    r!   r"   r#   quack.roundingr2   cutlass.cute.tensorr3   r%   r$   r'   r&   shaper)   )r   r   r   r0   r1   r   r   r2   r3   r+   src_vecraw_vecr,   r,   r-   sr_cvt_copy,   s    r9   c                C   s*   t j| | j||d}t j| |||d |S Nr/   )r   r%   r$   autovec_copy)r   r   r   r   r,   r,   r-   load_s2rD   s   r<   	dst_shapec                C   sJ   t t|tj rtj||j||d}n|}tj| || |||d |S r:   )r   r   r   Tensormake_rmem_tensorr$   r)   r   )r   r   r=   r   r   r   r,   r,   r-   load_s2r_retileK   s
   
r@   thr_copyr6   c                C   s>   t |}t j| |j|j||d}t j| ||||d |S r:   )r   make_identity_tensorr?   partition_Dr6   r$   r)   )rA   r6   r   r   r   cDstr   r,   r,   r-   load_t2r]   s   
rE   dtypenum_copy_elemsis_asyncc                C   s:   t td|| j }|rt ntj }tj|| |dS )N   num_bits_per_copy)	r   minwidthr   	CopyG2SOpr   nvgpuCopyUniversalOpmake_copy_atom)rF   rG   rH   r   r   num_copy_bitscopy_opr,   r,   r-   get_copy_atomg   s   rT   )r   rH   r   r   c          	      K   s>   | j d d }t| j||}tj|| |f|||d| d S )Nr   r   )r6   rT   r$   r   r)   )	r   r   r   rH   r   r   r*   rG   	copy_atomr,   r,   r-   r)   p   s   "r)      num_threadsc           	      C   sR   || j  }|rt ntj }tj|| |d}t|}t|}t|||S )NrJ   )	rM   r   rN   r   rO   rP   rQ   make_layoutmake_tiled_copy_tv)	rF   rW   rG   rH   rR   rS   rU   
thr_layout
val_layoutr,   r,   r-   tiled_copy_1d   s   


r\   threads_per_rowc           
      C   sr   || j  }|rt ntj }tj|| |d}|| dks J tj|| |fdd}td|f}	t	|||	S )NrJ   r   )rV   r   )orderrV   )
rM   r   rN   r   rO   rP   rQ   make_ordered_layoutrX   rY   )
rF   r]   rW   rG   rH   rR   rS   rU   rZ   r[   r,   r,   r-   tiled_copy_2d   s   

r`   tAcAlimitc              	   C   s   t t jt j| ddgdt j| dgdt j| dgdft j| dgdddfdt}t|jd D ]!}t|jd D ]}t | d|fd|f d |||d|f< q<q2|S )Nr   rV   mode   stride)	r   r?   rX   sizer
   cutlassrange_constexprr6   	elem_less)ra   rb   tApArest_vrest_kr,   r,   r-   predicate_k   s   .*ro   i   @il            T
ragged_dim	ptr_shiftc                C   sB  t | }|dk r||7 }|rX|dksJ d| jd | tf | j|d d   tf }| j| j| f }d| t f d|| d   }t || j}	t |	t j	||dS |dks`J d| j| }
| jd | tf | j|d d   ttf }| jd | |
f | j|d d   t
|
 |
f }t | jt j	||dS )	Nr      z8ptr_shift ragged tensor only supports up to 4 dimensionsrV   Nrf      z<non-ptr_shift ragged tensor only supports up to 3 dimensions)r   rankr6   BIG_INTMAX_INTrg   domain_offsetr   make_tensorrX   BIG_INT_INV)rp   rq   rr   r   r   rv   	new_shape
new_stride
ptr_offsetnew_ptrstride_rr,   r,   r-   create_ragged_tensor_for_tma   s.   
	, 
,
r   offsetlengthc                C   s   t | }|dk r||7 }t j| |gd}|| }	|r>||d ks#J d| |	f d|| d   }
d|d  || f }n#||d ksFJ d| |	f d|| d   }
d|d  ||| f }t |
| | S )Nr   rc   re   rt   rV   ru   )r   rv   rh   ry   )rp   r   r   rq   rr   r   r   rv   big_int
offset_valoffset_tupleindex_tupler,   r,   r-   offset_ragged_tensor
  s   
r   ptr_intbmsc                 C   s(   d|> d }||| > }| | |@ |? A S )NrV   r,   )r   r   r   r   bit_mskyyy_mskr,   r,   r-   swizzle_int'  s   r   ptrc                 C   s8   | j j}t|  |j|j|j}tj| j	|| j
| jdS )N)assumed_align)typeswizzle_typer   tointnum_bitsnum_base	num_shiftr   make_ptrrF   r!   	alignment)r   swzr   r,   r,   r-   swizzle_ptr-  s   r   tensorc                 C   sh   | j }| jj}| jjj}t|j|j	|j
}t|dt|dtd||}ttj| j| jd|S )N   r   )rF   )layoutr$   rM   r   r   r   r   make_swizzler   r   r   recast_layoutmake_composed_layoutrz   
recast_ptr)r   outerrM   r   inner
new_layoutr,   r,   r-   &as_position_independent_swizzle_tensor3  s   
r   c                 C   $   t t| |j| t|jS rt   )r   rz   r   rC   r   r   r   rA   r   r,   r,   r-    partition_D_position_independentA     r   c                 C   r   rt   )r   rz   r   partition_Sr   r   r   r   r,   r,   r-    partition_S_position_independentJ  r   r   layout_c	elem_ty_cc                C   s`   t |tjjstd| |  }|jdkr$tjt	
|d|||dS tjtj |||dS )a  
    Selects the largest vectorized smem load atom available subject to constraint of gmem layout.

    Parameters:
    -----------
    layout_c : LayoutEnum
        The layout enum of the output tensor D.

    elem_ty_c : Type[Numeric]
        The element type for output tensor D.

    Returns:
    --------
    Either SmemLoadMatrix or SimtSyncCopy, based on the input parameters.
    z%elem_ty_c must be a Numeric, but got    rs   r/   )r   ri   cutlass_dslNumericMeta	TypeErroris_m_major_crM   r   rQ   r   LdMatrix8x8x16bOprO   rP   )r   r   r   r   
is_m_majorr,   r,   r-   sm90_get_smem_load_opS  s   
r   archr$   	transposemajor_mode_sizec                 C   |   t | dk p	|jdkrtjtj ||sdnd|j dS |d u s'|d dkr)dn	|d dkr1dnd}ttj||d	|S 
NZ   r   re   rV   rJ   r   rs   r   )r   num_matrices)r   rM   r   rQ   rO   rP   r   StMatrix8x8x16bOpr   r$   r   r   r   r,   r,   r-   get_smem_store_atomt     r   c                 C   r   r   )r   rM   r   rQ   rO   rP   r   r   r   r,   r,   r-   get_smem_load_atom  r   r   	tiled_mmasCc                    sv   |j }t||||d}t|| |}	t| r!|	| nt|	| ddtjdt	t
 f fdd}
|
|	 fS )N)r   r   dst_idxc                    s<   t |d u r n d d d |f }t| |fddi| d S Nr   T)r   r.   )r   r   
new_kwargs
dst_tensortRS_sCr   r,   r-   copy_fn  s    z!get_smem_store_C.<locals>.copy_fnrt   )r$   r   r   make_tiled_copy_C	get_slicer   rC   r   r>   r   r   )r   r   r1   r   r   position_independentr   rF   rU   rA   r   r,   r   r-   get_smem_store_C  s   	


 
r   c                    s   |j }t|||}t|| |}t| r||nt||t|||}	t|	| |}
|
t	|j
d d j
 ddtt f fdd}||fS )Nre   src_idxc                    s6   t | d u rnd d d | f }t|fd i|S Nr=   )r   r@   )r   r   
src_tensor	tRS_shapetSR_sCr   r,   r-   r     s    z get_smem_load_C.<locals>.copy_fnrt   )r$   r   r   r   r   r   r   r   r   rB   r6   r   r   )r   r   r1   r   r   r   rF   rU   rA   copy_atom_RSthr_copy_RSr   r,   r   r-   get_smem_load_C  s   



r   epi_tilec                 C   s<   t tj||d d dkrdnddtj}t || }|S )NrV   r   r   rs   re   )r   )r   rQ   r   r   ri   Float16make_tiled_copy_C_atom)r   r   r   copy_atom_Ctiled_copy_C_atomr,   r,   r-   epilog_smem_copy_atom  s    r   c                    s   t |d ur	|jntj}t| |}t|||}	t|	||}
d  t |d ur:t | r5|
	| nt
|
| |d urE|jd d n|}|
t|j}t|| jj}dtjdtf fdd}t |d urm|nd |
 |fS )Nre   r   r   c                    s$   t |  d d d |f fi | d S rt   r.   r   r   r   r   r,   r-   r     s   $z#get_smem_store_epi.<locals>.copy_fn)r   r$   ri   r   r   r   r   make_tiled_copy_Sr   rC   r   r6   r   rB   r?   op	acc_dtyper>   r   )r   r   r   r1   r   r   r   rF   r   rU   rA   sC_shapetRS_rC_shapetRS_rCr   r,   r   r-   get_smem_store_epi  s   	



r   sAc           
         s|   |j }| jjtjjk}t|||}t|| 	|}t
| r'|| nt|| dtjdtf fdd}	|	| fS )Nr   r   c                    s(   t |  d d d |f fddi| d S r   r   r   tRS_sAr   r,   r-   r     s   (z!get_smem_store_A.<locals>.copy_fn)r$   r   a_major_moder   OperandMajorModeMNr   r   make_tiled_copy_Ar   r   rC   r   r>   r   )
r   r   r1   r   r   rF   r   rU   rA   r   r,   r   r-   get_smem_store_A  s   



r   with_dst_tensorc                    s   |j }| jjtjjk}t|||}t|| 	|}	t
| r'|	|nt|	|| |jd d  dtf fdd}
dtdtjffdd}|sU|
|	fS ||	fS )Nre   r   c                    s"   t d d d | f fd i|S r   r@   )r   r   r   tSR_sAr   r,   r-   r   (  s   z get_smem_load_A.<locals>.copy_fnr   c                    s    t  d d d | f |fi |S rt   r   )r   r   r   )r   r   r,   r-   copy_fn_w_dst_tensor-  s    z-get_smem_load_A.<locals>.copy_fn_w_dst_tensor)r$   r   r   r   r   r   r   r   r   r   r   r   r   partition_shape_Ar6   r   r>   )r   r   r1   r   r   r   rF   r   rU   rA   r   r   r,   r   r-   get_smem_load_A  s   


r   smem_ptrgmem_ptrstore_bytesc             	   C   sB   | j ||d }tjd |j|t| gddddtjjd d S )Nr/   zJcp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [$0], [$1], $2;zl,r,rTF)has_side_effectsis_align_stackasm_dialect)r   ir_valuer   
inline_asmllvm_ptrr   
AsmDialectAD_ATT)r   r   r   r   r   smem_ptr_i32r,   r,   r-   cpasync_reduce_bulk_add_f323  s   	
r  tma_atomc                C   s2   t j| jj||d}tjd}t j||||dS )aw  
    Get the address of the TMA descriptor embedded in a TMA Copy Atom.

    Extracts the constant memory address of the TMA descriptor for use with
    custom PTX instructions.

    :param tma_atom: TMA Copy Atom from make_tiled_tma_atom
    :return: Pointer to TMA descriptor in constant memory

    Example:
        >>> desc_ptr = get_tma_descriptor_address(tma_atom)
    r/   z@!cute.ptr<!cute_nvgpu.tma_descriptor_tiled, generic, align<128>>)_cute_nvgpu_iratom_make_exec_tma_traitvaluer   r   parseget_tma_desc_addr)r  r   r   	exec_atomtma_desc_ptr_typer,   r,   r-   r
  L  s
   r
  )num_ctamulticast_maskr   r   tma_desc_ptrdst_smem_ptrmbarrier_ptrcol_idxrow_indicesr  c                C   s   t |dkrtdt | t| }	dd |D }
| j||d }|j||d }|j||d}|dkr=|t@ }| }d}|durMt| }|du sUJ dd	| d
}tjd|||	|
d |
d |
d |
d |g|dddtj	j
||d	 dS )a!  
    Perform TMA gather4 load from global memory to shared memory.

    Issues PTX instruction:
    cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes
        [dstMem], [tensorMap, {col_idx, row0, row1, row2, row3}], [smem_bar];

    This loads 4 rows (specified by row_indices) from a 2D tensor at the given
    column index into shared memory, using the TMA descriptor.

    :param tma_desc_ptr: Pointer to TMA descriptor in constant memory (128-byte aligned)
    :type tma_desc_ptr:  Pointer
    :param dst_smem_ptr: Destination address in shared memory
    :type dst_smem_ptr:  Pointer
    :param mbarrier_ptr: Pointer to mbarrier in shared memory for completion tracking
    :type mbarrier_ptr:  Pointer
    :param col_idx:      Column index
    :type col_idx:       Int32
    :param row_indices:  Sequence of exactly 4 row indices
    :type row_indices:   Sequence[Int32]
    :param num_cta:      Number of CTAs participating (default: 1)
    :type num_cta:       int
    :param multicast_mask: Optional multicast mask
    :type multicast_mask: Int16

    Requirements:
        - row_indices must contain exactly 4 elements
        - Compute capability >= SM_100 (Blackwell)
        - TMA descriptor must be properly initialized for 2D tensor

    Example:
        >>> from cutlass.cute.nvgpu import cpasync
        >>> from cutlass.cute import core
        >>>
        >>> # Create TMA descriptor
        >>> tma_atom, tma_tensor = cpasync.make_tiled_tma_atom(...)
        >>> tma_desc_ptr = get_tma_descriptor_address(tma_atom)
        >>>
        >>> # Compute indices (typically from kernel logic)
        >>> col_idx = core.get(...) or 5  # Int32 value
        >>> row_indices = [core.get(...) for _ in range(4)]  # 4 Int32 values
        >>>
        >>> # Gather 4 rows at computed column
        >>> tma_gather4_load(
        ...     tma_desc_ptr=tma_desc_ptr,
        ...     dst_smem_ptr=smem_ptr,
        ...     mbarrier_ptr=barrier_ptr,
        ...     col_idx=col_idx,
        ...     row_indices=row_indices
        ... )
    rs   z,gather4 requires exactly 4 row indices, got c                 S   s   g | ]}t | qS r,   )r   r   ).0row_idxr,   r,   r-   
<listcomp>      z$tma_gather4_load.<locals>.<listcomp>r/   rV   Nzmulticast is not supported yetzacp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::z( [$0], [$1, {$2, $3, $4, $5, $6}], [$7];r   re   ru   zr,l,r,r,r,r,r,rTF)r   r   r   r   r   )len
ValueErrorr   r   r   Sm100MmaPeerBitMaskr	   r   r   r   r  )r  r  r  r  r  r  r  r   r   col_valrow_vals	desc_addrdst_addr	mbar_addrmulticast_mask_valptxr,   r,   r-   tma_gather4_loada  sF   @


r"  r   r   single_stagec                    s   t t| |s	dnd }t t||sdnd }t| d|t|d| dtjf fdd}dtjf fdd}t | rG|S |S )NrV   r   tma_bar_ptrc                    sp   t t j}t j   t j|d | f  d |f fd|i| W d    d S 1 s1w   Y  d S Nmbar_ptrr   rQ   r   CopyBulkG2SOpr$   r   	elect_oner)   )r   r   r$  r   atomr   r*   r   r,   r-   	copy_bulk  s   

"z+cpasync_bulk_get_copy_fn.<locals>.copy_bulkc                    s`   t t j}t j  t j| fd| i| W d    d S 1 s)w   Y  d S r%  r'  )r$  r   r*  r+  r,   r-   copy_bulk_single_stage  s    "z8cpasync_bulk_get_copy_fn.<locals>.copy_bulk_single_stage)r   r   rv   group_modesr    )r   r   r#  r*   group_rank_srcgroup_rank_dstr,  r-  r,   r+  r-   cpasync_bulk_get_copy_fn  s   r1  r*  	cta_coord
cta_layoutfilter_zerosc             
      s*  t t|jtjo|jtjjk}
|
r||fn||f\}}t t||s%dnd }t t||s2dnd }t	j
 ||t|d|t|d|||d\}}t |r\t|}t|}|
rb||fn||f\td d d fdd
}td d d fdd
}t | r|||fS |||fS )NrV   r   r/   c                   s8   t j d | f d |f fi |||d d S r:   r   r)   )r   r   r   r   r   r*  r   r*   r   r,   r-   copy_tma  s   z!tma_get_copy_fn.<locals>.copy_tmac                    s(   t j fi || |d d S r:   r5  )r   r   r   r6  r,   r-   copy_tma_single_stage  s   (z.tma_get_copy_fn.<locals>.copy_tma_single_stage)r   r   r   r   r    r!   r"   smemrv   r   tma_partitionr.  r4  r   )r*  r2  r3  r   r   r4  r#  r   r   r*   src_is_smemsmem_tensorgmem_tensorgroup_rank_smemgroup_rank_gmemr   gr7  r8  r,   r6  r-   tma_get_copy_fn  s2   
	

rA  pipelinec                    s   dt jjf fdd}|S )Nproducer_statec                    s"    d| |j |d| d S )N)r   r   r$  r,   )indexproducer_get_barrier)r   rC  r   r)   rB  r,   r-   r   #  s   
z%tma_producer_copy_fn.<locals>.copy_fn)ri   rB  PipelineState)r)   rB  r   r,   rF  r-   tma_producer_copy_fn"  s   rH  
thr_copy_AmAgsAIdxlimit_mlimit_kc                    s  t j|dgdt j|dgdf
|		jd dksJ t t 	ddd	d 
jd j dkt rAt|d }t 	jd d t 	}

|
d
||d d  }d d  tt jjdgd}tt jjdgd t |ttj|ddD ]}d|df d |k |< qt |ttj|ddD ]}d|df d }	| r||	 |< qd|< qt |d d fdd	tf 	
fd
d}
|
S )Nr   rc   rV   re   )NNr   NTunroll_fullFr   c           
         s
  d }t |r+t t}| d   }tj ddD ]}dd|f d |k ||< qd d | ff }tjd D ]G}ttj	|| d f dddfd }t sZ| rtj
jdgddksgJ d	 d  }	tj
|d |	f 	d |f|f |d
 q;d S )NrV   TrN  r   re   
up_to_rankNNr   rc   )r   r   r   r   )r   r   r?   r
   ri   rangerj   r6   tiled_divideappend_onesrh   r)   )
r   r   r   tApA_klimit_k_curkmA_curr   mA_rowkicols_per_threadelems_per_loadis_even_m_smemrM  mA_km_idxt0AcAra   tApA_mtAsArI  tile_shape_mkr,   r-   r   Y  s&   &z%gather_m_get_copy_fn.<locals>.copy_fnF)r   rh   rC   r6   r.  slice_tiler_mnr   rL   rB   r   r   r?   r
   ri   rT  r   logical_dividebool)rI  rJ  r   rK  rL  rM  cArows_per_threadr   r  r   r,   r]  r-   gather_m_get_copy_fn.  s6    	




*rn  c                    sJ  d\t |jtjjkr|n|jtjjksJ |tj|dgdtj|dgdf
	|tdd
d 	j	d j
 dk}t | rOt|
d }tj
d d }t
}	|	d||d d  }d d  t tjj
dgd}	t tjj
dgd t|	ttj|	ddD ]}
d|
df d |k |
< qt 	j	d j
| }	j}tt||f||fd || d fd f dd
tdttjtjf f 
fdd}		dd
tdttjtjf f 
fdd}		ddttjtjf d
tf	fdd}|t d ur"|fS |fS )N)NNr   rc   rV   ru   re   TrN  Fr   r   c                    s   d }t |r+t t}| d   }tj ddD ]}dd|f d |k ||< qd | f }t t}t D ]&}dd|f d }t | rS|| ||< q<|| r^|| ||< q<d||< q<||fS )NrV   TrN  r   )r   r   r?   r
   ri   rT  r   )r   r   rW  rX  rY  	gAIdx_curk_idxr  )r^  gAIdxrM  rc  ra   rf  r,   r-   prefetch_from_gmem_fn  s    

z3gather_k_get_copy_fn.<locals>.prefetch_from_gmem_fnc                    s   d }t |r+t t}|d   }tj ddD ]}dd|f d |k ||< q| | d |f }t t}	t D ]}dd|f d }
||
 |	|< qAtj	  tj
  | | W d    |	|fS 1 spw   Y  |	|fS )NrV   TrN  r   )r   r   r?   r
   ri   rT  consumer_waitr   r   	sync_warpr)  consumer_release)a_prefetch_pipeliner   r   a_prefetch_consumer_stater   rW  rX  rY  	sAIdx_currq  r  )r^  rM  sAIdxrc  ra   rf  r,   r-   prefetch_from_smem_fn  s&   


z3gather_k_get_copy_fn.<locals>.prefetch_from_smem_fnk_idx_tApA_kc           	   	      s   |\}}d }t |rtj|dd}t jd D ]4}t jd D ])}| rLtjd ||| f d ||f|f t |d u rDd n|d |f d q#qd S )Nre   rP  rV   rS  )r   r   prepend_onesri   rj   r6   r)   )	r   r   r|  r   rq  rW  tApA_k_predrY  r   )ra   tAmArd  re  rI  r,   r-   r     s    z%gather_k_get_copy_fn.<locals>.copy_fnrg  )r   r!   r   r"   gmemr9  rh   rC   r.  ri  r6   rL   rB   r   r   r?   r
   ri   rT  thr_idxrj  flat_dividerk  r   r>   )rI  rJ  r   rK  rL  rM  r`  r_  rl  rm  r   threads_per_colr1   rs  r{  r   r,   )r^  rr  rM  rz  rc  ra   r  rd  re  rI  rf  r-   gather_k_get_copy_fnr  s^   	 



.r  rz  warp_idx	num_warpsc                    s   t j|dgd}t |d | |d dksJ |}t t jt j tddt |t d}	|	|}
|
	|}|
	|t
| t| }tt||ddt jf fd	d
}|S )Nr   rc   rR  rs   rI   rJ   )r  r$  c              	      s   |  }t jtjdgdddD ]3  fddtdD }d  d |f j}tj  |||| W d    n1 s?w   Y  qd S )NrV   rc   TrN  c                    s   g | ]}| f qS r,   r,   )r  v)r   	tSR_rAIdxr,   r-   r    r  z=gather_m_get_tma_copy_fn.<locals>.copy_fn.<locals>.<listcomp>rs   )ri   rT  r   rh   r   r   r)  )r   r   r$  r  r  r   r  r   tile_Ktma_gather4_load_fn)r   r-   r     s   z)gather_m_get_tma_copy_fn.<locals>.copy_fn)r   rh   rY   rQ   rO   rP   r   rX   r   r   r<   r
  r   r"  r    )r  rJ  r   rz  r  r  r  tile_M	cta_groupcopy_AIdx_s2rwarp_copy_AIdx_s2r	tSR_sAIdxr  r   r,   r  r-   gather_m_get_tma_copy_fn  s"   



r  rg  )rV   F)r   F)FN)FFN)FF)rV   )Xtypingr   r   r   r   r   	functoolsr   ri   cutlass.cuter   r   r	   r
   r   cutlass.cute.nvgpur   r   r   cutlass.cute.nvgpu.tcgen05.mmar   cutlass.cutlass_dslr   cutlass.pipelinecutlass._mlir.dialectsr   cutlass._mlirr   r   r  r  	TiledCopyr>   rk  r.   r9   r<   Shaper@   ThrCopyrE   NumericintCopyAtomrT   r)   r\   r`   jitro   rw   rx   r{   r   r   r   r    r   r   corer   r   utils
LayoutEnumr   	Constexprr   r   TiledMmar   r   r   r   r   r   r  r
  r"  r1  CoordLayoutrA  rB  PipelineAsyncrH  rn  r  r  r,   r,   r,   r-   <module>   sP  
	 
		

D!	
	
	#





 

 r
 	0Co