
    )j                        d dl mZmZ d dlmZ d dlmZ ej        d             Z	d Z
 e
            Zdej        dej        dej        dej        d	ej        d
ej        dej        dej        deeef         fdZddZ	 	 	 	 	 ddej        dej        dej        dej        d	ej        d
ej        dej        deej                 deeef         deej                 deej                 dedeej        ej        f         fdZ	 	 	 	 ddej        dej        dej        dej        d	ej        d
ej        dej        deej                 deeef         deej                 deej                 fdZdS )    )OptionalTupleNc                 t    t          j        | |z             } t          j        | |d         |d                   S )Nr      )nnsoftplusmxclip)dtdt_biastime_step_limits      [/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/mlx_lm/models/ssm.py
compute_dtr      s3    	R'\	"	"B72q)?1+=>>>    c                      t           j                                        sd S d} t           j                            dg dddg|           S )Na  
        auto n = thread_position_in_grid.z;
        auto h_idx = n % H;
        auto g_idx = n / G;
        constexpr int n_per_t = Ds / 32;

        auto x = X + n * Dh;
        out += n * Dh;
        auto i_state = state_in + n * Dh * Ds;
        auto o_state = state_out + n * Dh * Ds;

        // C and B have shape [batch, group, state_dim]
        // C and B need to be offset by group size
        auto C_ = C + g_idx * Ds;
        auto B_ = B + g_idx * Ds;

        auto ds_idx = thread_position_in_threadgroup.x;
        auto d_idx = thread_position_in_grid.y;

        auto dt_ = static_cast<float>(dt[n]);
        auto A = -fast::exp(static_cast<float>(A_log[h_idx]));
        auto dA = fast::exp(A * dt_);

        float acc = 0.0;
        auto x_ = static_cast<float>(x[d_idx]);

        for (int i = 0; i < n_per_t; ++i) {
            auto s_idx = n_per_t * ds_idx + i;
            auto idx = d_idx * Ds + s_idx;
            auto dB_by_x = x_ * dt_ * static_cast<float>(B_[s_idx]);
            auto state = dA * i_state[idx] + dB_by_x;
            o_state[idx] = static_cast<T>(state);
            acc += state * C_[s_idx];
        }
        acc = simd_sum(acc);
        if (thread_index_in_simdgroup == 0) {
            out[d_idx] = static_cast<T>(acc + x_ * D[h_idx]);
        }
    
ssm_kernel)XA_logBCDr   state_inout	state_out)nameinput_namesoutput_namessource)r	   metalis_availablefastmetal_kernel)r   s    r   make_ssm_kernelr#      s]    8  "" t&FN 7CCC[)	     r   hidden_statesr   r   r   r   r   r   stater   c	           
          | j         \  }	}
}}| j        }|j         dd          \  }}t          |||          }t          | ||||||gd|fd|fd|fd|fd||z  fgd|||	z  fd|	d	||f|j         g||g
          S )NTDhDsHG    )r-      r   r   )inputstemplategridthreadgroupoutput_shapesoutput_dtypes)shapedtyper   _ssm_kernel)r$   r   r   r   r   r   r   r%   r   n_hd
input_typehbdss                   r   ssm_update_kernelr?   B   s     $JAq!Q$JWRSS\FB	B	1	1BuaAr59
#dAYr
S!HsAQSGnU!QU^1a|U[1!:.   r   c                 \   | j         d         }|t          j        |d          }| |z  } t          j        | d         |d          } t          j        | d          } t          j        | d          }|8t          j        |dd d d f         |d         z  |t          d                     }|S )Nr   .Naxisr'   .inf)r5   r	   expand_dimsrepeattrilcumsumwherefloat)xmasklx_segsums       r   segsumrP   [   s    	A~dA&&H
	!I,+++A
2Ay$$$H8dAAAi0(U5\\M
 
 Or   gMbP?g      Y@   rL   rM   lengthsstepreturnc                   
 | j         \  }|j         \  }}t          |||          }z  t          j        |                              |j                   }||                    ddd          z  }|                    |d          | z  }
fd}g }t          d|          D ]~} ||dd||z   f         |dd||z   f         |dd||z   f         |dd||z   f         ||	dn|	d||z   f                   \  }}

z
  
|                    |           t          j	        |d          | |                    ddd          z  z   }||fS )a5  SSD-SSM forward pass.

    Args:
        x: Input of shape (batch_size, seq_len, num_heads, head_dim).
        dt: Time deltas of shape (seq_len, num_heads,).
        A_log: State transition of shape (num_heads,).
        B: Input mixing of shape (batch_size, seq_len, num_groups, n).
        C: Output mixing of shape (batch_size, seq_len, num_groups, n).
        D: Residual connection.
        dt_bias: Bias for time deltas of shape (num_heads,).
        time_step_limit: Minimum and maximum value for time deltas.
        mask: Optional multiplicative mask.
        lengths: Optional lenghts of sequences, assumed to be the full length if unspecified.
        step: Step size for processing x.

    Code modified from
    https://github.com/cartesia-ai/edge/blob/main/cartesia-mlx/cartesia_mlx/layers/ssd/ops.py

    r   rA   c                 :   | j         d         }t          j        |d          }t          j        |dd          |z  }t          j        |d          }t          j        t          |                    dd          |                    }t          j        ||z  d          }	|	|                     dd          z  }
t          j        |
dd          }
Xt          j        t          j	                  dz
  d          }t          j
        |d          }t          j        ||d          }n|d d d d dd d d f         }|                    dd	dd          }t          j        |z  d                              dd	          }| |z  }|                    dd                              dd	          }||z  }|t          j        t          j        |d
                    }||d d dd d d d f         |z  z  }|                    |dd          }|                    df          |z                      d                              dd	          }|
|d         |z  z  }
/|-t          j        t          j
        dk     d          ||          }|
|fS )Nr   )r         r   rX   rC   )rM   r   )r   rX   rY   rA   rY   r'   rB   )r5   r	   	transposeswapaxesrG   exprP   rH   maximumminimumrF   take_along_axisrI   reshapesqueezeflattenrJ   )dtxdtAr   r   r%   rM   sCBdecaysurrogate_attention_matrixyposdtxdecay
next_stateexp_dtA_cumsumy_prevbr;   dhgr:   rS   repeatsrT   s                   r   _stepzssm_attn.<locals>._step   s   IaLLL))[Aq!!A%Yr7+++vcll1a00t<<<==%'WR%Z%;%;"&a););;K1a  *RZ66:A>>C.i00C&uc:::EE!!!QQQQQQ,'E1a++Iaaa(((11!Q77;$$Q**33Aq99\
VBIc$;$;$;<<N.B4)=>FFJ		!Q1a++A1a"a899A=FFrJJRRSTVWXX  	*V33A5#4w{I66z J *}r   r   N.rC   )
r5   r   r	   r\   astyper6   r`   rangeappendconcatenate)rL   r   r   r   r   r   r   r%   r   rM   rS   rT   rN   r9   Ard   rc   rs   ysiri   ro   r;   rp   rq   r:   rr   s             ``         @@@@@@r   ssm_attnr{   j   s   B 'KAq!RJAq!Q	B	1	1B1fG			bh	'	''A
qyyAr""
"C
**Q1a
 
 1
$C) ) ) ) ) ) ) ) ) ) ) )V 
B1a  51q4x< 1q4x< aaaQXoaaaQXoLDDd3AH+<&=
 
5 nG
		!
r"""Q1aA)>)>%>>Ae8Or   c                    | j         d         }|dk    sA|?t          j                    t          j        k    st          j                                        st          | |||||||||	|
          S t          | ||||||||	  	        S )Nr   )rM   rS   )r5   r	   default_devicegpur   r    r{   r?   )r$   r   r   r   r   r   r   r%   r   rM   rS   seq_lens               r   
ssm_updater      s     !!$G!="&((x$$&& ) 
 
 
 	
 !

 

 
	
r   )N)NrQ   NNrR   )NrQ   NN)typingr   r   mlx.corecorer	   mlx.nnr   compiler   r#   r7   arrayrK   r?   rP   intr{   r    r   r   <module>r      s   " " " " " " " "             ? ? ?
/ / /d o88 
x 
x	
 
x 	 X 8 5%<(   2   . !%+9#"&c c	xc8c 
xc 
x	c
 
xc 	c Xc BHc 5%<(c 28
c bhc c 28RXc c c c\ !%+9#"&,
 ,
8,
8,
 
x,
 
x	,

 
x,
 	,
 X,
 BH,
 5%<(,
 28
,
 bh,
 ,
 ,
 ,
 ,
 ,
r   