
    )j                      d   d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ ddlmZ dd	lmZmZmZm Z m!Z!m"Z"m#Z#m$Z$ dd
l%m&Z& ddl'm(Z( ddl)m*Z*m+Z+ dZ,dZ-dZ.dZ/dZ0d Z1dZ2dZ3dZ4dZ5dZ6dZ7d Z8d Z9 ej:         ej;                              Z<ej=        dMdej>        deeej?                          fd            Z@e G d d                      ZAd ZBddddddddd ddddejC        dej>        d eDd!eeejC        gejC        f                  d"eeeejC        ejC        gejC        f                           d#eeD         d$ee
         d%eDd&eeD         d'eDd(eDd)eeeDeDgdf                  d*eejC                 d+eeejC        ejC        f         ddf         fd,ZEd-ddddd.ddd d/	dejC        dej>        d0ej>        d1eDd eDd!eeejC        gejC        f                  d"eeeejC        ejC        gejC        f                           d$ee
         d%eDd&eeD         d'eDd(eDd+eeejC        ejC        eFf         ddf         fd2ZG	 	 dNdej>        d3eee(f         deeHejC        eeD         f         d eDd0eej>                 d+eeAddf         fd4ZI	 dOdej>        d3eee(f         deeHeeD         f         d6eFd+eHf
d7ZJdMd8ZKdMd9ZLe G d: d;                      ZMe G d< d=                      ZNe G d> d?                      ZOd@ ZPdA ZQdB ZR G dC dD          ZS	 	 	 	 	 dPdFeeeD                  dGeeee
                           d eeDeeD         f         d6eFdHeFd"eeeejC        ejC        gejC        f                           d+eNfdIZTdJ ZUeVdKk    r eWdL            eU             dS dS )Q    N)	dataclass)partial)AnyCallable	GeneratorListOptionalTupleUnion)tree_reduce)PreTrainedTokenizer   cache)ArraysCacheBatchKVCacheBatchRotatingKVCache	CacheListKVCacheQuantizedKVCacheRotatingKVCacheload_prompt_cache)make_sampler)TokenizerWrapper)#does_model_support_input_embeddingsloadhellod           g      ?z(mlx-community/Llama-3.2-3B-Instruct-4biti  c                 .    |                                  dvS )N)falsef)lower)strings    Y/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/mlx_lm/generate.pystr2boolr&   7   s    <<>>//    c                     t          j        d          } |                     dt          dt           dd           |                     dd	d
           |                     dt          d           |                     dt          ddd           |                     ddd           |                     ddt
          d           |                     ddd           |                     ddt          t          d           |                     d t          t          d!           |                     d"t          t          d#           |                     d$t          t          d%           |                     d&t          t          d'           |                     d(t          t          d)           |                     d*t          d+d,           |                     d-t          t          d.           |                     d/t          t          d0           |                     d1d	d2           |                     d3d	d4           |                     d5d6d7           |                     d8t           d9d:           |                     d;t          d<d           |                     d=t          dd>           |                     d?d@d	dA           |                     dBt          dCd           |                     dDt          dEdF           |                     dGdHt          t"          I           |                     dJt          dKd           |                     dLt          dMdN           | S )Oz&Set up and return the argument parser.zLLM inference script)descriptionz--modelz[The path to the local model directory or Hugging Face repo. If no model is specified, then z	 is used.N)typehelpdefaultz--trust-remote-code
store_truez)Enable trusting remote code for tokenizer)actionr+   z--adapter-pathz9Optional path for the trained adapter weights and config.)r*   r+   z--extra-eos-token +z:Add tokens in the list of eos tokens that stop generation.)r*   r,   nargsr+   z--system-promptz.System prompt to be used for the chat template)r,   r+   z--promptz-pz;Message to be processed by the model ('-' reads from stdin)z--prefill-responsez1Prefill response to be used for the chat templatez--max-tokensz-mz$Maximum number of tokens to generate)r*   r,   r+   z--tempzSampling temperaturez--top-pzSampling top-pz--min-pzSampling min-pz--top-kzSampling top-kz--xtc-probabilityz5Probability of XTC sampling to happen each next tokenz--xtc-thresholdr   zDThresold the probs of each next token candidate to be sampled by XTCz--min-tokens-to-keepz*Minimum tokens to keep for min-p sampling.z--seedz	PRNG seedz--ignore-chat-templatez9Use the raw prompt without the tokenizer's chat template.z--use-default-chat-templatezUse the default chat templatez--chat-template-configzAdditional config for `apply_chat_template`. Should be a dictionary of string keys to values represented as a JSON decodable string.)r+   r,   z	--verboseTzTLog verbose output when 'True' or 'T' or only print the response when 'False' or 'F'z--max-kv-sizez$Set the maximum key-value cache sizez--prompt-cache-filez;A file containing saved KV caches to avoid recomputing themz--quantize-activationsz-qazSQuantize activations using the same quantization config as the corresponding layer.z	--kv-bitszFNumber of bits for KV cache quantization. Defaults to no quantization.z--kv-group-sizez%Group size for KV cache quantization.@   z--quantized-kv-startzLWhen --kv-bits is set, start quantizing the KV cache from this step onwards.)r+   r*   r,   z--draft-modelz,A model to be used for speculative decoding.z--num-draft-tokensz:Number of tokens to draft when using speculative decoding.   )argparseArgumentParseradd_argumentstrDEFAULT_MODELDEFAULT_PROMPTintDEFAULT_MAX_TOKENSfloatDEFAULT_TEMPDEFAULT_TOP_PDEFAULT_MIN_PDEFAULT_TOP_KDEFAULT_XTC_PROBABILITYDEFAULT_MIN_TOKENS_TO_KEEPDEFAULT_SEEDr&   DEFAULT_QUANTIZED_KV_START)parsers    r%   setup_arg_parserrF   ;   s   $1GHHHF
G.;G G G      8    
 H    
 I     =    
 J	     @    
 "3     ul9O     };K     };K     ]9I     'D	     S	     *9	     	      H    
 %,    
  I	     c	     3	     J	      b	     U	     4	     "*     ;	     I	     Mr'   modelstreamsc              #   N  K   t           j                                        s	 dV  dS # w xY wt          d | d          }t          j                    d         }|d|z  k    r |dz  }|dz  }t          d| d| d	           t          j        |          }	 dV  ||D ]}t          j        |           nt          j                     t          j        |           dS # ||D ]}t          j        |           nt          j                     t          j        |           w xY w)
a  
    A context manager to temporarily change the wired limit.

    Note, the wired limit should not be changed during an async eval.  If an
    async eval could be running pass in the streams to synchronize with prior
    to exiting the context manager.
    Nc                 N    t          |t          j                  r
| |j        z   n| S N)
isinstancemxarraynbytes)accxs     r%   <lambda>zwired_limit.<locals>.<lambda>   s!    Z28-D-DM3>># r'   r    max_recommended_working_set_sizeg?i   z0[WARNING] Generating with a model that requires z6 MB which is close to the maximum recommended size of z MB. This can be slow. See the documentation for possible work-arounds: https://github.com/ml-explore/mlx-lm/tree/main#large-models)rM   metalis_availabler   device_infoprintset_wired_limitsynchronize)rG   rH   model_bytesmax_rec_sizemodel_mb
max_rec_mb	old_limitss           r%   wired_limitr`      s      8  "" *	EEEDDDDD!MMuVW
 
 ~''(JK|+++"e+H%.JN8 N NEON N N   &|44		*EEE"  & &AN1%%%%&    y))))) "  & &AN1%%%%&    y))))s   ( *C AD$c                       e Zd ZU dZeed<   eed<   ej        ed<   e	ed<   eed<   e
ed<   eed<   e
ed	<   e
ed
<   dZee         ed<   dS )GenerationResponsea  
    The output of :func:`stream_generate`.

    Args:
        text (str): The next segment of decoded text. This can be an empty string.
        token (int): The next token.
        from_draft (bool): Whether the token was generated by the draft model.
        logprobs (mx.array): A vector of log probabilities.
        prompt_tokens (int): The number of tokens in the prompt.
        prompt_tps (float): The prompt processing tokens-per-second.
        generation_tokens (int): The number of generated tokens.
        generation_tps (float): The tokens-per-second for generation.
        peak_memory (float): The peak memory used so far in GB.
        finish_reason (str): The reason the response is being sent: "length", "stop" or `None`
    texttokenlogprobs
from_draftprompt_tokens
prompt_tpsgeneration_tokensgeneration_tpspeak_memoryNfinish_reason)__name__
__module____qualname____doc__r7   __annotations__r:   rM   rN   boolr<   rl   r	   r/   r'   r%   rb   rb   	  s            IIIJJJh#'M8C='''''r'   rb   c                     |d S t          |           D ]:\  }}t          |d          r%|j        |k    r|                    ||          | |<   ;d S )Nto_quantized)
group_sizebits)	enumeratehasattroffsetrt   )prompt_cachequantized_kv_startkv_group_sizekv_bitsecs         r%   maybe_quantize_kv_cacher   '  so    ,'' U U11n%% 	U!(6H*H*HnnGnTTLOU Ur'         r2   )
max_tokenssamplerlogits_processorsmax_kv_sizerz   prefill_step_sizer}   r|   r{   prompt_progress_callbackinput_embeddingspromptr   r   r   r   rz   r   r}   r|   r{   r   r   returnc             #   D  K   |t                    st          d          t          |           dk    rPt          |           t          |          k    r0t          dt          |           dt          |            d          n"t          |           dk    rt          d          dt          j        |          |pd	 }t          j        t          |
|	|
          pd dt          j	        dt          t          j	                 ffdddt          j	        dt          t          j	                 ffd}t          j        t                    5  |t          |          nt          |           }d} |||           ||z
  dk    r||z
  dz
  }t          ||          } | d|         d         ||d|         d         nd                       t          j        d D                        ||z  } |||           | |d         } |
||d         n|}t          j                     ||z
  dk     || |          \  }}ddd           n# 1 swxY w Y   t          j        ||           d}	 ||k    r# ||          \  }}t          j        ||           |dk    r t          j        |            |||           ||k    rdS |                                |fV  |dz  dk    rt          j                     ||}}|dz  })a  
    A generator producing token ids based on the given prompt from the model.

    Args:
        prompt (mx.array): The input prompt.
        model (nn.Module): The model to use for generation.
        max_tokens (int): The maximum number of tokens. Use``-1`` for an infinite
          generator. Default: ``256``.
        sampler (Callable[mx.array, mx.array], optional): A sampler for sampling a
          token from a vector of log probabilities. Default: ``None``.
        logits_processors (List[Callable[[mx.array, mx.array], mx.array]], optional):
          A list of functions that take tokens and logits and return the processed
          logits. Default: ``None``.
        max_kv_size (int, optional): Maximum size of the key-value cache. Old
          entries (except the first 4 tokens) will be overwritten.
        prompt_cache (List[Any], optional): A pre-computed prompt cache. Note, if
          provided, the cache will be updated in place.
        prefill_step_size (int): Step size for processing the prompt.
        kv_bits (int, optional): Number of bits to use for KV cache quantization.
          None implies no cache quantization. Default: ``None``.
        kv_group_size (int): Group size for KV cache quantization. Default: ``64``.
        quantized_kv_start (int): Step to begin using a quantized KV cache.
           when ``kv_bits`` is non-None. Default: ``0``.
        prompt_progress_callback (Callable[[int, int], None]): A call-back which takes the
           prompt tokens processed so far and the total number of prompt tokens.
        input_embeddings (mx.array, optional): Input embeddings to use instead of or in
          conjunction with prompt tokens. Default: ``None``.

    Yields:
        Tuple[mx.array, mx.array]: One token and a vector of log probabilities.
    Nz(Model does not support input embeddings.r   z8When providing input_embeddings, their sequence length (z0) must match the sequence length of the prompt (z), or the prompt must be empty.z=Either input_embeddings or prompt (or both) must be provided.)r   c                      d S rK   r/   _s    r%   rR   zgenerate_step.<locals>.<lambda>u  s    t r'   r{   r|   r}   c                 .    t          j        | d          S NaxisrM   argmaxrQ   s    r%   rR   zgenerate_step.<locals>.<lambda>~      BIab$9$9$9 r'   input_tokensr   c                 >    | | |          S  |           S )N)r   r   r   r/   )r   r   rG   rz   s     r%   _model_callz"generate_step.<locals>._model_call  sA    '5LCS    5\::::r'   c                    t          j        t                    5   | d          ||d          nd           }|d d dd d f         }r>t          |           dk    r+t          j        | g          n| D ]} ||          } 	           |t          j        |d          z
  } 
|          }||                    d          fcd d d            S # 1 swxY w Y   d S )Nr   r   r   r   T)keepdims)rM   streamgeneration_streamlenconcat	logsumexpsqueeze)r   r   logits	processorre   sampledr   r   rz   quantize_cache_fnr   tokenss         r%   _stepzgenerate_step.<locals>._step  sz    Y()) 	0 	0 [)$/.>.J$T**PT  F AAAr111H%F  7S%6%6%:%: ) Iv|4555% 
 "3 7 7I&Yvv66FFl+++Vd C C CCHgh''GH,,Q////	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0s   B2CC!Cr   r   c                     g | ]	}|j         
S r/   state.0r   s     r%   
<listcomp>z!generate_step.<locals>.<listcomp>      333QW333r'   Tr   rK   )r   
ValueErrorr   r   make_prompt_cache	functoolsr   r   rM   rN   r	   r   r   minevalclear_cache
async_evalitem)r   rG   r   r   r   r   rz   r   r}   r|   r{   r   r   r   total_prompt_tokensprompt_processed_tokens	remainingn_to_processyre   nnext_ynext_logprobsr   r   r   s    ` `` `                @@@r%   generate_stepr   /  s     ^ #2599 	GHHH[[1__V4D0E0E!E!E(3O_K`K` ( (ADV( ( (  
 
V		K
 
 	
 F .#
 
 

  8LOO!)-#	   :99G;"( ;hrx>P ; ; ; ; ; ; ;0 0BH 08J 0 0 0 0 0 0 0 0 0 0 08 
$	%	% T T%5%AC !!!s6{{ 	 #$  !8:MNNN!$;;a??,/FF!KI0)<<LK#M\M248 (3 %]l]3D99    l+++G33l333444#|3#$$%<>QRRRLMM*F $/ !//% 
 N+ "$;;a??. eBRSSS8;T T T T T T T T T T T T T T T> M!X	A
??$)E!HH!FMM&-00066GAJJJ$$%8:MNNN
??Effhh    s7a<<Nm8	Qs   #C;I**I.1I.      )	num_draft_tokensr   r   r   rz   r   r}   r|   r{   draft_modelr   c       	      #     K   |                      t          j                  }d|)t          j        |          t          j                  n8|dt          |j                           |t          |j                  d         pd t          j        t          ||
|	          fddfd	fd}fd}fd	}t          j
        t                    5   ||          } |||          }ddd           n# 1 swxY w Y   d
}d
}d
}	 	 t          ||z
  |          } |||          }dj        |j        z
  |z
  dz            t          j        ||g          } |||dz             \  }}t          j        ||           |                                }|                                }d
}||k     r=||         ||         ||         }}}||k    rn|dz  }|dz  }||dfV  ||k    rn||k     =||k     r|dz  }||         ||         dfV  ||k    rnt          j        ||         gt          j                  }|}||k    r;t          j        t          j        |dd         t          j                  |g          }dt%          ||z
  d                     |||           	  |||           dS #  |||           w xY w)au  
    A generator producing token ids based on the given prompt from the model.

    Args:
        prompt (mx.array): The input prompt.
        model (nn.Module): The model to use for generation.
        draft_model (nn.Module): The draft model for speculative decoding.
        num_draft_tokens (int, optional): The number of draft tokens for
          speculative decoding. Default: ``2``.
        max_tokens (int): The maximum number of tokens. Use``-1`` for an infinite
          generator. Default: ``256``.
        sampler (Callable[[mx.array], mx.array], optional): A sampler for sampling a
          token from a vector of log probabilities. Default: ``None``.
        logits_processors (List[Callable[[mx.array, mx.array], mx.array]], optional):
          A list of functions that take tokens and logits and return the processed
          logits. Default: ``None``.
        prompt_cache (List[Any], optional): A pre-computed prompt cache. Note, if
          provided, the cache will be updated in place. The cache must be trimmable.
        prefill_step_size (int): Step size for processing the prompt.
        kv_bits (int, optional): Number of bits to use for KV cache quantization.
          None implies no cache quantization. Default: ``None``.
        kv_group_size (int): Group size for KV cache quantization. Default: ``64``.
        quantized_kv_start (int): Step to begin using a quantized KV cache.
           when ``kv_bits`` is non-None. Default: ``0``.

    Yields:
        Tuple[mx.array, mx.array, bool]: One token, a vector of log probabilities,
          and a bool indicating if the token was generated by the draft model
    Nc                 .    t          j        | d          S r   r   r   s    r%   rR   z+speculative_generate_step.<locals>.<lambda>  r   r'   r   c                 |    rD ]} || |          }|t          j        |dd          z
  } |          }||fS )Nr   Tr   r   )rM   r   )r   r   r   re   r   r   r   s        r%   _process_and_samplez6speculative_generate_step.<locals>._process_and_sample  sa     	3. 3 3	"6622BLb4HHHHGH({r'   r   c           
         t          j        t                    5   | |d          |          }|d d | d d d f         } |           
rg g }}|dk    r|d |dz
            }t          |          D ]b}t          j        |g          n| 	|d d |d d f                   \  }}|                    |           |                    |           ct          j        |d          t          j        |d          fcd d d            S  	d |                    d                    cd d d            S # 1 swxY w Y   d S )Nr   r   r   r   )rM   r   r   rangeconcatenateappendr   )rG   r   r   	n_predictr   out_yout_logprobsire   r   r   prev_tokensr   s            r%   r   z(speculative_generate_step.<locals>._step  s   Y()) 	D 	DU1T7%000FAAA	z{{AAA-.Fe$$$  D&("|q==,i!m,,-Ay)) 2 2A '2 Q'7888  
 #6"5k6!!!QPQPQPQ'?"S"SKAxLLOOO ''1111~e!444bn q7 7 7 '	D 	D 	D 	D 	D 	D 	D 	D. +*41B1BCC/	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	Ds   C(D;D;;D?D?c                     |j         k    rl | |d          d          |            |           t          j        d |D                        |d          }t          j                     |j         k    l|S )Nr   c                     g | ]	}|j         
S r/   r   r   s     r%   r   z?speculative_generate_step.<locals>._prefill.<locals>.<listcomp>=  s    ,,,QW,,,r'   )sizerM   r   r   )rG   r   r   r   r   s      r%   _prefillz+speculative_generate_step.<locals>._prefill9  s    f(((E!&&&'-U;;;;e$$$G,,e,,,---#$$%AN f((( r'   c                     t          j        | |z
             t          j        t          | |z
  dz
  d                     d S )Nr   r   )r   trim_prompt_cachemax)	num_draft
num_acceptdraft_cachemodel_caches     r%   _rewind_cachez0speculative_generate_step.<locals>._rewind_cacheB  sH    Y-CDDDSZ1G!1KQ-O-OPPPPPr'   c                    |dk    rt          j        g t           j                  S g }t          |          D ];} |           \  } }t          j        |            |                    |            <t          j        |          S )Nr   )rM   rN   uint32r   r   r   r   )r   r   ysr   r   r   r   s       r%   _draft_generatez2speculative_generate_step.<locals>._draft_generateF  s    >>8B	***y!! 	 	A5k155DAqM!IIaLLLL~b!!!r'   r   TFr   )r   )astyperM   r   r   r   r   layersr   r   r   r   r   r   r   r   r   tolistrN   r   ) r   rG   r   r   r   r   r   rz   r   r}   r|   r{   r   r   r   r   draft_yntoksr   r   draft_tokensr   re   tndtnlpnr   r   r   r   r   r   s      `  `` `                 @@@@@@r%   speculative_generate_stepr     sP     Z 	bi  AK -e44-k::"#6S%6%6#67"3u|#4#4#6#67:99G!)-#	       D D D D D D D D D4     Q Q Q Q Q Q" " " " " " " 
$	%	% , ,(;Q77HUK++, , , , , , , , , , , , , , , EI	A+$(	(J.0@AAI*?7I>>L&)*UK,<qv,E	,QTU,U*UV<011A$uUKIMJJFHGFL)))'..00L]]__FAi--%ay,q/8A;C99Q
#tm###J&& i-- z!!
Qi!e3333
""&)bi00AG
 I~~.Xl233/;;WE  &)*BSQ-B-B,B*BCM)Q'''Q(	(2 " 	i#####i####s   .DDD'F%K K)	tokenizerc              +   V  K   t          |t                    st          |          }t          |t          j                  sdt          |t                    r;|j        du p|                    |j                   }|                    ||          }t          j        |          }|j        }||d<   |1|	                    dd           t          || fi |}d |D             }n;|	                    dd           |	                    dd           t          || |fi |}t          | t          g          5  t          j                    }	t!          |          D ]\  }
\  }}}|
dk    r3t          j                    |	z
  }|j        |z  }t          j                    }	||j        v r nw|                    |           |
d	z   |k    r nWt)          |j        ||||j        ||
d	z   |
d	z   t          j                    |	z
  z  t          j                    d
z  d
  
        V  |                                 t)          |j        ||||j        ||
d	z   |
d	z   t          j                    |	z
  z  t          j                    d
z  ||j        v rdnd
  
        V  ddd           dS # 1 swxY w Y   dS )a  
    A generator producing text based on the given prompt from the model.

    Args:
        model (nn.Module): The model to use for generation.
        tokenizer (PreTrainedTokenizer): The tokenizer.
        prompt (Union[str, mx.array, List[int]]): The input prompt string or
          integer tokens.
        max_tokens (int): The maximum number of tokens to generate.
          Default: ``256``.
        draft_model (Optional[nn.Module]): An optional draft model. If provided
          then speculative decoding is used. The draft model must use the same
          tokenizer as the main model. Default: ``None``.
        kwargs: The remaining options get passed to :func:`generate_step`.
          See :func:`generate_step` for more details.

    Yields:
        GenerationResponse: An instance containing the generated text segment and
            associated metadata. See :class:`GenerationResponse` for details.
    Nadd_special_tokensr   r   c              3   &   K   | ]\  }}||d fV  dS )FNr/   )r   rd   re   s      r%   	<genexpr>z"stream_generate.<locals>.<genexpr>  s>       
 
)8UHe$
 
 
 
 
 
r'   r   r   r   r       eA)
rc   rd   re   rf   rg   rh   ri   rj   rk   rl   stoplength)rL   r   rM   rN   r7   	bos_token
startswithencodedetokenizerpopr   r   r`   r   timeperf_counterrw   r   eos_token_ids	add_tokenrb   last_segmentget_peak_memoryfinalize)rG   r   r   r   r   kwargsr   r   token_generatorticr   rd   re   rf   prompt_timerh   s                   r%   stream_generater	    s~     8 i!122 0$Y//	fbh'' "fc"" 	U!*!4!< "FDUDU#E E A %%fAS%TTF&!!'K%F<

%t,,,'@@@@
 
<K
 
 
 	

=$'''

-t4443E;
 
*0
 
 
U./	0	0 '
 '
!!09/0J0J 	 	,A,xAvv"/11C7#[;6
'))	///!!%(((A*$$$ -!%$k%"#a% !A$*;*=*=*CD.0036"       	 )! +!!eEd&7&9&9C&?@*,,s2$)Y-D$D$D&&(
 
 
 	
 	
 	
9'
 '
 '
 '
 '
 '
 '
 '
 '
 '
 '
 '
 '
 '
 '
 '
 '
 '
s   5EJJ"%J"Fverbosec                    |rt          d           d}t          | ||fi |D ]%}|rt          |j        dd           ||j        z  }&|rt                       t          d           t          |          dk    rt          d           dS t          d|j         d	|j        d
d           t          d|j         d	|j        d
d           t          d|j        d
d           |S )a  
    Generate a complete response from the model.

    Args:
       model (nn.Module): The language model.
       tokenizer (PreTrainedTokenizer): The tokenizer.
       prompt (Union[str, List[int]]): The input prompt string or integer tokens.
       verbose (bool): If ``True``, print tokens and timing information.
           Default: ``False``.
       kwargs: The remaining options get passed to :func:`stream_generate`.
          See :func:`stream_generate` for more details.
    z
========== T)endflushr   z!No text generated for this promptNzPrompt: 	 tokens, .3f tokens-per-seczGeneration: zPeak memory:  GB)	rW   r	  rc   r   rg   rh   ri   rj   rk   )rG   r   r   r
  r  rc   responses          r%   generater    sP   &  hD#E9fGGGG   	5(-Rt4444 =ht99>>5666F8x- 8 8"78 8 8	
 	
 	
 	<85 < <&;< < <	
 	
 	
 	;h2;;;;<<<Kr'   c                 z    t          d | D                       t          j        fd| D                       S )Nc              3   4   K   | ]}t          |          V  d S rK   r   r   ps     r%   r   z$_left_pad_prompts.<locals>.<genexpr>  (      11AQ111111r'   c                 B    g | ]}d gt          |          z
  z  |z   S r   r  r   r  
max_lengths     r%   r   z%_left_pad_prompts.<locals>.<listcomp>  s0    FFFaSJQ/014FFFr'   r   rM   rN   promptsr  s    `r%   _left_pad_promptsr"    J    1111111
8FFFFgFFFGGGr'   c                 z    t          d | D                       t          j        fd| D                       S )Nc              3   4   K   | ]}t          |          V  d S rK   r  r  s     r%   r   z%_right_pad_prompts.<locals>.<genexpr>  r  r'   c                 B    g | ]}|d gt          |          z
  z  z   S r  r  r  s     r%   r   z&_right_pad_prompts.<locals>.<listcomp>   s0    FFFQ!
SVV 344FFFr'   r  r   s    `r%   _right_pad_promptsr'    r#  r'   c                   v    e Zd ZU dZdZeed<   dZeed<   dZ	eed<   dZ
eed<   dZeed<   dZeed<   dZeed	<   d
S )
BatchStatsa%  
    An data object to hold generation stats.

    Args:
        prompt_tokens (int): The number of prompt tokens processed.
        prompt_tps (float): The prompt processing tokens-per-second.
        prompt_time (float): The time in seconds spent in prompt processing.
        generation_tokens (int): The number of generated tokens.
        generation_tps (float): The tokens-per-second for generation.
        generation_time (float): The time in seconds spent in generation .
        peak_memory (float): The peak memory used so far in GB.
    r   rg   rh   r  ri   rj   generation_timerk   N)rm   rn   ro   rp   rg   r:   rq   rh   r<   r  ri   rj   r*  rk   r/   r'   r%   r)  r)  #  s           M3JKsNEOUKr'   r)  c                   b    e Zd ZU dZee         ed<   eed<   eeee	                           ed<   dS )BatchResponsez
    An data object to hold a batch generation response.

    Args:
        texts: (List[str]): The generated text for each prompt.
        stats (BatchStats): Statistics about the generation.
    textsstatscachesN)
rm   rn   ro   rp   r   r7   rq   r)  r	   r   r/   r'   r%   r,  r,  ;  sS           9T$s)_%%%%%%r'   r,  c                      e Zd ZU ee         ed<   ej        ed<   ej        ed<   ee         ed<   ee         ed<   ee         ed<   ee         ed<   ee         ed<   eej                 ed	<   d
 Z	dee         fdZ
d Zd ZdS )Batchuidsr   re   r   
num_tokensr   samplersr   r   c                 *    t          | j                  S rK   )r   r2  selfs    r%   __len__zBatch.__len__V  s    49~~r'   keep_idxc                      fd|D              _          fd|D              _         fd|D              _         fd|D              _         fd|D              _         fd|D              _         fd|D              _        t          j        |t          j	                  } j
        |          _
         j        D ]}|                    |           d S )Nc                 *    g | ]}j         |         S r/   )r2  r   kr7  s     r%   r   z Batch.filter.<locals>.<listcomp>Z  s    444aTYq\444r'   c                 *    g | ]}j         |         S r/   )re   r<  s     r%   r   z Batch.filter.<locals>.<listcomp>[       <<<aq)<<<r'   c                 *    g | ]}j         |         S r/   )r   r<  s     r%   r   z Batch.filter.<locals>.<listcomp>\       @@@!4?1-@@@r'   c                 *    g | ]}j         |         S r/   )r3  r<  s     r%   r   z Batch.filter.<locals>.<listcomp>]  rA  r'   c                 *    g | ]}j         |         S r/   )r4  r<  s     r%   r   z Batch.filter.<locals>.<listcomp>^  r?  r'   c                 *    g | ]}j         |         S r/   )r   r<  s     r%   r   z Batch.filter.<locals>.<listcomp>_  s!    !N!N!N$"8";!N!N!Nr'   c                 *    g | ]}j         |         S r/   )r   r<  s     r%   r   z Batch.filter.<locals>.<listcomp>`  s    888!t{1~888r'   )r2  re   r   r3  r4  r   r   rM   rN   int32r   r   filter)r7  r9  r   s   `  r%   rG  zBatch.filterY  s   44448444	<<<<8<<<@@@@x@@@@@@@x@@@<<<<8<<<!N!N!N!NX!N!N!N8888x8888Hbh//! 	 	AHHX	 	r'   c                 l   | j                             |j                    t          j        | j        |j        g          | _        | j                            |j                   | j                            |j                   | j                            |j                   | j                            |j                   | j	                            |j	                   | j
                            |j
                   t          | j        |j                  D ]\  }}|                    |           d S rK   )r2  extendrM   r   r   re   r3  r   r4  r   r   zipr   )r7  otherr   os       r%   rI  zBatch.extendf  s   	$$$ 122U^,,,u/000u/000U^,,,%%e&=>>>5<(((
EK00 	 	DAqHHQKKKK	 	r'   c                 *    fd| j         D             S )Nc                 :    g | ]}|                               S r/   extract)r   r   idxs     r%   r   z'Batch.extract_cache.<locals>.<listcomp>s  s#    3331		#333r'   r   )r7  rQ  s    `r%   extract_cachezBatch.extract_cacher  s    3333
3333r'   N)rm   rn   ro   r   r:   rq   rM   rN   r   r8  rG  rI  rR  r/   r'   r%   r1  r1  J  s         
s)OOO	xKKKhS	S	93iCy   N  tCy    
 
 
4 4 4 4 4r'   r1  c                     fdt          | d          r"|                                 }fd|D             S fd| j        D             S fd| j        D             S )z[
    Convert a list of regular caches into their corresponding
    batch-aware caches.
    c                    t          |           t          u rt                    S t          | t                    rt          j                  | _        | S t          | t                    r/| j	        dk    rt          d          t          | j                  S t          | t                    rt          fd| j        D              S t          t          |            d          )Nr   z2RotatingKVCache with keep tokens is not supported.c              3   .   K   | ]} |          V  d S rK   r/   )r   sub_cto_batch_caches     r%   r   z6_make_cache.<locals>.to_batch_cache.<locals>.<genexpr>  s-      KK~~e44KKKKKKr'   z does not yet support batching)r*   r   r   rL   r   rM   rN   left_paddingr   keepr   r   max_sizer   r/  )r   rX  rW  s    r%   rW  z#_make_cache.<locals>.to_batch_cache|  s    77g---;'' 
	IXl33ANH?++ 	Ivzz !UVVV'
LAAA9%% 	IKKKK!(KKKLLQGGGHHHr'   
make_cachec                 &    g | ]} |          S r/   r/   )r   r   rW  s     r%   r   z_make_cache.<locals>.<listcomp>  s#    111aq!!111r'   Nc                 0    g | ]}t                    S r/   )r   )r   r   rX  r   s     r%   r   z_make_cache.<locals>.<listcomp>  s1       DE$[,??  r'   c                 .    g | ]}t                    S r/   )r   )r   r   rX  s     r%   r   z_make_cache.<locals>.<listcomp>  s!    AAAq\**AAAr'   )rx   r[  r   )rG   rX  r   r   rW  s    `` @r%   _make_cacher_  v  s    I I I I I I ul## B  ""111151111"    IN    BAAAELAAAAr'   c                 d   g }t          t          | d                             D ]t          | d                  d          rA|                    | d                                      fd| D                                  _t          t          | d                             d          |S )Nr   mergec                      g | ]
}|         S r/   r/   r   r   r   s     r%   r   z!_merge_caches.<locals>.<listcomp>  s    2H2H2HA1Q42H2H2Hr'   z+ does not yet support batching with history)r   r   rx   r   ra  r   r*   )r/  batch_cacher   s     @r%   _merge_cachesre    s    K3vay>>""  6!9Q<)) 	vay|112H2H2H2H2H2H2HIIJJJJq	!%%RRR   r'   c                      fd| D             S )Nc              3   B   K   | ]}|                               V  d S rK   rO  rc  s     r%   r   z&_lazy_extract_cache.<locals>.<genexpr>  s-      ((QAIIaLL((((((r'   r/   )r   r   s    `r%   _lazy_extract_cacherh    s    ((((%((((r'   c                      e Zd Ze G d d                      Z	 	 	 	 	 	 	 	 	 	 d%ded	ee         d
eee	j
        ge	j
        f                  deeee	j
        e	j
        ge	j
        f                           dedededeeeeeeee         f                  gdf                  deeeeeeef                  gdf                  dee         fdZd Zd Z	 	 	 	 	 d&deee         edf         dedz  dedz  deez  dz  fdZd'dee         defdZed             Zd Zde	j
        dee         dedz  dedz  d ee	j
                 f
d!Zd" Zd# Zd$ ZdS )(BatchGeneratorc                   t    e Zd ZU eed<   eed<   ej        ed<   ee         ed<   e	g e
e         f         ed<   dS )BatchGenerator.Responseuidrd   re   rl   rz   N)rm   rn   ro   r:   rq   rM   rN   r	   r7   r   r   r   r/   r'   r%   Responserl    s[         


(}$$$r49}------r'   rn     N       r   r   stop_tokensr   r   completion_batch_sizeprefill_batch_sizer   prompt_checkpoint_callbackr   r   c                    || _         g | _        || _        |pt                      | _        |pd | _        |pg | _        d| _        || _        || _	        t          ||          | _        |	| _        |
pd | _        t                      | _        d| _        || _        d | _        t&          j                                        r2t'          j        t'          j                    d                   | _        d S d | _        d S )Nc                 .    t          j        | d          S r   r   r   s    r%   rR   z)BatchGenerator.__init__.<locals>.<lambda>  s    RYqr-B-B-B r'   r   c                      d S rK   r/   r   s    r%   rR   z)BatchGenerator.__init__.<locals>.<lambda>  s    PT r'   rS   )rG   unprocessed_promptsr   setrr  r   r   	uid_countr   rt  r   rs  ru  r   r)  _stats_next_countr   active_batchrM   rT   rU   rX   rV   _old_wired_limit)r7  rG   r   rr  r   r   rs  rt  r   ru  r   r   s               r%   __init__zBatchGenerator.__init__  s    ( 
#% $&/#%%C#B#B!2!8b!2"4%()>@R%S%S"*D'(@(U__% ll& 8  "" 	)$&$6  !CD% %D!!! %)D!!!r'   c                     | j         ;t          j        t                     t          j        | j                    d | _         d S d S rK   )r  rM   rY   r   rX   r6  s    r%   closezBatchGenerator.close  sF     ,N,---t4555$(D!!! -,r'   c                 .    |                                   d S rK   )r  r6  s    r%   __del__zBatchGenerator.__del__  s    

r'   r4  prompt_checkpointsc           
         g }|t          |t                    r|p| j        gt          |          z  }|t          |t                    r|pdgt          |          z  }|d gt          |          z  }t	          t          |                    D ]&}||         t          j        | j                  ||<   '|pd gt          |          z  }|p| j        gt          |          z  }t          ||||||          D ]Y\  }	}
}}}}| j
                            | j        |	|
||||f           |                    | j                   | xj        dz  c_        Zt          | j
        d           | _
        |S )Nr   r   c                 l    t          | d                   t          d | d         D                       z   S )Nr   c              3   >   K   | ]}|                                 V  d S rK   )r   r   s     r%   r   z:BatchGenerator.insert.<locals>.<lambda>.<locals>.<genexpr>	  s*      )A)Aq!&&(()A)A)A)A)A)Ar'   r3   )r   r   r   s    r%   rR   z'BatchGenerator.insert.<locals>.<lambda>	  s0    #ad))c)A)AAaD)A)A)A&A&AA r'   )key)rL   r:   r   r   r   r   r   rG   r   rJ  ry  r   r{  sorted)r7  r!  r   r/  r4  r   r  r2  r   r  mr   r_   lppcs                  r%   insertzBatchGenerator.insert  s    J!<!<$783w<<GJ%4F)L)L%"4":!;c'll!J>Vc'll*Fs7||$$ 	@ 	@Aay !3DJ??q	4vG4-X$2H1ICPWLL1X"%Z3DFX#
 #
 	  	 Aq!QB $++T^Q1aR,PQQQKK'''NNaNNN#)$AA$
 $
 $
  r'   Fr2  return_prompt_cachesc                     i }t                    | j        | j        }|r7t          |j                  D ]"\  }}|vr
|                    |          ||<   #fdt          |j                  D             }t          |          dk    r|                    |           nd | _        t          t          t          | j	                                      D ]1}| j	        |         d         v r| j	        
                    |           2|r|S d S )Nc                 "    g | ]\  }}|v	|S r/   r/   )r   r~   rm  r2  s      r%   r   z)BatchGenerator.remove.<locals>.<listcomp>  s"    QQQfaDr'   r   )rz  r~  rw   r2  rR  r   rG  reversedr   ry  r   )	r7  r2  r  r/  batchr~   rm  r9  r   s	    `       r%   removezBatchGenerator.remove  s0   4yy(%E# 9'
33 9 9FAs$ "'"5"5a"8"8F3KKQQQQ	%*(=(=QQQH8}}q  X&&&&$(!%D$< = =>>?? 	0 	0A'*1-55(,,Q/// 	M	 	r'   c                     t          d | j        D                       }| j        &|t          d | j        j        D                       z  }|S )Nc              3   :   K   | ]}|d          D ]}|j         V  dS )r3   NrO   )r   r  r   s      r%   r   z5BatchGenerator.prompt_cache_nbytes.<locals>.<genexpr>&  s5      MM!MM1AHMMMMMMMr'   c              3   $   K   | ]}|j         V  d S rK   r  r   s     r%   r   z5BatchGenerator.prompt_cache_nbytes.<locals>.<genexpr>(  s$      CCaCCCCCCr'   )sumry  r~  r   )r7  totals     r%   prompt_cache_nbytesz"BatchGenerator.prompt_cache_nbytes$  sV    MMd&>MMMMM(SCC4+<+BCCCCCCEr'   c                    t          | \  }}}}}}}d |D             }	t          |	          fd|	D             }
d t          |	|          D             }t          dt          |                    | j        xj        t	          |	          z  c_        d |D             }dt          d |D                       r	t          |          }t          | j        |
| j	                  |j
        d         k    rt          | j        |j
        d         z
            }|                     |d d d |f         	           t          j        d
 D                        |d d |d f         }|z  |                     fdt          ||	          D                        t          j                     |j
        d         k    ʐnct          j        fd|D                       }t%          |          }t'          |          D ]%}|                    fd|	D             |
           &|j
        d         k    rt          | j        |j
        d         z
            }|                     |d d d |f         	           t          j        d D                        |d d |d f         }|z  |                     fdt          ||	          D                        t          j                     |j
        d         k    t          j        d D                        |}D ]}|                                 | j        /|                     fdt/          |          D                        dk    rD|                     |d d d dz
  f         	           t          j        d D                        t          j                     |                     ||||          \  }}t          j        ||           t5          t7          |          ||t7          |          dgt9          |          z  t7          |          t7          |          |	  	        S )Nc                 ,    g | ]}t          |          S r/   r  r  s     r%   r   z3BatchGenerator._process_prompts.<locals>.<listcomp>6  s    ***a3q66***r'   c                     g | ]}|z
  S r/   r/   )r   lr  s     r%   r   z3BatchGenerator._process_prompts.<locals>.<listcomp>8  s    333a:>333r'   c                 0    g | ]\  }}|d k    r||z
  n| S r  r/   )r   r  r  s      r%   r   z3BatchGenerator._process_prompts.<locals>.<listcomp>=  s;     
 
 
,1ArrAvvQVVB3
 
 
r'   r   c                 6    g | ]}t          j        |          S r/   )rM   rN   )r   inps     r%   r   z3BatchGenerator._process_prompts.<locals>.<listcomp>D  s     222C"(3--222r'   r   c              3   J   K   | ]}|d                                           V  dS )r   N)emptyr   s     r%   r   z2BatchGenerator._process_prompts.<locals>.<genexpr>J  s.      ,,qtzz||,,,,,,r'   )r  r   c                     g | ]	}|j         
S r/   r   r   s     r%   r   z3BatchGenerator._process_prompts.<locals>.<listcomp>S      777Q777r'   c                      g | ]
\  }}||fS r/   r/   r   rm  r   processed_tokenss      r%   r   z3BatchGenerator._process_prompts.<locals>.<listcomp>W  5       'C .7  r'   c                 &    g | ]}| d          S rK   r/   )r   r  prompt_checkpoints     r%   r   z3BatchGenerator._process_prompts.<locals>.<listcomp>d  s(    #K#K#KqA'8&8&9&9$:#K#K#Kr'   c                     g | ]}|z
  S r/   r/   )r   r  r  s     r%   r   z3BatchGenerator._process_prompts.<locals>.<listcomp>l  s    DDDqQ!22DDDr'   )lengthsright_paddingc                     g | ]	}|j         
S r/   r   r   s     r%   r   z3BatchGenerator._process_prompts.<locals>.<listcomp>u  r  r'   c                      g | ]
\  }}||fS r/   r/   r  s      r%   r   z3BatchGenerator._process_prompts.<locals>.<listcomp>y  r  r'   c                     g | ]	}|j         
S r/   r   r   s     r%   r   z3BatchGenerator._process_prompts.<locals>.<listcomp>  r   r'   c                 <    g | ]\  }}|t          |          fS r/   )rh  )r   r   rm  rz   r  s      r%   r   z3BatchGenerator._process_prompts.<locals>.<listcomp>  s@       3 +-@q-Q-QR  r'   c                     g | ]	}|j         
S r/   r   r   s     r%   r   z3BatchGenerator._process_prompts.<locals>.<listcomp>  r   r'   )rJ  r   r|  rg   r  allr"  r_  rG   r   shaper   r   rM   r   r   r   rN   r'  re  preparer  ru  rw   r   r   r1  listr   )r7  r!  r2  inputsr   r/  r4  r   r  r  paddingr   r   last_inputsr   r   re   r  r  rz   r  s                    @@@@r%   _process_promptszBatchGenerator._process_prompts+  s_    M	
 +*6***\\
33337333

 
58BT5U5U
 
 
  3'9#:#:;;!!S\\1!!226222
 ,,V,,,,, 7	!&v*EEEF&tz7D<LMML,q/$555"*FLO>O,O    

6!!!]l]"23<
HHH77,777888<== 01 L0 --   +.tW+=+=         ,q/$555, (#K#K#K#KF#K#K#KLLK':FFFF(00L!   		DDDDGDDD")     
 ,q/$555"*FLO>O,O    

6!!!]l]"23<
HHH77,777888<== 01 L0 --   +.tW+=+=         ,q/$555  G33l333444 F 	 	AJJLLLL *6++    "+D//     q  JJvaaa!8#4q#8!889JNNNG33l333444
jjL(,=v
 
8 	a"""JJC#d))ONN"##

 

 
	
r'   r   rz   r   c                    |j         d         }|                     ||          }|d d dd d f         }t          |          rig }t          |          D ]A}	||	|	dz            }
||	         D ]} |||	         |
          }
|                    |
           Bt          j        |d          }|t          j        |dd          z
  }t          |          reg }t          |          D ]<}	||	         p| j        } |||	|	dz                      }|                    |           =t          j        |d          }n|                     |          }|t          |          fS )Nr   r   r   r   r   Tr   )
r  rG   anyr   r   rM   r   r   r   r  )r7  r   rz   r4  r   r   
batch_sizer   processed_logitsr~   sample_logitsr   re   all_samplessample_samplerr   s                   r%   r   zBatchGenerator._step  s    "'*
L==2qqq! !! 	>!:&& 7 7 &q1q5y 1!21!5 H HI$-IfQi$G$GMM ''6666^$41===FBLb4HHHHx== 	-K:&& , ,!)!!<(.!a!e))<==""7++++n[q999GGll8,,GX&&r'   c                     | j         j        | j         j        z  | j         _        | j         j        | j         j        z  | j         _        t          j                    dz  | j         _	        | j         S )Nr   )
r|  rg   r  rh   ri   r*  rj   rM   r  rk   r6  s    r%   r.  zBatchGenerator.stats  sY    !%!:T[=T!TK)DK,GG 	" #%"4"6"6"<{r'   c           
      2   t          j                    }d}| j        }|rt          |          nd}| j        |z
  }|| j        k    r5| j        d | j                 }t          |          dk    r|dk    rnt          |          dk    r	d | _        g S |]|s[t          j        |j	        |j
                   | j        xj        t          j                    |z
  z  c_        t          j                    }|                     |          }| j        | j        d          | _        d}| j        || _        n| j                            |           t          | j                  }|t          |          z  }|| j        k    5| j        }|j	        |j
        }}t          |j                  D ].\  }	}
t          j        |
||	|	dz            f          |j        |	<   /|                     |d d d f         |j        |j        |j        |j                  \  |_	        |_
        t          j        |j	        |j
        |j                   |                                }t          j                    }|r| j        xj        ||z
  z  c_        n| j        xj        ||z
  z  c_        g }g }g }t          t3          ||j        |j        |j                            D ]\  }\  }}}}d }|dz  }||j        |<   || j        v rd}|                    |           n5||k    rd}|                    |           nd }|                    |           ||                    |          }|                    |                      ||||         ||                     t          |          r0t          |          dk    r|!                    |           nd | _        | xj"        dz  c_"        | j"        dz  dk    rt          j#                     | j        xj$        t          |          z  c_$        |S )NFr   Tr   r   r   r   )%r   r   r~  r   rs  rt  ry  rM   r   r   re   r|  r*  r  rI  rw   r   r   r   r   r4  r   r   r   r  rJ  r2  r3  r   rr  r   rR  rn  rG  r}  r   ri   )r7  r  prompt_processingr  
num_active
num_to_addr!  r   re   r   tokstocr9  end_idx	responsesr~   trm  num_tokmax_tokr   rl   s                         r%   _nextzBatchGenerator._next  sF   !!!!#(/SZZZa
/*<
D333./H1H/HIG7||q  Z!^^W""$(!	 ): 000++t/@/B/BS/HH++'))))'22E'+'?'))(D$ !% ($)!!!((///T.//J#e**$J9 D333< !gu~8 .. 	C 	CGAt ndAa!a%iL-ABBELOO"&**aaagJKN#L#
 #
 	egu~u|<<<HHJJ!! 	5K##sSy0###K''394''	-65:u/1ABB.
 .
 	W 	W)A)3 EqLG")EQD$$$ &q!!!!G## (q!!!! $"""(++A..T]]38A;uUUVVVV w<< 	)8}}q  X&&&&$(!Ac!Q&&N%%Y7%%r'   c                     t          j        t                    5  |                                 cd d d            S # 1 swxY w Y   d S rK   )rM   r   r   r  r6  s    r%   nextzBatchGenerator.next/  s    Y()) 	  	 ::<<	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	 s   ;??)
ro  NNNrp  rq  r   NNN)NNNNNF)rm   rn   ro   r   rn  r:   r	   rz  r   rM   rN   r   r
   r   r  r  r  r   r  r  rr   r  propertyr  r  r   r.  r  r  r/   r'   r%   rj  rj    s       . . . . . . . Y. %)<@ %'"#!%  %)%+) +) +) c]	+)
 (BH:rx#789+) $28RX.89:
+)  #+)  +) +) %-d5c49!4567=>%
+) #+d5c3/01478#
+)$ c]%+) +) +) +)Z) ) )   37 $)-04% % $s)S$./%
 +%  $;% !3J-% % % %N 49 D    .   Xz
 z
 z
x!'h!' 3i!' +	!'
  $;!' RX!' !' !' !'F  [ [ [z         r'   rj  ro  r!  prompt_cachesr  c                    t          | fdj        i|}	t          |          }
d}|rt          d|
 dd           |	                    ||          }d |D             i |	                                x}r|D ]i}|j        0|r|j        |j        <   |r|d
z  }t          d| d|
 dd           |j        dk    r%|j                 	                    |j
                   j|	                                x}|	                                 |rt          d| d|
            fd|D             }|	                                }|rfd|D             nd	}|r[t          d|j         d|j        dd           t          d|j         d|j        dd           t          d|j        dd           t%          |||          S )a?  
    Generate responses for the given batch of prompts.

    Args:
       model (nn.Module): The language model.
       tokenizer (PreTrainedTokenizer): The tokenizer.
       prompts (List[List[int]]): The input prompts.
       prompt_caches (List[List[Any]], optional): Pre-computed prompt-caches
          for each input prompt. Note, unlike ``generate_step``, the caches
          won't be updated in-place.
       verbose (bool): If ``True``, print tokens and timing information.
          Default: ``False``.
       max_tokens (Union[int, List[int]): Maximum number of output tokens. This
          can be per prompt if a list is provided.
       return_prompt_caches (bool): Return the prompt caches in the batch
          responses. Default: ``False``.
       logits_processors (List[Callable[[mx.array, mx.array], mx.array]], optional):
          A list of functions that take tokens and logits and return the processed logits. Default: ``None``.
       kwargs: The remaining options get passed to :obj:`BatchGenerator`.
          See :obj:`BatchGenerator` for more details.
    rr  r   z'[batch_generate] Finished processing 0/z ...)r  )r/  c                     i | ]}|g S r/   r/   )r   rm  s     r%   
<dictcomp>z"batch_generate.<locals>.<dictcomp>`  s    '''3sB'''r'   Nr   z%[batch_generate] Finished processing /r   c                 F    g | ]}                     |                   S r/   )decode)r   rm  resultsr   s     r%   r   z"batch_generate.<locals>.<listcomp>t  s+    <<<Ygcl++<<<r'   c                      g | ]
}|         S r/   r/   )r   rm  r  s     r%   r   z"batch_generate.<locals>.<listcomp>v  s    111SmC 111r'   z[batch_generate] Prompt: r  r  r  z[batch_generate] Generation: z[batch_generate] Peak memory: r  )rj  r   r   rW   r  r  rl   rz   rm  r   rd   r  r.  rg   rh   ri   rj   rk   r,  )rG   r   r!  r  r   r
  r  r   r  gennum_samplesfinr2  r  rr-  r.  r/  r  s    ` `              @r%   batch_generater  4  s   B  +  C
 g,,K
C UIIIItTTTT::gz-:@@D''$'''GMxxzz
!) / 	/ 	/A*' :+,>M!%( 1HCWWWkWWW     &((%%ag... xxzz
!) / IIKKK KIcIIKIIJJJ =<<<<t<<<EIIKKE5IS1111D1111tF Kk(;kkeFVkkkk	
 	
 	
 	9E,C 9 9#89 9 9	
 	
 	
 	Iu/@IIIIJJJv...r'   c                     t                      } |                                 }|j        $t          j                            |j                   |j        d u}|rt          |j        d          \  }}t          |d         t                    rQ|j	        %|j	        |d         j
        k    rt          d          |j        |d         j        k    rt          d          |si nt          j        |d                   }|j        rdnd |d<   |j        }|r3|	|d         }n(||d         k    rt          d	| d
|d          d          |pt$          }t'          ||j        |d|j        i          \  }}|j        D ]}	|                    |	           i }
|j        t          j        |j                  }
|j                            dd                              dd          }|dk    rt6          j                                        n|}|j        s|j        r|j         d|j         dg}ng }|!                    d|d           |j"        d u}|r|!                    d|j"        d            |j#        |fd|| d|
}|rBd|d         d<   |#                    |d||           }||$                    d          d          }|%                    |d          }n|%                    |          }|j&        7t'          |j&                  \  }}|j'        |j'        k    rt          d          nd }tQ          |j)        |j*        |j+        |j,        |j-        |j.        |j/        |%                    d          ta          |j1                  z             }te          ||||j3        |j4        ||j5        |r|nd |j	        |j        |j6        ||j7                  }|j4        stq          |           d S d S ) NT)return_metadatar   zF--kv-bits does not match the kv cache loaded from --prompt-cache-file.zL--kv-group-size does not match the kv cache loaded from --prompt-cache-file.tokenizer_configtrust_remote_coderG   zProviding a different model (z-) than that used to create the prompt cache (z) is an error.quantize_activations)adapter_pathr  model_configz\n
z\t	-system)rolecontentuser	assistantF)tokenizecontinue_final_messageadd_generation_promptz<query>r   r  r   z5Draft model tokenizer does not match model tokenizer.)top_kxtc_probabilityxtc_thresholdxtc_special_tokens)
r   r
  r   r   rz   r}   r|   r{   r   r   )9rF   
parse_argsseedrM   randomprompt_cache_filer   rL   r   r}   rv   r   r|   ru   jsonloadsr  rG   r8   r   r  r  extra_eos_tokenadd_eos_tokenchat_template_configr   replacesysstdinreadignore_chat_templatehas_chat_templatesystem_promptr   prefill_responseapply_chat_templateindexr   r   
vocab_sizer   temptop_pmin_pmin_tokens_to_keepr  r  r  r  r   r  r   r
  r   r{   r   rW   )rE   argsusing_cacherz   metadatar  
model_pathrG   r   	eos_tokentemplate_kwargsr   messageshas_prefilltest_promptr   draft_tokenizerr   r  s                      r%   mainr    s   FDy
	ty!!! (4K !2" "
 "
 "
h l1o'788 	|'DLLO<P,P,P \   !\!_%??? b   K4:h7I.J#K#K  594J,TDDPT()J !'*JJ8G,,,
  4<W4E    
 ,}J&),d.GH	  E9 ) + +		****O ,*T%>??[  --55eTBBF!'3SY^^FF$ *)D *)!)d6HIIJHHHF;;<<<+47 	UOO[T=RSSTTT..
#.&1/	
 

 
 
  	<&/HRL##77'2*5o	 8  K K--i88::;F!!&U!CC!!&))#'+D,<'='=$_%)===TUUU > 	

j,($++D11D9P4Q4QQ	 	 	G ?$%0:\\d(2.  H < h r'   __main__zCalling `python -m mlx_lm.generate...` directly is deprecated. Use `mlx_lm.generate...` or `python -m mlx_lm generate ...` instead.rK   )r   Nr  )Nro  FFN)Xr4   
contextlibr   r  r  r   dataclassesr   r   typingr   r   r   r   r	   r
   r   mlx.corecorerM   mlx.nnnn	mlx.utilsr   transformersr   modelsr   models.cacher   r   r   r   r   r   r   r   sample_utilsr   tokenizer_utilsr   utilsr   r   r9   r;   r=   r>   r?   r@   rA   DEFAULT_XTC_THRESHOLDrB   rC   r8   rD   r&   rF   
new_streamdefault_devicer   contextmanagerModuleStreamr`   rb   r   rN   r:   r   rr   r   r7   r	  r  r"  r'  r)  r,  r1  r_  re  rh  rj  r  r  rm   rW   r/   r'   r%   <module>r2     s             



  ! ! ! ! ! !                                    ! ! ! ! ! ! , , , , , ,      	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ' & & & & & - - - - - - < < < < < < < <    :! 0 0 0_ _ _F "BM"3""3"5"566  $* $*ry $*8DO+D $* $* $* $*N ( ( ( ( ( ( ( (:U U U 8<RV!%"&!!EI+/c c cHc9c 	c
 hz28345c  Xrx.BBH.L%M NOc #c 3-c c c]c c c 'xc
D0@'ABc rx(c uRXrx'($45c c c cV 8<RV"& !n$ n$ n$Hn$9n$ n$
 n$ n$ hz28345n$  Xrx.BBH.L%M NOn$ 3-n$ n$ c]n$ n$ n$ uRXrx-.d:;n$ n$ n$ n$j '+`
 `
9`
(*::;`
 #rxc*+`
 	`

 ")$`
 !4-.`
 `
 `
 `
N 	+ +9+(*::;+ #tCy.!+ 	+ 	+ + + +\H H H HH H H H        . & & & & & & & & (4 (4 (4 (4 (4 (4 (4 (4VB B B@	 	 	) ) )
J  J  J  J  J  J  J  J b 04(+!&RVL/ L/ $s)_L/ DcO,	L/
 c49n%L/ L/ L/  Xrx.BBH.L%M NOL/ L/ L/ L/ L/^z z zz z	E	P   	DFFFFF r'   