
    )ji             
       \   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZ d dlmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% d dl&m'Z( d d
l)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3m4Z4 ddl5m6Z6m7Z7 ddl8m9Z9m:Z: d Z;d Z< G d de!          Z=dee>         de?deee>                  dee@         de=f
dZAde#de#deBfdZCdEdeeD         de"eD         fd ZEd! ZF G d" d#          ZGe G d$ d%                      ZHe G d& d'                      ZIe G d( d)                      ZJe G d* d+                      ZKe G d, d-                      ZLe G d. d/                      ZMe G d0 d1                      ZN G d2 d3          ZO G d4 d5          ZPd6 ZQd7 ZRde$ee@ef                  fd8ZS G d9 d:          ZT G d; d<e          ZUeeUfd=e@d>e>fd?ZVeeUfd=e@d>e>d@ePfdAZWdB ZXeYdCk    r eZdD            eX             dS dS )F    N)deque)	dataclassfield)BaseHTTPRequestHandlerThreadingHTTPServer)Path)Empty)Queue)Thread)
AnyCallableDictListLiteral
NamedTupleOptionalSequenceTupleUnion)scan_cache_dir   )__version__)BatchGeneratorgeneration_streamstream_generate)can_trim_prompt_cachemake_prompt_cachetrim_prompt_cache)make_logits_processorsmake_sampler)loadsharded_loadc                      t          j                    d         } t           dt           j         dt          j                     d|  S )Narchitecture-)mxdevice_infor   platform)gpu_archs    W/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/mlx_lm/server.pyget_system_fingerprintr+   0   sC    ~/HMMBNMMX->-@-@MM8MMM    c                    dddddd}d}| D ]#}|                                 s|dk    s n|dz  }$t          | d |                   }| |d                                                                          }t	          |||         z            S )Ng    .A    eAr   )MGMBGB r   .)isdigitfloatstripupperint)xsizessplitxidigitssizes         r*   
parse_sizer@   5   s    Cs#1==EE  

 	c		E
1VeV9FeffI$$&&Dvd#$$$r,   c                   .    e Zd ZU eed<   eed<   eed<   dS )StopConditionstop_mettrim_lengthtrim_text_lengthN)__name__
__module____qualname__bool__annotations__r9    r,   r*   rB   rB   A   s3         NNNr,   rB   tokenseos_token_idsstop_id_sequences
stop_wordsreturnc                 b   | r| d         |v rt          ddd          S t          ||          D ]o\  }}t          |           t          |          k    rJ| t          |           d         |k    r.t          dt          |          t          |                    c S pt          ddd          S )a  
    Determines whether the token generation should stop based on predefined
    conditions.

    Args:
        tokens (List[int]): The current sequence of generated tokens.
        eos_token_ids (set): The token IDs that represents the
          end-of-sequence. If the last token in ``tokens`` is in the set,
          the generation should stop.
        stop_id_sequences (List[List[[int]]): A list of integer lists, each
          representing a sequence of token IDs. If the end of the `tokens`
          list matches any of these sequences, the generation should stop.
        stop_words (List[str]): The stop words that correspond to the
            ``stop_id_sequences``.

    Returns:
        StopCondition: A named tuple indicating whether the stop condition has
          been met (`stop_met`) and how many tokens should be trimmed from the
          end if it has (`trim_length`) as well as the text that should be
          trimmed.
    Tr   )rC   rD   rE   NF)rB   ziplen)rL   rM   rN   rO   stop_ids	stop_words         r*   stopping_criteriarW   G   s    6  O&*--dANNNN"#4jAA  )v;;#h--''s8}}n&&'833$! #H%(^^      %QKKKKr,   s1s2c                      t          t                     t                              }t           fdt          d|dz             D                       S )z
    Checks if a suffix of s1 has overlap with a prefix of s2

    Args:
        s1 (Sequence): The first sequence
        s2 (Sequence): The second sequence

    Returns:
        bool: If the two sequences have overlap
    c              3   F   K   | ]}| d          d |         k    V  d S NrK   ).0irX   rY   s     r*   	<genexpr>z#sequence_overlap.<locals>.<genexpr>}   s:      DDQr1"##w"RaR& DDDDDDr,   r   )minrT   anyrange)rX   rY   max_overlaps   `` r*   sequence_overlaprd   q   sU     c"ggs2ww''KDDDDD%;?*C*CDDDDDDr,   messagesrole_mappingc                 ,   dddddd}||n|}d}| D ]U}|                     |d         d          }|                     d	d          }|                     d
d          }|| | | z  }V||                     dd          z  }|                                S )Nz}A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.zASSISTANT's RULE: zUSER: zASSISTANT: 
)system_promptsystemuser	assistantstopr3   rolerm   contentrl   )getrstrip)re   rf   default_role_mappingpromptlinerole_prefixrm   ro   s           r*   convert_chatrv      s    O '"	 	 $0#;<<AULF 2 2"&&tF|R88++((9b))[1'14111
l{B///F==??r,   c                    | D ]}|                     dd          }t          |t                    rTd |D             }t          |          t          |          k    rt	          d          d                    |          |d<   n|d|d<   |                     dd          x}rL|D ]I}|                     dd          x}r/|                     d	d          x}rt          j        |          |d	<   JdS )
a  
    Convert message content to a format suitable for `apply_chat_template`.

    The function operates on messages in place. It converts the 'content' field
    to a string instead of a list of text fragments.

    Args:
        message_list (list): A list of dictionaries, where each dictionary may
          have a 'content' key containing a list of dictionaries with 'type' and
          'text' keys.

    Raises:
        ValueError: If the 'content' type is not supported or if 'text' is missing.

    ro   Nc                 6    g | ]}|d          dk    |d         S )typetextrK   )r]   fragments     r*   
<listcomp>z+process_message_content.<locals>.<listcomp>   s4       %-HV<LPV<V<V <V<V<Vr,   z&Only 'text' content type is supported.r3   
tool_callsFfunction	arguments)rp   
isinstancelistrT   
ValueErrorjoinjsonloads)re   messagero   text_fragmentsr}   	tool_callfuncargss           r*   process_message_contentr      s(      = =++i..gt$$ 	$ 18  N >""c'll22 !IJJJ!#!8!8GI_!#GI \5999: 	=' = =	$==U;;;4 =#xxU;;;t =,0Jt,<,<[)= =r,   c                      e Zd Ze G d d                      Z G d d          Ze G d d                      Zdd	ed
efdZd Z	e
d             Zd Zd Zd Zd ZddefdZddddee         dee         fdZd ZdS )LRUPromptCachec                   0    e Zd ZU ee         ed<   eed<   dS )LRUPromptCache.CacheEntryprompt_cachenbytesN)rF   rG   rH   r   r   rJ   r9   rK   r,   r*   
CacheEntryr      s)         3ir,   r   c                   4    e Zd Zd Zd Zd	defdZd Zd ZdS )
LRUPromptCache.CacheOrderc                 R    t                      | _        t                      | _        d S r\   )r   _lru_checkpoints_lruselfs    r*   __init__z"LRUPromptCache.CacheOrder.__init__   s    $)GGD!DIIIr,   c                 T    t          | j                  t          | j                  z   S r\   )rT   r   r   r   s    r*   __len__z!LRUPromptCache.CacheOrder.__len__   s     ty>>C(=$>$>>>r,   F
checkpointc                 T    |r| j         n| j        }|                    ||f           d S r\   )r   r   append)r   modelrL   r   cs        r*   pushzLRUPromptCache.CacheOrder.push   s2    )3B%%AHHeV_%%%%%r,   c                     	 | j                             ||f           d S # t          $ r  | j                            ||f           Y d S w xY wr\   )r   remover   r   )r   r   rL   s      r*   r   z LRUPromptCache.CacheOrder.remove   sf    >	  %11111 > > >%,,eV_======>s     &A
	A
c                     t          | j                  t          | j                  k    r| j                                        S | j                                        S r\   )rT   r   r   popleftr   s    r*   popzLRUPromptCache.CacheOrder.pop   sI    49~~T%:!;!;;;y((***,44666r,   NF)	rF   rG   rH   r   r   rI   r   r   r   rK   r,   r*   
CacheOrderr      sp        	  	  	 	? 	? 	?	& 	&$ 	& 	& 	& 	&	> 	> 	>	7 	7 	7 	7 	7r,   r   c                   f    e Zd ZU eed<   ee         ed<   ee         ed<   ee         ed<   eed<   dS )LRUPromptCache.SearchResultr   exactshorterlongercommon_prefixN)rF   rG   rH   r   rJ   r   r9   rK   r,   r*   SearchResultr      sW         


CycS	r,   r   
               max_size	max_bytesc                 p    || _         || _        i | _        |                                 | _        d| _        d S Nr   )r   r   _cacher   r   _n_bytes)r   r   r   s      r*   r   zLRUPromptCache.__init__   s3     "OO%%	r,   c                 *    t          | j                  S r\   )rT   r   r   s    r*   r   zLRUPromptCache.__len__   s    49~~r,   c                     | j         S r\   )r   r   s    r*   r   zLRUPromptCache.nbytes   s
    }r,   c                    || j         vr|                     |dddd          S | j         |         }d}d}|t          |          k     r@||         |v r6|||                  }d|v r|}|dz  }|t          |          k     r
||         |v 6|t          |          dz
  k    r|                     ||ddd          S d}|dk    r|d|dz            }d}|}|dk    r~d}	|g fg}
|
rh|
                                \  }}d|v r%|	 t          |          t          |	          k     r|}	n&|D ]#}|
                    ||         ||gz   f           $|
h|d|         |	z   }|                     |d|||          S )zASearch the cache for a prompt cache. Return exact or close match.Nr   rR   cacher   )r   r   rT   r   r   )r   r   rL   currentlast_cache_indexindexr   r   r   beststackextratoks                r*   _searchzLRUPromptCache._search   s   ##$$UD$a@@@+e$c&kk!!fUmw&>&>fUm,G'!!#( QJE	 c&kk!!fUmw&>&> s6{{Q..$$UFD$BBB a3/!334G 199Dr]OE D!&g%%|s5zzCII'='=$& D DgclESEM%BCCCC  D FUF^d*F  gv}MMMr,   c                 F    | j         |         }|D ]
}||         }|d         S )Nr   )r   )r   r   rL   r   r   s        r*   _getzLRUPromptCache._get  s3    +e$ 	# 	#CclGGwr,   c                    | j         |         g}|D ]#}|                    |d         |                    $|d         d         j        }| xj        |z  c_        |d         d= t	          t          t          |                              D ]6}||         ||dz            ||         }	}}t          |          dk    r d S ||	= 7d S )NrR   r   r   r   )r   r   r   r   reversedrb   rT   )
r   r   rL   pathr   cache_bytesr^   d_prevdts
             r*   _deletezLRUPromptCache._delete  s    E"# 	' 	'CKKR&&&&2hw'.$HW%F,,-- 	 	A7DQKqAF1vvzzq				 	r,   c                    |                      ||          }|j        ;|                     |j        |j                  }t	          j        |j                  g fS |j        t          |j                  nd}|j	        |j
        |k    r|                     |j        |j	                  }t          |j                  rqt	          j        |j                  }t          t          |          dz
  |j
                  }t          |j	                  |z
  }t          ||           |||d          fS |dk    rC|                     |j        |j                  }t	          j        |j                  ||d          fS d |fS )Nr   r   )r   r   r   r   copydeepcopyr   r   rT   r   r   r   r`   r   )	r   r   rL   resultcache_entryshort_lengthr   prefixnum_to_trims	            r*   fetch_nearest_cachez"LRUPromptCache.fetch_nearest_cache+  sX   eV,,<#))FL&,??K=!9::B>>.4n.Hs6>***a=$)=)L)L))FL&-@@K$[%=>> .k&>??S[[1_f.BCC!&-0069!%555fVWWo--!))FL&.AAK=!9::F<==<QQQV|r,   Fr   c                    t          |          }|| j        vr
i | j        |<   | j        |         }t          |          D ]]\  }}||vri ||<   |rEd|v rA| xj        |d         j        z  c_        |d= | j                            ||d |                    ||         }^d|v r| j                            ||           nBt          d |D                       }	|                     ||	          |d<   | xj        |	z  c_        | j        	                    |||           t          | j                  | j        k    r2| j                                        \  }}|                     ||           | j        | j        k    rtt          | j                  dk    r`| j                                        \  }}|                     ||           | j        | j        k    rt          | j                  dk    Zd S d S d S d S )Nr   c              3   $   K   | ]}|j         V  d S r\   )r   r]   r   s     r*   r_   z.LRUPromptCache.insert_cache.<locals>.<genexpr>S  s$      ==1ah======r,   r   r   )r   r   	enumerater   r   r   r   sumr   r   rT   r   r   r   r   )
r   r   rL   r   r   is_trimmabler   r^   r   r   s
             r*   insert_cachezLRUPromptCache.insert_cacheA  s   ,\::##!#DK+e$'' 	# 	#FAs'!!! 47 2 2!1!88G$	  rr
333clGGgIUF++++=======K#|[IIGGMM[(MM	uf<<<ty>>DM)) IMMOOME6LL'''mdn,,TY!1C1C IMMOOME6LL''' mdn,,TY!1C1C1C1C,,,,1C1Cr,   N)n_sequencesn_bytesr   r   c                   |t          d|          nd}|t          d|          nd}t          | j                  |k    rJ| j                                        \  }}|                     ||           t          | j                  |k    J| j        |k    r?| j                                        \  }}|                     ||           | j        |k    =d S d S )Nr   r   )maxrT   r   r   r   r   )r   r   r   r   rL   s        r*   trim_tozLRUPromptCache.trim_to_  s     .9-Dc![)))'%,%8#a///g$)nn{** IMMOOME6LL''' $)nn{** mg%% IMMOOME6LL''' mg%%%%%%r,   c           	          t          |           | j        }}t          | j        j                  dk    r%t          | j        j        d         d                   nd}t	          j        d| d|dz  dd| d	           d S )
Nr   rR   r   zKV Caches: z seq, r.   z.2fz GB, latest user cache z tokens)rT   r   r   r   logginginfo)r   ncachesr   ntoks       r*   log_cache_statszLRUPromptCache.log_cache_statsl  s    d))T[ 49-..22 	*2.q1222 	
 	]']]]]]PT]]]	
 	
 	
 	
 	
r,   )r   r   r   )rF   rG   rH   r   r   r   r   r9   r   r   propertyr   r   r   r   r   rI   r   r   r   r   rK   r,   r*   r   r      s              Y7 7 7 7 7 7 7 70        Y  c         X'N 'N 'NR         ,( (D ( ( ( (> /3T( ( (&sm(=Ec]( ( ( (	
 	
 	
 	
 	
r,   r   c                   .    e Zd ZU eed<   eed<   eed<   dS )ModelDescriptionr   draftadapterN)rF   rG   rH   strrJ   rK   r,   r*   r   r   x  s+         JJJJJJLLLLLr,   r   c                   L    e Zd ZU eed<   eed<   eed<   eed<   eed<   eed<   dS )SamplingArgumentstemperaturetop_ptop_kmin_pxtc_probabilityxtc_thresholdN)rF   rG   rH   r6   rJ   r9   rK   r,   r*   r   r     sQ         LLLJJJLLLr,   r   c                   r    e Zd ZU eeeef                  ed<   eed<   eed<   eed<   eed<   eed<   eed<   dS )	LogitsProcessorArguments
logit_biasrepetition_penaltyrepetition_context_sizepresence_penaltypresence_context_sizefrequency_penaltyfrequency_context_sizeN)rF   rG   rH   r   r   r9   r6   rJ   rK   r,   r*   r   r     st         c5j)****    r,   r   c                       e Zd ZU eed<   eed<   eed<   ee         ed<   e	ed<   e	ed<   e
ed<   e	ed<   ee	         ed	<   eeeef                  ed
<   dS )GenerationArgumentsr   samplinglogitsrO   
max_tokensnum_draft_tokenslogprobstop_logprobsseedchat_template_kwargsN)rF   rG   rH   r   rJ   r   r   r   r   r9   rI   r   r   r   rK   r,   r*   r  r    s         $$$$S	OOONNN
3-"4S>222222r,   r  c                       e Zd ZU ed         ed<   eed<   ee         ed<   eee                  ed<   ee	eef                  ed<   dS )CompletionRequest)chatrz   request_typers   re   toolsrf   N)
rF   rG   rH   r   rJ   r   r   r   r   r   rK   r,   r*   r  r    sj         .))))KKK3iDI4S>******r,   r  c                       e Zd ZU eed<   eed<   eed<   eeegef         ed<   eed<   e	ed<   e	ed<   eed<   e
ed	<   eee	                  ed
<   ee	         ed<   dZe	ed<   dZeed<   d ZdS )GenerationContexthas_tool_callingtool_call_starttool_call_endtool_parserhas_thinkingthink_start_idthink_end_id	think_endrM   stop_token_sequencesrs   rR   prompt_cache_countF_should_stopc                     d| _         d S NT)r  r   s    r*   rm   zGenerationContext.stop  s     r,   N)rF   rG   rH   rI   rJ   r   r   r   r   r9   setr   r  r  rm   rK   r,   r*   r  r    s         3*d*++++NNNtCy/)))I    L$! ! ! ! !r,   r  c                   j    e Zd ZU eed<   eed<   eed<   ee         ed<   ee	ee
f                  ed<   dS )Responserz   tokenlogprobfinish_reason
top_tokensN)rF   rG   rH   r   rJ   r9   r6   r   r   r   r   rK   r,   r*   r"  r"    sV         
IIIJJJNNNC=   d38n%%%%%%r,   r"  c                   "    e Zd ZddZd Zd ZdS )	
TimeBudget      ?   r   c                     t           j                                                                        dk    | _        || _        || _        || _        d | _        d | _	        d| _
        d| _        d S Nr   r   )r&   distributedinitr?   _is_distributed_budget_iterations_sync_frequency_start_current_iterations_loops_time_spent)r   budget
iterationssync_frequencys       r*   r   zTimeBudget.__init__  sd    !~224499;;a?%-#' r,   c                 D    t          j                     | _        d| _        | S r   )timer3  r4  r   s    r*   __iter__zTimeBudget.__iter__  s    ikk#$ r,   c                 :   | j         s4t          j                    | j        z
  | j        k    rt	                      d S | xj        dz  c_        | j        | j        k    r>| xj        dz  c_        | xj        t          j                    | j        z
  z  c_        | j        | j	        z  dk    rt          j        t                    5  t          j                            | j                                                  }d d d            n# 1 swxY w Y   |t          j                                                                        | j	        z  z  }| j        |z  }t%          t'          | j        |z            d          | _        d| _        d| _        t	                      d S r,  )r/  r;  r3  r0  StopIterationr4  r1  r5  r6  r2  r&   streamr   r-  all_sumitemr.  r?   r   round)r   	loop_timeavg_loop_timefactors       r*   __next__zTimeBudget.__next__  s   # 	y{{T[(4<77#oo%4  A%  #d&666KK1KK	dk 99{T11Q66Y011 P P " 6 6t7G H H M M O OIP P P P P P P P P P P P P P P )N''))..0043GG! 5#&uT-=-F'G'G#K#K #$ //! 76s   7DD	D	N)r)  r*  r   )rF   rG   rH   r   r<  rF  rK   r,   r*   r(  r(    sF        	 	 	 	  
" " " " "r,   r(  c                   ,    e Zd Zdej        fdZddZdS )ModelProvidercli_argsc                    || _         d| _        d| _        d| _        d| _        d| _        t          j                                        }|	                                dk    r	|j
        r|nd| _        |	                                dk    r	|j
        s|nd| _        |	                                dk    | _        i | _        | j         j        7d| j        | j         j        <   |                     | j         j        d           dS dS )z@Load models on demand and persist them across the whole process.NFr   default_model)draft_model_path)rI  	model_keyr   	tokenizerdraft_modelis_batchabler&   r-  r.  r?   pipelinepipeline_grouptensor_groupis_distributeddefault_model_mapr!   )r   rI  groups      r*   r   zModelProvider.__init__  s     
!##%%',zz||a'7'7H<M'7eeSWZZ\\A%%h.?%EET 	 $jjllQ. "$=*:ID"4=#67IIdm)OILLLLL +*r,   Nc                 L   | j                             ||          }| j        |||fk    r| j        | j        fS d | _        d | _        d | _        d | _        d| j        j        rdnd i}| j        j        r| j        j        |d<   |dk    ry| j        j        t          d          |p| j        j
        }| j        r)t          | j        j        | j        | j                  \  }n[t          | j        j        ||          \  }n;| j        rt          || j        | j                  \  }nt          |||          \  }| j        j        rj        j        _        |||f| _        || _        | _        fd}|dk    r9| j        j        -t          | j        j                  \  | _        } ||           n*|(|dk    r"t          |          \  | _        } ||           | j        0t%          d t'          | j                  D                       | _        | j        | j        fS )	Ntrust_remote_codeTchat_templaterK  zEA model path has to be given as a CLI argument or in the HTTP request)adapter_pathtokenizer_configc                 T    | j         j         k    rt          j        d           d S d S )NzdDraft model tokenizer does not match model tokenizer. Speculative decoding may not work as expected.)
vocab_sizer   warning)draft_tokenizerrN  s    r*   validate_draft_tokenizerz4ModelProvider.load.<locals>.validate_draft_tokenizerF  sA    )Y-AAAE     BAr,   c              3   6   K   | ]}t          |d           V  dS )mergeN)hasattrr   s     r*   r_   z%ModelProvider.load.<locals>.<genexpr>[  s=       $ $()7##$ $ $ $ $ $r,   )rU  rp   rM  r   rN  rO  rI  rX  rY  r   rZ  rT  r"   rR  rS  r!   use_default_chat_templatedefault_chat_templateallr   rP  )	r   
model_pathrZ  rL  r[  r   r`  r_  rN  s	           @r*   r!   zModelProvider.load  s   +//
JGG
>j,8HIII:t~-- 
  )H!Rd
 =& 	L040K_-((}"* 6   (E4=+EL" 	#/M')<d>O$ $ yy $(M'!-%5$ $ $ yy " 	#/ 3T5F$ $ yy $(!-%5$ $ $ y =2 	J&.*3*I	'$l4DE
"	 	 	 	 	 //)504T]5N0O0O-Do$$_5555).>/.Q.Q045E0F0F-Do$$_555# # $ $->tz-J-J$ $ $ ! !D z4>))r,   )NN)rF   rG   rH   argparse	Namespacer   r!   rK   r,   r*   rH  rH    sQ        M!3 M M M M.R* R* R* R* R* R*r,   rH  c                     t          | j        j        | j        j        | j        j        | j        j        | j        j        | j        j        |j        |	                    d          g          S )Nrh   )r   r   r   r   r   xtc_special_tokens)
r    r  r   r   r   r   r   r   eos_token_idencode)r   rN  s     r*   _make_samplerrn  b  se    !m!m!m!5m1"T""
   r,   c           	          t          | j        j        | j        j        | j        j        | j        j        | j        j        | j        j        | j        j                  S r\   )	r   r  r   r   r   r   r   r   r  )r   s    r*   _make_logits_processorsrp  q  sL    !&+$)%*  r,   c                 ,   |dk    rdS t          j        |  |dz
            }|d|                                         }| |                                         }|                    |          }t	          d t          |||          D                       S )zDReturns info dicts for the top `top_logprobs` tokens from `logprobs`r   rK   r   )kthNc              3   *   K   | ]\  }}}|||d V  dS ))idr#  r$  NrK   )r]   r^   sgs       r*   r_   z'_format_top_logprobs.<locals>.<genexpr>  sF        Aq! 1++     r,   )r&   argpartitiontolistconvert_ids_to_tokenstuplerS   )r  r	  rN  sorted_indicestop_indicestxtss         r*   _format_top_logprobsr~  }  s    qr_hYL14DEEEN ,/6688KK(//11L**;77D  ;l;;     r,   c            
           e Zd ZdedefdZd Zd ZddZd Z	d	 Z
d
 Zd Zd Zd Zd Z	 ddededeeeegdf                  fdZed             ZdS )ResponseGeneratormodel_providerr   c                    || _         || _        t                      | _        t	                      | _        t          j                                        	                                dk    | _
        t          j                                                                        | _        d| _        t          | j                  | _        | j                                         d S )Nr   F)target)r  r   r
   requestsr(  _time_budgetr&   r-  r.  r?   r/  rank_rank_stopr   	_generate_generation_threadstart)r   r  r   s      r*   r   zResponseGenerator.__init__  s    ,(&LL!~224499;;a?^((**//11

"("?"?"?%%'''''r,   c                 F    d| _         | j                                         d S r  )r  r  r   r   s    r*   stop_and_joinzResponseGenerator.stop_and_join  s$    
$$&&&&&r,   c                 8    | j                                          d S r\   )r  r   r   s    r*   r   zResponseGenerator.join  s    $$&&&&&r,   Nc                     d }| j         r| j        dk    rI	 || j                            |          }n| j                                        }n# t
          $ r Y nw xY w|                     |          S )Nr   timeout)r/  r  r  rp   
get_nowait
QueueEmpty_share_request)r   r  requests      r*   _next_requestzResponseGenerator._next_request  s    # 	tzQ&"m///@@GG"m6688G    ""7+++s   7A 
AAc                 x   | j         s|S t          j        t                    5  | j        dk    r|?t          j        t          j                            d                     	 d d d            d S t          j        t          j
        |                    }t          j        t          j                            |j                             t          j        t          j                            |                     |cd d d            S t          j                            d                                          }|dk    r	 d d d            d S t          j        |t          j                  }t          j                            |          }t          j        |          cd d d            S # 1 swxY w Y   d S )Nr   )dtype)r/  r&   r?  r   r  evalr-  r@  arraypickledumpsr?   rA  zerosuint8r   )r   objdatar?   s       r*   _share_objectzResponseGenerator._share_object  s   # 	JY()) 	. 	.zQ;GBN22155666		. 	. 	. 	. 	. 	. 	. 	. 8FL$5$566DGBN2249==>>>GBN22488999	. 	. 	. 	. 	. 	. 	. 	. ~--a005577199	. 	. 	. 	. 	. 	. 	. 	. 8D999D>11$77D!<--#	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	. 	.s&   A F/0BF/8F/AF//F36F3c                     | j         s|S |
|dd          nd }|                     |          }|d S ||d         nt                      }|g|R S r,  )r/  r  r
   )r   r  	shareablerqs       r*   r  z ResponseGenerator._share_request  sj    # 	N#*#6GABBKKD	&&y11	4".WQZZEGG~I~~r,   c                    |j         dk    r|j        }|j        }|j        }|j        rt          |           |r|j        st          j        d           | j	        j
        j        }|j        r.|                                }|                    |j                    |j        |f|ddd|S |                    t#          ||                    S |                    |j                  S )Nr  zReceived tools but model does not support tool calling. If you think this is an error, file an issue here: https://github.com/ml-explore/mlx-lm/issuesT)r  add_generation_prompttokenize)r  re   r  rf   has_chat_templater   r  r   r^  r  rI  chat_template_argsr  r   updateapply_chat_templaterm  rv   rs   )r   rN  r  r   re   r  rf   r  s           r*   	_tokenizezResponseGenerator._tokenize  s    6))'HME"/L* N'111 !; OF   &*%8%A%T", I);)@)@)B)B&&--d.GHHH4y4*.!	 
 )   !''X|(L(LMMM##GN333r,   c           	          |j         dk    rdS |j        d         d         dk    rdS d}|j        rLt          dt	          dt          |                    dz
  d          D ]}||          |j        k    r| dz
  } nd|fS )	Nr  )FrR   rR   rn   rk   r      T)r  re   r  rb   r`   rT   r  )r   rN  r  rs   prompt_checkpointr^   s         r*   _compute_prompt_checkpointz,ResponseGenerator._compute_prompt_checkpoint  s    6))9B'6119
 ! 	1c"c&kk22Q6::  1":!999)*Q%E : &&&r,   c                 4    | j         j        sdS |j        dS dS )NFT)r  rP  r
  )r   r   s     r*   _is_batchablezResponseGenerator._is_batchable   s'    "/ 	59 5tr,   c                     !"# d }d }d }d !d }d}i  g #d #fd	} fd} ! fd} j         rt          j                            t          j        j        d                                       t          j                                                  }	t          j        	                    |	            j
        sd }
|s%|t                     dk    rd nd} ||          }
|
5|
\  }}
}|||j        k    r                     |          r	                      ||
|          }n,# t          $ r}|                    |           Y d }~d }~ww xY wt#          "j        "j        "j        "j        "j        "j        "j        "j        "j        "fd|j        D             |	          }|                    |            j                                          j                            !|          \  }}t          |          t          |          z
  |_        |tA           j!        j                  } "                    "|
|          \  }}|#                    |g|j$        |gtK          |"          gtM          |          g|g
          \  }||d d          |"j'        |d |<   ~ j!        j(        j)        6 j!        j(        j)        }|j*        } j        +                    ||z
             E|	  j!        ,                    |j        j        |j        j-        |j        j.                  \  }"n-# t          $ r }|                    |           Y d }~d }~ww xY w                     |          s /                    ||
|f           |j        }"} j!        j0        !i  tc          |"j         j(        j2         j(        j3         j(        j4        ||          }#5                    ||
|f           Md}#5                    ||
|f           i|]t                     dk    r$|r d }d }d }d !|6                                 d }d}g } j7        D ]o}|8                                }|s nU|D ]O} |j9                 }|d         5                    |j:                   |j;        dk    r |d         <                    |j:                   |d                             t{          |d         j>        |j:        |j?        |j:                                                 |j;        t          |j?        |jA        |                               |j;        J|d                             d             j        B                    !|d         |j                    |j9        = |d         jC        r|5                    |j9                   Qq D                    |          }|rt          jE        t                    5  |G                    |d          }|H                                D ]7\  }}| vr
 |         } j        B                    !|d         |            |= 8	 d d d            n# 1 swxY w Y    j
        d S d S )NFc                 Z    r                                 S                     |           S r\   )r   r  )r  r   unprocessed_requestss    r*   get_next_requestz5ResponseGenerator._generate.<locals>.get_next_request  s1    # 3+//111))'222r,   c                     | D ];\  }}}|v r1|         d                              t          ||          |f           <d S )Nrqueue)putr`   )r   uid	processedtotalbatch_resultss       r*   progress_callbackz6ResponseGenerator._generate.<locals>.progress_callback  s`    )- U U%Y-''!#&x044c)U6K6KU5STTTU Ur,   c                     | D ]Q\  }}}|         }|d         sj                             |d         d |          t          |          d           Rd S )Nr   	cache_keyTr   )r   r   r   )promptsr  
prompt_endr   rsr  current_model_keyr   s        r*   checkpoint_callbackz8ResponseGenerator._generate.<locals>.checkpoint_callback  s    *1 	 	&Z"3',' !..%{OLj[L1KK#	 /    		 	r,   r   g?r  c                 >    g | ]}                     |d           S F)add_special_tokensrm  r]   rV   rN  s     r*   r|   z/ResponseGenerator._generate.<locals>.<listcomp>R  s<     . . . ) &,,Y5,QQ. . .r,   r  r  r  r  r  r  r  r  rM   r  rs   )cachessamplerslogits_processorsprompt_checkpoints)ctxr  r  detokenizerr   )r   )stop_tokenscompletion_batch_sizeprefill_batch_sizeprefill_step_sizeprompt_progress_callbackprompt_checkpoint_callbackTr  rm   r  r  r  )return_prompt_cachesr\   )Ir/  r&   r-  r@  randomstateviewuint64rA  r
  r  rT   r   r  r  	Exceptionr  r  r  r  r  r  r  r  r  r  rM   rO   r   r   r   r  r   r  r  insertr  rn  rp  r  rI  prompt_cache_bytesprompt_cache_nbytesr   r!   r   r   _serve_singlerM  r   decode_concurrencyprompt_concurrencyr  r   closer  nextr  r#  r%  	add_tokenr"  last_segmentr  r~  r	  r   r  r  r?  r   r   items)$r   current_modelcurrent_samplingcurrent_tokenizerbatch_generatordrain_batchr  r  r  r
  r  r  r  r   rs   er  r   restdo_checkpointcheckpoint_positionr  r  activer   uids_to_remove_	responsesrr   r  r   r  r  rN  r  s$   `                               @@@@r*   r  zResponseGenerator._generate  s"     !	3 	3 	3 	3 	3 	3 	3	U 	U 	U 	U 	U

	 
	 
	 
	 
	 
	 
	  	!>))")/!*<==BB29MMRRTTDINN4   * s	3G < (3M8J8JQ8N8N D 
 +*7;;; "(/% $/%33**400 4!!%0A7D!Q!Q$ ! ! !

1 ! ,)2)C(1(A&/&=$-$9%.%;'0'?"+"5%.%;&/&=. . . .-1_. . .  &  C  JJsOOO%55777"&"3"G"G)6# #KE4 .1[[3t99-DC*} 1$2E2K L L 77	7FSS 7M#6 -33 %w"/i"@"@!A+B4+H+H*I,?+@ 4  FS  #%+AAAY"('0'<&3* *M#& *3FR $ 3 < O!0!D)11%&.1III
 %,!+/+>+C+C J,dj.@$*BR, ,(yy % ! ! !

1 !  --d33 !**FGT+BCCC $(JM(1%(,(;(E%$&M&4$-$;.2m.N+/=+K*.-*I1B3F' ' 'O )//$0GHHH
 #'K(//$0GHHH !,}%%**" ,(,+/(,0),0)'--///*.&+!#* 9 9A / 4 4 6 6I$ & 9 9!.qu!5{+2217;;;?f44"=1;;AGDDDx(,,$ &} 5 B ! !
17 3 8 8 : : ! 4$%J0ACT!" !" 
 
 
 ?6"8,00666 -:: 16+3F   !.ae 4!%=5 9*11!%888598 "&!3!3N!C!C! 3#455 3 3!0!7!7* "8 " " 28 3 3-C"-77 (%23%7F -:: 16+3F   !.c 2 23	3 3 3 3 3 3 3 3 3 3 3 3 3 3 3Q * s	3 s	3 s	3 s	3 s	3sC   D* *
E4EE;=L9 9
M#MM#?A$Y00Y47Y4c                    |\  }}fd}	 | j         j        }| j         j        | j         j        }|                     ||          }t          j        j        j        j	        j
        j        j        j        j        fd|j        D             |          }                    |           |j        $t$          j                            |j                   t)          |          }t+          |          }	| j                                         | j                            | j         j        |          \  }
}t5          |          t5          |          z
  |_        |d d          }|
At9          | j         j                  }
| j         j        |
t9          | j         j                  z  }
t;          |||j        ||	|
||j        || j         j!                  D ]}                    tE          |j#        |j$        |j%        |j$                 &                                |j'        tQ          |j%        |j)                                       |*                    |j$                   |j+        r| j,        rt[                       n                    d            | j        .                    | j         j        ||
           d S # t^          $ r }                    |           Y d }~d S d }~ww xY w)Nc                 6                         | |f           d S r\   )r  )tokens_processedtokens_totalr  s     r*   progressz1ResponseGenerator._serve_single.<locals>.progress  s!    JJ(,788888r,   c                 >    g | ]}                     |d           S r  r  r  s     r*   r|   z3ResponseGenerator._serve_single.<locals>.<listcomp>  s<     & & &! $$Y5$II& & &r,   r  )r   rN  rs   r  samplerr  r   rO  r  r  r  )0r  r   rN  rO  r  r  r  r  r  r  r  r  r  r  rM   rO   r  r
  r&   r  rn  rp  r   r   r   rM  rT   r  r   r   r  r  rI  r  r"  rz   r#  r  rA  r%  r~  r	  r   r  r/  NotImplementedErrorr   r  )r   r  r   r  r   rO  rs   r  r  r  r   r  r  genr  r  rN  s                  @@r*   r  zResponseGenerator._serve_single  sL    '	9 	9 	9 	9 	9X	'-E+5I-9K ^^Iw==F $!*!; ) 9'5%1&3(7#-&3'5& & & &%)_& & &   C  JJsOOO y$	ty))) $D)44G 7 = = --///+??#-v KE4 &)[[3t99%<C"qqq	I})$*=*CDD&2>.t/B/NOOOE '#?"3"'!%!6)1"&-"A     

	SY/4466),L$*;Y  
 
 
   +++# + 41333E
 JJt **#-y%      	 	 	JJqMMMMMMMMM	s   J8K	 	
K3K..K3r  generation_argsr  c                     t                      | j                            ||f           fd}                                }t	          |t
                    r|| |            fS )Nc               3      K   	                                  } | d S t          | t                    r| t          | t                    r |   M| V  Qr\   )rp   r   r  rz  )responser  response_queues    r*   _innerz*ResponseGenerator.generate.<locals>._innerM  sv      
)--//#Eh	22 #"Nh.. (4))844
r,   )r
   r  r  rp   r   r  )r   r  r   r  r  r  r  s      `  @r*   generatezResponseGenerator.generateD  s     >7ODEEE	 	 	 	 	 	   ""c9%% 	IFFHH}r,   c                     | j         j        S r\   )r  rI  r   s    r*   rI  zResponseGenerator.cli_args`  s    "++r,   r\   )rF   rG   rH   rH  r   r   r  r   r  r  r  r  r  r  r  r  r  r  r   r   r9   r  r   rI  rK   r,   r*   r  r    sC       
(} 
(N 
( 
( 
( 
(' ' '' ' ', , , ,. . ..
 
 
4 4 4>' ' '$  Y3 Y3 Y3v_ _ _J CG	 " - $Hc3Z-=$>?	   8 , , X, , ,r,   r  c                       e Zd Zdddedee         f fdZd Zd$defd	Z	d$defd
Z
d Zd Zd Z	 	 	 	 	 	 	 	 d%dedeed         df         dee         dee         dee         deee                  deeeeeef                                    deee                  deee                  dee         defdZdedee         fdZ	 	 	 d&dee         dee         dee         fdZdefdZdefd Zd! Zd" Zd# Z xZS )'
APIHandlerN)system_fingerprintresponse_generatorr
  c                    t          t          j                              | _        || _        |pt	                      | _         t                      j        |i | dS )z9
        Create static request specific metadata
        N)r9   r;  createdr  r+   r
  superr   )r   r  r
  r   kwargs	__class__s        r*   r   zAPIHandler.__init__f  sX     49;;''"4"4"P8N8P8P$)&)))))r,   c                     |                      dd           |                      dd           |                      dd           d S )NzAccess-Control-Allow-Origin*zAccess-Control-Allow-MethodszAccess-Control-Allow-Headers)send_headerr   s    r*   _set_cors_headerszAPIHandler._set_cors_headersu  sM    6<<<7===7=====r,      status_codec                     |                      |           |                     dd           |                                  d S )NContent-typezapplication/jsonsend_responser  r  r   r  s     r*   _set_completion_headersz"APIHandler._set_completion_headersz  sE    ;''');<<<     r,   c                     |                      |           |                     dd           |                     dd           |                                  d S )Nr  ztext/event-streamzCache-Controlzno-cacher  r  s     r*   _set_stream_headerszAPIHandler._set_stream_headers  s[    ;''')<===*555     r,   c                 X    |                      d           |                                  d S )N   )r  end_headersr   s    r*   
do_OPTIONSzAPIHandler.do_OPTIONS  s.    $$S)))r,   c                    | j         | j        | j        d}| j        |vrE|                     d           |                                  | j                            d           dS t          | j        d                   }| j	        
                    |          }	 t          j        |                                          | _        n# t          j        $ r}t!          j        d| d|                                            |                     d           |                                  | j                            t          j        d	d
| i                                                     Y d}~dS d}~ww xY wd}t!          j        dt          j        | j        |                      t+          | j        t,                    sJ dt/          | j                               | j                            dd          | _        | j                            dd          | _        | j                            dd          | _        | j                            dd          | _        | j                            d| j        j        j                  | _        | j                            dd          | _         | j                            dd          | _!        | j!        /| j                            d| j        j        j!                  | _!        | j                            d| j        j        j"                  | _#        | j                            d| j        j        j$                  | _$        | j                            d| j        j        j%                  | _%        | j                            d| j        j        j&                  | _&        | j                            dd          | _'        | j                            dd           | _(        | j                            d!d          | _)        | j                            d"d           | _*        | j                            d#d          | _+        | j                            d$d           | _,        | j                            d%d          | _-        | j                            d&d          | _.        | j                            d'd          | _/        | j                            d(d          | _0        | j                            d)d*          | _1        | j                            d+d          | _2        | j                            d,          | _3        | 4                                 | j                            d-          }|pg }t+          |tj                    r|gn|} || j                             }| 6                    ||           dS ).z:
        Respond to a POST request from a client.
        )z/v1/completionsz/v1/chat/completionsz/chat/completions  	   Not FoundNContent-LengthzJSONDecodeError: z - Raw body: i  errorzInvalid JSON in request body: 	zIncoming Request Body: indentz Request should be dict, but got r?  Fstream_optionsr   rK  rO  r  adaptersmax_completion_tokensr  r   r   r   r   r           r      r   r   r   r  r   r   r   r  r	  rR   r
  r  rm   )7handle_text_completionshandle_chat_completionsr   r  r!  wfilewriter9   headersrfilereadr   r   decodebodyJSONDecodeErrorr   r'  r  rm  debugr   dictry   rp   r?  r+  requested_modelrequested_draft_modelr  rI  r  r   r  tempr   r   r   r   r   r   r   r   r   r  r   r   r   r  r	  r
  r  validate_model_parametersr   handle_completion)r   request_factoriescontent_lengthraw_bodyr  r*  rO   r  s           r*   do_POSTzAPIHandler.do_POST  s"   
  $;$($@!%!=
 
 9---((---J\***F T\*:;<<:??>22		
8??#4#455DII# 	 	 	MQaQQhoo>O>OQQRRR((---J
G%Ia%I%IJKKRRTT   FFFFF	 V
49V0T0T0TVVWWWIt
 
 	@ 	@?d49oo??	@ 	@ 

 immHe44"imm,<dCC#y}}WoFF%)Y]]=/%R%R" $	 7 @ Q!
 !
 y}}Z66)--(?FF?""immd5>I DO  9==42;@
 
 Y]]7D,C,L,RSS
Y]]7D,C,L,RSS
Y]]7D,C,L,RSS
"&)--0Dc"J"J'+y}}5NPR'S'S$ $	.@# F F%)Y]]3JB%O%O"!%/BC!H!H&*imm4Lb&Q&Q##y}}->DD!Y]]?C@@)--d;;	j%88 IMM."==IMM&$//	$(IMM2H$I$I!&&((( Y]]6**
%2
%/
C%@%@Pj\\j
 /#DI.00w
33333s   +C E7BE22E7c                 D	   t          | j        t                    st          d          t          | j        t
                    r| j        dk     rt          d          t          | j        t          t
          f          r| j        dk     rt          d          t          | j        t          t
          f          r| j        dk     s| j        dk    rt          d          t          | j	        t
                    r| j	        dk     rt          d          t          | j
        t          t
          f          r| j
        dk     s| j
        dk    rt          d          t          | j        t
                    r| j        dk     rt          d	          t          | j        t          t
          f          r| j        dk     rt          d
          t          | j        t
                    r| j        dk     rt          d          t          | j        t          t
          f          st          d          t          | j        t
                    r| j        dk     rt          d          t          | j        t          t
          f          st          d          t          | j        t
                    r| j        dk     rt          d          t          | j        t                    st          d          | j        dk    r,d| j        cxk     rdk    sn t          d| j        d          | j        pt          | j        t*                    st          d          	 d | j                                        D             | _        n# t          $ r t          d          w xY wt          | j        t                    rd| j        cxk    rdk    sn t          d          t          | j        t                    rd| j        cxk    rdk    sn t          d          t          | j        t4                    st          d          | j        )t          | j        t4                    st          d          | j        )t          | j        t
                    st          d          dS dS )zg
        Validate the model parameters passed in the request for the correct types and values.
        zstream must be a booleanr   z)max_tokens must be a non-negative integerz(temperature must be a non-negative floatr   z%top_p must be a float between 0 and 1z$top_k must be a non-negative integerz%min_p must be a float between 0 and 1z/num_draft_tokens must be a non-negative integerz/repetition_penalty must be a non-negative floatz6repetition_context_size must be a non-negative integerz(Presence penalty must be must be a floatz4presence_context_size must be a non-negative integerz5frequency_context_size must be a non-negative integerzlogprobs must be a booleanrR   r   z.top_logprobs must be between 1 and 10 but got ,Nz)logit_bias must be a dict of int to floatc                 4    i | ]\  }}t          |          |S rK   )r9   )r]   kvs      r*   
<dictcomp>z8APIHandler.validate_model_parameters.<locals>.<dictcomp>  s$    "Q"Q"QA3q661"Q"Q"Qr,   r.        ?z5xtc_probability must be a float between 0.00 and 1.00r)  z2xtc_threshold must be a float between 0.00 and 0.5zmodel must be a stringzadapter must be a stringzseed must be an integer)r   r?  rI   r   r  r9   r   r6   r   r   r   r  r   r   r   r   r   r  r  r	  r   r;  r  r   r   r<  r   r   r
  r   s    r*   r?  z$APIHandler.validate_model_parameters  s    $+t,, 	97888$/3// 	J4?Q3F3FHIII$*UCL99 	IT=MPQ=Q=QGHHH$*ucl33 	FtzA~~VWDEEE$*c** 	Edj1nnCDDD$*ucl33 	FtzA~~VWDEEE$/55 	P9NQR9R9RNOOO 42UCLAA	P&**NOOO47==	W+a//UVVV$/%>> 	IGHHH45s;;	U)A--STTT$05#,?? 	IGHHH46<<	V*Q..TUUU$-.. 	;9:::""A0A,G,G,G,GR,G,G,G,GVARVVV   ?&dot44 N !LMMMN"Q"Q9N9N9P9P"Q"Q"Q N N N !LMMMN t+U33	W,44444444UVVVt)511	T6:d>P6X6X6X6XTX6X6X6X6XRSSS$.44 	75666<#Jt|S,I,I#78889 DIs)C)C 6777 !   s   (M> >Nrz   r%  )lengthrm   prompt_token_countcompletion_token_countr  token_logprobsr&  rL   r}   reasoning_textrP   c                    |pg }|pg }|	pg }	| j         | j        | j        | j        | j        d|dgd}|rdd |D             i|d         d         d<   n-|r+dd t          ||          D             i|d         d         d<   | j        sZt          |t                    rt          |t                    st          d	          ||||z   d
|d<   ||dk    rd|i|d         d<   |d         d         }| j        
                    d          r| j        rdnd}d||
|	d||<   n9| j        dk    r|                    |           nt          d| j                   |S )a  
        Generate a single response packet based on response type (stream or
        not), completion type and parameters.

        Args:
            text (str): Text generated by model
            finish_reason (Union[Literal["length", "stop"], None]): The reason the
              response is being sent: "length", "stop" or `None`.
            prompt_token_count (Optional[int]): The number of tokens in the prompt,
              used to populate the "usage" field (not used when stream).
            completion_token_count (Optional[int]): The number of tokens in the
              response, used to populate the "usage" field (not used when stream).
            prompt_cache_count (Optional[int]): The portion of prompt_token_count
              that was found in the cache when servicing the request.
            token_logprobs (Optional[List[float]]): The log probabilities per token,
              in token order.
            top_tokens (Optional[List[Tuple[Dict[str, Any]]]]): List of outputs from
              _format_top_logprobs, giving info on the top N tokens at each token position.
            tokens (Optional[List[int]]): List of tokens to return with logprobs structure
            tool_calls (Optional[List[str]]): List of tool calls.
            reasoning_text (Optional[str]): The reasoning text generated by the model.

        Returns:
            dict: A dictionary containing the response, in the same format as
              OpenAI's API.
        r   )r   r%  )rt  r
  objectr   r  choicesro   c                 D    g | ]}|rt          |d          |          ni S )r   )r	  r;  )r]   r^   s     r*   r|   z0APIHandler.generate_response.<locals>.<listcomp>g  s@       @A!;D1A....  r,   rS  r  c                 6    g | ]\  }}t          ||           S ))rt  r$  rU  )r]   r^   rv  s      r*   r|   z0APIHandler.generate_response.<locals>.<listcomp>m  s6       .2aDAq)))  r,   z8Response type is complete, but token counts not providedprompt_tokenscompletion_tokenstotal_tokensusageNcached_tokensprompt_tokens_detailschat.completiondeltar   rl   )rn   ro   	reasoningr}   text_completion)rz   zUnsupported response type: )
request_idr
  object_typer<  r  rS   r?  r   r9   r   
startswithr  )r   rz   r%  rM  rN  r  rO  r&  rL   r}   rP  r	  r  choicekey_names                  r*   generate_responsezAPIHandler.generate_response+  s*   N (-2!'R%2
 /"&"9&)| %2 
 
  	  EQ  2HY":..
  	  69&.6Q6Q  2HY":. { 	-s335s;; !N  
 "4%; 25K K! !HW
 "-2D2I2I#%7>!"9: )$Q' &&'899 	O"&+<ww9H#+(	   F8 !222MMtM$$$$M4;KMMNNNr,   r  rO   c                     t          t           j         j         j                  t           j         j         j         j	         j
         j                  t           j         j         j         j         j         j         j                  | j         j         j         j         j         j        
  
        } fd}	  j                            ||          \  }n# t8          $ ru}                     d                                              j                             tC          j"        d| i          #                                           Y d	}~d	S d	}~ww xY w j$        r> %                    d
                                             tM          j'        d           n)                     d
           tM          j'        d           d}d}g }	d}
d fdfd}d}j(        r[tS          tU          j+                  dz
  dd          D ]4}j+        |         j,        k    r nj+        |         j-        k    rd} n5d}g g }g }d}d}d}|D ]D}tM          j'        |j.                   |r|j.        j/        k    rd}nr||j.        z  }ngj0        r|j.        j1        k    rd}d}nK|r5|j.        j2        k    r|	3                    |
           d}
d}n|
|j.        z  }
n||j.        z  }||j.        z  }3                    |j4                   |j        r|3                    |j5                   |j        dk    r|3                    |j6                   to          j8        j9        |          }|j:        r\|rdnd};                                 d	tU                    |j<        z
           |d	tU          |          |j=        z
           }d} nĉ j$        r|st}          fdj9        D                       r|s|	s|r ?                    |d	 ||	          |          } j                             dtC          j"        |           d#                                            j        @                                 d}d}g }	|jA        |jA        }F|r|
r|	3                    |
            j$        rn ?                    || ||	          |          } j                             dtC          j"        |           d#                                            j        @                                  jB         jB        d         r C                    tU          j+                  tU                    jD                  } j                             dtC          j"        |           d#                                            j        @                                  j                             d#                                            j        @                                 d	S  ?                    ||tU          j+                  tU                    jD        ||| ||	          
  
        }tC          j"        |          #                                }d}tM          j'        d tC          j"        ||!                       E                    d"t          tU          |                                                                  j                             |            j        @                                 d	S )#a  
        Generate a response to a prompt and send it to the client in a single batch.

        Args:
            prompt (List[int]): The tokenized prompt.
            stop_words (List[str]): A list of stop words passed to the
                stopping_criteria function
        )r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   r  )
r   r  r  rO   r  r  r  r	  r
  r  c                 $   t          j        d|  d|            j        rm	 j                            d|  d| d                                           j                                         d S # t          t          t          f$ r Y d S w xY wd S )NzPrompt processing progress: /z: keepalive 

)
r   r   r?  r2  r3  rm  flushBrokenPipeErrorConnectionResetErrorOSError)processed_tokensrZ  r   s     r*   keepalive_callbackz8APIHandler.handle_completion.<locals>.keepalive_callback  s    LP/?PP,PP   { 	J$$L'7LL,LLLSSUU   J$$&&&&&')=wG   DD	 	s   AA2 2BB)r  r$  r'  Nr  zStarting stream:zStarting completion:Fr3   r   c                     |                      dd           pt          t          j                              }t	          j        | d         d          | d<   | d|d}j        r
|d<   dz  |S )	Nrt  r   F)ensure_asciir~   )r~   ry   rt  r   r   )r   r   uuiduuid4r   r  r?  )r   tool_call_idoutr   tool_idxs      r*   format_tool_callz6APIHandler.handle_completion.<locals>.format_tool_call  s    $==t44IDJLL8I8IL%)Z+&U& & &Ik" &"" C
 { 'GAJr,   c                     | sg S g }| D ]r}                     |j                  }t          |t                    r"|                    fd|D                        T|                     |                     s|S )Nc              3   .   K   | ]} |          V  d S r\   rK   )r]   tcry  s     r*   r_   zDAPIHandler.handle_completion.<locals>.parse_tools.<locals>.<genexpr>  s/      !H!H2"2"22"6"6!H!H!H!H!H!Hr,   )r  r  r   r   extendr   )r}   r   	tool_textparsedr  ry  r  s       r*   parse_toolsz1APIHandler.handle_completion.<locals>.parse_tools  s     	F' < <	GMBBfd++ <MM!H!H!H!H!H!H!HHHHHMM"2"26":":;;;;Mr,   r   rR   TrL  r}   rm   c              3   8   K   | ]}t          |          V  d S r\   )rd   )r]   sequencerL   s     r*   r_   z/APIHandler.handle_completion.<locals>.<genexpr>U  sA        $ )::     r,   )r}   rP  zdata: rk  include_usagezdata: [DONE]

)rO  r&  rL   rP  r}   r(  zOutgoing Response: r)  r&  )Gr  r   r<  r=  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r	  r
  r  r  r  r  r  r!  r2  r3  r   r  rm  r?  r  r   r:  r  rb   rT   rs   r  r  rz   r  r  r  r  r   r#  r$  r&  rW   rM   r  rC   rm   rD   rE   ra   rg  rl  r%  r+  completion_usage_responser  r  r   )r   r  rO   r   rq  r  r  in_tool_callmade_tool_callr}   r~  r  in_reasoningr^   rP  rO  r&  rz   segmentr%  r  stop_conditionresponse_jsonr*  r  ry  rL   rx  s   ``                      @@@@r*   r@  zAPIHandler.handle_completion  sb    #"*0  
 ' ,jjj $ 4"0   ,?#'#:(,(D!%!6&*&@"&"8'+'B   "!2]*!%!:;
 
 
B	 	 	 	 	 
	 3<<"4 =  MC
  	 	 	((---JTZQ&(9::AACCDDDFFFFF		 ; 	2$$S)))M,----((---M0111 
		 	 	 	 	 	 
	 
	 
	 
	 
	 
	 
	  	3sz??Q.B77  :a=C$444EZ]c&888#'LE 9  
  ! G	2 G	2CM#(###  $8s},,#(LL"ch.NN% $#(c6I*I*I!%# 	$8s000%%i000 "I#(LL)II 38# MM#)$$$} 3%%ck222  1$$!!#.111 /!(	 N & 0> JF


 J#f++0J"J JKIc$ii.*IIIJ{ $< $    (+(@    $  $
 $n $#55#.;z#:#:'5	  6    H J$$%Hdj.B.B%H%H%H%O%O%Q%QRRRJ$$&&&%'N G!#J , # 1  	)I 	)i(((; (	--&;z22-	 .  H J@dj&:&:@@@GGIIJJJJ".43F3W.99
OOKK* 
 
  !D$*X*>*>!D!D!D!K!K!M!MNNN
  """J/6688999J--CJF&-%-&;z22 .  H !Jx007799MFMU
8F0S0S0SUUVVV -s3}3E3E/F/FGGGJ]+++Js    C, ,
E+6A*E&&E+c           
      z    | j         | j        d| j        | j        g ||||z   dd}||dk    rd|i|d         d<   |S )Nr^  rW  )rt  r
  rR  r   r  rS  r[  r   r\  r[  r]  )rb  r
  r<  r  )r   rM  rN  r  r  s        r*   r  z$APIHandler.completion_usage_response  sw     /"&"9')|!3%; 25K K 
 
 ).@A.E.E!3:HW56 r,   c           	         | j         }d|v s
J d            dt          j                     | _        | j        rdnd| _        t          dd|d         |                    d          pd	|                    d
                    S )z
        Handle a chat completion request.

        Returns:
            mx.array: A mx.array of the tokenized prompt from the request body
        re   z Request did not contain messagesz	chatcmpl-zchat.completion.chunkr^  r  r3   r  Nrf   )r8  rt  ru  rb  r?  rc  r  rp   )r   r8  s     r*   r1  z"APIHandler.handle_chat_completions  s     yT!!!#E!!! 5djll446:kX22GX HHW%HH^$$
 
 	
r,   c                     dt          j                     | _        d| _        d| j        v s
J d            t          d| j        d         g dd          S )z
        Handle a text completion request.

        Returns:
            mx.array: A mx.array of the tokenized prompt from the request body
        zcmpl-ra  rs   z Request did not contain a promptrz   N)rt  ru  rb  rc  r8  r  r   s    r*   r0  z"APIHandler.handle_text_completions  se     1$*,,00,49$$$&H$$$ Ih
 
 	
r,   c                 .   | j                             d          r|                                  dS | j         dk    r|                                  dS |                     d           |                                  | j                            d           dS )z9
        Respond to a GET request from a client.
        z
/v1/modelsz/healthr$  r%  N)r   rd  handle_models_requesthandle_health_checkr  r!  r2  r3  r   s    r*   do_GETzAPIHandler.do_GET  s     9-- 	+&&(((((Y)##$$&&&&&((---J\*****r,   c                     |                      d           |                                  | j                            d                                           | j                                         dS )z@
        Handle a GET request for the /health endpoint.
        r  z{"status": "ok"}N)r  r!  r2  r3  rm  rl  r   s    r*   r  zAPIHandler.handle_health_check  sg     	$$S)))
+2244555
r,   c                 *   	
                       d                                             g d	 j                            d          }d
t	          |          dk    rd                    |dd                   
	
fdt                      }fd|j        D             } fd|D             } j        j	        j
        rqt           j        j	        j
                  }|                                r?t          |                                          }|                    |d	 j        d
           d|d}t#          j        |                                          } j                            |            j                                         dS )zC
        Handle a GET request for the /v1/models endpoint.
        r  )zconfig.jsonzmodel.safetensors.index.jsonztokenizer_config.jsonrj  N   c                     | j         dk    rdS d| j        vrdS | j        k    rdS d | j        d         j        D             t	          fdD                       S )Nr   Fmainc                 &    h | ]}|j         j        S rK   )	file_pathname)r]   fs     r*   	<setcomp>zLAPIHandler.handle_models_request.<locals>.probably_mlx_lm.<locals>.<setcomp>  s    LLLq!+*LLLr,   c              3       K   | ]}|v V  	d S r\   rK   )r]   r  
file_namess     r*   r_   zLAPIHandler.handle_models_request.<locals>.probably_mlx_lm.<locals>.<genexpr>  s'      661qJ666666r,   )	repo_typerefsrepo_idfilesrf  )repor  r  filter_repo_ids    @r*   probably_mlx_lmz9APIHandler.handle_models_request.<locals>.probably_mlx_lm   s    ~((uTY&&u)dln.L.LuLLDIf4E4KLLLJ6666666666r,   c                 *    g | ]} |          |S rK   rK   )r]   r  r  s     r*   r|   z4APIHandler.handle_models_request.<locals>.<listcomp>  s8     
 
 
OOD4I4I

 
 
r,   c                 0    g | ]}|j         d j        dS )r   rt  rR  r  )r  r  )r]   r  r   s     r*   r|   z4APIHandler.handle_models_request.<locals>.<listcomp>  s?     
 
 
 	 l!< 
 
 
r,   r   r  r   )rR  r  )r  r!  r   r<   rT   r   r   reposr  rI  r   r   existsr   resolver   r  r   r  rm  r2  r3  rl  )r   partshf_cache_infodownloaded_modelsmodelsrg  model_idr  r  r  r  r  s   `        @@@r*   r  z APIHandler.handle_models_request  s    	$$S)))XXX	$$u::>> XXeABBi00N	7 	7 	7 	7 	7 	7 '((
 
 
 
*0
 
 


 
 
 
 *
 
 
 "+1 
	d5>DEEJ  "" z113344&")#'<    %f55
8,,3355
'''
r,   )r  )NNNNNNNN)NNN) rF   rG   rH   r  r   r   r   r  r9   r  r  r"  rD  r?  r   r   r   r6   r   r   r   r;  rg  r  r@  r  r1  r0  r  r  r  __classcell__)r  s   @r*   r	  r	  e  s       
 -1	* * *-* %SM	* * * * * *> > >
! !3 ! ! ! !
! !s ! ! ! !  N4 N4 N4`P8 P8 P8l -104,004<@&**.(,j jj W%56<=j %SM	j
 !)j %SMj !e-j T%S#X"789j c#j T#Y'j !j 
j j j jX@): @S	 @ @ @ @H -104,0	 $SM !) %SM	   2
): 
 
 
 
,
): 
 
 
 
&+ + +  8 8 8 8 8 8 8r,   r	  hostportc                    | |f}t          j        |t           j        t           j        d}t	          t          |                    \  |_        }}}} ||fd          }t          j        d           t          j
        d|  d| d           	 |                                 d S # t          $ r, |                                                                  Y d S w xY w)N)ry   flagsc                  8     g| R dt                      i|S )Nr
  )r+   )r   r  handler_classr  s     r*   <lambda>z"_run_http_server.<locals>.<lambda>;  sB    !
 !
 !
 !
577!
 	!
 !
 r,   z\mlx_lm.server is not recommended for production as it only implements basic security checks.zStarting httpd at z	 on port z...)socketgetaddrinfoSOCK_STREAM
AI_PASSIVEr  iteraddress_familywarningswarnr   r   serve_foreverKeyboardInterruptshutdownr  )	r  r  r  server_classr  server_addressinfosr  httpds	     ` `    r*   _run_http_serverr  -  s(    D\N	f08I  E <@U;L;L8LAq.L	
 	
 	
 	
 	
 E M	4   L>d>>T>>>???+ + + +((******+s   B* *2C C r  c                    t           j                                        }t          |j        j                  }t          ||          }|                                dk    rt          | ||           d S |	                                 d S r   )
r&   r-  r.  r   rI  prompt_cache_sizer  r  r  r   )r  r  r  r  r  rV  r   r  s           r*   runr  N  s~     N!!E!."9"KLLL*><HHzz||qt%788888!!!!!r,   c                     t          j        d          } |                     dt          d           |                     dt          d           |                     dt          d	d
           |                     dt          dd           |                     dt          dd            |                     dt          dd           |                     ddd           |                     dt          dg dd           |                     dt          dd d!"           |                     d#dd$           |                     d%t
          d&d'           |                     d(t
          d)d*           |                     d+t          d,d-           |                     d.t
          d&d/           |                     d0t          d1d2           |                     d3t          j        d4d5           |                     d6t          d7d8           |                     d9t          d:d;           |                     d<t          d=d>           |                     d?t          d@dA           |                     dBt          dC           |                     dDddE           | 	                                }t          j                                        r-t          j                    dF         }t          j        |           t          j        t#          t          |j                                        d           dGH           t)          |j        |j        t/          |                     d S )INzMLX Http Server.)descriptionz--modelz8The path to the MLX model weights, tokenizer, and config)ry   helpz--adapter-pathz9Optional path for the trained adapter weights and config.z--hostz	127.0.0.1z-Host for the HTTP server (default: 127.0.0.1))ry   defaultr  z--porti  z(Port for the HTTP server (default: 8080)z--draft-modelz,A model to be used for speculative decoding.)ry   r  r  z--num-draft-tokensz:Number of tokens to draft when using speculative decoding.r  z--trust-remote-code
store_truez)Enable trusting remote code for tokenizer)actionr  z--log-levelINFO)DEBUGr  WARNINGERRORCRITICALz%Set the logging level (default: INFO))ry   r  rS  r  z--chat-templater3   z)Specify a chat template for the tokenizerF)ry   r  r  requiredz--use-default-chat-templatezUse the default chat templatez--tempr.  z+Default sampling temperature (default: 0.0)z--top-prK  z-Default nucleus sampling top-p (default: 1.0)z--top-kr   z3Default top-k sampling (default: 0, disables top-k)z--min-pz5Default min-p sampling (default: 0.0, disables min-p)z--max-tokensi   z;Default maximum number of tokens to generate (default: 512)z--chat-template-argsznA JSON formatted string of arguments for the tokenizer's apply_chat_template, e.g. '{"enable_thinking":false}'z{}z--decode-concurrency    zFWhen a request is batchable then decode that many requests in parallelz--prompt-concurrency   zFWhen a request is batchable then process that many prompts in parallelz--prefill-step-sizei   z0Step size for prefill processing (default: 2048)z--prompt-cache-sizer   z@Maximum number of distinct KV caches to hold in the prompt cachez--prompt-cache-bytesz&Maximum size in bytes of the KV cachesz
--pipelinez,Use pipelining instead of tensor parallelism max_recommended_working_set_sizez)%(asctime)s - %(levelname)s - %(message)s)levelformat)rh  ArgumentParseradd_argumentr   r9   r6   r   r   r@   
parse_argsr&   metalis_availabler'   set_wired_limitr   basicConfiggetattr	log_levelr8   r  r  r  rH  )parserr   wired_limits      r*   r  r  ^  sr   $1CDDDF
G    
 H    
 <	     7	     ;	     I	     8    
 AAA4     8     %,    
 :	     <	     B	     D	     J	     Z B	     U	     U	     ?	     O	     5    
 ;    
 D	x (n&&'IJ
;'''gt~3355t<<:    	49mD1122222r,   __main__z}Calling `python -m mlx_lm.server...` directly is deprecated. Use `mlx_lm.server...` or `python -m mlx_lm server ...` instead.r\   )[rh  r   heapqr   r   r  r(   r  r;  rt  r  collectionsr   dataclassesr   r   http.serverr   r   pathlibr   queuer	   r  r
   	threadingr   typingr   r   r   r   r   r   r   r   r   r   mlx.corecorer&   huggingface_hubr   _versionr   r  r   r   r   models.cacher   r   r   sample_utilsr   r    utilsr!   r"   r+   r@   rB   r9   r   r   rW   rI   rd   r;  rv   r   r   r   r   r   r  r  r  r"  r(  rH  rn  rp  r~  r  r	  r  r  r  rF   printrK   r,   r*   <module>r     s                    ( ( ( ( ( ( ( ( C C C C C C C C       % % % % % %                                          * * * * * * ! ! ! ! ! ! H H H H H H H H H H         
 ? > > > > > > > % % % % % % % %N N N
	% 	% 	%    J   'LI'L'L DI'L S		'L
 'L 'L 'L 'LTE Ex ED E E E E 4: Xd^    0= = =D{
 {
 {
 {
 {
 {
 {
 {
|                                 3 3 3 3 3 3 3 3 + + + + + + + + ! ! ! ! ! ! ! !( & & & & & & & &%" %" %" %" %" %" %" %"Pj* j* j* j* j* j* j* j*Z  	 	 	uT#s(^?T    W, W, W, W, W, W, W, W,tE E E E E' E E EX %+ +
+
+ + + +J %" "
"
" "" " " " K3 K3 K3\ z	E	L   	DFFFFF r,   