o
    :/iR                     @   sD  d dl Z d dlZd dlmZmZmZmZ d dlmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZmZ erkd dlmZ d dl m!  m"  m#Z$ n	eZede% dZ$ee&Z'ed Z(eddde(f Z)ed Z*eddddde)e*f Z+ed Z,eG dd dZ-dS )    N)TYPE_CHECKINGAnyLiteralget_args)FieldSkipValidationmodel_validator)Self)
LoadConfig)ModelConfig)ParallelConfig)config)init_logger)get_hf_text_config)	safe_hash)
LazyLoaderhas_arctic_inference)PretrainedConfigmodel_executorz'vllm.model_executor.layers.quantization)deepseek_mtpmimo_mtpglm4_moe_mtpglm4_moe_lite_mtpglm_ocr_mtp	ernie_mtpnemotron_h_mtpexaone_moe_mtpqwen3_next_mtpqwen3_5_mtplongcat_flash_mtpmtppangu_ultra_moe_mtpstep3p5_mtpeagleeagle3extract_hidden_states	ngram_gpungrammedusamlp_speculatordraft_modelsuffix)strictprobabilisticc                	   @   s  e Zd ZU dZdZedB ed< 	 edddZe	ed< 	 dZ
edB ed< 	 dZedB ed< 	 edd	d
Ze	dB ed< 	 dZe	dB ed< 	 dZejdB ed< 	 edd	d
Ze	dB ed< 	 dZedB ed< 	 dZedB ed< 	 dZeed< 	 dZeed< 	 edd	d
Ze	dB ed< 	 edd	d
Ze	dB ed< 	 dZedB ed< 	 dZeed< 	 dZee ed< 	 dZee  ed< 	 dZ!ee ed< 	 dZ"ee  ed< 	 dZ#e	ed< 	 dZ$e	ed< 	 d Z%e&ed!< 	 d"Z'e&ed#< 	 dZ(e)dB ed$< 	 d%Z*e+ed&< 	 d'efd(d)Z,e-d*e.d'e.fd+d,Z/d-d. Z0d/d0 Z1e-d1e	dB d2e	d3e	d'e	fd4d5Z2e-de d6e	dB d7e.d'e	fd8d9Z3d:d; Z4e-de d6e	d'e fd<d=Z5e6d>d?d'e7fd@dAZ8dBdC Z9e:d'e	fdDdEZ;d'efdFdGZ<d'efdHdIZ=d'efdJdKZ>d'efdLdMZ?d'efdNdOZ@dS )PSpeculativeConfigz'Configuration for speculative decoding.Nenforce_eagerr   )defaultgtnum_speculative_tokensmodelmethod   )r0   gedraft_tensor_parallel_sizetensor_parallel_sizequantizationmax_model_lenrevisioncode_revisionFdisable_padded_drafter_batchuse_local_argmax_reductionprompt_lookup_maxprompt_lookup_minspeculative_token_treeparallel_draftingtarget_model_configtarget_parallel_configdraft_model_configdraft_parallel_config   suffix_decoding_max_tree_depthi'  #suffix_decoding_max_cached_requestsg      ?suffix_decoding_max_spec_factorg?suffix_decoding_min_token_probdraft_load_configr,   rejection_sample_methodreturnc                 C   sh   g }| j dv }|| |r&| jdur&t| jjdd}|dur&|t| tt| dd	 }|S )a  
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        r$   r%   N eagle_aux_hidden_state_layer_idsF)usedforsecurity)
r4   appendrE   getattr	hf_configtupler   strencode	hexdigest)selffactorsuses_aux_hidden_states	layer_idshash_str r^   d/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/config/speculative.pycompute_hash   s   

zSpeculativeConfig.compute_hashrT   c                 C   s  | j d }| jdv rd| _| jdkr!t| dd }| |dgd | jdv r)d| _| jdkr=t| dd }| |d	gd | j d d
krWd| _t| dd }| d|dgd | j d dkrpd| _t| dd }| |dgd | j d dkrd| _t| dd }| d|dgd | j d dkrd| _t| dd }| d|dgd | jdkrd| _| jdkrt| dd }| |dgd | jdkrt| dr| jdkrd| _| jdkrt| dd}| |dgd | jdkrd| _| jdkrt| dd }| |d gd | jd!krd"| _| jd"kr!t| dd }| |d#gd | jd$v rC| jd%k}d&| _t| d'd }| ||r=d(nd)gd | jd*kr[d+| _t| dd}| |d,gd | jd-krsd.| _t| dd}| |d/gd |d0kr| d1d2gi | S )3Nr   )deepseek_v3deepseek_v32glm_moe_dsar   num_nextn_predict_layersDeepSeekMTPModel)	n_predictarchitecturespangu_ultra_moer!   OpenPanguMTPModelMiMoForCausalLMr   MiMoMTPModel)num_hidden_layersrf   rg   Glm4MoeForCausalLMr   Glm4MoeMTPModelGlm4MoeLiteForCausalLMr   Glm4MoeLiteMTPModelGlmOcrForConditionalGenerationr   GlmOcrMTPModelernie4_5_moer   ErnieMTPModel
nemotron_hr   r5   NemotronHMTPModel
qwen3_nextr   Qwen3NextMTP
exaone_moer   ExaoneMoeMTP)qwen3_5qwen3_5_moer|   r   mtp_num_hidden_layersQwen3_5MoeMTP
Qwen3_5MTPlongcat_flashr   LongCatFlashMTPModelstep3p5r"   
Step3p5MTPMistralLarge3ForCausalLMrg   EagleMistralLarge3ForCausalLM)rg   
model_typerS   updatehasattrrd   )rT   initial_architecturerf   is_moer^   r^   r_   hf_config_override   s   



















z$SpeculativeConfig.hf_config_overridec                 C   s  | j d u r| jdv rd| _ nd| _ | j ttv r'| j dkr'td| j  d| _ | jd u r| jd ur| j dkrW| jd u r?td| jj	j
dkrId| _| jj| _| jsV| jj| _n(| j dv r`d| _n| j d	krid	| _n| j d
krrd
| _n| j dkr{d| _ntd| j dv rd| _ | j dv r| jd u r| jd u rd| _d| _n%| jd u r| jd u rtd| j| _n| jd u r| jd u rtd| j| _| j| jkrtd| j d| j | j| _| j| _| S | j d
kr|   | S | j dkr>ddlm} d| _d| _d| _t| jdr| jj }nt| jtrd| jv r| jd }ni }t| j| _|| jjfi || j_|   | j| _| S d| _d| _| jd urtdAi d| jddd| jjd| jjd| jjd| jj d| jj!d| jj"d| jj#d| j$d | j%d!| jj&d"| jj'd#| jd$| jjd%| jj(d&t)j*d'| jj+| _| j d(v rnrd)| jj, v rd*| _ ned+| jj, v rd+| _ nX| jjj
d,krd,| _ nL| jjj
d-krd-| _ n@| jjj
ttv rd| _ | jd.krtd/ n'| jjj
d0v rd0| _ | jd.krtd1 n| j dkrn	t-d2| j  d3| j d(v rKdd4l.m/} dd5l0m1} t| jj||fr9n|| jj| j d*d6}|| j_|   | jd ur_t| jjd7r_| j| jj_2t3| jjd8d }|d ur| jd u rv|| _n| j|kr| j| dkrtd9| j d:|| j4d u r| jd u rtd;t5d<d= t6| jD | _4nt78| j4}t5t9|d>d? d@| _4t):| j| j;| jj| _;t)<| j'| jj'| jj'| j_'t)=| j| j;| _| S )BN)r'   z[ngram]r'   r*   r    z0method `%s` is deprecated and replaced with mtp.z+target_model_config must be present for mtprb   Tr&   r+   r%   zBnum_speculative_tokens was provided but without speculative model.)r'   r&      z[Either prompt_lookup_max or prompt_lookup_min must be provided when using the ngram method.zprompt_lookup_min=z must be <= prompt_lookup_max=r   )ExtractHiddenStatesConfigrT   r3   runnerdraft	tokenizertokenizer_modetrust_remote_codeallowed_local_media_pathallowed_media_domainsdtypeseedr;   r<   tokenizer_revisionspec_target_max_model_lenr9   r/   max_logprobshf_overridesconfig_format)r#   r$   zeagle-r#   r$   r(   r)   r5   zEnabling num_speculative_tokens > 1 will run multiple times of forward on same MTP layer,which may result in lower acceptance rater   z`LongCat MTP models only have one layer. Might need some code changes to support multiple layers.z!Unsupported speculative method: '')SpeculatorsConfig)EAGLEConfig)r4   r   num_lookahead_tokensrf   znum_speculative_tokens:z  must be divisible by n_predict=zpA speculative model was provided, but neither `speculative_token_tree` nor `num_speculative_tokens` was providedc                 S   s   g | ]}|d  d qS )r5   )r   r^   ).0ir^   r^   r_   
<listcomp>>  s    z3SpeculativeConfig.__post_init__.<locals>.<listcomp>c                 S   s   t | | fS N)len)tr^   r^   r_   <lambda>D  s    z1SpeculativeConfig.__post_init__.<locals>.<lambda>)keyr^   )>r4   r3   r   MTPModelTypesloggerwarningr2   rC   
ValueErrorhf_text_configr   r/   r9   r@   r?   rE   rD   rF   _validate_suffix_decoding5vllm.transformers_utils.configs.extract_hidden_statesr   r   rT   to_dict
isinstancedictcopyupdate_arch_r   r   r   r   r   r   r   r   r;   r<   r   r:   r   r.   r   r   lowerNotImplementedErrorvllm.transformers_utils.configsr   %vllm.transformers_utils.configs.eagler   r   rS   rA   rV   rangeastliteral_evalsorted_verify_and_get_draft_tpr7   #_maybe_override_draft_max_model_lencreate_draft_parallel_config)rY   r   rT   r   r   eagle_configrf   tree_choicesr^   r^   r_   __post_init__R  s  


















 7 
 5 

  	




		zSpeculativeConfig.__post_init__c                 C   s   t  std| jd u r| j| _td| j | jdk r%td| j d| jdk r3td| j d| jdk rAtd	| j dd| j	  krLdksVn td
| j	 dd S )NzdArctic Inference is required for suffix decoding. Install via `pip install arctic-inference==0.1.1`.z;Defaulted num_speculative_tokens to %s for suffix decoding.r5   zsuffix_decoding_max_tree_depth=z must be >= 1r   z$suffix_decoding_max_cached_requests=z must be >= 0z suffix_decoding_max_spec_factor=zsuffix_decoding_min_token_prob=z must be in [0, 1])
r   ImportErrorr2   rH   r   r   r   rI   rJ   rK   rY   r^   r^   r_   r   ^  sF   



z+SpeculativeConfig._validate_suffix_decodingspeculative_max_model_lendraft_max_model_lentarget_max_model_lenc                 C   sN   | dur"| |krt d| d|| |kr t d| d|| S t||S )a  Determine the max sequence len for the draft model. This is usually
        the draft_max_model_len, but may be the target_max_model_len if it is
        less than the draft_max_model_len, or may be speculative_max_model_len
        if it is specified.

        This is necessary so that sequences do not exceed the capacity of the
        draft model or the target model.

        speculative_max_model_len is mainly used for testing that sequences can
        skip speculation.
        Nzspeculative_max_model_len=z+ cannot be larger than draft_max_model_len=z, cannot be larger than target_max_model_len=)r   min)r   r   r   r^   r^   r_   r     s$   z5SpeculativeConfig._maybe_override_draft_max_model_len&speculative_draft_tensor_parallel_sizedraft_hf_configc                 C   s^   |du r|j dkrd}| jdkrtd|j  |S | j}|S |d| jfvr-td|d|S )z
        Verifies and adjusts the tensor parallel size for a draft model
        specified using speculative_draft_tensor_parallel_size.
        Nr)   r5   zV%s cannot currently be run with tp>1; setting speculative_draft_tensor_parallel_size=1z'speculative_draft_tensor_parallel_size=zB cannot be other value than 1 or target model tensor_parallel_size)r   r8   r   r   r   )rD   r   r   r^   r^   r_   r     s(   


z*SpeculativeConfig._verify_and_get_draft_tpc                 C   sL   t | jj| j_| j | j_| jj| jj| j\}}|| j_	|| j_
dS )z
        EagleConfig and ExtractHiddenStatesConfig update architectures, so update all
        architectures-related fields in self.draft_model_config
        N)r   rE   rT   r   get_model_arch_configmodel_arch_configregistryinspect_model_clsrg   _model_info_architecture)rY   
model_infoarchr^   r^   r_   r     s   zSpeculativeConfig.update_arch_c              	   C   s&   t | j|| j| j| j| j| jd}|S )zCreate a parallel config for use by the draft worker.

        This is mostly a copy of the target parallel config, except the tp_size.
        )pipeline_parallel_sizer8   distributed_executor_backendmax_parallel_loading_workersdisable_custom_all_reduceray_workers_use_nsightplacement_group)r   r   r   r   r   r   r   )rD   r   rF   r^   r^   r_   r     s   	
z.SpeculativeConfig.create_draft_parallel_configafter)modec                    s    j d ur	td jd u rtd jdkr td j d jr* j j g d} jdv rQ jrQt fdd	|D sQt j d
| d jj	j
    S )Nz{'tensor_parallel_size' is not a valid argument in the speculative_config. Please pass 'draft_tensor_parallel_size' instead.z}num_speculative_tokens must be provided with speculative model unless the draft model config contains an n_predict parameter.r   z9Expected num_speculative_tokens to be greater than zero (z).)llamaqwenminicpmgpt_oss
hunyuan_vlhunyuan_v1_denseafmoeru   deepseek_v2ra   kimi_k2kimi_k25rO   c                 3   s    | ]
}| j jjv V  qd S r   )rC   r   r   )r   supported_modelr   r^   r_   	<genexpr>  s
    
z1SpeculativeConfig._verify_args.<locals>.<genexpr>z is only supported for z@ models. Got self.target_model_config.hf_text_config.model_type=)r8   r   r2   rE   verify_with_parallel_configrF   r4   rC   anyr   r   &verify_equal_vocab_size_if_draft_model)rY   aux_hidden_states_supportedr^   r   r_   _verify_args  sB   



zSpeculativeConfig._verify_argsc                 C   s`   | j dkr(| jd ur*| jd ur,| j }| j }||kr.td| d| dd S d S d S d S )Nr*   zUTarget and draft model should have the same vocabulary size. Target model vocab_size=z. Draft model vocab_size=zd. Using models with different tokenizers can cause out-of-bounds errors during speculative decoding.)r4   rC   rE   get_vocab_sizer   )rY   target_vocab_sizedraft_vocab_sizer^   r^   r_   r   )  s"   




z8SpeculativeConfig.verify_equal_vocab_size_if_draft_modelc                 C   s(   d}| j r
| jd }|  r|d7 }|S )zs
        Calculate the maximum number of new slots that might be added to the batch
        when drafting.
        r   r5   )rB   r2   uses_draft_model)rY   slots_per_reqr^   r^   r_   max_num_new_slots_for_drafting:  s   
z0SpeculativeConfig.max_num_new_slots_for_draftingc                 C   s
   | j dv S )N)r#   r$   r    r4   r   r^   r^   r_   	use_eagleJ     
zSpeculativeConfig.use_eaglec                 C   
   | j dkS )Nr*   r   r   r^   r^   r_   r   M  r   z"SpeculativeConfig.uses_draft_modelc                 C   r   )Nr%   r   r   r^   r^   r_   uses_extract_hidden_statesP  r   z,SpeculativeConfig.uses_extract_hidden_statesc                 C   r   )Nr&   r   r   r^   r^   r_   use_ngram_gpuS  r   zSpeculativeConfig.use_ngram_gpuc                 C   s8   | j }|dv r	d n| jj}| j}d|d|d|dS )N)r'   r+   r%   zSpeculativeConfig(method=z, model=z, num_spec_tokens=))r4   rE   r3   r2   )rY   r4   r3   num_spec_tokensr^   r^   r_   __repr__V  s   zSpeculativeConfig.__repr__)A__name__
__module____qualname____doc__r/   bool__annotations__r   r2   intr3   rV   r4   SpeculativeMethodr7   r8   r9   me_quantQuantizationMethodsr:   r;   r<   r=   r>   r?   r@   rA   rB   rC   r   r   rD   r   rE   rF   rH   rI   rJ   floatrK   rL   r
   rM   RejectionSampleMethodr`   staticmethodr   r   r   r   r   r   r   r   r   r	   r   r   propertyr   r   r   r   r   r   r^   r^   r^   r_   r.   ?   s   
  |  $%"6r.   ).r   r   typingr   r   r   r   pydanticr   r   r   typing_extensionsr	   vllm.configr
   vllm.config.modelr   vllm.config.parallelr   vllm.config.utilsr   vllm.loggerr   vllm.transformers_utils.configr   vllm.utils.hashingr   vllm.utils.import_utilsr   r   transformersr   'vllm.model_executor.layers.quantizationr   layersr9   r  globalsr   r   r   EagleModelTypesNgramGPUTypesr  r  r.   r^   r^   r^   r_   <module>   sN   	