o
    :/i6:                  
   @   s   d Z ddlZddlZddlmZ ddlmZ ddlmZm	Z	 ddl
Z
ddlZddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ eeZdedeeef deeeeef f fddZ G dd deZ!dS )z(Tokenizer for Kimi-Audio using TikToken.    N)Sequence)Path)Anyoverload)hf_hub_download)
AddedTokenBatchEncoding)chat_template_utils)ChatCompletionMessageParam)init_logger)TokenizerLike
vocab_filespecial_tokensreturnc           
      C   s   i }t | dd1}|D ]&}| }|sq| }t|dkr1|d }t|d }t|}|||< qW d   n1 s<w   Y  tjt	| d||d}	|	|fS )	z'Load TikToken encoding from vocab file.utf-8encoding   r      Nzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+)namepat_strmergeable_ranksr   )
openstripsplitlenintpybase64	b64decodetiktokenEncodingstr)
r   r   r   flineparts	token_b64ranktoken_bytes	tokenizer r)   g/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/tokenizers/kimi_audio.py_load_tiktoken_encoding   s,   
r+   c                       sp  e Zd ZdZedddddeeB dededB dedB d	d f
d
dZdededed	df fddZ	dTddZ
d	efddZed	ee fddZed	ee fddZed	efddZed	efddZed	efddZed	efdd Zed	efd!d"Zed	efd#d$Zed	efd%d&Zed	efd'd(Zed	eeef fd)d*Zejd+eeef d	dfd,d*Zd	eeef fd-d.Zd	efd/d0Zd	eeef fd1d2Zd3ee d4edB d	ee fd5d6Z 			7dUd8ed9edB d4edB d:ed	ee f
d;d<Z!	dVd=e"e eB d>ed	efd?d@Z#e$d3ed	efdAdBZ%e$d3ee d	ee fdCdBZ%d3eee B d	eee B fdDdBZ%	dVd=e"e d>ed	ee fdEdFZ&d3ee d	efdGdHZ'		7		dWd8eee B dIedB d:ed9ed4edB d	e(fdJdKZ)	dXdLedB dMeeeef  dB d	edB fdNdOZ*				dYdPee+ dB dMeeeef  dB dLedB dQed	eee B f
dRdSZ,  Z-S )ZKimiAudioTokenizerz"TikToken tokenizer for Kimi-Audio.FN)trust_remote_coderevisiondownload_dirpath_or_repo_idr-   r.   r/   r   c                O   s0  |rt d t|}| r|}nm| r#|d }| s"|d }n\t|}	zt|	d||d}
t|
}W n, tya   zt|	d||d}
t|
}W n ty^ } ztd|	 |d }~ww Y nw t	
t t|	d||d W d    n1 szw   Y  | std| d| |t||d	d
dS )Nz6Ignoring extra positional args for KimiAudioTokenizer.ztiktoken.modelztokenizer.model)repo_idfilenamer.   	local_dirz4Could not find tiktoken.model or tokenizer.model in tokenizer_config.jsonztiktoken.model not found at .truncation_sideleft)r   name_or_pathr6   )logger
debug_oncer   is_fileis_dirr!   r   	Exception
ValueError
contextlibsuppressFileNotFoundErrorget)clsr0   r-   r.   r/   argskwargspathr   r1   
vocab_pathexcr)   r)   r*   from_pretrained7   sj   


z"KimiAudioTokenizer.from_pretrainedr   r8   r6   c                   sD  t    || _|| _|| _i }|jd }| rSt|dd,}t	|}|
di }| D ]\}	}
t|	}|

dd}|rC|||< q/W d    n1 sNw   Y  t||\| _| _i | _i | _| jj D ]\}}|jddd}|| j|< || j|< qhi | _|   d	| _d
| _| j| _| j| _tdd | jD dd| _d S )Nr4   r   r   added_tokens_decodercontent replace)errorsi[P i\P c                 s   s    | ]}t |V  qd S Nr   ).0tokr)   r)   r*   	<genexpr>   s    z.KimiAudioTokenizer.__init__.<locals>.<genexpr>
   )default)super__init__r8   _truncation_side_vocab_fileparentr;   r   jsonloadrB   itemsr   r+   
_tokenizer_special_tokens_token_to_id_id_to_token_mergeable_ranksdecode_added_tokens_decoder_add_kimiaudio_special_tokens_bos_token_id_eos_token_id_pad_token_id_unk_token_idmax_max_chars_per_token)selfr   r8   r6   r   tokenizer_configr"   configadded_tokenstoken_id_str
token_infotoken_idrK   r'   	token_str	__class__r)   r*   rW   x   sH   




zKimiAudioTokenizer.__init__c                 C   sp   ddddddd}|  D ](\}}|| jvr5t|dd	dd
| j|< || jvr+|| j|< || jvr5|| j|< qdS )z/Add Kimi-Audio special tokens to the tokenizer.imP ioP irP i]P ivP iwP )<|im_media_begin|><|im_media_end|><|im_kimia_text_blank|><|im_msg_end|><|im_kimia_user_msg_start|> <|im_kimia_assistant_msg_start|>TF)single_word
normalizedspecialN)r]   rd   r   r`   ra   )rl   kimiaudio_special_tokensrs   rr   r)   r)   r*   re      s$   	




z0KimiAudioTokenizer._add_kimiaudio_special_tokensc                 C      dS )Nr   r)   rl   r)   r)   r*   num_special_tokens_to_add      z,KimiAudioTokenizer.num_special_tokens_to_addc                 C      t | j S rO   )listrd   valuesr   r)   r)   r*   all_special_tokens      z%KimiAudioTokenizer.all_special_tokensc                 C   r   rO   )r   rd   keysr   r)   r)   r*   all_special_ids   r   z"KimiAudioTokenizer.all_special_idsc                 C      | j S rO   )rf   r   r)   r)   r*   bos_token_id      zKimiAudioTokenizer.bos_token_idc                 C   r   rO   )rg   r   r)   r)   r*   eos_token_id   r   zKimiAudioTokenizer.eos_token_idc                 C   r   rO   )rh   r   r)   r)   r*   pad_token_id   r   zKimiAudioTokenizer.pad_token_idc                 C   r   )NFr)   r   r)   r)   r*   is_fast   s   zKimiAudioTokenizer.is_fastc                 C      | j jS rO   r^   n_vocabr   r)   r)   r*   
vocab_size      zKimiAudioTokenizer.vocab_sizec                 C   s   | j jd S )Nr   r   r   r)   r)   r*   max_token_id   s   zKimiAudioTokenizer.max_token_idc                 C   r   rO   )rk   r   r)   r)   r*   max_chars_per_token   r   z&KimiAudioTokenizer.max_chars_per_tokenc                 C   r   rO   )rX   r   r)   r)   r*   r6      r   z"KimiAudioTokenizer.truncation_sidec                 C   r   rO   )rd   r   r)   r)   r*   rJ      r   z'KimiAudioTokenizer.added_tokens_decodervaluec                 C   sX   || _ | D ]"\}}t|drt|n|}d|v r|| _qd|v s&d|v r)|| _qdS )z6Set added tokens decoder and update special token IDs.__str__rz   ry   z
<|im_end|>N)rd   r]   hasattrr!   rf   rg   )rl   r   rr   tokenrs   r)   r)   r*   rJ      s   c                 C   s
   t | jS rO   )dictr`   r   r)   r)   r*   	get_vocab  s   
zKimiAudioTokenizer.get_vocabc                 C   r   )z@Return vocab size for compatibility with HF tokenizer interface.r   r   r)   r)   r*   __len__  r   zKimiAudioTokenizer.__len__c                 C   s   dd | j  D S )Nc                 S   s   i | ]	\}}t ||qS r)   )r!   )rQ   rr   r   r)   r)   r*   
<dictcomp>
  s    z6KimiAudioTokenizer.get_added_vocab.<locals>.<dictcomp>)rd   r]   r   r)   r)   r*   get_added_vocab	  s   z"KimiAudioTokenizer.get_added_vocabtokens
max_lengthc                 C   s<   |d u s
t ||kr|S | jdkr|| d  S |d | S )Nr7   )r   r6   )rl   r   r   r)   r)   r*   _maybe_truncate  s
   
z"KimiAudioTokenizer._maybe_truncateTtext
truncationadd_special_tokensc                 K   s*   ~| j j|h dd}|r| ||}|S )N>   ry   rw   rv   rx   rz   r{   )allowed_special)r^   encoder   )rl   r   r   r   r   rE   r   r)   r)   r*   r     s   zKimiAudioTokenizer.encodeidsskip_special_tokensc                    s@   t |tr|g}|rt| j   fdd|D }| j|S )z=Decode token IDs to text, optionally skipping special tokens.c                    s   g | ]}| vr|qS r)   r)   )rQ   rr   special_idsr)   r*   
<listcomp>8  s    z-KimiAudioTokenizer.decode.<locals>.<listcomp>)
isinstancer   setr_   r   r^   rc   )rl   r   r   r)   r   r*   rc   /  s   
zKimiAudioTokenizer.decodec                 C      d S rO   r)   rl   r   r)   r)   r*   convert_tokens_to_ids;  r   z(KimiAudioTokenizer.convert_tokens_to_idsc                 C   r   rO   r)   r   r)   r)   r*   r   >  r   c                    s,   t |tr j| jS  fdd|D S )Nc                    s   g | ]
} j | jqS r)   )r`   rB   ri   )rQ   r   r   r)   r*   r   D  s    z<KimiAudioTokenizer.convert_tokens_to_ids.<locals>.<listcomp>)r   r!   r`   rB   ri   r   r)   r   r*   r   A  s   
c                 C   s6   g }|D ]}|r|| j v rq|| j|d q|S )Nz<|unk|>)rd   appendra   rB   )rl   r   r   r   rr   r)   r)   r*   convert_ids_to_tokensF  s   z(KimiAudioTokenizer.convert_ids_to_tokensc                 C   s   |  |}| j|ddS )NF)r   )r   rc   )rl   r   	token_idsr)   r)   r*   convert_tokens_to_stringP  s   
z+KimiAudioTokenizer.convert_tokens_to_string	text_pairc                    s|   |d urt dt|tr' fdd|D }dd |D }t||dS j| d}	dgt|	 }
t|	|
dS )Nz2text_pair is not supported for KimiAudioTokenizer.c                    s   g | ]}j | d qS )r   r   r   )r   )rQ   itemr   r   rl   r   r)   r*   r   c  s    z/KimiAudioTokenizer.__call__.<locals>.<listcomp>c                 S   s   g | ]	}d gt | qS )r   rP   )rQ   r   r)   r)   r*   r   l  s    )	input_idsattention_maskr   r   )NotImplementedErrorr   r   r   r   r   )rl   r   r   r   r   r   rE   input_ids_batchattention_mask_batchr   r   r)   r   r*   __call__T  s(   	
	zKimiAudioTokenizer.__call__chat_templatetoolsc                 C   s   ~|S rO   r)   )rl   r   r   r)   r)   r*   get_chat_templatez  s   z$KimiAudioTokenizer.get_chat_templatemessagestokenizec                 K   s   |d ur|n| d}|d u rtd| j||d}|d u r"tdtj|f||d|\}}	|r6|d nd}
|rA| j|
dd	S |
S )
Nconversationz5Either 'messages' or 'conversation' must be provided.)r   z?No chat template available. Provide `chat_template` explicitly.)r   r   r   rL   F)r   )rB   r>   r   hf_chat_utilsrender_jinja_templater   )rl   r   r   r   r   rE   r   templaterendered_promptr)   r)   r*   apply_chat_template  s(   	
z&KimiAudioTokenizer.apply_chat_template)r   N)NNT)F)NTFNrO   )NNNF).__name__
__module____qualname____doc__classmethodr!   r   boolrI   rW   re   r   r   propertyr   r   r   r   r   r   r   r   r   r   r6   r   r   rJ   setterr   r   r   r   r   r   rc   r   r   r   r   r   r   r   r
   r   __classcell__r)   r)   rt   r*   r,   4   s   @
6"



"



'


r,   )"r   r?   r[   collections.abcr   pathlibr   typingr   r   r   r   huggingface_hubr   transformersr   r   transformers.utilsr	   r   vllm.entrypoints.chat_utilsr
   vllm.loggerr   vllm.tokenizers.protocolr   r   r9   r   r!   r   tupler+   r,   r)   r)   r)   r*   <module>   s.   

