o
    :/i8                     @   sj  d Z ddlZddlZddlmZmZmZ ddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ eeZ dZ!dZ"dZ#dd e$ddD Z%dd e$ddD Z&e!e#e"gZ'e!e#e"dZ(dZ)dZ*dede+dB de+dB de+dB de,e+e
f f
dd Z-d!ede.e
e,e+e/f f fd"d#Z0G d$d% d%eZ1dS )&z&Tokenizer for Grok-2 .tok.json format.    N)
CollectionSequenceSet)Path)AnyLiteraloverload)hf_hub_download)EntryNotFoundErrorHfHubHTTPErrorRepositoryNotFoundErrorRevisionNotFoundError)BatchEncoding)chat_template_utils)ChatCompletionMessageParam)init_logger   )TokenizerLikez<|pad|>z<|eos|>z<|separator|>c                 C      g | ]}d | dqS )z<|reserved_|> .0ir   r   b/lsinfo/ai/hellotax_ai/llm_service/venv_vllm/lib/python3.10/site-packages/vllm/tokenizers/grok2.py
<listcomp>       r         c                 C   r   )z	<|controlr   r   r   r   r   r   r       r   i  )padsepeosa  {% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}zn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
model_pathrepo_idrevisiondownload_dirreturnc             
   C   sh  | d }|  r$|jddd}t|W  d    S 1 sw   Y  |d u r*i S z
t|d||d}W n( tttfyB   i  Y S  ty\ } zt	
d|| i W  Y d }~S d }~ww z"t|jddd}t|W  d    W S 1 sxw   Y  W d S  tjy } zt	
d| i W  Y d }~S d }~w ty } zt	
d| i W  Y d }~S d }~ww )	Nztokenizer_config.jsonrutf-8)encodingr#   filenamer$   	cache_dirzFailed to download tokenizer_config.json from %s. This may be due to a network or authentication issue. The default chat template will be used. Error: %szXFailed to parse tokenizer_config.json. The default chat template will be used. Error: %szWFailed to open tokenizer_config.json. The default chat template will be used. Error: %s)is_fileopenjsonloadr	   r   r   r
   r   loggerwarningr   JSONDecodeErrorOSError)r"   r#   r$   r%   config_pathfconfig_fileexcr   r   r   _maybe_load_tokenizer_config9   sX    

(r9   
vocab_filec              
      s  zdd l  W n ty } ztd|d }~ww | d}t|}W d    n1 s-w   Y  dd |dg D }dd |dg D }|d	d
krRt}n
td|d	|d|}t| |||d}d|v ru|d |d<  j	di |}d }	d|v rdd |d D }	|	pt
 |_t|_t
 dddtdtd tt B dtd tt B dtt f fdd}
t|
||_| jt
t O  _| jt
tt O  _||fS )Nr   z1Grok-2 tokenizer requires the `tiktoken` package.rbc                 S   s   i | ]}t |d  |d qS )bytestoken)r<   r   itemr   r   r   
<dictcomp>z       z+_load_tiktoken_encoding.<locals>.<dictcomp>regular_tokensc                 S   s(   i | ]}t |d  jddd|d qS )r<   r(   replaceerrorsr=   r<   decoder>   r   r   r   r@   ~   s    special_tokens
word_splitV1zUnknown word_split: pat_str)namerK   mergeable_ranksrH   
vocab_sizeexplicit_n_vocabdefault_allowed_specialc                 S   s   h | ]}t |jd ddqS )r(   rC   rD   rF   )r   
bytes_listr   r   r   	<setcomp>   rA   z*_load_tiktoken_encoding.<locals>.<setcomp>allallowed_specialdisallowed_specialtextrU   rV   r&   c                   s*   ~t |tr|| jO } jj| ||ddS )Nr   rT   )
isinstanceset_default_allowed_specialEncodingencode)selfrW   rU   rV   tiktokenr   r   encode_patched   s   

z/_load_tiktoken_encoding.<locals>.encode_patchedr   )r_   ImportErrorr.   r/   r0   get	PAT_STR_B
ValueErrorstrr[   rY   rZ   DEFAULT_CONTROL_TOKENS_control_tokensr   r   r   listint	functoolspartialr\   valuesCONTROL_TOKEN_TEXTSRESERVED_TOKEN_TEXTS)r:   r8   r6   	xtok_dictrM   rH   rK   kwargs	tokenizerrP   r`   r   r^   r   _load_tiktoken_encodingo   sf   


rr   c                       s2  e Zd ZedddddeeB dededB dedB dd f
d	d
ZdddededededB deee	f dB ddf fddZ
defddZedee fddZedee fddZedefddZedefddZedefddZedefdd Zedefd!d"Zedefd#d$Zedefd%d&Zedefd'd(Zdeeef fd)d*Zdeeef fd+d,Zd-ee d.edB dee fd/d0Z			1dMd2ed3edB d.edB d4edee f
d5d6Z	dNd7ee eB d8edefd9d:Zed-edefd;d<Z ed-ee dee fd=d<Z d-eee B deee B fd>d<Z 	dNd7ee d8edee fd?d@Z!d-ee defdAdBZ"		1		dOd2eee B dCedB d4ed3ed.edB de#fdDdEZ$	dPdedB dFeeee	f  dB dedB fdGdHZ%			dQdIee& dFeeee	f  dB dedB dJedeee B f
dKdLZ'  Z(S )RGrok2TokenizerFN)trust_remote_coder$   r%   path_or_repo_idrt   r$   r%   r&   c                O   s   |rt d t|}| r|}|j}	d }
n | r$|d }|}	d }
nttt|d||d}|j}	t|}
| sCtd| dt	|	|
||d}| |t||
dd|
d	|d
S )Nz2Ignoring extra positional args for Grok2Tokenizer.ztokenizer.tok.jsonr*   z tokenizer.tok.json not found at .)r#   r$   r%   truncation_sideleftchat_template)r:   name_or_pathrw   ry   init_kwargs)r1   
debug_oncer   r-   parentis_dirr	   re   FileNotFoundErrorr9   rb   )clsru   rt   r$   r%   argsrp   pathr:   r"   r#   configr   r   r   from_pretrained   sH   


zGrok2Tokenizer.from_pretrained)r{   r:   rz   rw   ry   r{   c          
         s2  t    || _|| _|pi | _|pt| _t|\| _| _	i | _
i | _| jj D ]\}}|jddd}|| j
|< || j|< q)| j	 D ]\}}|| j
|< || j|< qD| j	t}	|	d u rc| j	t}	|	d u rm| j	t}	|	d u rsd}	|	| _| j	t| j| _| j	t| j| _| j| _tdd | j
D | _d S )Nr(   rC   rD   r   c                 s   s    | ]}t |V  qd S Nlen)r   tokr   r   r   	<genexpr>  s    z*Grok2Tokenizer.__init__.<locals>.<genexpr>)super__init__rz   _truncation_sider{   DEFAULT_CHAT_TEMPLATE_chat_templaterr   
_tokenizer_special_tokens_token_to_id_id_to_token_mergeable_ranksitemsrG   rb   SEPPADEOS_bos_token_id_eos_token_id_pad_token_id_unk_token_idmax_max_chars_per_token)
r]   r:   rz   rw   ry   r{   r=   token_id	token_strbos_token_id	__class__r   r   r      s6   
	



zGrok2Tokenizer.__init__c                 C      dS )Nr   r   r]   r   r   r   num_special_tokens_to_add     z(Grok2Tokenizer.num_special_tokens_to_addc                 C      t | j S r   )rh   r   keysr   r   r   r   all_special_tokens     z!Grok2Tokenizer.all_special_tokensc                 C   r   r   )rh   r   rl   r   r   r   r   all_special_ids!  r   zGrok2Tokenizer.all_special_idsc                 C      | j S r   )r   r   r   r   r   r   %     zGrok2Tokenizer.bos_token_idc                 C   r   r   )r   r   r   r   r   eos_token_id)  r   zGrok2Tokenizer.eos_token_idc                 C   r   r   )r   r   r   r   r   pad_token_id-  r   zGrok2Tokenizer.pad_token_idc                 C   r   )NFr   r   r   r   r   is_fast1  s   zGrok2Tokenizer.is_fastc                 C   s   | j jS r   r   n_vocabr   r   r   r   rN   5  s   zGrok2Tokenizer.vocab_sizec                 C   s   | j jd S )Nr   r   r   r   r   r   max_token_id9  s   zGrok2Tokenizer.max_token_idc                 C   r   r   )r   r   r   r   r   max_chars_per_token=  r   z"Grok2Tokenizer.max_chars_per_tokenc                 C   r   r   )r   r   r   r   r   rw   A  r   zGrok2Tokenizer.truncation_sidec                 C   
   t | jS r   )dictr   r   r   r   r   	get_vocabE     
zGrok2Tokenizer.get_vocabc                 C   r   r   )r   r   r   r   r   r   get_added_vocabH  r   zGrok2Tokenizer.get_added_vocabtokens
max_lengthc                 C   s<   |d u s
t ||kr|S | jdkr|| d  S |d | S )Nrx   )r   rw   )r]   r   r   r   r   r   _maybe_truncateK  s
   
zGrok2Tokenizer._maybe_truncateTrW   
truncationadd_special_tokensc                 C   s"   ~| j |}|r| ||}|S r   )r   r\   r   )r]   rW   r   r   r   r   r   r   r   r\   R  s
   zGrok2Tokenizer.encodeidsskip_special_tokensc                    s2   t |tr|g}|r fdd|D } j|S )Nc                    s   g | ]}| j  vr|qS r   )r   rl   )r   r   r   r   r   r   e  s
    z)Grok2Tokenizer.decode.<locals>.<listcomp>)rX   ri   r   rG   )r]   r   r   r   r   r   rG   _  s   

zGrok2Tokenizer.decodec                 C      d S r   r   r]   r   r   r   r   convert_tokens_to_idsl  r   z$Grok2Tokenizer.convert_tokens_to_idsc                 C   r   r   r   r   r   r   r   r   o  r   c                    s,   t |tr j| jS  fdd|D S )Nc                    s   g | ]
} j | jqS r   )r   rb   r   )r   r=   r   r   r   r   u  s    z8Grok2Tokenizer.convert_tokens_to_ids.<locals>.<listcomp>)rX   re   r   rb   r   r   r   r   r   r   r  s   
c                 C   s:   g }|D ]}|r|| j  v rq|| j|d q|S )Nz<|unk|>)r   rl   appendr   rb   )r]   r   r   r   r   r   r   r   convert_ids_to_tokensw  s   z$Grok2Tokenizer.convert_ids_to_tokensc                 C   s   |  |}| j|ddS )NF)r   )r   rG   )r]   r   	token_idsr   r   r   convert_tokens_to_string  s   
z'Grok2Tokenizer.convert_tokens_to_string	text_pairc           
         s|   |d urt dt|tr' fdd|D }dd |D }t||dS j| d}dgt| }	t||	dS )Nz.text_pair is not supported for Grok2Tokenizer.c                    s   g | ]}j | d qS )r   r   r   )r\   r>   r   r   r]   r   r   r   r     s    z+Grok2Tokenizer.__call__.<locals>.<listcomp>c                 S   s   g | ]	}d gt | qS )r   r   )r   r   r   r   r   r     s    )	input_idsattention_maskr   r   )NotImplementedErrorrX   rh   r   r\   r   )
r]   rW   r   r   r   r   input_ids_batchattention_mask_batchr   r   r   r   r   __call__  s$   
	zGrok2Tokenizer.__call__toolsc                 C   s   ~|p| j S r   )r   )r]   ry   r   r   r   r   get_chat_template  s   
z Grok2Tokenizer.get_chat_templatemessagestokenizec                 K   sT   | j ||d}|d u rtdd|d< tjd|||d|}|r(| j|ddS |S )N)r   z?No chat template available. Provide `chat_template` explicitly.Freturn_dict)conversationry   r   )r   r   )r   rd   hf_chat_utilsapply_chat_templater\   )r]   r   r   ry   r   rp   templatepromptr   r   r   r     s    z"Grok2Tokenizer.apply_chat_template)NNT)F)NTFNr   )NNF))__name__
__module____qualname__classmethodre   r   boolr   r   r   r   ri   r   propertyrh   r   r   r   r   r   r   rN   r   r   rw   r   r   r   r\   r   rG   r   r   r   r   r   r   r   r   r   __classcell__r   r   r   r   rs      s   :+"



"



$
	
rs   )2__doc__rj   r/   collections.abcr   r   r   pathlibr   typingr   r   r   huggingface_hubr	   huggingface_hub.utilsr
   r   r   r   transformersr   transformers.utilsr   r   vllm.entrypoints.chat_utilsr   vllm.loggerr   protocolr   r   r1   r   r   r   rangern   rm   DEFAULT_SPECIAL_TOKENSrf   r   rc   re   r   r9   tupleri   rr   rs   r   r   r   r   <module>   sT   


6
K