o
    di~                     @   s  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlZddlZddlmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4 ddl5m6Z6m7Z7 ddl*m8Z8 e	rddlm9Z9 e+ddrddl:m;Z; nddl<m;Z; e=e>Z?dZ@dZAdZBdZCee6G d d! d!e7e;ZDdS )"zAClasses handling causal-lm related architectures in ONNX Runtime.    N)Path)TemporaryDirectory)TYPE_CHECKINGAnyDictOptionalSequenceTupleUnion)HUGGINGFACE_HUB_CACHE)update_model_dims)AutoModelForCausalLMGenerationConfig)add_end_docstrings%add_start_docstrings_to_model_forward)CausalLMOutputWithPast)cached_file)InferenceSessionSessionOptions   )"MODEL_TYPES_REQUIRING_POSITION_IDSmain_export)TasksManager)check_model_uses_external_data)is_transformers_version)find_files_matching_pattern)maybe_save_preprocessors   ) DECODER_MERGED_ONNX_FILE_PATTERNDECODER_ONNX_FILE_PATTERN#DECODER_WITH_PAST_ONNX_FILE_PATTERNONNX_FILE_PATTERN)ONNX_MODEL_END_DOCSTRINGORTModel)&prepare_providers_and_provider_options)PretrainedConfig>=z4.25.0)GenerationMixina  
    Args:
        input_ids (`torch.LongTensor`):
            Indices of decoder input sequence tokens in the vocabulary of shape `(batch_size, sequence_length)`.
        attention_mask (`torch.LongTensor`, *optional*):
            Mask to avoid performing attention on padding token indices, of shape
            `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`.
        past_key_values (`tuple(tuple(torch.FloatTensor), *optional*, defaults to `None`)`
            Contains the precomputed key and value hidden states of the attention blocks used to speed up decoding.
            The tuple is of length `config.n_layers` with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`.
a  
    Args:
        input_ids (`torch.LongTensor`):
            Indices of decoder input sequence tokens in the vocabulary of shape `(batch_size, sequence_length)`.
        attention_mask (`torch.LongTensor`):
            Mask to avoid performing attention on padding token indices, of shape
            `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`.
        past_key_values (`tuple(tuple(torch.FloatTensor), *optional*, defaults to `None`)`
            Contains the precomputed key and value hidden states of the attention blocks used to speed up decoding.
            The tuple is of length `config.n_layers` with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`.
AutoTokenizera  
    Example of text generation:

    ```python
    >>> from transformers import {processor_class}
    >>> from optimum.onnxruntime import {model_class}
    >>> import torch

    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
    >>> model = {model_class}.from_pretrained("{checkpoint}")

    >>> inputs = tokenizer("My name is Arthur and I live in", return_tensors="pt")

    >>> gen_tokens = model.generate(**inputs,do_sample=True,temperature=0.9, min_length=20,max_length=20)
    >>> tokenizer.batch_decode(gen_tokens)  # doctest: +IGNORE_RESULT
    ```

    Example using `transformers.pipelines`:

    ```python
    >>> from transformers import {processor_class}, pipeline
    >>> from optimum.onnxruntime import {model_class}

    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> onnx_gen = pipeline("text-generation", model=model, tokenizer=tokenizer)

    >>> text = "My name is Arthur and I live in"
    >>> gen = onnx_gen(text)
    ```
c                ,       s  e Zd ZdZeZdZdZdddddddddd	d
ee	 ded dee
eeef  f
 fddZedd Zedd Zeedejed dd 				d=dejdeej deeeej   deej dee	 defddZ fddZ						d>d d!Zedeeej  d"ejdeeej  fd#d$Z e!d%d&ddde"ddd'dddd(ddddfd)e
eef ddd*ed+ed,e	d-e	d.e	d/ed0ee
e	ef  d1ee d2ed3ee#e  d4ee
e#e$ee%f  e$ee%f f  d5ee& de	d6ee	 d
ee	 dee' dee
eeef  dd f(d7d8Z(e!d%d&ddde"dd(fd)e
eef ddd*ed+ed,e	d-e	d.e	d/ed0ee
e	ef  de	dd fd9d:Z)d;d< Z*  Z+S )?ORTModelForCausalLMz
    ONNX model with a causal language modeling head for ONNX Runtime inference. This class officially supports bloom, codegen, falcon, gpt2, gpt-bigcode, gpt_neo, gpt_neox, gptj, llama.
    	input_idsFNconfigsessionuse_io_bindinggeneration_configmodel_save_dirr,   r%   r-   r   r.   r/   r   r0   c          
         s  |rGt d |d }t|dkr|d }t|dkr|d }t|dkr)|d }t|dkr3|d }t|dkr=|d }t|dkrG|d }|d	d d urYt d
 |d	}|rit dd|  d |d u rqtd|d u rytdt j	||||d dd | j
D | _dd | jD | _t| jdkot| jdk| _d| j
v | _|| _| jj}	|	tv rd| j
vrt d|	 d | js| jjrt d | jjdkr| jj| _n| jjdkr| jj| jj d | _n	| jj| jj | _| jjdv r| jj| _n | jjdkr| jjs| jjs| jj| _n	d| _n| jj| _| jdd d uo@| jdd d uo@| jd | jd k| _d S )NzInstantiating an ORTModelForCausalLM with positional arguments is deprecated and will be removed in the next version. Please use the keywords arguments {config, session, use_io_binding, generation_config, model_save_dir, use_cache} instead.r   r   r               modelzxPassing the inference session as `model` argument to an ORTModelForCausalLM is deprecated. Please use `session` instead.znSome keyword arguments were passed to the ORTModelForCausalLM constructor that are not part of its signature: z, ze. These arguments will be ignored in the current version and will raise an error in the next version.zYThe parameter config is required. Please pass a config or use the from_pretrained method.z[The parameter session is required. Please pass a session or use the from_pretrained method.)r,   r-   r.   r0   c                 S       g | ]}d |v sd|v r|qS .keyz.value .0keyr9   r9   r/lsinfo/ai/hellotax_ai/llm_service/venv_embed/lib/python3.10/site-packages/optimum/onnxruntime/modeling_decoder.py
<listcomp>        z0ORTModelForCausalLM.__init__.<locals>.<listcomp>c                 S   r6   r7   r9   r:   r9   r9   r=   r>      r?   use_cache_branchposition_idszcORTModelForCausalLM loaded a legacy ONNX model with no position_ids input, although the model type z requires it. for correct batched generation. We strongly encourage to re-export the model with a newer version of Optimum for better performance and more reliable generation. a  `model.generation_config.use_cache=True` but the loaded model does not support using the past key values cache.Please re-export the original model once again with `use_cache=True` to be able to use it during generation. Or set `model.generation_config.use_cache=False` to avoid errors from attempting to use the cache. To re-export your model, simply set `export=True` as in `from_pretrained(..., export=True, use_cache=True)`.gemmagpt_bigcode>   rB   llamaqwen2qwen3granitemistralsmollm3	qwen3_moefalconzpast_key_values.0.keyzpast_key_values.0.value)loggerwarninglengetpopjoinkeys
ValueErrorsuper__init__input_nameskey_value_input_namesoutput_nameskey_value_output_namescan_use_cache	is_mergedr/   r,   
model_typer   	use_cachehead_dimembed_size_per_headhidden_sizenum_attention_headsnum_key_value_headsnew_decoder_architecturemulti_querynum_kv_headsinput_shapesold_bloom_modeling)
selfr,   r-   r.   r/   r0   argskwargs_r\   	__class__r9   r=   rU      s   



zORTModelForCausalLM.__init__c                 C      t d | jS )NaS  The `ORTModelForCausalLM.use_cache` property is deprecated and will be removed in a future version. Please rather use `ORTModelForCausalLM.can_use_cache` to check if a model supports using cache during generation. And use `ORTModelForCausalLM.generation_config.use_cache` to check if the model is configured to use cache during generation.)rL   rM   rZ   rh   r9   r9   r=   r]      s   zORTModelForCausalLM.use_cachec                 C   rn   )NzThe `ORTModelForCausalLM.use_merged` property is deprecated and will be removed in a future version. Please rather use `ORTModelForCausalLM.is_merged` to check if the underlying model is merged or not.)rL   rM   r[   ro   r9   r9   r=   
use_merged   s   zORTModelForCausalLM.use_mergedzbatch_size, sequence_lengthzoptimum/gpt2)processor_classmodel_class
checkpointattention_maskpast_key_valuesrA   r]   returnc              
      s  t |tj}| | |d ur|n| jj}|r-| js-td| d| j d| j d| d	|j\}}	d urZ| jj	dkrP| jj
rHd jd }
nd jd	 }
nd d jd	 }
nd}
|d u rd
| jv r| jj	dkr|d ur|dd }||dkd |d d |
d f }n@td| jj	dkr|d ur|dd }||dkd |d d |
d f }ntdtj|
|
|	 tj|jdd|d}d }| jrtjdd utj| jd}t| jdkrRd u rE| jj	dkr| jj
r|d| jf  n(| jj	dkr| jr|| j | jdf || j d| jfn
|| jd| jf  tj | j| jdtj| j| jdtfdd| jD nt d trRtd||||d}t| jdkrj|t | j d }d }|r| j!r|j\}}	| jj	dkr| jj
rd jd }||
|	 |f  n7| jj	dkr| jrd jd d	 \}}|||
|	 f ||
|	 |fnd jd }|| j|
|	 |f   fdd| j"D }n| j"}| j!r+| j#|||d\| jj$dkr| j%&| j' n| j'(  | j%&| j' | j')  *dd }d +d }|r*tfdd| j"D n-| ,||}| j%-d |}| .||/dd }/d}|rXtfdd| j"D |rs| jj	dkrstfddt0dtd	D t1||dS ) Nz`use_cache=zH` was passed to the model but the loaded model only supports `use_cache=z2`. Please load your current model with `use_cache=z:` or export the original model once again with `use_cache=z` when calling the `from_pretrained` method. To re-export your model, simply set `export=True` in the `from_pretrained` method.rC   r   r   r   rA   optzThe model OPT requires position_ids for batched generation but none were provided. Please provide position_ids or attention_mask (from which position_ids can be inferred).zThe model gpt_bigcode requires position_ids for batched generation but none were provided. Please provide position_ids or attention_mask (from which position_ids can be inferred).)dtypedevice)r   bloomc                 3   s     | ]}d |v r
 nV  qdS )r8   Nr9   r;   name)k_tensorv_tensorr9   r=   	<genexpr>[  s    
z.ORTModelForCausalLM.forward.<locals>.<genexpr>r9   )r*   rA   rt   r@   c                    s   i | ]}|d |v r nqS )r8   r9   r|   )k_shapev_shaper9   r=   
<dictcomp>z  s    z/ORTModelForCausalLM.forward.<locals>.<dictcomp>)outputs_to_not_bindknown_output_shapescpulosslogitsc                 3   s$    | ]}  || V  qd S N)rP   viewr|   )output_buffersoutput_shapesr9   r=   r     s    
c                 3   s    | ]}  |V  qd S r   )rP   r|   )model_outputsr9   r=   r     s    c                 3   s     | ]} ||d   V  qdS )r   Nr9   )r;   i)ru   r9   r=   r     s    )r   r   ru   )2
isinstancetorchTensorraise_on_numpy_input_io_bindingr,   r]   rZ   rS   shaper\   rd   rV   cumsummasked_fill_arangelongrz   	unsqueezeexpandr[   fullboolrN   rW   r_   rg   rb   zerosry   tuplesumupdatezipr.   rY   _prepare_io_bindingtyper-   run_with_iobinding_io_bindingsynchronize_inputssynchronize_outputsrO   r   _prepare_onnx_inputsrun_prepare_onnx_outputsrP   ranger   )rh   r*   rt   ru   rA   r]   rj   	use_torch
batch_sizeseq_lenpkv_seq_lenr@   model_inputsr   r   r_   num_key_value_heads_batch_sizer   r   onnx_inputsonnx_outputsr9   )r   r~   r   r   r   ru   r   r   r=   forward  s   










"zORTModelForCausalLM.forwardc                    s,   t ddr| j|i |S t j|i |S )N<z4.46.0)r   %_prepare_inputs_for_generation_legacyrT   prepare_inputs_for_generation)rh   ri   rj   rl   r9   r=   r     s   
z1ORTModelForCausalLM.prepare_inputs_for_generationc                 K   s   |d urB| j jdkr| j jr|d jd }	n|d jd }	n	|d d jd }	|jd |	kr1|	}
n|jd d }
|d d |
d f }d| jv rm|d u rm|d urm| dd }||dkd |rm|d d df d}|||||||dS )NrC   r   r   r   rA   rx   )r*   rt   ru   cache_positioninputs_embedsrA   r]   )	r,   r\   rd   r   rV   r   r   r   r   )rh   r*   ru   rt   r   r   rA   r]   rj   r   remove_prefix_lengthr9   r9   r=   r     s.   z9ORTModelForCausalLM._prepare_inputs_for_generation_legacybeam_idxc                    s   t | trJt | d trJ| d d j| d d jkr?| d d j\}|jd  jd  t fdd| D S tfdd| D S t | trbt | d tjrbtfdd| D S td|  d)	Nr   r   c              	   3   s|    | ]9}|d    d |d  j   |d   d |d j   fV  qdS )r   r   N)r   index_selecttorz   r;   
layer_pastr   r   r^   	num_heads
seq_lengthr9   r=   r     s    
z5ORTModelForCausalLM._reorder_cache.<locals>.<genexpr>c                 3   s&    | ]}t  fd d|D V  qdS )c                 3   $    | ]}| d  |jV  qdS r   Nr   r   rz   )r;   
past_stater   r9   r=   r        " z?ORTModelForCausalLM._reorder_cache.<locals>.<genexpr>.<genexpr>N)r   r   r   r9   r=   r     s
    
c                 3   r   r   r   r   r   r9   r=   r     r   zUnexpected past_key_values: zP. Expected tuple of tuples (GPT2 style) or tuple of tensors (GPT BigCode style).)r   r   r   r   r   rS   )ru   r   batch_size_times_num_headsr9   r   r=   _reorder_cache  s    

z"ORTModelForCausalLM._reorder_cache mainCPUExecutionProviderTmodel_id	subfolderrevisionforce_downloadlocal_files_onlytrust_remote_code	cache_dirtoken	file_nameprovider	providersprovider_optionssession_optionsrp   c           $   
      s  t |td||	|d}t|dkrtd| t|dkr:|d j}|d j}|
r7|
|kr7td|
 d| |}
nsg }|durMd	d
 |D }t|dk}|du r`|rUtnt  fdd
|D }|rd|
rg|}n	t	d| d |d j}|d j}|
p}d}|D ]}|j|kr|j}|j} nq|}
t|dkrt	dd
tt| d|
 d| d tj|r|}d}t|tr| }t||
|	|||||d}|d u rt|j}zt||
d |	|||||d W n	 ty   Y nw tjt|dd}t|}|rtjt|dd}dd |jjD }dd |jjD }d}|d d dkr3d|d d< d|d d< d}| D ]}d|v rO|| d  d!krOd"|| d < d}q7|rst	d# t|||}tj|t||t|jd dddd$ ~t|d%r}d|_ t|d&rd|_!|d u rzt"j#||	|||||d'}W n t$y   t%d( t"&|}Y nw ||_'||_'t(d)d*r|) } t| dkrt	d+|  d, | * D ]\}!}"t+||!|" t+||!d  qt,|||d-\}}t-||||d.}#| ||#|||d/S )0Nz	**/*.onnx)glob_patternr   r   r   r   z&Could not find any ONNX model file in r   zTrying to load z but only found Fc                 S   s    g | ]}t tt|r|qS r9   )researchr   strr;   pr9   r9   r=   r>   4  r?   z8ORTModelForCausalLM._from_pretrained.<locals>.<listcomp>c                    s    g | ]}t  t|r|qS r9   )r   r   r   r   patternr9   r=   r>   9  r?   zLegacy models found in zm will be loaded. Legacy models will be deprecated in the next version of optimum, please re-export your modelz
model.onnxz(Too many ONNX model files were found in z ,zh. specify which one to load by using the `file_name` and/or the `subfolder` arguments. Loading the file z in the subfolder .r   )filenamer   r   r   r   r   r   _data)load_external_dataTc                 S   &   i | ]}|j d d |jjjjD qS )c                 S      g | ]}|j p	|jqS r9   	dim_value	dim_paramr;   dimr9   r9   r=   r>         CORTModelForCausalLM._from_pretrained.<locals>.<dictcomp>.<listcomp>r}   r   tensor_typer   r   r;   noder9   r9   r=   r         z8ORTModelForCausalLM._from_pretrained.<locals>.<dictcomp>c                 S   r   )c                 S   r   r9   r   r   r9   r9   r=   r>     r   r   r   r   r9   r9   r=   r     r   r*   sequence_lengthr   pastr   z&past_sequence_length + sequence_lengthpast_sequence_lengthaq  The ONNX model was probably exported with an older version of optimum. We are updating the input/output dimensions and overwriting the model file with new dimensions. This is necessary for the model to work correctly with the current version of optimum. If you encounter any issues, please re-export the model with the latest version of optimum for optimal performance.)save_as_external_datalocationall_tensors_to_one_fileconvert_attributesize_threshold
is_decoderis_encoder_decoder)r   r   r   r   r   r   zGGeneration config file not found, creating a new one from model config.r&   z4.45.0zHMoving the following attributes in the config to the generation config: z. You are seeing this warning because you've set generation parameters in the model config, as opposed to in the generation config.)r   r   r   )r   r   sess_optionsr+   ).r   r!   rN   FileNotFoundErrorparentr}   r    r   rL   rM   rQ   mapr   ospathisdirr   r   as_posixr   EnvironmentErroronnxloadr   graphinputoutputrR   r   update_inputs_outputs_dimssavehasattrr   r   r   from_pretrainedOSErrorinfofrom_model_configr]   r   &_get_non_default_generation_parametersitemssetattrr$   r   )$clsr   r,   r   r   r   r   r   r   r   r   r   r   r   r   r]   rp   r.   r/   r0   
onnx_files
_file_namemodel_filesdefaut_file_namefilemodel_cache_path
onnx_modelmodel_uses_external_data
input_dimsoutput_dimsoverride_dims
input_namemisplaced_generation_parameters
param_nameparam_valuer-   r9   r   r=   _from_pretrained  s@  	








	

	

z$ORTModelForCausalLM._from_pretrainedc                 K   s   t j| jd}|
r|d7 }|dd d ur!td| j d| dt }t|j}t	|||ddd||||	|||d t
|||d	 | j||f|
|d
|S )N)rr   z
-with-pasttaskz?The `task` argument is not needed when exporting a model with `z;`. The `task` is automatically inferred from the class as `z`.F)model_name_or_pathr  r'  do_validationno_post_processlegacyr   r   r   r   r   r   r   )src_subfolder)r]   r0   )r   %_infer_task_from_model_or_model_classauto_model_classrO   rS   __name__r   r   r}   r   r   r&  )r  r   r,   r   r   r   r   r   r   r   r]   rj   r'  save_dirsave_dir_pathr9   r9   r=   _export  sH   

zORTModelForCausalLM._exportc                 C   s   | j | | j| dS )z
        Save the model and generation configs to the specified directory.

        Args:
            save_directory (`str` or `os.PathLike`):
                Directory where the model and generation configs will be saved.
        N)r,   save_pretrainedr/   )rh   save_directoryr9   r9   r=   _save_config  s   z ORTModelForCausalLM._save_config)NNNN)NNNNNN),r/  
__module____qualname____doc__r   r.  main_input_name_supports_cache_classr   r   r
   r   r   r   rU   propertyr]   rp   r   CAUSALLM_ONNX_MODEL_DOCSTRINGformatTEXT_GENERATION_EXAMPLE_TOKENIZER_FOR_DOCr   
LongTensorr	   r   r   r   r   r   staticmethodr   classmethodr   r   r   r   r   r   r&  r2  r5  __classcell__r9   r9   rl   r=   r)   z   sB   l

 

-%
	

" d
	
8r)   )Er8  loggingr  r   pathlibr   tempfiler   typingr   r   r   r   r   r	   r
   r  r   huggingface_hub.constantsr   
onnx.toolsr   transformersr   r   transformers.file_utilsr   r   transformers.modeling_outputsr   transformers.utilsr   onnxruntimer   r   exporters.onnxr   r   exporters.tasksr   
onnx.utilsr   utilsr   utils.file_utilsr   utils.save_utilsr   	constantsr   r   r    r!   modeling_ortr"   r#   r$   r%   transformers.generationr'   transformers.generation_utils	getLoggerr/  rL   DECODER_INPUTS_DOCSTRINGr<  r?  r>  r)   r9   r9   r9   r=   <module>   sJ   $

!