o
    di,                    @   s  d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
mZmZmZmZ ddlZddlZddlmZ ddlmZmZmZmZmZmZmZmZ ddlmZmZ dd	l m!Z!m"Z" dd
l#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z: ddl;m<Z<m=Z=m>Z>m?Z?m@Z@ ddlAmBZB ddl.mCZCmDZDmEZEmFZFmGZG erddlmHZH e4eIZJdZKdZLdZMdZNdZOdZPd ZQd!ZRd"ZSd#ZTd$ZUd%ZVd&ZWd'ZXd(ZYd)ZZd*Z[G d+d, d,e:Z\G d-d. d.e:Z]G d/d0 d0e\Z^G d1d2 d2e\Z_G d3d4 d4e\Z`G d5d6 d6e9eBZaeeKG d7d8 d8eaeZbeeKG d9d: d:eaeZcG d;d< d<eceZdeeKG d=d> d>eaeZeeeKG d?d@ d@eaeZfdS )Az~
ORTModelForXXX classes related to seq2seq, allowing to run ONNX Models with ONNX Runtime using the same API as Transformers.
    N)Path)TemporaryDirectory)TYPE_CHECKINGAnyDictOptionalSequenceSetTupleUnion)HUGGINGFACE_HUB_CACHE)
AutoConfigAutoModelForSeq2SeqLMAutoModelForSpeechSeq2SeqAutoModelForVision2SeqGenerationConfigGenerationMixin"Pix2StructForConditionalGenerationWhisperForConditionalGeneration)add_end_docstrings%add_start_docstrings_to_model_forward)BaseModelOutputSeq2SeqLMOutput)(MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES)cached_file)InferenceSessionSessionOptions   )main_export)TasksManager)NormalizedConfigManageris_transformers_version)find_files_matching_pattern)
get_logger	warn_once)maybe_save_preprocessors   )ORTParentMixinORTSessionMixin) DECODER_MERGED_ONNX_FILE_PATTERNDECODER_ONNX_FILE_PATTERN#DECODER_WITH_PAST_ONNX_FILE_PATTERNENCODER_ONNX_FILE_PATTERNONNX_FILE_PATTERN)ORTModel)ONNX_DECODER_NAMEONNX_DECODER_WITH_PAST_NAMEONNX_ENCODER_NAMEDummyWhisperModel&prepare_providers_and_provider_options)PretrainedConfigab  
    This model inherits from [`~onnxruntime.modeling_ort.ORTModelForConditionalGeneration`], check its documentation for the generic methods the
    library implements for all its model (such as downloading or saving).

    This class should be initialized using the [`onnxruntime.modeling_ort.ORTModelForConditionalGeneration.from_pretrained`] method.
az  
    Args:
        input_ids (`torch.LongTensor`):
            Indices of input sequence tokens in the vocabulary of shape `(batch_size, encoder_sequence_length)`.
        attention_mask (`torch.LongTensor`):
            Mask to avoid performing attention on padding token indices, of shape
            `(batch_size, encoder_sequence_length)`. Mask values selected in `[0, 1]`.
z
    Args:
        input_features (`torch.FloatTensor`):
            Mel / fbank features extracted from the raw speech waveform. `(batch_size, feature_size, encoder_sequence_length)`.
z
    Args:
        pixel_values (`torch.FloatTensor`):
            Features extracted from an Image. This tensor should be of shape `(batch_size, num_channels, height, width)`.
a^  
    Args:
        flattened_patches (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_channels x patch_height x patch_width)`):
            Flattened and padded pixel values.
        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Mask to avoid performing attention on padding pixel values.
a  
    Args:
        input_ids (`torch.LongTensor`):
            Indices of decoder input sequence tokens in the vocabulary of shape `(batch_size, decoder_sequence_length)`.
        encoder_hidden_states (`torch.FloatTensor`):
            The encoder `last_hidden_state` of shape `(batch_size, encoder_sequence_length, hidden_size)`.
        encoder_attention_mask (`torch.LongTensor`, *optional*):
            Mask to avoid performing cross-attention on padding tokens indices of encoder `input_ids`.
        past_key_values (`tuple(tuple(torch.FloatTensor), *optional*, defaults to `None`)`
            Contains the precomputed key and value hidden states of the attention blocks used to speed up decoding.
            The tuple is of length `config.n_layers` with each tuple having 2 tensors of shape
            `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)` and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
a  
    Args:
        input_ids (`torch.LongTensor`):
            Indices of input sequence tokens in the vocabulary of shape `(batch_size, encoder_sequence_length)`.
        attention_mask (`torch.LongTensor`):
            Mask to avoid performing attention on padding token indices, of shape
            `(batch_size, encoder_sequence_length)`. Mask values selected in `[0, 1]`.
        decoder_input_ids (`torch.LongTensor`):
            Indices of decoder input sequence tokens in the vocabulary of shape `(batch_size, decoder_sequence_length)`.
        encoder_outputs (`torch.FloatTensor`):
            The encoder `last_hidden_state` of shape `(batch_size, encoder_sequence_length, hidden_size)`.
        past_key_values (`tuple(tuple(torch.FloatTensor), *optional*, defaults to `None`)`
            Contains the precomputed key and value hidden states of the attention blocks used to speed up decoding.
            The tuple is of length `config.n_layers` with each tuple having 2 tensors of shape
            `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)` and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
a  
    Args:
        input_features (`torch.FloatTensor`):
            Mel features extracted from the raw speech waveform.
            `(batch_size, feature_size, encoder_sequence_length)`.
        decoder_input_ids (`torch.LongTensor`):
            Indices of decoder input sequence tokens in the vocabulary of shape `(batch_size, decoder_sequence_length)`.
        encoder_outputs (`torch.FloatTensor`):
            The encoder `last_hidden_state` of shape `(batch_size, encoder_sequence_length, hidden_size)`.
        past_key_values (`tuple(tuple(torch.FloatTensor), *optional*, defaults to `None`)`
            Contains the precomputed key and value hidden states of the attention blocks used to speed up decoding.
            The tuple is of length `config.n_layers` with each tuple having 2 tensors of shape
            `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)` and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
a  
    Args:
        pixel_values (`torch.FloatTensor`):
            Features extracted from an Image. This tensor should be of shape
            `(batch_size, num_channels, height, width)`.
        decoder_input_ids (`torch.LongTensor`):
            Indices of decoder input sequence tokens in the vocabulary of shape `(batch_size, decoder_sequence_length)`.
        encoder_outputs (`torch.FloatTensor`):
            The encoder `last_hidden_state` of shape `(batch_size, encoder_sequence_length, hidden_size)`.
        past_key_values (`tuple(tuple(torch.FloatTensor), *optional*, defaults to `None`)`
            Contains the precomputed key and value hidden states of the attention blocks used to speed up decoding.
            The tuple is of length `config.n_layers` with each tuple having 2 tensors of shape
            `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)` and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
aW  
    Args:
        flattened_patches (`torch.FloatTensor` of shape `(batch_size, seq_length, hidden_size)`):
            Flattened pixel patches. the `hidden_size` is obtained by the following formula: `hidden_size` =
            `num_channels` * `patch_size` * `patch_size`
            The process of flattening the pixel patches is done by `Pix2StructProcessor`.
        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.
            Pix2StructText uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*)
            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at
            the output of the last layer of the encoder. Used in the cross-attention of the decoder.
        past_key_values (`tuple(tuple(torch.FloatTensor), *optional*, defaults to `None`)`
            Contains the precomputed key and value hidden states of the attention blocks used to speed up decoding.
            The tuple is of length `config.n_layers` with each tuple having 2 tensors of shape
            `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)` and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
AutoTokenizerAutoProcessorAutoImageProcessora  
    Example of text generation:

    ```python
    >>> from transformers import {processor_class}
    >>> from optimum.onnxruntime import {model_class}

    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
    >>> model = {model_class}.from_pretrained("{checkpoint}")

    >>> inputs = tokenizer("My name is Eustache and I like to", return_tensors="pt")

    >>> gen_tokens = model.generate(**inputs)
    >>> outputs = tokenizer.batch_decode(gen_tokens)
    ```

    Example using `transformers.pipeline`:

    ```python
    >>> from transformers import {processor_class}, pipeline
    >>> from optimum.onnxruntime import {model_class}

    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> onnx_translation = pipeline("translation_en_to_de", model=model, tokenizer=tokenizer)

    >>> text = "My name is Eustache."
    >>> pred = onnx_translation(text)
    ```
a8  
    Example of text generation:

    ```python
    >>> from transformers import {processor_class}
    >>> from optimum.onnxruntime import {model_class}
    >>> from datasets import load_dataset

    >>> processor = {processor_class}.from_pretrained("{checkpoint}")
    >>> model = {model_class}.from_pretrained("{checkpoint}")

    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
    >>> inputs = processor.feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")

    >>> gen_tokens = model.generate(inputs=inputs.input_features)
    >>> outputs = processor.tokenizer.batch_decode(gen_tokens)
    ```

    Example using `transformers.pipeline`:

    ```python
    >>> from transformers import {processor_class}, pipeline
    >>> from optimum.onnxruntime import {model_class}
    >>> from datasets import load_dataset

    >>> processor = {processor_class}.from_pretrained("{checkpoint}")
    >>> model = {model_class}.from_pretrained("{checkpoint}")
    >>> speech_recognition = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor)

    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
    >>> pred = speech_recognition(ds[0]["audio"]["array"])
    ```
a  
    Example of text generation:

    ```python
    >>> from transformers import {processor_class}, {tokenizer_class}
    >>> from optimum.onnxruntime import {model_class}
    >>> from PIL import Image
    >>> import requests


    >>> processor = {processor_class}.from_pretrained("{checkpoint}")
    >>> tokenizer = {tokenizer_class}.from_pretrained("{checkpoint}")
    >>> model = {model_class}.from_pretrained("{checkpoint}", export=True)

    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    >>> image = Image.open(requests.get(url, stream=True).raw)
    >>> inputs = processor(image, return_tensors="pt")

    >>> gen_tokens = model.generate(**inputs)
    >>> outputs = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)

    ```

    Example using `transformers.pipeline`:

    ```python
    >>> from transformers import {processor_class}, {tokenizer_class}, pipeline
    >>> from optimum.onnxruntime import {model_class}
    >>> from PIL import Image
    >>> import requests


    >>> processor = {processor_class}.from_pretrained("{checkpoint}")
    >>> tokenizer = {tokenizer_class}.from_pretrained("{checkpoint}")
    >>> model = {model_class}.from_pretrained("{checkpoint}", export=True)

    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    >>> image = Image.open(requests.get(url, stream=True).raw)

    >>> image_to_text = pipeline("image-to-text", model=model, tokenizer=tokenizer, feature_extractor=processor, image_processor=processor)
    >>> pred = image_to_text(image)
    ```
ae  
    Example of pix2struct:

    ```python
    >>> from transformers import {processor_class}
    >>> from optimum.onnxruntime import {model_class}
    >>> from PIL import Image
    >>> import requests

    >>> processor = {processor_class}.from_pretrained("{checkpoint}")
    >>> model = {model_class}.from_pretrained("{checkpoint}", export=True, use_io_binding=True)

    >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
    >>> image = Image.open(requests.get(url, stream=True).raw)
    >>> question = "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"
    >>> inputs = processor(images=image, text=question, return_tensors="pt")

    >>> gen_tokens = model.generate(**inputs)
    >>> outputs = processor.batch_decode(gen_tokens, skip_special_tokens=True)
    ```
c                   @   sL   e Zd ZdZdZ	ddddddee fd	d
Zdej	dej	de
fddZdS )
ORTEncoderzI
    Encoder of an encoder-decoder model for ONNX Runtime inference.
    	input_idsNsessionr   parent_model ORTModelForConditionalGenerationuse_io_bindingc                 C   s<   |  || t|jdr|jjn|j}t|j|| _d S )Nencoder)initialize_ort_attributeshasattrconfigr>   r    get_normalized_config_class
model_typenormalized_config)selfr:   r;   r=   rA    rF   r/lsinfo/ai/hellotax_ai/llm_service/venv_embed/lib/python3.10/site-packages/optimum/onnxruntime/modeling_seq2seq.py__init__Z  s   zORTEncoder.__init__attention_maskreturnc                 K      t |tj}| | ||d}| jrC| |\}}| jjdkr(| j	| j
 n| j
  | j	| j
 | j
  |d |d }n| ||}	| jd |	}
| ||
}|d }t|dS )Nr9   rI   cpulast_hidden_staterN   
isinstancetorchTensorraise_on_numpy_input_io_bindingr=   _prepare_io_bindingdevicetyper:   run_with_iobinding_io_bindingsynchronize_inputssynchronize_outputsview_prepare_onnx_inputsrun_prepare_onnx_outputsr   )rE   r9   rI   kwargs	use_torchmodel_inputsoutput_shapesoutput_buffersrN   onnx_inputsonnx_outputsmodel_outputsrF   rF   rG   forwardd  s$   



zORTEncoder.forwardN)__name__
__module____qualname____doc__main_input_namer   boolrH   rR   rS   r   rh   rF   rF   rF   rG   r8   S  s    

r8   c                   @   s*  e Zd ZdZdZ	ddddddee fd	d
Z	ddej	dej	dee dee
e
ej   deeef f
ddZdedee fddZ				ddejdejdeej deej dee
e
ej   deej	 defddZdeeejejf  dee
eejejf   deeejejf  defddZdS )ORTDecoderForSeq2SeqzI
    Decoder of an encoder-decoder model for ONNX Runtime inference.
    r9   Nr:   r   r;   r<   r=   c                 C   sx  |  || t|jdr|jjn|j}t|j|| _|j| _|j	| _	|jjdko0|jjjdk| _
dd | jD | _dd | jD | _t| jdkrTdd | jD | _t| jdkrdd	d | jD | _| j	d
u rtt| jdkrttdt| jdk| _t| jdk| _| js| js| j
rd| _nd| _t | _| jD ]}|drd|v r| j| q| jdu ot| jdk| _d S )Ndecoderzvision-encoder-decodergpt2c                 S       g | ]}d |v sd|v r|qS z.keyz.valuerF   .0keyrF   rF   rG   
<listcomp>       z1ORTDecoderForSeq2Seq.__init__.<locals>.<listcomp>c                 S   rs   rt   rF   ru   rF   rF   rG   rx     ry   r   c                 S      g | ]}d |v r|qS 
key_valuesrF   ru   rF   rF   rG   rx         c                 S   rz   r{   rF   ru   rF   rF   rG   rx     r}   Tz9Could not find the past key values in the provided model.r      presentr>   F)r?   r@   rA   rq   r    rB   rC   rD   
use_merged	use_cacheno_cross_attention_cacheinput_nameskey_value_input_namesoutput_nameskey_value_output_nameslenRuntimeErroruse_past_in_outputsuse_past_in_inputsnum_pkvset,past_key_values_cross_attention_output_names
startswithadduse_legacy_outputs)rE   r:   r;   r=   rA   output_namerF   rF   rG   rH     s8   
zORTDecoderForSeq2Seq.__init__encoder_hidden_statesuse_cache_branchpast_key_valuesrJ   c                 C   s   | d}| jj}| jj| }| d}| d}	|d ur*|dur*||d  d7 }||||f}
|d ur?|du r?d|d|f}n|||	|f}i }t| jD ]\}}|d dk }|s]| jdkr_|
n|||< qL|S )Nr   r&   Fr   Tr~   )sizerD   num_attention_headshidden_size	enumerater   r   )rE   r9   r   r   r   
batch_sizer   embed_size_per_headsequence_lengthencoder_sequence_lengthself_attn_shapecross_attn_shapepast_key_values_shapesidxnameis_self_attnrF   rF   rG   %compute_past_key_values_output_shapes  s    


z:ORTDecoderForSeq2Seq.compute_past_key_values_output_shapesuse_merged_cachec                 C   s(   dd | j D }|du r|| j}|S )Nc                 S   s"   h | ]}| d s|dvr|qS )r   >   losslogits)r   rv   r   rF   rF   rG   	<setcomp>  s    z?ORTDecoderForSeq2Seq.get_outputs_not_to_bind.<locals>.<setcomp>T)r   unionr   )rE   r   resultrF   rF   rG   get_outputs_not_to_bind  s   z,ORTDecoderForSeq2Seq.get_outputs_not_to_binddecoder_attention_maskencoder_attention_maskcache_positionc                    s  t |tj}| d urtdd D d u oj}d uo%j}	j|||d\}
}|||||
|d}d urH|tj	 j
r]j|||
d urX|
 nd d}|	}j|||d\}}jjdkrzjj nj  jj j  | D ]\}}|jv r|d d d	 |d
d   ||< qdjD ]}|jv r|	rq|| || f7 q|d |d }d }djv r|d |d }jsd n"jr|sjrtfddtdtj D nj!du r%d}t"t#|d tfddtdtj D nj dkr@tfddtdtj D nǈj dkrYtfddtdtdD nt$d%||}j&d |}'|| t fddjD  (dd } d }jsd nzjr|sjrtfddtdtj D n[j!du rd}t"t#|d tfddtdtj D n9j dkrtfddtdtj D nj dkrtfddtdtj D nt$dt)||dS )Nc                 s   s    | ]
}|D ]}|V  qqd S ri   rF   )rv   pkv_per_layerpast_key_valuerF   rF   rG   	<genexpr>  s    z/ORTDecoderForSeq2Seq.forward.<locals>.<genexpr>)ra   )r9   r   r   r   r   r   )r   r   )outputs_to_not_bindknown_output_shapesrM   r   )   rF   r   r   c                 3   "    | ]} ||j   V  qd S ri   r   rv   iout_past_key_valuesrE   rF   rG   r   <      
r   TzFor the decoder with past, using ONNX models outputting cross attention past key values is deprecated and the support will be removed in optimum 2.0. We recommend exporting again the model with optimum>=1.7.3.)msgc                 3   r   ri   r   r   r   rF   rG   r   G  
    
c                 3   D    | ]} ||j   d | d  d | d  j    V  qdS r   Nr   r   r   r   rE   rF   rG   r   M       
r~   c                 3   s<    | ]} ||d   d | d  d | d   V  qdS r   r~   NrF   r   r   r   rF   rG   r   T  s
    *
zUnsupported num_pkvc                 3   s    | ]} | V  qd S ri   rF   r   )rg   rF   rG   r   c  s    c                 3   r   ri   r   r   r   rF   rG   r   o  r   c                 3   r   ri   r   r   r   rF   rG   r   z  r   c                 3   r   r   r   r   r   rF   rG   r     r   c                 3   s4    | ]} ||d   |d  |d   V  qdS r   rF   r   r   rF   rG   r     s
    "
r   r   r   )*rQ   rR   rS   rT   tupler   prepare_inputs_for_mergedupdatezipr   r=   r   itemr   rU   rV   rW   r:   rX   rY   rZ   r[   itemsr   r   r\   r   r   r   r   ranger   r   r   r$   logger
ValueErrorr]   r^   r_   getr   )rE   r9   r   r   r   r   r   ra   use_merged_no_cacher   use_cache_branch_tensorrb   r   r   rc   rd   r   shaper   r   r   re   rf   rF   )rg   r   r   rE   rG   rh     s   	





 








zORTDecoderForSeq2Seq.forwardra   c                    s  |du rt nt}t|t| jdd }| jr-|d|d u}|r,|d ur,|| j	}nd }| jrl|d u rl|j
d }| jj}	| jj|	 }
||	d|
f}|j||d |du r\ | j	 t fdd	tt| jD }| jr|d u r|jd|jd}|du r|| j	}|||fS )
NT.r   )r&   r   r&   )dtypec                 3   s    | ]} V  qd S ri   rF   )rv   _key_or_valuerF   rG   r     s    zAORTDecoderForSeq2Seq.prepare_inputs_for_merged.<locals>.<genexpr>)rR   npgetattrstrr   splitr   fulltorV   r   rD   r   r   zerosr   r   r   r   int64)rE   r9   r   r   ra   constructorfloat_dtyper   r   r   r   r   rF   r   rG   r     s,   
 
z.ORTDecoderForSeq2Seq.prepare_inputs_for_mergedri   NNNN)rj   rk   rl   rm   rn   r   ro   rH   rR   rS   r
   FloatTensorr   r   intr   r	   r   
LongTensorr   rh   r   r   ndarrayr   rF   rF   rF   rG   rp     sj    
:

!
 %rp   c                   @   6   e Zd ZdZdZeedejdej	de
fddZdS )ORTEncoderForSpeechz
    Encoder model for ONNX Runtime inference for Whisper model.

    Args:
        session (`InferenceSession`):
            The ONNX Runtime inference session associated to the encoder.
    input_featuresrI   rJ   c                 K   rK   )Nr   rI   rM   rN   rO   rP   )rE   r   rI   r`   ra   rb   rc   rd   rN   re   rf   rg   rF   rF   rG   rh     $   



zORTEncoderForSpeech.forwardN)rj   rk   rl   rm   rn   r   SPEECH_ENCODER_INPUTS_DOCSTRINGrR   r   r   r   rh   rF   rF   rF   rG   r         r   c                   @   s0   e Zd ZdZdZeedejde	fddZ
dS )!ORTEncoderForVisionEncoderDecoderz
    Encoder model for ONNX Runtime inference for VisionEncoderDecoder models.

    Args:
        session (`InferenceSession`):
            The ONNX Runtime inference session associated to the encoder.
    pixel_valuesrJ   c                 K   s   t |tj}| | d|i}| jrB| |\}}| jjdkr'| j	| j
 n| j
  | j	| j
 | j
  |d |d }n| ||}| jd |}	| ||	}
|
d }t|dS )Nr   rM   rN   rO   rP   )rE   r   r`   ra   rb   rc   rd   rN   re   rf   rg   rF   rF   rG   rh     s"   



z)ORTEncoderForVisionEncoderDecoder.forwardN)rj   rk   rl   rm   rn   r   VISION_ENCODER_INPUTS_DOCSTRINGrR   r   r   rh   rF   rF   rF   rG   r     s    r   c                   @   r   )ORTEncoderForPix2Structz
    Encoder model for ONNX Runtime inference for Pix2Struct.

    Args:
        session (`InferenceSession`):
            The ONNX Runtime inference session associated to the encoder.
    flattened_patchesrI   rJ   c                 K   rK   )Nr   rI   rM   rN   rO   rP   )rE   r   rI   r`   ra   rb   rc   rd   rN   re   rf   rg   rF   rF   rG   rh     r   zORTEncoderForPix2Struct.forwardN)rj   rk   rl   rm   rn   r   PIX2STRUCT_INPUTS_DOCSTRINGrR   r   r   r   rh   rF   rF   rF   rG   r     r   r   c                0       s  e Zd ZdZdZeZeZddddddddddddd	dd
e	d de	e
 de	d de	eeeef  f fddZdeeef fddZdd Zedddddedeeedddddddddfdeeef dddedede
de
de
d ed!e	ee
ef  d"ed#ed$ed%ed&e	ee  d'e	eeeeef  eeef f  d(e	e d)e
d*e	e
 de	e
 de	e de	eeeef  f*d+d,Zedddddedddf	deeef dddedede
de
de
d ed!e	ee
ef  d)e
d*e
d-d fd.d/Z  ZS )0r<   a`  
    Sequence-to-sequence model with a language modeling head for ONNX Runtime inference.

    Important attributes:
        config ([`PretrainedConfig`]):
            Instance of the configuration associated to the model. Initializing with a config file does
            not load the weights associated with the model, only the configuration.
        use_io_binding (`Optional[bool]`, defaults to `None`):
            Whether use IOBinding during inference to avoid memory copy between the host and devices. Defaults to `True`
            if the device is CUDA, otherwise defaults to `False`.
        use_cache (`bool`):
            Whether or not past key/values cache should be used. It is determined by whether an InferenceSession for
            that was provided or not.
        providers (`List[str`]):
            The list of execution providers the model is running on.
        encoder (`ORTEncoder`):
            The encoder model.
        decoder (`ORTDecoderForSeq2Seq`):
            The decoder model.
        decoder_with_past (`Optional[ORTDecoderForSeq2Seq]`):
            The decoder model handling the past key/values if `use_cache=True`, else `None`.

    Other attributes:
        encoder_file_name (`str`, defaults to `optimum.onnxruntime.utils.ONNX_ENCODER_NAME`):
            The name of the ONNX file containing the encoder part of the model.
        decoder_file_name (`str`,  defaults to `optimum.onnxruntime.utils.ONNX_DECODER_NAME`):
            The name of the ONNX file containing the decoder part of the model.
        decoder_file_with_past_name (`str`, defaults to `optimum.onnxruntime.utils.ONNX_DECODER_WITH_PAST_NAME`):
            The name of the ONNX file containing the decoder with past key/values part of the model.
        model_save_dir (`str`, defaults to `""`):
            The directory under which the model exported to ONNX was saved.

    FNrA   encoder_sessiondecoder_sessiondecoder_with_past_sessionr=   generation_configmodel_save_dirrA   r4   r   r   r   r   r=   r   r   r   c                   s  |rGt d |d }t|dkr|d }t|dkr|d }
t|dkr)|d }t|dkr3|d }
t|dkr=|d }t|dkrG|d }|	rWt d	d
|	  d |du r_td|du rgtd|du rotdddd | D v }|dup|}|du r|du r|durtd|du r|du r|du rtdn
d}|durtd|| _|| _t	t
| j||d | j|| |d| _| j|| |d| _d| _| jdu r| jdu r| j|| |d| _| jttd| j| j| jhd |pt|| _tddr/| j }t|dkr/t d| d | D ]\}}t| j|| t| j|d qd| _|du r?t|jj| _ nt!|t"rO|| _t|j#| _ nt!|t$r[t|| _ n|| _ t%&| j't% t(| j)drv| j)&t%| j* dS dS ) a  
        Args:
            config ([`PretrainedConfig`]):
                `config` is an instance of the configuration associated to the model. Initializing with a config file
                does not load the weights associated with the model, only the configuration.
            encoder_session (`InferenceSession`):
                The ONNX Runtime inference session associated to the encoder.
            decoder_session (`InferenceSession`):
                The ONNX Runtime inference session associated to the decoder.
            decoder_with_past_session (`Optional[InferenceSession]`, *optional*, defaults to `None`):
                The ONNX Runtime inference session associated to the decoder with past key values.
            use_io_binding (``Optional[bool]`, *optional*, defaults to `None`):
                Whether use IOBinding during inference to avoid memory copy between the host and devices. Defaults to
                `True` if the device is CUDA, otherwise defaults to `False`.
            generation_config (`Optional[GenerationConfig]`, *optional*, defaults to `None`):
                The generation configuration used by default when calling `generate()`.
                Refer to https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate.
            model_save_dir (``Optional[Union[str, Path, TemporaryDirectory]]`, *optional*, defaults to `None`):
                The directory under which the model exported to ONNX was saved.
        a  Instantiating an ORTModelForConditionalGeneration with positional arguments is deprecated and will be removed in the next version. Please use the keyword arguments {config, encoder_session, decoder_session, decoder_with_past_session, use_cache, use_io_binding, model_save_dir} instead.r   r&   r   r   r~         z{Some keyword arguments were passed to the ORTModelForConditionalGeneration constructor that are not part of its signature: z, ze. These arguments will be ignored in the current version and will raise an error in the next version.NzYThe parameter config is required. Please pass a config or use the from_pretrained method.zlThe parameter encoder_session is required. Please pass an encoder_session or use the from_pretrained method.zkThe parameter decoder_session is required. Please pass a decoder_session or use the from_pretrained method.r   c                 S   s   g | ]}|j qS rF   r   )rv   inputrF   rF   rG   rx     s    z=ORTModelForConditionalGeneration.__init__.<locals>.<listcomp>TzDetected a merged decoder, but decoder_with_past_session was provided.Please only set decoder_session, or provide a non-merged decoder_session.FzThe parameter use_cache was set as True, but neither decoder_with_past_session was passed nor a use_cache branch can be found in the decoder_session. Please pass a decoder_with_past_session or set use_cache=False.zThe parameter decoder_with_past_session was passed, although use_cache is False.Please pass use_cache=True for decoder_with_past_session to be used.)modelrA   )r=   )partsz>=z4.44.99zHMoving the following attributes in the config to the generation config: z. You are seeing this warning because you've set generation parameters in the model config, as opposed to in the generation config.register)+r   warningr   joinkeysr   
get_inputsr   r   superr.   rH   _ort_encoder_classr>   _ort_decoder_classrq   decoder_with_pastr?   listfilterr   from_model_configr   r!   rA   &_get_non_default_generation_parametersr   setattr&_model_save_dir_tempdirectory_instancer   _model_pathparentr   rQ   r   r   r   r   r  rC   r@   auto_model_class	__class__)rE   rA   r   r   r   r=   r   r   argsr`   r   r   r   misplaced_generation_parameters
param_nameparam_valuer  rF   rG   rH   j  s   ""

z)ORTModelForConditionalGeneration.__init__save_directoryc                 C   s6   | j | | j| | jdur| j| dS dS )z
        Saves the encoder, decoder and decoder_with_past ONNX files to the save directory.
        Args:
            save_directory (`Union[str, Path`]):
                The directory under which the models will be saved.
        N)r>   save_sessionrq   r	  rE   r  rF   rF   rG   _save_pretrained  s
   
z1ORTModelForConditionalGeneration._save_pretrainedc                 C   s   | j | | j| dS )z
        Saves the model and generation configs to the save directory.
        Args:
            save_directory (`Union[str, Path`]):
                The directory under which the configs will be saved.
        N)rA   save_pretrainedr   r  rF   rF   rG   _save_config  s   z-ORTModelForConditionalGeneration._save_config mainCPUExecutionProviderTmodel_id	subfolderrevisionforce_downloadlocal_files_onlytrust_remote_code	cache_dirtokenencoder_file_namedecoder_file_namedecoder_with_past_file_nameprovider	providersprovider_optionssession_optionsr   r   c           %         s  |du r|du rt dd}t|}t|td||	|d}t|dkr)td| d }d }|dur>dd	 |D }t|dk}|du r|rFtntfd
d	|D }|rqfdd	|D }|rb|d n|d }|j|j	
dd }n fdd	|D }|r|d n|d }n|d }dd	 |D }fdd	|D }|r|d n|d }| r|}n}||s|nd |s|r|nd |r|nd d}i }| D ]=\}} | d u rq| j }| j	} t|| |||||d}!zt||| d |	||||d W n	 ty   Y nw t|!j	||< qt|!j}|r||d  }n||d  }|r!||d  }||d  }t|||d\}}t||||d}"t||||d}#d }$|d urY|du rY|du rYt||||d}$|d u r`|}|d u rztj|||||	||d}W n ty   td Y nw | ||"|#|$|||dS )NFTzThe parameters combination use_cache=False, use_merged=True is not supported. To use a merged decoder, past key values must be used.z	**/*.onnx)glob_patternr#  r)  r$  r   z&Could not find any ONNX model file in c                 S       g | ]}t tt|r|qS rF   )researchr)   r   rv   prF   rF   rG   rx   K  ry   zEORTModelForConditionalGeneration._from_pretrained.<locals>.<listcomp>c                    s    g | ]}t  t|r|qS rF   )r3  r4  r   r5  )patternrF   rG   rx   P  ry   c                       g | ]	}|j  kr|qS rF   r   rv   file)r,  rF   rG   rx   S      
_with_pastr  c                    r8  rF   r   r9  )r+  rF   rG   rx   W  r;  c                 S   r2  rF   )r3  r4  r,   r   r5  rF   rF   rG   rx   \  ry   c                    r8  rF   r   r9  )r*  rF   rG   rx   ]  r;  )last_encoder_model_namelast_decoder_model_name!last_decoder_with_past_model_namelast_decoder_merged_name)filenamer#  r$  r(  r%  r&  _data)r#  rA  r)  r$  r(  r%  r&  r@  r>  r?  r=  )r-  r.  r/  )r.  r/  sess_options)r(  r%  r&  r)  r$  r#  zZGeneration config file not found, using a generation config created from the model config.r   )r   r   r"   r-   r   FileNotFoundErrorr+   r*   r  r   replaceis_dirr   as_posixr   EnvironmentErrorr3   r   r   from_pretrainedOSErrorr   info)%clsr"  rA   r#  r$  r%  r&  r'  r(  r)  r*  r+  r,  r-  r.  r/  r0  r   r   r=   r   r   
model_path
onnx_filesdecoder_pathdecoder_with_past_pathmodel_filesencoder_pathnew_model_save_dirattribute_name_to_filenamepaths	attr_namerA  model_cache_pathr   r   r   rF   )r+  r,  r*  r7  rG   _from_pretrained  s  	










	z1ORTModelForConditionalGeneration._from_pretrainedrJ   c                 K   s   t j| jd}|
r|d7 }|dd d ur!td| j d| d|
du r-|du r-td	t }t|j}t	|||d| ||||	|||| j
d
 t|||d | j||f|
||d|S )N)model_classz
-with-pasttaskz?The `task` argument is not needed when exporting a model with `z;`. The `task` is automatically inferred from the class as `z`.FTaF  The incompatible arguments use_cache=False, use_merged=True were passed to ORTModelForConditionalGeneration.from_pretrained(). Please pass either use_cache=False, use_merged=False to disable past key value caching, or use_cache=True, use_merged=False to disable the merging of the decoder not using / using past key and value.)model_name_or_pathoutputrZ  do_validationno_post_processr#  r$  r(  r)  r&  r%  r'  library_name)src_subfolder)r   r   r   )r   %_infer_task_from_model_or_model_classr  r   r   rj   r   r   r   r   _library_namer%   rX  )rL  r"  rA   r#  r$  r%  r&  r'  r(  r)  r   r   r`   rZ  r   model_save_pathrF   rF   rG   _export  sR   

z(ORTModelForConditionalGeneration._export)rj   rk   rl   rm   _supports_cache_classr8   r  rp   r  r   ro   r   r   r   r   rH   r  r  classmethodr   r1   r/   r0   r   r   r   r   r   rX  rd  __classcell__rF   rF   r  rG   r<   B  s   "	 

	

" ;
	
r<   c                   @   s   e Zd ZdZeZdZeee	j
ed dd 					ddejdeej deej deeeej   d	eeeej   d
efddZ								dd
efddZd
efddZed
eeej  fddZdS )ORTModelForSeq2SeqLMz
    Sequence-to-sequence model with a language modeling head for ONNX Runtime inference. This class officially supports bart, blenderbot, blenderbot-small, longt5, m2m_100, marian, mbart, mt5, pegasus, t5.
    r9   zoptimum/t5-smallprocessor_classrY  
checkpointNrI   decoder_input_idsencoder_outputsr   rJ   c           	      K   sb   |d u r| j ||d}|d u s| jr| jr| jn| j}||||j|d}t|dd |j|j	dS )NrL   )r9   r   r   r   r   r   
r>   r   r   rq   r	  rN   r   r   r   r   )	rE   r9   rI   rl  rm  r   r`   r   decoder_outputsrF   rF   rG   rh     s"   
zORTModelForSeq2SeqLM.forwardc
              	   K   sf   |d ur(|d d j d }|j d |kr|}n|j d d }|d d |d f }|||	|||||dS )Nr   r   r&   )rl  r   rm  rI   	head_maskdecoder_head_maskcross_attn_head_maskr   r   )rE   r9   r   rI   token_type_idsrp  rq  rr  r   rm  r`   past_lengthremove_prefix_lengthrF   rF   rG   prepare_inputs_for_generationA  s   z2ORTModelForSeq2SeqLM.prepare_inputs_for_generationc                 C      | j S ri   r>   rE   rF   rF   rG   get_encoderc     z ORTModelForSeq2SeqLM.get_encoderc                    B   d}| D ]}|t  fdd|d d D |dd   f7 }q|S )NrF   c                 3       | ]	}| d  V  qdS r   Nindex_selectrv   
past_statebeam_idxrF   rG   r   m      z6ORTModelForSeq2SeqLM._reorder_cache.<locals>.<genexpr>r   r   pastr  reordered_past
layer_pastrF   r  rG   _reorder_cacheg     (z#ORTModelForSeq2SeqLM._reorder_cache)NNNNN)NNNNNNNN)rj   rk   rl   rm   r   r  rn   r   SEQ2SEQ_ONNX_MODEL_DOCSTRINGTRANSLATION_EXAMPLEformat_TOKENIZER_FOR_DOCrR   r   r   r   r
   rS   r   rh   r   rw  r8   r{  staticmethodr  rF   rF   rF   rG   rh    sZ    
"
"rh  c                       s  e Zd ZdZdZeZeZ fddZ	de
fddZeeejed dd	 	
	
	
	
	
	
ddeej deej deej deeeej   deeeej   deej defddZ	
	
	
	
	
	
	
dddZedeeej  fddZedeeef ddf fddZ   Z!S )ORTModelForSpeechSeq2Seqz
    Speech Sequence-to-sequence model with a language modeling head for ONNX Runtime inference. This class officially supports whisper, speech_to_text.
    r   c                    s"   t  j|i | | jjtd< d S )Nort_speechseq2seq)r  rH   r  rj   r   rE   r  r`   r  rF   rG   rH   }  s   z!ORTModelForSpeechSeq2Seq.__init__rJ   c                 C   rx  ri   ry  rz  rF   rF   rG   r{    r|  z$ORTModelForSpeechSeq2Seq.get_encoderzoptimum/whisper-tiny.enri  NrI   rl  rm  r   r   c           
      K   sd   |d u r| j ||d}|d u s| jr| jr| jn| j}||||j||d}	t|	dd |	j|	j	dS )Nr   )r9   r   r   r   r   r   r   rn  )
rE   r   rI   rl  rm  r   r   r`   r   ro  rF   rF   rG   rh     s$   
z ORTModelForSpeechSeq2Seq.forwardc	           
   	   K   s2   |d ur|d d dd f }||||||||dS )Nr   )rm  r   rl  rI   rp  rq  rr  r   rF   )
rE   rl  r   rI   rp  rq  rr  r   rm  r`   rF   rF   rG   rw    s   z6ORTModelForSpeechSeq2Seq.prepare_inputs_for_generationc                    r}  )NrF   c                 3   r~  r  r  r  r  rF   rG   r     r  z:ORTModelForSpeechSeq2Seq._reorder_cache.<locals>.<genexpr>r   r  r  rF   r  rG   r    r  z'ORTModelForSpeechSeq2Seq._reorder_cacher"  rA   r4   c                    s4   |j dkrtj||fi |S t j||fi |S )Nwhisper)rC   _ORTModelForWhisperrX  r  rL  r"  rA   r`   r  rF   rG   rX    s   
z)ORTModelForSpeechSeq2Seq._from_pretrainedNNNNNN)NNNNNNN)"rj   rk   rl   rm   rn   r   r  r   r  rH   r8   r{  r   #SPEECH_SEQ2SEQ_ONNX_MODEL_DOCSTRING$AUTOMATIC_SPEECH_RECOGNITION_EXAMPLEr  _PROCESSOR_FOR_DOCr   rR   r   r   r
   rS   r   rh   rw  r  r  rf  r   r   r   rX  rg  rF   rF   r  rG   r  r  sb    	
	$
	(r  c                       sV   e Zd ZdZeZ fddZdd Zdd Ze	de
eef d	d
f fddZ  ZS )r  z7
    Whisper implements its own generate() method.
    c                    s   t  j|i | t | _d S ri   )r  rH   r2   r   r  r  rF   rG   rH     s   z_ORTModelForWhisper.__init__c                  O      t j| i |S ri   )r   generater  r`   rF   rF   rG   r       z_ORTModelForWhisper.generatec                  O   r  ri   )r   rw  r  rF   rF   rG   rw    r  z1_ORTModelForWhisper.prepare_inputs_for_generationr"  rA   r4   c                    s   t t| j||fi |S ri   )r  r  rX  r  r  rF   rG   rX    s   z$_ORTModelForWhisper._from_pretrained)rj   rk   rl   rm   r   r  rH   r  rw  rf  r   r   r   rX  rg  rF   rF   r  rG   r    s    (r  c                   @   s   e Zd ZdZeZdZeZe	e
ejeed dd 				ddeej deej deeeej   deeeej   d	ef
d
dZ						dd	efddZd	efddZed	eeej  fddZdS )ORTModelForVision2Seqz
    VisionEncoderDecoder Sequence-to-sequence model with a language modeling head for ONNX Runtime inference. This class officially supports trocr and vision-encoder-decoder.
    r   z$nlpconnect/vit-gpt2-image-captioning)rj  tokenizer_classrY  rk  Nrl  rm  r   rJ   c                 K   s\   |d u r
| j |d}|d u s| jr| jr| jn| j}||||jd}t|j|j|j	|jdS )N)r   )r9   r   r   r   r   r   encoder_last_hidden_state)
r>   r   r   rq   r	  rN   r   r   r   r   )rE   r   rl  rm  r   r`   r   ro  rF   rF   rG   rh     s"   zORTModelForVision2Seq.forwardc                 K   sd   |d ur(|d d j d }	|j d |	kr|	}
n|j d d }
|d d |
d f }|||||||dS )Nr   r   r&   )rl  r   rm  rp  rq  rr  r   rs  )rE   r9   r   rp  rq  rr  r   rm  r`   ru  rv  rF   rF   rG   rw  ,  s   z3ORTModelForVision2Seq.prepare_inputs_for_generationc                 C   rx  ri   ry  rz  rF   rF   rG   r{  K  r|  z!ORTModelForVision2Seq.get_encoderc                    r}  )NrF   c                 3   r~  r  r  r  r  rF   rG   r   U  r  z7ORTModelForVision2Seq._reorder_cache.<locals>.<genexpr>r   r  r  rF   r  rG   r  O  r  z$ORTModelForVision2Seq._reorder_cacher   r  )rj   rk   rl   rm   r   r  rn   r   r  r   3VISION_ENCODER_DECODER_SEQ2SEQ_ONNX_MODEL_DOCSTRINGIMAGE_TO_TEXT_EXAMPLEr  _IMAGE_PROCESSER_FOR_DOCr  r   rR   r   r   r
   rS   r   rh   r   rw  r8   r{  r  r  rF   rF   rF   rG   r    sT    !

r  c                   @   s  e Zd ZdZeZdZeZe	e
ejed dd 						ddeej deej deej deej d	eeeej   d
eeeej   defddZ									ddeej deej deej defddZdefddZedeeej  fddZdS )ORTModelForPix2Structz
    Pix2struct model with a language modeling head for ONNX Runtime inference. This class officially supports pix2struct.
    r   zgoogle/pix2struct-ai2d-baseri  NrI   rl  r   rm  r   rJ   c           
      K   sb   |d u r| j ||d}| js| jr|d u r| jn| j}|||||j|d}	t|	j|	j|	j	|jdS )Nr   )r9   r   r   r   r   r  )
r>   r   r   rq   r	  rN   r   r   r   r   )
rE   r   rI   rl  r   rm  r   r`   r   ro  rF   rF   rG   rh   f  s4   zORTModelForPix2Struct.forwardc                 K   s   |d ur(|d d j d }|j d |kr|}n|j d d }|d d |d f }|d u r5t||j}||||
||||||	d
S )Nr   r   r&   )
r   rl  r   rm  rI   r   rp  rq  rr  r   )r   rR   	ones_liker   rV   )rE   r9   r   rI   r   r   rp  rq  rr  r   rm  r`   ru  rv  rF   rF   rG   rw    s&   z3ORTModelForPix2Struct.prepare_inputs_for_generationc                 C   rx  ri   ry  rz  rF   rF   rG   r{    r|  z!ORTModelForPix2Struct.get_encoderc                 C   s   t | | d S ri   )rh  r  )r  r  rF   rF   rG   r    s   z$ORTModelForPix2Struct._reorder_cacher  )	NNNNNNNNN)rj   rk   rl   rm   r   r  rn   r   r  r   PIX2STRUCT_ONNX_MODEL_DOCSTRINGPIX2STRUCT_EXAMPLEr  r  r   rR   r   r   
BoolTensorr
   rS   r   rh   r   rw  r8   r{  r  r  rF   rF   rF   rG   r  Z  sp    
	(
(r  )grm   r3  pathlibr   tempfiler   typingr   r   r   r   r   r	   r
   r   numpyr   rR   huggingface_hub.constantsr   transformersr   r   r   r   r   r   r   r   transformers.file_utilsr   r   transformers.modeling_outputsr   r   &transformers.models.auto.modeling_autor   transformers.utilsr   onnxruntimer   r   exporters.onnxr   exporters.tasksr   utilsr    r!   utils.file_utilsr"   utils.loggingr#   r$   utils.save_utilsr%   baser'   r(   	constantsr)   r*   r+   r,   r-   modeling_ortr.   r/   r0   r1   r2   r3   r4   rj   r   ONNX_MODEL_END_DOCSTRING SEQ2SEQ_ENCODER_INPUTS_DOCSTRINGr   r   r   DECODER_INPUTS_DOCSTRINGr  r  r  r  r  r  r  r  r  r  r  r8   rp   r   r   r   r<   rh  r  r  r  r  rF   rF   rF   rG   <module>   s   ((
		 #,/  7/-/   R`m_