o
    dii[                     @   sj  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlZddlmZmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 e	rddl4m5Z5 ddlm6Z6 e7e8Z9G dd deZ:G dd de%Z;dS )z0Classes handling quantization with ONNX Runtime.    N)defaultdict)Path)TYPE_CHECKINGCallableDictListOptionalTupleUnion)Versionparse)
AutoConfig)__version__)CalibrationDataReaderQuantFormatQuantizationMode	QuantType)ONNXQuantizer)QDQQuantizer)requires_backends   )OptimumQuantizer)maybe_save_preprocessors   )ORTQuantizableOperator)CalibrationConfig	ORTConfigQuantizationConfig)ORTModel) ORTModelForConditionalGeneration)QuantizationPreprocessor)Dataset)PretrainedConfigc                   @   s0   e Zd Zg dZddddefddZdd	 Zd
S )ORTCalibrationDataReader)
batch_sizedataset_dataset_iterr   r%   r!   r$   c                 C   sD   |d u rt d|dkrt d| d|| _|| _t| j| _d S )NzProvided dataset is None.r   z)Provided batch_size should be >= 1 (got: ).)
ValueErrorr%   r$   iterr&   )selfr%   r$    r+   n/lsinfo/ai/hellotax_ai/llm_service/venv_embed/lib/python3.10/site-packages/optimum/onnxruntime/quantization.py__init__4   s   z!ORTCalibrationDataReader.__init__c                 C   s   d }z7| j dkrdd t| j D }n#tt}t| j D ]}t| j}| D ]\}}||  |g7  < q)qW n	 tyB   Y nw |d urOt|dkrO|S d S )Nr   c                 S   s   i | ]\}}||gqS r+   r+   ).0keyvaluer+   r+   r,   
<dictcomp>D   s    z5ORTCalibrationDataReader.get_next.<locals>.<dictcomp>r   )	r$   nextr&   itemsr   listrangeStopIterationlen)r*   featurized_samples_samplenamer0   r+   r+   r,   get_next@   s"   

z!ORTCalibrationDataReader.get_nextN)r   )__name__
__module____qualname__	__slots__intr-   r<   r+   r+   r+   r,   r#   1   s    r#   c                       s  e Zd ZdZd8deded f fddZe	d8ded	e	ef d
ee	 dd fddZ
						d9dddedee	ef deee	  dededededee	eeef f fddZ						d9dddedee	ef deee	  dedededefddZdee	eeef f fddZ	 			d:d!ed"ee	ef d#ee	 d$eee	eeef f  ded%ee defd&d'Z	(				)	*		d;d+e	d,ed-ee	 d.ee	 d/ee d0ed1ed2eeee	f  d3eeee	f  ddfd4d5Zd<d6d7Z  ZS )=ORTQuantizerzc
    Handles the ONNX Runtime quantization process for models shared on huggingface.co/models.
    Nonnx_model_pathconfigr"   c              	      sj   t    || _|| _| jdu r0z
t| jj| _W n ttfy/   t	
d| j d Y nw d| _dS )z
        Args:
            onnx_model_path (`Path`):
                Path to the onnx model files you want to quantize.
            config (`Optional[PretrainedConfig]`, defaults to `None`):
                The configuration of the model.
        NzCould not load the config for z automatically, this might make the quantized model harder to use because it will not be able to be loaded by an ORTModel without having to specify the configuration explicitly.)superr-   rC   rD   r   from_pretrainedparentOSErrorr(   LOGGERwarning_calibrator)r*   rC   rD   	__class__r+   r,   r-   Z   s   


zORTQuantizer.__init__model_or_pathr   	file_namereturnc                 C   s   d}t |trt|}d}d}t |trt|t |trJ|du rJt|d}t|dkr5td| t|dkrEt	d| d| |d j
}t |trYt|jj}|j}ntj|rft|| }ntd	| d
| ||dS )a  
        Instantiates a `ORTQuantizer` from an ONNX model file or an `ORTModel`.

        Args:
            model_or_path (`Union[ORTModel, str, Path]`):
                Can be either:
                    - A path to a saved exported ONNX Intermediate Representation (IR) model, e.g., `./my_model_directory/.
                    - Or an `ORTModelForXX` class, e.g., `ORTModelForQuestionAnswering`.
            file_name(`Optional[str]`, defaults to `None`):
                Overwrites the default model file name from `"model.onnx"` to `file_name`.
                This allows you to load different model files from the same repository or directory.
        Returns:
            An instance of `ORTQuantizer`.
        zORTQuantizer does not support multi-file quantization. Please create separate ORTQuantizer instances for each model/file, by passing the argument `file_name` to ORTQuantizer.from_pretrained().Nz*.onnxr   z&Could not find any ONNX model file in r   z#Found too many ONNX model files in z. zUnable to load model from .)rD   )
isinstancestrr   r   NotImplementedErrorr4   globr7   FileNotFoundErrorRuntimeErrorr;   r   model_model_pathrD   ospathisdirr(   )clsrN   rO   ort_quantizer_error_messager[   rD   
onnx_filesr+   r+   r,   rF   p   s.   



zORTQuantizer.from_pretrainedaugmented_model.onnxr   Fr%   r!   calibration_configonnx_augmented_model_nameoperators_to_quantizer$   use_external_data_formatuse_gpuforce_symmetric_rangec	           	   
   C   s<   t d|j d|j d | |||||||| |  S )aC  
        Performs the calibration step and computes the quantization ranges.

        Args:
            dataset (`Dataset`):
                The dataset to use when performing the calibration step.
            calibration_config ([`~CalibrationConfig`]):
                The configuration containing the parameters related to the calibration step.
            onnx_augmented_model_name (`Union[str, Path]`, defaults to `"augmented_model.onnx"`):
                The path used to save the augmented model used to collect the quantization ranges.
            operators_to_quantize (`Optional[List[str]]`, defaults to `None`):
                List of the operators types to quantize.
            batch_size (`int`, defaults to 1):
                The batch size to use when collecting the quantization ranges values.
            use_external_data_format (`bool`, defaults to `False`):
                Whether to use external data format to store model which size is >= 2Gb.
            use_gpu (`bool`, defaults to `False`):
                Whether to use the GPU when collecting the quantization ranges values.
            force_symmetric_range (`bool`, defaults to `False`):
                Whether to make the quantization ranges symmetric.

        Returns:
            The dictionary mapping the nodes name to their quantization ranges.
        z+Using static quantization schema (dataset: z
, method: ))rI   infodataset_namemethodpartial_fitcompute_ranges)	r*   r%   ra   rb   rc   r$   rd   re   rf   r+   r+   r,   fit   s$   $
zORTQuantizer.fitc	           
      C   sx   |j dur td|j  d| d |j| j ||||d| _|r*| jjdgd td t||}	| j	|	 dS )	a  
        Performs the calibration step and collects the quantization ranges without computing them.

        Args:
            dataset (`Dataset`):
                The dataset to use when performing the calibration step.
            calibration_config (`CalibrationConfig`):
                The configuration containing the parameters related to the calibration step.
            onnx_augmented_model_name (`Union[str, Path]`, defaults to `"augmented_model.onnx"`):
                The path used to save the augmented model used to collect the quantization ranges.
            operators_to_quantize (`Optional[List[str]]`, defaults to `None`):
                List of the operators types to quantize.
            batch_size (`int`, defaults to 1):
                The batch size to use when collecting the quantization ranges values.
            use_external_data_format (`bool`, defaults to `False`):
                Whether uto se external data format to store model which size is >= 2Gb.
            use_gpu (`bool`, defaults to `False`):
                Whether to use the GPU when collecting the quantization ranges values.
            force_symmetric_range (`bool`, defaults to `False`):
                Whether to make the quantization ranges symmetric.
        NzCreating calibrator: (rg   )rC   rd   augmented_model_namerc   rf   CUDAExecutionProvider)execution_providersz Collecting tensors statistics...)
rj   rI   rh   create_calibratorrC   as_posixrK   set_execution_providersr#   collect_data)
r*   r%   ra   rb   rc   r$   rd   re   rf   readerr+   r+   r,   rk      s   
!

zORTQuantizer.partial_fitc                 C   s@   | j du r	tdtd tttdkr| j  S | j  S )z
        Computes the quantization ranges.

        Returns:
            The dictionary mapping the nodes name to their quantization ranges.
        Nz^Calibrator is None, please call `partial_fit` or `fit` method at least ones to compute ranges.zComputing calibration ranges1.16.0)	rK   r(   rI   rh   r   ort_versionr   compute_datacompute_range)r*   r+   r+   r,   rl     s   



zORTQuantizer.compute_ranges	quantizedquantization_configsave_dirfile_suffixcalibration_tensors_rangepreprocessorc                 C   s  |j o|jtjk}t|}|jddd |j r|du rtd|j sB|jtj	kr2t
d|j d |jtjkrBt
d|j d t
d|j rJd	nd
 d|  |duryt
d || j\}}	||j |	|j t||_t|	|_d}
tt| j }|jjD ]}|jdv rd}
 nq|
r|j rtdtttdkrtd|rt nt!}||j |j"|j|j#|j||j$|j|jdd |j%D |j&|j'|
|j'o|j&|j(|j)|j*dd}|r|+d tttdkr|+d	 tttdkr|+d|d< |d#i |}t
d |,  |rd| nd}|-| jj. | /d}t
d| d | d! |j01| | t2||d"}|3| | j4durV| j43| t5| jj6| t|S )$a  
        Quantizes a model given the optimization specifications defined in `quantization_config`.

        Args:
            quantization_config (`QuantizationConfig`):
                The configuration containing the parameters related to quantization.
            save_dir (`Union[str, Path]`):
                The directory where the quantized model should be saved.
            file_suffix (`Optional[str]`, defaults to `"quantized"`):
                The file_suffix used to save the quantized model.
            calibration_tensors_range (`Optional[Dict[str, Tuple[float, float]]]`, defaults to `None`):
                The dictionary mapping the nodes name to their quantization ranges, used and required only when applying static quantization.
            use_external_data_format (`bool`, defaults to `False`):
                Whether to use external data format to store model which size is >= 2Gb.
            preprocessor (`Optional[QuantizationPreprocessor]`, defaults to `None`):
                The preprocessor to use to collect the nodes to include or exclude from quantization.

        Returns:
            The path of the resulting quantized model.
        T)parentsexist_okNzRequested static quantization in the QuantizationConfig, but no calibration ranges were provided. Please run calibration first using the quantizer fit method, or use dynamic quantization.zSONNX Runtime dynamic quantization mode should be QuantizationMode.IntegerOps (got: r'   zYONNX Runtime dynamic quantization activations data type should be QuantType.QUInt8 (got: z	Creating staticdynamicz quantizer: z:Preprocessor detected, collecting nodes to include/excludeF)IfLoopScanSequenceMapzIStatic quantization is currently not supported for models with subgraphs.rw   zONNX Runtime version v1.16.0 is not compatible with quantization for models with subgraphs, please downgrade to 1.15.1 or upgrade to a higher version. Reference: https://github.com/microsoft/onnxruntime/pull/17651c                 S   s    g | ]}t |tr|jn|qS r+   )rR   r   r0   )r.   operatorr+   r+   r,   
<listcomp>w  s    z)ORTQuantizer.quantize.<locals>.<listcomp>)WeightSymmetricActivationSymmetricEnableSubgraphForceSymmetricAddQDQPairToWeightDedicatedQDQPair QDQOpTypePerChannelSupportToAxis)rX   r   per_channelmodeweight_qTypeinput_qTypetensors_rangereduce_rangenodes_to_quantizenodes_to_excludeop_types_to_quantizeextra_optionsr   z1.18.0z1.13.0r   activation_qTypezQuantizing model...r9    z.onnxzSaving quantized model at: z (external data format: rg   )quantizationrd   r+   )7	is_staticformatr   QDQr   mkdirr(   r   r   
IntegerOpsrI   rJ   activations_dtyper   QUInt8rh   collectrC   updater   r   r4   onnxloadrs   graphnodeop_typerT   r   rx   r   r   r   r   weights_dtyper   rc   weights_symmetricactivations_symmetricqdq_add_pair_to_weightqdq_dedicated_pair'qdq_op_type_per_channel_support_to_axispopquantize_modeljoinpathstemwith_suffixrX   save_model_to_filer   save_pretrainedrD   r   rG   )r*   r|   r}   r~   r   rd   r   use_qdqr   r   has_subgraphs
onnx_modelr   quantizer_factoryquantizer_kwargs	quantizersuffixquantized_model_path
ort_configr+   r+   r,   quantize  s   








zORTQuantizer.quantized   T  ri   num_samplesdataset_config_namedataset_splitpreprocess_functionpreprocess_batchseeduse_auth_tokentokenc
                 C   s   |durt dt |	durtd|}	|du rtdt| dg ddlm}
 |
||||	d}|durFt|t|}|j	|d	
t|}|durR|j||d
}n|}| |S )a  
        Creates the calibration `datasets.Dataset` to use for the post-training static quantization calibration step.

        Args:
            dataset_name (`str`):
                The dataset repository name on the Hugging Face Hub or path to a local directory containing data files
                to load to use for the calibration step.
            num_samples (`int`, defaults to 100):
                The maximum number of samples composing the calibration dataset.
            dataset_config_name (`Optional[str]`, defaults to `None`):
                The name of the dataset configuration.
            dataset_split (`Optional[str]`, defaults to `None`):
                Which split of the dataset to use to perform the calibration step.
            preprocess_function (`Optional[Callable]`, defaults to `None`):
                Processing function to apply to each example after loading dataset.
            preprocess_batch (`bool`, defaults to `True`):
                Whether the `preprocess_function` should be batched.
            seed (`int`, defaults to 2016):
                The random seed to use when shuffling the calibration dataset.
            use_auth_token (`Optional[Union[bool,str]]`, defaults to `None`):
                Deprecated. Please use the `token` argument instead.
            token (`Optional[Union[bool,str]]`, defaults to `None`):
                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
                when running `huggingface-cli login` (stored in `huggingface_hub.constants.HF_TOKEN_PATH`).

        Returns:
            The calibration `datasets.Dataset` to use for the post-training static quantization calibration
            step.
        NznThe `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.zLYou cannot use both `use_auth_token` and `token` arguments at the same time.zkORTQuantizer: Static quantization calibration step requires a dataset_name if no calib_dataset is provided.datasetsr   )load_dataset)r;   splitr   )r   )batched)warningswarnFutureWarningr(   r   r   r   minr7   shuffleselectr5   mapclean_calibration_dataset)r*   ri   r   r   r   r   r   r   r   r   r   calib_datasetprocessed_calib_datasetr+   r+   r,   get_calibration_dataset  s6   *
z$ORTQuantizer.get_calibration_datasetc                 C   s:   t | j}dd |jjD }tt|j| }||S )Nc                 S   s   h | ]}|j qS r+   )r;   )r.   inputr+   r+   r,   	<setcomp>  s    z9ORTQuantizer.clean_calibration_dataset.<locals>.<setcomp>)	r   r   rC   r   r   r4   setcolumn_namesremove_columns)r*   r%   rX   model_inputsignored_columnsr+   r+   r,   r     s   
z&ORTQuantizer.clean_calibration_dataset)N)r`   Nr   FFF)r{   NFN)r   NNNTr   NN)r%   r!   rP   r!   )r=   r>   r?   __doc__r   r   r-   classmethodr
   rS   rF   r   r   rA   boolr   r	   floatrm   rk   rl   r   r    r   r   r   r   __classcell__r+   r+   rL   r,   rB   U   s    3

	

:

	
2

 	

OrB   )<r   loggingrZ   r   collectionsr   pathlibr   typingr   r   r   r   r   r	   r
   r   packaging.versionr   r   transformersr   onnxruntimer   rx   onnxruntime.quantizationr   r   r   r   'onnxruntime.quantization.onnx_quantizerr   &onnxruntime.quantization.qdq_quantizerr   optimum.utils.import_utilsr   quantization_baser   utils.save_utilsr   r   r   configurationr   r   r   modeling_ortr   modeling_seq2seqr   preprocessorsr    r   r!   r"   	getLoggerr=   rI   r#   rB   r+   r+   r+   r,   <module>   s8   $
$