o
    di                     @   st  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ ddlmZmZmZ ddlmZ ddlm Z! ddl"m#Z# ddl$m%Z%m&Z& ddlm'Z'm(Z(m)Z)m*Z*m+Z+m,Z, e rd dl-m.Z.m/Z/ d dl0m1Z1 e rd dl2m Z3 d dl2m4Z4 d dl5m6Z7 d dl8m9Z9 d dl:m;Z< e rd dl=m4Z4 d dl>m9Z9 d dl?m<Z< d dl@mAZAmBZB d dl@mCZ7 d dlDm ZE eeFZGdd ZHG dd deIeZJG d d! d!eKZLe#ddddddd"d"ddfd#ejMd$eNd%eNd&eeN d'eeN d(ee	 d)ee	 d*eeN d+eeN d,eOd-eOd.ee	eNef  d/eeI fd0d1ZPdS )2    N)Enum)	getLogger)AnyDictListOptionalTupleUnion)version)nn)tqdm)AutoTokenizer)Conv1D)QuantizationMethod   )is_accelerate_availableis_auto_gptq_availableis_gptqmodel_available)recurse_getattr)__version__   )GPTQ_CONFIG)get_datasetprepare_dataset)get_block_name_with_pattern
get_device
get_layersget_preceding_modules
get_seqlennested_move_to)cpu_offload_with_hookload_checkpoint_and_dispatch)remove_hook_from_module)exllama_set_max_input_length)autogptq_post_init)GPTQ)dynamically_import_QuantLinear)hf_select_quant_linear)hf_convert_gptq_v1_to_v2_formathf_convert_gptq_v2_to_v1_format)hf_gptqmodel_post_initc                   C   s   t j ptt dot j S )Nxpu)torchcudais_availablehasattrr+    r0   r0   d/lsinfo/ai/hellotax_ai/llm_service/venv_embed/lib/python3.10/site-packages/optimum/gptq/quantizer.pyhas_device_more_than_cpuF   s   r2   c                   @   s   e Zd ZdZdZdS )ExllamaVersionr   r   N)__name__
__module____qualname__ONETWOr0   r0   r0   r1   r3   J   s    r3   c                ,   @   s  e Zd ZdZ																				dBd	ed
eeee ef  dede	de
de
de
de
dee dee deee  dedee de
deeeef  dee dee
 deeee   dedeeeef  dee f*ddZdCd eeef d!e
fd"d#Zd$d% Zed&eeef fd'd(Zd)ejfd*d+Zd,d- ZdDd/ejd0ee d1efd2d3Ze dEd)ejd4ee fd5d6Zd7d8 Zd)ejd9eeef fd:d;ZdFd)ejd=ed>ed?e
fd@dAZdS )GGPTQQuantizerz,
    A simple API for GPTQ Quantization
    N   皙?FTr   gptqbitsdataset
group_sizedamp_percentdesc_actsymtrue_sequentialuse_cuda_fp16model_seqlenblock_name_to_quantize!module_name_preceding_first_block
batch_sizepad_token_iddisable_exllamaexllama_configmax_input_lengthcache_block_outputsmodules_in_block_to_quantizecheckpoint_formatmetabackendc                 O   s`  || _ || _|| _|| _|| _|| _|| _| | _|| _	|dur%| nd| _
|| _|	| _|
| _|| _|| _|| _|| _|| _|| _tj| _|| _|| _g d| _| j dvr[td| jdkri| jdkritdd| j  k rwdk s|td	 td	| jdu rd
tji| _n d
| jvrtd| jd
 tjtjfvr| jd
 }td| | jd
 | _dS )a  
        Args:
            bits (`int`):
                The number of bits to quantize to, supported numbers are (2, 3, 4, 8).
            dataset (`Union[List[str], str, Any]`, defaults to `None`):
                The dataset used for quantization. You can provide your own dataset in a list of string or in a list of tokenized data
                (e.g. [{ "input_ids": [ 1, 100, 15, ... ],"attention_mask": [ 1, 1, 1, ... ]},...])
                or just use the original datasets used in GPTQ paper ['wikitext2','c4','c4-new'].
            group_size (int, defaults to 128):
                The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
            damp_percent (`float`, defaults to `0.1`):
                The percent of the average Hessian diagonal to use for dampening, recommended value is 0.1.
            desc_act (`bool`, defaults to `False`):
                Whether to quantize columns in order of decreasing activation size.
                Setting it to False can significantly speed up inference but the perplexity may become slightly worse.
                Also known as act-order.
            sym (`bool`, defaults to `True`):
                Whether to use symetric quantization.
            true_sequential (`bool`, defaults to `True`):
                Whether to perform sequential quantization even within a single Transformer block.
                Instead of quantizing the entire block at once, we perform layer-wise quantization.
                As a result, each layer undergoes quantization using inputs that have passed through the previously quantized layers.
            use_cuda_fp16 (`bool`, defaults to `False`):
                Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16.
            model_seqlen (`Optional[int]`, defaults to `None`):
                The maximum sequence length that the model can take.
            block_name_to_quantize (`Optional[str]`, defaults to `None`):
                The transformers block name to quantize. If None, we will infer the block name using common patterns (e.g. model.layers)
            module_name_preceding_first_block (`Optional[List[str]]`, defaults to `None`):
                The layers that are preceding the first Transformer block.
            batch_size (`int`, defaults to `1`):
                The batch size of the dataset
            pad_token_id (`Optional[int]`, defaults to `None`):
                The pad token id. Needed to prepare the dataset when `batch_size` > 1.
            disable_exllama (`bool`, defaults to `False`):
                Whether to use exllama backend. Only works with `bits` = 4.
            exllama_config (`Dict[str, Any]`, *optional*):
                The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
            max_input_length (`Optional[int]`, defaults to `None`):
                The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
                It is specific to the exllama backend with act-order.
            cache_block_outputs (`bool`, defaults to `True`):
                Whether to cache block outputs to reuse as inputs for the succeeding block. It allows optimization of non-standard models
                (e.g. ChatGLM) but can require more time.
            modules_in_block_to_quantize (`Optional[List[List[str]]]`, defaults to `None`):
                List list of module names to quantize in the block specified. This argument is useful to exclude certain linear modules from being quantized.
                The block to quantize can be specified by setting `block_name_to_quantize`. We will quantize each list sequentially.
                If not set, we will quantize all linear layers. Example: `inside_layer_modules=[["self_attention.query_key_value"], ["mlp.dense_h_to_4h"]]`
            checkpoint_format (`str`, *optional*, defaults to `gptq`):
                GPTQ weight format. `gptq`(v1) is supported by both gptqmodel and auto-gptq. `gptq_v2` is gptqmodel only.
            meta (`Dict[str, any]`, *optional*):
                Properties, such as tooling:version, that do not directly contributes to quantization or quant inference are stored in meta.
                i.e. `meta.quantizer`: ["optimum:_version_", "gptqmodel:_version_"]
            backend (`str`, *optional*):
                Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py
        N)r=   r>   r?   r@   rA   rB   rC   quant_methodrN   rO   rP   )r            z(only support quantize to [2,3,4,8] bits.r   z0group_size must be greater than 0 or equal to -1r   z"damp_percent must between 0 and 1.r
   .`exllama_config` needs to have a `version` keyaOnly supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version )r=   r>   r?   r@   rA   rB   rC   lowerrO   rP   rQ   rD   rE   rF   rG   rH   rI   rJ   rK   rL   r   r%   rR   rM   rN   serialization_keys
ValueErrorr3   r8   r7   exllama_version)selfr=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   argskwargsr
   r0   r0   r1   __init__T   sR   S





zGPTQQuantizer.__init__
device_mappackc                 C   sp   t  rt| j| j| j| j| j| j|| j|d	| _	d S td| j| j| j| j
p*| jtjk| j
p2| jtjkd| _	d S )N)	r=   r?   rA   rB   rO   rP   ra   rQ   rb   F)
use_tritonrA   r?   r=   rJ   disable_exllamav2)r   r'   r=   r?   rA   rB   rO   rP   rQ   quant_linearrJ   r\   r3   r7   r8   )r]   ra   rb   r0   r0   r1   select_quant_linear   s(   z!GPTQQuantizer.select_quant_linearc                 C   s   i }| j D ]	}t| |||< q|ddu ri |d< |d }|ddu rIdt g|d< t r<|d dt  |S t rI|d dt  |S )z2
        Returns the args in dict format.
        rP   N	quantizerzoptimum:z
gptqmodel:z
auto_gptq:)	rZ   getattrgetoptimum_versionr   appendgptqmodel_versionr   autogptq_version)r]   	gptq_dictkeyrP   r0   r0   r1   to_dict   s   
zGPTQQuantizer.to_dictconfig_dictc                 C   s   | di |S )a  
        Instantiates a `GPTQQuantizer` using config_dict as kwargs

        Args:
            config_dict (`Dict[str,Any]`):
                quantization config

        Returns:
            `GPTQQuantizer`:  The quantizer object instantiated from those parameters.
        Nr0   r0   )clsrq   r0   r0   r1   	from_dict  s   zGPTQQuantizer.from_dictmodelc                    s   | j du r
t|| _ | j }t||d}| jdurBt| jg }t| D ] t fdd|D sAt	d  d| j d | = q$| j
|ddd	d
 | || |S )z
        Convert the model to a GPTQ model by getting and replacing the layers.

        Args:
            model (`nn.Module`):
                Model to be converted

        N)prefixc                 3   s    | ]}  |V  qd S N)endswith).0layernamer0   r1   	<genexpr>+      z.GPTQQuantizer.convert_model.<locals>.<genexpr>zQuantization disabled for z$ (only modules_in_block_to_quantize=z are quantized)ra   Fra   rb   )rF   r   r   rN   sumlistkeysanyloggerinforf   ri   _replace_by_quant_layers)r]   rt   r_   
block_namelayers_to_be_replacedlayers_to_keepr0   rz   r1   convert_model  s    
	

zGPTQQuantizer.convert_modelc                 C   s   t || jd jj}|g}|S )z
        Get the modules that should not be split across multiple devices.
        Args:
            model (`nn.Module`):
                The input model
        r   )r   rF   	__class__r4   )r]   rt   block_class_nameno_split_module_classesr0   r0   r1   get_no_split_module_classes7  s   z)GPTQQuantizer.get_no_split_module_classes modulenamesr{   c                 C   s  t || jrdS t|D ]}t||}|dkr|d | n|}||v rt|}t|| t |tjr9|j}|j	}	nt |tj
rF|j}|j}	nt |trW|jjd }|jjd }	|jdu}
t rs| j| j| j| j| j||	|
|jjd}n)| jr{| jdkr| j| j| j||	|
| j|jjd}n| j| j| j||	|
|jjd}||_t|||| q| D ]\}}| |||dkr|d | n| qdS )	a^  
        Replaces linear layers in `module` by `QuantLinear`

        Args:
            module (`nn.Module`):
                Module to quantize
            names (`List[str]`):
                List of names of the module to quantize
            name (`str`, defaults to `""`):
                To keep track of the name of the current module
        Nr   .r   r   )weight_dtyperV   )rD   r   )
isinstancere   dirrh   r   delattrr   Linearin_featuresout_featuresConv2din_channelsout_channelsr   weightshapebiasr   r=   r?   rA   rB   dtyperD   devicesetattrtonamed_childrenr   )r]   r   r   r{   attrry   name1r   r   r   r   	new_layerchildr0   r0   r1   r   C  sj   




$z&GPTQQuantizer._replace_by_quant_layers	tokenizerc           %         sJ	  t  s
t s
tdt rt  rtd t  o%ttjdtdkp(t }|s4t	j
 s4td| js>t s>td| jdkrJt sJtd|  t rY| jdkrYd| _d	}d	}t|d
rld}|jj}d	|j_t|drt|j d}dv rtddv st	dv rt|jdkrtd d}|j D ]\}}	|	dkrt||}
t|
dd t|
|d\}
}qnd	}t|dr|jt	jk| _| j du rt!dt"|| _ t#|}	t$| j%trt$| j%d t&s| j%}td nWt$t&rzt'(W n t)y   td dw | j%du rtdt$| j%t&r-t*| j%| j dd}nt$| j%tr?fdd| j%D }ntd t+| j% d!t,|| j-| j.d"}g g }g | j/du rdt0|| _/| j1du rqt2|| j/| _1t|| j/}t#|d  t s j+dkrd |s }| j1D ]}t||}
|
du rtd#| d$|
3|}
q|d 3||d<  fd%d&}| j4r|d j5|dd'}|D ](}| D ]\}}t6| ||< qz
|dEi | W q ty   Y qw |7  |s|d 3|	 | j1D ]}t||}
|
du rtd#| d$qt	j
8  tt	d(r2t	j9 r2t	j98  i }t:t;|d)| j/ d*d+D ]\}}td,| j/ d-|d  d.t|  | j4s|j5|dd'}|D ](}| D ]\}}t6| ||< qlz
|dEi | W qf ty   Y qfw |7  |rt#|t	dkrt< r|3d}t=|t#|}t s|j+dkrd}t$| j>trt| j>dkr| j?r| j>}nt@| j>g g}n| j?rd/d A D }ntA g}td0|  t;|d	d1d2D ]}fd3d4|D }i g }|D ]*}tB|| |< | jCjD| jE| jdd5 fd6d7}|F|| G|| qtHt|D ],}t6| ||< |  D ]\}}t6||| |< qS|| i |  qB|D ]}|7  qq|D ]K}td)| d8|d  d.t| d9 | jI| jJ| jK| jLd:} | d | d | d; }!}"}#| jC|!|"|#f|| j/ d!| d!| < | M  q{~q | j4rtHt|D ]}|| i | }$|F|$ q|s|3|	||< |g }ng t	j
8  tt	d(rt	j9 rt	j98  qA| jEd<krm|	j+d=ks1|rBtNfd>d?d@D rB| jOsAt sAtdA d| _On+| jLrZ| jOsZ| jPtQjRkrZtdB d| _On| jOsm| jPtQjSkrmtdC d| _O| jT||dD d|_UtVjB|_W|r||j_| X |j_Y| Z|}t	j
8  tt	d(rt	j9 rt	j98  |S )Fa  
        Quantizes the model using the dataset

        Args:
            model (`nn.Module`):
                The model to quantize
            tokenizer (Optional[`Any`], defaults to `None`):
                The tokenizer to use in order to prepare the dataset. You can pass either:
                    - A custom tokenizer object.
                    - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
                      user or organization name, like `dbmdz/bert-base-german-cased`.
                    - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
                      using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
        Returns:
            `nn.Module`: The quantized model
        zgptqmodel or auto-gptq is required in order to perform gptq quantzation: `pip install gptqmodel` or `pip install auto-gptq`. Please notice that auto-gptq will be deprecated in the future.zeDetected gptqmodel and auto-gptq, will use gptqmodel. The auto_gptq will be deprecated in the future.z	auto-gptqz0.4.2zmNo cuda gpu or cpu support using Intel/IPEX found. A gpu or cpu with Intel/IPEX is required for quantization.zpAsymmetric sym=False quantization is not supported with auto-gptq. Please use gptqmodel: `pip install gptqmodel`gptq_v2z_gptq_v2 format only supported with gptqmodel. Please install gptqmodel: `pip install gptqmodel`FconfigThf_device_mapdiskz4disk offload is not supported with GPTQ quantizationcpur   zJCpu offload is not recommended. There might be some issues with the memoryN)recurse)prev_module_hookr   i  r   zMGPTQQuantizer dataset appears to be already tokenized. Skipping tokenization.zWe were not able to get the tokenizer using `AutoTokenizer.from_pretrained`
                        with the string that you have passed z. If you have a custom tokenizer, you can pass it as input.
                        For now, we only support quantization for text model. Support for vision, speech and multimodel will come later.z:You need to pass `dataset` in order to quantize your modeltrain)seqlensplitc                    s   g | ]} |d dqS )pt)return_tensorsr0   )rx   data)r   r0   r1   
<listcomp>  s    z0GPTQQuantizer.quantize_model.<locals>.<listcomp>z^You need to pass a list of string, a list of tokenized data or a string for `dataset`. Found: r   )rI   rH   zModule z was not found in modelc                    sv   |d }|d u rd|v rt |d  f}ntd| i }| D ]\}}|dvr3t | ||< q$| t)Nr   hidden_statesz'No input value found in the foward pass)r   )r   r[   rk   items)_inputr^   r_   other_kwargskv)cur_layer_devicelayer_input_kwargslayer_inputsr0   r1   store_input_hook  s   

z6GPTQQuantizer.quantize_model.<locals>.store_input_hook)with_kwargsr+   zQuantizing z blocks )desczStart quantizing block  /c                 S   s   g | ]}|gqS r0   r0   )rx   ro   r0   r0   r1   r   ]  s    zModule to quantize z"Quantizing layers inside the block)leaver   c                       i | ]}| | qS r0   r0   )rx   r{   layersr0   r1   
<dictcomp>b      z0GPTQQuantizer.quantize_model.<locals>.<dictcomp>)r=   rB   
perchannelc                    s    fdd}|S )Nc                    s      |d j|j d S )Nr   )	add_batchr   )r   r   output)r<   r{   r0   r1   tmpk  s   z<GPTQQuantizer.quantize_model.<locals>.add_batch.<locals>.tmpr0   )r{   r   )r<   rz   r1   r   j  s   z/GPTQQuantizer.quantize_model.<locals>.add_batchz
 in block z...)percdampr?   actorderr   rT   r-   c                 3   s    | ]}| v V  qd S rv   r0   rx   d)devicesr0   r1   r|     s    z/GPTQQuantizer.quantize_model.<locals>.<genexpr>r   r   hpuFound modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`zUsing Exllama backend with act_order will reorder the weights offline, thus you will not be able to save the model with the right weights.Setting `disable_exllama=True`. You should only use Exllama backend with act_order for inference. zUsing Exllamav2 backend will reorder the weights offline, thus you will not be able to save the model with the right weights.Setting `disable_exllama=True`. You should only use Exllamav2 backend for inference. )rt   
quantizersr0   )[r   r   RuntimeErrorr   warningr
   parse	importlibmetadatar,   r-   r.   rB   r[   rO   evalr/   r   	use_cacher   r   valuesr   lenr   r   r   r"   r    r   float16rD   rE   minr   r   r   r>   strr   from_pretrained	Exceptionr   typer   rI   rH   rF   r   rG   r   r   rM   register_forward_pre_hookr   removeempty_cacher+   	enumerater   r2   r   rN   rC   r   r   r%   rg   	configurer=   rk   register_forward_hookrangefasterquantr@   r?   rA   freer   rJ   r\   r3   r7   r8   
pack_modelis_quantizedr   quantization_methodrp   quantization_configpost_init_model)%r]   rt   r   gptq_supports_cpu
has_confighas_device_mapr   hookr{   r   r   r>   layer_outputsblocks	to_devicemodule_namer   handler   r   r   r   iblockblock_devicelayers_name_listsubset_name_listsubset_layershandlesr   jhquant_outputsscalezerog_idxlayer_outputr0   )r   r   r<   r   r   r   r   r1   quantize_model  s  














$&"
&

*


zGPTQQuantizer.quantize_modelc                    s   | j dkr*| js*t jdkst dr*t fdddD r*| js*td d| _G d	d
 d
t}t	 rCt
 | j | j| j| j\ }|  _| j j_t | jd | jrj| jsj| jtjkrj| jdurjt | j  S )z
        Post-initialization that require device information, for example buffers initialization on device.

        Args:
            model (`nn.Module`):
                The input model
        rT   r-   r   c                 3   s    | ]}| j v V  qd S rv   )r   r   rt   r0   r1   r|     r}   z0GPTQQuantizer.post_init_model.<locals>.<genexpr>r   r   Tc                   @   s   e Zd ZdS )z0GPTQQuantizer.post_init_model.<locals>.StoreAttrN)r4   r5   r6   r0   r0   r0   r1   	StoreAttr  s    r  )use_act_orderN)r=   rJ   r   r   r/   r   r   r   objectr   r(   re   rO   rP   quantize_configrA   gptq_post_initr\   r3   r7   rL   r#   )r]   rt   r  r   r0   r  r1   r     s6   

zGPTQQuantizer.post_init_modelr   c           	         s   t d t|  fdd|D  | j|jdd | || t|| jg}|D ]J}t | || \||< }}}|| j}|| d  | d|d|d|df\ |< }}}|| 	 | ||| || | q)t d dS )	a  
        Pack the model by replacing the layers by quantized layers

        Args:
            model (`nn.Module`):
                The model to pack
            quantizers (`Dict[str,Tuple]`):
                A mapping of the layer name and the data needed to pack the layer
        zPacking model...c                    r   r0   r0   )rx   nr   r0   r1   r     r   z,GPTQQuantizer.pack_model.<locals>.<dictcomp>Tr~   r   zModel packed.N)
r   r   r   rf   r   r   re   r   r   rb   )	r]   rt   r   qlayersr{   r	  r
  r  layer_devicer0   r   r1   r     s   


4zGPTQQuantizer.pack_model10GBsave_dirmax_shard_sizesafe_serializationc                 C   s   t  rt|| j| j| j| j| j\}}|rd| _tj|dd |j	|||d t
tj|tddd}tj|  |dd	 W d
   d
S 1 sIw   Y  d
S )a  
        Save model state dict and configs

        Args:
            model (`nn.Module`):
                Model to be saved. The model can be wrapped or unwraped.
            save_dir (`str`):
                Directory to which to save. Will be created if it doesn't exist.
            max_shard_size (`str`, defaults to `"10GB"`):
                The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
                lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
                <Tip warning={true}>

                If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
                which will be bigger than `max_shard_size`.

                </Tip>
            safe_serialization (`bool`, defaults to `True`):
                Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).

        r<   T)exist_ok)r  r  wutf-8encodingr   )indentN)r   r)   rB   r=   re   rO   rP   osmakedirssave_pretrainedopenpathjoinr   jsondumprp   )r]   rt   r  r  r  	convertedfr0   r0   r1   save  s   "zGPTQQuantizer.save)Nr:   r;   FTTFNNNr   NFNNTNr<   NN)F)r   rv   )r  T) r4   r5   r6   __doc__intr   r	   r   r   floatboolr   r   r   r`   dictrf   rp   classmethodrs   r   Moduler   r   r   r,   no_gradr  r   r   r   r+  r0   r0   r0   r1   r9   O   s    	


 B  B%

""r9   Frt   save_folderquant_config_namestate_dict_namera   
max_memoryr   offload_folderoffload_buffersoffload_state_dictrJ   rK   rL   c              
   C   s  t j st stdt st stdt std|du r-dt j i}t	d |du r7dt
ji}nd|vr?td|d t
jt
jfvrT|d }td	| z5t| d
rgt| jdrg| jj }n!ttj||ddd}t|}W d   n1 sw   Y  W n ty } ztd| d| d|d}~ww t|}|
|_||_|jd |_||_|j| |d} |du r| | }t!| |durtj||n|||||||	d} |"| } d| _#t$j%| _&| '  | S )a
  
    Load quantized weights from the save_folder into the converted model and dispatch the weights according to the device_map.

    Args:
        model (`nn.Module`):
            The model can be enpty or not.
        save_folder (`str`):
            Directory to which to load the weights.
        quant_config_name (`str`, defaults to `GPTQ_CONFIG`):
            Name of the quantization config file
        state_dict_name (`Optional[str]`, defaults to `None`):
            Name of the state dict file
        device_map (`Optional[str]`, defaults to `None`):
            A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer
            name, once a given module name is inside, every submodule of it will be sent to the same device.
            To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`.
        max_memory (`Optional[Dict]`, defaults to `None`):
            A dictionary device identifier to maximum memory. Will default to the maximum memory available for each GPU
            and the available CPU RAM if unset.
        no_split_module_classes (`Optional[Dict]`, defaults to `None`):
            A list of layer class names that should never be split across device (for instance any layer that has a
            residual connection).
        offload_folder (`Optional[str]`, defaults to `None`):
            If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
        offload_buffers (`Optional[str]`, defaults to `None`):
            In the layers that are offloaded on the CPU or the hard drive, whether or not to offload the buffers as
            well as the parameters.
        offload_state_dict (`bool`, defaults to `False`):
            If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
            the weight of the CPU state dict + the biggest shard does not fit. Will default to `True` if the device map
            picked contains `"disk"` values.
        disable_exllama (`Optional[bool]`, defaults to `None`):
            Whether to use exllama backend. Only works with `bits` = 4.
        exllama_config (`Optional[Dict[str, Any]]`, defaults to `None`):
            The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
        max_input_length (`Optional[int]`, defaults to `None`):
            The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
            It is specific to the exllama backend with act-order.

    Returns:
        `nn.Module`: The quantized model
    zBNo GPU found. A GPU is needed to run quantized model by auto_gptq.zgptqmodel (`pip install gptqmodel`) or auto-gptq (`pip install auto-gptq`) is required in order to load quantized weights. Please notice that auto-gptq will be deprecated in the future.zYou need to install accelerate in order to load and dispatch weights toa quantized model. You can do it with `pip install accelerate`Nr   z\The device_map was not initialized.Setting device_map to `{'':torch.cuda.current_device()}`.r
   rW   rX   r   r   rr  r  z(Failed to load quantization config from z (lookup for traceback): z
Tip: If the save directory is saved from a transformers.PreTrainedModel, make sure that `config.json` contains a 'quantization_config' key.)ra   )
checkpointra   r7  r   r8  r9  r:  T)(r,   r-   r.   r   r   r   r   current_devicer   r   r3   r8   r[   r7   r/   r   r   rp   r$  r!  r%  r&  r'  loadr   r9   rs   rJ   rK   r\   rL   r   r   r!   r   r   r   r%   r   r   )rt   r4  r5  r6  ra   r7  r   r8  r9  r:  rJ   rK   rL   r
   quantize_config_dictr*  errrg   r0   r0   r1   load_quantized_model2  sx   9



rA  )Qr   r'  r!  enumr   loggingr   typingr   r   r   r   r   r	   r,   	packagingr
   r   	tqdm.autor   transformersr   transformers.pytorch_utilsr   &transformers.utils.quantization_configr   utilsr   r   r   utils.modeling_utilsr   r   rj   	constantsr   r   r   r   r   r   r   r   r   r   
accelerater    r!   accelerate.hooksr"   	auto_gptqrm   r#   auto_gptq.modeling._utilsr$   r  auto_gptq.quantizationr%   auto_gptq.utils.import_utilsr&   r'   	gptqmodelgptqmodel.quantizationgptqmodel.utils.importergptqmodel.utils.modelr(   r)   r*   gptqmodel.versionrl   r4   r   r2   r-  r3   r  r9   r2  r   r/  rA  r0   r0   r0   r1   <module>   s     
     k	
