
    #j/                        d dl Z d dlZd dlmZ d dlmZ d dlZej        j        j	        ej        j        j
        ej        j        j        ej        j        j        ej        j        j        ej        j        j        ej        j        j        ej        j        j        ej        j        j        ej        j        j        ej        j        j        ej        j        j        ej        j        j        ej        j        j        dZ	 d3dej        ded	ed
ej        dz  dej        f
dZ		 d3dej        deded
ej        dz  dej        f
dZ
dej        dedej        fdZdej        dej        fdZdej        dej        fdZdej        dej        fdZd4dej        dedej        fdZd5dej        ded
ej        dz  dej        fdZd5dej        ded
ej        dz  dej        fdZ	 	 	 	 d6dej        dededed
ej        dz  dej        fdZ	 	 	 	 d6dej        dededed
ej        dz  dej        fd Z	 	 	 	 	 d7dej        dededed	ed
ej        dz  dej        fd#Z	 	 d8dej        ded
ej        dz  dej        fd$Z	 d9dej        d&eded
ej        dz  dej        f
d'Zdej        d(ej        dej        fd)Zd:d+Zd, Zd- Z d.Z!ed/             Z"ed0             Z#ed1             Z$ed2             Z%dS );    N)defaultdict)contextmanager)uniform_normal_	constant_ones_zeros_eye_dirac_xavier_uniform_xavier_normal_kaiming_uniform_kaiming_normal_trunc_normal_orthogonal_sparse_              ?tensorab	generatorreturnc                 Z    t          | dd          st          d         | |||          S | S )N_is_hf_initializedFr   )r   r   r   getattrTORCH_INIT_FUNCTIONS)r   r   r   r   s       e/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/transformers/initialization.pyr   r   *   s;     6/77 W#J/!qIVVVVM    meanstdc                 Z    t          | dd          st          d         | |||          S | S )Nr   Fr   )r!   r"   r   r   )r   r!   r"   r   s       r   r   r   2   s<     6/77 `#I.vDcU^____Mr    valc                 V    t          | dd          st          d         | |          S | S )Nr   Fr   )r$   r   )r   r$   s     r   r   r   :   s5    6/77 B#K0SAAAAMr    c                 R    t          | dd          st          d         |           S | S )Nr   Fr   r   r   s    r   r   r   @   s/    6/77 5#G,V444Mr    c                 R    t          | dd          st          d         |           S | S )Nr   Fr	   r   r'   s    r   r	   r	   F   s/    6/77 6#H-f555Mr    c                 R    t          | dd          st          d         |           S | S )Nr   Fr
   r   r'   s    r   r
   r
   L   s/    6/77 4#F+F333Mr       groupsc                 V    t          | dd          st          d         | |          S | S )Nr   Fr   )r+   r   )r   r+   s     r   r   r   R   s5    6/77 E#H-fVDDDDMr    gainc                 X    t          | dd          st          d         | ||          S | S )Nr   Fr   r-   r   r   r   r-   r   s      r   r   r   X   s9    6/77 _#$56vDT]^^^^Mr    c                 X    t          | dd          st          d         | ||          S | S )Nr   Fr   r/   r   r0   s      r   r   r   ^   s9    6/77 ^#$45f4S\]]]]Mr    fan_in
leaky_relumodenonlinearityc                 \    t          | dd          st          d         | ||||          S | S )Nr   Fr   r   r4   r5   r   r   r   r   r4   r5   r   s        r   r   r   d   sH     6/77 
#$67ad
 
 
 	
 Mr    c                 \    t          | dd          st          d         | ||||          S | S )Nr   Fr   r7   r   r8   s        r   r   r   r   sH     6/77 
#$56ad
 
 
 	
 Mr                  @c                 ^    t          | dd          st          d         | |||||          S | S )Nr   Fr   )r!   r"   r   r   r   r   )r   r!   r"   r   r   r   s         r   r   r      sB     6/77 p#O4V$CSTXYenooooMr    c                 X    t          | dd          st          d         | ||          S | S )Nr   Fr   r/   r   r0   s      r   r   r      s:    
 6/77 [#M26PYZZZZMr    {Gz?sparsityc                 Z    t          | dd          st          d         | |||          S | S )Nr   Fr   )r?   r"   r   r   )r   r?   r"   r   s       r   r   r      s<     6/77 h#I.vc]fggggMr    otherc                     t          | dd          s@t          j                    5  |                     |          cd d d            S # 1 swxY w Y   | S )Nr   F)r   torchno_gradcopy_)r   rA   s     r   rE   rE      s    6/77 ']__ 	' 	'<<&&	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	'Ms   AAAnormalc                    t           j        j                            |           \  }}|dk    r|}n|dk    r|}n|dk    r||z   dz  }d|z  }|dk    r(t	          | t          j        |          dz             d S |d	k    r%t          | t          j        |                     d S |d
k    r+t          j        d|z            }t          | | |           d S t          d|           )Nr2   fan_outfan_avg   r   truncated_normalg۶%?)r"   rF   uniform   zinvalid distribution )
rC   nninit_calculate_fan_in_and_fan_outr   mathsqrtr   r   
ValueError)r   r4   distributionr2   rH   denomvariancebounds           r   _variance_scalingrX      s   hmAA&IIOFGx						'!Q&U{H)))f$)H"5"58K"KLLLLLL		!	!DIh//000000		"	"	!h,''%'''''???@@@r    c                 L    t          | dd          st          | dd           | S )Nr   Fr2   rK   r4   rT   r   rX   r'   s    r   lecun_normal_r\      s3    6/77 R&x>PQQQQMr    c                 L    t          | dd          st          | dd           | S )Nr   Fr2   rF   rZ   r[   r'   s    r   default_flax_embed_init_r^      s2    6/77 H&xhGGGGMr    )
ztorch.nn.initztorch.nn.modules.activationztorch.nn.modules.transformerztorch.nn.modules.linearztorch.nn.modules.lossztorch.nn.modules.batchnormztorch.nn.modules.convztorch.nn.modules.normalizationztorch.nn.modules.rnnztorch.nn.modules.sparsec            	   #   ~  K   t          t                    } 	 t          D ]}|t          j        v rzt          j        |         }t
                                          D ]N}t          ||          r<t          ||          | |         |<   t          ||t                      |                    OdV  |                                 D ]0\  }}|                                D ]\  }}t          |||           1dS # |                                 D ]0\  }}|                                D ]\  }}t          |||           1w xY w)a  
    Guard the `torch.nn.init` primitive functions to behave exactly like the functions in this file, i.e. be
    protected against the `_is_hf_initialized` flag to avoid re-init if the param was already loaded.

    Usually, all models are using the init from `transformers` which are already guarded, but just to make extra sure
    and for remote code, we also use this context manager.
    N)r   dictTORCH_MODULES_TO_PATCHsysmodulesr   keyshasattrr   setattrglobalsitems)	originalsmodule_namemodule	func_name	functionsfuncs         r   guard_torch_init_functionsro      sq      D!!I11 	I 	IKck))[1!5!:!:!<!< I IIvy11 I7>vy7Q7Q	&))4	799Y3GHHH "+!2!2 	1 	1FI#,??#4#4 1 1	4	400001	1 	1!2!2 	1 	1FI#,??#4#4 1 1	4	400001	1s   BC5 5AD<c            	   #     K   ddl m}  d }t          t                    }	 t          D ]x}|t
          j        v rht
          j        |         }t                                          D ]<}t          ||          r*t          ||          ||         |<   t          |||           =y| j        }|| _        dV  |                                D ]0\  }}|                                D ]\  }}t          |||           1|| _        dS # |                                D ]0\  }}|                                D ]\  }}t          |||           1|| _        w xY w)ac  
    Disable weight initialization both at the torch-level, and at the transformers-level (`init_weights`).
    This is used to speed-up initializing an empty model with deepspeed, as we do not initialize the model on meta device
    with deepspeed, but we still don't need to run expensive weight initializations as we are loading params afterwards.
    r*   PreTrainedModelc                      d S N argskwargss     r   
empty_funcz#no_init_weights.<locals>.empty_func       r    N)modeling_utilsrr   r   r`   ra   rb   rc   r   rd   re   r   rf   init_weightsrh   )	rr   ry   ri   rj   rk   rl   original_init_weightsrm   rn   s	            r   no_init_weightsr~      s      0/////   D!!I=1 	? 	?Kck))[1!5!:!:!<!< ? ?Ivy11 ?7>vy7Q7Q	&))4	:>>> !0 <'1$ "+!2!2 	1 	1FI#,??#4#4 1 1	4	400001 (=$$$	 "+!2!2 	1 	1FI#,??#4#4 1 1	4	400001 (=$<<<<s   BD AEc               #   h   K   ddl m}  d }	 | j        }|| _        dV  || _        dS # || _        w xY w)a  
    Disable weight tying during loading with `from_pretrained`. This is needed as we want to have access to ALL
    weights in the state_dict during `from_pretrained`, and otherwise tying them would remove them from it, as it's
    called in `post_init` when instantiating.
    r*   rq   c                      d S rt   ru   rv   s     r   ry   z"no_tie_weights.<locals>.empty_func$  rz   r    N)r{   rr   tie_weights)rr   ry   original_tie_weightss      r   no_tie_weightsr     sm       0/////  ;.:&0# ';###&:#::::s   ( 	1c               #      K   t           j        fd} | t           _        	 dV  t           _        dS # t           _        w xY w)a  
    During meta-device model initialisation, ``torch.linspace`` produces meta
    tensors that have no data.  Custom models loaded from the Hub (remote code)
    often call ``.item()`` on these tensors to compute scalar hyperparameters
    (e.g. stochastic-depth / drop-path schedules).  Native transformers models
    already pass ``device="cpu"`` explicitly for such calls (see e.g.
    ``modeling_swin.py``, ``modeling_pvt_v2.py``), but remote-code models
    written before v5 do not.

    This context manager patches ``torch.linspace`` to default to
    ``device="cpu"`` when no explicit device is requested, matching the best
    practice already used throughout transformers.  Calls that supply an
    explicit ``device`` argument (e.g. ``device=self.logits.device``) are left
    untouched.  ``torch.arange`` is intentionally NOT patched because it is
    used in RoPE computations where the device must match model parameters.
    c                  @    |                     dd            | i |S )Ndevicecpu)
setdefault)rw   rx   original_linspaces     r   _safe_linspacez5meta_device_safe_creation_ops.<locals>._safe_linspaceE  s/    (E***  $1&111r    N)rC   linspace)r   r   s    @r   meta_device_safe_creation_opsr   1  s`      $ 2 2 2 2 2 $EN+******s	   4 A)r   r   N)r*   )r   N)r   r2   r3   N)r   r   r:   r;   N)r*   N)r>   N)r2   rF   )&rQ   rb   collectionsr   
contextlibr   rC   rN   rO   r   r   r   r   r	   r
   r   r   r   r   r   r   r   r   r   Tensorfloat	GeneratorintstrrE   rX   r\   r^   ra   ro   r~   r   r   ru   r    r   <module>r      s    



 # # # # # # % % % % % %  &x}$(X] hm"HMhm"x}4hm26x}4X]08=,x}$  & _c L"-2EJ_W[E[
\    dh L %27JO/\`J`
\   el  5<    %, 5<    5< EL     %,     5<  U\     EL  Z^H^ jojv     5< u uY]G] iniu     $(, L  	
 % \     $(, L  	
 % \     (,
 
L


 

 	

 
 %
 \
 
 
 
 (, L
 % \	    cg L$)05IN[_I_
\   %, u|     A A A A,      1 1 14 != != !=H ; ; ;* + + + + +r    