
    vj5                     v   d dl mZmZmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZmZ d d
lmZmZ d dlmZ  e            Z ej        ej        ej                   G d de                      Z  ej        ej        ej!                   G d de                      Z"dS )    )AnyDictListTupleUnionN)Preprocessors)Preprocessor)PREPROCESSORS)"TextClassificationPreprocessorBase)NLPTokenizerForLSTM#TokenClassificationPreprocessorBase)NLPTokenizer)FieldsModeKeys)get_model_typeparse_label_mapping)
get_logger)module_namec                   r     e Zd ZddZdddddej        dddf	dededeeef         de	dede
d	ef fd
Z xZS )/SpeakerDiarizationDialogueDetectionPreprocessorNc                 `    d|vr| j         t          j        k    rdnd |d<    | j        ||fi |S )Nreturn_tensorspt)moder   	INFERENCEnlp_tokenizer)self	sequence1	sequence2kwargss       p/lsinfo/ai/hellotax_ai/data_center/backend/venv/lib/python3.11/site-packages/modelscope/preprocessors/speaker.py_tokenize_textz>SpeakerDiarizationDialogueDetectionPreprocessor._tokenize_text   sN    6)),0I9K,K,KDDQU  "!t!)YAA&AAA    labelfirst_sequencesecond_sequencelabel2idr   
max_lengthuse_fastc
           	      v   |
                     dd          |
d<   |
                     dd          |
d<   ||n|
                     dd          |
d<   |
                    dd            d }|t          |          }t          ||||
          | _        t                                          |||||||	           d S )N
truncationTpaddingr(   sequence_length   )r)   tokenize_kwargs)getpopr   r   r   super__init__)r   	model_dirr%   r&   r$   r'   r   r(   r)   keep_original_columnsr    
model_type	__class__s               r!   r3   z8SpeakerDiarizationDialogueDetectionPreprocessor.__init__!   s      &zz,==|"JJy,??y*4*@JJfjj!3G( G( 		 	

$d+++
 '	22J)zHfN N NNOU!4)>	@ 	@ 	@ 	@ 	@r#   )N)__name__
__module____qualname__r"   r   r   strr   r   r   intboolr3   __classcell__r7   s   @r!   r   r      s        
B B B B  '+(,+2"&%/#'"&'+@ @!$@ #&@ c4i(	@
  @ @ !@  @ @ @ @ @ @ @ @ @ @r#   r   c                        e Zd Zdddddej        ddddf
dedededed	ed
ef fdZde	ee
e         f         fdZd Zd Zd Zd Zd Z xZS ):SpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessorNtextr$   FTr4   r%   r'   label_all_tokensr   c           
         t                                          |||||||	|
           d }|t          |          }|                    dd          |d<   |                    dd          |d<   ||n|                    dd          |d<   |                    dd            |dk    |d<   t          ||||	          | _        d S )
Nr+   Tr,   r(   r-   r.   lstmadd_special_tokens)r4   r6   r)   r/   )r2   r3   r   r0   r1   r   r   )r   r4   r%   r$   r'   rC   r   r(   r)   r5   return_textr    r6   r7   s                r!   r3   zCSpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor.__init__@   s     	NE8)41F$	& 	& 	& 
 '	22J%zz,==|"JJy,??y*4*@JJfjj!3G( G( 		 	

$d+++'1V';#$0!"	$ $ $r#   c                 n   |}| j         t          j        k    rt          |t                    s
J d            | j                            dd          }|r|                    d          }|dk    s| j        rt	          |          }npg }|	                    t	          |d |                              |
                    d           |	                    t	          ||dz   d                               |}|r'| j         t          j        k    r | j        |fi |\  }}n4| j        j        j        r | j        |fi |\  }}n | j        |fi |\  }}d}t!          |d                   D ]\  }	}
|
| j        j        j        k    r|	} n|dk    r1t%          |t'          |d                             D ]}d|d         |<   | j         t          j        k    rH|                                D ]2}t+          j        ||                                       d	          ||<   3n|                    d
d            ||fS )NzsInput needs to be lists in training and evaluating,because the length of the words and the labels need to be equal.is_split_into_wordsFz[SEP]   	input_ids
label_maskr   offset_mapping)r   r   r   
isinstancelistr   get_tokenizer_kwargfindis_lstm_modelextendappend_tokenize_text_by_words	tokenizeris_fast"_tokenize_text_with_fast_tokenizer"_tokenize_text_with_slow_tokenizer	enumeratesep_token_idrangelenkeystorchtensor	unsqueezer1   )r   rB   r    tokensrI   sep_idx
tmp_tokens	encodingsword_idsidxtoken_idikeys                r!   r"   zISpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor._tokenize_text`   s   9***fd++ S S .S S S+"0DD!5* * 
	$kk'**G"}} 2}f
!!$vhwh'7"8"8999!!'***!!$vgkll';"<"<===# 	"490B#B#B">$">#" #" #" #"Ixx)1 	""I$"I#" #" #" #"Ixx #J$"I#" #" #" #"Ix &y'=>> 	 	MC4-7DDD E b==7C	,(?$@$@AA 3 3-2	,'**9*** ~~'' K K!&in!=!=!G!G!J!J	#K MM*D111(""r#   c           	         g }g }g }g }t          |          D ]\  }}| j        j                            |d          }	t	          |	          dk    r| j        j        j        g}	|                    |	           |                    dgt	          |	          z             |                    dgdgt	          |	          dz
  z  z              |                    ||dz   fg           |                    d| j                            d                    }
|                    d|                    d| j                            d                              }| j                            d	          rdnd}t	          |          |d
|z  z
  k    r |d |d
|z  z
           }|d |d
|z  z
           }|d t          |                   }|
dk    rdg|z  |z   dg|t	          |          z
  |z
  z  z   }|dg|t	          |          z
  z  z   }| j        j        j
        g|z  |z   | j        j        j        g|z  z   | j        j        j        g|t	          |          z
  d
|z  z
  z  z   }|dg|d
z  z  z   dg|t	          |          z
  d
|z  z
  z  z   }nJdg|z  |z   dg|z  z   }| j        j        j
        g|z  |z   | j        j        j        g|z  z   }|dg|d
z  z  z   }||||d}|d fS )NF)rF   r      Tr,   r(   r-   rF      r   r   )rL   attention_maskrM   rN   )r[   r   rW   encoder^   unk_token_idrT   r0   rQ   sumcls_token_idr\   pad_token_id)r   rc   r    rL   rM   rN   rp   offsettokensubtoken_idsr,   r(   special_tokenrf   s                 r!   rV   zRSpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor._tokenize_text_by_words   s   	
&v.. 	: 	:MFE-7>>% ? 1 1L<  A%% $ 2 < IJ\***!!1#L(9(9"9:::tfw#l2C2Ca2G'HHIII!!FFQJ#7"89999**Y!/CCINNP PZZJJ()==lKKM MN N
 "/CC " " )'( 	z??Z!m*;;;;#$Ej1}3D&D$EFJ!"CJ]1B$B"CDI'(8Z(89l""=0:=:J7-GHIJ+vhS000/2 2N+5BCmSV__#-:;mKL#-:;zCPYNN?Z]^an]n?nopI ,qc!/# #&'S^!4!44q=7HH&JJNN  =0:=-'(J+5BCmSV__#-:;mKLI+qc]Q5F.GGN #,$,	
 
	 $r#   c                    t          |t                    } | j        |fd|d|}g }|                                }g }t	          t          |                    D ]}||         |                    d            ||         ||dz
           k    r=|                    d           |s%|d         d         |d         |         d         f|d<   r|                    d           |r'|                    ||         ||         dz   f           |                    |d         |                    | j                            d          }	|	d	k    r&|d
gt          |          t          |          z
  z  z   }||d<   ||d<   ||fS )NT)return_offsets_mappingrI   Frm   rJ   r   rN   r,   r(   ro   rM   )rO   rP   r   rg   r]   r^   rU   rQ   )
r   rc   r    rI   rf   rM   rg   rN   rj   r,   s
             r!   rY   z]SpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor._tokenize_text_with_fast_tokenizer   s   (66&D&#' 3  	 	
 
%%''s8}}%% 	J 	JA{"!!%((((!Q//!!%(((* M*8*<Q*?*34D*Ea*H*K*MN2& !!$'''& J"))8A;a*HIIII")))4D*Ea*HIIII$88CCl""+vhJ#n"5"55/7 7N&4	"#",	,(""r#   c           	         | j         t          j        k    rt          |t                    s
J d            d } | j        |fddi|}| j                                        }d|z   }t          | |          st          d| d| d| d           t          | |          |          \  }}|
                    d	| j                            d	                    }	|
                    d
| j                            d
                    }
|
                    d| j                            d                    rdnd}t          |          |
d|z  z
  k    r|d |
d|z  z
           }|d t          |                   }|	d
k    r=dg|z  |z   dg|
t          |          z
  |z
  z  z   }|dg|
t          |          z
  z  z   }ndg|z  |z   dg|z  z   }||d<   ||d<   ||fS )NzSlow tokenizer now only support str input in inference mode. If you are training models, please consider using the fast tokenizer.rI   F"get_label_mask_and_offset_mapping_zNo `z` method defined for tokenizer z>, please use a fast tokenizer instead, or try to implement a `z` methodr,   r(   rF   rm   r   rn   ro   rN   rM   )r   r   r   rO   r;   r   get_tokenizer_classhasattrRuntimeErrorgetattrr0   rQ   r^   rs   )r   rc   r    rg   rf   tokenizer_namemethodrM   rN   r,   r(   ry   s               r!   rZ   z]SpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor._tokenize_text_with_slow_tokenizer   s   yH...:fc3J3J..8 /.J &D&9 9(-9179 9	+??AA5FtV$$ 	98v 8 8+8 8'-8 8 89 9 9 &;WT6%:%:6%B%B"
N**Y!/CCINNP PZZ$,@@NNP P
#ZZ 22$& &' ' . -. 	 z??Z!m*;;;;#$Ej1}3D&D$EFJ'(8Z(89l""=0:=:J7-GHIJ+vhS000/2 2NN  =0:=-'(J&4	"#",	,(""r#   c                    g }g }| j         j                            |          }d}|D ]}|d d         dk    }|r|                    d           n|dd          }|                    d           |||d                              |          z   }|t          |          z   }	|r|                    ||	f           n|d         d         |	f|d<   |	}||fS )Nr   rn   z##TFrJ   )r   rW   tokenizerU   indexr^   )
r   rB   rM   rN   rc   rv   rw   is_startstartends
             r!   /get_label_mask_and_offset_mapping_BertTokenizerzjSpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor.get_label_mask_and_offset_mapping_BertTokenizer  s
   
#-66t<< 	 	Ebqb	T)H )!!$''''abb	!!%(((T&'']00777E#e**$C B%%ucl3333&4R&8&;S%Ar"FF>))r#   c                    g }g }| j         j                            |          }d}d}|D ]}|d         dk    }|r6|dd          }|                    d           t	          |          dk    rd}En|                    d           |||d                              |          z   }	|	t	          |          z   }
|s|r|                    |	|
f           n|d         d         |
f|d<   |
}d}||fS )Nr   F_rm   TrJ   )r   rW   r   rU   r^   r   )r   rB   rM   rN   rc   rv   last_is_blankrw   r   r   r   s              r!   5get_label_mask_and_offset_mapping_XLMRobertaTokenizerzpSpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor.get_label_mask_and_offset_mapping_XLMRobertaTokenizer"  s2   
#-66t<< 	" 	"EaCH )abb	!!$'''u::??$(M # !!%(((T&'']00777E#e**$C B B%%ucl3333&4R&8&;S%Ar"F!MM>))r#   )r8   r9   r:   r   r   r;   r   r=   r3   r   r   r"   rV   rY   rZ   r   r   r>   r?   s   @r!   rA   rA   ;   s        #''-%"&*/%/ '+!$ $$!$$ $  	$
 $($ $ $ $ $ $ $@,#5d3i#8 ,# ,# ,# ,#\4 4 4l# # #B%# %# %#N* * *,* * * * * * *r#   rA   )#typingr   r   r   r   r   r`   modelscope.metainfor   modelscope.preprocessorsr	    modelscope.preprocessors.builderr
   =modelscope.preprocessors.nlp.text_classification_preprocessorr   >modelscope.preprocessors.nlp.token_classification_preprocessorr   r   3modelscope.preprocessors.nlp.transformers_tokenizerr   modelscope.utils.constantr   r   modelscope.utils.hubr   r   modelscope.utils.loggerr   loggerregister_moduleaudiosen_cls_tokenizerr   token_cls_tokenizerrA    r#   r!   <module>r      s   1 0 0 0 0 0 0 0 0 0 0 0 0 0  - - - - - - 1 1 1 1 1 1 : : : : : :' ' ' ' ' '> > > > > > > > L L L L L L 6 6 6 6 6 6 6 6 D D D D D D D D . . . . . .	 
Lm=? ? ? @  @  @  @  @* @  @? ? @F 
Lm?A A A}* }* }* }* }*+}* }*A A}* }* }*r#   