o
    ~ri?4                     @  s   d dl mZ d dlZd dlZd dlmZ d dlmZmZm	Z	 er&ddl
mZ G dd dZG d	d
 d
eZG dd deZG dd deZdS )    )annotationsN)Queue)TYPE_CHECKINGAnycast   )PreTrainedTokenizerBasec                   @  s    e Zd ZdZdd Zdd ZdS )BaseStreamerzG
    Base class from which `.generate()` streamers should inherit.
    c                 C     t  )z;Function that is called by `.generate()` to push new tokensNotImplementedErrorselfvalue r   o/lsinfo/ai/hellotax_ai/llm_service/venv_embed/lib/python3.10/site-packages/transformers/generation/streamers.pyput       zBaseStreamer.putc                 C  r
   )zHFunction that is called by `.generate()` to signal the end of generationr   r   r   r   r   end$   r   zBaseStreamer.endN)__name__
__module____qualname____doc__r   r   r   r   r   r   r	      s    r	   c                   @  s@   e Zd ZdZddd	d
Zdd Zdd ZddddZdd ZdS )TextStreamera)  
    Simple text streamer that prints the token(s) to stdout as soon as entire words are formed.

    <Tip warning={true}>

    The API for the streamer classes is still under development and may change in the future.

    </Tip>

    Parameters:
        tokenizer (`AutoTokenizer`):
            The tokenized used to decode the tokens.
        skip_prompt (`bool`, *optional*, defaults to `False`):
            Whether to skip the prompt to `.generate()` or not. Useful e.g. for chatbots.
        decode_kwargs (`dict`, *optional*):
            Additional keyword arguments to pass to the tokenizer's `decode` method.

    Examples:

        ```python
        >>> from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer

        >>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2")
        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
        >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
        >>> streamer = TextStreamer(tok)

        >>> # Despite returning the usual output, the streamer will also print the generated text to stdout.
        >>> _ = model.generate(**inputs, streamer=streamer, max_new_tokens=20)
        An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,
        ```
    F	tokenizerr   skip_promptbooldecode_kwargsr   c                 K  s(   || _ || _|| _g | _d| _d| _d S )Nr   T)r   r   r   token_cache	print_lennext_tokens_are_prompt)r   r   r   r   r   r   r   __init__K   s   
zTextStreamer.__init__c                 C  s  t |jdkr|jd dkrtdt |jdkr|d }| jr(| jr(d| _dS | j|  tt	| j
j| jfi | j}|drR|| jd }g | _d| _n5t |dkrr| t|d rr|| jd }|  jt |7  _n|| j|dd  }|  jt |7  _| | dS )	zm
        Receives tokens, decodes them, and prints them to stdout as soon as they form entire words.
           r   z'TextStreamer only supports batch size 1FN
 )lenshape
ValueErrorr   r!   r   extendtolistr   strr   decoder   endswithr    _is_chinese_charordrfindon_finalized_text)r   r   textprintable_textr   r   r   r   U   s&   
zTextStreamer.putc                 C  sd   t | jdkr$tt| jj| jfi | j}|| jd }g | _d| _nd}d| _| j	|dd dS )z;Flushes any remaining cache and prints a newline to stdout.r   N T)
stream_end)
r'   r   r   r,   r   r-   r   r    r!   r2   )r   r3   r4   r   r   r   r   w   s   zTextStreamer.endr3   r,   r6   c                 C  s   t |d|sdndd dS )zNPrints the new text to stdout. If the stream is ending, also prints a newline.Tr5   N)flushr   )printr   r3   r6   r   r   r   r2      s   zTextStreamer.on_finalized_textc                 C  s   |dkr|dks@|dkr|dks@|dkr|dks@|dkr |dks@|d	kr(|d
ks@|dkr0|dks@|dkr8|dks@|dkrB|dkrBdS dS )z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TFr   )r   cpr   r   r   r/      s   zTextStreamer._is_chinese_charNF)r   r   r   r   r   r   r3   r,   r6   r   )	r   r   r   r   r"   r   r   r2   r/   r   r   r   r   r   )   s    !
"r   c                      D   e Zd ZdZ		dd fddZddddZdd Zdd Z  ZS )TextIteratorStreamera  
    Streamer that stores print-ready text in a queue, to be used by a downstream application as an iterator. This is
    useful for applications that benefit from accessing the generated text in a non-blocking way (e.g. in an interactive
    Gradio demo).

    <Tip warning={true}>

    The API for the streamer classes is still under development and may change in the future.

    </Tip>

    Parameters:
        tokenizer (`AutoTokenizer`):
            The tokenized used to decode the tokens.
        skip_prompt (`bool`, *optional*, defaults to `False`):
            Whether to skip the prompt to `.generate()` or not. Useful e.g. for chatbots.
        timeout (`float`, *optional*):
            The timeout for the text queue. If `None`, the queue will block indefinitely. Useful to handle exceptions
            in `.generate()`, when it is called in a separate thread.
        decode_kwargs (`dict`, *optional*):
            Additional keyword arguments to pass to the tokenizer's `decode` method.

    Examples:

        ```python
        >>> from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
        >>> from threading import Thread

        >>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2")
        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
        >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
        >>> streamer = TextIteratorStreamer(tok)

        >>> # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
        >>> generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=20)
        >>> thread = Thread(target=model.generate, kwargs=generation_kwargs)
        >>> thread.start()
        >>> generated_text = ""
        >>> for new_text in streamer:
        ...     generated_text += new_text
        >>> generated_text
        'An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,'
        ```
    FNr   r   r   r   timeoutfloat | Noner   r   c                   s.   t  j||fi | t | _d | _|| _d S N)superr"   r   
text_queuestop_signalr?   )r   r   r   r?   r   	__class__r   r   r"      s   
zTextIteratorStreamer.__init__r3   r,   r6   c                 C  s2   | j j|| jd |r| j j| j| jd dS dS )\Put the new text in the queue. If the stream is ending, also put a stop signal in the queue.r?   N)rC   r   r?   rD   r9   r   r   r   r2         z&TextIteratorStreamer.on_finalized_textc                 C     | S rA   r   r   r   r   r   __iter__      zTextIteratorStreamer.__iter__c                 C  s$   | j j| jd}|| jkrt |S NrH   )rC   getr?   rD   StopIterationr   r   r   r   __next__   s   
zTextIteratorStreamer.__next__FNr   r   r   r   r?   r@   r   r   r;   r<   )	r   r   r   r   r"   r2   rK   rP   __classcell__r   r   rE   r   r>      s    0r>   c                      r=   )AsyncTextIteratorStreamera'	  
    Streamer that stores print-ready text in a queue, to be used by a downstream application as an async iterator.
    This is useful for applications that benefit from accessing the generated text asynchronously (e.g. in an
    interactive Gradio demo).

    <Tip warning={true}>

    The API for the streamer classes is still under development and may change in the future.

    </Tip>

    Parameters:
        tokenizer (`AutoTokenizer`):
            The tokenized used to decode the tokens.
        skip_prompt (`bool`, *optional*, defaults to `False`):
            Whether to skip the prompt to `.generate()` or not. Useful e.g. for chatbots.
        timeout (`float`, *optional*):
            The timeout for the text queue. If `None`, the queue will block indefinitely. Useful to handle exceptions
            in `.generate()`, when it is called in a separate thread.
        decode_kwargs (`dict`, *optional*):
            Additional keyword arguments to pass to the tokenizer's `decode` method.

    Raises:
        TimeoutError: If token generation time exceeds timeout value.

    Examples:

        ```python
        >>> from transformers import AutoModelForCausalLM, AutoTokenizer, AsyncTextIteratorStreamer
        >>> from threading import Thread
        >>> import asyncio

        >>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2")
        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
        >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")

        >>> # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
        >>> async def main():
        ...     # Important: AsyncTextIteratorStreamer must be initialized inside a coroutine!
        ...     streamer = AsyncTextIteratorStreamer(tok)
        ...     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=20)
        ...     thread = Thread(target=model.generate, kwargs=generation_kwargs)
        ...     thread.start()
        ...     generated_text = ""
        ...     async for new_text in streamer:
        ...         generated_text += new_text
        >>>     print(generated_text)
        >>> asyncio.run(main())
        An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,
        ```
    FNr   r   r   r   r?   r@   r   r   c                   sp   t  j||fi | t | _d | _|| _t | _t	tdd }t
jdko)t|| _| jr3|| _d S d | _d S )Nr?   )      )rB   r"   asyncior   rC   rD   r?   get_running_looploopgetattrsysversion_infocallablehas_asyncio_timeoutasyncio_timeout)r   r   r   r?   r   timeout_contextrE   r   r   r"   "  s   

z"AsyncTextIteratorStreamer.__init__r3   r,   r6   c                 C  s2   | j | jj| |r| j | jj| j dS dS )rG   N)rY   call_soon_threadsaferC   
put_nowaitrD   r9   r   r   r   r2   2  rI   z+AsyncTextIteratorStreamer.on_finalized_textc                 C  rJ   rA   r   r   r   r   r   	__aiter__8  rL   z#AsyncTextIteratorStreamer.__aiter__c              	     s   zA| j r3| jd ur3| | j4 I d H  | j I d H }W d   I d H  n1 I d H s-w   Y  ntj| j | jdI d H }W n tjyM   t w || jkrVt	 |S rM   )
r^   r_   r?   rC   rN   rW   wait_forTimeoutErrorrD   StopAsyncIterationr   r   r   r   	__anext__;  s   (
z#AsyncTextIteratorStreamer.__anext__rQ   rR   r;   r<   )	r   r   r   r   r"   r2   rc   rg   rS   r   r   rE   r   rT      s    7rT   )
__future__r   rW   r[   queuer   typingr   r   r   tokenization_utils_baser   r	   r   r>   rT   r   r   r   r   <module>   s   yK