
    j O                     H   d dl Z d dlZd dlZd dlmZ d dlmZ d dl	m	Z	m
Z
 d dlmZ d dlZd dlmZ d dlmZmZmZmZmZmZmZmZmZ  G d d	ej                  Z G d
 dej                  Z G d dej                  Z G d d          Z G d dee          Z G d dee          Z  G d dee          Z! G d dee          Z" G d dee          Z# G d de          Z$ G d dee          Z% G d dee          Z& G d  d!ee          Z' G d" d#ee          Z(dS )$    N)abstractmethod)fields)gettextngettext)Path)Json)	
BaseConfigChunkingConfigEmbeddingConfigFileStorageConfigPartitionConfigPermissionsConfigProcessorConfig
ReadConfigRetryStrategyConfigc            	           e Zd ZdZ	 	 ddej        dej        ej                 dej        ej	                 dej        fdZ
dS )	DictdictNvalueparamctxreturnc                     	 t          j        |          S # t           j        $ r< |                     t	          d                              |          ||           Y d S w xY w)Nz"{value} is not a valid json value.r   )jsonloadsJSONDecodeErrorfailr   format)selfr   r   r   s       l/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/unstructured/ingest/cli/interfaces.pyconvertzDict.convert   s    		:e$$$# 	 	 	II8 &u&%%     	s    AA! A!NN)__name__
__module____qualname__nametAnyOptionalclick	ParameterContextr"        r!   r   r      sv        D
 .2)-	 u z%/* Z&	
 
     r/   r   c            	           e Zd ZdZddefdZ	 	 ddej        dej        e	j
                 dej        e	j                 d	ej        fd
ZdS )
FileOrJsonzfile-or-jsonFallow_raw_strc                     || _         d S N)r2   )r    r2   s     r!   __init__zFileOrJson.__init__1   s    *r/   Nr   r   r   r   c                    t           j                            t           j                            |                    }t           j                            |          r.t          t          |                                                    S t          |t
                    r5	 t          j
        |          S # t          j        $ r | j        r|cY S Y nw xY w|                     t          d                              |          ||           d S )Nz<{value} is not a valid json string nor an existing filepath.r   )ospathabspath
expanduserisfilestrr   resolve
isinstancer   r   r   r2   r   r   r   )r    r   r   r   	full_paths        r!   r"   zFileOrJson.convert4   s    GOOBG$6$6u$=$=>>	7>>)$$ 	2tI..00111eS!! 	!!z%(((' ! ! !% ! LLL! !! 			N f5f!!	
 	
 	
 	
 	
s    B4 4CC)Fr#   )r$   r%   r&   r'   boolr5   r(   r)   r*   r+   r,   r-   r"   r.   r/   r!   r1   r1   .   s        D+ +d + + + + .2)-	
 
u
 z%/*
 Z&	

 

 
 
 
 
 
r/   r1   c            	           e Zd ZdZddedej        ej        e                  fdZ	 	 ddej	        dej        e
j                 d	ej        e
j                 d
ej	        fdZdS )DelimitedStringzdelimited-string,N	delimiterchoicesc                 *    |r|ng | _         || _        d S r4   )rE   rD   )r    rD   rE   s      r!   r5   zDelimitedString.__init__P   s    ")1wwr"r/   r   r   r   r   c           
         t          |t                    rd |D             }n$d |                    | j                  D             }| j        s|S d                    t          t          | j                            }|D ]Y}|| j        vrN|                     t          ddt          | j                                                |||          ||           Z|S )Nc                 P    g | ]#}t          |                                          $S r.   )r<   strip.0vs     r!   
<listcomp>z+DelimitedString.convert.<locals>.<listcomp>\   s&    333SVV\\^^333r/   c                 6    g | ]}|                                 S r.   )rI   rJ   s     r!   rM   z+DelimitedString.convert.<locals>.<listcomp>^   s     DDD1QWWYYDDDr/   z, z{value!r} is not {choice}.z"{value!r} is not one of {choices}.)r   choicerE   )r>   listsplitrD   rE   joinmapreprr   r   lenr   )r    r   r   r   rQ   choices_strss          r!   r"   zDelimitedString.convertT   s     eT"" 	E33U333EEDDDN(C(CDDDE| 	LiiD$, 7 788 
	 
	A$$		4<DL))  f1[+fNN   r/   )rC   Nr#   )r$   r%   r&   r'   r<   r(   r*   Listr5   r)   r+   r,   r-   r"   r.   r/   r!   rB   rB   M   s        D# ## #aj6M # # # # .2)-	 u z%/* Z&	
 
     r/   rB   c                       e Zd Zeedej        ej                 fd                        Z	e
dej        ddfd            Zdej        dej        ej                 fdZdS )CliMixinr   c                      d S r4   r.   r.   r/   r!   get_cli_optionszCliMixin.get_cli_optionsq   s	     	r/   cmdNc                 f    |                                  }t                              ||           d S )N)params)r\   rZ   
add_params)clsr]   options_to_adds      r!   add_cli_optionszCliMixin.add_cli_optionsv   s2    ,,..C77777r/   r_   c                    g }| j         D ]}|                    |j                   |D ]X}|j        D ]N}||v rt          | d| j                   |                    |           | j                             |           OYd S )Nz# is already defined on the command )r_   extendopts
ValueErrorr'   append)r]   r_   existing_optsr   opts        r!   r`   zCliMixin.add_params{   s    Z 	- 	-E  ,,,, 	) 	)Ez ) )-''$%Z%ZPSPX%Z%Z[[[$$S)))
!!%((((	)	) 	)r/   )r$   r%   r&   staticmethodr   r(   rX   r+   Optionr\   classmethodCommandrc   r,   r`   r.   r/   r!   rZ   rZ   p   s        QVEL1    ^ \ 8%- 8D 8 8 8 [8
) 
)qveo/F 
) 
) 
) 
) 
) 
)r/   rZ   c                       e Zd ZdS )	CliConfigN)r$   r%   r&   r.   r/   r!   rp   rp      s        Dr/   rp   c                   p     e Zd Zedej        ej                 fd            Ze	de
f fd            Z xZS )CliRetryStrategyConfigr   c                  ~    t          j        dgd t          d          t          j        dgd t          d          g} | S )Nz--max-retrieszMIf provided, will use this max retry for back off strategy if http calls faildefaulttypehelpz--max-retry-timez_If provided, will attempt retries for this long as part of back off strategy if http calls fail)r+   rl   intfloatoptionss    r!   r\   z&CliRetryStrategyConfig.get_cli_options   s^     L !7	   L#$:	  
  r/   kvsc                     t          t                    r-fdt          |           D             }fd|D             }|sdS  t                      j        ddi|S )zG
        Return None if none of the fields are being populated
        c                 0    h | ]}|j         v |j         S r.   )r'   )rK   fieldr|   s     r!   	<setcomp>z3CliRetryStrategyConfig.from_dict.<locals>.<setcomp>   s(    TTT%%*PSBSBS5:BSBSBSr/   c                 d    g | ],}                     |                               |          -S r.   )get)rK   nr|   s     r!   rM   z4CliRetryStrategyConfig.from_dict.<locals>.<listcomp>   s2    JJJ1swwqzzJCGGAJJJJJr/   Nr|   r.   )r>   r   r   super	from_dict)ra   r|   kwargsfield_namesfield_values	__class__s    `   r!   r   z CliRetryStrategyConfig.from_dict   s    
 c4   	TTTT6#;;TTTKJJJJJJJL t uww 33S3F333r/   r$   r%   r&   rk   r(   rX   r+   rl   r\   rm   r   r   __classcell__r   s   @r!   rr   rr      s~        QVEL1    \& 	4D 	4 	4 	4 	4 	4 [	4 	4 	4 	4 	4r/   rr   c                   J    e Zd Zedej        ej                 fd            ZdS )CliProcessorConfigr   c            
         t          j        dgddd          t          j        dgdd	          t          j        d
gt          t          t          j                    dz  dz  dz  dz                                            dd          t          j        dgddd          t          j        dgddd          t          j        ddgdd          g} | S )Nz--reprocessTFzqReprocess a downloaded file even if the relevant structured output .json file in output directory already exists.is_flagru   rw   z--output-dirzstructured-outputz-Where to place structured output .json files.ru   rw   z
--work-dirz.cacheunstructuredingestpipelinez6Where to place working files when processing each step)rv   ru   show_defaultrw   z--num-processes   z7Number of parallel processes with which to process docs)ru   r   rw   z--raise-on-errorzoIs set, will raise error if any doc in the pipeline fail. Otherwise will log error and continue with other docsz-vz	--verboser   ru   )r+   rl   r<   r   homer=   rz   s    r!   r\   z"CliProcessorConfig.get_cli_options   s    LH	   L +D  
 LY[[8+n<xG*T]]__  "M   L"#!N	   L#$9	   L$,dEJJJG$
J r/   N	r$   r%   r&   rk   r(   rX   r+   rl   r\   r.   r/   r!   r   r      sB        &QVEL1 & & & \& & &r/   r   c                   J    e Zd Zedej        ej                 fd            ZdS )CliReadConfigr   c            
          t          j        dgd          t          j        dgddd          t          j        d	gddd
          t          j        dgddd          t          j        dgd t          d          g} | S )Nz--download-dirzuWhere files are downloaded to, defaults to a location at`$HOME/.cache/unstructured/ingest/<connector name>/<SHA256>`.)rw   z--re-downloadTFzCRe-download files even if they are already present in download dir.r   z--preserve-downloadsz]Preserve downloaded files. Otherwise each file is removed after being processed successfully.z--download-onlyzDownload any files that are not already present in either --download-dir or the default download ~/.cache/... location in case --download-dir is not specified and skip processing them through unstructured.z
--max-docsz@If specified, process at most the specified number of documents.rt   r+   rl   rx   rz   s    r!   r\   zCliReadConfig.get_cli_options   s     L!"P  
 L !Z	   L'(6	   L"#=	   LW	  9"
F r/   Nr   r.   r/   r!   r   r      sB        $QVEL1 $ $ $ \$ $ $r/   r   c                   J    e Zd Zedej        ej                 fd            ZdS )CliPartitionConfigr   c                  6   t          j        dgddd          t          j        dgdd	          t          j        d
gd t          d          d          t          j        dgd d	          t          j        dgt                      d d          t          j        dgt                      d          t          j        dgt                      g dd          t          j        dgddd          t          j        dgg t                      d          t          j        dgg t                      d          t          j        d gddd!          t          j        d"gd#d$	          t          j        d%gd d&	          t          j        d'gd d(	          g} | S ))Nz--pdf-infer-table-structureTFzIPartition will include the table's text_as_html in the response metadata.r   z
--strategyautozsThe method that will be used to process the documents. Default: auto. Other strategies include `fast` and `hi_res`.r   z--ocr-languages+)rD   zA list of language packs to specify which languages to use for OCR, separated by '+' e.g. 'eng+deu' to use the English and German language packs. The appropriate Tesseract language pack needs to be installed.rt   z
--encodingz_Text encoding to use when reading documents. By default the encoding is detected automatically.z--skip-infer-table-typesz;Optional list of document types to skip table extraction on)rv   ru   rw   z--additional-partition-argszEA json string representation of values to pass through to partition()rv   rw   z--fields-include)
element_idtextrv   metadata
embeddingszSComma-delimited list. If set, include the specified top-level fields in an element.z--flatten-metadatazResults in flattened json elements. Specifically, the metadata key values are brought to the top-level of the element, and the `metadata` key itself is removed.z--metadata-includezmComma-delimited list. If set, include the specified metadata fields if they exist and drop all other fields. z--metadata-excludezOComma-delimited list. If set, drop the specified metadata fields if they exist.z--partition-by-apizXUse a remote API to partition the files. Otherwise, use the function from partition.autoz--partition-endpointz.https://api.unstructured.io/general/v0/generalzhIf partitioning via api, use the following host. Default: https://api.unstructured.io/general/v0/generalz	--api-keyzAPI Key for partition endpoint.z--hi-res-model-namezModel name for hi-res strategy.)r+   rl   rB   r   rz   s    r!   r\   z"CliPartitionConfig.get_cli_options  sH    L./c	   LO   L"#$s3337	   L*   L+,$&&R	   L./VV\  
 L#$$&&PPP(	   L%&Z	   L%&$&&C	   L%&$&&(	   L%&C	   L'(HJ   L6  
 L&'6  m[
x r/   Nr   r.   r/   r!   r   r     sI        ]QVEL1 ] ] ] \] ] ]r/   r   c                   V    e Zd ZU eed<   edej        ej	                 fd            Z
dS )CliRecursiveConfig	recursiver   c                  :    t          j        dgddd          g} | S )N--recursiveTFlRecursively download files in their respective folders otherwise stop at the files in provided folder level.r   )r+   rl   rz   s    r!   r\   z"CliRecursiveConfig.get_cli_optionsg  s7     LH	  
 r/   N)r$   r%   r&   r@   __annotations__rk   r(   rX   r+   rl   r\   r.   r/   r!   r   r   d  sM         OOO
QVEL1 
 
 
 \
 
 
r/   r   c                   J    e Zd Zedej        ej                 fd            ZdS )CliFilesStorageConfigr   c            	          t          j        dgdd          t          j        dgt          ddd          t          j        d	gddd
          t          j        dgd t                      d          g} | S )Nz--remote-urlTz4Remote fsspec URL formatted as `protocol://dir/path`)requiredrw   z--uncompressFz^Uncompress any archived files. Currently supporting zip and tar files based on file extension.)rv   ru   r   rw   r   r   r   z--file-globzjA comma-separated list of file globs to limit which types of local files are accepted, e.g. '*.html,*.txt'rt   )r+   rl   r@   rB   rz   s    r!   r\   z%CliFilesStorageConfig.get_cli_optionsv  s     L K  
 L 1   LH	   L$&&@	  +
: r/   Nr   r.   r/   r!   r   r   u  sB        QVEL1    \  r/   r   c                   p     e Zd Zedej        ej                 fd            Ze	de
f fd            Z xZS )CliEmbeddingConfigr   c            	         ddl m}  t          j        dgdt	          |            t          j        t	          |                               t          j        dgdt          d           t          j        d	gd
t          d           g}|S )Nr   )EMBEDDING_PROVIDER_TO_CLASS_MAPz--embedding-providerz7Type of the embedding class to be used. Can be one of: )rw   rv   z--embedding-api-keyzCAPI key for the embedding model, for the case an API key is needed.)rw   rv   ru   z--embedding-model-namezfEmbedding model name, if needed. Chooses a particular LLM between different options, to embed with it.)unstructured.embedr   r+   rl   rP   Choicer<   )r   r{   s     r!   r\   z"CliEmbeddingConfig.get_cli_options  s    FFFFFF L'(;788; ;\$'F"G"GHH	   L&'Z	   L)*X  
* r/   r|   c                 P   t          |t                    rwd |                                D             }t          |                                          dk    rdS |                    dd          sdS  t                      j        |fi |S  t                      j        |fi |S )a  
        Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params.
        This allows CLI arguments to be prepended with embedding_ during CLI invocation but
        doesn't require that as part of the field names in this class
        c                 n    i | ]2\  }}|                     d           |t          d           d         |3S )
embedding_N
startswithrU   rK   krL   s      r!   
<dictcomp>z0CliEmbeddingConfig.from_dict.<locals>.<dictcomp>  sS       Aq<<--#l##%%&  r/   r   Nprovider)r>   r   itemsrU   keysr   r   r   )ra   r|   r   new_kvsr   s       r!   r   zCliEmbeddingConfig.from_dict  s     c4   
	8 IIKK  G
 7<<>>""a''t;;z400 t$577$W77777 uww /////r/   r   r   s   @r!   r   r     s~        QVEL1    \4 0D 0 0 0 0 0 [0 0 0 0 0r/   r   c                   p     e Zd Zedej        ej                 fd            Ze	de
f fd            Z xZS )CliChunkingConfigr   c            
         t          j        dgdd          t          j        dgdd          t          j        dgt          dd          t          j        d	gt          d
d          t          j        dgt          d
d          g} | S )Nz--chunk-elementsTFr   z--chunk-multipage-sectionsz"--chunk-combine-text-under-n-charsi  )rv   ru   r   z--chunk-new-after-n-charsi  z--chunk-max-charactersr   rz   s    r!   r\   z!CliChunkingConfig.get_cli_options  s     L#$  
 L-.  
 L56!	   L,-!	   L)*!	  /
< r/   r|   c                    t          |t                    r|                                }i }d|v r|                    d          }|sdS ||d<   |                    d |                                D                        t          |                                          dk    rdS  t                      j	        dd|i|S  t                      j	        dd|i|S )a  
        Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params.
        This allows CLI arguments to be prepended with chunking_ during CLI invocation but
        doesn't require that as part of the field names in this class
        chunk_elementsNc                 n    i | ]2\  }}|                     d           |t          d           d         |3S )chunk_Nr   r   s      r!   r   z/CliChunkingConfig.from_dict.<locals>.<dictcomp>  sO       1||H--c(mmoo&  r/   r   r|   r.   )
r>   r   copypopupdater   rU   r   r   r   )ra   r|   r   r   r   r   s        r!   r   zCliChunkingConfig.from_dict  s     c4   	<((**CG3&&!$)9!:!:%  4,:()NN  #		     7<<>>""a''t$577$;;;F;;; uww 33S3F333r/   r   r   s   @r!   r   r     s        QVEL1    \B 4D 4 4 4 4 4 [4 4 4 4 4r/   r   c                   p     e Zd Zedej        ej                 fd            Ze	de
f fd            Z xZS )CliPermissionsConfigr   c                      t          j        dgt          d          t          j        dgt          d          t          j        dgt          d          g} | S )Nz--permissions-application-idz"Microsoft Graph API application idr   z--permissions-client-credz+Microsoft Graph API application credentialsz--permissions-tenantzJe.g https://contoso.onmicrosoft.com to get permissions data within tenant.)r+   rl   r<   rz   s    r!   r\   z$CliPermissionsConfig.get_cli_options	  sv     L/09  
 L,-B  
 L'(a  
" r/   r|   c                    t          |t                    r|                    d          }|                    d          }|                    d          }|||g}t          |          rt	          |          st          d          d |                                D             }t          |                                          dk    rdS  t                      j
        d	d|i|S  t                      j
        d	d|i|S )
aR  
        Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params.
        This allows CLI arguments to be prepended with permissions_ during CLI invocation but
        doesn't require that as part of the field names in this class. It also checks if the
        CLI params are provided as intended.
        permissions_application_idpermissions_client_credpermissions_tenantzPlease provide either none or all of the following optional values:
--permissions-application-id
--permissions-client-cred
--permissions-tenantc                 n    i | ]2\  }}|                     d           |t          d           d         |3S )permissions_Nr   r   s      r!   r   z2CliPermissionsConfig.from_dict.<locals>.<dictcomp>8  sS       Aq<<//#n%%''(!  r/   r   Nr|   r.   )r>   r   r   anyallrg   r   rU   r   r   r   )	ra   r|   r   r   r   r   permission_valuesr   r   s	           r!   r   zCliPermissionsConfig.from_dict  s'    c4   	<),1M)N)N&&)gg.G&H&H#!$)=!>!>*'"!
 $%% c2C.D.D  +   IIKK  G
 7<<>>""a''t$577$;;;F;;; uww 33S3F333r/   r   r   s   @r!   r   r     s~        QVEL1    \( !4D !4 !4 !4 !4 !4 [!4 !4 !4 !4 !4r/   r   ))r   os.pathr7   typingr(   abcr   dataclassesr   r   r   pathlibr   r+   dataclasses_json.corer   unstructured.ingest.interfacesr	   r
   r   r   r   r   r   r   r   	ParamTyper   r1   rB   rZ   rp   rr   r   r   r   r   r   r   r   r   r.   r/   r!   <module>r      sy                     % % % % % % % %        & & & & & &
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
    5?   *
 
 
 
 
 
 
 
>         eo      F) ) ) ) ) ) ) )0	 	 	 	 	
H 	 	 	4 4 4 4 40( 4 4 4D( ( ( ( (( ( ( (V& & & & &J & & &R_ _ _ _ _( _ _ _D       "         -x      F.0 .0 .0 .0 .0( .0 .0 .0b<4 <4 <4 <4 <4 <4 <4 <4~84 84 84 84 84,h 84 84 84 84 84r/   