
    jE              '          U d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
mZmZmZ d dlmZmZmZmZmZmZmZ d dlmZmZmZmZ d dlmZ d dlmZmZ d d	l m!Z! ej"        d
k     rd dl#m$Z$ nd dlm$Z$ d dl%m&Z& d dl'm(Z(m)Z) d dl*m+Z+m,Z,m-Z-m.Z.m/Z/ d dl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7 d dl8m9Z9m:Z:m;Z;m<Z<m=Z= d dl>m?Z?m@Z@ d dlAmBZB d dlCmDZD d dlEmFZF ddgZGe$eeH                  eId<   dZJeHeId<   deHdee1         fdZKdeHdeeHeHf         fdZLdedee1         fdZM	 d@ded eeH         d!eeH         de2fd"ZNd#eHdeeH         fd$ZO	 d@d%ed&eeH         deeeHeHf                  fd'ZPd( ZQd)ee4e6f         d*ejR        dee1e1f         fd+ZS	 	 dAd eeH         d,eeeeT         ef                  deeeH         ef         fd-ZU e7             e@e?jV                   e&            dddddd.d/d0ddd.dd dd1gd.fd eeH         d,eeeeT         ef                  d2eeH         d3eHd4eeH         d5eWd6eeX         d7eWd8eeH         d!eeH         d9eWd:ee         d;eeX         d<eeH         d=eeeH                  d>eWdee1         f"d?                                    ZYdS )B    N)Message)partial)NamedTemporaryFileSpooledTemporaryFileTemporaryDirectory)IOCallableDictListOptionalTupleUnion)COMMON_ENCODINGSformat_encoding_strread_txt_filevalidate_encoding)logger)convert_to_bytesexactly_one)apply_lang_metadata)      )Final)add_chunking_strategy)clean_extra_whitespacereplace_mime_encodings)extract_datetimetzextract_email_addressextract_ip_addressextract_ip_address_nameextract_mapi_id)ElementElementMetadataImageNarrativeTextTextTitleprocess_metadata)MetaDataReceivedInfo	RecipientSenderSubject)FileTypeadd_metadata_with_filetype)EMAIL_DATETIMETZ_PATTERN_RE)partition_html)partition_text	text/html
text/plainVALID_CONTENT_SOURCESemailDETECTION_ORIGINdatareturnc                    t          |           }t          |           }t          |           }t          |           }g }|r<|r:t	          ||          D ])\  }}|                    t          ||                     *|r*|                    t          d|d                              |r2|                    t          dt          |          |                     |S )Nnametextmapi_idr   received_datetimetz)r<   r=   	datestamp)r    r   r!   r   zipappendr*   str)r8   ip_address_namesip_addressesr>   
datetimetzelementsr<   ips           f/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/unstructured/partition/email.py_parse_received_datarJ   @   s    .t44%d++Ld##G#D))J H >L >,l;; 	> 	>HD"OOLd<<<==== G)'!*EEEFFF 
*__$  	
 	
 	
 O    c                     t          |           }d}t          j        ||                                           d                                                                         }||d         fS )Nz)<[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+>r   )r   resplitlowertitlestrip)r8   email_addressPATTERNr<   s       rI   _parse_email_addressrT   W   sZ    )$//M9G8GTZZ\\**1-3355;;==Dq!!!rK   msgc                    g }|                                  D ]3}|d         dk    rFt          |d                   }|                    t          |d         |d                              U|d         dk    rFt          |d                   }|                    t	          |d         |d                              |d         dk    r*|                    t          |d                              |d         dk    r|t          |d                   z  }|                    t          |d         |d                              5|S )	Nr   To   r;   Fromr-   )r=   Received)	raw_itemsrT   rB   r+   r,   r-   rJ   r)   )rU   rG   itemr=   s       rI   partition_email_headerr]   `   s<    H B B7d??'Q00DOOI47aAAABBBB!W'Q00DOOFQd1g>>>????!W	!!OOGa1112222!W
"",T!W555HHOOH$q'Q@@@AAAAOrK   filenamemetadata_last_modifiedc                    t          |                                           }|                    d          }|t          |          }|                    d          }|d |                    d          D             }|                    d          }|d |                    d          D             }t          |||                    d          |p||	          }t          |_        |S )
zKCreates an ElementMetadata object from the header information in the email.DateNrY   c                 6    g | ]}|                                 S  rQ   ).0senders     rI   
<listcomp>z(build_email_metadata.<locals>.<listcomp>   s     GGGV\\^^GGGrK   ,rW   c                 6    g | ]}|                                 S rc   rd   )re   	recipients     rI   rg   z(build_email_metadata.<locals>.<listcomp>   s"    III9??$$IIIrK   r-   )sent_to	sent_fromsubjectlast_modifiedr^   )dictr[   getconvert_to_iso_8601rN   r#   r7   detection_origin)rU   r^   r_   header_dict
email_daterl   rk   element_metadatas           rI   build_email_metadatarv   s   s     s}}''K((J(44
''IGG)//#2F2FGGG	ood##GIIgmmC6H6HIII&	**,:
   )9%rK   timec                 (   t          |           }t          j        |          }|t          j        |  d           dS |                                \  }}|||         }t          j                            |d          }|                                S )z?Converts the datetime from the email output to ISO-8601 format.Nz; did not match RFC-2822 format. Unable to extract the time.z%a, %d %b %Y %H:%M:%S %z)	r   r0   searchr   warningspandatetimestrptime	isoformat)rw   cleaned_timeregex_matchstartend	dt_stringdatetime_objects          rI   rq   rq      s    )$//L-4\BBKPPP	
 	
 	
 t!!##JE3U3Y'I'00<VWWO$$&&&rK   message
output_dirc                    g }|                                  D ]}d|v r|d                             d          }d |D             }i }|D ]}|                                dv r|                    dd          \  }}t          |                    dd                    }t          |                    dd                    }t          |          |t          |          <   |                    d	
          |d<   |                    |           t          |          D ]\  }	}
|rd|
v rR|dz   |
d         z   }t          |d          5 }|	                    |
d                    d d d            n# 1 swxY w Y   ]t          d|d          5 }t          j                            |j                  ||	         d<   |	                    |
d                    d d d            n# 1 swxY w Y   А|S )Nzcontent-disposition;c                 ,    g | ]}t          |          S rc   )r   )re   r\   s     rI   rg   z+extract_attachment_info.<locals>.<listcomp>   s!    DDDd+D11DDDrK   )
attachmentinline=rX   " Tdecodepayloadr^   /wbF)modedirdelete)walkrN   rO   r   replaceget_payloadrB   	enumerateopenwriter   ospathbasenamer<   )r   r   list_attachmentspartcdispattachment_infor\   keyvalueidxr   r^   fs                rI   extract_attachment_infor      s     !; !; D((./55c::EDDeDDDE O  ::<<#;;;!ZZQ//
U,S[[b-A-ABB.u}}S"/E/EFF?U@ @ 6s ; ;<< *.)9)9)9)F)FOI&##O444#,-=#>#> ; ;Z ;!Z//#-#3j6L#L!(D11 ;Q GGJy$9:::; ; ; ; ; ; ; ; ; ; ; ; ; ; ;
 0!% *#(   ; @B@P@PQRQW@X@X,S1*=GGJy$9:::; ; ; ; ; ; ; ; ; ; ; ; ; ; ; s%   E00E47E4A	G##G'*G'c                 ^    t          j        d          }|                    | j                  S )Nz\[image: .+\])rM   compilery   r=   )elementrS   s     rI   has_embedded_imager      s%    j))G>>','''rK   r   indicesc                 R   |                                 |                                }}| j        ||         }t          |                    d          d                   }| j                            d|d d         z   dz   d          | _        t          |d d         d          | fS )	N:rX   z[image: ]r   r6   )r=   rr   )r   r   r=   r   rN   r   r$   )r   r   r   r   image_raw_info
image_infos         rI   find_embedded_imager      s     '++--3E\%),N'(<(<S(A(A!(DEEJ<''
Z_(Ds(JBOOGLj"o@@@'IIrK   filec                    | =t          | d          5 }t          j        |          }d d d            n# 1 swxY w Y   n5|$t          |          }t          j        |          }nt          d          d }|                                pg }|D ]+}|r'|                                rt          |          r|} n,|rt          |          nd }||fS )Nrbz-Either 'filename' or 'file' must be provided.)
r   r6   message_from_binary_filer   message_from_bytes
ValueErrorget_charsetsrQ   r   r   )	r^   r   r   rU   f_bytesencodingcharsetscharsetformatted_encodings	            rI   parse_emailr      s1    (D!! 	4Q033C	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4		"4((&w//HIIIH!!'RH   	w}} 	+<W+E+E 	HE:BL,X666s""s   488Fi  Tautor=   content_sourcer   include_headersmax_partitioninclude_metadatametadata_filenameprocess_attachmentsattachment_partitionermin_partitionchunking_strategy	languagesdetect_language_per_elementc           	      V
   |t           vrt          | dt                      ||                                dk    r|s| sg S t          | ||           d}| At	          |           \  }}|r|}nt          | |          \  }}t          j        |          }nh|At	          |          \  }}|r|}nNt          ||	          \  }}t          j        |          }n%|#t          |          }t          j        |          }|s|}d
}i }|	                                D ]}|
                                |                                }|                    d          rd}|                                dk    rv|                    dd          dk    r\	 |                    d                              |          ||<   # t"          t$          f$ r |                                ||<   Y w xY w|                                ||<   |                    |d          }g }|rt'          j        d           nE|snA|dk    r|                    d          }d                    |          }t/          |d
|dgd          }|D ]}t1          |t2                    rt5          t6          |          }	 |                    |           D# t"          t$          f$ r g }t:          D ],} t=          |           }!|!|k    r|                    |!           -|D ]F}"	 t5          t6          |"          }|                    |            n# t"          t$          f$ r Y Cw xY wY w xY wn|dk    rtA          ||||dgd
d          }tC          |          D ]c\  }#}tE          |          }$t1          |tF          tH          f          r3|$r1tK          ||$          \  }%}&|&||#<   |&                    |#dz   |%           dg }'|rtO          |          }'|'|z   }(tQ          ||p| |	          })|(D ]}tS          j*        |)          |_+        |
rtY                      5 }*t[          ||*           t]          j/        |*          }+|+D ]},t\          j0                            |*|,          }-|t          d           ||-|	||          }.|.D ]=}|,|j+        _1        d|j+        _2        |p| |j+        _3        |(                    |           >	 ddd           n# 1 swxY w Y   ti          tk          |(||                    }|S )a0  Partitions an .eml documents into its constituent elements.
    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object using "r" mode --> open(filename, "r").
    text
        The string representation of the .eml document.
    content_source
        default: "text/html"
        other: "text/plain"
    encoding
        The encoding method used to decode the text input. If None, utf-8 will be used.
    max_partition
        The maximum number of characters to include in a partition. If None is passed,
        no maximum is applied. Only applies if processing the text/plain content.
    metadata_filename
        The filename to use for the metadata.
    metadata_last_modified
        The last modified date for the document.
    process_attachments
        If True, partition_email will process email attachments in addition to
        processing the content of the email itself.
    attachment_partitioner
        The partitioning function to use to process attachments.
    min_partition
        The minimum number of characters to include in a partition. Only applies if
        processing the text/plain content.
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
        in either language.
        Additional Parameters:
            detect_language_per_element
                Detect language per element instead of at the document level.
    zE is not a valid value for content_source. Valid content sources are: Nr   )r^   r   r=   zutf-8)r^   )r^   r   )r   )r   r   F	encryptedTr=   zcontent-transfer-encodingbase64r   zGEncrypted email detected. Partition function will return an empty list.r3   z=
r6   )r=   r   r   r   rr   )r   r4   )r=   r   r   r   r   r   rr   rX   )r^   r_   z@Specify the attachment_partitioner kwarg to process attachments.)r^   r_   r   r   )rG   r   r   )6r5   r   rQ   r   r   r   r6   message_from_stringrC   r   get_content_dispositionget_content_typeendswithget_content_maintyperp   r   r   UnicodeDecodeErrorUnicodeErrorr   rz   rN   joinr1   
isinstancer&   r   r   applyr   r   rB   r2   r   r   r%   r'   r   insertr]   rv   copydeepcopymetadatar   r   r   listdirr   r^   file_directoryattached_to_filenamelistr   )/r^   r   r=   r   r   r   r   r   r   r_   r   r   r   r   r   r   kwargsdetected_encodingextracted_encodingrU   	file_text_textis_encryptedcontent_mapr   content_typecontentrG   list_contentr   _replace_mime_encodingscommon_encodingsx_xencr   r   r   clean_elementheaderall_elementsr   tmpdirattached_filesattached_fileattached_filenameattached_elementss/                                                  rI   partition_emailr      s   v 222 B B*?B B
 
 	

 DJJLLB..t.H.	 48888"-x"@"@"@C 	7 2+8!!, , ,(y +I66CC		"-4"8"8"8C 	7 2+8dX+V+V+V(y+I66CC		YY'.. %$L"$K

 ; ; ''))5,,..
   -- 	 L %%''6114d;;xGG?,0,<,<D,<,I,I,P,PQY,Z,ZL))&5 ? ? ?,0,<,<,>,>L)))? )-(8(8(:(:K%%oonb11G H =
U	
 	
 	
 	
  8
	;	&	& }}U++'',''!"/d$
 
 
   	% 	%G'4(( %*1*%+ + +'%MM"9::::*L9 % % %')$- 8 8033>>,33B777/ 	% 	%%6= 6),7 7 73 $MM*ABBB!E 2LA % % %$H%!%%	%6 
<	'	'!''d"$
 
 
 "(++ 1 1W$W--w 677 	1W 	1(;GW(M(M%J)HSMOOC!GZ000F -',,H$L#".h5  H
   3 3=22 1!! 	1V#C000Z//N!/ 1 1$&GLL$G$G!)1$Z   %;$:.+A"/"/	% % %!  1 1 1G0=G$-6:G$3<M<YQYG$9 ''0000	11	1 	1 	1 	1 	1 	1 	1 	1 	1 	1 	1 	1 	1 	1 	1* !(C	
 	
 	
 H Os\   ,G  (G+*G+0KAM+L?=M?M	MM	MMB*S??TT)N)NN)Zr   r|   r6   r   rM   sysemail.messager   	functoolsr   tempfiler   r   r   typingr   r	   r
   r   r   r   r    unstructured.file_utils.encodingr   r   r   r   unstructured.loggerr   unstructured.partition.commonr   r   unstructured.partition.langr   version_infotyping_extensionsr   unstructured.chunkingr   unstructured.cleaners.corer   r   unstructured.cleaners.extractr   r   r   r    r!   unstructured.documents.elementsr"   r#   r$   r%   r&   r'   r(   %unstructured.documents.email_elementsr)   r*   r+   r,   r-    unstructured.file_utils.filetyper.   r/   unstructured.nlp.patternsr0   unstructured.partition.htmlr1   unstructured.partition.textr2   r5   rC   __annotations__r7   rJ   rT   r]   rv   rq   r   r   Matchr   bytesr   EMLboolintr   rc   rK   rI   <module>r     s      				 				 



 ! ! ! ! ! !       Q Q Q Q Q Q Q Q Q Q C C C C C C C C C C C C C C C C C C            ' & & & & &        < ; ; ; ; ;f''''''' 7 7 7 7 7 7 U U U U U U U U                                            R Q Q Q Q Q Q Q A A A A A A 6 6 6 6 6 6 6 6 6 6 6 6+6*E uT#Y' E E E #   s tG}    ."s "uS#X " " " " DM    , -1 	sm %SM 	   <'c 'hsm ' ' ' '$ !%) ))) 
$sCx.) ) ) )X( ( (
	J=%'(	JX	J 7G	J 	J 	J 	J #=A# #sm#
5E$889
:# 8C='!"# # # #2 HL))"=A%"!#'!'+,0 %15#$'+&,X(-!m msmm
5E$889
:m 3-m 	m
 smm m C=m m  }m %SMm m %X.m C=m  }m S	"m  "&!m$ 
']%m m m  *) m m mrK   