
    j%                        d dl Z d dlZd dlZd dlZd dlmZmZmZmZ d dl	Z	d dl
Zd dlmZ d dlmZmZ d dlmZmZ d dlmZmZ  e j        d          Z e j                    Zde_         e j        d	          Ze                    e           dd
 ej        D             vre                    e           e                     e j!                   g dZ"	 	 	 	 	 	 d*de#de#deee#                  deee#                  de#dee#         dee$e$e$f         de%ddfdZ&	 	 	 	 d+de#de#deee#                  deee#                  de#de%fdZ'de#fdZ(d Z)d Z*d,de#de#d ej+        d!e#fd"Z,d-d$eej-        ee.         f         d%ee$         fd&Z/d-d$eee.                  d%ee$         fd'Z0d-d$eee.                  d%ee$         fd(Z1d) Z2dS ).    N)ListOptionalTupleUnion)tqdm)$calculate_element_type_percent_matchget_element_type_frequency)calculate_accuracycalculate_percent_missing_text)elements_from_jsonelements_to_textzunstructured.ingestingest_log_handlerz:%(asctime)s %(processName)-10s %(levelname)-8s %(message)sc                     g | ]	}|j         
S  )name).0hs     g/lsinfo/ai/hellotax_ai/base_platform/venv/lib/python3.11/site-packages/unstructured/metrics/evaluate.py
<listcomp>r      s    @@@1@@@    )metricaverage	sample_sdpopulation_sdcountmetrics      r   F
output_dir
source_diroutput_listsource_list
export_dirgroupingweights	visualizereturnc           	      b   |st          |           }|st          |          }|s#t          d           t          j        d           g }t	          |d|           D ]}	|	                    d          d                             d          d         }
|
                    dd	          d         }|
d
z   }t          |	                    d                    d	k    r|	                    d          d         nd}||vr!|
                    dd	          d         }|d
z   }||v rt          t          t          j                            | |	                              }t          t          j                            ||                    }t          t          |||          d          }t          t!          ||          d          }|                    |
||||g           g d}t%          j        ||          }d}|dg                             t*          t,          t.          dg                                          }|dg                             t*          t,          t.          dg                                          }t%          j        ||f                                          }t6          |_        |r|dv r|                    |                              dt*          t,          dgi                              ddd          }|                    |                              dt*          t,          dgi                              ddd          }t?          ||          }d| d}nt          d           tA          || d|           tA          |d|           tC          |           dS )a  
    Loops through the list of structured output from all of `output_dir` or selected files from
    `output_list`, and compare with gold-standard of the same file name under `source_dir` or
    selected files from `source_list`.

    Calculates text accuracy and percent missing. After looped through the whole list, write to tsv.
    Also calculates the aggregated accuracy and percent missing.
    z;No output files to calculate to edit distances for, exitingr   Fleavedisable/.json.r   z.txtN   )filenamedoctype	connectorcct-accuracycct-%missingcolumnszall-docs-cctr5   r   r6   )r3   r4   meanstdev)_mean_stdevzall-z-agg-cctz7No field to group by. Returning a non-group evaluation.z.tsvzaggregate-scores-cct.tsv)"_listdir_recursiveprintsysexitr   splitrsplitlenr   r   ospathjoin
_read_textroundr
   r   appendpd	DataFrameaggr;   r<   _pstdev	transposeconcatreset_indexagg_headersr8   groupbyrename_format_grouping_output_write_to_file_display)r    r!   r"   r#   r$   r%   r&   r'   rowsdocr2   r3   fn_txtr4   fn
output_cct
source_cctaccuracypercent_missingheadersdfexport_filenameaccmissagg_dfgrouped_accgrouped_misss                              r    measure_text_extraction_accuracyrg   $   s   $  5(44 5(44 KLLLD Ku)mDDD S SIIcNN2&--g66q9//#q))"-F"),SYYs^^)<)<q)@)@CIIcNN1%%d	 $$a((+B&[F[  )*<RW\\*VY=Z=Z*[*[\\J#BGLLV$D$DEEJ/
JPPRSTTH#$B:z$Z$Z\]^^OKK7IxQRRRRRRG	dG	,	,	,B$O
n

"
"E67G#D
E
E
O
O
Q
QC~##UFGW$EFFPPRRDYT{##//11F FN M///

8$$nufg&>?@@&G D DEE  

8$$nufg&>?@@&G D DEE 
 )lCCB7X777OOKLLL:/777<<<:96BBBVr   c           	         |st          |           }|st          |          }g }t          |d|           D ]J}|                    d          d                             d          d         }|                    dd          d         }	|dz   }
t	          |                    d                    dk    r|                    d          d         nd	}|
|v rt          t          t          j        	                    | |                              }t          t          t          j        	                    ||
                              }t          t          ||          d
          }|                    ||	||g           Lg d}t          j        ||          }|j        r)t          j        g d                                          }nP|                    dt$          t&          t(          dgi                                          }|                                }t,          |_        t1          |d|           t1          |d|           t3          |           d	S )a  
    Loops through the list of structured output from all of `output_dir` or selected files from
    `output_list`, and compare with gold-standard of the same file name under `source_dir` or
    selected files from `source_list`.

    Calculates element type frequency accuracy and percent missing. After looped through the
    whole list, write to tsv. Also calculates the aggregated accuracy.
    Fr*   r-   r.   r/   r   r0   r   Nr1   )r2   r3   r4   element-type-accuracyr7   )ri   NNNr   ri   r   z#all-docs-element-type-frequency.tsvz!aggregate-scores-element-type.tsv)r=   r   rA   rB   rC   r	   rG   rD   rE   rF   rH   r   rI   rJ   rK   emptyrN   rL   r;   r<   rM   rP   rQ   r8   rU   rV   )r    r!   r"   r#   r$   r'   rW   rX   r2   r3   fn_jsonr4   outputsourcer]   r_   r`   rd   s                     r   measure_element_type_accuracyrn   v   s.      5(44 5(44D Ku)mDDD 
B 
BIIcNN2&--g66q9//#q))"-W$),SYYs^^)<)<q)@)@CIIcNN1%%d	k!!/
27<<
TW;X;X0Y0YZZF/
27<<
T[;\;\0]0]^^FA&&QQSTUUHKK7Ix@AAAKKKG	dG	,	,	,B	x &LLLMMWWYY05&'72STUU__aa##%% FN:DbIII:BFKKKVr   dirc                     g }t          j        |           D ]a\  }}}|D ]X}t           j                            ||           }|dk    r|                    |           >|                    | d|            Yb|S )Nr0   r-   )rD   walkrE   relpathrI   )ro   listdirdirpath_	filenamesr2   relative_paths          r   r=   r=      s    G!# > >I! 	> 	>HGOOGS99M##x((((-<<(<<====	> Nr   c                  R    t          j        | d                                          S )Nr   )axis)rJ   rO   rP   )r`   s    r   rT   rT      s#    9Ra   ,,...r   c                     t                     dk    rd S  j                                        } fd|D             t          j        d                    fdt          |          D                                  t          j        dt                    z  dt          |          dz
  z  z                                               D ]\  }}g |D ]Q}t          |t                    r                    |d           /                    t          |                     Rt          j        d                    fdt          t                              D                                  d S )	Nr   c           
          g | ]<}t          t          |          t          d  |         D                                 =S )c              3   N   K   | ] }t          t          |                    V  !d S N)rC   str)r   items     r   	<genexpr>z&_display.<locals>.<listcomp>.<genexpr>   s.      CCST^^CCCCCCr   )maxrC   )r   headerr`   s     r   r   z_display.<locals>.<listcomp>   sQ       IOCKKCC6
CCCCCDD  r    c              3   T   K   | ]"\  }}|                     |                   V  #d S r}   ljust)r   ir   
col_widthss      r   r   z_display.<locals>.<genexpr>   s7      XX	6Z]33XXXXXXr   -r   z.3fc              3   Z   K   | ]%}|                              |                   V  &d S r}   r   )r   r   r   formatted_rows     r   r   z_display.<locals>.<genexpr>   s:      ^^q]1%++JqM::^^^^^^r   )rC   r8   tolistclickechorF   	enumeratesumiterrows
isinstancefloatrI   r~   range)r`   r_   ru   rowr   r   r   s   `    @@r   rV   rV      s   
2ww!||j!!G   SZ  J 
JsxxXXXXYwEWEWXXXXXYYY	JsS__$sc'llQ.>'??@@@++-- 	
 	
3 	0 	0D$&& 0$$]]3333$$SYY////
HH^^^^^E#mJ\J\D]D]^^^^^	
 	
 	
 	
	
 	
r   wr2   r`   modec                    |dvrt          d          | r3t          j                            |           st          j        |            d|j        v r#|d                             t                    |d<   d|j        v r"d|j        v r|                    ddgd           |	                    t          j        
                    | |          d|d	|d
k               d S )N)r   az/Mode not supported. Mode must be one of [w, a].r   r2   r4   T)byinplace	Fr   )sepr   indexr   )
ValueErrorrD   rE   existsmakedirsr8   astypeintsort_valuesto_csvrF   )ro   r2   r`   r   s       r   rU   rU      s    :JKKK
 27>>#&& 
C"*k((--7RZK2:$=$=
;
3TBBBIIbgll3))t$eUY]`U`Ibbbbbr   r1   scoresroundingc                 |    t          |           dk    rd S t          j        |           }|s|S t          ||          S )Nr   )rC   
statisticsr9   rH   )r   r   r9   s      r   r;   r;      sD    
6{{at?6""D x   r   c                     d | D             } t          |           dk    rd S |st          j        |           S t          t          j        |           |          S )Nc                     g | ]}||S r}   r   r   scores     r   r   z_stdev.<locals>.<listcomp>       ===5+<e+<+<+<r   r   )rC   r   r:   rH   r   r   s     r   r<   r<      s_    =====F
6{{at ('''!&))8444r   c                     d | D             } t          |           dk    rd S |st          j        |           S t          t          j        |           |          S )Nc                     g | ]}||S r}   r   r   s     r   r   z_pstdev.<locals>.<listcomp>   r   r   r   )rC   r   pstdevrH   r   s     r   rM   rM      s_    =====F
6{{at ) ((("6**H555r   c                     t          | d          5 }|                                }d d d            n# 1 swxY w Y   |S )Nignore)errors)openread)rE   ftexts      r   rG   rG      sy    	d8	$	$	$ vvxx              Ks   377)NNr   Nr   F)NNr   F)r   )r1   )3loggingrD   r   r?   typingr   r   r   r   r   pandasrJ   r   !unstructured.metrics.element_typer   r	   $unstructured.metrics.text_extractionr
   r   unstructured.staging.baser   r   	getLoggerloggerStreamHandlerhandlerr   	Formatter	formattersetFormatterhandlers
addHandlersetLevelDEBUGrQ   r~   r   boolrg   rn   r=   rT   rV   rK   rU   Seriesr   r;   r<   rM   rG   r   r   r   <module>r      s    				     



 / / / / / / / / / / / /                   d c c c c c c c J J J J J J J J		0	1	1
'

!
!#GZ[[	   Y    @@@@@@@
g     KJJ (,'+"$-O OOO $s)$O $s)$	O
 O smO 3S=!O O 
O O O Oj (,'+/ /// $s)$/ $s)$	/
 / / / / /d
C 
 
 
 
/ / /
 
 
*	c 	c 	cs 	c 	cC 	c 	c 	c 	c! !%	4;./ !8C= ! ! ! !5 54( 5HSM 5 5 5 56 6D%) 6Xc] 6 6 6 6    r   