
    Xj                         d dl Z d dlZd dlmZ d dlmZ d dlmZ  ee          Z ed          Z	dZ
dZ G d d	          ZdS )
    N)datetime)Path)
get_loggerz/tmp/tax_documents/indexzcheckpoint.db2   c                       e Zd Zddedededz  fdZdee         fdZdeddfd	Z	dd
Z
dedefdZdeddfdZdefdZddZdefdZddZdej        fdZddZdS )CheckpointManagerNtask_idcategory_iddb_dirc                     || _         || _        |pt          | _        | j        t          z  | _        t          j                    | _        g | _	        t                      | _        |                                  d S N)r	   r
   DEFAULT_CHECKPOINT_DIRr   CHECKPOINT_DB_NAMEdb_path	threadingLock_lock_pending_urlsset_completed_urls
_ensure_db)selfr	   r
   r   s       `/lsinfo/ai/hellotax_ai/data_center/backend/app/services/tax_data_processor/checkpoint_manager.py__init__zCheckpointManager.__init__   sf    &6 6{%77^%%
(*),    returnc                 R   |                                  5 }|                    d| j        | j        f                                          }d d d            n# 1 swxY w Y   d |D             | _        t                              dt          | j                   d           | j        S )NzESELECT doc_url FROM checkpoints WHERE task_id = ? AND category_id = ?c                     h | ]
}|d          S )r    ).0rows     r   	<setcomp>z8CheckpointManager.load_completed_urls.<locals>.<setcomp>   s    7773A777r   u
   已恢复     条断点记录)	_connectexecuter	   r
   fetchallr   loggerinfolen)r   connrowss      r   load_completed_urlsz%CheckpointManager.load_completed_urls   s   ]]__ 	V<< gjnjvx|  yI  jJ  K  K  T  T  V  VD	V 	V 	V 	V 	V 	V 	V 	V 	V 	V 	V 	V 	V 	V 	V77$777LT%9!:!:LLLMMM##   5AAAdoc_urlc                    | j                             |           | j        5  | j                            |           t          | j                  t          k    r|                                  d d d            d S # 1 swxY w Y   d S r   )r   addr   r   appendr)   CHECKPOINT_INTERVAL_flushr   r.   s     r   	mark_donezCheckpointManager.mark_done!   s      )))Z 	 	%%g...4%&&*===	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   AA;;A?A?c                 n    | j         5  |                                  d d d            d S # 1 swxY w Y   d S r   )r   r3   r   s    r   flushzCheckpointManager.flush(   s{    Z 	 	KKMMM	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   *..c                     || j         v S r   )r   r4   s     r   is_donezCheckpointManager.is_done,   s    $...r   pagec           	          |                                  5 }|                    d| j        | j        |t	          j                                                    f           d d d            d S # 1 swxY w Y   d S )Na  
                INSERT INTO page_cursors (task_id, category_id, last_page, updated_at)
                VALUES (?, ?, ?, ?)
                ON CONFLICT(task_id, category_id)
                DO UPDATE SET last_page = excluded.last_page, updated_at = excluded.updated_at
                )r$   r%   r	   r
   r   utcnow	isoformat)r   r;   r*   s      r   save_page_cursorz"CheckpointManager.save_page_cursor/   s   ]]__ 	FLL  ~  AE  AM  OS  O_  ae  go  gv  gx  gx  gB  gB  gD  gD  @E  F  F  F	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	Fs   AA**A.1A.c                 R   |                                  5 }|                    d| j        | j        f                                          }d d d            n# 1 swxY w Y   |r|d         nd}|dk    r4t
                              d| j         d| j         d|dz    d| d	           |S )	NzHSELECT last_page FROM page_cursors WHERE task_id = ? AND category_id = ?r   [CheckpointManager] task= cat=u    从第    u    页继续（已完成第 1~u    页）)r$   r%   r	   r
   fetchoner'   r(   )r   r*   r!   r;   s       r   load_page_cursorz"CheckpointManager.load_page_cursor3   sG   ]]__ 	X,,ilplxz~  {K  lL  M  M  V  V  X  XC	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X 	X#s1vv!!88KK  UDL  U  UtGW  U  Uaehiai  U  U  HL  U  U  U  V  V  Vr-   c                 n   |                                  5 }|                    d| j        | j        f           |                    d| j        | j        f           d d d            n# 1 swxY w Y   | j                                         t                              d| j         d| j         d           d S )Nz=DELETE FROM checkpoints WHERE task_id = ? AND category_id = ?z>DELETE FROM page_cursors WHERE task_id = ? AND category_id = ?rA   rB   u    检查点已清理)r$   r%   r	   r
   r   clearr'   r(   r   r*   s     r   rG   zCheckpointManager.clear;   s    ]]__ 	}LLX[_[gimiyZz{{{LLY\`\hjnjz[{|||	} 	} 	} 	} 	} 	} 	} 	} 	} 	} 	} 	} 	} 	} 	} 	""$$$hhh4CShhhiiiiis   AA&&A*-A*c                 l    | j         | j        t          | j                  t          | j                  dS )N)r	   r
   completed_countpending_flush)r	   r
   r)   r   r   r7   s    r   statszCheckpointManager.statsB   sP    <8H]`aeau]v]v  JM  NR  N`  Ja  Ja  b  b  	br   c                    | j                             dd           |                                 5 }|                    d           |                    d           |                    d           d d d            d S # 1 swxY w Y   d S )NT)parentsexist_oka  
                CREATE TABLE IF NOT EXISTS checkpoints (
                    id          INTEGER PRIMARY KEY AUTOINCREMENT,
                    task_id     TEXT    NOT NULL,
                    category_id INTEGER NOT NULL,
                    doc_url     TEXT    NOT NULL,
                    created_at  TEXT    NOT NULL DEFAULT (datetime('now')),
                    UNIQUE(task_id, category_id, doc_url)
                )
                z
                CREATE INDEX IF NOT EXISTS idx_cp_task_cat
                ON checkpoints(task_id, category_id)
                af  
                CREATE TABLE IF NOT EXISTS page_cursors (
                    task_id     TEXT    NOT NULL,
                    category_id INTEGER NOT NULL,
                    last_page   INTEGER NOT NULL DEFAULT 0,
                    updated_at  TEXT    NOT NULL,
                    PRIMARY KEY (task_id, category_id)
                )
                )r   mkdirr$   r%   rH   s     r   r   zCheckpointManager._ensure_dbE   s    $666]]__ 	KLL  `  a  a  aLL  `  a  a  aLL  J  K  K  K	K 	K 	K 	K 	K 	K 	K 	K 	K 	K 	K 	K 	K 	K 	K 	K 	K 	Ks   A A>>BBc                     t          j        t          | j                  d          }|                    d           |                    d           |S )N
   )timeoutzPRAGMA journal_mode=WALzPRAGMA synchronous=NORMAL)sqlite3connectstrr   r%   rH   s     r   r$   zCheckpointManager._connectL   sK    s4<00"===.///0111r   c                 L     j         sd S t          j                                                     fd j         D             }	                                  5 }|                    d|           d d d            n# 1 swxY w Y   t                              dt           j                    d           n4# t          $ r'}t          
                    d|            Y d }~nd }~ww xY w j                                          d S #  j                                          w xY w)Nc                 0    g | ]}j         j        |fS r   )r	   r
   )r    urlnowr   s     r   
<listcomp>z,CheckpointManager._flush.<locals>.<listcomp>V   s'    YYYst/c:YYYr   z
                    INSERT OR IGNORE INTO checkpoints
                        (task_id, category_id, doc_url, created_at)
                    VALUES (?, ?, ?, ?)
                    u   [CheckpointManager] 写入 r#   u(   [CheckpointManager] 写入断点失败: )r   r   r=   r>   r$   executemanyr'   debugr)   	ExceptionerrorrG   )r   r+   r*   erZ   s   `   @r   r3   zCheckpointManager._flushR   s   ! 	Fo))++YYYYYdFXYYY	' fD    "_  ae  f  f  ff f f f f f f f f f f f f f fLL`s4;M7N7N```aaaa 	I 	I 	ILLGAGGHHHHHHHH	I $$&&&&&D$$&&&&sT   B9 A=1B9 =BB9 B3B9 8D 9
C*C% D %C**D D#r   )r   N)__name__
__module____qualname__rV   intr   r   r   r,   r5   r8   boolr:   r?   rE   rG   dictrL   r   rT   
Connectionr$   r3   r   r   r   r   r      s         # td{    $SX $ $ $ $        /s /t / / / /FS FT F F F F#    j j j jbt b b b bK K K K',    ' ' ' ' ' 'r   r   )rT   r   r   pathlibr   common_loggingr   ra   r'   r   r   r2   r   r   r   r   <module>rj      s                     % % % % % %	H		899 $  P' P' P' P' P' P' P' P' P' P'r   