import re

from common_logging import get_logger

logger = get_logger(__name__)

_DOC_NUM_RE = re.compile('(?<![^\\s，,。、（(《〈\\n])[\\u4e00-\\u9fa5]{2,6}[〔\\[（(]\\d{4}[〕\\]）)]第?\\s*\\d+\\s*号')
_NO_YEAR_RE = re.compile('(?:国务院令|主席令|国务院关税税则委员会令)第\\s*\\d+\\s*号')
_BOOK_RE = re.compile('[《〈][^》〉\\n]{2,60}(?:法|条例|办法|规定|实施细则|通则|准则|决定|细则|暂行规定|管理规定)[》〉]')
_ANY_REF_RE = re.compile('(?:(?:[\\u4e00-\\u9fa5]{2,15}[〔\\[（(]\\d{4}[〕\\]）)]第?\\s*\\d+\\s*号)|(?:[\\u4e00-\\u9fa5]{2,15}公告\\d{4}年第\\s*\\d+\\s*号)|(?:(?:国务院令|主席令|国务院关税税则委员会令)第\\s*\\d+\\s*号)|(?:[《〈][^》〉\\n]{5,60}(?:法|条例|办法|规定|实施细则|通则|准则|决定|细则|暂行规定|管理规定)(?:[（(][^）)\\n]{1,10}[）)])?[》〉]))')
_VERB_CHARS = set('根据按照依照执行废止失效撤销修订修正修改印颁施实生予同及以')

def _clean(ref: str) -> str:
    if ref.startswith(('《', '〈')):
        return ref
    m = re.search('[〔\\[（(]\\d{4}|公告\\d{4}年', ref)
    if not m:
        return ref
    prefix_end = m.start()
    prefix = ''
    for ch in reversed(ref[:prefix_end]):
        if '一' <= ch <= '龥' and ch not in _VERB_CHARS:
            prefix = ch + prefix
        else:
            break
    if len(prefix) >= 2:
        return prefix + ref[prefix_end:]
    chars = re.findall('[\\u4e00-\\u9fa5]', ref[:prefix_end])
    return (''.join(chars[-2:]) if len(chars) >= 2 else ref[:prefix_end]) + ref[prefix_end:]
_FULL_DOC_NUMBER_RE = re.compile('(?:[\\u4e00-\\u9fa5]{2,15}[〔\\[（(]\\d{4}[〕\\]）)]第?\\s*\\d+\\s*号|(?:国务院令|主席令|国务院关税税则委员会令)第\\s*\\d+\\s*号)')
_AMEND_TARGET_RE = re.compile('对\\s*(?:\\[)?(?P<doc>[《〈][^》〉\\n]{2,80}(?:法|条例|办法|规定|实施细则|通则|准则|决定|细则|暂行规定|管理规定)[》〉])(?:\\]\\((?P<url>https?://[^)]+)\\))?[^。\\n]{0,40}?(?:作如下|进行)[^。\\n]{0,15}?(?:补充)?修[改订正]')

class RelationshipExtractor:

    def extract(self, content_markdown: str) -> dict:
        supersedes: list[dict] = []
        references: list[dict] = []
        seen_supersedes: set = set()
        seen_references: set = set()
        reference_urls: list[str] = []
        seen_urls: set = set()
        for _link_text, url in re.findall('\\[([^\\]]+)\\]\\((https?://fgk\\.chinatax\\.gov\\.cn[^\\)]+)\\)', content_markdown):
            if url not in seen_urls:
                reference_urls.append(url)
                seen_urls.add(url)
        doc_num_to_title: dict[str, str] = {}
        for title_text, doc_num in re.findall('[《〈]([^》〉\\n]{2,60})[》〉][（(]([\\u4e00-\\u9fa5]{2,15}[〔\\[（(]\\d{4}[〕\\]）)]第?\\s*\\d+\\s*号)[）)]', content_markdown):
            doc_num_to_title[doc_num.replace(' ', '')] = title_text
        for title_text, doc_num in re.findall('[《〈]([^》〉\\n]{2,60})[》〉][（(]([\\u4e00-\\u9fa5]{2,15}公告\\d{4}年第\\s*\\d+\\s*号)[^）)]{0,10}[）)]', content_markdown):
            doc_num_to_title[doc_num.replace(' ', '')] = title_text
        sentences = re.split('[。\\n]', content_markdown)
        for sentence in sentences:
            sentence = sentence.strip()
            sentence = re.sub('[《〈][^》〉\\n]{2,60}[》〉][（(]((?:国务院令|主席令|国务院关税税则委员会令)第\\s*\\d+\\s*号)[）)]', '\\1', sentence)
            sentence = re.sub('[《〈][^》〉\\n]{2,60}[》〉][（(]([\\u4e00-\\u9fa5]{2,15}公告\\d{4}年第\\s*\\d+\\s*号)[^）)]{0,10}[）)]', '\\1', sentence)

            def _pub_replace(m):
                authority = m.group(1).replace(' ', '')
                return f'{authority}公告{m.group(2)}年第{m.group(3)}号'
            sentence = re.sub('(?:\\[)?[《〈]([\\u4e00-\\u9fa5 ]{2,20})关于[^》〉\\n]{2,50}公告[》〉](?:\\]\\([^)]+\\))?[（(](\\d{4})年第\\s*(\\d+)\\s*号[）)]', _pub_replace, sentence)
            if not sentence:
                continue
            if any(k in sentence for k in ['废止', '失效', '撤销']):
                is_self_abolished = '根据' in sentence and any(k in sentence for k in ['本文', '本规定', '本办法', '本通知', '全文废止', '宣布失效', '已宣布失效', '予以废止'])
                is_announcement = '根据' in sentence and any(k in sentence for k in ['予以公告', '予以发布', '现予公告', '现予发布'])
                if is_self_abolished or is_announcement:
                    for ref in _ANY_REF_RE.findall(sentence):
                        ref = _clean(ref)
                        if ref not in seen_supersedes and ref not in seen_references:
                            references.append({'doc_number': ref})
                            seen_references.add(ref)
                else:
                    _CLAUSE_RE = re.compile('第[一二三四五六七八九十百\\d]+(?:条|款|项)')
                    matches = list(_ANY_REF_RE.finditer(sentence))
                    for idx, m in enumerate(matches):
                        ref = _clean(m.group(0))
                        if ref in seen_supersedes:
                            continue
                        end = m.end()
                        next_start = matches[idx + 1].start() if idx + 1 < len(matches) else end + 50
                        after = sentence[end:min(next_start, end + 50)]
                        relation = 'partially_supersedes' if _CLAUSE_RE.search(after) else 'supersedes'
                        supersedes.append({'doc_number': ref, 'relation': relation})
                        seen_supersedes.add(ref)
            elif any(k in sentence for k in ['修订', '修正', '修改']):
                amend_targets = list(_AMEND_TARGET_RE.finditer(sentence))
                for match in amend_targets:
                    doc_num = match.group('doc')
                    url = match.group('url')
                    key = url or doc_num
                    if key not in seen_supersedes:
                        entry = {'doc_number': doc_num, 'relation': 'amends'}
                        if url:
                            entry['source_url'] = url
                        supersedes.append(entry)
                        seen_supersedes.add(key)
                if not amend_targets:
                    for ref in _ANY_REF_RE.findall(sentence):
                        ref = _clean(ref)
                        if ref not in seen_supersedes and ref not in seen_references:
                            references.append({'doc_number': ref})
                            seen_references.add(ref)
            else:
                for ref in _ANY_REF_RE.findall(sentence):
                    ref = _clean(ref)
                    if ref not in seen_supersedes and ref not in seen_references:
                        references.append({'doc_number': ref})
                        seen_references.add(ref)
        _LIST_ITEM_RE = re.compile('^\\d+[．.、]\\s*(?:\\[)?([《〈][^》〉\\n]{2,60}[》〉])(?:\\]\\(([^)]+)\\))?\\s*[（(]([^）)\\n]{3,80})[）)]')
        if any(k in content_markdown for k in ['全文废止', '全文失效', '部分废止', '部分条款废止', '部分失效']):
            current_relation = None
            for line in content_markdown.split('\n'):
                line = line.strip()
                if re.search('全文废止|全文失效', line):
                    current_relation = 'supersedes'
                elif re.search('部分废止|部分条款废止|部分失效', line):
                    current_relation = 'partially_supersedes'
                elif current_relation and (m := _LIST_ITEM_RE.match(line)):
                    title_text = m.group(1)
                    url = m.group(2)
                    doc_num = _clean(m.group(3))
                    key = doc_num or title_text
                    if key not in seen_supersedes:
                        entry = {'doc_number': doc_num, 'title': title_text.strip('《》〈〉'), 'relation': current_relation}
                        if url and url.startswith('http'):
                            entry['source_url'] = url
                        supersedes.append(entry)
                        seen_supersedes.add(key)
                        if url and url.startswith('http') and (url not in seen_urls):
                            reference_urls.append(url)
                            seen_urls.add(url)
        return {'supersedes': supersedes, 'references': references, 'reference_urls': reference_urls, 'doc_num_to_title': doc_num_to_title}
