# -*- coding: utf-8 -*-
import os
import re
import json
import markdown
import html2text
from bs4 import BeautifulSoup
from typing import List, Dict, Optional
from langchain_text_splitters import MarkdownHeaderTextSplitter

class BaseMaterial:
    md_content: Optional[str] = None
    type: Optional[str] = None
    img_url_list: Optional[List[str]] = None
    table_list: Optional[List[str]] = None
    net_content: Optional[str] = None  # 净文本
    table_content: Optional[str] = None  # 表格+标题

def md2html(markdown_text):
    """将 Markdown 文本转换为 HTML"""
    return markdown.markdown(markdown_text, extensions=['tables'])

def table2md(table):
    """将 HTML 表格转换为 Markdown 格式"""
    markdown_table = []
    rows = table.find_all('tr')
    for row in rows:
        cells = row.find_all(['th', 'td'])
        row_content = '| ' + ' | '.join([cell.get_text().strip() for cell in cells]) + ' |'
        markdown_table.append(row_content)
    # 添加分隔符行
    if markdown_table:
        header_row = markdown_table[0]
        separator_row = '| ' + ' | '.join(['---' for _ in header_row.split('|')[1:-1]]) + ' |'
        markdown_table.insert(1, separator_row)
    return '\n'.join(markdown_table)



def split_markdown_by_heading(markdown_text: str):
    """
    将 Markdown 文档按标题分块。
    """
    headers_to_split_on = []
    header_index = 3
    for i in range(1, header_index + 1):
        headers_to_split_on.append((f"{'#' * i}", f"header_{i}"))
    # 初始化分块器
    splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
    # 文本分快
    chunks_md = splitter.split_text(markdown_text)
    # 将分块结果转换为字典列表
    chunks = [{'heading': chunk.metadata, 'content': chunk.page_content} for chunk in chunks_md]
    return chunks

def get_img_url(text):
    """
    从文本中提取图片链接
    格式：![](http://xp-int-pub-static.oss-cn-hangzhou.aliyuncs.com/xp-iphoenix-ai-plus/test/docx/image/image_20250)
    """
    chunks = split_markdown_by_heading(text)
    pattern = r'!\[board\]\((http[s]?://[^)]+)\)'
    img_url_list = []
    for chunk in chunks:
        chunk_cotent = chunk['content']
        chunk_heading = chunk['heading']
        matches = re.findall(pattern, chunk_cotent)
        if matches:
            img_url_list.extend({"chunk_heading":chunk_heading,"url":url} for url in matches)  # 返回所有匹配的链接
        else:  # 如果没有匹配，返回空列表
            pass
    return img_url_list

def html2md(html_content):
    """
    将HTML内容转换为Markdown格式
    :param html_content: HTML内容（字符串）
    :return: Markdown内容（字符串）
    """
    # 创建html2text转换器
    h = html2text.HTML2Text()
    # 配置转换器（可选）
    h.ignore_links = False  # 是否忽略链接
    h.ignore_images = False  # 是否忽略图片
    h.ignore_emphasis = False  # 是否忽略强调（如加粗、斜体等）
    h.body_width = 0  # 设置输出宽度为无限制
    markdown_content = h.handle(html_content)
    return markdown_content

class MaterialInterpretation:
    """
    材料解析类
    """
    @classmethod
    def material_interpretation(cls, material: BaseMaterial):
        """材料解析"""
        table_holder = "【此处为表格】"
        img_holder = "【此处为画板】"
        ## 离线文档预清洗
        if material.type in ["offline_doc", "offline_sheet", "offline_pdf"]:
            logging.info(f"离线文档预清洗...")
            material.md_content = cls.offline_text_clean(material.md_content)
        # 画板链接提取
        material.img_url_list = get_img_url(material.md_content)
        # 剔除画板链接
        pattern = r'!\[.*?\]\(http://[^\)]+\)'
        text = re.sub(pattern, img_holder, material.md_content)
        # 提取表格
        # html_text = md2html(text)
        text = text.replace("\\n", "\n")  # 试一下转义字符 yes！
        html_text = markdown.markdown(text, extensions=['tables'])
        soup = BeautifulSoup(html_text, 'html.parser')
        tables = soup.find_all('table')
        # 替换表格为占位符
        for table in tables:
            table.replace_with(BeautifulSoup(f'<p>{table_holder}</p>', 'html.parser'))
        # 净文本(占位符)
        material.net_content = html2md(str(soup))
        # 表格处理
        material.table_list = [table2md(table) for table in tables]
        # 表格文本
        table_content = []
        index = 0
        for line in material.net_content.split('\n'):
            if line.startswith('#'):
                table_content.append(line)
            elif line.strip() == table_holder:
                table_content.append(material.table_list[index])
                index += 1
            else:
                pass
        material.table_content = '\n'.join(table_content)

        return material,html_text


    @classmethod
    def offline_text_clean(cls, text):
        """
        清洗离线文档文本：
        1. 如果该行为一级标题 (# ) 或二级标题 (## )，该行为标题。
        2. 直至下一个一级标题 (# ) 或二级标题 (## )之间的内容，为该标题的内容。
        3. 输出拼接后的内容字符串，去重并处理重复行。
        """

        def save_current_block(title, content, result):
            """保存当前标题块到结果列表"""
            if title:
                result.append({
                    "标题名": title,
                    "内容": "\n".join(content)
                })

        result = []  # 存储最终结果
        current_title = None  # 当前标题
        current_content = []  # 当前标题下的内容

        # 一、遍历输入文本，按标题切割内容
        for line in text.splitlines():
            if line.startswith("# ") or line.startswith("## "):  # 遇到一级或二级标题
                save_current_block(current_title, current_content, result)  # 保存当前块
                current_title = line  # 更新标题
                current_content = [line]  # 初始化内容，包含标题行
            else:
                current_content.append(line)  # 添加内容行

        # 保存最后一个块
        save_current_block(current_title, current_content, result)

        # 二、去重处理：剔除重复的标题块
        seen_titles = set()
        unique_result = []
        for item in result:
            if item["标题名"] not in seen_titles:
                unique_result.append(item)
                seen_titles.add(item["标题名"])

        # 三、倒序遍历最后一个元素的内容，删除与第一个元素重复的行
        if len(unique_result) > 1:
            last_content_lines = unique_result[-1]["内容"].splitlines()
            first_content_lines = set(unique_result[0]["内容"].splitlines())  # 转为集合加速查找
            last_content_lines = [
                                     line for line in reversed(last_content_lines)  # 倒序遍历
                                     if line not in first_content_lines
                                 ][::-1]  # 再次反转回正序
            unique_result[-1]["内容"] = "\n".join(last_content_lines)

        # 拼接所有内容
        clean_str = "\n".join(item["内容"] for item in unique_result)

        return clean_str


if __name__ == "__main__":
    pass


