from ..tools.logger import Logger
import os
import os.path as osp
import poppler  # pip install python-poppler
import fitz  # pip install PyMuPDF

def pdf2txt(pdf_filename: str, txt_filename: str, with_position: bool = False):
    """convert pdf to txt

    Args:
        pdf_filename (str): pdf_filename
        txt_filename (str): txt_filename
        with_position (bool, optional): Save text position structure. Defaults to False.
    """
    text_list = extract_text_from_pdf(pdf_filename, with_position)
    text = "".join(text_list)
    if text:
        path = os.path.dirname(txt_filename)
        if path:
            os.makedirs(path, exist_ok=True)
        with open(txt_filename, "w", encoding="utf-8") as fw:
            fw.write(text)
    else:
        Logger.warn(f"no text: {pdf_filename}")


def extract_text_from_pdf(pdf_filename: str, with_position: bool = False):
    """Extract text from pdf

    Args:
        pdf_filename (str): pdf_filename
        with_position (bool, optional): Save text position structure. Defaults to False.
    """
    text_list = []
    if with_position:
        
        try:
            doc = poppler.load_from_file(pdf_filename)
            for i in range(doc.pages):
                page = doc.create_page(i)
                text_list.append(page.text())
        except Exception as e:
            Logger.warn(e)
    else:
        try:
            with fitz.open(pdf_filename) as doc:
                for page in doc:
                    text = page.get_text()
                    text_list.append(text.replace("-\n", ""))
        except Exception as e:
            Logger.warn(e)
    return text_list


def extract_image_from_pdf(pdf_filename: str, folder: str = None, export_pyside_qpixmap: bool = False):
    doc = fitz.Document(pdf_filename)

    cnt = 0
    pix_list = []
    for i in range(len(doc)):
        for img in doc.get_page_images(i):
            xref = img[0]
            # image = doc.extract_image(xref)
            pix = fitz.Pixmap(doc, xref)

            if export_pyside_qpixmap:
                from PySide6 import QtGui

                image = QtGui.QImage(pix.samples, pix.width, pix.height, pix.stride, QtGui.QImage.Format_RGB888)
                pix = QtGui.QPixmap.fromImage(image)
                if pix.width() > 0:
                    pix_list.append(pix)

            if folder is not None:
                pix.save(osp.join(folder, f"{cnt:03d}.png"))
            cnt += 1

    return pix_list


def extract_highlights_from_pdf(pdf_filename: str):
    """提取pdf高亮部分文字
    Return example:
    {
        0: ["Human3.6M", "Our method achiev"],
        3: [
            "pretrained using 3D mocap data, has also proved a popular approach for articulated human motion"
        ],
    }
    """
    from pdfannots import process_file  # pip install pdfannots

    annots_dict = dict()
    document = process_file(open(pdf_filename, "rb"))
    for page_idx in range(len(document.pages)):
        annots = document.pages[page_idx].annots
        for annot in annots:
            if page_idx not in annots_dict:
                annots_dict[page_idx] = []

            text = "".join(annot.text).strip()
            # 去掉换行符
            text = text.replace("-\n", "").replace("\n", " ")
            annots_dict[page_idx].append(text)
    return annots_dict


def extract_annotations_from_pdf(pdf_filename: str) -> dict:
    import PyPDF2  # pip install PyPDF2

    doc = PyPDF2.PdfFileReader(open(pdf_filename, "rb"))

    annotations_dict = dict()
    for i in range(len(doc.pages)):
        annotations = doc.pages[i].annotations

        results = []
        if annotations:
            for annot in annotations:
                d = annot.get_object()
                if "/Contents" in d:
                    results.append(d["/Contents"])
        if len(results):
            annotations_dict[i] = results
    return annotations_dict
