import time
from typing import List, Dict


def parse_google_scholar_paper(web_text: str) -> List[Dict]:
    """Parse paper information from Google Scholar web source code"""

    paper_list = []
    title_set = set()
    for text in web_text.split("data-cid"):
        segs = text.split("data-clk-atid")
        if len(segs) != 3:
            continue
        title = segs[-1].split("</a>")[0].split('">')[1].replace("<b>", " ").replace("</b>", " ").strip()

        if title in title_set:
            continue
        title_set.add(title)

        pdf_url = text.split("tabindex")[-1].split('<a href="')[1].split('"')[0]

        cited_num = 0
        for head in ["被引用次数：", "Cited by"]:
            if head in text:
                cited_num = int(text.split(head)[1].split("</a>")[0].strip())
                break

        # author
        authors, authors_ids = [], []
        for part in text.split("/citations?user=")[1:]:
            author_id, author_name = part.split("</a>")[0].split('&amp;oi=sra">')
            author_id = author_id.split("&amp;")[0]
            authors.append(author_name)
            authors_ids.append(author_id)

        # abstract
        # only support use_chinese=true
        # 不提取
        # abstract = text.split('gs_rs">')[1].split(" …</div><div")[0].replace("<br>", "") + " ..."

        head = '<a href="/scholar?cites='
        paper_id = ""
        if head in text:
            paper_id = text.split(head)[1].split("&amp;")[0]

        paper_list.append(
            {
                "title": title,
                "paper_id": paper_id,
                "cited_num": cited_num,
                "pdf_url": pdf_url,
                "authors": authors,
                "authors_ids": authors_ids,
                "search_time": int(time.time()),
            }
        )

    return paper_list
