#!/usr/bin/env python3

from os.path import dirname
from tqdm import tqdm
import zd
import re
from opencc import OpenCC
from fire import Fire
import bz2
from gensim.corpora.wikicorpus import extract_pages,filter_wiki

opencc = OpenCC('t2s.json')

RE_S = re.compile(r':*{\|[\s\S]*?\|}')
RE_GALLERY = re.compile(
  r'<gallery>[\s\S]*?</gallery>'
)
RE_BRACE = re.compile(
  r'(.){{([^{}\n]*?\|[^{}\n]*?)}}'
)


def wiki_replace(d):
  s = d[1]
  s = RE_S.sub('', s)
  s = RE_GALLERY.sub('', s)
  s = RE_BRACE.sub('\\1[[\\2]]', s)
  s = filter_wiki(s)
  s = re.sub(r'\* *\n|\'{2,}', '', s)
  s = re.sub('\n+', '\n', s)
  s = re.sub('\n[:;]|\n +', '\n', s)
  s = re.sub('\n==', '\n\n==', s)
  return opencc.convert(d[0]).strip(), opencc.convert(s).strip()


RE_EN = re.compile('^[a-zA-Z]+:')


def main(xml_bz2_filepath=None):
  """从维基百科压缩包抽取中文语料

首先，下载维基百科压缩包（ https://dumps.wikimedia.org/zhwiki ）

下载后，用法如下:

wiki_txt /share/wiki/zhwiki-20200701-pages-articles.xml.bz2"""
  if xml_bz2_filepath is None:
    print(main.__doc__)
    return
  print(
    """从维基百科压缩包抽取中文语料
有问题请到 https://gitee.com/znlp/zword/issues 发帖
联系邮箱：张沈鹏<zsp042@gmail.com>
"""
  )
  outpath = xml_bz2_filepath[:-7]
  outfile = outpath + "txt.zd"
  outtitle = outpath + "title.txt.zd"
  with zd.open(outtitle, 'w') as ft:
    with zd.open(outfile, 'w') as f:
      wiki = extract_pages(
        bz2.open(xml_bz2_filepath)
      )
      w = tqdm(wiki, desc='已获取0篇文章')
      i = 0
      for d in w:
        if not RE_EN.findall(
          d[0]
        ) and d[0] and not d[1].startswith("#"):
          title, txt = wiki_replace(d)
          prefix = "➜"
          txt = txt.replace(prefix, "➔")
          ft.write(title + "\n")
          f.write(
            "".join((
              prefix,
              " ",
              title,
              "\n",
              txt,
              '\n\n\n'
            ))
          )
          i += 1
          if i % 100 == 0:
            w.set_description('已获取%s篇文章' % i)


Fire(main)
