Location | Tag | Media  ||  A | P

wikipedia

카테고리 없음 2019. 6. 26. 00:50 |
import glob
import json
import os
import re
import sys

from tqdm import tqdm


def main(argv):
    src_path = argv[0]
    dst_path = argv[1]
    file_list = list(glob.iglob('%s/**/wiki_*' % src_path, recursive=True))
    for filename in tqdm(file_list):
        relpath = os.path.relpath(filename, src_path)
        file = open(filename, 'r', encoding='utf-8')

        lines = file.readlines()
        output = []
        for line in lines:
            data = json.loads(line)
            pattern = re.compile('<a href=\\"([^>]+)\\">(?:\\*|분류:([^<]+))<\\/a>')

            finded = pattern.findall(data['text'])
            data['text'] = pattern.subn('', data['text'])[0]
            data['text'] = data['text'].replace('\n\n', '\n')
            categories = []
            for pat in finded:
                if len(pat[1]) == 0:
                    categories.append(data['title'])
                else:
                    categories.append(pat[1])

            data['categories'] = categories
            output.append(data)

        output_path = os.path.join(dst_path, relpath)
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, 'w', encoding='utf-8') as converted:
            json.dump(output, converted, ensure_ascii=False)


if __name__ == "__main__":
    main(sys.argv[1:])

 

python D:\tech\wikiextractor\WikiExtractor.py kowiki-20170320-pages-articles-multistream.xml -l -ns 분류 --json -o output

 

 

 

WikiExtractor.py
0.11MB

Posted by Bestend
: