#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import xml.etree.ElementTree as ET
import sys
import re
import mmap

STRING_WIKI_OVERVIEW = 'Übersicht'
STRING_WIKI_LANGUAGE = 'Deutsch' #[^ {]+

def parse(filename, sanitizer, callback):
    RE_XML_TEXT = re.compile(r'<text[^>]*>.*</text>', re.DOTALL)
    RE_XML_TITLE = re.compile(r'<title>(.*)</title>', re.DOTALL)
    RE_WIKI_OVERVIEW = re.compile(r'{{(?P<language>%s) (?P<class>[^ ]+) %s(?P<table>.*?)}}'%(STRING_WIKI_LANGUAGE, STRING_WIKI_OVERVIEW), re.DOTALL)
    RE_WIKI_OVERVIEW_KEY_VALUE = re.compile('(?<=\n\|)([^=]+)=(.+)')
    file_input = open(filename, 'rb')
    # https://stackoverflow.com/questions/519633/lazy-method-for-reading-big-file-in-python
    # https://stackoverflow.com/questions/1661986/why-doesnt-pythons-mmap-work-with-large-files
    mmap_input = mmap.mmap(file_input.fileno(), 0, access=mmap.ACCESS_READ)
    # https://stackoverflow.com/questions/3862010/is-there-a-generator-version-of-string-split-in-python
    # https://docs.python.org/3.7/library/re.html#re.DOTALL
    for page in re.finditer(b'<page>.*?</page>', mmap_input, re.DOTALL):
        page = page.group(0).decode('utf-8')
        text = RE_XML_TEXT.search(page)
        if (text):
            text = text.group(0)
            overview = RE_WIKI_OVERVIEW.search(text)
            if (overview):
                _language = overview.group('language')
                _class = overview.group('class')
                _table = overview.group('table')
                # https://stackoverflow.com/questions/18072759/list-comprehension-on-a-nested-list
                _table = {sanitizer(k):v for k,v in [m.groups() for m in RE_WIKI_OVERVIEW_KEY_VALUE.finditer(_table)] if sanitizer(k)}
                _title = RE_XML_TITLE.search(page).group(1)
                callback(_language, _class, _table, _title)

def sanitizer(key):
    if (key.startswith('Bild')):
        return None
    return key.strip(' *1234567890')

def statistics():
    from collections import defaultdict
    # https://stackoverflow.com/questions/5029934/python-defaultdict-of-defaultdict
    keys = defaultdict(lambda: defaultdict(int))
    def interpret(_language, _class, _table, _title):
        for k in _table.keys():
            keys[_class][k] += 1
    parse(sys.argv[1], sanitizer, interpret)
    import pprint
    pprint.pprint(keys)


files = {k:open(k+'.txt','w') for k in ['Verb', 'Nominativ Plural', 'Dativ Plural', 'Adjektiv']}
ignore = ['Adverb', 'Eigenname', 'Nachname', 'Name', 'Pronomen', 'Toponym', 'Vorname', 'adjektivisch']
def writemline(f, line):
    if (line[0] in ['m', 'M']):
        f.write(line+'\n')
def interpret(_language, _class, _table, _title):
    if (':' in _title):
        pass
    elif (_title in ['Mussel', 'Loschen', 'Chochemer', 'DDR-Wortschatz', 'Goldsuche', 'Jausem', 'Illuminatenorden', 'Staatskunst', 'Spalt breit', 'Lesedauer', 'Erwachsensein', 'eben', 'musikalisch', 'verwandt', 'passé', 'umfangreich', 'furchtbar', 'berühmt', 'festlich', 'lebend', 'fruchtbar', 'gleichlautend']):
        pass
    elif (not _table or _class in ignore):
        pass
    elif (_class == 'Verb'):
        if (not _title.startswith('mit') or _title == 'mitteln'):
            writemline(files['Verb'], _title)
    elif (_class == 'Substantiv'):
        writemline(files['Nominativ Plural'], _table['Nominativ Plural'])
        writemline(files['Dativ Plural'], _table['Dativ Plural'])
    elif (_class == 'Adjektiv'):
        writemline(files['Adjektiv'], _table['Positiv'])
parse(sys.argv[1], sanitizer, interpret)
