from keras_preprocessing.text import Tokenizer

from KoozDawa.dawa.loader import load_kawa


def get_sequence_of_tokens(corpus, tokenizer=Tokenizer()):
    # TODO Tokenize while keeping accents
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1

    # convert data to sequence of tokens
    input_sequences = []

    # FIXME Debug: truncate corpus
    corpus = corpus[:50]
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i + 1]
            input_sequences.append(n_gram_sequence)

    texts = tokenizer.sequences_to_texts(input_sequences)
    print("Tokenized:", texts)

    return input_sequences, total_words


if __name__ == '__main__':
    kawa = load_kawa("../")
    seqs, words = get_sequence_of_tokens(kawa)
    print("%i words." % words)