from keras_preprocessing.text import Tokenizer from KoozDawa.dawa.loader import load_kawa def get_sequence_of_tokens(corpus, tokenizer=Tokenizer()): # TODO Tokenize while keeping accents tokenizer.fit_on_texts(corpus) total_words = len(tokenizer.word_index) + 1 # convert data to sequence of tokens input_sequences = [] # FIXME Debug: truncate corpus corpus = corpus[:50] for line in corpus: token_list = tokenizer.texts_to_sequences([line])[0] for i in range(1, len(token_list)): n_gram_sequence = token_list[:i + 1] input_sequences.append(n_gram_sequence) texts = tokenizer.sequences_to_texts(input_sequences) print("Tokenized:", texts) return input_sequences, total_words if __name__ == '__main__': kawa = load_kawa("../") seqs, words = get_sequence_of_tokens(kawa) print("%i words." % words)