From cce04fee584071ec9dc64d2f7cd21255e98a4ec0 Mon Sep 17 00:00:00 2001 From: Paul-Louis NECH Date: Mon, 18 Nov 2019 21:19:55 +0100 Subject: [PATCH] chore(tokens): Better main --- KoozDawa/dawa/tokens.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/KoozDawa/dawa/tokens.py b/KoozDawa/dawa/tokens.py index 3acfb5a..576bf3f 100644 --- a/KoozDawa/dawa/tokens.py +++ b/KoozDawa/dawa/tokens.py @@ -3,8 +3,8 @@ from keras_preprocessing.text import Tokenizer from KoozDawa.dawa.loader import load_kawa -def get_sequence_of_tokens(corpus, tokenizer=Tokenizer()): - # TODO Tokenize while keeping accents +def get_sequence_of_tokens(corpus, tokenizer): + # TODO Tokenize while keeping apostrophes like j'aime tokenizer.fit_on_texts(corpus) total_words = len(tokenizer.word_index) + 1 @@ -18,12 +18,14 @@ def get_sequence_of_tokens(corpus, tokenizer=Tokenizer()): input_sequences.append(n_gram_sequence) texts = tokenizer.sequences_to_texts(input_sequences) - print("Tokenized:", texts) + print("Tokenized:", texts[:5]) return input_sequences, total_words if __name__ == '__main__': kawa = load_kawa("../") - seqs, words = get_sequence_of_tokens(kawa) + tokenizer = Tokenizer() + seqs, words = get_sequence_of_tokens(kawa, tokenizer) + texts = tokenizer.sequences_to_texts(seqs) print("%i words." % words) -- libgit2 0.27.0