From d06fd636229de206632747be095a1ba9ff34360b Mon Sep 17 00:00:00 2001 From: Paul-Louis NECH Date: Tue, 19 Nov 2019 14:26:03 +0100 Subject: [PATCH] feat(tokens): as class, with punctuation/case/oov_token --- KoozDawa/dawa/tokens.py | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/KoozDawa/dawa/tokens.py b/KoozDawa/dawa/tokens.py index 576bf3f..bd275d2 100644 --- a/KoozDawa/dawa/tokens.py +++ b/KoozDawa/dawa/tokens.py @@ -3,29 +3,35 @@ from keras_preprocessing.text import Tokenizer from KoozDawa.dawa.loader import load_kawa -def get_sequence_of_tokens(corpus, tokenizer): - # TODO Tokenize while keeping apostrophes like j'aime - tokenizer.fit_on_texts(corpus) - total_words = len(tokenizer.word_index) + 1 +class PoemTokenizer(Tokenizer): + def __init__(self, **kwargs) -> None: + super().__init__(lower=False, filters='"#$%&()*+,-/<=>@[\\]^_`{|}~\t\n', oov_token="😢", **kwargs) - # convert data to sequence of tokens - input_sequences = [] + def get_sequence_of_tokens(self, corpus): + self.fit_on_texts(corpus) + total_words = len(self.word_index) + 1 - for line in corpus: - token_list = tokenizer.texts_to_sequences([line])[0] - for i in range(1, len(token_list)): - n_gram_sequence = token_list[:i + 1] - input_sequences.append(n_gram_sequence) + # convert data to sequence of tokens + input_sequences = [] - texts = tokenizer.sequences_to_texts(input_sequences) - print("Tokenized:", texts[:5]) + for line in corpus: + token_list = self.texts_to_sequences([line])[0] + for i in range(1, len(token_list)): + n_gram_sequence = token_list[:i + 1] + input_sequences.append(n_gram_sequence) - return input_sequences, total_words + texts = self.sequences_to_texts(input_sequences) + print("Tokenized:", texts[:5]) + + return input_sequences, total_words + + def get_text(self, sequence): + return self.sequences_to_texts(sequence) if __name__ == '__main__': kawa = load_kawa("../") - tokenizer = Tokenizer() - seqs, words = get_sequence_of_tokens(kawa, tokenizer) - texts = tokenizer.sequences_to_texts(seqs) + tokenizer = PoemTokenizer() + seqs, words = tokenizer.get_sequence_of_tokens(kawa) + texts = tokenizer.get_text(seqs) print("%i words." % words) -- libgit2 0.27.0