From 14c73a0529eeafbbef2ff99604ec96b787330c43 Mon Sep 17 00:00:00 2001 From: Paul-Louis NECH Date: Sat, 23 Nov 2019 15:38:54 +0100 Subject: [PATCH] refactor: Extract glossolalia from KoozDawa --- KoozDawa/dawa.py | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ KoozDawa/dawa/loader.py | 62 -------------------------------------------------------------- KoozDawa/dawa/lstm.py | 113 ----------------------------------------------------------------------------------------------------------------- KoozDawa/dawa/lyrics.py | 17 ----------------- KoozDawa/dawa/tokens.py | 39 --------------------------------------- KoozDawa/lyrics.py | 18 ++++++++++++++++++ KoozDawa/tweeper.py | 2 +- glossolalia/loader.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ glossolalia/lstm.py | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ glossolalia/tokens.py | 39 +++++++++++++++++++++++++++++++++++++++ 10 files changed, 244 insertions(+), 232 deletions(-) create mode 100644 KoozDawa/dawa.py delete mode 100644 KoozDawa/dawa/loader.py delete mode 100644 KoozDawa/dawa/lstm.py delete mode 100644 KoozDawa/dawa/lyrics.py delete mode 100644 KoozDawa/dawa/tokens.py create mode 100644 KoozDawa/lyrics.py create mode 100644 glossolalia/loader.py create mode 100644 glossolalia/lstm.py create mode 100644 glossolalia/tokens.py diff --git a/KoozDawa/dawa.py b/KoozDawa/dawa.py new file mode 100644 index 0000000..dca40a6 --- /dev/null +++ b/KoozDawa/dawa.py @@ -0,0 +1,66 @@ +from keras.callbacks import ModelCheckpoint, EarlyStopping + +from glossolalia.loader import load_kawa, clean_text, load_seeds +from glossolalia.lstm import generate_padded_sequences, create_model, generate_text +from glossolalia.tokens import PoemTokenizer + + +def main(): + # should_train = True + # model_file = "../models/dawa_lstm_%i.hd5" % nb_epoch + nb_words = 20 + nb_epoch = 100 + nb_layers = 128 + dropout = .2 + tokenizer = PoemTokenizer() + + # if should_train: + lines = load_kawa() + + corpus = [clean_text(x) for x in lines] + print("Corpus:", corpus[:10]) + + inp_sequences, total_words = tokenizer.get_sequence_of_tokens(corpus) + predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences, total_words) + model = create_model(max_sequence_len, total_words, layers=nb_layers, dropout=dropout) + model.summary() + + file_path = "../models/dawa/dawa_lstm%i-d%.1f-{epoch:02d}_%i-{accuracy:.4f}.hdf5" % (nb_layers, dropout, nb_epoch) + checkpoint = ModelCheckpoint(file_path, monitor='accuracy', period=10, save_best_only=True) + # print_callback = LambdaCallback(on_epoch_end=on_epoch_end) + early_stopping = EarlyStopping(monitor='accuracy', patience=5) + callbacks_list = [checkpoint, early_stopping] + + for i in range(0, nb_epoch, 10): + model.fit(predictors, label, initial_epoch=i, epochs=min(i + 10, nb_epoch), verbose=2, callbacks=callbacks_list) + print(generate_text(model, tokenizer, "", nb_words, max_sequence_len)) + + # model.save(model_file) + # else: # FIXME: Load and predict, maybe reuse checkpoints? + # model = load_model(model_file) + + for i, seed in enumerate(load_seeds(lines, 5)): + output = generate_text(model, tokenizer, seed, nb_words, max_sequence_len) + print("%i %s -> %s" % (i, seed, output)) + + with open("./output/dawa.txt", "a+") as f: + while True: + input_text = input("> ") + text = generate_text(model, tokenizer, input_text, nb_words, max_sequence_len) + + print(text) + f.writelines("%s\n" % text) + + +def debug_unrandomize(): + from numpy.random import seed + from tensorflow_core.python.framework.random_seed import set_random_seed + + # set seeds for reproducibility + set_random_seed(2) + seed(1) + + +if __name__ == '__main__': + debug_unrandomize() + main() diff --git a/KoozDawa/dawa/loader.py b/KoozDawa/dawa/loader.py deleted file mode 100644 index 8734aa5..0000000 --- a/KoozDawa/dawa/loader.py +++ /dev/null @@ -1,62 +0,0 @@ -import os -import string -from pprint import pprint -from random import choice, randint - -from numpy.random import seed -from tensorflow_core.python.framework.random_seed import set_random_seed - - -def load_kawa(root="./"): - # set seeds for reproducibility - set_random_seed(2) - seed(1) - data_dir = root + 'data/' - all_lines = [] - files = os.listdir(data_dir) - print("%i files in data folder." % len(files)) - for filename in files: - with open(data_dir + filename) as f: - content = f.readlines() - all_lines.extend(content) - - all_lines = [h for h in all_lines if h[0] not in ["[", "#"] - ] - len(all_lines) - print("Loaded %i lines of data: %s." % (len(all_lines), all_lines[0])) - return all_lines - - -def load_seeds(kawa=None, nb_seeds=10): - if kawa is None: - kawa = load_kawa() - seeds = [] - for i in range(nb_seeds): - plain_kawa = filter(lambda k: k != "\n", kawa) - chosen = choice(list(plain_kawa)) - split = chosen.split(" ") - nb_words = randint(1, len(split)) - seeds.append(" ".join(split[:nb_words])) - return seeds - - -def clean_text(lines): - """ - In dataset preparation step, we will first perform text cleaning of the data - which includes removal of punctuations and lower casing all the words. - """ - lines = "".join(v for v in lines if v not in string.punctuation) - # lines = lines.encode("utf8").decode("ascii", 'ignore') - return lines - - -def main(): - lines = load_kawa("../") - clean = clean_text(lines) - print(clean) - print("Some seeds:") - pprint(load_seeds(lines)) - - -if __name__ == '__main__': - main() diff --git a/KoozDawa/dawa/lstm.py b/KoozDawa/dawa/lstm.py deleted file mode 100644 index 55bd633..0000000 --- a/KoozDawa/dawa/lstm.py +++ /dev/null @@ -1,113 +0,0 @@ -import warnings - -import numpy as np -from keras import Sequential -from keras.callbacks import ModelCheckpoint, EarlyStopping -from keras.layers import Embedding, LSTM, Dropout, Dense, Bidirectional -from keras.utils import to_categorical -from keras_preprocessing.sequence import pad_sequences -from keras_preprocessing.text import Tokenizer - -from KoozDawa.dawa.loader import load_kawa, clean_text, load_seeds -from KoozDawa.dawa.tokens import PoemTokenizer - -warnings.filterwarnings("ignore") -warnings.simplefilter(action='ignore', category=FutureWarning) - - -# 3.3 Padding the Sequences and obtain Variables : Predictors and Target -def generate_padded_sequences(input_sequences, total_words): - max_sequence_len = max([len(x) for x in input_sequences]) - input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')) - predictors, label = input_sequences[:, :-1], input_sequences[:, -1] - label = to_categorical(label, num_classes=total_words) - return predictors, label, max_sequence_len - - -def create_model(max_sequence_len, total_words, layers=128, dropout=0.3): # TODO finetune layers/dropout - print("Creating model across %i words for %i-long seqs (%i layers, %.2f dropout):" % - (total_words, max_sequence_len, layers, dropout)) - input_len = max_sequence_len - 1 - model = Sequential() - - # Add Input Embedding Layer - model.add(Embedding(total_words, 10, input_length=input_len)) - - # Add Hidden Layer 1 - LSTM Layer - model.add(LSTM(layers)) - # model.add(Bidirectional(LSTM(layers), input_shape=(max_sequence_len, total_words))) - model.add(Dropout(dropout)) - - # Add Output Layer - model.add(Dense(total_words, activation='softmax')) - - model.compile(optimizer='adam', # TODO: Try RMSprop(learning_rate=0.01) - loss='categorical_crossentropy', # TODO: Try sparse_categorical_crossentropy for faster training - metrics=['accuracy']) - - # TODO: Try alternative architectures - # https://medium.com/coinmonks/word-level-lstm-text-generator-creating-automatic-song-lyrics-with-neural-networks-b8a1617104fb#35f4 - return model - - -def generate_text(model: Sequential, tokenizer: Tokenizer, seed_text="", nb_words=5, max_sequence_len=0) -> str: - word_indices = {v: k for k, v in tokenizer.word_index.items()} - output = seed_text - - for _ in range(nb_words): - token_list = tokenizer.texts_to_sequences([output])[0] - token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre') - predicted = model.predict_classes(token_list, verbose=2)[0] - output += " " + word_indices[predicted] - return output.capitalize() - - -def main(): - # should_train = True - # model_file = "../models/dawa_lstm_%i.hd5" % nb_epoch - nb_words = 20 - nb_epoch = 100 - nb_layers = 128 - dropout = .2 - tokenizer = PoemTokenizer() - - # if should_train: - lines = load_kawa() - - corpus = [clean_text(x) for x in lines] - print("Corpus:", corpus[:10]) - - inp_sequences, total_words = tokenizer.get_sequence_of_tokens(corpus) - predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences, total_words) - model = create_model(max_sequence_len, total_words, layers=nb_layers, dropout=dropout) - model.summary() - - file_path = "../models/dawa/dawa_lstm%i-d%.1f-{epoch:02d}_%i-{accuracy:.4f}.hdf5" % (nb_layers, dropout, nb_epoch) - checkpoint = ModelCheckpoint(file_path, monitor='accuracy', period=10, save_best_only=True) - # print_callback = LambdaCallback(on_epoch_end=on_epoch_end) - early_stopping = EarlyStopping(monitor='accuracy', patience=5) - callbacks_list = [checkpoint, early_stopping] - - for i in range(0, nb_epoch, 10): - model.fit(predictors, label, initial_epoch=i, epochs=min(i + 10, nb_epoch), verbose=2, callbacks=callbacks_list) - print(generate_text(model, tokenizer, "", nb_words, max_sequence_len)) - - # model.save(model_file) - # else: # FIXME: Load and predict, maybe reuse checkpoints? - # model = load_model(model_file) - - for i, seed in enumerate(load_seeds(lines, 5)): - output = generate_text(model, tokenizer, seed, nb_words, max_sequence_len) - print("%i %s -> %s" % (i, seed, output)) - - with open("./output/dawa.txt", "a+") as f: - while True: - input_text = input("> ") - text = generate_text(model, tokenizer, input_text, nb_words, max_sequence_len) - - print(text) - f.writelines("%s\n" % text) - - -if __name__ == '__main__': - main() diff --git a/KoozDawa/dawa/lyrics.py b/KoozDawa/dawa/lyrics.py deleted file mode 100644 index 27426d3..0000000 --- a/KoozDawa/dawa/lyrics.py +++ /dev/null @@ -1,17 +0,0 @@ -import lyricsgenius - - -def fetch(): - genius = lyricsgenius.Genius("zUSpjfQ9ELXDqOjx9hGfAlJGYQFrNvHh3rlDV298_QSr5ScKf3qlHZtOO2KsXspQ") - response = genius.search_artist("Dooz-kawa") - - for hit in response["hits"]: - print(hit) - - -def main(): - fetch() - - -if __name__ == '__main__': - main() diff --git a/KoozDawa/dawa/tokens.py b/KoozDawa/dawa/tokens.py deleted file mode 100644 index 1e05230..0000000 --- a/KoozDawa/dawa/tokens.py +++ /dev/null @@ -1,39 +0,0 @@ -from keras_preprocessing.text import Tokenizer - -from KoozDawa.dawa.loader import load_kawa - - -class PoemTokenizer(Tokenizer): - def __init__(self, **kwargs) -> None: - super().__init__(lower=True, # TODO: Better generalization without? - filters='#$%&()*+/<=>@[\\]^_`{|}~\t\n', oov_token="😢", - **kwargs) - - def get_sequence_of_tokens(self, corpus): - self.fit_on_texts(corpus) - total_words = len(self.word_index) + 1 - - # convert data to sequence of tokens - input_sequences = [] - - for line in corpus: - token_list = self.texts_to_sequences([line])[0] - for i in range(1, len(token_list)): - n_gram_sequence = token_list[:i + 1] - input_sequences.append(n_gram_sequence) - - texts = self.sequences_to_texts(input_sequences) - print("Tokenized:", texts[:5]) - - return input_sequences, total_words - - def get_text(self, sequence): - return self.sequences_to_texts(sequence) - - -if __name__ == '__main__': - kawa = load_kawa("../") - tokenizer = PoemTokenizer() - seqs, words = tokenizer.get_sequence_of_tokens(kawa) - texts = tokenizer.get_text(seqs) - print("%i words." % words) diff --git a/KoozDawa/lyrics.py b/KoozDawa/lyrics.py new file mode 100644 index 0000000..1b49ff1 --- /dev/null +++ b/KoozDawa/lyrics.py @@ -0,0 +1,18 @@ +import lyricsgenius + + +def fetch(): + genius = lyricsgenius.Genius("zUSpjfQ9ELXDqOjx9hGfAlJGYQFrNvHh3rlDV298_QSr5ScKf3qlHZtOO2KsXspQ") + response = genius.search_artist("Dooz-kawa") + print(response) + + for hit in response["hits"]: + print(hit) + + +def main(): + fetch() + + +if __name__ == '__main__': + main() diff --git a/KoozDawa/tweeper.py b/KoozDawa/tweeper.py index 657b50e..2629baf 100755 --- a/KoozDawa/tweeper.py +++ b/KoozDawa/tweeper.py @@ -25,7 +25,7 @@ class Tweeper(object): def main(): - Tweeper().tweet("un pont de paris sen souvient sur de toi") + Tweeper().tweet("les anges se sont fichés") # Authenticate to Twitter diff --git a/glossolalia/loader.py b/glossolalia/loader.py new file mode 100644 index 0000000..8734aa5 --- /dev/null +++ b/glossolalia/loader.py @@ -0,0 +1,62 @@ +import os +import string +from pprint import pprint +from random import choice, randint + +from numpy.random import seed +from tensorflow_core.python.framework.random_seed import set_random_seed + + +def load_kawa(root="./"): + # set seeds for reproducibility + set_random_seed(2) + seed(1) + data_dir = root + 'data/' + all_lines = [] + files = os.listdir(data_dir) + print("%i files in data folder." % len(files)) + for filename in files: + with open(data_dir + filename) as f: + content = f.readlines() + all_lines.extend(content) + + all_lines = [h for h in all_lines if h[0] not in ["[", "#"] + ] + len(all_lines) + print("Loaded %i lines of data: %s." % (len(all_lines), all_lines[0])) + return all_lines + + +def load_seeds(kawa=None, nb_seeds=10): + if kawa is None: + kawa = load_kawa() + seeds = [] + for i in range(nb_seeds): + plain_kawa = filter(lambda k: k != "\n", kawa) + chosen = choice(list(plain_kawa)) + split = chosen.split(" ") + nb_words = randint(1, len(split)) + seeds.append(" ".join(split[:nb_words])) + return seeds + + +def clean_text(lines): + """ + In dataset preparation step, we will first perform text cleaning of the data + which includes removal of punctuations and lower casing all the words. + """ + lines = "".join(v for v in lines if v not in string.punctuation) + # lines = lines.encode("utf8").decode("ascii", 'ignore') + return lines + + +def main(): + lines = load_kawa("../") + clean = clean_text(lines) + print(clean) + print("Some seeds:") + pprint(load_seeds(lines)) + + +if __name__ == '__main__': + main() diff --git a/glossolalia/lstm.py b/glossolalia/lstm.py new file mode 100644 index 0000000..973f017 --- /dev/null +++ b/glossolalia/lstm.py @@ -0,0 +1,58 @@ +import warnings + +import numpy as np +from keras import Sequential +from keras.layers import Embedding, LSTM, Dropout, Dense +from keras.utils import to_categorical +from keras_preprocessing.sequence import pad_sequences +from keras_preprocessing.text import Tokenizer + +warnings.filterwarnings("ignore") +warnings.simplefilter(action='ignore', category=FutureWarning) + + +# 3.3 Padding the Sequences and obtain Variables : Predictors and Target +def generate_padded_sequences(input_sequences, total_words): + max_sequence_len = max([len(x) for x in input_sequences]) + input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')) + predictors, label = input_sequences[:, :-1], input_sequences[:, -1] + label = to_categorical(label, num_classes=total_words) + return predictors, label, max_sequence_len + + +def create_model(max_sequence_len, total_words, layers=128, dropout=0.3): # TODO finetune layers/dropout + print("Creating model across %i words for %i-long seqs (%i layers, %.2f dropout):" % + (total_words, max_sequence_len, layers, dropout)) + input_len = max_sequence_len - 1 + model = Sequential() + + # Add Input Embedding Layer + model.add(Embedding(total_words, 10, input_length=input_len)) + + # Add Hidden Layer 1 - LSTM Layer + model.add(LSTM(layers)) + # model.add(Bidirectional(LSTM(layers), input_shape=(max_sequence_len, total_words))) + model.add(Dropout(dropout)) + + # Add Output Layer + model.add(Dense(total_words, activation='softmax')) + + model.compile(optimizer='adam', # TODO: Try RMSprop(learning_rate=0.01) + loss='categorical_crossentropy', # TODO: Try sparse_categorical_crossentropy for faster training + metrics=['accuracy']) + + # TODO: Try alternative architectures + # https://medium.com/coinmonks/word-level-lstm-text-generator-creating-automatic-song-lyrics-with-neural-networks-b8a1617104fb#35f4 + return model + + +def generate_text(model: Sequential, tokenizer: Tokenizer, seed_text="", nb_words=5, max_sequence_len=0) -> str: + word_indices = {v: k for k, v in tokenizer.word_index.items()} + output = seed_text + + for _ in range(nb_words): + token_list = tokenizer.texts_to_sequences([output])[0] + token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre') + predicted = model.predict_classes(token_list, verbose=2)[0] + output += " " + word_indices[predicted] + return output.capitalize() \ No newline at end of file diff --git a/glossolalia/tokens.py b/glossolalia/tokens.py new file mode 100644 index 0000000..88d7c51 --- /dev/null +++ b/glossolalia/tokens.py @@ -0,0 +1,39 @@ +from keras_preprocessing.text import Tokenizer + +from glossolalia.loader import load_kawa + + +class PoemTokenizer(Tokenizer): + def __init__(self, **kwargs) -> None: + super().__init__(lower=True, # TODO: Better generalization without? + filters='#$%&()*+/<=>@[\\]^_`{|}~\t\n', oov_token="😢", + **kwargs) + + def get_sequence_of_tokens(self, corpus): + self.fit_on_texts(corpus) + total_words = len(self.word_index) + 1 + + # convert data to sequence of tokens + input_sequences = [] + + for line in corpus: + token_list = self.texts_to_sequences([line])[0] + for i in range(1, len(token_list)): + n_gram_sequence = token_list[:i + 1] + input_sequences.append(n_gram_sequence) + + texts = self.sequences_to_texts(input_sequences) + print("Tokenized:", texts[:5]) + + return input_sequences, total_words + + def get_text(self, sequence): + return self.sequences_to_texts(sequence) + + +if __name__ == '__main__': + kawa = load_kawa("../") + tokenizer = PoemTokenizer() + seqs, words = tokenizer.get_sequence_of_tokens(kawa) + texts = tokenizer.get_text(seqs) + print("%i words." % words) -- libgit2 0.27.0