diff --git a/KoozDawa/dawa/loader.py b/KoozDawa/dawa/loader.py new file mode 100644 index 0000000..31c3b88 --- /dev/null +++ b/KoozDawa/dawa/loader.py @@ -0,0 +1,43 @@ +import os +import string + +from numpy.random import seed +from tensorflow_core.python.framework.random_seed import set_random_seed + + +def load_kawa(root="./"): + # set seeds for reproducibility + set_random_seed(2) + seed(1) + data_dir = root + 'data/' + all_lines = [] + for filename in os.listdir(data_dir): + with open(data_dir + filename) as f: + content = f.readlines() + all_lines.extend(content) + + all_lines = [h for h in all_lines if h[0] not in ["[", "#"] + ] + len(all_lines) + print("Loaded %i lines of data: %s." % (len(all_lines), all_lines[0])) + return all_lines + + +def clean_text(lines): + """ + In dataset preparation step, we will first perform text cleaning of the data + which includes removal of punctuations and lower casing all the words. + """ + lines = " ".join(v for v in lines if v not in string.punctuation).lower() + lines = lines.encode("utf8").decode("ascii", 'ignore') + return lines + + +def main(): + lines = load_kawa("../") + clean = clean_text(lines) + print(clean) + + +if __name__ == '__main__': + main() diff --git a/KoozDawa/lstm.py b/KoozDawa/dawa/lstm.py similarity index 62% rename from KoozDawa/lstm.py rename to KoozDawa/dawa/lstm.py index c55f141..4c3014f 100644 --- a/KoozDawa/lstm.py +++ b/KoozDawa/dawa/lstm.py @@ -1,79 +1,26 @@ -import os -import string import warnings import numpy as np from keras import Sequential from keras.engine.saving import load_model from keras.layers import Embedding, LSTM, Dropout, Dense -from keras.preprocessing.text import Tokenizer from keras.utils import to_categorical from keras_preprocessing.sequence import pad_sequences -from numpy.random import seed -from tensorflow_core.python.framework.random_seed import set_random_seed +from keras_preprocessing.text import Tokenizer + +from KoozDawa.dawa.loader import load_kawa, clean_text +from KoozDawa.dawa.tokens import get_sequence_of_tokens warnings.filterwarnings("ignore") warnings.simplefilter(action='ignore', category=FutureWarning) -def load(): - # set seeds for reproducibility - set_random_seed(2) - seed(1) - data_dir = 'data/' - all_lines = [] - for filename in os.listdir(data_dir): - with open(data_dir + filename) as f: - content = f.readlines() - all_lines.extend(content) - - all_lines = [h for h in all_lines if h[0] not in ["[", "#"] - ] - len(all_lines) - print("Loaded %i lines of data: %s." % (len(all_lines), all_lines[0])) - return all_lines - - -# 3.1 Dataset cleaning -# In dataset preparation step, we will first perform text cleaning of the data which includes removal of punctuations -# and lower casing all the words. - -def clean_text(txt): - txt = " ".join(v for v in txt if v not in string.punctuation).lower() - txt = txt.encode("utf8").decode("ascii", 'ignore') - return txt - - -# 3.2 Generating Sequence of N-gram Tokens -# -# Language modelling requires a sequence input data, as given a sequence (of words/tokens) the aim is the predict next word/token. -# -# The next step is Tokenization. Tokenization is a process of extracting tokens (terms / words) from a corpus. Python’s library Keras has inbuilt model for tokenization which can be used to obtain the tokens and their index in the corpus. After this step, every text document in the dataset is converted into sequence of tokens. - - -tokenizer = Tokenizer() - - -def get_sequence_of_tokens(corpus): - # TODO Tokenize while keeping accents - tokenizer.fit_on_texts(corpus) - total_words = len(tokenizer.word_index) + 1 - - # convert data to sequence of tokens - input_sequences = [] - for line in corpus: - token_list = tokenizer.texts_to_sequences([line])[0] - for i in range(1, len(token_list)): - n_gram_sequence = token_list[:i + 1] - input_sequences.append(n_gram_sequence) - return input_sequences, total_words - - -# 3.3 Padding the Sequences and obtain Variables : Predictors and Target¶ +# 3.3 Padding the Sequences and obtain Variables : Predictors and Target def generate_padded_sequences(input_sequences, total_words): max_sequence_len = max([len(x) for x in input_sequences]) input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')) + print("Max len:", max_sequence_len) predictors, label = input_sequences[:, :-1], input_sequences[:, -1] label = to_categorical(label, num_classes=total_words) return predictors, label, max_sequence_len @@ -98,7 +45,7 @@ def create_model(max_sequence_len, total_words, layers=100, dropout=0.1): # TOD return model -def generate_text(seed_text, nb_words, model, max_sequence_len): +def generate_text(model, tokenizer, seed_text="", nb_words=5, max_sequence_len=0): for _ in range(nb_words): token_list = tokenizer.texts_to_sequences([seed_text])[0] token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre') @@ -115,17 +62,18 @@ def generate_text(seed_text, nb_words, model, max_sequence_len): def main(): should_train = True - nb_epoch = 20 + nb_epoch = 100 model_file = "../models/dawa_lstm_%i.hd5" % nb_epoch - max_sequence_len = 5 # TODO: Test different default + max_sequence_len = 5 # TODO: Test different default + tokenizer = Tokenizer() if should_train: - lines = load() + lines = load_kawa() corpus = [clean_text(x) for x in lines] print(corpus[:10]) - inp_sequences, total_words = get_sequence_of_tokens(corpus) + inp_sequences, total_words = get_sequence_of_tokens(corpus, tokenizer) print(inp_sequences[:10]) predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences, total_words) @@ -139,12 +87,12 @@ def main(): else: model = load_model(model_file) - print(generate_text("", 10, model, max_sequence_len)) - print(generate_text("L'étoile", 10, model, max_sequence_len)) + for sample in ["", "L'étoile ", "Elle ", "Les punchlines "]: + print(generate_text(model, tokenizer, sample, 100, max_sequence_len)) while True: input_text = input("> ") - print(generate_text(input_text, 10, model, max_sequence_len)) + print(generate_text(model, tokenizer, input_text, 100, max_sequence_len)) if __name__ == '__main__': diff --git a/KoozDawa/dawa/tokens.py b/KoozDawa/dawa/tokens.py new file mode 100644 index 0000000..481f5fe --- /dev/null +++ b/KoozDawa/dawa/tokens.py @@ -0,0 +1,31 @@ +from keras_preprocessing.text import Tokenizer + +from KoozDawa.dawa.loader import load_kawa + + +def get_sequence_of_tokens(corpus, tokenizer=Tokenizer()): + # TODO Tokenize while keeping accents + tokenizer.fit_on_texts(corpus) + total_words = len(tokenizer.word_index) + 1 + + # convert data to sequence of tokens + input_sequences = [] + + # FIXME Debug: truncate corpus + corpus = corpus[:50] + for line in corpus: + token_list = tokenizer.texts_to_sequences([line])[0] + for i in range(1, len(token_list)): + n_gram_sequence = token_list[:i + 1] + input_sequences.append(n_gram_sequence) + + texts = tokenizer.sequences_to_texts(input_sequences) + print("Tokenized:", texts) + + return input_sequences, total_words + + +if __name__ == '__main__': + kawa = load_kawa("../") + seqs, words = get_sequence_of_tokens(kawa) + print("%i words." % words)