diff --git b/.gitignore a/.gitignore new file mode 100644 index 0000000..52d1498 --- /dev/null +++ a/.gitignore @@ -0,0 +1,129 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git b/KoozDawa/data/savoir.txt a/KoozDawa/data/savoir.txt new file mode 100644 index 0000000..e69de29 --- /dev/null +++ a/KoozDawa/data/savoir.txt diff --git b/KoozDawa/lstm.py a/KoozDawa/lstm.py new file mode 100644 index 0000000..2d5eccb --- /dev/null +++ a/KoozDawa/lstm.py @@ -0,0 +1,137 @@ +import os +import string +import warnings + +import numpy as np +from keras import Sequential +from keras.layers import Embedding, LSTM, Dropout, Dense +from keras.preprocessing.text import Tokenizer +from keras.utils import to_categorical +from keras_preprocessing.sequence import pad_sequences +from numpy.random import seed +from tensorflow_core.python.framework.random_seed import set_random_seed + +warnings.filterwarnings("ignore") +warnings.simplefilter(action='ignore', category=FutureWarning) + + +def load(): + # set seeds for reproducibility + set_random_seed(2) + seed(1) + data_dir = 'data/' + all_lines = [] + for filename in os.listdir(data_dir): + with open(data_dir + filename) as f: + content = f.readlines() + all_lines.extend(content) + + all_lines = [h for h in all_lines if + h[0] != "["] + len(all_lines) + print("Loaded data:", all_lines[0]) + return all_lines + + +# 3.1 Dataset cleaning +# In dataset preparation step, we will first perform text cleaning of the data which includes removal of punctuations +# and lower casing all the words. + +def clean_text(txt): + txt = " ".join(v for v in txt if v not in string.punctuation).lower() + txt = txt.encode("utf8").decode("ascii", 'ignore') + return txt + + +# 3.2 Generating Sequence of N-gram Tokens +# +# Language modelling requires a sequence input data, as given a sequence (of words/tokens) the aim is the predict next word/token. +# +# The next step is Tokenization. Tokenization is a process of extracting tokens (terms / words) from a corpus. Python’s library Keras has inbuilt model for tokenization which can be used to obtain the tokens and their index in the corpus. After this step, every text document in the dataset is converted into sequence of tokens. + + +tokenizer = Tokenizer() + + +def get_sequence_of_tokens(corpus): + # TODO Tokenize while keeping accents + tokenizer.fit_on_texts(corpus) + total_words = len(tokenizer.word_index) + 1 + + # convert data to sequence of tokens + input_sequences = [] + for line in corpus: + token_list = tokenizer.texts_to_sequences([line])[0] + for i in range(1, len(token_list)): + n_gram_sequence = token_list[:i + 1] + input_sequences.append(n_gram_sequence) + return input_sequences, total_words + + +# 3.3 Padding the Sequences and obtain Variables : Predictors and Target¶ +def generate_padded_sequences(input_sequences, total_words): + max_sequence_len = max([len(x) for x in input_sequences]) + input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')) + + predictors, label = input_sequences[:, :-1], input_sequences[:, -1] + label = to_categorical(label, num_classes=total_words) + return predictors, label, max_sequence_len + + +def create_model(max_sequence_len, total_words): + input_len = max_sequence_len - 1 + model = Sequential() + + # Add Input Embedding Layer + model.add(Embedding(total_words, 10, input_length=input_len)) + + # Add Hidden Layer 1 - LSTM Layer + model.add(LSTM(100)) # TODO finetune + model.add(Dropout(0.1)) # TODO finetune + + # Add Output Layer + model.add(Dense(total_words, activation='softmax')) + + model.compile(loss='categorical_crossentropy', optimizer='adam') + + return model + + +def generate_text(seed_text, nb_words, model, max_sequence_len): + for _ in range(nb_words): + token_list = tokenizer.texts_to_sequences([seed_text])[0] + token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre') + predicted = model.predict_classes(token_list, verbose=0) + + output_word = "" + for word, index in tokenizer.word_index.items(): + if index == predicted: + output_word = word + break + seed_text += " " + output_word + return seed_text.title() + + +def main(): + lines = load() + + corpus = [clean_text(x) for x in lines] + print(corpus[:10]) + + inp_sequences, total_words = get_sequence_of_tokens(corpus[:10]) # Fixme: Corpus cliff for debug + print(inp_sequences[:10]) + + predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences, total_words) + print(predictors, label, max_sequence_len) + + model = create_model(max_sequence_len, total_words) + model.summary() + + model.fit(predictors, label, epochs=10, verbose=5) + + print(generate_text("", 10, model, max_sequence_len)) + print(generate_text("L'étoile", 10, model, max_sequence_len)) + + +if __name__ == '__main__': + main()