diff --git a/CommentTuParles/parles.py b/CommentTuParles/parles.py index f4d3361..310f120 100644 --- a/CommentTuParles/parles.py +++ b/CommentTuParles/parles.py @@ -1,6 +1,9 @@ import json +import operator import os -from collections import defaultdict +from collections import defaultdict, Counter + +import spacy def are_invalid(lyrics: str): @@ -16,8 +19,7 @@ def are_invalid(lyrics: str): def cleanup(lyrics): lines = lyrics.split("\n") filtered = [l for l in lines if - l is not None and - len(l) and + l is not None and len(l) and l[0] not in ["#", "["] and "Paroles de la chanson" not in l and "Paroles de Même" not in l] @@ -27,33 +29,64 @@ def cleanup(lyrics): def analyse(): path = "../data/" - files = os.listdir(path) stats = { "avg_lines": defaultdict(lambda: 0), "avg_newlines": defaultdict(lambda: 0), "missing": defaultdict(lambda: 0) } + print("Go") + files = os.listdir(path) + print("Listed data files.") + nlp = spacy.load("fr_core_news_md") + print("Loaded spaCy.") + for filename in files: with open(path + filename) as f: content = json.load(f) + print("Loaded", filename) artist = content["name"] songs = content["songs"] print("\n## %s ##" % artist) + keywords = [] + occurences = {} for song in songs: - # title = song["title"] if song["lyrics"] is not None: + keyword = "anar" + occurences[song["title"]] = song["lyrics"].lower().count(keyword) + + if any([i > 0 for i in occurences.values()]): + print("%s chez %s: %i" % (keyword, artist, len(occurences))) + for pair in sorted(occurences.items(), key=operator.itemgetter(1), reverse=True): + print(pair) + else: + print("%s ne parle pas de %s" % (artist, keyword)) + + for song in songs: + # title = song["title"] + if song["lyrics"] is None: + stats["missing"][artist] += 1 + else: lyrics: str = song["lyrics"] + if are_invalid(lyrics): stats["missing"][artist] += 1 else: lyrics = cleanup(lyrics) stats["avg_lines"][artist] += len(lyrics) stats["avg_newlines"][artist] += lyrics.count("\n") - else: - stats["missing"][artist] += 1 + + doc = nlp(lyrics) + keywords.extend([token.text for token in doc if + token.is_stop != True and + token.is_punct != True and + token.pos_ in ["PRON", "NOUN", "ADJ", "VERB", "INTJ", "X"]]) + + # five most common noun tokens + noun_freq = Counter(keywords) + common_nouns = noun_freq.most_common(20) final_missing = stats["missing"][artist] final_songs = len(songs) - final_missing @@ -66,6 +99,7 @@ def analyse(): stats["avg_lines"][artist], stats["avg_newlines"][artist], final_missing)) + print(common_nouns) if __name__ == '__main__': diff --git a/CommentTuParles/requirements.txt b/CommentTuParles/requirements.txt new file mode 100644 index 0000000..e394bb0 --- /dev/null +++ b/CommentTuParles/requirements.txt @@ -0,0 +1 @@ +spacy==2.2.3