From 5a4f5208a2d951c367e4c577927550d2061bd8d3 Mon Sep 17 00:00:00 2001 From: Paul-Louis NECH Date: Sun, 24 Nov 2019 15:52:08 +0100 Subject: [PATCH] feat(parles): stats words/lines, clean input --- CommentTuParles/parles.py | 50 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/CommentTuParles/parles.py b/CommentTuParles/parles.py index 0bb3c05..f4d3361 100644 --- a/CommentTuParles/parles.py +++ b/CommentTuParles/parles.py @@ -3,11 +3,34 @@ import os from collections import defaultdict +def are_invalid(lyrics: str): + placeholders = ["Please check back once the song has been released", + "Lyrics for this song have yet", + "Tell us that you would like to have the lyrics of this song"] + for placeholder in placeholders: + if lyrics.find(placeholder) != -1: + return True + return False + + +def cleanup(lyrics): + lines = lyrics.split("\n") + filtered = [l for l in lines if + l is not None and + len(l) and + l[0] not in ["#", "["] and + "Paroles de la chanson" not in l and + "Paroles de Même" not in l] + + return "\n".join(filtered) + + def analyse(): path = "../data/" files = os.listdir(path) stats = { - "avg_length": defaultdict(lambda: 0), + "avg_lines": defaultdict(lambda: 0), + "avg_newlines": defaultdict(lambda: 0), "missing": defaultdict(lambda: 0) } @@ -20,16 +43,29 @@ def analyse(): print("\n## %s ##" % artist) for song in songs: - title = song["title"] + # title = song["title"] if song["lyrics"] is not None: - lyrics = song["lyrics"] - stats["avg_length"][artist] += len(lyrics) + lyrics: str = song["lyrics"] + if are_invalid(lyrics): + stats["missing"][artist] += 1 + else: + lyrics = cleanup(lyrics) + stats["avg_lines"][artist] += len(lyrics) + stats["avg_newlines"][artist] += lyrics.count("\n") else: stats["missing"][artist] += 1 - stats["avg_length"][artist] /= len(songs) - print("%s: Average song is %i long (%i missing)." % ( - artist, stats["avg_length"][artist], stats["missing"][artist])) + final_missing = stats["missing"][artist] + final_songs = len(songs) - final_missing + + stats["avg_lines"][artist] /= final_songs + stats["avg_newlines"][artist] /= final_songs + + print("%s: %i songs, on average %i/%i long (%i missing)." % ( + artist, final_songs, + stats["avg_lines"][artist], + stats["avg_newlines"][artist], + final_missing)) if __name__ == '__main__': -- libgit2 0.27.0