mirror of
https://github.com/WallyS02/Song-Lyrics-Generator.git
synced 2025-01-18 08:19:19 +00:00
Added statistical analysis based on Zipf's law, Heap's law and Self-BLEU evaluation.
This commit is contained in:
parent
daafbb246e
commit
622cf00bd2
21484
Data/polish_mixtape.csv
Normal file
21484
Data/polish_mixtape.csv
Normal file
File diff suppressed because it is too large
Load Diff
1
Models/polish_mixtape.json
Normal file
1
Models/polish_mixtape.json
Normal file
File diff suppressed because one or more lines are too long
30
main.py
30
main.py
@ -2,9 +2,7 @@ import os
|
||||
import random
|
||||
import pandas as pd
|
||||
from scrapper import scrap_data
|
||||
from markov_model import clean_data
|
||||
from markov_model import create_markov_model
|
||||
from markov_model import generate_lyrics
|
||||
from markov_model import clean_data, create_markov_model, generate_lyrics, self_BLEU, zipfs_law, plot_heaps_laws
|
||||
import json
|
||||
|
||||
blacksabbath_selected_albums = ["Black Sabbath", "Paranoid", "Master Of Reality", "Vol 4", "Sabbath Bloody Sabbath",
|
||||
@ -23,15 +21,20 @@ pathData = os.path.join(path, "Data")
|
||||
pathModels = os.path.join(path, "Models")
|
||||
|
||||
|
||||
def create_model():
|
||||
def print_file_list(filepath):
|
||||
filelist = []
|
||||
for file in os.listdir(pathData):
|
||||
if os.path.isfile(os.path.join(pathData, file)):
|
||||
for file in os.listdir(filepath):
|
||||
if os.path.isfile(os.path.join(filepath, file)):
|
||||
filelist.append(file)
|
||||
i = 0
|
||||
for file in filelist:
|
||||
print(i, ": ", file)
|
||||
i += 1
|
||||
return filelist
|
||||
|
||||
|
||||
def create_model():
|
||||
filelist = print_file_list(pathData)
|
||||
name = filelist[int(input("Select datafile: "))]
|
||||
dataset = clean_data(os.path.join(pathData, name))
|
||||
n_gram = int(input("Select number of words in Markov state: "))
|
||||
@ -42,14 +45,7 @@ def create_model():
|
||||
|
||||
|
||||
def generate_song():
|
||||
filelist = []
|
||||
for file in os.listdir(pathModels):
|
||||
if os.path.isfile(os.path.join(pathModels, file)):
|
||||
filelist.append(file)
|
||||
i = 0
|
||||
for file in filelist:
|
||||
print(i, ": ", file)
|
||||
i += 1
|
||||
filelist = print_file_list(pathModels)
|
||||
model_name = filelist[int(input("Select model: "))]
|
||||
with open(os.path.join(pathModels, model_name), 'r') as model_file:
|
||||
model = json.loads(model_file.read())
|
||||
@ -57,9 +53,13 @@ def generate_song():
|
||||
words_in_verses = int(input("Select number of words in verses: ")) - len(list(model.keys())[0].split(' '))
|
||||
print('\n')
|
||||
rime = None
|
||||
song = []
|
||||
for i in range(number_of_verses):
|
||||
generated_lyrics, rime = generate_lyrics(model, random.choice(list(model.keys())), words_in_verses, True if i % 2 == 1 else False, rime)
|
||||
print(generated_lyrics)
|
||||
for state in generated_lyrics.split():
|
||||
song.append(state.lower())
|
||||
return song
|
||||
|
||||
|
||||
def scraping():
|
||||
@ -111,7 +111,7 @@ def merging():
|
||||
|
||||
|
||||
def main():
|
||||
print("Select option:\n1. Create model based on datafile\n2. Generate lyrics with model.\n3. Scrap "
|
||||
print("Select option:\n1. Create model based on datafile\n2. Generate lyrics with model\n3. Scrap "
|
||||
"data\n4. Merge CSV band's songs\n5. Exit")
|
||||
while True:
|
||||
selection = int(input())
|
||||
|
@ -1,8 +1,13 @@
|
||||
import copy
|
||||
import math
|
||||
import random
|
||||
import re
|
||||
from nltk import SyllableTokenizer
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def clean_data(name):
|
||||
@ -100,3 +105,78 @@ def generate_lyrics(markov_model, start, limit, try_rhyme, rime):
|
||||
current_state, lyrics = default_next_state(markov_model, current_state, lyrics)
|
||||
n += 1
|
||||
return lyrics, current_state
|
||||
|
||||
|
||||
def get_bleu(verse, remaining_verses):
|
||||
bleues = []
|
||||
smoothie = SmoothingFunction()
|
||||
for other_verse in remaining_verses:
|
||||
bleu = sentence_bleu(verse, other_verse, smoothing_function=smoothie.method1)
|
||||
bleues.append(bleu)
|
||||
return bleues
|
||||
|
||||
|
||||
def self_BLEU(verses):
|
||||
bleu_scores = []
|
||||
for verse in verses:
|
||||
remaining_verses = copy.deepcopy(verses)
|
||||
remaining_verses.remove(verse)
|
||||
bleu = get_bleu(verse, remaining_verses)
|
||||
bleu_scores.append(bleu)
|
||||
return np.mean(bleu_scores)
|
||||
|
||||
|
||||
def zipfs_law(dataset, name, firstValues=1000):
|
||||
histogram = {}
|
||||
for state in dataset:
|
||||
if state in histogram.keys():
|
||||
histogram[state] += 1
|
||||
else:
|
||||
histogram[state] = 1
|
||||
keys = list(histogram.keys())
|
||||
values = list(histogram.values())
|
||||
sorted_value_index = np.argsort(-np.array(values))
|
||||
sorted_histogram = {keys[i]: values[i] for i in sorted_value_index}
|
||||
plt.bar([i for i in range(min(len(sorted_histogram), firstValues))],
|
||||
[list(sorted_histogram.values())[i] for i in range(min(len(sorted_histogram), firstValues))])
|
||||
plt.xlabel("states")
|
||||
plt.ylabel("occurrences")
|
||||
plt.title(name + " state histogram")
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
constant_list = []
|
||||
for i, state in enumerate(sorted_histogram.values()):
|
||||
if i == min(len(sorted_histogram), firstValues):
|
||||
break
|
||||
constant_list.append((i + 1) * state)
|
||||
plt.xlabel("states")
|
||||
plt.ylabel("constants")
|
||||
plt.title(name + " state constants plot")
|
||||
plt.tight_layout()
|
||||
plt.bar([i for i in range(min(len(sorted_histogram), firstValues))], constant_list)
|
||||
plt.show()
|
||||
|
||||
|
||||
def heaps_law(dataset, n_gram):
|
||||
unique_states = []
|
||||
for state in dataset:
|
||||
if state not in unique_states:
|
||||
unique_states.append(state)
|
||||
return int(math.factorial(len(unique_states)) / math.factorial(len(unique_states) - n_gram)), len(dataset) ** n_gram
|
||||
|
||||
|
||||
def plot_heaps_laws(datasets, n_grams):
|
||||
plt.xlabel("total number of states")
|
||||
plt.ylabel("unique number of states")
|
||||
plt.title("Heap's law")
|
||||
for n_gram in n_grams:
|
||||
x = []
|
||||
y = []
|
||||
for dataset in datasets:
|
||||
unique, total = heaps_law(dataset, n_gram)
|
||||
x.append(total)
|
||||
y.append(unique)
|
||||
plt.plot(x, y, linewidth=1.0)
|
||||
plt.legend(["n_gram: " + str(n_gram)])
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
@ -1,4 +1,7 @@
|
||||
nltk
|
||||
pandas
|
||||
bs4
|
||||
requests
|
||||
beautifulsoup4
|
||||
numpy
|
||||
bs4
|
||||
matplotlib
|
Loading…
x
Reference in New Issue
Block a user