mirror of
https://github.com/WallyS02/Song-Lyrics-Generator.git
synced 2025-01-18 16:29:19 +00:00
Added statistical analysis based on Zipf's law, Heap's law and Self-BLEU evaluation.
This commit is contained in:
parent
daafbb246e
commit
622cf00bd2
21484
Data/polish_mixtape.csv
Normal file
21484
Data/polish_mixtape.csv
Normal file
File diff suppressed because it is too large
Load Diff
1
Models/polish_mixtape.json
Normal file
1
Models/polish_mixtape.json
Normal file
File diff suppressed because one or more lines are too long
30
main.py
30
main.py
@ -2,9 +2,7 @@ import os
|
|||||||
import random
|
import random
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from scrapper import scrap_data
|
from scrapper import scrap_data
|
||||||
from markov_model import clean_data
|
from markov_model import clean_data, create_markov_model, generate_lyrics, self_BLEU, zipfs_law, plot_heaps_laws
|
||||||
from markov_model import create_markov_model
|
|
||||||
from markov_model import generate_lyrics
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
blacksabbath_selected_albums = ["Black Sabbath", "Paranoid", "Master Of Reality", "Vol 4", "Sabbath Bloody Sabbath",
|
blacksabbath_selected_albums = ["Black Sabbath", "Paranoid", "Master Of Reality", "Vol 4", "Sabbath Bloody Sabbath",
|
||||||
@ -23,15 +21,20 @@ pathData = os.path.join(path, "Data")
|
|||||||
pathModels = os.path.join(path, "Models")
|
pathModels = os.path.join(path, "Models")
|
||||||
|
|
||||||
|
|
||||||
def create_model():
|
def print_file_list(filepath):
|
||||||
filelist = []
|
filelist = []
|
||||||
for file in os.listdir(pathData):
|
for file in os.listdir(filepath):
|
||||||
if os.path.isfile(os.path.join(pathData, file)):
|
if os.path.isfile(os.path.join(filepath, file)):
|
||||||
filelist.append(file)
|
filelist.append(file)
|
||||||
i = 0
|
i = 0
|
||||||
for file in filelist:
|
for file in filelist:
|
||||||
print(i, ": ", file)
|
print(i, ": ", file)
|
||||||
i += 1
|
i += 1
|
||||||
|
return filelist
|
||||||
|
|
||||||
|
|
||||||
|
def create_model():
|
||||||
|
filelist = print_file_list(pathData)
|
||||||
name = filelist[int(input("Select datafile: "))]
|
name = filelist[int(input("Select datafile: "))]
|
||||||
dataset = clean_data(os.path.join(pathData, name))
|
dataset = clean_data(os.path.join(pathData, name))
|
||||||
n_gram = int(input("Select number of words in Markov state: "))
|
n_gram = int(input("Select number of words in Markov state: "))
|
||||||
@ -42,14 +45,7 @@ def create_model():
|
|||||||
|
|
||||||
|
|
||||||
def generate_song():
|
def generate_song():
|
||||||
filelist = []
|
filelist = print_file_list(pathModels)
|
||||||
for file in os.listdir(pathModels):
|
|
||||||
if os.path.isfile(os.path.join(pathModels, file)):
|
|
||||||
filelist.append(file)
|
|
||||||
i = 0
|
|
||||||
for file in filelist:
|
|
||||||
print(i, ": ", file)
|
|
||||||
i += 1
|
|
||||||
model_name = filelist[int(input("Select model: "))]
|
model_name = filelist[int(input("Select model: "))]
|
||||||
with open(os.path.join(pathModels, model_name), 'r') as model_file:
|
with open(os.path.join(pathModels, model_name), 'r') as model_file:
|
||||||
model = json.loads(model_file.read())
|
model = json.loads(model_file.read())
|
||||||
@ -57,9 +53,13 @@ def generate_song():
|
|||||||
words_in_verses = int(input("Select number of words in verses: ")) - len(list(model.keys())[0].split(' '))
|
words_in_verses = int(input("Select number of words in verses: ")) - len(list(model.keys())[0].split(' '))
|
||||||
print('\n')
|
print('\n')
|
||||||
rime = None
|
rime = None
|
||||||
|
song = []
|
||||||
for i in range(number_of_verses):
|
for i in range(number_of_verses):
|
||||||
generated_lyrics, rime = generate_lyrics(model, random.choice(list(model.keys())), words_in_verses, True if i % 2 == 1 else False, rime)
|
generated_lyrics, rime = generate_lyrics(model, random.choice(list(model.keys())), words_in_verses, True if i % 2 == 1 else False, rime)
|
||||||
print(generated_lyrics)
|
print(generated_lyrics)
|
||||||
|
for state in generated_lyrics.split():
|
||||||
|
song.append(state.lower())
|
||||||
|
return song
|
||||||
|
|
||||||
|
|
||||||
def scraping():
|
def scraping():
|
||||||
@ -111,7 +111,7 @@ def merging():
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
print("Select option:\n1. Create model based on datafile\n2. Generate lyrics with model.\n3. Scrap "
|
print("Select option:\n1. Create model based on datafile\n2. Generate lyrics with model\n3. Scrap "
|
||||||
"data\n4. Merge CSV band's songs\n5. Exit")
|
"data\n4. Merge CSV band's songs\n5. Exit")
|
||||||
while True:
|
while True:
|
||||||
selection = int(input())
|
selection = int(input())
|
||||||
|
@ -1,8 +1,13 @@
|
|||||||
|
import copy
|
||||||
|
import math
|
||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
from nltk import SyllableTokenizer
|
from nltk import SyllableTokenizer
|
||||||
from nltk.tokenize import word_tokenize
|
from nltk.tokenize import word_tokenize
|
||||||
|
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
|
||||||
def clean_data(name):
|
def clean_data(name):
|
||||||
@ -100,3 +105,78 @@ def generate_lyrics(markov_model, start, limit, try_rhyme, rime):
|
|||||||
current_state, lyrics = default_next_state(markov_model, current_state, lyrics)
|
current_state, lyrics = default_next_state(markov_model, current_state, lyrics)
|
||||||
n += 1
|
n += 1
|
||||||
return lyrics, current_state
|
return lyrics, current_state
|
||||||
|
|
||||||
|
|
||||||
|
def get_bleu(verse, remaining_verses):
|
||||||
|
bleues = []
|
||||||
|
smoothie = SmoothingFunction()
|
||||||
|
for other_verse in remaining_verses:
|
||||||
|
bleu = sentence_bleu(verse, other_verse, smoothing_function=smoothie.method1)
|
||||||
|
bleues.append(bleu)
|
||||||
|
return bleues
|
||||||
|
|
||||||
|
|
||||||
|
def self_BLEU(verses):
|
||||||
|
bleu_scores = []
|
||||||
|
for verse in verses:
|
||||||
|
remaining_verses = copy.deepcopy(verses)
|
||||||
|
remaining_verses.remove(verse)
|
||||||
|
bleu = get_bleu(verse, remaining_verses)
|
||||||
|
bleu_scores.append(bleu)
|
||||||
|
return np.mean(bleu_scores)
|
||||||
|
|
||||||
|
|
||||||
|
def zipfs_law(dataset, name, firstValues=1000):
|
||||||
|
histogram = {}
|
||||||
|
for state in dataset:
|
||||||
|
if state in histogram.keys():
|
||||||
|
histogram[state] += 1
|
||||||
|
else:
|
||||||
|
histogram[state] = 1
|
||||||
|
keys = list(histogram.keys())
|
||||||
|
values = list(histogram.values())
|
||||||
|
sorted_value_index = np.argsort(-np.array(values))
|
||||||
|
sorted_histogram = {keys[i]: values[i] for i in sorted_value_index}
|
||||||
|
plt.bar([i for i in range(min(len(sorted_histogram), firstValues))],
|
||||||
|
[list(sorted_histogram.values())[i] for i in range(min(len(sorted_histogram), firstValues))])
|
||||||
|
plt.xlabel("states")
|
||||||
|
plt.ylabel("occurrences")
|
||||||
|
plt.title(name + " state histogram")
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.show()
|
||||||
|
constant_list = []
|
||||||
|
for i, state in enumerate(sorted_histogram.values()):
|
||||||
|
if i == min(len(sorted_histogram), firstValues):
|
||||||
|
break
|
||||||
|
constant_list.append((i + 1) * state)
|
||||||
|
plt.xlabel("states")
|
||||||
|
plt.ylabel("constants")
|
||||||
|
plt.title(name + " state constants plot")
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.bar([i for i in range(min(len(sorted_histogram), firstValues))], constant_list)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
def heaps_law(dataset, n_gram):
|
||||||
|
unique_states = []
|
||||||
|
for state in dataset:
|
||||||
|
if state not in unique_states:
|
||||||
|
unique_states.append(state)
|
||||||
|
return int(math.factorial(len(unique_states)) / math.factorial(len(unique_states) - n_gram)), len(dataset) ** n_gram
|
||||||
|
|
||||||
|
|
||||||
|
def plot_heaps_laws(datasets, n_grams):
|
||||||
|
plt.xlabel("total number of states")
|
||||||
|
plt.ylabel("unique number of states")
|
||||||
|
plt.title("Heap's law")
|
||||||
|
for n_gram in n_grams:
|
||||||
|
x = []
|
||||||
|
y = []
|
||||||
|
for dataset in datasets:
|
||||||
|
unique, total = heaps_law(dataset, n_gram)
|
||||||
|
x.append(total)
|
||||||
|
y.append(unique)
|
||||||
|
plt.plot(x, y, linewidth=1.0)
|
||||||
|
plt.legend(["n_gram: " + str(n_gram)])
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.show()
|
||||||
|
@ -1,4 +1,7 @@
|
|||||||
nltk
|
nltk
|
||||||
pandas
|
pandas
|
||||||
bs4
|
|
||||||
requests
|
requests
|
||||||
|
beautifulsoup4
|
||||||
|
numpy
|
||||||
|
bs4
|
||||||
|
matplotlib
|
Loading…
x
Reference in New Issue
Block a user