Added statistical analysis based on Zipf's law, Heap's law and Self-BLEU evaluation.

This commit is contained in:
Sebastian Kutny 2023-04-22 00:12:26 +02:00
parent daafbb246e
commit 622cf00bd2
5 changed files with 21584 additions and 16 deletions

21484
Data/polish_mixtape.csv Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

30
main.py
View File

@ -2,9 +2,7 @@ import os
import random import random
import pandas as pd import pandas as pd
from scrapper import scrap_data from scrapper import scrap_data
from markov_model import clean_data from markov_model import clean_data, create_markov_model, generate_lyrics, self_BLEU, zipfs_law, plot_heaps_laws
from markov_model import create_markov_model
from markov_model import generate_lyrics
import json import json
blacksabbath_selected_albums = ["Black Sabbath", "Paranoid", "Master Of Reality", "Vol 4", "Sabbath Bloody Sabbath", blacksabbath_selected_albums = ["Black Sabbath", "Paranoid", "Master Of Reality", "Vol 4", "Sabbath Bloody Sabbath",
@ -23,15 +21,20 @@ pathData = os.path.join(path, "Data")
pathModels = os.path.join(path, "Models") pathModels = os.path.join(path, "Models")
def create_model(): def print_file_list(filepath):
filelist = [] filelist = []
for file in os.listdir(pathData): for file in os.listdir(filepath):
if os.path.isfile(os.path.join(pathData, file)): if os.path.isfile(os.path.join(filepath, file)):
filelist.append(file) filelist.append(file)
i = 0 i = 0
for file in filelist: for file in filelist:
print(i, ": ", file) print(i, ": ", file)
i += 1 i += 1
return filelist
def create_model():
filelist = print_file_list(pathData)
name = filelist[int(input("Select datafile: "))] name = filelist[int(input("Select datafile: "))]
dataset = clean_data(os.path.join(pathData, name)) dataset = clean_data(os.path.join(pathData, name))
n_gram = int(input("Select number of words in Markov state: ")) n_gram = int(input("Select number of words in Markov state: "))
@ -42,14 +45,7 @@ def create_model():
def generate_song(): def generate_song():
filelist = [] filelist = print_file_list(pathModels)
for file in os.listdir(pathModels):
if os.path.isfile(os.path.join(pathModels, file)):
filelist.append(file)
i = 0
for file in filelist:
print(i, ": ", file)
i += 1
model_name = filelist[int(input("Select model: "))] model_name = filelist[int(input("Select model: "))]
with open(os.path.join(pathModels, model_name), 'r') as model_file: with open(os.path.join(pathModels, model_name), 'r') as model_file:
model = json.loads(model_file.read()) model = json.loads(model_file.read())
@ -57,9 +53,13 @@ def generate_song():
words_in_verses = int(input("Select number of words in verses: ")) - len(list(model.keys())[0].split(' ')) words_in_verses = int(input("Select number of words in verses: ")) - len(list(model.keys())[0].split(' '))
print('\n') print('\n')
rime = None rime = None
song = []
for i in range(number_of_verses): for i in range(number_of_verses):
generated_lyrics, rime = generate_lyrics(model, random.choice(list(model.keys())), words_in_verses, True if i % 2 == 1 else False, rime) generated_lyrics, rime = generate_lyrics(model, random.choice(list(model.keys())), words_in_verses, True if i % 2 == 1 else False, rime)
print(generated_lyrics) print(generated_lyrics)
for state in generated_lyrics.split():
song.append(state.lower())
return song
def scraping(): def scraping():
@ -111,7 +111,7 @@ def merging():
def main(): def main():
print("Select option:\n1. Create model based on datafile\n2. Generate lyrics with model.\n3. Scrap " print("Select option:\n1. Create model based on datafile\n2. Generate lyrics with model\n3. Scrap "
"data\n4. Merge CSV band's songs\n5. Exit") "data\n4. Merge CSV band's songs\n5. Exit")
while True: while True:
selection = int(input()) selection = int(input())

View File

@ -1,8 +1,13 @@
import copy
import math
import random import random
import re import re
from nltk import SyllableTokenizer from nltk import SyllableTokenizer
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import pandas as pd import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def clean_data(name): def clean_data(name):
@ -100,3 +105,78 @@ def generate_lyrics(markov_model, start, limit, try_rhyme, rime):
current_state, lyrics = default_next_state(markov_model, current_state, lyrics) current_state, lyrics = default_next_state(markov_model, current_state, lyrics)
n += 1 n += 1
return lyrics, current_state return lyrics, current_state
def get_bleu(verse, remaining_verses):
bleues = []
smoothie = SmoothingFunction()
for other_verse in remaining_verses:
bleu = sentence_bleu(verse, other_verse, smoothing_function=smoothie.method1)
bleues.append(bleu)
return bleues
def self_BLEU(verses):
bleu_scores = []
for verse in verses:
remaining_verses = copy.deepcopy(verses)
remaining_verses.remove(verse)
bleu = get_bleu(verse, remaining_verses)
bleu_scores.append(bleu)
return np.mean(bleu_scores)
def zipfs_law(dataset, name, firstValues=1000):
histogram = {}
for state in dataset:
if state in histogram.keys():
histogram[state] += 1
else:
histogram[state] = 1
keys = list(histogram.keys())
values = list(histogram.values())
sorted_value_index = np.argsort(-np.array(values))
sorted_histogram = {keys[i]: values[i] for i in sorted_value_index}
plt.bar([i for i in range(min(len(sorted_histogram), firstValues))],
[list(sorted_histogram.values())[i] for i in range(min(len(sorted_histogram), firstValues))])
plt.xlabel("states")
plt.ylabel("occurrences")
plt.title(name + " state histogram")
plt.tight_layout()
plt.show()
constant_list = []
for i, state in enumerate(sorted_histogram.values()):
if i == min(len(sorted_histogram), firstValues):
break
constant_list.append((i + 1) * state)
plt.xlabel("states")
plt.ylabel("constants")
plt.title(name + " state constants plot")
plt.tight_layout()
plt.bar([i for i in range(min(len(sorted_histogram), firstValues))], constant_list)
plt.show()
def heaps_law(dataset, n_gram):
unique_states = []
for state in dataset:
if state not in unique_states:
unique_states.append(state)
return int(math.factorial(len(unique_states)) / math.factorial(len(unique_states) - n_gram)), len(dataset) ** n_gram
def plot_heaps_laws(datasets, n_grams):
plt.xlabel("total number of states")
plt.ylabel("unique number of states")
plt.title("Heap's law")
for n_gram in n_grams:
x = []
y = []
for dataset in datasets:
unique, total = heaps_law(dataset, n_gram)
x.append(total)
y.append(unique)
plt.plot(x, y, linewidth=1.0)
plt.legend(["n_gram: " + str(n_gram)])
plt.tight_layout()
plt.show()

View File

@ -1,4 +1,7 @@
nltk nltk
pandas pandas
bs4
requests requests
beautifulsoup4
numpy
bs4
matplotlib