Added statistical analysis based on Zipf's law, Heap's law and Self-BLEU evaluation.

2025-04-24 18:28:02 +00:00 · 2023-04-22 00:12:26 +02:00 · 2023-04-22 00:12:26 +02:00 · 622cf00bd2
commit 622cf00bd2
parent daafbb246e
5 changed files with 21584 additions and 16 deletions
--- a/Data/polish_mixtape.csv
+++ b/Data/polish_mixtape.csv
--- a/Models/polish_mixtape.json
+++ b/Models/polish_mixtape.json
--- a/main.py
+++ b/main.py
@ -2,9 +2,7 @@ import os
 import random
 import pandas as pd
 from scrapper import scrap_data
-from markov_model import clean_data
-from markov_model import create_markov_model
-from markov_model import generate_lyrics
+from markov_model import clean_data, create_markov_model, generate_lyrics, self_BLEU, zipfs_law, plot_heaps_laws
 import json

 blacksabbath_selected_albums = ["Black Sabbath", "Paranoid", "Master Of Reality", "Vol 4", "Sabbath Bloody Sabbath",
@ -23,15 +21,20 @@ pathData = os.path.join(path, "Data")
 pathModels = os.path.join(path, "Models")


-def create_model():
+def print_file_list(filepath):
    filelist = []
-    for file in os.listdir(pathData):
-        if os.path.isfile(os.path.join(pathData, file)):
+    for file in os.listdir(filepath):
+        if os.path.isfile(os.path.join(filepath, file)):
            filelist.append(file)
    i = 0
    for file in filelist:
        print(i, ": ", file)
        i += 1
+    return filelist
+
+
+def create_model():
+    filelist = print_file_list(pathData)
    name = filelist[int(input("Select datafile: "))]
    dataset = clean_data(os.path.join(pathData, name))
    n_gram = int(input("Select number of words in Markov state: "))
@ -42,14 +45,7 @@ def create_model():


 def generate_song():
-    filelist = []
-    for file in os.listdir(pathModels):
-        if os.path.isfile(os.path.join(pathModels, file)):
-            filelist.append(file)
-    i = 0
-    for file in filelist:
-        print(i, ": ", file)
-        i += 1
+    filelist = print_file_list(pathModels)
    model_name = filelist[int(input("Select model: "))]
    with open(os.path.join(pathModels, model_name), 'r') as model_file:
        model = json.loads(model_file.read())
@ -57,9 +53,13 @@ def generate_song():
    words_in_verses = int(input("Select number of words in verses: ")) - len(list(model.keys())[0].split(' '))
    print('\n')
    rime = None
+    song = []
    for i in range(number_of_verses):
        generated_lyrics, rime = generate_lyrics(model, random.choice(list(model.keys())), words_in_verses, True if i % 2 == 1 else False, rime)
        print(generated_lyrics)
+        for state in generated_lyrics.split():
+            song.append(state.lower())
+    return song


 def scraping():
@ -111,7 +111,7 @@ def merging():


 def main():
-    print("Select option:\n1. Create model based on datafile\n2. Generate lyrics with model.\n3. Scrap "
+    print("Select option:\n1. Create model based on datafile\n2. Generate lyrics with model\n3. Scrap "
          "data\n4. Merge CSV band's songs\n5. Exit")
    while True:
        selection = int(input())
--- a/markov_model.py
+++ b/markov_model.py
@ -1,8 +1,13 @@
+import copy
+import math
 import random
 import re
 from nltk import SyllableTokenizer
 from nltk.tokenize import word_tokenize
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
 import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt


 def clean_data(name):
@ -100,3 +105,78 @@ def generate_lyrics(markov_model, start, limit, try_rhyme, rime):
            current_state, lyrics = default_next_state(markov_model, current_state, lyrics)
        n += 1
    return lyrics, current_state
+
+
+def get_bleu(verse, remaining_verses):
+    bleues = []
+    smoothie = SmoothingFunction()
+    for other_verse in remaining_verses:
+        bleu = sentence_bleu(verse, other_verse, smoothing_function=smoothie.method1)
+        bleues.append(bleu)
+    return bleues
+
+
+def self_BLEU(verses):
+    bleu_scores = []
+    for verse in verses:
+        remaining_verses = copy.deepcopy(verses)
+        remaining_verses.remove(verse)
+        bleu = get_bleu(verse, remaining_verses)
+        bleu_scores.append(bleu)
+    return np.mean(bleu_scores)
+
+
+def zipfs_law(dataset, name, firstValues=1000):
+    histogram = {}
+    for state in dataset:
+        if state in histogram.keys():
+            histogram[state] += 1
+        else:
+            histogram[state] = 1
+    keys = list(histogram.keys())
+    values = list(histogram.values())
+    sorted_value_index = np.argsort(-np.array(values))
+    sorted_histogram = {keys[i]: values[i] for i in sorted_value_index}
+    plt.bar([i for i in range(min(len(sorted_histogram), firstValues))],
+            [list(sorted_histogram.values())[i] for i in range(min(len(sorted_histogram), firstValues))])
+    plt.xlabel("states")
+    plt.ylabel("occurrences")
+    plt.title(name + " state histogram")
+    plt.tight_layout()
+    plt.show()
+    constant_list = []
+    for i, state in enumerate(sorted_histogram.values()):
+        if i == min(len(sorted_histogram), firstValues):
+            break
+        constant_list.append((i + 1) * state)
+    plt.xlabel("states")
+    plt.ylabel("constants")
+    plt.title(name + " state constants plot")
+    plt.tight_layout()
+    plt.bar([i for i in range(min(len(sorted_histogram), firstValues))], constant_list)
+    plt.show()
+
+
+def heaps_law(dataset, n_gram):
+    unique_states = []
+    for state in dataset:
+        if state not in unique_states:
+            unique_states.append(state)
+    return int(math.factorial(len(unique_states)) / math.factorial(len(unique_states) - n_gram)), len(dataset) ** n_gram
+
+
+def plot_heaps_laws(datasets, n_grams):
+    plt.xlabel("total number of states")
+    plt.ylabel("unique number of states")
+    plt.title("Heap's law")
+    for n_gram in n_grams:
+        x = []
+        y = []
+        for dataset in datasets:
+            unique, total = heaps_law(dataset, n_gram)
+            x.append(total)
+            y.append(unique)
+        plt.plot(x, y, linewidth=1.0)
+        plt.legend(["n_gram: " + str(n_gram)])
+        plt.tight_layout()
+        plt.show()
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,7 @@
 nltk
 pandas
-bs4
 requests
+beautifulsoup4
+numpy
+bs4
+matplotlib