Added statistical analysis based on Cross-Entropy and Perplexity.

2025-04-25 02:38:01 +00:00 · 2023-04-23 16:17:58 +02:00 · 2023-04-23 16:17:58 +02:00 · e334953278
commit e334953278
parent 622cf00bd2
5 changed files with 36 additions and 5 deletions
--- a/Models/english_mixtape.json
+++ b/Models/english_mixtape.json
--- a/Models/polish_mixtape.json
+++ b/Models/polish_mixtape.json
--- a/Models/somemix.json
+++ b/Models/somemix.json
--- a/main.py
+++ b/main.py
@ -2,7 +2,7 @@ import os
 import random
 import pandas as pd
 from scrapper import scrap_data
-from markov_model import clean_data, create_markov_model, generate_lyrics, self_BLEU, zipfs_law, plot_heaps_laws
+from markov_model import clean_data, create_markov_model, generate_lyrics, self_BLEU, zipfs_law, plot_heaps_laws, cross_entropy, perplexity
 import json
 blacksabbath_selected_albums = ["Black Sabbath", "Paranoid", "Master Of Reality", "Vol 4", "Sabbath Bloody Sabbath",
--- a/markov_model.py
+++ b/markov_model.py
@ -36,7 +36,7 @@ def clean_data(name):
 def create_markov_model(dataset, n_gram):
    markov_model = {}
-    for i in range(len(dataset) - 1 - 2 * n_gram):
+    for i in range(len(dataset) - n_gram):
        current_state, next_state = "", ""
        for j in range(n_gram):
            current_state += dataset[i + j] + " "
@ -180,3 +180,34 @@ def plot_heaps_laws(datasets, n_grams):
        plt.legend(["n_gram: " + str(n_gram)])
        plt.tight_layout()
        plt.show()
 def cross_entropy(model, text, k):
    counts = {}
    for i in range(len(text) - k):
        gram = ""
        for j in range(k):
            gram += text[i + j] + " "
        gram = gram[:-1]
        if gram not in counts:
            counts[gram] = 0
        counts[gram] += 1
    total = sum(counts.values())
    probs = {gram: count / total for gram, count in counts.items()}
    entropy = 0
    for i in range(len(text) - k):
        gram = ""
        for j in range(k):
            gram += text[i + j] + " "
        gram = gram[:-1]
        next_word = text[i + k]
        if gram in model:
            prob = model[gram].get(next_word, 0)
            entropy -= np.log2(prob) * probs[gram]
    return entropy
 def perplexity(entropy):
    return pow(2, entropy)