Major updates in model + gathered some data.

2025-04-24 18:28:02 +00:00 · 2023-04-04 18:01:11 +02:00 · 2023-04-04 18:01:11 +02:00 · fbd287ea76
commit fbd287ea76
parent 307c291862
19 changed files with 108234 additions and 61 deletions
--- a/Data/ac_dc.csv
+++ b/Data/ac_dc.csv
--- a/Data/aerosmith.csv
+++ b/Data/aerosmith.csv
--- a/Data/alice_in_chains.csv
+++ b/Data/alice_in_chains.csv
--- a/Data/arctic_monkeys.csv
+++ b/Data/arctic_monkeys.csv
--- a/Data/depeche_mode.csv
+++ b/Data/depeche_mode.csv
--- a/Data/gorillaz.csv
+++ b/Data/gorillaz.csv
--- a/Data/jimi_hendrix.csv
+++ b/Data/jimi_hendrix.csv
--- a/Data/josh_homme.csv
+++ b/Data/josh_homme.csv
--- a/Data/kult.csv
+++ b/Data/kult.csv
--- a/Data/kyuss.csv
+++ b/Data/kyuss.csv
--- a/Data/led_zeppelin.csv
+++ b/Data/led_zeppelin.csv
--- a/Data/metallica.csv
+++ b/Data/metallica.csv
--- a/Data/queens_of_the_stone_age.csv
+++ b/Data/queens_of_the_stone_age.csv
--- a/Data/red_hot_chili_peppers.csv
+++ b/Data/red_hot_chili_peppers.csv
--- a/Data/the_cult.csv
+++ b/Data/the_cult.csv
--- a/Data/the_doors.csv
+++ b/Data/the_doors.csv
--- a/links.txt
+++ b/links.txt
@ -2,4 +2,19 @@ https://www.azlyrics.com/p/pinkfloyd.html
 https://www.azlyrics.com/b/blacksabbath.html
 https://www.tekstowo.pl/piosenki_artysty,paktofonika.html
 https://www.tekstowo.pl/piosenki_artysty,bracia_figo_fagot.html
-https://www.tekstowo.pl/piosenki_artysty,kuki.html
+https://www.tekstowo.pl/piosenki_artysty,kuki.html
+https://www.tekstowo.pl/piosenki_artysty,queens_of_the_stone_age.html
+https://www.tekstowo.pl/piosenki_artysty,kyuss.html
+https://www.tekstowo.pl/piosenki_artysty,depeche_mode.html
+https://www.tekstowo.pl/piosenki_artysty,ac_dc.html
+https://www.tekstowo.pl/piosenki_artysty,aerosmith.html
+https://www.tekstowo.pl/piosenki_artysty,alice_in_chains.html
+https://www.tekstowo.pl/piosenki_artysty,arctic_monkeys.html
+https://www.tekstowo.pl/piosenki_artysty,the_cult.html
+https://www.tekstowo.pl/piosenki_artysty,the_doors.html
+https://www.tekstowo.pl/piosenki_artysty,gorillaz.html
+https://www.tekstowo.pl/piosenki_artysty,jimi_hendrix.html
+https://www.tekstowo.pl/piosenki_artysty,kult.html
+https://www.tekstowo.pl/piosenki_artysty,led_zeppelin.html
+https://www.tekstowo.pl/piosenki_artysty,metallica.html
+https://www.tekstowo.pl/piosenki_artysty,red_hot_chili_peppers.html
--- a/main.py
+++ b/main.py
@ -25,18 +25,13 @@ def generate_song(name):
    dataset = clean_data(os.path.join(path, name))
    n_gram = int(input("Select number of words in Markov state: "))
    number_of_verses = int(input("Select number of verses: "))
-    words_in_verses = int((int(input("Select number of words in verses: ")) - 1) / n_gram)
-    # degree_of_chain = int(input("Select degree of chain: "))
+    words_in_verses = int(input("Select number of words in verses: ")) - n_gram
    model = create_markov_model(dataset, n_gram)
    print('\n')
-    last_state = random.choice(list(model.keys()))
    rime = None
    for i in range(number_of_verses):
-        generated_lyrics, last_state = generate_lyrics(model, last_state, words_in_verses, True if i == 0 else False, rime)
+        generated_lyrics, rime = generate_lyrics(model, random.choice(list(model.keys())), words_in_verses, True if i % 2 == 1 else False, rime)
        print(generated_lyrics)
-        rime = last_state
-        last_state = random.choices(list(model[last_state].keys()),
-                                    list(model[last_state].values()))[0]


 def scraping():
--- a/markov_model.py
+++ b/markov_model.py
@ -1,10 +1,9 @@
+import math
 import random
 import re
 from nltk import SyllableTokenizer
 from nltk.tokenize import word_tokenize
 import pandas as pd
-import numpy as np
-from scipy import sparse


 def clean_data(name):
@ -12,20 +11,21 @@ def clean_data(name):
    rows = document["Lyrics"].values.tolist()
    dataset = []
    for lyric in rows:
-        lyric = lyric.lower()
-        lyric = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", lyric)
-        lyric = re.sub(r"\([A-Za-z0-9:\s\.\?\,\&\*]+\)", "", lyric)
-        lyric = re.sub(r"\[[A-Za-z0-9:\s\.\?\,\&\*]+\]", "", lyric)
-        lyric = re.sub(r"[A-Za-z0-9]+::", "", lyric)
-        lyric = re.sub(r"[A-Za-z0-9]+:", "", lyric)
-        lyric = re.sub(r"/[A-Za-z0-9]+", "", lyric)
-        lyric = re.sub(r"x[0-9]", "", lyric)
-        forbidden_words = ['chorus', 'refrain', 'coda', 'solo', 'intro', 'introduction', 'verse', 'pre-chorus',
-                           'post-chorus', 'bridge', 'outro', 'ref']
-        tokens = word_tokenize(lyric)
-        words = [word for word in tokens if word.isalpha()]
-        words = [word for word in words if word not in forbidden_words]
-        dataset += words
+        if isinstance(lyric, str):
+            lyric = lyric.lower()
+            lyric = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", lyric)
+            lyric = re.sub(r"\([A-Za-z0-9:\s\.\?\,\&\*]+\)", "", lyric)
+            lyric = re.sub(r"\[[A-Za-z0-9:\s\.\?\,\&\*]+\]", "", lyric)
+            lyric = re.sub(r"[A-Za-z0-9]+::", "", lyric)
+            lyric = re.sub(r"[A-Za-z0-9]+:", "", lyric)
+            lyric = re.sub(r"/[A-Za-z0-9]+", "", lyric)
+            lyric = re.sub(r"x[0-9]", "", lyric)
+            forbidden_words = ['chorus', 'refrain', 'coda', 'solo', 'intro', 'introduction', 'verse', 'pre-chorus',
+                               'post-chorus', 'bridge', 'outro', 'ref']
+            tokens = word_tokenize(lyric)
+            words = [word for word in tokens if word.isalpha()]
+            words = [word for word in words if word not in forbidden_words]
+            dataset += words
    print(name.split('\\')[-1], "number of words in cleaned data: ", len(dataset))
    return dataset

@ -36,9 +36,8 @@ def create_markov_model(dataset, n_gram):
        current_state, next_state = "", ""
        for j in range(n_gram):
            current_state += dataset[i + j] + " "
-            next_state += dataset[i + j + n_gram] + " "
+        next_state += dataset[i + n_gram]
        current_state = current_state[:-1]
-        next_state = next_state[:-1]
        if current_state not in markov_model:
            markov_model[current_state] = {}
            markov_model[current_state][next_state] = 1
@ -51,58 +50,54 @@ def create_markov_model(dataset, n_gram):
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[current_state][state] = count / total
-    """matrix = [[0 for _ in range(len(markov_model.items()))] for _ in range(int(len(markov_model.items())))]
-    for current_state, transition in markov_model.items():
-        tempRow = list(markov_model.items())
-        indexRow = [idx for idx, key in enumerate(tempRow) if key[0] == current_state]
-        total = sum(transition.values())
-        for state, count in transition.items():
-            tempCol = list(transition.items())
-            indexCol = [idx for idx, key in enumerate(tempCol) if key[0] == state]
-            markov_model[current_state][state] = count / total
-            matrix[indexRow[0]][indexCol[0]] = markov_model[current_state][state]
-    matrix = np.array(matrix)
-    for i in range(n_step):
-        matrix = matrix.dot(matrix)
-        for current_state, transition in markov_model.items():
-            tempRow = list(markov_model.items())
-            indexRow = [idx for idx, key in enumerate(tempRow) if key[0] == current_state]
-            for state, count in transition.items():
-                tempCol = list(transition.items())
-                indexCol = [idx for idx, key in enumerate(tempCol) if key[0] == state]
-                markov_model[current_state][state] += matrix[indexRow[0]][indexCol[0]]"""
    return markov_model


-def generate_lyrics(markov_model, start, limit, isStartingVerse, rime):
+def default_next_state(markov_model, current_state, lyrics):
+    next_state = random.choices(list(markov_model[current_state].keys()),
+                                list(markov_model[current_state].values()))
+    lyrics += next_state[0] + " "
+    n_gram = len(current_state.split(" "))
+    current_state = ""
+    for i in range(n_gram + 1, 1, -1):
+        current_state += lyrics.split(" ")[-i] + " "
+    current_state = current_state[:-1]
+    return current_state, lyrics
+
+
+def rhyming_next_state(rime_states, current_state, lyrics):
+    next_state = random.choices(list(rime_states.keys()),
+                                list(rime_states.values()))
+    lyrics += next_state[0] + " "
+    n_gram = len(current_state.split(" "))
+    current_state = ""
+    for i in range(n_gram + 1, 1, -1):
+        current_state += lyrics.split(" ")[-i] + " "
+    current_state = current_state[:-1]
+    return current_state, lyrics
+
+
+def generate_lyrics(markov_model, start, limit, try_rhyme, rime):
    n = 0
    current_state = start
    lyrics = ""
    lyrics += current_state + " "
    lyrics = lyrics[0].upper() + lyrics[1:]
    while n < limit:
-        if n == limit - 1 and not isStartingVerse:
+        if n == limit - 1 and try_rhyme is True:
            rime = rime.split(" ")[-1]
            tk = SyllableTokenizer()
            rime_syllab = tk.tokenize(rime)[-1]
            rime_states = {}
            for state, probability in markov_model[current_state].items():
-                word = state.split(" ")[-1]
-                syllab = tk.tokenize(word)[-1]
-                if rime_syllab == syllab and rime != word:
+                syllab = tk.tokenize(state)[-1]
+                if rime_syllab == syllab and rime != state:
                    rime_states.update({state: probability})
            if rime_states:
-                next_state = random.choices(list(rime_states.keys()),
-                                            list(rime_states.values()))
-                current_state = next_state[0]
+                current_state, lyrics = rhyming_next_state(rime_states, current_state, lyrics)
            else:
-                next_state = random.choices(list(markov_model[current_state].keys()),
-                                            list(markov_model[current_state].values()))
-                current_state = next_state[0]
+                current_state, lyrics = default_next_state(markov_model, current_state, lyrics)
        else:
-            next_state = random.choices(list(markov_model[current_state].keys()),
-                                        list(markov_model[current_state].values()))
-            current_state = next_state[0]
-        lyrics += current_state + " "
+            current_state, lyrics = default_next_state(markov_model, current_state, lyrics)
        n += 1
    return lyrics, current_state