2023-03-26 13:22:02 +00:00
|
|
|
import random
|
2023-03-27 22:21:13 +00:00
|
|
|
import re
|
|
|
|
from nltk.tokenize import word_tokenize
|
|
|
|
import pandas as pd
|
2023-03-28 13:08:23 +00:00
|
|
|
import numpy as np
|
|
|
|
from scipy import sparse
|
2023-03-26 13:22:02 +00:00
|
|
|
|
|
|
|
|
2023-03-27 22:21:13 +00:00
|
|
|
def clean_data(name):
|
|
|
|
document = pd.read_csv(name, usecols=["Lyrics"])
|
|
|
|
rows = document["Lyrics"].values.tolist()
|
|
|
|
dataset = []
|
|
|
|
for lyric in rows:
|
|
|
|
lyric = lyric.lower()
|
|
|
|
lyric = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", lyric)
|
|
|
|
lyric = re.sub(r"\([A-Za-z0-9:\s\.\?\,\&\*]+\)", "", lyric)
|
|
|
|
lyric = re.sub(r"\[[A-Za-z0-9:\s\.\?\,\&\*]+\]", "", lyric)
|
|
|
|
lyric = re.sub(r"[A-Za-z0-9]+::", "", lyric)
|
|
|
|
lyric = re.sub(r"[A-Za-z0-9]+:", "", lyric)
|
|
|
|
lyric = re.sub(r"/[A-Za-z0-9]+", "", lyric)
|
|
|
|
lyric = re.sub(r"x[0-9]", "", lyric)
|
|
|
|
forbidden_words = ['chorus', 'refrain', 'coda', 'solo', 'intro', 'introduction', 'verse', 'pre-chorus',
|
|
|
|
'post-chorus', 'bridge', 'outro', 'ref']
|
|
|
|
tokens = word_tokenize(lyric)
|
|
|
|
words = [word for word in tokens if word.isalpha()]
|
|
|
|
words = [word for word in words if word not in forbidden_words]
|
|
|
|
dataset += words
|
|
|
|
print(name.split('\\')[-1], "number of words in cleaned data: ", len(dataset))
|
|
|
|
return dataset
|
|
|
|
|
|
|
|
|
2023-03-28 13:08:23 +00:00
|
|
|
def create_markov_model(dataset, n_gram, n_step):
|
2023-03-26 13:22:02 +00:00
|
|
|
markov_model = {}
|
2023-03-28 13:08:23 +00:00
|
|
|
for i in range(len(dataset) - 1 - 2 * n_gram):
|
2023-03-26 13:22:02 +00:00
|
|
|
current_state, next_state = "", ""
|
|
|
|
for j in range(n_gram):
|
|
|
|
current_state += dataset[i + j] + " "
|
|
|
|
next_state += dataset[i + j + n_gram] + " "
|
|
|
|
current_state = current_state[:-1]
|
|
|
|
next_state = next_state[:-1]
|
|
|
|
if current_state not in markov_model:
|
|
|
|
markov_model[current_state] = {}
|
|
|
|
markov_model[current_state][next_state] = 1
|
|
|
|
else:
|
|
|
|
if next_state in markov_model[current_state]:
|
|
|
|
markov_model[current_state][next_state] += 1
|
|
|
|
else:
|
|
|
|
markov_model[current_state][next_state] = 1
|
|
|
|
for current_state, transition in markov_model.items():
|
|
|
|
total = sum(transition.values())
|
|
|
|
for state, count in transition.items():
|
|
|
|
markov_model[current_state][state] = count / total
|
2023-03-28 13:08:23 +00:00
|
|
|
"""matrix = [[0 for _ in range(len(markov_model.items()))] for _ in range(int(len(markov_model.items())))]
|
|
|
|
for current_state, transition in markov_model.items():
|
|
|
|
tempRow = list(markov_model.items())
|
|
|
|
indexRow = [idx for idx, key in enumerate(tempRow) if key[0] == current_state]
|
|
|
|
total = sum(transition.values())
|
|
|
|
for state, count in transition.items():
|
|
|
|
tempCol = list(transition.items())
|
|
|
|
indexCol = [idx for idx, key in enumerate(tempCol) if key[0] == state]
|
|
|
|
markov_model[current_state][state] = count / total
|
|
|
|
matrix[indexRow[0]][indexCol[0]] = markov_model[current_state][state]
|
|
|
|
matrix = np.array(matrix)
|
|
|
|
for i in range(n_step):
|
|
|
|
matrix = matrix.dot(matrix)
|
|
|
|
for current_state, transition in markov_model.items():
|
|
|
|
tempRow = list(markov_model.items())
|
|
|
|
indexRow = [idx for idx, key in enumerate(tempRow) if key[0] == current_state]
|
|
|
|
for state, count in transition.items():
|
|
|
|
tempCol = list(transition.items())
|
|
|
|
indexCol = [idx for idx, key in enumerate(tempCol) if key[0] == state]
|
|
|
|
markov_model[current_state][state] += matrix[indexRow[0]][indexCol[0]]"""
|
2023-03-26 13:22:02 +00:00
|
|
|
return markov_model
|
|
|
|
|
|
|
|
|
2023-03-27 22:21:13 +00:00
|
|
|
def generate_lyrics(markov_model, start, limit):
|
2023-03-26 13:22:02 +00:00
|
|
|
n = 0
|
|
|
|
current_state = start
|
|
|
|
lyrics = ""
|
|
|
|
lyrics += current_state + " "
|
2023-03-27 22:21:13 +00:00
|
|
|
lyrics = lyrics[0].upper() + lyrics[1:]
|
2023-03-26 13:22:02 +00:00
|
|
|
while n < limit:
|
|
|
|
next_state = random.choices(list(markov_model[current_state].keys()),
|
|
|
|
list(markov_model[current_state].values()))
|
|
|
|
current_state = next_state[0]
|
|
|
|
lyrics += current_state + " "
|
|
|
|
n += 1
|
2023-03-28 13:08:23 +00:00
|
|
|
return lyrics, current_state
|