songs-lyrics-generator/markov_model.py

67 lines
2.5 KiB
Python
Raw Normal View History

2023-03-26 13:22:02 +00:00
import random
import re
from nltk.tokenize import word_tokenize
import pandas as pd
2023-03-26 13:22:02 +00:00
def clean_data(name):
document = pd.read_csv(name, usecols=["Lyrics"])
rows = document["Lyrics"].values.tolist()
dataset = []
for lyric in rows:
lyric = lyric.lower()
lyric = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", lyric)
lyric = re.sub(r"\([A-Za-z0-9:\s\.\?\,\&\*]+\)", "", lyric)
lyric = re.sub(r"\[[A-Za-z0-9:\s\.\?\,\&\*]+\]", "", lyric)
lyric = re.sub(r"[A-Za-z0-9]+::", "", lyric)
lyric = re.sub(r"[A-Za-z0-9]+:", "", lyric)
lyric = re.sub(r"/[A-Za-z0-9]+", "", lyric)
lyric = re.sub(r"x[0-9]", "", lyric)
forbidden_words = ['chorus', 'refrain', 'coda', 'solo', 'intro', 'introduction', 'verse', 'pre-chorus',
'post-chorus', 'bridge', 'outro', 'ref']
tokens = word_tokenize(lyric)
words = [word for word in tokens if word.isalpha()]
words = [word for word in words if word not in forbidden_words]
dataset += words
print(name.split('\\')[-1], "number of words in cleaned data: ", len(dataset))
return dataset
def create_markov_model(dataset, n_gram):
2023-03-26 13:22:02 +00:00
markov_model = {}
for i in range(len(dataset) - n_gram - 1):
current_state, next_state = "", ""
for j in range(n_gram):
current_state += dataset[i + j] + " "
next_state += dataset[i + j + n_gram] + " "
current_state = current_state[:-1]
next_state = next_state[:-1]
if current_state not in markov_model:
markov_model[current_state] = {}
markov_model[current_state][next_state] = 1
else:
if next_state in markov_model[current_state]:
markov_model[current_state][next_state] += 1
else:
markov_model[current_state][next_state] = 1
for current_state, transition in markov_model.items():
total = sum(transition.values())
for state, count in transition.items():
markov_model[current_state][state] = count / total
return markov_model
def generate_lyrics(markov_model, start, limit):
2023-03-26 13:22:02 +00:00
n = 0
current_state = start
lyrics = ""
lyrics += current_state + " "
lyrics = lyrics[0].upper() + lyrics[1:]
2023-03-26 13:22:02 +00:00
while n < limit:
next_state = random.choices(list(markov_model[current_state].keys()),
list(markov_model[current_state].values()))
current_state = next_state[0]
lyrics += current_state + " "
n += 1
return lyrics