import math import random import re from nltk import SyllableTokenizer from nltk.tokenize import word_tokenize import pandas as pd def clean_data(name): document = pd.read_csv(name, usecols=["Lyrics"]) rows = document["Lyrics"].values.tolist() dataset = [] for lyric in rows: if isinstance(lyric, str): lyric = lyric.lower() lyric = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", lyric) lyric = re.sub(r"\([A-Za-z0-9:\s\.\?\,\&\*]+\)", "", lyric) lyric = re.sub(r"\[[A-Za-z0-9:\s\.\?\,\&\*]+\]", "", lyric) lyric = re.sub(r"[A-Za-z0-9]+::", "", lyric) lyric = re.sub(r"[A-Za-z0-9]+:", "", lyric) lyric = re.sub(r"/[A-Za-z0-9]+", "", lyric) lyric = re.sub(r"x[0-9]", "", lyric) forbidden_words = ['chorus', 'refrain', 'coda', 'solo', 'intro', 'introduction', 'verse', 'pre-chorus', 'post-chorus', 'bridge', 'outro', 'ref'] tokens = word_tokenize(lyric) words = [word for word in tokens if word.isalpha()] words = [word for word in words if word not in forbidden_words] dataset += words print(name.split('\\')[-1], "number of words in cleaned data: ", len(dataset)) return dataset def create_markov_model(dataset, n_gram): markov_model = {} for i in range(len(dataset) - 1 - 2 * n_gram): current_state, next_state = "", "" for j in range(n_gram): current_state += dataset[i + j] + " " next_state += dataset[i + n_gram] current_state = current_state[:-1] if current_state not in markov_model: markov_model[current_state] = {} markov_model[current_state][next_state] = 1 else: if next_state in markov_model[current_state]: markov_model[current_state][next_state] += 1 else: markov_model[current_state][next_state] = 1 for current_state, transition in markov_model.items(): total = sum(transition.values()) for state, count in transition.items(): markov_model[current_state][state] = count / total return markov_model def default_next_state(markov_model, current_state, lyrics): next_state = random.choices(list(markov_model[current_state].keys()), list(markov_model[current_state].values())) lyrics += next_state[0] + " " n_gram = len(current_state.split(" ")) current_state = "" for i in range(n_gram + 1, 1, -1): current_state += lyrics.split(" ")[-i] + " " current_state = current_state[:-1] return current_state, lyrics def rhyming_next_state(rime_states, current_state, lyrics): next_state = random.choices(list(rime_states.keys()), list(rime_states.values())) lyrics += next_state[0] + " " n_gram = len(current_state.split(" ")) current_state = "" for i in range(n_gram + 1, 1, -1): current_state += lyrics.split(" ")[-i] + " " current_state = current_state[:-1] return current_state, lyrics def generate_lyrics(markov_model, start, limit, try_rhyme, rime): n = 0 current_state = start lyrics = "" lyrics += current_state + " " lyrics = lyrics[0].upper() + lyrics[1:] while n < limit: if n == limit - 1 and try_rhyme is True: rime = rime.split(" ")[-1] tk = SyllableTokenizer() rime_syllab = tk.tokenize(rime)[-1] rime_states = {} for state, probability in markov_model[current_state].items(): syllab = tk.tokenize(state)[-1] if rime_syllab == syllab and rime != state: rime_states.update({state: probability}) if rime_states: current_state, lyrics = rhyming_next_state(rime_states, current_state, lyrics) else: current_state, lyrics = default_next_state(markov_model, current_state, lyrics) else: current_state, lyrics = default_next_state(markov_model, current_state, lyrics) n += 1 return lyrics, current_state