songs-lyrics-generator/markov_model.py

import random
import re
from nltk.tokenize import word_tokenize
import pandas as pd


def clean_data(name):
    document = pd.read_csv(name, usecols=["Lyrics"])
    rows = document["Lyrics"].values.tolist()
    dataset = []
    for lyric in rows:
        lyric = lyric.lower()
        lyric = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", lyric)
        lyric = re.sub(r"\([A-Za-z0-9:\s\.\?\,\&\*]+\)", "", lyric)
        lyric = re.sub(r"\[[A-Za-z0-9:\s\.\?\,\&\*]+\]", "", lyric)
        lyric = re.sub(r"[A-Za-z0-9]+::", "", lyric)
        lyric = re.sub(r"[A-Za-z0-9]+:", "", lyric)
        lyric = re.sub(r"/[A-Za-z0-9]+", "", lyric)
        lyric = re.sub(r"x[0-9]", "", lyric)
        forbidden_words = ['chorus', 'refrain', 'coda', 'solo', 'intro', 'introduction', 'verse', 'pre-chorus',
                           'post-chorus', 'bridge', 'outro', 'ref']
        tokens = word_tokenize(lyric)
        words = [word for word in tokens if word.isalpha()]
        words = [word for word in words if word not in forbidden_words]
        dataset += words
    print(name.split('\\')[-1], "number of words in cleaned data: ", len(dataset))
    return dataset


def create_markov_model(dataset, n_gram):
    markov_model = {}
    for i in range(len(dataset) - n_gram - 1):
        current_state, next_state = "", ""
        for j in range(n_gram):
            current_state += dataset[i + j] + " "
            next_state += dataset[i + j + n_gram] + " "
        current_state = current_state[:-1]
        next_state = next_state[:-1]
        if current_state not in markov_model:
            markov_model[current_state] = {}
            markov_model[current_state][next_state] = 1
        else:
            if next_state in markov_model[current_state]:
                markov_model[current_state][next_state] += 1
            else:
                markov_model[current_state][next_state] = 1
    for current_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[current_state][state] = count / total
    return markov_model


def generate_lyrics(markov_model, start, limit):
    n = 0
    current_state = start
    lyrics = ""
    lyrics += current_state + " "
    lyrics = lyrics[0].upper() + lyrics[1:]
    while n < limit:
        next_state = random.choices(list(markov_model[current_state].keys()),
                                    list(markov_model[current_state].values()))
        current_state = next_state[0]
        lyrics += current_state + " "
        n += 1
    return lyrics
Initial commit 2023-03-26 13:22:02 +00:00			`import random`
Interface change and minor improvement in data cleaning. 2023-03-27 22:21:13 +00:00			`import re`
			`from nltk.tokenize import word_tokenize`
			`import pandas as pd`
Initial commit 2023-03-26 13:22:02 +00:00

Interface change and minor improvement in data cleaning. 2023-03-27 22:21:13 +00:00			`def clean_data(name):`
			`document = pd.read_csv(name, usecols=["Lyrics"])`
			`rows = document["Lyrics"].values.tolist()`
			`dataset = []`
			`for lyric in rows:`
			`lyric = lyric.lower()`
			lyric = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", lyric)
			`lyric = re.sub(r"\([A-Za-z0-9:\s\.\?\,\&\*]+\)", "", lyric)`
			`lyric = re.sub(r"\[[A-Za-z0-9:\s\.\?\,\&\*]+\]", "", lyric)`
			`lyric = re.sub(r"[A-Za-z0-9]+::", "", lyric)`
			`lyric = re.sub(r"[A-Za-z0-9]+:", "", lyric)`
			`lyric = re.sub(r"/[A-Za-z0-9]+", "", lyric)`
			`lyric = re.sub(r"x[0-9]", "", lyric)`
			`forbidden_words = ['chorus', 'refrain', 'coda', 'solo', 'intro', 'introduction', 'verse', 'pre-chorus',`
			`'post-chorus', 'bridge', 'outro', 'ref']`
			`tokens = word_tokenize(lyric)`
			`words = [word for word in tokens if word.isalpha()]`
			`words = [word for word in words if word not in forbidden_words]`
			`dataset += words`
			`print(name.split('\\')[-1], "number of words in cleaned data: ", len(dataset))`
			`return dataset`


			`def create_markov_model(dataset, n_gram):`
Initial commit 2023-03-26 13:22:02 +00:00			`markov_model = {}`
			`for i in range(len(dataset) - n_gram - 1):`
			`current_state, next_state = "", ""`
			`for j in range(n_gram):`
			`current_state += dataset[i + j] + " "`
			`next_state += dataset[i + j + n_gram] + " "`
			`current_state = current_state[:-1]`
			`next_state = next_state[:-1]`
			`if current_state not in markov_model:`
			`markov_model[current_state] = {}`
			`markov_model[current_state][next_state] = 1`
			`else:`
			`if next_state in markov_model[current_state]:`
			`markov_model[current_state][next_state] += 1`
			`else:`
			`markov_model[current_state][next_state] = 1`
			`for current_state, transition in markov_model.items():`
			`total = sum(transition.values())`
			`for state, count in transition.items():`
			`markov_model[current_state][state] = count / total`
			`return markov_model`


Interface change and minor improvement in data cleaning. 2023-03-27 22:21:13 +00:00			`def generate_lyrics(markov_model, start, limit):`
Initial commit 2023-03-26 13:22:02 +00:00			`n = 0`
			`current_state = start`
			`lyrics = ""`
			`lyrics += current_state + " "`
Interface change and minor improvement in data cleaning. 2023-03-27 22:21:13 +00:00			`lyrics = lyrics[0].upper() + lyrics[1:]`
Initial commit 2023-03-26 13:22:02 +00:00			`while n < limit:`
			`next_state = random.choices(list(markov_model[current_state].keys()),`
			`list(markov_model[current_state].values()))`
			`current_state = next_state[0]`
			`lyrics += current_state + " "`
			`n += 1`
			`return lyrics`