import math
import random
import re
from nltk import SyllableTokenizer
from nltk.tokenize import word_tokenize
import pandas as pd


def clean_data(name):
    document = pd.read_csv(name, usecols=["Lyrics"])
    rows = document["Lyrics"].values.tolist()
    dataset = []
    for lyric in rows:
        if isinstance(lyric, str):
            lyric = lyric.lower()
            lyric = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", lyric)
            lyric = re.sub(r"\([A-Za-z0-9:\s\.\?\,\&\*]+\)", "", lyric)
            lyric = re.sub(r"\[[A-Za-z0-9:\s\.\?\,\&\*]+\]", "", lyric)
            lyric = re.sub(r"[A-Za-z0-9]+::", "", lyric)
            lyric = re.sub(r"[A-Za-z0-9]+:", "", lyric)
            lyric = re.sub(r"/[A-Za-z0-9]+", "", lyric)
            lyric = re.sub(r"x[0-9]", "", lyric)
            forbidden_words = ['chorus', 'refrain', 'coda', 'solo', 'intro', 'introduction', 'verse', 'pre-chorus',
                               'post-chorus', 'bridge', 'outro', 'ref']
            tokens = word_tokenize(lyric)
            words = [word for word in tokens if word.isalpha()]
            words = [word for word in words if word not in forbidden_words]
            dataset += words
    print(name.split('\\')[-1], "number of words in cleaned data: ", len(dataset))
    return dataset


def create_markov_model(dataset, n_gram):
    markov_model = {}
    for i in range(len(dataset) - 1 - 2 * n_gram):
        current_state, next_state = "", ""
        for j in range(n_gram):
            current_state += dataset[i + j] + " "
        next_state += dataset[i + n_gram]
        current_state = current_state[:-1]
        if current_state not in markov_model:
            markov_model[current_state] = {}
            markov_model[current_state][next_state] = 1
        else:
            if next_state in markov_model[current_state]:
                markov_model[current_state][next_state] += 1
            else:
                markov_model[current_state][next_state] = 1
    for current_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[current_state][state] = count / total
    return markov_model


def default_next_state(markov_model, current_state, lyrics):
    next_state = random.choices(list(markov_model[current_state].keys()),
                                list(markov_model[current_state].values()))
    lyrics += next_state[0] + " "
    n_gram = len(current_state.split(" "))
    current_state = ""
    for i in range(n_gram + 1, 1, -1):
        current_state += lyrics.split(" ")[-i] + " "
    current_state = current_state[:-1]
    return current_state, lyrics


def rhyming_next_state(rime_states, current_state, lyrics):
    next_state = random.choices(list(rime_states.keys()),
                                list(rime_states.values()))
    lyrics += next_state[0] + " "
    n_gram = len(current_state.split(" "))
    current_state = ""
    for i in range(n_gram + 1, 1, -1):
        current_state += lyrics.split(" ")[-i] + " "
    current_state = current_state[:-1]
    return current_state, lyrics


def generate_lyrics(markov_model, start, limit, try_rhyme, rime):
    n = 0
    current_state = start
    lyrics = ""
    lyrics += current_state + " "
    lyrics = lyrics[0].upper() + lyrics[1:]
    while n < limit:
        if n == limit - 1 and try_rhyme is True:
            rime = rime.split(" ")[-1]
            tk = SyllableTokenizer()
            rime_syllab = tk.tokenize(rime)[-1]
            rime_states = {}
            for state, probability in markov_model[current_state].items():
                syllab = tk.tokenize(state)[-1]
                if rime_syllab == syllab and rime != state:
                    rime_states.update({state: probability})
            if rime_states:
                current_state, lyrics = rhyming_next_state(rime_states, current_state, lyrics)
            else:
                current_state, lyrics = default_next_state(markov_model, current_state, lyrics)
        else:
            current_state, lyrics = default_next_state(markov_model, current_state, lyrics)
        n += 1
    return lyrics, current_state