mirror of
https://github.com/WallyS02/Song-Lyrics-Generator.git
synced 2024-11-20 17:38:51 +00:00
111 lines
4.1 KiB
Python
111 lines
4.1 KiB
Python
|
import pandas as pd
|
||
|
from keras.preprocessing.text import Tokenizer, one_hot
|
||
|
import numpy as np
|
||
|
from keras.utils import to_categorical
|
||
|
|
||
|
|
||
|
def pad_array(array, length):
|
||
|
return array + [0] * (length - len(array))
|
||
|
|
||
|
|
||
|
class DataProcessor:
|
||
|
def __init__(self, csv_filename, seqs_length=None, mode="words"):
|
||
|
self.lyrics = list(pd.read_csv(csv_filename)["Lyrics"].dropna())
|
||
|
if mode == "words":
|
||
|
self.tokenizer = Tokenizer()
|
||
|
elif mode == "chars":
|
||
|
self.tokenizer = Tokenizer(char_level=True)
|
||
|
else:
|
||
|
raise ValueError("Unsupported mode: " + mode)
|
||
|
self.mode = mode
|
||
|
self.tokenizer.fit_on_texts(self.lyrics)
|
||
|
self.seqs_length = seqs_length
|
||
|
self.train_lyrics = self.lyrics[:int(0.8 * len(self.lyrics))]
|
||
|
self.validation_lyrics = self.lyrics[int(0.8 * len(self.lyrics)):]
|
||
|
|
||
|
def texts_to_ints(self, texts):
|
||
|
return self.tokenizer.texts_to_sequences(texts)
|
||
|
|
||
|
def ints_to_text(self, ints):
|
||
|
return self.tokenizer.sequences_to_texts([ints])[0]
|
||
|
|
||
|
def texts_to_onehots(self, texts):
|
||
|
n = len(self.tokenizer.word_index) + 1
|
||
|
one_hots = [one_hot(lyric, n) for lyric in texts]
|
||
|
return one_hots
|
||
|
|
||
|
def vocab_size(self):
|
||
|
return len(self.tokenizer.word_index) + 1
|
||
|
|
||
|
def max_length(self, texts):
|
||
|
full_sequences = self.texts_to_ints(texts)
|
||
|
return max([len(seq) for seq in full_sequences])
|
||
|
|
||
|
def training_data(self, kind="ints", padded=False):
|
||
|
if kind == "ints":
|
||
|
full_sequences = self.texts_to_ints(self.train_lyrics)
|
||
|
elif kind == "onehots":
|
||
|
full_sequences = self.texts_to_onehots(self.train_lyrics)
|
||
|
else:
|
||
|
raise ValueError("Kind must be either ints or onehots")
|
||
|
|
||
|
if not padded and self.seqs_length:
|
||
|
X = []
|
||
|
y = []
|
||
|
for full_sequence in full_sequences:
|
||
|
for i in range(len(full_sequence) - self.seqs_length):
|
||
|
X.append(full_sequence[i:i + self.seqs_length])
|
||
|
y.append(full_sequence[i + self.seqs_length])
|
||
|
X = np.reshape(X, (len(X), len(X[0]), 1))
|
||
|
# TODO: Do we need that?
|
||
|
# X = X / float(self.vocab_size())
|
||
|
elif padded:
|
||
|
max_length = max([len(seq) for seq in full_sequences])
|
||
|
X = []
|
||
|
y = []
|
||
|
for full_sequence in full_sequences:
|
||
|
for i in range(len(full_sequence)):
|
||
|
X.append(pad_array(full_sequence[:i], max_length))
|
||
|
y.append(full_sequence[i])
|
||
|
X = np.reshape(X, (len(X), len(X[0]), 1))
|
||
|
else:
|
||
|
raise ValueError("Can't use padding along with seqs_length")
|
||
|
|
||
|
y = to_categorical(y, num_classes=self.vocab_size())
|
||
|
|
||
|
return X, y
|
||
|
|
||
|
def validation_data(self, kind="ints", padded=False):
|
||
|
if kind == "ints":
|
||
|
full_sequences = self.texts_to_ints(self.validation_lyrics)
|
||
|
elif kind == "onehots":
|
||
|
full_sequences = self.texts_to_onehots(self.validation_lyrics)
|
||
|
else:
|
||
|
raise ValueError("Kind must be either ints or onehots")
|
||
|
|
||
|
if not padded and self.seqs_length:
|
||
|
X = []
|
||
|
y = []
|
||
|
for full_sequence in full_sequences:
|
||
|
for i in range(len(full_sequence) - self.seqs_length):
|
||
|
X.append(full_sequence[i:i + self.seqs_length])
|
||
|
y.append(full_sequence[i + self.seqs_length])
|
||
|
X = np.reshape(X, (len(X), len(X[0]), 1))
|
||
|
# TODO: Do we need that?
|
||
|
# X = X / float(self.vocab_size())
|
||
|
elif padded:
|
||
|
max_length = max([len(seq) for seq in full_sequences])
|
||
|
X = []
|
||
|
y = []
|
||
|
for full_sequence in full_sequences:
|
||
|
for i in range(len(full_sequence)):
|
||
|
X.append(pad_array(full_sequence[:i], max_length))
|
||
|
y.append(full_sequence[i])
|
||
|
X = np.reshape(X, (len(X), len(X[0]), 1))
|
||
|
else:
|
||
|
raise ValueError("Can't use padding along with seqs_length")
|
||
|
|
||
|
y = to_categorical(y, num_classes=self.vocab_size())
|
||
|
|
||
|
return X, y
|