mirror of
https://github.com/WallyS02/Song-Lyrics-Generator.git
synced 2024-11-20 09:38:50 +00:00
Interface change and minor improvement in data cleaning.
This commit is contained in:
commit
a2c1694adb
65
main.py
65
main.py
@ -1,7 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
from scrapper import scrap_data
|
from scrapper import scrap_data
|
||||||
from scrapper import clean_data
|
from markov_model import clean_data
|
||||||
from markov_model import create_markov_model
|
from markov_model import create_markov_model
|
||||||
from markov_model import generate_lyrics
|
from markov_model import generate_lyrics
|
||||||
|
|
||||||
@ -16,63 +16,44 @@ pink_floyd_selected_albums = ["The Piper At The Gates Of Dawn", "A Saucerful Of
|
|||||||
"A Momentary Lapse Of Reason", "The Division Bell"]
|
"A Momentary Lapse Of Reason", "The Division Bell"]
|
||||||
|
|
||||||
time_stamp = 3.5
|
time_stamp = 3.5
|
||||||
|
path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
path = os.path.join(path, "Data")
|
||||||
|
|
||||||
|
|
||||||
|
def generate_song(name):
|
||||||
|
dataset = clean_data(os.path.join(path, name))
|
||||||
|
n_gram = int(input("Select number of words in Markov state: "))
|
||||||
|
number_of_verses = int(input("Select number of verses: "))
|
||||||
|
words_in_verses = int(int(input("Select number of words in verses: ")) / n_gram)
|
||||||
|
model = create_markov_model(dataset, n_gram)
|
||||||
|
print('\n')
|
||||||
|
for i in range(number_of_verses):
|
||||||
|
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), words_in_verses)
|
||||||
|
print(generated_lyrics)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
print("Select data set to use in generation or other option:\n1. Pink Floyd lyrics generation\n2. Black Sabbath "
|
print("Select data set to use in generation or other option:\n1. Pink Floyd lyrics generation\n2. Black Sabbath "
|
||||||
"lyrics generation\n3. Bracia Figo Fagot\n4. Paktofonika\n5. Fused English (aka Pink Sabbath) lyrics "
|
"lyrics generation\n3. Bracia Figo Fagot\n4. Paktofonika\n5. Fused English (aka Pink Sabbath) lyrics "
|
||||||
"generation\n6. Fused Polish (aka Braciofonika Pigo Pagot)\n7. Prepare data\n8. Scrap data\n9. Exit")
|
"generation\n6. Fused Polish (aka Braciofonika Pigo Pagot)\n7. Scrap data\n8. Exit")
|
||||||
pink_floyd_dataset = None
|
|
||||||
black_sabbath_dataset = None
|
|
||||||
pink_sabbath_dataset = None
|
|
||||||
paktofonika_dataset = None
|
|
||||||
bracia_figo_fagot_dataset = None
|
|
||||||
braciofonika_pigo_pagot_dataset = None
|
|
||||||
while True:
|
while True:
|
||||||
selection = int(input())
|
selection = int(input())
|
||||||
match selection:
|
match selection:
|
||||||
case 1:
|
case 1:
|
||||||
model = create_markov_model(pink_floyd_dataset)
|
generate_song("Pink Floyd.csv")
|
||||||
for i in range(5):
|
|
||||||
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10)
|
|
||||||
print(generated_lyrics)
|
|
||||||
case 2:
|
case 2:
|
||||||
model = create_markov_model(black_sabbath_dataset)
|
generate_song("Black Sabbath.csv")
|
||||||
for i in range(5):
|
|
||||||
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10)
|
|
||||||
print(generated_lyrics)
|
|
||||||
case 3:
|
case 3:
|
||||||
model = create_markov_model(bracia_figo_fagot_dataset)
|
generate_song("Bracia Figo Fagot.csv")
|
||||||
for i in range(5):
|
|
||||||
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10)
|
|
||||||
print(generated_lyrics)
|
|
||||||
case 4:
|
case 4:
|
||||||
model = create_markov_model(paktofonika_dataset)
|
generate_song("Paktofonika.csv")
|
||||||
for i in range(5):
|
|
||||||
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10)
|
|
||||||
print(generated_lyrics)
|
|
||||||
case 5:
|
case 5:
|
||||||
model = create_markov_model(pink_sabbath_dataset)
|
generate_song("Pink Sabbath.csv")
|
||||||
for i in range(5):
|
|
||||||
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10)
|
|
||||||
print(generated_lyrics)
|
|
||||||
case 6:
|
case 6:
|
||||||
model = create_markov_model(braciofonika_pigo_pagot_dataset)
|
generate_song("Braciofonika Pigo Pagot.csv")
|
||||||
for i in range(5):
|
|
||||||
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10)
|
|
||||||
print(generated_lyrics)
|
|
||||||
case 7:
|
case 7:
|
||||||
path = os.path.dirname(os.path.abspath(__file__))
|
|
||||||
path = os.path.join(path, "Data")
|
|
||||||
pink_floyd_dataset = clean_data(os.path.join(path, "Pink Floyd.csv"))
|
|
||||||
black_sabbath_dataset = clean_data(os.path.join(path, "Black Sabbath.csv"))
|
|
||||||
pink_sabbath_dataset = clean_data(os.path.join(path, "Pink Sabbath.csv"))
|
|
||||||
paktofonika_dataset = clean_data(os.path.join(path, "Paktofonika.csv"))
|
|
||||||
bracia_figo_fagot_dataset = clean_data(os.path.join(path, "Bracia Figo Fagot.csv"))
|
|
||||||
braciofonika_pigo_pagot_dataset = clean_data(os.path.join(path, "Braciofonika Pigo Pagot.csv"))
|
|
||||||
case 8:
|
|
||||||
scrap_data(pink_floyd_selected_albums, black_sabbath_selected_albums, time_stamp)
|
scrap_data(pink_floyd_selected_albums, black_sabbath_selected_albums, time_stamp)
|
||||||
case 9:
|
case 8:
|
||||||
break
|
break
|
||||||
print("\nCommand executed")
|
print("\nCommand executed")
|
||||||
|
|
||||||
|
@ -1,7 +1,33 @@
|
|||||||
import random
|
import random
|
||||||
|
import re
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
def create_markov_model(dataset, n_gram=2):
|
def clean_data(name):
|
||||||
|
document = pd.read_csv(name, usecols=["Lyrics"])
|
||||||
|
rows = document["Lyrics"].values.tolist()
|
||||||
|
dataset = []
|
||||||
|
for lyric in rows:
|
||||||
|
lyric = lyric.lower()
|
||||||
|
lyric = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", lyric)
|
||||||
|
lyric = re.sub(r"\([A-Za-z0-9:\s\.\?\,\&\*]+\)", "", lyric)
|
||||||
|
lyric = re.sub(r"\[[A-Za-z0-9:\s\.\?\,\&\*]+\]", "", lyric)
|
||||||
|
lyric = re.sub(r"[A-Za-z0-9]+::", "", lyric)
|
||||||
|
lyric = re.sub(r"[A-Za-z0-9]+:", "", lyric)
|
||||||
|
lyric = re.sub(r"/[A-Za-z0-9]+", "", lyric)
|
||||||
|
lyric = re.sub(r"x[0-9]", "", lyric)
|
||||||
|
forbidden_words = ['chorus', 'refrain', 'coda', 'solo', 'intro', 'introduction', 'verse', 'pre-chorus',
|
||||||
|
'post-chorus', 'bridge', 'outro', 'ref']
|
||||||
|
tokens = word_tokenize(lyric)
|
||||||
|
words = [word for word in tokens if word.isalpha()]
|
||||||
|
words = [word for word in words if word not in forbidden_words]
|
||||||
|
dataset += words
|
||||||
|
print(name.split('\\')[-1], "number of words in cleaned data: ", len(dataset))
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
|
def create_markov_model(dataset, n_gram):
|
||||||
markov_model = {}
|
markov_model = {}
|
||||||
for i in range(len(dataset) - n_gram - 1):
|
for i in range(len(dataset) - n_gram - 1):
|
||||||
current_state, next_state = "", ""
|
current_state, next_state = "", ""
|
||||||
@ -25,11 +51,12 @@ def create_markov_model(dataset, n_gram=2):
|
|||||||
return markov_model
|
return markov_model
|
||||||
|
|
||||||
|
|
||||||
def generate_lyrics(markov_model, start, limit=100):
|
def generate_lyrics(markov_model, start, limit):
|
||||||
n = 0
|
n = 0
|
||||||
current_state = start
|
current_state = start
|
||||||
lyrics = ""
|
lyrics = ""
|
||||||
lyrics += current_state + " "
|
lyrics += current_state + " "
|
||||||
|
lyrics = lyrics[0].upper() + lyrics[1:]
|
||||||
while n < limit:
|
while n < limit:
|
||||||
next_state = random.choices(list(markov_model[current_state].keys()),
|
next_state = random.choices(list(markov_model[current_state].keys()),
|
||||||
list(markov_model[current_state].values()))
|
list(markov_model[current_state].values()))
|
||||||
|
32
scrapper.py
32
scrapper.py
@ -4,26 +4,10 @@ from bs4 import BeautifulSoup
|
|||||||
import requests
|
import requests
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import re
|
|
||||||
from nltk.tokenize import word_tokenize
|
|
||||||
from ScrapThread import ScrapThread
|
from ScrapThread import ScrapThread
|
||||||
from proxy_handling import proxies_validation
|
from proxy_handling import proxies_validation
|
||||||
|
|
||||||
|
|
||||||
def clean_data(name):
|
|
||||||
document = pd.read_csv(name, usecols=["Lyrics"])
|
|
||||||
rows = document["Lyrics"].values.tolist()
|
|
||||||
dataset = []
|
|
||||||
for lyric in rows:
|
|
||||||
lyric = lyric.lower()
|
|
||||||
lyric = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", lyric)
|
|
||||||
tokens = word_tokenize(lyric)
|
|
||||||
words = [word for word in tokens if word.isalpha()]
|
|
||||||
dataset += words
|
|
||||||
print(name.split('\\')[-1], ": ", len(dataset))
|
|
||||||
return dataset
|
|
||||||
|
|
||||||
|
|
||||||
def connect(url, proxies_list):
|
def connect(url, proxies_list):
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.30 (KHTML, like Gecko) "
|
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.30 (KHTML, like Gecko) "
|
||||||
@ -156,19 +140,17 @@ def scrap_data(pink_floyd_selected_albums, black_sabbath_selected_albums, time_s
|
|||||||
proxies_list = proxies_validation()
|
proxies_list = proxies_validation()
|
||||||
file = open("links.txt")
|
file = open("links.txt")
|
||||||
path = os.path.dirname(os.path.abspath(__file__))
|
path = os.path.dirname(os.path.abspath(__file__))
|
||||||
path = path + "\\Data\\"
|
path = os.path.join(path, "Data")
|
||||||
file.readline()
|
|
||||||
file.readline()
|
|
||||||
paktofonika = do_threading(file.readline()[0:-1], [], 0.0, proxies_list)
|
|
||||||
figofagot = do_threading(file.readline(), [], 0.0, proxies_list)
|
|
||||||
braciofonika_pigo_pagot = pd.concat([paktofonika, figofagot], ignore_index=True)
|
|
||||||
paktofonika.to_csv((path + "Paktofonika.csv"))
|
|
||||||
figofagot.to_csv((path + "Bracia Figo Fagot.csv"))
|
|
||||||
braciofonika_pigo_pagot.to_csv((path + "Braciofonika Pigo Pagot.csv"))
|
|
||||||
pink_floyd_data_frame = do_threading(file.readline()[0:-1], pink_floyd_selected_albums, time_stamp, proxies_list)
|
pink_floyd_data_frame = do_threading(file.readline()[0:-1], pink_floyd_selected_albums, time_stamp, proxies_list)
|
||||||
black_sabbath_data_frame = do_threading(file.readline(), black_sabbath_selected_albums, time_stamp, proxies_list)
|
black_sabbath_data_frame = do_threading(file.readline(), black_sabbath_selected_albums, time_stamp, proxies_list)
|
||||||
pink_sabbath_data_frame = pd.concat([pink_floyd_data_frame, black_sabbath_data_frame], ignore_index=True)
|
pink_sabbath_data_frame = pd.concat([pink_floyd_data_frame, black_sabbath_data_frame], ignore_index=True)
|
||||||
pink_floyd_data_frame.to_csv((path + "PinkFloyd.csv"))
|
pink_floyd_data_frame.to_csv((path + "PinkFloyd.csv"))
|
||||||
black_sabbath_data_frame.to_csv((path + "BlackSabbath.csv"))
|
black_sabbath_data_frame.to_csv((path + "BlackSabbath.csv"))
|
||||||
pink_sabbath_data_frame.to_csv((path + "PinkSabbath.csv"))
|
pink_sabbath_data_frame.to_csv((path + "PinkSabbath.csv"))
|
||||||
|
paktofonika = do_threading(file.readline()[0:-1], [], 0.0, proxies_list)
|
||||||
|
figofagot = do_threading(file.readline(), [], 0.0, proxies_list)
|
||||||
|
braciofonika_pigo_pagot = pd.concat([paktofonika, figofagot], ignore_index=True)
|
||||||
|
paktofonika.to_csv((path + "Paktofonika.csv"))
|
||||||
|
figofagot.to_csv((path + "Bracia Figo Fagot.csv"))
|
||||||
|
braciofonika_pigo_pagot.to_csv((path + "Braciofonika Pigo Pagot.csv"))
|
||||||
os.remove("valid_proxy_list")
|
os.remove("valid_proxy_list")
|
||||||
|
Loading…
Reference in New Issue
Block a user