mirror of
https://github.com/WallyS02/Song-Lyrics-Generator.git
synced 2025-01-18 08:19:19 +00:00
Interface change and minor improvement in data cleaning.
This commit is contained in:
commit
a2c1694adb
65
main.py
65
main.py
@ -1,7 +1,7 @@
|
||||
import os
|
||||
import random
|
||||
from scrapper import scrap_data
|
||||
from scrapper import clean_data
|
||||
from markov_model import clean_data
|
||||
from markov_model import create_markov_model
|
||||
from markov_model import generate_lyrics
|
||||
|
||||
@ -16,63 +16,44 @@ pink_floyd_selected_albums = ["The Piper At The Gates Of Dawn", "A Saucerful Of
|
||||
"A Momentary Lapse Of Reason", "The Division Bell"]
|
||||
|
||||
time_stamp = 3.5
|
||||
path = os.path.dirname(os.path.abspath(__file__))
|
||||
path = os.path.join(path, "Data")
|
||||
|
||||
|
||||
def generate_song(name):
|
||||
dataset = clean_data(os.path.join(path, name))
|
||||
n_gram = int(input("Select number of words in Markov state: "))
|
||||
number_of_verses = int(input("Select number of verses: "))
|
||||
words_in_verses = int(int(input("Select number of words in verses: ")) / n_gram)
|
||||
model = create_markov_model(dataset, n_gram)
|
||||
print('\n')
|
||||
for i in range(number_of_verses):
|
||||
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), words_in_verses)
|
||||
print(generated_lyrics)
|
||||
|
||||
|
||||
def main():
|
||||
print("Select data set to use in generation or other option:\n1. Pink Floyd lyrics generation\n2. Black Sabbath "
|
||||
"lyrics generation\n3. Bracia Figo Fagot\n4. Paktofonika\n5. Fused English (aka Pink Sabbath) lyrics "
|
||||
"generation\n6. Fused Polish (aka Braciofonika Pigo Pagot)\n7. Prepare data\n8. Scrap data\n9. Exit")
|
||||
pink_floyd_dataset = None
|
||||
black_sabbath_dataset = None
|
||||
pink_sabbath_dataset = None
|
||||
paktofonika_dataset = None
|
||||
bracia_figo_fagot_dataset = None
|
||||
braciofonika_pigo_pagot_dataset = None
|
||||
"generation\n6. Fused Polish (aka Braciofonika Pigo Pagot)\n7. Scrap data\n8. Exit")
|
||||
while True:
|
||||
selection = int(input())
|
||||
match selection:
|
||||
case 1:
|
||||
model = create_markov_model(pink_floyd_dataset)
|
||||
for i in range(5):
|
||||
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10)
|
||||
print(generated_lyrics)
|
||||
generate_song("Pink Floyd.csv")
|
||||
case 2:
|
||||
model = create_markov_model(black_sabbath_dataset)
|
||||
for i in range(5):
|
||||
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10)
|
||||
print(generated_lyrics)
|
||||
generate_song("Black Sabbath.csv")
|
||||
case 3:
|
||||
model = create_markov_model(bracia_figo_fagot_dataset)
|
||||
for i in range(5):
|
||||
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10)
|
||||
print(generated_lyrics)
|
||||
generate_song("Bracia Figo Fagot.csv")
|
||||
case 4:
|
||||
model = create_markov_model(paktofonika_dataset)
|
||||
for i in range(5):
|
||||
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10)
|
||||
print(generated_lyrics)
|
||||
generate_song("Paktofonika.csv")
|
||||
case 5:
|
||||
model = create_markov_model(pink_sabbath_dataset)
|
||||
for i in range(5):
|
||||
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10)
|
||||
print(generated_lyrics)
|
||||
generate_song("Pink Sabbath.csv")
|
||||
case 6:
|
||||
model = create_markov_model(braciofonika_pigo_pagot_dataset)
|
||||
for i in range(5):
|
||||
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10)
|
||||
print(generated_lyrics)
|
||||
generate_song("Braciofonika Pigo Pagot.csv")
|
||||
case 7:
|
||||
path = os.path.dirname(os.path.abspath(__file__))
|
||||
path = os.path.join(path, "Data")
|
||||
pink_floyd_dataset = clean_data(os.path.join(path, "Pink Floyd.csv"))
|
||||
black_sabbath_dataset = clean_data(os.path.join(path, "Black Sabbath.csv"))
|
||||
pink_sabbath_dataset = clean_data(os.path.join(path, "Pink Sabbath.csv"))
|
||||
paktofonika_dataset = clean_data(os.path.join(path, "Paktofonika.csv"))
|
||||
bracia_figo_fagot_dataset = clean_data(os.path.join(path, "Bracia Figo Fagot.csv"))
|
||||
braciofonika_pigo_pagot_dataset = clean_data(os.path.join(path, "Braciofonika Pigo Pagot.csv"))
|
||||
case 8:
|
||||
scrap_data(pink_floyd_selected_albums, black_sabbath_selected_albums, time_stamp)
|
||||
case 9:
|
||||
case 8:
|
||||
break
|
||||
print("\nCommand executed")
|
||||
|
||||
|
@ -1,7 +1,33 @@
|
||||
import random
|
||||
import re
|
||||
from nltk.tokenize import word_tokenize
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def create_markov_model(dataset, n_gram=2):
|
||||
def clean_data(name):
|
||||
document = pd.read_csv(name, usecols=["Lyrics"])
|
||||
rows = document["Lyrics"].values.tolist()
|
||||
dataset = []
|
||||
for lyric in rows:
|
||||
lyric = lyric.lower()
|
||||
lyric = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", lyric)
|
||||
lyric = re.sub(r"\([A-Za-z0-9:\s\.\?\,\&\*]+\)", "", lyric)
|
||||
lyric = re.sub(r"\[[A-Za-z0-9:\s\.\?\,\&\*]+\]", "", lyric)
|
||||
lyric = re.sub(r"[A-Za-z0-9]+::", "", lyric)
|
||||
lyric = re.sub(r"[A-Za-z0-9]+:", "", lyric)
|
||||
lyric = re.sub(r"/[A-Za-z0-9]+", "", lyric)
|
||||
lyric = re.sub(r"x[0-9]", "", lyric)
|
||||
forbidden_words = ['chorus', 'refrain', 'coda', 'solo', 'intro', 'introduction', 'verse', 'pre-chorus',
|
||||
'post-chorus', 'bridge', 'outro', 'ref']
|
||||
tokens = word_tokenize(lyric)
|
||||
words = [word for word in tokens if word.isalpha()]
|
||||
words = [word for word in words if word not in forbidden_words]
|
||||
dataset += words
|
||||
print(name.split('\\')[-1], "number of words in cleaned data: ", len(dataset))
|
||||
return dataset
|
||||
|
||||
|
||||
def create_markov_model(dataset, n_gram):
|
||||
markov_model = {}
|
||||
for i in range(len(dataset) - n_gram - 1):
|
||||
current_state, next_state = "", ""
|
||||
@ -25,11 +51,12 @@ def create_markov_model(dataset, n_gram=2):
|
||||
return markov_model
|
||||
|
||||
|
||||
def generate_lyrics(markov_model, start, limit=100):
|
||||
def generate_lyrics(markov_model, start, limit):
|
||||
n = 0
|
||||
current_state = start
|
||||
lyrics = ""
|
||||
lyrics += current_state + " "
|
||||
lyrics = lyrics[0].upper() + lyrics[1:]
|
||||
while n < limit:
|
||||
next_state = random.choices(list(markov_model[current_state].keys()),
|
||||
list(markov_model[current_state].values()))
|
||||
|
32
scrapper.py
32
scrapper.py
@ -4,26 +4,10 @@ from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import os
|
||||
import time
|
||||
import re
|
||||
from nltk.tokenize import word_tokenize
|
||||
from ScrapThread import ScrapThread
|
||||
from proxy_handling import proxies_validation
|
||||
|
||||
|
||||
def clean_data(name):
|
||||
document = pd.read_csv(name, usecols=["Lyrics"])
|
||||
rows = document["Lyrics"].values.tolist()
|
||||
dataset = []
|
||||
for lyric in rows:
|
||||
lyric = lyric.lower()
|
||||
lyric = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", lyric)
|
||||
tokens = word_tokenize(lyric)
|
||||
words = [word for word in tokens if word.isalpha()]
|
||||
dataset += words
|
||||
print(name.split('\\')[-1], ": ", len(dataset))
|
||||
return dataset
|
||||
|
||||
|
||||
def connect(url, proxies_list):
|
||||
headers = {
|
||||
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.30 (KHTML, like Gecko) "
|
||||
@ -156,19 +140,17 @@ def scrap_data(pink_floyd_selected_albums, black_sabbath_selected_albums, time_s
|
||||
proxies_list = proxies_validation()
|
||||
file = open("links.txt")
|
||||
path = os.path.dirname(os.path.abspath(__file__))
|
||||
path = path + "\\Data\\"
|
||||
file.readline()
|
||||
file.readline()
|
||||
paktofonika = do_threading(file.readline()[0:-1], [], 0.0, proxies_list)
|
||||
figofagot = do_threading(file.readline(), [], 0.0, proxies_list)
|
||||
braciofonika_pigo_pagot = pd.concat([paktofonika, figofagot], ignore_index=True)
|
||||
paktofonika.to_csv((path + "Paktofonika.csv"))
|
||||
figofagot.to_csv((path + "Bracia Figo Fagot.csv"))
|
||||
braciofonika_pigo_pagot.to_csv((path + "Braciofonika Pigo Pagot.csv"))
|
||||
path = os.path.join(path, "Data")
|
||||
pink_floyd_data_frame = do_threading(file.readline()[0:-1], pink_floyd_selected_albums, time_stamp, proxies_list)
|
||||
black_sabbath_data_frame = do_threading(file.readline(), black_sabbath_selected_albums, time_stamp, proxies_list)
|
||||
pink_sabbath_data_frame = pd.concat([pink_floyd_data_frame, black_sabbath_data_frame], ignore_index=True)
|
||||
pink_floyd_data_frame.to_csv((path + "PinkFloyd.csv"))
|
||||
black_sabbath_data_frame.to_csv((path + "BlackSabbath.csv"))
|
||||
pink_sabbath_data_frame.to_csv((path + "PinkSabbath.csv"))
|
||||
paktofonika = do_threading(file.readline()[0:-1], [], 0.0, proxies_list)
|
||||
figofagot = do_threading(file.readline(), [], 0.0, proxies_list)
|
||||
braciofonika_pigo_pagot = pd.concat([paktofonika, figofagot], ignore_index=True)
|
||||
paktofonika.to_csv((path + "Paktofonika.csv"))
|
||||
figofagot.to_csv((path + "Bracia Figo Fagot.csv"))
|
||||
braciofonika_pigo_pagot.to_csv((path + "Braciofonika Pigo Pagot.csv"))
|
||||
os.remove("valid_proxy_list")
|
||||
|
Loading…
x
Reference in New Issue
Block a user