Interface change and minor improvement in data cleaning.

This commit is contained in:
Sebastian Kutny 2023-03-28 00:21:13 +02:00
parent 9dd0b4f642
commit 07758a575c
3 changed files with 59 additions and 69 deletions

65
main.py
View File

@ -1,7 +1,7 @@
import os import os
import random import random
from scrapper import scrap_data from scrapper import scrap_data
from scrapper import clean_data from markov_model import clean_data
from markov_model import create_markov_model from markov_model import create_markov_model
from markov_model import generate_lyrics from markov_model import generate_lyrics
@ -16,63 +16,44 @@ pink_floyd_selected_albums = ["The Piper At The Gates Of Dawn", "A Saucerful Of
"A Momentary Lapse Of Reason", "The Division Bell"] "A Momentary Lapse Of Reason", "The Division Bell"]
time_stamp = 3.5 time_stamp = 3.5
path = os.path.dirname(os.path.abspath(__file__))
path = os.path.join(path, "Data")
def generate_song(name):
dataset = clean_data(os.path.join(path, name))
n_gram = int(input("Select number of words in Markov state: "))
number_of_verses = int(input("Select number of verses: "))
words_in_verses = int(int(input("Select number of words in verses: ")) / n_gram)
model = create_markov_model(dataset, n_gram)
print('\n')
for i in range(number_of_verses):
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), words_in_verses)
print(generated_lyrics)
def main(): def main():
print("Select data set to use in generation or other option:\n1. Pink Floyd lyrics generation\n2. Black Sabbath " print("Select data set to use in generation or other option:\n1. Pink Floyd lyrics generation\n2. Black Sabbath "
"lyrics generation\n3. Bracia Figo Fagot\n4. Paktofonika\n5. Fused English (aka Pink Sabbath) lyrics " "lyrics generation\n3. Bracia Figo Fagot\n4. Paktofonika\n5. Fused English (aka Pink Sabbath) lyrics "
"generation\n6. Fused Polish (aka Braciofonika Pigo Pagot)\n7. Prepare data\n8. Scrap data\n9. Exit") "generation\n6. Fused Polish (aka Braciofonika Pigo Pagot)\n7. Scrap data\n8. Exit")
pink_floyd_dataset = None
black_sabbath_dataset = None
pink_sabbath_dataset = None
paktofonika_dataset = None
bracia_figo_fagot_dataset = None
braciofonika_pigo_pagot_dataset = None
while True: while True:
selection = int(input()) selection = int(input())
match selection: match selection:
case 1: case 1:
model = create_markov_model(pink_floyd_dataset) generate_song("Pink Floyd.csv")
for i in range(5):
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10)
print(generated_lyrics)
case 2: case 2:
model = create_markov_model(black_sabbath_dataset) generate_song("Black Sabbath.csv")
for i in range(5):
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10)
print(generated_lyrics)
case 3: case 3:
model = create_markov_model(bracia_figo_fagot_dataset) generate_song("Bracia Figo Fagot.csv")
for i in range(5):
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10)
print(generated_lyrics)
case 4: case 4:
model = create_markov_model(paktofonika_dataset) generate_song("Paktofonika.csv")
for i in range(5):
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10)
print(generated_lyrics)
case 5: case 5:
model = create_markov_model(pink_sabbath_dataset) generate_song("Pink Sabbath.csv")
for i in range(5):
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10)
print(generated_lyrics)
case 6: case 6:
model = create_markov_model(braciofonika_pigo_pagot_dataset) generate_song("Braciofonika Pigo Pagot.csv")
for i in range(5):
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10)
print(generated_lyrics)
case 7: case 7:
path = os.path.dirname(os.path.abspath(__file__))
path = path + "\\Data\\"
pink_floyd_dataset = clean_data((path + "Pink Floyd.csv"))
black_sabbath_dataset = clean_data((path + "Black Sabbath.csv"))
pink_sabbath_dataset = clean_data((path + "Pink Sabbath.csv"))
paktofonika_dataset = clean_data((path + "Paktofonika.csv"))
bracia_figo_fagot_dataset = clean_data((path + "Bracia Figo Fagot.csv"))
braciofonika_pigo_pagot_dataset = clean_data((path + "Braciofonika Pigo Pagot.csv"))
case 8:
scrap_data(pink_floyd_selected_albums, black_sabbath_selected_albums, time_stamp) scrap_data(pink_floyd_selected_albums, black_sabbath_selected_albums, time_stamp)
case 9: case 8:
break break
print("\nCommand executed") print("\nCommand executed")

View File

@ -1,7 +1,33 @@
import random import random
import re
from nltk.tokenize import word_tokenize
import pandas as pd
def create_markov_model(dataset, n_gram=2): def clean_data(name):
document = pd.read_csv(name, usecols=["Lyrics"])
rows = document["Lyrics"].values.tolist()
dataset = []
for lyric in rows:
lyric = lyric.lower()
lyric = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", lyric)
lyric = re.sub(r"\([A-Za-z0-9:\s\.\?\,\&\*]+\)", "", lyric)
lyric = re.sub(r"\[[A-Za-z0-9:\s\.\?\,\&\*]+\]", "", lyric)
lyric = re.sub(r"[A-Za-z0-9]+::", "", lyric)
lyric = re.sub(r"[A-Za-z0-9]+:", "", lyric)
lyric = re.sub(r"/[A-Za-z0-9]+", "", lyric)
lyric = re.sub(r"x[0-9]", "", lyric)
forbidden_words = ['chorus', 'refrain', 'coda', 'solo', 'intro', 'introduction', 'verse', 'pre-chorus',
'post-chorus', 'bridge', 'outro', 'ref']
tokens = word_tokenize(lyric)
words = [word for word in tokens if word.isalpha()]
words = [word for word in words if word not in forbidden_words]
dataset += words
print(name.split('\\')[-1], "number of words in cleaned data: ", len(dataset))
return dataset
def create_markov_model(dataset, n_gram):
markov_model = {} markov_model = {}
for i in range(len(dataset) - n_gram - 1): for i in range(len(dataset) - n_gram - 1):
current_state, next_state = "", "" current_state, next_state = "", ""
@ -25,11 +51,12 @@ def create_markov_model(dataset, n_gram=2):
return markov_model return markov_model
def generate_lyrics(markov_model, start, limit=100): def generate_lyrics(markov_model, start, limit):
n = 0 n = 0
current_state = start current_state = start
lyrics = "" lyrics = ""
lyrics += current_state + " " lyrics += current_state + " "
lyrics = lyrics[0].upper() + lyrics[1:]
while n < limit: while n < limit:
next_state = random.choices(list(markov_model[current_state].keys()), next_state = random.choices(list(markov_model[current_state].keys()),
list(markov_model[current_state].values())) list(markov_model[current_state].values()))

View File

@ -4,26 +4,10 @@ from bs4 import BeautifulSoup
import requests import requests
import os import os
import time import time
import re
from nltk.tokenize import word_tokenize
from ScrapThread import ScrapThread from ScrapThread import ScrapThread
from proxy_handling import proxies_validation from proxy_handling import proxies_validation
def clean_data(name):
document = pd.read_csv(name, usecols=["Lyrics"])
rows = document["Lyrics"].values.tolist()
dataset = []
for lyric in rows:
lyric = lyric.lower()
lyric = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", lyric)
tokens = word_tokenize(lyric)
words = [word for word in tokens if word.isalpha()]
dataset += words
print(name.split('\\')[-1], ": ", len(dataset))
return dataset
def connect(url, proxies_list): def connect(url, proxies_list):
headers = { headers = {
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.30 (KHTML, like Gecko) " 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.30 (KHTML, like Gecko) "
@ -156,19 +140,17 @@ def scrap_data(pink_floyd_selected_albums, black_sabbath_selected_albums, time_s
proxies_list = proxies_validation() proxies_list = proxies_validation()
file = open("links.txt") file = open("links.txt")
path = os.path.dirname(os.path.abspath(__file__)) path = os.path.dirname(os.path.abspath(__file__))
path = path + "\\Data\\" path = os.path.join(path, "Data")
file.readline()
file.readline()
paktofonika = do_threading(file.readline()[0:-1], [], 0.0, proxies_list)
figofagot = do_threading(file.readline(), [], 0.0, proxies_list)
braciofonika_pigo_pagot = pd.concat([paktofonika, figofagot], ignore_index=True)
paktofonika.to_csv((path + "Paktofonika.csv"))
figofagot.to_csv((path + "Bracia Figo Fagot.csv"))
braciofonika_pigo_pagot.to_csv((path + "Braciofonika Pigo Pagot.csv"))
pink_floyd_data_frame = do_threading(file.readline()[0:-1], pink_floyd_selected_albums, time_stamp, proxies_list) pink_floyd_data_frame = do_threading(file.readline()[0:-1], pink_floyd_selected_albums, time_stamp, proxies_list)
black_sabbath_data_frame = do_threading(file.readline(), black_sabbath_selected_albums, time_stamp, proxies_list) black_sabbath_data_frame = do_threading(file.readline(), black_sabbath_selected_albums, time_stamp, proxies_list)
pink_sabbath_data_frame = pd.concat([pink_floyd_data_frame, black_sabbath_data_frame], ignore_index=True) pink_sabbath_data_frame = pd.concat([pink_floyd_data_frame, black_sabbath_data_frame], ignore_index=True)
pink_floyd_data_frame.to_csv((path + "PinkFloyd.csv")) pink_floyd_data_frame.to_csv((path + "PinkFloyd.csv"))
black_sabbath_data_frame.to_csv((path + "BlackSabbath.csv")) black_sabbath_data_frame.to_csv((path + "BlackSabbath.csv"))
pink_sabbath_data_frame.to_csv((path + "PinkSabbath.csv")) pink_sabbath_data_frame.to_csv((path + "PinkSabbath.csv"))
paktofonika = do_threading(file.readline()[0:-1], [], 0.0, proxies_list)
figofagot = do_threading(file.readline(), [], 0.0, proxies_list)
braciofonika_pigo_pagot = pd.concat([paktofonika, figofagot], ignore_index=True)
paktofonika.to_csv((path + "Paktofonika.csv"))
figofagot.to_csv((path + "Bracia Figo Fagot.csv"))
braciofonika_pigo_pagot.to_csv((path + "Braciofonika Pigo Pagot.csv"))
os.remove("valid_proxy_list") os.remove("valid_proxy_list")