From 07758a575c69e45564398424ad2359519ed92575 Mon Sep 17 00:00:00 2001 From: Sebastian Kutny Date: Tue, 28 Mar 2023 00:21:13 +0200 Subject: [PATCH] Interface change and minor improvement in data cleaning. --- main.py | 65 +++++++++++++++++-------------------------------- markov_model.py | 31 +++++++++++++++++++++-- scrapper.py | 32 ++++++------------------ 3 files changed, 59 insertions(+), 69 deletions(-) diff --git a/main.py b/main.py index 799c16f..b276e27 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,7 @@ import os import random from scrapper import scrap_data -from scrapper import clean_data +from markov_model import clean_data from markov_model import create_markov_model from markov_model import generate_lyrics @@ -16,63 +16,44 @@ pink_floyd_selected_albums = ["The Piper At The Gates Of Dawn", "A Saucerful Of "A Momentary Lapse Of Reason", "The Division Bell"] time_stamp = 3.5 +path = os.path.dirname(os.path.abspath(__file__)) +path = os.path.join(path, "Data") + + +def generate_song(name): + dataset = clean_data(os.path.join(path, name)) + n_gram = int(input("Select number of words in Markov state: ")) + number_of_verses = int(input("Select number of verses: ")) + words_in_verses = int(int(input("Select number of words in verses: ")) / n_gram) + model = create_markov_model(dataset, n_gram) + print('\n') + for i in range(number_of_verses): + generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), words_in_verses) + print(generated_lyrics) def main(): print("Select data set to use in generation or other option:\n1. Pink Floyd lyrics generation\n2. Black Sabbath " "lyrics generation\n3. Bracia Figo Fagot\n4. Paktofonika\n5. Fused English (aka Pink Sabbath) lyrics " - "generation\n6. Fused Polish (aka Braciofonika Pigo Pagot)\n7. Prepare data\n8. Scrap data\n9. Exit") - pink_floyd_dataset = None - black_sabbath_dataset = None - pink_sabbath_dataset = None - paktofonika_dataset = None - bracia_figo_fagot_dataset = None - braciofonika_pigo_pagot_dataset = None + "generation\n6. Fused Polish (aka Braciofonika Pigo Pagot)\n7. Scrap data\n8. Exit") while True: selection = int(input()) match selection: case 1: - model = create_markov_model(pink_floyd_dataset) - for i in range(5): - generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10) - print(generated_lyrics) + generate_song("Pink Floyd.csv") case 2: - model = create_markov_model(black_sabbath_dataset) - for i in range(5): - generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10) - print(generated_lyrics) + generate_song("Black Sabbath.csv") case 3: - model = create_markov_model(bracia_figo_fagot_dataset) - for i in range(5): - generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10) - print(generated_lyrics) + generate_song("Bracia Figo Fagot.csv") case 4: - model = create_markov_model(paktofonika_dataset) - for i in range(5): - generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10) - print(generated_lyrics) + generate_song("Paktofonika.csv") case 5: - model = create_markov_model(pink_sabbath_dataset) - for i in range(5): - generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10) - print(generated_lyrics) + generate_song("Pink Sabbath.csv") case 6: - model = create_markov_model(braciofonika_pigo_pagot_dataset) - for i in range(5): - generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), 10) - print(generated_lyrics) + generate_song("Braciofonika Pigo Pagot.csv") case 7: - path = os.path.dirname(os.path.abspath(__file__)) - path = path + "\\Data\\" - pink_floyd_dataset = clean_data((path + "Pink Floyd.csv")) - black_sabbath_dataset = clean_data((path + "Black Sabbath.csv")) - pink_sabbath_dataset = clean_data((path + "Pink Sabbath.csv")) - paktofonika_dataset = clean_data((path + "Paktofonika.csv")) - bracia_figo_fagot_dataset = clean_data((path + "Bracia Figo Fagot.csv")) - braciofonika_pigo_pagot_dataset = clean_data((path + "Braciofonika Pigo Pagot.csv")) - case 8: scrap_data(pink_floyd_selected_albums, black_sabbath_selected_albums, time_stamp) - case 9: + case 8: break print("\nCommand executed") diff --git a/markov_model.py b/markov_model.py index 785a6c5..904e618 100644 --- a/markov_model.py +++ b/markov_model.py @@ -1,7 +1,33 @@ import random +import re +from nltk.tokenize import word_tokenize +import pandas as pd -def create_markov_model(dataset, n_gram=2): +def clean_data(name): + document = pd.read_csv(name, usecols=["Lyrics"]) + rows = document["Lyrics"].values.tolist() + dataset = [] + for lyric in rows: + lyric = lyric.lower() + lyric = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", lyric) + lyric = re.sub(r"\([A-Za-z0-9:\s\.\?\,\&\*]+\)", "", lyric) + lyric = re.sub(r"\[[A-Za-z0-9:\s\.\?\,\&\*]+\]", "", lyric) + lyric = re.sub(r"[A-Za-z0-9]+::", "", lyric) + lyric = re.sub(r"[A-Za-z0-9]+:", "", lyric) + lyric = re.sub(r"/[A-Za-z0-9]+", "", lyric) + lyric = re.sub(r"x[0-9]", "", lyric) + forbidden_words = ['chorus', 'refrain', 'coda', 'solo', 'intro', 'introduction', 'verse', 'pre-chorus', + 'post-chorus', 'bridge', 'outro', 'ref'] + tokens = word_tokenize(lyric) + words = [word for word in tokens if word.isalpha()] + words = [word for word in words if word not in forbidden_words] + dataset += words + print(name.split('\\')[-1], "number of words in cleaned data: ", len(dataset)) + return dataset + + +def create_markov_model(dataset, n_gram): markov_model = {} for i in range(len(dataset) - n_gram - 1): current_state, next_state = "", "" @@ -25,11 +51,12 @@ def create_markov_model(dataset, n_gram=2): return markov_model -def generate_lyrics(markov_model, start, limit=100): +def generate_lyrics(markov_model, start, limit): n = 0 current_state = start lyrics = "" lyrics += current_state + " " + lyrics = lyrics[0].upper() + lyrics[1:] while n < limit: next_state = random.choices(list(markov_model[current_state].keys()), list(markov_model[current_state].values())) diff --git a/scrapper.py b/scrapper.py index 1b65f26..daa541c 100644 --- a/scrapper.py +++ b/scrapper.py @@ -4,26 +4,10 @@ from bs4 import BeautifulSoup import requests import os import time -import re -from nltk.tokenize import word_tokenize from ScrapThread import ScrapThread from proxy_handling import proxies_validation -def clean_data(name): - document = pd.read_csv(name, usecols=["Lyrics"]) - rows = document["Lyrics"].values.tolist() - dataset = [] - for lyric in rows: - lyric = lyric.lower() - lyric = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", lyric) - tokens = word_tokenize(lyric) - words = [word for word in tokens if word.isalpha()] - dataset += words - print(name.split('\\')[-1], ": ", len(dataset)) - return dataset - - def connect(url, proxies_list): headers = { 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.30 (KHTML, like Gecko) " @@ -156,19 +140,17 @@ def scrap_data(pink_floyd_selected_albums, black_sabbath_selected_albums, time_s proxies_list = proxies_validation() file = open("links.txt") path = os.path.dirname(os.path.abspath(__file__)) - path = path + "\\Data\\" - file.readline() - file.readline() - paktofonika = do_threading(file.readline()[0:-1], [], 0.0, proxies_list) - figofagot = do_threading(file.readline(), [], 0.0, proxies_list) - braciofonika_pigo_pagot = pd.concat([paktofonika, figofagot], ignore_index=True) - paktofonika.to_csv((path + "Paktofonika.csv")) - figofagot.to_csv((path + "Bracia Figo Fagot.csv")) - braciofonika_pigo_pagot.to_csv((path + "Braciofonika Pigo Pagot.csv")) + path = os.path.join(path, "Data") pink_floyd_data_frame = do_threading(file.readline()[0:-1], pink_floyd_selected_albums, time_stamp, proxies_list) black_sabbath_data_frame = do_threading(file.readline(), black_sabbath_selected_albums, time_stamp, proxies_list) pink_sabbath_data_frame = pd.concat([pink_floyd_data_frame, black_sabbath_data_frame], ignore_index=True) pink_floyd_data_frame.to_csv((path + "PinkFloyd.csv")) black_sabbath_data_frame.to_csv((path + "BlackSabbath.csv")) pink_sabbath_data_frame.to_csv((path + "PinkSabbath.csv")) + paktofonika = do_threading(file.readline()[0:-1], [], 0.0, proxies_list) + figofagot = do_threading(file.readline(), [], 0.0, proxies_list) + braciofonika_pigo_pagot = pd.concat([paktofonika, figofagot], ignore_index=True) + paktofonika.to_csv((path + "Paktofonika.csv")) + figofagot.to_csv((path + "Bracia Figo Fagot.csv")) + braciofonika_pigo_pagot.to_csv((path + "Braciofonika Pigo Pagot.csv")) os.remove("valid_proxy_list")