From afeb9d579bc49f562f76a8fa8c988a859fac2600 Mon Sep 17 00:00:00 2001 From: Sebastian Kutny Date: Tue, 28 Mar 2023 15:08:23 +0200 Subject: [PATCH] Interface improvements. --- main.py | 79 +++++++++++++++++++++++++++++++++++++------------ markov_model.py | 28 ++++++++++++++++-- scrapper.py | 25 ++++++---------- 3 files changed, 94 insertions(+), 38 deletions(-) diff --git a/main.py b/main.py index b276e27..aaf8529 100644 --- a/main.py +++ b/main.py @@ -1,16 +1,19 @@ import os import random + +import pandas as pd + from scrapper import scrap_data from markov_model import clean_data from markov_model import create_markov_model from markov_model import generate_lyrics -black_sabbath_selected_albums = ["Black Sabbath", "Paranoid", "Master Of Reality", "Vol 4", "Sabbath Bloody Sabbath", +blacksabbath_selected_albums = ["Black Sabbath", "Paranoid", "Master Of Reality", "Vol 4", "Sabbath Bloody Sabbath", "Sabotage", "Technical Ecstasy", "Never Say Die!", "Heaven And Hell", "Mob Rules", "Born Again", "Seventh Star", "The Eternal Idol", "Headless Cross", "Tyr", "Dehumanizer", "Cross Purposes", "Forbidden", "13"] -pink_floyd_selected_albums = ["The Piper At The Gates Of Dawn", "A Saucerful Of Secrets", "Meddle", "More", "Ummagumma", +pinkfloyd_selected_albums = ["The Piper At The Gates Of Dawn", "A Saucerful Of Secrets", "Meddle", "More", "Ummagumma", "Atom Heart Mother", "Obscured By Clouds", "The Dark Side Of The Moon", "Wish You Were Here", "Animals", "The Wall", "The Final Cut", "A Momentary Lapse Of Reason", "The Division Bell"] @@ -24,36 +27,74 @@ def generate_song(name): dataset = clean_data(os.path.join(path, name)) n_gram = int(input("Select number of words in Markov state: ")) number_of_verses = int(input("Select number of verses: ")) - words_in_verses = int(int(input("Select number of words in verses: ")) / n_gram) - model = create_markov_model(dataset, n_gram) + words_in_verses = int((int(input("Select number of words in verses: ")) - 1) / n_gram) + degree_of_chain = int(input("Select degree of chain: ")) + model = create_markov_model(dataset, n_gram, degree_of_chain) print('\n') + last_state = random.choice(list(model.keys())) for i in range(number_of_verses): - generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), words_in_verses) + generated_lyrics, last_state = generate_lyrics(model, last_state, words_in_verses) print(generated_lyrics) + last_state = random.choices(list(model[last_state].keys()), + list(model[last_state].values()))[0] + + +def scraping(): + with open("links.txt", "r") as f: + lines = f.readlines() + for i in range(len(lines)): + if i != (len(lines) - 1): + print(str(i) + ".", lines[i][:-1]) + else: + print(str(i) + ".", lines[i]) + line_index = int(input("Select url to scrap: ")) + url = lines[line_index] + if line_index != (len(lines) - 1): + url = url[:-1] + if url.split('/')[2] == 'www.azlyrics.com': + selected_albums_name = url.split('/')[4][:-5] + "_selected_albums" + if selected_albums_name in globals(): + selected_albums = globals()[selected_albums_name] + scrap_data(url, selected_albums, time_stamp) + else: + print("Define selected albums in global list variable in format: bandname_selected_albums") + return + if url.split('/')[2] == 'www.tekstowo.pl': + scrap_data(url, [], 0.0) + + +def merging(): + name1 = input("Select first band file: ") + if os.path.exists(path + name1): + df1 = pd.read_csv(path + name1) + else: + print("No such file in directory!") + return + name2 = input("Select second band file: ") + if os.path.exists(path + name2): + df2 = pd.read_csv(path + name2) + else: + print("No such file in directory!") + return + dfResult = pd.concat([df1, df2], ignore_index=True) + result_name = input("Select name of result file: ") + dfResult.to_csv(path + result_name) def main(): - print("Select data set to use in generation or other option:\n1. Pink Floyd lyrics generation\n2. Black Sabbath " - "lyrics generation\n3. Bracia Figo Fagot\n4. Paktofonika\n5. Fused English (aka Pink Sabbath) lyrics " - "generation\n6. Fused Polish (aka Braciofonika Pigo Pagot)\n7. Scrap data\n8. Exit") + print("Select data set to use in generation or other option:\n1. Generate text based on input filename\n2. Scrap " + "data\n3. Merge CSV band's songs\n4. Exit") while True: selection = int(input()) match selection: case 1: - generate_song("Pink Floyd.csv") + name = input("Select name of data file: ") + generate_song(name) case 2: - generate_song("Black Sabbath.csv") + scraping() case 3: - generate_song("Bracia Figo Fagot.csv") + merging() case 4: - generate_song("Paktofonika.csv") - case 5: - generate_song("Pink Sabbath.csv") - case 6: - generate_song("Braciofonika Pigo Pagot.csv") - case 7: - scrap_data(pink_floyd_selected_albums, black_sabbath_selected_albums, time_stamp) - case 8: break print("\nCommand executed") diff --git a/markov_model.py b/markov_model.py index 904e618..f426a6c 100644 --- a/markov_model.py +++ b/markov_model.py @@ -2,6 +2,8 @@ import random import re from nltk.tokenize import word_tokenize import pandas as pd +import numpy as np +from scipy import sparse def clean_data(name): @@ -27,9 +29,9 @@ def clean_data(name): return dataset -def create_markov_model(dataset, n_gram): +def create_markov_model(dataset, n_gram, n_step): markov_model = {} - for i in range(len(dataset) - n_gram - 1): + for i in range(len(dataset) - 1 - 2 * n_gram): current_state, next_state = "", "" for j in range(n_gram): current_state += dataset[i + j] + " " @@ -48,6 +50,26 @@ def create_markov_model(dataset, n_gram): total = sum(transition.values()) for state, count in transition.items(): markov_model[current_state][state] = count / total + """matrix = [[0 for _ in range(len(markov_model.items()))] for _ in range(int(len(markov_model.items())))] + for current_state, transition in markov_model.items(): + tempRow = list(markov_model.items()) + indexRow = [idx for idx, key in enumerate(tempRow) if key[0] == current_state] + total = sum(transition.values()) + for state, count in transition.items(): + tempCol = list(transition.items()) + indexCol = [idx for idx, key in enumerate(tempCol) if key[0] == state] + markov_model[current_state][state] = count / total + matrix[indexRow[0]][indexCol[0]] = markov_model[current_state][state] + matrix = np.array(matrix) + for i in range(n_step): + matrix = matrix.dot(matrix) + for current_state, transition in markov_model.items(): + tempRow = list(markov_model.items()) + indexRow = [idx for idx, key in enumerate(tempRow) if key[0] == current_state] + for state, count in transition.items(): + tempCol = list(transition.items()) + indexCol = [idx for idx, key in enumerate(tempCol) if key[0] == state] + markov_model[current_state][state] += matrix[indexRow[0]][indexCol[0]]""" return markov_model @@ -63,4 +85,4 @@ def generate_lyrics(markov_model, start, limit): current_state = next_state[0] lyrics += current_state + " " n += 1 - return lyrics + return lyrics, current_state diff --git a/scrapper.py b/scrapper.py index daa541c..b103280 100644 --- a/scrapper.py +++ b/scrapper.py @@ -6,6 +6,7 @@ import os import time from ScrapThread import ScrapThread from proxy_handling import proxies_validation +from main import path def connect(url, proxies_list): @@ -136,21 +137,13 @@ def do_threading(url, selected_albums, time_stamp, proxies_list): return df -def scrap_data(pink_floyd_selected_albums, black_sabbath_selected_albums, time_stamp): +def scrap_data(url, selected_albums, time_stamp): proxies_list = proxies_validation() - file = open("links.txt") - path = os.path.dirname(os.path.abspath(__file__)) - path = os.path.join(path, "Data") - pink_floyd_data_frame = do_threading(file.readline()[0:-1], pink_floyd_selected_albums, time_stamp, proxies_list) - black_sabbath_data_frame = do_threading(file.readline(), black_sabbath_selected_albums, time_stamp, proxies_list) - pink_sabbath_data_frame = pd.concat([pink_floyd_data_frame, black_sabbath_data_frame], ignore_index=True) - pink_floyd_data_frame.to_csv((path + "PinkFloyd.csv")) - black_sabbath_data_frame.to_csv((path + "BlackSabbath.csv")) - pink_sabbath_data_frame.to_csv((path + "PinkSabbath.csv")) - paktofonika = do_threading(file.readline()[0:-1], [], 0.0, proxies_list) - figofagot = do_threading(file.readline(), [], 0.0, proxies_list) - braciofonika_pigo_pagot = pd.concat([paktofonika, figofagot], ignore_index=True) - paktofonika.to_csv((path + "Paktofonika.csv")) - figofagot.to_csv((path + "Bracia Figo Fagot.csv")) - braciofonika_pigo_pagot.to_csv((path + "Braciofonika Pigo Pagot.csv")) + df = do_threading(url, selected_albums, time_stamp, proxies_list) + if url.split('/')[2] == 'www.azlyrics.com': + filename = url.split('/')[4][:-5] + df.to_csv((path + filename)) + if url.split('/')[2] == 'www.tekstowo.pl': + filename = url.split(',')[1][:-5] + df.to_csv((path + filename)) os.remove("valid_proxy_list")