Interface improvements.

This commit is contained in:
Sebastian Kutny 2023-03-28 15:08:23 +02:00
parent a2c1694adb
commit afeb9d579b
3 changed files with 94 additions and 38 deletions

79
main.py
View File

@ -1,16 +1,19 @@
import os import os
import random import random
import pandas as pd
from scrapper import scrap_data from scrapper import scrap_data
from markov_model import clean_data from markov_model import clean_data
from markov_model import create_markov_model from markov_model import create_markov_model
from markov_model import generate_lyrics from markov_model import generate_lyrics
black_sabbath_selected_albums = ["Black Sabbath", "Paranoid", "Master Of Reality", "Vol 4", "Sabbath Bloody Sabbath", blacksabbath_selected_albums = ["Black Sabbath", "Paranoid", "Master Of Reality", "Vol 4", "Sabbath Bloody Sabbath",
"Sabotage", "Technical Ecstasy", "Never Say Die!", "Heaven And Hell", "Mob Rules", "Sabotage", "Technical Ecstasy", "Never Say Die!", "Heaven And Hell", "Mob Rules",
"Born Again", "Seventh Star", "The Eternal Idol", "Headless Cross", "Tyr", "Born Again", "Seventh Star", "The Eternal Idol", "Headless Cross", "Tyr",
"Dehumanizer", "Cross Purposes", "Forbidden", "13"] "Dehumanizer", "Cross Purposes", "Forbidden", "13"]
pink_floyd_selected_albums = ["The Piper At The Gates Of Dawn", "A Saucerful Of Secrets", "Meddle", "More", "Ummagumma", pinkfloyd_selected_albums = ["The Piper At The Gates Of Dawn", "A Saucerful Of Secrets", "Meddle", "More", "Ummagumma",
"Atom Heart Mother", "Obscured By Clouds", "The Dark Side Of The Moon", "Atom Heart Mother", "Obscured By Clouds", "The Dark Side Of The Moon",
"Wish You Were Here", "Animals", "The Wall", "The Final Cut", "Wish You Were Here", "Animals", "The Wall", "The Final Cut",
"A Momentary Lapse Of Reason", "The Division Bell"] "A Momentary Lapse Of Reason", "The Division Bell"]
@ -24,36 +27,74 @@ def generate_song(name):
dataset = clean_data(os.path.join(path, name)) dataset = clean_data(os.path.join(path, name))
n_gram = int(input("Select number of words in Markov state: ")) n_gram = int(input("Select number of words in Markov state: "))
number_of_verses = int(input("Select number of verses: ")) number_of_verses = int(input("Select number of verses: "))
words_in_verses = int(int(input("Select number of words in verses: ")) / n_gram) words_in_verses = int((int(input("Select number of words in verses: ")) - 1) / n_gram)
model = create_markov_model(dataset, n_gram) degree_of_chain = int(input("Select degree of chain: "))
model = create_markov_model(dataset, n_gram, degree_of_chain)
print('\n') print('\n')
last_state = random.choice(list(model.keys()))
for i in range(number_of_verses): for i in range(number_of_verses):
generated_lyrics = generate_lyrics(model, random.choice(list(model.keys())), words_in_verses) generated_lyrics, last_state = generate_lyrics(model, last_state, words_in_verses)
print(generated_lyrics) print(generated_lyrics)
last_state = random.choices(list(model[last_state].keys()),
list(model[last_state].values()))[0]
def scraping():
with open("links.txt", "r") as f:
lines = f.readlines()
for i in range(len(lines)):
if i != (len(lines) - 1):
print(str(i) + ".", lines[i][:-1])
else:
print(str(i) + ".", lines[i])
line_index = int(input("Select url to scrap: "))
url = lines[line_index]
if line_index != (len(lines) - 1):
url = url[:-1]
if url.split('/')[2] == 'www.azlyrics.com':
selected_albums_name = url.split('/')[4][:-5] + "_selected_albums"
if selected_albums_name in globals():
selected_albums = globals()[selected_albums_name]
scrap_data(url, selected_albums, time_stamp)
else:
print("Define selected albums in global list variable in format: bandname_selected_albums")
return
if url.split('/')[2] == 'www.tekstowo.pl':
scrap_data(url, [], 0.0)
def merging():
name1 = input("Select first band file: ")
if os.path.exists(path + name1):
df1 = pd.read_csv(path + name1)
else:
print("No such file in directory!")
return
name2 = input("Select second band file: ")
if os.path.exists(path + name2):
df2 = pd.read_csv(path + name2)
else:
print("No such file in directory!")
return
dfResult = pd.concat([df1, df2], ignore_index=True)
result_name = input("Select name of result file: ")
dfResult.to_csv(path + result_name)
def main(): def main():
print("Select data set to use in generation or other option:\n1. Pink Floyd lyrics generation\n2. Black Sabbath " print("Select data set to use in generation or other option:\n1. Generate text based on input filename\n2. Scrap "
"lyrics generation\n3. Bracia Figo Fagot\n4. Paktofonika\n5. Fused English (aka Pink Sabbath) lyrics " "data\n3. Merge CSV band's songs\n4. Exit")
"generation\n6. Fused Polish (aka Braciofonika Pigo Pagot)\n7. Scrap data\n8. Exit")
while True: while True:
selection = int(input()) selection = int(input())
match selection: match selection:
case 1: case 1:
generate_song("Pink Floyd.csv") name = input("Select name of data file: ")
generate_song(name)
case 2: case 2:
generate_song("Black Sabbath.csv") scraping()
case 3: case 3:
generate_song("Bracia Figo Fagot.csv") merging()
case 4: case 4:
generate_song("Paktofonika.csv")
case 5:
generate_song("Pink Sabbath.csv")
case 6:
generate_song("Braciofonika Pigo Pagot.csv")
case 7:
scrap_data(pink_floyd_selected_albums, black_sabbath_selected_albums, time_stamp)
case 8:
break break
print("\nCommand executed") print("\nCommand executed")

View File

@ -2,6 +2,8 @@ import random
import re import re
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
import pandas as pd import pandas as pd
import numpy as np
from scipy import sparse
def clean_data(name): def clean_data(name):
@ -27,9 +29,9 @@ def clean_data(name):
return dataset return dataset
def create_markov_model(dataset, n_gram): def create_markov_model(dataset, n_gram, n_step):
markov_model = {} markov_model = {}
for i in range(len(dataset) - n_gram - 1): for i in range(len(dataset) - 1 - 2 * n_gram):
current_state, next_state = "", "" current_state, next_state = "", ""
for j in range(n_gram): for j in range(n_gram):
current_state += dataset[i + j] + " " current_state += dataset[i + j] + " "
@ -48,6 +50,26 @@ def create_markov_model(dataset, n_gram):
total = sum(transition.values()) total = sum(transition.values())
for state, count in transition.items(): for state, count in transition.items():
markov_model[current_state][state] = count / total markov_model[current_state][state] = count / total
"""matrix = [[0 for _ in range(len(markov_model.items()))] for _ in range(int(len(markov_model.items())))]
for current_state, transition in markov_model.items():
tempRow = list(markov_model.items())
indexRow = [idx for idx, key in enumerate(tempRow) if key[0] == current_state]
total = sum(transition.values())
for state, count in transition.items():
tempCol = list(transition.items())
indexCol = [idx for idx, key in enumerate(tempCol) if key[0] == state]
markov_model[current_state][state] = count / total
matrix[indexRow[0]][indexCol[0]] = markov_model[current_state][state]
matrix = np.array(matrix)
for i in range(n_step):
matrix = matrix.dot(matrix)
for current_state, transition in markov_model.items():
tempRow = list(markov_model.items())
indexRow = [idx for idx, key in enumerate(tempRow) if key[0] == current_state]
for state, count in transition.items():
tempCol = list(transition.items())
indexCol = [idx for idx, key in enumerate(tempCol) if key[0] == state]
markov_model[current_state][state] += matrix[indexRow[0]][indexCol[0]]"""
return markov_model return markov_model
@ -63,4 +85,4 @@ def generate_lyrics(markov_model, start, limit):
current_state = next_state[0] current_state = next_state[0]
lyrics += current_state + " " lyrics += current_state + " "
n += 1 n += 1
return lyrics return lyrics, current_state

View File

@ -6,6 +6,7 @@ import os
import time import time
from ScrapThread import ScrapThread from ScrapThread import ScrapThread
from proxy_handling import proxies_validation from proxy_handling import proxies_validation
from main import path
def connect(url, proxies_list): def connect(url, proxies_list):
@ -136,21 +137,13 @@ def do_threading(url, selected_albums, time_stamp, proxies_list):
return df return df
def scrap_data(pink_floyd_selected_albums, black_sabbath_selected_albums, time_stamp): def scrap_data(url, selected_albums, time_stamp):
proxies_list = proxies_validation() proxies_list = proxies_validation()
file = open("links.txt") df = do_threading(url, selected_albums, time_stamp, proxies_list)
path = os.path.dirname(os.path.abspath(__file__)) if url.split('/')[2] == 'www.azlyrics.com':
path = os.path.join(path, "Data") filename = url.split('/')[4][:-5]
pink_floyd_data_frame = do_threading(file.readline()[0:-1], pink_floyd_selected_albums, time_stamp, proxies_list) df.to_csv((path + filename))
black_sabbath_data_frame = do_threading(file.readline(), black_sabbath_selected_albums, time_stamp, proxies_list) if url.split('/')[2] == 'www.tekstowo.pl':
pink_sabbath_data_frame = pd.concat([pink_floyd_data_frame, black_sabbath_data_frame], ignore_index=True) filename = url.split(',')[1][:-5]
pink_floyd_data_frame.to_csv((path + "PinkFloyd.csv")) df.to_csv((path + filename))
black_sabbath_data_frame.to_csv((path + "BlackSabbath.csv"))
pink_sabbath_data_frame.to_csv((path + "PinkSabbath.csv"))
paktofonika = do_threading(file.readline()[0:-1], [], 0.0, proxies_list)
figofagot = do_threading(file.readline(), [], 0.0, proxies_list)
braciofonika_pigo_pagot = pd.concat([paktofonika, figofagot], ignore_index=True)
paktofonika.to_csv((path + "Paktofonika.csv"))
figofagot.to_csv((path + "Bracia Figo Fagot.csv"))
braciofonika_pigo_pagot.to_csv((path + "Braciofonika Pigo Pagot.csv"))
os.remove("valid_proxy_list") os.remove("valid_proxy_list")