Minor bugs repair.

This commit is contained in:
Sebastian Kutny 2023-03-28 15:30:52 +02:00
parent afeb9d579b
commit a87304e138
3 changed files with 26 additions and 23 deletions

View File

@ -1,4 +1,5 @@
https://www.azlyrics.com/p/pinkfloyd.html https://www.azlyrics.com/p/pinkfloyd.html
https://www.azlyrics.com/b/blacksabbath.html https://www.azlyrics.com/b/blacksabbath.html
https://www.tekstowo.pl/piosenki_artysty,paktofonika.html https://www.tekstowo.pl/piosenki_artysty,paktofonika.html
https://www.tekstowo.pl/piosenki_artysty,bracia_figo_fagot.html https://www.tekstowo.pl/piosenki_artysty,bracia_figo_fagot.html
https://www.tekstowo.pl/piosenki_artysty,kuki.html

24
main.py
View File

@ -1,22 +1,20 @@
import os import os
import random import random
import pandas as pd import pandas as pd
from scrapper import scrap_data from scrapper import scrap_data
from markov_model import clean_data from markov_model import clean_data
from markov_model import create_markov_model from markov_model import create_markov_model
from markov_model import generate_lyrics from markov_model import generate_lyrics
blacksabbath_selected_albums = ["Black Sabbath", "Paranoid", "Master Of Reality", "Vol 4", "Sabbath Bloody Sabbath", blacksabbath_selected_albums = ["Black Sabbath", "Paranoid", "Master Of Reality", "Vol 4", "Sabbath Bloody Sabbath",
"Sabotage", "Technical Ecstasy", "Never Say Die!", "Heaven And Hell", "Mob Rules", "Sabotage", "Technical Ecstasy", "Never Say Die!", "Heaven And Hell", "Mob Rules",
"Born Again", "Seventh Star", "The Eternal Idol", "Headless Cross", "Tyr", "Born Again", "Seventh Star", "The Eternal Idol", "Headless Cross", "Tyr",
"Dehumanizer", "Cross Purposes", "Forbidden", "13"] "Dehumanizer", "Cross Purposes", "Forbidden", "13"]
pinkfloyd_selected_albums = ["The Piper At The Gates Of Dawn", "A Saucerful Of Secrets", "Meddle", "More", "Ummagumma", pinkfloyd_selected_albums = ["The Piper At The Gates Of Dawn", "A Saucerful Of Secrets", "Meddle", "More", "Ummagumma",
"Atom Heart Mother", "Obscured By Clouds", "The Dark Side Of The Moon", "Atom Heart Mother", "Obscured By Clouds", "The Dark Side Of The Moon",
"Wish You Were Here", "Animals", "The Wall", "The Final Cut", "Wish You Were Here", "Animals", "The Wall", "The Final Cut",
"A Momentary Lapse Of Reason", "The Division Bell"] "A Momentary Lapse Of Reason", "The Division Bell"]
time_stamp = 3.5 time_stamp = 3.5
path = os.path.dirname(os.path.abspath(__file__)) path = os.path.dirname(os.path.abspath(__file__))
@ -65,20 +63,20 @@ def scraping():
def merging(): def merging():
name1 = input("Select first band file: ") name1 = input("Select first band file: ")
if os.path.exists(path + name1): if os.path.exists(os.path.join(path, name1)):
df1 = pd.read_csv(path + name1) df1 = pd.read_csv(os.path.join(path, name1))
else: else:
print("No such file in directory!") print("No such file in directory!")
return return
name2 = input("Select second band file: ") name2 = input("Select second band file: ")
if os.path.exists(path + name2): if os.path.exists(os.path.join(path, name2)):
df2 = pd.read_csv(path + name2) df2 = pd.read_csv(os.path.join(path, name2))
else: else:
print("No such file in directory!") print("No such file in directory!")
return return
dfResult = pd.concat([df1, df2], ignore_index=True) dfResult = pd.concat([df1, df2], ignore_index=True)
result_name = input("Select name of result file: ") result_name = input("Select name of result file: ")
dfResult.to_csv(path + result_name) dfResult.to_csv(os.path.join(path, result_name))
def main(): def main():

View File

@ -6,7 +6,6 @@ import os
import time import time
from ScrapThread import ScrapThread from ScrapThread import ScrapThread
from proxy_handling import proxies_validation from proxy_handling import proxies_validation
from main import path
def connect(url, proxies_list): def connect(url, proxies_list):
@ -16,8 +15,8 @@ def connect(url, proxies_list):
main_page = None main_page = None
while True: while True:
try: try:
main_page = requests.get(url, headers=headers, proxies={'http': random.choice(proxies_list), main_page = requests.get(url, headers=headers) #, proxies={'http': random.choice(proxies_list),
'https': random.choice(proxies_list)}, timeout=5.0) # 'https': random.choice(proxies_list)}, timeout=5.0)
break break
except: except:
continue continue
@ -138,12 +137,17 @@ def do_threading(url, selected_albums, time_stamp, proxies_list):
def scrap_data(url, selected_albums, time_stamp): def scrap_data(url, selected_albums, time_stamp):
proxies_list = proxies_validation() # proxies_list = proxies_validation()
proxies_list = []
df = do_threading(url, selected_albums, time_stamp, proxies_list) df = do_threading(url, selected_albums, time_stamp, proxies_list)
path = os.path.dirname(os.path.abspath(__file__))
path = os.path.join(path, "Data")
if url.split('/')[2] == 'www.azlyrics.com': if url.split('/')[2] == 'www.azlyrics.com':
filename = url.split('/')[4][:-5] filename = url.split('/')[4][:-5] + '.csv'
df.to_csv((path + filename)) saving = os.path.join(path, filename)
df.to_csv(saving)
if url.split('/')[2] == 'www.tekstowo.pl': if url.split('/')[2] == 'www.tekstowo.pl':
filename = url.split(',')[1][:-5] filename = url.split(',')[1][:-5] + '.csv'
df.to_csv((path + filename)) saving = os.path.join(path, filename)
os.remove("valid_proxy_list") df.to_csv(saving)
# os.remove("valid_proxy_list")