Minor bugs repair.

This commit is contained in:
Sebastian Kutny 2023-03-28 15:30:52 +02:00
parent afeb9d579b
commit a87304e138
3 changed files with 26 additions and 23 deletions

View File

@ -2,3 +2,4 @@ https://www.azlyrics.com/p/pinkfloyd.html
https://www.azlyrics.com/b/blacksabbath.html https://www.azlyrics.com/b/blacksabbath.html
https://www.tekstowo.pl/piosenki_artysty,paktofonika.html https://www.tekstowo.pl/piosenki_artysty,paktofonika.html
https://www.tekstowo.pl/piosenki_artysty,bracia_figo_fagot.html https://www.tekstowo.pl/piosenki_artysty,bracia_figo_fagot.html
https://www.tekstowo.pl/piosenki_artysty,kuki.html

12
main.py
View File

@ -1,8 +1,6 @@
import os import os
import random import random
import pandas as pd import pandas as pd
from scrapper import scrap_data from scrapper import scrap_data
from markov_model import clean_data from markov_model import clean_data
from markov_model import create_markov_model from markov_model import create_markov_model
@ -65,20 +63,20 @@ def scraping():
def merging(): def merging():
name1 = input("Select first band file: ") name1 = input("Select first band file: ")
if os.path.exists(path + name1): if os.path.exists(os.path.join(path, name1)):
df1 = pd.read_csv(path + name1) df1 = pd.read_csv(os.path.join(path, name1))
else: else:
print("No such file in directory!") print("No such file in directory!")
return return
name2 = input("Select second band file: ") name2 = input("Select second band file: ")
if os.path.exists(path + name2): if os.path.exists(os.path.join(path, name2)):
df2 = pd.read_csv(path + name2) df2 = pd.read_csv(os.path.join(path, name2))
else: else:
print("No such file in directory!") print("No such file in directory!")
return return
dfResult = pd.concat([df1, df2], ignore_index=True) dfResult = pd.concat([df1, df2], ignore_index=True)
result_name = input("Select name of result file: ") result_name = input("Select name of result file: ")
dfResult.to_csv(path + result_name) dfResult.to_csv(os.path.join(path, result_name))
def main(): def main():

View File

@ -6,7 +6,6 @@ import os
import time import time
from ScrapThread import ScrapThread from ScrapThread import ScrapThread
from proxy_handling import proxies_validation from proxy_handling import proxies_validation
from main import path
def connect(url, proxies_list): def connect(url, proxies_list):
@ -16,8 +15,8 @@ def connect(url, proxies_list):
main_page = None main_page = None
while True: while True:
try: try:
main_page = requests.get(url, headers=headers, proxies={'http': random.choice(proxies_list), main_page = requests.get(url, headers=headers) #, proxies={'http': random.choice(proxies_list),
'https': random.choice(proxies_list)}, timeout=5.0) # 'https': random.choice(proxies_list)}, timeout=5.0)
break break
except: except:
continue continue
@ -138,12 +137,17 @@ def do_threading(url, selected_albums, time_stamp, proxies_list):
def scrap_data(url, selected_albums, time_stamp): def scrap_data(url, selected_albums, time_stamp):
proxies_list = proxies_validation() # proxies_list = proxies_validation()
proxies_list = []
df = do_threading(url, selected_albums, time_stamp, proxies_list) df = do_threading(url, selected_albums, time_stamp, proxies_list)
path = os.path.dirname(os.path.abspath(__file__))
path = os.path.join(path, "Data")
if url.split('/')[2] == 'www.azlyrics.com': if url.split('/')[2] == 'www.azlyrics.com':
filename = url.split('/')[4][:-5] filename = url.split('/')[4][:-5] + '.csv'
df.to_csv((path + filename)) saving = os.path.join(path, filename)
df.to_csv(saving)
if url.split('/')[2] == 'www.tekstowo.pl': if url.split('/')[2] == 'www.tekstowo.pl':
filename = url.split(',')[1][:-5] filename = url.split(',')[1][:-5] + '.csv'
df.to_csv((path + filename)) saving = os.path.join(path, filename)
os.remove("valid_proxy_list") df.to_csv(saving)
# os.remove("valid_proxy_list")