Minor bugs repair.

This commit is contained in:
Sebastian Kutny 2023-03-28 15:30:52 +02:00
parent afeb9d579b
commit a87304e138
3 changed files with 26 additions and 23 deletions

View File

@ -2,3 +2,4 @@ https://www.azlyrics.com/p/pinkfloyd.html
https://www.azlyrics.com/b/blacksabbath.html
https://www.tekstowo.pl/piosenki_artysty,paktofonika.html
https://www.tekstowo.pl/piosenki_artysty,bracia_figo_fagot.html
https://www.tekstowo.pl/piosenki_artysty,kuki.html

12
main.py
View File

@ -1,8 +1,6 @@
import os
import random
import pandas as pd
from scrapper import scrap_data
from markov_model import clean_data
from markov_model import create_markov_model
@ -65,20 +63,20 @@ def scraping():
def merging():
name1 = input("Select first band file: ")
if os.path.exists(path + name1):
df1 = pd.read_csv(path + name1)
if os.path.exists(os.path.join(path, name1)):
df1 = pd.read_csv(os.path.join(path, name1))
else:
print("No such file in directory!")
return
name2 = input("Select second band file: ")
if os.path.exists(path + name2):
df2 = pd.read_csv(path + name2)
if os.path.exists(os.path.join(path, name2)):
df2 = pd.read_csv(os.path.join(path, name2))
else:
print("No such file in directory!")
return
dfResult = pd.concat([df1, df2], ignore_index=True)
result_name = input("Select name of result file: ")
dfResult.to_csv(path + result_name)
dfResult.to_csv(os.path.join(path, result_name))
def main():

View File

@ -6,7 +6,6 @@ import os
import time
from ScrapThread import ScrapThread
from proxy_handling import proxies_validation
from main import path
def connect(url, proxies_list):
@ -16,8 +15,8 @@ def connect(url, proxies_list):
main_page = None
while True:
try:
main_page = requests.get(url, headers=headers, proxies={'http': random.choice(proxies_list),
'https': random.choice(proxies_list)}, timeout=5.0)
main_page = requests.get(url, headers=headers) #, proxies={'http': random.choice(proxies_list),
# 'https': random.choice(proxies_list)}, timeout=5.0)
break
except:
continue
@ -138,12 +137,17 @@ def do_threading(url, selected_albums, time_stamp, proxies_list):
def scrap_data(url, selected_albums, time_stamp):
proxies_list = proxies_validation()
# proxies_list = proxies_validation()
proxies_list = []
df = do_threading(url, selected_albums, time_stamp, proxies_list)
path = os.path.dirname(os.path.abspath(__file__))
path = os.path.join(path, "Data")
if url.split('/')[2] == 'www.azlyrics.com':
filename = url.split('/')[4][:-5]
df.to_csv((path + filename))
filename = url.split('/')[4][:-5] + '.csv'
saving = os.path.join(path, filename)
df.to_csv(saving)
if url.split('/')[2] == 'www.tekstowo.pl':
filename = url.split(',')[1][:-5]
df.to_csv((path + filename))
os.remove("valid_proxy_list")
filename = url.split(',')[1][:-5] + '.csv'
saving = os.path.join(path, filename)
df.to_csv(saving)
# os.remove("valid_proxy_list")