From a87304e1387fe3a90cfc2950f54563fee726c7f1 Mon Sep 17 00:00:00 2001 From: Sebastian Kutny Date: Tue, 28 Mar 2023 15:30:52 +0200 Subject: [PATCH] Minor bugs repair. --- links.txt | 3 ++- main.py | 24 +++++++++++------------- scrapper.py | 22 +++++++++++++--------- 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/links.txt b/links.txt index f2e7e27..dafe680 100644 --- a/links.txt +++ b/links.txt @@ -1,4 +1,5 @@ https://www.azlyrics.com/p/pinkfloyd.html https://www.azlyrics.com/b/blacksabbath.html https://www.tekstowo.pl/piosenki_artysty,paktofonika.html -https://www.tekstowo.pl/piosenki_artysty,bracia_figo_fagot.html \ No newline at end of file +https://www.tekstowo.pl/piosenki_artysty,bracia_figo_fagot.html +https://www.tekstowo.pl/piosenki_artysty,kuki.html \ No newline at end of file diff --git a/main.py b/main.py index aaf8529..e93570b 100644 --- a/main.py +++ b/main.py @@ -1,22 +1,20 @@ import os import random - import pandas as pd - from scrapper import scrap_data from markov_model import clean_data from markov_model import create_markov_model from markov_model import generate_lyrics blacksabbath_selected_albums = ["Black Sabbath", "Paranoid", "Master Of Reality", "Vol 4", "Sabbath Bloody Sabbath", - "Sabotage", "Technical Ecstasy", "Never Say Die!", "Heaven And Hell", "Mob Rules", - "Born Again", "Seventh Star", "The Eternal Idol", "Headless Cross", "Tyr", - "Dehumanizer", "Cross Purposes", "Forbidden", "13"] + "Sabotage", "Technical Ecstasy", "Never Say Die!", "Heaven And Hell", "Mob Rules", + "Born Again", "Seventh Star", "The Eternal Idol", "Headless Cross", "Tyr", + "Dehumanizer", "Cross Purposes", "Forbidden", "13"] pinkfloyd_selected_albums = ["The Piper At The Gates Of Dawn", "A Saucerful Of Secrets", "Meddle", "More", "Ummagumma", - "Atom Heart Mother", "Obscured By Clouds", "The Dark Side Of The Moon", - "Wish You Were Here", "Animals", "The Wall", "The Final Cut", - "A Momentary Lapse Of Reason", "The Division Bell"] + "Atom Heart Mother", "Obscured By Clouds", "The Dark Side Of The Moon", + "Wish You Were Here", "Animals", "The Wall", "The Final Cut", + "A Momentary Lapse Of Reason", "The Division Bell"] time_stamp = 3.5 path = os.path.dirname(os.path.abspath(__file__)) @@ -65,20 +63,20 @@ def scraping(): def merging(): name1 = input("Select first band file: ") - if os.path.exists(path + name1): - df1 = pd.read_csv(path + name1) + if os.path.exists(os.path.join(path, name1)): + df1 = pd.read_csv(os.path.join(path, name1)) else: print("No such file in directory!") return name2 = input("Select second band file: ") - if os.path.exists(path + name2): - df2 = pd.read_csv(path + name2) + if os.path.exists(os.path.join(path, name2)): + df2 = pd.read_csv(os.path.join(path, name2)) else: print("No such file in directory!") return dfResult = pd.concat([df1, df2], ignore_index=True) result_name = input("Select name of result file: ") - dfResult.to_csv(path + result_name) + dfResult.to_csv(os.path.join(path, result_name)) def main(): diff --git a/scrapper.py b/scrapper.py index b103280..e3d21ea 100644 --- a/scrapper.py +++ b/scrapper.py @@ -6,7 +6,6 @@ import os import time from ScrapThread import ScrapThread from proxy_handling import proxies_validation -from main import path def connect(url, proxies_list): @@ -16,8 +15,8 @@ def connect(url, proxies_list): main_page = None while True: try: - main_page = requests.get(url, headers=headers, proxies={'http': random.choice(proxies_list), - 'https': random.choice(proxies_list)}, timeout=5.0) + main_page = requests.get(url, headers=headers) #, proxies={'http': random.choice(proxies_list), + # 'https': random.choice(proxies_list)}, timeout=5.0) break except: continue @@ -138,12 +137,17 @@ def do_threading(url, selected_albums, time_stamp, proxies_list): def scrap_data(url, selected_albums, time_stamp): - proxies_list = proxies_validation() + # proxies_list = proxies_validation() + proxies_list = [] df = do_threading(url, selected_albums, time_stamp, proxies_list) + path = os.path.dirname(os.path.abspath(__file__)) + path = os.path.join(path, "Data") if url.split('/')[2] == 'www.azlyrics.com': - filename = url.split('/')[4][:-5] - df.to_csv((path + filename)) + filename = url.split('/')[4][:-5] + '.csv' + saving = os.path.join(path, filename) + df.to_csv(saving) if url.split('/')[2] == 'www.tekstowo.pl': - filename = url.split(',')[1][:-5] - df.to_csv((path + filename)) - os.remove("valid_proxy_list") + filename = url.split(',')[1][:-5] + '.csv' + saving = os.path.join(path, filename) + df.to_csv(saving) + # os.remove("valid_proxy_list")