Minor bugs repair.

2025-04-25 02:38:01 +00:00 · 2023-03-28 15:30:52 +02:00 · 2023-03-28 15:30:52 +02:00 · a87304e138
commit a87304e138
parent afeb9d579b
3 changed files with 26 additions and 23 deletions
--- a/links.txt
+++ b/links.txt
@ -2,3 +2,4 @@ https://www.azlyrics.com/p/pinkfloyd.html
 https://www.azlyrics.com/b/blacksabbath.html
 https://www.tekstowo.pl/piosenki_artysty,paktofonika.html
 https://www.tekstowo.pl/piosenki_artysty,bracia_figo_fagot.html
+https://www.tekstowo.pl/piosenki_artysty,kuki.html
--- a/main.py
+++ b/main.py
@ -1,22 +1,20 @@
 import os
 import random
-
 import pandas as pd
-
 from scrapper import scrap_data
 from markov_model import clean_data
 from markov_model import create_markov_model
 from markov_model import generate_lyrics

 blacksabbath_selected_albums = ["Black Sabbath", "Paranoid", "Master Of Reality", "Vol 4", "Sabbath Bloody Sabbath",
-                                 "Sabotage", "Technical Ecstasy", "Never Say Die!", "Heaven And Hell", "Mob Rules",
-                                 "Born Again", "Seventh Star", "The Eternal Idol", "Headless Cross", "Tyr",
-                                 "Dehumanizer", "Cross Purposes", "Forbidden", "13"]
+                                "Sabotage", "Technical Ecstasy", "Never Say Die!", "Heaven And Hell", "Mob Rules",
+                                "Born Again", "Seventh Star", "The Eternal Idol", "Headless Cross", "Tyr",
+                                "Dehumanizer", "Cross Purposes", "Forbidden", "13"]

 pinkfloyd_selected_albums = ["The Piper At The Gates Of Dawn", "A Saucerful Of Secrets", "Meddle", "More", "Ummagumma",
-                              "Atom Heart Mother", "Obscured By Clouds", "The Dark Side Of The Moon",
-                              "Wish You Were Here", "Animals", "The Wall", "The Final Cut",
-                              "A Momentary Lapse Of Reason", "The Division Bell"]
+                             "Atom Heart Mother", "Obscured By Clouds", "The Dark Side Of The Moon",
+                             "Wish You Were Here", "Animals", "The Wall", "The Final Cut",
+                             "A Momentary Lapse Of Reason", "The Division Bell"]

 time_stamp = 3.5
 path = os.path.dirname(os.path.abspath(__file__))
@ -65,20 +63,20 @@ def scraping():

 def merging():
    name1 = input("Select first band file: ")
-    if os.path.exists(path + name1):
-        df1 = pd.read_csv(path + name1)
+    if os.path.exists(os.path.join(path, name1)):
+        df1 = pd.read_csv(os.path.join(path, name1))
    else:
        print("No such file in directory!")
        return
    name2 = input("Select second band file: ")
-    if os.path.exists(path + name2):
-        df2 = pd.read_csv(path + name2)
+    if os.path.exists(os.path.join(path, name2)):
+        df2 = pd.read_csv(os.path.join(path, name2))
    else:
        print("No such file in directory!")
        return
    dfResult = pd.concat([df1, df2], ignore_index=True)
    result_name = input("Select name of result file: ")
-    dfResult.to_csv(path + result_name)
+    dfResult.to_csv(os.path.join(path, result_name))


 def main():
--- a/scrapper.py
+++ b/scrapper.py
@ -6,7 +6,6 @@ import os
 import time
 from ScrapThread import ScrapThread
 from proxy_handling import proxies_validation
-from main import path


 def connect(url, proxies_list):
@ -16,8 +15,8 @@ def connect(url, proxies_list):
    main_page = None
    while True:
        try:
-            main_page = requests.get(url, headers=headers, proxies={'http': random.choice(proxies_list),
-                                                                    'https': random.choice(proxies_list)}, timeout=5.0)
+            main_page = requests.get(url, headers=headers) #, proxies={'http': random.choice(proxies_list),
+                                                           #         'https': random.choice(proxies_list)}, timeout=5.0)
            break
        except:
            continue
@ -138,12 +137,17 @@ def do_threading(url, selected_albums, time_stamp, proxies_list):


 def scrap_data(url, selected_albums, time_stamp):
-    proxies_list = proxies_validation()
+    # proxies_list = proxies_validation()
+    proxies_list = []
    df = do_threading(url, selected_albums, time_stamp, proxies_list)
+    path = os.path.dirname(os.path.abspath(__file__))
+    path = os.path.join(path, "Data")
    if url.split('/')[2] == 'www.azlyrics.com':
-        filename = url.split('/')[4][:-5]
-        df.to_csv((path + filename))
+        filename = url.split('/')[4][:-5] + '.csv'
+        saving = os.path.join(path, filename)
+        df.to_csv(saving)
    if url.split('/')[2] == 'www.tekstowo.pl':
-        filename = url.split(',')[1][:-5]
-        df.to_csv((path + filename))
-    os.remove("valid_proxy_list")
+        filename = url.split(',')[1][:-5] + '.csv'
+        saving = os.path.join(path, filename)
+        df.to_csv(saving)
+    # os.remove("valid_proxy_list")