Minor bugs repair.

2025-04-25 02:38:01 +00:00 · 2023-03-28 15:30:52 +02:00 · 2023-03-28 15:30:52 +02:00 · a87304e138
commit a87304e138
parent afeb9d579b
3 changed files with 26 additions and 23 deletions
--- a/links.txt
+++ b/links.txt
@ -2,3 +2,4 @@ https://www.azlyrics.com/p/pinkfloyd.html
 https://www.azlyrics.com/b/blacksabbath.html
 https://www.tekstowo.pl/piosenki_artysty,paktofonika.html
 https://www.tekstowo.pl/piosenki_artysty,bracia_figo_fagot.html
 https://www.tekstowo.pl/piosenki_artysty,kuki.html
--- a/main.py
+++ b/main.py
@ -1,8 +1,6 @@
 import os
 import random
 import pandas as pd
 from scrapper import scrap_data
 from markov_model import clean_data
 from markov_model import create_markov_model
@ -65,20 +63,20 @@ def scraping():
 def merging():
    name1 = input("Select first band file: ")
-    if os.path.exists(path + name1):
+    if os.path.exists(os.path.join(path, name1)):
-        df1 = pd.read_csv(path + name1)
+        df1 = pd.read_csv(os.path.join(path, name1))
    else:
        print("No such file in directory!")
        return
    name2 = input("Select second band file: ")
-    if os.path.exists(path + name2):
+    if os.path.exists(os.path.join(path, name2)):
-        df2 = pd.read_csv(path + name2)
+        df2 = pd.read_csv(os.path.join(path, name2))
    else:
        print("No such file in directory!")
        return
    dfResult = pd.concat([df1, df2], ignore_index=True)
    result_name = input("Select name of result file: ")
-    dfResult.to_csv(path + result_name)
+    dfResult.to_csv(os.path.join(path, result_name))
 def main():
--- a/scrapper.py
+++ b/scrapper.py
@ -6,7 +6,6 @@ import os
 import time
 from ScrapThread import ScrapThread
 from proxy_handling import proxies_validation
 from main import path
 def connect(url, proxies_list):
@ -16,8 +15,8 @@ def connect(url, proxies_list):
    main_page = None
    while True:
        try:
-            main_page = requests.get(url, headers=headers, proxies={'http': random.choice(proxies_list),
+            main_page = requests.get(url, headers=headers) #, proxies={'http': random.choice(proxies_list),
-                                                                    'https': random.choice(proxies_list)}, timeout=5.0)
+                                                           #         'https': random.choice(proxies_list)}, timeout=5.0)
            break
        except:
            continue
@ -138,12 +137,17 @@ def do_threading(url, selected_albums, time_stamp, proxies_list):
 def scrap_data(url, selected_albums, time_stamp):
-    proxies_list = proxies_validation()
+    # proxies_list = proxies_validation()
    proxies_list = []
    df = do_threading(url, selected_albums, time_stamp, proxies_list)
    path = os.path.dirname(os.path.abspath(__file__))
    path = os.path.join(path, "Data")
    if url.split('/')[2] == 'www.azlyrics.com':
-        filename = url.split('/')[4][:-5]
+        filename = url.split('/')[4][:-5] + '.csv'
-        df.to_csv((path + filename))
+        saving = os.path.join(path, filename)
        df.to_csv(saving)
    if url.split('/')[2] == 'www.tekstowo.pl':
-        filename = url.split(',')[1][:-5]
+        filename = url.split(',')[1][:-5] + '.csv'
-        df.to_csv((path + filename))
+        saving = os.path.join(path, filename)
-    os.remove("valid_proxy_list")
+        df.to_csv(saving)
    # os.remove("valid_proxy_list")