mirror of
https://github.com/WallyS02/Song-Lyrics-Generator.git
synced 2024-11-20 09:38:50 +00:00
175 lines
7.1 KiB
Python
175 lines
7.1 KiB
Python
|
import random
|
||
|
import pandas as pd
|
||
|
from bs4 import BeautifulSoup
|
||
|
import requests
|
||
|
import os
|
||
|
import time
|
||
|
import re
|
||
|
from nltk.tokenize import word_tokenize
|
||
|
from ScrapThread import ScrapThread
|
||
|
from proxy_handling import proxies_validation
|
||
|
|
||
|
|
||
|
def clean_data(name):
|
||
|
document = pd.read_csv(name, usecols=["Lyrics"])
|
||
|
rows = document["Lyrics"].values.tolist()
|
||
|
dataset = []
|
||
|
for lyric in rows:
|
||
|
lyric = lyric.lower()
|
||
|
lyric = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", lyric)
|
||
|
tokens = word_tokenize(lyric)
|
||
|
words = [word for word in tokens if word.isalpha()]
|
||
|
dataset += words
|
||
|
print(name.split('\\')[-1], ": ", len(dataset))
|
||
|
return dataset
|
||
|
|
||
|
|
||
|
def connect(url, proxies_list):
|
||
|
headers = {
|
||
|
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.30 (KHTML, like Gecko) "
|
||
|
"Chrome/12.0.742.112 Safari/534.30"}
|
||
|
main_page = None
|
||
|
while True:
|
||
|
try:
|
||
|
main_page = requests.get(url, headers=headers, proxies={'http': random.choice(proxies_list),
|
||
|
'https': random.choice(proxies_list)}, timeout=5.0)
|
||
|
break
|
||
|
except:
|
||
|
continue
|
||
|
if main_page.status_code != 200:
|
||
|
raise Exception("Access Denied!")
|
||
|
return main_page
|
||
|
|
||
|
|
||
|
def get_lyricsAzlyrics(url, proxies_list):
|
||
|
main_page_text = BeautifulSoup(connect(url, proxies_list).text, "html.parser")
|
||
|
lyrics = str()
|
||
|
for goods in main_page_text.find_all("div", {"class": None}):
|
||
|
if len(goods.text) == 0:
|
||
|
pass
|
||
|
lyrics = lyrics + goods.text
|
||
|
return lyrics
|
||
|
|
||
|
|
||
|
def scrapAzlyrics(url, selected_albums, time_stamp, proxies_list):
|
||
|
main_page_text = BeautifulSoup(connect(url, proxies_list).text, "html.parser")
|
||
|
albums = main_page_text.find_all(class_="album")
|
||
|
if not albums:
|
||
|
raise Exception("Access Denied!")
|
||
|
scraped_lyrics = pd.DataFrame(columns=['Title', 'Lyrics'])
|
||
|
for alb in albums:
|
||
|
album_title = alb.find("b").text
|
||
|
album_title = album_title[1:-1]
|
||
|
print("Checking album: ", album_title)
|
||
|
if album_title in selected_albums:
|
||
|
next_song = alb.find_next(class_="listalbum-item")
|
||
|
if next_song.find(class_="comment"):
|
||
|
next_song = next_song.find_next(class_=["listalbum-item", "album"])
|
||
|
while True:
|
||
|
if not next_song.find(class_="comment"):
|
||
|
title = next_song.find("a").text
|
||
|
lyrics = get_lyricsAzlyrics(("https://www.azlyrics.com/" + next_song.find("a")['href']),
|
||
|
proxies_list)
|
||
|
df = pd.DataFrame({
|
||
|
"Title": [title],
|
||
|
"Lyrics": [lyrics]
|
||
|
})
|
||
|
scraped_lyrics = pd.concat([scraped_lyrics, df], ignore_index=True)
|
||
|
print("Downloaded song: ", title)
|
||
|
next_song = next_song.find_next(class_=["listalbum-item", "album"])
|
||
|
if next_song['class'] != ['listalbum-item']:
|
||
|
break
|
||
|
else:
|
||
|
next_song = next_song.find_next(class_=["listalbum-item", "album"])
|
||
|
if next_song['class'] != ['listalbum-item']:
|
||
|
break
|
||
|
time.sleep(time_stamp)
|
||
|
return scraped_lyrics
|
||
|
|
||
|
|
||
|
def get_lyricsTekstowo(url, proxies_list):
|
||
|
main_page_text = BeautifulSoup(connect(url, proxies_list).text, "html.parser")
|
||
|
lyrics = str()
|
||
|
for goods in main_page_text.find_all("div", {"class": "inner-text"}):
|
||
|
if len(goods.text) == 0:
|
||
|
pass
|
||
|
if goods.parent.attrs['class'] == ['song-text']:
|
||
|
lyrics = lyrics + goods.text
|
||
|
return lyrics
|
||
|
|
||
|
|
||
|
def scrapTekstowo(url, proxies_list):
|
||
|
main_page_text = BeautifulSoup(connect(url, proxies_list).text, "html.parser")
|
||
|
song_list = main_page_text.find(class_='ranking-lista')
|
||
|
songs = song_list.find_all(class_="box-przeboje")
|
||
|
scraped_lyrics = pd.DataFrame(columns=['Title', 'Lyrics'])
|
||
|
if not songs:
|
||
|
raise Exception("Access Denied!")
|
||
|
for song in songs:
|
||
|
ref = song.find("a")
|
||
|
title = ref.text.split('-')[1][1:]
|
||
|
lyrics = get_lyricsTekstowo('https://www.tekstowo.pl' + ref['href'], proxies_list)
|
||
|
df = pd.DataFrame({
|
||
|
"Title": [title],
|
||
|
"Lyrics": [lyrics]
|
||
|
})
|
||
|
scraped_lyrics = pd.concat([scraped_lyrics, df], ignore_index=True)
|
||
|
print("Downloaded song: ", title)
|
||
|
return scraped_lyrics
|
||
|
|
||
|
|
||
|
def check_pagination(url, proxies_list):
|
||
|
main_page_text = BeautifulSoup(connect(url, proxies_list).text, "html.parser")
|
||
|
pages = main_page_text.find_all(class_='page-link')
|
||
|
page_number = 0
|
||
|
for page in pages:
|
||
|
if page.text != 'Następna >>' and page.text != '<< Poprzednia':
|
||
|
page_number = page_number + 1
|
||
|
return page_number / 2
|
||
|
|
||
|
|
||
|
def do_threading(url, selected_albums, time_stamp, proxies_list):
|
||
|
threads = []
|
||
|
thread_number = 0
|
||
|
if url.split('/')[2] == 'www.azlyrics.com':
|
||
|
thread_number = len(selected_albums)
|
||
|
if url.split('/')[2] == 'www.tekstowo.pl':
|
||
|
thread_number = int(check_pagination(url, proxies_list))
|
||
|
df = pd.DataFrame(columns=['Title', 'Lyrics'])
|
||
|
for i in range(thread_number):
|
||
|
t = None
|
||
|
if url.split('/')[2] == 'www.azlyrics.com':
|
||
|
t = ScrapThread(target=scrapAzlyrics, args=(url, [selected_albums[i]], time_stamp, proxies_list))
|
||
|
if url.split('/')[2] == 'www.tekstowo.pl':
|
||
|
newUrl = url[:-5] + ',strona,' + str(i + 1) + '.html'
|
||
|
t = ScrapThread(target=scrapTekstowo, args=(newUrl, proxies_list))
|
||
|
t.daemon = True
|
||
|
threads.append(t)
|
||
|
for i in range(thread_number):
|
||
|
threads[i].start()
|
||
|
for i in range(thread_number):
|
||
|
df = pd.concat([df, threads[i].join()], ignore_index=True)
|
||
|
return df
|
||
|
|
||
|
|
||
|
def scrap_data(pink_floyd_selected_albums, black_sabbath_selected_albums, time_stamp):
|
||
|
proxies_list = proxies_validation()
|
||
|
file = open("links.txt")
|
||
|
path = os.path.dirname(os.path.abspath(__file__))
|
||
|
path = path + "\\Data\\"
|
||
|
file.readline()
|
||
|
file.readline()
|
||
|
paktofonika = do_threading(file.readline()[0:-1], [], 0.0, proxies_list)
|
||
|
figofagot = do_threading(file.readline(), [], 0.0, proxies_list)
|
||
|
braciofonika_pigo_pagot = pd.concat([paktofonika, figofagot], ignore_index=True)
|
||
|
paktofonika.to_csv((path + "Paktofonika.csv"))
|
||
|
figofagot.to_csv((path + "Bracia Figo Fagot.csv"))
|
||
|
braciofonika_pigo_pagot.to_csv((path + "Braciofonika Pigo Pagot.csv"))
|
||
|
pink_floyd_data_frame = do_threading(file.readline()[0:-1], pink_floyd_selected_albums, time_stamp, proxies_list)
|
||
|
black_sabbath_data_frame = do_threading(file.readline(), black_sabbath_selected_albums, time_stamp, proxies_list)
|
||
|
pink_sabbath_data_frame = pd.concat([pink_floyd_data_frame, black_sabbath_data_frame], ignore_index=True)
|
||
|
pink_floyd_data_frame.to_csv((path + "PinkFloyd.csv"))
|
||
|
black_sabbath_data_frame.to_csv((path + "BlackSabbath.csv"))
|
||
|
pink_sabbath_data_frame.to_csv((path + "PinkSabbath.csv"))
|
||
|
os.remove("valid_proxy_list")
|