b9d4d752d7
uploaded api, scraper, as well as KH quotes scraped from websites
199 lines
3.7 KiB
Python
199 lines
3.7 KiB
Python
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
import json #the first few of lines import different selenium libraries
|
|
|
|
# Function to configure and return a WebDriver instance
|
|
def configure_driver():
|
|
# Configure the driver (e.g., using Chrome)
|
|
driver = webdriver.Chrome()
|
|
return driver
|
|
|
|
# Function to scrape blockquote texts from a given character page
|
|
def scrape_character_page(driver, url):
|
|
# Navigate to the character page
|
|
driver.get(url)
|
|
|
|
# Find all blockquote elements and print their text
|
|
blockquotes = driver.find_elements(By.TAG_NAME, "blockquote")
|
|
# Return a list of texts from each blockquote element
|
|
return [blockquote.text for blockquote in blockquotes]
|
|
|
|
def main():
|
|
# List of character page URLs to be scraped
|
|
character_urls = [
|
|
'https://www.khdatabase.com/Ansem',
|
|
'https://www.khdatabase.com/Ariel',
|
|
'https://www.khdatabase.com/Cloud',
|
|
'https://www.khdatabase.com/Daisy_Duck',
|
|
'https://www.khdatabase.com/Donald_Duck',
|
|
'https://www.khdatabase.com/Goofy',
|
|
'https://www.khdatabase.com/Hades',
|
|
'https://www.khdatabase.com/Hercules',
|
|
'https://www.khdatabase.com/Ice_Titan',
|
|
'https://www.khdatabase.com/Jiminy_Cricket',
|
|
'https://www.khdatabase.com/Kairi',
|
|
'https://www.khdatabase.com/King_Mickey_Mouse',
|
|
'https://www.khdatabase.com/Lava_Titan',
|
|
'https://www.khdatabase.com/Leon',
|
|
'https://www.khdatabase.com/Maleficent',
|
|
'https://www.khdatabase.com/Merlin',
|
|
'https://www.khdatabase.com/Moogle',
|
|
'https://www.khdatabase.com/Philoctetes',
|
|
'https://www.khdatabase.com/Queen_Minnie_Mouse',
|
|
'https://www.khdatabase.com/Riku',
|
|
'https://www.khdatabase.com/Rock_Titan',
|
|
'https://www.khdatabase.com/Simba',
|
|
'https://www.khdatabase.com/Sora',
|
|
'https://www.khdatabase.com/Tornado_Titan',
|
|
'https://www.khdatabase.com/Ursula',
|
|
]
|
|
|
|
# Configure the WebDriver
|
|
driver = configure_driver()
|
|
|
|
# Initialize an empty list to store all scraped quotes
|
|
all_quotes = []
|
|
try:
|
|
# Iterate over each URL in the character_urls list
|
|
for url in character_urls:
|
|
# Scrape blockquote texts from the current URL
|
|
quotes = scrape_character_page(driver, url)
|
|
# Add the scraped quotes to the all_quotes list
|
|
all_quotes.extend(quotes)
|
|
finally:
|
|
# Close the WebDriver once scraping is done or if an error occurs
|
|
driver.quit()
|
|
|
|
# Write the collected quotes to a JSON file
|
|
with open('quotes.json', 'w') as file:
|
|
# Convert the list of quotes to JSON format and save i
|
|
json.dump(all_quotes, file)
|
|
|
|
|
|
# Python's way to check if this script is being run as the main program
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# def configure_driver():
|
|
# driver = webdriver.Chrome()
|
|
# return driver
|
|
|
|
|
|
# def get_character_urls(driver, url):
|
|
# driver.get(url)
|
|
# character_links = driver.find_elements(By.CSS_SELECTOR, "div.mw-category-group ul li a")
|
|
# urls = [link.get_attribute('href') for link in character_links]
|
|
|
|
# return urls
|
|
|
|
# def main():
|
|
# characters_list_page = "https://www.khdatabase.com/Category:Kingdom_Hearts_characters"
|
|
|
|
# driver = configure_driver()
|
|
|
|
# try:
|
|
# character_urls = get_character_urls(driver, characters_list_page)
|
|
# for url in character_urls:
|
|
# print(url)
|
|
# finally:
|
|
# driver.quit()
|
|
|
|
# if __name__ == "__main__":
|
|
# main()
|
|
|
|
# #elements = driver.find_elements(By.TAG_NAME, "blockquote")
|
|
|
|
# #for element in elements:
|
|
# # print(element.text)
|
|
|
|
# #driver.quit()
|