Made a scrapper
This commit is contained in:
parent
a4ae884022
commit
7e407dd1c9
151
deviantart_scraper.py
Normal file
151
deviantart_scraper.py
Normal file
@ -0,0 +1,151 @@
|
|||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.common.action_chains import ActionChains
|
||||||
|
from selenium.webdriver.common.keys import Keys
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
def scrape_deviantart(query, max_pages):
|
||||||
|
"""
|
||||||
|
Scrapes text content from DeviantArt based on a query using Selenium.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query (str): The search query to find relevant stories.
|
||||||
|
max_pages (int): The number of pages to scrape.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None: Saves scraped content into text files in the `data` directory.
|
||||||
|
"""
|
||||||
|
output_dir = "data"
|
||||||
|
if not os.path.exists(output_dir):
|
||||||
|
os.makedirs(output_dir)
|
||||||
|
|
||||||
|
# Configure Selenium WebDriver for Brave
|
||||||
|
options = Options()
|
||||||
|
options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe" # Path to Brave browser executable
|
||||||
|
# options.add_argument("--headless") # Uncomment to enable headless mode
|
||||||
|
options.add_argument("--disable-gpu")
|
||||||
|
options.add_argument("--no-sandbox")
|
||||||
|
service = Service("F:\\chromedriver\\chromedriver.exe") # Path to your ChromeDriver
|
||||||
|
driver = webdriver.Chrome(service=service, options=options)
|
||||||
|
|
||||||
|
try:
|
||||||
|
base_url = "https://www.deviantart.com/search/?q="
|
||||||
|
driver.get(base_url + query)
|
||||||
|
scraped_urls = set() # To avoid duplicates
|
||||||
|
|
||||||
|
for page in range(1, max_pages + 1):
|
||||||
|
print(f"Scraping page {page}...")
|
||||||
|
time.sleep(3) # Allow time for dynamic content to load
|
||||||
|
|
||||||
|
# Locate all grid rows containing stories
|
||||||
|
grid_rows = driver.find_elements(By.XPATH, "//div[@data-testid='grid-row']")
|
||||||
|
print(f"DEBUG: Found {len(grid_rows)} grid rows on the page.")
|
||||||
|
|
||||||
|
if not grid_rows:
|
||||||
|
print("No stories found on this page.")
|
||||||
|
break
|
||||||
|
|
||||||
|
for row_idx, row in enumerate(grid_rows):
|
||||||
|
story_elements = row.find_elements(By.XPATH, ".//a[contains(@class, 'torpedo-thumb-link')] | .//a[contains(@href, '/art/')]")
|
||||||
|
print(f"DEBUG: Found {len(story_elements)} story elements in row {row_idx + 1}.")
|
||||||
|
|
||||||
|
for idx, link_element in enumerate(story_elements):
|
||||||
|
try:
|
||||||
|
story_url = link_element.get_attribute("href")
|
||||||
|
|
||||||
|
# Exclude URLs ending with '#comments'
|
||||||
|
if story_url.endswith("#comments"):
|
||||||
|
print(f"Skipping comment URL: {story_url}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if story_url in scraped_urls:
|
||||||
|
print(f"Skipping duplicate URL: {story_url}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Verify the URL format to ensure it's a story link
|
||||||
|
if "/art/" not in story_url:
|
||||||
|
print(f"Skipping non-story URL: {story_url}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
scraped_urls.add(story_url)
|
||||||
|
print(f"Processing story {idx + 1} in row {row_idx + 1}: {story_url}")
|
||||||
|
|
||||||
|
# Open the story in a new tab
|
||||||
|
driver.execute_script("window.open('{}', '_blank');".format(story_url))
|
||||||
|
driver.switch_to.window(driver.window_handles[-1]) # Switch to the new tab
|
||||||
|
|
||||||
|
# Scrape the content
|
||||||
|
scrape_deviation_content(driver, story_url, output_dir)
|
||||||
|
|
||||||
|
# Close the tab and return to the main tab
|
||||||
|
driver.close()
|
||||||
|
driver.switch_to.window(driver.window_handles[0])
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing a story element: {e}")
|
||||||
|
|
||||||
|
# Navigate to the next page if applicable
|
||||||
|
try:
|
||||||
|
next_button = driver.find_element(By.XPATH, "//a[contains(@href, 'cursor=') and contains(., 'Next')]")
|
||||||
|
driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
|
||||||
|
time.sleep(1) # Allow any animations to complete
|
||||||
|
next_button.click()
|
||||||
|
time.sleep(2) # Allow time for the next page to load
|
||||||
|
except Exception as e:
|
||||||
|
print(f"No more pages available or navigation failed: {e}")
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An error occurred during scraping: {e}")
|
||||||
|
finally:
|
||||||
|
print("Browser will remain open for debugging. Close it manually when done.")
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_deviation_content(driver, story_url, output_dir):
|
||||||
|
"""
|
||||||
|
Scrapes the content of a single DeviantArt story page using Selenium.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
driver (WebDriver): Selenium WebDriver instance.
|
||||||
|
story_url (str): URL of the deviation page.
|
||||||
|
output_dir (str): Directory to save the story content.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None: Saves the story content into a text file.
|
||||||
|
"""
|
||||||
|
time.sleep(2) # Allow time for the page to load
|
||||||
|
|
||||||
|
try:
|
||||||
|
title_element = driver.find_element(By.TAG_NAME, "h1")
|
||||||
|
|
||||||
|
# Locate the main story container by data-editor-viewer attribute
|
||||||
|
try:
|
||||||
|
story_container = driver.find_element(By.XPATH, "//div[@data-editor-viewer='1']")
|
||||||
|
except:
|
||||||
|
# Fallback to legacy-journal if data-editor-viewer is not found
|
||||||
|
print("DEBUG: Falling back to legacy-journal.")
|
||||||
|
story_container = driver.find_element(By.XPATH, "//div[contains(@class, 'legacy-journal')]")
|
||||||
|
|
||||||
|
# Extract the story content
|
||||||
|
content = story_container.text.strip()
|
||||||
|
|
||||||
|
title = title_element.text.strip()
|
||||||
|
|
||||||
|
print(f"DEBUG: Extracted content length: {len(content)}") # Debugging log
|
||||||
|
|
||||||
|
if not title or not content:
|
||||||
|
print(f"No content found on {story_url}")
|
||||||
|
return
|
||||||
|
|
||||||
|
file_name = re.sub(r"[^a-zA-Z0-9_]+", "_", title) + ".txt"
|
||||||
|
file_path = os.path.join(output_dir, file_name)
|
||||||
|
|
||||||
|
with open(file_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(content)
|
||||||
|
print(f"Saved: {file_path}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to scrape content from {story_url}: {e}")
|
Loading…
x
Reference in New Issue
Block a user