from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.keys import Keys import time import os import re def scrape_deviantart(query, max_pages): """ Scrapes text content from DeviantArt based on a query using Selenium. Args: query (str): The search query to find relevant stories. max_pages (int): The number of pages to scrape. Returns: None: Saves scraped content into text files in the `data` directory. """ output_dir = "data" if not os.path.exists(output_dir): os.makedirs(output_dir) # Configure Selenium WebDriver for Brave options = Options() options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe" # Path to Brave browser executable # options.add_argument("--headless") # Uncomment to enable headless mode options.add_argument("--disable-gpu") options.add_argument("--no-sandbox") service = Service("F:\\chromedriver\\chromedriver.exe") # Path to your ChromeDriver driver = webdriver.Chrome(service=service, options=options) try: base_url = "https://www.deviantart.com/search/?q=" driver.get(base_url + query) scraped_urls = set() # To avoid duplicates for page in range(1, max_pages + 1): print(f"Scraping page {page}...") time.sleep(3) # Allow time for dynamic content to load # Locate all grid rows containing stories grid_rows = driver.find_elements(By.XPATH, "//div[@data-testid='grid-row']") print(f"DEBUG: Found {len(grid_rows)} grid rows on the page.") if not grid_rows: print("No stories found on this page.") break for row_idx, row in enumerate(grid_rows): story_elements = row.find_elements(By.XPATH, ".//a[contains(@class, 'torpedo-thumb-link')] | .//a[contains(@href, '/art/')]") print(f"DEBUG: Found {len(story_elements)} story elements in row {row_idx + 1}.") for idx, link_element in enumerate(story_elements): try: story_url = link_element.get_attribute("href") # Exclude URLs ending with '#comments' if story_url.endswith("#comments"): print(f"Skipping comment URL: {story_url}") continue if story_url in scraped_urls: print(f"Skipping duplicate URL: {story_url}") continue # Verify the URL format to ensure it's a story link if "/art/" not in story_url: print(f"Skipping non-story URL: {story_url}") continue scraped_urls.add(story_url) print(f"Processing story {idx + 1} in row {row_idx + 1}: {story_url}") # Open the story in a new tab driver.execute_script("window.open('{}', '_blank');".format(story_url)) driver.switch_to.window(driver.window_handles[-1]) # Switch to the new tab # Scrape the content scrape_deviation_content(driver, story_url, output_dir) # Close the tab and return to the main tab driver.close() driver.switch_to.window(driver.window_handles[0]) except Exception as e: print(f"Error processing a story element: {e}") # Navigate to the next page if applicable try: next_button = driver.find_element(By.XPATH, "//a[contains(@href, 'cursor=') and contains(., 'Next')]") driver.execute_script("arguments[0].scrollIntoView(true);", next_button) time.sleep(1) # Allow any animations to complete next_button.click() time.sleep(2) # Allow time for the next page to load except Exception as e: print(f"No more pages available or navigation failed: {e}") break except Exception as e: print(f"An error occurred during scraping: {e}") finally: print("Browser will remain open for debugging. Close it manually when done.") def scrape_deviation_content(driver, story_url, output_dir): """ Scrapes the content of a single DeviantArt story page using Selenium. Args: driver (WebDriver): Selenium WebDriver instance. story_url (str): URL of the deviation page. output_dir (str): Directory to save the story content. Returns: None: Saves the story content into a text file. """ time.sleep(2) # Allow time for the page to load try: title_element = driver.find_element(By.TAG_NAME, "h1") # Locate the main story container by data-editor-viewer attribute try: story_container = driver.find_element(By.XPATH, "//div[@data-editor-viewer='1']") except: # Fallback to legacy-journal if data-editor-viewer is not found print("DEBUG: Falling back to legacy-journal.") story_container = driver.find_element(By.XPATH, "//div[contains(@class, 'legacy-journal')]") # Extract the story content content = story_container.text.strip() title = title_element.text.strip() print(f"DEBUG: Extracted content length: {len(content)}") # Debugging log if not title or not content: print(f"No content found on {story_url}") return file_name = re.sub(r"[^a-zA-Z0-9_]+", "_", title) + ".txt" file_path = os.path.join(output_dir, file_name) with open(file_path, "w", encoding="utf-8") as f: f.write(content) print(f"Saved: {file_path}") except Exception as e: print(f"Failed to scrape content from {story_url}: {e}")