From 7e407dd1c90724521094cc60ce22abe047be4fa9 Mon Sep 17 00:00:00 2001 From: Dan Date: Thu, 19 Dec 2024 22:07:48 -0500 Subject: [PATCH] Made a scrapper --- deviantart_scraper.py | 151 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 deviantart_scraper.py diff --git a/deviantart_scraper.py b/deviantart_scraper.py new file mode 100644 index 0000000..d296f70 --- /dev/null +++ b/deviantart_scraper.py @@ -0,0 +1,151 @@ +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.action_chains import ActionChains +from selenium.webdriver.common.keys import Keys +import time +import os +import re + +def scrape_deviantart(query, max_pages): + """ + Scrapes text content from DeviantArt based on a query using Selenium. + + Args: + query (str): The search query to find relevant stories. + max_pages (int): The number of pages to scrape. + + Returns: + None: Saves scraped content into text files in the `data` directory. + """ + output_dir = "data" + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # Configure Selenium WebDriver for Brave + options = Options() + options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe" # Path to Brave browser executable + # options.add_argument("--headless") # Uncomment to enable headless mode + options.add_argument("--disable-gpu") + options.add_argument("--no-sandbox") + service = Service("F:\\chromedriver\\chromedriver.exe") # Path to your ChromeDriver + driver = webdriver.Chrome(service=service, options=options) + + try: + base_url = "https://www.deviantart.com/search/?q=" + driver.get(base_url + query) + scraped_urls = set() # To avoid duplicates + + for page in range(1, max_pages + 1): + print(f"Scraping page {page}...") + time.sleep(3) # Allow time for dynamic content to load + + # Locate all grid rows containing stories + grid_rows = driver.find_elements(By.XPATH, "//div[@data-testid='grid-row']") + print(f"DEBUG: Found {len(grid_rows)} grid rows on the page.") + + if not grid_rows: + print("No stories found on this page.") + break + + for row_idx, row in enumerate(grid_rows): + story_elements = row.find_elements(By.XPATH, ".//a[contains(@class, 'torpedo-thumb-link')] | .//a[contains(@href, '/art/')]") + print(f"DEBUG: Found {len(story_elements)} story elements in row {row_idx + 1}.") + + for idx, link_element in enumerate(story_elements): + try: + story_url = link_element.get_attribute("href") + + # Exclude URLs ending with '#comments' + if story_url.endswith("#comments"): + print(f"Skipping comment URL: {story_url}") + continue + + if story_url in scraped_urls: + print(f"Skipping duplicate URL: {story_url}") + continue + + # Verify the URL format to ensure it's a story link + if "/art/" not in story_url: + print(f"Skipping non-story URL: {story_url}") + continue + + scraped_urls.add(story_url) + print(f"Processing story {idx + 1} in row {row_idx + 1}: {story_url}") + + # Open the story in a new tab + driver.execute_script("window.open('{}', '_blank');".format(story_url)) + driver.switch_to.window(driver.window_handles[-1]) # Switch to the new tab + + # Scrape the content + scrape_deviation_content(driver, story_url, output_dir) + + # Close the tab and return to the main tab + driver.close() + driver.switch_to.window(driver.window_handles[0]) + + except Exception as e: + print(f"Error processing a story element: {e}") + + # Navigate to the next page if applicable + try: + next_button = driver.find_element(By.XPATH, "//a[contains(@href, 'cursor=') and contains(., 'Next')]") + driver.execute_script("arguments[0].scrollIntoView(true);", next_button) + time.sleep(1) # Allow any animations to complete + next_button.click() + time.sleep(2) # Allow time for the next page to load + except Exception as e: + print(f"No more pages available or navigation failed: {e}") + break + except Exception as e: + print(f"An error occurred during scraping: {e}") + finally: + print("Browser will remain open for debugging. Close it manually when done.") + + +def scrape_deviation_content(driver, story_url, output_dir): + """ + Scrapes the content of a single DeviantArt story page using Selenium. + + Args: + driver (WebDriver): Selenium WebDriver instance. + story_url (str): URL of the deviation page. + output_dir (str): Directory to save the story content. + + Returns: + None: Saves the story content into a text file. + """ + time.sleep(2) # Allow time for the page to load + + try: + title_element = driver.find_element(By.TAG_NAME, "h1") + + # Locate the main story container by data-editor-viewer attribute + try: + story_container = driver.find_element(By.XPATH, "//div[@data-editor-viewer='1']") + except: + # Fallback to legacy-journal if data-editor-viewer is not found + print("DEBUG: Falling back to legacy-journal.") + story_container = driver.find_element(By.XPATH, "//div[contains(@class, 'legacy-journal')]") + + # Extract the story content + content = story_container.text.strip() + + title = title_element.text.strip() + + print(f"DEBUG: Extracted content length: {len(content)}") # Debugging log + + if not title or not content: + print(f"No content found on {story_url}") + return + + file_name = re.sub(r"[^a-zA-Z0-9_]+", "_", title) + ".txt" + file_path = os.path.join(output_dir, file_name) + + with open(file_path, "w", encoding="utf-8") as f: + f.write(content) + print(f"Saved: {file_path}") + + except Exception as e: + print(f"Failed to scrape content from {story_url}: {e}")