diff --git a/.gitignore b/.gitignore index c2ea699..e049312 100644 --- a/.gitignore +++ b/.gitignore @@ -160,4 +160,5 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -/data \ No newline at end of file +/data +/logs/scraped_urls.txt \ No newline at end of file diff --git a/deviantart_scraper.py b/deviantart_scraper.py deleted file mode 100644 index d296f70..0000000 --- a/deviantart_scraper.py +++ /dev/null @@ -1,151 +0,0 @@ -from selenium import webdriver -from selenium.webdriver.common.by import By -from selenium.webdriver.chrome.service import Service -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.common.action_chains import ActionChains -from selenium.webdriver.common.keys import Keys -import time -import os -import re - -def scrape_deviantart(query, max_pages): - """ - Scrapes text content from DeviantArt based on a query using Selenium. - - Args: - query (str): The search query to find relevant stories. - max_pages (int): The number of pages to scrape. - - Returns: - None: Saves scraped content into text files in the `data` directory. - """ - output_dir = "data" - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - # Configure Selenium WebDriver for Brave - options = Options() - options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe" # Path to Brave browser executable - # options.add_argument("--headless") # Uncomment to enable headless mode - options.add_argument("--disable-gpu") - options.add_argument("--no-sandbox") - service = Service("F:\\chromedriver\\chromedriver.exe") # Path to your ChromeDriver - driver = webdriver.Chrome(service=service, options=options) - - try: - base_url = "https://www.deviantart.com/search/?q=" - driver.get(base_url + query) - scraped_urls = set() # To avoid duplicates - - for page in range(1, max_pages + 1): - print(f"Scraping page {page}...") - time.sleep(3) # Allow time for dynamic content to load - - # Locate all grid rows containing stories - grid_rows = driver.find_elements(By.XPATH, "//div[@data-testid='grid-row']") - print(f"DEBUG: Found {len(grid_rows)} grid rows on the page.") - - if not grid_rows: - print("No stories found on this page.") - break - - for row_idx, row in enumerate(grid_rows): - story_elements = row.find_elements(By.XPATH, ".//a[contains(@class, 'torpedo-thumb-link')] | .//a[contains(@href, '/art/')]") - print(f"DEBUG: Found {len(story_elements)} story elements in row {row_idx + 1}.") - - for idx, link_element in enumerate(story_elements): - try: - story_url = link_element.get_attribute("href") - - # Exclude URLs ending with '#comments' - if story_url.endswith("#comments"): - print(f"Skipping comment URL: {story_url}") - continue - - if story_url in scraped_urls: - print(f"Skipping duplicate URL: {story_url}") - continue - - # Verify the URL format to ensure it's a story link - if "/art/" not in story_url: - print(f"Skipping non-story URL: {story_url}") - continue - - scraped_urls.add(story_url) - print(f"Processing story {idx + 1} in row {row_idx + 1}: {story_url}") - - # Open the story in a new tab - driver.execute_script("window.open('{}', '_blank');".format(story_url)) - driver.switch_to.window(driver.window_handles[-1]) # Switch to the new tab - - # Scrape the content - scrape_deviation_content(driver, story_url, output_dir) - - # Close the tab and return to the main tab - driver.close() - driver.switch_to.window(driver.window_handles[0]) - - except Exception as e: - print(f"Error processing a story element: {e}") - - # Navigate to the next page if applicable - try: - next_button = driver.find_element(By.XPATH, "//a[contains(@href, 'cursor=') and contains(., 'Next')]") - driver.execute_script("arguments[0].scrollIntoView(true);", next_button) - time.sleep(1) # Allow any animations to complete - next_button.click() - time.sleep(2) # Allow time for the next page to load - except Exception as e: - print(f"No more pages available or navigation failed: {e}") - break - except Exception as e: - print(f"An error occurred during scraping: {e}") - finally: - print("Browser will remain open for debugging. Close it manually when done.") - - -def scrape_deviation_content(driver, story_url, output_dir): - """ - Scrapes the content of a single DeviantArt story page using Selenium. - - Args: - driver (WebDriver): Selenium WebDriver instance. - story_url (str): URL of the deviation page. - output_dir (str): Directory to save the story content. - - Returns: - None: Saves the story content into a text file. - """ - time.sleep(2) # Allow time for the page to load - - try: - title_element = driver.find_element(By.TAG_NAME, "h1") - - # Locate the main story container by data-editor-viewer attribute - try: - story_container = driver.find_element(By.XPATH, "//div[@data-editor-viewer='1']") - except: - # Fallback to legacy-journal if data-editor-viewer is not found - print("DEBUG: Falling back to legacy-journal.") - story_container = driver.find_element(By.XPATH, "//div[contains(@class, 'legacy-journal')]") - - # Extract the story content - content = story_container.text.strip() - - title = title_element.text.strip() - - print(f"DEBUG: Extracted content length: {len(content)}") # Debugging log - - if not title or not content: - print(f"No content found on {story_url}") - return - - file_name = re.sub(r"[^a-zA-Z0-9_]+", "_", title) + ".txt" - file_path = os.path.join(output_dir, file_name) - - with open(file_path, "w", encoding="utf-8") as f: - f.write(content) - print(f"Saved: {file_path}") - - except Exception as e: - print(f"Failed to scrape content from {story_url}: {e}")