Removed an extra copy of the scraper, cleaned up the .gitignore

2024-12-25 22:56:52 -05:00 · 2024-12-25 22:56:52 -05:00 · 7880e3b55a
commit 7880e3b55a
parent 78e67b4321
2 changed files with 2 additions and 152 deletions
--- a/.gitignore
+++ b/.gitignore
@ -160,4 +160,5 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/

-/data
+/data
+/logs/scraped_urls.txt
--- a/deviantart_scraper.py
+++ b/deviantart_scraper.py
@ -1,151 +0,0 @@
-from selenium import webdriver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.chrome.service import Service
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.common.action_chains import ActionChains
-from selenium.webdriver.common.keys import Keys
-import time
-import os
-import re
-
-def scrape_deviantart(query, max_pages):
-    """
-    Scrapes text content from DeviantArt based on a query using Selenium.
-
-    Args:
-        query (str): The search query to find relevant stories.
-        max_pages (int): The number of pages to scrape.
-
-    Returns:
-        None: Saves scraped content into text files in the `data` directory.
-    """
-    output_dir = "data"
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
-    # Configure Selenium WebDriver for Brave
-    options = Options()
-    options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"  # Path to Brave browser executable
-    # options.add_argument("--headless")  # Uncomment to enable headless mode
-    options.add_argument("--disable-gpu")
-    options.add_argument("--no-sandbox")
-    service = Service("F:\\chromedriver\\chromedriver.exe")  # Path to your ChromeDriver
-    driver = webdriver.Chrome(service=service, options=options)
-
-    try:
-        base_url = "https://www.deviantart.com/search/?q="
-        driver.get(base_url + query)
-        scraped_urls = set()  # To avoid duplicates
-
-        for page in range(1, max_pages + 1):
-            print(f"Scraping page {page}...")
-            time.sleep(3)  # Allow time for dynamic content to load
-
-            # Locate all grid rows containing stories
-            grid_rows = driver.find_elements(By.XPATH, "//div[@data-testid='grid-row']")
-            print(f"DEBUG: Found {len(grid_rows)} grid rows on the page.")
-
-            if not grid_rows:
-                print("No stories found on this page.")
-                break
-
-            for row_idx, row in enumerate(grid_rows):
-                story_elements = row.find_elements(By.XPATH, ".//a[contains(@class, 'torpedo-thumb-link')] | .//a[contains(@href, '/art/')]")
-                print(f"DEBUG: Found {len(story_elements)} story elements in row {row_idx + 1}.")
-
-                for idx, link_element in enumerate(story_elements):
-                    try:
-                        story_url = link_element.get_attribute("href")
-
-                        # Exclude URLs ending with '#comments'
-                        if story_url.endswith("#comments"):
-                            print(f"Skipping comment URL: {story_url}")
-                            continue
-
-                        if story_url in scraped_urls:
-                            print(f"Skipping duplicate URL: {story_url}")
-                            continue
-
-                        # Verify the URL format to ensure it's a story link
-                        if "/art/" not in story_url:
-                            print(f"Skipping non-story URL: {story_url}")
-                            continue
-
-                        scraped_urls.add(story_url)
-                        print(f"Processing story {idx + 1} in row {row_idx + 1}: {story_url}")
-
-                        # Open the story in a new tab
-                        driver.execute_script("window.open('{}', '_blank');".format(story_url))
-                        driver.switch_to.window(driver.window_handles[-1])  # Switch to the new tab
-
-                        # Scrape the content
-                        scrape_deviation_content(driver, story_url, output_dir)
-
-                        # Close the tab and return to the main tab
-                        driver.close()
-                        driver.switch_to.window(driver.window_handles[0])
-
-                    except Exception as e:
-                        print(f"Error processing a story element: {e}")
-
-            # Navigate to the next page if applicable
-            try:
-                next_button = driver.find_element(By.XPATH, "//a[contains(@href, 'cursor=') and contains(., 'Next')]")
-                driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
-                time.sleep(1)  # Allow any animations to complete
-                next_button.click()
-                time.sleep(2)  # Allow time for the next page to load
-            except Exception as e:
-                print(f"No more pages available or navigation failed: {e}")
-                break
-    except Exception as e:
-        print(f"An error occurred during scraping: {e}")
-    finally:
-        print("Browser will remain open for debugging. Close it manually when done.")
-
-
-def scrape_deviation_content(driver, story_url, output_dir):
-    """
-    Scrapes the content of a single DeviantArt story page using Selenium.
-
-    Args:
-        driver (WebDriver): Selenium WebDriver instance.
-        story_url (str): URL of the deviation page.
-        output_dir (str): Directory to save the story content.
-
-    Returns:
-        None: Saves the story content into a text file.
-    """
-    time.sleep(2)  # Allow time for the page to load
-
-    try:
-        title_element = driver.find_element(By.TAG_NAME, "h1")
-
-        # Locate the main story container by data-editor-viewer attribute
-        try:
-            story_container = driver.find_element(By.XPATH, "//div[@data-editor-viewer='1']")
-        except:
-            # Fallback to legacy-journal if data-editor-viewer is not found
-            print("DEBUG: Falling back to legacy-journal.")
-            story_container = driver.find_element(By.XPATH, "//div[contains(@class, 'legacy-journal')]")
-
-        # Extract the story content
-        content = story_container.text.strip()
-
-        title = title_element.text.strip()
-
-        print(f"DEBUG: Extracted content length: {len(content)}")  # Debugging log
-
-        if not title or not content:
-            print(f"No content found on {story_url}")
-            return
-
-        file_name = re.sub(r"[^a-zA-Z0-9_]+", "_", title) + ".txt"
-        file_path = os.path.join(output_dir, file_name)
-
-        with open(file_path, "w", encoding="utf-8") as f:
-            f.write(content)
-        print(f"Saved: {file_path}")
-
-    except Exception as e:
-        print(f"Failed to scrape content from {story_url}: {e}")