import hashlib import time import logging from typing import Dict, Optional from duckduckgo_search import DDGS from sqlalchemy.orm import Session from app.scrapers.base import BaseScraper from app.models import Species, Image, ApiKey from app.workers.quality_tasks import download_and_process_image class DuckDuckGoScraper(BaseScraper): """Scraper for DuckDuckGo image search. No API key required.""" name = "duckduckgo" requires_api_key = False NEGATIVE_TERMS = "-herbarium -specimen -illustration -drawing -diagram -dried -pressed" def _build_queries(self, species: Species) -> list[str]: queries = [f'"{species.scientific_name}" plant photo {self.NEGATIVE_TERMS}'] if species.common_name: queries.append(f'"{species.common_name}" houseplant photo {self.NEGATIVE_TERMS}') return queries def scrape_species( self, species: Species, db: Session, logger: Optional[logging.Logger] = None, ) -> Dict[str, int]: api_key = self.get_api_key(db) rate_limit = api_key.rate_limit_per_sec if api_key else 0.5 downloaded = 0 rejected = 0 seen_urls = set() try: queries = self._build_queries(species) with DDGS() as ddgs: for query in queries: results = ddgs.images( keywords=query, type_image="photo", max_results=50, ) for result in results: url = result.get("image") if not url or url in seen_urls: continue seen_urls.add(url) source_id = hashlib.md5(url.encode()).hexdigest()[:16] # Check if already exists existing = db.query(Image).filter( Image.source == self.name, Image.source_id == source_id, ).first() if existing: continue title = result.get("title", "") attribution = f"{title} via DuckDuckGo" if title else "via DuckDuckGo" image = Image( species_id=species.id, source=self.name, source_id=source_id, url=url, license="UNKNOWN", attribution=attribution, status="pending", ) db.add(image) db.commit() download_and_process_image.delay(image.id) downloaded += 1 time.sleep(1.0 / rate_limit) except Exception as e: if logger: logger.error(f"Error scraping DuckDuckGo for {species.scientific_name}: {e}") else: print(f"Error scraping DuckDuckGo for {species.scientific_name}: {e}") return {"downloaded": downloaded, "rejected": rejected} def test_connection(self, api_key: ApiKey) -> str: with DDGS() as ddgs: results = ddgs.images(keywords="Monstera deliciosa plant", max_results=1) count = len(list(results)) return f"DuckDuckGo search working ({count} test result)"