Files
2026-04-12 09:54:27 -05:00

102 lines
3.4 KiB
Python

import hashlib
import time
import logging
from typing import Dict, Optional
from duckduckgo_search import DDGS
from sqlalchemy.orm import Session
from app.scrapers.base import BaseScraper
from app.models import Species, Image, ApiKey
from app.workers.quality_tasks import download_and_process_image
class DuckDuckGoScraper(BaseScraper):
"""Scraper for DuckDuckGo image search. No API key required."""
name = "duckduckgo"
requires_api_key = False
NEGATIVE_TERMS = "-herbarium -specimen -illustration -drawing -diagram -dried -pressed"
def _build_queries(self, species: Species) -> list[str]:
queries = [f'"{species.scientific_name}" plant photo {self.NEGATIVE_TERMS}']
if species.common_name:
queries.append(f'"{species.common_name}" houseplant photo {self.NEGATIVE_TERMS}')
return queries
def scrape_species(
self,
species: Species,
db: Session,
logger: Optional[logging.Logger] = None,
) -> Dict[str, int]:
api_key = self.get_api_key(db)
rate_limit = api_key.rate_limit_per_sec if api_key else 0.5
downloaded = 0
rejected = 0
seen_urls = set()
try:
queries = self._build_queries(species)
with DDGS() as ddgs:
for query in queries:
results = ddgs.images(
keywords=query,
type_image="photo",
max_results=50,
)
for result in results:
url = result.get("image")
if not url or url in seen_urls:
continue
seen_urls.add(url)
source_id = hashlib.md5(url.encode()).hexdigest()[:16]
# Check if already exists
existing = db.query(Image).filter(
Image.source == self.name,
Image.source_id == source_id,
).first()
if existing:
continue
title = result.get("title", "")
attribution = f"{title} via DuckDuckGo" if title else "via DuckDuckGo"
image = Image(
species_id=species.id,
source=self.name,
source_id=source_id,
url=url,
license="UNKNOWN",
attribution=attribution,
status="pending",
)
db.add(image)
db.commit()
download_and_process_image.delay(image.id)
downloaded += 1
time.sleep(1.0 / rate_limit)
except Exception as e:
if logger:
logger.error(f"Error scraping DuckDuckGo for {species.scientific_name}: {e}")
else:
print(f"Error scraping DuckDuckGo for {species.scientific_name}: {e}")
return {"downloaded": downloaded, "rejected": rejected}
def test_connection(self, api_key: ApiKey) -> str:
with DDGS() as ddgs:
results = ddgs.images(keywords="Monstera deliciosa plant", max_results=1)
count = len(list(results))
return f"DuckDuckGo search working ({count} test result)"