import time import logging from typing import Dict, Optional import httpx from sqlalchemy.orm import Session from app.scrapers.base import BaseScraper from app.models import Species, Image, ApiKey from app.workers.quality_tasks import download_and_process_image class INaturalistScraper(BaseScraper): """Scraper for iNaturalist observations via their API.""" name = "inaturalist" requires_api_key = False # Public API, but rate limited BASE_URL = "https://api.inaturalist.org/v1" HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15" } # Commercial-safe licenses (CC0, CC-BY) ALLOWED_LICENSES = ["cc0", "cc-by"] def scrape_species( self, species: Species, db: Session, logger: Optional[logging.Logger] = None ) -> Dict[str, int]: """Scrape images from iNaturalist for a species.""" api_key = self.get_api_key(db) rate_limit = api_key.rate_limit_per_sec if api_key else 1.0 downloaded = 0 rejected = 0 def log(level: str, msg: str): if logger: getattr(logger, level)(msg) try: # Search for observations of this species params = { "taxon_name": species.scientific_name, "quality_grade": "research", # Only research-grade "photos": True, "per_page": 200, "order_by": "votes", "license": ",".join(self.ALLOWED_LICENSES), } log("debug", f" API request params: {params}") with httpx.Client(timeout=30, headers=self.HEADERS) as client: response = client.get( f"{self.BASE_URL}/observations", params=params, ) log("debug", f" API response status: {response.status_code}") response.raise_for_status() data = response.json() observations = data.get("results", []) total_results = data.get("total_results", 0) log("debug", f" Found {len(observations)} observations (total: {total_results})") if not observations: log("info", f" No observations found for {species.scientific_name}") return {"downloaded": 0, "rejected": 0} for obs in observations: photos = obs.get("photos", []) for photo in photos: # Check license license_code = photo.get("license_code", "").lower() if photo.get("license_code") else "" if license_code not in self.ALLOWED_LICENSES: log("debug", f" Rejected photo {photo.get('id')}: license={license_code}") rejected += 1 continue # Get image URL (medium size for initial download) url = photo.get("url", "") if not url: log("debug", f" Skipped photo {photo.get('id')}: no URL") continue # Convert to larger size url = url.replace("square", "large") # Check if already exists source_id = str(photo.get("id")) existing = db.query(Image).filter( Image.source == self.name, Image.source_id == source_id, ).first() if existing: log("debug", f" Skipped photo {source_id}: already exists") continue # Create image record image = Image( species_id=species.id, source=self.name, source_id=source_id, url=url, license=license_code.upper(), attribution=photo.get("attribution", ""), status="pending", ) db.add(image) db.commit() # Queue for download download_and_process_image.delay(image.id) downloaded += 1 log("debug", f" Queued photo {source_id} for download") # Rate limiting time.sleep(1.0 / rate_limit) except httpx.HTTPStatusError as e: log("error", f" HTTP error for {species.scientific_name}: {e.response.status_code} - {e.response.text}") except httpx.RequestError as e: log("error", f" Request error for {species.scientific_name}: {e}") except Exception as e: log("error", f" Error scraping iNaturalist for {species.scientific_name}: {e}") return {"downloaded": downloaded, "rejected": rejected} def test_connection(self, api_key: ApiKey) -> str: """Test iNaturalist API connection.""" with httpx.Client(timeout=10, headers=self.HEADERS) as client: response = client.get( f"{self.BASE_URL}/observations", params={"per_page": 1}, ) response.raise_for_status() return "iNaturalist API connection successful"