import time import logging from typing import Dict, Optional import httpx from sqlalchemy.orm import Session from app.scrapers.base import BaseScraper from app.models import Species, Image, ApiKey from app.workers.quality_tasks import download_and_process_image class EOLScraper(BaseScraper): """Scraper for Encyclopedia of Life (EOL) images. EOL aggregates biodiversity data from many sources and provides a free API with no authentication required. """ name = "eol" requires_api_key = False BASE_URL = "https://eol.org/api" HEADERS = { "User-Agent": "PlantGuideScraper/1.0 (Plant image collection for ML training)", "Accept": "application/json", } # Map EOL license URLs to short codes LICENSE_MAP = { "http://creativecommons.org/publicdomain/zero/1.0/": "CC0", "http://creativecommons.org/publicdomain/mark/1.0/": "CC0", "http://creativecommons.org/licenses/by/2.0/": "CC-BY", "http://creativecommons.org/licenses/by/3.0/": "CC-BY", "http://creativecommons.org/licenses/by/4.0/": "CC-BY", "http://creativecommons.org/licenses/by-sa/2.0/": "CC-BY-SA", "http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA", "http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA", "https://creativecommons.org/publicdomain/zero/1.0/": "CC0", "https://creativecommons.org/publicdomain/mark/1.0/": "CC0", "https://creativecommons.org/licenses/by/2.0/": "CC-BY", "https://creativecommons.org/licenses/by/3.0/": "CC-BY", "https://creativecommons.org/licenses/by/4.0/": "CC-BY", "https://creativecommons.org/licenses/by-sa/2.0/": "CC-BY-SA", "https://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA", "https://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA", "pd": "CC0", # Public domain "public domain": "CC0", } # Commercial-safe licenses ALLOWED_LICENSES = {"CC0", "CC-BY", "CC-BY-SA"} def scrape_species( self, species: Species, db: Session, logger: Optional[logging.Logger] = None ) -> Dict[str, int]: """Scrape images from EOL for a species.""" api_key = self.get_api_key(db) rate_limit = api_key.rate_limit_per_sec if api_key else 0.5 downloaded = 0 rejected = 0 def log(level: str, msg: str): if logger: getattr(logger, level)(msg) try: # Disable SSL verification - EOL is a trusted source and some Docker # environments lack proper CA certificates with httpx.Client(timeout=30, headers=self.HEADERS, verify=False) as client: # Step 1: Search for the species search_response = client.get( f"{self.BASE_URL}/search/1.0.json", params={ "q": species.scientific_name, "page": 1, "exact": "true", }, ) search_response.raise_for_status() search_data = search_response.json() results = search_data.get("results", []) if not results: log("info", f" Species not found in EOL: {species.scientific_name}") return {"downloaded": 0, "rejected": 0} # Get the EOL page ID eol_page_id = results[0].get("id") if not eol_page_id: return {"downloaded": 0, "rejected": 0} time.sleep(1.0 / rate_limit) # Step 2: Get page details with images page_response = client.get( f"{self.BASE_URL}/pages/1.0/{eol_page_id}.json", params={ "images_per_page": 75, "images_page": 1, "videos_per_page": 0, "sounds_per_page": 0, "maps_per_page": 0, "texts_per_page": 0, "details": "true", "licenses": "cc-by|cc-by-sa|pd|cc-by-nc", }, ) page_response.raise_for_status() page_data = page_response.json() data_objects = page_data.get("dataObjects", []) log("debug", f" Found {len(data_objects)} media objects") for obj in data_objects: # Only process images media_type = obj.get("dataType", "") if "image" not in media_type.lower() and "stillimage" not in media_type.lower(): continue # Get image URL image_url = obj.get("eolMediaURL") or obj.get("mediaURL") if not image_url: rejected += 1 continue # Check license license_url = obj.get("license", "").lower() license_code = None # Try to match license URL for pattern, code in self.LICENSE_MAP.items(): if pattern in license_url: license_code = code break if not license_code: # Check for NC licenses which we reject if "-nc" in license_url: rejected += 1 continue # Unknown license, skip log("debug", f" Rejected: unknown license {license_url}") rejected += 1 continue if license_code not in self.ALLOWED_LICENSES: rejected += 1 continue # Create unique source ID source_id = str(obj.get("dataObjectVersionID") or obj.get("identifier") or hash(image_url)) # Check if already exists existing = db.query(Image).filter( Image.source == self.name, Image.source_id == source_id, ).first() if existing: continue # Build attribution agents = obj.get("agents", []) photographer = None rights_holder = None for agent in agents: role = agent.get("role", "").lower() name = agent.get("full_name", "") if role == "photographer": photographer = name elif role == "owner" or role == "rights holder": rights_holder = name attribution_parts = [] if photographer: attribution_parts.append(f"Photo by {photographer}") if rights_holder and rights_holder != photographer: attribution_parts.append(f"Rights: {rights_holder}") attribution_parts.append(f"via EOL ({license_code})") attribution = " | ".join(attribution_parts) # Create image record image = Image( species_id=species.id, source=self.name, source_id=source_id, url=image_url, license=license_code, attribution=attribution, status="pending", ) db.add(image) db.commit() # Queue for download download_and_process_image.delay(image.id) downloaded += 1 time.sleep(1.0 / rate_limit) except httpx.HTTPStatusError as e: log("error", f" HTTP error for {species.scientific_name}: {e.response.status_code}") except Exception as e: log("error", f" Error scraping EOL for {species.scientific_name}: {e}") return {"downloaded": downloaded, "rejected": rejected} def test_connection(self, api_key: ApiKey) -> str: """Test EOL API connection.""" with httpx.Client(timeout=10, headers=self.HEADERS, verify=False) as client: response = client.get( f"{self.BASE_URL}/search/1.0.json", params={"q": "Rosa", "page": 1}, ) response.raise_for_status() data = response.json() total = data.get("totalResults", 0) return f"EOL API connection successful ({total} results for 'Rosa')"