import time import logging from typing import Dict, Optional import httpx from sqlalchemy.orm import Session from app.scrapers.base import BaseScraper from app.models import Species, Image, ApiKey from app.workers.quality_tasks import download_and_process_image class GBIFScraper(BaseScraper): """Scraper for GBIF (Global Biodiversity Information Facility) images.""" name = "gbif" requires_api_key = False # GBIF is free to use BASE_URL = "https://api.gbif.org/v1" HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15" } # Map GBIF license URLs to short codes LICENSE_MAP = { "http://creativecommons.org/publicdomain/zero/1.0/legalcode": "CC0", "http://creativecommons.org/licenses/by/4.0/legalcode": "CC-BY", "http://creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC", "http://creativecommons.org/publicdomain/zero/1.0/": "CC0", "http://creativecommons.org/licenses/by/4.0/": "CC-BY", "http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC", "https://creativecommons.org/publicdomain/zero/1.0/legalcode": "CC0", "https://creativecommons.org/licenses/by/4.0/legalcode": "CC-BY", "https://creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC", "https://creativecommons.org/publicdomain/zero/1.0/": "CC0", "https://creativecommons.org/licenses/by/4.0/": "CC-BY", "https://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC", } # Only allow commercial-safe licenses ALLOWED_LICENSES = {"CC0", "CC-BY"} def scrape_species( self, species: Species, db: Session, logger: Optional[logging.Logger] = None ) -> Dict[str, int]: """Scrape images from GBIF for a species.""" # GBIF doesn't require API key, but we still respect rate limits api_key = self.get_api_key(db) rate_limit = api_key.rate_limit_per_sec if api_key else 1.0 downloaded = 0 rejected = 0 try: params = { "scientificName": species.scientific_name, "mediaType": "StillImage", "limit": 100, } with httpx.Client(timeout=30, headers=self.HEADERS) as client: response = client.get( f"{self.BASE_URL}/occurrence/search", params=params, ) response.raise_for_status() data = response.json() results = data.get("results", []) for occurrence in results: media_list = occurrence.get("media", []) for media in media_list: # Only process still images if media.get("type") != "StillImage": continue url = media.get("identifier") if not url: rejected += 1 continue # Check license license_url = media.get("license", "") license_code = self.LICENSE_MAP.get(license_url) if not license_code or license_code not in self.ALLOWED_LICENSES: rejected += 1 continue # Create unique source ID from occurrence key and media URL occurrence_key = occurrence.get("key", "") # Use hash of URL for uniqueness within occurrence url_hash = str(hash(url))[-8:] source_id = f"{occurrence_key}_{url_hash}" # Check if already exists existing = db.query(Image).filter( Image.source == self.name, Image.source_id == source_id, ).first() if existing: continue # Build attribution creator = media.get("creator", "") rights_holder = media.get("rightsHolder", "") attribution_parts = [] if creator: attribution_parts.append(f"Photo by {creator}") if rights_holder and rights_holder != creator: attribution_parts.append(f"Rights: {rights_holder}") attribution_parts.append(f"via GBIF ({license_code})") attribution = " | ".join(attribution_parts) if attribution_parts else f"GBIF ({license_code})" # Create image record image = Image( species_id=species.id, source=self.name, source_id=source_id, url=url, license=license_code, attribution=attribution, status="pending", ) db.add(image) db.commit() # Queue for download download_and_process_image.delay(image.id) downloaded += 1 # Rate limiting time.sleep(1.0 / rate_limit) except Exception as e: print(f"Error scraping GBIF for {species.scientific_name}: {e}") return {"downloaded": downloaded, "rejected": rejected} def test_connection(self, api_key: ApiKey) -> str: """Test GBIF API connection.""" # GBIF doesn't require authentication, just test the endpoint with httpx.Client(timeout=10, headers=self.HEADERS) as client: response = client.get( f"{self.BASE_URL}/occurrence/search", params={"limit": 1}, ) response.raise_for_status() data = response.json() count = data.get("count", 0) return f"GBIF API connection successful ({count:,} total occurrences available)"