Files
PlantGuideScraper/backend/app/scrapers/gbif.py
2026-04-12 09:54:27 -05:00

160 lines
6.2 KiB
Python

import time
import logging
from typing import Dict, Optional
import httpx
from sqlalchemy.orm import Session
from app.scrapers.base import BaseScraper
from app.models import Species, Image, ApiKey
from app.workers.quality_tasks import download_and_process_image
class GBIFScraper(BaseScraper):
"""Scraper for GBIF (Global Biodiversity Information Facility) images."""
name = "gbif"
requires_api_key = False # GBIF is free to use
BASE_URL = "https://api.gbif.org/v1"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
}
# Map GBIF license URLs to short codes
LICENSE_MAP = {
"http://creativecommons.org/publicdomain/zero/1.0/legalcode": "CC0",
"http://creativecommons.org/licenses/by/4.0/legalcode": "CC-BY",
"http://creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC",
"http://creativecommons.org/publicdomain/zero/1.0/": "CC0",
"http://creativecommons.org/licenses/by/4.0/": "CC-BY",
"http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
"https://creativecommons.org/publicdomain/zero/1.0/legalcode": "CC0",
"https://creativecommons.org/licenses/by/4.0/legalcode": "CC-BY",
"https://creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC",
"https://creativecommons.org/publicdomain/zero/1.0/": "CC0",
"https://creativecommons.org/licenses/by/4.0/": "CC-BY",
"https://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
}
# Only allow commercial-safe licenses
ALLOWED_LICENSES = {"CC0", "CC-BY"}
def scrape_species(
self,
species: Species,
db: Session,
logger: Optional[logging.Logger] = None
) -> Dict[str, int]:
"""Scrape images from GBIF for a species."""
# GBIF doesn't require API key, but we still respect rate limits
api_key = self.get_api_key(db)
rate_limit = api_key.rate_limit_per_sec if api_key else 1.0
downloaded = 0
rejected = 0
try:
params = {
"scientificName": species.scientific_name,
"mediaType": "StillImage",
"limit": 100,
}
with httpx.Client(timeout=30, headers=self.HEADERS) as client:
response = client.get(
f"{self.BASE_URL}/occurrence/search",
params=params,
)
response.raise_for_status()
data = response.json()
results = data.get("results", [])
for occurrence in results:
media_list = occurrence.get("media", [])
for media in media_list:
# Only process still images
if media.get("type") != "StillImage":
continue
url = media.get("identifier")
if not url:
rejected += 1
continue
# Check license
license_url = media.get("license", "")
license_code = self.LICENSE_MAP.get(license_url)
if not license_code or license_code not in self.ALLOWED_LICENSES:
rejected += 1
continue
# Create unique source ID from occurrence key and media URL
occurrence_key = occurrence.get("key", "")
# Use hash of URL for uniqueness within occurrence
url_hash = str(hash(url))[-8:]
source_id = f"{occurrence_key}_{url_hash}"
# Check if already exists
existing = db.query(Image).filter(
Image.source == self.name,
Image.source_id == source_id,
).first()
if existing:
continue
# Build attribution
creator = media.get("creator", "")
rights_holder = media.get("rightsHolder", "")
attribution_parts = []
if creator:
attribution_parts.append(f"Photo by {creator}")
if rights_holder and rights_holder != creator:
attribution_parts.append(f"Rights: {rights_holder}")
attribution_parts.append(f"via GBIF ({license_code})")
attribution = " | ".join(attribution_parts) if attribution_parts else f"GBIF ({license_code})"
# Create image record
image = Image(
species_id=species.id,
source=self.name,
source_id=source_id,
url=url,
license=license_code,
attribution=attribution,
status="pending",
)
db.add(image)
db.commit()
# Queue for download
download_and_process_image.delay(image.id)
downloaded += 1
# Rate limiting
time.sleep(1.0 / rate_limit)
except Exception as e:
print(f"Error scraping GBIF for {species.scientific_name}: {e}")
return {"downloaded": downloaded, "rejected": rejected}
def test_connection(self, api_key: ApiKey) -> str:
"""Test GBIF API connection."""
# GBIF doesn't require authentication, just test the endpoint
with httpx.Client(timeout=10, headers=self.HEADERS) as client:
response = client.get(
f"{self.BASE_URL}/occurrence/search",
params={"limit": 1},
)
response.raise_for_status()
data = response.json()
count = data.get("count", 0)
return f"GBIF API connection successful ({count:,} total occurrences available)"