Files
PlantGuideScraper/backend/app/scrapers/inaturalist.py
2026-04-12 09:54:27 -05:00

145 lines
5.3 KiB
Python

import time
import logging
from typing import Dict, Optional
import httpx
from sqlalchemy.orm import Session
from app.scrapers.base import BaseScraper
from app.models import Species, Image, ApiKey
from app.workers.quality_tasks import download_and_process_image
class INaturalistScraper(BaseScraper):
"""Scraper for iNaturalist observations via their API."""
name = "inaturalist"
requires_api_key = False # Public API, but rate limited
BASE_URL = "https://api.inaturalist.org/v1"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
}
# Commercial-safe licenses (CC0, CC-BY)
ALLOWED_LICENSES = ["cc0", "cc-by"]
def scrape_species(
self,
species: Species,
db: Session,
logger: Optional[logging.Logger] = None
) -> Dict[str, int]:
"""Scrape images from iNaturalist for a species."""
api_key = self.get_api_key(db)
rate_limit = api_key.rate_limit_per_sec if api_key else 1.0
downloaded = 0
rejected = 0
def log(level: str, msg: str):
if logger:
getattr(logger, level)(msg)
try:
# Search for observations of this species
params = {
"taxon_name": species.scientific_name,
"quality_grade": "research", # Only research-grade
"photos": True,
"per_page": 200,
"order_by": "votes",
"license": ",".join(self.ALLOWED_LICENSES),
}
log("debug", f" API request params: {params}")
with httpx.Client(timeout=30, headers=self.HEADERS) as client:
response = client.get(
f"{self.BASE_URL}/observations",
params=params,
)
log("debug", f" API response status: {response.status_code}")
response.raise_for_status()
data = response.json()
observations = data.get("results", [])
total_results = data.get("total_results", 0)
log("debug", f" Found {len(observations)} observations (total: {total_results})")
if not observations:
log("info", f" No observations found for {species.scientific_name}")
return {"downloaded": 0, "rejected": 0}
for obs in observations:
photos = obs.get("photos", [])
for photo in photos:
# Check license
license_code = photo.get("license_code", "").lower() if photo.get("license_code") else ""
if license_code not in self.ALLOWED_LICENSES:
log("debug", f" Rejected photo {photo.get('id')}: license={license_code}")
rejected += 1
continue
# Get image URL (medium size for initial download)
url = photo.get("url", "")
if not url:
log("debug", f" Skipped photo {photo.get('id')}: no URL")
continue
# Convert to larger size
url = url.replace("square", "large")
# Check if already exists
source_id = str(photo.get("id"))
existing = db.query(Image).filter(
Image.source == self.name,
Image.source_id == source_id,
).first()
if existing:
log("debug", f" Skipped photo {source_id}: already exists")
continue
# Create image record
image = Image(
species_id=species.id,
source=self.name,
source_id=source_id,
url=url,
license=license_code.upper(),
attribution=photo.get("attribution", ""),
status="pending",
)
db.add(image)
db.commit()
# Queue for download
download_and_process_image.delay(image.id)
downloaded += 1
log("debug", f" Queued photo {source_id} for download")
# Rate limiting
time.sleep(1.0 / rate_limit)
except httpx.HTTPStatusError as e:
log("error", f" HTTP error for {species.scientific_name}: {e.response.status_code} - {e.response.text}")
except httpx.RequestError as e:
log("error", f" Request error for {species.scientific_name}: {e}")
except Exception as e:
log("error", f" Error scraping iNaturalist for {species.scientific_name}: {e}")
return {"downloaded": downloaded, "rejected": rejected}
def test_connection(self, api_key: ApiKey) -> str:
"""Test iNaturalist API connection."""
with httpx.Client(timeout=10, headers=self.HEADERS) as client:
response = client.get(
f"{self.BASE_URL}/observations",
params={"per_page": 1},
)
response.raise_for_status()
return "iNaturalist API connection successful"