145 lines
5.3 KiB
Python
145 lines
5.3 KiB
Python
import time
|
|
import logging
|
|
from typing import Dict, Optional
|
|
|
|
import httpx
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.scrapers.base import BaseScraper
|
|
from app.models import Species, Image, ApiKey
|
|
from app.workers.quality_tasks import download_and_process_image
|
|
|
|
|
|
class INaturalistScraper(BaseScraper):
|
|
"""Scraper for iNaturalist observations via their API."""
|
|
|
|
name = "inaturalist"
|
|
requires_api_key = False # Public API, but rate limited
|
|
|
|
BASE_URL = "https://api.inaturalist.org/v1"
|
|
|
|
HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
|
|
}
|
|
|
|
# Commercial-safe licenses (CC0, CC-BY)
|
|
ALLOWED_LICENSES = ["cc0", "cc-by"]
|
|
|
|
def scrape_species(
|
|
self,
|
|
species: Species,
|
|
db: Session,
|
|
logger: Optional[logging.Logger] = None
|
|
) -> Dict[str, int]:
|
|
"""Scrape images from iNaturalist for a species."""
|
|
api_key = self.get_api_key(db)
|
|
rate_limit = api_key.rate_limit_per_sec if api_key else 1.0
|
|
|
|
downloaded = 0
|
|
rejected = 0
|
|
|
|
def log(level: str, msg: str):
|
|
if logger:
|
|
getattr(logger, level)(msg)
|
|
|
|
try:
|
|
# Search for observations of this species
|
|
params = {
|
|
"taxon_name": species.scientific_name,
|
|
"quality_grade": "research", # Only research-grade
|
|
"photos": True,
|
|
"per_page": 200,
|
|
"order_by": "votes",
|
|
"license": ",".join(self.ALLOWED_LICENSES),
|
|
}
|
|
|
|
log("debug", f" API request params: {params}")
|
|
|
|
with httpx.Client(timeout=30, headers=self.HEADERS) as client:
|
|
response = client.get(
|
|
f"{self.BASE_URL}/observations",
|
|
params=params,
|
|
)
|
|
log("debug", f" API response status: {response.status_code}")
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
observations = data.get("results", [])
|
|
total_results = data.get("total_results", 0)
|
|
log("debug", f" Found {len(observations)} observations (total: {total_results})")
|
|
|
|
if not observations:
|
|
log("info", f" No observations found for {species.scientific_name}")
|
|
return {"downloaded": 0, "rejected": 0}
|
|
|
|
for obs in observations:
|
|
photos = obs.get("photos", [])
|
|
for photo in photos:
|
|
# Check license
|
|
license_code = photo.get("license_code", "").lower() if photo.get("license_code") else ""
|
|
if license_code not in self.ALLOWED_LICENSES:
|
|
log("debug", f" Rejected photo {photo.get('id')}: license={license_code}")
|
|
rejected += 1
|
|
continue
|
|
|
|
# Get image URL (medium size for initial download)
|
|
url = photo.get("url", "")
|
|
if not url:
|
|
log("debug", f" Skipped photo {photo.get('id')}: no URL")
|
|
continue
|
|
|
|
# Convert to larger size
|
|
url = url.replace("square", "large")
|
|
|
|
# Check if already exists
|
|
source_id = str(photo.get("id"))
|
|
existing = db.query(Image).filter(
|
|
Image.source == self.name,
|
|
Image.source_id == source_id,
|
|
).first()
|
|
|
|
if existing:
|
|
log("debug", f" Skipped photo {source_id}: already exists")
|
|
continue
|
|
|
|
# Create image record
|
|
image = Image(
|
|
species_id=species.id,
|
|
source=self.name,
|
|
source_id=source_id,
|
|
url=url,
|
|
license=license_code.upper(),
|
|
attribution=photo.get("attribution", ""),
|
|
status="pending",
|
|
)
|
|
db.add(image)
|
|
db.commit()
|
|
|
|
# Queue for download
|
|
download_and_process_image.delay(image.id)
|
|
downloaded += 1
|
|
log("debug", f" Queued photo {source_id} for download")
|
|
|
|
# Rate limiting
|
|
time.sleep(1.0 / rate_limit)
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
log("error", f" HTTP error for {species.scientific_name}: {e.response.status_code} - {e.response.text}")
|
|
except httpx.RequestError as e:
|
|
log("error", f" Request error for {species.scientific_name}: {e}")
|
|
except Exception as e:
|
|
log("error", f" Error scraping iNaturalist for {species.scientific_name}: {e}")
|
|
|
|
return {"downloaded": downloaded, "rejected": rejected}
|
|
|
|
def test_connection(self, api_key: ApiKey) -> str:
|
|
"""Test iNaturalist API connection."""
|
|
with httpx.Client(timeout=10, headers=self.HEADERS) as client:
|
|
response = client.get(
|
|
f"{self.BASE_URL}/observations",
|
|
params={"per_page": 1},
|
|
)
|
|
response.raise_for_status()
|
|
|
|
return "iNaturalist API connection successful"
|