Initial commit — PlantGuideScraper project

2026-04-12 09:54:27 -05:00
commit 6926f502c5
87 changed files with 29120 additions and 0 deletions
--- a/backend/app/scrapers/inaturalist.py
+++ b/backend/app/scrapers/inaturalist.py
@@ -0,0 +1,144 @@
+import time
+import logging
+from typing import Dict, Optional
+
+import httpx
+from sqlalchemy.orm import Session
+
+from app.scrapers.base import BaseScraper
+from app.models import Species, Image, ApiKey
+from app.workers.quality_tasks import download_and_process_image
+
+
+class INaturalistScraper(BaseScraper):
+    """Scraper for iNaturalist observations via their API."""
+
+    name = "inaturalist"
+    requires_api_key = False  # Public API, but rate limited
+
+    BASE_URL = "https://api.inaturalist.org/v1"
+
+    HEADERS = {
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
+    }
+
+    # Commercial-safe licenses (CC0, CC-BY)
+    ALLOWED_LICENSES = ["cc0", "cc-by"]
+
+    def scrape_species(
+        self,
+        species: Species,
+        db: Session,
+        logger: Optional[logging.Logger] = None
+    ) -> Dict[str, int]:
+        """Scrape images from iNaturalist for a species."""
+        api_key = self.get_api_key(db)
+        rate_limit = api_key.rate_limit_per_sec if api_key else 1.0
+
+        downloaded = 0
+        rejected = 0
+
+        def log(level: str, msg: str):
+            if logger:
+                getattr(logger, level)(msg)
+
+        try:
+            # Search for observations of this species
+            params = {
+                "taxon_name": species.scientific_name,
+                "quality_grade": "research",  # Only research-grade
+                "photos": True,
+                "per_page": 200,
+                "order_by": "votes",
+                "license": ",".join(self.ALLOWED_LICENSES),
+            }
+
+            log("debug", f"  API request params: {params}")
+
+            with httpx.Client(timeout=30, headers=self.HEADERS) as client:
+                response = client.get(
+                    f"{self.BASE_URL}/observations",
+                    params=params,
+                )
+                log("debug", f"  API response status: {response.status_code}")
+                response.raise_for_status()
+                data = response.json()
+
+            observations = data.get("results", [])
+            total_results = data.get("total_results", 0)
+            log("debug", f"  Found {len(observations)} observations (total: {total_results})")
+
+            if not observations:
+                log("info", f"  No observations found for {species.scientific_name}")
+                return {"downloaded": 0, "rejected": 0}
+
+            for obs in observations:
+                photos = obs.get("photos", [])
+                for photo in photos:
+                    # Check license
+                    license_code = photo.get("license_code", "").lower() if photo.get("license_code") else ""
+                    if license_code not in self.ALLOWED_LICENSES:
+                        log("debug", f"  Rejected photo {photo.get('id')}: license={license_code}")
+                        rejected += 1
+                        continue
+
+                    # Get image URL (medium size for initial download)
+                    url = photo.get("url", "")
+                    if not url:
+                        log("debug", f"  Skipped photo {photo.get('id')}: no URL")
+                        continue
+
+                    # Convert to larger size
+                    url = url.replace("square", "large")
+
+                    # Check if already exists
+                    source_id = str(photo.get("id"))
+                    existing = db.query(Image).filter(
+                        Image.source == self.name,
+                        Image.source_id == source_id,
+                    ).first()
+
+                    if existing:
+                        log("debug", f"  Skipped photo {source_id}: already exists")
+                        continue
+
+                    # Create image record
+                    image = Image(
+                        species_id=species.id,
+                        source=self.name,
+                        source_id=source_id,
+                        url=url,
+                        license=license_code.upper(),
+                        attribution=photo.get("attribution", ""),
+                        status="pending",
+                    )
+                    db.add(image)
+                    db.commit()
+
+                    # Queue for download
+                    download_and_process_image.delay(image.id)
+                    downloaded += 1
+                    log("debug", f"  Queued photo {source_id} for download")
+
+                # Rate limiting
+                time.sleep(1.0 / rate_limit)
+
+        except httpx.HTTPStatusError as e:
+            log("error", f"  HTTP error for {species.scientific_name}: {e.response.status_code} - {e.response.text}")
+        except httpx.RequestError as e:
+            log("error", f"  Request error for {species.scientific_name}: {e}")
+        except Exception as e:
+            log("error", f"  Error scraping iNaturalist for {species.scientific_name}: {e}")
+
+        return {"downloaded": downloaded, "rejected": rejected}
+
+    def test_connection(self, api_key: ApiKey) -> str:
+        """Test iNaturalist API connection."""
+        with httpx.Client(timeout=10, headers=self.HEADERS) as client:
+            response = client.get(
+                f"{self.BASE_URL}/observations",
+                params={"per_page": 1},
+            )
+            response.raise_for_status()
+
+        return "iNaturalist API connection successful"