Initial commit — PlantGuideScraper project

2026-04-12 09:54:27 -05:00
commit 6926f502c5
87 changed files with 29120 additions and 0 deletions
--- a/backend/app/scrapers/gbif.py
+++ b/backend/app/scrapers/gbif.py
@@ -0,0 +1,159 @@
+import time
+import logging
+from typing import Dict, Optional
+
+import httpx
+from sqlalchemy.orm import Session
+
+from app.scrapers.base import BaseScraper
+from app.models import Species, Image, ApiKey
+from app.workers.quality_tasks import download_and_process_image
+
+
+class GBIFScraper(BaseScraper):
+    """Scraper for GBIF (Global Biodiversity Information Facility) images."""
+
+    name = "gbif"
+    requires_api_key = False  # GBIF is free to use
+
+    BASE_URL = "https://api.gbif.org/v1"
+
+    HEADERS = {
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
+    }
+
+    # Map GBIF license URLs to short codes
+    LICENSE_MAP = {
+        "http://creativecommons.org/publicdomain/zero/1.0/legalcode": "CC0",
+        "http://creativecommons.org/licenses/by/4.0/legalcode": "CC-BY",
+        "http://creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC",
+        "http://creativecommons.org/publicdomain/zero/1.0/": "CC0",
+        "http://creativecommons.org/licenses/by/4.0/": "CC-BY",
+        "http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
+        "https://creativecommons.org/publicdomain/zero/1.0/legalcode": "CC0",
+        "https://creativecommons.org/licenses/by/4.0/legalcode": "CC-BY",
+        "https://creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC",
+        "https://creativecommons.org/publicdomain/zero/1.0/": "CC0",
+        "https://creativecommons.org/licenses/by/4.0/": "CC-BY",
+        "https://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
+    }
+
+    # Only allow commercial-safe licenses
+    ALLOWED_LICENSES = {"CC0", "CC-BY"}
+
+    def scrape_species(
+        self,
+        species: Species,
+        db: Session,
+        logger: Optional[logging.Logger] = None
+    ) -> Dict[str, int]:
+        """Scrape images from GBIF for a species."""
+        # GBIF doesn't require API key, but we still respect rate limits
+        api_key = self.get_api_key(db)
+        rate_limit = api_key.rate_limit_per_sec if api_key else 1.0
+
+        downloaded = 0
+        rejected = 0
+
+        try:
+            params = {
+                "scientificName": species.scientific_name,
+                "mediaType": "StillImage",
+                "limit": 100,
+            }
+
+            with httpx.Client(timeout=30, headers=self.HEADERS) as client:
+                response = client.get(
+                    f"{self.BASE_URL}/occurrence/search",
+                    params=params,
+                )
+                response.raise_for_status()
+                data = response.json()
+
+                results = data.get("results", [])
+
+                for occurrence in results:
+                    media_list = occurrence.get("media", [])
+
+                    for media in media_list:
+                        # Only process still images
+                        if media.get("type") != "StillImage":
+                            continue
+
+                        url = media.get("identifier")
+                        if not url:
+                            rejected += 1
+                            continue
+
+                        # Check license
+                        license_url = media.get("license", "")
+                        license_code = self.LICENSE_MAP.get(license_url)
+
+                        if not license_code or license_code not in self.ALLOWED_LICENSES:
+                            rejected += 1
+                            continue
+
+                        # Create unique source ID from occurrence key and media URL
+                        occurrence_key = occurrence.get("key", "")
+                        # Use hash of URL for uniqueness within occurrence
+                        url_hash = str(hash(url))[-8:]
+                        source_id = f"{occurrence_key}_{url_hash}"
+
+                        # Check if already exists
+                        existing = db.query(Image).filter(
+                            Image.source == self.name,
+                            Image.source_id == source_id,
+                        ).first()
+
+                        if existing:
+                            continue
+
+                        # Build attribution
+                        creator = media.get("creator", "")
+                        rights_holder = media.get("rightsHolder", "")
+                        attribution_parts = []
+                        if creator:
+                            attribution_parts.append(f"Photo by {creator}")
+                        if rights_holder and rights_holder != creator:
+                            attribution_parts.append(f"Rights: {rights_holder}")
+                        attribution_parts.append(f"via GBIF ({license_code})")
+                        attribution = " | ".join(attribution_parts) if attribution_parts else f"GBIF ({license_code})"
+
+                        # Create image record
+                        image = Image(
+                            species_id=species.id,
+                            source=self.name,
+                            source_id=source_id,
+                            url=url,
+                            license=license_code,
+                            attribution=attribution,
+                            status="pending",
+                        )
+                        db.add(image)
+                        db.commit()
+
+                        # Queue for download
+                        download_and_process_image.delay(image.id)
+                        downloaded += 1
+
+                # Rate limiting
+                time.sleep(1.0 / rate_limit)
+
+        except Exception as e:
+            print(f"Error scraping GBIF for {species.scientific_name}: {e}")
+
+        return {"downloaded": downloaded, "rejected": rejected}
+
+    def test_connection(self, api_key: ApiKey) -> str:
+        """Test GBIF API connection."""
+        # GBIF doesn't require authentication, just test the endpoint
+        with httpx.Client(timeout=10, headers=self.HEADERS) as client:
+            response = client.get(
+                f"{self.BASE_URL}/occurrence/search",
+                params={"limit": 1},
+            )
+            response.raise_for_status()
+            data = response.json()
+
+        count = data.get("count", 0)
+        return f"GBIF API connection successful ({count:,} total occurrences available)"