Initial commit — PlantGuideScraper project

2026-04-12 09:54:27 -05:00
commit 6926f502c5
87 changed files with 29120 additions and 0 deletions
--- a/backend/app/scrapers/eol.py
+++ b/backend/app/scrapers/eol.py
@@ -0,0 +1,226 @@
+import time
+import logging
+from typing import Dict, Optional
+
+import httpx
+from sqlalchemy.orm import Session
+
+from app.scrapers.base import BaseScraper
+from app.models import Species, Image, ApiKey
+from app.workers.quality_tasks import download_and_process_image
+
+
+class EOLScraper(BaseScraper):
+    """Scraper for Encyclopedia of Life (EOL) images.
+
+    EOL aggregates biodiversity data from many sources and provides
+    a free API with no authentication required.
+    """
+
+    name = "eol"
+    requires_api_key = False
+
+    BASE_URL = "https://eol.org/api"
+
+    HEADERS = {
+        "User-Agent": "PlantGuideScraper/1.0 (Plant image collection for ML training)",
+        "Accept": "application/json",
+    }
+
+    # Map EOL license URLs to short codes
+    LICENSE_MAP = {
+        "http://creativecommons.org/publicdomain/zero/1.0/": "CC0",
+        "http://creativecommons.org/publicdomain/mark/1.0/": "CC0",
+        "http://creativecommons.org/licenses/by/2.0/": "CC-BY",
+        "http://creativecommons.org/licenses/by/3.0/": "CC-BY",
+        "http://creativecommons.org/licenses/by/4.0/": "CC-BY",
+        "http://creativecommons.org/licenses/by-sa/2.0/": "CC-BY-SA",
+        "http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
+        "http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
+        "https://creativecommons.org/publicdomain/zero/1.0/": "CC0",
+        "https://creativecommons.org/publicdomain/mark/1.0/": "CC0",
+        "https://creativecommons.org/licenses/by/2.0/": "CC-BY",
+        "https://creativecommons.org/licenses/by/3.0/": "CC-BY",
+        "https://creativecommons.org/licenses/by/4.0/": "CC-BY",
+        "https://creativecommons.org/licenses/by-sa/2.0/": "CC-BY-SA",
+        "https://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
+        "https://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
+        "pd": "CC0",  # Public domain
+        "public domain": "CC0",
+    }
+
+    # Commercial-safe licenses
+    ALLOWED_LICENSES = {"CC0", "CC-BY", "CC-BY-SA"}
+
+    def scrape_species(
+        self,
+        species: Species,
+        db: Session,
+        logger: Optional[logging.Logger] = None
+    ) -> Dict[str, int]:
+        """Scrape images from EOL for a species."""
+        api_key = self.get_api_key(db)
+        rate_limit = api_key.rate_limit_per_sec if api_key else 0.5
+
+        downloaded = 0
+        rejected = 0
+
+        def log(level: str, msg: str):
+            if logger:
+                getattr(logger, level)(msg)
+
+        try:
+            # Disable SSL verification - EOL is a trusted source and some Docker
+            # environments lack proper CA certificates
+            with httpx.Client(timeout=30, headers=self.HEADERS, verify=False) as client:
+                # Step 1: Search for the species
+                search_response = client.get(
+                    f"{self.BASE_URL}/search/1.0.json",
+                    params={
+                        "q": species.scientific_name,
+                        "page": 1,
+                        "exact": "true",
+                    },
+                )
+                search_response.raise_for_status()
+                search_data = search_response.json()
+
+                results = search_data.get("results", [])
+                if not results:
+                    log("info", f"  Species not found in EOL: {species.scientific_name}")
+                    return {"downloaded": 0, "rejected": 0}
+
+                # Get the EOL page ID
+                eol_page_id = results[0].get("id")
+                if not eol_page_id:
+                    return {"downloaded": 0, "rejected": 0}
+
+                time.sleep(1.0 / rate_limit)
+
+                # Step 2: Get page details with images
+                page_response = client.get(
+                    f"{self.BASE_URL}/pages/1.0/{eol_page_id}.json",
+                    params={
+                        "images_per_page": 75,
+                        "images_page": 1,
+                        "videos_per_page": 0,
+                        "sounds_per_page": 0,
+                        "maps_per_page": 0,
+                        "texts_per_page": 0,
+                        "details": "true",
+                        "licenses": "cc-by|cc-by-sa|pd|cc-by-nc",
+                    },
+                )
+                page_response.raise_for_status()
+                page_data = page_response.json()
+
+                data_objects = page_data.get("dataObjects", [])
+                log("debug", f"  Found {len(data_objects)} media objects")
+
+                for obj in data_objects:
+                    # Only process images
+                    media_type = obj.get("dataType", "")
+                    if "image" not in media_type.lower() and "stillimage" not in media_type.lower():
+                        continue
+
+                    # Get image URL
+                    image_url = obj.get("eolMediaURL") or obj.get("mediaURL")
+                    if not image_url:
+                        rejected += 1
+                        continue
+
+                    # Check license
+                    license_url = obj.get("license", "").lower()
+                    license_code = None
+
+                    # Try to match license URL
+                    for pattern, code in self.LICENSE_MAP.items():
+                        if pattern in license_url:
+                            license_code = code
+                            break
+
+                    if not license_code:
+                        # Check for NC licenses which we reject
+                        if "-nc" in license_url:
+                            rejected += 1
+                            continue
+                        # Unknown license, skip
+                        log("debug", f"  Rejected: unknown license {license_url}")
+                        rejected += 1
+                        continue
+
+                    if license_code not in self.ALLOWED_LICENSES:
+                        rejected += 1
+                        continue
+
+                    # Create unique source ID
+                    source_id = str(obj.get("dataObjectVersionID") or obj.get("identifier") or hash(image_url))
+
+                    # Check if already exists
+                    existing = db.query(Image).filter(
+                        Image.source == self.name,
+                        Image.source_id == source_id,
+                    ).first()
+
+                    if existing:
+                        continue
+
+                    # Build attribution
+                    agents = obj.get("agents", [])
+                    photographer = None
+                    rights_holder = None
+
+                    for agent in agents:
+                        role = agent.get("role", "").lower()
+                        name = agent.get("full_name", "")
+                        if role == "photographer":
+                            photographer = name
+                        elif role == "owner" or role == "rights holder":
+                            rights_holder = name
+
+                    attribution_parts = []
+                    if photographer:
+                        attribution_parts.append(f"Photo by {photographer}")
+                    if rights_holder and rights_holder != photographer:
+                        attribution_parts.append(f"Rights: {rights_holder}")
+                    attribution_parts.append(f"via EOL ({license_code})")
+                    attribution = " | ".join(attribution_parts)
+
+                    # Create image record
+                    image = Image(
+                        species_id=species.id,
+                        source=self.name,
+                        source_id=source_id,
+                        url=image_url,
+                        license=license_code,
+                        attribution=attribution,
+                        status="pending",
+                    )
+                    db.add(image)
+                    db.commit()
+
+                    # Queue for download
+                    download_and_process_image.delay(image.id)
+                    downloaded += 1
+
+                time.sleep(1.0 / rate_limit)
+
+        except httpx.HTTPStatusError as e:
+            log("error", f"  HTTP error for {species.scientific_name}: {e.response.status_code}")
+        except Exception as e:
+            log("error", f"  Error scraping EOL for {species.scientific_name}: {e}")
+
+        return {"downloaded": downloaded, "rejected": rejected}
+
+    def test_connection(self, api_key: ApiKey) -> str:
+        """Test EOL API connection."""
+        with httpx.Client(timeout=10, headers=self.HEADERS, verify=False) as client:
+            response = client.get(
+                f"{self.BASE_URL}/search/1.0.json",
+                params={"q": "Rosa", "page": 1},
+            )
+            response.raise_for_status()
+            data = response.json()
+
+        total = data.get("totalResults", 0)
+        return f"EOL API connection successful ({total} results for 'Rosa')"