Initial commit — PlantGuideScraper project

2026-04-12 09:54:27 -05:00
commit 6926f502c5
87 changed files with 29120 additions and 0 deletions
--- a/backend/app/scrapers/trefle.py
+++ b/backend/app/scrapers/trefle.py
@@ -0,0 +1,154 @@
+import time
+import logging
+from typing import Dict, Optional
+
+import httpx
+from sqlalchemy.orm import Session
+
+from app.scrapers.base import BaseScraper
+from app.models import Species, Image, ApiKey
+from app.workers.quality_tasks import download_and_process_image
+
+
+class TrefleScraper(BaseScraper):
+    """Scraper for Trefle.io plant database."""
+
+    name = "trefle"
+    requires_api_key = True
+
+    BASE_URL = "https://trefle.io/api/v1"
+
+    HEADERS = {
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
+    }
+
+    def scrape_species(
+        self,
+        species: Species,
+        db: Session,
+        logger: Optional[logging.Logger] = None
+    ) -> Dict[str, int]:
+        """Scrape images from Trefle for a species."""
+        api_key = self.get_api_key(db)
+        if not api_key:
+            return {"downloaded": 0, "rejected": 0, "error": "No API key configured"}
+
+        rate_limit = api_key.rate_limit_per_sec
+
+        downloaded = 0
+        rejected = 0
+
+        try:
+            # Search for the species
+            params = {
+                "token": api_key.api_key,
+                "q": species.scientific_name,
+            }
+
+            with httpx.Client(timeout=30, headers=self.HEADERS) as client:
+                response = client.get(
+                    f"{self.BASE_URL}/plants/search",
+                    params=params,
+                )
+                response.raise_for_status()
+                data = response.json()
+
+                plants = data.get("data", [])
+
+                for plant in plants:
+                    # Get plant details for more images
+                    plant_id = plant.get("id")
+                    if not plant_id:
+                        continue
+
+                    detail_response = client.get(
+                        f"{self.BASE_URL}/plants/{plant_id}",
+                        params={"token": api_key.api_key},
+                    )
+
+                    if detail_response.status_code != 200:
+                        continue
+
+                    plant_detail = detail_response.json().get("data", {})
+
+                    # Get main image
+                    main_image = plant_detail.get("image_url")
+                    if main_image:
+                        source_id = f"main_{plant_id}"
+                        existing = db.query(Image).filter(
+                            Image.source == self.name,
+                            Image.source_id == source_id,
+                        ).first()
+
+                        if not existing:
+                            image = Image(
+                                species_id=species.id,
+                                source=self.name,
+                                source_id=source_id,
+                                url=main_image,
+                                license="TREFLE",  # Trefle's own license
+                                attribution="Trefle.io Plant Database",
+                                status="pending",
+                            )
+                            db.add(image)
+                            db.commit()
+                            download_and_process_image.delay(image.id)
+                            downloaded += 1
+
+                    # Get additional images from species detail
+                    images = plant_detail.get("images", {})
+                    for image_type, image_list in images.items():
+                        if not isinstance(image_list, list):
+                            continue
+
+                        for img in image_list:
+                            url = img.get("image_url")
+                            if not url:
+                                continue
+
+                            img_id = img.get("id", url.split("/")[-1])
+                            source_id = f"{image_type}_{img_id}"
+
+                            existing = db.query(Image).filter(
+                                Image.source == self.name,
+                                Image.source_id == source_id,
+                            ).first()
+
+                            if existing:
+                                continue
+
+                            copyright_info = img.get("copyright", "")
+                            image = Image(
+                                species_id=species.id,
+                                source=self.name,
+                                source_id=source_id,
+                                url=url,
+                                license="TREFLE",
+                                attribution=copyright_info or "Trefle.io",
+                                status="pending",
+                            )
+                            db.add(image)
+                            db.commit()
+                            download_and_process_image.delay(image.id)
+                            downloaded += 1
+
+                    # Rate limiting
+                    time.sleep(1.0 / rate_limit)
+
+        except Exception as e:
+            print(f"Error scraping Trefle for {species.scientific_name}: {e}")
+
+        return {"downloaded": downloaded, "rejected": rejected}
+
+    def test_connection(self, api_key: ApiKey) -> str:
+        """Test Trefle API connection."""
+        params = {"token": api_key.api_key}
+
+        with httpx.Client(timeout=10, headers=self.HEADERS) as client:
+            response = client.get(
+                f"{self.BASE_URL}/plants",
+                params=params,
+            )
+            response.raise_for_status()
+
+        return "Trefle API connection successful"