PlantGuideScraper/backend/app/scrapers/trefle.py

import time
import logging
from typing import Dict, Optional

import httpx
from sqlalchemy.orm import Session

from app.scrapers.base import BaseScraper
from app.models import Species, Image, ApiKey
from app.workers.quality_tasks import download_and_process_image


class TrefleScraper(BaseScraper):
    """Scraper for Trefle.io plant database."""

    name = "trefle"
    requires_api_key = True

    BASE_URL = "https://trefle.io/api/v1"

    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
    }

    def scrape_species(
        self,
        species: Species,
        db: Session,
        logger: Optional[logging.Logger] = None
    ) -> Dict[str, int]:
        """Scrape images from Trefle for a species."""
        api_key = self.get_api_key(db)
        if not api_key:
            return {"downloaded": 0, "rejected": 0, "error": "No API key configured"}

        rate_limit = api_key.rate_limit_per_sec

        downloaded = 0
        rejected = 0

        try:
            # Search for the species
            params = {
                "token": api_key.api_key,
                "q": species.scientific_name,
            }

            with httpx.Client(timeout=30, headers=self.HEADERS) as client:
                response = client.get(
                    f"{self.BASE_URL}/plants/search",
                    params=params,
                )
                response.raise_for_status()
                data = response.json()

                plants = data.get("data", [])

                for plant in plants:
                    # Get plant details for more images
                    plant_id = plant.get("id")
                    if not plant_id:
                        continue

                    detail_response = client.get(
                        f"{self.BASE_URL}/plants/{plant_id}",
                        params={"token": api_key.api_key},
                    )

                    if detail_response.status_code != 200:
                        continue

                    plant_detail = detail_response.json().get("data", {})

                    # Get main image
                    main_image = plant_detail.get("image_url")
                    if main_image:
                        source_id = f"main_{plant_id}"
                        existing = db.query(Image).filter(
                            Image.source == self.name,
                            Image.source_id == source_id,
                        ).first()

                        if not existing:
                            image = Image(
                                species_id=species.id,
                                source=self.name,
                                source_id=source_id,
                                url=main_image,
                                license="TREFLE",  # Trefle's own license
                                attribution="Trefle.io Plant Database",
                                status="pending",
                            )
                            db.add(image)
                            db.commit()
                            download_and_process_image.delay(image.id)
                            downloaded += 1

                    # Get additional images from species detail
                    images = plant_detail.get("images", {})
                    for image_type, image_list in images.items():
                        if not isinstance(image_list, list):
                            continue

                        for img in image_list:
                            url = img.get("image_url")
                            if not url:
                                continue

                            img_id = img.get("id", url.split("/")[-1])
                            source_id = f"{image_type}_{img_id}"

                            existing = db.query(Image).filter(
                                Image.source == self.name,
                                Image.source_id == source_id,
                            ).first()

                            if existing:
                                continue

                            copyright_info = img.get("copyright", "")
                            image = Image(
                                species_id=species.id,
                                source=self.name,
                                source_id=source_id,
                                url=url,
                                license="TREFLE",
                                attribution=copyright_info or "Trefle.io",
                                status="pending",
                            )
                            db.add(image)
                            db.commit()
                            download_and_process_image.delay(image.id)
                            downloaded += 1

                    # Rate limiting
                    time.sleep(1.0 / rate_limit)

        except Exception as e:
            print(f"Error scraping Trefle for {species.scientific_name}: {e}")

        return {"downloaded": downloaded, "rejected": rejected}

    def test_connection(self, api_key: ApiKey) -> str:
        """Test Trefle API connection."""
        params = {"token": api_key.api_key}

        with httpx.Client(timeout=10, headers=self.HEADERS) as client:
            response = client.get(
                f"{self.BASE_URL}/plants",
                params=params,
            )
            response.raise_for_status()

        return "Trefle API connection successful"