import time import logging from typing import Dict, Optional import httpx from sqlalchemy.orm import Session from app.scrapers.base import BaseScraper from app.models import Species, Image, ApiKey from app.workers.quality_tasks import download_and_process_image class TrefleScraper(BaseScraper): """Scraper for Trefle.io plant database.""" name = "trefle" requires_api_key = True BASE_URL = "https://trefle.io/api/v1" HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15" } def scrape_species( self, species: Species, db: Session, logger: Optional[logging.Logger] = None ) -> Dict[str, int]: """Scrape images from Trefle for a species.""" api_key = self.get_api_key(db) if not api_key: return {"downloaded": 0, "rejected": 0, "error": "No API key configured"} rate_limit = api_key.rate_limit_per_sec downloaded = 0 rejected = 0 try: # Search for the species params = { "token": api_key.api_key, "q": species.scientific_name, } with httpx.Client(timeout=30, headers=self.HEADERS) as client: response = client.get( f"{self.BASE_URL}/plants/search", params=params, ) response.raise_for_status() data = response.json() plants = data.get("data", []) for plant in plants: # Get plant details for more images plant_id = plant.get("id") if not plant_id: continue detail_response = client.get( f"{self.BASE_URL}/plants/{plant_id}", params={"token": api_key.api_key}, ) if detail_response.status_code != 200: continue plant_detail = detail_response.json().get("data", {}) # Get main image main_image = plant_detail.get("image_url") if main_image: source_id = f"main_{plant_id}" existing = db.query(Image).filter( Image.source == self.name, Image.source_id == source_id, ).first() if not existing: image = Image( species_id=species.id, source=self.name, source_id=source_id, url=main_image, license="TREFLE", # Trefle's own license attribution="Trefle.io Plant Database", status="pending", ) db.add(image) db.commit() download_and_process_image.delay(image.id) downloaded += 1 # Get additional images from species detail images = plant_detail.get("images", {}) for image_type, image_list in images.items(): if not isinstance(image_list, list): continue for img in image_list: url = img.get("image_url") if not url: continue img_id = img.get("id", url.split("/")[-1]) source_id = f"{image_type}_{img_id}" existing = db.query(Image).filter( Image.source == self.name, Image.source_id == source_id, ).first() if existing: continue copyright_info = img.get("copyright", "") image = Image( species_id=species.id, source=self.name, source_id=source_id, url=url, license="TREFLE", attribution=copyright_info or "Trefle.io", status="pending", ) db.add(image) db.commit() download_and_process_image.delay(image.id) downloaded += 1 # Rate limiting time.sleep(1.0 / rate_limit) except Exception as e: print(f"Error scraping Trefle for {species.scientific_name}: {e}") return {"downloaded": downloaded, "rejected": rejected} def test_connection(self, api_key: ApiKey) -> str: """Test Trefle API connection.""" params = {"token": api_key.api_key} with httpx.Client(timeout=10, headers=self.HEADERS) as client: response = client.get( f"{self.BASE_URL}/plants", params=params, ) response.raise_for_status() return "Trefle API connection successful"