Files
PlantGuideScraper/backend/app/scrapers/trefle.py
2026-04-12 09:54:27 -05:00

155 lines
5.5 KiB
Python

import time
import logging
from typing import Dict, Optional
import httpx
from sqlalchemy.orm import Session
from app.scrapers.base import BaseScraper
from app.models import Species, Image, ApiKey
from app.workers.quality_tasks import download_and_process_image
class TrefleScraper(BaseScraper):
"""Scraper for Trefle.io plant database."""
name = "trefle"
requires_api_key = True
BASE_URL = "https://trefle.io/api/v1"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
}
def scrape_species(
self,
species: Species,
db: Session,
logger: Optional[logging.Logger] = None
) -> Dict[str, int]:
"""Scrape images from Trefle for a species."""
api_key = self.get_api_key(db)
if not api_key:
return {"downloaded": 0, "rejected": 0, "error": "No API key configured"}
rate_limit = api_key.rate_limit_per_sec
downloaded = 0
rejected = 0
try:
# Search for the species
params = {
"token": api_key.api_key,
"q": species.scientific_name,
}
with httpx.Client(timeout=30, headers=self.HEADERS) as client:
response = client.get(
f"{self.BASE_URL}/plants/search",
params=params,
)
response.raise_for_status()
data = response.json()
plants = data.get("data", [])
for plant in plants:
# Get plant details for more images
plant_id = plant.get("id")
if not plant_id:
continue
detail_response = client.get(
f"{self.BASE_URL}/plants/{plant_id}",
params={"token": api_key.api_key},
)
if detail_response.status_code != 200:
continue
plant_detail = detail_response.json().get("data", {})
# Get main image
main_image = plant_detail.get("image_url")
if main_image:
source_id = f"main_{plant_id}"
existing = db.query(Image).filter(
Image.source == self.name,
Image.source_id == source_id,
).first()
if not existing:
image = Image(
species_id=species.id,
source=self.name,
source_id=source_id,
url=main_image,
license="TREFLE", # Trefle's own license
attribution="Trefle.io Plant Database",
status="pending",
)
db.add(image)
db.commit()
download_and_process_image.delay(image.id)
downloaded += 1
# Get additional images from species detail
images = plant_detail.get("images", {})
for image_type, image_list in images.items():
if not isinstance(image_list, list):
continue
for img in image_list:
url = img.get("image_url")
if not url:
continue
img_id = img.get("id", url.split("/")[-1])
source_id = f"{image_type}_{img_id}"
existing = db.query(Image).filter(
Image.source == self.name,
Image.source_id == source_id,
).first()
if existing:
continue
copyright_info = img.get("copyright", "")
image = Image(
species_id=species.id,
source=self.name,
source_id=source_id,
url=url,
license="TREFLE",
attribution=copyright_info or "Trefle.io",
status="pending",
)
db.add(image)
db.commit()
download_and_process_image.delay(image.id)
downloaded += 1
# Rate limiting
time.sleep(1.0 / rate_limit)
except Exception as e:
print(f"Error scraping Trefle for {species.scientific_name}: {e}")
return {"downloaded": downloaded, "rejected": rejected}
def test_connection(self, api_key: ApiKey) -> str:
"""Test Trefle API connection."""
params = {"token": api_key.api_key}
with httpx.Client(timeout=10, headers=self.HEADERS) as client:
response = client.get(
f"{self.BASE_URL}/plants",
params=params,
)
response.raise_for_status()
return "Trefle API connection successful"