Initial commit — PlantGuideScraper project
This commit is contained in:
154
backend/app/scrapers/trefle.py
Normal file
154
backend/app/scrapers/trefle.py
Normal file
@@ -0,0 +1,154 @@
|
||||
import time
|
||||
import logging
|
||||
from typing import Dict, Optional
|
||||
|
||||
import httpx
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.scrapers.base import BaseScraper
|
||||
from app.models import Species, Image, ApiKey
|
||||
from app.workers.quality_tasks import download_and_process_image
|
||||
|
||||
|
||||
class TrefleScraper(BaseScraper):
|
||||
"""Scraper for Trefle.io plant database."""
|
||||
|
||||
name = "trefle"
|
||||
requires_api_key = True
|
||||
|
||||
BASE_URL = "https://trefle.io/api/v1"
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
|
||||
}
|
||||
|
||||
def scrape_species(
|
||||
self,
|
||||
species: Species,
|
||||
db: Session,
|
||||
logger: Optional[logging.Logger] = None
|
||||
) -> Dict[str, int]:
|
||||
"""Scrape images from Trefle for a species."""
|
||||
api_key = self.get_api_key(db)
|
||||
if not api_key:
|
||||
return {"downloaded": 0, "rejected": 0, "error": "No API key configured"}
|
||||
|
||||
rate_limit = api_key.rate_limit_per_sec
|
||||
|
||||
downloaded = 0
|
||||
rejected = 0
|
||||
|
||||
try:
|
||||
# Search for the species
|
||||
params = {
|
||||
"token": api_key.api_key,
|
||||
"q": species.scientific_name,
|
||||
}
|
||||
|
||||
with httpx.Client(timeout=30, headers=self.HEADERS) as client:
|
||||
response = client.get(
|
||||
f"{self.BASE_URL}/plants/search",
|
||||
params=params,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
plants = data.get("data", [])
|
||||
|
||||
for plant in plants:
|
||||
# Get plant details for more images
|
||||
plant_id = plant.get("id")
|
||||
if not plant_id:
|
||||
continue
|
||||
|
||||
detail_response = client.get(
|
||||
f"{self.BASE_URL}/plants/{plant_id}",
|
||||
params={"token": api_key.api_key},
|
||||
)
|
||||
|
||||
if detail_response.status_code != 200:
|
||||
continue
|
||||
|
||||
plant_detail = detail_response.json().get("data", {})
|
||||
|
||||
# Get main image
|
||||
main_image = plant_detail.get("image_url")
|
||||
if main_image:
|
||||
source_id = f"main_{plant_id}"
|
||||
existing = db.query(Image).filter(
|
||||
Image.source == self.name,
|
||||
Image.source_id == source_id,
|
||||
).first()
|
||||
|
||||
if not existing:
|
||||
image = Image(
|
||||
species_id=species.id,
|
||||
source=self.name,
|
||||
source_id=source_id,
|
||||
url=main_image,
|
||||
license="TREFLE", # Trefle's own license
|
||||
attribution="Trefle.io Plant Database",
|
||||
status="pending",
|
||||
)
|
||||
db.add(image)
|
||||
db.commit()
|
||||
download_and_process_image.delay(image.id)
|
||||
downloaded += 1
|
||||
|
||||
# Get additional images from species detail
|
||||
images = plant_detail.get("images", {})
|
||||
for image_type, image_list in images.items():
|
||||
if not isinstance(image_list, list):
|
||||
continue
|
||||
|
||||
for img in image_list:
|
||||
url = img.get("image_url")
|
||||
if not url:
|
||||
continue
|
||||
|
||||
img_id = img.get("id", url.split("/")[-1])
|
||||
source_id = f"{image_type}_{img_id}"
|
||||
|
||||
existing = db.query(Image).filter(
|
||||
Image.source == self.name,
|
||||
Image.source_id == source_id,
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
continue
|
||||
|
||||
copyright_info = img.get("copyright", "")
|
||||
image = Image(
|
||||
species_id=species.id,
|
||||
source=self.name,
|
||||
source_id=source_id,
|
||||
url=url,
|
||||
license="TREFLE",
|
||||
attribution=copyright_info or "Trefle.io",
|
||||
status="pending",
|
||||
)
|
||||
db.add(image)
|
||||
db.commit()
|
||||
download_and_process_image.delay(image.id)
|
||||
downloaded += 1
|
||||
|
||||
# Rate limiting
|
||||
time.sleep(1.0 / rate_limit)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error scraping Trefle for {species.scientific_name}: {e}")
|
||||
|
||||
return {"downloaded": downloaded, "rejected": rejected}
|
||||
|
||||
def test_connection(self, api_key: ApiKey) -> str:
|
||||
"""Test Trefle API connection."""
|
||||
params = {"token": api_key.api_key}
|
||||
|
||||
with httpx.Client(timeout=10, headers=self.HEADERS) as client:
|
||||
response = client.get(
|
||||
f"{self.BASE_URL}/plants",
|
||||
params=params,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
return "Trefle API connection successful"
|
||||
Reference in New Issue
Block a user