Initial commit — PlantGuideScraper project

This commit is contained in:
Trey T
2026-04-12 09:54:27 -05:00
commit 6926f502c5
87 changed files with 29120 additions and 0 deletions

1
backend/app/__init__.py Normal file
View File

@@ -0,0 +1 @@
# PlantGuideScraper Backend

View File

@@ -0,0 +1 @@
# API routes

175
backend/app/api/exports.py Normal file
View File

@@ -0,0 +1,175 @@
import json
import os
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi.responses import FileResponse
from sqlalchemy.orm import Session
from sqlalchemy import func
from app.database import get_db
from app.models import Export, Image, Species
from app.schemas.export import (
ExportCreate,
ExportResponse,
ExportListResponse,
ExportPreview,
)
from app.workers.export_tasks import generate_export
router = APIRouter()
@router.get("", response_model=ExportListResponse)
def list_exports(
limit: int = Query(50, ge=1, le=200),
db: Session = Depends(get_db),
):
"""List all exports."""
total = db.query(Export).count()
exports = db.query(Export).order_by(Export.created_at.desc()).limit(limit).all()
return ExportListResponse(
items=[ExportResponse.model_validate(e) for e in exports],
total=total,
)
@router.post("/preview", response_model=ExportPreview)
def preview_export(export: ExportCreate, db: Session = Depends(get_db)):
"""Preview export without creating it."""
criteria = export.filter_criteria
min_images = criteria.min_images_per_species
# Build query
query = db.query(Image).filter(Image.status == "downloaded")
if criteria.licenses:
query = query.filter(Image.license.in_(criteria.licenses))
if criteria.min_quality:
query = query.filter(Image.quality_score >= criteria.min_quality)
if criteria.species_ids:
query = query.filter(Image.species_id.in_(criteria.species_ids))
# Count images per species
species_counts = db.query(
Image.species_id,
func.count(Image.id).label("count")
).filter(Image.status == "downloaded")
if criteria.licenses:
species_counts = species_counts.filter(Image.license.in_(criteria.licenses))
if criteria.min_quality:
species_counts = species_counts.filter(Image.quality_score >= criteria.min_quality)
if criteria.species_ids:
species_counts = species_counts.filter(Image.species_id.in_(criteria.species_ids))
species_counts = species_counts.group_by(Image.species_id).all()
valid_species = [s for s in species_counts if s.count >= min_images]
total_images = sum(s.count for s in valid_species)
# Estimate file size (rough: 50KB per image)
estimated_size_mb = (total_images * 50) / 1024
return ExportPreview(
species_count=len(valid_species),
image_count=total_images,
estimated_size_mb=estimated_size_mb,
)
@router.post("", response_model=ExportResponse)
def create_export(export: ExportCreate, db: Session = Depends(get_db)):
"""Create and start a new export job."""
db_export = Export(
name=export.name,
filter_criteria=export.filter_criteria.model_dump_json(),
train_split=export.train_split,
status="pending",
)
db.add(db_export)
db.commit()
db.refresh(db_export)
# Start Celery task
task = generate_export.delay(db_export.id)
db_export.celery_task_id = task.id
db.commit()
return ExportResponse.model_validate(db_export)
@router.get("/{export_id}", response_model=ExportResponse)
def get_export(export_id: int, db: Session = Depends(get_db)):
"""Get export status."""
export = db.query(Export).filter(Export.id == export_id).first()
if not export:
raise HTTPException(status_code=404, detail="Export not found")
return ExportResponse.model_validate(export)
@router.get("/{export_id}/progress")
def get_export_progress(export_id: int, db: Session = Depends(get_db)):
"""Get real-time export progress."""
from app.workers.celery_app import celery_app
export = db.query(Export).filter(Export.id == export_id).first()
if not export:
raise HTTPException(status_code=404, detail="Export not found")
if not export.celery_task_id:
return {"status": export.status}
result = celery_app.AsyncResult(export.celery_task_id)
if result.state == "PROGRESS":
meta = result.info
return {
"status": "generating",
"current": meta.get("current", 0),
"total": meta.get("total", 0),
"current_species": meta.get("species", ""),
}
return {"status": export.status}
@router.get("/{export_id}/download")
def download_export(export_id: int, db: Session = Depends(get_db)):
"""Download export zip file."""
export = db.query(Export).filter(Export.id == export_id).first()
if not export:
raise HTTPException(status_code=404, detail="Export not found")
if export.status != "completed":
raise HTTPException(status_code=400, detail="Export not ready")
if not export.file_path or not os.path.exists(export.file_path):
raise HTTPException(status_code=404, detail="Export file not found")
return FileResponse(
export.file_path,
media_type="application/zip",
filename=f"{export.name}.zip",
)
@router.delete("/{export_id}")
def delete_export(export_id: int, db: Session = Depends(get_db)):
"""Delete an export and its file."""
export = db.query(Export).filter(Export.id == export_id).first()
if not export:
raise HTTPException(status_code=404, detail="Export not found")
# Delete file if exists
if export.file_path and os.path.exists(export.file_path):
os.remove(export.file_path)
db.delete(export)
db.commit()
return {"status": "deleted"}

441
backend/app/api/images.py Normal file
View File

@@ -0,0 +1,441 @@
import os
import shutil
import uuid
from pathlib import Path
from typing import Optional, List
from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi.responses import FileResponse
from sqlalchemy.orm import Session
from sqlalchemy import func
from PIL import Image as PILImage
from app.database import get_db
from app.models import Image, Species
from app.schemas.image import ImageResponse, ImageListResponse
from app.config import get_settings
router = APIRouter()
settings = get_settings()
@router.get("", response_model=ImageListResponse)
def list_images(
page: int = Query(1, ge=1),
page_size: int = Query(50, ge=1, le=200),
species_id: Optional[int] = None,
source: Optional[str] = None,
license: Optional[str] = None,
status: Optional[str] = None,
min_quality: Optional[float] = None,
search: Optional[str] = None,
db: Session = Depends(get_db),
):
"""List images with pagination and filters."""
# Use joinedload to fetch species in single query
from sqlalchemy.orm import joinedload
query = db.query(Image).options(joinedload(Image.species))
if species_id:
query = query.filter(Image.species_id == species_id)
if source:
query = query.filter(Image.source == source)
if license:
query = query.filter(Image.license == license)
if status:
query = query.filter(Image.status == status)
if min_quality:
query = query.filter(Image.quality_score >= min_quality)
if search:
search_term = f"%{search}%"
query = query.join(Species).filter(
(Species.scientific_name.ilike(search_term)) |
(Species.common_name.ilike(search_term))
)
# Use faster count for simple queries
if not search:
# Build count query without join for better performance
count_query = db.query(func.count(Image.id))
if species_id:
count_query = count_query.filter(Image.species_id == species_id)
if source:
count_query = count_query.filter(Image.source == source)
if license:
count_query = count_query.filter(Image.license == license)
if status:
count_query = count_query.filter(Image.status == status)
if min_quality:
count_query = count_query.filter(Image.quality_score >= min_quality)
total = count_query.scalar()
else:
total = query.count()
pages = (total + page_size - 1) // page_size
images = query.order_by(Image.created_at.desc()).offset(
(page - 1) * page_size
).limit(page_size).all()
items = [
ImageResponse(
id=img.id,
species_id=img.species_id,
species_name=img.species.scientific_name if img.species else None,
source=img.source,
source_id=img.source_id,
url=img.url,
local_path=img.local_path,
license=img.license,
attribution=img.attribution,
width=img.width,
height=img.height,
quality_score=img.quality_score,
status=img.status,
created_at=img.created_at,
)
for img in images
]
return ImageListResponse(
items=items,
total=total,
page=page,
page_size=page_size,
pages=pages,
)
@router.get("/sources")
def list_sources(db: Session = Depends(get_db)):
"""List all unique image sources."""
sources = db.query(Image.source).distinct().all()
return [s[0] for s in sources]
@router.get("/licenses")
def list_licenses(db: Session = Depends(get_db)):
"""List all unique licenses."""
licenses = db.query(Image.license).distinct().all()
return [l[0] for l in licenses]
@router.post("/process-pending")
def process_pending_images(
source: Optional[str] = None,
db: Session = Depends(get_db),
):
"""Queue all pending images for download and processing."""
from app.workers.quality_tasks import batch_process_pending_images
query = db.query(func.count(Image.id)).filter(Image.status == "pending")
if source:
query = query.filter(Image.source == source)
pending_count = query.scalar()
task = batch_process_pending_images.delay(source=source)
return {
"pending_count": pending_count,
"task_id": task.id,
}
@router.get("/process-pending/status/{task_id}")
def process_pending_status(task_id: str):
"""Check status of a batch processing task."""
from app.workers.celery_app import celery_app
result = celery_app.AsyncResult(task_id)
state = result.state # PENDING, STARTED, PROGRESS, SUCCESS, FAILURE
response = {"task_id": task_id, "state": state}
if state == "PROGRESS" and isinstance(result.info, dict):
response["queued"] = result.info.get("queued", 0)
response["total"] = result.info.get("total", 0)
elif state == "SUCCESS" and isinstance(result.result, dict):
response["queued"] = result.result.get("queued", 0)
response["total"] = result.result.get("total", 0)
return response
@router.get("/{image_id}", response_model=ImageResponse)
def get_image(image_id: int, db: Session = Depends(get_db)):
"""Get an image by ID."""
image = db.query(Image).filter(Image.id == image_id).first()
if not image:
raise HTTPException(status_code=404, detail="Image not found")
return ImageResponse(
id=image.id,
species_id=image.species_id,
species_name=image.species.scientific_name if image.species else None,
source=image.source,
source_id=image.source_id,
url=image.url,
local_path=image.local_path,
license=image.license,
attribution=image.attribution,
width=image.width,
height=image.height,
quality_score=image.quality_score,
status=image.status,
created_at=image.created_at,
)
@router.get("/{image_id}/file")
def get_image_file(image_id: int, db: Session = Depends(get_db)):
"""Get the actual image file."""
image = db.query(Image).filter(Image.id == image_id).first()
if not image:
raise HTTPException(status_code=404, detail="Image not found")
if not image.local_path:
raise HTTPException(status_code=404, detail="Image file not available")
return FileResponse(image.local_path, media_type="image/jpeg")
@router.delete("/{image_id}")
def delete_image(image_id: int, db: Session = Depends(get_db)):
"""Delete an image."""
image = db.query(Image).filter(Image.id == image_id).first()
if not image:
raise HTTPException(status_code=404, detail="Image not found")
# Delete file if exists
if image.local_path:
import os
if os.path.exists(image.local_path):
os.remove(image.local_path)
db.delete(image)
db.commit()
return {"status": "deleted"}
@router.post("/bulk-delete")
def bulk_delete_images(
image_ids: List[int],
db: Session = Depends(get_db),
):
"""Delete multiple images."""
import os
images = db.query(Image).filter(Image.id.in_(image_ids)).all()
deleted = 0
for image in images:
if image.local_path and os.path.exists(image.local_path):
os.remove(image.local_path)
db.delete(image)
deleted += 1
db.commit()
return {"deleted": deleted}
@router.get("/import/scan")
def scan_imports(db: Session = Depends(get_db)):
"""Scan the imports folder and return what can be imported.
Expected structure: imports/{source}/{species_name}/*.jpg
"""
imports_path = Path(settings.imports_path)
if not imports_path.exists():
return {
"available": False,
"message": f"Imports folder not found: {imports_path}",
"sources": [],
"total_images": 0,
"matched_species": 0,
"unmatched_species": [],
}
results = {
"available": True,
"sources": [],
"total_images": 0,
"matched_species": 0,
"unmatched_species": [],
}
# Get all species for matching
species_map = {}
for species in db.query(Species).all():
# Map by scientific name with underscores and spaces
species_map[species.scientific_name.lower()] = species
species_map[species.scientific_name.replace(" ", "_").lower()] = species
seen_unmatched = set()
# Scan source folders
for source_dir in imports_path.iterdir():
if not source_dir.is_dir():
continue
source_name = source_dir.name
source_info = {
"name": source_name,
"species_count": 0,
"image_count": 0,
}
# Scan species folders within source
for species_dir in source_dir.iterdir():
if not species_dir.is_dir():
continue
species_name = species_dir.name.replace("_", " ")
species_key = species_name.lower()
# Count images
image_files = list(species_dir.glob("*.jpg")) + \
list(species_dir.glob("*.jpeg")) + \
list(species_dir.glob("*.png"))
if not image_files:
continue
source_info["image_count"] += len(image_files)
results["total_images"] += len(image_files)
if species_key in species_map or species_dir.name.lower() in species_map:
source_info["species_count"] += 1
results["matched_species"] += 1
else:
if species_name not in seen_unmatched:
seen_unmatched.add(species_name)
results["unmatched_species"].append(species_name)
if source_info["image_count"] > 0:
results["sources"].append(source_info)
return results
@router.post("/import/run")
def run_import(
move_files: bool = Query(False, description="Move files instead of copy"),
db: Session = Depends(get_db),
):
"""Import images from the imports folder.
Expected structure: imports/{source}/{species_name}/*.jpg
Images are copied/moved to: images/{species_name}/{source}_{filename}
"""
imports_path = Path(settings.imports_path)
images_path = Path(settings.images_path)
if not imports_path.exists():
raise HTTPException(status_code=400, detail="Imports folder not found")
# Get all species for matching
species_map = {}
for species in db.query(Species).all():
species_map[species.scientific_name.lower()] = species
species_map[species.scientific_name.replace(" ", "_").lower()] = species
imported = 0
skipped = 0
errors = []
# Scan source folders
for source_dir in imports_path.iterdir():
if not source_dir.is_dir():
continue
source_name = source_dir.name
# Scan species folders within source
for species_dir in source_dir.iterdir():
if not species_dir.is_dir():
continue
species_name = species_dir.name.replace("_", " ")
species_key = species_name.lower()
# Find matching species
species = species_map.get(species_key) or species_map.get(species_dir.name.lower())
if not species:
continue
# Create target directory
target_dir = images_path / species.scientific_name.replace(" ", "_")
target_dir.mkdir(parents=True, exist_ok=True)
# Process images
image_files = list(species_dir.glob("*.jpg")) + \
list(species_dir.glob("*.jpeg")) + \
list(species_dir.glob("*.png"))
for img_file in image_files:
try:
# Generate unique filename
ext = img_file.suffix.lower()
if ext == ".jpeg":
ext = ".jpg"
new_filename = f"{source_name}_{img_file.stem}_{uuid.uuid4().hex[:8]}{ext}"
target_path = target_dir / new_filename
# Check if already imported (by original filename pattern)
existing = db.query(Image).filter(
Image.species_id == species.id,
Image.source == source_name,
Image.source_id == img_file.stem,
).first()
if existing:
skipped += 1
continue
# Get image dimensions
try:
with PILImage.open(img_file) as pil_img:
width, height = pil_img.size
except Exception:
width, height = None, None
# Copy or move file
if move_files:
shutil.move(str(img_file), str(target_path))
else:
shutil.copy2(str(img_file), str(target_path))
# Create database record
image = Image(
species_id=species.id,
source=source_name,
source_id=img_file.stem,
url=f"file://{img_file}",
local_path=str(target_path),
license="unknown",
width=width,
height=height,
status="downloaded",
)
db.add(image)
imported += 1
except Exception as e:
errors.append(f"{img_file}: {str(e)}")
# Commit after each species to avoid large transactions
db.commit()
return {
"imported": imported,
"skipped": skipped,
"errors": errors[:20],
}

173
backend/app/api/jobs.py Normal file
View File

@@ -0,0 +1,173 @@
import json
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Query
from sqlalchemy.orm import Session
from app.database import get_db
from app.models import Job
from app.schemas.job import JobCreate, JobResponse, JobListResponse
from app.workers.scrape_tasks import run_scrape_job
router = APIRouter()
@router.get("", response_model=JobListResponse)
def list_jobs(
status: Optional[str] = None,
source: Optional[str] = None,
limit: int = Query(50, ge=1, le=200),
db: Session = Depends(get_db),
):
"""List all jobs."""
query = db.query(Job)
if status:
query = query.filter(Job.status == status)
if source:
query = query.filter(Job.source == source)
total = query.count()
jobs = query.order_by(Job.created_at.desc()).limit(limit).all()
return JobListResponse(
items=[JobResponse.model_validate(j) for j in jobs],
total=total,
)
@router.post("", response_model=JobResponse)
def create_job(job: JobCreate, db: Session = Depends(get_db)):
"""Create and start a new scrape job."""
species_filter = None
if job.species_ids:
species_filter = json.dumps(job.species_ids)
db_job = Job(
name=job.name,
source=job.source,
species_filter=species_filter,
only_without_images=job.only_without_images,
max_images=job.max_images,
status="pending",
)
db.add(db_job)
db.commit()
db.refresh(db_job)
# Start the Celery task
task = run_scrape_job.delay(db_job.id)
db_job.celery_task_id = task.id
db.commit()
return JobResponse.model_validate(db_job)
@router.get("/{job_id}", response_model=JobResponse)
def get_job(job_id: int, db: Session = Depends(get_db)):
"""Get job status."""
job = db.query(Job).filter(Job.id == job_id).first()
if not job:
raise HTTPException(status_code=404, detail="Job not found")
return JobResponse.model_validate(job)
@router.get("/{job_id}/progress")
def get_job_progress(job_id: int, db: Session = Depends(get_db)):
"""Get real-time job progress from Celery."""
from app.workers.celery_app import celery_app
job = db.query(Job).filter(Job.id == job_id).first()
if not job:
raise HTTPException(status_code=404, detail="Job not found")
if not job.celery_task_id:
return {
"status": job.status,
"progress_current": job.progress_current,
"progress_total": job.progress_total,
}
# Get Celery task state
result = celery_app.AsyncResult(job.celery_task_id)
if result.state == "PROGRESS":
meta = result.info
return {
"status": "running",
"progress_current": meta.get("current", 0),
"progress_total": meta.get("total", 0),
"current_species": meta.get("species", ""),
}
return {
"status": job.status,
"progress_current": job.progress_current,
"progress_total": job.progress_total,
}
@router.post("/{job_id}/pause")
def pause_job(job_id: int, db: Session = Depends(get_db)):
"""Pause a running job."""
from app.workers.celery_app import celery_app
job = db.query(Job).filter(Job.id == job_id).first()
if not job:
raise HTTPException(status_code=404, detail="Job not found")
if job.status != "running":
raise HTTPException(status_code=400, detail="Job is not running")
# Revoke Celery task
if job.celery_task_id:
celery_app.control.revoke(job.celery_task_id, terminate=True)
job.status = "paused"
db.commit()
return {"status": "paused"}
@router.post("/{job_id}/resume")
def resume_job(job_id: int, db: Session = Depends(get_db)):
"""Resume a paused job."""
job = db.query(Job).filter(Job.id == job_id).first()
if not job:
raise HTTPException(status_code=404, detail="Job not found")
if job.status != "paused":
raise HTTPException(status_code=400, detail="Job is not paused")
# Start new Celery task
task = run_scrape_job.delay(job.id)
job.celery_task_id = task.id
job.status = "pending"
db.commit()
return {"status": "resumed"}
@router.post("/{job_id}/cancel")
def cancel_job(job_id: int, db: Session = Depends(get_db)):
"""Cancel a job."""
from app.workers.celery_app import celery_app
job = db.query(Job).filter(Job.id == job_id).first()
if not job:
raise HTTPException(status_code=404, detail="Job not found")
if job.status in ["completed", "failed"]:
raise HTTPException(status_code=400, detail="Job already finished")
# Revoke Celery task
if job.celery_task_id:
celery_app.control.revoke(job.celery_task_id, terminate=True)
job.status = "failed"
job.error_message = "Cancelled by user"
db.commit()
return {"status": "cancelled"}

198
backend/app/api/sources.py Normal file
View File

@@ -0,0 +1,198 @@
from fastapi import APIRouter, Depends, HTTPException
from sqlalchemy.orm import Session
from app.database import get_db
from app.models import ApiKey
from app.schemas.api_key import ApiKeyCreate, ApiKeyUpdate, ApiKeyResponse
router = APIRouter()
# Available sources
# auth_type: "none" (no auth), "api_key" (single key), "api_key_secret" (key + secret), "oauth" (client_id + client_secret + access_token)
# default_rate: safe default requests per second for each API
AVAILABLE_SOURCES = [
{"name": "gbif", "label": "GBIF", "requires_secret": False, "auth_type": "none", "default_rate": 1.0}, # Free, no auth required
{"name": "inaturalist", "label": "iNaturalist", "requires_secret": True, "auth_type": "api_key_secret", "default_rate": 1.0}, # 60/min limit
{"name": "flickr", "label": "Flickr", "requires_secret": True, "auth_type": "api_key_secret", "default_rate": 0.5}, # 3600/hr shared limit
{"name": "wikimedia", "label": "Wikimedia Commons", "requires_secret": True, "auth_type": "oauth", "default_rate": 1.0}, # generous limits
{"name": "trefle", "label": "Trefle.io", "requires_secret": False, "auth_type": "api_key", "default_rate": 1.0}, # 120/min limit
{"name": "duckduckgo", "label": "DuckDuckGo", "requires_secret": False, "auth_type": "none", "default_rate": 0.5}, # Web search, no API key
{"name": "bing", "label": "Bing Image Search", "requires_secret": False, "auth_type": "api_key", "default_rate": 3.0}, # Azure Cognitive Services
]
def mask_api_key(key: str) -> str:
"""Mask API key, showing only last 4 characters."""
if not key or len(key) <= 4:
return "****"
return "*" * (len(key) - 4) + key[-4:]
@router.get("")
def list_sources(db: Session = Depends(get_db)):
"""List all available sources with their configuration status."""
api_keys = {k.source: k for k in db.query(ApiKey).all()}
result = []
for source in AVAILABLE_SOURCES:
api_key = api_keys.get(source["name"])
default_rate = source.get("default_rate", 1.0)
result.append({
"name": source["name"],
"label": source["label"],
"requires_secret": source["requires_secret"],
"auth_type": source.get("auth_type", "api_key"),
"configured": api_key is not None,
"enabled": api_key.enabled if api_key else False,
"api_key_masked": mask_api_key(api_key.api_key) if api_key else None,
"has_secret": bool(api_key.api_secret) if api_key else False,
"has_access_token": bool(getattr(api_key, 'access_token', None)) if api_key else False,
"rate_limit_per_sec": api_key.rate_limit_per_sec if api_key else default_rate,
"default_rate": default_rate,
})
return result
@router.get("/{source}")
def get_source(source: str, db: Session = Depends(get_db)):
"""Get source configuration."""
source_info = next((s for s in AVAILABLE_SOURCES if s["name"] == source), None)
if not source_info:
raise HTTPException(status_code=404, detail="Unknown source")
api_key = db.query(ApiKey).filter(ApiKey.source == source).first()
default_rate = source_info.get("default_rate", 1.0)
return {
"name": source_info["name"],
"label": source_info["label"],
"requires_secret": source_info["requires_secret"],
"auth_type": source_info.get("auth_type", "api_key"),
"configured": api_key is not None,
"enabled": api_key.enabled if api_key else False,
"api_key_masked": mask_api_key(api_key.api_key) if api_key else None,
"has_secret": bool(api_key.api_secret) if api_key else False,
"has_access_token": bool(getattr(api_key, 'access_token', None)) if api_key else False,
"rate_limit_per_sec": api_key.rate_limit_per_sec if api_key else default_rate,
"default_rate": default_rate,
}
@router.put("/{source}")
def update_source(
source: str,
config: ApiKeyCreate,
db: Session = Depends(get_db),
):
"""Create or update source configuration."""
source_info = next((s for s in AVAILABLE_SOURCES if s["name"] == source), None)
if not source_info:
raise HTTPException(status_code=404, detail="Unknown source")
# For sources that require auth, validate api_key is provided
auth_type = source_info.get("auth_type", "api_key")
if auth_type != "none" and not config.api_key:
raise HTTPException(status_code=400, detail="API key is required for this source")
api_key = db.query(ApiKey).filter(ApiKey.source == source).first()
# Use placeholder for no-auth sources
api_key_value = config.api_key or "no-auth"
if api_key:
# Update existing
api_key.api_key = api_key_value
if config.api_secret:
api_key.api_secret = config.api_secret
if config.access_token:
api_key.access_token = config.access_token
api_key.rate_limit_per_sec = config.rate_limit_per_sec
api_key.enabled = config.enabled
else:
# Create new
api_key = ApiKey(
source=source,
api_key=api_key_value,
api_secret=config.api_secret,
access_token=config.access_token,
rate_limit_per_sec=config.rate_limit_per_sec,
enabled=config.enabled,
)
db.add(api_key)
db.commit()
db.refresh(api_key)
return {
"name": source,
"configured": True,
"enabled": api_key.enabled,
"api_key_masked": mask_api_key(api_key.api_key) if auth_type != "none" else None,
"has_secret": bool(api_key.api_secret),
"has_access_token": bool(api_key.access_token),
"rate_limit_per_sec": api_key.rate_limit_per_sec,
}
@router.patch("/{source}")
def patch_source(
source: str,
config: ApiKeyUpdate,
db: Session = Depends(get_db),
):
"""Partially update source configuration."""
api_key = db.query(ApiKey).filter(ApiKey.source == source).first()
if not api_key:
raise HTTPException(status_code=404, detail="Source not configured")
update_data = config.model_dump(exclude_unset=True)
for field, value in update_data.items():
setattr(api_key, field, value)
db.commit()
db.refresh(api_key)
return {
"name": source,
"configured": True,
"enabled": api_key.enabled,
"api_key_masked": mask_api_key(api_key.api_key),
"has_secret": bool(api_key.api_secret),
"has_access_token": bool(api_key.access_token),
"rate_limit_per_sec": api_key.rate_limit_per_sec,
}
@router.delete("/{source}")
def delete_source(source: str, db: Session = Depends(get_db)):
"""Delete source configuration."""
api_key = db.query(ApiKey).filter(ApiKey.source == source).first()
if not api_key:
raise HTTPException(status_code=404, detail="Source not configured")
db.delete(api_key)
db.commit()
return {"status": "deleted"}
@router.post("/{source}/test")
def test_source(source: str, db: Session = Depends(get_db)):
"""Test source API connection."""
api_key = db.query(ApiKey).filter(ApiKey.source == source).first()
if not api_key:
raise HTTPException(status_code=404, detail="Source not configured")
# Import and test the scraper
from app.scrapers import get_scraper
scraper = get_scraper(source)
if not scraper:
raise HTTPException(status_code=400, detail="No scraper for this source")
try:
result = scraper.test_connection(api_key)
return {"status": "success", "message": result}
except Exception as e:
return {"status": "error", "message": str(e)}

366
backend/app/api/species.py Normal file
View File

@@ -0,0 +1,366 @@
import csv
import io
import json
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Query, UploadFile, File
from sqlalchemy.orm import Session
from sqlalchemy import func, text
from app.database import get_db
from app.models import Species, Image
from app.schemas.species import (
SpeciesCreate,
SpeciesUpdate,
SpeciesResponse,
SpeciesListResponse,
SpeciesImportResponse,
)
router = APIRouter()
def get_species_with_count(db: Session, species: Species) -> SpeciesResponse:
"""Get species response with image count."""
image_count = db.query(func.count(Image.id)).filter(
Image.species_id == species.id,
Image.status == "downloaded"
).scalar()
return SpeciesResponse(
id=species.id,
scientific_name=species.scientific_name,
common_name=species.common_name,
genus=species.genus,
family=species.family,
created_at=species.created_at,
image_count=image_count or 0,
)
@router.get("", response_model=SpeciesListResponse)
def list_species(
page: int = Query(1, ge=1),
page_size: int = Query(50, ge=1, le=500),
search: Optional[str] = None,
genus: Optional[str] = None,
has_images: Optional[bool] = None,
max_images: Optional[int] = Query(None, description="Filter species with less than N images"),
min_images: Optional[int] = Query(None, description="Filter species with at least N images"),
db: Session = Depends(get_db),
):
"""List species with pagination and filters.
Filters:
- search: Search by scientific or common name
- genus: Filter by genus
- has_images: True for species with images, False for species without
- max_images: Filter species with fewer than N downloaded images
- min_images: Filter species with at least N downloaded images
"""
# If filtering by image count, we need to use a subquery approach
if max_images is not None or min_images is not None:
# Build a subquery with image counts per species
image_counts = (
db.query(
Species.id.label("species_id"),
func.count(Image.id).label("img_count")
)
.outerjoin(Image, (Image.species_id == Species.id) & (Image.status == "downloaded"))
.group_by(Species.id)
.subquery()
)
# Join species with their counts
query = db.query(Species).join(
image_counts, Species.id == image_counts.c.species_id
)
if max_images is not None:
query = query.filter(image_counts.c.img_count < max_images)
if min_images is not None:
query = query.filter(image_counts.c.img_count >= min_images)
else:
query = db.query(Species)
if search:
search_term = f"%{search}%"
query = query.filter(
(Species.scientific_name.ilike(search_term)) |
(Species.common_name.ilike(search_term))
)
if genus:
query = query.filter(Species.genus == genus)
# Filter by whether species has downloaded images (only if not using min/max filters)
if has_images is not None and max_images is None and min_images is None:
# Get IDs of species that have at least one downloaded image
species_with_images = (
db.query(Image.species_id)
.filter(Image.status == "downloaded")
.distinct()
.subquery()
)
if has_images:
query = query.filter(Species.id.in_(db.query(species_with_images.c.species_id)))
else:
query = query.filter(~Species.id.in_(db.query(species_with_images.c.species_id)))
total = query.count()
pages = (total + page_size - 1) // page_size
species_list = query.order_by(Species.scientific_name).offset(
(page - 1) * page_size
).limit(page_size).all()
# Fetch image counts in bulk for all species on this page
species_ids = [s.id for s in species_list]
if species_ids:
count_query = db.query(
Image.species_id,
func.count(Image.id)
).filter(
Image.species_id.in_(species_ids),
Image.status == "downloaded"
).group_by(Image.species_id).all()
count_map = {species_id: count for species_id, count in count_query}
else:
count_map = {}
items = [
SpeciesResponse(
id=s.id,
scientific_name=s.scientific_name,
common_name=s.common_name,
genus=s.genus,
family=s.family,
created_at=s.created_at,
image_count=count_map.get(s.id, 0),
)
for s in species_list
]
return SpeciesListResponse(
items=items,
total=total,
page=page,
page_size=page_size,
pages=pages,
)
@router.post("", response_model=SpeciesResponse)
def create_species(species: SpeciesCreate, db: Session = Depends(get_db)):
"""Create a new species."""
existing = db.query(Species).filter(
Species.scientific_name == species.scientific_name
).first()
if existing:
raise HTTPException(status_code=400, detail="Species already exists")
# Auto-extract genus from scientific name if not provided
genus = species.genus
if not genus and " " in species.scientific_name:
genus = species.scientific_name.split()[0]
db_species = Species(
scientific_name=species.scientific_name,
common_name=species.common_name,
genus=genus,
family=species.family,
)
db.add(db_species)
db.commit()
db.refresh(db_species)
return get_species_with_count(db, db_species)
@router.post("/import", response_model=SpeciesImportResponse)
async def import_species(
file: UploadFile = File(...),
db: Session = Depends(get_db),
):
"""Import species from CSV file.
Expected columns: scientific_name, common_name (optional), genus (optional), family (optional)
"""
if not file.filename.endswith(".csv"):
raise HTTPException(status_code=400, detail="File must be a CSV")
content = await file.read()
text = content.decode("utf-8")
reader = csv.DictReader(io.StringIO(text))
imported = 0
skipped = 0
errors = []
for row_num, row in enumerate(reader, start=2):
scientific_name = row.get("scientific_name", "").strip()
if not scientific_name:
errors.append(f"Row {row_num}: Missing scientific_name")
continue
# Check if already exists
existing = db.query(Species).filter(
Species.scientific_name == scientific_name
).first()
if existing:
skipped += 1
continue
# Auto-extract genus if not provided
genus = row.get("genus", "").strip()
if not genus and " " in scientific_name:
genus = scientific_name.split()[0]
try:
species = Species(
scientific_name=scientific_name,
common_name=row.get("common_name", "").strip() or None,
genus=genus or None,
family=row.get("family", "").strip() or None,
)
db.add(species)
imported += 1
except Exception as e:
errors.append(f"Row {row_num}: {str(e)}")
db.commit()
return SpeciesImportResponse(
imported=imported,
skipped=skipped,
errors=errors[:10], # Limit error messages
)
@router.post("/import-json", response_model=SpeciesImportResponse)
async def import_species_json(
file: UploadFile = File(...),
db: Session = Depends(get_db),
):
"""Import species from JSON file.
Expected format: {"plants": [{"scientific_name": "...", "common_names": [...], "family": "..."}]}
"""
if not file.filename.endswith(".json"):
raise HTTPException(status_code=400, detail="File must be a JSON")
content = await file.read()
try:
data = json.loads(content.decode("utf-8"))
except json.JSONDecodeError as e:
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}")
plants = data.get("plants", [])
if not plants:
raise HTTPException(status_code=400, detail="No plants found in JSON")
imported = 0
skipped = 0
errors = []
for idx, plant in enumerate(plants):
scientific_name = plant.get("scientific_name", "").strip()
if not scientific_name:
errors.append(f"Plant {idx}: Missing scientific_name")
continue
# Check if already exists
existing = db.query(Species).filter(
Species.scientific_name == scientific_name
).first()
if existing:
skipped += 1
continue
# Auto-extract genus from scientific name
genus = None
if " " in scientific_name:
genus = scientific_name.split()[0]
# Get first common name if array provided
common_names = plant.get("common_names", [])
common_name = common_names[0] if common_names else None
try:
species = Species(
scientific_name=scientific_name,
common_name=common_name,
genus=genus,
family=plant.get("family"),
)
db.add(species)
imported += 1
except Exception as e:
errors.append(f"Plant {idx}: {str(e)}")
db.commit()
return SpeciesImportResponse(
imported=imported,
skipped=skipped,
errors=errors[:10],
)
@router.get("/{species_id}", response_model=SpeciesResponse)
def get_species(species_id: int, db: Session = Depends(get_db)):
"""Get a species by ID."""
species = db.query(Species).filter(Species.id == species_id).first()
if not species:
raise HTTPException(status_code=404, detail="Species not found")
return get_species_with_count(db, species)
@router.put("/{species_id}", response_model=SpeciesResponse)
def update_species(
species_id: int,
species_update: SpeciesUpdate,
db: Session = Depends(get_db),
):
"""Update a species."""
species = db.query(Species).filter(Species.id == species_id).first()
if not species:
raise HTTPException(status_code=404, detail="Species not found")
update_data = species_update.model_dump(exclude_unset=True)
for field, value in update_data.items():
setattr(species, field, value)
db.commit()
db.refresh(species)
return get_species_with_count(db, species)
@router.delete("/{species_id}")
def delete_species(species_id: int, db: Session = Depends(get_db)):
"""Delete a species and all its images."""
species = db.query(Species).filter(Species.id == species_id).first()
if not species:
raise HTTPException(status_code=404, detail="Species not found")
db.delete(species)
db.commit()
return {"status": "deleted"}
@router.get("/genera/list")
def list_genera(db: Session = Depends(get_db)):
"""List all unique genera."""
genera = db.query(Species.genus).filter(
Species.genus.isnot(None)
).distinct().order_by(Species.genus).all()
return [g[0] for g in genera]

190
backend/app/api/stats.py Normal file
View File

@@ -0,0 +1,190 @@
import json
from fastapi import APIRouter, Depends, HTTPException
from sqlalchemy.orm import Session
from sqlalchemy import func, case
from app.database import get_db
from app.models import Species, Image, Job
from app.models.cached_stats import CachedStats
from app.schemas.stats import StatsResponse, SourceStats, LicenseStats, SpeciesStats, JobStats
router = APIRouter()
@router.get("", response_model=StatsResponse)
def get_stats(db: Session = Depends(get_db)):
"""Get dashboard statistics from cache (updated every 60s by Celery)."""
# Try to get cached stats
cached = db.query(CachedStats).filter(CachedStats.key == "dashboard_stats").first()
if cached:
data = json.loads(cached.value)
return StatsResponse(
total_species=data["total_species"],
total_images=data["total_images"],
images_downloaded=data["images_downloaded"],
images_pending=data["images_pending"],
images_rejected=data["images_rejected"],
disk_usage_mb=data["disk_usage_mb"],
sources=[SourceStats(**s) for s in data["sources"]],
licenses=[LicenseStats(**l) for l in data["licenses"]],
jobs=JobStats(**data["jobs"]),
top_species=[SpeciesStats(**s) for s in data["top_species"]],
under_represented=[SpeciesStats(**s) for s in data["under_represented"]],
)
# No cache yet - return empty stats (Celery will populate soon)
# This only happens on first startup before Celery runs
return StatsResponse(
total_species=0,
total_images=0,
images_downloaded=0,
images_pending=0,
images_rejected=0,
disk_usage_mb=0.0,
sources=[],
licenses=[],
jobs=JobStats(running=0, pending=0, completed=0, failed=0),
top_species=[],
under_represented=[],
)
@router.post("/refresh")
def refresh_stats_now(db: Session = Depends(get_db)):
"""Manually trigger a stats refresh."""
from app.workers.stats_tasks import refresh_stats
refresh_stats.delay()
return {"status": "refresh_queued"}
@router.get("/sources")
def get_source_stats(db: Session = Depends(get_db)):
"""Get per-source breakdown."""
stats = db.query(
Image.source,
func.count(Image.id).label("total"),
func.sum(case((Image.status == "downloaded", 1), else_=0)).label("downloaded"),
func.sum(case((Image.status == "pending", 1), else_=0)).label("pending"),
func.sum(case((Image.status == "rejected", 1), else_=0)).label("rejected"),
).group_by(Image.source).all()
return [
{
"source": s.source,
"total": s.total,
"downloaded": s.downloaded or 0,
"pending": s.pending or 0,
"rejected": s.rejected or 0,
}
for s in stats
]
@router.get("/species")
def get_species_stats(
min_count: int = 0,
max_count: int = None,
db: Session = Depends(get_db),
):
"""Get per-species image counts."""
query = db.query(
Species.id,
Species.scientific_name,
Species.common_name,
Species.genus,
func.count(Image.id).label("image_count")
).outerjoin(Image, (Image.species_id == Species.id) & (Image.status == "downloaded")
).group_by(Species.id)
if min_count > 0:
query = query.having(func.count(Image.id) >= min_count)
if max_count is not None:
query = query.having(func.count(Image.id) <= max_count)
stats = query.order_by(func.count(Image.id).desc()).all()
return [
{
"id": s.id,
"scientific_name": s.scientific_name,
"common_name": s.common_name,
"genus": s.genus,
"image_count": s.image_count,
}
for s in stats
]
@router.get("/distribution")
def get_image_distribution(db: Session = Depends(get_db)):
"""Get distribution of images per species for ML training assessment.
Returns counts of species at various image thresholds to help
determine dataset quality for training image classifiers.
"""
from sqlalchemy import text
# Get image counts per species using optimized raw SQL
distribution_sql = text("""
WITH species_counts AS (
SELECT
s.id,
COUNT(i.id) as cnt
FROM species s
LEFT JOIN images i ON i.species_id = s.id AND i.status = 'downloaded'
GROUP BY s.id
)
SELECT
COUNT(*) as total_species,
SUM(CASE WHEN cnt = 0 THEN 1 ELSE 0 END) as with_0,
SUM(CASE WHEN cnt >= 1 AND cnt < 10 THEN 1 ELSE 0 END) as with_1_9,
SUM(CASE WHEN cnt >= 10 AND cnt < 25 THEN 1 ELSE 0 END) as with_10_24,
SUM(CASE WHEN cnt >= 25 AND cnt < 50 THEN 1 ELSE 0 END) as with_25_49,
SUM(CASE WHEN cnt >= 50 AND cnt < 100 THEN 1 ELSE 0 END) as with_50_99,
SUM(CASE WHEN cnt >= 100 AND cnt < 200 THEN 1 ELSE 0 END) as with_100_199,
SUM(CASE WHEN cnt >= 200 THEN 1 ELSE 0 END) as with_200_plus,
SUM(CASE WHEN cnt >= 10 THEN 1 ELSE 0 END) as trainable_10,
SUM(CASE WHEN cnt >= 25 THEN 1 ELSE 0 END) as trainable_25,
SUM(CASE WHEN cnt >= 50 THEN 1 ELSE 0 END) as trainable_50,
SUM(CASE WHEN cnt >= 100 THEN 1 ELSE 0 END) as trainable_100,
AVG(cnt) as avg_images,
MAX(cnt) as max_images,
MIN(cnt) as min_images,
SUM(cnt) as total_images
FROM species_counts
""")
result = db.execute(distribution_sql).fetchone()
return {
"total_species": result[0] or 0,
"distribution": {
"0_images": result[1] or 0,
"1_to_9": result[2] or 0,
"10_to_24": result[3] or 0,
"25_to_49": result[4] or 0,
"50_to_99": result[5] or 0,
"100_to_199": result[6] or 0,
"200_plus": result[7] or 0,
},
"trainable_species": {
"min_10_images": result[8] or 0,
"min_25_images": result[9] or 0,
"min_50_images": result[10] or 0,
"min_100_images": result[11] or 0,
},
"summary": {
"avg_images_per_species": round(result[12] or 0, 1),
"max_images": result[13] or 0,
"min_images": result[14] or 0,
"total_downloaded_images": result[15] or 0,
},
"recommendations": {
"for_basic_model": f"{result[8] or 0} species with 10+ images",
"for_good_model": f"{result[10] or 0} species with 50+ images",
"for_excellent_model": f"{result[11] or 0} species with 100+ images",
}
}

38
backend/app/config.py Normal file
View File

@@ -0,0 +1,38 @@
from pydantic_settings import BaseSettings
from functools import lru_cache
class Settings(BaseSettings):
# Database
database_url: str = "sqlite:////data/db/plants.sqlite"
# Redis
redis_url: str = "redis://redis:6379/0"
# Storage paths
images_path: str = "/data/images"
exports_path: str = "/data/exports"
imports_path: str = "/data/imports"
logs_path: str = "/data/logs"
# API Keys
flickr_api_key: str = ""
flickr_api_secret: str = ""
inaturalist_app_id: str = ""
inaturalist_app_secret: str = ""
trefle_api_key: str = ""
# Logging
log_level: str = "INFO"
# Celery
celery_concurrency: int = 4
class Config:
env_file = ".env"
extra = "ignore"
@lru_cache()
def get_settings() -> Settings:
return Settings()

44
backend/app/database.py Normal file
View File

@@ -0,0 +1,44 @@
from sqlalchemy import create_engine, event
from sqlalchemy.orm import sessionmaker, declarative_base
from sqlalchemy.pool import StaticPool
from app.config import get_settings
settings = get_settings()
# SQLite-specific configuration
connect_args = {"check_same_thread": False}
engine = create_engine(
settings.database_url,
connect_args=connect_args,
poolclass=StaticPool,
echo=False,
)
# Enable WAL mode for better concurrent access
@event.listens_for(engine, "connect")
def set_sqlite_pragma(dbapi_connection, connection_record):
cursor = dbapi_connection.cursor()
cursor.execute("PRAGMA journal_mode=WAL")
cursor.execute("PRAGMA synchronous=NORMAL")
cursor.execute("PRAGMA foreign_keys=ON")
cursor.close()
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Base = declarative_base()
def get_db():
db = SessionLocal()
try:
yield db
finally:
db.close()
def init_db():
"""Create all tables."""
from app.models import species, image, job, api_key, export, cached_stats # noqa
Base.metadata.create_all(bind=engine)

95
backend/app/main.py Normal file
View File

@@ -0,0 +1,95 @@
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from app.config import get_settings
from app.database import init_db
from app.api import species, images, jobs, exports, stats, sources
settings = get_settings()
app = FastAPI(
title="PlantGuideScraper API",
description="Web scraper interface for houseplant image collection",
version="1.0.0",
)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Include routers
app.include_router(species.router, prefix="/api/species", tags=["Species"])
app.include_router(images.router, prefix="/api/images", tags=["Images"])
app.include_router(jobs.router, prefix="/api/jobs", tags=["Jobs"])
app.include_router(exports.router, prefix="/api/exports", tags=["Exports"])
app.include_router(stats.router, prefix="/api/stats", tags=["Stats"])
app.include_router(sources.router, prefix="/api/sources", tags=["Sources"])
@app.on_event("startup")
async def startup_event():
"""Initialize database on startup."""
init_db()
@app.get("/health")
async def health_check():
"""Health check endpoint."""
return {"status": "healthy", "service": "plant-scraper"}
@app.get("/api/debug")
async def debug_check():
"""Debug endpoint - checks database connection."""
import time
from app.database import SessionLocal
from app.models import Species, Image
results = {"status": "checking", "checks": {}}
# Check 1: Can we create a session?
try:
start = time.time()
db = SessionLocal()
results["checks"]["session_create"] = {"ok": True, "ms": int((time.time() - start) * 1000)}
except Exception as e:
results["checks"]["session_create"] = {"ok": False, "error": str(e)}
results["status"] = "error"
return results
# Check 2: Simple query - count species
try:
start = time.time()
count = db.query(Species).count()
results["checks"]["species_count"] = {"ok": True, "count": count, "ms": int((time.time() - start) * 1000)}
except Exception as e:
results["checks"]["species_count"] = {"ok": False, "error": str(e)}
results["status"] = "error"
db.close()
return results
# Check 3: Count images
try:
start = time.time()
count = db.query(Image).count()
results["checks"]["image_count"] = {"ok": True, "count": count, "ms": int((time.time() - start) * 1000)}
except Exception as e:
results["checks"]["image_count"] = {"ok": False, "error": str(e)}
results["status"] = "error"
db.close()
return results
db.close()
results["status"] = "healthy"
return results
@app.get("/")
async def root():
"""Root endpoint."""
return {"message": "PlantGuideScraper API", "docs": "/docs"}

View File

@@ -0,0 +1,8 @@
from app.models.species import Species
from app.models.image import Image
from app.models.job import Job
from app.models.api_key import ApiKey
from app.models.export import Export
from app.models.cached_stats import CachedStats
__all__ = ["Species", "Image", "Job", "ApiKey", "Export", "CachedStats"]

View File

@@ -0,0 +1,18 @@
from sqlalchemy import Column, Integer, String, Float, Boolean
from app.database import Base
class ApiKey(Base):
__tablename__ = "api_keys"
id = Column(Integer, primary_key=True, index=True)
source = Column(String, unique=True, nullable=False) # 'flickr', 'inaturalist', 'wikimedia', 'trefle'
api_key = Column(String, nullable=False) # Also used as Client ID for OAuth sources
api_secret = Column(String, nullable=True) # Also used as Client Secret for OAuth sources
access_token = Column(String, nullable=True) # For OAuth sources like Wikimedia
rate_limit_per_sec = Column(Float, default=1.0)
enabled = Column(Boolean, default=True)
def __repr__(self):
return f"<ApiKey(id={self.id}, source='{self.source}', enabled={self.enabled})>"

View File

@@ -0,0 +1,14 @@
from datetime import datetime
from sqlalchemy import Column, Integer, String, Text, DateTime
from app.database import Base
class CachedStats(Base):
"""Stores pre-calculated statistics updated by Celery beat."""
__tablename__ = "cached_stats"
id = Column(Integer, primary_key=True, index=True)
key = Column(String(50), unique=True, nullable=False, index=True)
value = Column(Text, nullable=False) # JSON-encoded stats
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)

View File

@@ -0,0 +1,24 @@
from sqlalchemy import Column, Integer, String, Float, DateTime, Text, func
from app.database import Base
class Export(Base):
__tablename__ = "exports"
id = Column(Integer, primary_key=True, index=True)
name = Column(String, nullable=False)
filter_criteria = Column(Text, nullable=True) # JSON: min_images, licenses, min_quality, species_ids
train_split = Column(Float, default=0.8)
status = Column(String, default="pending") # pending, generating, completed, failed
file_path = Column(String, nullable=True)
file_size = Column(Integer, nullable=True)
species_count = Column(Integer, nullable=True)
image_count = Column(Integer, nullable=True)
celery_task_id = Column(String, nullable=True)
created_at = Column(DateTime, server_default=func.now())
completed_at = Column(DateTime, nullable=True)
error_message = Column(Text, nullable=True)
def __repr__(self):
return f"<Export(id={self.id}, name='{self.name}', status='{self.status}')>"

View File

@@ -0,0 +1,36 @@
from sqlalchemy import Column, Integer, String, Float, DateTime, ForeignKey, func, UniqueConstraint, Index
from sqlalchemy.orm import relationship
from app.database import Base
class Image(Base):
__tablename__ = "images"
id = Column(Integer, primary_key=True, index=True)
species_id = Column(Integer, ForeignKey("species.id"), nullable=False, index=True)
source = Column(String, nullable=False, index=True)
source_id = Column(String, nullable=True)
url = Column(String, nullable=False)
local_path = Column(String, nullable=True)
license = Column(String, nullable=False, index=True)
attribution = Column(String, nullable=True)
width = Column(Integer, nullable=True)
height = Column(Integer, nullable=True)
phash = Column(String, nullable=True, index=True)
quality_score = Column(Float, nullable=True)
status = Column(String, default="pending", index=True) # pending, downloaded, rejected, deleted
created_at = Column(DateTime, server_default=func.now())
# Composite indexes for common query patterns
__table_args__ = (
UniqueConstraint("source", "source_id", name="uq_source_source_id"),
Index("ix_images_species_status", "species_id", "status"), # For counting images per species by status
Index("ix_images_status_created", "status", "created_at"), # For listing images by status
)
# Relationships
species = relationship("Species", back_populates="images")
def __repr__(self):
return f"<Image(id={self.id}, source='{self.source}', status='{self.status}')>"

27
backend/app/models/job.py Normal file
View File

@@ -0,0 +1,27 @@
from sqlalchemy import Column, Integer, String, DateTime, Text, Boolean, func
from app.database import Base
class Job(Base):
__tablename__ = "jobs"
id = Column(Integer, primary_key=True, index=True)
name = Column(String, nullable=False)
source = Column(String, nullable=False)
species_filter = Column(Text, nullable=True) # JSON array of species IDs or NULL for all
only_without_images = Column(Boolean, default=False) # If True, only scrape species with 0 images
max_images = Column(Integer, nullable=True) # If set, only scrape species with fewer than N images
status = Column(String, default="pending", index=True) # pending, running, paused, completed, failed
progress_current = Column(Integer, default=0)
progress_total = Column(Integer, default=0)
images_downloaded = Column(Integer, default=0)
images_rejected = Column(Integer, default=0)
celery_task_id = Column(String, nullable=True)
started_at = Column(DateTime, nullable=True)
completed_at = Column(DateTime, nullable=True)
error_message = Column(Text, nullable=True)
created_at = Column(DateTime, server_default=func.now())
def __repr__(self):
return f"<Job(id={self.id}, name='{self.name}', status='{self.status}')>"

View File

@@ -0,0 +1,21 @@
from sqlalchemy import Column, Integer, String, DateTime, func
from sqlalchemy.orm import relationship
from app.database import Base
class Species(Base):
__tablename__ = "species"
id = Column(Integer, primary_key=True, index=True)
scientific_name = Column(String, unique=True, nullable=False, index=True)
common_name = Column(String, nullable=True)
genus = Column(String, nullable=True, index=True)
family = Column(String, nullable=True)
created_at = Column(DateTime, server_default=func.now())
# Relationships
images = relationship("Image", back_populates="species", cascade="all, delete-orphan")
def __repr__(self):
return f"<Species(id={self.id}, scientific_name='{self.scientific_name}')>"

View File

@@ -0,0 +1,15 @@
from app.schemas.species import SpeciesCreate, SpeciesUpdate, SpeciesResponse, SpeciesListResponse
from app.schemas.image import ImageResponse, ImageListResponse, ImageFilter
from app.schemas.job import JobCreate, JobResponse, JobListResponse
from app.schemas.api_key import ApiKeyCreate, ApiKeyUpdate, ApiKeyResponse
from app.schemas.export import ExportCreate, ExportResponse, ExportListResponse
from app.schemas.stats import StatsResponse, SourceStats, SpeciesStats
__all__ = [
"SpeciesCreate", "SpeciesUpdate", "SpeciesResponse", "SpeciesListResponse",
"ImageResponse", "ImageListResponse", "ImageFilter",
"JobCreate", "JobResponse", "JobListResponse",
"ApiKeyCreate", "ApiKeyUpdate", "ApiKeyResponse",
"ExportCreate", "ExportResponse", "ExportListResponse",
"StatsResponse", "SourceStats", "SpeciesStats",
]

View File

@@ -0,0 +1,36 @@
from pydantic import BaseModel
from typing import Optional
class ApiKeyBase(BaseModel):
source: str
api_key: Optional[str] = None # Optional for no-auth sources, used as Client ID for OAuth
api_secret: Optional[str] = None # Also used as Client Secret for OAuth sources
access_token: Optional[str] = None # For OAuth sources like Wikimedia
rate_limit_per_sec: float = 1.0
enabled: bool = True
class ApiKeyCreate(ApiKeyBase):
pass
class ApiKeyUpdate(BaseModel):
api_key: Optional[str] = None
api_secret: Optional[str] = None
access_token: Optional[str] = None
rate_limit_per_sec: Optional[float] = None
enabled: Optional[bool] = None
class ApiKeyResponse(BaseModel):
id: int
source: str
api_key_masked: str # Show only last 4 chars
has_secret: bool
has_access_token: bool
rate_limit_per_sec: float
enabled: bool
class Config:
from_attributes = True

View File

@@ -0,0 +1,45 @@
from pydantic import BaseModel
from datetime import datetime
from typing import Optional, List
class ExportFilter(BaseModel):
min_images_per_species: int = 100
licenses: Optional[List[str]] = None # None means all
min_quality: Optional[float] = None
species_ids: Optional[List[int]] = None # None means all
class ExportCreate(BaseModel):
name: str
filter_criteria: ExportFilter
train_split: float = 0.8
class ExportResponse(BaseModel):
id: int
name: str
filter_criteria: Optional[str] = None
train_split: float
status: str
file_path: Optional[str] = None
file_size: Optional[int] = None
species_count: Optional[int] = None
image_count: Optional[int] = None
created_at: datetime
completed_at: Optional[datetime] = None
error_message: Optional[str] = None
class Config:
from_attributes = True
class ExportListResponse(BaseModel):
items: List[ExportResponse]
total: int
class ExportPreview(BaseModel):
species_count: int
image_count: int
estimated_size_mb: float

View File

@@ -0,0 +1,47 @@
from pydantic import BaseModel
from datetime import datetime
from typing import Optional, List
class ImageBase(BaseModel):
species_id: int
source: str
url: str
license: str
class ImageResponse(BaseModel):
id: int
species_id: int
species_name: Optional[str] = None
source: str
source_id: Optional[str] = None
url: str
local_path: Optional[str] = None
license: str
attribution: Optional[str] = None
width: Optional[int] = None
height: Optional[int] = None
quality_score: Optional[float] = None
status: str
created_at: datetime
class Config:
from_attributes = True
class ImageListResponse(BaseModel):
items: List[ImageResponse]
total: int
page: int
page_size: int
pages: int
class ImageFilter(BaseModel):
species_id: Optional[int] = None
source: Optional[str] = None
license: Optional[str] = None
status: Optional[str] = None
min_quality: Optional[float] = None
search: Optional[str] = None

View File

@@ -0,0 +1,35 @@
from pydantic import BaseModel
from datetime import datetime
from typing import Optional, List
class JobCreate(BaseModel):
name: str
source: str
species_ids: Optional[List[int]] = None # None means all species
only_without_images: bool = False # If True, only scrape species with 0 images
max_images: Optional[int] = None # If set, only scrape species with fewer than N images
class JobResponse(BaseModel):
id: int
name: str
source: str
species_filter: Optional[str] = None
status: str
progress_current: int
progress_total: int
images_downloaded: int
images_rejected: int
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
error_message: Optional[str] = None
created_at: datetime
class Config:
from_attributes = True
class JobListResponse(BaseModel):
items: List[JobResponse]
total: int

View File

@@ -0,0 +1,44 @@
from pydantic import BaseModel
from datetime import datetime
from typing import Optional, List
class SpeciesBase(BaseModel):
scientific_name: str
common_name: Optional[str] = None
genus: Optional[str] = None
family: Optional[str] = None
class SpeciesCreate(SpeciesBase):
pass
class SpeciesUpdate(BaseModel):
scientific_name: Optional[str] = None
common_name: Optional[str] = None
genus: Optional[str] = None
family: Optional[str] = None
class SpeciesResponse(SpeciesBase):
id: int
created_at: datetime
image_count: int = 0
class Config:
from_attributes = True
class SpeciesListResponse(BaseModel):
items: List[SpeciesResponse]
total: int
page: int
page_size: int
pages: int
class SpeciesImportResponse(BaseModel):
imported: int
skipped: int
errors: List[str]

View File

@@ -0,0 +1,43 @@
from pydantic import BaseModel
from typing import List, Dict
class SourceStats(BaseModel):
source: str
image_count: int
downloaded: int
pending: int
rejected: int
class LicenseStats(BaseModel):
license: str
count: int
class SpeciesStats(BaseModel):
id: int
scientific_name: str
common_name: str | None
image_count: int
class JobStats(BaseModel):
running: int
pending: int
completed: int
failed: int
class StatsResponse(BaseModel):
total_species: int
total_images: int
images_downloaded: int
images_pending: int
images_rejected: int
disk_usage_mb: float
sources: List[SourceStats]
licenses: List[LicenseStats]
jobs: JobStats
top_species: List[SpeciesStats]
under_represented: List[SpeciesStats] # Species with < 100 images

View File

@@ -0,0 +1,41 @@
from typing import Optional
from app.scrapers.base import BaseScraper
from app.scrapers.inaturalist import INaturalistScraper
from app.scrapers.flickr import FlickrScraper
from app.scrapers.wikimedia import WikimediaScraper
from app.scrapers.trefle import TrefleScraper
from app.scrapers.gbif import GBIFScraper
from app.scrapers.duckduckgo import DuckDuckGoScraper
from app.scrapers.bing import BingScraper
def get_scraper(source: str) -> Optional[BaseScraper]:
"""Get scraper instance for a source."""
scrapers = {
"inaturalist": INaturalistScraper,
"flickr": FlickrScraper,
"wikimedia": WikimediaScraper,
"trefle": TrefleScraper,
"gbif": GBIFScraper,
"duckduckgo": DuckDuckGoScraper,
"bing": BingScraper,
}
scraper_class = scrapers.get(source)
if scraper_class:
return scraper_class()
return None
__all__ = [
"get_scraper",
"BaseScraper",
"INaturalistScraper",
"FlickrScraper",
"WikimediaScraper",
"TrefleScraper",
"GBIFScraper",
"DuckDuckGoScraper",
"BingScraper",
]

View File

@@ -0,0 +1,57 @@
from abc import ABC, abstractmethod
from typing import Dict, Any, Optional
import logging
from sqlalchemy.orm import Session
from app.models import Species, ApiKey
class BaseScraper(ABC):
"""Base class for all image scrapers."""
name: str = "base"
requires_api_key: bool = True
@abstractmethod
def scrape_species(
self,
species: Species,
db: Session,
logger: Optional[logging.Logger] = None
) -> Dict[str, int]:
"""
Scrape images for a species.
Args:
species: The species to scrape images for
db: Database session
logger: Optional logger for debugging
Returns:
Dict with 'downloaded' and 'rejected' counts
"""
pass
@abstractmethod
def test_connection(self, api_key: ApiKey) -> str:
"""
Test API connection.
Args:
api_key: The API key configuration
Returns:
Success message
Raises:
Exception if connection fails
"""
pass
def get_api_key(self, db: Session) -> ApiKey:
"""Get API key for this scraper."""
return db.query(ApiKey).filter(
ApiKey.source == self.name,
ApiKey.enabled == True
).first()

228
backend/app/scrapers/bhl.py Normal file
View File

@@ -0,0 +1,228 @@
import time
import logging
from typing import Dict, Optional
import httpx
from sqlalchemy.orm import Session
from app.scrapers.base import BaseScraper
from app.models import Species, Image, ApiKey
from app.workers.quality_tasks import download_and_process_image
class BHLScraper(BaseScraper):
"""Scraper for Biodiversity Heritage Library (BHL) images.
BHL provides access to digitized biodiversity literature and illustrations.
Most content is public domain (pre-1927) or CC-licensed.
Note: BHL images are primarily historical botanical illustrations,
which may differ from photographs but are valuable for training.
"""
name = "bhl"
requires_api_key = True # BHL requires free API key
BASE_URL = "https://www.biodiversitylibrary.org/api3"
HEADERS = {
"User-Agent": "PlantGuideScraper/1.0 (Plant image collection for ML training)",
"Accept": "application/json",
}
# BHL content is mostly public domain
ALLOWED_LICENSES = {"CC0", "CC-BY", "CC-BY-SA", "PD"}
def scrape_species(
self,
species: Species,
db: Session,
logger: Optional[logging.Logger] = None
) -> Dict[str, int]:
"""Scrape images from BHL for a species."""
api_key = self.get_api_key(db)
if not api_key:
return {"downloaded": 0, "rejected": 0, "error": "No API key configured"}
rate_limit = api_key.rate_limit_per_sec if api_key else 0.5
downloaded = 0
rejected = 0
def log(level: str, msg: str):
if logger:
getattr(logger, level)(msg)
try:
# Disable SSL verification - some Docker environments lack proper CA certificates
with httpx.Client(timeout=30, headers=self.HEADERS, verify=False) as client:
# Search for name in BHL
search_response = client.get(
f"{self.BASE_URL}",
params={
"op": "NameSearch",
"name": species.scientific_name,
"format": "json",
"apikey": api_key.api_key,
},
)
search_response.raise_for_status()
search_data = search_response.json()
results = search_data.get("Result", [])
if not results:
log("info", f" Species not found in BHL: {species.scientific_name}")
return {"downloaded": 0, "rejected": 0}
time.sleep(1.0 / rate_limit)
# Get pages with illustrations for each name result
for name_result in results[:5]: # Limit to top 5 matches
name_bank_id = name_result.get("NameBankID")
if not name_bank_id:
continue
# Get publications with this name
pub_response = client.get(
f"{self.BASE_URL}",
params={
"op": "NameGetDetail",
"namebankid": name_bank_id,
"format": "json",
"apikey": api_key.api_key,
},
)
pub_response.raise_for_status()
pub_data = pub_response.json()
time.sleep(1.0 / rate_limit)
# Extract titles and get page images
for title in pub_data.get("Result", []):
title_id = title.get("TitleID")
if not title_id:
continue
# Get pages for this title
pages_response = client.get(
f"{self.BASE_URL}",
params={
"op": "GetPageMetadata",
"titleid": title_id,
"format": "json",
"apikey": api_key.api_key,
"ocr": "false",
"names": "false",
},
)
if pages_response.status_code != 200:
continue
pages_data = pages_response.json()
pages = pages_data.get("Result", [])
time.sleep(1.0 / rate_limit)
# Look for pages that are likely illustrations
for page in pages[:100]: # Limit pages per title
page_types = page.get("PageTypes", [])
# Only get illustration/plate pages
is_illustration = any(
pt.get("PageTypeName", "").lower() in ["illustration", "plate", "figure", "map"]
for pt in page_types
) if page_types else False
if not is_illustration and page_types:
continue
page_id = page.get("PageID")
if not page_id:
continue
# Construct image URL
# BHL provides multiple image sizes
image_url = f"https://www.biodiversitylibrary.org/pageimage/{page_id}"
# Check if already exists
source_id = str(page_id)
existing = db.query(Image).filter(
Image.source == self.name,
Image.source_id == source_id,
).first()
if existing:
continue
# Determine license - BHL content is usually public domain
item_url = page.get("ItemUrl", "")
year = None
try:
# Try to extract year from ItemUrl or other fields
if "Year" in page:
year = int(page.get("Year", 0))
except (ValueError, TypeError):
pass
# Content before 1927 is public domain in US
if year and year < 1927:
license_code = "PD"
else:
license_code = "CC0" # BHL default for older works
# Build attribution
title_name = title.get("ShortTitle", title.get("FullTitle", "Unknown"))
attribution = f"From '{title_name}' via Biodiversity Heritage Library ({license_code})"
# Create image record
image = Image(
species_id=species.id,
source=self.name,
source_id=source_id,
url=image_url,
license=license_code,
attribution=attribution,
status="pending",
)
db.add(image)
db.commit()
# Queue for download
download_and_process_image.delay(image.id)
downloaded += 1
# Limit total per species
if downloaded >= 50:
break
if downloaded >= 50:
break
if downloaded >= 50:
break
except httpx.HTTPStatusError as e:
log("error", f" HTTP error for {species.scientific_name}: {e.response.status_code}")
except Exception as e:
log("error", f" Error scraping BHL for {species.scientific_name}: {e}")
return {"downloaded": downloaded, "rejected": rejected}
def test_connection(self, api_key: ApiKey) -> str:
"""Test BHL API connection."""
with httpx.Client(timeout=10, headers=self.HEADERS, verify=False) as client:
response = client.get(
f"{self.BASE_URL}",
params={
"op": "NameSearch",
"name": "Rosa",
"format": "json",
"apikey": api_key.api_key,
},
)
response.raise_for_status()
data = response.json()
results = data.get("Result", [])
return f"BHL API connection successful ({len(results)} results for 'Rosa')"

View File

@@ -0,0 +1,135 @@
import hashlib
import time
import logging
from typing import Dict, Optional
import httpx
from sqlalchemy.orm import Session
from app.scrapers.base import BaseScraper
from app.models import Species, Image, ApiKey
from app.workers.quality_tasks import download_and_process_image
class BingScraper(BaseScraper):
"""Scraper for Bing Image Search v7 API (Azure Cognitive Services)."""
name = "bing"
requires_api_key = True
BASE_URL = "https://api.bing.microsoft.com/v7.0/images/search"
NEGATIVE_TERMS = "-herbarium -specimen -illustration -drawing -diagram -dried -pressed"
LICENSE_MAP = {
"Public": "CC0",
"Share": "CC-BY-SA",
"ShareCommercially": "CC-BY",
"Modify": "CC-BY-SA",
"ModifyCommercially": "CC-BY",
}
def _build_queries(self, species: Species) -> list[str]:
queries = [f'"{species.scientific_name}" plant photo {self.NEGATIVE_TERMS}']
if species.common_name:
queries.append(f'"{species.common_name}" houseplant photo {self.NEGATIVE_TERMS}')
return queries
def scrape_species(
self,
species: Species,
db: Session,
logger: Optional[logging.Logger] = None,
) -> Dict[str, int]:
api_key = self.get_api_key(db)
if not api_key:
return {"downloaded": 0, "rejected": 0}
rate_limit = api_key.rate_limit_per_sec or 3.0
downloaded = 0
rejected = 0
seen_urls = set()
headers = {
"Ocp-Apim-Subscription-Key": api_key.api_key,
}
try:
queries = self._build_queries(species)
with httpx.Client(timeout=30, headers=headers) as client:
for query in queries:
params = {
"q": query,
"imageType": "Photo",
"license": "ShareCommercially",
"count": 50,
}
response = client.get(self.BASE_URL, params=params)
response.raise_for_status()
data = response.json()
for result in data.get("value", []):
url = result.get("contentUrl")
if not url or url in seen_urls:
continue
seen_urls.add(url)
# Use Bing's imageId, fall back to md5 hash
source_id = result.get("imageId") or hashlib.md5(url.encode()).hexdigest()[:16]
existing = db.query(Image).filter(
Image.source == self.name,
Image.source_id == source_id,
).first()
if existing:
continue
# Map license
bing_license = result.get("license", "")
license_code = self.LICENSE_MAP.get(bing_license, "UNKNOWN")
host = result.get("hostPageDisplayUrl", "")
attribution = f"via Bing ({host})" if host else "via Bing Image Search"
image = Image(
species_id=species.id,
source=self.name,
source_id=source_id,
url=url,
width=result.get("width"),
height=result.get("height"),
license=license_code,
attribution=attribution,
status="pending",
)
db.add(image)
db.commit()
download_and_process_image.delay(image.id)
downloaded += 1
time.sleep(1.0 / rate_limit)
except Exception as e:
if logger:
logger.error(f"Error scraping Bing for {species.scientific_name}: {e}")
else:
print(f"Error scraping Bing for {species.scientific_name}: {e}")
return {"downloaded": downloaded, "rejected": rejected}
def test_connection(self, api_key: ApiKey) -> str:
headers = {"Ocp-Apim-Subscription-Key": api_key.api_key}
with httpx.Client(timeout=10, headers=headers) as client:
response = client.get(
self.BASE_URL,
params={"q": "Monstera deliciosa plant", "count": 1},
)
response.raise_for_status()
data = response.json()
count = data.get("totalEstimatedMatches", 0)
return f"Bing Image Search working ({count:,} estimated matches)"

View File

@@ -0,0 +1,101 @@
import hashlib
import time
import logging
from typing import Dict, Optional
from duckduckgo_search import DDGS
from sqlalchemy.orm import Session
from app.scrapers.base import BaseScraper
from app.models import Species, Image, ApiKey
from app.workers.quality_tasks import download_and_process_image
class DuckDuckGoScraper(BaseScraper):
"""Scraper for DuckDuckGo image search. No API key required."""
name = "duckduckgo"
requires_api_key = False
NEGATIVE_TERMS = "-herbarium -specimen -illustration -drawing -diagram -dried -pressed"
def _build_queries(self, species: Species) -> list[str]:
queries = [f'"{species.scientific_name}" plant photo {self.NEGATIVE_TERMS}']
if species.common_name:
queries.append(f'"{species.common_name}" houseplant photo {self.NEGATIVE_TERMS}')
return queries
def scrape_species(
self,
species: Species,
db: Session,
logger: Optional[logging.Logger] = None,
) -> Dict[str, int]:
api_key = self.get_api_key(db)
rate_limit = api_key.rate_limit_per_sec if api_key else 0.5
downloaded = 0
rejected = 0
seen_urls = set()
try:
queries = self._build_queries(species)
with DDGS() as ddgs:
for query in queries:
results = ddgs.images(
keywords=query,
type_image="photo",
max_results=50,
)
for result in results:
url = result.get("image")
if not url or url in seen_urls:
continue
seen_urls.add(url)
source_id = hashlib.md5(url.encode()).hexdigest()[:16]
# Check if already exists
existing = db.query(Image).filter(
Image.source == self.name,
Image.source_id == source_id,
).first()
if existing:
continue
title = result.get("title", "")
attribution = f"{title} via DuckDuckGo" if title else "via DuckDuckGo"
image = Image(
species_id=species.id,
source=self.name,
source_id=source_id,
url=url,
license="UNKNOWN",
attribution=attribution,
status="pending",
)
db.add(image)
db.commit()
download_and_process_image.delay(image.id)
downloaded += 1
time.sleep(1.0 / rate_limit)
except Exception as e:
if logger:
logger.error(f"Error scraping DuckDuckGo for {species.scientific_name}: {e}")
else:
print(f"Error scraping DuckDuckGo for {species.scientific_name}: {e}")
return {"downloaded": downloaded, "rejected": rejected}
def test_connection(self, api_key: ApiKey) -> str:
with DDGS() as ddgs:
results = ddgs.images(keywords="Monstera deliciosa plant", max_results=1)
count = len(list(results))
return f"DuckDuckGo search working ({count} test result)"

226
backend/app/scrapers/eol.py Normal file
View File

@@ -0,0 +1,226 @@
import time
import logging
from typing import Dict, Optional
import httpx
from sqlalchemy.orm import Session
from app.scrapers.base import BaseScraper
from app.models import Species, Image, ApiKey
from app.workers.quality_tasks import download_and_process_image
class EOLScraper(BaseScraper):
"""Scraper for Encyclopedia of Life (EOL) images.
EOL aggregates biodiversity data from many sources and provides
a free API with no authentication required.
"""
name = "eol"
requires_api_key = False
BASE_URL = "https://eol.org/api"
HEADERS = {
"User-Agent": "PlantGuideScraper/1.0 (Plant image collection for ML training)",
"Accept": "application/json",
}
# Map EOL license URLs to short codes
LICENSE_MAP = {
"http://creativecommons.org/publicdomain/zero/1.0/": "CC0",
"http://creativecommons.org/publicdomain/mark/1.0/": "CC0",
"http://creativecommons.org/licenses/by/2.0/": "CC-BY",
"http://creativecommons.org/licenses/by/3.0/": "CC-BY",
"http://creativecommons.org/licenses/by/4.0/": "CC-BY",
"http://creativecommons.org/licenses/by-sa/2.0/": "CC-BY-SA",
"http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
"http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
"https://creativecommons.org/publicdomain/zero/1.0/": "CC0",
"https://creativecommons.org/publicdomain/mark/1.0/": "CC0",
"https://creativecommons.org/licenses/by/2.0/": "CC-BY",
"https://creativecommons.org/licenses/by/3.0/": "CC-BY",
"https://creativecommons.org/licenses/by/4.0/": "CC-BY",
"https://creativecommons.org/licenses/by-sa/2.0/": "CC-BY-SA",
"https://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
"https://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
"pd": "CC0", # Public domain
"public domain": "CC0",
}
# Commercial-safe licenses
ALLOWED_LICENSES = {"CC0", "CC-BY", "CC-BY-SA"}
def scrape_species(
self,
species: Species,
db: Session,
logger: Optional[logging.Logger] = None
) -> Dict[str, int]:
"""Scrape images from EOL for a species."""
api_key = self.get_api_key(db)
rate_limit = api_key.rate_limit_per_sec if api_key else 0.5
downloaded = 0
rejected = 0
def log(level: str, msg: str):
if logger:
getattr(logger, level)(msg)
try:
# Disable SSL verification - EOL is a trusted source and some Docker
# environments lack proper CA certificates
with httpx.Client(timeout=30, headers=self.HEADERS, verify=False) as client:
# Step 1: Search for the species
search_response = client.get(
f"{self.BASE_URL}/search/1.0.json",
params={
"q": species.scientific_name,
"page": 1,
"exact": "true",
},
)
search_response.raise_for_status()
search_data = search_response.json()
results = search_data.get("results", [])
if not results:
log("info", f" Species not found in EOL: {species.scientific_name}")
return {"downloaded": 0, "rejected": 0}
# Get the EOL page ID
eol_page_id = results[0].get("id")
if not eol_page_id:
return {"downloaded": 0, "rejected": 0}
time.sleep(1.0 / rate_limit)
# Step 2: Get page details with images
page_response = client.get(
f"{self.BASE_URL}/pages/1.0/{eol_page_id}.json",
params={
"images_per_page": 75,
"images_page": 1,
"videos_per_page": 0,
"sounds_per_page": 0,
"maps_per_page": 0,
"texts_per_page": 0,
"details": "true",
"licenses": "cc-by|cc-by-sa|pd|cc-by-nc",
},
)
page_response.raise_for_status()
page_data = page_response.json()
data_objects = page_data.get("dataObjects", [])
log("debug", f" Found {len(data_objects)} media objects")
for obj in data_objects:
# Only process images
media_type = obj.get("dataType", "")
if "image" not in media_type.lower() and "stillimage" not in media_type.lower():
continue
# Get image URL
image_url = obj.get("eolMediaURL") or obj.get("mediaURL")
if not image_url:
rejected += 1
continue
# Check license
license_url = obj.get("license", "").lower()
license_code = None
# Try to match license URL
for pattern, code in self.LICENSE_MAP.items():
if pattern in license_url:
license_code = code
break
if not license_code:
# Check for NC licenses which we reject
if "-nc" in license_url:
rejected += 1
continue
# Unknown license, skip
log("debug", f" Rejected: unknown license {license_url}")
rejected += 1
continue
if license_code not in self.ALLOWED_LICENSES:
rejected += 1
continue
# Create unique source ID
source_id = str(obj.get("dataObjectVersionID") or obj.get("identifier") or hash(image_url))
# Check if already exists
existing = db.query(Image).filter(
Image.source == self.name,
Image.source_id == source_id,
).first()
if existing:
continue
# Build attribution
agents = obj.get("agents", [])
photographer = None
rights_holder = None
for agent in agents:
role = agent.get("role", "").lower()
name = agent.get("full_name", "")
if role == "photographer":
photographer = name
elif role == "owner" or role == "rights holder":
rights_holder = name
attribution_parts = []
if photographer:
attribution_parts.append(f"Photo by {photographer}")
if rights_holder and rights_holder != photographer:
attribution_parts.append(f"Rights: {rights_holder}")
attribution_parts.append(f"via EOL ({license_code})")
attribution = " | ".join(attribution_parts)
# Create image record
image = Image(
species_id=species.id,
source=self.name,
source_id=source_id,
url=image_url,
license=license_code,
attribution=attribution,
status="pending",
)
db.add(image)
db.commit()
# Queue for download
download_and_process_image.delay(image.id)
downloaded += 1
time.sleep(1.0 / rate_limit)
except httpx.HTTPStatusError as e:
log("error", f" HTTP error for {species.scientific_name}: {e.response.status_code}")
except Exception as e:
log("error", f" Error scraping EOL for {species.scientific_name}: {e}")
return {"downloaded": downloaded, "rejected": rejected}
def test_connection(self, api_key: ApiKey) -> str:
"""Test EOL API connection."""
with httpx.Client(timeout=10, headers=self.HEADERS, verify=False) as client:
response = client.get(
f"{self.BASE_URL}/search/1.0.json",
params={"q": "Rosa", "page": 1},
)
response.raise_for_status()
data = response.json()
total = data.get("totalResults", 0)
return f"EOL API connection successful ({total} results for 'Rosa')"

View File

@@ -0,0 +1,146 @@
import time
import logging
from typing import Dict, Optional
import httpx
from sqlalchemy.orm import Session
from app.scrapers.base import BaseScraper
from app.models import Species, Image, ApiKey
from app.workers.quality_tasks import download_and_process_image
class FlickrScraper(BaseScraper):
"""Scraper for Flickr images via their API."""
name = "flickr"
requires_api_key = True
BASE_URL = "https://api.flickr.com/services/rest/"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
}
# Commercial-safe license IDs
# 4 = CC BY 2.0, 7 = No known copyright, 8 = US Gov, 9 = CC0
ALLOWED_LICENSES = "4,7,8,9"
LICENSE_MAP = {
"4": "CC-BY",
"7": "NO-KNOWN-COPYRIGHT",
"8": "US-GOV",
"9": "CC0",
}
def scrape_species(
self,
species: Species,
db: Session,
logger: Optional[logging.Logger] = None
) -> Dict[str, int]:
"""Scrape images from Flickr for a species."""
api_key = self.get_api_key(db)
if not api_key:
return {"downloaded": 0, "rejected": 0, "error": "No API key configured"}
rate_limit = api_key.rate_limit_per_sec
downloaded = 0
rejected = 0
try:
params = {
"method": "flickr.photos.search",
"api_key": api_key.api_key,
"text": species.scientific_name,
"license": self.ALLOWED_LICENSES,
"content_type": 1, # Photos only
"media": "photos",
"extras": "license,url_l,url_o,owner_name",
"per_page": 100,
"format": "json",
"nojsoncallback": 1,
}
with httpx.Client(timeout=30, headers=self.HEADERS) as client:
response = client.get(self.BASE_URL, params=params)
response.raise_for_status()
data = response.json()
if data.get("stat") != "ok":
return {"downloaded": 0, "rejected": 0, "error": data.get("message")}
photos = data.get("photos", {}).get("photo", [])
for photo in photos:
# Get best URL (original or large)
url = photo.get("url_o") or photo.get("url_l")
if not url:
rejected += 1
continue
# Get license
license_id = str(photo.get("license", ""))
license_code = self.LICENSE_MAP.get(license_id, "UNKNOWN")
if license_code == "UNKNOWN":
rejected += 1
continue
# Check if already exists
source_id = str(photo.get("id"))
existing = db.query(Image).filter(
Image.source == self.name,
Image.source_id == source_id,
).first()
if existing:
continue
# Build attribution
owner = photo.get("ownername", "Unknown")
attribution = f"Photo by {owner} on Flickr ({license_code})"
# Create image record
image = Image(
species_id=species.id,
source=self.name,
source_id=source_id,
url=url,
license=license_code,
attribution=attribution,
status="pending",
)
db.add(image)
db.commit()
# Queue for download
download_and_process_image.delay(image.id)
downloaded += 1
# Rate limiting
time.sleep(1.0 / rate_limit)
except Exception as e:
print(f"Error scraping Flickr for {species.scientific_name}: {e}")
return {"downloaded": downloaded, "rejected": rejected}
def test_connection(self, api_key: ApiKey) -> str:
"""Test Flickr API connection."""
params = {
"method": "flickr.test.echo",
"api_key": api_key.api_key,
"format": "json",
"nojsoncallback": 1,
}
with httpx.Client(timeout=10, headers=self.HEADERS) as client:
response = client.get(self.BASE_URL, params=params)
response.raise_for_status()
data = response.json()
if data.get("stat") != "ok":
raise Exception(data.get("message", "API test failed"))
return "Flickr API connection successful"

View File

@@ -0,0 +1,159 @@
import time
import logging
from typing import Dict, Optional
import httpx
from sqlalchemy.orm import Session
from app.scrapers.base import BaseScraper
from app.models import Species, Image, ApiKey
from app.workers.quality_tasks import download_and_process_image
class GBIFScraper(BaseScraper):
"""Scraper for GBIF (Global Biodiversity Information Facility) images."""
name = "gbif"
requires_api_key = False # GBIF is free to use
BASE_URL = "https://api.gbif.org/v1"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
}
# Map GBIF license URLs to short codes
LICENSE_MAP = {
"http://creativecommons.org/publicdomain/zero/1.0/legalcode": "CC0",
"http://creativecommons.org/licenses/by/4.0/legalcode": "CC-BY",
"http://creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC",
"http://creativecommons.org/publicdomain/zero/1.0/": "CC0",
"http://creativecommons.org/licenses/by/4.0/": "CC-BY",
"http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
"https://creativecommons.org/publicdomain/zero/1.0/legalcode": "CC0",
"https://creativecommons.org/licenses/by/4.0/legalcode": "CC-BY",
"https://creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC",
"https://creativecommons.org/publicdomain/zero/1.0/": "CC0",
"https://creativecommons.org/licenses/by/4.0/": "CC-BY",
"https://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
}
# Only allow commercial-safe licenses
ALLOWED_LICENSES = {"CC0", "CC-BY"}
def scrape_species(
self,
species: Species,
db: Session,
logger: Optional[logging.Logger] = None
) -> Dict[str, int]:
"""Scrape images from GBIF for a species."""
# GBIF doesn't require API key, but we still respect rate limits
api_key = self.get_api_key(db)
rate_limit = api_key.rate_limit_per_sec if api_key else 1.0
downloaded = 0
rejected = 0
try:
params = {
"scientificName": species.scientific_name,
"mediaType": "StillImage",
"limit": 100,
}
with httpx.Client(timeout=30, headers=self.HEADERS) as client:
response = client.get(
f"{self.BASE_URL}/occurrence/search",
params=params,
)
response.raise_for_status()
data = response.json()
results = data.get("results", [])
for occurrence in results:
media_list = occurrence.get("media", [])
for media in media_list:
# Only process still images
if media.get("type") != "StillImage":
continue
url = media.get("identifier")
if not url:
rejected += 1
continue
# Check license
license_url = media.get("license", "")
license_code = self.LICENSE_MAP.get(license_url)
if not license_code or license_code not in self.ALLOWED_LICENSES:
rejected += 1
continue
# Create unique source ID from occurrence key and media URL
occurrence_key = occurrence.get("key", "")
# Use hash of URL for uniqueness within occurrence
url_hash = str(hash(url))[-8:]
source_id = f"{occurrence_key}_{url_hash}"
# Check if already exists
existing = db.query(Image).filter(
Image.source == self.name,
Image.source_id == source_id,
).first()
if existing:
continue
# Build attribution
creator = media.get("creator", "")
rights_holder = media.get("rightsHolder", "")
attribution_parts = []
if creator:
attribution_parts.append(f"Photo by {creator}")
if rights_holder and rights_holder != creator:
attribution_parts.append(f"Rights: {rights_holder}")
attribution_parts.append(f"via GBIF ({license_code})")
attribution = " | ".join(attribution_parts) if attribution_parts else f"GBIF ({license_code})"
# Create image record
image = Image(
species_id=species.id,
source=self.name,
source_id=source_id,
url=url,
license=license_code,
attribution=attribution,
status="pending",
)
db.add(image)
db.commit()
# Queue for download
download_and_process_image.delay(image.id)
downloaded += 1
# Rate limiting
time.sleep(1.0 / rate_limit)
except Exception as e:
print(f"Error scraping GBIF for {species.scientific_name}: {e}")
return {"downloaded": downloaded, "rejected": rejected}
def test_connection(self, api_key: ApiKey) -> str:
"""Test GBIF API connection."""
# GBIF doesn't require authentication, just test the endpoint
with httpx.Client(timeout=10, headers=self.HEADERS) as client:
response = client.get(
f"{self.BASE_URL}/occurrence/search",
params={"limit": 1},
)
response.raise_for_status()
data = response.json()
count = data.get("count", 0)
return f"GBIF API connection successful ({count:,} total occurrences available)"

View File

@@ -0,0 +1,144 @@
import time
import logging
from typing import Dict, Optional
import httpx
from sqlalchemy.orm import Session
from app.scrapers.base import BaseScraper
from app.models import Species, Image, ApiKey
from app.workers.quality_tasks import download_and_process_image
class INaturalistScraper(BaseScraper):
"""Scraper for iNaturalist observations via their API."""
name = "inaturalist"
requires_api_key = False # Public API, but rate limited
BASE_URL = "https://api.inaturalist.org/v1"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
}
# Commercial-safe licenses (CC0, CC-BY)
ALLOWED_LICENSES = ["cc0", "cc-by"]
def scrape_species(
self,
species: Species,
db: Session,
logger: Optional[logging.Logger] = None
) -> Dict[str, int]:
"""Scrape images from iNaturalist for a species."""
api_key = self.get_api_key(db)
rate_limit = api_key.rate_limit_per_sec if api_key else 1.0
downloaded = 0
rejected = 0
def log(level: str, msg: str):
if logger:
getattr(logger, level)(msg)
try:
# Search for observations of this species
params = {
"taxon_name": species.scientific_name,
"quality_grade": "research", # Only research-grade
"photos": True,
"per_page": 200,
"order_by": "votes",
"license": ",".join(self.ALLOWED_LICENSES),
}
log("debug", f" API request params: {params}")
with httpx.Client(timeout=30, headers=self.HEADERS) as client:
response = client.get(
f"{self.BASE_URL}/observations",
params=params,
)
log("debug", f" API response status: {response.status_code}")
response.raise_for_status()
data = response.json()
observations = data.get("results", [])
total_results = data.get("total_results", 0)
log("debug", f" Found {len(observations)} observations (total: {total_results})")
if not observations:
log("info", f" No observations found for {species.scientific_name}")
return {"downloaded": 0, "rejected": 0}
for obs in observations:
photos = obs.get("photos", [])
for photo in photos:
# Check license
license_code = photo.get("license_code", "").lower() if photo.get("license_code") else ""
if license_code not in self.ALLOWED_LICENSES:
log("debug", f" Rejected photo {photo.get('id')}: license={license_code}")
rejected += 1
continue
# Get image URL (medium size for initial download)
url = photo.get("url", "")
if not url:
log("debug", f" Skipped photo {photo.get('id')}: no URL")
continue
# Convert to larger size
url = url.replace("square", "large")
# Check if already exists
source_id = str(photo.get("id"))
existing = db.query(Image).filter(
Image.source == self.name,
Image.source_id == source_id,
).first()
if existing:
log("debug", f" Skipped photo {source_id}: already exists")
continue
# Create image record
image = Image(
species_id=species.id,
source=self.name,
source_id=source_id,
url=url,
license=license_code.upper(),
attribution=photo.get("attribution", ""),
status="pending",
)
db.add(image)
db.commit()
# Queue for download
download_and_process_image.delay(image.id)
downloaded += 1
log("debug", f" Queued photo {source_id} for download")
# Rate limiting
time.sleep(1.0 / rate_limit)
except httpx.HTTPStatusError as e:
log("error", f" HTTP error for {species.scientific_name}: {e.response.status_code} - {e.response.text}")
except httpx.RequestError as e:
log("error", f" Request error for {species.scientific_name}: {e}")
except Exception as e:
log("error", f" Error scraping iNaturalist for {species.scientific_name}: {e}")
return {"downloaded": downloaded, "rejected": rejected}
def test_connection(self, api_key: ApiKey) -> str:
"""Test iNaturalist API connection."""
with httpx.Client(timeout=10, headers=self.HEADERS) as client:
response = client.get(
f"{self.BASE_URL}/observations",
params={"per_page": 1},
)
response.raise_for_status()
return "iNaturalist API connection successful"

View File

@@ -0,0 +1,154 @@
import time
import logging
from typing import Dict, Optional
import httpx
from sqlalchemy.orm import Session
from app.scrapers.base import BaseScraper
from app.models import Species, Image, ApiKey
from app.workers.quality_tasks import download_and_process_image
class TrefleScraper(BaseScraper):
"""Scraper for Trefle.io plant database."""
name = "trefle"
requires_api_key = True
BASE_URL = "https://trefle.io/api/v1"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
}
def scrape_species(
self,
species: Species,
db: Session,
logger: Optional[logging.Logger] = None
) -> Dict[str, int]:
"""Scrape images from Trefle for a species."""
api_key = self.get_api_key(db)
if not api_key:
return {"downloaded": 0, "rejected": 0, "error": "No API key configured"}
rate_limit = api_key.rate_limit_per_sec
downloaded = 0
rejected = 0
try:
# Search for the species
params = {
"token": api_key.api_key,
"q": species.scientific_name,
}
with httpx.Client(timeout=30, headers=self.HEADERS) as client:
response = client.get(
f"{self.BASE_URL}/plants/search",
params=params,
)
response.raise_for_status()
data = response.json()
plants = data.get("data", [])
for plant in plants:
# Get plant details for more images
plant_id = plant.get("id")
if not plant_id:
continue
detail_response = client.get(
f"{self.BASE_URL}/plants/{plant_id}",
params={"token": api_key.api_key},
)
if detail_response.status_code != 200:
continue
plant_detail = detail_response.json().get("data", {})
# Get main image
main_image = plant_detail.get("image_url")
if main_image:
source_id = f"main_{plant_id}"
existing = db.query(Image).filter(
Image.source == self.name,
Image.source_id == source_id,
).first()
if not existing:
image = Image(
species_id=species.id,
source=self.name,
source_id=source_id,
url=main_image,
license="TREFLE", # Trefle's own license
attribution="Trefle.io Plant Database",
status="pending",
)
db.add(image)
db.commit()
download_and_process_image.delay(image.id)
downloaded += 1
# Get additional images from species detail
images = plant_detail.get("images", {})
for image_type, image_list in images.items():
if not isinstance(image_list, list):
continue
for img in image_list:
url = img.get("image_url")
if not url:
continue
img_id = img.get("id", url.split("/")[-1])
source_id = f"{image_type}_{img_id}"
existing = db.query(Image).filter(
Image.source == self.name,
Image.source_id == source_id,
).first()
if existing:
continue
copyright_info = img.get("copyright", "")
image = Image(
species_id=species.id,
source=self.name,
source_id=source_id,
url=url,
license="TREFLE",
attribution=copyright_info or "Trefle.io",
status="pending",
)
db.add(image)
db.commit()
download_and_process_image.delay(image.id)
downloaded += 1
# Rate limiting
time.sleep(1.0 / rate_limit)
except Exception as e:
print(f"Error scraping Trefle for {species.scientific_name}: {e}")
return {"downloaded": downloaded, "rejected": rejected}
def test_connection(self, api_key: ApiKey) -> str:
"""Test Trefle API connection."""
params = {"token": api_key.api_key}
with httpx.Client(timeout=10, headers=self.HEADERS) as client:
response = client.get(
f"{self.BASE_URL}/plants",
params=params,
)
response.raise_for_status()
return "Trefle API connection successful"

View File

@@ -0,0 +1,146 @@
import time
import logging
from typing import Dict, Optional
import httpx
from sqlalchemy.orm import Session
from app.scrapers.base import BaseScraper
from app.models import Species, Image, ApiKey
from app.workers.quality_tasks import download_and_process_image
class WikimediaScraper(BaseScraper):
"""Scraper for Wikimedia Commons images."""
name = "wikimedia"
requires_api_key = False
BASE_URL = "https://commons.wikimedia.org/w/api.php"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
}
def scrape_species(
self,
species: Species,
db: Session,
logger: Optional[logging.Logger] = None
) -> Dict[str, int]:
"""Scrape images from Wikimedia Commons for a species."""
api_key = self.get_api_key(db)
rate_limit = api_key.rate_limit_per_sec if api_key else 1.0
downloaded = 0
rejected = 0
try:
# Search for images in the species category
search_term = species.scientific_name
params = {
"action": "query",
"format": "json",
"generator": "search",
"gsrsearch": f"filetype:bitmap {search_term}",
"gsrnamespace": 6, # File namespace
"gsrlimit": 50,
"prop": "imageinfo",
"iiprop": "url|extmetadata|size",
}
with httpx.Client(timeout=30, headers=self.HEADERS) as client:
response = client.get(self.BASE_URL, params=params)
response.raise_for_status()
data = response.json()
pages = data.get("query", {}).get("pages", {})
for page_id, page in pages.items():
if int(page_id) < 0:
continue
imageinfo = page.get("imageinfo", [{}])[0]
url = imageinfo.get("url", "")
if not url:
continue
# Check size
width = imageinfo.get("width", 0)
height = imageinfo.get("height", 0)
if width < 256 or height < 256:
rejected += 1
continue
# Get license from metadata
metadata = imageinfo.get("extmetadata", {})
license_info = metadata.get("LicenseShortName", {}).get("value", "")
# Filter for commercial-safe licenses
license_upper = license_info.upper()
if "CC BY" in license_upper or "CC0" in license_upper or "PUBLIC DOMAIN" in license_upper:
license_code = license_info
else:
rejected += 1
continue
# Check if already exists
source_id = str(page_id)
existing = db.query(Image).filter(
Image.source == self.name,
Image.source_id == source_id,
).first()
if existing:
continue
# Get attribution
artist = metadata.get("Artist", {}).get("value", "Unknown")
# Clean HTML from artist
if "<" in artist:
import re
artist = re.sub(r"<[^>]+>", "", artist).strip()
attribution = f"{artist} via Wikimedia Commons ({license_code})"
# Create image record
image = Image(
species_id=species.id,
source=self.name,
source_id=source_id,
url=url,
license=license_code,
attribution=attribution,
width=width,
height=height,
status="pending",
)
db.add(image)
db.commit()
# Queue for download
download_and_process_image.delay(image.id)
downloaded += 1
# Rate limiting
time.sleep(1.0 / rate_limit)
except Exception as e:
print(f"Error scraping Wikimedia for {species.scientific_name}: {e}")
return {"downloaded": downloaded, "rejected": rejected}
def test_connection(self, api_key: ApiKey) -> str:
"""Test Wikimedia API connection."""
params = {
"action": "query",
"format": "json",
"meta": "siteinfo",
}
with httpx.Client(timeout=10, headers=self.HEADERS) as client:
response = client.get(self.BASE_URL, params=params)
response.raise_for_status()
return "Wikimedia Commons API connection successful"

View File

@@ -0,0 +1 @@
# Utility functions

View File

@@ -0,0 +1,80 @@
"""Image deduplication utilities using perceptual hashing."""
from typing import Optional
import imagehash
from PIL import Image as PILImage
def calculate_phash(image_path: str) -> Optional[str]:
"""
Calculate perceptual hash for an image.
Args:
image_path: Path to image file
Returns:
Hex string of perceptual hash, or None if failed
"""
try:
with PILImage.open(image_path) as img:
return str(imagehash.phash(img))
except Exception:
return None
def calculate_dhash(image_path: str) -> Optional[str]:
"""
Calculate difference hash for an image.
Faster but less accurate than phash.
Args:
image_path: Path to image file
Returns:
Hex string of difference hash, or None if failed
"""
try:
with PILImage.open(image_path) as img:
return str(imagehash.dhash(img))
except Exception:
return None
def hashes_are_similar(hash1: str, hash2: str, threshold: int = 10) -> bool:
"""
Check if two hashes are similar (potential duplicates).
Args:
hash1: First hash string
hash2: Second hash string
threshold: Maximum Hamming distance (default 10)
Returns:
True if hashes are similar
"""
try:
h1 = imagehash.hex_to_hash(hash1)
h2 = imagehash.hex_to_hash(hash2)
return (h1 - h2) <= threshold
except Exception:
return False
def hamming_distance(hash1: str, hash2: str) -> int:
"""
Calculate Hamming distance between two hashes.
Args:
hash1: First hash string
hash2: Second hash string
Returns:
Hamming distance (0 = identical, higher = more different)
"""
try:
h1 = imagehash.hex_to_hash(hash1)
h2 = imagehash.hex_to_hash(hash2)
return int(h1 - h2)
except Exception:
return 64 # Maximum distance

View File

@@ -0,0 +1,109 @@
"""Image quality assessment utilities."""
import numpy as np
from PIL import Image as PILImage
from scipy import ndimage
def calculate_blur_score(image_path: str) -> float:
"""
Calculate blur score using Laplacian variance.
Higher score = sharper image.
Args:
image_path: Path to image file
Returns:
Variance of Laplacian (higher = sharper)
"""
try:
img = PILImage.open(image_path).convert("L")
img_array = np.array(img)
laplacian = ndimage.laplace(img_array)
return float(np.var(laplacian))
except Exception:
return 0.0
def is_too_blurry(image_path: str, threshold: float = 100.0) -> bool:
"""
Check if image is too blurry for training.
Args:
image_path: Path to image file
threshold: Minimum acceptable blur score (default 100)
Returns:
True if image is too blurry
"""
score = calculate_blur_score(image_path)
return score < threshold
def get_image_dimensions(image_path: str) -> tuple[int, int]:
"""
Get image dimensions.
Args:
image_path: Path to image file
Returns:
Tuple of (width, height)
"""
try:
with PILImage.open(image_path) as img:
return img.size
except Exception:
return (0, 0)
def is_too_small(image_path: str, min_size: int = 256) -> bool:
"""
Check if image is too small for training.
Args:
image_path: Path to image file
min_size: Minimum dimension size (default 256)
Returns:
True if image is too small
"""
width, height = get_image_dimensions(image_path)
return width < min_size or height < min_size
def resize_image(
image_path: str,
output_path: str = None,
max_size: int = 512,
quality: int = 95,
) -> bool:
"""
Resize image to max dimension while preserving aspect ratio.
Args:
image_path: Path to input image
output_path: Path for output (defaults to overwriting input)
max_size: Maximum dimension size (default 512)
quality: JPEG quality (default 95)
Returns:
True if successful
"""
try:
output_path = output_path or image_path
with PILImage.open(image_path) as img:
# Only resize if larger than max_size
if max(img.size) > max_size:
img.thumbnail((max_size, max_size), PILImage.Resampling.LANCZOS)
# Convert to RGB if necessary (for JPEG)
if img.mode in ("RGBA", "P"):
img = img.convert("RGB")
img.save(output_path, "JPEG", quality=quality)
return True
except Exception:
return False

View File

@@ -0,0 +1,92 @@
import logging
import os
from datetime import datetime
from pathlib import Path
from app.config import get_settings
settings = get_settings()
def setup_logging():
"""Configure file and console logging."""
logs_path = Path(settings.logs_path)
logs_path.mkdir(parents=True, exist_ok=True)
# Create a dated log file
log_file = logs_path / f"scraper_{datetime.now().strftime('%Y-%m-%d')}.log"
# Configure root logger
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_file),
logging.StreamHandler()
]
)
return logging.getLogger("plant_scraper")
def get_logger(name: str = "plant_scraper"):
"""Get a logger instance."""
logs_path = Path(settings.logs_path)
logs_path.mkdir(parents=True, exist_ok=True)
logger = logging.getLogger(name)
if not logger.handlers:
logger.setLevel(logging.INFO)
# File handler with daily rotation
log_file = logs_path / f"scraper_{datetime.now().strftime('%Y-%m-%d')}.log"
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
))
# Console handler
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(logging.Formatter(
'%(asctime)s - %(levelname)s - %(message)s'
))
logger.addHandler(file_handler)
logger.addHandler(console_handler)
return logger
def get_job_logger(job_id: int):
"""Get a logger specific to a job, writing to a job-specific file."""
logs_path = Path(settings.logs_path)
logs_path.mkdir(parents=True, exist_ok=True)
logger = logging.getLogger(f"job_{job_id}")
if not logger.handlers:
logger.setLevel(logging.DEBUG)
# Job-specific log file
job_log_file = logs_path / f"job_{job_id}.log"
file_handler = logging.FileHandler(job_log_file)
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(logging.Formatter(
'%(asctime)s - %(levelname)s - %(message)s'
))
# Also log to daily file
daily_log_file = logs_path / f"scraper_{datetime.now().strftime('%Y-%m-%d')}.log"
daily_handler = logging.FileHandler(daily_log_file)
daily_handler.setLevel(logging.INFO)
daily_handler.setFormatter(logging.Formatter(
'%(asctime)s - job_%(name)s - %(levelname)s - %(message)s'
))
logger.addHandler(file_handler)
logger.addHandler(daily_handler)
return logger

View File

@@ -0,0 +1 @@
# Celery workers

View File

@@ -0,0 +1,36 @@
from celery import Celery
from app.config import get_settings
settings = get_settings()
celery_app = Celery(
"plant_scraper",
broker=settings.redis_url,
backend=settings.redis_url,
include=[
"app.workers.scrape_tasks",
"app.workers.quality_tasks",
"app.workers.export_tasks",
"app.workers.stats_tasks",
],
)
celery_app.conf.update(
task_serializer="json",
accept_content=["json"],
result_serializer="json",
timezone="UTC",
enable_utc=True,
task_track_started=True,
task_time_limit=3600 * 24, # 24 hour max per task
worker_prefetch_multiplier=1,
task_acks_late=True,
beat_schedule={
"refresh-stats-every-5min": {
"task": "app.workers.stats_tasks.refresh_stats",
"schedule": 300.0, # Every 5 minutes
},
},
beat_schedule_filename="/tmp/celerybeat-schedule",
)

View File

@@ -0,0 +1,170 @@
import json
import os
import random
import shutil
import zipfile
from datetime import datetime
from pathlib import Path
from app.workers.celery_app import celery_app
from app.database import SessionLocal
from app.models import Export, Image, Species
from app.config import get_settings
settings = get_settings()
@celery_app.task(bind=True)
def generate_export(self, export_id: int):
"""Generate a zip export for CoreML training."""
db = SessionLocal()
try:
export = db.query(Export).filter(Export.id == export_id).first()
if not export:
return {"error": "Export not found"}
# Update status
export.status = "generating"
export.celery_task_id = self.request.id
db.commit()
# Parse filter criteria
criteria = json.loads(export.filter_criteria) if export.filter_criteria else {}
min_images = criteria.get("min_images_per_species", 100)
licenses = criteria.get("licenses")
min_quality = criteria.get("min_quality")
species_ids = criteria.get("species_ids")
# Build query for images
query = db.query(Image).filter(Image.status == "downloaded")
if licenses:
query = query.filter(Image.license.in_(licenses))
if min_quality:
query = query.filter(Image.quality_score >= min_quality)
if species_ids:
query = query.filter(Image.species_id.in_(species_ids))
# Group by species and filter by min count
from sqlalchemy import func
species_counts = db.query(
Image.species_id,
func.count(Image.id).label("count")
).filter(Image.status == "downloaded").group_by(Image.species_id).all()
valid_species_ids = [s.species_id for s in species_counts if s.count >= min_images]
if species_ids:
valid_species_ids = [s for s in valid_species_ids if s in species_ids]
if not valid_species_ids:
export.status = "failed"
export.error_message = "No species meet the criteria"
export.completed_at = datetime.utcnow()
db.commit()
return {"error": "No species meet the criteria"}
# Create export directory
export_dir = Path(settings.exports_path) / f"export_{export_id}"
train_dir = export_dir / "Training"
test_dir = export_dir / "Testing"
train_dir.mkdir(parents=True, exist_ok=True)
test_dir.mkdir(parents=True, exist_ok=True)
total_images = 0
species_count = 0
# Process each valid species
for i, species_id in enumerate(valid_species_ids):
species = db.query(Species).filter(Species.id == species_id).first()
if not species:
continue
# Get images for this species
images_query = query.filter(Image.species_id == species_id)
if licenses:
images_query = images_query.filter(Image.license.in_(licenses))
if min_quality:
images_query = images_query.filter(Image.quality_score >= min_quality)
images = images_query.all()
if len(images) < min_images:
continue
species_count += 1
# Create species folders
species_name = species.scientific_name.replace(" ", "_")
(train_dir / species_name).mkdir(exist_ok=True)
(test_dir / species_name).mkdir(exist_ok=True)
# Shuffle and split
random.shuffle(images)
split_idx = int(len(images) * export.train_split)
train_images = images[:split_idx]
test_images = images[split_idx:]
# Copy images
for j, img in enumerate(train_images):
if img.local_path and os.path.exists(img.local_path):
ext = Path(img.local_path).suffix or ".jpg"
dest = train_dir / species_name / f"img_{j:05d}{ext}"
shutil.copy2(img.local_path, dest)
total_images += 1
for j, img in enumerate(test_images):
if img.local_path and os.path.exists(img.local_path):
ext = Path(img.local_path).suffix or ".jpg"
dest = test_dir / species_name / f"img_{j:05d}{ext}"
shutil.copy2(img.local_path, dest)
total_images += 1
# Update progress
self.update_state(
state="PROGRESS",
meta={
"current": i + 1,
"total": len(valid_species_ids),
"species": species.scientific_name,
}
)
# Create zip file
zip_path = Path(settings.exports_path) / f"export_{export_id}.zip"
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
for root, dirs, files in os.walk(export_dir):
for file in files:
file_path = Path(root) / file
arcname = file_path.relative_to(export_dir)
zipf.write(file_path, arcname)
# Clean up directory
shutil.rmtree(export_dir)
# Update export record
export.status = "completed"
export.file_path = str(zip_path)
export.file_size = zip_path.stat().st_size
export.species_count = species_count
export.image_count = total_images
export.completed_at = datetime.utcnow()
db.commit()
return {
"status": "completed",
"species_count": species_count,
"image_count": total_images,
"file_size": export.file_size,
}
except Exception as e:
if export:
export.status = "failed"
export.error_message = str(e)
export.completed_at = datetime.utcnow()
db.commit()
raise
finally:
db.close()

View File

@@ -0,0 +1,224 @@
import os
from pathlib import Path
import httpx
from PIL import Image as PILImage
import imagehash
import numpy as np
from scipy import ndimage
from app.workers.celery_app import celery_app
from app.database import SessionLocal
from app.models import Image
from app.config import get_settings
settings = get_settings()
def calculate_blur_score(image_path: str) -> float:
"""Calculate blur score using Laplacian variance. Higher = sharper."""
try:
img = PILImage.open(image_path).convert("L")
img_array = np.array(img)
laplacian = ndimage.laplace(img_array)
return float(np.var(laplacian))
except Exception:
return 0.0
def calculate_phash(image_path: str) -> str:
"""Calculate perceptual hash for deduplication."""
try:
img = PILImage.open(image_path)
return str(imagehash.phash(img))
except Exception:
return ""
def check_color_distribution(image_path: str) -> tuple[bool, str]:
"""Check if image has healthy color distribution for a plant photo.
Returns (passed, reason) tuple.
Rejects:
- Low color variance (mean channel std < 25): herbarium specimens (brown on white)
- No green + low variance (green ratio < 5% AND mean std < 40): monochrome illustrations
"""
try:
img = PILImage.open(image_path).convert("RGB")
arr = np.array(img, dtype=np.float64)
# Per-channel standard deviation
channel_stds = arr.std(axis=(0, 1)) # [R_std, G_std, B_std]
mean_std = float(channel_stds.mean())
if mean_std < 25:
return False, f"Low color variance ({mean_std:.1f})"
# Check green ratio
channel_means = arr.mean(axis=(0, 1))
total = channel_means.sum()
green_ratio = channel_means[1] / total if total > 0 else 0
if green_ratio < 0.05 and mean_std < 40:
return False, f"No green ({green_ratio:.2%}) + low variance ({mean_std:.1f})"
return True, ""
except Exception:
return True, "" # Don't reject on error
def resize_image(image_path: str, target_size: int = 512) -> bool:
"""Resize image to target size while maintaining aspect ratio."""
try:
img = PILImage.open(image_path)
img.thumbnail((target_size, target_size), PILImage.Resampling.LANCZOS)
img.save(image_path, quality=95)
return True
except Exception:
return False
@celery_app.task
def download_and_process_image(image_id: int):
"""Download image, check quality, dedupe, and resize."""
db = SessionLocal()
try:
image = db.query(Image).filter(Image.id == image_id).first()
if not image:
return {"error": "Image not found"}
# Create directory for species
species = image.species
species_dir = Path(settings.images_path) / species.scientific_name.replace(" ", "_")
species_dir.mkdir(parents=True, exist_ok=True)
# Download image
filename = f"{image.source}_{image.source_id or image.id}.jpg"
local_path = species_dir / filename
try:
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
}
with httpx.Client(timeout=30, headers=headers, follow_redirects=True) as client:
response = client.get(image.url)
response.raise_for_status()
with open(local_path, "wb") as f:
f.write(response.content)
except Exception as e:
image.status = "rejected"
db.commit()
return {"error": f"Download failed: {e}"}
# Check minimum size
try:
with PILImage.open(local_path) as img:
width, height = img.size
if width < 256 or height < 256:
os.remove(local_path)
image.status = "rejected"
db.commit()
return {"error": "Image too small"}
image.width = width
image.height = height
except Exception as e:
if local_path.exists():
os.remove(local_path)
image.status = "rejected"
db.commit()
return {"error": f"Invalid image: {e}"}
# Calculate perceptual hash for deduplication
phash = calculate_phash(str(local_path))
if phash:
# Check for duplicates
existing = db.query(Image).filter(
Image.phash == phash,
Image.id != image.id,
Image.status == "downloaded"
).first()
if existing:
os.remove(local_path)
image.status = "rejected"
image.phash = phash
db.commit()
return {"error": "Duplicate image"}
image.phash = phash
# Calculate blur score
quality_score = calculate_blur_score(str(local_path))
image.quality_score = quality_score
# Reject very blurry images (threshold can be tuned)
if quality_score < 100: # Low variance = blurry
os.remove(local_path)
image.status = "rejected"
db.commit()
return {"error": "Image too blurry"}
# Check color distribution (reject herbarium specimens, illustrations)
color_ok, color_reason = check_color_distribution(str(local_path))
if not color_ok:
os.remove(local_path)
image.status = "rejected"
db.commit()
return {"error": f"Non-photo content: {color_reason}"}
# Resize to 512x512 max
resize_image(str(local_path))
# Update image record
image.local_path = str(local_path)
image.status = "downloaded"
db.commit()
return {
"status": "success",
"path": str(local_path),
"quality_score": quality_score,
}
except Exception as e:
if image:
image.status = "rejected"
db.commit()
return {"error": str(e)}
finally:
db.close()
@celery_app.task(bind=True)
def batch_process_pending_images(self, source: str = None, chunk_size: int = 500):
"""Process ALL pending images in chunks, with progress tracking."""
db = SessionLocal()
try:
query = db.query(Image).filter(Image.status == "pending")
if source:
query = query.filter(Image.source == source)
total = query.count()
queued = 0
offset = 0
while offset < total:
chunk = query.order_by(Image.id).offset(offset).limit(chunk_size).all()
if not chunk:
break
for image in chunk:
download_and_process_image.delay(image.id)
queued += 1
offset += len(chunk)
self.update_state(
state="PROGRESS",
meta={"queued": queued, "total": total},
)
return {"queued": queued, "total": total}
finally:
db.close()

View File

@@ -0,0 +1,164 @@
import json
from datetime import datetime
from app.workers.celery_app import celery_app
from app.database import SessionLocal
from app.models import Job, Species, Image
from app.utils.logging import get_job_logger
@celery_app.task(bind=True)
def run_scrape_job(self, job_id: int):
"""Main scrape task that dispatches to source-specific scrapers."""
logger = get_job_logger(job_id)
logger.info(f"Starting scrape job {job_id}")
db = SessionLocal()
job = None
try:
job = db.query(Job).filter(Job.id == job_id).first()
if not job:
logger.error(f"Job {job_id} not found")
return {"error": "Job not found"}
logger.info(f"Job: {job.name}, Source: {job.source}")
# Update job status
job.status = "running"
job.started_at = datetime.utcnow()
job.celery_task_id = self.request.id
db.commit()
# Get species to scrape
if job.species_filter:
species_ids = json.loads(job.species_filter)
query = db.query(Species).filter(Species.id.in_(species_ids))
logger.info(f"Filtered to species IDs: {species_ids}")
else:
query = db.query(Species)
logger.info("Scraping all species")
# Filter by image count if requested
if job.only_without_images or job.max_images:
from sqlalchemy import func
# Subquery to count downloaded images per species
image_count_subquery = (
db.query(Image.species_id, func.count(Image.id).label("count"))
.filter(Image.status == "downloaded")
.group_by(Image.species_id)
.subquery()
)
# Left join with the count subquery
query = query.outerjoin(
image_count_subquery,
Species.id == image_count_subquery.c.species_id
)
if job.only_without_images:
# Filter where count is NULL or 0
query = query.filter(
(image_count_subquery.c.count == None) | (image_count_subquery.c.count == 0)
)
logger.info("Filtering to species without images")
elif job.max_images:
# Filter where count is NULL or less than max_images
query = query.filter(
(image_count_subquery.c.count == None) | (image_count_subquery.c.count < job.max_images)
)
logger.info(f"Filtering to species with fewer than {job.max_images} images")
species_list = query.all()
logger.info(f"Total species to scrape: {len(species_list)}")
job.progress_total = len(species_list)
db.commit()
# Import scraper based on source
from app.scrapers import get_scraper
scraper = get_scraper(job.source)
if not scraper:
error_msg = f"Unknown source: {job.source}"
logger.error(error_msg)
job.status = "failed"
job.error_message = error_msg
job.completed_at = datetime.utcnow()
db.commit()
return {"error": error_msg}
logger.info(f"Using scraper: {scraper.name}")
# Scrape each species
for i, species in enumerate(species_list):
try:
# Update progress
job.progress_current = i + 1
db.commit()
logger.info(f"[{i+1}/{len(species_list)}] Scraping: {species.scientific_name}")
# Update task state for real-time monitoring
self.update_state(
state="PROGRESS",
meta={
"current": i + 1,
"total": len(species_list),
"species": species.scientific_name,
}
)
# Run scraper for this species
results = scraper.scrape_species(species, db, logger)
downloaded = results.get("downloaded", 0)
rejected = results.get("rejected", 0)
job.images_downloaded += downloaded
job.images_rejected += rejected
db.commit()
logger.info(f" -> Downloaded: {downloaded}, Rejected: {rejected}")
except Exception as e:
# Log error but continue with other species
logger.error(f"Error scraping {species.scientific_name}: {e}", exc_info=True)
continue
# Mark job complete
job.status = "completed"
job.completed_at = datetime.utcnow()
db.commit()
logger.info(f"Job {job_id} completed. Total downloaded: {job.images_downloaded}, rejected: {job.images_rejected}")
return {
"status": "completed",
"downloaded": job.images_downloaded,
"rejected": job.images_rejected,
}
except Exception as e:
logger.error(f"Job {job_id} failed with error: {e}", exc_info=True)
if job:
job.status = "failed"
job.error_message = str(e)
job.completed_at = datetime.utcnow()
db.commit()
raise
finally:
db.close()
@celery_app.task
def pause_scrape_job(job_id: int):
"""Pause a running scrape job."""
db = SessionLocal()
try:
job = db.query(Job).filter(Job.id == job_id).first()
if job and job.status == "running":
job.status = "paused"
db.commit()
# Revoke the Celery task
if job.celery_task_id:
celery_app.control.revoke(job.celery_task_id, terminate=True)
return {"status": "paused"}
finally:
db.close()

View File

@@ -0,0 +1,193 @@
import json
import os
from datetime import datetime
from pathlib import Path
from sqlalchemy import func, case, text
from app.workers.celery_app import celery_app
from app.database import SessionLocal
from app.models import Species, Image, Job
from app.models.cached_stats import CachedStats
from app.config import get_settings
def get_directory_size_fast(path: str) -> int:
"""Get directory size in bytes using fast os.scandir."""
total = 0
try:
with os.scandir(path) as it:
for entry in it:
try:
if entry.is_file(follow_symlinks=False):
total += entry.stat(follow_symlinks=False).st_size
elif entry.is_dir(follow_symlinks=False):
total += get_directory_size_fast(entry.path)
except (OSError, PermissionError):
pass
except (OSError, PermissionError):
pass
return total
@celery_app.task
def refresh_stats():
"""Calculate and cache dashboard statistics."""
print("=== STATS TASK: Starting refresh ===", flush=True)
db = SessionLocal()
try:
# Use raw SQL for maximum performance on SQLite
# All counts in a single query
counts_sql = text("""
SELECT
(SELECT COUNT(*) FROM species) as total_species,
(SELECT COUNT(*) FROM images) as total_images,
(SELECT COUNT(*) FROM images WHERE status = 'downloaded') as images_downloaded,
(SELECT COUNT(*) FROM images WHERE status = 'pending') as images_pending,
(SELECT COUNT(*) FROM images WHERE status = 'rejected') as images_rejected
""")
counts = db.execute(counts_sql).fetchone()
total_species = counts[0] or 0
total_images = counts[1] or 0
images_downloaded = counts[2] or 0
images_pending = counts[3] or 0
images_rejected = counts[4] or 0
# Per-source stats - single query with GROUP BY
source_sql = text("""
SELECT
source,
COUNT(*) as total,
SUM(CASE WHEN status = 'downloaded' THEN 1 ELSE 0 END) as downloaded,
SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) as pending,
SUM(CASE WHEN status = 'rejected' THEN 1 ELSE 0 END) as rejected
FROM images
GROUP BY source
""")
source_stats_raw = db.execute(source_sql).fetchall()
sources = [
{
"source": s[0],
"image_count": s[1],
"downloaded": s[2] or 0,
"pending": s[3] or 0,
"rejected": s[4] or 0,
}
for s in source_stats_raw
]
# Per-license stats - single indexed query
license_sql = text("""
SELECT license, COUNT(*) as count
FROM images
WHERE status = 'downloaded'
GROUP BY license
""")
license_stats_raw = db.execute(license_sql).fetchall()
licenses = [
{"license": l[0], "count": l[1]}
for l in license_stats_raw
]
# Job stats - single query
job_sql = text("""
SELECT
SUM(CASE WHEN status = 'running' THEN 1 ELSE 0 END) as running,
SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) as pending,
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as completed,
SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed
FROM jobs
""")
job_counts = db.execute(job_sql).fetchone()
jobs = {
"running": job_counts[0] or 0,
"pending": job_counts[1] or 0,
"completed": job_counts[2] or 0,
"failed": job_counts[3] or 0,
}
# Top species by image count - optimized with index
top_sql = text("""
SELECT s.id, s.scientific_name, s.common_name, COUNT(i.id) as image_count
FROM species s
INNER JOIN images i ON i.species_id = s.id AND i.status = 'downloaded'
GROUP BY s.id
ORDER BY image_count DESC
LIMIT 10
""")
top_species_raw = db.execute(top_sql).fetchall()
top_species = [
{
"id": s[0],
"scientific_name": s[1],
"common_name": s[2],
"image_count": s[3],
}
for s in top_species_raw
]
# Under-represented species - use pre-computed counts
under_sql = text("""
SELECT s.id, s.scientific_name, s.common_name, COALESCE(img_counts.cnt, 0) as image_count
FROM species s
LEFT JOIN (
SELECT species_id, COUNT(*) as cnt
FROM images
WHERE status = 'downloaded'
GROUP BY species_id
) img_counts ON img_counts.species_id = s.id
WHERE COALESCE(img_counts.cnt, 0) < 100
ORDER BY image_count ASC
LIMIT 10
""")
under_rep_raw = db.execute(under_sql).fetchall()
under_represented = [
{
"id": s[0],
"scientific_name": s[1],
"common_name": s[2],
"image_count": s[3],
}
for s in under_rep_raw
]
# Calculate disk usage (fast recursive scan)
settings = get_settings()
disk_usage_bytes = get_directory_size_fast(settings.images_path)
disk_usage_mb = round(disk_usage_bytes / (1024 * 1024), 2)
# Build the stats object
stats = {
"total_species": total_species,
"total_images": total_images,
"images_downloaded": images_downloaded,
"images_pending": images_pending,
"images_rejected": images_rejected,
"disk_usage_mb": disk_usage_mb,
"sources": sources,
"licenses": licenses,
"jobs": jobs,
"top_species": top_species,
"under_represented": under_represented,
}
# Store in database
cached = db.query(CachedStats).filter(CachedStats.key == "dashboard_stats").first()
if cached:
cached.value = json.dumps(stats)
cached.updated_at = datetime.utcnow()
else:
cached = CachedStats(key="dashboard_stats", value=json.dumps(stats))
db.add(cached)
db.commit()
print(f"=== STATS TASK: Refreshed (species={total_species}, images={total_images}) ===", flush=True)
return {"status": "success", "total_species": total_species, "total_images": total_images}
except Exception as e:
print(f"=== STATS TASK ERROR: {e} ===", flush=True)
raise
finally:
db.close()