Initial commit — PlantGuideScraper project

2026-04-12 09:54:27 -05:00
commit 6926f502c5
87 changed files with 29120 additions and 0 deletions
@@ -0,0 +1 @@
+# PlantGuideScraper Backend
@@ -0,0 +1 @@
+# API routes
@@ -0,0 +1,175 @@
+import json
+import os
+from typing import Optional
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+from fastapi.responses import FileResponse
+from sqlalchemy.orm import Session
+from sqlalchemy import func
+
+from app.database import get_db
+from app.models import Export, Image, Species
+from app.schemas.export import (
+    ExportCreate,
+    ExportResponse,
+    ExportListResponse,
+    ExportPreview,
+)
+from app.workers.export_tasks import generate_export
+
+router = APIRouter()
+
+
+@router.get("", response_model=ExportListResponse)
+def list_exports(
+    limit: int = Query(50, ge=1, le=200),
+    db: Session = Depends(get_db),
+):
+    """List all exports."""
+    total = db.query(Export).count()
+    exports = db.query(Export).order_by(Export.created_at.desc()).limit(limit).all()
+
+    return ExportListResponse(
+        items=[ExportResponse.model_validate(e) for e in exports],
+        total=total,
+    )
+
+
+@router.post("/preview", response_model=ExportPreview)
+def preview_export(export: ExportCreate, db: Session = Depends(get_db)):
+    """Preview export without creating it."""
+    criteria = export.filter_criteria
+    min_images = criteria.min_images_per_species
+
+    # Build query
+    query = db.query(Image).filter(Image.status == "downloaded")
+
+    if criteria.licenses:
+        query = query.filter(Image.license.in_(criteria.licenses))
+
+    if criteria.min_quality:
+        query = query.filter(Image.quality_score >= criteria.min_quality)
+
+    if criteria.species_ids:
+        query = query.filter(Image.species_id.in_(criteria.species_ids))
+
+    # Count images per species
+    species_counts = db.query(
+        Image.species_id,
+        func.count(Image.id).label("count")
+    ).filter(Image.status == "downloaded")
+
+    if criteria.licenses:
+        species_counts = species_counts.filter(Image.license.in_(criteria.licenses))
+    if criteria.min_quality:
+        species_counts = species_counts.filter(Image.quality_score >= criteria.min_quality)
+    if criteria.species_ids:
+        species_counts = species_counts.filter(Image.species_id.in_(criteria.species_ids))
+
+    species_counts = species_counts.group_by(Image.species_id).all()
+
+    valid_species = [s for s in species_counts if s.count >= min_images]
+    total_images = sum(s.count for s in valid_species)
+
+    # Estimate file size (rough: 50KB per image)
+    estimated_size_mb = (total_images * 50) / 1024
+
+    return ExportPreview(
+        species_count=len(valid_species),
+        image_count=total_images,
+        estimated_size_mb=estimated_size_mb,
+    )
+
+
+@router.post("", response_model=ExportResponse)
+def create_export(export: ExportCreate, db: Session = Depends(get_db)):
+    """Create and start a new export job."""
+    db_export = Export(
+        name=export.name,
+        filter_criteria=export.filter_criteria.model_dump_json(),
+        train_split=export.train_split,
+        status="pending",
+    )
+    db.add(db_export)
+    db.commit()
+    db.refresh(db_export)
+
+    # Start Celery task
+    task = generate_export.delay(db_export.id)
+    db_export.celery_task_id = task.id
+    db.commit()
+
+    return ExportResponse.model_validate(db_export)
+
+
+@router.get("/{export_id}", response_model=ExportResponse)
+def get_export(export_id: int, db: Session = Depends(get_db)):
+    """Get export status."""
+    export = db.query(Export).filter(Export.id == export_id).first()
+    if not export:
+        raise HTTPException(status_code=404, detail="Export not found")
+
+    return ExportResponse.model_validate(export)
+
+
+@router.get("/{export_id}/progress")
+def get_export_progress(export_id: int, db: Session = Depends(get_db)):
+    """Get real-time export progress."""
+    from app.workers.celery_app import celery_app
+
+    export = db.query(Export).filter(Export.id == export_id).first()
+    if not export:
+        raise HTTPException(status_code=404, detail="Export not found")
+
+    if not export.celery_task_id:
+        return {"status": export.status}
+
+    result = celery_app.AsyncResult(export.celery_task_id)
+
+    if result.state == "PROGRESS":
+        meta = result.info
+        return {
+            "status": "generating",
+            "current": meta.get("current", 0),
+            "total": meta.get("total", 0),
+            "current_species": meta.get("species", ""),
+        }
+
+    return {"status": export.status}
+
+
+@router.get("/{export_id}/download")
+def download_export(export_id: int, db: Session = Depends(get_db)):
+    """Download export zip file."""
+    export = db.query(Export).filter(Export.id == export_id).first()
+    if not export:
+        raise HTTPException(status_code=404, detail="Export not found")
+
+    if export.status != "completed":
+        raise HTTPException(status_code=400, detail="Export not ready")
+
+    if not export.file_path or not os.path.exists(export.file_path):
+        raise HTTPException(status_code=404, detail="Export file not found")
+
+    return FileResponse(
+        export.file_path,
+        media_type="application/zip",
+        filename=f"{export.name}.zip",
+    )
+
+
+@router.delete("/{export_id}")
+def delete_export(export_id: int, db: Session = Depends(get_db)):
+    """Delete an export and its file."""
+    export = db.query(Export).filter(Export.id == export_id).first()
+    if not export:
+        raise HTTPException(status_code=404, detail="Export not found")
+
+    # Delete file if exists
+    if export.file_path and os.path.exists(export.file_path):
+        os.remove(export.file_path)
+
+    db.delete(export)
+    db.commit()
+
+    return {"status": "deleted"}
@@ -0,0 +1,441 @@
+import os
+import shutil
+import uuid
+from pathlib import Path
+from typing import Optional, List
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+from fastapi.responses import FileResponse
+from sqlalchemy.orm import Session
+from sqlalchemy import func
+from PIL import Image as PILImage
+
+from app.database import get_db
+from app.models import Image, Species
+from app.schemas.image import ImageResponse, ImageListResponse
+from app.config import get_settings
+
+router = APIRouter()
+settings = get_settings()
+
+
+@router.get("", response_model=ImageListResponse)
+def list_images(
+    page: int = Query(1, ge=1),
+    page_size: int = Query(50, ge=1, le=200),
+    species_id: Optional[int] = None,
+    source: Optional[str] = None,
+    license: Optional[str] = None,
+    status: Optional[str] = None,
+    min_quality: Optional[float] = None,
+    search: Optional[str] = None,
+    db: Session = Depends(get_db),
+):
+    """List images with pagination and filters."""
+    # Use joinedload to fetch species in single query
+    from sqlalchemy.orm import joinedload
+    query = db.query(Image).options(joinedload(Image.species))
+
+    if species_id:
+        query = query.filter(Image.species_id == species_id)
+
+    if source:
+        query = query.filter(Image.source == source)
+
+    if license:
+        query = query.filter(Image.license == license)
+
+    if status:
+        query = query.filter(Image.status == status)
+
+    if min_quality:
+        query = query.filter(Image.quality_score >= min_quality)
+
+    if search:
+        search_term = f"%{search}%"
+        query = query.join(Species).filter(
+            (Species.scientific_name.ilike(search_term)) |
+            (Species.common_name.ilike(search_term))
+        )
+
+    # Use faster count for simple queries
+    if not search:
+        # Build count query without join for better performance
+        count_query = db.query(func.count(Image.id))
+        if species_id:
+            count_query = count_query.filter(Image.species_id == species_id)
+        if source:
+            count_query = count_query.filter(Image.source == source)
+        if license:
+            count_query = count_query.filter(Image.license == license)
+        if status:
+            count_query = count_query.filter(Image.status == status)
+        if min_quality:
+            count_query = count_query.filter(Image.quality_score >= min_quality)
+        total = count_query.scalar()
+    else:
+        total = query.count()
+
+    pages = (total + page_size - 1) // page_size
+
+    images = query.order_by(Image.created_at.desc()).offset(
+        (page - 1) * page_size
+    ).limit(page_size).all()
+
+    items = [
+        ImageResponse(
+            id=img.id,
+            species_id=img.species_id,
+            species_name=img.species.scientific_name if img.species else None,
+            source=img.source,
+            source_id=img.source_id,
+            url=img.url,
+            local_path=img.local_path,
+            license=img.license,
+            attribution=img.attribution,
+            width=img.width,
+            height=img.height,
+            quality_score=img.quality_score,
+            status=img.status,
+            created_at=img.created_at,
+        )
+        for img in images
+    ]
+
+    return ImageListResponse(
+        items=items,
+        total=total,
+        page=page,
+        page_size=page_size,
+        pages=pages,
+    )
+
+
+@router.get("/sources")
+def list_sources(db: Session = Depends(get_db)):
+    """List all unique image sources."""
+    sources = db.query(Image.source).distinct().all()
+    return [s[0] for s in sources]
+
+
+@router.get("/licenses")
+def list_licenses(db: Session = Depends(get_db)):
+    """List all unique licenses."""
+    licenses = db.query(Image.license).distinct().all()
+    return [l[0] for l in licenses]
+
+
+@router.post("/process-pending")
+def process_pending_images(
+    source: Optional[str] = None,
+    db: Session = Depends(get_db),
+):
+    """Queue all pending images for download and processing."""
+    from app.workers.quality_tasks import batch_process_pending_images
+
+    query = db.query(func.count(Image.id)).filter(Image.status == "pending")
+    if source:
+        query = query.filter(Image.source == source)
+    pending_count = query.scalar()
+
+    task = batch_process_pending_images.delay(source=source)
+
+    return {
+        "pending_count": pending_count,
+        "task_id": task.id,
+    }
+
+
+@router.get("/process-pending/status/{task_id}")
+def process_pending_status(task_id: str):
+    """Check status of a batch processing task."""
+    from app.workers.celery_app import celery_app
+
+    result = celery_app.AsyncResult(task_id)
+    state = result.state  # PENDING, STARTED, PROGRESS, SUCCESS, FAILURE
+
+    response = {"task_id": task_id, "state": state}
+
+    if state == "PROGRESS" and isinstance(result.info, dict):
+        response["queued"] = result.info.get("queued", 0)
+        response["total"] = result.info.get("total", 0)
+    elif state == "SUCCESS" and isinstance(result.result, dict):
+        response["queued"] = result.result.get("queued", 0)
+        response["total"] = result.result.get("total", 0)
+
+    return response
+
+
+@router.get("/{image_id}", response_model=ImageResponse)
+def get_image(image_id: int, db: Session = Depends(get_db)):
+    """Get an image by ID."""
+    image = db.query(Image).filter(Image.id == image_id).first()
+    if not image:
+        raise HTTPException(status_code=404, detail="Image not found")
+
+    return ImageResponse(
+        id=image.id,
+        species_id=image.species_id,
+        species_name=image.species.scientific_name if image.species else None,
+        source=image.source,
+        source_id=image.source_id,
+        url=image.url,
+        local_path=image.local_path,
+        license=image.license,
+        attribution=image.attribution,
+        width=image.width,
+        height=image.height,
+        quality_score=image.quality_score,
+        status=image.status,
+        created_at=image.created_at,
+    )
+
+
+@router.get("/{image_id}/file")
+def get_image_file(image_id: int, db: Session = Depends(get_db)):
+    """Get the actual image file."""
+    image = db.query(Image).filter(Image.id == image_id).first()
+    if not image:
+        raise HTTPException(status_code=404, detail="Image not found")
+
+    if not image.local_path:
+        raise HTTPException(status_code=404, detail="Image file not available")
+
+    return FileResponse(image.local_path, media_type="image/jpeg")
+
+
+@router.delete("/{image_id}")
+def delete_image(image_id: int, db: Session = Depends(get_db)):
+    """Delete an image."""
+    image = db.query(Image).filter(Image.id == image_id).first()
+    if not image:
+        raise HTTPException(status_code=404, detail="Image not found")
+
+    # Delete file if exists
+    if image.local_path:
+        import os
+        if os.path.exists(image.local_path):
+            os.remove(image.local_path)
+
+    db.delete(image)
+    db.commit()
+
+    return {"status": "deleted"}
+
+
+@router.post("/bulk-delete")
+def bulk_delete_images(
+    image_ids: List[int],
+    db: Session = Depends(get_db),
+):
+    """Delete multiple images."""
+    import os
+
+    images = db.query(Image).filter(Image.id.in_(image_ids)).all()
+
+    deleted = 0
+    for image in images:
+        if image.local_path and os.path.exists(image.local_path):
+            os.remove(image.local_path)
+        db.delete(image)
+        deleted += 1
+
+    db.commit()
+
+    return {"deleted": deleted}
+
+
+@router.get("/import/scan")
+def scan_imports(db: Session = Depends(get_db)):
+    """Scan the imports folder and return what can be imported.
+
+    Expected structure: imports/{source}/{species_name}/*.jpg
+    """
+    imports_path = Path(settings.imports_path)
+
+    if not imports_path.exists():
+        return {
+            "available": False,
+            "message": f"Imports folder not found: {imports_path}",
+            "sources": [],
+            "total_images": 0,
+            "matched_species": 0,
+            "unmatched_species": [],
+        }
+
+    results = {
+        "available": True,
+        "sources": [],
+        "total_images": 0,
+        "matched_species": 0,
+        "unmatched_species": [],
+    }
+
+    # Get all species for matching
+    species_map = {}
+    for species in db.query(Species).all():
+        # Map by scientific name with underscores and spaces
+        species_map[species.scientific_name.lower()] = species
+        species_map[species.scientific_name.replace(" ", "_").lower()] = species
+
+    seen_unmatched = set()
+
+    # Scan source folders
+    for source_dir in imports_path.iterdir():
+        if not source_dir.is_dir():
+            continue
+
+        source_name = source_dir.name
+        source_info = {
+            "name": source_name,
+            "species_count": 0,
+            "image_count": 0,
+        }
+
+        # Scan species folders within source
+        for species_dir in source_dir.iterdir():
+            if not species_dir.is_dir():
+                continue
+
+            species_name = species_dir.name.replace("_", " ")
+            species_key = species_name.lower()
+
+            # Count images
+            image_files = list(species_dir.glob("*.jpg")) + \
+                         list(species_dir.glob("*.jpeg")) + \
+                         list(species_dir.glob("*.png"))
+
+            if not image_files:
+                continue
+
+            source_info["image_count"] += len(image_files)
+            results["total_images"] += len(image_files)
+
+            if species_key in species_map or species_dir.name.lower() in species_map:
+                source_info["species_count"] += 1
+                results["matched_species"] += 1
+            else:
+                if species_name not in seen_unmatched:
+                    seen_unmatched.add(species_name)
+                    results["unmatched_species"].append(species_name)
+
+        if source_info["image_count"] > 0:
+            results["sources"].append(source_info)
+
+    return results
+
+
+@router.post("/import/run")
+def run_import(
+    move_files: bool = Query(False, description="Move files instead of copy"),
+    db: Session = Depends(get_db),
+):
+    """Import images from the imports folder.
+
+    Expected structure: imports/{source}/{species_name}/*.jpg
+    Images are copied/moved to: images/{species_name}/{source}_{filename}
+    """
+    imports_path = Path(settings.imports_path)
+    images_path = Path(settings.images_path)
+
+    if not imports_path.exists():
+        raise HTTPException(status_code=400, detail="Imports folder not found")
+
+    # Get all species for matching
+    species_map = {}
+    for species in db.query(Species).all():
+        species_map[species.scientific_name.lower()] = species
+        species_map[species.scientific_name.replace(" ", "_").lower()] = species
+
+    imported = 0
+    skipped = 0
+    errors = []
+
+    # Scan source folders
+    for source_dir in imports_path.iterdir():
+        if not source_dir.is_dir():
+            continue
+
+        source_name = source_dir.name
+
+        # Scan species folders within source
+        for species_dir in source_dir.iterdir():
+            if not species_dir.is_dir():
+                continue
+
+            species_name = species_dir.name.replace("_", " ")
+            species_key = species_name.lower()
+
+            # Find matching species
+            species = species_map.get(species_key) or species_map.get(species_dir.name.lower())
+            if not species:
+                continue
+
+            # Create target directory
+            target_dir = images_path / species.scientific_name.replace(" ", "_")
+            target_dir.mkdir(parents=True, exist_ok=True)
+
+            # Process images
+            image_files = list(species_dir.glob("*.jpg")) + \
+                         list(species_dir.glob("*.jpeg")) + \
+                         list(species_dir.glob("*.png"))
+
+            for img_file in image_files:
+                try:
+                    # Generate unique filename
+                    ext = img_file.suffix.lower()
+                    if ext == ".jpeg":
+                        ext = ".jpg"
+                    new_filename = f"{source_name}_{img_file.stem}_{uuid.uuid4().hex[:8]}{ext}"
+                    target_path = target_dir / new_filename
+
+                    # Check if already imported (by original filename pattern)
+                    existing = db.query(Image).filter(
+                        Image.species_id == species.id,
+                        Image.source == source_name,
+                        Image.source_id == img_file.stem,
+                    ).first()
+
+                    if existing:
+                        skipped += 1
+                        continue
+
+                    # Get image dimensions
+                    try:
+                        with PILImage.open(img_file) as pil_img:
+                            width, height = pil_img.size
+                    except Exception:
+                        width, height = None, None
+
+                    # Copy or move file
+                    if move_files:
+                        shutil.move(str(img_file), str(target_path))
+                    else:
+                        shutil.copy2(str(img_file), str(target_path))
+
+                    # Create database record
+                    image = Image(
+                        species_id=species.id,
+                        source=source_name,
+                        source_id=img_file.stem,
+                        url=f"file://{img_file}",
+                        local_path=str(target_path),
+                        license="unknown",
+                        width=width,
+                        height=height,
+                        status="downloaded",
+                    )
+                    db.add(image)
+                    imported += 1
+
+                except Exception as e:
+                    errors.append(f"{img_file}: {str(e)}")
+
+            # Commit after each species to avoid large transactions
+            db.commit()
+
+    return {
+        "imported": imported,
+        "skipped": skipped,
+        "errors": errors[:20],
+    }
@@ -0,0 +1,173 @@
+import json
+from typing import Optional
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+from sqlalchemy.orm import Session
+
+from app.database import get_db
+from app.models import Job
+from app.schemas.job import JobCreate, JobResponse, JobListResponse
+from app.workers.scrape_tasks import run_scrape_job
+
+router = APIRouter()
+
+
+@router.get("", response_model=JobListResponse)
+def list_jobs(
+    status: Optional[str] = None,
+    source: Optional[str] = None,
+    limit: int = Query(50, ge=1, le=200),
+    db: Session = Depends(get_db),
+):
+    """List all jobs."""
+    query = db.query(Job)
+
+    if status:
+        query = query.filter(Job.status == status)
+
+    if source:
+        query = query.filter(Job.source == source)
+
+    total = query.count()
+    jobs = query.order_by(Job.created_at.desc()).limit(limit).all()
+
+    return JobListResponse(
+        items=[JobResponse.model_validate(j) for j in jobs],
+        total=total,
+    )
+
+
+@router.post("", response_model=JobResponse)
+def create_job(job: JobCreate, db: Session = Depends(get_db)):
+    """Create and start a new scrape job."""
+    species_filter = None
+    if job.species_ids:
+        species_filter = json.dumps(job.species_ids)
+
+    db_job = Job(
+        name=job.name,
+        source=job.source,
+        species_filter=species_filter,
+        only_without_images=job.only_without_images,
+        max_images=job.max_images,
+        status="pending",
+    )
+    db.add(db_job)
+    db.commit()
+    db.refresh(db_job)
+
+    # Start the Celery task
+    task = run_scrape_job.delay(db_job.id)
+    db_job.celery_task_id = task.id
+    db.commit()
+
+    return JobResponse.model_validate(db_job)
+
+
+@router.get("/{job_id}", response_model=JobResponse)
+def get_job(job_id: int, db: Session = Depends(get_db)):
+    """Get job status."""
+    job = db.query(Job).filter(Job.id == job_id).first()
+    if not job:
+        raise HTTPException(status_code=404, detail="Job not found")
+
+    return JobResponse.model_validate(job)
+
+
+@router.get("/{job_id}/progress")
+def get_job_progress(job_id: int, db: Session = Depends(get_db)):
+    """Get real-time job progress from Celery."""
+    from app.workers.celery_app import celery_app
+
+    job = db.query(Job).filter(Job.id == job_id).first()
+    if not job:
+        raise HTTPException(status_code=404, detail="Job not found")
+
+    if not job.celery_task_id:
+        return {
+            "status": job.status,
+            "progress_current": job.progress_current,
+            "progress_total": job.progress_total,
+        }
+
+    # Get Celery task state
+    result = celery_app.AsyncResult(job.celery_task_id)
+
+    if result.state == "PROGRESS":
+        meta = result.info
+        return {
+            "status": "running",
+            "progress_current": meta.get("current", 0),
+            "progress_total": meta.get("total", 0),
+            "current_species": meta.get("species", ""),
+        }
+
+    return {
+        "status": job.status,
+        "progress_current": job.progress_current,
+        "progress_total": job.progress_total,
+    }
+
+
+@router.post("/{job_id}/pause")
+def pause_job(job_id: int, db: Session = Depends(get_db)):
+    """Pause a running job."""
+    from app.workers.celery_app import celery_app
+
+    job = db.query(Job).filter(Job.id == job_id).first()
+    if not job:
+        raise HTTPException(status_code=404, detail="Job not found")
+
+    if job.status != "running":
+        raise HTTPException(status_code=400, detail="Job is not running")
+
+    # Revoke Celery task
+    if job.celery_task_id:
+        celery_app.control.revoke(job.celery_task_id, terminate=True)
+
+    job.status = "paused"
+    db.commit()
+
+    return {"status": "paused"}
+
+
+@router.post("/{job_id}/resume")
+def resume_job(job_id: int, db: Session = Depends(get_db)):
+    """Resume a paused job."""
+    job = db.query(Job).filter(Job.id == job_id).first()
+    if not job:
+        raise HTTPException(status_code=404, detail="Job not found")
+
+    if job.status != "paused":
+        raise HTTPException(status_code=400, detail="Job is not paused")
+
+    # Start new Celery task
+    task = run_scrape_job.delay(job.id)
+    job.celery_task_id = task.id
+    job.status = "pending"
+    db.commit()
+
+    return {"status": "resumed"}
+
+
+@router.post("/{job_id}/cancel")
+def cancel_job(job_id: int, db: Session = Depends(get_db)):
+    """Cancel a job."""
+    from app.workers.celery_app import celery_app
+
+    job = db.query(Job).filter(Job.id == job_id).first()
+    if not job:
+        raise HTTPException(status_code=404, detail="Job not found")
+
+    if job.status in ["completed", "failed"]:
+        raise HTTPException(status_code=400, detail="Job already finished")
+
+    # Revoke Celery task
+    if job.celery_task_id:
+        celery_app.control.revoke(job.celery_task_id, terminate=True)
+
+    job.status = "failed"
+    job.error_message = "Cancelled by user"
+    db.commit()
+
+    return {"status": "cancelled"}
@@ -0,0 +1,198 @@
+from fastapi import APIRouter, Depends, HTTPException
+from sqlalchemy.orm import Session
+
+from app.database import get_db
+from app.models import ApiKey
+from app.schemas.api_key import ApiKeyCreate, ApiKeyUpdate, ApiKeyResponse
+
+router = APIRouter()
+
+# Available sources
+# auth_type: "none" (no auth), "api_key" (single key), "api_key_secret" (key + secret), "oauth" (client_id + client_secret + access_token)
+# default_rate: safe default requests per second for each API
+AVAILABLE_SOURCES = [
+    {"name": "gbif", "label": "GBIF", "requires_secret": False, "auth_type": "none", "default_rate": 1.0},  # Free, no auth required
+    {"name": "inaturalist", "label": "iNaturalist", "requires_secret": True, "auth_type": "api_key_secret", "default_rate": 1.0},  # 60/min limit
+    {"name": "flickr", "label": "Flickr", "requires_secret": True, "auth_type": "api_key_secret", "default_rate": 0.5},  # 3600/hr shared limit
+    {"name": "wikimedia", "label": "Wikimedia Commons", "requires_secret": True, "auth_type": "oauth", "default_rate": 1.0},  # generous limits
+    {"name": "trefle", "label": "Trefle.io", "requires_secret": False, "auth_type": "api_key", "default_rate": 1.0},  # 120/min limit
+    {"name": "duckduckgo", "label": "DuckDuckGo", "requires_secret": False, "auth_type": "none", "default_rate": 0.5},  # Web search, no API key
+    {"name": "bing", "label": "Bing Image Search", "requires_secret": False, "auth_type": "api_key", "default_rate": 3.0},  # Azure Cognitive Services
+]
+
+
+def mask_api_key(key: str) -> str:
+    """Mask API key, showing only last 4 characters."""
+    if not key or len(key) <= 4:
+        return "****"
+    return "*" * (len(key) - 4) + key[-4:]
+
+
+@router.get("")
+def list_sources(db: Session = Depends(get_db)):
+    """List all available sources with their configuration status."""
+    api_keys = {k.source: k for k in db.query(ApiKey).all()}
+
+    result = []
+    for source in AVAILABLE_SOURCES:
+        api_key = api_keys.get(source["name"])
+        default_rate = source.get("default_rate", 1.0)
+        result.append({
+            "name": source["name"],
+            "label": source["label"],
+            "requires_secret": source["requires_secret"],
+            "auth_type": source.get("auth_type", "api_key"),
+            "configured": api_key is not None,
+            "enabled": api_key.enabled if api_key else False,
+            "api_key_masked": mask_api_key(api_key.api_key) if api_key else None,
+            "has_secret": bool(api_key.api_secret) if api_key else False,
+            "has_access_token": bool(getattr(api_key, 'access_token', None)) if api_key else False,
+            "rate_limit_per_sec": api_key.rate_limit_per_sec if api_key else default_rate,
+            "default_rate": default_rate,
+        })
+
+    return result
+
+
+@router.get("/{source}")
+def get_source(source: str, db: Session = Depends(get_db)):
+    """Get source configuration."""
+    source_info = next((s for s in AVAILABLE_SOURCES if s["name"] == source), None)
+    if not source_info:
+        raise HTTPException(status_code=404, detail="Unknown source")
+
+    api_key = db.query(ApiKey).filter(ApiKey.source == source).first()
+    default_rate = source_info.get("default_rate", 1.0)
+
+    return {
+        "name": source_info["name"],
+        "label": source_info["label"],
+        "requires_secret": source_info["requires_secret"],
+        "auth_type": source_info.get("auth_type", "api_key"),
+        "configured": api_key is not None,
+        "enabled": api_key.enabled if api_key else False,
+        "api_key_masked": mask_api_key(api_key.api_key) if api_key else None,
+        "has_secret": bool(api_key.api_secret) if api_key else False,
+        "has_access_token": bool(getattr(api_key, 'access_token', None)) if api_key else False,
+        "rate_limit_per_sec": api_key.rate_limit_per_sec if api_key else default_rate,
+        "default_rate": default_rate,
+    }
+
+
+@router.put("/{source}")
+def update_source(
+    source: str,
+    config: ApiKeyCreate,
+    db: Session = Depends(get_db),
+):
+    """Create or update source configuration."""
+    source_info = next((s for s in AVAILABLE_SOURCES if s["name"] == source), None)
+    if not source_info:
+        raise HTTPException(status_code=404, detail="Unknown source")
+
+    # For sources that require auth, validate api_key is provided
+    auth_type = source_info.get("auth_type", "api_key")
+    if auth_type != "none" and not config.api_key:
+        raise HTTPException(status_code=400, detail="API key is required for this source")
+
+    api_key = db.query(ApiKey).filter(ApiKey.source == source).first()
+
+    # Use placeholder for no-auth sources
+    api_key_value = config.api_key or "no-auth"
+
+    if api_key:
+        # Update existing
+        api_key.api_key = api_key_value
+        if config.api_secret:
+            api_key.api_secret = config.api_secret
+        if config.access_token:
+            api_key.access_token = config.access_token
+        api_key.rate_limit_per_sec = config.rate_limit_per_sec
+        api_key.enabled = config.enabled
+    else:
+        # Create new
+        api_key = ApiKey(
+            source=source,
+            api_key=api_key_value,
+            api_secret=config.api_secret,
+            access_token=config.access_token,
+            rate_limit_per_sec=config.rate_limit_per_sec,
+            enabled=config.enabled,
+        )
+        db.add(api_key)
+
+    db.commit()
+    db.refresh(api_key)
+
+    return {
+        "name": source,
+        "configured": True,
+        "enabled": api_key.enabled,
+        "api_key_masked": mask_api_key(api_key.api_key) if auth_type != "none" else None,
+        "has_secret": bool(api_key.api_secret),
+        "has_access_token": bool(api_key.access_token),
+        "rate_limit_per_sec": api_key.rate_limit_per_sec,
+    }
+
+
+@router.patch("/{source}")
+def patch_source(
+    source: str,
+    config: ApiKeyUpdate,
+    db: Session = Depends(get_db),
+):
+    """Partially update source configuration."""
+    api_key = db.query(ApiKey).filter(ApiKey.source == source).first()
+    if not api_key:
+        raise HTTPException(status_code=404, detail="Source not configured")
+
+    update_data = config.model_dump(exclude_unset=True)
+    for field, value in update_data.items():
+        setattr(api_key, field, value)
+
+    db.commit()
+    db.refresh(api_key)
+
+    return {
+        "name": source,
+        "configured": True,
+        "enabled": api_key.enabled,
+        "api_key_masked": mask_api_key(api_key.api_key),
+        "has_secret": bool(api_key.api_secret),
+        "has_access_token": bool(api_key.access_token),
+        "rate_limit_per_sec": api_key.rate_limit_per_sec,
+    }
+
+
+@router.delete("/{source}")
+def delete_source(source: str, db: Session = Depends(get_db)):
+    """Delete source configuration."""
+    api_key = db.query(ApiKey).filter(ApiKey.source == source).first()
+    if not api_key:
+        raise HTTPException(status_code=404, detail="Source not configured")
+
+    db.delete(api_key)
+    db.commit()
+
+    return {"status": "deleted"}
+
+
+@router.post("/{source}/test")
+def test_source(source: str, db: Session = Depends(get_db)):
+    """Test source API connection."""
+    api_key = db.query(ApiKey).filter(ApiKey.source == source).first()
+    if not api_key:
+        raise HTTPException(status_code=404, detail="Source not configured")
+
+    # Import and test the scraper
+    from app.scrapers import get_scraper
+
+    scraper = get_scraper(source)
+    if not scraper:
+        raise HTTPException(status_code=400, detail="No scraper for this source")
+
+    try:
+        result = scraper.test_connection(api_key)
+        return {"status": "success", "message": result}
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
@@ -0,0 +1,366 @@
+import csv
+import io
+import json
+from typing import Optional
+
+from fastapi import APIRouter, Depends, HTTPException, Query, UploadFile, File
+from sqlalchemy.orm import Session
+from sqlalchemy import func, text
+
+from app.database import get_db
+from app.models import Species, Image
+from app.schemas.species import (
+    SpeciesCreate,
+    SpeciesUpdate,
+    SpeciesResponse,
+    SpeciesListResponse,
+    SpeciesImportResponse,
+)
+
+router = APIRouter()
+
+
+def get_species_with_count(db: Session, species: Species) -> SpeciesResponse:
+    """Get species response with image count."""
+    image_count = db.query(func.count(Image.id)).filter(
+        Image.species_id == species.id,
+        Image.status == "downloaded"
+    ).scalar()
+
+    return SpeciesResponse(
+        id=species.id,
+        scientific_name=species.scientific_name,
+        common_name=species.common_name,
+        genus=species.genus,
+        family=species.family,
+        created_at=species.created_at,
+        image_count=image_count or 0,
+    )
+
+
+@router.get("", response_model=SpeciesListResponse)
+def list_species(
+    page: int = Query(1, ge=1),
+    page_size: int = Query(50, ge=1, le=500),
+    search: Optional[str] = None,
+    genus: Optional[str] = None,
+    has_images: Optional[bool] = None,
+    max_images: Optional[int] = Query(None, description="Filter species with less than N images"),
+    min_images: Optional[int] = Query(None, description="Filter species with at least N images"),
+    db: Session = Depends(get_db),
+):
+    """List species with pagination and filters.
+
+    Filters:
+    - search: Search by scientific or common name
+    - genus: Filter by genus
+    - has_images: True for species with images, False for species without
+    - max_images: Filter species with fewer than N downloaded images
+    - min_images: Filter species with at least N downloaded images
+    """
+    # If filtering by image count, we need to use a subquery approach
+    if max_images is not None or min_images is not None:
+        # Build a subquery with image counts per species
+        image_counts = (
+            db.query(
+                Species.id.label("species_id"),
+                func.count(Image.id).label("img_count")
+            )
+            .outerjoin(Image, (Image.species_id == Species.id) & (Image.status == "downloaded"))
+            .group_by(Species.id)
+            .subquery()
+        )
+
+        # Join species with their counts
+        query = db.query(Species).join(
+            image_counts, Species.id == image_counts.c.species_id
+        )
+
+        if max_images is not None:
+            query = query.filter(image_counts.c.img_count < max_images)
+
+        if min_images is not None:
+            query = query.filter(image_counts.c.img_count >= min_images)
+    else:
+        query = db.query(Species)
+
+    if search:
+        search_term = f"%{search}%"
+        query = query.filter(
+            (Species.scientific_name.ilike(search_term)) |
+            (Species.common_name.ilike(search_term))
+        )
+
+    if genus:
+        query = query.filter(Species.genus == genus)
+
+    # Filter by whether species has downloaded images (only if not using min/max filters)
+    if has_images is not None and max_images is None and min_images is None:
+        # Get IDs of species that have at least one downloaded image
+        species_with_images = (
+            db.query(Image.species_id)
+            .filter(Image.status == "downloaded")
+            .distinct()
+            .subquery()
+        )
+        if has_images:
+            query = query.filter(Species.id.in_(db.query(species_with_images.c.species_id)))
+        else:
+            query = query.filter(~Species.id.in_(db.query(species_with_images.c.species_id)))
+
+    total = query.count()
+    pages = (total + page_size - 1) // page_size
+
+    species_list = query.order_by(Species.scientific_name).offset(
+        (page - 1) * page_size
+    ).limit(page_size).all()
+
+    # Fetch image counts in bulk for all species on this page
+    species_ids = [s.id for s in species_list]
+    if species_ids:
+        count_query = db.query(
+            Image.species_id,
+            func.count(Image.id)
+        ).filter(
+            Image.species_id.in_(species_ids),
+            Image.status == "downloaded"
+        ).group_by(Image.species_id).all()
+        count_map = {species_id: count for species_id, count in count_query}
+    else:
+        count_map = {}
+
+    items = [
+        SpeciesResponse(
+            id=s.id,
+            scientific_name=s.scientific_name,
+            common_name=s.common_name,
+            genus=s.genus,
+            family=s.family,
+            created_at=s.created_at,
+            image_count=count_map.get(s.id, 0),
+        )
+        for s in species_list
+    ]
+
+    return SpeciesListResponse(
+        items=items,
+        total=total,
+        page=page,
+        page_size=page_size,
+        pages=pages,
+    )
+
+
+@router.post("", response_model=SpeciesResponse)
+def create_species(species: SpeciesCreate, db: Session = Depends(get_db)):
+    """Create a new species."""
+    existing = db.query(Species).filter(
+        Species.scientific_name == species.scientific_name
+    ).first()
+
+    if existing:
+        raise HTTPException(status_code=400, detail="Species already exists")
+
+    # Auto-extract genus from scientific name if not provided
+    genus = species.genus
+    if not genus and " " in species.scientific_name:
+        genus = species.scientific_name.split()[0]
+
+    db_species = Species(
+        scientific_name=species.scientific_name,
+        common_name=species.common_name,
+        genus=genus,
+        family=species.family,
+    )
+    db.add(db_species)
+    db.commit()
+    db.refresh(db_species)
+
+    return get_species_with_count(db, db_species)
+
+
+@router.post("/import", response_model=SpeciesImportResponse)
+async def import_species(
+    file: UploadFile = File(...),
+    db: Session = Depends(get_db),
+):
+    """Import species from CSV file.
+
+    Expected columns: scientific_name, common_name (optional), genus (optional), family (optional)
+    """
+    if not file.filename.endswith(".csv"):
+        raise HTTPException(status_code=400, detail="File must be a CSV")
+
+    content = await file.read()
+    text = content.decode("utf-8")
+
+    reader = csv.DictReader(io.StringIO(text))
+
+    imported = 0
+    skipped = 0
+    errors = []
+
+    for row_num, row in enumerate(reader, start=2):
+        scientific_name = row.get("scientific_name", "").strip()
+        if not scientific_name:
+            errors.append(f"Row {row_num}: Missing scientific_name")
+            continue
+
+        # Check if already exists
+        existing = db.query(Species).filter(
+            Species.scientific_name == scientific_name
+        ).first()
+
+        if existing:
+            skipped += 1
+            continue
+
+        # Auto-extract genus if not provided
+        genus = row.get("genus", "").strip()
+        if not genus and " " in scientific_name:
+            genus = scientific_name.split()[0]
+
+        try:
+            species = Species(
+                scientific_name=scientific_name,
+                common_name=row.get("common_name", "").strip() or None,
+                genus=genus or None,
+                family=row.get("family", "").strip() or None,
+            )
+            db.add(species)
+            imported += 1
+        except Exception as e:
+            errors.append(f"Row {row_num}: {str(e)}")
+
+    db.commit()
+
+    return SpeciesImportResponse(
+        imported=imported,
+        skipped=skipped,
+        errors=errors[:10],  # Limit error messages
+    )
+
+
+@router.post("/import-json", response_model=SpeciesImportResponse)
+async def import_species_json(
+    file: UploadFile = File(...),
+    db: Session = Depends(get_db),
+):
+    """Import species from JSON file.
+
+    Expected format: {"plants": [{"scientific_name": "...", "common_names": [...], "family": "..."}]}
+    """
+    if not file.filename.endswith(".json"):
+        raise HTTPException(status_code=400, detail="File must be a JSON")
+
+    content = await file.read()
+    try:
+        data = json.loads(content.decode("utf-8"))
+    except json.JSONDecodeError as e:
+        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}")
+
+    plants = data.get("plants", [])
+    if not plants:
+        raise HTTPException(status_code=400, detail="No plants found in JSON")
+
+    imported = 0
+    skipped = 0
+    errors = []
+
+    for idx, plant in enumerate(plants):
+        scientific_name = plant.get("scientific_name", "").strip()
+        if not scientific_name:
+            errors.append(f"Plant {idx}: Missing scientific_name")
+            continue
+
+        # Check if already exists
+        existing = db.query(Species).filter(
+            Species.scientific_name == scientific_name
+        ).first()
+
+        if existing:
+            skipped += 1
+            continue
+
+        # Auto-extract genus from scientific name
+        genus = None
+        if " " in scientific_name:
+            genus = scientific_name.split()[0]
+
+        # Get first common name if array provided
+        common_names = plant.get("common_names", [])
+        common_name = common_names[0] if common_names else None
+
+        try:
+            species = Species(
+                scientific_name=scientific_name,
+                common_name=common_name,
+                genus=genus,
+                family=plant.get("family"),
+            )
+            db.add(species)
+            imported += 1
+        except Exception as e:
+            errors.append(f"Plant {idx}: {str(e)}")
+
+    db.commit()
+
+    return SpeciesImportResponse(
+        imported=imported,
+        skipped=skipped,
+        errors=errors[:10],
+    )
+
+
+@router.get("/{species_id}", response_model=SpeciesResponse)
+def get_species(species_id: int, db: Session = Depends(get_db)):
+    """Get a species by ID."""
+    species = db.query(Species).filter(Species.id == species_id).first()
+    if not species:
+        raise HTTPException(status_code=404, detail="Species not found")
+
+    return get_species_with_count(db, species)
+
+
+@router.put("/{species_id}", response_model=SpeciesResponse)
+def update_species(
+    species_id: int,
+    species_update: SpeciesUpdate,
+    db: Session = Depends(get_db),
+):
+    """Update a species."""
+    species = db.query(Species).filter(Species.id == species_id).first()
+    if not species:
+        raise HTTPException(status_code=404, detail="Species not found")
+
+    update_data = species_update.model_dump(exclude_unset=True)
+    for field, value in update_data.items():
+        setattr(species, field, value)
+
+    db.commit()
+    db.refresh(species)
+
+    return get_species_with_count(db, species)
+
+
+@router.delete("/{species_id}")
+def delete_species(species_id: int, db: Session = Depends(get_db)):
+    """Delete a species and all its images."""
+    species = db.query(Species).filter(Species.id == species_id).first()
+    if not species:
+        raise HTTPException(status_code=404, detail="Species not found")
+
+    db.delete(species)
+    db.commit()
+
+    return {"status": "deleted"}
+
+
+@router.get("/genera/list")
+def list_genera(db: Session = Depends(get_db)):
+    """List all unique genera."""
+    genera = db.query(Species.genus).filter(
+        Species.genus.isnot(None)
+    ).distinct().order_by(Species.genus).all()
+
+    return [g[0] for g in genera]
@@ -0,0 +1,190 @@
+import json
+
+from fastapi import APIRouter, Depends, HTTPException
+from sqlalchemy.orm import Session
+from sqlalchemy import func, case
+
+from app.database import get_db
+from app.models import Species, Image, Job
+from app.models.cached_stats import CachedStats
+from app.schemas.stats import StatsResponse, SourceStats, LicenseStats, SpeciesStats, JobStats
+
+router = APIRouter()
+
+
+@router.get("", response_model=StatsResponse)
+def get_stats(db: Session = Depends(get_db)):
+    """Get dashboard statistics from cache (updated every 60s by Celery)."""
+    # Try to get cached stats
+    cached = db.query(CachedStats).filter(CachedStats.key == "dashboard_stats").first()
+
+    if cached:
+        data = json.loads(cached.value)
+        return StatsResponse(
+            total_species=data["total_species"],
+            total_images=data["total_images"],
+            images_downloaded=data["images_downloaded"],
+            images_pending=data["images_pending"],
+            images_rejected=data["images_rejected"],
+            disk_usage_mb=data["disk_usage_mb"],
+            sources=[SourceStats(**s) for s in data["sources"]],
+            licenses=[LicenseStats(**l) for l in data["licenses"]],
+            jobs=JobStats(**data["jobs"]),
+            top_species=[SpeciesStats(**s) for s in data["top_species"]],
+            under_represented=[SpeciesStats(**s) for s in data["under_represented"]],
+        )
+
+    # No cache yet - return empty stats (Celery will populate soon)
+    # This only happens on first startup before Celery runs
+    return StatsResponse(
+        total_species=0,
+        total_images=0,
+        images_downloaded=0,
+        images_pending=0,
+        images_rejected=0,
+        disk_usage_mb=0.0,
+        sources=[],
+        licenses=[],
+        jobs=JobStats(running=0, pending=0, completed=0, failed=0),
+        top_species=[],
+        under_represented=[],
+    )
+
+
+@router.post("/refresh")
+def refresh_stats_now(db: Session = Depends(get_db)):
+    """Manually trigger a stats refresh."""
+    from app.workers.stats_tasks import refresh_stats
+    refresh_stats.delay()
+    return {"status": "refresh_queued"}
+
+
+@router.get("/sources")
+def get_source_stats(db: Session = Depends(get_db)):
+    """Get per-source breakdown."""
+    stats = db.query(
+        Image.source,
+        func.count(Image.id).label("total"),
+        func.sum(case((Image.status == "downloaded", 1), else_=0)).label("downloaded"),
+        func.sum(case((Image.status == "pending", 1), else_=0)).label("pending"),
+        func.sum(case((Image.status == "rejected", 1), else_=0)).label("rejected"),
+    ).group_by(Image.source).all()
+
+    return [
+        {
+            "source": s.source,
+            "total": s.total,
+            "downloaded": s.downloaded or 0,
+            "pending": s.pending or 0,
+            "rejected": s.rejected or 0,
+        }
+        for s in stats
+    ]
+
+
+@router.get("/species")
+def get_species_stats(
+    min_count: int = 0,
+    max_count: int = None,
+    db: Session = Depends(get_db),
+):
+    """Get per-species image counts."""
+    query = db.query(
+        Species.id,
+        Species.scientific_name,
+        Species.common_name,
+        Species.genus,
+        func.count(Image.id).label("image_count")
+    ).outerjoin(Image, (Image.species_id == Species.id) & (Image.status == "downloaded")
+    ).group_by(Species.id)
+
+    if min_count > 0:
+        query = query.having(func.count(Image.id) >= min_count)
+
+    if max_count is not None:
+        query = query.having(func.count(Image.id) <= max_count)
+
+    stats = query.order_by(func.count(Image.id).desc()).all()
+
+    return [
+        {
+            "id": s.id,
+            "scientific_name": s.scientific_name,
+            "common_name": s.common_name,
+            "genus": s.genus,
+            "image_count": s.image_count,
+        }
+        for s in stats
+    ]
+
+
+@router.get("/distribution")
+def get_image_distribution(db: Session = Depends(get_db)):
+    """Get distribution of images per species for ML training assessment.
+
+    Returns counts of species at various image thresholds to help
+    determine dataset quality for training image classifiers.
+    """
+    from sqlalchemy import text
+
+    # Get image counts per species using optimized raw SQL
+    distribution_sql = text("""
+        WITH species_counts AS (
+            SELECT
+                s.id,
+                COUNT(i.id) as cnt
+            FROM species s
+            LEFT JOIN images i ON i.species_id = s.id AND i.status = 'downloaded'
+            GROUP BY s.id
+        )
+        SELECT
+            COUNT(*) as total_species,
+            SUM(CASE WHEN cnt = 0 THEN 1 ELSE 0 END) as with_0,
+            SUM(CASE WHEN cnt >= 1 AND cnt < 10 THEN 1 ELSE 0 END) as with_1_9,
+            SUM(CASE WHEN cnt >= 10 AND cnt < 25 THEN 1 ELSE 0 END) as with_10_24,
+            SUM(CASE WHEN cnt >= 25 AND cnt < 50 THEN 1 ELSE 0 END) as with_25_49,
+            SUM(CASE WHEN cnt >= 50 AND cnt < 100 THEN 1 ELSE 0 END) as with_50_99,
+            SUM(CASE WHEN cnt >= 100 AND cnt < 200 THEN 1 ELSE 0 END) as with_100_199,
+            SUM(CASE WHEN cnt >= 200 THEN 1 ELSE 0 END) as with_200_plus,
+            SUM(CASE WHEN cnt >= 10 THEN 1 ELSE 0 END) as trainable_10,
+            SUM(CASE WHEN cnt >= 25 THEN 1 ELSE 0 END) as trainable_25,
+            SUM(CASE WHEN cnt >= 50 THEN 1 ELSE 0 END) as trainable_50,
+            SUM(CASE WHEN cnt >= 100 THEN 1 ELSE 0 END) as trainable_100,
+            AVG(cnt) as avg_images,
+            MAX(cnt) as max_images,
+            MIN(cnt) as min_images,
+            SUM(cnt) as total_images
+        FROM species_counts
+    """)
+
+    result = db.execute(distribution_sql).fetchone()
+
+    return {
+        "total_species": result[0] or 0,
+        "distribution": {
+            "0_images": result[1] or 0,
+            "1_to_9": result[2] or 0,
+            "10_to_24": result[3] or 0,
+            "25_to_49": result[4] or 0,
+            "50_to_99": result[5] or 0,
+            "100_to_199": result[6] or 0,
+            "200_plus": result[7] or 0,
+        },
+        "trainable_species": {
+            "min_10_images": result[8] or 0,
+            "min_25_images": result[9] or 0,
+            "min_50_images": result[10] or 0,
+            "min_100_images": result[11] or 0,
+        },
+        "summary": {
+            "avg_images_per_species": round(result[12] or 0, 1),
+            "max_images": result[13] or 0,
+            "min_images": result[14] or 0,
+            "total_downloaded_images": result[15] or 0,
+        },
+        "recommendations": {
+            "for_basic_model": f"{result[8] or 0} species with 10+ images",
+            "for_good_model": f"{result[10] or 0} species with 50+ images",
+            "for_excellent_model": f"{result[11] or 0} species with 100+ images",
+        }
+    }
@@ -0,0 +1,38 @@
+from pydantic_settings import BaseSettings
+from functools import lru_cache
+
+
+class Settings(BaseSettings):
+    # Database
+    database_url: str = "sqlite:////data/db/plants.sqlite"
+
+    # Redis
+    redis_url: str = "redis://redis:6379/0"
+
+    # Storage paths
+    images_path: str = "/data/images"
+    exports_path: str = "/data/exports"
+    imports_path: str = "/data/imports"
+    logs_path: str = "/data/logs"
+
+    # API Keys
+    flickr_api_key: str = ""
+    flickr_api_secret: str = ""
+    inaturalist_app_id: str = ""
+    inaturalist_app_secret: str = ""
+    trefle_api_key: str = ""
+
+    # Logging
+    log_level: str = "INFO"
+
+    # Celery
+    celery_concurrency: int = 4
+
+    class Config:
+        env_file = ".env"
+        extra = "ignore"
+
+
+@lru_cache()
+def get_settings() -> Settings:
+    return Settings()
@@ -0,0 +1,44 @@
+from sqlalchemy import create_engine, event
+from sqlalchemy.orm import sessionmaker, declarative_base
+from sqlalchemy.pool import StaticPool
+
+from app.config import get_settings
+
+settings = get_settings()
+
+# SQLite-specific configuration
+connect_args = {"check_same_thread": False}
+
+engine = create_engine(
+    settings.database_url,
+    connect_args=connect_args,
+    poolclass=StaticPool,
+    echo=False,
+)
+
+# Enable WAL mode for better concurrent access
+@event.listens_for(engine, "connect")
+def set_sqlite_pragma(dbapi_connection, connection_record):
+    cursor = dbapi_connection.cursor()
+    cursor.execute("PRAGMA journal_mode=WAL")
+    cursor.execute("PRAGMA synchronous=NORMAL")
+    cursor.execute("PRAGMA foreign_keys=ON")
+    cursor.close()
+
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+
+Base = declarative_base()
+
+
+def get_db():
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()
+
+
+def init_db():
+    """Create all tables."""
+    from app.models import species, image, job, api_key, export, cached_stats  # noqa
+    Base.metadata.create_all(bind=engine)
@@ -0,0 +1,95 @@
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+
+from app.config import get_settings
+from app.database import init_db
+from app.api import species, images, jobs, exports, stats, sources
+
+settings = get_settings()
+
+app = FastAPI(
+    title="PlantGuideScraper API",
+    description="Web scraper interface for houseplant image collection",
+    version="1.0.0",
+)
+
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# Include routers
+app.include_router(species.router, prefix="/api/species", tags=["Species"])
+app.include_router(images.router, prefix="/api/images", tags=["Images"])
+app.include_router(jobs.router, prefix="/api/jobs", tags=["Jobs"])
+app.include_router(exports.router, prefix="/api/exports", tags=["Exports"])
+app.include_router(stats.router, prefix="/api/stats", tags=["Stats"])
+app.include_router(sources.router, prefix="/api/sources", tags=["Sources"])
+
+
+@app.on_event("startup")
+async def startup_event():
+    """Initialize database on startup."""
+    init_db()
+
+
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    return {"status": "healthy", "service": "plant-scraper"}
+
+
+@app.get("/api/debug")
+async def debug_check():
+    """Debug endpoint - checks database connection."""
+    import time
+    from app.database import SessionLocal
+    from app.models import Species, Image
+
+    results = {"status": "checking", "checks": {}}
+
+    # Check 1: Can we create a session?
+    try:
+        start = time.time()
+        db = SessionLocal()
+        results["checks"]["session_create"] = {"ok": True, "ms": int((time.time() - start) * 1000)}
+    except Exception as e:
+        results["checks"]["session_create"] = {"ok": False, "error": str(e)}
+        results["status"] = "error"
+        return results
+
+    # Check 2: Simple query - count species
+    try:
+        start = time.time()
+        count = db.query(Species).count()
+        results["checks"]["species_count"] = {"ok": True, "count": count, "ms": int((time.time() - start) * 1000)}
+    except Exception as e:
+        results["checks"]["species_count"] = {"ok": False, "error": str(e)}
+        results["status"] = "error"
+        db.close()
+        return results
+
+    # Check 3: Count images
+    try:
+        start = time.time()
+        count = db.query(Image).count()
+        results["checks"]["image_count"] = {"ok": True, "count": count, "ms": int((time.time() - start) * 1000)}
+    except Exception as e:
+        results["checks"]["image_count"] = {"ok": False, "error": str(e)}
+        results["status"] = "error"
+        db.close()
+        return results
+
+    db.close()
+    results["status"] = "healthy"
+    return results
+
+
+@app.get("/")
+async def root():
+    """Root endpoint."""
+    return {"message": "PlantGuideScraper API", "docs": "/docs"}
@@ -0,0 +1,8 @@
+from app.models.species import Species
+from app.models.image import Image
+from app.models.job import Job
+from app.models.api_key import ApiKey
+from app.models.export import Export
+from app.models.cached_stats import CachedStats
+
+__all__ = ["Species", "Image", "Job", "ApiKey", "Export", "CachedStats"]
@@ -0,0 +1,18 @@
+from sqlalchemy import Column, Integer, String, Float, Boolean
+
+from app.database import Base
+
+
+class ApiKey(Base):
+    __tablename__ = "api_keys"
+
+    id = Column(Integer, primary_key=True, index=True)
+    source = Column(String, unique=True, nullable=False)  # 'flickr', 'inaturalist', 'wikimedia', 'trefle'
+    api_key = Column(String, nullable=False)  # Also used as Client ID for OAuth sources
+    api_secret = Column(String, nullable=True)  # Also used as Client Secret for OAuth sources
+    access_token = Column(String, nullable=True)  # For OAuth sources like Wikimedia
+    rate_limit_per_sec = Column(Float, default=1.0)
+    enabled = Column(Boolean, default=True)
+
+    def __repr__(self):
+        return f"<ApiKey(id={self.id}, source='{self.source}', enabled={self.enabled})>"
@@ -0,0 +1,14 @@
+from datetime import datetime
+from sqlalchemy import Column, Integer, String, Text, DateTime
+
+from app.database import Base
+
+
+class CachedStats(Base):
+    """Stores pre-calculated statistics updated by Celery beat."""
+    __tablename__ = "cached_stats"
+
+    id = Column(Integer, primary_key=True, index=True)
+    key = Column(String(50), unique=True, nullable=False, index=True)
+    value = Column(Text, nullable=False)  # JSON-encoded stats
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
@@ -0,0 +1,24 @@
+from sqlalchemy import Column, Integer, String, Float, DateTime, Text, func
+
+from app.database import Base
+
+
+class Export(Base):
+    __tablename__ = "exports"
+
+    id = Column(Integer, primary_key=True, index=True)
+    name = Column(String, nullable=False)
+    filter_criteria = Column(Text, nullable=True)  # JSON: min_images, licenses, min_quality, species_ids
+    train_split = Column(Float, default=0.8)
+    status = Column(String, default="pending")  # pending, generating, completed, failed
+    file_path = Column(String, nullable=True)
+    file_size = Column(Integer, nullable=True)
+    species_count = Column(Integer, nullable=True)
+    image_count = Column(Integer, nullable=True)
+    celery_task_id = Column(String, nullable=True)
+    created_at = Column(DateTime, server_default=func.now())
+    completed_at = Column(DateTime, nullable=True)
+    error_message = Column(Text, nullable=True)
+
+    def __repr__(self):
+        return f"<Export(id={self.id}, name='{self.name}', status='{self.status}')>"
@@ -0,0 +1,36 @@
+from sqlalchemy import Column, Integer, String, Float, DateTime, ForeignKey, func, UniqueConstraint, Index
+from sqlalchemy.orm import relationship
+
+from app.database import Base
+
+
+class Image(Base):
+    __tablename__ = "images"
+
+    id = Column(Integer, primary_key=True, index=True)
+    species_id = Column(Integer, ForeignKey("species.id"), nullable=False, index=True)
+    source = Column(String, nullable=False, index=True)
+    source_id = Column(String, nullable=True)
+    url = Column(String, nullable=False)
+    local_path = Column(String, nullable=True)
+    license = Column(String, nullable=False, index=True)
+    attribution = Column(String, nullable=True)
+    width = Column(Integer, nullable=True)
+    height = Column(Integer, nullable=True)
+    phash = Column(String, nullable=True, index=True)
+    quality_score = Column(Float, nullable=True)
+    status = Column(String, default="pending", index=True)  # pending, downloaded, rejected, deleted
+    created_at = Column(DateTime, server_default=func.now())
+
+    # Composite indexes for common query patterns
+    __table_args__ = (
+        UniqueConstraint("source", "source_id", name="uq_source_source_id"),
+        Index("ix_images_species_status", "species_id", "status"),  # For counting images per species by status
+        Index("ix_images_status_created", "status", "created_at"),  # For listing images by status
+    )
+
+    # Relationships
+    species = relationship("Species", back_populates="images")
+
+    def __repr__(self):
+        return f"<Image(id={self.id}, source='{self.source}', status='{self.status}')>"
@@ -0,0 +1,27 @@
+from sqlalchemy import Column, Integer, String, DateTime, Text, Boolean, func
+
+from app.database import Base
+
+
+class Job(Base):
+    __tablename__ = "jobs"
+
+    id = Column(Integer, primary_key=True, index=True)
+    name = Column(String, nullable=False)
+    source = Column(String, nullable=False)
+    species_filter = Column(Text, nullable=True)  # JSON array of species IDs or NULL for all
+    only_without_images = Column(Boolean, default=False)  # If True, only scrape species with 0 images
+    max_images = Column(Integer, nullable=True)  # If set, only scrape species with fewer than N images
+    status = Column(String, default="pending", index=True)  # pending, running, paused, completed, failed
+    progress_current = Column(Integer, default=0)
+    progress_total = Column(Integer, default=0)
+    images_downloaded = Column(Integer, default=0)
+    images_rejected = Column(Integer, default=0)
+    celery_task_id = Column(String, nullable=True)
+    started_at = Column(DateTime, nullable=True)
+    completed_at = Column(DateTime, nullable=True)
+    error_message = Column(Text, nullable=True)
+    created_at = Column(DateTime, server_default=func.now())
+
+    def __repr__(self):
+        return f"<Job(id={self.id}, name='{self.name}', status='{self.status}')>"
@@ -0,0 +1,21 @@
+from sqlalchemy import Column, Integer, String, DateTime, func
+from sqlalchemy.orm import relationship
+
+from app.database import Base
+
+
+class Species(Base):
+    __tablename__ = "species"
+
+    id = Column(Integer, primary_key=True, index=True)
+    scientific_name = Column(String, unique=True, nullable=False, index=True)
+    common_name = Column(String, nullable=True)
+    genus = Column(String, nullable=True, index=True)
+    family = Column(String, nullable=True)
+    created_at = Column(DateTime, server_default=func.now())
+
+    # Relationships
+    images = relationship("Image", back_populates="species", cascade="all, delete-orphan")
+
+    def __repr__(self):
+        return f"<Species(id={self.id}, scientific_name='{self.scientific_name}')>"
@@ -0,0 +1,15 @@
+from app.schemas.species import SpeciesCreate, SpeciesUpdate, SpeciesResponse, SpeciesListResponse
+from app.schemas.image import ImageResponse, ImageListResponse, ImageFilter
+from app.schemas.job import JobCreate, JobResponse, JobListResponse
+from app.schemas.api_key import ApiKeyCreate, ApiKeyUpdate, ApiKeyResponse
+from app.schemas.export import ExportCreate, ExportResponse, ExportListResponse
+from app.schemas.stats import StatsResponse, SourceStats, SpeciesStats
+
+__all__ = [
+    "SpeciesCreate", "SpeciesUpdate", "SpeciesResponse", "SpeciesListResponse",
+    "ImageResponse", "ImageListResponse", "ImageFilter",
+    "JobCreate", "JobResponse", "JobListResponse",
+    "ApiKeyCreate", "ApiKeyUpdate", "ApiKeyResponse",
+    "ExportCreate", "ExportResponse", "ExportListResponse",
+    "StatsResponse", "SourceStats", "SpeciesStats",
+]
@@ -0,0 +1,36 @@
+from pydantic import BaseModel
+from typing import Optional
+
+
+class ApiKeyBase(BaseModel):
+    source: str
+    api_key: Optional[str] = None  # Optional for no-auth sources, used as Client ID for OAuth
+    api_secret: Optional[str] = None  # Also used as Client Secret for OAuth sources
+    access_token: Optional[str] = None  # For OAuth sources like Wikimedia
+    rate_limit_per_sec: float = 1.0
+    enabled: bool = True
+
+
+class ApiKeyCreate(ApiKeyBase):
+    pass
+
+
+class ApiKeyUpdate(BaseModel):
+    api_key: Optional[str] = None
+    api_secret: Optional[str] = None
+    access_token: Optional[str] = None
+    rate_limit_per_sec: Optional[float] = None
+    enabled: Optional[bool] = None
+
+
+class ApiKeyResponse(BaseModel):
+    id: int
+    source: str
+    api_key_masked: str  # Show only last 4 chars
+    has_secret: bool
+    has_access_token: bool
+    rate_limit_per_sec: float
+    enabled: bool
+
+    class Config:
+        from_attributes = True
@@ -0,0 +1,45 @@
+from pydantic import BaseModel
+from datetime import datetime
+from typing import Optional, List
+
+
+class ExportFilter(BaseModel):
+    min_images_per_species: int = 100
+    licenses: Optional[List[str]] = None  # None means all
+    min_quality: Optional[float] = None
+    species_ids: Optional[List[int]] = None  # None means all
+
+
+class ExportCreate(BaseModel):
+    name: str
+    filter_criteria: ExportFilter
+    train_split: float = 0.8
+
+
+class ExportResponse(BaseModel):
+    id: int
+    name: str
+    filter_criteria: Optional[str] = None
+    train_split: float
+    status: str
+    file_path: Optional[str] = None
+    file_size: Optional[int] = None
+    species_count: Optional[int] = None
+    image_count: Optional[int] = None
+    created_at: datetime
+    completed_at: Optional[datetime] = None
+    error_message: Optional[str] = None
+
+    class Config:
+        from_attributes = True
+
+
+class ExportListResponse(BaseModel):
+    items: List[ExportResponse]
+    total: int
+
+
+class ExportPreview(BaseModel):
+    species_count: int
+    image_count: int
+    estimated_size_mb: float
@@ -0,0 +1,47 @@
+from pydantic import BaseModel
+from datetime import datetime
+from typing import Optional, List
+
+
+class ImageBase(BaseModel):
+    species_id: int
+    source: str
+    url: str
+    license: str
+
+
+class ImageResponse(BaseModel):
+    id: int
+    species_id: int
+    species_name: Optional[str] = None
+    source: str
+    source_id: Optional[str] = None
+    url: str
+    local_path: Optional[str] = None
+    license: str
+    attribution: Optional[str] = None
+    width: Optional[int] = None
+    height: Optional[int] = None
+    quality_score: Optional[float] = None
+    status: str
+    created_at: datetime
+
+    class Config:
+        from_attributes = True
+
+
+class ImageListResponse(BaseModel):
+    items: List[ImageResponse]
+    total: int
+    page: int
+    page_size: int
+    pages: int
+
+
+class ImageFilter(BaseModel):
+    species_id: Optional[int] = None
+    source: Optional[str] = None
+    license: Optional[str] = None
+    status: Optional[str] = None
+    min_quality: Optional[float] = None
+    search: Optional[str] = None
@@ -0,0 +1,35 @@
+from pydantic import BaseModel
+from datetime import datetime
+from typing import Optional, List
+
+
+class JobCreate(BaseModel):
+    name: str
+    source: str
+    species_ids: Optional[List[int]] = None  # None means all species
+    only_without_images: bool = False  # If True, only scrape species with 0 images
+    max_images: Optional[int] = None  # If set, only scrape species with fewer than N images
+
+
+class JobResponse(BaseModel):
+    id: int
+    name: str
+    source: str
+    species_filter: Optional[str] = None
+    status: str
+    progress_current: int
+    progress_total: int
+    images_downloaded: int
+    images_rejected: int
+    started_at: Optional[datetime] = None
+    completed_at: Optional[datetime] = None
+    error_message: Optional[str] = None
+    created_at: datetime
+
+    class Config:
+        from_attributes = True
+
+
+class JobListResponse(BaseModel):
+    items: List[JobResponse]
+    total: int
@@ -0,0 +1,44 @@
+from pydantic import BaseModel
+from datetime import datetime
+from typing import Optional, List
+
+
+class SpeciesBase(BaseModel):
+    scientific_name: str
+    common_name: Optional[str] = None
+    genus: Optional[str] = None
+    family: Optional[str] = None
+
+
+class SpeciesCreate(SpeciesBase):
+    pass
+
+
+class SpeciesUpdate(BaseModel):
+    scientific_name: Optional[str] = None
+    common_name: Optional[str] = None
+    genus: Optional[str] = None
+    family: Optional[str] = None
+
+
+class SpeciesResponse(SpeciesBase):
+    id: int
+    created_at: datetime
+    image_count: int = 0
+
+    class Config:
+        from_attributes = True
+
+
+class SpeciesListResponse(BaseModel):
+    items: List[SpeciesResponse]
+    total: int
+    page: int
+    page_size: int
+    pages: int
+
+
+class SpeciesImportResponse(BaseModel):
+    imported: int
+    skipped: int
+    errors: List[str]
@@ -0,0 +1,43 @@
+from pydantic import BaseModel
+from typing import List, Dict
+
+
+class SourceStats(BaseModel):
+    source: str
+    image_count: int
+    downloaded: int
+    pending: int
+    rejected: int
+
+
+class LicenseStats(BaseModel):
+    license: str
+    count: int
+
+
+class SpeciesStats(BaseModel):
+    id: int
+    scientific_name: str
+    common_name: str | None
+    image_count: int
+
+
+class JobStats(BaseModel):
+    running: int
+    pending: int
+    completed: int
+    failed: int
+
+
+class StatsResponse(BaseModel):
+    total_species: int
+    total_images: int
+    images_downloaded: int
+    images_pending: int
+    images_rejected: int
+    disk_usage_mb: float
+    sources: List[SourceStats]
+    licenses: List[LicenseStats]
+    jobs: JobStats
+    top_species: List[SpeciesStats]
+    under_represented: List[SpeciesStats]  # Species with < 100 images
@@ -0,0 +1,41 @@
+from typing import Optional
+
+from app.scrapers.base import BaseScraper
+from app.scrapers.inaturalist import INaturalistScraper
+from app.scrapers.flickr import FlickrScraper
+from app.scrapers.wikimedia import WikimediaScraper
+from app.scrapers.trefle import TrefleScraper
+from app.scrapers.gbif import GBIFScraper
+from app.scrapers.duckduckgo import DuckDuckGoScraper
+from app.scrapers.bing import BingScraper
+
+
+def get_scraper(source: str) -> Optional[BaseScraper]:
+    """Get scraper instance for a source."""
+    scrapers = {
+        "inaturalist": INaturalistScraper,
+        "flickr": FlickrScraper,
+        "wikimedia": WikimediaScraper,
+        "trefle": TrefleScraper,
+        "gbif": GBIFScraper,
+        "duckduckgo": DuckDuckGoScraper,
+        "bing": BingScraper,
+    }
+
+    scraper_class = scrapers.get(source)
+    if scraper_class:
+        return scraper_class()
+    return None
+
+
+__all__ = [
+    "get_scraper",
+    "BaseScraper",
+    "INaturalistScraper",
+    "FlickrScraper",
+    "WikimediaScraper",
+    "TrefleScraper",
+    "GBIFScraper",
+    "DuckDuckGoScraper",
+    "BingScraper",
+]
@@ -0,0 +1,57 @@
+from abc import ABC, abstractmethod
+from typing import Dict, Any, Optional
+import logging
+
+from sqlalchemy.orm import Session
+
+from app.models import Species, ApiKey
+
+
+class BaseScraper(ABC):
+    """Base class for all image scrapers."""
+
+    name: str = "base"
+    requires_api_key: bool = True
+
+    @abstractmethod
+    def scrape_species(
+        self,
+        species: Species,
+        db: Session,
+        logger: Optional[logging.Logger] = None
+    ) -> Dict[str, int]:
+        """
+        Scrape images for a species.
+
+        Args:
+            species: The species to scrape images for
+            db: Database session
+            logger: Optional logger for debugging
+
+        Returns:
+            Dict with 'downloaded' and 'rejected' counts
+        """
+        pass
+
+    @abstractmethod
+    def test_connection(self, api_key: ApiKey) -> str:
+        """
+        Test API connection.
+
+        Args:
+            api_key: The API key configuration
+
+        Returns:
+            Success message
+
+        Raises:
+            Exception if connection fails
+        """
+        pass
+
+    def get_api_key(self, db: Session) -> ApiKey:
+        """Get API key for this scraper."""
+        return db.query(ApiKey).filter(
+            ApiKey.source == self.name,
+            ApiKey.enabled == True
+        ).first()
@@ -0,0 +1,228 @@
+import time
+import logging
+from typing import Dict, Optional
+
+import httpx
+from sqlalchemy.orm import Session
+
+from app.scrapers.base import BaseScraper
+from app.models import Species, Image, ApiKey
+from app.workers.quality_tasks import download_and_process_image
+
+
+class BHLScraper(BaseScraper):
+    """Scraper for Biodiversity Heritage Library (BHL) images.
+
+    BHL provides access to digitized biodiversity literature and illustrations.
+    Most content is public domain (pre-1927) or CC-licensed.
+
+    Note: BHL images are primarily historical botanical illustrations,
+    which may differ from photographs but are valuable for training.
+    """
+
+    name = "bhl"
+    requires_api_key = True  # BHL requires free API key
+
+    BASE_URL = "https://www.biodiversitylibrary.org/api3"
+
+    HEADERS = {
+        "User-Agent": "PlantGuideScraper/1.0 (Plant image collection for ML training)",
+        "Accept": "application/json",
+    }
+
+    # BHL content is mostly public domain
+    ALLOWED_LICENSES = {"CC0", "CC-BY", "CC-BY-SA", "PD"}
+
+    def scrape_species(
+        self,
+        species: Species,
+        db: Session,
+        logger: Optional[logging.Logger] = None
+    ) -> Dict[str, int]:
+        """Scrape images from BHL for a species."""
+        api_key = self.get_api_key(db)
+        if not api_key:
+            return {"downloaded": 0, "rejected": 0, "error": "No API key configured"}
+
+        rate_limit = api_key.rate_limit_per_sec if api_key else 0.5
+
+        downloaded = 0
+        rejected = 0
+
+        def log(level: str, msg: str):
+            if logger:
+                getattr(logger, level)(msg)
+
+        try:
+            # Disable SSL verification - some Docker environments lack proper CA certificates
+            with httpx.Client(timeout=30, headers=self.HEADERS, verify=False) as client:
+                # Search for name in BHL
+                search_response = client.get(
+                    f"{self.BASE_URL}",
+                    params={
+                        "op": "NameSearch",
+                        "name": species.scientific_name,
+                        "format": "json",
+                        "apikey": api_key.api_key,
+                    },
+                )
+                search_response.raise_for_status()
+                search_data = search_response.json()
+
+                results = search_data.get("Result", [])
+                if not results:
+                    log("info", f"  Species not found in BHL: {species.scientific_name}")
+                    return {"downloaded": 0, "rejected": 0}
+
+                time.sleep(1.0 / rate_limit)
+
+                # Get pages with illustrations for each name result
+                for name_result in results[:5]:  # Limit to top 5 matches
+                    name_bank_id = name_result.get("NameBankID")
+                    if not name_bank_id:
+                        continue
+
+                    # Get publications with this name
+                    pub_response = client.get(
+                        f"{self.BASE_URL}",
+                        params={
+                            "op": "NameGetDetail",
+                            "namebankid": name_bank_id,
+                            "format": "json",
+                            "apikey": api_key.api_key,
+                        },
+                    )
+                    pub_response.raise_for_status()
+                    pub_data = pub_response.json()
+
+                    time.sleep(1.0 / rate_limit)
+
+                    # Extract titles and get page images
+                    for title in pub_data.get("Result", []):
+                        title_id = title.get("TitleID")
+                        if not title_id:
+                            continue
+
+                        # Get pages for this title
+                        pages_response = client.get(
+                            f"{self.BASE_URL}",
+                            params={
+                                "op": "GetPageMetadata",
+                                "titleid": title_id,
+                                "format": "json",
+                                "apikey": api_key.api_key,
+                                "ocr": "false",
+                                "names": "false",
+                            },
+                        )
+
+                        if pages_response.status_code != 200:
+                            continue
+
+                        pages_data = pages_response.json()
+                        pages = pages_data.get("Result", [])
+
+                        time.sleep(1.0 / rate_limit)
+
+                        # Look for pages that are likely illustrations
+                        for page in pages[:100]:  # Limit pages per title
+                            page_types = page.get("PageTypes", [])
+
+                            # Only get illustration/plate pages
+                            is_illustration = any(
+                                pt.get("PageTypeName", "").lower() in ["illustration", "plate", "figure", "map"]
+                                for pt in page_types
+                            ) if page_types else False
+
+                            if not is_illustration and page_types:
+                                continue
+
+                            page_id = page.get("PageID")
+                            if not page_id:
+                                continue
+
+                            # Construct image URL
+                            # BHL provides multiple image sizes
+                            image_url = f"https://www.biodiversitylibrary.org/pageimage/{page_id}"
+
+                            # Check if already exists
+                            source_id = str(page_id)
+                            existing = db.query(Image).filter(
+                                Image.source == self.name,
+                                Image.source_id == source_id,
+                            ).first()
+
+                            if existing:
+                                continue
+
+                            # Determine license - BHL content is usually public domain
+                            item_url = page.get("ItemUrl", "")
+                            year = None
+                            try:
+                                # Try to extract year from ItemUrl or other fields
+                                if "Year" in page:
+                                    year = int(page.get("Year", 0))
+                            except (ValueError, TypeError):
+                                pass
+
+                            # Content before 1927 is public domain in US
+                            if year and year < 1927:
+                                license_code = "PD"
+                            else:
+                                license_code = "CC0"  # BHL default for older works
+
+                            # Build attribution
+                            title_name = title.get("ShortTitle", title.get("FullTitle", "Unknown"))
+                            attribution = f"From '{title_name}' via Biodiversity Heritage Library ({license_code})"
+
+                            # Create image record
+                            image = Image(
+                                species_id=species.id,
+                                source=self.name,
+                                source_id=source_id,
+                                url=image_url,
+                                license=license_code,
+                                attribution=attribution,
+                                status="pending",
+                            )
+                            db.add(image)
+                            db.commit()
+
+                            # Queue for download
+                            download_and_process_image.delay(image.id)
+                            downloaded += 1
+
+                            # Limit total per species
+                            if downloaded >= 50:
+                                break
+
+                        if downloaded >= 50:
+                            break
+
+                    if downloaded >= 50:
+                        break
+
+        except httpx.HTTPStatusError as e:
+            log("error", f"  HTTP error for {species.scientific_name}: {e.response.status_code}")
+        except Exception as e:
+            log("error", f"  Error scraping BHL for {species.scientific_name}: {e}")
+
+        return {"downloaded": downloaded, "rejected": rejected}
+
+    def test_connection(self, api_key: ApiKey) -> str:
+        """Test BHL API connection."""
+        with httpx.Client(timeout=10, headers=self.HEADERS, verify=False) as client:
+            response = client.get(
+                f"{self.BASE_URL}",
+                params={
+                    "op": "NameSearch",
+                    "name": "Rosa",
+                    "format": "json",
+                    "apikey": api_key.api_key,
+                },
+            )
+            response.raise_for_status()
+            data = response.json()
+
+        results = data.get("Result", [])
+        return f"BHL API connection successful ({len(results)} results for 'Rosa')"
@@ -0,0 +1,135 @@
+import hashlib
+import time
+import logging
+from typing import Dict, Optional
+
+import httpx
+from sqlalchemy.orm import Session
+
+from app.scrapers.base import BaseScraper
+from app.models import Species, Image, ApiKey
+from app.workers.quality_tasks import download_and_process_image
+
+
+class BingScraper(BaseScraper):
+    """Scraper for Bing Image Search v7 API (Azure Cognitive Services)."""
+
+    name = "bing"
+    requires_api_key = True
+
+    BASE_URL = "https://api.bing.microsoft.com/v7.0/images/search"
+
+    NEGATIVE_TERMS = "-herbarium -specimen -illustration -drawing -diagram -dried -pressed"
+
+    LICENSE_MAP = {
+        "Public": "CC0",
+        "Share": "CC-BY-SA",
+        "ShareCommercially": "CC-BY",
+        "Modify": "CC-BY-SA",
+        "ModifyCommercially": "CC-BY",
+    }
+
+    def _build_queries(self, species: Species) -> list[str]:
+        queries = [f'"{species.scientific_name}" plant photo {self.NEGATIVE_TERMS}']
+        if species.common_name:
+            queries.append(f'"{species.common_name}" houseplant photo {self.NEGATIVE_TERMS}')
+        return queries
+
+    def scrape_species(
+        self,
+        species: Species,
+        db: Session,
+        logger: Optional[logging.Logger] = None,
+    ) -> Dict[str, int]:
+        api_key = self.get_api_key(db)
+        if not api_key:
+            return {"downloaded": 0, "rejected": 0}
+
+        rate_limit = api_key.rate_limit_per_sec or 3.0
+        downloaded = 0
+        rejected = 0
+        seen_urls = set()
+
+        headers = {
+            "Ocp-Apim-Subscription-Key": api_key.api_key,
+        }
+
+        try:
+            queries = self._build_queries(species)
+
+            with httpx.Client(timeout=30, headers=headers) as client:
+                for query in queries:
+                    params = {
+                        "q": query,
+                        "imageType": "Photo",
+                        "license": "ShareCommercially",
+                        "count": 50,
+                    }
+
+                    response = client.get(self.BASE_URL, params=params)
+                    response.raise_for_status()
+                    data = response.json()
+
+                    for result in data.get("value", []):
+                        url = result.get("contentUrl")
+                        if not url or url in seen_urls:
+                            continue
+                        seen_urls.add(url)
+
+                        # Use Bing's imageId, fall back to md5 hash
+                        source_id = result.get("imageId") or hashlib.md5(url.encode()).hexdigest()[:16]
+
+                        existing = db.query(Image).filter(
+                            Image.source == self.name,
+                            Image.source_id == source_id,
+                        ).first()
+
+                        if existing:
+                            continue
+
+                        # Map license
+                        bing_license = result.get("license", "")
+                        license_code = self.LICENSE_MAP.get(bing_license, "UNKNOWN")
+
+                        host = result.get("hostPageDisplayUrl", "")
+                        attribution = f"via Bing ({host})" if host else "via Bing Image Search"
+
+                        image = Image(
+                            species_id=species.id,
+                            source=self.name,
+                            source_id=source_id,
+                            url=url,
+                            width=result.get("width"),
+                            height=result.get("height"),
+                            license=license_code,
+                            attribution=attribution,
+                            status="pending",
+                        )
+                        db.add(image)
+                        db.commit()
+
+                        download_and_process_image.delay(image.id)
+                        downloaded += 1
+
+                    time.sleep(1.0 / rate_limit)
+
+        except Exception as e:
+            if logger:
+                logger.error(f"Error scraping Bing for {species.scientific_name}: {e}")
+            else:
+                print(f"Error scraping Bing for {species.scientific_name}: {e}")
+
+        return {"downloaded": downloaded, "rejected": rejected}
+
+    def test_connection(self, api_key: ApiKey) -> str:
+        headers = {"Ocp-Apim-Subscription-Key": api_key.api_key}
+        with httpx.Client(timeout=10, headers=headers) as client:
+            response = client.get(
+                self.BASE_URL,
+                params={"q": "Monstera deliciosa plant", "count": 1},
+            )
+            response.raise_for_status()
+            data = response.json()
+
+        count = data.get("totalEstimatedMatches", 0)
+        return f"Bing Image Search working ({count:,} estimated matches)"
@@ -0,0 +1,101 @@
+import hashlib
+import time
+import logging
+from typing import Dict, Optional
+
+from duckduckgo_search import DDGS
+from sqlalchemy.orm import Session
+
+from app.scrapers.base import BaseScraper
+from app.models import Species, Image, ApiKey
+from app.workers.quality_tasks import download_and_process_image
+
+
+class DuckDuckGoScraper(BaseScraper):
+    """Scraper for DuckDuckGo image search. No API key required."""
+
+    name = "duckduckgo"
+    requires_api_key = False
+
+    NEGATIVE_TERMS = "-herbarium -specimen -illustration -drawing -diagram -dried -pressed"
+
+    def _build_queries(self, species: Species) -> list[str]:
+        queries = [f'"{species.scientific_name}" plant photo {self.NEGATIVE_TERMS}']
+        if species.common_name:
+            queries.append(f'"{species.common_name}" houseplant photo {self.NEGATIVE_TERMS}')
+        return queries
+
+    def scrape_species(
+        self,
+        species: Species,
+        db: Session,
+        logger: Optional[logging.Logger] = None,
+    ) -> Dict[str, int]:
+        api_key = self.get_api_key(db)
+        rate_limit = api_key.rate_limit_per_sec if api_key else 0.5
+
+        downloaded = 0
+        rejected = 0
+        seen_urls = set()
+
+        try:
+            queries = self._build_queries(species)
+
+            with DDGS() as ddgs:
+                for query in queries:
+                    results = ddgs.images(
+                        keywords=query,
+                        type_image="photo",
+                        max_results=50,
+                    )
+
+                    for result in results:
+                        url = result.get("image")
+                        if not url or url in seen_urls:
+                            continue
+                        seen_urls.add(url)
+
+                        source_id = hashlib.md5(url.encode()).hexdigest()[:16]
+
+                        # Check if already exists
+                        existing = db.query(Image).filter(
+                            Image.source == self.name,
+                            Image.source_id == source_id,
+                        ).first()
+
+                        if existing:
+                            continue
+
+                        title = result.get("title", "")
+                        attribution = f"{title} via DuckDuckGo" if title else "via DuckDuckGo"
+
+                        image = Image(
+                            species_id=species.id,
+                            source=self.name,
+                            source_id=source_id,
+                            url=url,
+                            license="UNKNOWN",
+                            attribution=attribution,
+                            status="pending",
+                        )
+                        db.add(image)
+                        db.commit()
+
+                        download_and_process_image.delay(image.id)
+                        downloaded += 1
+
+                    time.sleep(1.0 / rate_limit)
+
+        except Exception as e:
+            if logger:
+                logger.error(f"Error scraping DuckDuckGo for {species.scientific_name}: {e}")
+            else:
+                print(f"Error scraping DuckDuckGo for {species.scientific_name}: {e}")
+
+        return {"downloaded": downloaded, "rejected": rejected}
+
+    def test_connection(self, api_key: ApiKey) -> str:
+        with DDGS() as ddgs:
+            results = ddgs.images(keywords="Monstera deliciosa plant", max_results=1)
+            count = len(list(results))
+        return f"DuckDuckGo search working ({count} test result)"
@@ -0,0 +1,226 @@
+import time
+import logging
+from typing import Dict, Optional
+
+import httpx
+from sqlalchemy.orm import Session
+
+from app.scrapers.base import BaseScraper
+from app.models import Species, Image, ApiKey
+from app.workers.quality_tasks import download_and_process_image
+
+
+class EOLScraper(BaseScraper):
+    """Scraper for Encyclopedia of Life (EOL) images.
+
+    EOL aggregates biodiversity data from many sources and provides
+    a free API with no authentication required.
+    """
+
+    name = "eol"
+    requires_api_key = False
+
+    BASE_URL = "https://eol.org/api"
+
+    HEADERS = {
+        "User-Agent": "PlantGuideScraper/1.0 (Plant image collection for ML training)",
+        "Accept": "application/json",
+    }
+
+    # Map EOL license URLs to short codes
+    LICENSE_MAP = {
+        "http://creativecommons.org/publicdomain/zero/1.0/": "CC0",
+        "http://creativecommons.org/publicdomain/mark/1.0/": "CC0",
+        "http://creativecommons.org/licenses/by/2.0/": "CC-BY",
+        "http://creativecommons.org/licenses/by/3.0/": "CC-BY",
+        "http://creativecommons.org/licenses/by/4.0/": "CC-BY",
+        "http://creativecommons.org/licenses/by-sa/2.0/": "CC-BY-SA",
+        "http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
+        "http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
+        "https://creativecommons.org/publicdomain/zero/1.0/": "CC0",
+        "https://creativecommons.org/publicdomain/mark/1.0/": "CC0",
+        "https://creativecommons.org/licenses/by/2.0/": "CC-BY",
+        "https://creativecommons.org/licenses/by/3.0/": "CC-BY",
+        "https://creativecommons.org/licenses/by/4.0/": "CC-BY",
+        "https://creativecommons.org/licenses/by-sa/2.0/": "CC-BY-SA",
+        "https://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
+        "https://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
+        "pd": "CC0",  # Public domain
+        "public domain": "CC0",
+    }
+
+    # Commercial-safe licenses
+    ALLOWED_LICENSES = {"CC0", "CC-BY", "CC-BY-SA"}
+
+    def scrape_species(
+        self,
+        species: Species,
+        db: Session,
+        logger: Optional[logging.Logger] = None
+    ) -> Dict[str, int]:
+        """Scrape images from EOL for a species."""
+        api_key = self.get_api_key(db)
+        rate_limit = api_key.rate_limit_per_sec if api_key else 0.5
+
+        downloaded = 0
+        rejected = 0
+
+        def log(level: str, msg: str):
+            if logger:
+                getattr(logger, level)(msg)
+
+        try:
+            # Disable SSL verification - EOL is a trusted source and some Docker
+            # environments lack proper CA certificates
+            with httpx.Client(timeout=30, headers=self.HEADERS, verify=False) as client:
+                # Step 1: Search for the species
+                search_response = client.get(
+                    f"{self.BASE_URL}/search/1.0.json",
+                    params={
+                        "q": species.scientific_name,
+                        "page": 1,
+                        "exact": "true",
+                    },
+                )
+                search_response.raise_for_status()
+                search_data = search_response.json()
+
+                results = search_data.get("results", [])
+                if not results:
+                    log("info", f"  Species not found in EOL: {species.scientific_name}")
+                    return {"downloaded": 0, "rejected": 0}
+
+                # Get the EOL page ID
+                eol_page_id = results[0].get("id")
+                if not eol_page_id:
+                    return {"downloaded": 0, "rejected": 0}
+
+                time.sleep(1.0 / rate_limit)
+
+                # Step 2: Get page details with images
+                page_response = client.get(
+                    f"{self.BASE_URL}/pages/1.0/{eol_page_id}.json",
+                    params={
+                        "images_per_page": 75,
+                        "images_page": 1,
+                        "videos_per_page": 0,
+                        "sounds_per_page": 0,
+                        "maps_per_page": 0,
+                        "texts_per_page": 0,
+                        "details": "true",
+                        "licenses": "cc-by|cc-by-sa|pd|cc-by-nc",
+                    },
+                )
+                page_response.raise_for_status()
+                page_data = page_response.json()
+
+                data_objects = page_data.get("dataObjects", [])
+                log("debug", f"  Found {len(data_objects)} media objects")
+
+                for obj in data_objects:
+                    # Only process images
+                    media_type = obj.get("dataType", "")
+                    if "image" not in media_type.lower() and "stillimage" not in media_type.lower():
+                        continue
+
+                    # Get image URL
+                    image_url = obj.get("eolMediaURL") or obj.get("mediaURL")
+                    if not image_url:
+                        rejected += 1
+                        continue
+
+                    # Check license
+                    license_url = obj.get("license", "").lower()
+                    license_code = None
+
+                    # Try to match license URL
+                    for pattern, code in self.LICENSE_MAP.items():
+                        if pattern in license_url:
+                            license_code = code
+                            break
+
+                    if not license_code:
+                        # Check for NC licenses which we reject
+                        if "-nc" in license_url:
+                            rejected += 1
+                            continue
+                        # Unknown license, skip
+                        log("debug", f"  Rejected: unknown license {license_url}")
+                        rejected += 1
+                        continue
+
+                    if license_code not in self.ALLOWED_LICENSES:
+                        rejected += 1
+                        continue
+
+                    # Create unique source ID
+                    source_id = str(obj.get("dataObjectVersionID") or obj.get("identifier") or hash(image_url))
+
+                    # Check if already exists
+                    existing = db.query(Image).filter(
+                        Image.source == self.name,
+                        Image.source_id == source_id,
+                    ).first()
+
+                    if existing:
+                        continue
+
+                    # Build attribution
+                    agents = obj.get("agents", [])
+                    photographer = None
+                    rights_holder = None
+
+                    for agent in agents:
+                        role = agent.get("role", "").lower()
+                        name = agent.get("full_name", "")
+                        if role == "photographer":
+                            photographer = name
+                        elif role == "owner" or role == "rights holder":
+                            rights_holder = name
+
+                    attribution_parts = []
+                    if photographer:
+                        attribution_parts.append(f"Photo by {photographer}")
+                    if rights_holder and rights_holder != photographer:
+                        attribution_parts.append(f"Rights: {rights_holder}")
+                    attribution_parts.append(f"via EOL ({license_code})")
+                    attribution = " | ".join(attribution_parts)
+
+                    # Create image record
+                    image = Image(
+                        species_id=species.id,
+                        source=self.name,
+                        source_id=source_id,
+                        url=image_url,
+                        license=license_code,
+                        attribution=attribution,
+                        status="pending",
+                    )
+                    db.add(image)
+                    db.commit()
+
+                    # Queue for download
+                    download_and_process_image.delay(image.id)
+                    downloaded += 1
+
+                time.sleep(1.0 / rate_limit)
+
+        except httpx.HTTPStatusError as e:
+            log("error", f"  HTTP error for {species.scientific_name}: {e.response.status_code}")
+        except Exception as e:
+            log("error", f"  Error scraping EOL for {species.scientific_name}: {e}")
+
+        return {"downloaded": downloaded, "rejected": rejected}
+
+    def test_connection(self, api_key: ApiKey) -> str:
+        """Test EOL API connection."""
+        with httpx.Client(timeout=10, headers=self.HEADERS, verify=False) as client:
+            response = client.get(
+                f"{self.BASE_URL}/search/1.0.json",
+                params={"q": "Rosa", "page": 1},
+            )
+            response.raise_for_status()
+            data = response.json()
+
+        total = data.get("totalResults", 0)
+        return f"EOL API connection successful ({total} results for 'Rosa')"
@@ -0,0 +1,146 @@
+import time
+import logging
+from typing import Dict, Optional
+
+import httpx
+from sqlalchemy.orm import Session
+
+from app.scrapers.base import BaseScraper
+from app.models import Species, Image, ApiKey
+from app.workers.quality_tasks import download_and_process_image
+
+
+class FlickrScraper(BaseScraper):
+    """Scraper for Flickr images via their API."""
+
+    name = "flickr"
+    requires_api_key = True
+
+    BASE_URL = "https://api.flickr.com/services/rest/"
+
+    HEADERS = {
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
+    }
+
+    # Commercial-safe license IDs
+    # 4 = CC BY 2.0, 7 = No known copyright, 8 = US Gov, 9 = CC0
+    ALLOWED_LICENSES = "4,7,8,9"
+
+    LICENSE_MAP = {
+        "4": "CC-BY",
+        "7": "NO-KNOWN-COPYRIGHT",
+        "8": "US-GOV",
+        "9": "CC0",
+    }
+
+    def scrape_species(
+        self,
+        species: Species,
+        db: Session,
+        logger: Optional[logging.Logger] = None
+    ) -> Dict[str, int]:
+        """Scrape images from Flickr for a species."""
+        api_key = self.get_api_key(db)
+        if not api_key:
+            return {"downloaded": 0, "rejected": 0, "error": "No API key configured"}
+
+        rate_limit = api_key.rate_limit_per_sec
+
+        downloaded = 0
+        rejected = 0
+
+        try:
+            params = {
+                "method": "flickr.photos.search",
+                "api_key": api_key.api_key,
+                "text": species.scientific_name,
+                "license": self.ALLOWED_LICENSES,
+                "content_type": 1,  # Photos only
+                "media": "photos",
+                "extras": "license,url_l,url_o,owner_name",
+                "per_page": 100,
+                "format": "json",
+                "nojsoncallback": 1,
+            }
+
+            with httpx.Client(timeout=30, headers=self.HEADERS) as client:
+                response = client.get(self.BASE_URL, params=params)
+                response.raise_for_status()
+                data = response.json()
+
+            if data.get("stat") != "ok":
+                return {"downloaded": 0, "rejected": 0, "error": data.get("message")}
+
+            photos = data.get("photos", {}).get("photo", [])
+
+            for photo in photos:
+                # Get best URL (original or large)
+                url = photo.get("url_o") or photo.get("url_l")
+                if not url:
+                    rejected += 1
+                    continue
+
+                # Get license
+                license_id = str(photo.get("license", ""))
+                license_code = self.LICENSE_MAP.get(license_id, "UNKNOWN")
+                if license_code == "UNKNOWN":
+                    rejected += 1
+                    continue
+
+                # Check if already exists
+                source_id = str(photo.get("id"))
+                existing = db.query(Image).filter(
+                    Image.source == self.name,
+                    Image.source_id == source_id,
+                ).first()
+
+                if existing:
+                    continue
+
+                # Build attribution
+                owner = photo.get("ownername", "Unknown")
+                attribution = f"Photo by {owner} on Flickr ({license_code})"
+
+                # Create image record
+                image = Image(
+                    species_id=species.id,
+                    source=self.name,
+                    source_id=source_id,
+                    url=url,
+                    license=license_code,
+                    attribution=attribution,
+                    status="pending",
+                )
+                db.add(image)
+                db.commit()
+
+                # Queue for download
+                download_and_process_image.delay(image.id)
+                downloaded += 1
+
+            # Rate limiting
+            time.sleep(1.0 / rate_limit)
+
+        except Exception as e:
+            print(f"Error scraping Flickr for {species.scientific_name}: {e}")
+
+        return {"downloaded": downloaded, "rejected": rejected}
+
+    def test_connection(self, api_key: ApiKey) -> str:
+        """Test Flickr API connection."""
+        params = {
+            "method": "flickr.test.echo",
+            "api_key": api_key.api_key,
+            "format": "json",
+            "nojsoncallback": 1,
+        }
+
+        with httpx.Client(timeout=10, headers=self.HEADERS) as client:
+            response = client.get(self.BASE_URL, params=params)
+            response.raise_for_status()
+            data = response.json()
+
+        if data.get("stat") != "ok":
+            raise Exception(data.get("message", "API test failed"))
+
+        return "Flickr API connection successful"
@@ -0,0 +1,159 @@
+import time
+import logging
+from typing import Dict, Optional
+
+import httpx
+from sqlalchemy.orm import Session
+
+from app.scrapers.base import BaseScraper
+from app.models import Species, Image, ApiKey
+from app.workers.quality_tasks import download_and_process_image
+
+
+class GBIFScraper(BaseScraper):
+    """Scraper for GBIF (Global Biodiversity Information Facility) images."""
+
+    name = "gbif"
+    requires_api_key = False  # GBIF is free to use
+
+    BASE_URL = "https://api.gbif.org/v1"
+
+    HEADERS = {
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
+    }
+
+    # Map GBIF license URLs to short codes
+    LICENSE_MAP = {
+        "http://creativecommons.org/publicdomain/zero/1.0/legalcode": "CC0",
+        "http://creativecommons.org/licenses/by/4.0/legalcode": "CC-BY",
+        "http://creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC",
+        "http://creativecommons.org/publicdomain/zero/1.0/": "CC0",
+        "http://creativecommons.org/licenses/by/4.0/": "CC-BY",
+        "http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
+        "https://creativecommons.org/publicdomain/zero/1.0/legalcode": "CC0",
+        "https://creativecommons.org/licenses/by/4.0/legalcode": "CC-BY",
+        "https://creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC",
+        "https://creativecommons.org/publicdomain/zero/1.0/": "CC0",
+        "https://creativecommons.org/licenses/by/4.0/": "CC-BY",
+        "https://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
+    }
+
+    # Only allow commercial-safe licenses
+    ALLOWED_LICENSES = {"CC0", "CC-BY"}
+
+    def scrape_species(
+        self,
+        species: Species,
+        db: Session,
+        logger: Optional[logging.Logger] = None
+    ) -> Dict[str, int]:
+        """Scrape images from GBIF for a species."""
+        # GBIF doesn't require API key, but we still respect rate limits
+        api_key = self.get_api_key(db)
+        rate_limit = api_key.rate_limit_per_sec if api_key else 1.0
+
+        downloaded = 0
+        rejected = 0
+
+        try:
+            params = {
+                "scientificName": species.scientific_name,
+                "mediaType": "StillImage",
+                "limit": 100,
+            }
+
+            with httpx.Client(timeout=30, headers=self.HEADERS) as client:
+                response = client.get(
+                    f"{self.BASE_URL}/occurrence/search",
+                    params=params,
+                )
+                response.raise_for_status()
+                data = response.json()
+
+                results = data.get("results", [])
+
+                for occurrence in results:
+                    media_list = occurrence.get("media", [])
+
+                    for media in media_list:
+                        # Only process still images
+                        if media.get("type") != "StillImage":
+                            continue
+
+                        url = media.get("identifier")
+                        if not url:
+                            rejected += 1
+                            continue
+
+                        # Check license
+                        license_url = media.get("license", "")
+                        license_code = self.LICENSE_MAP.get(license_url)
+
+                        if not license_code or license_code not in self.ALLOWED_LICENSES:
+                            rejected += 1
+                            continue
+
+                        # Create unique source ID from occurrence key and media URL
+                        occurrence_key = occurrence.get("key", "")
+                        # Use hash of URL for uniqueness within occurrence
+                        url_hash = str(hash(url))[-8:]
+                        source_id = f"{occurrence_key}_{url_hash}"
+
+                        # Check if already exists
+                        existing = db.query(Image).filter(
+                            Image.source == self.name,
+                            Image.source_id == source_id,
+                        ).first()
+
+                        if existing:
+                            continue
+
+                        # Build attribution
+                        creator = media.get("creator", "")
+                        rights_holder = media.get("rightsHolder", "")
+                        attribution_parts = []
+                        if creator:
+                            attribution_parts.append(f"Photo by {creator}")
+                        if rights_holder and rights_holder != creator:
+                            attribution_parts.append(f"Rights: {rights_holder}")
+                        attribution_parts.append(f"via GBIF ({license_code})")
+                        attribution = " | ".join(attribution_parts) if attribution_parts else f"GBIF ({license_code})"
+
+                        # Create image record
+                        image = Image(
+                            species_id=species.id,
+                            source=self.name,
+                            source_id=source_id,
+                            url=url,
+                            license=license_code,
+                            attribution=attribution,
+                            status="pending",
+                        )
+                        db.add(image)
+                        db.commit()
+
+                        # Queue for download
+                        download_and_process_image.delay(image.id)
+                        downloaded += 1
+
+                # Rate limiting
+                time.sleep(1.0 / rate_limit)
+
+        except Exception as e:
+            print(f"Error scraping GBIF for {species.scientific_name}: {e}")
+
+        return {"downloaded": downloaded, "rejected": rejected}
+
+    def test_connection(self, api_key: ApiKey) -> str:
+        """Test GBIF API connection."""
+        # GBIF doesn't require authentication, just test the endpoint
+        with httpx.Client(timeout=10, headers=self.HEADERS) as client:
+            response = client.get(
+                f"{self.BASE_URL}/occurrence/search",
+                params={"limit": 1},
+            )
+            response.raise_for_status()
+            data = response.json()
+
+        count = data.get("count", 0)
+        return f"GBIF API connection successful ({count:,} total occurrences available)"
@@ -0,0 +1,144 @@
+import time
+import logging
+from typing import Dict, Optional
+
+import httpx
+from sqlalchemy.orm import Session
+
+from app.scrapers.base import BaseScraper
+from app.models import Species, Image, ApiKey
+from app.workers.quality_tasks import download_and_process_image
+
+
+class INaturalistScraper(BaseScraper):
+    """Scraper for iNaturalist observations via their API."""
+
+    name = "inaturalist"
+    requires_api_key = False  # Public API, but rate limited
+
+    BASE_URL = "https://api.inaturalist.org/v1"
+
+    HEADERS = {
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
+    }
+
+    # Commercial-safe licenses (CC0, CC-BY)
+    ALLOWED_LICENSES = ["cc0", "cc-by"]
+
+    def scrape_species(
+        self,
+        species: Species,
+        db: Session,
+        logger: Optional[logging.Logger] = None
+    ) -> Dict[str, int]:
+        """Scrape images from iNaturalist for a species."""
+        api_key = self.get_api_key(db)
+        rate_limit = api_key.rate_limit_per_sec if api_key else 1.0
+
+        downloaded = 0
+        rejected = 0
+
+        def log(level: str, msg: str):
+            if logger:
+                getattr(logger, level)(msg)
+
+        try:
+            # Search for observations of this species
+            params = {
+                "taxon_name": species.scientific_name,
+                "quality_grade": "research",  # Only research-grade
+                "photos": True,
+                "per_page": 200,
+                "order_by": "votes",
+                "license": ",".join(self.ALLOWED_LICENSES),
+            }
+
+            log("debug", f"  API request params: {params}")
+
+            with httpx.Client(timeout=30, headers=self.HEADERS) as client:
+                response = client.get(
+                    f"{self.BASE_URL}/observations",
+                    params=params,
+                )
+                log("debug", f"  API response status: {response.status_code}")
+                response.raise_for_status()
+                data = response.json()
+
+            observations = data.get("results", [])
+            total_results = data.get("total_results", 0)
+            log("debug", f"  Found {len(observations)} observations (total: {total_results})")
+
+            if not observations:
+                log("info", f"  No observations found for {species.scientific_name}")
+                return {"downloaded": 0, "rejected": 0}
+
+            for obs in observations:
+                photos = obs.get("photos", [])
+                for photo in photos:
+                    # Check license
+                    license_code = photo.get("license_code", "").lower() if photo.get("license_code") else ""
+                    if license_code not in self.ALLOWED_LICENSES:
+                        log("debug", f"  Rejected photo {photo.get('id')}: license={license_code}")
+                        rejected += 1
+                        continue
+
+                    # Get image URL (medium size for initial download)
+                    url = photo.get("url", "")
+                    if not url:
+                        log("debug", f"  Skipped photo {photo.get('id')}: no URL")
+                        continue
+
+                    # Convert to larger size
+                    url = url.replace("square", "large")
+
+                    # Check if already exists
+                    source_id = str(photo.get("id"))
+                    existing = db.query(Image).filter(
+                        Image.source == self.name,
+                        Image.source_id == source_id,
+                    ).first()
+
+                    if existing:
+                        log("debug", f"  Skipped photo {source_id}: already exists")
+                        continue
+
+                    # Create image record
+                    image = Image(
+                        species_id=species.id,
+                        source=self.name,
+                        source_id=source_id,
+                        url=url,
+                        license=license_code.upper(),
+                        attribution=photo.get("attribution", ""),
+                        status="pending",
+                    )
+                    db.add(image)
+                    db.commit()
+
+                    # Queue for download
+                    download_and_process_image.delay(image.id)
+                    downloaded += 1
+                    log("debug", f"  Queued photo {source_id} for download")
+
+                # Rate limiting
+                time.sleep(1.0 / rate_limit)
+
+        except httpx.HTTPStatusError as e:
+            log("error", f"  HTTP error for {species.scientific_name}: {e.response.status_code} - {e.response.text}")
+        except httpx.RequestError as e:
+            log("error", f"  Request error for {species.scientific_name}: {e}")
+        except Exception as e:
+            log("error", f"  Error scraping iNaturalist for {species.scientific_name}: {e}")
+
+        return {"downloaded": downloaded, "rejected": rejected}
+
+    def test_connection(self, api_key: ApiKey) -> str:
+        """Test iNaturalist API connection."""
+        with httpx.Client(timeout=10, headers=self.HEADERS) as client:
+            response = client.get(
+                f"{self.BASE_URL}/observations",
+                params={"per_page": 1},
+            )
+            response.raise_for_status()
+
+        return "iNaturalist API connection successful"
@@ -0,0 +1,154 @@
+import time
+import logging
+from typing import Dict, Optional
+
+import httpx
+from sqlalchemy.orm import Session
+
+from app.scrapers.base import BaseScraper
+from app.models import Species, Image, ApiKey
+from app.workers.quality_tasks import download_and_process_image
+
+
+class TrefleScraper(BaseScraper):
+    """Scraper for Trefle.io plant database."""
+
+    name = "trefle"
+    requires_api_key = True
+
+    BASE_URL = "https://trefle.io/api/v1"
+
+    HEADERS = {
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
+    }
+
+    def scrape_species(
+        self,
+        species: Species,
+        db: Session,
+        logger: Optional[logging.Logger] = None
+    ) -> Dict[str, int]:
+        """Scrape images from Trefle for a species."""
+        api_key = self.get_api_key(db)
+        if not api_key:
+            return {"downloaded": 0, "rejected": 0, "error": "No API key configured"}
+
+        rate_limit = api_key.rate_limit_per_sec
+
+        downloaded = 0
+        rejected = 0
+
+        try:
+            # Search for the species
+            params = {
+                "token": api_key.api_key,
+                "q": species.scientific_name,
+            }
+
+            with httpx.Client(timeout=30, headers=self.HEADERS) as client:
+                response = client.get(
+                    f"{self.BASE_URL}/plants/search",
+                    params=params,
+                )
+                response.raise_for_status()
+                data = response.json()
+
+                plants = data.get("data", [])
+
+                for plant in plants:
+                    # Get plant details for more images
+                    plant_id = plant.get("id")
+                    if not plant_id:
+                        continue
+
+                    detail_response = client.get(
+                        f"{self.BASE_URL}/plants/{plant_id}",
+                        params={"token": api_key.api_key},
+                    )
+
+                    if detail_response.status_code != 200:
+                        continue
+
+                    plant_detail = detail_response.json().get("data", {})
+
+                    # Get main image
+                    main_image = plant_detail.get("image_url")
+                    if main_image:
+                        source_id = f"main_{plant_id}"
+                        existing = db.query(Image).filter(
+                            Image.source == self.name,
+                            Image.source_id == source_id,
+                        ).first()
+
+                        if not existing:
+                            image = Image(
+                                species_id=species.id,
+                                source=self.name,
+                                source_id=source_id,
+                                url=main_image,
+                                license="TREFLE",  # Trefle's own license
+                                attribution="Trefle.io Plant Database",
+                                status="pending",
+                            )
+                            db.add(image)
+                            db.commit()
+                            download_and_process_image.delay(image.id)
+                            downloaded += 1
+
+                    # Get additional images from species detail
+                    images = plant_detail.get("images", {})
+                    for image_type, image_list in images.items():
+                        if not isinstance(image_list, list):
+                            continue
+
+                        for img in image_list:
+                            url = img.get("image_url")
+                            if not url:
+                                continue
+
+                            img_id = img.get("id", url.split("/")[-1])
+                            source_id = f"{image_type}_{img_id}"
+
+                            existing = db.query(Image).filter(
+                                Image.source == self.name,
+                                Image.source_id == source_id,
+                            ).first()
+
+                            if existing:
+                                continue
+
+                            copyright_info = img.get("copyright", "")
+                            image = Image(
+                                species_id=species.id,
+                                source=self.name,
+                                source_id=source_id,
+                                url=url,
+                                license="TREFLE",
+                                attribution=copyright_info or "Trefle.io",
+                                status="pending",
+                            )
+                            db.add(image)
+                            db.commit()
+                            download_and_process_image.delay(image.id)
+                            downloaded += 1
+
+                    # Rate limiting
+                    time.sleep(1.0 / rate_limit)
+
+        except Exception as e:
+            print(f"Error scraping Trefle for {species.scientific_name}: {e}")
+
+        return {"downloaded": downloaded, "rejected": rejected}
+
+    def test_connection(self, api_key: ApiKey) -> str:
+        """Test Trefle API connection."""
+        params = {"token": api_key.api_key}
+
+        with httpx.Client(timeout=10, headers=self.HEADERS) as client:
+            response = client.get(
+                f"{self.BASE_URL}/plants",
+                params=params,
+            )
+            response.raise_for_status()
+
+        return "Trefle API connection successful"
@@ -0,0 +1,146 @@
+import time
+import logging
+from typing import Dict, Optional
+
+import httpx
+from sqlalchemy.orm import Session
+
+from app.scrapers.base import BaseScraper
+from app.models import Species, Image, ApiKey
+from app.workers.quality_tasks import download_and_process_image
+
+
+class WikimediaScraper(BaseScraper):
+    """Scraper for Wikimedia Commons images."""
+
+    name = "wikimedia"
+    requires_api_key = False
+
+    BASE_URL = "https://commons.wikimedia.org/w/api.php"
+
+    HEADERS = {
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
+    }
+
+    def scrape_species(
+        self,
+        species: Species,
+        db: Session,
+        logger: Optional[logging.Logger] = None
+    ) -> Dict[str, int]:
+        """Scrape images from Wikimedia Commons for a species."""
+        api_key = self.get_api_key(db)
+        rate_limit = api_key.rate_limit_per_sec if api_key else 1.0
+
+        downloaded = 0
+        rejected = 0
+
+        try:
+            # Search for images in the species category
+            search_term = species.scientific_name
+
+            params = {
+                "action": "query",
+                "format": "json",
+                "generator": "search",
+                "gsrsearch": f"filetype:bitmap {search_term}",
+                "gsrnamespace": 6,  # File namespace
+                "gsrlimit": 50,
+                "prop": "imageinfo",
+                "iiprop": "url|extmetadata|size",
+            }
+
+            with httpx.Client(timeout=30, headers=self.HEADERS) as client:
+                response = client.get(self.BASE_URL, params=params)
+                response.raise_for_status()
+                data = response.json()
+
+            pages = data.get("query", {}).get("pages", {})
+
+            for page_id, page in pages.items():
+                if int(page_id) < 0:
+                    continue
+
+                imageinfo = page.get("imageinfo", [{}])[0]
+                url = imageinfo.get("url", "")
+                if not url:
+                    continue
+
+                # Check size
+                width = imageinfo.get("width", 0)
+                height = imageinfo.get("height", 0)
+                if width < 256 or height < 256:
+                    rejected += 1
+                    continue
+
+                # Get license from metadata
+                metadata = imageinfo.get("extmetadata", {})
+                license_info = metadata.get("LicenseShortName", {}).get("value", "")
+
+                # Filter for commercial-safe licenses
+                license_upper = license_info.upper()
+                if "CC BY" in license_upper or "CC0" in license_upper or "PUBLIC DOMAIN" in license_upper:
+                    license_code = license_info
+                else:
+                    rejected += 1
+                    continue
+
+                # Check if already exists
+                source_id = str(page_id)
+                existing = db.query(Image).filter(
+                    Image.source == self.name,
+                    Image.source_id == source_id,
+                ).first()
+
+                if existing:
+                    continue
+
+                # Get attribution
+                artist = metadata.get("Artist", {}).get("value", "Unknown")
+                # Clean HTML from artist
+                if "<" in artist:
+                    import re
+                    artist = re.sub(r"<[^>]+>", "", artist).strip()
+
+                attribution = f"{artist} via Wikimedia Commons ({license_code})"
+
+                # Create image record
+                image = Image(
+                    species_id=species.id,
+                    source=self.name,
+                    source_id=source_id,
+                    url=url,
+                    license=license_code,
+                    attribution=attribution,
+                    width=width,
+                    height=height,
+                    status="pending",
+                )
+                db.add(image)
+                db.commit()
+
+                # Queue for download
+                download_and_process_image.delay(image.id)
+                downloaded += 1
+
+            # Rate limiting
+            time.sleep(1.0 / rate_limit)
+
+        except Exception as e:
+            print(f"Error scraping Wikimedia for {species.scientific_name}: {e}")
+
+        return {"downloaded": downloaded, "rejected": rejected}
+
+    def test_connection(self, api_key: ApiKey) -> str:
+        """Test Wikimedia API connection."""
+        params = {
+            "action": "query",
+            "format": "json",
+            "meta": "siteinfo",
+        }
+
+        with httpx.Client(timeout=10, headers=self.HEADERS) as client:
+            response = client.get(self.BASE_URL, params=params)
+            response.raise_for_status()
+
+        return "Wikimedia Commons API connection successful"
@@ -0,0 +1 @@
+# Utility functions
@@ -0,0 +1,80 @@
+"""Image deduplication utilities using perceptual hashing."""
+
+from typing import Optional
+
+import imagehash
+from PIL import Image as PILImage
+
+
+def calculate_phash(image_path: str) -> Optional[str]:
+    """
+    Calculate perceptual hash for an image.
+
+    Args:
+        image_path: Path to image file
+
+    Returns:
+        Hex string of perceptual hash, or None if failed
+    """
+    try:
+        with PILImage.open(image_path) as img:
+            return str(imagehash.phash(img))
+    except Exception:
+        return None
+
+
+def calculate_dhash(image_path: str) -> Optional[str]:
+    """
+    Calculate difference hash for an image.
+    Faster but less accurate than phash.
+
+    Args:
+        image_path: Path to image file
+
+    Returns:
+        Hex string of difference hash, or None if failed
+    """
+    try:
+        with PILImage.open(image_path) as img:
+            return str(imagehash.dhash(img))
+    except Exception:
+        return None
+
+
+def hashes_are_similar(hash1: str, hash2: str, threshold: int = 10) -> bool:
+    """
+    Check if two hashes are similar (potential duplicates).
+
+    Args:
+        hash1: First hash string
+        hash2: Second hash string
+        threshold: Maximum Hamming distance (default 10)
+
+    Returns:
+        True if hashes are similar
+    """
+    try:
+        h1 = imagehash.hex_to_hash(hash1)
+        h2 = imagehash.hex_to_hash(hash2)
+        return (h1 - h2) <= threshold
+    except Exception:
+        return False
+
+
+def hamming_distance(hash1: str, hash2: str) -> int:
+    """
+    Calculate Hamming distance between two hashes.
+
+    Args:
+        hash1: First hash string
+        hash2: Second hash string
+
+    Returns:
+        Hamming distance (0 = identical, higher = more different)
+    """
+    try:
+        h1 = imagehash.hex_to_hash(hash1)
+        h2 = imagehash.hex_to_hash(hash2)
+        return int(h1 - h2)
+    except Exception:
+        return 64  # Maximum distance
@@ -0,0 +1,109 @@
+"""Image quality assessment utilities."""
+
+import numpy as np
+from PIL import Image as PILImage
+from scipy import ndimage
+
+
+def calculate_blur_score(image_path: str) -> float:
+    """
+    Calculate blur score using Laplacian variance.
+    Higher score = sharper image.
+
+    Args:
+        image_path: Path to image file
+
+    Returns:
+        Variance of Laplacian (higher = sharper)
+    """
+    try:
+        img = PILImage.open(image_path).convert("L")
+        img_array = np.array(img)
+        laplacian = ndimage.laplace(img_array)
+        return float(np.var(laplacian))
+    except Exception:
+        return 0.0
+
+
+def is_too_blurry(image_path: str, threshold: float = 100.0) -> bool:
+    """
+    Check if image is too blurry for training.
+
+    Args:
+        image_path: Path to image file
+        threshold: Minimum acceptable blur score (default 100)
+
+    Returns:
+        True if image is too blurry
+    """
+    score = calculate_blur_score(image_path)
+    return score < threshold
+
+
+def get_image_dimensions(image_path: str) -> tuple[int, int]:
+    """
+    Get image dimensions.
+
+    Args:
+        image_path: Path to image file
+
+    Returns:
+        Tuple of (width, height)
+    """
+    try:
+        with PILImage.open(image_path) as img:
+            return img.size
+    except Exception:
+        return (0, 0)
+
+
+def is_too_small(image_path: str, min_size: int = 256) -> bool:
+    """
+    Check if image is too small for training.
+
+    Args:
+        image_path: Path to image file
+        min_size: Minimum dimension size (default 256)
+
+    Returns:
+        True if image is too small
+    """
+    width, height = get_image_dimensions(image_path)
+    return width < min_size or height < min_size
+
+
+def resize_image(
+    image_path: str,
+    output_path: str = None,
+    max_size: int = 512,
+    quality: int = 95,
+) -> bool:
+    """
+    Resize image to max dimension while preserving aspect ratio.
+
+    Args:
+        image_path: Path to input image
+        output_path: Path for output (defaults to overwriting input)
+        max_size: Maximum dimension size (default 512)
+        quality: JPEG quality (default 95)
+
+    Returns:
+        True if successful
+    """
+    try:
+        output_path = output_path or image_path
+
+        with PILImage.open(image_path) as img:
+            # Only resize if larger than max_size
+            if max(img.size) > max_size:
+                img.thumbnail((max_size, max_size), PILImage.Resampling.LANCZOS)
+
+            # Convert to RGB if necessary (for JPEG)
+            if img.mode in ("RGBA", "P"):
+                img = img.convert("RGB")
+
+            img.save(output_path, "JPEG", quality=quality)
+
+        return True
+    except Exception:
+        return False
@@ -0,0 +1,92 @@
+import logging
+import os
+from datetime import datetime
+from pathlib import Path
+
+from app.config import get_settings
+
+settings = get_settings()
+
+
+def setup_logging():
+    """Configure file and console logging."""
+    logs_path = Path(settings.logs_path)
+    logs_path.mkdir(parents=True, exist_ok=True)
+
+    # Create a dated log file
+    log_file = logs_path / f"scraper_{datetime.now().strftime('%Y-%m-%d')}.log"
+
+    # Configure root logger
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        handlers=[
+            logging.FileHandler(log_file),
+            logging.StreamHandler()
+        ]
+    )
+
+    return logging.getLogger("plant_scraper")
+
+
+def get_logger(name: str = "plant_scraper"):
+    """Get a logger instance."""
+    logs_path = Path(settings.logs_path)
+    logs_path.mkdir(parents=True, exist_ok=True)
+
+    logger = logging.getLogger(name)
+
+    if not logger.handlers:
+        logger.setLevel(logging.INFO)
+
+        # File handler with daily rotation
+        log_file = logs_path / f"scraper_{datetime.now().strftime('%Y-%m-%d')}.log"
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setLevel(logging.INFO)
+        file_handler.setFormatter(logging.Formatter(
+            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+        ))
+
+        # Console handler
+        console_handler = logging.StreamHandler()
+        console_handler.setLevel(logging.INFO)
+        console_handler.setFormatter(logging.Formatter(
+            '%(asctime)s - %(levelname)s - %(message)s'
+        ))
+
+        logger.addHandler(file_handler)
+        logger.addHandler(console_handler)
+
+    return logger
+
+
+def get_job_logger(job_id: int):
+    """Get a logger specific to a job, writing to a job-specific file."""
+    logs_path = Path(settings.logs_path)
+    logs_path.mkdir(parents=True, exist_ok=True)
+
+    logger = logging.getLogger(f"job_{job_id}")
+
+    if not logger.handlers:
+        logger.setLevel(logging.DEBUG)
+
+        # Job-specific log file
+        job_log_file = logs_path / f"job_{job_id}.log"
+        file_handler = logging.FileHandler(job_log_file)
+        file_handler.setLevel(logging.DEBUG)
+        file_handler.setFormatter(logging.Formatter(
+            '%(asctime)s - %(levelname)s - %(message)s'
+        ))
+
+        # Also log to daily file
+        daily_log_file = logs_path / f"scraper_{datetime.now().strftime('%Y-%m-%d')}.log"
+        daily_handler = logging.FileHandler(daily_log_file)
+        daily_handler.setLevel(logging.INFO)
+        daily_handler.setFormatter(logging.Formatter(
+            '%(asctime)s - job_%(name)s - %(levelname)s - %(message)s'
+        ))
+
+        logger.addHandler(file_handler)
+        logger.addHandler(daily_handler)
+
+    return logger
@@ -0,0 +1 @@
+# Celery workers
@@ -0,0 +1,36 @@
+from celery import Celery
+
+from app.config import get_settings
+
+settings = get_settings()
+
+celery_app = Celery(
+    "plant_scraper",
+    broker=settings.redis_url,
+    backend=settings.redis_url,
+    include=[
+        "app.workers.scrape_tasks",
+        "app.workers.quality_tasks",
+        "app.workers.export_tasks",
+        "app.workers.stats_tasks",
+    ],
+)
+
+celery_app.conf.update(
+    task_serializer="json",
+    accept_content=["json"],
+    result_serializer="json",
+    timezone="UTC",
+    enable_utc=True,
+    task_track_started=True,
+    task_time_limit=3600 * 24,  # 24 hour max per task
+    worker_prefetch_multiplier=1,
+    task_acks_late=True,
+    beat_schedule={
+        "refresh-stats-every-5min": {
+            "task": "app.workers.stats_tasks.refresh_stats",
+            "schedule": 300.0,  # Every 5 minutes
+        },
+    },
+    beat_schedule_filename="/tmp/celerybeat-schedule",
+)
@@ -0,0 +1,170 @@
+import json
+import os
+import random
+import shutil
+import zipfile
+from datetime import datetime
+from pathlib import Path
+
+from app.workers.celery_app import celery_app
+from app.database import SessionLocal
+from app.models import Export, Image, Species
+from app.config import get_settings
+
+settings = get_settings()
+
+
+@celery_app.task(bind=True)
+def generate_export(self, export_id: int):
+    """Generate a zip export for CoreML training."""
+    db = SessionLocal()
+    try:
+        export = db.query(Export).filter(Export.id == export_id).first()
+        if not export:
+            return {"error": "Export not found"}
+
+        # Update status
+        export.status = "generating"
+        export.celery_task_id = self.request.id
+        db.commit()
+
+        # Parse filter criteria
+        criteria = json.loads(export.filter_criteria) if export.filter_criteria else {}
+        min_images = criteria.get("min_images_per_species", 100)
+        licenses = criteria.get("licenses")
+        min_quality = criteria.get("min_quality")
+        species_ids = criteria.get("species_ids")
+
+        # Build query for images
+        query = db.query(Image).filter(Image.status == "downloaded")
+
+        if licenses:
+            query = query.filter(Image.license.in_(licenses))
+
+        if min_quality:
+            query = query.filter(Image.quality_score >= min_quality)
+
+        if species_ids:
+            query = query.filter(Image.species_id.in_(species_ids))
+
+        # Group by species and filter by min count
+        from sqlalchemy import func
+        species_counts = db.query(
+            Image.species_id,
+            func.count(Image.id).label("count")
+        ).filter(Image.status == "downloaded").group_by(Image.species_id).all()
+
+        valid_species_ids = [s.species_id for s in species_counts if s.count >= min_images]
+
+        if species_ids:
+            valid_species_ids = [s for s in valid_species_ids if s in species_ids]
+
+        if not valid_species_ids:
+            export.status = "failed"
+            export.error_message = "No species meet the criteria"
+            export.completed_at = datetime.utcnow()
+            db.commit()
+            return {"error": "No species meet the criteria"}
+
+        # Create export directory
+        export_dir = Path(settings.exports_path) / f"export_{export_id}"
+        train_dir = export_dir / "Training"
+        test_dir = export_dir / "Testing"
+        train_dir.mkdir(parents=True, exist_ok=True)
+        test_dir.mkdir(parents=True, exist_ok=True)
+
+        total_images = 0
+        species_count = 0
+
+        # Process each valid species
+        for i, species_id in enumerate(valid_species_ids):
+            species = db.query(Species).filter(Species.id == species_id).first()
+            if not species:
+                continue
+
+            # Get images for this species
+            images_query = query.filter(Image.species_id == species_id)
+            if licenses:
+                images_query = images_query.filter(Image.license.in_(licenses))
+            if min_quality:
+                images_query = images_query.filter(Image.quality_score >= min_quality)
+
+            images = images_query.all()
+            if len(images) < min_images:
+                continue
+
+            species_count += 1
+
+            # Create species folders
+            species_name = species.scientific_name.replace(" ", "_")
+            (train_dir / species_name).mkdir(exist_ok=True)
+            (test_dir / species_name).mkdir(exist_ok=True)
+
+            # Shuffle and split
+            random.shuffle(images)
+            split_idx = int(len(images) * export.train_split)
+            train_images = images[:split_idx]
+            test_images = images[split_idx:]
+
+            # Copy images
+            for j, img in enumerate(train_images):
+                if img.local_path and os.path.exists(img.local_path):
+                    ext = Path(img.local_path).suffix or ".jpg"
+                    dest = train_dir / species_name / f"img_{j:05d}{ext}"
+                    shutil.copy2(img.local_path, dest)
+                    total_images += 1
+
+            for j, img in enumerate(test_images):
+                if img.local_path and os.path.exists(img.local_path):
+                    ext = Path(img.local_path).suffix or ".jpg"
+                    dest = test_dir / species_name / f"img_{j:05d}{ext}"
+                    shutil.copy2(img.local_path, dest)
+                    total_images += 1
+
+            # Update progress
+            self.update_state(
+                state="PROGRESS",
+                meta={
+                    "current": i + 1,
+                    "total": len(valid_species_ids),
+                    "species": species.scientific_name,
+                }
+            )
+
+        # Create zip file
+        zip_path = Path(settings.exports_path) / f"export_{export_id}.zip"
+        with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
+            for root, dirs, files in os.walk(export_dir):
+                for file in files:
+                    file_path = Path(root) / file
+                    arcname = file_path.relative_to(export_dir)
+                    zipf.write(file_path, arcname)
+
+        # Clean up directory
+        shutil.rmtree(export_dir)
+
+        # Update export record
+        export.status = "completed"
+        export.file_path = str(zip_path)
+        export.file_size = zip_path.stat().st_size
+        export.species_count = species_count
+        export.image_count = total_images
+        export.completed_at = datetime.utcnow()
+        db.commit()
+
+        return {
+            "status": "completed",
+            "species_count": species_count,
+            "image_count": total_images,
+            "file_size": export.file_size,
+        }
+
+    except Exception as e:
+        if export:
+            export.status = "failed"
+            export.error_message = str(e)
+            export.completed_at = datetime.utcnow()
+            db.commit()
+        raise
+    finally:
+        db.close()
@@ -0,0 +1,224 @@
+import os
+from pathlib import Path
+
+import httpx
+from PIL import Image as PILImage
+import imagehash
+import numpy as np
+from scipy import ndimage
+
+from app.workers.celery_app import celery_app
+from app.database import SessionLocal
+from app.models import Image
+from app.config import get_settings
+
+settings = get_settings()
+
+
+def calculate_blur_score(image_path: str) -> float:
+    """Calculate blur score using Laplacian variance. Higher = sharper."""
+    try:
+        img = PILImage.open(image_path).convert("L")
+        img_array = np.array(img)
+        laplacian = ndimage.laplace(img_array)
+        return float(np.var(laplacian))
+    except Exception:
+        return 0.0
+
+
+def calculate_phash(image_path: str) -> str:
+    """Calculate perceptual hash for deduplication."""
+    try:
+        img = PILImage.open(image_path)
+        return str(imagehash.phash(img))
+    except Exception:
+        return ""
+
+
+def check_color_distribution(image_path: str) -> tuple[bool, str]:
+    """Check if image has healthy color distribution for a plant photo.
+
+    Returns (passed, reason) tuple.
+    Rejects:
+    - Low color variance (mean channel std < 25): herbarium specimens (brown on white)
+    - No green + low variance (green ratio < 5% AND mean std < 40): monochrome illustrations
+    """
+    try:
+        img = PILImage.open(image_path).convert("RGB")
+        arr = np.array(img, dtype=np.float64)
+
+        # Per-channel standard deviation
+        channel_stds = arr.std(axis=(0, 1))  # [R_std, G_std, B_std]
+        mean_std = float(channel_stds.mean())
+
+        if mean_std < 25:
+            return False, f"Low color variance ({mean_std:.1f})"
+
+        # Check green ratio
+        channel_means = arr.mean(axis=(0, 1))
+        total = channel_means.sum()
+        green_ratio = channel_means[1] / total if total > 0 else 0
+
+        if green_ratio < 0.05 and mean_std < 40:
+            return False, f"No green ({green_ratio:.2%}) + low variance ({mean_std:.1f})"
+
+        return True, ""
+    except Exception:
+        return True, ""  # Don't reject on error
+
+
+def resize_image(image_path: str, target_size: int = 512) -> bool:
+    """Resize image to target size while maintaining aspect ratio."""
+    try:
+        img = PILImage.open(image_path)
+        img.thumbnail((target_size, target_size), PILImage.Resampling.LANCZOS)
+        img.save(image_path, quality=95)
+        return True
+    except Exception:
+        return False
+
+
+@celery_app.task
+def download_and_process_image(image_id: int):
+    """Download image, check quality, dedupe, and resize."""
+    db = SessionLocal()
+    try:
+        image = db.query(Image).filter(Image.id == image_id).first()
+        if not image:
+            return {"error": "Image not found"}
+
+        # Create directory for species
+        species = image.species
+        species_dir = Path(settings.images_path) / species.scientific_name.replace(" ", "_")
+        species_dir.mkdir(parents=True, exist_ok=True)
+
+        # Download image
+        filename = f"{image.source}_{image.source_id or image.id}.jpg"
+        local_path = species_dir / filename
+
+        try:
+            headers = {
+                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
+            }
+            with httpx.Client(timeout=30, headers=headers, follow_redirects=True) as client:
+                response = client.get(image.url)
+                response.raise_for_status()
+
+                with open(local_path, "wb") as f:
+                    f.write(response.content)
+        except Exception as e:
+            image.status = "rejected"
+            db.commit()
+            return {"error": f"Download failed: {e}"}
+
+        # Check minimum size
+        try:
+            with PILImage.open(local_path) as img:
+                width, height = img.size
+                if width < 256 or height < 256:
+                    os.remove(local_path)
+                    image.status = "rejected"
+                    db.commit()
+                    return {"error": "Image too small"}
+                image.width = width
+                image.height = height
+        except Exception as e:
+            if local_path.exists():
+                os.remove(local_path)
+            image.status = "rejected"
+            db.commit()
+            return {"error": f"Invalid image: {e}"}
+
+        # Calculate perceptual hash for deduplication
+        phash = calculate_phash(str(local_path))
+        if phash:
+            # Check for duplicates
+            existing = db.query(Image).filter(
+                Image.phash == phash,
+                Image.id != image.id,
+                Image.status == "downloaded"
+            ).first()
+
+            if existing:
+                os.remove(local_path)
+                image.status = "rejected"
+                image.phash = phash
+                db.commit()
+                return {"error": "Duplicate image"}
+
+            image.phash = phash
+
+        # Calculate blur score
+        quality_score = calculate_blur_score(str(local_path))
+        image.quality_score = quality_score
+
+        # Reject very blurry images (threshold can be tuned)
+        if quality_score < 100:  # Low variance = blurry
+            os.remove(local_path)
+            image.status = "rejected"
+            db.commit()
+            return {"error": "Image too blurry"}
+
+        # Check color distribution (reject herbarium specimens, illustrations)
+        color_ok, color_reason = check_color_distribution(str(local_path))
+        if not color_ok:
+            os.remove(local_path)
+            image.status = "rejected"
+            db.commit()
+            return {"error": f"Non-photo content: {color_reason}"}
+
+        # Resize to 512x512 max
+        resize_image(str(local_path))
+
+        # Update image record
+        image.local_path = str(local_path)
+        image.status = "downloaded"
+        db.commit()
+
+        return {
+            "status": "success",
+            "path": str(local_path),
+            "quality_score": quality_score,
+        }
+
+    except Exception as e:
+        if image:
+            image.status = "rejected"
+            db.commit()
+        return {"error": str(e)}
+    finally:
+        db.close()
+
+
+@celery_app.task(bind=True)
+def batch_process_pending_images(self, source: str = None, chunk_size: int = 500):
+    """Process ALL pending images in chunks, with progress tracking."""
+    db = SessionLocal()
+    try:
+        query = db.query(Image).filter(Image.status == "pending")
+        if source:
+            query = query.filter(Image.source == source)
+
+        total = query.count()
+        queued = 0
+        offset = 0
+
+        while offset < total:
+            chunk = query.order_by(Image.id).offset(offset).limit(chunk_size).all()
+            if not chunk:
+                break
+
+            for image in chunk:
+                download_and_process_image.delay(image.id)
+                queued += 1
+
+            offset += len(chunk)
+
+            self.update_state(
+                state="PROGRESS",
+                meta={"queued": queued, "total": total},
+            )
+
+        return {"queued": queued, "total": total}
+    finally:
+        db.close()
@@ -0,0 +1,164 @@
+import json
+from datetime import datetime
+
+from app.workers.celery_app import celery_app
+from app.database import SessionLocal
+from app.models import Job, Species, Image
+from app.utils.logging import get_job_logger
+
+
+@celery_app.task(bind=True)
+def run_scrape_job(self, job_id: int):
+    """Main scrape task that dispatches to source-specific scrapers."""
+    logger = get_job_logger(job_id)
+    logger.info(f"Starting scrape job {job_id}")
+
+    db = SessionLocal()
+    job = None
+    try:
+        job = db.query(Job).filter(Job.id == job_id).first()
+        if not job:
+            logger.error(f"Job {job_id} not found")
+            return {"error": "Job not found"}
+
+        logger.info(f"Job: {job.name}, Source: {job.source}")
+
+        # Update job status
+        job.status = "running"
+        job.started_at = datetime.utcnow()
+        job.celery_task_id = self.request.id
+        db.commit()
+
+        # Get species to scrape
+        if job.species_filter:
+            species_ids = json.loads(job.species_filter)
+            query = db.query(Species).filter(Species.id.in_(species_ids))
+            logger.info(f"Filtered to species IDs: {species_ids}")
+        else:
+            query = db.query(Species)
+            logger.info("Scraping all species")
+
+        # Filter by image count if requested
+        if job.only_without_images or job.max_images:
+            from sqlalchemy import func
+            # Subquery to count downloaded images per species
+            image_count_subquery = (
+                db.query(Image.species_id, func.count(Image.id).label("count"))
+                .filter(Image.status == "downloaded")
+                .group_by(Image.species_id)
+                .subquery()
+            )
+            # Left join with the count subquery
+            query = query.outerjoin(
+                image_count_subquery,
+                Species.id == image_count_subquery.c.species_id
+            )
+
+            if job.only_without_images:
+                # Filter where count is NULL or 0
+                query = query.filter(
+                    (image_count_subquery.c.count == None) | (image_count_subquery.c.count == 0)
+                )
+                logger.info("Filtering to species without images")
+            elif job.max_images:
+                # Filter where count is NULL or less than max_images
+                query = query.filter(
+                    (image_count_subquery.c.count == None) | (image_count_subquery.c.count < job.max_images)
+                )
+                logger.info(f"Filtering to species with fewer than {job.max_images} images")
+
+        species_list = query.all()
+        logger.info(f"Total species to scrape: {len(species_list)}")
+
+        job.progress_total = len(species_list)
+        db.commit()
+
+        # Import scraper based on source
+        from app.scrapers import get_scraper
+        scraper = get_scraper(job.source)
+
+        if not scraper:
+            error_msg = f"Unknown source: {job.source}"
+            logger.error(error_msg)
+            job.status = "failed"
+            job.error_message = error_msg
+            job.completed_at = datetime.utcnow()
+            db.commit()
+            return {"error": error_msg}
+
+        logger.info(f"Using scraper: {scraper.name}")
+
+        # Scrape each species
+        for i, species in enumerate(species_list):
+            try:
+                # Update progress
+                job.progress_current = i + 1
+                db.commit()
+
+                logger.info(f"[{i+1}/{len(species_list)}] Scraping: {species.scientific_name}")
+
+                # Update task state for real-time monitoring
+                self.update_state(
+                    state="PROGRESS",
+                    meta={
+                        "current": i + 1,
+                        "total": len(species_list),
+                        "species": species.scientific_name,
+                    }
+                )
+
+                # Run scraper for this species
+                results = scraper.scrape_species(species, db, logger)
+                downloaded = results.get("downloaded", 0)
+                rejected = results.get("rejected", 0)
+                job.images_downloaded += downloaded
+                job.images_rejected += rejected
+                db.commit()
+
+                logger.info(f"  -> Downloaded: {downloaded}, Rejected: {rejected}")
+
+            except Exception as e:
+                # Log error but continue with other species
+                logger.error(f"Error scraping {species.scientific_name}: {e}", exc_info=True)
+                continue
+
+        # Mark job complete
+        job.status = "completed"
+        job.completed_at = datetime.utcnow()
+        db.commit()
+
+        logger.info(f"Job {job_id} completed. Total downloaded: {job.images_downloaded}, rejected: {job.images_rejected}")
+
+        return {
+            "status": "completed",
+            "downloaded": job.images_downloaded,
+            "rejected": job.images_rejected,
+        }
+
+    except Exception as e:
+        logger.error(f"Job {job_id} failed with error: {e}", exc_info=True)
+        if job:
+            job.status = "failed"
+            job.error_message = str(e)
+            job.completed_at = datetime.utcnow()
+            db.commit()
+        raise
+    finally:
+        db.close()
+
+
+@celery_app.task
+def pause_scrape_job(job_id: int):
+    """Pause a running scrape job."""
+    db = SessionLocal()
+    try:
+        job = db.query(Job).filter(Job.id == job_id).first()
+        if job and job.status == "running":
+            job.status = "paused"
+            db.commit()
+            # Revoke the Celery task
+            if job.celery_task_id:
+                celery_app.control.revoke(job.celery_task_id, terminate=True)
+        return {"status": "paused"}
+    finally:
+        db.close()
@@ -0,0 +1,193 @@
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+
+from sqlalchemy import func, case, text
+
+from app.workers.celery_app import celery_app
+from app.database import SessionLocal
+from app.models import Species, Image, Job
+from app.models.cached_stats import CachedStats
+from app.config import get_settings
+
+
+def get_directory_size_fast(path: str) -> int:
+    """Get directory size in bytes using fast os.scandir."""
+    total = 0
+    try:
+        with os.scandir(path) as it:
+            for entry in it:
+                try:
+                    if entry.is_file(follow_symlinks=False):
+                        total += entry.stat(follow_symlinks=False).st_size
+                    elif entry.is_dir(follow_symlinks=False):
+                        total += get_directory_size_fast(entry.path)
+                except (OSError, PermissionError):
+                    pass
+    except (OSError, PermissionError):
+        pass
+    return total
+
+
+@celery_app.task
+def refresh_stats():
+    """Calculate and cache dashboard statistics."""
+    print("=== STATS TASK: Starting refresh ===", flush=True)
+
+    db = SessionLocal()
+    try:
+        # Use raw SQL for maximum performance on SQLite
+        # All counts in a single query
+        counts_sql = text("""
+            SELECT
+                (SELECT COUNT(*) FROM species) as total_species,
+                (SELECT COUNT(*) FROM images) as total_images,
+                (SELECT COUNT(*) FROM images WHERE status = 'downloaded') as images_downloaded,
+                (SELECT COUNT(*) FROM images WHERE status = 'pending') as images_pending,
+                (SELECT COUNT(*) FROM images WHERE status = 'rejected') as images_rejected
+        """)
+        counts = db.execute(counts_sql).fetchone()
+        total_species = counts[0] or 0
+        total_images = counts[1] or 0
+        images_downloaded = counts[2] or 0
+        images_pending = counts[3] or 0
+        images_rejected = counts[4] or 0
+
+        # Per-source stats - single query with GROUP BY
+        source_sql = text("""
+            SELECT
+                source,
+                COUNT(*) as total,
+                SUM(CASE WHEN status = 'downloaded' THEN 1 ELSE 0 END) as downloaded,
+                SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) as pending,
+                SUM(CASE WHEN status = 'rejected' THEN 1 ELSE 0 END) as rejected
+            FROM images
+            GROUP BY source
+        """)
+        source_stats_raw = db.execute(source_sql).fetchall()
+        sources = [
+            {
+                "source": s[0],
+                "image_count": s[1],
+                "downloaded": s[2] or 0,
+                "pending": s[3] or 0,
+                "rejected": s[4] or 0,
+            }
+            for s in source_stats_raw
+        ]
+
+        # Per-license stats - single indexed query
+        license_sql = text("""
+            SELECT license, COUNT(*) as count
+            FROM images
+            WHERE status = 'downloaded'
+            GROUP BY license
+        """)
+        license_stats_raw = db.execute(license_sql).fetchall()
+        licenses = [
+            {"license": l[0], "count": l[1]}
+            for l in license_stats_raw
+        ]
+
+        # Job stats - single query
+        job_sql = text("""
+            SELECT
+                SUM(CASE WHEN status = 'running' THEN 1 ELSE 0 END) as running,
+                SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) as pending,
+                SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as completed,
+                SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed
+            FROM jobs
+        """)
+        job_counts = db.execute(job_sql).fetchone()
+        jobs = {
+            "running": job_counts[0] or 0,
+            "pending": job_counts[1] or 0,
+            "completed": job_counts[2] or 0,
+            "failed": job_counts[3] or 0,
+        }
+
+        # Top species by image count - optimized with index
+        top_sql = text("""
+            SELECT s.id, s.scientific_name, s.common_name, COUNT(i.id) as image_count
+            FROM species s
+            INNER JOIN images i ON i.species_id = s.id AND i.status = 'downloaded'
+            GROUP BY s.id
+            ORDER BY image_count DESC
+            LIMIT 10
+        """)
+        top_species_raw = db.execute(top_sql).fetchall()
+        top_species = [
+            {
+                "id": s[0],
+                "scientific_name": s[1],
+                "common_name": s[2],
+                "image_count": s[3],
+            }
+            for s in top_species_raw
+        ]
+
+        # Under-represented species - use pre-computed counts
+        under_sql = text("""
+            SELECT s.id, s.scientific_name, s.common_name, COALESCE(img_counts.cnt, 0) as image_count
+            FROM species s
+            LEFT JOIN (
+                SELECT species_id, COUNT(*) as cnt
+                FROM images
+                WHERE status = 'downloaded'
+                GROUP BY species_id
+            ) img_counts ON img_counts.species_id = s.id
+            WHERE COALESCE(img_counts.cnt, 0) < 100
+            ORDER BY image_count ASC
+            LIMIT 10
+        """)
+        under_rep_raw = db.execute(under_sql).fetchall()
+        under_represented = [
+            {
+                "id": s[0],
+                "scientific_name": s[1],
+                "common_name": s[2],
+                "image_count": s[3],
+            }
+            for s in under_rep_raw
+        ]
+
+        # Calculate disk usage (fast recursive scan)
+        settings = get_settings()
+        disk_usage_bytes = get_directory_size_fast(settings.images_path)
+        disk_usage_mb = round(disk_usage_bytes / (1024 * 1024), 2)
+
+        # Build the stats object
+        stats = {
+            "total_species": total_species,
+            "total_images": total_images,
+            "images_downloaded": images_downloaded,
+            "images_pending": images_pending,
+            "images_rejected": images_rejected,
+            "disk_usage_mb": disk_usage_mb,
+            "sources": sources,
+            "licenses": licenses,
+            "jobs": jobs,
+            "top_species": top_species,
+            "under_represented": under_represented,
+        }
+
+        # Store in database
+        cached = db.query(CachedStats).filter(CachedStats.key == "dashboard_stats").first()
+        if cached:
+            cached.value = json.dumps(stats)
+            cached.updated_at = datetime.utcnow()
+        else:
+            cached = CachedStats(key="dashboard_stats", value=json.dumps(stats))
+            db.add(cached)
+
+        db.commit()
+        print(f"=== STATS TASK: Refreshed (species={total_species}, images={total_images}) ===", flush=True)
+
+        return {"status": "success", "total_species": total_species, "total_images": total_images}
+
+    except Exception as e:
+        print(f"=== STATS TASK ERROR: {e} ===", flush=True)
+        raise
+    finally:
+        db.close()