PlantGuideScraper/backend/app/api/images.py

import os
import shutil
import uuid
from pathlib import Path
from typing import Optional, List

from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi.responses import FileResponse
from sqlalchemy.orm import Session
from sqlalchemy import func
from PIL import Image as PILImage

from app.database import get_db
from app.models import Image, Species
from app.schemas.image import ImageResponse, ImageListResponse
from app.config import get_settings

router = APIRouter()
settings = get_settings()


@router.get("", response_model=ImageListResponse)
def list_images(
    page: int = Query(1, ge=1),
    page_size: int = Query(50, ge=1, le=200),
    species_id: Optional[int] = None,
    source: Optional[str] = None,
    license: Optional[str] = None,
    status: Optional[str] = None,
    min_quality: Optional[float] = None,
    search: Optional[str] = None,
    db: Session = Depends(get_db),
):
    """List images with pagination and filters."""
    # Use joinedload to fetch species in single query
    from sqlalchemy.orm import joinedload
    query = db.query(Image).options(joinedload(Image.species))

    if species_id:
        query = query.filter(Image.species_id == species_id)

    if source:
        query = query.filter(Image.source == source)

    if license:
        query = query.filter(Image.license == license)

    if status:
        query = query.filter(Image.status == status)

    if min_quality:
        query = query.filter(Image.quality_score >= min_quality)

    if search:
        search_term = f"%{search}%"
        query = query.join(Species).filter(
            (Species.scientific_name.ilike(search_term)) |
            (Species.common_name.ilike(search_term))
        )

    # Use faster count for simple queries
    if not search:
        # Build count query without join for better performance
        count_query = db.query(func.count(Image.id))
        if species_id:
            count_query = count_query.filter(Image.species_id == species_id)
        if source:
            count_query = count_query.filter(Image.source == source)
        if license:
            count_query = count_query.filter(Image.license == license)
        if status:
            count_query = count_query.filter(Image.status == status)
        if min_quality:
            count_query = count_query.filter(Image.quality_score >= min_quality)
        total = count_query.scalar()
    else:
        total = query.count()

    pages = (total + page_size - 1) // page_size

    images = query.order_by(Image.created_at.desc()).offset(
        (page - 1) * page_size
    ).limit(page_size).all()

    items = [
        ImageResponse(
            id=img.id,
            species_id=img.species_id,
            species_name=img.species.scientific_name if img.species else None,
            source=img.source,
            source_id=img.source_id,
            url=img.url,
            local_path=img.local_path,
            license=img.license,
            attribution=img.attribution,
            width=img.width,
            height=img.height,
            quality_score=img.quality_score,
            status=img.status,
            created_at=img.created_at,
        )
        for img in images
    ]

    return ImageListResponse(
        items=items,
        total=total,
        page=page,
        page_size=page_size,
        pages=pages,
    )


@router.get("/sources")
def list_sources(db: Session = Depends(get_db)):
    """List all unique image sources."""
    sources = db.query(Image.source).distinct().all()
    return [s[0] for s in sources]


@router.get("/licenses")
def list_licenses(db: Session = Depends(get_db)):
    """List all unique licenses."""
    licenses = db.query(Image.license).distinct().all()
    return [l[0] for l in licenses]


@router.post("/process-pending")
def process_pending_images(
    source: Optional[str] = None,
    db: Session = Depends(get_db),
):
    """Queue all pending images for download and processing."""
    from app.workers.quality_tasks import batch_process_pending_images

    query = db.query(func.count(Image.id)).filter(Image.status == "pending")
    if source:
        query = query.filter(Image.source == source)
    pending_count = query.scalar()

    task = batch_process_pending_images.delay(source=source)

    return {
        "pending_count": pending_count,
        "task_id": task.id,
    }


@router.get("/process-pending/status/{task_id}")
def process_pending_status(task_id: str):
    """Check status of a batch processing task."""
    from app.workers.celery_app import celery_app

    result = celery_app.AsyncResult(task_id)
    state = result.state  # PENDING, STARTED, PROGRESS, SUCCESS, FAILURE

    response = {"task_id": task_id, "state": state}

    if state == "PROGRESS" and isinstance(result.info, dict):
        response["queued"] = result.info.get("queued", 0)
        response["total"] = result.info.get("total", 0)
    elif state == "SUCCESS" and isinstance(result.result, dict):
        response["queued"] = result.result.get("queued", 0)
        response["total"] = result.result.get("total", 0)

    return response


@router.get("/{image_id}", response_model=ImageResponse)
def get_image(image_id: int, db: Session = Depends(get_db)):
    """Get an image by ID."""
    image = db.query(Image).filter(Image.id == image_id).first()
    if not image:
        raise HTTPException(status_code=404, detail="Image not found")

    return ImageResponse(
        id=image.id,
        species_id=image.species_id,
        species_name=image.species.scientific_name if image.species else None,
        source=image.source,
        source_id=image.source_id,
        url=image.url,
        local_path=image.local_path,
        license=image.license,
        attribution=image.attribution,
        width=image.width,
        height=image.height,
        quality_score=image.quality_score,
        status=image.status,
        created_at=image.created_at,
    )


@router.get("/{image_id}/file")
def get_image_file(image_id: int, db: Session = Depends(get_db)):
    """Get the actual image file."""
    image = db.query(Image).filter(Image.id == image_id).first()
    if not image:
        raise HTTPException(status_code=404, detail="Image not found")

    if not image.local_path:
        raise HTTPException(status_code=404, detail="Image file not available")

    return FileResponse(image.local_path, media_type="image/jpeg")


@router.delete("/{image_id}")
def delete_image(image_id: int, db: Session = Depends(get_db)):
    """Delete an image."""
    image = db.query(Image).filter(Image.id == image_id).first()
    if not image:
        raise HTTPException(status_code=404, detail="Image not found")

    # Delete file if exists
    if image.local_path:
        import os
        if os.path.exists(image.local_path):
            os.remove(image.local_path)

    db.delete(image)
    db.commit()

    return {"status": "deleted"}


@router.post("/bulk-delete")
def bulk_delete_images(
    image_ids: List[int],
    db: Session = Depends(get_db),
):
    """Delete multiple images."""
    import os

    images = db.query(Image).filter(Image.id.in_(image_ids)).all()

    deleted = 0
    for image in images:
        if image.local_path and os.path.exists(image.local_path):
            os.remove(image.local_path)
        db.delete(image)
        deleted += 1

    db.commit()

    return {"deleted": deleted}


@router.get("/import/scan")
def scan_imports(db: Session = Depends(get_db)):
    """Scan the imports folder and return what can be imported.

    Expected structure: imports/{source}/{species_name}/*.jpg
    """
    imports_path = Path(settings.imports_path)

    if not imports_path.exists():
        return {
            "available": False,
            "message": f"Imports folder not found: {imports_path}",
            "sources": [],
            "total_images": 0,
            "matched_species": 0,
            "unmatched_species": [],
        }

    results = {
        "available": True,
        "sources": [],
        "total_images": 0,
        "matched_species": 0,
        "unmatched_species": [],
    }

    # Get all species for matching
    species_map = {}
    for species in db.query(Species).all():
        # Map by scientific name with underscores and spaces
        species_map[species.scientific_name.lower()] = species
        species_map[species.scientific_name.replace(" ", "_").lower()] = species

    seen_unmatched = set()

    # Scan source folders
    for source_dir in imports_path.iterdir():
        if not source_dir.is_dir():
            continue

        source_name = source_dir.name
        source_info = {
            "name": source_name,
            "species_count": 0,
            "image_count": 0,
        }

        # Scan species folders within source
        for species_dir in source_dir.iterdir():
            if not species_dir.is_dir():
                continue

            species_name = species_dir.name.replace("_", " ")
            species_key = species_name.lower()

            # Count images
            image_files = list(species_dir.glob("*.jpg")) + \
                         list(species_dir.glob("*.jpeg")) + \
                         list(species_dir.glob("*.png"))

            if not image_files:
                continue

            source_info["image_count"] += len(image_files)
            results["total_images"] += len(image_files)

            if species_key in species_map or species_dir.name.lower() in species_map:
                source_info["species_count"] += 1
                results["matched_species"] += 1
            else:
                if species_name not in seen_unmatched:
                    seen_unmatched.add(species_name)
                    results["unmatched_species"].append(species_name)

        if source_info["image_count"] > 0:
            results["sources"].append(source_info)

    return results


@router.post("/import/run")
def run_import(
    move_files: bool = Query(False, description="Move files instead of copy"),
    db: Session = Depends(get_db),
):
    """Import images from the imports folder.

    Expected structure: imports/{source}/{species_name}/*.jpg
    Images are copied/moved to: images/{species_name}/{source}_{filename}
    """
    imports_path = Path(settings.imports_path)
    images_path = Path(settings.images_path)

    if not imports_path.exists():
        raise HTTPException(status_code=400, detail="Imports folder not found")

    # Get all species for matching
    species_map = {}
    for species in db.query(Species).all():
        species_map[species.scientific_name.lower()] = species
        species_map[species.scientific_name.replace(" ", "_").lower()] = species

    imported = 0
    skipped = 0
    errors = []

    # Scan source folders
    for source_dir in imports_path.iterdir():
        if not source_dir.is_dir():
            continue

        source_name = source_dir.name

        # Scan species folders within source
        for species_dir in source_dir.iterdir():
            if not species_dir.is_dir():
                continue

            species_name = species_dir.name.replace("_", " ")
            species_key = species_name.lower()

            # Find matching species
            species = species_map.get(species_key) or species_map.get(species_dir.name.lower())
            if not species:
                continue

            # Create target directory
            target_dir = images_path / species.scientific_name.replace(" ", "_")
            target_dir.mkdir(parents=True, exist_ok=True)

            # Process images
            image_files = list(species_dir.glob("*.jpg")) + \
                         list(species_dir.glob("*.jpeg")) + \
                         list(species_dir.glob("*.png"))

            for img_file in image_files:
                try:
                    # Generate unique filename
                    ext = img_file.suffix.lower()
                    if ext == ".jpeg":
                        ext = ".jpg"
                    new_filename = f"{source_name}_{img_file.stem}_{uuid.uuid4().hex[:8]}{ext}"
                    target_path = target_dir / new_filename

                    # Check if already imported (by original filename pattern)
                    existing = db.query(Image).filter(
                        Image.species_id == species.id,
                        Image.source == source_name,
                        Image.source_id == img_file.stem,
                    ).first()

                    if existing:
                        skipped += 1
                        continue

                    # Get image dimensions
                    try:
                        with PILImage.open(img_file) as pil_img:
                            width, height = pil_img.size
                    except Exception:
                        width, height = None, None

                    # Copy or move file
                    if move_files:
                        shutil.move(str(img_file), str(target_path))
                    else:
                        shutil.copy2(str(img_file), str(target_path))

                    # Create database record
                    image = Image(
                        species_id=species.id,
                        source=source_name,
                        source_id=img_file.stem,
                        url=f"file://{img_file}",
                        local_path=str(target_path),
                        license="unknown",
                        width=width,
                        height=height,
                        status="downloaded",
                    )
                    db.add(image)
                    imported += 1

                except Exception as e:
                    errors.append(f"{img_file}: {str(e)}")

            # Commit after each species to avoid large transactions
            db.commit()

    return {
        "imported": imported,
        "skipped": skipped,
        "errors": errors[:20],
    }