import json from fastapi import APIRouter, Depends, HTTPException from sqlalchemy.orm import Session from sqlalchemy import func, case from app.database import get_db from app.models import Species, Image, Job from app.models.cached_stats import CachedStats from app.schemas.stats import StatsResponse, SourceStats, LicenseStats, SpeciesStats, JobStats router = APIRouter() @router.get("", response_model=StatsResponse) def get_stats(db: Session = Depends(get_db)): """Get dashboard statistics from cache (updated every 60s by Celery).""" # Try to get cached stats cached = db.query(CachedStats).filter(CachedStats.key == "dashboard_stats").first() if cached: data = json.loads(cached.value) return StatsResponse( total_species=data["total_species"], total_images=data["total_images"], images_downloaded=data["images_downloaded"], images_pending=data["images_pending"], images_rejected=data["images_rejected"], disk_usage_mb=data["disk_usage_mb"], sources=[SourceStats(**s) for s in data["sources"]], licenses=[LicenseStats(**l) for l in data["licenses"]], jobs=JobStats(**data["jobs"]), top_species=[SpeciesStats(**s) for s in data["top_species"]], under_represented=[SpeciesStats(**s) for s in data["under_represented"]], ) # No cache yet - return empty stats (Celery will populate soon) # This only happens on first startup before Celery runs return StatsResponse( total_species=0, total_images=0, images_downloaded=0, images_pending=0, images_rejected=0, disk_usage_mb=0.0, sources=[], licenses=[], jobs=JobStats(running=0, pending=0, completed=0, failed=0), top_species=[], under_represented=[], ) @router.post("/refresh") def refresh_stats_now(db: Session = Depends(get_db)): """Manually trigger a stats refresh.""" from app.workers.stats_tasks import refresh_stats refresh_stats.delay() return {"status": "refresh_queued"} @router.get("/sources") def get_source_stats(db: Session = Depends(get_db)): """Get per-source breakdown.""" stats = db.query( Image.source, func.count(Image.id).label("total"), func.sum(case((Image.status == "downloaded", 1), else_=0)).label("downloaded"), func.sum(case((Image.status == "pending", 1), else_=0)).label("pending"), func.sum(case((Image.status == "rejected", 1), else_=0)).label("rejected"), ).group_by(Image.source).all() return [ { "source": s.source, "total": s.total, "downloaded": s.downloaded or 0, "pending": s.pending or 0, "rejected": s.rejected or 0, } for s in stats ] @router.get("/species") def get_species_stats( min_count: int = 0, max_count: int = None, db: Session = Depends(get_db), ): """Get per-species image counts.""" query = db.query( Species.id, Species.scientific_name, Species.common_name, Species.genus, func.count(Image.id).label("image_count") ).outerjoin(Image, (Image.species_id == Species.id) & (Image.status == "downloaded") ).group_by(Species.id) if min_count > 0: query = query.having(func.count(Image.id) >= min_count) if max_count is not None: query = query.having(func.count(Image.id) <= max_count) stats = query.order_by(func.count(Image.id).desc()).all() return [ { "id": s.id, "scientific_name": s.scientific_name, "common_name": s.common_name, "genus": s.genus, "image_count": s.image_count, } for s in stats ] @router.get("/distribution") def get_image_distribution(db: Session = Depends(get_db)): """Get distribution of images per species for ML training assessment. Returns counts of species at various image thresholds to help determine dataset quality for training image classifiers. """ from sqlalchemy import text # Get image counts per species using optimized raw SQL distribution_sql = text(""" WITH species_counts AS ( SELECT s.id, COUNT(i.id) as cnt FROM species s LEFT JOIN images i ON i.species_id = s.id AND i.status = 'downloaded' GROUP BY s.id ) SELECT COUNT(*) as total_species, SUM(CASE WHEN cnt = 0 THEN 1 ELSE 0 END) as with_0, SUM(CASE WHEN cnt >= 1 AND cnt < 10 THEN 1 ELSE 0 END) as with_1_9, SUM(CASE WHEN cnt >= 10 AND cnt < 25 THEN 1 ELSE 0 END) as with_10_24, SUM(CASE WHEN cnt >= 25 AND cnt < 50 THEN 1 ELSE 0 END) as with_25_49, SUM(CASE WHEN cnt >= 50 AND cnt < 100 THEN 1 ELSE 0 END) as with_50_99, SUM(CASE WHEN cnt >= 100 AND cnt < 200 THEN 1 ELSE 0 END) as with_100_199, SUM(CASE WHEN cnt >= 200 THEN 1 ELSE 0 END) as with_200_plus, SUM(CASE WHEN cnt >= 10 THEN 1 ELSE 0 END) as trainable_10, SUM(CASE WHEN cnt >= 25 THEN 1 ELSE 0 END) as trainable_25, SUM(CASE WHEN cnt >= 50 THEN 1 ELSE 0 END) as trainable_50, SUM(CASE WHEN cnt >= 100 THEN 1 ELSE 0 END) as trainable_100, AVG(cnt) as avg_images, MAX(cnt) as max_images, MIN(cnt) as min_images, SUM(cnt) as total_images FROM species_counts """) result = db.execute(distribution_sql).fetchone() return { "total_species": result[0] or 0, "distribution": { "0_images": result[1] or 0, "1_to_9": result[2] or 0, "10_to_24": result[3] or 0, "25_to_49": result[4] or 0, "50_to_99": result[5] or 0, "100_to_199": result[6] or 0, "200_plus": result[7] or 0, }, "trainable_species": { "min_10_images": result[8] or 0, "min_25_images": result[9] or 0, "min_50_images": result[10] or 0, "min_100_images": result[11] or 0, }, "summary": { "avg_images_per_species": round(result[12] or 0, 1), "max_images": result[13] or 0, "min_images": result[14] or 0, "total_downloaded_images": result[15] or 0, }, "recommendations": { "for_basic_model": f"{result[8] or 0} species with 10+ images", "for_good_model": f"{result[10] or 0} species with 50+ images", "for_excellent_model": f"{result[11] or 0} species with 100+ images", } }