191 lines
6.6 KiB
Python
191 lines
6.6 KiB
Python
import json
|
|
|
|
from fastapi import APIRouter, Depends, HTTPException
|
|
from sqlalchemy.orm import Session
|
|
from sqlalchemy import func, case
|
|
|
|
from app.database import get_db
|
|
from app.models import Species, Image, Job
|
|
from app.models.cached_stats import CachedStats
|
|
from app.schemas.stats import StatsResponse, SourceStats, LicenseStats, SpeciesStats, JobStats
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
@router.get("", response_model=StatsResponse)
|
|
def get_stats(db: Session = Depends(get_db)):
|
|
"""Get dashboard statistics from cache (updated every 60s by Celery)."""
|
|
# Try to get cached stats
|
|
cached = db.query(CachedStats).filter(CachedStats.key == "dashboard_stats").first()
|
|
|
|
if cached:
|
|
data = json.loads(cached.value)
|
|
return StatsResponse(
|
|
total_species=data["total_species"],
|
|
total_images=data["total_images"],
|
|
images_downloaded=data["images_downloaded"],
|
|
images_pending=data["images_pending"],
|
|
images_rejected=data["images_rejected"],
|
|
disk_usage_mb=data["disk_usage_mb"],
|
|
sources=[SourceStats(**s) for s in data["sources"]],
|
|
licenses=[LicenseStats(**l) for l in data["licenses"]],
|
|
jobs=JobStats(**data["jobs"]),
|
|
top_species=[SpeciesStats(**s) for s in data["top_species"]],
|
|
under_represented=[SpeciesStats(**s) for s in data["under_represented"]],
|
|
)
|
|
|
|
# No cache yet - return empty stats (Celery will populate soon)
|
|
# This only happens on first startup before Celery runs
|
|
return StatsResponse(
|
|
total_species=0,
|
|
total_images=0,
|
|
images_downloaded=0,
|
|
images_pending=0,
|
|
images_rejected=0,
|
|
disk_usage_mb=0.0,
|
|
sources=[],
|
|
licenses=[],
|
|
jobs=JobStats(running=0, pending=0, completed=0, failed=0),
|
|
top_species=[],
|
|
under_represented=[],
|
|
)
|
|
|
|
|
|
@router.post("/refresh")
|
|
def refresh_stats_now(db: Session = Depends(get_db)):
|
|
"""Manually trigger a stats refresh."""
|
|
from app.workers.stats_tasks import refresh_stats
|
|
refresh_stats.delay()
|
|
return {"status": "refresh_queued"}
|
|
|
|
|
|
@router.get("/sources")
|
|
def get_source_stats(db: Session = Depends(get_db)):
|
|
"""Get per-source breakdown."""
|
|
stats = db.query(
|
|
Image.source,
|
|
func.count(Image.id).label("total"),
|
|
func.sum(case((Image.status == "downloaded", 1), else_=0)).label("downloaded"),
|
|
func.sum(case((Image.status == "pending", 1), else_=0)).label("pending"),
|
|
func.sum(case((Image.status == "rejected", 1), else_=0)).label("rejected"),
|
|
).group_by(Image.source).all()
|
|
|
|
return [
|
|
{
|
|
"source": s.source,
|
|
"total": s.total,
|
|
"downloaded": s.downloaded or 0,
|
|
"pending": s.pending or 0,
|
|
"rejected": s.rejected or 0,
|
|
}
|
|
for s in stats
|
|
]
|
|
|
|
|
|
@router.get("/species")
|
|
def get_species_stats(
|
|
min_count: int = 0,
|
|
max_count: int = None,
|
|
db: Session = Depends(get_db),
|
|
):
|
|
"""Get per-species image counts."""
|
|
query = db.query(
|
|
Species.id,
|
|
Species.scientific_name,
|
|
Species.common_name,
|
|
Species.genus,
|
|
func.count(Image.id).label("image_count")
|
|
).outerjoin(Image, (Image.species_id == Species.id) & (Image.status == "downloaded")
|
|
).group_by(Species.id)
|
|
|
|
if min_count > 0:
|
|
query = query.having(func.count(Image.id) >= min_count)
|
|
|
|
if max_count is not None:
|
|
query = query.having(func.count(Image.id) <= max_count)
|
|
|
|
stats = query.order_by(func.count(Image.id).desc()).all()
|
|
|
|
return [
|
|
{
|
|
"id": s.id,
|
|
"scientific_name": s.scientific_name,
|
|
"common_name": s.common_name,
|
|
"genus": s.genus,
|
|
"image_count": s.image_count,
|
|
}
|
|
for s in stats
|
|
]
|
|
|
|
|
|
@router.get("/distribution")
|
|
def get_image_distribution(db: Session = Depends(get_db)):
|
|
"""Get distribution of images per species for ML training assessment.
|
|
|
|
Returns counts of species at various image thresholds to help
|
|
determine dataset quality for training image classifiers.
|
|
"""
|
|
from sqlalchemy import text
|
|
|
|
# Get image counts per species using optimized raw SQL
|
|
distribution_sql = text("""
|
|
WITH species_counts AS (
|
|
SELECT
|
|
s.id,
|
|
COUNT(i.id) as cnt
|
|
FROM species s
|
|
LEFT JOIN images i ON i.species_id = s.id AND i.status = 'downloaded'
|
|
GROUP BY s.id
|
|
)
|
|
SELECT
|
|
COUNT(*) as total_species,
|
|
SUM(CASE WHEN cnt = 0 THEN 1 ELSE 0 END) as with_0,
|
|
SUM(CASE WHEN cnt >= 1 AND cnt < 10 THEN 1 ELSE 0 END) as with_1_9,
|
|
SUM(CASE WHEN cnt >= 10 AND cnt < 25 THEN 1 ELSE 0 END) as with_10_24,
|
|
SUM(CASE WHEN cnt >= 25 AND cnt < 50 THEN 1 ELSE 0 END) as with_25_49,
|
|
SUM(CASE WHEN cnt >= 50 AND cnt < 100 THEN 1 ELSE 0 END) as with_50_99,
|
|
SUM(CASE WHEN cnt >= 100 AND cnt < 200 THEN 1 ELSE 0 END) as with_100_199,
|
|
SUM(CASE WHEN cnt >= 200 THEN 1 ELSE 0 END) as with_200_plus,
|
|
SUM(CASE WHEN cnt >= 10 THEN 1 ELSE 0 END) as trainable_10,
|
|
SUM(CASE WHEN cnt >= 25 THEN 1 ELSE 0 END) as trainable_25,
|
|
SUM(CASE WHEN cnt >= 50 THEN 1 ELSE 0 END) as trainable_50,
|
|
SUM(CASE WHEN cnt >= 100 THEN 1 ELSE 0 END) as trainable_100,
|
|
AVG(cnt) as avg_images,
|
|
MAX(cnt) as max_images,
|
|
MIN(cnt) as min_images,
|
|
SUM(cnt) as total_images
|
|
FROM species_counts
|
|
""")
|
|
|
|
result = db.execute(distribution_sql).fetchone()
|
|
|
|
return {
|
|
"total_species": result[0] or 0,
|
|
"distribution": {
|
|
"0_images": result[1] or 0,
|
|
"1_to_9": result[2] or 0,
|
|
"10_to_24": result[3] or 0,
|
|
"25_to_49": result[4] or 0,
|
|
"50_to_99": result[5] or 0,
|
|
"100_to_199": result[6] or 0,
|
|
"200_plus": result[7] or 0,
|
|
},
|
|
"trainable_species": {
|
|
"min_10_images": result[8] or 0,
|
|
"min_25_images": result[9] or 0,
|
|
"min_50_images": result[10] or 0,
|
|
"min_100_images": result[11] or 0,
|
|
},
|
|
"summary": {
|
|
"avg_images_per_species": round(result[12] or 0, 1),
|
|
"max_images": result[13] or 0,
|
|
"min_images": result[14] or 0,
|
|
"total_downloaded_images": result[15] or 0,
|
|
},
|
|
"recommendations": {
|
|
"for_basic_model": f"{result[8] or 0} species with 10+ images",
|
|
"for_good_model": f"{result[10] or 0} species with 50+ images",
|
|
"for_excellent_model": f"{result[11] or 0} species with 100+ images",
|
|
}
|
|
}
|