Initial commit — PlantGuideScraper project
This commit is contained in:
190
backend/app/api/stats.py
Normal file
190
backend/app/api/stats.py
Normal file
@@ -0,0 +1,190 @@
|
||||
import json
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import func, case
|
||||
|
||||
from app.database import get_db
|
||||
from app.models import Species, Image, Job
|
||||
from app.models.cached_stats import CachedStats
|
||||
from app.schemas.stats import StatsResponse, SourceStats, LicenseStats, SpeciesStats, JobStats
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("", response_model=StatsResponse)
|
||||
def get_stats(db: Session = Depends(get_db)):
|
||||
"""Get dashboard statistics from cache (updated every 60s by Celery)."""
|
||||
# Try to get cached stats
|
||||
cached = db.query(CachedStats).filter(CachedStats.key == "dashboard_stats").first()
|
||||
|
||||
if cached:
|
||||
data = json.loads(cached.value)
|
||||
return StatsResponse(
|
||||
total_species=data["total_species"],
|
||||
total_images=data["total_images"],
|
||||
images_downloaded=data["images_downloaded"],
|
||||
images_pending=data["images_pending"],
|
||||
images_rejected=data["images_rejected"],
|
||||
disk_usage_mb=data["disk_usage_mb"],
|
||||
sources=[SourceStats(**s) for s in data["sources"]],
|
||||
licenses=[LicenseStats(**l) for l in data["licenses"]],
|
||||
jobs=JobStats(**data["jobs"]),
|
||||
top_species=[SpeciesStats(**s) for s in data["top_species"]],
|
||||
under_represented=[SpeciesStats(**s) for s in data["under_represented"]],
|
||||
)
|
||||
|
||||
# No cache yet - return empty stats (Celery will populate soon)
|
||||
# This only happens on first startup before Celery runs
|
||||
return StatsResponse(
|
||||
total_species=0,
|
||||
total_images=0,
|
||||
images_downloaded=0,
|
||||
images_pending=0,
|
||||
images_rejected=0,
|
||||
disk_usage_mb=0.0,
|
||||
sources=[],
|
||||
licenses=[],
|
||||
jobs=JobStats(running=0, pending=0, completed=0, failed=0),
|
||||
top_species=[],
|
||||
under_represented=[],
|
||||
)
|
||||
|
||||
|
||||
@router.post("/refresh")
|
||||
def refresh_stats_now(db: Session = Depends(get_db)):
|
||||
"""Manually trigger a stats refresh."""
|
||||
from app.workers.stats_tasks import refresh_stats
|
||||
refresh_stats.delay()
|
||||
return {"status": "refresh_queued"}
|
||||
|
||||
|
||||
@router.get("/sources")
|
||||
def get_source_stats(db: Session = Depends(get_db)):
|
||||
"""Get per-source breakdown."""
|
||||
stats = db.query(
|
||||
Image.source,
|
||||
func.count(Image.id).label("total"),
|
||||
func.sum(case((Image.status == "downloaded", 1), else_=0)).label("downloaded"),
|
||||
func.sum(case((Image.status == "pending", 1), else_=0)).label("pending"),
|
||||
func.sum(case((Image.status == "rejected", 1), else_=0)).label("rejected"),
|
||||
).group_by(Image.source).all()
|
||||
|
||||
return [
|
||||
{
|
||||
"source": s.source,
|
||||
"total": s.total,
|
||||
"downloaded": s.downloaded or 0,
|
||||
"pending": s.pending or 0,
|
||||
"rejected": s.rejected or 0,
|
||||
}
|
||||
for s in stats
|
||||
]
|
||||
|
||||
|
||||
@router.get("/species")
|
||||
def get_species_stats(
|
||||
min_count: int = 0,
|
||||
max_count: int = None,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Get per-species image counts."""
|
||||
query = db.query(
|
||||
Species.id,
|
||||
Species.scientific_name,
|
||||
Species.common_name,
|
||||
Species.genus,
|
||||
func.count(Image.id).label("image_count")
|
||||
).outerjoin(Image, (Image.species_id == Species.id) & (Image.status == "downloaded")
|
||||
).group_by(Species.id)
|
||||
|
||||
if min_count > 0:
|
||||
query = query.having(func.count(Image.id) >= min_count)
|
||||
|
||||
if max_count is not None:
|
||||
query = query.having(func.count(Image.id) <= max_count)
|
||||
|
||||
stats = query.order_by(func.count(Image.id).desc()).all()
|
||||
|
||||
return [
|
||||
{
|
||||
"id": s.id,
|
||||
"scientific_name": s.scientific_name,
|
||||
"common_name": s.common_name,
|
||||
"genus": s.genus,
|
||||
"image_count": s.image_count,
|
||||
}
|
||||
for s in stats
|
||||
]
|
||||
|
||||
|
||||
@router.get("/distribution")
|
||||
def get_image_distribution(db: Session = Depends(get_db)):
|
||||
"""Get distribution of images per species for ML training assessment.
|
||||
|
||||
Returns counts of species at various image thresholds to help
|
||||
determine dataset quality for training image classifiers.
|
||||
"""
|
||||
from sqlalchemy import text
|
||||
|
||||
# Get image counts per species using optimized raw SQL
|
||||
distribution_sql = text("""
|
||||
WITH species_counts AS (
|
||||
SELECT
|
||||
s.id,
|
||||
COUNT(i.id) as cnt
|
||||
FROM species s
|
||||
LEFT JOIN images i ON i.species_id = s.id AND i.status = 'downloaded'
|
||||
GROUP BY s.id
|
||||
)
|
||||
SELECT
|
||||
COUNT(*) as total_species,
|
||||
SUM(CASE WHEN cnt = 0 THEN 1 ELSE 0 END) as with_0,
|
||||
SUM(CASE WHEN cnt >= 1 AND cnt < 10 THEN 1 ELSE 0 END) as with_1_9,
|
||||
SUM(CASE WHEN cnt >= 10 AND cnt < 25 THEN 1 ELSE 0 END) as with_10_24,
|
||||
SUM(CASE WHEN cnt >= 25 AND cnt < 50 THEN 1 ELSE 0 END) as with_25_49,
|
||||
SUM(CASE WHEN cnt >= 50 AND cnt < 100 THEN 1 ELSE 0 END) as with_50_99,
|
||||
SUM(CASE WHEN cnt >= 100 AND cnt < 200 THEN 1 ELSE 0 END) as with_100_199,
|
||||
SUM(CASE WHEN cnt >= 200 THEN 1 ELSE 0 END) as with_200_plus,
|
||||
SUM(CASE WHEN cnt >= 10 THEN 1 ELSE 0 END) as trainable_10,
|
||||
SUM(CASE WHEN cnt >= 25 THEN 1 ELSE 0 END) as trainable_25,
|
||||
SUM(CASE WHEN cnt >= 50 THEN 1 ELSE 0 END) as trainable_50,
|
||||
SUM(CASE WHEN cnt >= 100 THEN 1 ELSE 0 END) as trainable_100,
|
||||
AVG(cnt) as avg_images,
|
||||
MAX(cnt) as max_images,
|
||||
MIN(cnt) as min_images,
|
||||
SUM(cnt) as total_images
|
||||
FROM species_counts
|
||||
""")
|
||||
|
||||
result = db.execute(distribution_sql).fetchone()
|
||||
|
||||
return {
|
||||
"total_species": result[0] or 0,
|
||||
"distribution": {
|
||||
"0_images": result[1] or 0,
|
||||
"1_to_9": result[2] or 0,
|
||||
"10_to_24": result[3] or 0,
|
||||
"25_to_49": result[4] or 0,
|
||||
"50_to_99": result[5] or 0,
|
||||
"100_to_199": result[6] or 0,
|
||||
"200_plus": result[7] or 0,
|
||||
},
|
||||
"trainable_species": {
|
||||
"min_10_images": result[8] or 0,
|
||||
"min_25_images": result[9] or 0,
|
||||
"min_50_images": result[10] or 0,
|
||||
"min_100_images": result[11] or 0,
|
||||
},
|
||||
"summary": {
|
||||
"avg_images_per_species": round(result[12] or 0, 1),
|
||||
"max_images": result[13] or 0,
|
||||
"min_images": result[14] or 0,
|
||||
"total_downloaded_images": result[15] or 0,
|
||||
},
|
||||
"recommendations": {
|
||||
"for_basic_model": f"{result[8] or 0} species with 10+ images",
|
||||
"for_good_model": f"{result[10] or 0} species with 50+ images",
|
||||
"for_excellent_model": f"{result[11] or 0} species with 100+ images",
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user