Initial commit — PlantGuideScraper project
This commit is contained in:
24
backend/Dockerfile
Normal file
24
backend/Dockerfile
Normal file
@@ -0,0 +1,24 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
gcc \
|
||||
g++ \
|
||||
libffi-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY . .
|
||||
|
||||
# Create data directories
|
||||
RUN mkdir -p /data/db /data/images /data/exports
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
19
backend/add_indexes.py
Normal file
19
backend/add_indexes.py
Normal file
@@ -0,0 +1,19 @@
|
||||
#!/usr/bin/env python
|
||||
"""Add missing database indexes."""
|
||||
from sqlalchemy import text
|
||||
from app.database import engine
|
||||
|
||||
with engine.connect() as conn:
|
||||
# Single column indexes
|
||||
conn.execute(text('CREATE INDEX IF NOT EXISTS ix_images_license ON images(license)'))
|
||||
conn.execute(text('CREATE INDEX IF NOT EXISTS ix_images_status ON images(status)'))
|
||||
conn.execute(text('CREATE INDEX IF NOT EXISTS ix_images_source ON images(source)'))
|
||||
conn.execute(text('CREATE INDEX IF NOT EXISTS ix_images_species_id ON images(species_id)'))
|
||||
conn.execute(text('CREATE INDEX IF NOT EXISTS ix_images_phash ON images(phash)'))
|
||||
|
||||
# Composite indexes for common query patterns
|
||||
conn.execute(text('CREATE INDEX IF NOT EXISTS ix_images_species_status ON images(species_id, status)'))
|
||||
conn.execute(text('CREATE INDEX IF NOT EXISTS ix_images_status_created ON images(status, created_at)'))
|
||||
|
||||
conn.commit()
|
||||
print('All indexes created successfully')
|
||||
42
backend/alembic.ini
Normal file
42
backend/alembic.ini
Normal file
@@ -0,0 +1,42 @@
|
||||
[alembic]
|
||||
script_location = alembic
|
||||
prepend_sys_path = .
|
||||
version_path_separator = os
|
||||
|
||||
sqlalchemy.url = sqlite:////data/db/plants.sqlite
|
||||
|
||||
[post_write_hooks]
|
||||
|
||||
[loggers]
|
||||
keys = root,sqlalchemy,alembic
|
||||
|
||||
[handlers]
|
||||
keys = console
|
||||
|
||||
[formatters]
|
||||
keys = generic
|
||||
|
||||
[logger_root]
|
||||
level = WARN
|
||||
handlers = console
|
||||
qualname =
|
||||
|
||||
[logger_sqlalchemy]
|
||||
level = WARN
|
||||
handlers =
|
||||
qualname = sqlalchemy.engine
|
||||
|
||||
[logger_alembic]
|
||||
level = INFO
|
||||
handlers =
|
||||
qualname = alembic
|
||||
|
||||
[handler_console]
|
||||
class = StreamHandler
|
||||
args = (sys.stderr,)
|
||||
level = NOTSET
|
||||
formatter = generic
|
||||
|
||||
[formatter_generic]
|
||||
format = %(levelname)-5.5s [%(name)s] %(message)s
|
||||
datefmt = %H:%M:%S
|
||||
54
backend/alembic/env.py
Normal file
54
backend/alembic/env.py
Normal file
@@ -0,0 +1,54 @@
|
||||
from logging.config import fileConfig
|
||||
|
||||
from sqlalchemy import engine_from_config
|
||||
from sqlalchemy import pool
|
||||
|
||||
from alembic import context
|
||||
|
||||
# Import models for autogenerate
|
||||
from app.database import Base
|
||||
from app.models import Species, Image, Job, ApiKey, Export
|
||||
|
||||
config = context.config
|
||||
|
||||
if config.config_file_name is not None:
|
||||
fileConfig(config.config_file_name)
|
||||
|
||||
target_metadata = Base.metadata
|
||||
|
||||
|
||||
def run_migrations_offline() -> None:
|
||||
"""Run migrations in 'offline' mode."""
|
||||
url = config.get_main_option("sqlalchemy.url")
|
||||
context.configure(
|
||||
url=url,
|
||||
target_metadata=target_metadata,
|
||||
literal_binds=True,
|
||||
dialect_opts={"paramstyle": "named"},
|
||||
)
|
||||
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
def run_migrations_online() -> None:
|
||||
"""Run migrations in 'online' mode."""
|
||||
connectable = engine_from_config(
|
||||
config.get_section(config.config_ini_section, {}),
|
||||
prefix="sqlalchemy.",
|
||||
poolclass=pool.NullPool,
|
||||
)
|
||||
|
||||
with connectable.connect() as connection:
|
||||
context.configure(
|
||||
connection=connection, target_metadata=target_metadata
|
||||
)
|
||||
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
if context.is_offline_mode():
|
||||
run_migrations_offline()
|
||||
else:
|
||||
run_migrations_online()
|
||||
26
backend/alembic/script.py.mako
Normal file
26
backend/alembic/script.py.mako
Normal file
@@ -0,0 +1,26 @@
|
||||
"""${message}
|
||||
|
||||
Revision ID: ${up_revision}
|
||||
Revises: ${down_revision | comma,n}
|
||||
Create Date: ${create_date}
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
${imports if imports else ""}
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = ${repr(up_revision)}
|
||||
down_revision: Union[str, None] = ${repr(down_revision)}
|
||||
branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
|
||||
depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
${upgrades if upgrades else "pass"}
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
${downgrades if downgrades else "pass"}
|
||||
112
backend/alembic/versions/001_initial.py
Normal file
112
backend/alembic/versions/001_initial.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""Initial migration
|
||||
|
||||
Revision ID: 001
|
||||
Revises:
|
||||
Create Date: 2024-01-01
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
revision: str = '001'
|
||||
down_revision: Union[str, None] = None
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Species table
|
||||
op.create_table(
|
||||
'species',
|
||||
sa.Column('id', sa.Integer(), primary_key=True),
|
||||
sa.Column('scientific_name', sa.String(), nullable=False, unique=True),
|
||||
sa.Column('common_name', sa.String(), nullable=True),
|
||||
sa.Column('genus', sa.String(), nullable=True),
|
||||
sa.Column('family', sa.String(), nullable=True),
|
||||
sa.Column('created_at', sa.DateTime(), server_default=sa.func.now()),
|
||||
)
|
||||
op.create_index('ix_species_scientific_name', 'species', ['scientific_name'])
|
||||
op.create_index('ix_species_genus', 'species', ['genus'])
|
||||
|
||||
# API Keys table
|
||||
op.create_table(
|
||||
'api_keys',
|
||||
sa.Column('id', sa.Integer(), primary_key=True),
|
||||
sa.Column('source', sa.String(), nullable=False, unique=True),
|
||||
sa.Column('api_key', sa.String(), nullable=False),
|
||||
sa.Column('api_secret', sa.String(), nullable=True),
|
||||
sa.Column('rate_limit_per_sec', sa.Float(), default=1.0),
|
||||
sa.Column('enabled', sa.Boolean(), default=True),
|
||||
)
|
||||
|
||||
# Images table
|
||||
op.create_table(
|
||||
'images',
|
||||
sa.Column('id', sa.Integer(), primary_key=True),
|
||||
sa.Column('species_id', sa.Integer(), sa.ForeignKey('species.id'), nullable=False),
|
||||
sa.Column('source', sa.String(), nullable=False),
|
||||
sa.Column('source_id', sa.String(), nullable=True),
|
||||
sa.Column('url', sa.String(), nullable=False),
|
||||
sa.Column('local_path', sa.String(), nullable=True),
|
||||
sa.Column('license', sa.String(), nullable=False),
|
||||
sa.Column('attribution', sa.String(), nullable=True),
|
||||
sa.Column('width', sa.Integer(), nullable=True),
|
||||
sa.Column('height', sa.Integer(), nullable=True),
|
||||
sa.Column('phash', sa.String(), nullable=True),
|
||||
sa.Column('quality_score', sa.Float(), nullable=True),
|
||||
sa.Column('status', sa.String(), default='pending'),
|
||||
sa.Column('created_at', sa.DateTime(), server_default=sa.func.now()),
|
||||
)
|
||||
op.create_index('ix_images_species_id', 'images', ['species_id'])
|
||||
op.create_index('ix_images_source', 'images', ['source'])
|
||||
op.create_index('ix_images_status', 'images', ['status'])
|
||||
op.create_index('ix_images_phash', 'images', ['phash'])
|
||||
op.create_unique_constraint('uq_source_source_id', 'images', ['source', 'source_id'])
|
||||
|
||||
# Jobs table
|
||||
op.create_table(
|
||||
'jobs',
|
||||
sa.Column('id', sa.Integer(), primary_key=True),
|
||||
sa.Column('name', sa.String(), nullable=False),
|
||||
sa.Column('source', sa.String(), nullable=False),
|
||||
sa.Column('species_filter', sa.Text(), nullable=True),
|
||||
sa.Column('status', sa.String(), default='pending'),
|
||||
sa.Column('progress_current', sa.Integer(), default=0),
|
||||
sa.Column('progress_total', sa.Integer(), default=0),
|
||||
sa.Column('images_downloaded', sa.Integer(), default=0),
|
||||
sa.Column('images_rejected', sa.Integer(), default=0),
|
||||
sa.Column('celery_task_id', sa.String(), nullable=True),
|
||||
sa.Column('started_at', sa.DateTime(), nullable=True),
|
||||
sa.Column('completed_at', sa.DateTime(), nullable=True),
|
||||
sa.Column('error_message', sa.Text(), nullable=True),
|
||||
sa.Column('created_at', sa.DateTime(), server_default=sa.func.now()),
|
||||
)
|
||||
op.create_index('ix_jobs_status', 'jobs', ['status'])
|
||||
|
||||
# Exports table
|
||||
op.create_table(
|
||||
'exports',
|
||||
sa.Column('id', sa.Integer(), primary_key=True),
|
||||
sa.Column('name', sa.String(), nullable=False),
|
||||
sa.Column('filter_criteria', sa.Text(), nullable=True),
|
||||
sa.Column('train_split', sa.Float(), default=0.8),
|
||||
sa.Column('status', sa.String(), default='pending'),
|
||||
sa.Column('file_path', sa.String(), nullable=True),
|
||||
sa.Column('file_size', sa.Integer(), nullable=True),
|
||||
sa.Column('species_count', sa.Integer(), nullable=True),
|
||||
sa.Column('image_count', sa.Integer(), nullable=True),
|
||||
sa.Column('celery_task_id', sa.String(), nullable=True),
|
||||
sa.Column('created_at', sa.DateTime(), server_default=sa.func.now()),
|
||||
sa.Column('completed_at', sa.DateTime(), nullable=True),
|
||||
sa.Column('error_message', sa.Text(), nullable=True),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_table('exports')
|
||||
op.drop_table('jobs')
|
||||
op.drop_table('images')
|
||||
op.drop_table('api_keys')
|
||||
op.drop_table('species')
|
||||
53
backend/alembic/versions/002_add_cached_stats_and_indexes.py
Normal file
53
backend/alembic/versions/002_add_cached_stats_and_indexes.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""Add cached_stats table and license index
|
||||
|
||||
Revision ID: 002
|
||||
Revises: 001
|
||||
Create Date: 2025-01-25
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
revision: str = '002'
|
||||
down_revision: Union[str, None] = '001'
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Cached stats table for pre-calculated dashboard statistics
|
||||
op.create_table(
|
||||
'cached_stats',
|
||||
sa.Column('id', sa.Integer(), primary_key=True),
|
||||
sa.Column('key', sa.String(50), nullable=False, unique=True),
|
||||
sa.Column('value', sa.Text(), nullable=False),
|
||||
sa.Column('updated_at', sa.DateTime(), server_default=sa.func.now()),
|
||||
)
|
||||
op.create_index('ix_cached_stats_key', 'cached_stats', ['key'])
|
||||
|
||||
# Add license index to images table (if not exists)
|
||||
# Using batch mode for SQLite compatibility
|
||||
try:
|
||||
op.create_index('ix_images_license', 'images', ['license'])
|
||||
except Exception:
|
||||
pass # Index may already exist
|
||||
|
||||
# Add only_without_images column to jobs if it doesn't exist
|
||||
try:
|
||||
op.add_column('jobs', sa.Column('only_without_images', sa.Boolean(), default=False))
|
||||
except Exception:
|
||||
pass # Column may already exist
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
try:
|
||||
op.drop_index('ix_images_license', 'images')
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
op.drop_column('jobs', 'only_without_images')
|
||||
except Exception:
|
||||
pass
|
||||
op.drop_table('cached_stats')
|
||||
31
backend/alembic/versions/003_add_job_max_images.py
Normal file
31
backend/alembic/versions/003_add_job_max_images.py
Normal file
@@ -0,0 +1,31 @@
|
||||
"""Add max_images column to jobs table
|
||||
|
||||
Revision ID: 003
|
||||
Revises: 002
|
||||
Create Date: 2025-01-25
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
revision: str = '003'
|
||||
down_revision: Union[str, None] = '002'
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Add max_images column to jobs table
|
||||
try:
|
||||
op.add_column('jobs', sa.Column('max_images', sa.Integer(), nullable=True))
|
||||
except Exception:
|
||||
pass # Column may already exist
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
try:
|
||||
op.drop_column('jobs', 'max_images')
|
||||
except Exception:
|
||||
pass
|
||||
1
backend/app/__init__.py
Normal file
1
backend/app/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# PlantGuideScraper Backend
|
||||
1
backend/app/api/__init__.py
Normal file
1
backend/app/api/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# API routes
|
||||
175
backend/app/api/exports.py
Normal file
175
backend/app/api/exports.py
Normal file
@@ -0,0 +1,175 @@
|
||||
import json
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
from fastapi.responses import FileResponse
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import func
|
||||
|
||||
from app.database import get_db
|
||||
from app.models import Export, Image, Species
|
||||
from app.schemas.export import (
|
||||
ExportCreate,
|
||||
ExportResponse,
|
||||
ExportListResponse,
|
||||
ExportPreview,
|
||||
)
|
||||
from app.workers.export_tasks import generate_export
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("", response_model=ExportListResponse)
|
||||
def list_exports(
|
||||
limit: int = Query(50, ge=1, le=200),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""List all exports."""
|
||||
total = db.query(Export).count()
|
||||
exports = db.query(Export).order_by(Export.created_at.desc()).limit(limit).all()
|
||||
|
||||
return ExportListResponse(
|
||||
items=[ExportResponse.model_validate(e) for e in exports],
|
||||
total=total,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/preview", response_model=ExportPreview)
|
||||
def preview_export(export: ExportCreate, db: Session = Depends(get_db)):
|
||||
"""Preview export without creating it."""
|
||||
criteria = export.filter_criteria
|
||||
min_images = criteria.min_images_per_species
|
||||
|
||||
# Build query
|
||||
query = db.query(Image).filter(Image.status == "downloaded")
|
||||
|
||||
if criteria.licenses:
|
||||
query = query.filter(Image.license.in_(criteria.licenses))
|
||||
|
||||
if criteria.min_quality:
|
||||
query = query.filter(Image.quality_score >= criteria.min_quality)
|
||||
|
||||
if criteria.species_ids:
|
||||
query = query.filter(Image.species_id.in_(criteria.species_ids))
|
||||
|
||||
# Count images per species
|
||||
species_counts = db.query(
|
||||
Image.species_id,
|
||||
func.count(Image.id).label("count")
|
||||
).filter(Image.status == "downloaded")
|
||||
|
||||
if criteria.licenses:
|
||||
species_counts = species_counts.filter(Image.license.in_(criteria.licenses))
|
||||
if criteria.min_quality:
|
||||
species_counts = species_counts.filter(Image.quality_score >= criteria.min_quality)
|
||||
if criteria.species_ids:
|
||||
species_counts = species_counts.filter(Image.species_id.in_(criteria.species_ids))
|
||||
|
||||
species_counts = species_counts.group_by(Image.species_id).all()
|
||||
|
||||
valid_species = [s for s in species_counts if s.count >= min_images]
|
||||
total_images = sum(s.count for s in valid_species)
|
||||
|
||||
# Estimate file size (rough: 50KB per image)
|
||||
estimated_size_mb = (total_images * 50) / 1024
|
||||
|
||||
return ExportPreview(
|
||||
species_count=len(valid_species),
|
||||
image_count=total_images,
|
||||
estimated_size_mb=estimated_size_mb,
|
||||
)
|
||||
|
||||
|
||||
@router.post("", response_model=ExportResponse)
|
||||
def create_export(export: ExportCreate, db: Session = Depends(get_db)):
|
||||
"""Create and start a new export job."""
|
||||
db_export = Export(
|
||||
name=export.name,
|
||||
filter_criteria=export.filter_criteria.model_dump_json(),
|
||||
train_split=export.train_split,
|
||||
status="pending",
|
||||
)
|
||||
db.add(db_export)
|
||||
db.commit()
|
||||
db.refresh(db_export)
|
||||
|
||||
# Start Celery task
|
||||
task = generate_export.delay(db_export.id)
|
||||
db_export.celery_task_id = task.id
|
||||
db.commit()
|
||||
|
||||
return ExportResponse.model_validate(db_export)
|
||||
|
||||
|
||||
@router.get("/{export_id}", response_model=ExportResponse)
|
||||
def get_export(export_id: int, db: Session = Depends(get_db)):
|
||||
"""Get export status."""
|
||||
export = db.query(Export).filter(Export.id == export_id).first()
|
||||
if not export:
|
||||
raise HTTPException(status_code=404, detail="Export not found")
|
||||
|
||||
return ExportResponse.model_validate(export)
|
||||
|
||||
|
||||
@router.get("/{export_id}/progress")
|
||||
def get_export_progress(export_id: int, db: Session = Depends(get_db)):
|
||||
"""Get real-time export progress."""
|
||||
from app.workers.celery_app import celery_app
|
||||
|
||||
export = db.query(Export).filter(Export.id == export_id).first()
|
||||
if not export:
|
||||
raise HTTPException(status_code=404, detail="Export not found")
|
||||
|
||||
if not export.celery_task_id:
|
||||
return {"status": export.status}
|
||||
|
||||
result = celery_app.AsyncResult(export.celery_task_id)
|
||||
|
||||
if result.state == "PROGRESS":
|
||||
meta = result.info
|
||||
return {
|
||||
"status": "generating",
|
||||
"current": meta.get("current", 0),
|
||||
"total": meta.get("total", 0),
|
||||
"current_species": meta.get("species", ""),
|
||||
}
|
||||
|
||||
return {"status": export.status}
|
||||
|
||||
|
||||
@router.get("/{export_id}/download")
|
||||
def download_export(export_id: int, db: Session = Depends(get_db)):
|
||||
"""Download export zip file."""
|
||||
export = db.query(Export).filter(Export.id == export_id).first()
|
||||
if not export:
|
||||
raise HTTPException(status_code=404, detail="Export not found")
|
||||
|
||||
if export.status != "completed":
|
||||
raise HTTPException(status_code=400, detail="Export not ready")
|
||||
|
||||
if not export.file_path or not os.path.exists(export.file_path):
|
||||
raise HTTPException(status_code=404, detail="Export file not found")
|
||||
|
||||
return FileResponse(
|
||||
export.file_path,
|
||||
media_type="application/zip",
|
||||
filename=f"{export.name}.zip",
|
||||
)
|
||||
|
||||
|
||||
@router.delete("/{export_id}")
|
||||
def delete_export(export_id: int, db: Session = Depends(get_db)):
|
||||
"""Delete an export and its file."""
|
||||
export = db.query(Export).filter(Export.id == export_id).first()
|
||||
if not export:
|
||||
raise HTTPException(status_code=404, detail="Export not found")
|
||||
|
||||
# Delete file if exists
|
||||
if export.file_path and os.path.exists(export.file_path):
|
||||
os.remove(export.file_path)
|
||||
|
||||
db.delete(export)
|
||||
db.commit()
|
||||
|
||||
return {"status": "deleted"}
|
||||
441
backend/app/api/images.py
Normal file
441
backend/app/api/images.py
Normal file
@@ -0,0 +1,441 @@
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Optional, List
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
from fastapi.responses import FileResponse
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import func
|
||||
from PIL import Image as PILImage
|
||||
|
||||
from app.database import get_db
|
||||
from app.models import Image, Species
|
||||
from app.schemas.image import ImageResponse, ImageListResponse
|
||||
from app.config import get_settings
|
||||
|
||||
router = APIRouter()
|
||||
settings = get_settings()
|
||||
|
||||
|
||||
@router.get("", response_model=ImageListResponse)
|
||||
def list_images(
|
||||
page: int = Query(1, ge=1),
|
||||
page_size: int = Query(50, ge=1, le=200),
|
||||
species_id: Optional[int] = None,
|
||||
source: Optional[str] = None,
|
||||
license: Optional[str] = None,
|
||||
status: Optional[str] = None,
|
||||
min_quality: Optional[float] = None,
|
||||
search: Optional[str] = None,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""List images with pagination and filters."""
|
||||
# Use joinedload to fetch species in single query
|
||||
from sqlalchemy.orm import joinedload
|
||||
query = db.query(Image).options(joinedload(Image.species))
|
||||
|
||||
if species_id:
|
||||
query = query.filter(Image.species_id == species_id)
|
||||
|
||||
if source:
|
||||
query = query.filter(Image.source == source)
|
||||
|
||||
if license:
|
||||
query = query.filter(Image.license == license)
|
||||
|
||||
if status:
|
||||
query = query.filter(Image.status == status)
|
||||
|
||||
if min_quality:
|
||||
query = query.filter(Image.quality_score >= min_quality)
|
||||
|
||||
if search:
|
||||
search_term = f"%{search}%"
|
||||
query = query.join(Species).filter(
|
||||
(Species.scientific_name.ilike(search_term)) |
|
||||
(Species.common_name.ilike(search_term))
|
||||
)
|
||||
|
||||
# Use faster count for simple queries
|
||||
if not search:
|
||||
# Build count query without join for better performance
|
||||
count_query = db.query(func.count(Image.id))
|
||||
if species_id:
|
||||
count_query = count_query.filter(Image.species_id == species_id)
|
||||
if source:
|
||||
count_query = count_query.filter(Image.source == source)
|
||||
if license:
|
||||
count_query = count_query.filter(Image.license == license)
|
||||
if status:
|
||||
count_query = count_query.filter(Image.status == status)
|
||||
if min_quality:
|
||||
count_query = count_query.filter(Image.quality_score >= min_quality)
|
||||
total = count_query.scalar()
|
||||
else:
|
||||
total = query.count()
|
||||
|
||||
pages = (total + page_size - 1) // page_size
|
||||
|
||||
images = query.order_by(Image.created_at.desc()).offset(
|
||||
(page - 1) * page_size
|
||||
).limit(page_size).all()
|
||||
|
||||
items = [
|
||||
ImageResponse(
|
||||
id=img.id,
|
||||
species_id=img.species_id,
|
||||
species_name=img.species.scientific_name if img.species else None,
|
||||
source=img.source,
|
||||
source_id=img.source_id,
|
||||
url=img.url,
|
||||
local_path=img.local_path,
|
||||
license=img.license,
|
||||
attribution=img.attribution,
|
||||
width=img.width,
|
||||
height=img.height,
|
||||
quality_score=img.quality_score,
|
||||
status=img.status,
|
||||
created_at=img.created_at,
|
||||
)
|
||||
for img in images
|
||||
]
|
||||
|
||||
return ImageListResponse(
|
||||
items=items,
|
||||
total=total,
|
||||
page=page,
|
||||
page_size=page_size,
|
||||
pages=pages,
|
||||
)
|
||||
|
||||
|
||||
@router.get("/sources")
|
||||
def list_sources(db: Session = Depends(get_db)):
|
||||
"""List all unique image sources."""
|
||||
sources = db.query(Image.source).distinct().all()
|
||||
return [s[0] for s in sources]
|
||||
|
||||
|
||||
@router.get("/licenses")
|
||||
def list_licenses(db: Session = Depends(get_db)):
|
||||
"""List all unique licenses."""
|
||||
licenses = db.query(Image.license).distinct().all()
|
||||
return [l[0] for l in licenses]
|
||||
|
||||
|
||||
@router.post("/process-pending")
|
||||
def process_pending_images(
|
||||
source: Optional[str] = None,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Queue all pending images for download and processing."""
|
||||
from app.workers.quality_tasks import batch_process_pending_images
|
||||
|
||||
query = db.query(func.count(Image.id)).filter(Image.status == "pending")
|
||||
if source:
|
||||
query = query.filter(Image.source == source)
|
||||
pending_count = query.scalar()
|
||||
|
||||
task = batch_process_pending_images.delay(source=source)
|
||||
|
||||
return {
|
||||
"pending_count": pending_count,
|
||||
"task_id": task.id,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/process-pending/status/{task_id}")
|
||||
def process_pending_status(task_id: str):
|
||||
"""Check status of a batch processing task."""
|
||||
from app.workers.celery_app import celery_app
|
||||
|
||||
result = celery_app.AsyncResult(task_id)
|
||||
state = result.state # PENDING, STARTED, PROGRESS, SUCCESS, FAILURE
|
||||
|
||||
response = {"task_id": task_id, "state": state}
|
||||
|
||||
if state == "PROGRESS" and isinstance(result.info, dict):
|
||||
response["queued"] = result.info.get("queued", 0)
|
||||
response["total"] = result.info.get("total", 0)
|
||||
elif state == "SUCCESS" and isinstance(result.result, dict):
|
||||
response["queued"] = result.result.get("queued", 0)
|
||||
response["total"] = result.result.get("total", 0)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
@router.get("/{image_id}", response_model=ImageResponse)
|
||||
def get_image(image_id: int, db: Session = Depends(get_db)):
|
||||
"""Get an image by ID."""
|
||||
image = db.query(Image).filter(Image.id == image_id).first()
|
||||
if not image:
|
||||
raise HTTPException(status_code=404, detail="Image not found")
|
||||
|
||||
return ImageResponse(
|
||||
id=image.id,
|
||||
species_id=image.species_id,
|
||||
species_name=image.species.scientific_name if image.species else None,
|
||||
source=image.source,
|
||||
source_id=image.source_id,
|
||||
url=image.url,
|
||||
local_path=image.local_path,
|
||||
license=image.license,
|
||||
attribution=image.attribution,
|
||||
width=image.width,
|
||||
height=image.height,
|
||||
quality_score=image.quality_score,
|
||||
status=image.status,
|
||||
created_at=image.created_at,
|
||||
)
|
||||
|
||||
|
||||
@router.get("/{image_id}/file")
|
||||
def get_image_file(image_id: int, db: Session = Depends(get_db)):
|
||||
"""Get the actual image file."""
|
||||
image = db.query(Image).filter(Image.id == image_id).first()
|
||||
if not image:
|
||||
raise HTTPException(status_code=404, detail="Image not found")
|
||||
|
||||
if not image.local_path:
|
||||
raise HTTPException(status_code=404, detail="Image file not available")
|
||||
|
||||
return FileResponse(image.local_path, media_type="image/jpeg")
|
||||
|
||||
|
||||
@router.delete("/{image_id}")
|
||||
def delete_image(image_id: int, db: Session = Depends(get_db)):
|
||||
"""Delete an image."""
|
||||
image = db.query(Image).filter(Image.id == image_id).first()
|
||||
if not image:
|
||||
raise HTTPException(status_code=404, detail="Image not found")
|
||||
|
||||
# Delete file if exists
|
||||
if image.local_path:
|
||||
import os
|
||||
if os.path.exists(image.local_path):
|
||||
os.remove(image.local_path)
|
||||
|
||||
db.delete(image)
|
||||
db.commit()
|
||||
|
||||
return {"status": "deleted"}
|
||||
|
||||
|
||||
@router.post("/bulk-delete")
|
||||
def bulk_delete_images(
|
||||
image_ids: List[int],
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Delete multiple images."""
|
||||
import os
|
||||
|
||||
images = db.query(Image).filter(Image.id.in_(image_ids)).all()
|
||||
|
||||
deleted = 0
|
||||
for image in images:
|
||||
if image.local_path and os.path.exists(image.local_path):
|
||||
os.remove(image.local_path)
|
||||
db.delete(image)
|
||||
deleted += 1
|
||||
|
||||
db.commit()
|
||||
|
||||
return {"deleted": deleted}
|
||||
|
||||
|
||||
@router.get("/import/scan")
|
||||
def scan_imports(db: Session = Depends(get_db)):
|
||||
"""Scan the imports folder and return what can be imported.
|
||||
|
||||
Expected structure: imports/{source}/{species_name}/*.jpg
|
||||
"""
|
||||
imports_path = Path(settings.imports_path)
|
||||
|
||||
if not imports_path.exists():
|
||||
return {
|
||||
"available": False,
|
||||
"message": f"Imports folder not found: {imports_path}",
|
||||
"sources": [],
|
||||
"total_images": 0,
|
||||
"matched_species": 0,
|
||||
"unmatched_species": [],
|
||||
}
|
||||
|
||||
results = {
|
||||
"available": True,
|
||||
"sources": [],
|
||||
"total_images": 0,
|
||||
"matched_species": 0,
|
||||
"unmatched_species": [],
|
||||
}
|
||||
|
||||
# Get all species for matching
|
||||
species_map = {}
|
||||
for species in db.query(Species).all():
|
||||
# Map by scientific name with underscores and spaces
|
||||
species_map[species.scientific_name.lower()] = species
|
||||
species_map[species.scientific_name.replace(" ", "_").lower()] = species
|
||||
|
||||
seen_unmatched = set()
|
||||
|
||||
# Scan source folders
|
||||
for source_dir in imports_path.iterdir():
|
||||
if not source_dir.is_dir():
|
||||
continue
|
||||
|
||||
source_name = source_dir.name
|
||||
source_info = {
|
||||
"name": source_name,
|
||||
"species_count": 0,
|
||||
"image_count": 0,
|
||||
}
|
||||
|
||||
# Scan species folders within source
|
||||
for species_dir in source_dir.iterdir():
|
||||
if not species_dir.is_dir():
|
||||
continue
|
||||
|
||||
species_name = species_dir.name.replace("_", " ")
|
||||
species_key = species_name.lower()
|
||||
|
||||
# Count images
|
||||
image_files = list(species_dir.glob("*.jpg")) + \
|
||||
list(species_dir.glob("*.jpeg")) + \
|
||||
list(species_dir.glob("*.png"))
|
||||
|
||||
if not image_files:
|
||||
continue
|
||||
|
||||
source_info["image_count"] += len(image_files)
|
||||
results["total_images"] += len(image_files)
|
||||
|
||||
if species_key in species_map or species_dir.name.lower() in species_map:
|
||||
source_info["species_count"] += 1
|
||||
results["matched_species"] += 1
|
||||
else:
|
||||
if species_name not in seen_unmatched:
|
||||
seen_unmatched.add(species_name)
|
||||
results["unmatched_species"].append(species_name)
|
||||
|
||||
if source_info["image_count"] > 0:
|
||||
results["sources"].append(source_info)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@router.post("/import/run")
|
||||
def run_import(
|
||||
move_files: bool = Query(False, description="Move files instead of copy"),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Import images from the imports folder.
|
||||
|
||||
Expected structure: imports/{source}/{species_name}/*.jpg
|
||||
Images are copied/moved to: images/{species_name}/{source}_{filename}
|
||||
"""
|
||||
imports_path = Path(settings.imports_path)
|
||||
images_path = Path(settings.images_path)
|
||||
|
||||
if not imports_path.exists():
|
||||
raise HTTPException(status_code=400, detail="Imports folder not found")
|
||||
|
||||
# Get all species for matching
|
||||
species_map = {}
|
||||
for species in db.query(Species).all():
|
||||
species_map[species.scientific_name.lower()] = species
|
||||
species_map[species.scientific_name.replace(" ", "_").lower()] = species
|
||||
|
||||
imported = 0
|
||||
skipped = 0
|
||||
errors = []
|
||||
|
||||
# Scan source folders
|
||||
for source_dir in imports_path.iterdir():
|
||||
if not source_dir.is_dir():
|
||||
continue
|
||||
|
||||
source_name = source_dir.name
|
||||
|
||||
# Scan species folders within source
|
||||
for species_dir in source_dir.iterdir():
|
||||
if not species_dir.is_dir():
|
||||
continue
|
||||
|
||||
species_name = species_dir.name.replace("_", " ")
|
||||
species_key = species_name.lower()
|
||||
|
||||
# Find matching species
|
||||
species = species_map.get(species_key) or species_map.get(species_dir.name.lower())
|
||||
if not species:
|
||||
continue
|
||||
|
||||
# Create target directory
|
||||
target_dir = images_path / species.scientific_name.replace(" ", "_")
|
||||
target_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Process images
|
||||
image_files = list(species_dir.glob("*.jpg")) + \
|
||||
list(species_dir.glob("*.jpeg")) + \
|
||||
list(species_dir.glob("*.png"))
|
||||
|
||||
for img_file in image_files:
|
||||
try:
|
||||
# Generate unique filename
|
||||
ext = img_file.suffix.lower()
|
||||
if ext == ".jpeg":
|
||||
ext = ".jpg"
|
||||
new_filename = f"{source_name}_{img_file.stem}_{uuid.uuid4().hex[:8]}{ext}"
|
||||
target_path = target_dir / new_filename
|
||||
|
||||
# Check if already imported (by original filename pattern)
|
||||
existing = db.query(Image).filter(
|
||||
Image.species_id == species.id,
|
||||
Image.source == source_name,
|
||||
Image.source_id == img_file.stem,
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
# Get image dimensions
|
||||
try:
|
||||
with PILImage.open(img_file) as pil_img:
|
||||
width, height = pil_img.size
|
||||
except Exception:
|
||||
width, height = None, None
|
||||
|
||||
# Copy or move file
|
||||
if move_files:
|
||||
shutil.move(str(img_file), str(target_path))
|
||||
else:
|
||||
shutil.copy2(str(img_file), str(target_path))
|
||||
|
||||
# Create database record
|
||||
image = Image(
|
||||
species_id=species.id,
|
||||
source=source_name,
|
||||
source_id=img_file.stem,
|
||||
url=f"file://{img_file}",
|
||||
local_path=str(target_path),
|
||||
license="unknown",
|
||||
width=width,
|
||||
height=height,
|
||||
status="downloaded",
|
||||
)
|
||||
db.add(image)
|
||||
imported += 1
|
||||
|
||||
except Exception as e:
|
||||
errors.append(f"{img_file}: {str(e)}")
|
||||
|
||||
# Commit after each species to avoid large transactions
|
||||
db.commit()
|
||||
|
||||
return {
|
||||
"imported": imported,
|
||||
"skipped": skipped,
|
||||
"errors": errors[:20],
|
||||
}
|
||||
173
backend/app/api/jobs.py
Normal file
173
backend/app/api/jobs.py
Normal file
@@ -0,0 +1,173 @@
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.database import get_db
|
||||
from app.models import Job
|
||||
from app.schemas.job import JobCreate, JobResponse, JobListResponse
|
||||
from app.workers.scrape_tasks import run_scrape_job
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("", response_model=JobListResponse)
|
||||
def list_jobs(
|
||||
status: Optional[str] = None,
|
||||
source: Optional[str] = None,
|
||||
limit: int = Query(50, ge=1, le=200),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""List all jobs."""
|
||||
query = db.query(Job)
|
||||
|
||||
if status:
|
||||
query = query.filter(Job.status == status)
|
||||
|
||||
if source:
|
||||
query = query.filter(Job.source == source)
|
||||
|
||||
total = query.count()
|
||||
jobs = query.order_by(Job.created_at.desc()).limit(limit).all()
|
||||
|
||||
return JobListResponse(
|
||||
items=[JobResponse.model_validate(j) for j in jobs],
|
||||
total=total,
|
||||
)
|
||||
|
||||
|
||||
@router.post("", response_model=JobResponse)
|
||||
def create_job(job: JobCreate, db: Session = Depends(get_db)):
|
||||
"""Create and start a new scrape job."""
|
||||
species_filter = None
|
||||
if job.species_ids:
|
||||
species_filter = json.dumps(job.species_ids)
|
||||
|
||||
db_job = Job(
|
||||
name=job.name,
|
||||
source=job.source,
|
||||
species_filter=species_filter,
|
||||
only_without_images=job.only_without_images,
|
||||
max_images=job.max_images,
|
||||
status="pending",
|
||||
)
|
||||
db.add(db_job)
|
||||
db.commit()
|
||||
db.refresh(db_job)
|
||||
|
||||
# Start the Celery task
|
||||
task = run_scrape_job.delay(db_job.id)
|
||||
db_job.celery_task_id = task.id
|
||||
db.commit()
|
||||
|
||||
return JobResponse.model_validate(db_job)
|
||||
|
||||
|
||||
@router.get("/{job_id}", response_model=JobResponse)
|
||||
def get_job(job_id: int, db: Session = Depends(get_db)):
|
||||
"""Get job status."""
|
||||
job = db.query(Job).filter(Job.id == job_id).first()
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
return JobResponse.model_validate(job)
|
||||
|
||||
|
||||
@router.get("/{job_id}/progress")
|
||||
def get_job_progress(job_id: int, db: Session = Depends(get_db)):
|
||||
"""Get real-time job progress from Celery."""
|
||||
from app.workers.celery_app import celery_app
|
||||
|
||||
job = db.query(Job).filter(Job.id == job_id).first()
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
if not job.celery_task_id:
|
||||
return {
|
||||
"status": job.status,
|
||||
"progress_current": job.progress_current,
|
||||
"progress_total": job.progress_total,
|
||||
}
|
||||
|
||||
# Get Celery task state
|
||||
result = celery_app.AsyncResult(job.celery_task_id)
|
||||
|
||||
if result.state == "PROGRESS":
|
||||
meta = result.info
|
||||
return {
|
||||
"status": "running",
|
||||
"progress_current": meta.get("current", 0),
|
||||
"progress_total": meta.get("total", 0),
|
||||
"current_species": meta.get("species", ""),
|
||||
}
|
||||
|
||||
return {
|
||||
"status": job.status,
|
||||
"progress_current": job.progress_current,
|
||||
"progress_total": job.progress_total,
|
||||
}
|
||||
|
||||
|
||||
@router.post("/{job_id}/pause")
|
||||
def pause_job(job_id: int, db: Session = Depends(get_db)):
|
||||
"""Pause a running job."""
|
||||
from app.workers.celery_app import celery_app
|
||||
|
||||
job = db.query(Job).filter(Job.id == job_id).first()
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
if job.status != "running":
|
||||
raise HTTPException(status_code=400, detail="Job is not running")
|
||||
|
||||
# Revoke Celery task
|
||||
if job.celery_task_id:
|
||||
celery_app.control.revoke(job.celery_task_id, terminate=True)
|
||||
|
||||
job.status = "paused"
|
||||
db.commit()
|
||||
|
||||
return {"status": "paused"}
|
||||
|
||||
|
||||
@router.post("/{job_id}/resume")
|
||||
def resume_job(job_id: int, db: Session = Depends(get_db)):
|
||||
"""Resume a paused job."""
|
||||
job = db.query(Job).filter(Job.id == job_id).first()
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
if job.status != "paused":
|
||||
raise HTTPException(status_code=400, detail="Job is not paused")
|
||||
|
||||
# Start new Celery task
|
||||
task = run_scrape_job.delay(job.id)
|
||||
job.celery_task_id = task.id
|
||||
job.status = "pending"
|
||||
db.commit()
|
||||
|
||||
return {"status": "resumed"}
|
||||
|
||||
|
||||
@router.post("/{job_id}/cancel")
|
||||
def cancel_job(job_id: int, db: Session = Depends(get_db)):
|
||||
"""Cancel a job."""
|
||||
from app.workers.celery_app import celery_app
|
||||
|
||||
job = db.query(Job).filter(Job.id == job_id).first()
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
if job.status in ["completed", "failed"]:
|
||||
raise HTTPException(status_code=400, detail="Job already finished")
|
||||
|
||||
# Revoke Celery task
|
||||
if job.celery_task_id:
|
||||
celery_app.control.revoke(job.celery_task_id, terminate=True)
|
||||
|
||||
job.status = "failed"
|
||||
job.error_message = "Cancelled by user"
|
||||
db.commit()
|
||||
|
||||
return {"status": "cancelled"}
|
||||
198
backend/app/api/sources.py
Normal file
198
backend/app/api/sources.py
Normal file
@@ -0,0 +1,198 @@
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.database import get_db
|
||||
from app.models import ApiKey
|
||||
from app.schemas.api_key import ApiKeyCreate, ApiKeyUpdate, ApiKeyResponse
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
# Available sources
|
||||
# auth_type: "none" (no auth), "api_key" (single key), "api_key_secret" (key + secret), "oauth" (client_id + client_secret + access_token)
|
||||
# default_rate: safe default requests per second for each API
|
||||
AVAILABLE_SOURCES = [
|
||||
{"name": "gbif", "label": "GBIF", "requires_secret": False, "auth_type": "none", "default_rate": 1.0}, # Free, no auth required
|
||||
{"name": "inaturalist", "label": "iNaturalist", "requires_secret": True, "auth_type": "api_key_secret", "default_rate": 1.0}, # 60/min limit
|
||||
{"name": "flickr", "label": "Flickr", "requires_secret": True, "auth_type": "api_key_secret", "default_rate": 0.5}, # 3600/hr shared limit
|
||||
{"name": "wikimedia", "label": "Wikimedia Commons", "requires_secret": True, "auth_type": "oauth", "default_rate": 1.0}, # generous limits
|
||||
{"name": "trefle", "label": "Trefle.io", "requires_secret": False, "auth_type": "api_key", "default_rate": 1.0}, # 120/min limit
|
||||
{"name": "duckduckgo", "label": "DuckDuckGo", "requires_secret": False, "auth_type": "none", "default_rate": 0.5}, # Web search, no API key
|
||||
{"name": "bing", "label": "Bing Image Search", "requires_secret": False, "auth_type": "api_key", "default_rate": 3.0}, # Azure Cognitive Services
|
||||
]
|
||||
|
||||
|
||||
def mask_api_key(key: str) -> str:
|
||||
"""Mask API key, showing only last 4 characters."""
|
||||
if not key or len(key) <= 4:
|
||||
return "****"
|
||||
return "*" * (len(key) - 4) + key[-4:]
|
||||
|
||||
|
||||
@router.get("")
|
||||
def list_sources(db: Session = Depends(get_db)):
|
||||
"""List all available sources with their configuration status."""
|
||||
api_keys = {k.source: k for k in db.query(ApiKey).all()}
|
||||
|
||||
result = []
|
||||
for source in AVAILABLE_SOURCES:
|
||||
api_key = api_keys.get(source["name"])
|
||||
default_rate = source.get("default_rate", 1.0)
|
||||
result.append({
|
||||
"name": source["name"],
|
||||
"label": source["label"],
|
||||
"requires_secret": source["requires_secret"],
|
||||
"auth_type": source.get("auth_type", "api_key"),
|
||||
"configured": api_key is not None,
|
||||
"enabled": api_key.enabled if api_key else False,
|
||||
"api_key_masked": mask_api_key(api_key.api_key) if api_key else None,
|
||||
"has_secret": bool(api_key.api_secret) if api_key else False,
|
||||
"has_access_token": bool(getattr(api_key, 'access_token', None)) if api_key else False,
|
||||
"rate_limit_per_sec": api_key.rate_limit_per_sec if api_key else default_rate,
|
||||
"default_rate": default_rate,
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@router.get("/{source}")
|
||||
def get_source(source: str, db: Session = Depends(get_db)):
|
||||
"""Get source configuration."""
|
||||
source_info = next((s for s in AVAILABLE_SOURCES if s["name"] == source), None)
|
||||
if not source_info:
|
||||
raise HTTPException(status_code=404, detail="Unknown source")
|
||||
|
||||
api_key = db.query(ApiKey).filter(ApiKey.source == source).first()
|
||||
default_rate = source_info.get("default_rate", 1.0)
|
||||
|
||||
return {
|
||||
"name": source_info["name"],
|
||||
"label": source_info["label"],
|
||||
"requires_secret": source_info["requires_secret"],
|
||||
"auth_type": source_info.get("auth_type", "api_key"),
|
||||
"configured": api_key is not None,
|
||||
"enabled": api_key.enabled if api_key else False,
|
||||
"api_key_masked": mask_api_key(api_key.api_key) if api_key else None,
|
||||
"has_secret": bool(api_key.api_secret) if api_key else False,
|
||||
"has_access_token": bool(getattr(api_key, 'access_token', None)) if api_key else False,
|
||||
"rate_limit_per_sec": api_key.rate_limit_per_sec if api_key else default_rate,
|
||||
"default_rate": default_rate,
|
||||
}
|
||||
|
||||
|
||||
@router.put("/{source}")
|
||||
def update_source(
|
||||
source: str,
|
||||
config: ApiKeyCreate,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Create or update source configuration."""
|
||||
source_info = next((s for s in AVAILABLE_SOURCES if s["name"] == source), None)
|
||||
if not source_info:
|
||||
raise HTTPException(status_code=404, detail="Unknown source")
|
||||
|
||||
# For sources that require auth, validate api_key is provided
|
||||
auth_type = source_info.get("auth_type", "api_key")
|
||||
if auth_type != "none" and not config.api_key:
|
||||
raise HTTPException(status_code=400, detail="API key is required for this source")
|
||||
|
||||
api_key = db.query(ApiKey).filter(ApiKey.source == source).first()
|
||||
|
||||
# Use placeholder for no-auth sources
|
||||
api_key_value = config.api_key or "no-auth"
|
||||
|
||||
if api_key:
|
||||
# Update existing
|
||||
api_key.api_key = api_key_value
|
||||
if config.api_secret:
|
||||
api_key.api_secret = config.api_secret
|
||||
if config.access_token:
|
||||
api_key.access_token = config.access_token
|
||||
api_key.rate_limit_per_sec = config.rate_limit_per_sec
|
||||
api_key.enabled = config.enabled
|
||||
else:
|
||||
# Create new
|
||||
api_key = ApiKey(
|
||||
source=source,
|
||||
api_key=api_key_value,
|
||||
api_secret=config.api_secret,
|
||||
access_token=config.access_token,
|
||||
rate_limit_per_sec=config.rate_limit_per_sec,
|
||||
enabled=config.enabled,
|
||||
)
|
||||
db.add(api_key)
|
||||
|
||||
db.commit()
|
||||
db.refresh(api_key)
|
||||
|
||||
return {
|
||||
"name": source,
|
||||
"configured": True,
|
||||
"enabled": api_key.enabled,
|
||||
"api_key_masked": mask_api_key(api_key.api_key) if auth_type != "none" else None,
|
||||
"has_secret": bool(api_key.api_secret),
|
||||
"has_access_token": bool(api_key.access_token),
|
||||
"rate_limit_per_sec": api_key.rate_limit_per_sec,
|
||||
}
|
||||
|
||||
|
||||
@router.patch("/{source}")
|
||||
def patch_source(
|
||||
source: str,
|
||||
config: ApiKeyUpdate,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Partially update source configuration."""
|
||||
api_key = db.query(ApiKey).filter(ApiKey.source == source).first()
|
||||
if not api_key:
|
||||
raise HTTPException(status_code=404, detail="Source not configured")
|
||||
|
||||
update_data = config.model_dump(exclude_unset=True)
|
||||
for field, value in update_data.items():
|
||||
setattr(api_key, field, value)
|
||||
|
||||
db.commit()
|
||||
db.refresh(api_key)
|
||||
|
||||
return {
|
||||
"name": source,
|
||||
"configured": True,
|
||||
"enabled": api_key.enabled,
|
||||
"api_key_masked": mask_api_key(api_key.api_key),
|
||||
"has_secret": bool(api_key.api_secret),
|
||||
"has_access_token": bool(api_key.access_token),
|
||||
"rate_limit_per_sec": api_key.rate_limit_per_sec,
|
||||
}
|
||||
|
||||
|
||||
@router.delete("/{source}")
|
||||
def delete_source(source: str, db: Session = Depends(get_db)):
|
||||
"""Delete source configuration."""
|
||||
api_key = db.query(ApiKey).filter(ApiKey.source == source).first()
|
||||
if not api_key:
|
||||
raise HTTPException(status_code=404, detail="Source not configured")
|
||||
|
||||
db.delete(api_key)
|
||||
db.commit()
|
||||
|
||||
return {"status": "deleted"}
|
||||
|
||||
|
||||
@router.post("/{source}/test")
|
||||
def test_source(source: str, db: Session = Depends(get_db)):
|
||||
"""Test source API connection."""
|
||||
api_key = db.query(ApiKey).filter(ApiKey.source == source).first()
|
||||
if not api_key:
|
||||
raise HTTPException(status_code=404, detail="Source not configured")
|
||||
|
||||
# Import and test the scraper
|
||||
from app.scrapers import get_scraper
|
||||
|
||||
scraper = get_scraper(source)
|
||||
if not scraper:
|
||||
raise HTTPException(status_code=400, detail="No scraper for this source")
|
||||
|
||||
try:
|
||||
result = scraper.test_connection(api_key)
|
||||
return {"status": "success", "message": result}
|
||||
except Exception as e:
|
||||
return {"status": "error", "message": str(e)}
|
||||
366
backend/app/api/species.py
Normal file
366
backend/app/api/species.py
Normal file
@@ -0,0 +1,366 @@
|
||||
import csv
|
||||
import io
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query, UploadFile, File
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import func, text
|
||||
|
||||
from app.database import get_db
|
||||
from app.models import Species, Image
|
||||
from app.schemas.species import (
|
||||
SpeciesCreate,
|
||||
SpeciesUpdate,
|
||||
SpeciesResponse,
|
||||
SpeciesListResponse,
|
||||
SpeciesImportResponse,
|
||||
)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def get_species_with_count(db: Session, species: Species) -> SpeciesResponse:
|
||||
"""Get species response with image count."""
|
||||
image_count = db.query(func.count(Image.id)).filter(
|
||||
Image.species_id == species.id,
|
||||
Image.status == "downloaded"
|
||||
).scalar()
|
||||
|
||||
return SpeciesResponse(
|
||||
id=species.id,
|
||||
scientific_name=species.scientific_name,
|
||||
common_name=species.common_name,
|
||||
genus=species.genus,
|
||||
family=species.family,
|
||||
created_at=species.created_at,
|
||||
image_count=image_count or 0,
|
||||
)
|
||||
|
||||
|
||||
@router.get("", response_model=SpeciesListResponse)
|
||||
def list_species(
|
||||
page: int = Query(1, ge=1),
|
||||
page_size: int = Query(50, ge=1, le=500),
|
||||
search: Optional[str] = None,
|
||||
genus: Optional[str] = None,
|
||||
has_images: Optional[bool] = None,
|
||||
max_images: Optional[int] = Query(None, description="Filter species with less than N images"),
|
||||
min_images: Optional[int] = Query(None, description="Filter species with at least N images"),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""List species with pagination and filters.
|
||||
|
||||
Filters:
|
||||
- search: Search by scientific or common name
|
||||
- genus: Filter by genus
|
||||
- has_images: True for species with images, False for species without
|
||||
- max_images: Filter species with fewer than N downloaded images
|
||||
- min_images: Filter species with at least N downloaded images
|
||||
"""
|
||||
# If filtering by image count, we need to use a subquery approach
|
||||
if max_images is not None or min_images is not None:
|
||||
# Build a subquery with image counts per species
|
||||
image_counts = (
|
||||
db.query(
|
||||
Species.id.label("species_id"),
|
||||
func.count(Image.id).label("img_count")
|
||||
)
|
||||
.outerjoin(Image, (Image.species_id == Species.id) & (Image.status == "downloaded"))
|
||||
.group_by(Species.id)
|
||||
.subquery()
|
||||
)
|
||||
|
||||
# Join species with their counts
|
||||
query = db.query(Species).join(
|
||||
image_counts, Species.id == image_counts.c.species_id
|
||||
)
|
||||
|
||||
if max_images is not None:
|
||||
query = query.filter(image_counts.c.img_count < max_images)
|
||||
|
||||
if min_images is not None:
|
||||
query = query.filter(image_counts.c.img_count >= min_images)
|
||||
else:
|
||||
query = db.query(Species)
|
||||
|
||||
if search:
|
||||
search_term = f"%{search}%"
|
||||
query = query.filter(
|
||||
(Species.scientific_name.ilike(search_term)) |
|
||||
(Species.common_name.ilike(search_term))
|
||||
)
|
||||
|
||||
if genus:
|
||||
query = query.filter(Species.genus == genus)
|
||||
|
||||
# Filter by whether species has downloaded images (only if not using min/max filters)
|
||||
if has_images is not None and max_images is None and min_images is None:
|
||||
# Get IDs of species that have at least one downloaded image
|
||||
species_with_images = (
|
||||
db.query(Image.species_id)
|
||||
.filter(Image.status == "downloaded")
|
||||
.distinct()
|
||||
.subquery()
|
||||
)
|
||||
if has_images:
|
||||
query = query.filter(Species.id.in_(db.query(species_with_images.c.species_id)))
|
||||
else:
|
||||
query = query.filter(~Species.id.in_(db.query(species_with_images.c.species_id)))
|
||||
|
||||
total = query.count()
|
||||
pages = (total + page_size - 1) // page_size
|
||||
|
||||
species_list = query.order_by(Species.scientific_name).offset(
|
||||
(page - 1) * page_size
|
||||
).limit(page_size).all()
|
||||
|
||||
# Fetch image counts in bulk for all species on this page
|
||||
species_ids = [s.id for s in species_list]
|
||||
if species_ids:
|
||||
count_query = db.query(
|
||||
Image.species_id,
|
||||
func.count(Image.id)
|
||||
).filter(
|
||||
Image.species_id.in_(species_ids),
|
||||
Image.status == "downloaded"
|
||||
).group_by(Image.species_id).all()
|
||||
count_map = {species_id: count for species_id, count in count_query}
|
||||
else:
|
||||
count_map = {}
|
||||
|
||||
items = [
|
||||
SpeciesResponse(
|
||||
id=s.id,
|
||||
scientific_name=s.scientific_name,
|
||||
common_name=s.common_name,
|
||||
genus=s.genus,
|
||||
family=s.family,
|
||||
created_at=s.created_at,
|
||||
image_count=count_map.get(s.id, 0),
|
||||
)
|
||||
for s in species_list
|
||||
]
|
||||
|
||||
return SpeciesListResponse(
|
||||
items=items,
|
||||
total=total,
|
||||
page=page,
|
||||
page_size=page_size,
|
||||
pages=pages,
|
||||
)
|
||||
|
||||
|
||||
@router.post("", response_model=SpeciesResponse)
|
||||
def create_species(species: SpeciesCreate, db: Session = Depends(get_db)):
|
||||
"""Create a new species."""
|
||||
existing = db.query(Species).filter(
|
||||
Species.scientific_name == species.scientific_name
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
raise HTTPException(status_code=400, detail="Species already exists")
|
||||
|
||||
# Auto-extract genus from scientific name if not provided
|
||||
genus = species.genus
|
||||
if not genus and " " in species.scientific_name:
|
||||
genus = species.scientific_name.split()[0]
|
||||
|
||||
db_species = Species(
|
||||
scientific_name=species.scientific_name,
|
||||
common_name=species.common_name,
|
||||
genus=genus,
|
||||
family=species.family,
|
||||
)
|
||||
db.add(db_species)
|
||||
db.commit()
|
||||
db.refresh(db_species)
|
||||
|
||||
return get_species_with_count(db, db_species)
|
||||
|
||||
|
||||
@router.post("/import", response_model=SpeciesImportResponse)
|
||||
async def import_species(
|
||||
file: UploadFile = File(...),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Import species from CSV file.
|
||||
|
||||
Expected columns: scientific_name, common_name (optional), genus (optional), family (optional)
|
||||
"""
|
||||
if not file.filename.endswith(".csv"):
|
||||
raise HTTPException(status_code=400, detail="File must be a CSV")
|
||||
|
||||
content = await file.read()
|
||||
text = content.decode("utf-8")
|
||||
|
||||
reader = csv.DictReader(io.StringIO(text))
|
||||
|
||||
imported = 0
|
||||
skipped = 0
|
||||
errors = []
|
||||
|
||||
for row_num, row in enumerate(reader, start=2):
|
||||
scientific_name = row.get("scientific_name", "").strip()
|
||||
if not scientific_name:
|
||||
errors.append(f"Row {row_num}: Missing scientific_name")
|
||||
continue
|
||||
|
||||
# Check if already exists
|
||||
existing = db.query(Species).filter(
|
||||
Species.scientific_name == scientific_name
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
# Auto-extract genus if not provided
|
||||
genus = row.get("genus", "").strip()
|
||||
if not genus and " " in scientific_name:
|
||||
genus = scientific_name.split()[0]
|
||||
|
||||
try:
|
||||
species = Species(
|
||||
scientific_name=scientific_name,
|
||||
common_name=row.get("common_name", "").strip() or None,
|
||||
genus=genus or None,
|
||||
family=row.get("family", "").strip() or None,
|
||||
)
|
||||
db.add(species)
|
||||
imported += 1
|
||||
except Exception as e:
|
||||
errors.append(f"Row {row_num}: {str(e)}")
|
||||
|
||||
db.commit()
|
||||
|
||||
return SpeciesImportResponse(
|
||||
imported=imported,
|
||||
skipped=skipped,
|
||||
errors=errors[:10], # Limit error messages
|
||||
)
|
||||
|
||||
|
||||
@router.post("/import-json", response_model=SpeciesImportResponse)
|
||||
async def import_species_json(
|
||||
file: UploadFile = File(...),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Import species from JSON file.
|
||||
|
||||
Expected format: {"plants": [{"scientific_name": "...", "common_names": [...], "family": "..."}]}
|
||||
"""
|
||||
if not file.filename.endswith(".json"):
|
||||
raise HTTPException(status_code=400, detail="File must be a JSON")
|
||||
|
||||
content = await file.read()
|
||||
try:
|
||||
data = json.loads(content.decode("utf-8"))
|
||||
except json.JSONDecodeError as e:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}")
|
||||
|
||||
plants = data.get("plants", [])
|
||||
if not plants:
|
||||
raise HTTPException(status_code=400, detail="No plants found in JSON")
|
||||
|
||||
imported = 0
|
||||
skipped = 0
|
||||
errors = []
|
||||
|
||||
for idx, plant in enumerate(plants):
|
||||
scientific_name = plant.get("scientific_name", "").strip()
|
||||
if not scientific_name:
|
||||
errors.append(f"Plant {idx}: Missing scientific_name")
|
||||
continue
|
||||
|
||||
# Check if already exists
|
||||
existing = db.query(Species).filter(
|
||||
Species.scientific_name == scientific_name
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
# Auto-extract genus from scientific name
|
||||
genus = None
|
||||
if " " in scientific_name:
|
||||
genus = scientific_name.split()[0]
|
||||
|
||||
# Get first common name if array provided
|
||||
common_names = plant.get("common_names", [])
|
||||
common_name = common_names[0] if common_names else None
|
||||
|
||||
try:
|
||||
species = Species(
|
||||
scientific_name=scientific_name,
|
||||
common_name=common_name,
|
||||
genus=genus,
|
||||
family=plant.get("family"),
|
||||
)
|
||||
db.add(species)
|
||||
imported += 1
|
||||
except Exception as e:
|
||||
errors.append(f"Plant {idx}: {str(e)}")
|
||||
|
||||
db.commit()
|
||||
|
||||
return SpeciesImportResponse(
|
||||
imported=imported,
|
||||
skipped=skipped,
|
||||
errors=errors[:10],
|
||||
)
|
||||
|
||||
|
||||
@router.get("/{species_id}", response_model=SpeciesResponse)
|
||||
def get_species(species_id: int, db: Session = Depends(get_db)):
|
||||
"""Get a species by ID."""
|
||||
species = db.query(Species).filter(Species.id == species_id).first()
|
||||
if not species:
|
||||
raise HTTPException(status_code=404, detail="Species not found")
|
||||
|
||||
return get_species_with_count(db, species)
|
||||
|
||||
|
||||
@router.put("/{species_id}", response_model=SpeciesResponse)
|
||||
def update_species(
|
||||
species_id: int,
|
||||
species_update: SpeciesUpdate,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Update a species."""
|
||||
species = db.query(Species).filter(Species.id == species_id).first()
|
||||
if not species:
|
||||
raise HTTPException(status_code=404, detail="Species not found")
|
||||
|
||||
update_data = species_update.model_dump(exclude_unset=True)
|
||||
for field, value in update_data.items():
|
||||
setattr(species, field, value)
|
||||
|
||||
db.commit()
|
||||
db.refresh(species)
|
||||
|
||||
return get_species_with_count(db, species)
|
||||
|
||||
|
||||
@router.delete("/{species_id}")
|
||||
def delete_species(species_id: int, db: Session = Depends(get_db)):
|
||||
"""Delete a species and all its images."""
|
||||
species = db.query(Species).filter(Species.id == species_id).first()
|
||||
if not species:
|
||||
raise HTTPException(status_code=404, detail="Species not found")
|
||||
|
||||
db.delete(species)
|
||||
db.commit()
|
||||
|
||||
return {"status": "deleted"}
|
||||
|
||||
|
||||
@router.get("/genera/list")
|
||||
def list_genera(db: Session = Depends(get_db)):
|
||||
"""List all unique genera."""
|
||||
genera = db.query(Species.genus).filter(
|
||||
Species.genus.isnot(None)
|
||||
).distinct().order_by(Species.genus).all()
|
||||
|
||||
return [g[0] for g in genera]
|
||||
190
backend/app/api/stats.py
Normal file
190
backend/app/api/stats.py
Normal file
@@ -0,0 +1,190 @@
|
||||
import json
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import func, case
|
||||
|
||||
from app.database import get_db
|
||||
from app.models import Species, Image, Job
|
||||
from app.models.cached_stats import CachedStats
|
||||
from app.schemas.stats import StatsResponse, SourceStats, LicenseStats, SpeciesStats, JobStats
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("", response_model=StatsResponse)
|
||||
def get_stats(db: Session = Depends(get_db)):
|
||||
"""Get dashboard statistics from cache (updated every 60s by Celery)."""
|
||||
# Try to get cached stats
|
||||
cached = db.query(CachedStats).filter(CachedStats.key == "dashboard_stats").first()
|
||||
|
||||
if cached:
|
||||
data = json.loads(cached.value)
|
||||
return StatsResponse(
|
||||
total_species=data["total_species"],
|
||||
total_images=data["total_images"],
|
||||
images_downloaded=data["images_downloaded"],
|
||||
images_pending=data["images_pending"],
|
||||
images_rejected=data["images_rejected"],
|
||||
disk_usage_mb=data["disk_usage_mb"],
|
||||
sources=[SourceStats(**s) for s in data["sources"]],
|
||||
licenses=[LicenseStats(**l) for l in data["licenses"]],
|
||||
jobs=JobStats(**data["jobs"]),
|
||||
top_species=[SpeciesStats(**s) for s in data["top_species"]],
|
||||
under_represented=[SpeciesStats(**s) for s in data["under_represented"]],
|
||||
)
|
||||
|
||||
# No cache yet - return empty stats (Celery will populate soon)
|
||||
# This only happens on first startup before Celery runs
|
||||
return StatsResponse(
|
||||
total_species=0,
|
||||
total_images=0,
|
||||
images_downloaded=0,
|
||||
images_pending=0,
|
||||
images_rejected=0,
|
||||
disk_usage_mb=0.0,
|
||||
sources=[],
|
||||
licenses=[],
|
||||
jobs=JobStats(running=0, pending=0, completed=0, failed=0),
|
||||
top_species=[],
|
||||
under_represented=[],
|
||||
)
|
||||
|
||||
|
||||
@router.post("/refresh")
|
||||
def refresh_stats_now(db: Session = Depends(get_db)):
|
||||
"""Manually trigger a stats refresh."""
|
||||
from app.workers.stats_tasks import refresh_stats
|
||||
refresh_stats.delay()
|
||||
return {"status": "refresh_queued"}
|
||||
|
||||
|
||||
@router.get("/sources")
|
||||
def get_source_stats(db: Session = Depends(get_db)):
|
||||
"""Get per-source breakdown."""
|
||||
stats = db.query(
|
||||
Image.source,
|
||||
func.count(Image.id).label("total"),
|
||||
func.sum(case((Image.status == "downloaded", 1), else_=0)).label("downloaded"),
|
||||
func.sum(case((Image.status == "pending", 1), else_=0)).label("pending"),
|
||||
func.sum(case((Image.status == "rejected", 1), else_=0)).label("rejected"),
|
||||
).group_by(Image.source).all()
|
||||
|
||||
return [
|
||||
{
|
||||
"source": s.source,
|
||||
"total": s.total,
|
||||
"downloaded": s.downloaded or 0,
|
||||
"pending": s.pending or 0,
|
||||
"rejected": s.rejected or 0,
|
||||
}
|
||||
for s in stats
|
||||
]
|
||||
|
||||
|
||||
@router.get("/species")
|
||||
def get_species_stats(
|
||||
min_count: int = 0,
|
||||
max_count: int = None,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Get per-species image counts."""
|
||||
query = db.query(
|
||||
Species.id,
|
||||
Species.scientific_name,
|
||||
Species.common_name,
|
||||
Species.genus,
|
||||
func.count(Image.id).label("image_count")
|
||||
).outerjoin(Image, (Image.species_id == Species.id) & (Image.status == "downloaded")
|
||||
).group_by(Species.id)
|
||||
|
||||
if min_count > 0:
|
||||
query = query.having(func.count(Image.id) >= min_count)
|
||||
|
||||
if max_count is not None:
|
||||
query = query.having(func.count(Image.id) <= max_count)
|
||||
|
||||
stats = query.order_by(func.count(Image.id).desc()).all()
|
||||
|
||||
return [
|
||||
{
|
||||
"id": s.id,
|
||||
"scientific_name": s.scientific_name,
|
||||
"common_name": s.common_name,
|
||||
"genus": s.genus,
|
||||
"image_count": s.image_count,
|
||||
}
|
||||
for s in stats
|
||||
]
|
||||
|
||||
|
||||
@router.get("/distribution")
|
||||
def get_image_distribution(db: Session = Depends(get_db)):
|
||||
"""Get distribution of images per species for ML training assessment.
|
||||
|
||||
Returns counts of species at various image thresholds to help
|
||||
determine dataset quality for training image classifiers.
|
||||
"""
|
||||
from sqlalchemy import text
|
||||
|
||||
# Get image counts per species using optimized raw SQL
|
||||
distribution_sql = text("""
|
||||
WITH species_counts AS (
|
||||
SELECT
|
||||
s.id,
|
||||
COUNT(i.id) as cnt
|
||||
FROM species s
|
||||
LEFT JOIN images i ON i.species_id = s.id AND i.status = 'downloaded'
|
||||
GROUP BY s.id
|
||||
)
|
||||
SELECT
|
||||
COUNT(*) as total_species,
|
||||
SUM(CASE WHEN cnt = 0 THEN 1 ELSE 0 END) as with_0,
|
||||
SUM(CASE WHEN cnt >= 1 AND cnt < 10 THEN 1 ELSE 0 END) as with_1_9,
|
||||
SUM(CASE WHEN cnt >= 10 AND cnt < 25 THEN 1 ELSE 0 END) as with_10_24,
|
||||
SUM(CASE WHEN cnt >= 25 AND cnt < 50 THEN 1 ELSE 0 END) as with_25_49,
|
||||
SUM(CASE WHEN cnt >= 50 AND cnt < 100 THEN 1 ELSE 0 END) as with_50_99,
|
||||
SUM(CASE WHEN cnt >= 100 AND cnt < 200 THEN 1 ELSE 0 END) as with_100_199,
|
||||
SUM(CASE WHEN cnt >= 200 THEN 1 ELSE 0 END) as with_200_plus,
|
||||
SUM(CASE WHEN cnt >= 10 THEN 1 ELSE 0 END) as trainable_10,
|
||||
SUM(CASE WHEN cnt >= 25 THEN 1 ELSE 0 END) as trainable_25,
|
||||
SUM(CASE WHEN cnt >= 50 THEN 1 ELSE 0 END) as trainable_50,
|
||||
SUM(CASE WHEN cnt >= 100 THEN 1 ELSE 0 END) as trainable_100,
|
||||
AVG(cnt) as avg_images,
|
||||
MAX(cnt) as max_images,
|
||||
MIN(cnt) as min_images,
|
||||
SUM(cnt) as total_images
|
||||
FROM species_counts
|
||||
""")
|
||||
|
||||
result = db.execute(distribution_sql).fetchone()
|
||||
|
||||
return {
|
||||
"total_species": result[0] or 0,
|
||||
"distribution": {
|
||||
"0_images": result[1] or 0,
|
||||
"1_to_9": result[2] or 0,
|
||||
"10_to_24": result[3] or 0,
|
||||
"25_to_49": result[4] or 0,
|
||||
"50_to_99": result[5] or 0,
|
||||
"100_to_199": result[6] or 0,
|
||||
"200_plus": result[7] or 0,
|
||||
},
|
||||
"trainable_species": {
|
||||
"min_10_images": result[8] or 0,
|
||||
"min_25_images": result[9] or 0,
|
||||
"min_50_images": result[10] or 0,
|
||||
"min_100_images": result[11] or 0,
|
||||
},
|
||||
"summary": {
|
||||
"avg_images_per_species": round(result[12] or 0, 1),
|
||||
"max_images": result[13] or 0,
|
||||
"min_images": result[14] or 0,
|
||||
"total_downloaded_images": result[15] or 0,
|
||||
},
|
||||
"recommendations": {
|
||||
"for_basic_model": f"{result[8] or 0} species with 10+ images",
|
||||
"for_good_model": f"{result[10] or 0} species with 50+ images",
|
||||
"for_excellent_model": f"{result[11] or 0} species with 100+ images",
|
||||
}
|
||||
}
|
||||
38
backend/app/config.py
Normal file
38
backend/app/config.py
Normal file
@@ -0,0 +1,38 @@
|
||||
from pydantic_settings import BaseSettings
|
||||
from functools import lru_cache
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
# Database
|
||||
database_url: str = "sqlite:////data/db/plants.sqlite"
|
||||
|
||||
# Redis
|
||||
redis_url: str = "redis://redis:6379/0"
|
||||
|
||||
# Storage paths
|
||||
images_path: str = "/data/images"
|
||||
exports_path: str = "/data/exports"
|
||||
imports_path: str = "/data/imports"
|
||||
logs_path: str = "/data/logs"
|
||||
|
||||
# API Keys
|
||||
flickr_api_key: str = ""
|
||||
flickr_api_secret: str = ""
|
||||
inaturalist_app_id: str = ""
|
||||
inaturalist_app_secret: str = ""
|
||||
trefle_api_key: str = ""
|
||||
|
||||
# Logging
|
||||
log_level: str = "INFO"
|
||||
|
||||
# Celery
|
||||
celery_concurrency: int = 4
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
extra = "ignore"
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def get_settings() -> Settings:
|
||||
return Settings()
|
||||
44
backend/app/database.py
Normal file
44
backend/app/database.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from sqlalchemy import create_engine, event
|
||||
from sqlalchemy.orm import sessionmaker, declarative_base
|
||||
from sqlalchemy.pool import StaticPool
|
||||
|
||||
from app.config import get_settings
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
# SQLite-specific configuration
|
||||
connect_args = {"check_same_thread": False}
|
||||
|
||||
engine = create_engine(
|
||||
settings.database_url,
|
||||
connect_args=connect_args,
|
||||
poolclass=StaticPool,
|
||||
echo=False,
|
||||
)
|
||||
|
||||
# Enable WAL mode for better concurrent access
|
||||
@event.listens_for(engine, "connect")
|
||||
def set_sqlite_pragma(dbapi_connection, connection_record):
|
||||
cursor = dbapi_connection.cursor()
|
||||
cursor.execute("PRAGMA journal_mode=WAL")
|
||||
cursor.execute("PRAGMA synchronous=NORMAL")
|
||||
cursor.execute("PRAGMA foreign_keys=ON")
|
||||
cursor.close()
|
||||
|
||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
def get_db():
|
||||
db = SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def init_db():
|
||||
"""Create all tables."""
|
||||
from app.models import species, image, job, api_key, export, cached_stats # noqa
|
||||
Base.metadata.create_all(bind=engine)
|
||||
95
backend/app/main.py
Normal file
95
backend/app/main.py
Normal file
@@ -0,0 +1,95 @@
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
from app.config import get_settings
|
||||
from app.database import init_db
|
||||
from app.api import species, images, jobs, exports, stats, sources
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
app = FastAPI(
|
||||
title="PlantGuideScraper API",
|
||||
description="Web scraper interface for houseplant image collection",
|
||||
version="1.0.0",
|
||||
)
|
||||
|
||||
# CORS middleware
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Include routers
|
||||
app.include_router(species.router, prefix="/api/species", tags=["Species"])
|
||||
app.include_router(images.router, prefix="/api/images", tags=["Images"])
|
||||
app.include_router(jobs.router, prefix="/api/jobs", tags=["Jobs"])
|
||||
app.include_router(exports.router, prefix="/api/exports", tags=["Exports"])
|
||||
app.include_router(stats.router, prefix="/api/stats", tags=["Stats"])
|
||||
app.include_router(sources.router, prefix="/api/sources", tags=["Sources"])
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event():
|
||||
"""Initialize database on startup."""
|
||||
init_db()
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
"""Health check endpoint."""
|
||||
return {"status": "healthy", "service": "plant-scraper"}
|
||||
|
||||
|
||||
@app.get("/api/debug")
|
||||
async def debug_check():
|
||||
"""Debug endpoint - checks database connection."""
|
||||
import time
|
||||
from app.database import SessionLocal
|
||||
from app.models import Species, Image
|
||||
|
||||
results = {"status": "checking", "checks": {}}
|
||||
|
||||
# Check 1: Can we create a session?
|
||||
try:
|
||||
start = time.time()
|
||||
db = SessionLocal()
|
||||
results["checks"]["session_create"] = {"ok": True, "ms": int((time.time() - start) * 1000)}
|
||||
except Exception as e:
|
||||
results["checks"]["session_create"] = {"ok": False, "error": str(e)}
|
||||
results["status"] = "error"
|
||||
return results
|
||||
|
||||
# Check 2: Simple query - count species
|
||||
try:
|
||||
start = time.time()
|
||||
count = db.query(Species).count()
|
||||
results["checks"]["species_count"] = {"ok": True, "count": count, "ms": int((time.time() - start) * 1000)}
|
||||
except Exception as e:
|
||||
results["checks"]["species_count"] = {"ok": False, "error": str(e)}
|
||||
results["status"] = "error"
|
||||
db.close()
|
||||
return results
|
||||
|
||||
# Check 3: Count images
|
||||
try:
|
||||
start = time.time()
|
||||
count = db.query(Image).count()
|
||||
results["checks"]["image_count"] = {"ok": True, "count": count, "ms": int((time.time() - start) * 1000)}
|
||||
except Exception as e:
|
||||
results["checks"]["image_count"] = {"ok": False, "error": str(e)}
|
||||
results["status"] = "error"
|
||||
db.close()
|
||||
return results
|
||||
|
||||
db.close()
|
||||
results["status"] = "healthy"
|
||||
return results
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
"""Root endpoint."""
|
||||
return {"message": "PlantGuideScraper API", "docs": "/docs"}
|
||||
8
backend/app/models/__init__.py
Normal file
8
backend/app/models/__init__.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from app.models.species import Species
|
||||
from app.models.image import Image
|
||||
from app.models.job import Job
|
||||
from app.models.api_key import ApiKey
|
||||
from app.models.export import Export
|
||||
from app.models.cached_stats import CachedStats
|
||||
|
||||
__all__ = ["Species", "Image", "Job", "ApiKey", "Export", "CachedStats"]
|
||||
18
backend/app/models/api_key.py
Normal file
18
backend/app/models/api_key.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from sqlalchemy import Column, Integer, String, Float, Boolean
|
||||
|
||||
from app.database import Base
|
||||
|
||||
|
||||
class ApiKey(Base):
|
||||
__tablename__ = "api_keys"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
source = Column(String, unique=True, nullable=False) # 'flickr', 'inaturalist', 'wikimedia', 'trefle'
|
||||
api_key = Column(String, nullable=False) # Also used as Client ID for OAuth sources
|
||||
api_secret = Column(String, nullable=True) # Also used as Client Secret for OAuth sources
|
||||
access_token = Column(String, nullable=True) # For OAuth sources like Wikimedia
|
||||
rate_limit_per_sec = Column(Float, default=1.0)
|
||||
enabled = Column(Boolean, default=True)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<ApiKey(id={self.id}, source='{self.source}', enabled={self.enabled})>"
|
||||
14
backend/app/models/cached_stats.py
Normal file
14
backend/app/models/cached_stats.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from datetime import datetime
|
||||
from sqlalchemy import Column, Integer, String, Text, DateTime
|
||||
|
||||
from app.database import Base
|
||||
|
||||
|
||||
class CachedStats(Base):
|
||||
"""Stores pre-calculated statistics updated by Celery beat."""
|
||||
__tablename__ = "cached_stats"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
key = Column(String(50), unique=True, nullable=False, index=True)
|
||||
value = Column(Text, nullable=False) # JSON-encoded stats
|
||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
24
backend/app/models/export.py
Normal file
24
backend/app/models/export.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from sqlalchemy import Column, Integer, String, Float, DateTime, Text, func
|
||||
|
||||
from app.database import Base
|
||||
|
||||
|
||||
class Export(Base):
|
||||
__tablename__ = "exports"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
name = Column(String, nullable=False)
|
||||
filter_criteria = Column(Text, nullable=True) # JSON: min_images, licenses, min_quality, species_ids
|
||||
train_split = Column(Float, default=0.8)
|
||||
status = Column(String, default="pending") # pending, generating, completed, failed
|
||||
file_path = Column(String, nullable=True)
|
||||
file_size = Column(Integer, nullable=True)
|
||||
species_count = Column(Integer, nullable=True)
|
||||
image_count = Column(Integer, nullable=True)
|
||||
celery_task_id = Column(String, nullable=True)
|
||||
created_at = Column(DateTime, server_default=func.now())
|
||||
completed_at = Column(DateTime, nullable=True)
|
||||
error_message = Column(Text, nullable=True)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<Export(id={self.id}, name='{self.name}', status='{self.status}')>"
|
||||
36
backend/app/models/image.py
Normal file
36
backend/app/models/image.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from sqlalchemy import Column, Integer, String, Float, DateTime, ForeignKey, func, UniqueConstraint, Index
|
||||
from sqlalchemy.orm import relationship
|
||||
|
||||
from app.database import Base
|
||||
|
||||
|
||||
class Image(Base):
|
||||
__tablename__ = "images"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
species_id = Column(Integer, ForeignKey("species.id"), nullable=False, index=True)
|
||||
source = Column(String, nullable=False, index=True)
|
||||
source_id = Column(String, nullable=True)
|
||||
url = Column(String, nullable=False)
|
||||
local_path = Column(String, nullable=True)
|
||||
license = Column(String, nullable=False, index=True)
|
||||
attribution = Column(String, nullable=True)
|
||||
width = Column(Integer, nullable=True)
|
||||
height = Column(Integer, nullable=True)
|
||||
phash = Column(String, nullable=True, index=True)
|
||||
quality_score = Column(Float, nullable=True)
|
||||
status = Column(String, default="pending", index=True) # pending, downloaded, rejected, deleted
|
||||
created_at = Column(DateTime, server_default=func.now())
|
||||
|
||||
# Composite indexes for common query patterns
|
||||
__table_args__ = (
|
||||
UniqueConstraint("source", "source_id", name="uq_source_source_id"),
|
||||
Index("ix_images_species_status", "species_id", "status"), # For counting images per species by status
|
||||
Index("ix_images_status_created", "status", "created_at"), # For listing images by status
|
||||
)
|
||||
|
||||
# Relationships
|
||||
species = relationship("Species", back_populates="images")
|
||||
|
||||
def __repr__(self):
|
||||
return f"<Image(id={self.id}, source='{self.source}', status='{self.status}')>"
|
||||
27
backend/app/models/job.py
Normal file
27
backend/app/models/job.py
Normal file
@@ -0,0 +1,27 @@
|
||||
from sqlalchemy import Column, Integer, String, DateTime, Text, Boolean, func
|
||||
|
||||
from app.database import Base
|
||||
|
||||
|
||||
class Job(Base):
|
||||
__tablename__ = "jobs"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
name = Column(String, nullable=False)
|
||||
source = Column(String, nullable=False)
|
||||
species_filter = Column(Text, nullable=True) # JSON array of species IDs or NULL for all
|
||||
only_without_images = Column(Boolean, default=False) # If True, only scrape species with 0 images
|
||||
max_images = Column(Integer, nullable=True) # If set, only scrape species with fewer than N images
|
||||
status = Column(String, default="pending", index=True) # pending, running, paused, completed, failed
|
||||
progress_current = Column(Integer, default=0)
|
||||
progress_total = Column(Integer, default=0)
|
||||
images_downloaded = Column(Integer, default=0)
|
||||
images_rejected = Column(Integer, default=0)
|
||||
celery_task_id = Column(String, nullable=True)
|
||||
started_at = Column(DateTime, nullable=True)
|
||||
completed_at = Column(DateTime, nullable=True)
|
||||
error_message = Column(Text, nullable=True)
|
||||
created_at = Column(DateTime, server_default=func.now())
|
||||
|
||||
def __repr__(self):
|
||||
return f"<Job(id={self.id}, name='{self.name}', status='{self.status}')>"
|
||||
21
backend/app/models/species.py
Normal file
21
backend/app/models/species.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from sqlalchemy import Column, Integer, String, DateTime, func
|
||||
from sqlalchemy.orm import relationship
|
||||
|
||||
from app.database import Base
|
||||
|
||||
|
||||
class Species(Base):
|
||||
__tablename__ = "species"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
scientific_name = Column(String, unique=True, nullable=False, index=True)
|
||||
common_name = Column(String, nullable=True)
|
||||
genus = Column(String, nullable=True, index=True)
|
||||
family = Column(String, nullable=True)
|
||||
created_at = Column(DateTime, server_default=func.now())
|
||||
|
||||
# Relationships
|
||||
images = relationship("Image", back_populates="species", cascade="all, delete-orphan")
|
||||
|
||||
def __repr__(self):
|
||||
return f"<Species(id={self.id}, scientific_name='{self.scientific_name}')>"
|
||||
15
backend/app/schemas/__init__.py
Normal file
15
backend/app/schemas/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from app.schemas.species import SpeciesCreate, SpeciesUpdate, SpeciesResponse, SpeciesListResponse
|
||||
from app.schemas.image import ImageResponse, ImageListResponse, ImageFilter
|
||||
from app.schemas.job import JobCreate, JobResponse, JobListResponse
|
||||
from app.schemas.api_key import ApiKeyCreate, ApiKeyUpdate, ApiKeyResponse
|
||||
from app.schemas.export import ExportCreate, ExportResponse, ExportListResponse
|
||||
from app.schemas.stats import StatsResponse, SourceStats, SpeciesStats
|
||||
|
||||
__all__ = [
|
||||
"SpeciesCreate", "SpeciesUpdate", "SpeciesResponse", "SpeciesListResponse",
|
||||
"ImageResponse", "ImageListResponse", "ImageFilter",
|
||||
"JobCreate", "JobResponse", "JobListResponse",
|
||||
"ApiKeyCreate", "ApiKeyUpdate", "ApiKeyResponse",
|
||||
"ExportCreate", "ExportResponse", "ExportListResponse",
|
||||
"StatsResponse", "SourceStats", "SpeciesStats",
|
||||
]
|
||||
36
backend/app/schemas/api_key.py
Normal file
36
backend/app/schemas/api_key.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class ApiKeyBase(BaseModel):
|
||||
source: str
|
||||
api_key: Optional[str] = None # Optional for no-auth sources, used as Client ID for OAuth
|
||||
api_secret: Optional[str] = None # Also used as Client Secret for OAuth sources
|
||||
access_token: Optional[str] = None # For OAuth sources like Wikimedia
|
||||
rate_limit_per_sec: float = 1.0
|
||||
enabled: bool = True
|
||||
|
||||
|
||||
class ApiKeyCreate(ApiKeyBase):
|
||||
pass
|
||||
|
||||
|
||||
class ApiKeyUpdate(BaseModel):
|
||||
api_key: Optional[str] = None
|
||||
api_secret: Optional[str] = None
|
||||
access_token: Optional[str] = None
|
||||
rate_limit_per_sec: Optional[float] = None
|
||||
enabled: Optional[bool] = None
|
||||
|
||||
|
||||
class ApiKeyResponse(BaseModel):
|
||||
id: int
|
||||
source: str
|
||||
api_key_masked: str # Show only last 4 chars
|
||||
has_secret: bool
|
||||
has_access_token: bool
|
||||
rate_limit_per_sec: float
|
||||
enabled: bool
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
45
backend/app/schemas/export.py
Normal file
45
backend/app/schemas/export.py
Normal file
@@ -0,0 +1,45 @@
|
||||
from pydantic import BaseModel
|
||||
from datetime import datetime
|
||||
from typing import Optional, List
|
||||
|
||||
|
||||
class ExportFilter(BaseModel):
|
||||
min_images_per_species: int = 100
|
||||
licenses: Optional[List[str]] = None # None means all
|
||||
min_quality: Optional[float] = None
|
||||
species_ids: Optional[List[int]] = None # None means all
|
||||
|
||||
|
||||
class ExportCreate(BaseModel):
|
||||
name: str
|
||||
filter_criteria: ExportFilter
|
||||
train_split: float = 0.8
|
||||
|
||||
|
||||
class ExportResponse(BaseModel):
|
||||
id: int
|
||||
name: str
|
||||
filter_criteria: Optional[str] = None
|
||||
train_split: float
|
||||
status: str
|
||||
file_path: Optional[str] = None
|
||||
file_size: Optional[int] = None
|
||||
species_count: Optional[int] = None
|
||||
image_count: Optional[int] = None
|
||||
created_at: datetime
|
||||
completed_at: Optional[datetime] = None
|
||||
error_message: Optional[str] = None
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class ExportListResponse(BaseModel):
|
||||
items: List[ExportResponse]
|
||||
total: int
|
||||
|
||||
|
||||
class ExportPreview(BaseModel):
|
||||
species_count: int
|
||||
image_count: int
|
||||
estimated_size_mb: float
|
||||
47
backend/app/schemas/image.py
Normal file
47
backend/app/schemas/image.py
Normal file
@@ -0,0 +1,47 @@
|
||||
from pydantic import BaseModel
|
||||
from datetime import datetime
|
||||
from typing import Optional, List
|
||||
|
||||
|
||||
class ImageBase(BaseModel):
|
||||
species_id: int
|
||||
source: str
|
||||
url: str
|
||||
license: str
|
||||
|
||||
|
||||
class ImageResponse(BaseModel):
|
||||
id: int
|
||||
species_id: int
|
||||
species_name: Optional[str] = None
|
||||
source: str
|
||||
source_id: Optional[str] = None
|
||||
url: str
|
||||
local_path: Optional[str] = None
|
||||
license: str
|
||||
attribution: Optional[str] = None
|
||||
width: Optional[int] = None
|
||||
height: Optional[int] = None
|
||||
quality_score: Optional[float] = None
|
||||
status: str
|
||||
created_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class ImageListResponse(BaseModel):
|
||||
items: List[ImageResponse]
|
||||
total: int
|
||||
page: int
|
||||
page_size: int
|
||||
pages: int
|
||||
|
||||
|
||||
class ImageFilter(BaseModel):
|
||||
species_id: Optional[int] = None
|
||||
source: Optional[str] = None
|
||||
license: Optional[str] = None
|
||||
status: Optional[str] = None
|
||||
min_quality: Optional[float] = None
|
||||
search: Optional[str] = None
|
||||
35
backend/app/schemas/job.py
Normal file
35
backend/app/schemas/job.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from pydantic import BaseModel
|
||||
from datetime import datetime
|
||||
from typing import Optional, List
|
||||
|
||||
|
||||
class JobCreate(BaseModel):
|
||||
name: str
|
||||
source: str
|
||||
species_ids: Optional[List[int]] = None # None means all species
|
||||
only_without_images: bool = False # If True, only scrape species with 0 images
|
||||
max_images: Optional[int] = None # If set, only scrape species with fewer than N images
|
||||
|
||||
|
||||
class JobResponse(BaseModel):
|
||||
id: int
|
||||
name: str
|
||||
source: str
|
||||
species_filter: Optional[str] = None
|
||||
status: str
|
||||
progress_current: int
|
||||
progress_total: int
|
||||
images_downloaded: int
|
||||
images_rejected: int
|
||||
started_at: Optional[datetime] = None
|
||||
completed_at: Optional[datetime] = None
|
||||
error_message: Optional[str] = None
|
||||
created_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class JobListResponse(BaseModel):
|
||||
items: List[JobResponse]
|
||||
total: int
|
||||
44
backend/app/schemas/species.py
Normal file
44
backend/app/schemas/species.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from pydantic import BaseModel
|
||||
from datetime import datetime
|
||||
from typing import Optional, List
|
||||
|
||||
|
||||
class SpeciesBase(BaseModel):
|
||||
scientific_name: str
|
||||
common_name: Optional[str] = None
|
||||
genus: Optional[str] = None
|
||||
family: Optional[str] = None
|
||||
|
||||
|
||||
class SpeciesCreate(SpeciesBase):
|
||||
pass
|
||||
|
||||
|
||||
class SpeciesUpdate(BaseModel):
|
||||
scientific_name: Optional[str] = None
|
||||
common_name: Optional[str] = None
|
||||
genus: Optional[str] = None
|
||||
family: Optional[str] = None
|
||||
|
||||
|
||||
class SpeciesResponse(SpeciesBase):
|
||||
id: int
|
||||
created_at: datetime
|
||||
image_count: int = 0
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class SpeciesListResponse(BaseModel):
|
||||
items: List[SpeciesResponse]
|
||||
total: int
|
||||
page: int
|
||||
page_size: int
|
||||
pages: int
|
||||
|
||||
|
||||
class SpeciesImportResponse(BaseModel):
|
||||
imported: int
|
||||
skipped: int
|
||||
errors: List[str]
|
||||
43
backend/app/schemas/stats.py
Normal file
43
backend/app/schemas/stats.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Dict
|
||||
|
||||
|
||||
class SourceStats(BaseModel):
|
||||
source: str
|
||||
image_count: int
|
||||
downloaded: int
|
||||
pending: int
|
||||
rejected: int
|
||||
|
||||
|
||||
class LicenseStats(BaseModel):
|
||||
license: str
|
||||
count: int
|
||||
|
||||
|
||||
class SpeciesStats(BaseModel):
|
||||
id: int
|
||||
scientific_name: str
|
||||
common_name: str | None
|
||||
image_count: int
|
||||
|
||||
|
||||
class JobStats(BaseModel):
|
||||
running: int
|
||||
pending: int
|
||||
completed: int
|
||||
failed: int
|
||||
|
||||
|
||||
class StatsResponse(BaseModel):
|
||||
total_species: int
|
||||
total_images: int
|
||||
images_downloaded: int
|
||||
images_pending: int
|
||||
images_rejected: int
|
||||
disk_usage_mb: float
|
||||
sources: List[SourceStats]
|
||||
licenses: List[LicenseStats]
|
||||
jobs: JobStats
|
||||
top_species: List[SpeciesStats]
|
||||
under_represented: List[SpeciesStats] # Species with < 100 images
|
||||
41
backend/app/scrapers/__init__.py
Normal file
41
backend/app/scrapers/__init__.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from typing import Optional
|
||||
|
||||
from app.scrapers.base import BaseScraper
|
||||
from app.scrapers.inaturalist import INaturalistScraper
|
||||
from app.scrapers.flickr import FlickrScraper
|
||||
from app.scrapers.wikimedia import WikimediaScraper
|
||||
from app.scrapers.trefle import TrefleScraper
|
||||
from app.scrapers.gbif import GBIFScraper
|
||||
from app.scrapers.duckduckgo import DuckDuckGoScraper
|
||||
from app.scrapers.bing import BingScraper
|
||||
|
||||
|
||||
def get_scraper(source: str) -> Optional[BaseScraper]:
|
||||
"""Get scraper instance for a source."""
|
||||
scrapers = {
|
||||
"inaturalist": INaturalistScraper,
|
||||
"flickr": FlickrScraper,
|
||||
"wikimedia": WikimediaScraper,
|
||||
"trefle": TrefleScraper,
|
||||
"gbif": GBIFScraper,
|
||||
"duckduckgo": DuckDuckGoScraper,
|
||||
"bing": BingScraper,
|
||||
}
|
||||
|
||||
scraper_class = scrapers.get(source)
|
||||
if scraper_class:
|
||||
return scraper_class()
|
||||
return None
|
||||
|
||||
|
||||
__all__ = [
|
||||
"get_scraper",
|
||||
"BaseScraper",
|
||||
"INaturalistScraper",
|
||||
"FlickrScraper",
|
||||
"WikimediaScraper",
|
||||
"TrefleScraper",
|
||||
"GBIFScraper",
|
||||
"DuckDuckGoScraper",
|
||||
"BingScraper",
|
||||
]
|
||||
57
backend/app/scrapers/base.py
Normal file
57
backend/app/scrapers/base.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Any, Optional
|
||||
import logging
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.models import Species, ApiKey
|
||||
|
||||
|
||||
class BaseScraper(ABC):
|
||||
"""Base class for all image scrapers."""
|
||||
|
||||
name: str = "base"
|
||||
requires_api_key: bool = True
|
||||
|
||||
@abstractmethod
|
||||
def scrape_species(
|
||||
self,
|
||||
species: Species,
|
||||
db: Session,
|
||||
logger: Optional[logging.Logger] = None
|
||||
) -> Dict[str, int]:
|
||||
"""
|
||||
Scrape images for a species.
|
||||
|
||||
Args:
|
||||
species: The species to scrape images for
|
||||
db: Database session
|
||||
logger: Optional logger for debugging
|
||||
|
||||
Returns:
|
||||
Dict with 'downloaded' and 'rejected' counts
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def test_connection(self, api_key: ApiKey) -> str:
|
||||
"""
|
||||
Test API connection.
|
||||
|
||||
Args:
|
||||
api_key: The API key configuration
|
||||
|
||||
Returns:
|
||||
Success message
|
||||
|
||||
Raises:
|
||||
Exception if connection fails
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_api_key(self, db: Session) -> ApiKey:
|
||||
"""Get API key for this scraper."""
|
||||
return db.query(ApiKey).filter(
|
||||
ApiKey.source == self.name,
|
||||
ApiKey.enabled == True
|
||||
).first()
|
||||
228
backend/app/scrapers/bhl.py
Normal file
228
backend/app/scrapers/bhl.py
Normal file
@@ -0,0 +1,228 @@
|
||||
import time
|
||||
import logging
|
||||
from typing import Dict, Optional
|
||||
|
||||
import httpx
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.scrapers.base import BaseScraper
|
||||
from app.models import Species, Image, ApiKey
|
||||
from app.workers.quality_tasks import download_and_process_image
|
||||
|
||||
|
||||
class BHLScraper(BaseScraper):
|
||||
"""Scraper for Biodiversity Heritage Library (BHL) images.
|
||||
|
||||
BHL provides access to digitized biodiversity literature and illustrations.
|
||||
Most content is public domain (pre-1927) or CC-licensed.
|
||||
|
||||
Note: BHL images are primarily historical botanical illustrations,
|
||||
which may differ from photographs but are valuable for training.
|
||||
"""
|
||||
|
||||
name = "bhl"
|
||||
requires_api_key = True # BHL requires free API key
|
||||
|
||||
BASE_URL = "https://www.biodiversitylibrary.org/api3"
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "PlantGuideScraper/1.0 (Plant image collection for ML training)",
|
||||
"Accept": "application/json",
|
||||
}
|
||||
|
||||
# BHL content is mostly public domain
|
||||
ALLOWED_LICENSES = {"CC0", "CC-BY", "CC-BY-SA", "PD"}
|
||||
|
||||
def scrape_species(
|
||||
self,
|
||||
species: Species,
|
||||
db: Session,
|
||||
logger: Optional[logging.Logger] = None
|
||||
) -> Dict[str, int]:
|
||||
"""Scrape images from BHL for a species."""
|
||||
api_key = self.get_api_key(db)
|
||||
if not api_key:
|
||||
return {"downloaded": 0, "rejected": 0, "error": "No API key configured"}
|
||||
|
||||
rate_limit = api_key.rate_limit_per_sec if api_key else 0.5
|
||||
|
||||
downloaded = 0
|
||||
rejected = 0
|
||||
|
||||
def log(level: str, msg: str):
|
||||
if logger:
|
||||
getattr(logger, level)(msg)
|
||||
|
||||
try:
|
||||
# Disable SSL verification - some Docker environments lack proper CA certificates
|
||||
with httpx.Client(timeout=30, headers=self.HEADERS, verify=False) as client:
|
||||
# Search for name in BHL
|
||||
search_response = client.get(
|
||||
f"{self.BASE_URL}",
|
||||
params={
|
||||
"op": "NameSearch",
|
||||
"name": species.scientific_name,
|
||||
"format": "json",
|
||||
"apikey": api_key.api_key,
|
||||
},
|
||||
)
|
||||
search_response.raise_for_status()
|
||||
search_data = search_response.json()
|
||||
|
||||
results = search_data.get("Result", [])
|
||||
if not results:
|
||||
log("info", f" Species not found in BHL: {species.scientific_name}")
|
||||
return {"downloaded": 0, "rejected": 0}
|
||||
|
||||
time.sleep(1.0 / rate_limit)
|
||||
|
||||
# Get pages with illustrations for each name result
|
||||
for name_result in results[:5]: # Limit to top 5 matches
|
||||
name_bank_id = name_result.get("NameBankID")
|
||||
if not name_bank_id:
|
||||
continue
|
||||
|
||||
# Get publications with this name
|
||||
pub_response = client.get(
|
||||
f"{self.BASE_URL}",
|
||||
params={
|
||||
"op": "NameGetDetail",
|
||||
"namebankid": name_bank_id,
|
||||
"format": "json",
|
||||
"apikey": api_key.api_key,
|
||||
},
|
||||
)
|
||||
pub_response.raise_for_status()
|
||||
pub_data = pub_response.json()
|
||||
|
||||
time.sleep(1.0 / rate_limit)
|
||||
|
||||
# Extract titles and get page images
|
||||
for title in pub_data.get("Result", []):
|
||||
title_id = title.get("TitleID")
|
||||
if not title_id:
|
||||
continue
|
||||
|
||||
# Get pages for this title
|
||||
pages_response = client.get(
|
||||
f"{self.BASE_URL}",
|
||||
params={
|
||||
"op": "GetPageMetadata",
|
||||
"titleid": title_id,
|
||||
"format": "json",
|
||||
"apikey": api_key.api_key,
|
||||
"ocr": "false",
|
||||
"names": "false",
|
||||
},
|
||||
)
|
||||
|
||||
if pages_response.status_code != 200:
|
||||
continue
|
||||
|
||||
pages_data = pages_response.json()
|
||||
pages = pages_data.get("Result", [])
|
||||
|
||||
time.sleep(1.0 / rate_limit)
|
||||
|
||||
# Look for pages that are likely illustrations
|
||||
for page in pages[:100]: # Limit pages per title
|
||||
page_types = page.get("PageTypes", [])
|
||||
|
||||
# Only get illustration/plate pages
|
||||
is_illustration = any(
|
||||
pt.get("PageTypeName", "").lower() in ["illustration", "plate", "figure", "map"]
|
||||
for pt in page_types
|
||||
) if page_types else False
|
||||
|
||||
if not is_illustration and page_types:
|
||||
continue
|
||||
|
||||
page_id = page.get("PageID")
|
||||
if not page_id:
|
||||
continue
|
||||
|
||||
# Construct image URL
|
||||
# BHL provides multiple image sizes
|
||||
image_url = f"https://www.biodiversitylibrary.org/pageimage/{page_id}"
|
||||
|
||||
# Check if already exists
|
||||
source_id = str(page_id)
|
||||
existing = db.query(Image).filter(
|
||||
Image.source == self.name,
|
||||
Image.source_id == source_id,
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
continue
|
||||
|
||||
# Determine license - BHL content is usually public domain
|
||||
item_url = page.get("ItemUrl", "")
|
||||
year = None
|
||||
try:
|
||||
# Try to extract year from ItemUrl or other fields
|
||||
if "Year" in page:
|
||||
year = int(page.get("Year", 0))
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Content before 1927 is public domain in US
|
||||
if year and year < 1927:
|
||||
license_code = "PD"
|
||||
else:
|
||||
license_code = "CC0" # BHL default for older works
|
||||
|
||||
# Build attribution
|
||||
title_name = title.get("ShortTitle", title.get("FullTitle", "Unknown"))
|
||||
attribution = f"From '{title_name}' via Biodiversity Heritage Library ({license_code})"
|
||||
|
||||
# Create image record
|
||||
image = Image(
|
||||
species_id=species.id,
|
||||
source=self.name,
|
||||
source_id=source_id,
|
||||
url=image_url,
|
||||
license=license_code,
|
||||
attribution=attribution,
|
||||
status="pending",
|
||||
)
|
||||
db.add(image)
|
||||
db.commit()
|
||||
|
||||
# Queue for download
|
||||
download_and_process_image.delay(image.id)
|
||||
downloaded += 1
|
||||
|
||||
# Limit total per species
|
||||
if downloaded >= 50:
|
||||
break
|
||||
|
||||
if downloaded >= 50:
|
||||
break
|
||||
|
||||
if downloaded >= 50:
|
||||
break
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
log("error", f" HTTP error for {species.scientific_name}: {e.response.status_code}")
|
||||
except Exception as e:
|
||||
log("error", f" Error scraping BHL for {species.scientific_name}: {e}")
|
||||
|
||||
return {"downloaded": downloaded, "rejected": rejected}
|
||||
|
||||
def test_connection(self, api_key: ApiKey) -> str:
|
||||
"""Test BHL API connection."""
|
||||
with httpx.Client(timeout=10, headers=self.HEADERS, verify=False) as client:
|
||||
response = client.get(
|
||||
f"{self.BASE_URL}",
|
||||
params={
|
||||
"op": "NameSearch",
|
||||
"name": "Rosa",
|
||||
"format": "json",
|
||||
"apikey": api_key.api_key,
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
results = data.get("Result", [])
|
||||
return f"BHL API connection successful ({len(results)} results for 'Rosa')"
|
||||
135
backend/app/scrapers/bing.py
Normal file
135
backend/app/scrapers/bing.py
Normal file
@@ -0,0 +1,135 @@
|
||||
import hashlib
|
||||
import time
|
||||
import logging
|
||||
from typing import Dict, Optional
|
||||
|
||||
import httpx
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.scrapers.base import BaseScraper
|
||||
from app.models import Species, Image, ApiKey
|
||||
from app.workers.quality_tasks import download_and_process_image
|
||||
|
||||
|
||||
class BingScraper(BaseScraper):
|
||||
"""Scraper for Bing Image Search v7 API (Azure Cognitive Services)."""
|
||||
|
||||
name = "bing"
|
||||
requires_api_key = True
|
||||
|
||||
BASE_URL = "https://api.bing.microsoft.com/v7.0/images/search"
|
||||
|
||||
NEGATIVE_TERMS = "-herbarium -specimen -illustration -drawing -diagram -dried -pressed"
|
||||
|
||||
LICENSE_MAP = {
|
||||
"Public": "CC0",
|
||||
"Share": "CC-BY-SA",
|
||||
"ShareCommercially": "CC-BY",
|
||||
"Modify": "CC-BY-SA",
|
||||
"ModifyCommercially": "CC-BY",
|
||||
}
|
||||
|
||||
def _build_queries(self, species: Species) -> list[str]:
|
||||
queries = [f'"{species.scientific_name}" plant photo {self.NEGATIVE_TERMS}']
|
||||
if species.common_name:
|
||||
queries.append(f'"{species.common_name}" houseplant photo {self.NEGATIVE_TERMS}')
|
||||
return queries
|
||||
|
||||
def scrape_species(
|
||||
self,
|
||||
species: Species,
|
||||
db: Session,
|
||||
logger: Optional[logging.Logger] = None,
|
||||
) -> Dict[str, int]:
|
||||
api_key = self.get_api_key(db)
|
||||
if not api_key:
|
||||
return {"downloaded": 0, "rejected": 0}
|
||||
|
||||
rate_limit = api_key.rate_limit_per_sec or 3.0
|
||||
downloaded = 0
|
||||
rejected = 0
|
||||
seen_urls = set()
|
||||
|
||||
headers = {
|
||||
"Ocp-Apim-Subscription-Key": api_key.api_key,
|
||||
}
|
||||
|
||||
try:
|
||||
queries = self._build_queries(species)
|
||||
|
||||
with httpx.Client(timeout=30, headers=headers) as client:
|
||||
for query in queries:
|
||||
params = {
|
||||
"q": query,
|
||||
"imageType": "Photo",
|
||||
"license": "ShareCommercially",
|
||||
"count": 50,
|
||||
}
|
||||
|
||||
response = client.get(self.BASE_URL, params=params)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
for result in data.get("value", []):
|
||||
url = result.get("contentUrl")
|
||||
if not url or url in seen_urls:
|
||||
continue
|
||||
seen_urls.add(url)
|
||||
|
||||
# Use Bing's imageId, fall back to md5 hash
|
||||
source_id = result.get("imageId") or hashlib.md5(url.encode()).hexdigest()[:16]
|
||||
|
||||
existing = db.query(Image).filter(
|
||||
Image.source == self.name,
|
||||
Image.source_id == source_id,
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
continue
|
||||
|
||||
# Map license
|
||||
bing_license = result.get("license", "")
|
||||
license_code = self.LICENSE_MAP.get(bing_license, "UNKNOWN")
|
||||
|
||||
host = result.get("hostPageDisplayUrl", "")
|
||||
attribution = f"via Bing ({host})" if host else "via Bing Image Search"
|
||||
|
||||
image = Image(
|
||||
species_id=species.id,
|
||||
source=self.name,
|
||||
source_id=source_id,
|
||||
url=url,
|
||||
width=result.get("width"),
|
||||
height=result.get("height"),
|
||||
license=license_code,
|
||||
attribution=attribution,
|
||||
status="pending",
|
||||
)
|
||||
db.add(image)
|
||||
db.commit()
|
||||
|
||||
download_and_process_image.delay(image.id)
|
||||
downloaded += 1
|
||||
|
||||
time.sleep(1.0 / rate_limit)
|
||||
|
||||
except Exception as e:
|
||||
if logger:
|
||||
logger.error(f"Error scraping Bing for {species.scientific_name}: {e}")
|
||||
else:
|
||||
print(f"Error scraping Bing for {species.scientific_name}: {e}")
|
||||
|
||||
return {"downloaded": downloaded, "rejected": rejected}
|
||||
|
||||
def test_connection(self, api_key: ApiKey) -> str:
|
||||
headers = {"Ocp-Apim-Subscription-Key": api_key.api_key}
|
||||
with httpx.Client(timeout=10, headers=headers) as client:
|
||||
response = client.get(
|
||||
self.BASE_URL,
|
||||
params={"q": "Monstera deliciosa plant", "count": 1},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
count = data.get("totalEstimatedMatches", 0)
|
||||
return f"Bing Image Search working ({count:,} estimated matches)"
|
||||
101
backend/app/scrapers/duckduckgo.py
Normal file
101
backend/app/scrapers/duckduckgo.py
Normal file
@@ -0,0 +1,101 @@
|
||||
import hashlib
|
||||
import time
|
||||
import logging
|
||||
from typing import Dict, Optional
|
||||
|
||||
from duckduckgo_search import DDGS
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.scrapers.base import BaseScraper
|
||||
from app.models import Species, Image, ApiKey
|
||||
from app.workers.quality_tasks import download_and_process_image
|
||||
|
||||
|
||||
class DuckDuckGoScraper(BaseScraper):
|
||||
"""Scraper for DuckDuckGo image search. No API key required."""
|
||||
|
||||
name = "duckduckgo"
|
||||
requires_api_key = False
|
||||
|
||||
NEGATIVE_TERMS = "-herbarium -specimen -illustration -drawing -diagram -dried -pressed"
|
||||
|
||||
def _build_queries(self, species: Species) -> list[str]:
|
||||
queries = [f'"{species.scientific_name}" plant photo {self.NEGATIVE_TERMS}']
|
||||
if species.common_name:
|
||||
queries.append(f'"{species.common_name}" houseplant photo {self.NEGATIVE_TERMS}')
|
||||
return queries
|
||||
|
||||
def scrape_species(
|
||||
self,
|
||||
species: Species,
|
||||
db: Session,
|
||||
logger: Optional[logging.Logger] = None,
|
||||
) -> Dict[str, int]:
|
||||
api_key = self.get_api_key(db)
|
||||
rate_limit = api_key.rate_limit_per_sec if api_key else 0.5
|
||||
|
||||
downloaded = 0
|
||||
rejected = 0
|
||||
seen_urls = set()
|
||||
|
||||
try:
|
||||
queries = self._build_queries(species)
|
||||
|
||||
with DDGS() as ddgs:
|
||||
for query in queries:
|
||||
results = ddgs.images(
|
||||
keywords=query,
|
||||
type_image="photo",
|
||||
max_results=50,
|
||||
)
|
||||
|
||||
for result in results:
|
||||
url = result.get("image")
|
||||
if not url or url in seen_urls:
|
||||
continue
|
||||
seen_urls.add(url)
|
||||
|
||||
source_id = hashlib.md5(url.encode()).hexdigest()[:16]
|
||||
|
||||
# Check if already exists
|
||||
existing = db.query(Image).filter(
|
||||
Image.source == self.name,
|
||||
Image.source_id == source_id,
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
continue
|
||||
|
||||
title = result.get("title", "")
|
||||
attribution = f"{title} via DuckDuckGo" if title else "via DuckDuckGo"
|
||||
|
||||
image = Image(
|
||||
species_id=species.id,
|
||||
source=self.name,
|
||||
source_id=source_id,
|
||||
url=url,
|
||||
license="UNKNOWN",
|
||||
attribution=attribution,
|
||||
status="pending",
|
||||
)
|
||||
db.add(image)
|
||||
db.commit()
|
||||
|
||||
download_and_process_image.delay(image.id)
|
||||
downloaded += 1
|
||||
|
||||
time.sleep(1.0 / rate_limit)
|
||||
|
||||
except Exception as e:
|
||||
if logger:
|
||||
logger.error(f"Error scraping DuckDuckGo for {species.scientific_name}: {e}")
|
||||
else:
|
||||
print(f"Error scraping DuckDuckGo for {species.scientific_name}: {e}")
|
||||
|
||||
return {"downloaded": downloaded, "rejected": rejected}
|
||||
|
||||
def test_connection(self, api_key: ApiKey) -> str:
|
||||
with DDGS() as ddgs:
|
||||
results = ddgs.images(keywords="Monstera deliciosa plant", max_results=1)
|
||||
count = len(list(results))
|
||||
return f"DuckDuckGo search working ({count} test result)"
|
||||
226
backend/app/scrapers/eol.py
Normal file
226
backend/app/scrapers/eol.py
Normal file
@@ -0,0 +1,226 @@
|
||||
import time
|
||||
import logging
|
||||
from typing import Dict, Optional
|
||||
|
||||
import httpx
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.scrapers.base import BaseScraper
|
||||
from app.models import Species, Image, ApiKey
|
||||
from app.workers.quality_tasks import download_and_process_image
|
||||
|
||||
|
||||
class EOLScraper(BaseScraper):
|
||||
"""Scraper for Encyclopedia of Life (EOL) images.
|
||||
|
||||
EOL aggregates biodiversity data from many sources and provides
|
||||
a free API with no authentication required.
|
||||
"""
|
||||
|
||||
name = "eol"
|
||||
requires_api_key = False
|
||||
|
||||
BASE_URL = "https://eol.org/api"
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "PlantGuideScraper/1.0 (Plant image collection for ML training)",
|
||||
"Accept": "application/json",
|
||||
}
|
||||
|
||||
# Map EOL license URLs to short codes
|
||||
LICENSE_MAP = {
|
||||
"http://creativecommons.org/publicdomain/zero/1.0/": "CC0",
|
||||
"http://creativecommons.org/publicdomain/mark/1.0/": "CC0",
|
||||
"http://creativecommons.org/licenses/by/2.0/": "CC-BY",
|
||||
"http://creativecommons.org/licenses/by/3.0/": "CC-BY",
|
||||
"http://creativecommons.org/licenses/by/4.0/": "CC-BY",
|
||||
"http://creativecommons.org/licenses/by-sa/2.0/": "CC-BY-SA",
|
||||
"http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
|
||||
"http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
|
||||
"https://creativecommons.org/publicdomain/zero/1.0/": "CC0",
|
||||
"https://creativecommons.org/publicdomain/mark/1.0/": "CC0",
|
||||
"https://creativecommons.org/licenses/by/2.0/": "CC-BY",
|
||||
"https://creativecommons.org/licenses/by/3.0/": "CC-BY",
|
||||
"https://creativecommons.org/licenses/by/4.0/": "CC-BY",
|
||||
"https://creativecommons.org/licenses/by-sa/2.0/": "CC-BY-SA",
|
||||
"https://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
|
||||
"https://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
|
||||
"pd": "CC0", # Public domain
|
||||
"public domain": "CC0",
|
||||
}
|
||||
|
||||
# Commercial-safe licenses
|
||||
ALLOWED_LICENSES = {"CC0", "CC-BY", "CC-BY-SA"}
|
||||
|
||||
def scrape_species(
|
||||
self,
|
||||
species: Species,
|
||||
db: Session,
|
||||
logger: Optional[logging.Logger] = None
|
||||
) -> Dict[str, int]:
|
||||
"""Scrape images from EOL for a species."""
|
||||
api_key = self.get_api_key(db)
|
||||
rate_limit = api_key.rate_limit_per_sec if api_key else 0.5
|
||||
|
||||
downloaded = 0
|
||||
rejected = 0
|
||||
|
||||
def log(level: str, msg: str):
|
||||
if logger:
|
||||
getattr(logger, level)(msg)
|
||||
|
||||
try:
|
||||
# Disable SSL verification - EOL is a trusted source and some Docker
|
||||
# environments lack proper CA certificates
|
||||
with httpx.Client(timeout=30, headers=self.HEADERS, verify=False) as client:
|
||||
# Step 1: Search for the species
|
||||
search_response = client.get(
|
||||
f"{self.BASE_URL}/search/1.0.json",
|
||||
params={
|
||||
"q": species.scientific_name,
|
||||
"page": 1,
|
||||
"exact": "true",
|
||||
},
|
||||
)
|
||||
search_response.raise_for_status()
|
||||
search_data = search_response.json()
|
||||
|
||||
results = search_data.get("results", [])
|
||||
if not results:
|
||||
log("info", f" Species not found in EOL: {species.scientific_name}")
|
||||
return {"downloaded": 0, "rejected": 0}
|
||||
|
||||
# Get the EOL page ID
|
||||
eol_page_id = results[0].get("id")
|
||||
if not eol_page_id:
|
||||
return {"downloaded": 0, "rejected": 0}
|
||||
|
||||
time.sleep(1.0 / rate_limit)
|
||||
|
||||
# Step 2: Get page details with images
|
||||
page_response = client.get(
|
||||
f"{self.BASE_URL}/pages/1.0/{eol_page_id}.json",
|
||||
params={
|
||||
"images_per_page": 75,
|
||||
"images_page": 1,
|
||||
"videos_per_page": 0,
|
||||
"sounds_per_page": 0,
|
||||
"maps_per_page": 0,
|
||||
"texts_per_page": 0,
|
||||
"details": "true",
|
||||
"licenses": "cc-by|cc-by-sa|pd|cc-by-nc",
|
||||
},
|
||||
)
|
||||
page_response.raise_for_status()
|
||||
page_data = page_response.json()
|
||||
|
||||
data_objects = page_data.get("dataObjects", [])
|
||||
log("debug", f" Found {len(data_objects)} media objects")
|
||||
|
||||
for obj in data_objects:
|
||||
# Only process images
|
||||
media_type = obj.get("dataType", "")
|
||||
if "image" not in media_type.lower() and "stillimage" not in media_type.lower():
|
||||
continue
|
||||
|
||||
# Get image URL
|
||||
image_url = obj.get("eolMediaURL") or obj.get("mediaURL")
|
||||
if not image_url:
|
||||
rejected += 1
|
||||
continue
|
||||
|
||||
# Check license
|
||||
license_url = obj.get("license", "").lower()
|
||||
license_code = None
|
||||
|
||||
# Try to match license URL
|
||||
for pattern, code in self.LICENSE_MAP.items():
|
||||
if pattern in license_url:
|
||||
license_code = code
|
||||
break
|
||||
|
||||
if not license_code:
|
||||
# Check for NC licenses which we reject
|
||||
if "-nc" in license_url:
|
||||
rejected += 1
|
||||
continue
|
||||
# Unknown license, skip
|
||||
log("debug", f" Rejected: unknown license {license_url}")
|
||||
rejected += 1
|
||||
continue
|
||||
|
||||
if license_code not in self.ALLOWED_LICENSES:
|
||||
rejected += 1
|
||||
continue
|
||||
|
||||
# Create unique source ID
|
||||
source_id = str(obj.get("dataObjectVersionID") or obj.get("identifier") or hash(image_url))
|
||||
|
||||
# Check if already exists
|
||||
existing = db.query(Image).filter(
|
||||
Image.source == self.name,
|
||||
Image.source_id == source_id,
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
continue
|
||||
|
||||
# Build attribution
|
||||
agents = obj.get("agents", [])
|
||||
photographer = None
|
||||
rights_holder = None
|
||||
|
||||
for agent in agents:
|
||||
role = agent.get("role", "").lower()
|
||||
name = agent.get("full_name", "")
|
||||
if role == "photographer":
|
||||
photographer = name
|
||||
elif role == "owner" or role == "rights holder":
|
||||
rights_holder = name
|
||||
|
||||
attribution_parts = []
|
||||
if photographer:
|
||||
attribution_parts.append(f"Photo by {photographer}")
|
||||
if rights_holder and rights_holder != photographer:
|
||||
attribution_parts.append(f"Rights: {rights_holder}")
|
||||
attribution_parts.append(f"via EOL ({license_code})")
|
||||
attribution = " | ".join(attribution_parts)
|
||||
|
||||
# Create image record
|
||||
image = Image(
|
||||
species_id=species.id,
|
||||
source=self.name,
|
||||
source_id=source_id,
|
||||
url=image_url,
|
||||
license=license_code,
|
||||
attribution=attribution,
|
||||
status="pending",
|
||||
)
|
||||
db.add(image)
|
||||
db.commit()
|
||||
|
||||
# Queue for download
|
||||
download_and_process_image.delay(image.id)
|
||||
downloaded += 1
|
||||
|
||||
time.sleep(1.0 / rate_limit)
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
log("error", f" HTTP error for {species.scientific_name}: {e.response.status_code}")
|
||||
except Exception as e:
|
||||
log("error", f" Error scraping EOL for {species.scientific_name}: {e}")
|
||||
|
||||
return {"downloaded": downloaded, "rejected": rejected}
|
||||
|
||||
def test_connection(self, api_key: ApiKey) -> str:
|
||||
"""Test EOL API connection."""
|
||||
with httpx.Client(timeout=10, headers=self.HEADERS, verify=False) as client:
|
||||
response = client.get(
|
||||
f"{self.BASE_URL}/search/1.0.json",
|
||||
params={"q": "Rosa", "page": 1},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
total = data.get("totalResults", 0)
|
||||
return f"EOL API connection successful ({total} results for 'Rosa')"
|
||||
146
backend/app/scrapers/flickr.py
Normal file
146
backend/app/scrapers/flickr.py
Normal file
@@ -0,0 +1,146 @@
|
||||
import time
|
||||
import logging
|
||||
from typing import Dict, Optional
|
||||
|
||||
import httpx
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.scrapers.base import BaseScraper
|
||||
from app.models import Species, Image, ApiKey
|
||||
from app.workers.quality_tasks import download_and_process_image
|
||||
|
||||
|
||||
class FlickrScraper(BaseScraper):
|
||||
"""Scraper for Flickr images via their API."""
|
||||
|
||||
name = "flickr"
|
||||
requires_api_key = True
|
||||
|
||||
BASE_URL = "https://api.flickr.com/services/rest/"
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
|
||||
}
|
||||
|
||||
# Commercial-safe license IDs
|
||||
# 4 = CC BY 2.0, 7 = No known copyright, 8 = US Gov, 9 = CC0
|
||||
ALLOWED_LICENSES = "4,7,8,9"
|
||||
|
||||
LICENSE_MAP = {
|
||||
"4": "CC-BY",
|
||||
"7": "NO-KNOWN-COPYRIGHT",
|
||||
"8": "US-GOV",
|
||||
"9": "CC0",
|
||||
}
|
||||
|
||||
def scrape_species(
|
||||
self,
|
||||
species: Species,
|
||||
db: Session,
|
||||
logger: Optional[logging.Logger] = None
|
||||
) -> Dict[str, int]:
|
||||
"""Scrape images from Flickr for a species."""
|
||||
api_key = self.get_api_key(db)
|
||||
if not api_key:
|
||||
return {"downloaded": 0, "rejected": 0, "error": "No API key configured"}
|
||||
|
||||
rate_limit = api_key.rate_limit_per_sec
|
||||
|
||||
downloaded = 0
|
||||
rejected = 0
|
||||
|
||||
try:
|
||||
params = {
|
||||
"method": "flickr.photos.search",
|
||||
"api_key": api_key.api_key,
|
||||
"text": species.scientific_name,
|
||||
"license": self.ALLOWED_LICENSES,
|
||||
"content_type": 1, # Photos only
|
||||
"media": "photos",
|
||||
"extras": "license,url_l,url_o,owner_name",
|
||||
"per_page": 100,
|
||||
"format": "json",
|
||||
"nojsoncallback": 1,
|
||||
}
|
||||
|
||||
with httpx.Client(timeout=30, headers=self.HEADERS) as client:
|
||||
response = client.get(self.BASE_URL, params=params)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
if data.get("stat") != "ok":
|
||||
return {"downloaded": 0, "rejected": 0, "error": data.get("message")}
|
||||
|
||||
photos = data.get("photos", {}).get("photo", [])
|
||||
|
||||
for photo in photos:
|
||||
# Get best URL (original or large)
|
||||
url = photo.get("url_o") or photo.get("url_l")
|
||||
if not url:
|
||||
rejected += 1
|
||||
continue
|
||||
|
||||
# Get license
|
||||
license_id = str(photo.get("license", ""))
|
||||
license_code = self.LICENSE_MAP.get(license_id, "UNKNOWN")
|
||||
if license_code == "UNKNOWN":
|
||||
rejected += 1
|
||||
continue
|
||||
|
||||
# Check if already exists
|
||||
source_id = str(photo.get("id"))
|
||||
existing = db.query(Image).filter(
|
||||
Image.source == self.name,
|
||||
Image.source_id == source_id,
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
continue
|
||||
|
||||
# Build attribution
|
||||
owner = photo.get("ownername", "Unknown")
|
||||
attribution = f"Photo by {owner} on Flickr ({license_code})"
|
||||
|
||||
# Create image record
|
||||
image = Image(
|
||||
species_id=species.id,
|
||||
source=self.name,
|
||||
source_id=source_id,
|
||||
url=url,
|
||||
license=license_code,
|
||||
attribution=attribution,
|
||||
status="pending",
|
||||
)
|
||||
db.add(image)
|
||||
db.commit()
|
||||
|
||||
# Queue for download
|
||||
download_and_process_image.delay(image.id)
|
||||
downloaded += 1
|
||||
|
||||
# Rate limiting
|
||||
time.sleep(1.0 / rate_limit)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error scraping Flickr for {species.scientific_name}: {e}")
|
||||
|
||||
return {"downloaded": downloaded, "rejected": rejected}
|
||||
|
||||
def test_connection(self, api_key: ApiKey) -> str:
|
||||
"""Test Flickr API connection."""
|
||||
params = {
|
||||
"method": "flickr.test.echo",
|
||||
"api_key": api_key.api_key,
|
||||
"format": "json",
|
||||
"nojsoncallback": 1,
|
||||
}
|
||||
|
||||
with httpx.Client(timeout=10, headers=self.HEADERS) as client:
|
||||
response = client.get(self.BASE_URL, params=params)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
if data.get("stat") != "ok":
|
||||
raise Exception(data.get("message", "API test failed"))
|
||||
|
||||
return "Flickr API connection successful"
|
||||
159
backend/app/scrapers/gbif.py
Normal file
159
backend/app/scrapers/gbif.py
Normal file
@@ -0,0 +1,159 @@
|
||||
import time
|
||||
import logging
|
||||
from typing import Dict, Optional
|
||||
|
||||
import httpx
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.scrapers.base import BaseScraper
|
||||
from app.models import Species, Image, ApiKey
|
||||
from app.workers.quality_tasks import download_and_process_image
|
||||
|
||||
|
||||
class GBIFScraper(BaseScraper):
|
||||
"""Scraper for GBIF (Global Biodiversity Information Facility) images."""
|
||||
|
||||
name = "gbif"
|
||||
requires_api_key = False # GBIF is free to use
|
||||
|
||||
BASE_URL = "https://api.gbif.org/v1"
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
|
||||
}
|
||||
|
||||
# Map GBIF license URLs to short codes
|
||||
LICENSE_MAP = {
|
||||
"http://creativecommons.org/publicdomain/zero/1.0/legalcode": "CC0",
|
||||
"http://creativecommons.org/licenses/by/4.0/legalcode": "CC-BY",
|
||||
"http://creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC",
|
||||
"http://creativecommons.org/publicdomain/zero/1.0/": "CC0",
|
||||
"http://creativecommons.org/licenses/by/4.0/": "CC-BY",
|
||||
"http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
|
||||
"https://creativecommons.org/publicdomain/zero/1.0/legalcode": "CC0",
|
||||
"https://creativecommons.org/licenses/by/4.0/legalcode": "CC-BY",
|
||||
"https://creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC",
|
||||
"https://creativecommons.org/publicdomain/zero/1.0/": "CC0",
|
||||
"https://creativecommons.org/licenses/by/4.0/": "CC-BY",
|
||||
"https://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
|
||||
}
|
||||
|
||||
# Only allow commercial-safe licenses
|
||||
ALLOWED_LICENSES = {"CC0", "CC-BY"}
|
||||
|
||||
def scrape_species(
|
||||
self,
|
||||
species: Species,
|
||||
db: Session,
|
||||
logger: Optional[logging.Logger] = None
|
||||
) -> Dict[str, int]:
|
||||
"""Scrape images from GBIF for a species."""
|
||||
# GBIF doesn't require API key, but we still respect rate limits
|
||||
api_key = self.get_api_key(db)
|
||||
rate_limit = api_key.rate_limit_per_sec if api_key else 1.0
|
||||
|
||||
downloaded = 0
|
||||
rejected = 0
|
||||
|
||||
try:
|
||||
params = {
|
||||
"scientificName": species.scientific_name,
|
||||
"mediaType": "StillImage",
|
||||
"limit": 100,
|
||||
}
|
||||
|
||||
with httpx.Client(timeout=30, headers=self.HEADERS) as client:
|
||||
response = client.get(
|
||||
f"{self.BASE_URL}/occurrence/search",
|
||||
params=params,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
results = data.get("results", [])
|
||||
|
||||
for occurrence in results:
|
||||
media_list = occurrence.get("media", [])
|
||||
|
||||
for media in media_list:
|
||||
# Only process still images
|
||||
if media.get("type") != "StillImage":
|
||||
continue
|
||||
|
||||
url = media.get("identifier")
|
||||
if not url:
|
||||
rejected += 1
|
||||
continue
|
||||
|
||||
# Check license
|
||||
license_url = media.get("license", "")
|
||||
license_code = self.LICENSE_MAP.get(license_url)
|
||||
|
||||
if not license_code or license_code not in self.ALLOWED_LICENSES:
|
||||
rejected += 1
|
||||
continue
|
||||
|
||||
# Create unique source ID from occurrence key and media URL
|
||||
occurrence_key = occurrence.get("key", "")
|
||||
# Use hash of URL for uniqueness within occurrence
|
||||
url_hash = str(hash(url))[-8:]
|
||||
source_id = f"{occurrence_key}_{url_hash}"
|
||||
|
||||
# Check if already exists
|
||||
existing = db.query(Image).filter(
|
||||
Image.source == self.name,
|
||||
Image.source_id == source_id,
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
continue
|
||||
|
||||
# Build attribution
|
||||
creator = media.get("creator", "")
|
||||
rights_holder = media.get("rightsHolder", "")
|
||||
attribution_parts = []
|
||||
if creator:
|
||||
attribution_parts.append(f"Photo by {creator}")
|
||||
if rights_holder and rights_holder != creator:
|
||||
attribution_parts.append(f"Rights: {rights_holder}")
|
||||
attribution_parts.append(f"via GBIF ({license_code})")
|
||||
attribution = " | ".join(attribution_parts) if attribution_parts else f"GBIF ({license_code})"
|
||||
|
||||
# Create image record
|
||||
image = Image(
|
||||
species_id=species.id,
|
||||
source=self.name,
|
||||
source_id=source_id,
|
||||
url=url,
|
||||
license=license_code,
|
||||
attribution=attribution,
|
||||
status="pending",
|
||||
)
|
||||
db.add(image)
|
||||
db.commit()
|
||||
|
||||
# Queue for download
|
||||
download_and_process_image.delay(image.id)
|
||||
downloaded += 1
|
||||
|
||||
# Rate limiting
|
||||
time.sleep(1.0 / rate_limit)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error scraping GBIF for {species.scientific_name}: {e}")
|
||||
|
||||
return {"downloaded": downloaded, "rejected": rejected}
|
||||
|
||||
def test_connection(self, api_key: ApiKey) -> str:
|
||||
"""Test GBIF API connection."""
|
||||
# GBIF doesn't require authentication, just test the endpoint
|
||||
with httpx.Client(timeout=10, headers=self.HEADERS) as client:
|
||||
response = client.get(
|
||||
f"{self.BASE_URL}/occurrence/search",
|
||||
params={"limit": 1},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
count = data.get("count", 0)
|
||||
return f"GBIF API connection successful ({count:,} total occurrences available)"
|
||||
144
backend/app/scrapers/inaturalist.py
Normal file
144
backend/app/scrapers/inaturalist.py
Normal file
@@ -0,0 +1,144 @@
|
||||
import time
|
||||
import logging
|
||||
from typing import Dict, Optional
|
||||
|
||||
import httpx
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.scrapers.base import BaseScraper
|
||||
from app.models import Species, Image, ApiKey
|
||||
from app.workers.quality_tasks import download_and_process_image
|
||||
|
||||
|
||||
class INaturalistScraper(BaseScraper):
|
||||
"""Scraper for iNaturalist observations via their API."""
|
||||
|
||||
name = "inaturalist"
|
||||
requires_api_key = False # Public API, but rate limited
|
||||
|
||||
BASE_URL = "https://api.inaturalist.org/v1"
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
|
||||
}
|
||||
|
||||
# Commercial-safe licenses (CC0, CC-BY)
|
||||
ALLOWED_LICENSES = ["cc0", "cc-by"]
|
||||
|
||||
def scrape_species(
|
||||
self,
|
||||
species: Species,
|
||||
db: Session,
|
||||
logger: Optional[logging.Logger] = None
|
||||
) -> Dict[str, int]:
|
||||
"""Scrape images from iNaturalist for a species."""
|
||||
api_key = self.get_api_key(db)
|
||||
rate_limit = api_key.rate_limit_per_sec if api_key else 1.0
|
||||
|
||||
downloaded = 0
|
||||
rejected = 0
|
||||
|
||||
def log(level: str, msg: str):
|
||||
if logger:
|
||||
getattr(logger, level)(msg)
|
||||
|
||||
try:
|
||||
# Search for observations of this species
|
||||
params = {
|
||||
"taxon_name": species.scientific_name,
|
||||
"quality_grade": "research", # Only research-grade
|
||||
"photos": True,
|
||||
"per_page": 200,
|
||||
"order_by": "votes",
|
||||
"license": ",".join(self.ALLOWED_LICENSES),
|
||||
}
|
||||
|
||||
log("debug", f" API request params: {params}")
|
||||
|
||||
with httpx.Client(timeout=30, headers=self.HEADERS) as client:
|
||||
response = client.get(
|
||||
f"{self.BASE_URL}/observations",
|
||||
params=params,
|
||||
)
|
||||
log("debug", f" API response status: {response.status_code}")
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
observations = data.get("results", [])
|
||||
total_results = data.get("total_results", 0)
|
||||
log("debug", f" Found {len(observations)} observations (total: {total_results})")
|
||||
|
||||
if not observations:
|
||||
log("info", f" No observations found for {species.scientific_name}")
|
||||
return {"downloaded": 0, "rejected": 0}
|
||||
|
||||
for obs in observations:
|
||||
photos = obs.get("photos", [])
|
||||
for photo in photos:
|
||||
# Check license
|
||||
license_code = photo.get("license_code", "").lower() if photo.get("license_code") else ""
|
||||
if license_code not in self.ALLOWED_LICENSES:
|
||||
log("debug", f" Rejected photo {photo.get('id')}: license={license_code}")
|
||||
rejected += 1
|
||||
continue
|
||||
|
||||
# Get image URL (medium size for initial download)
|
||||
url = photo.get("url", "")
|
||||
if not url:
|
||||
log("debug", f" Skipped photo {photo.get('id')}: no URL")
|
||||
continue
|
||||
|
||||
# Convert to larger size
|
||||
url = url.replace("square", "large")
|
||||
|
||||
# Check if already exists
|
||||
source_id = str(photo.get("id"))
|
||||
existing = db.query(Image).filter(
|
||||
Image.source == self.name,
|
||||
Image.source_id == source_id,
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
log("debug", f" Skipped photo {source_id}: already exists")
|
||||
continue
|
||||
|
||||
# Create image record
|
||||
image = Image(
|
||||
species_id=species.id,
|
||||
source=self.name,
|
||||
source_id=source_id,
|
||||
url=url,
|
||||
license=license_code.upper(),
|
||||
attribution=photo.get("attribution", ""),
|
||||
status="pending",
|
||||
)
|
||||
db.add(image)
|
||||
db.commit()
|
||||
|
||||
# Queue for download
|
||||
download_and_process_image.delay(image.id)
|
||||
downloaded += 1
|
||||
log("debug", f" Queued photo {source_id} for download")
|
||||
|
||||
# Rate limiting
|
||||
time.sleep(1.0 / rate_limit)
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
log("error", f" HTTP error for {species.scientific_name}: {e.response.status_code} - {e.response.text}")
|
||||
except httpx.RequestError as e:
|
||||
log("error", f" Request error for {species.scientific_name}: {e}")
|
||||
except Exception as e:
|
||||
log("error", f" Error scraping iNaturalist for {species.scientific_name}: {e}")
|
||||
|
||||
return {"downloaded": downloaded, "rejected": rejected}
|
||||
|
||||
def test_connection(self, api_key: ApiKey) -> str:
|
||||
"""Test iNaturalist API connection."""
|
||||
with httpx.Client(timeout=10, headers=self.HEADERS) as client:
|
||||
response = client.get(
|
||||
f"{self.BASE_URL}/observations",
|
||||
params={"per_page": 1},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
return "iNaturalist API connection successful"
|
||||
154
backend/app/scrapers/trefle.py
Normal file
154
backend/app/scrapers/trefle.py
Normal file
@@ -0,0 +1,154 @@
|
||||
import time
|
||||
import logging
|
||||
from typing import Dict, Optional
|
||||
|
||||
import httpx
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.scrapers.base import BaseScraper
|
||||
from app.models import Species, Image, ApiKey
|
||||
from app.workers.quality_tasks import download_and_process_image
|
||||
|
||||
|
||||
class TrefleScraper(BaseScraper):
|
||||
"""Scraper for Trefle.io plant database."""
|
||||
|
||||
name = "trefle"
|
||||
requires_api_key = True
|
||||
|
||||
BASE_URL = "https://trefle.io/api/v1"
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
|
||||
}
|
||||
|
||||
def scrape_species(
|
||||
self,
|
||||
species: Species,
|
||||
db: Session,
|
||||
logger: Optional[logging.Logger] = None
|
||||
) -> Dict[str, int]:
|
||||
"""Scrape images from Trefle for a species."""
|
||||
api_key = self.get_api_key(db)
|
||||
if not api_key:
|
||||
return {"downloaded": 0, "rejected": 0, "error": "No API key configured"}
|
||||
|
||||
rate_limit = api_key.rate_limit_per_sec
|
||||
|
||||
downloaded = 0
|
||||
rejected = 0
|
||||
|
||||
try:
|
||||
# Search for the species
|
||||
params = {
|
||||
"token": api_key.api_key,
|
||||
"q": species.scientific_name,
|
||||
}
|
||||
|
||||
with httpx.Client(timeout=30, headers=self.HEADERS) as client:
|
||||
response = client.get(
|
||||
f"{self.BASE_URL}/plants/search",
|
||||
params=params,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
plants = data.get("data", [])
|
||||
|
||||
for plant in plants:
|
||||
# Get plant details for more images
|
||||
plant_id = plant.get("id")
|
||||
if not plant_id:
|
||||
continue
|
||||
|
||||
detail_response = client.get(
|
||||
f"{self.BASE_URL}/plants/{plant_id}",
|
||||
params={"token": api_key.api_key},
|
||||
)
|
||||
|
||||
if detail_response.status_code != 200:
|
||||
continue
|
||||
|
||||
plant_detail = detail_response.json().get("data", {})
|
||||
|
||||
# Get main image
|
||||
main_image = plant_detail.get("image_url")
|
||||
if main_image:
|
||||
source_id = f"main_{plant_id}"
|
||||
existing = db.query(Image).filter(
|
||||
Image.source == self.name,
|
||||
Image.source_id == source_id,
|
||||
).first()
|
||||
|
||||
if not existing:
|
||||
image = Image(
|
||||
species_id=species.id,
|
||||
source=self.name,
|
||||
source_id=source_id,
|
||||
url=main_image,
|
||||
license="TREFLE", # Trefle's own license
|
||||
attribution="Trefle.io Plant Database",
|
||||
status="pending",
|
||||
)
|
||||
db.add(image)
|
||||
db.commit()
|
||||
download_and_process_image.delay(image.id)
|
||||
downloaded += 1
|
||||
|
||||
# Get additional images from species detail
|
||||
images = plant_detail.get("images", {})
|
||||
for image_type, image_list in images.items():
|
||||
if not isinstance(image_list, list):
|
||||
continue
|
||||
|
||||
for img in image_list:
|
||||
url = img.get("image_url")
|
||||
if not url:
|
||||
continue
|
||||
|
||||
img_id = img.get("id", url.split("/")[-1])
|
||||
source_id = f"{image_type}_{img_id}"
|
||||
|
||||
existing = db.query(Image).filter(
|
||||
Image.source == self.name,
|
||||
Image.source_id == source_id,
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
continue
|
||||
|
||||
copyright_info = img.get("copyright", "")
|
||||
image = Image(
|
||||
species_id=species.id,
|
||||
source=self.name,
|
||||
source_id=source_id,
|
||||
url=url,
|
||||
license="TREFLE",
|
||||
attribution=copyright_info or "Trefle.io",
|
||||
status="pending",
|
||||
)
|
||||
db.add(image)
|
||||
db.commit()
|
||||
download_and_process_image.delay(image.id)
|
||||
downloaded += 1
|
||||
|
||||
# Rate limiting
|
||||
time.sleep(1.0 / rate_limit)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error scraping Trefle for {species.scientific_name}: {e}")
|
||||
|
||||
return {"downloaded": downloaded, "rejected": rejected}
|
||||
|
||||
def test_connection(self, api_key: ApiKey) -> str:
|
||||
"""Test Trefle API connection."""
|
||||
params = {"token": api_key.api_key}
|
||||
|
||||
with httpx.Client(timeout=10, headers=self.HEADERS) as client:
|
||||
response = client.get(
|
||||
f"{self.BASE_URL}/plants",
|
||||
params=params,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
return "Trefle API connection successful"
|
||||
146
backend/app/scrapers/wikimedia.py
Normal file
146
backend/app/scrapers/wikimedia.py
Normal file
@@ -0,0 +1,146 @@
|
||||
import time
|
||||
import logging
|
||||
from typing import Dict, Optional
|
||||
|
||||
import httpx
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.scrapers.base import BaseScraper
|
||||
from app.models import Species, Image, ApiKey
|
||||
from app.workers.quality_tasks import download_and_process_image
|
||||
|
||||
|
||||
class WikimediaScraper(BaseScraper):
|
||||
"""Scraper for Wikimedia Commons images."""
|
||||
|
||||
name = "wikimedia"
|
||||
requires_api_key = False
|
||||
|
||||
BASE_URL = "https://commons.wikimedia.org/w/api.php"
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
|
||||
}
|
||||
|
||||
def scrape_species(
|
||||
self,
|
||||
species: Species,
|
||||
db: Session,
|
||||
logger: Optional[logging.Logger] = None
|
||||
) -> Dict[str, int]:
|
||||
"""Scrape images from Wikimedia Commons for a species."""
|
||||
api_key = self.get_api_key(db)
|
||||
rate_limit = api_key.rate_limit_per_sec if api_key else 1.0
|
||||
|
||||
downloaded = 0
|
||||
rejected = 0
|
||||
|
||||
try:
|
||||
# Search for images in the species category
|
||||
search_term = species.scientific_name
|
||||
|
||||
params = {
|
||||
"action": "query",
|
||||
"format": "json",
|
||||
"generator": "search",
|
||||
"gsrsearch": f"filetype:bitmap {search_term}",
|
||||
"gsrnamespace": 6, # File namespace
|
||||
"gsrlimit": 50,
|
||||
"prop": "imageinfo",
|
||||
"iiprop": "url|extmetadata|size",
|
||||
}
|
||||
|
||||
with httpx.Client(timeout=30, headers=self.HEADERS) as client:
|
||||
response = client.get(self.BASE_URL, params=params)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
pages = data.get("query", {}).get("pages", {})
|
||||
|
||||
for page_id, page in pages.items():
|
||||
if int(page_id) < 0:
|
||||
continue
|
||||
|
||||
imageinfo = page.get("imageinfo", [{}])[0]
|
||||
url = imageinfo.get("url", "")
|
||||
if not url:
|
||||
continue
|
||||
|
||||
# Check size
|
||||
width = imageinfo.get("width", 0)
|
||||
height = imageinfo.get("height", 0)
|
||||
if width < 256 or height < 256:
|
||||
rejected += 1
|
||||
continue
|
||||
|
||||
# Get license from metadata
|
||||
metadata = imageinfo.get("extmetadata", {})
|
||||
license_info = metadata.get("LicenseShortName", {}).get("value", "")
|
||||
|
||||
# Filter for commercial-safe licenses
|
||||
license_upper = license_info.upper()
|
||||
if "CC BY" in license_upper or "CC0" in license_upper or "PUBLIC DOMAIN" in license_upper:
|
||||
license_code = license_info
|
||||
else:
|
||||
rejected += 1
|
||||
continue
|
||||
|
||||
# Check if already exists
|
||||
source_id = str(page_id)
|
||||
existing = db.query(Image).filter(
|
||||
Image.source == self.name,
|
||||
Image.source_id == source_id,
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
continue
|
||||
|
||||
# Get attribution
|
||||
artist = metadata.get("Artist", {}).get("value", "Unknown")
|
||||
# Clean HTML from artist
|
||||
if "<" in artist:
|
||||
import re
|
||||
artist = re.sub(r"<[^>]+>", "", artist).strip()
|
||||
|
||||
attribution = f"{artist} via Wikimedia Commons ({license_code})"
|
||||
|
||||
# Create image record
|
||||
image = Image(
|
||||
species_id=species.id,
|
||||
source=self.name,
|
||||
source_id=source_id,
|
||||
url=url,
|
||||
license=license_code,
|
||||
attribution=attribution,
|
||||
width=width,
|
||||
height=height,
|
||||
status="pending",
|
||||
)
|
||||
db.add(image)
|
||||
db.commit()
|
||||
|
||||
# Queue for download
|
||||
download_and_process_image.delay(image.id)
|
||||
downloaded += 1
|
||||
|
||||
# Rate limiting
|
||||
time.sleep(1.0 / rate_limit)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error scraping Wikimedia for {species.scientific_name}: {e}")
|
||||
|
||||
return {"downloaded": downloaded, "rejected": rejected}
|
||||
|
||||
def test_connection(self, api_key: ApiKey) -> str:
|
||||
"""Test Wikimedia API connection."""
|
||||
params = {
|
||||
"action": "query",
|
||||
"format": "json",
|
||||
"meta": "siteinfo",
|
||||
}
|
||||
|
||||
with httpx.Client(timeout=10, headers=self.HEADERS) as client:
|
||||
response = client.get(self.BASE_URL, params=params)
|
||||
response.raise_for_status()
|
||||
|
||||
return "Wikimedia Commons API connection successful"
|
||||
1
backend/app/utils/__init__.py
Normal file
1
backend/app/utils/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Utility functions
|
||||
80
backend/app/utils/dedup.py
Normal file
80
backend/app/utils/dedup.py
Normal file
@@ -0,0 +1,80 @@
|
||||
"""Image deduplication utilities using perceptual hashing."""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import imagehash
|
||||
from PIL import Image as PILImage
|
||||
|
||||
|
||||
def calculate_phash(image_path: str) -> Optional[str]:
|
||||
"""
|
||||
Calculate perceptual hash for an image.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
|
||||
Returns:
|
||||
Hex string of perceptual hash, or None if failed
|
||||
"""
|
||||
try:
|
||||
with PILImage.open(image_path) as img:
|
||||
return str(imagehash.phash(img))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def calculate_dhash(image_path: str) -> Optional[str]:
|
||||
"""
|
||||
Calculate difference hash for an image.
|
||||
Faster but less accurate than phash.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
|
||||
Returns:
|
||||
Hex string of difference hash, or None if failed
|
||||
"""
|
||||
try:
|
||||
with PILImage.open(image_path) as img:
|
||||
return str(imagehash.dhash(img))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def hashes_are_similar(hash1: str, hash2: str, threshold: int = 10) -> bool:
|
||||
"""
|
||||
Check if two hashes are similar (potential duplicates).
|
||||
|
||||
Args:
|
||||
hash1: First hash string
|
||||
hash2: Second hash string
|
||||
threshold: Maximum Hamming distance (default 10)
|
||||
|
||||
Returns:
|
||||
True if hashes are similar
|
||||
"""
|
||||
try:
|
||||
h1 = imagehash.hex_to_hash(hash1)
|
||||
h2 = imagehash.hex_to_hash(hash2)
|
||||
return (h1 - h2) <= threshold
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def hamming_distance(hash1: str, hash2: str) -> int:
|
||||
"""
|
||||
Calculate Hamming distance between two hashes.
|
||||
|
||||
Args:
|
||||
hash1: First hash string
|
||||
hash2: Second hash string
|
||||
|
||||
Returns:
|
||||
Hamming distance (0 = identical, higher = more different)
|
||||
"""
|
||||
try:
|
||||
h1 = imagehash.hex_to_hash(hash1)
|
||||
h2 = imagehash.hex_to_hash(hash2)
|
||||
return int(h1 - h2)
|
||||
except Exception:
|
||||
return 64 # Maximum distance
|
||||
109
backend/app/utils/image_quality.py
Normal file
109
backend/app/utils/image_quality.py
Normal file
@@ -0,0 +1,109 @@
|
||||
"""Image quality assessment utilities."""
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image as PILImage
|
||||
from scipy import ndimage
|
||||
|
||||
|
||||
def calculate_blur_score(image_path: str) -> float:
|
||||
"""
|
||||
Calculate blur score using Laplacian variance.
|
||||
Higher score = sharper image.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
|
||||
Returns:
|
||||
Variance of Laplacian (higher = sharper)
|
||||
"""
|
||||
try:
|
||||
img = PILImage.open(image_path).convert("L")
|
||||
img_array = np.array(img)
|
||||
laplacian = ndimage.laplace(img_array)
|
||||
return float(np.var(laplacian))
|
||||
except Exception:
|
||||
return 0.0
|
||||
|
||||
|
||||
def is_too_blurry(image_path: str, threshold: float = 100.0) -> bool:
|
||||
"""
|
||||
Check if image is too blurry for training.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
threshold: Minimum acceptable blur score (default 100)
|
||||
|
||||
Returns:
|
||||
True if image is too blurry
|
||||
"""
|
||||
score = calculate_blur_score(image_path)
|
||||
return score < threshold
|
||||
|
||||
|
||||
def get_image_dimensions(image_path: str) -> tuple[int, int]:
|
||||
"""
|
||||
Get image dimensions.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
|
||||
Returns:
|
||||
Tuple of (width, height)
|
||||
"""
|
||||
try:
|
||||
with PILImage.open(image_path) as img:
|
||||
return img.size
|
||||
except Exception:
|
||||
return (0, 0)
|
||||
|
||||
|
||||
def is_too_small(image_path: str, min_size: int = 256) -> bool:
|
||||
"""
|
||||
Check if image is too small for training.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
min_size: Minimum dimension size (default 256)
|
||||
|
||||
Returns:
|
||||
True if image is too small
|
||||
"""
|
||||
width, height = get_image_dimensions(image_path)
|
||||
return width < min_size or height < min_size
|
||||
|
||||
|
||||
def resize_image(
|
||||
image_path: str,
|
||||
output_path: str = None,
|
||||
max_size: int = 512,
|
||||
quality: int = 95,
|
||||
) -> bool:
|
||||
"""
|
||||
Resize image to max dimension while preserving aspect ratio.
|
||||
|
||||
Args:
|
||||
image_path: Path to input image
|
||||
output_path: Path for output (defaults to overwriting input)
|
||||
max_size: Maximum dimension size (default 512)
|
||||
quality: JPEG quality (default 95)
|
||||
|
||||
Returns:
|
||||
True if successful
|
||||
"""
|
||||
try:
|
||||
output_path = output_path or image_path
|
||||
|
||||
with PILImage.open(image_path) as img:
|
||||
# Only resize if larger than max_size
|
||||
if max(img.size) > max_size:
|
||||
img.thumbnail((max_size, max_size), PILImage.Resampling.LANCZOS)
|
||||
|
||||
# Convert to RGB if necessary (for JPEG)
|
||||
if img.mode in ("RGBA", "P"):
|
||||
img = img.convert("RGB")
|
||||
|
||||
img.save(output_path, "JPEG", quality=quality)
|
||||
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
92
backend/app/utils/logging.py
Normal file
92
backend/app/utils/logging.py
Normal file
@@ -0,0 +1,92 @@
|
||||
import logging
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from app.config import get_settings
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
|
||||
def setup_logging():
|
||||
"""Configure file and console logging."""
|
||||
logs_path = Path(settings.logs_path)
|
||||
logs_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create a dated log file
|
||||
log_file = logs_path / f"scraper_{datetime.now().strftime('%Y-%m-%d')}.log"
|
||||
|
||||
# Configure root logger
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(log_file),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
|
||||
return logging.getLogger("plant_scraper")
|
||||
|
||||
|
||||
def get_logger(name: str = "plant_scraper"):
|
||||
"""Get a logger instance."""
|
||||
logs_path = Path(settings.logs_path)
|
||||
logs_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logger = logging.getLogger(name)
|
||||
|
||||
if not logger.handlers:
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# File handler with daily rotation
|
||||
log_file = logs_path / f"scraper_{datetime.now().strftime('%Y-%m-%d')}.log"
|
||||
file_handler = logging.FileHandler(log_file)
|
||||
file_handler.setLevel(logging.INFO)
|
||||
file_handler.setFormatter(logging.Formatter(
|
||||
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
))
|
||||
|
||||
# Console handler
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setLevel(logging.INFO)
|
||||
console_handler.setFormatter(logging.Formatter(
|
||||
'%(asctime)s - %(levelname)s - %(message)s'
|
||||
))
|
||||
|
||||
logger.addHandler(file_handler)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
|
||||
|
||||
def get_job_logger(job_id: int):
|
||||
"""Get a logger specific to a job, writing to a job-specific file."""
|
||||
logs_path = Path(settings.logs_path)
|
||||
logs_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logger = logging.getLogger(f"job_{job_id}")
|
||||
|
||||
if not logger.handlers:
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
# Job-specific log file
|
||||
job_log_file = logs_path / f"job_{job_id}.log"
|
||||
file_handler = logging.FileHandler(job_log_file)
|
||||
file_handler.setLevel(logging.DEBUG)
|
||||
file_handler.setFormatter(logging.Formatter(
|
||||
'%(asctime)s - %(levelname)s - %(message)s'
|
||||
))
|
||||
|
||||
# Also log to daily file
|
||||
daily_log_file = logs_path / f"scraper_{datetime.now().strftime('%Y-%m-%d')}.log"
|
||||
daily_handler = logging.FileHandler(daily_log_file)
|
||||
daily_handler.setLevel(logging.INFO)
|
||||
daily_handler.setFormatter(logging.Formatter(
|
||||
'%(asctime)s - job_%(name)s - %(levelname)s - %(message)s'
|
||||
))
|
||||
|
||||
logger.addHandler(file_handler)
|
||||
logger.addHandler(daily_handler)
|
||||
|
||||
return logger
|
||||
1
backend/app/workers/__init__.py
Normal file
1
backend/app/workers/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Celery workers
|
||||
36
backend/app/workers/celery_app.py
Normal file
36
backend/app/workers/celery_app.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from celery import Celery
|
||||
|
||||
from app.config import get_settings
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
celery_app = Celery(
|
||||
"plant_scraper",
|
||||
broker=settings.redis_url,
|
||||
backend=settings.redis_url,
|
||||
include=[
|
||||
"app.workers.scrape_tasks",
|
||||
"app.workers.quality_tasks",
|
||||
"app.workers.export_tasks",
|
||||
"app.workers.stats_tasks",
|
||||
],
|
||||
)
|
||||
|
||||
celery_app.conf.update(
|
||||
task_serializer="json",
|
||||
accept_content=["json"],
|
||||
result_serializer="json",
|
||||
timezone="UTC",
|
||||
enable_utc=True,
|
||||
task_track_started=True,
|
||||
task_time_limit=3600 * 24, # 24 hour max per task
|
||||
worker_prefetch_multiplier=1,
|
||||
task_acks_late=True,
|
||||
beat_schedule={
|
||||
"refresh-stats-every-5min": {
|
||||
"task": "app.workers.stats_tasks.refresh_stats",
|
||||
"schedule": 300.0, # Every 5 minutes
|
||||
},
|
||||
},
|
||||
beat_schedule_filename="/tmp/celerybeat-schedule",
|
||||
)
|
||||
170
backend/app/workers/export_tasks.py
Normal file
170
backend/app/workers/export_tasks.py
Normal file
@@ -0,0 +1,170 @@
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import shutil
|
||||
import zipfile
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from app.workers.celery_app import celery_app
|
||||
from app.database import SessionLocal
|
||||
from app.models import Export, Image, Species
|
||||
from app.config import get_settings
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
|
||||
@celery_app.task(bind=True)
|
||||
def generate_export(self, export_id: int):
|
||||
"""Generate a zip export for CoreML training."""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
export = db.query(Export).filter(Export.id == export_id).first()
|
||||
if not export:
|
||||
return {"error": "Export not found"}
|
||||
|
||||
# Update status
|
||||
export.status = "generating"
|
||||
export.celery_task_id = self.request.id
|
||||
db.commit()
|
||||
|
||||
# Parse filter criteria
|
||||
criteria = json.loads(export.filter_criteria) if export.filter_criteria else {}
|
||||
min_images = criteria.get("min_images_per_species", 100)
|
||||
licenses = criteria.get("licenses")
|
||||
min_quality = criteria.get("min_quality")
|
||||
species_ids = criteria.get("species_ids")
|
||||
|
||||
# Build query for images
|
||||
query = db.query(Image).filter(Image.status == "downloaded")
|
||||
|
||||
if licenses:
|
||||
query = query.filter(Image.license.in_(licenses))
|
||||
|
||||
if min_quality:
|
||||
query = query.filter(Image.quality_score >= min_quality)
|
||||
|
||||
if species_ids:
|
||||
query = query.filter(Image.species_id.in_(species_ids))
|
||||
|
||||
# Group by species and filter by min count
|
||||
from sqlalchemy import func
|
||||
species_counts = db.query(
|
||||
Image.species_id,
|
||||
func.count(Image.id).label("count")
|
||||
).filter(Image.status == "downloaded").group_by(Image.species_id).all()
|
||||
|
||||
valid_species_ids = [s.species_id for s in species_counts if s.count >= min_images]
|
||||
|
||||
if species_ids:
|
||||
valid_species_ids = [s for s in valid_species_ids if s in species_ids]
|
||||
|
||||
if not valid_species_ids:
|
||||
export.status = "failed"
|
||||
export.error_message = "No species meet the criteria"
|
||||
export.completed_at = datetime.utcnow()
|
||||
db.commit()
|
||||
return {"error": "No species meet the criteria"}
|
||||
|
||||
# Create export directory
|
||||
export_dir = Path(settings.exports_path) / f"export_{export_id}"
|
||||
train_dir = export_dir / "Training"
|
||||
test_dir = export_dir / "Testing"
|
||||
train_dir.mkdir(parents=True, exist_ok=True)
|
||||
test_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
total_images = 0
|
||||
species_count = 0
|
||||
|
||||
# Process each valid species
|
||||
for i, species_id in enumerate(valid_species_ids):
|
||||
species = db.query(Species).filter(Species.id == species_id).first()
|
||||
if not species:
|
||||
continue
|
||||
|
||||
# Get images for this species
|
||||
images_query = query.filter(Image.species_id == species_id)
|
||||
if licenses:
|
||||
images_query = images_query.filter(Image.license.in_(licenses))
|
||||
if min_quality:
|
||||
images_query = images_query.filter(Image.quality_score >= min_quality)
|
||||
|
||||
images = images_query.all()
|
||||
if len(images) < min_images:
|
||||
continue
|
||||
|
||||
species_count += 1
|
||||
|
||||
# Create species folders
|
||||
species_name = species.scientific_name.replace(" ", "_")
|
||||
(train_dir / species_name).mkdir(exist_ok=True)
|
||||
(test_dir / species_name).mkdir(exist_ok=True)
|
||||
|
||||
# Shuffle and split
|
||||
random.shuffle(images)
|
||||
split_idx = int(len(images) * export.train_split)
|
||||
train_images = images[:split_idx]
|
||||
test_images = images[split_idx:]
|
||||
|
||||
# Copy images
|
||||
for j, img in enumerate(train_images):
|
||||
if img.local_path and os.path.exists(img.local_path):
|
||||
ext = Path(img.local_path).suffix or ".jpg"
|
||||
dest = train_dir / species_name / f"img_{j:05d}{ext}"
|
||||
shutil.copy2(img.local_path, dest)
|
||||
total_images += 1
|
||||
|
||||
for j, img in enumerate(test_images):
|
||||
if img.local_path and os.path.exists(img.local_path):
|
||||
ext = Path(img.local_path).suffix or ".jpg"
|
||||
dest = test_dir / species_name / f"img_{j:05d}{ext}"
|
||||
shutil.copy2(img.local_path, dest)
|
||||
total_images += 1
|
||||
|
||||
# Update progress
|
||||
self.update_state(
|
||||
state="PROGRESS",
|
||||
meta={
|
||||
"current": i + 1,
|
||||
"total": len(valid_species_ids),
|
||||
"species": species.scientific_name,
|
||||
}
|
||||
)
|
||||
|
||||
# Create zip file
|
||||
zip_path = Path(settings.exports_path) / f"export_{export_id}.zip"
|
||||
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
|
||||
for root, dirs, files in os.walk(export_dir):
|
||||
for file in files:
|
||||
file_path = Path(root) / file
|
||||
arcname = file_path.relative_to(export_dir)
|
||||
zipf.write(file_path, arcname)
|
||||
|
||||
# Clean up directory
|
||||
shutil.rmtree(export_dir)
|
||||
|
||||
# Update export record
|
||||
export.status = "completed"
|
||||
export.file_path = str(zip_path)
|
||||
export.file_size = zip_path.stat().st_size
|
||||
export.species_count = species_count
|
||||
export.image_count = total_images
|
||||
export.completed_at = datetime.utcnow()
|
||||
db.commit()
|
||||
|
||||
return {
|
||||
"status": "completed",
|
||||
"species_count": species_count,
|
||||
"image_count": total_images,
|
||||
"file_size": export.file_size,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
if export:
|
||||
export.status = "failed"
|
||||
export.error_message = str(e)
|
||||
export.completed_at = datetime.utcnow()
|
||||
db.commit()
|
||||
raise
|
||||
finally:
|
||||
db.close()
|
||||
224
backend/app/workers/quality_tasks.py
Normal file
224
backend/app/workers/quality_tasks.py
Normal file
@@ -0,0 +1,224 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
from PIL import Image as PILImage
|
||||
import imagehash
|
||||
import numpy as np
|
||||
from scipy import ndimage
|
||||
|
||||
from app.workers.celery_app import celery_app
|
||||
from app.database import SessionLocal
|
||||
from app.models import Image
|
||||
from app.config import get_settings
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
|
||||
def calculate_blur_score(image_path: str) -> float:
|
||||
"""Calculate blur score using Laplacian variance. Higher = sharper."""
|
||||
try:
|
||||
img = PILImage.open(image_path).convert("L")
|
||||
img_array = np.array(img)
|
||||
laplacian = ndimage.laplace(img_array)
|
||||
return float(np.var(laplacian))
|
||||
except Exception:
|
||||
return 0.0
|
||||
|
||||
|
||||
def calculate_phash(image_path: str) -> str:
|
||||
"""Calculate perceptual hash for deduplication."""
|
||||
try:
|
||||
img = PILImage.open(image_path)
|
||||
return str(imagehash.phash(img))
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def check_color_distribution(image_path: str) -> tuple[bool, str]:
|
||||
"""Check if image has healthy color distribution for a plant photo.
|
||||
|
||||
Returns (passed, reason) tuple.
|
||||
Rejects:
|
||||
- Low color variance (mean channel std < 25): herbarium specimens (brown on white)
|
||||
- No green + low variance (green ratio < 5% AND mean std < 40): monochrome illustrations
|
||||
"""
|
||||
try:
|
||||
img = PILImage.open(image_path).convert("RGB")
|
||||
arr = np.array(img, dtype=np.float64)
|
||||
|
||||
# Per-channel standard deviation
|
||||
channel_stds = arr.std(axis=(0, 1)) # [R_std, G_std, B_std]
|
||||
mean_std = float(channel_stds.mean())
|
||||
|
||||
if mean_std < 25:
|
||||
return False, f"Low color variance ({mean_std:.1f})"
|
||||
|
||||
# Check green ratio
|
||||
channel_means = arr.mean(axis=(0, 1))
|
||||
total = channel_means.sum()
|
||||
green_ratio = channel_means[1] / total if total > 0 else 0
|
||||
|
||||
if green_ratio < 0.05 and mean_std < 40:
|
||||
return False, f"No green ({green_ratio:.2%}) + low variance ({mean_std:.1f})"
|
||||
|
||||
return True, ""
|
||||
except Exception:
|
||||
return True, "" # Don't reject on error
|
||||
|
||||
|
||||
def resize_image(image_path: str, target_size: int = 512) -> bool:
|
||||
"""Resize image to target size while maintaining aspect ratio."""
|
||||
try:
|
||||
img = PILImage.open(image_path)
|
||||
img.thumbnail((target_size, target_size), PILImage.Resampling.LANCZOS)
|
||||
img.save(image_path, quality=95)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
@celery_app.task
|
||||
def download_and_process_image(image_id: int):
|
||||
"""Download image, check quality, dedupe, and resize."""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
image = db.query(Image).filter(Image.id == image_id).first()
|
||||
if not image:
|
||||
return {"error": "Image not found"}
|
||||
|
||||
# Create directory for species
|
||||
species = image.species
|
||||
species_dir = Path(settings.images_path) / species.scientific_name.replace(" ", "_")
|
||||
species_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Download image
|
||||
filename = f"{image.source}_{image.source_id or image.id}.jpg"
|
||||
local_path = species_dir / filename
|
||||
|
||||
try:
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
|
||||
}
|
||||
with httpx.Client(timeout=30, headers=headers, follow_redirects=True) as client:
|
||||
response = client.get(image.url)
|
||||
response.raise_for_status()
|
||||
|
||||
with open(local_path, "wb") as f:
|
||||
f.write(response.content)
|
||||
except Exception as e:
|
||||
image.status = "rejected"
|
||||
db.commit()
|
||||
return {"error": f"Download failed: {e}"}
|
||||
|
||||
# Check minimum size
|
||||
try:
|
||||
with PILImage.open(local_path) as img:
|
||||
width, height = img.size
|
||||
if width < 256 or height < 256:
|
||||
os.remove(local_path)
|
||||
image.status = "rejected"
|
||||
db.commit()
|
||||
return {"error": "Image too small"}
|
||||
image.width = width
|
||||
image.height = height
|
||||
except Exception as e:
|
||||
if local_path.exists():
|
||||
os.remove(local_path)
|
||||
image.status = "rejected"
|
||||
db.commit()
|
||||
return {"error": f"Invalid image: {e}"}
|
||||
|
||||
# Calculate perceptual hash for deduplication
|
||||
phash = calculate_phash(str(local_path))
|
||||
if phash:
|
||||
# Check for duplicates
|
||||
existing = db.query(Image).filter(
|
||||
Image.phash == phash,
|
||||
Image.id != image.id,
|
||||
Image.status == "downloaded"
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
os.remove(local_path)
|
||||
image.status = "rejected"
|
||||
image.phash = phash
|
||||
db.commit()
|
||||
return {"error": "Duplicate image"}
|
||||
|
||||
image.phash = phash
|
||||
|
||||
# Calculate blur score
|
||||
quality_score = calculate_blur_score(str(local_path))
|
||||
image.quality_score = quality_score
|
||||
|
||||
# Reject very blurry images (threshold can be tuned)
|
||||
if quality_score < 100: # Low variance = blurry
|
||||
os.remove(local_path)
|
||||
image.status = "rejected"
|
||||
db.commit()
|
||||
return {"error": "Image too blurry"}
|
||||
|
||||
# Check color distribution (reject herbarium specimens, illustrations)
|
||||
color_ok, color_reason = check_color_distribution(str(local_path))
|
||||
if not color_ok:
|
||||
os.remove(local_path)
|
||||
image.status = "rejected"
|
||||
db.commit()
|
||||
return {"error": f"Non-photo content: {color_reason}"}
|
||||
|
||||
# Resize to 512x512 max
|
||||
resize_image(str(local_path))
|
||||
|
||||
# Update image record
|
||||
image.local_path = str(local_path)
|
||||
image.status = "downloaded"
|
||||
db.commit()
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"path": str(local_path),
|
||||
"quality_score": quality_score,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
if image:
|
||||
image.status = "rejected"
|
||||
db.commit()
|
||||
return {"error": str(e)}
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@celery_app.task(bind=True)
|
||||
def batch_process_pending_images(self, source: str = None, chunk_size: int = 500):
|
||||
"""Process ALL pending images in chunks, with progress tracking."""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
query = db.query(Image).filter(Image.status == "pending")
|
||||
if source:
|
||||
query = query.filter(Image.source == source)
|
||||
|
||||
total = query.count()
|
||||
queued = 0
|
||||
offset = 0
|
||||
|
||||
while offset < total:
|
||||
chunk = query.order_by(Image.id).offset(offset).limit(chunk_size).all()
|
||||
if not chunk:
|
||||
break
|
||||
|
||||
for image in chunk:
|
||||
download_and_process_image.delay(image.id)
|
||||
queued += 1
|
||||
|
||||
offset += len(chunk)
|
||||
|
||||
self.update_state(
|
||||
state="PROGRESS",
|
||||
meta={"queued": queued, "total": total},
|
||||
)
|
||||
|
||||
return {"queued": queued, "total": total}
|
||||
finally:
|
||||
db.close()
|
||||
164
backend/app/workers/scrape_tasks.py
Normal file
164
backend/app/workers/scrape_tasks.py
Normal file
@@ -0,0 +1,164 @@
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
from app.workers.celery_app import celery_app
|
||||
from app.database import SessionLocal
|
||||
from app.models import Job, Species, Image
|
||||
from app.utils.logging import get_job_logger
|
||||
|
||||
|
||||
@celery_app.task(bind=True)
|
||||
def run_scrape_job(self, job_id: int):
|
||||
"""Main scrape task that dispatches to source-specific scrapers."""
|
||||
logger = get_job_logger(job_id)
|
||||
logger.info(f"Starting scrape job {job_id}")
|
||||
|
||||
db = SessionLocal()
|
||||
job = None
|
||||
try:
|
||||
job = db.query(Job).filter(Job.id == job_id).first()
|
||||
if not job:
|
||||
logger.error(f"Job {job_id} not found")
|
||||
return {"error": "Job not found"}
|
||||
|
||||
logger.info(f"Job: {job.name}, Source: {job.source}")
|
||||
|
||||
# Update job status
|
||||
job.status = "running"
|
||||
job.started_at = datetime.utcnow()
|
||||
job.celery_task_id = self.request.id
|
||||
db.commit()
|
||||
|
||||
# Get species to scrape
|
||||
if job.species_filter:
|
||||
species_ids = json.loads(job.species_filter)
|
||||
query = db.query(Species).filter(Species.id.in_(species_ids))
|
||||
logger.info(f"Filtered to species IDs: {species_ids}")
|
||||
else:
|
||||
query = db.query(Species)
|
||||
logger.info("Scraping all species")
|
||||
|
||||
# Filter by image count if requested
|
||||
if job.only_without_images or job.max_images:
|
||||
from sqlalchemy import func
|
||||
# Subquery to count downloaded images per species
|
||||
image_count_subquery = (
|
||||
db.query(Image.species_id, func.count(Image.id).label("count"))
|
||||
.filter(Image.status == "downloaded")
|
||||
.group_by(Image.species_id)
|
||||
.subquery()
|
||||
)
|
||||
# Left join with the count subquery
|
||||
query = query.outerjoin(
|
||||
image_count_subquery,
|
||||
Species.id == image_count_subquery.c.species_id
|
||||
)
|
||||
|
||||
if job.only_without_images:
|
||||
# Filter where count is NULL or 0
|
||||
query = query.filter(
|
||||
(image_count_subquery.c.count == None) | (image_count_subquery.c.count == 0)
|
||||
)
|
||||
logger.info("Filtering to species without images")
|
||||
elif job.max_images:
|
||||
# Filter where count is NULL or less than max_images
|
||||
query = query.filter(
|
||||
(image_count_subquery.c.count == None) | (image_count_subquery.c.count < job.max_images)
|
||||
)
|
||||
logger.info(f"Filtering to species with fewer than {job.max_images} images")
|
||||
|
||||
species_list = query.all()
|
||||
logger.info(f"Total species to scrape: {len(species_list)}")
|
||||
|
||||
job.progress_total = len(species_list)
|
||||
db.commit()
|
||||
|
||||
# Import scraper based on source
|
||||
from app.scrapers import get_scraper
|
||||
scraper = get_scraper(job.source)
|
||||
|
||||
if not scraper:
|
||||
error_msg = f"Unknown source: {job.source}"
|
||||
logger.error(error_msg)
|
||||
job.status = "failed"
|
||||
job.error_message = error_msg
|
||||
job.completed_at = datetime.utcnow()
|
||||
db.commit()
|
||||
return {"error": error_msg}
|
||||
|
||||
logger.info(f"Using scraper: {scraper.name}")
|
||||
|
||||
# Scrape each species
|
||||
for i, species in enumerate(species_list):
|
||||
try:
|
||||
# Update progress
|
||||
job.progress_current = i + 1
|
||||
db.commit()
|
||||
|
||||
logger.info(f"[{i+1}/{len(species_list)}] Scraping: {species.scientific_name}")
|
||||
|
||||
# Update task state for real-time monitoring
|
||||
self.update_state(
|
||||
state="PROGRESS",
|
||||
meta={
|
||||
"current": i + 1,
|
||||
"total": len(species_list),
|
||||
"species": species.scientific_name,
|
||||
}
|
||||
)
|
||||
|
||||
# Run scraper for this species
|
||||
results = scraper.scrape_species(species, db, logger)
|
||||
downloaded = results.get("downloaded", 0)
|
||||
rejected = results.get("rejected", 0)
|
||||
job.images_downloaded += downloaded
|
||||
job.images_rejected += rejected
|
||||
db.commit()
|
||||
|
||||
logger.info(f" -> Downloaded: {downloaded}, Rejected: {rejected}")
|
||||
|
||||
except Exception as e:
|
||||
# Log error but continue with other species
|
||||
logger.error(f"Error scraping {species.scientific_name}: {e}", exc_info=True)
|
||||
continue
|
||||
|
||||
# Mark job complete
|
||||
job.status = "completed"
|
||||
job.completed_at = datetime.utcnow()
|
||||
db.commit()
|
||||
|
||||
logger.info(f"Job {job_id} completed. Total downloaded: {job.images_downloaded}, rejected: {job.images_rejected}")
|
||||
|
||||
return {
|
||||
"status": "completed",
|
||||
"downloaded": job.images_downloaded,
|
||||
"rejected": job.images_rejected,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Job {job_id} failed with error: {e}", exc_info=True)
|
||||
if job:
|
||||
job.status = "failed"
|
||||
job.error_message = str(e)
|
||||
job.completed_at = datetime.utcnow()
|
||||
db.commit()
|
||||
raise
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@celery_app.task
|
||||
def pause_scrape_job(job_id: int):
|
||||
"""Pause a running scrape job."""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
job = db.query(Job).filter(Job.id == job_id).first()
|
||||
if job and job.status == "running":
|
||||
job.status = "paused"
|
||||
db.commit()
|
||||
# Revoke the Celery task
|
||||
if job.celery_task_id:
|
||||
celery_app.control.revoke(job.celery_task_id, terminate=True)
|
||||
return {"status": "paused"}
|
||||
finally:
|
||||
db.close()
|
||||
193
backend/app/workers/stats_tasks.py
Normal file
193
backend/app/workers/stats_tasks.py
Normal file
@@ -0,0 +1,193 @@
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from sqlalchemy import func, case, text
|
||||
|
||||
from app.workers.celery_app import celery_app
|
||||
from app.database import SessionLocal
|
||||
from app.models import Species, Image, Job
|
||||
from app.models.cached_stats import CachedStats
|
||||
from app.config import get_settings
|
||||
|
||||
|
||||
def get_directory_size_fast(path: str) -> int:
|
||||
"""Get directory size in bytes using fast os.scandir."""
|
||||
total = 0
|
||||
try:
|
||||
with os.scandir(path) as it:
|
||||
for entry in it:
|
||||
try:
|
||||
if entry.is_file(follow_symlinks=False):
|
||||
total += entry.stat(follow_symlinks=False).st_size
|
||||
elif entry.is_dir(follow_symlinks=False):
|
||||
total += get_directory_size_fast(entry.path)
|
||||
except (OSError, PermissionError):
|
||||
pass
|
||||
except (OSError, PermissionError):
|
||||
pass
|
||||
return total
|
||||
|
||||
|
||||
@celery_app.task
|
||||
def refresh_stats():
|
||||
"""Calculate and cache dashboard statistics."""
|
||||
print("=== STATS TASK: Starting refresh ===", flush=True)
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
# Use raw SQL for maximum performance on SQLite
|
||||
# All counts in a single query
|
||||
counts_sql = text("""
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM species) as total_species,
|
||||
(SELECT COUNT(*) FROM images) as total_images,
|
||||
(SELECT COUNT(*) FROM images WHERE status = 'downloaded') as images_downloaded,
|
||||
(SELECT COUNT(*) FROM images WHERE status = 'pending') as images_pending,
|
||||
(SELECT COUNT(*) FROM images WHERE status = 'rejected') as images_rejected
|
||||
""")
|
||||
counts = db.execute(counts_sql).fetchone()
|
||||
total_species = counts[0] or 0
|
||||
total_images = counts[1] or 0
|
||||
images_downloaded = counts[2] or 0
|
||||
images_pending = counts[3] or 0
|
||||
images_rejected = counts[4] or 0
|
||||
|
||||
# Per-source stats - single query with GROUP BY
|
||||
source_sql = text("""
|
||||
SELECT
|
||||
source,
|
||||
COUNT(*) as total,
|
||||
SUM(CASE WHEN status = 'downloaded' THEN 1 ELSE 0 END) as downloaded,
|
||||
SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) as pending,
|
||||
SUM(CASE WHEN status = 'rejected' THEN 1 ELSE 0 END) as rejected
|
||||
FROM images
|
||||
GROUP BY source
|
||||
""")
|
||||
source_stats_raw = db.execute(source_sql).fetchall()
|
||||
sources = [
|
||||
{
|
||||
"source": s[0],
|
||||
"image_count": s[1],
|
||||
"downloaded": s[2] or 0,
|
||||
"pending": s[3] or 0,
|
||||
"rejected": s[4] or 0,
|
||||
}
|
||||
for s in source_stats_raw
|
||||
]
|
||||
|
||||
# Per-license stats - single indexed query
|
||||
license_sql = text("""
|
||||
SELECT license, COUNT(*) as count
|
||||
FROM images
|
||||
WHERE status = 'downloaded'
|
||||
GROUP BY license
|
||||
""")
|
||||
license_stats_raw = db.execute(license_sql).fetchall()
|
||||
licenses = [
|
||||
{"license": l[0], "count": l[1]}
|
||||
for l in license_stats_raw
|
||||
]
|
||||
|
||||
# Job stats - single query
|
||||
job_sql = text("""
|
||||
SELECT
|
||||
SUM(CASE WHEN status = 'running' THEN 1 ELSE 0 END) as running,
|
||||
SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) as pending,
|
||||
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as completed,
|
||||
SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed
|
||||
FROM jobs
|
||||
""")
|
||||
job_counts = db.execute(job_sql).fetchone()
|
||||
jobs = {
|
||||
"running": job_counts[0] or 0,
|
||||
"pending": job_counts[1] or 0,
|
||||
"completed": job_counts[2] or 0,
|
||||
"failed": job_counts[3] or 0,
|
||||
}
|
||||
|
||||
# Top species by image count - optimized with index
|
||||
top_sql = text("""
|
||||
SELECT s.id, s.scientific_name, s.common_name, COUNT(i.id) as image_count
|
||||
FROM species s
|
||||
INNER JOIN images i ON i.species_id = s.id AND i.status = 'downloaded'
|
||||
GROUP BY s.id
|
||||
ORDER BY image_count DESC
|
||||
LIMIT 10
|
||||
""")
|
||||
top_species_raw = db.execute(top_sql).fetchall()
|
||||
top_species = [
|
||||
{
|
||||
"id": s[0],
|
||||
"scientific_name": s[1],
|
||||
"common_name": s[2],
|
||||
"image_count": s[3],
|
||||
}
|
||||
for s in top_species_raw
|
||||
]
|
||||
|
||||
# Under-represented species - use pre-computed counts
|
||||
under_sql = text("""
|
||||
SELECT s.id, s.scientific_name, s.common_name, COALESCE(img_counts.cnt, 0) as image_count
|
||||
FROM species s
|
||||
LEFT JOIN (
|
||||
SELECT species_id, COUNT(*) as cnt
|
||||
FROM images
|
||||
WHERE status = 'downloaded'
|
||||
GROUP BY species_id
|
||||
) img_counts ON img_counts.species_id = s.id
|
||||
WHERE COALESCE(img_counts.cnt, 0) < 100
|
||||
ORDER BY image_count ASC
|
||||
LIMIT 10
|
||||
""")
|
||||
under_rep_raw = db.execute(under_sql).fetchall()
|
||||
under_represented = [
|
||||
{
|
||||
"id": s[0],
|
||||
"scientific_name": s[1],
|
||||
"common_name": s[2],
|
||||
"image_count": s[3],
|
||||
}
|
||||
for s in under_rep_raw
|
||||
]
|
||||
|
||||
# Calculate disk usage (fast recursive scan)
|
||||
settings = get_settings()
|
||||
disk_usage_bytes = get_directory_size_fast(settings.images_path)
|
||||
disk_usage_mb = round(disk_usage_bytes / (1024 * 1024), 2)
|
||||
|
||||
# Build the stats object
|
||||
stats = {
|
||||
"total_species": total_species,
|
||||
"total_images": total_images,
|
||||
"images_downloaded": images_downloaded,
|
||||
"images_pending": images_pending,
|
||||
"images_rejected": images_rejected,
|
||||
"disk_usage_mb": disk_usage_mb,
|
||||
"sources": sources,
|
||||
"licenses": licenses,
|
||||
"jobs": jobs,
|
||||
"top_species": top_species,
|
||||
"under_represented": under_represented,
|
||||
}
|
||||
|
||||
# Store in database
|
||||
cached = db.query(CachedStats).filter(CachedStats.key == "dashboard_stats").first()
|
||||
if cached:
|
||||
cached.value = json.dumps(stats)
|
||||
cached.updated_at = datetime.utcnow()
|
||||
else:
|
||||
cached = CachedStats(key="dashboard_stats", value=json.dumps(stats))
|
||||
db.add(cached)
|
||||
|
||||
db.commit()
|
||||
print(f"=== STATS TASK: Refreshed (species={total_species}, images={total_images}) ===", flush=True)
|
||||
|
||||
return {"status": "success", "total_species": total_species, "total_images": total_images}
|
||||
|
||||
except Exception as e:
|
||||
print(f"=== STATS TASK ERROR: {e} ===", flush=True)
|
||||
raise
|
||||
finally:
|
||||
db.close()
|
||||
34
backend/requirements.txt
Normal file
34
backend/requirements.txt
Normal file
@@ -0,0 +1,34 @@
|
||||
# Web framework
|
||||
fastapi==0.109.0
|
||||
uvicorn[standard]==0.27.0
|
||||
python-multipart==0.0.6
|
||||
|
||||
# Database
|
||||
sqlalchemy==2.0.25
|
||||
alembic==1.13.1
|
||||
aiosqlite==0.19.0
|
||||
|
||||
# Task queue
|
||||
celery==5.3.6
|
||||
redis==5.0.1
|
||||
|
||||
# Image processing
|
||||
Pillow==10.2.0
|
||||
imagehash==4.3.1
|
||||
imagededup==0.3.3.post2
|
||||
|
||||
# HTTP clients
|
||||
httpx==0.26.0
|
||||
aiohttp==3.9.3
|
||||
|
||||
# Search
|
||||
duckduckgo-search
|
||||
|
||||
# Utilities
|
||||
python-dotenv==1.0.0
|
||||
pydantic==2.5.3
|
||||
pydantic-settings==2.1.0
|
||||
|
||||
# Testing
|
||||
pytest==7.4.4
|
||||
pytest-asyncio==0.23.3
|
||||
1
backend/tests/__init__.py
Normal file
1
backend/tests/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Tests
|
||||
Reference in New Issue
Block a user