Initial commit — PlantGuideScraper project

This commit is contained in:
Trey T
2026-04-12 09:54:27 -05:00
commit 6926f502c5
87 changed files with 29120 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
from app.schemas.species import SpeciesCreate, SpeciesUpdate, SpeciesResponse, SpeciesListResponse
from app.schemas.image import ImageResponse, ImageListResponse, ImageFilter
from app.schemas.job import JobCreate, JobResponse, JobListResponse
from app.schemas.api_key import ApiKeyCreate, ApiKeyUpdate, ApiKeyResponse
from app.schemas.export import ExportCreate, ExportResponse, ExportListResponse
from app.schemas.stats import StatsResponse, SourceStats, SpeciesStats
__all__ = [
"SpeciesCreate", "SpeciesUpdate", "SpeciesResponse", "SpeciesListResponse",
"ImageResponse", "ImageListResponse", "ImageFilter",
"JobCreate", "JobResponse", "JobListResponse",
"ApiKeyCreate", "ApiKeyUpdate", "ApiKeyResponse",
"ExportCreate", "ExportResponse", "ExportListResponse",
"StatsResponse", "SourceStats", "SpeciesStats",
]

View File

@@ -0,0 +1,36 @@
from pydantic import BaseModel
from typing import Optional
class ApiKeyBase(BaseModel):
source: str
api_key: Optional[str] = None # Optional for no-auth sources, used as Client ID for OAuth
api_secret: Optional[str] = None # Also used as Client Secret for OAuth sources
access_token: Optional[str] = None # For OAuth sources like Wikimedia
rate_limit_per_sec: float = 1.0
enabled: bool = True
class ApiKeyCreate(ApiKeyBase):
pass
class ApiKeyUpdate(BaseModel):
api_key: Optional[str] = None
api_secret: Optional[str] = None
access_token: Optional[str] = None
rate_limit_per_sec: Optional[float] = None
enabled: Optional[bool] = None
class ApiKeyResponse(BaseModel):
id: int
source: str
api_key_masked: str # Show only last 4 chars
has_secret: bool
has_access_token: bool
rate_limit_per_sec: float
enabled: bool
class Config:
from_attributes = True

View File

@@ -0,0 +1,45 @@
from pydantic import BaseModel
from datetime import datetime
from typing import Optional, List
class ExportFilter(BaseModel):
min_images_per_species: int = 100
licenses: Optional[List[str]] = None # None means all
min_quality: Optional[float] = None
species_ids: Optional[List[int]] = None # None means all
class ExportCreate(BaseModel):
name: str
filter_criteria: ExportFilter
train_split: float = 0.8
class ExportResponse(BaseModel):
id: int
name: str
filter_criteria: Optional[str] = None
train_split: float
status: str
file_path: Optional[str] = None
file_size: Optional[int] = None
species_count: Optional[int] = None
image_count: Optional[int] = None
created_at: datetime
completed_at: Optional[datetime] = None
error_message: Optional[str] = None
class Config:
from_attributes = True
class ExportListResponse(BaseModel):
items: List[ExportResponse]
total: int
class ExportPreview(BaseModel):
species_count: int
image_count: int
estimated_size_mb: float

View File

@@ -0,0 +1,47 @@
from pydantic import BaseModel
from datetime import datetime
from typing import Optional, List
class ImageBase(BaseModel):
species_id: int
source: str
url: str
license: str
class ImageResponse(BaseModel):
id: int
species_id: int
species_name: Optional[str] = None
source: str
source_id: Optional[str] = None
url: str
local_path: Optional[str] = None
license: str
attribution: Optional[str] = None
width: Optional[int] = None
height: Optional[int] = None
quality_score: Optional[float] = None
status: str
created_at: datetime
class Config:
from_attributes = True
class ImageListResponse(BaseModel):
items: List[ImageResponse]
total: int
page: int
page_size: int
pages: int
class ImageFilter(BaseModel):
species_id: Optional[int] = None
source: Optional[str] = None
license: Optional[str] = None
status: Optional[str] = None
min_quality: Optional[float] = None
search: Optional[str] = None

View File

@@ -0,0 +1,35 @@
from pydantic import BaseModel
from datetime import datetime
from typing import Optional, List
class JobCreate(BaseModel):
name: str
source: str
species_ids: Optional[List[int]] = None # None means all species
only_without_images: bool = False # If True, only scrape species with 0 images
max_images: Optional[int] = None # If set, only scrape species with fewer than N images
class JobResponse(BaseModel):
id: int
name: str
source: str
species_filter: Optional[str] = None
status: str
progress_current: int
progress_total: int
images_downloaded: int
images_rejected: int
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
error_message: Optional[str] = None
created_at: datetime
class Config:
from_attributes = True
class JobListResponse(BaseModel):
items: List[JobResponse]
total: int

View File

@@ -0,0 +1,44 @@
from pydantic import BaseModel
from datetime import datetime
from typing import Optional, List
class SpeciesBase(BaseModel):
scientific_name: str
common_name: Optional[str] = None
genus: Optional[str] = None
family: Optional[str] = None
class SpeciesCreate(SpeciesBase):
pass
class SpeciesUpdate(BaseModel):
scientific_name: Optional[str] = None
common_name: Optional[str] = None
genus: Optional[str] = None
family: Optional[str] = None
class SpeciesResponse(SpeciesBase):
id: int
created_at: datetime
image_count: int = 0
class Config:
from_attributes = True
class SpeciesListResponse(BaseModel):
items: List[SpeciesResponse]
total: int
page: int
page_size: int
pages: int
class SpeciesImportResponse(BaseModel):
imported: int
skipped: int
errors: List[str]

View File

@@ -0,0 +1,43 @@
from pydantic import BaseModel
from typing import List, Dict
class SourceStats(BaseModel):
source: str
image_count: int
downloaded: int
pending: int
rejected: int
class LicenseStats(BaseModel):
license: str
count: int
class SpeciesStats(BaseModel):
id: int
scientific_name: str
common_name: str | None
image_count: int
class JobStats(BaseModel):
running: int
pending: int
completed: int
failed: int
class StatsResponse(BaseModel):
total_species: int
total_images: int
images_downloaded: int
images_pending: int
images_rejected: int
disk_usage_mb: float
sources: List[SourceStats]
licenses: List[LicenseStats]
jobs: JobStats
top_species: List[SpeciesStats]
under_represented: List[SpeciesStats] # Species with < 100 images