Initial commit — PlantGuideScraper project
This commit is contained in:
224
backend/app/workers/quality_tasks.py
Normal file
224
backend/app/workers/quality_tasks.py
Normal file
@@ -0,0 +1,224 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
from PIL import Image as PILImage
|
||||
import imagehash
|
||||
import numpy as np
|
||||
from scipy import ndimage
|
||||
|
||||
from app.workers.celery_app import celery_app
|
||||
from app.database import SessionLocal
|
||||
from app.models import Image
|
||||
from app.config import get_settings
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
|
||||
def calculate_blur_score(image_path: str) -> float:
|
||||
"""Calculate blur score using Laplacian variance. Higher = sharper."""
|
||||
try:
|
||||
img = PILImage.open(image_path).convert("L")
|
||||
img_array = np.array(img)
|
||||
laplacian = ndimage.laplace(img_array)
|
||||
return float(np.var(laplacian))
|
||||
except Exception:
|
||||
return 0.0
|
||||
|
||||
|
||||
def calculate_phash(image_path: str) -> str:
|
||||
"""Calculate perceptual hash for deduplication."""
|
||||
try:
|
||||
img = PILImage.open(image_path)
|
||||
return str(imagehash.phash(img))
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def check_color_distribution(image_path: str) -> tuple[bool, str]:
|
||||
"""Check if image has healthy color distribution for a plant photo.
|
||||
|
||||
Returns (passed, reason) tuple.
|
||||
Rejects:
|
||||
- Low color variance (mean channel std < 25): herbarium specimens (brown on white)
|
||||
- No green + low variance (green ratio < 5% AND mean std < 40): monochrome illustrations
|
||||
"""
|
||||
try:
|
||||
img = PILImage.open(image_path).convert("RGB")
|
||||
arr = np.array(img, dtype=np.float64)
|
||||
|
||||
# Per-channel standard deviation
|
||||
channel_stds = arr.std(axis=(0, 1)) # [R_std, G_std, B_std]
|
||||
mean_std = float(channel_stds.mean())
|
||||
|
||||
if mean_std < 25:
|
||||
return False, f"Low color variance ({mean_std:.1f})"
|
||||
|
||||
# Check green ratio
|
||||
channel_means = arr.mean(axis=(0, 1))
|
||||
total = channel_means.sum()
|
||||
green_ratio = channel_means[1] / total if total > 0 else 0
|
||||
|
||||
if green_ratio < 0.05 and mean_std < 40:
|
||||
return False, f"No green ({green_ratio:.2%}) + low variance ({mean_std:.1f})"
|
||||
|
||||
return True, ""
|
||||
except Exception:
|
||||
return True, "" # Don't reject on error
|
||||
|
||||
|
||||
def resize_image(image_path: str, target_size: int = 512) -> bool:
|
||||
"""Resize image to target size while maintaining aspect ratio."""
|
||||
try:
|
||||
img = PILImage.open(image_path)
|
||||
img.thumbnail((target_size, target_size), PILImage.Resampling.LANCZOS)
|
||||
img.save(image_path, quality=95)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
@celery_app.task
|
||||
def download_and_process_image(image_id: int):
|
||||
"""Download image, check quality, dedupe, and resize."""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
image = db.query(Image).filter(Image.id == image_id).first()
|
||||
if not image:
|
||||
return {"error": "Image not found"}
|
||||
|
||||
# Create directory for species
|
||||
species = image.species
|
||||
species_dir = Path(settings.images_path) / species.scientific_name.replace(" ", "_")
|
||||
species_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Download image
|
||||
filename = f"{image.source}_{image.source_id or image.id}.jpg"
|
||||
local_path = species_dir / filename
|
||||
|
||||
try:
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
|
||||
}
|
||||
with httpx.Client(timeout=30, headers=headers, follow_redirects=True) as client:
|
||||
response = client.get(image.url)
|
||||
response.raise_for_status()
|
||||
|
||||
with open(local_path, "wb") as f:
|
||||
f.write(response.content)
|
||||
except Exception as e:
|
||||
image.status = "rejected"
|
||||
db.commit()
|
||||
return {"error": f"Download failed: {e}"}
|
||||
|
||||
# Check minimum size
|
||||
try:
|
||||
with PILImage.open(local_path) as img:
|
||||
width, height = img.size
|
||||
if width < 256 or height < 256:
|
||||
os.remove(local_path)
|
||||
image.status = "rejected"
|
||||
db.commit()
|
||||
return {"error": "Image too small"}
|
||||
image.width = width
|
||||
image.height = height
|
||||
except Exception as e:
|
||||
if local_path.exists():
|
||||
os.remove(local_path)
|
||||
image.status = "rejected"
|
||||
db.commit()
|
||||
return {"error": f"Invalid image: {e}"}
|
||||
|
||||
# Calculate perceptual hash for deduplication
|
||||
phash = calculate_phash(str(local_path))
|
||||
if phash:
|
||||
# Check for duplicates
|
||||
existing = db.query(Image).filter(
|
||||
Image.phash == phash,
|
||||
Image.id != image.id,
|
||||
Image.status == "downloaded"
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
os.remove(local_path)
|
||||
image.status = "rejected"
|
||||
image.phash = phash
|
||||
db.commit()
|
||||
return {"error": "Duplicate image"}
|
||||
|
||||
image.phash = phash
|
||||
|
||||
# Calculate blur score
|
||||
quality_score = calculate_blur_score(str(local_path))
|
||||
image.quality_score = quality_score
|
||||
|
||||
# Reject very blurry images (threshold can be tuned)
|
||||
if quality_score < 100: # Low variance = blurry
|
||||
os.remove(local_path)
|
||||
image.status = "rejected"
|
||||
db.commit()
|
||||
return {"error": "Image too blurry"}
|
||||
|
||||
# Check color distribution (reject herbarium specimens, illustrations)
|
||||
color_ok, color_reason = check_color_distribution(str(local_path))
|
||||
if not color_ok:
|
||||
os.remove(local_path)
|
||||
image.status = "rejected"
|
||||
db.commit()
|
||||
return {"error": f"Non-photo content: {color_reason}"}
|
||||
|
||||
# Resize to 512x512 max
|
||||
resize_image(str(local_path))
|
||||
|
||||
# Update image record
|
||||
image.local_path = str(local_path)
|
||||
image.status = "downloaded"
|
||||
db.commit()
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"path": str(local_path),
|
||||
"quality_score": quality_score,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
if image:
|
||||
image.status = "rejected"
|
||||
db.commit()
|
||||
return {"error": str(e)}
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@celery_app.task(bind=True)
|
||||
def batch_process_pending_images(self, source: str = None, chunk_size: int = 500):
|
||||
"""Process ALL pending images in chunks, with progress tracking."""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
query = db.query(Image).filter(Image.status == "pending")
|
||||
if source:
|
||||
query = query.filter(Image.source == source)
|
||||
|
||||
total = query.count()
|
||||
queued = 0
|
||||
offset = 0
|
||||
|
||||
while offset < total:
|
||||
chunk = query.order_by(Image.id).offset(offset).limit(chunk_size).all()
|
||||
if not chunk:
|
||||
break
|
||||
|
||||
for image in chunk:
|
||||
download_and_process_image.delay(image.id)
|
||||
queued += 1
|
||||
|
||||
offset += len(chunk)
|
||||
|
||||
self.update_state(
|
||||
state="PROGRESS",
|
||||
meta={"queued": queued, "total": total},
|
||||
)
|
||||
|
||||
return {"queued": queued, "total": total}
|
||||
finally:
|
||||
db.close()
|
||||
Reference in New Issue
Block a user