Initial commit — PlantGuideScraper project
This commit is contained in:
80
backend/app/utils/dedup.py
Normal file
80
backend/app/utils/dedup.py
Normal file
@@ -0,0 +1,80 @@
|
||||
"""Image deduplication utilities using perceptual hashing."""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import imagehash
|
||||
from PIL import Image as PILImage
|
||||
|
||||
|
||||
def calculate_phash(image_path: str) -> Optional[str]:
|
||||
"""
|
||||
Calculate perceptual hash for an image.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
|
||||
Returns:
|
||||
Hex string of perceptual hash, or None if failed
|
||||
"""
|
||||
try:
|
||||
with PILImage.open(image_path) as img:
|
||||
return str(imagehash.phash(img))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def calculate_dhash(image_path: str) -> Optional[str]:
|
||||
"""
|
||||
Calculate difference hash for an image.
|
||||
Faster but less accurate than phash.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
|
||||
Returns:
|
||||
Hex string of difference hash, or None if failed
|
||||
"""
|
||||
try:
|
||||
with PILImage.open(image_path) as img:
|
||||
return str(imagehash.dhash(img))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def hashes_are_similar(hash1: str, hash2: str, threshold: int = 10) -> bool:
|
||||
"""
|
||||
Check if two hashes are similar (potential duplicates).
|
||||
|
||||
Args:
|
||||
hash1: First hash string
|
||||
hash2: Second hash string
|
||||
threshold: Maximum Hamming distance (default 10)
|
||||
|
||||
Returns:
|
||||
True if hashes are similar
|
||||
"""
|
||||
try:
|
||||
h1 = imagehash.hex_to_hash(hash1)
|
||||
h2 = imagehash.hex_to_hash(hash2)
|
||||
return (h1 - h2) <= threshold
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def hamming_distance(hash1: str, hash2: str) -> int:
|
||||
"""
|
||||
Calculate Hamming distance between two hashes.
|
||||
|
||||
Args:
|
||||
hash1: First hash string
|
||||
hash2: Second hash string
|
||||
|
||||
Returns:
|
||||
Hamming distance (0 = identical, higher = more different)
|
||||
"""
|
||||
try:
|
||||
h1 = imagehash.hex_to_hash(hash1)
|
||||
h2 = imagehash.hex_to_hash(hash2)
|
||||
return int(h1 - h2)
|
||||
except Exception:
|
||||
return 64 # Maximum distance
|
||||
Reference in New Issue
Block a user