81 lines
1.9 KiB
Python
81 lines
1.9 KiB
Python
"""Image deduplication utilities using perceptual hashing."""
|
|
|
|
from typing import Optional
|
|
|
|
import imagehash
|
|
from PIL import Image as PILImage
|
|
|
|
|
|
def calculate_phash(image_path: str) -> Optional[str]:
|
|
"""
|
|
Calculate perceptual hash for an image.
|
|
|
|
Args:
|
|
image_path: Path to image file
|
|
|
|
Returns:
|
|
Hex string of perceptual hash, or None if failed
|
|
"""
|
|
try:
|
|
with PILImage.open(image_path) as img:
|
|
return str(imagehash.phash(img))
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def calculate_dhash(image_path: str) -> Optional[str]:
|
|
"""
|
|
Calculate difference hash for an image.
|
|
Faster but less accurate than phash.
|
|
|
|
Args:
|
|
image_path: Path to image file
|
|
|
|
Returns:
|
|
Hex string of difference hash, or None if failed
|
|
"""
|
|
try:
|
|
with PILImage.open(image_path) as img:
|
|
return str(imagehash.dhash(img))
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def hashes_are_similar(hash1: str, hash2: str, threshold: int = 10) -> bool:
|
|
"""
|
|
Check if two hashes are similar (potential duplicates).
|
|
|
|
Args:
|
|
hash1: First hash string
|
|
hash2: Second hash string
|
|
threshold: Maximum Hamming distance (default 10)
|
|
|
|
Returns:
|
|
True if hashes are similar
|
|
"""
|
|
try:
|
|
h1 = imagehash.hex_to_hash(hash1)
|
|
h2 = imagehash.hex_to_hash(hash2)
|
|
return (h1 - h2) <= threshold
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def hamming_distance(hash1: str, hash2: str) -> int:
|
|
"""
|
|
Calculate Hamming distance between two hashes.
|
|
|
|
Args:
|
|
hash1: First hash string
|
|
hash2: Second hash string
|
|
|
|
Returns:
|
|
Hamming distance (0 = identical, higher = more different)
|
|
"""
|
|
try:
|
|
h1 = imagehash.hex_to_hash(hash1)
|
|
h2 = imagehash.hex_to_hash(hash2)
|
|
return int(h1 - h2)
|
|
except Exception:
|
|
return 64 # Maximum distance
|