"""Image deduplication utilities using perceptual hashing.""" from typing import Optional import imagehash from PIL import Image as PILImage def calculate_phash(image_path: str) -> Optional[str]: """ Calculate perceptual hash for an image. Args: image_path: Path to image file Returns: Hex string of perceptual hash, or None if failed """ try: with PILImage.open(image_path) as img: return str(imagehash.phash(img)) except Exception: return None def calculate_dhash(image_path: str) -> Optional[str]: """ Calculate difference hash for an image. Faster but less accurate than phash. Args: image_path: Path to image file Returns: Hex string of difference hash, or None if failed """ try: with PILImage.open(image_path) as img: return str(imagehash.dhash(img)) except Exception: return None def hashes_are_similar(hash1: str, hash2: str, threshold: int = 10) -> bool: """ Check if two hashes are similar (potential duplicates). Args: hash1: First hash string hash2: Second hash string threshold: Maximum Hamming distance (default 10) Returns: True if hashes are similar """ try: h1 = imagehash.hex_to_hash(hash1) h2 = imagehash.hex_to_hash(hash2) return (h1 - h2) <= threshold except Exception: return False def hamming_distance(hash1: str, hash2: str) -> int: """ Calculate Hamming distance between two hashes. Args: hash1: First hash string hash2: Second hash string Returns: Hamming distance (0 = identical, higher = more different) """ try: h1 = imagehash.hex_to_hash(hash1) h2 = imagehash.hex_to_hash(hash2) return int(h1 - h2) except Exception: return 64 # Maximum distance