forum scraper: dedup downloads by content hash, not just filename

Mirror of a627388 but for the forum image path. The same image is often
re-uploaded under different filenames across pages/posts, so existsSync
on the target name can't catch content-duplicates. After fetching the
buffer, hash the first 64KB and compare against existing same-size files
in the target folder (same md5+size signature as gallery's duplicate
scanner). Confirmed against a known dani-speegle-2 pair:

  skip IMG_79695f8914f20ce38b07.jpg — same content as
       72759c89-7e53-4976-839a-7d952c444579.jpg

buildSizeIndex is built once per job in runForumScrape and threaded
through scrapeForumPage → downloadImage; the hash cache amortizes across
all pages in the job.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Trey T
2026-05-29 10:14:23 -05:00
parent a627388a4a
commit 4ba88d96f4
2 changed files with 66 additions and 7 deletions
+4 -3
View File
@@ -2,7 +2,7 @@ import { Router } from 'express';
import { mkdirSync } from 'fs';
import { join } from 'path';
import * as cheerio from 'cheerio';
import { scrapeForumPage, getPageUrl, detectMaxPage, CookieExpiredError, fixCookieIp, FORUM_UA } from './scrapers/forum.js';
import { scrapeForumPage, getPageUrl, detectMaxPage, buildSizeIndex, CookieExpiredError, fixCookieIp, FORUM_UA } from './scrapers/forum.js';
import { refreshForumCookies, fsCreateSession, fsDestroySession, fsGet, fsPost } from './flaresolverr.js';
import { parseUserUrl, fetchAllPosts, fetchSearchPosts, downloadFiles } from './scrapers/coomer.js';
import { parseMediaUrl, fetchAllMedia, fetchAllMediaFromHtml, downloadMedia } from './scrapers/medialink.js';
@@ -94,6 +94,7 @@ async function runForumScrape(job) {
mkdirSync(outputDir, { recursive: true });
const downloadedSet = new Set();
const sizeIndex = buildSizeIndex(outputDir);
let totalImages = 0;
// When a siteId is in play, page HTML must be fetched through FlareSolverr —
@@ -140,7 +141,7 @@ async function runForumScrape(job) {
let count;
try {
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession);
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession, sizeIndex);
} catch (err) {
if (err instanceof CookieExpiredError && siteId) {
addLog(job, `Cookie expired (HTTP ${err.statusCode}) — attempting auto-refresh via FlareSolverr...`);
@@ -151,7 +152,7 @@ async function runForumScrape(job) {
job.config.cookies = cookies;
job.config.userAgent = userAgent;
addLog(job, 'Cookies refreshed successfully — retrying page...');
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession);
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession, sizeIndex);
} catch (refreshErr) {
addLog(job, `Cookie refresh failed: ${refreshErr.message}`);
addLog(job, 'Stopping scrape — fix credentials or refresh cookies manually');