forum scraper: dedup downloads by content hash, not just filename

Mirror of a627388 but for the forum image path. The same image is often re-uploaded under different filenames across pages/posts, so existsSync on the target name can't catch content-duplicates. After fetching the buffer, hash the first 64KB and compare against existing same-size files in the target folder (same md5+size signature as gallery's duplicate scanner). Confirmed against a known dani-speegle-2 pair: skip IMG_79695f8914f20ce38b07.jpg — same content as 72759c89-7e53-4976-839a-7d952c444579.jpg buildSizeIndex is built once per job in runForumScrape and threaded through scrapeForumPage → downloadImage; the hash cache amortizes across all pages in the job. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-29 10:14:23 -05:00
parent a627388a4a
commit 4ba88d96f4
2 changed files with 66 additions and 7 deletions
@@ -2,7 +2,7 @@ import { Router } from 'express';
 import { mkdirSync } from 'fs';
 import { join } from 'path';
 import * as cheerio from 'cheerio';
-import { scrapeForumPage, getPageUrl, detectMaxPage, CookieExpiredError, fixCookieIp, FORUM_UA } from './scrapers/forum.js';
+import { scrapeForumPage, getPageUrl, detectMaxPage, buildSizeIndex, CookieExpiredError, fixCookieIp, FORUM_UA } from './scrapers/forum.js';
 import { refreshForumCookies, fsCreateSession, fsDestroySession, fsGet, fsPost } from './flaresolverr.js';
 import { parseUserUrl, fetchAllPosts, fetchSearchPosts, downloadFiles } from './scrapers/coomer.js';
 import { parseMediaUrl, fetchAllMedia, fetchAllMediaFromHtml, downloadMedia } from './scrapers/medialink.js';
@@ -94,6 +94,7 @@ async function runForumScrape(job) {
  mkdirSync(outputDir, { recursive: true });

  const downloadedSet = new Set();
+  const sizeIndex = buildSizeIndex(outputDir);
  let totalImages = 0;

  // When a siteId is in play, page HTML must be fetched through FlareSolverr —
@@ -140,7 +141,7 @@ async function runForumScrape(job) {

      let count;
      try {
-        count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession);
+        count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession, sizeIndex);
      } catch (err) {
        if (err instanceof CookieExpiredError && siteId) {
          addLog(job, `Cookie expired (HTTP ${err.statusCode}) — attempting auto-refresh via FlareSolverr...`);
@@ -151,7 +152,7 @@ async function runForumScrape(job) {
            job.config.cookies = cookies;
            job.config.userAgent = userAgent;
            addLog(job, 'Cookies refreshed successfully — retrying page...');
-            count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession);
+            count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession, sizeIndex);
          } catch (refreshErr) {
            addLog(job, `Cookie refresh failed: ${refreshErr.message}`);
            addLog(job, 'Stopping scrape — fix credentials or refresh cookies manually');