forum scraper: dedup downloads by content hash, not just filename
Mirror of a627388 but for the forum image path. The same image is often
re-uploaded under different filenames across pages/posts, so existsSync
on the target name can't catch content-duplicates. After fetching the
buffer, hash the first 64KB and compare against existing same-size files
in the target folder (same md5+size signature as gallery's duplicate
scanner). Confirmed against a known dani-speegle-2 pair:
skip IMG_79695f8914f20ce38b07.jpg — same content as
72759c89-7e53-4976-839a-7d952c444579.jpg
buildSizeIndex is built once per job in runForumScrape and threaded
through scrapeForumPage → downloadImage; the hash cache amortizes across
all pages in the job.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
+4
-3
@@ -2,7 +2,7 @@ import { Router } from 'express';
|
||||
import { mkdirSync } from 'fs';
|
||||
import { join } from 'path';
|
||||
import * as cheerio from 'cheerio';
|
||||
import { scrapeForumPage, getPageUrl, detectMaxPage, CookieExpiredError, fixCookieIp, FORUM_UA } from './scrapers/forum.js';
|
||||
import { scrapeForumPage, getPageUrl, detectMaxPage, buildSizeIndex, CookieExpiredError, fixCookieIp, FORUM_UA } from './scrapers/forum.js';
|
||||
import { refreshForumCookies, fsCreateSession, fsDestroySession, fsGet, fsPost } from './flaresolverr.js';
|
||||
import { parseUserUrl, fetchAllPosts, fetchSearchPosts, downloadFiles } from './scrapers/coomer.js';
|
||||
import { parseMediaUrl, fetchAllMedia, fetchAllMediaFromHtml, downloadMedia } from './scrapers/medialink.js';
|
||||
@@ -94,6 +94,7 @@ async function runForumScrape(job) {
|
||||
mkdirSync(outputDir, { recursive: true });
|
||||
|
||||
const downloadedSet = new Set();
|
||||
const sizeIndex = buildSizeIndex(outputDir);
|
||||
let totalImages = 0;
|
||||
|
||||
// When a siteId is in play, page HTML must be fetched through FlareSolverr —
|
||||
@@ -140,7 +141,7 @@ async function runForumScrape(job) {
|
||||
|
||||
let count;
|
||||
try {
|
||||
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession);
|
||||
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession, sizeIndex);
|
||||
} catch (err) {
|
||||
if (err instanceof CookieExpiredError && siteId) {
|
||||
addLog(job, `Cookie expired (HTTP ${err.statusCode}) — attempting auto-refresh via FlareSolverr...`);
|
||||
@@ -151,7 +152,7 @@ async function runForumScrape(job) {
|
||||
job.config.cookies = cookies;
|
||||
job.config.userAgent = userAgent;
|
||||
addLog(job, 'Cookies refreshed successfully — retrying page...');
|
||||
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession);
|
||||
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession, sizeIndex);
|
||||
} catch (refreshErr) {
|
||||
addLog(job, `Cookie refresh failed: ${refreshErr.message}`);
|
||||
addLog(job, 'Stopping scrape — fix credentials or refresh cookies manually');
|
||||
|
||||
Reference in New Issue
Block a user