forum scraper: dedup downloads by content hash, not just filename
Mirror of a627388 but for the forum image path. The same image is often
re-uploaded under different filenames across pages/posts, so existsSync
on the target name can't catch content-duplicates. After fetching the
buffer, hash the first 64KB and compare against existing same-size files
in the target folder (same md5+size signature as gallery's duplicate
scanner). Confirmed against a known dani-speegle-2 pair:
skip IMG_79695f8914f20ce38b07.jpg — same content as
72759c89-7e53-4976-839a-7d952c444579.jpg
buildSizeIndex is built once per job in runForumScrape and threaded
through scrapeForumPage → downloadImage; the hash cache amortizes across
all pages in the job.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
+4
-3
@@ -2,7 +2,7 @@ import { Router } from 'express';
|
|||||||
import { mkdirSync } from 'fs';
|
import { mkdirSync } from 'fs';
|
||||||
import { join } from 'path';
|
import { join } from 'path';
|
||||||
import * as cheerio from 'cheerio';
|
import * as cheerio from 'cheerio';
|
||||||
import { scrapeForumPage, getPageUrl, detectMaxPage, CookieExpiredError, fixCookieIp, FORUM_UA } from './scrapers/forum.js';
|
import { scrapeForumPage, getPageUrl, detectMaxPage, buildSizeIndex, CookieExpiredError, fixCookieIp, FORUM_UA } from './scrapers/forum.js';
|
||||||
import { refreshForumCookies, fsCreateSession, fsDestroySession, fsGet, fsPost } from './flaresolverr.js';
|
import { refreshForumCookies, fsCreateSession, fsDestroySession, fsGet, fsPost } from './flaresolverr.js';
|
||||||
import { parseUserUrl, fetchAllPosts, fetchSearchPosts, downloadFiles } from './scrapers/coomer.js';
|
import { parseUserUrl, fetchAllPosts, fetchSearchPosts, downloadFiles } from './scrapers/coomer.js';
|
||||||
import { parseMediaUrl, fetchAllMedia, fetchAllMediaFromHtml, downloadMedia } from './scrapers/medialink.js';
|
import { parseMediaUrl, fetchAllMedia, fetchAllMediaFromHtml, downloadMedia } from './scrapers/medialink.js';
|
||||||
@@ -94,6 +94,7 @@ async function runForumScrape(job) {
|
|||||||
mkdirSync(outputDir, { recursive: true });
|
mkdirSync(outputDir, { recursive: true });
|
||||||
|
|
||||||
const downloadedSet = new Set();
|
const downloadedSet = new Set();
|
||||||
|
const sizeIndex = buildSizeIndex(outputDir);
|
||||||
let totalImages = 0;
|
let totalImages = 0;
|
||||||
|
|
||||||
// When a siteId is in play, page HTML must be fetched through FlareSolverr —
|
// When a siteId is in play, page HTML must be fetched through FlareSolverr —
|
||||||
@@ -140,7 +141,7 @@ async function runForumScrape(job) {
|
|||||||
|
|
||||||
let count;
|
let count;
|
||||||
try {
|
try {
|
||||||
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession);
|
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession, sizeIndex);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
if (err instanceof CookieExpiredError && siteId) {
|
if (err instanceof CookieExpiredError && siteId) {
|
||||||
addLog(job, `Cookie expired (HTTP ${err.statusCode}) — attempting auto-refresh via FlareSolverr...`);
|
addLog(job, `Cookie expired (HTTP ${err.statusCode}) — attempting auto-refresh via FlareSolverr...`);
|
||||||
@@ -151,7 +152,7 @@ async function runForumScrape(job) {
|
|||||||
job.config.cookies = cookies;
|
job.config.cookies = cookies;
|
||||||
job.config.userAgent = userAgent;
|
job.config.userAgent = userAgent;
|
||||||
addLog(job, 'Cookies refreshed successfully — retrying page...');
|
addLog(job, 'Cookies refreshed successfully — retrying page...');
|
||||||
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession);
|
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession, sizeIndex);
|
||||||
} catch (refreshErr) {
|
} catch (refreshErr) {
|
||||||
addLog(job, `Cookie refresh failed: ${refreshErr.message}`);
|
addLog(job, `Cookie refresh failed: ${refreshErr.message}`);
|
||||||
addLog(job, 'Stopping scrape — fix credentials or refresh cookies manually');
|
addLog(job, 'Stopping scrape — fix credentials or refresh cookies manually');
|
||||||
|
|||||||
@@ -1,15 +1,54 @@
|
|||||||
import * as cheerio from 'cheerio';
|
import * as cheerio from 'cheerio';
|
||||||
import { createWriteStream, existsSync, mkdirSync, statSync, writeFileSync } from 'fs';
|
import { createReadStream, createWriteStream, existsSync, mkdirSync, readdirSync, statSync, writeFileSync } from 'fs';
|
||||||
import { basename, join, extname } from 'path';
|
import { basename, join, extname } from 'path';
|
||||||
import { pipeline } from 'stream/promises';
|
import { pipeline } from 'stream/promises';
|
||||||
import { execFile } from 'child_process';
|
import { execFile } from 'child_process';
|
||||||
import { promisify } from 'util';
|
import { promisify } from 'util';
|
||||||
|
import { createHash } from 'crypto';
|
||||||
import { upsertMediaFile } from '../db.js';
|
import { upsertMediaFile } from '../db.js';
|
||||||
import { fsGet } from '../flaresolverr.js';
|
import { fsGet } from '../flaresolverr.js';
|
||||||
import { isTurboUrl, downloadTurbo } from './turbo.js';
|
import { isTurboUrl, downloadTurbo } from './turbo.js';
|
||||||
|
|
||||||
const execFileAsync = promisify(execFile);
|
const execFileAsync = promisify(execFile);
|
||||||
|
|
||||||
|
// Match the duplicate scanner in gallery.js and turbo.js — md5 of first 64KB + exact size.
|
||||||
|
const HASH_BYTES = 65536;
|
||||||
|
|
||||||
|
function hashFirst64kSync(filePath) {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
const hash = createHash('md5');
|
||||||
|
const s = createReadStream(filePath, { start: 0, end: HASH_BYTES - 1 });
|
||||||
|
s.on('data', (c) => hash.update(c));
|
||||||
|
s.on('end', () => resolve(hash.digest('hex')));
|
||||||
|
s.on('error', reject);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build size -> [{filename, path, hash:null}] index for the folder. Hashes are
|
||||||
|
// computed lazily only when a size collision is found.
|
||||||
|
export function buildSizeIndex(folderPath) {
|
||||||
|
const idx = new Map();
|
||||||
|
let entries;
|
||||||
|
try { entries = readdirSync(folderPath); } catch { return idx; }
|
||||||
|
for (const name of entries) {
|
||||||
|
if (name.startsWith('.')) continue;
|
||||||
|
const p = join(folderPath, name);
|
||||||
|
try {
|
||||||
|
const st = statSync(p);
|
||||||
|
if (!st.isFile()) continue;
|
||||||
|
if (!idx.has(st.size)) idx.set(st.size, []);
|
||||||
|
idx.get(st.size).push({ filename: name, path: p, hash: null });
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
return idx;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function ensureCandidateHash(c) {
|
||||||
|
if (c.hash != null) return c.hash;
|
||||||
|
try { c.hash = await hashFirst64kSync(c.path); } catch { c.hash = ''; }
|
||||||
|
return c.hash;
|
||||||
|
}
|
||||||
|
|
||||||
const UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
const UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
||||||
const SERVER_IP = '47.185.183.191';
|
const SERVER_IP = '47.185.183.191';
|
||||||
|
|
||||||
@@ -150,7 +189,7 @@ function tryFullSizeUrl(thumbUrl) {
|
|||||||
return candidates;
|
return candidates;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function downloadImage(url, outputDir, downloadedSet, logFn, cookies, userAgent) {
|
async function downloadImage(url, outputDir, downloadedSet, logFn, cookies, userAgent, sizeIndex) {
|
||||||
if (downloadedSet.has(url)) return false;
|
if (downloadedSet.has(url)) return false;
|
||||||
if (!isImageUrl(url)) return false;
|
if (!isImageUrl(url)) return false;
|
||||||
const lower = url.toLowerCase();
|
const lower = url.toLowerCase();
|
||||||
@@ -183,11 +222,30 @@ async function downloadImage(url, outputDir, downloadedSet, logFn, cookies, user
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Content-hash dedup: same image often re-uploaded under different names.
|
||||||
|
let newHash = null;
|
||||||
|
if (sizeIndex && sizeIndex.has(buf.length)) {
|
||||||
|
const head = buf.subarray(0, Math.min(buf.length, HASH_BYTES));
|
||||||
|
newHash = createHash('md5').update(head).digest('hex');
|
||||||
|
for (const cand of sizeIndex.get(buf.length)) {
|
||||||
|
const ch = await ensureCandidateHash(cand);
|
||||||
|
if (ch && ch === newHash) {
|
||||||
|
logFn(`skip ${filename} — same content as ${cand.filename}`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
writeFileSync(filepath, buf);
|
writeFileSync(filepath, buf);
|
||||||
const savedName = basename(filepath);
|
const savedName = basename(filepath);
|
||||||
const folderName = basename(outputDir);
|
const folderName = basename(outputDir);
|
||||||
try { upsertMediaFile(folderName, savedName, 'image', buf.length, Date.now(), null); } catch {}
|
try { upsertMediaFile(folderName, savedName, 'image', buf.length, Date.now(), null); } catch {}
|
||||||
|
|
||||||
|
if (sizeIndex) {
|
||||||
|
if (!sizeIndex.has(buf.length)) sizeIndex.set(buf.length, []);
|
||||||
|
sizeIndex.get(buf.length).push({ filename: savedName, path: filepath, hash: newHash });
|
||||||
|
}
|
||||||
|
|
||||||
const sizeKb = (buf.length / 1024).toFixed(1);
|
const sizeKb = (buf.length / 1024).toFixed(1);
|
||||||
logFn(`Downloaded: ${savedName} (${sizeKb} KB)`);
|
logFn(`Downloaded: ${savedName} (${sizeKb} KB)`);
|
||||||
return true;
|
return true;
|
||||||
@@ -273,7 +331,7 @@ async function downloadFromExternalHost(url, outputDir, downloadedSet, logFn, us
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn, cookies, userAgent, fsSession) {
|
export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn, cookies, userAgent, fsSession, sizeIndex) {
|
||||||
logFn(`Fetching page: ${pageUrl}${fsSession ? ' [via FlareSolverr]' : ''}`);
|
logFn(`Fetching page: ${pageUrl}${fsSession ? ' [via FlareSolverr]' : ''}`);
|
||||||
|
|
||||||
let html;
|
let html;
|
||||||
@@ -403,7 +461,7 @@ export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn,
|
|||||||
|
|
||||||
// Download images
|
// Download images
|
||||||
for (const imgUrl of imageUrls) {
|
for (const imgUrl of imageUrls) {
|
||||||
if (await downloadImage(imgUrl, outputDir, downloadedSet, logFn, cookies, userAgent)) {
|
if (await downloadImage(imgUrl, outputDir, downloadedSet, logFn, cookies, userAgent, sizeIndex)) {
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user