diff --git a/server/scrape.js b/server/scrape.js index 20f70ad..bf93a88 100644 --- a/server/scrape.js +++ b/server/scrape.js @@ -2,7 +2,7 @@ import { Router } from 'express'; import { mkdirSync } from 'fs'; import { join } from 'path'; import * as cheerio from 'cheerio'; -import { scrapeForumPage, getPageUrl, detectMaxPage, CookieExpiredError, fixCookieIp, FORUM_UA } from './scrapers/forum.js'; +import { scrapeForumPage, getPageUrl, detectMaxPage, buildSizeIndex, CookieExpiredError, fixCookieIp, FORUM_UA } from './scrapers/forum.js'; import { refreshForumCookies, fsCreateSession, fsDestroySession, fsGet, fsPost } from './flaresolverr.js'; import { parseUserUrl, fetchAllPosts, fetchSearchPosts, downloadFiles } from './scrapers/coomer.js'; import { parseMediaUrl, fetchAllMedia, fetchAllMediaFromHtml, downloadMedia } from './scrapers/medialink.js'; @@ -94,6 +94,7 @@ async function runForumScrape(job) { mkdirSync(outputDir, { recursive: true }); const downloadedSet = new Set(); + const sizeIndex = buildSizeIndex(outputDir); let totalImages = 0; // When a siteId is in play, page HTML must be fetched through FlareSolverr — @@ -140,7 +141,7 @@ async function runForumScrape(job) { let count; try { - count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession); + count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession, sizeIndex); } catch (err) { if (err instanceof CookieExpiredError && siteId) { addLog(job, `Cookie expired (HTTP ${err.statusCode}) — attempting auto-refresh via FlareSolverr...`); @@ -151,7 +152,7 @@ async function runForumScrape(job) { job.config.cookies = cookies; job.config.userAgent = userAgent; addLog(job, 'Cookies refreshed successfully — retrying page...'); - count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession); + count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession, sizeIndex); } catch (refreshErr) { addLog(job, `Cookie refresh failed: ${refreshErr.message}`); addLog(job, 'Stopping scrape — fix credentials or refresh cookies manually'); diff --git a/server/scrapers/forum.js b/server/scrapers/forum.js index df8d69f..6c06091 100644 --- a/server/scrapers/forum.js +++ b/server/scrapers/forum.js @@ -1,15 +1,54 @@ import * as cheerio from 'cheerio'; -import { createWriteStream, existsSync, mkdirSync, statSync, writeFileSync } from 'fs'; +import { createReadStream, createWriteStream, existsSync, mkdirSync, readdirSync, statSync, writeFileSync } from 'fs'; import { basename, join, extname } from 'path'; import { pipeline } from 'stream/promises'; import { execFile } from 'child_process'; import { promisify } from 'util'; +import { createHash } from 'crypto'; import { upsertMediaFile } from '../db.js'; import { fsGet } from '../flaresolverr.js'; import { isTurboUrl, downloadTurbo } from './turbo.js'; const execFileAsync = promisify(execFile); +// Match the duplicate scanner in gallery.js and turbo.js — md5 of first 64KB + exact size. +const HASH_BYTES = 65536; + +function hashFirst64kSync(filePath) { + return new Promise((resolve, reject) => { + const hash = createHash('md5'); + const s = createReadStream(filePath, { start: 0, end: HASH_BYTES - 1 }); + s.on('data', (c) => hash.update(c)); + s.on('end', () => resolve(hash.digest('hex'))); + s.on('error', reject); + }); +} + +// Build size -> [{filename, path, hash:null}] index for the folder. Hashes are +// computed lazily only when a size collision is found. +export function buildSizeIndex(folderPath) { + const idx = new Map(); + let entries; + try { entries = readdirSync(folderPath); } catch { return idx; } + for (const name of entries) { + if (name.startsWith('.')) continue; + const p = join(folderPath, name); + try { + const st = statSync(p); + if (!st.isFile()) continue; + if (!idx.has(st.size)) idx.set(st.size, []); + idx.get(st.size).push({ filename: name, path: p, hash: null }); + } catch {} + } + return idx; +} + +async function ensureCandidateHash(c) { + if (c.hash != null) return c.hash; + try { c.hash = await hashFirst64kSync(c.path); } catch { c.hash = ''; } + return c.hash; +} + const UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'; const SERVER_IP = '47.185.183.191'; @@ -150,7 +189,7 @@ function tryFullSizeUrl(thumbUrl) { return candidates; } -async function downloadImage(url, outputDir, downloadedSet, logFn, cookies, userAgent) { +async function downloadImage(url, outputDir, downloadedSet, logFn, cookies, userAgent, sizeIndex) { if (downloadedSet.has(url)) return false; if (!isImageUrl(url)) return false; const lower = url.toLowerCase(); @@ -183,11 +222,30 @@ async function downloadImage(url, outputDir, downloadedSet, logFn, cookies, user return false; } + // Content-hash dedup: same image often re-uploaded under different names. + let newHash = null; + if (sizeIndex && sizeIndex.has(buf.length)) { + const head = buf.subarray(0, Math.min(buf.length, HASH_BYTES)); + newHash = createHash('md5').update(head).digest('hex'); + for (const cand of sizeIndex.get(buf.length)) { + const ch = await ensureCandidateHash(cand); + if (ch && ch === newHash) { + logFn(`skip ${filename} — same content as ${cand.filename}`); + return false; + } + } + } + writeFileSync(filepath, buf); const savedName = basename(filepath); const folderName = basename(outputDir); try { upsertMediaFile(folderName, savedName, 'image', buf.length, Date.now(), null); } catch {} + if (sizeIndex) { + if (!sizeIndex.has(buf.length)) sizeIndex.set(buf.length, []); + sizeIndex.get(buf.length).push({ filename: savedName, path: filepath, hash: newHash }); + } + const sizeKb = (buf.length / 1024).toFixed(1); logFn(`Downloaded: ${savedName} (${sizeKb} KB)`); return true; @@ -273,7 +331,7 @@ async function downloadFromExternalHost(url, outputDir, downloadedSet, logFn, us } } -export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn, cookies, userAgent, fsSession) { +export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn, cookies, userAgent, fsSession, sizeIndex) { logFn(`Fetching page: ${pageUrl}${fsSession ? ' [via FlareSolverr]' : ''}`); let html; @@ -403,7 +461,7 @@ export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn, // Download images for (const imgUrl of imageUrls) { - if (await downloadImage(imgUrl, outputDir, downloadedSet, logFn, cookies, userAgent)) { + if (await downloadImage(imgUrl, outputDir, downloadedSet, logFn, cookies, userAgent, sizeIndex)) { count++; } }