Route SimpCity forum scraping through FlareSolverr + add turbo.cr resolver
DDoS-Guard now binds session cookies to the issuing browser's fingerprint, so direct Node fetch returns 403 even with valid cookies. Page HTML for any forum_site with stored cookies is now fetched via a FlareSolverr browser session opened once per scrape job. - Hybrid cookie refresh: FlareSolverr clears the DDoS-Guard captcha, those cookies seed undetected_chromedriver, Turnstile auto-solves in the real browser, login form submits, final cookies + browser UA persist to forum_sites - Per-site user_agent column so subsequent scraper requests match the UA the cookies were issued for (DDoS-Guard rejects UA mismatches) - XenForo search rewritten as proper CSRF POST /search/search → results page parse, replacing the broken ?q=... GET that only returned the search form - Pagination regex fallback in detectMaxPage catches XenForo pages that cheerio's class-based selectors miss - New scrapers/turbo.js handles turbo.cr /embed/ and /a/ URLs by rendering the page via FlareSolverr and grabbing the signed mp4 from the resolved <video src> attribute (gallery-dl can't extract these — obfuscated WASM) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
+64
-25
@@ -5,6 +5,8 @@ import { pipeline } from 'stream/promises';
|
||||
import { execFile } from 'child_process';
|
||||
import { promisify } from 'util';
|
||||
import { upsertMediaFile } from '../db.js';
|
||||
import { fsGet } from '../flaresolverr.js';
|
||||
import { isTurboUrl, downloadTurbo } from './turbo.js';
|
||||
|
||||
const execFileAsync = promisify(execFile);
|
||||
|
||||
@@ -20,11 +22,13 @@ export class CookieExpiredError extends Error {
|
||||
}
|
||||
|
||||
// Replace DDoS-Guard __ddg9_ cookie IP with server's IP so cookies work from any browser
|
||||
function fixCookieIp(cookies) {
|
||||
export function fixCookieIp(cookies) {
|
||||
if (!cookies) return cookies;
|
||||
return cookies.replace(/__ddg9_=[^;]+/, `__ddg9_=${SERVER_IP}`);
|
||||
}
|
||||
|
||||
export const FORUM_UA = UA;
|
||||
|
||||
const IMAGE_EXTS = new Set(['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff']);
|
||||
const VIDEO_EXTS = new Set(['.mp4', '.mov', '.avi', '.webm', '.mkv', '.m4v', '.wmv', '.flv', '.ts']);
|
||||
const SKIP_PATTERNS = ['avatar', 'smilie', 'emoji', 'icon', 'logo', 'button', 'sprite', 'badge', 'rank', 'star', 'dc_thumbnails'];
|
||||
@@ -69,13 +73,20 @@ export function getPageUrl(baseUrl, pageNum) {
|
||||
return url.split('#')[0];
|
||||
}
|
||||
|
||||
export async function detectMaxPage(baseUrl, logFn, cookies) {
|
||||
export async function detectMaxPage(baseUrl, logFn, cookies, userAgent, fsSession) {
|
||||
try {
|
||||
const headers = { 'User-Agent': UA };
|
||||
if (cookies) headers['Cookie'] = fixCookieIp(cookies);
|
||||
const resp = await fetch(baseUrl, { headers, signal: AbortSignal.timeout(15000) });
|
||||
if (!resp.ok) return null;
|
||||
const html = await resp.text();
|
||||
let html;
|
||||
if (fsSession) {
|
||||
const r = await fsGet(fsSession, baseUrl, cookies);
|
||||
if (r.status !== 200) return null;
|
||||
html = r.html;
|
||||
} else {
|
||||
const headers = { 'User-Agent': userAgent || UA };
|
||||
if (cookies) headers['Cookie'] = fixCookieIp(cookies);
|
||||
const resp = await fetch(baseUrl, { headers, signal: AbortSignal.timeout(15000) });
|
||||
if (!resp.ok) return null;
|
||||
html = await resp.text();
|
||||
}
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
let maxPage = 1;
|
||||
@@ -91,6 +102,17 @@ export async function detectMaxPage(baseUrl, logFn, cookies) {
|
||||
if (n > maxPage && n < 10000) maxPage = n;
|
||||
}
|
||||
});
|
||||
// Final fallback: scan raw HTML for any page-N references (XenForo's
|
||||
// serialized pagination sometimes only appears in href attributes that
|
||||
// cheerio's class-based selectors miss).
|
||||
if (maxPage === 1) {
|
||||
const re = /page-(\d+)/g;
|
||||
let m;
|
||||
while ((m = re.exec(html)) !== null) {
|
||||
const n = parseInt(m[1], 10);
|
||||
if (n > maxPage && n < 10000) maxPage = n;
|
||||
}
|
||||
}
|
||||
|
||||
if (maxPage > 1) {
|
||||
logFn(`Detected ${maxPage} pages`);
|
||||
@@ -123,7 +145,7 @@ function tryFullSizeUrl(thumbUrl) {
|
||||
return candidates;
|
||||
}
|
||||
|
||||
async function downloadImage(url, outputDir, downloadedSet, logFn, cookies) {
|
||||
async function downloadImage(url, outputDir, downloadedSet, logFn, cookies, userAgent) {
|
||||
if (downloadedSet.has(url)) return false;
|
||||
if (!isImageUrl(url)) return false;
|
||||
const lower = url.toLowerCase();
|
||||
@@ -142,7 +164,7 @@ async function downloadImage(url, outputDir, downloadedSet, logFn, cookies) {
|
||||
}
|
||||
|
||||
try {
|
||||
const dlHeaders = { 'User-Agent': UA };
|
||||
const dlHeaders = { 'User-Agent': userAgent || UA };
|
||||
if (cookies) dlHeaders['Cookie'] = fixCookieIp(cookies);
|
||||
const resp = await fetch(url, { headers: dlHeaders, signal: AbortSignal.timeout(30000) });
|
||||
if (!resp.ok) {
|
||||
@@ -171,10 +193,16 @@ async function downloadImage(url, outputDir, downloadedSet, logFn, cookies) {
|
||||
}
|
||||
|
||||
// Use gallery-dl to download from external hosts (bunkr, saint, cyberdrop, etc.)
|
||||
async function downloadFromExternalHost(url, outputDir, downloadedSet, logFn) {
|
||||
async function downloadFromExternalHost(url, outputDir, downloadedSet, logFn, userAgent, fsSession) {
|
||||
if (downloadedSet.has(url)) return 0;
|
||||
downloadedSet.add(url);
|
||||
|
||||
// turbo.cr uses an obfuscated WASM player — gallery-dl can't extract the
|
||||
// signed mp4 URL. Resolve via FlareSolverr (renders JS) instead.
|
||||
if (isTurboUrl(url)) {
|
||||
return await downloadTurbo(url, outputDir, logFn, userAgent, fsSession);
|
||||
}
|
||||
|
||||
logFn(`Resolving via gallery-dl: ${url}`);
|
||||
|
||||
try {
|
||||
@@ -240,23 +268,34 @@ async function downloadFromExternalHost(url, outputDir, downloadedSet, logFn) {
|
||||
}
|
||||
}
|
||||
|
||||
export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn, cookies) {
|
||||
logFn(`Fetching page: ${pageUrl}`);
|
||||
export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn, cookies, userAgent, fsSession) {
|
||||
logFn(`Fetching page: ${pageUrl}${fsSession ? ' [via FlareSolverr]' : ''}`);
|
||||
|
||||
let html;
|
||||
try {
|
||||
const headers = { 'User-Agent': UA };
|
||||
if (cookies) headers['Cookie'] = fixCookieIp(cookies);
|
||||
const resp = await fetch(pageUrl, { headers, signal: AbortSignal.timeout(15000) });
|
||||
if (!resp.ok) {
|
||||
// SimpCity returns 404 for expired sessions, 403 for blocked
|
||||
if (cookies && (resp.status === 404 || resp.status === 403)) {
|
||||
throw new CookieExpiredError(resp.status);
|
||||
if (fsSession) {
|
||||
const r = await fsGet(fsSession, pageUrl, cookies);
|
||||
if (r.status !== 200) {
|
||||
if (cookies && (r.status === 404 || r.status === 403)) {
|
||||
throw new CookieExpiredError(r.status);
|
||||
}
|
||||
logFn(`Failed to fetch page (${r.status})`);
|
||||
return 0;
|
||||
}
|
||||
logFn(`Failed to fetch page (${resp.status})`);
|
||||
return 0;
|
||||
html = r.html;
|
||||
} else {
|
||||
const headers = { 'User-Agent': userAgent || UA };
|
||||
if (cookies) headers['Cookie'] = fixCookieIp(cookies);
|
||||
const resp = await fetch(pageUrl, { headers, signal: AbortSignal.timeout(15000) });
|
||||
if (!resp.ok) {
|
||||
if (cookies && (resp.status === 404 || resp.status === 403)) {
|
||||
throw new CookieExpiredError(resp.status);
|
||||
}
|
||||
logFn(`Failed to fetch page (${resp.status})`);
|
||||
return 0;
|
||||
}
|
||||
html = await resp.text();
|
||||
}
|
||||
html = await resp.text();
|
||||
} catch (err) {
|
||||
if (err instanceof CookieExpiredError) throw err;
|
||||
logFn(`Failed to fetch page: ${err.message}`);
|
||||
@@ -359,14 +398,14 @@ export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn,
|
||||
|
||||
// Download images
|
||||
for (const imgUrl of imageUrls) {
|
||||
if (await downloadImage(imgUrl, outputDir, downloadedSet, logFn, cookies)) {
|
||||
if (await downloadImage(imgUrl, outputDir, downloadedSet, logFn, cookies, userAgent)) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
|
||||
// Download from external hosts via gallery-dl
|
||||
// Download from external hosts (turbo.cr handled via FlareSolverr; rest via gallery-dl)
|
||||
for (const extUrl of externalUrls) {
|
||||
const dlCount = await downloadFromExternalHost(extUrl, outputDir, downloadedSet, logFn);
|
||||
const dlCount = await downloadFromExternalHost(extUrl, outputDir, downloadedSet, logFn, userAgent, fsSession);
|
||||
count += dlCount;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user