Route SimpCity forum scraping through FlareSolverr + add turbo.cr resolver
DDoS-Guard now binds session cookies to the issuing browser's fingerprint, so direct Node fetch returns 403 even with valid cookies. Page HTML for any forum_site with stored cookies is now fetched via a FlareSolverr browser session opened once per scrape job. - Hybrid cookie refresh: FlareSolverr clears the DDoS-Guard captcha, those cookies seed undetected_chromedriver, Turnstile auto-solves in the real browser, login form submits, final cookies + browser UA persist to forum_sites - Per-site user_agent column so subsequent scraper requests match the UA the cookies were issued for (DDoS-Guard rejects UA mismatches) - XenForo search rewritten as proper CSRF POST /search/search → results page parse, replacing the broken ?q=... GET that only returned the search form - Pagination regex fallback in detectMaxPage catches XenForo pages that cheerio's class-based selectors miss - New scrapers/turbo.js handles turbo.cr /embed/ and /a/ URLs by rendering the page via FlareSolverr and grabbing the signed mp4 from the resolved <video src> attribute (gallery-dl can't extract these — obfuscated WASM) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
+144
-6
@@ -1,8 +1,9 @@
|
||||
import { Router } from 'express';
|
||||
import { mkdirSync } from 'fs';
|
||||
import { join } from 'path';
|
||||
import { scrapeForumPage, getPageUrl, detectMaxPage, CookieExpiredError } from './scrapers/forum.js';
|
||||
import { refreshForumCookies } from './flaresolverr.js';
|
||||
import * as cheerio from 'cheerio';
|
||||
import { scrapeForumPage, getPageUrl, detectMaxPage, CookieExpiredError, fixCookieIp, FORUM_UA } from './scrapers/forum.js';
|
||||
import { refreshForumCookies, fsCreateSession, fsDestroySession, fsGet, fsPost } from './flaresolverr.js';
|
||||
import { parseUserUrl, fetchAllPosts, fetchSearchPosts, downloadFiles } from './scrapers/coomer.js';
|
||||
import { parseMediaUrl, fetchAllMedia, fetchAllMediaFromHtml, downloadMedia } from './scrapers/medialink.js';
|
||||
import { parseMegaUrl, listAllFiles, downloadMegaFiles } from './scrapers/mega.js';
|
||||
@@ -75,14 +76,17 @@ function jobToJson(job) {
|
||||
async function runForumScrape(job) {
|
||||
let { url, startPage, endPage, delay, folderName, siteId, lastPageOnly } = job.config;
|
||||
let { cookies } = job.config;
|
||||
let userAgent = job.config.userAgent || '';
|
||||
|
||||
// Load cookies from forum site record if siteId provided and no cookies passed
|
||||
if (!cookies && siteId) {
|
||||
const site = getForumSiteById(siteId);
|
||||
if (site && site.cookies) {
|
||||
cookies = site.cookies;
|
||||
userAgent = site.user_agent || userAgent;
|
||||
job.config.cookies = cookies;
|
||||
addLog(job, `Loaded cookies from forum site: ${site.name}`);
|
||||
job.config.userAgent = userAgent;
|
||||
addLog(job, `Loaded cookies from forum site: ${site.name}${userAgent ? ` (UA pinned)` : ''}`);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -92,10 +96,24 @@ async function runForumScrape(job) {
|
||||
const downloadedSet = new Set();
|
||||
let totalImages = 0;
|
||||
|
||||
// When a siteId is in play, page HTML must be fetched through FlareSolverr —
|
||||
// direct fetch hits DDoS-Guard's browser-fingerprint check and gets 403.
|
||||
let fsSession = null;
|
||||
if (siteId && cookies) {
|
||||
try {
|
||||
const baseHost = new URL(url).origin;
|
||||
addLog(job, `Opening FlareSolverr session for ${baseHost}...`);
|
||||
fsSession = await fsCreateSession(baseHost + '/');
|
||||
addLog(job, `FlareSolverr session ready (${fsSession.slice(0, 8)}...)`);
|
||||
} catch (e) {
|
||||
addLog(job, `FlareSolverr session failed (${e.message}) — falling back to direct fetch`);
|
||||
}
|
||||
}
|
||||
|
||||
// If lastPageOnly, detect the last page and only scrape that
|
||||
if (lastPageOnly) {
|
||||
addLog(job, 'Detecting last page...');
|
||||
const maxPage = await detectMaxPage(url, (msg) => addLog(job, msg), cookies);
|
||||
const maxPage = await detectMaxPage(url, (msg) => addLog(job, msg), cookies, userAgent, fsSession);
|
||||
if (maxPage) {
|
||||
startPage = maxPage;
|
||||
endPage = maxPage;
|
||||
@@ -122,15 +140,18 @@ async function runForumScrape(job) {
|
||||
|
||||
let count;
|
||||
try {
|
||||
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies);
|
||||
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession);
|
||||
} catch (err) {
|
||||
if (err instanceof CookieExpiredError && siteId) {
|
||||
addLog(job, `Cookie expired (HTTP ${err.statusCode}) — attempting auto-refresh via FlareSolverr...`);
|
||||
try {
|
||||
cookies = await refreshForumCookies(siteId);
|
||||
const refreshed = getForumSiteById(siteId);
|
||||
userAgent = refreshed?.user_agent || userAgent;
|
||||
job.config.cookies = cookies;
|
||||
job.config.userAgent = userAgent;
|
||||
addLog(job, 'Cookies refreshed successfully — retrying page...');
|
||||
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies);
|
||||
count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession);
|
||||
} catch (refreshErr) {
|
||||
addLog(job, `Cookie refresh failed: ${refreshErr.message}`);
|
||||
addLog(job, 'Stopping scrape — fix credentials or refresh cookies manually');
|
||||
@@ -156,6 +177,10 @@ async function runForumScrape(job) {
|
||||
addLog(job, `Error: ${err.message}`);
|
||||
job.progress.errors++;
|
||||
} finally {
|
||||
if (fsSession) {
|
||||
await fsDestroySession(fsSession);
|
||||
addLog(job, `FlareSolverr session closed`);
|
||||
}
|
||||
job.running = false;
|
||||
job.completedAt = new Date().toISOString();
|
||||
addLog(job, `Done! ${totalImages} files saved to ${folderName}/`);
|
||||
@@ -653,6 +678,119 @@ router.post('/api/scrape/forum/detect-pages', async (req, res) => {
|
||||
res.json({ maxPage, logs });
|
||||
});
|
||||
|
||||
// Search a forum site for threads matching a query, return preview images per thread
|
||||
const SEARCH_SKIP_PATTERNS = ['avatar', 'smilie', 'emoji', 'icon', 'logo', 'button', 'sprite', 'badge', 'rank', 'star'];
|
||||
const PREVIEW_IMG_EXTS = /\.(jpg|jpeg|png|webp|gif)(\?|$)/i;
|
||||
|
||||
router.post('/api/scrape/forum/search', async (req, res) => {
|
||||
const { query, siteId = 2, maxThreads = 5, previewsPerThread = 4, titleOnly = true } = req.body;
|
||||
if (!query) return res.status(400).json({ error: 'query is required' });
|
||||
|
||||
const site = getForumSiteById(siteId);
|
||||
if (!site) return res.status(404).json({ error: `Forum site ${siteId} not found` });
|
||||
if (!site.cookies) return res.status(400).json({ error: 'Forum site has no cookies — refresh first' });
|
||||
|
||||
const baseUrl = (site.base_url || 'https://simpcity.cr').replace(/\/$/, '');
|
||||
const cookies = site.cookies;
|
||||
|
||||
let fsSession = null;
|
||||
try {
|
||||
fsSession = await fsCreateSession(baseUrl + '/');
|
||||
|
||||
// Step 1: GET search form to grab the XenForo CSRF token
|
||||
const formRes = await fsGet(fsSession, baseUrl + '/search/', cookies);
|
||||
if (formRes.status !== 200) {
|
||||
return res.status(formRes.status).json({ error: `Search form fetch failed: HTTP ${formRes.status}` });
|
||||
}
|
||||
const xfMatch = formRes.html.match(/name="_xfToken"\s+value="([^"]+)"/);
|
||||
if (!xfMatch) {
|
||||
return res.status(503).json({ error: 'No _xfToken on search form — cookies likely expired. Refresh via /api/flaresolverr/refresh/' + siteId });
|
||||
}
|
||||
const xfToken = xfMatch[1];
|
||||
|
||||
// Step 2: POST the search; XenForo redirects to /search/<id>/ with results
|
||||
const postBody = new URLSearchParams({
|
||||
keywords: query,
|
||||
'c[title_only]': titleOnly ? '1' : '',
|
||||
'c[users]': '',
|
||||
_xfToken: xfToken,
|
||||
});
|
||||
const postRes = await fsPost(fsSession, baseUrl + '/search/search', cookies, postBody.toString());
|
||||
const html = postRes.html;
|
||||
|
||||
// Parse thread results from contentRow-title anchors (XenForo result layout)
|
||||
const $ = cheerio.load(html);
|
||||
const seen = new Set();
|
||||
const threads = [];
|
||||
$('h3.contentRow-title a[href*="/threads/"]').each((_, el) => {
|
||||
const $a = $(el);
|
||||
let href;
|
||||
try { href = new URL($a.attr('href'), baseUrl).href; } catch { return; }
|
||||
const m = href.match(/\/threads\/([^\/]+\.\d+)\//);
|
||||
if (!m) return;
|
||||
const threadRoot = `${baseUrl}/threads/${m[1]}/`;
|
||||
if (seen.has(threadRoot)) return;
|
||||
seen.add(threadRoot);
|
||||
const title = $a.text().replace(/\s+/g, ' ').trim();
|
||||
if (!title || title.length < 3) return;
|
||||
threads.push({ threadUrl: threadRoot, title });
|
||||
});
|
||||
|
||||
if (threads.length === 0) {
|
||||
return res.json({ query, results: [] });
|
||||
}
|
||||
|
||||
// For top N threads, fetch last page and pull preview image URLs
|
||||
const topThreads = threads.slice(0, maxThreads);
|
||||
const results = [];
|
||||
for (const t of topThreads) {
|
||||
try {
|
||||
const maxPage = await detectMaxPage(t.threadUrl, () => {}, cookies, '', fsSession);
|
||||
const lastPageUrl = maxPage && maxPage > 1 ? `${t.threadUrl}page-${maxPage}` : t.threadUrl;
|
||||
const pageRes = await fsGet(fsSession, lastPageUrl, cookies);
|
||||
if (pageRes.status !== 200) {
|
||||
results.push({ ...t, lastPageUrl, lastPageNum: maxPage || 1, previews: [], error: `HTTP ${pageRes.status}` });
|
||||
continue;
|
||||
}
|
||||
const $p = cheerio.load(pageRes.html);
|
||||
const imgUrls = [];
|
||||
$p('.message-body img, .bbWrapper img').each((_, el) => {
|
||||
const $img = $p(el);
|
||||
const src = $img.attr('src') || $img.attr('data-src') || $img.attr('data-url');
|
||||
if (!src) return;
|
||||
let absSrc;
|
||||
try { absSrc = new URL(src, lastPageUrl).href; } catch { return; }
|
||||
const lower = absSrc.toLowerCase();
|
||||
if (SEARCH_SKIP_PATTERNS.some(p => lower.includes(p))) return;
|
||||
const $parentA = $img.closest('a');
|
||||
if ($parentA.length && $parentA.attr('href')) {
|
||||
try {
|
||||
const aHref = new URL($parentA.attr('href'), lastPageUrl).href;
|
||||
if (PREVIEW_IMG_EXTS.test(aHref)) { imgUrls.push(aHref); return; }
|
||||
} catch {}
|
||||
}
|
||||
let upgraded = absSrc.replace('.th.', '.').replace('.md.', '.');
|
||||
if (PREVIEW_IMG_EXTS.test(upgraded) || /\/data\/attachments|proxy\.php/.test(upgraded)) {
|
||||
imgUrls.push(upgraded);
|
||||
}
|
||||
});
|
||||
const unique = [...new Set(imgUrls)];
|
||||
const previews = unique.slice(-previewsPerThread);
|
||||
results.push({ ...t, lastPageUrl, lastPageNum: maxPage || 1, previews });
|
||||
} catch (err) {
|
||||
results.push({ ...t, previews: [], error: err.message });
|
||||
}
|
||||
}
|
||||
|
||||
res.json({ query, results });
|
||||
} catch (err) {
|
||||
console.error('[scrape/forum/search]', err);
|
||||
res.status(500).json({ error: err.message });
|
||||
} finally {
|
||||
if (fsSession) await fsDestroySession(fsSession);
|
||||
}
|
||||
});
|
||||
|
||||
// --- Forum Sites CRUD ---
|
||||
|
||||
router.get('/api/scrape/forum-sites', (_req, res) => {
|
||||
|
||||
Reference in New Issue
Block a user