Files
OFApp/server/scrapers/turbo.js
T
Trey T aa4f1157d1 Route SimpCity forum scraping through FlareSolverr + add turbo.cr resolver
DDoS-Guard now binds session cookies to the issuing browser's fingerprint, so
direct Node fetch returns 403 even with valid cookies. Page HTML for any
forum_site with stored cookies is now fetched via a FlareSolverr browser
session opened once per scrape job.

- Hybrid cookie refresh: FlareSolverr clears the DDoS-Guard captcha, those
  cookies seed undetected_chromedriver, Turnstile auto-solves in the real
  browser, login form submits, final cookies + browser UA persist to forum_sites
- Per-site user_agent column so subsequent scraper requests match the UA the
  cookies were issued for (DDoS-Guard rejects UA mismatches)
- XenForo search rewritten as proper CSRF POST /search/search → results page
  parse, replacing the broken ?q=... GET that only returned the search form
- Pagination regex fallback in detectMaxPage catches XenForo pages that
  cheerio's class-based selectors miss
- New scrapers/turbo.js handles turbo.cr /embed/ and /a/ URLs by rendering
  the page via FlareSolverr and grabbing the signed mp4 from the resolved
  <video src> attribute (gallery-dl can't extract these — obfuscated WASM)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-29 19:33:54 -05:00

139 lines
4.6 KiB
JavaScript

import { writeFileSync, existsSync } from 'fs';
import { join, basename } from 'path';
import { fsCreateSession, fsDestroySession, fsGet } from '../flaresolverr.js';
import { upsertMediaFile } from '../db.js';
const TURBO_HOST_RE = /^https?:\/\/(?:www\.)?turbo\.\w+\//i;
const TURBO_BASE = 'https://turbo.cr';
const DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
export function isTurboUrl(url) {
return TURBO_HOST_RE.test(url);
}
function unescapeHtml(s) {
return s.replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>').replace(/&quot;/g, '"');
}
function extractMp4FromHtml(html) {
// Plyr renders the resolved URL into <video src="..."> after WASM runs
const m = html.match(/<video[^>]+\bsrc=["']([^"']+\.mp4[^"']*)["']/i);
if (m) return unescapeHtml(m[1]);
// Fallback: any direct turbocdn mp4
const m2 = html.match(/https?:\/\/[^"'\s<>]*turbocdn[^"'\s<>]*\.mp4[^"'\s<>]*/i);
if (m2) return unescapeHtml(m2[0]);
return null;
}
function turboFilename(mp4Url, fallbackId) {
try {
const u = new URL(mp4Url);
const fn = u.searchParams.get('fn');
if (fn) return fn;
const base = basename(u.pathname);
if (base) return base;
} catch {}
return (fallbackId || 'turbo') + '.mp4';
}
async function downloadVideo(url, dest, ua) {
try {
const r = await fetch(url, {
headers: { 'User-Agent': ua, 'Referer': TURBO_BASE + '/' },
signal: AbortSignal.timeout(600000), // 10 min for big videos
});
if (!r.ok) return { ok: false, status: r.status };
const buf = Buffer.from(await r.arrayBuffer());
if (buf.length < 10000) return { ok: false, reason: 'too small ' + buf.length };
writeFileSync(dest, buf);
return { ok: true, size: buf.length };
} catch (e) {
return { ok: false, error: e.message };
}
}
async function resolveEmbed(sessionId, embedUrl) {
const r = await fsGet(sessionId, embedUrl, '');
if (r.status !== 200) return null;
return extractMp4FromHtml(r.html);
}
async function resolveAlbumIds(sessionId, albumUrl) {
const r = await fsGet(sessionId, albumUrl, '');
if (r.status !== 200) return [];
// Album page: each video tile has data-id="<videoId>"
const ids = new Set();
const re = /data-id=["']([A-Za-z0-9_-]{6,})["']/g;
let m;
while ((m = re.exec(r.html)) !== null) ids.add(m[1]);
// Also handle direct embed links if present
const re2 = /turbo\.[a-z]+\/embed\/([A-Za-z0-9_-]+)/g;
while ((m = re2.exec(r.html)) !== null) ids.add(m[1]);
return [...ids];
}
/**
* Resolve a turbo.cr URL (embed or album) and download all videos found.
* Pass an existing FlareSolverr sessionId to reuse it across many calls;
* otherwise one is created and destroyed per call.
*
* Returns the count of videos successfully downloaded.
*/
export async function downloadTurbo(url, outputDir, logFn, userAgent, fsSession) {
const ua = userAgent || DEFAULT_UA;
const folderName = basename(outputDir);
let ownSession = false;
let sessionId = fsSession;
try {
if (!sessionId) {
sessionId = await fsCreateSession(TURBO_BASE + '/');
ownSession = true;
}
let embedIds = [];
let mEmbed = url.match(/turbo\.[a-z]+\/embed\/([A-Za-z0-9_-]+)/i);
if (mEmbed) {
embedIds = [mEmbed[1]];
} else if (/\/a\//i.test(url)) {
logFn(`turbo: resolving album ${url}`);
embedIds = await resolveAlbumIds(sessionId, url);
logFn(`turbo: album has ${embedIds.length} video(s)`);
} else {
logFn(`turbo: unrecognized URL ${url}`);
return 0;
}
let count = 0;
for (const id of embedIds) {
const embedUrl = `${TURBO_BASE}/embed/${id}`;
try {
const mp4 = await resolveEmbed(sessionId, embedUrl);
if (!mp4) {
logFn(`turbo: could not resolve mp4 for ${id}`);
continue;
}
const filename = turboFilename(mp4, id);
const dest = join(outputDir, filename);
if (existsSync(dest)) {
logFn(`turbo: already have ${filename}`);
continue;
}
const dl = await downloadVideo(mp4, dest, ua);
if (dl.ok) {
try { upsertMediaFile(folderName, filename, 'video', dl.size, Date.now(), null); } catch {}
logFn(`Downloaded: ${filename} (${(dl.size / (1024 * 1024)).toFixed(1)} MB) [video]`);
count++;
} else {
logFn(`turbo: download failed ${filename} - ${dl.status || dl.error || dl.reason}`);
}
} catch (e) {
logFn(`turbo: error for ${id}: ${e.message}`);
}
}
return count;
} finally {
if (ownSession && sessionId) await fsDestroySession(sessionId);
}
}