diff --git a/server/db.js b/server/db.js index 054c068..4321edf 100644 --- a/server/db.js +++ b/server/db.js @@ -137,6 +137,9 @@ if (!forumCols.includes('password')) { if (!forumCols.includes('cookie_expires_at')) { db.exec('ALTER TABLE forum_sites ADD COLUMN cookie_expires_at TEXT'); } +if (!forumCols.includes('user_agent')) { + db.exec("ALTER TABLE forum_sites ADD COLUMN user_agent TEXT DEFAULT ''"); +} export function getAuthConfig() { const row = db.prepare('SELECT * FROM auth_config LIMIT 1').get(); @@ -768,7 +771,7 @@ export function createForumSite(name, baseUrl, cookies, username, password) { } export function updateForumSite(id, fields) { - const allowed = ['name', 'base_url', 'cookies', 'username', 'password', 'cookie_expires_at']; + const allowed = ['name', 'base_url', 'cookies', 'username', 'password', 'cookie_expires_at', 'user_agent']; const sets = []; const vals = []; for (const [k, v] of Object.entries(fields)) { diff --git a/server/flaresolverr.js b/server/flaresolverr.js index 89367e7..6c5f24b 100644 --- a/server/flaresolverr.js +++ b/server/flaresolverr.js @@ -13,10 +13,113 @@ const CHROMIUM_PATH = process.env.CHROMIUM_PATH || '/usr/bin/chromium-browser'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); +async function fsCall(payload, timeoutMs = 130000) { + const resp = await fetch(`${FLARESOLVERR_URL}/v1`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(payload), + signal: AbortSignal.timeout(timeoutMs), + }); + const data = await resp.json(); + if (data.status !== 'ok') { + throw new Error(`FlareSolverr error: ${data.message || JSON.stringify(data)}`); + } + return data; +} + +function cookieArrayToString(cookies) { + return cookies.map(c => `${c.name}=${c.value}`).join('; '); +} + /** - * Refresh forum cookies using undetected_chromedriver (Python). - * Runs login_helper.py via xvfb-run so Chrome runs in headed mode - * with a virtual display — this is what lets Turnstile auto-solve. + * Create a FlareSolverr browser session and warm it up so DDoS-Guard / + * Cloudflare cookies are seeded for the target host. Returns a sessionId that + * must be passed to fsGet() and finally fsDestroySession(). + */ +export async function fsCreateSession(warmUpUrl) { + const sess = await fsCall({ cmd: 'sessions.create' }, 60000); + if (warmUpUrl) { + try { + await fsCall({ cmd: 'request.get', url: warmUpUrl, session: sess.session, maxTimeout: 90000 }); + } catch (e) { + console.warn(`[flaresolverr] Warm-up GET ${warmUpUrl} failed: ${e.message}`); + } + } + return sess.session; +} + +export async function fsDestroySession(sessionId) { + if (!sessionId) return; + try { + await fsCall({ cmd: 'sessions.destroy', session: sessionId }, 30000); + } catch (e) { + console.warn(`[flaresolverr] Failed to destroy session ${sessionId}: ${e.message}`); + } +} + +/** + * GET a URL through a FlareSolverr session. The site's auth cookies (as a + * cookie-string) are merged into the session's cookie jar before navigation, + * so requests are authenticated without exposing fingerprintable headers. + * Returns { status, html, finalUrl }. + */ +export async function fsGet(sessionId, url, cookieStr, retries = 1) { + const cookies = (cookieStr || '').split(';').map(c => { + const [name, ...rest] = c.trim().split('='); + return { name: name.trim(), value: rest.join('=') }; + }).filter(c => c.name && c.value && !c.name.startsWith('__ddg') && c.name !== 'ddg_last_challenge'); + + for (let i = 0; i <= retries; i++) { + try { + const r = await fsCall({ + cmd: 'request.get', + url, + session: sessionId, + cookies, + maxTimeout: 90000, + }); + if (r.solution && r.solution.status) { + return { status: r.solution.status, html: r.solution.response, finalUrl: r.solution.url }; + } + } catch (e) { + if (i >= retries) throw e; + } + await new Promise(r => setTimeout(r, 3000)); + } + return { status: 0, html: '', finalUrl: '' }; +} + +/** + * POST form-encoded data through a FlareSolverr session. + */ +export async function fsPost(sessionId, url, cookieStr, postData) { + const cookies = (cookieStr || '').split(';').map(c => { + const [name, ...rest] = c.trim().split('='); + return { name: name.trim(), value: rest.join('=') }; + }).filter(c => c.name && c.value && !c.name.startsWith('__ddg') && c.name !== 'ddg_last_challenge'); + + const r = await fsCall({ + cmd: 'request.post', + url, + session: sessionId, + cookies, + postData, + maxTimeout: 90000, + }); + return { status: r.solution.status, html: r.solution.response, finalUrl: r.solution.url }; +} + +/** + * Refresh forum cookies via a hybrid flow: + * 1) FlareSolverr clears DDoS-Guard's "I'm not a robot" captcha and returns the + * __ddg* cookies in JSON form. + * 2) undetected_chromedriver is launched with those cookies pre-loaded, so it + * lands directly on the login page (skipping the captcha). Turnstile then + * auto-solves in the real browser context, the form is submitted, and we + * extract the final session cookies (including the user-identity cookie). + * + * This is the only flow we've found that handles both DDoS-Guard captcha and + * Cloudflare Turnstile without external paid services. */ export async function refreshForumCookies(siteId) { const site = getForumSiteById(siteId); @@ -25,66 +128,79 @@ export async function refreshForumCookies(siteId) { throw new Error('Forum site has no saved credentials — set username and password first'); } - const baseUrl = site.base_url || 'https://simpcity.su'; + const baseUrl = (site.base_url || 'https://simpcity.cr').replace(/\/$/, ''); const loginUrl = `${baseUrl}/login/`; - const helperPath = path.join(__dirname, 'login_helper.py'); console.log(`[flaresolverr] Refreshing cookies for site ${siteId} (${site.name})`); - console.log(`[flaresolverr] Login URL: ${loginUrl}`); - // Run the Python helper with xvfb-run for virtual display + // Step 1: get DDoS-Guard cookies via FlareSolverr (no login attempt yet) + let ddgCookies = []; + let fsSessionId = null; + try { + const sess = await fsCall({ cmd: 'sessions.create' }, 60000); + fsSessionId = sess.session; + const getRes = await fsCall({ + cmd: 'request.get', + url: baseUrl + '/', + session: fsSessionId, + maxTimeout: 120000, + }); + ddgCookies = getRes.solution.cookies || []; + console.log(`[flaresolverr] DDoS-Guard cleared, got ${ddgCookies.length} cookies`); + } finally { + if (fsSessionId) { + try { await fsCall({ cmd: 'sessions.destroy', session: fsSessionId }, 30000); } catch {} + } + } + + if (ddgCookies.length === 0) { + throw new Error('FlareSolverr returned no cookies — DDoS-Guard not bypassed'); + } + + // Step 2: launch chromedriver with pre-loaded cookies + perform login + const helperPath = path.join(__dirname, 'login_helper.py'); + const cookiesJson = JSON.stringify(ddgCookies); + // Escape arguments for shell safety - const escapedUrl = loginUrl.replace(/'/g, "'\\''"); - const escapedUser = site.username.replace(/'/g, "'\\''"); - const escapedPass = site.password.replace(/'/g, "'\\''"); - - const cmd = `xvfb-run --auto-servernum --server-args='-screen 0 1920x1080x24' python3 '${helperPath}' '${escapedUrl}' '${escapedUser}' '${escapedPass}'`; + const esc = (s) => s.replace(/'/g, "'\\''"); + const cmd = `xvfb-run --auto-servernum --server-args='-screen 0 1920x1080x24' python3 -u '${helperPath}' '${esc(loginUrl)}' '${esc(site.username)}' '${esc(site.password)}' '${esc(cookiesJson)}'`; try { const { stdout, stderr } = await execAsync(cmd, { - timeout: 120000, // 2 minutes + timeout: 180000, maxBuffer: 10 * 1024 * 1024, env: { ...process.env, CHROMIUM_PATH }, }); - // Log stderr (debug output from login_helper.py) if (stderr) { for (const line of stderr.split('\n').filter(Boolean)) { console.log(`[flaresolverr] ${line}`); } } - // Parse JSON from stdout const result = JSON.parse(stdout.trim()); + if (!result.ok) throw new Error(result.error || 'Login failed'); - if (!result.ok) { - throw new Error(result.error || 'Login failed'); - } - - // Update DB with new cookies const expiresAt = new Date(Date.now() + 25 * 24 * 60 * 60 * 1000).toISOString(); updateForumSite(siteId, { cookies: result.cookies, cookie_expires_at: expiresAt, + user_agent: result.user_agent || '', }); - console.log(`[flaresolverr] Cookie refresh successful for site ${siteId}`); + console.log(`[flaresolverr] Cookie refresh successful for site ${siteId} (UA: ${result.user_agent || 'default'})`); return result.cookies; } catch (err) { - // If execAsync fails, the error might have stderr info if (err.stderr) { for (const line of err.stderr.split('\n').filter(Boolean)) { console.error(`[flaresolverr] ${line}`); } } - // Try to parse stdout for a structured error if (err.stdout) { try { const result = JSON.parse(err.stdout.trim()); if (result.error) throw new Error(result.error); - } catch (parseErr) { - // Not JSON, use original error - } + } catch {} } throw new Error(`Cookie refresh failed: ${err.message}`); } diff --git a/server/login_helper.py b/server/login_helper.py index 3a09996..5f3c6d6 100644 --- a/server/login_helper.py +++ b/server/login_helper.py @@ -1,10 +1,14 @@ #!/usr/bin/env python3 """ -Login helper using undetected_chromedriver to bypass Cloudflare Turnstile. -Runs Chrome in headed mode with Xvfb (virtual display) so Turnstile sees a real browser. +Login helper using undetected_chromedriver. The 4th argument is a JSON array of +pre-seeded cookies (from FlareSolverr) that satisfy DDoS-Guard, so Chrome lands +directly on the login form without facing the captcha. Turnstile auto-solves in +the real browser context. Usage: - xvfb-run python3 login_helper.py + xvfb-run python3 login_helper.py + +cookies_json: JSON array like [{"name":"__ddg9_","value":"...","domain":"..."}] Outputs JSON to stdout: {"ok": true, "cookies": "name=val; name2=val2", "url": ""} @@ -19,12 +23,19 @@ import shutil def main(): if len(sys.argv) < 4: - print(json.dumps({"ok": False, "error": "Usage: login_helper.py "})) + print(json.dumps({"ok": False, "error": "Usage: login_helper.py [cookies_json]"})) sys.exit(1) login_url = sys.argv[1] username = sys.argv[2] password = sys.argv[3] + seed_cookies = [] + if len(sys.argv) >= 5 and sys.argv[4]: + try: + seed_cookies = json.loads(sys.argv[4]) + log(f"Got {len(seed_cookies)} seed cookies from FlareSolverr") + except Exception as e: + log(f"Could not parse seed cookies: {e}") try: import undetected_chromedriver as uc @@ -76,6 +87,9 @@ def main(): options.add_argument('--disable-dev-shm-usage') options.add_argument('--disable-gpu') options.add_argument('--window-size=1920,1080') + # Don't pin a fake UA — Turnstile detects spoofed UAs and refuses to + # auto-solve. The natural Chromium UA must be matched by all scraper + # requests instead (forum.js reads it from the forum_sites row). log(f"Chromium: {chromium_path}") log(f"Chromedriver: {chromedriver_path}") @@ -90,23 +104,68 @@ def main(): ) driver.set_window_size(1920, 1080) + # Pre-seed DDoS-Guard cookies so Chrome skips the captcha challenge entirely + from urllib.parse import urlparse + parsed = urlparse(login_url) + base_url = f"{parsed.scheme}://{parsed.netloc}" + + if seed_cookies: + # Must visit the domain first before add_cookie works + log(f"Visiting {base_url}/.well-known/ddos-guard/check.js to set cookie domain context...") + try: + # Use a static asset path that DDoS-Guard whitelists (returns 200 fast) + driver.get(base_url + "/favicon.ico") + except Exception: + driver.get("about:blank") + driver.execute_script(f"document.location = '{base_url}/favicon.ico';") + time.sleep(2) + + log(f"Injecting {len(seed_cookies)} seed cookies...") + for c in seed_cookies: + cookie_dict = { + "name": c["name"], + "value": c["value"], + "path": c.get("path", "/"), + } + # Selenium add_cookie wants domain (without leading dot) and rejects mismatches + if c.get("domain"): + cookie_dict["domain"] = c["domain"] + if c.get("secure"): + cookie_dict["secure"] = c["secure"] + try: + driver.add_cookie(cookie_dict) + except Exception as e: + log(f" skip {c['name']}: {e}") + log(f"Navigating to {login_url}...") driver.get(login_url) - # Wait for DDoS-Guard to solve and login form to appear - log("Waiting for login form (DDoS-Guard solving)...") - WebDriverWait(driver, 60).until( - EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="login"]')) - ) - log("Login form found") + # Wait for login form to appear + log("Waiting for login form...") + try: + WebDriverWait(driver, 30).until( + EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="login"]')) + ) + log("Login form found") + except Exception: + try: + log(f"Login form timeout. Title: '{driver.title}', url: {driver.current_url}") + log(f"Page source snippet: {driver.page_source[:500]}") + driver.save_screenshot('/tmp/login_timeout.png') + log("Screenshot saved to /tmp/login_timeout.png") + except Exception: + pass + raise # Wait for Turnstile to auto-solve (should work in undetected headed mode) log("Waiting for Turnstile to solve...") turnstile_token = "" - for i in range(45): + for i in range(60): try: - el = driver.find_element(By.CSS_SELECTOR, 'input[name="cf-turnstile-response"]') - val = el.get_attribute("value") + # Use JS .value (property), not get_attribute (which reads HTML attribute) + val = driver.execute_script( + 'var el = document.querySelector(\'input[name="cf-turnstile-response"]\'); return el ? el.value : null;' + ) if val: turnstile_token = val break @@ -178,22 +237,40 @@ def main(): has_user_cookie = any(c['name'] in ('xf_user', 'ogaddgmetaprof_user') for c in cookies) if not has_user_cookie: - # Check for error message + # Check for error message — XenForo shows multiple variants error_msg = "Login failed — no user cookie returned" + for sel in ['.blockMessage--error', '.errorOverlay', '.formRow--error', '.alert--error', '.blockMessage']: + try: + el = driver.find_element(By.CSS_SELECTOR, sel) + txt = el.text.strip() + if txt: + error_msg = txt[:300] + log(f"Found error in {sel}: {error_msg}") + break + except Exception: + pass + # Also try dumping page title and any visible "Invalid"/"failed"/"incorrect" text try: - error_el = driver.find_element(By.CSS_SELECTOR, '.blockMessage--error') - error_msg = error_el.text.strip() + page_title = driver.title or '' + log(f"Page title after submit: '{page_title}'") + snippet = driver.page_source[:1500] + log(f"Page source snippet: {snippet}") + driver.save_screenshot('/tmp/login_failed.png') + log("Screenshot saved to /tmp/login_failed.png") except Exception: pass - # Also dump all cookie names for debugging cookie_names = [c['name'] for c in cookies] log(f"Cookie names: {cookie_names}") - log(f"Error: {error_msg}") print(json.dumps({"ok": False, "error": error_msg, "url": final_url})) sys.exit(1) - log(f"Login successful — {len(cookies)} cookies") - print(json.dumps({"ok": True, "cookies": cookie_str, "url": final_url})) + # Capture the browser's real UA so subsequent scraper requests can match it + try: + real_ua = driver.execute_script('return navigator.userAgent;') + except Exception: + real_ua = '' + log(f"Login successful — {len(cookies)} cookies, UA: {real_ua}") + print(json.dumps({"ok": True, "cookies": cookie_str, "user_agent": real_ua, "url": final_url})) except Exception as e: log(f"Fatal error: {e}") diff --git a/server/scrape.js b/server/scrape.js index c7e2c74..20f70ad 100644 --- a/server/scrape.js +++ b/server/scrape.js @@ -1,8 +1,9 @@ import { Router } from 'express'; import { mkdirSync } from 'fs'; import { join } from 'path'; -import { scrapeForumPage, getPageUrl, detectMaxPage, CookieExpiredError } from './scrapers/forum.js'; -import { refreshForumCookies } from './flaresolverr.js'; +import * as cheerio from 'cheerio'; +import { scrapeForumPage, getPageUrl, detectMaxPage, CookieExpiredError, fixCookieIp, FORUM_UA } from './scrapers/forum.js'; +import { refreshForumCookies, fsCreateSession, fsDestroySession, fsGet, fsPost } from './flaresolverr.js'; import { parseUserUrl, fetchAllPosts, fetchSearchPosts, downloadFiles } from './scrapers/coomer.js'; import { parseMediaUrl, fetchAllMedia, fetchAllMediaFromHtml, downloadMedia } from './scrapers/medialink.js'; import { parseMegaUrl, listAllFiles, downloadMegaFiles } from './scrapers/mega.js'; @@ -75,14 +76,17 @@ function jobToJson(job) { async function runForumScrape(job) { let { url, startPage, endPage, delay, folderName, siteId, lastPageOnly } = job.config; let { cookies } = job.config; + let userAgent = job.config.userAgent || ''; // Load cookies from forum site record if siteId provided and no cookies passed if (!cookies && siteId) { const site = getForumSiteById(siteId); if (site && site.cookies) { cookies = site.cookies; + userAgent = site.user_agent || userAgent; job.config.cookies = cookies; - addLog(job, `Loaded cookies from forum site: ${site.name}`); + job.config.userAgent = userAgent; + addLog(job, `Loaded cookies from forum site: ${site.name}${userAgent ? ` (UA pinned)` : ''}`); } } @@ -92,10 +96,24 @@ async function runForumScrape(job) { const downloadedSet = new Set(); let totalImages = 0; + // When a siteId is in play, page HTML must be fetched through FlareSolverr — + // direct fetch hits DDoS-Guard's browser-fingerprint check and gets 403. + let fsSession = null; + if (siteId && cookies) { + try { + const baseHost = new URL(url).origin; + addLog(job, `Opening FlareSolverr session for ${baseHost}...`); + fsSession = await fsCreateSession(baseHost + '/'); + addLog(job, `FlareSolverr session ready (${fsSession.slice(0, 8)}...)`); + } catch (e) { + addLog(job, `FlareSolverr session failed (${e.message}) — falling back to direct fetch`); + } + } + // If lastPageOnly, detect the last page and only scrape that if (lastPageOnly) { addLog(job, 'Detecting last page...'); - const maxPage = await detectMaxPage(url, (msg) => addLog(job, msg), cookies); + const maxPage = await detectMaxPage(url, (msg) => addLog(job, msg), cookies, userAgent, fsSession); if (maxPage) { startPage = maxPage; endPage = maxPage; @@ -122,15 +140,18 @@ async function runForumScrape(job) { let count; try { - count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies); + count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession); } catch (err) { if (err instanceof CookieExpiredError && siteId) { addLog(job, `Cookie expired (HTTP ${err.statusCode}) — attempting auto-refresh via FlareSolverr...`); try { cookies = await refreshForumCookies(siteId); + const refreshed = getForumSiteById(siteId); + userAgent = refreshed?.user_agent || userAgent; job.config.cookies = cookies; + job.config.userAgent = userAgent; addLog(job, 'Cookies refreshed successfully — retrying page...'); - count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies); + count = await scrapeForumPage(pageUrl, outputDir, downloadedSet, (msg) => addLog(job, msg), cookies, userAgent, fsSession); } catch (refreshErr) { addLog(job, `Cookie refresh failed: ${refreshErr.message}`); addLog(job, 'Stopping scrape — fix credentials or refresh cookies manually'); @@ -156,6 +177,10 @@ async function runForumScrape(job) { addLog(job, `Error: ${err.message}`); job.progress.errors++; } finally { + if (fsSession) { + await fsDestroySession(fsSession); + addLog(job, `FlareSolverr session closed`); + } job.running = false; job.completedAt = new Date().toISOString(); addLog(job, `Done! ${totalImages} files saved to ${folderName}/`); @@ -653,6 +678,119 @@ router.post('/api/scrape/forum/detect-pages', async (req, res) => { res.json({ maxPage, logs }); }); +// Search a forum site for threads matching a query, return preview images per thread +const SEARCH_SKIP_PATTERNS = ['avatar', 'smilie', 'emoji', 'icon', 'logo', 'button', 'sprite', 'badge', 'rank', 'star']; +const PREVIEW_IMG_EXTS = /\.(jpg|jpeg|png|webp|gif)(\?|$)/i; + +router.post('/api/scrape/forum/search', async (req, res) => { + const { query, siteId = 2, maxThreads = 5, previewsPerThread = 4, titleOnly = true } = req.body; + if (!query) return res.status(400).json({ error: 'query is required' }); + + const site = getForumSiteById(siteId); + if (!site) return res.status(404).json({ error: `Forum site ${siteId} not found` }); + if (!site.cookies) return res.status(400).json({ error: 'Forum site has no cookies — refresh first' }); + + const baseUrl = (site.base_url || 'https://simpcity.cr').replace(/\/$/, ''); + const cookies = site.cookies; + + let fsSession = null; + try { + fsSession = await fsCreateSession(baseUrl + '/'); + + // Step 1: GET search form to grab the XenForo CSRF token + const formRes = await fsGet(fsSession, baseUrl + '/search/', cookies); + if (formRes.status !== 200) { + return res.status(formRes.status).json({ error: `Search form fetch failed: HTTP ${formRes.status}` }); + } + const xfMatch = formRes.html.match(/name="_xfToken"\s+value="([^"]+)"/); + if (!xfMatch) { + return res.status(503).json({ error: 'No _xfToken on search form — cookies likely expired. Refresh via /api/flaresolverr/refresh/' + siteId }); + } + const xfToken = xfMatch[1]; + + // Step 2: POST the search; XenForo redirects to /search// with results + const postBody = new URLSearchParams({ + keywords: query, + 'c[title_only]': titleOnly ? '1' : '', + 'c[users]': '', + _xfToken: xfToken, + }); + const postRes = await fsPost(fsSession, baseUrl + '/search/search', cookies, postBody.toString()); + const html = postRes.html; + + // Parse thread results from contentRow-title anchors (XenForo result layout) + const $ = cheerio.load(html); + const seen = new Set(); + const threads = []; + $('h3.contentRow-title a[href*="/threads/"]').each((_, el) => { + const $a = $(el); + let href; + try { href = new URL($a.attr('href'), baseUrl).href; } catch { return; } + const m = href.match(/\/threads\/([^\/]+\.\d+)\//); + if (!m) return; + const threadRoot = `${baseUrl}/threads/${m[1]}/`; + if (seen.has(threadRoot)) return; + seen.add(threadRoot); + const title = $a.text().replace(/\s+/g, ' ').trim(); + if (!title || title.length < 3) return; + threads.push({ threadUrl: threadRoot, title }); + }); + + if (threads.length === 0) { + return res.json({ query, results: [] }); + } + + // For top N threads, fetch last page and pull preview image URLs + const topThreads = threads.slice(0, maxThreads); + const results = []; + for (const t of topThreads) { + try { + const maxPage = await detectMaxPage(t.threadUrl, () => {}, cookies, '', fsSession); + const lastPageUrl = maxPage && maxPage > 1 ? `${t.threadUrl}page-${maxPage}` : t.threadUrl; + const pageRes = await fsGet(fsSession, lastPageUrl, cookies); + if (pageRes.status !== 200) { + results.push({ ...t, lastPageUrl, lastPageNum: maxPage || 1, previews: [], error: `HTTP ${pageRes.status}` }); + continue; + } + const $p = cheerio.load(pageRes.html); + const imgUrls = []; + $p('.message-body img, .bbWrapper img').each((_, el) => { + const $img = $p(el); + const src = $img.attr('src') || $img.attr('data-src') || $img.attr('data-url'); + if (!src) return; + let absSrc; + try { absSrc = new URL(src, lastPageUrl).href; } catch { return; } + const lower = absSrc.toLowerCase(); + if (SEARCH_SKIP_PATTERNS.some(p => lower.includes(p))) return; + const $parentA = $img.closest('a'); + if ($parentA.length && $parentA.attr('href')) { + try { + const aHref = new URL($parentA.attr('href'), lastPageUrl).href; + if (PREVIEW_IMG_EXTS.test(aHref)) { imgUrls.push(aHref); return; } + } catch {} + } + let upgraded = absSrc.replace('.th.', '.').replace('.md.', '.'); + if (PREVIEW_IMG_EXTS.test(upgraded) || /\/data\/attachments|proxy\.php/.test(upgraded)) { + imgUrls.push(upgraded); + } + }); + const unique = [...new Set(imgUrls)]; + const previews = unique.slice(-previewsPerThread); + results.push({ ...t, lastPageUrl, lastPageNum: maxPage || 1, previews }); + } catch (err) { + results.push({ ...t, previews: [], error: err.message }); + } + } + + res.json({ query, results }); + } catch (err) { + console.error('[scrape/forum/search]', err); + res.status(500).json({ error: err.message }); + } finally { + if (fsSession) await fsDestroySession(fsSession); + } +}); + // --- Forum Sites CRUD --- router.get('/api/scrape/forum-sites', (_req, res) => { diff --git a/server/scrapers/forum.js b/server/scrapers/forum.js index 89debe1..a78301d 100644 --- a/server/scrapers/forum.js +++ b/server/scrapers/forum.js @@ -5,6 +5,8 @@ import { pipeline } from 'stream/promises'; import { execFile } from 'child_process'; import { promisify } from 'util'; import { upsertMediaFile } from '../db.js'; +import { fsGet } from '../flaresolverr.js'; +import { isTurboUrl, downloadTurbo } from './turbo.js'; const execFileAsync = promisify(execFile); @@ -20,11 +22,13 @@ export class CookieExpiredError extends Error { } // Replace DDoS-Guard __ddg9_ cookie IP with server's IP so cookies work from any browser -function fixCookieIp(cookies) { +export function fixCookieIp(cookies) { if (!cookies) return cookies; return cookies.replace(/__ddg9_=[^;]+/, `__ddg9_=${SERVER_IP}`); } +export const FORUM_UA = UA; + const IMAGE_EXTS = new Set(['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff']); const VIDEO_EXTS = new Set(['.mp4', '.mov', '.avi', '.webm', '.mkv', '.m4v', '.wmv', '.flv', '.ts']); const SKIP_PATTERNS = ['avatar', 'smilie', 'emoji', 'icon', 'logo', 'button', 'sprite', 'badge', 'rank', 'star', 'dc_thumbnails']; @@ -69,13 +73,20 @@ export function getPageUrl(baseUrl, pageNum) { return url.split('#')[0]; } -export async function detectMaxPage(baseUrl, logFn, cookies) { +export async function detectMaxPage(baseUrl, logFn, cookies, userAgent, fsSession) { try { - const headers = { 'User-Agent': UA }; - if (cookies) headers['Cookie'] = fixCookieIp(cookies); - const resp = await fetch(baseUrl, { headers, signal: AbortSignal.timeout(15000) }); - if (!resp.ok) return null; - const html = await resp.text(); + let html; + if (fsSession) { + const r = await fsGet(fsSession, baseUrl, cookies); + if (r.status !== 200) return null; + html = r.html; + } else { + const headers = { 'User-Agent': userAgent || UA }; + if (cookies) headers['Cookie'] = fixCookieIp(cookies); + const resp = await fetch(baseUrl, { headers, signal: AbortSignal.timeout(15000) }); + if (!resp.ok) return null; + html = await resp.text(); + } const $ = cheerio.load(html); let maxPage = 1; @@ -91,6 +102,17 @@ export async function detectMaxPage(baseUrl, logFn, cookies) { if (n > maxPage && n < 10000) maxPage = n; } }); + // Final fallback: scan raw HTML for any page-N references (XenForo's + // serialized pagination sometimes only appears in href attributes that + // cheerio's class-based selectors miss). + if (maxPage === 1) { + const re = /page-(\d+)/g; + let m; + while ((m = re.exec(html)) !== null) { + const n = parseInt(m[1], 10); + if (n > maxPage && n < 10000) maxPage = n; + } + } if (maxPage > 1) { logFn(`Detected ${maxPage} pages`); @@ -123,7 +145,7 @@ function tryFullSizeUrl(thumbUrl) { return candidates; } -async function downloadImage(url, outputDir, downloadedSet, logFn, cookies) { +async function downloadImage(url, outputDir, downloadedSet, logFn, cookies, userAgent) { if (downloadedSet.has(url)) return false; if (!isImageUrl(url)) return false; const lower = url.toLowerCase(); @@ -142,7 +164,7 @@ async function downloadImage(url, outputDir, downloadedSet, logFn, cookies) { } try { - const dlHeaders = { 'User-Agent': UA }; + const dlHeaders = { 'User-Agent': userAgent || UA }; if (cookies) dlHeaders['Cookie'] = fixCookieIp(cookies); const resp = await fetch(url, { headers: dlHeaders, signal: AbortSignal.timeout(30000) }); if (!resp.ok) { @@ -171,10 +193,16 @@ async function downloadImage(url, outputDir, downloadedSet, logFn, cookies) { } // Use gallery-dl to download from external hosts (bunkr, saint, cyberdrop, etc.) -async function downloadFromExternalHost(url, outputDir, downloadedSet, logFn) { +async function downloadFromExternalHost(url, outputDir, downloadedSet, logFn, userAgent, fsSession) { if (downloadedSet.has(url)) return 0; downloadedSet.add(url); + // turbo.cr uses an obfuscated WASM player — gallery-dl can't extract the + // signed mp4 URL. Resolve via FlareSolverr (renders JS) instead. + if (isTurboUrl(url)) { + return await downloadTurbo(url, outputDir, logFn, userAgent, fsSession); + } + logFn(`Resolving via gallery-dl: ${url}`); try { @@ -240,23 +268,34 @@ async function downloadFromExternalHost(url, outputDir, downloadedSet, logFn) { } } -export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn, cookies) { - logFn(`Fetching page: ${pageUrl}`); +export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn, cookies, userAgent, fsSession) { + logFn(`Fetching page: ${pageUrl}${fsSession ? ' [via FlareSolverr]' : ''}`); let html; try { - const headers = { 'User-Agent': UA }; - if (cookies) headers['Cookie'] = fixCookieIp(cookies); - const resp = await fetch(pageUrl, { headers, signal: AbortSignal.timeout(15000) }); - if (!resp.ok) { - // SimpCity returns 404 for expired sessions, 403 for blocked - if (cookies && (resp.status === 404 || resp.status === 403)) { - throw new CookieExpiredError(resp.status); + if (fsSession) { + const r = await fsGet(fsSession, pageUrl, cookies); + if (r.status !== 200) { + if (cookies && (r.status === 404 || r.status === 403)) { + throw new CookieExpiredError(r.status); + } + logFn(`Failed to fetch page (${r.status})`); + return 0; } - logFn(`Failed to fetch page (${resp.status})`); - return 0; + html = r.html; + } else { + const headers = { 'User-Agent': userAgent || UA }; + if (cookies) headers['Cookie'] = fixCookieIp(cookies); + const resp = await fetch(pageUrl, { headers, signal: AbortSignal.timeout(15000) }); + if (!resp.ok) { + if (cookies && (resp.status === 404 || resp.status === 403)) { + throw new CookieExpiredError(resp.status); + } + logFn(`Failed to fetch page (${resp.status})`); + return 0; + } + html = await resp.text(); } - html = await resp.text(); } catch (err) { if (err instanceof CookieExpiredError) throw err; logFn(`Failed to fetch page: ${err.message}`); @@ -359,14 +398,14 @@ export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn, // Download images for (const imgUrl of imageUrls) { - if (await downloadImage(imgUrl, outputDir, downloadedSet, logFn, cookies)) { + if (await downloadImage(imgUrl, outputDir, downloadedSet, logFn, cookies, userAgent)) { count++; } } - // Download from external hosts via gallery-dl + // Download from external hosts (turbo.cr handled via FlareSolverr; rest via gallery-dl) for (const extUrl of externalUrls) { - const dlCount = await downloadFromExternalHost(extUrl, outputDir, downloadedSet, logFn); + const dlCount = await downloadFromExternalHost(extUrl, outputDir, downloadedSet, logFn, userAgent, fsSession); count += dlCount; } diff --git a/server/scrapers/turbo.js b/server/scrapers/turbo.js new file mode 100644 index 0000000..5f330ad --- /dev/null +++ b/server/scrapers/turbo.js @@ -0,0 +1,138 @@ +import { writeFileSync, existsSync } from 'fs'; +import { join, basename } from 'path'; +import { fsCreateSession, fsDestroySession, fsGet } from '../flaresolverr.js'; +import { upsertMediaFile } from '../db.js'; + +const TURBO_HOST_RE = /^https?:\/\/(?:www\.)?turbo\.\w+\//i; +const TURBO_BASE = 'https://turbo.cr'; +const DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'; + +export function isTurboUrl(url) { + return TURBO_HOST_RE.test(url); +} + +function unescapeHtml(s) { + return s.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>').replace(/"/g, '"'); +} + +function extractMp4FromHtml(html) { + // Plyr renders the resolved URL into