OFApp/server/scrapers/forum.js

import * as cheerio from 'cheerio';
import { createWriteStream, existsSync, mkdirSync, statSync } from 'fs';
import { basename, join, extname } from 'path';
import { pipeline } from 'stream/promises';
import { upsertMediaFile } from '../db.js';

const UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';

const IMAGE_EXTS = new Set(['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff']);
const SKIP_PATTERNS = ['avatar', 'smilie', 'emoji', 'icon', 'logo', 'button', 'sprite', 'badge', 'rank', 'star'];

function isImageUrl(url) {
  try {
    const path = new URL(url).pathname.toLowerCase();
    return [...IMAGE_EXTS].some(ext => path.endsWith(ext));
  } catch { return false; }
}

export function getPageUrl(baseUrl, pageNum) {
  const url = baseUrl.replace(/page-\d+/, `page-${pageNum}`);
  return url.split('#')[0];
}

export async function detectMaxPage(baseUrl, logFn) {
  try {
    const resp = await fetch(baseUrl, { headers: { 'User-Agent': UA }, signal: AbortSignal.timeout(15000) });
    if (!resp.ok) return null;
    const html = await resp.text();
    const $ = cheerio.load(html);

    let maxPage = 1;
    // XenForo-style
    $('a.pageNav-page, .pageNav a[href*="page-"], .pagination a[href*="page-"]').each((_, el) => {
      const href = $(el).attr('href') || '';
      const m = href.match(/page-(\d+)/);
      if (m) maxPage = Math.max(maxPage, parseInt(m[1], 10));
    });
    // Generic pagination text
    $('a').each((_, el) => {
      const text = $(el).text().trim();
      if (/^\d+$/.test(text)) {
        const n = parseInt(text, 10);
        if (n > maxPage && n < 10000) maxPage = n;
      }
    });

    if (maxPage > 1) {
      logFn(`Detected ${maxPage} pages`);
      return maxPage;
    }
    return null;
  } catch (err) {
    logFn(`Page detection failed: ${err.message}`);
    return null;
  }
}

function tryFullSizeUrl(thumbUrl) {
  const candidates = [];
  if (thumbUrl.includes('.th.')) candidates.push(thumbUrl.replace('.th.', '.'));
  if (/_thumb\./i.test(thumbUrl)) candidates.push(thumbUrl.replace(/_thumb\./i, '.'));
  if (thumbUrl.includes('/thumbs/')) {
    candidates.push(thumbUrl.replace('/thumbs/', '/images/'));
    candidates.push(thumbUrl.replace('/thumbs/', '/full/'));
  }
  try {
    const parsed = new URL(thumbUrl);
    const base = basename(parsed.pathname);
    if (base.startsWith('thumb_')) {
      candidates.push(thumbUrl.replace(`/${base}`, `/${base.slice(6)}`));
    }
    if (parsed.search) candidates.push(thumbUrl.split('?')[0]);
  } catch {}
  return candidates;
}

async function downloadImage(url, outputDir, downloadedSet, logFn) {
  if (downloadedSet.has(url)) return false;
  if (!isImageUrl(url)) return false;
  const lower = url.toLowerCase();
  if (SKIP_PATTERNS.some(p => lower.includes(p))) return false;

  downloadedSet.add(url);

  let filename;
  try {
    filename = basename(new URL(url).pathname);
  } catch { return false; }
  if (!filename) return false;

  filename = filename.replace('.th.', '.');

  let filepath = join(outputDir, filename);
  if (existsSync(filepath)) {
    const ext = extname(filename);
    const name = filename.slice(0, -ext.length);
    let i = 1;
    while (existsSync(filepath)) {
      filepath = join(outputDir, `${name}_${i}${ext}`);
      i++;
    }
  }

  try {
    const resp = await fetch(url, {
      headers: { 'User-Agent': UA },
      signal: AbortSignal.timeout(30000),
    });
    if (!resp.ok) {
      logFn(`FAILED (${resp.status}): ${url}`);
      return false;
    }

    // Read full body to check size
    const buf = Buffer.from(await resp.arrayBuffer());
    if (buf.length < 1000) {
      downloadedSet.delete(url);
      return false;
    }

    const { writeFileSync } = await import('fs');
    writeFileSync(filepath, buf);

    const savedName = basename(filepath);
    const folderName = basename(outputDir);
    try { upsertMediaFile(folderName, savedName, 'image', buf.length, Date.now(), null); } catch { /* ignore */ }

    const sizeKb = (buf.length / 1024).toFixed(1);
    logFn(`Downloaded: ${savedName} (${sizeKb} KB)`);
    return true;
  } catch (err) {
    logFn(`FAILED: ${basename(filepath)} - ${err.message}`);
    return false;
  }
}

export async function scrapeForumPage(pageUrl, outputDir, downloadedSet, logFn) {
  logFn(`Fetching page: ${pageUrl}`);

  let html;
  try {
    const resp = await fetch(pageUrl, {
      headers: { 'User-Agent': UA },
      signal: AbortSignal.timeout(15000),
    });
    if (!resp.ok) {
      logFn(`Failed to fetch page (${resp.status})`);
      return 0;
    }
    html = await resp.text();
  } catch (err) {
    logFn(`Failed to fetch page: ${err.message}`);
    return 0;
  }

  const $ = cheerio.load(html);

  // Try known content selectors, fall back to whole page
  const selectors = '.message-body, .post-body, .post_body, .postcontent, .messageContent, .bbWrapper, article, .entry-content, .post_message, .post-content, #posts, .threadBody';
  let contentAreas = $(selectors).toArray();
  if (contentAreas.length === 0) {
    contentAreas = [$.root().get(0)];
  }

  const imageUrls = [];

  for (const area of contentAreas) {
    const $area = $(area);

    // Pass 1: <img> tags
    $area.find('img').each((_, el) => {
      const $img = $(el);
      const src = $img.attr('src') || $img.attr('data-src') || $img.attr('data-url') || '';
      if (!src) return;

      let absSrc;
      try { absSrc = new URL(src, pageUrl).href; } catch { return; }

      // Check parent <a> for direct image link
      const $parentA = $img.closest('a');
      if ($parentA.length && $parentA.attr('href')) {
        try {
          const aHref = new URL($parentA.attr('href'), pageUrl).href;
          if (isImageUrl(aHref)) {
            imageUrls.push(aHref);
            return;
          }
        } catch {}
      }

      // Try to derive full-size from thumbnail URL
      const fullCandidates = tryFullSizeUrl(absSrc);
      if (fullCandidates.length > 0) {
        imageUrls.push(...fullCandidates);
      } else {
        imageUrls.push(absSrc);
      }

      // Also check data attributes
      for (const attr of ['data-src', 'data-url', 'data-orig', 'data-original', 'data-full-url', 'data-zoom-src']) {
        const val = $img.attr(attr);
        if (val && val !== src) {
          try { imageUrls.push(new URL(val, pageUrl).href); } catch {}
        }
      }
    });

    // Pass 2: <a href> pointing directly to images (no child <img>)
    $area.find('a[href]').each((_, el) => {
      const $a = $(el);
      if ($a.find('img').length) return;
      try {
        const href = new URL($a.attr('href'), pageUrl).href;
        if (isImageUrl(href)) imageUrls.push(href);
      } catch {}
    });
  }

  logFn(`Found ${imageUrls.length} candidate URLs`);

  let count = 0;
  for (const imgUrl of imageUrls) {
    if (await downloadImage(imgUrl, outputDir, downloadedSet, logFn)) {
      count++;
    }
  }

  logFn(`${count} images from this page`);
  return count;
}