diff --git a/server/scrapers/forum.js b/server/scrapers/forum.js index a78301d..df8d69f 100644 --- a/server/scrapers/forum.js +++ b/server/scrapers/forum.js @@ -89,29 +89,34 @@ export async function detectMaxPage(baseUrl, logFn, cookies, userAgent, fsSessio } const $ = cheerio.load(html); + // Derive the thread's URL prefix so we only count pagination that belongs + // to THIS thread. XenForo sidebars/widgets contain page-N references for + // unrelated threads, and there are also bare numeric anchors (online count, + // trending widgets) that look like page numbers but aren't. + let threadPrefix = ''; + try { + const u = new URL(baseUrl); + const m = u.pathname.match(/^(\/threads\/[^\/]+)/); + if (m) threadPrefix = m[1]; + } catch {} + let maxPage = 1; - $('a.pageNav-page, .pageNav a[href*="page-"], .pagination a[href*="page-"]').each((_, el) => { - const href = $(el).attr('href') || ''; - const m = href.match(/page-(\d+)/); - if (m) maxPage = Math.max(maxPage, parseInt(m[1], 10)); - }); - $('a').each((_, el) => { - const text = $(el).text().trim(); - if (/^\d+$/.test(text)) { - const n = parseInt(text, 10); - if (n > maxPage && n < 10000) maxPage = n; - } - }); - // Final fallback: scan raw HTML for any page-N references (XenForo's - // serialized pagination sometimes only appears in href attributes that - // cheerio's class-based selectors miss). - if (maxPage === 1) { - const re = /page-(\d+)/g; + if (threadPrefix) { + const escaped = threadPrefix.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + const ownRe = new RegExp(escaped + '/page-(\\d+)', 'g'); let m; - while ((m = re.exec(html)) !== null) { + while ((m = ownRe.exec(html)) !== null) { const n = parseInt(m[1], 10); if (n > maxPage && n < 10000) maxPage = n; } + } else { + // No thread prefix (caller passed a non-thread URL) — fall back to the + // narrow class-based selectors only, NOT the bare numeric-anchor scan. + $('a.pageNav-page, .pageNav a[href*="page-"], .pagination a[href*="page-"]').each((_, el) => { + const href = $(el).attr('href') || ''; + const m = href.match(/page-(\d+)/); + if (m) maxPage = Math.max(maxPage, parseInt(m[1], 10)); + }); } if (maxPage > 1) {