Make detectMaxPage thread-scoped to avoid sidebar false positives
The previous fallback scanned for any anchor whose text was a number, which matched widget elements (online count, trending threads, etc.) and inflated the page count — a sidebar showing "26" caused detectMaxPage to report 26 pages on threads that were actually 12 and 8 pages long. Now we derive the thread's URL prefix from the input baseUrl and only count page-N references in hrefs that match that thread, ignoring sidebar references to unrelated threads. The bare numeric-anchor scan is dropped. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
+23
-18
@@ -89,29 +89,34 @@ export async function detectMaxPage(baseUrl, logFn, cookies, userAgent, fsSessio
|
|||||||
}
|
}
|
||||||
const $ = cheerio.load(html);
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
|
// Derive the thread's URL prefix so we only count pagination that belongs
|
||||||
|
// to THIS thread. XenForo sidebars/widgets contain page-N references for
|
||||||
|
// unrelated threads, and there are also bare numeric anchors (online count,
|
||||||
|
// trending widgets) that look like page numbers but aren't.
|
||||||
|
let threadPrefix = '';
|
||||||
|
try {
|
||||||
|
const u = new URL(baseUrl);
|
||||||
|
const m = u.pathname.match(/^(\/threads\/[^\/]+)/);
|
||||||
|
if (m) threadPrefix = m[1];
|
||||||
|
} catch {}
|
||||||
|
|
||||||
let maxPage = 1;
|
let maxPage = 1;
|
||||||
$('a.pageNav-page, .pageNav a[href*="page-"], .pagination a[href*="page-"]').each((_, el) => {
|
if (threadPrefix) {
|
||||||
const href = $(el).attr('href') || '';
|
const escaped = threadPrefix.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||||
const m = href.match(/page-(\d+)/);
|
const ownRe = new RegExp(escaped + '/page-(\\d+)', 'g');
|
||||||
if (m) maxPage = Math.max(maxPage, parseInt(m[1], 10));
|
|
||||||
});
|
|
||||||
$('a').each((_, el) => {
|
|
||||||
const text = $(el).text().trim();
|
|
||||||
if (/^\d+$/.test(text)) {
|
|
||||||
const n = parseInt(text, 10);
|
|
||||||
if (n > maxPage && n < 10000) maxPage = n;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
// Final fallback: scan raw HTML for any page-N references (XenForo's
|
|
||||||
// serialized pagination sometimes only appears in href attributes that
|
|
||||||
// cheerio's class-based selectors miss).
|
|
||||||
if (maxPage === 1) {
|
|
||||||
const re = /page-(\d+)/g;
|
|
||||||
let m;
|
let m;
|
||||||
while ((m = re.exec(html)) !== null) {
|
while ((m = ownRe.exec(html)) !== null) {
|
||||||
const n = parseInt(m[1], 10);
|
const n = parseInt(m[1], 10);
|
||||||
if (n > maxPage && n < 10000) maxPage = n;
|
if (n > maxPage && n < 10000) maxPage = n;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// No thread prefix (caller passed a non-thread URL) — fall back to the
|
||||||
|
// narrow class-based selectors only, NOT the bare numeric-anchor scan.
|
||||||
|
$('a.pageNav-page, .pageNav a[href*="page-"], .pagination a[href*="page-"]').each((_, el) => {
|
||||||
|
const href = $(el).attr('href') || '';
|
||||||
|
const m = href.match(/page-(\d+)/);
|
||||||
|
if (m) maxPage = Math.max(maxPage, parseInt(m[1], 10));
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (maxPage > 1) {
|
if (maxPage > 1) {
|
||||||
|
|||||||
Reference in New Issue
Block a user