|
1 | 1 | import { expect, test } from '@playwright/test' |
2 | 2 |
|
3 | | -test('site-wide link crawler returns 2xx for internal links', async ({ page, request }) => { |
4 | | - await page.goto('/') |
5 | | - |
6 | | - const anchors = await page.locator('a').all() |
7 | | - const hrefs = new Set<string>() |
8 | | - for (const a of anchors) { |
9 | | - const href = await a.getAttribute('href') |
10 | | - if (!href) |
11 | | - continue |
12 | | - if (href.startsWith('http') && !href.startsWith('https://ljluestc.github.io')) |
| 3 | +const NOT_FOUND_PATTERNS = [ |
| 4 | + 'Page Not Found', |
| 5 | + 'The page you\'re looking for doesn\'t exist', |
| 6 | + '404', |
| 7 | +] |
| 8 | + |
| 9 | +test('crawl internal links: fail on non-2xx and not-found content', async ({ request }, testInfo) => { |
| 10 | + const base = (testInfo.project.use as any).baseURL as string | undefined |
| 11 | + expect(base, 'project baseURL must be defined').toBeTruthy() |
| 12 | + const BASE = base!.replace(/\/$/, '') |
| 13 | + |
| 14 | + const toVisit: string[] = [`${BASE}/`] |
| 15 | + const visited = new Set<string>() |
| 16 | + const errors: { url: string; status?: number; reason: string }[] = [] |
| 17 | + const maxPages = 250 |
| 18 | + |
| 19 | + while (toVisit.length && visited.size < maxPages) { |
| 20 | + const url = toVisit.shift()! |
| 21 | + if (visited.has(url)) |
13 | 22 | continue |
14 | | - const resolved = href.startsWith('http') ? href : new URL(href, 'https://ljluestc.github.io').toString() |
15 | | - hrefs.add(resolved) |
16 | | - } |
| 23 | + visited.add(url) |
17 | 24 |
|
18 | | - for (const url of hrefs) { |
19 | 25 | const res = await request.get(url) |
20 | | - expect(res.ok()).toBe(true) |
| 26 | + if (!res.ok()) { |
| 27 | + errors.push({ url, status: res.status(), reason: 'non-2xx response' }) |
| 28 | + continue |
| 29 | + } |
| 30 | + const html = await res.text() |
| 31 | + const lowered = html.toLowerCase() |
| 32 | + if (NOT_FOUND_PATTERNS.some(p => lowered.includes(p.toLowerCase()))) { |
| 33 | + errors.push({ url, reason: 'not-found content detected' }) |
| 34 | + continue |
| 35 | + } |
| 36 | + |
| 37 | + // extract internal anchors (avoid assignment in condition per lint rules) |
| 38 | + const hrefRegex = /href\s*=\s*"([^"]+)"/gi |
| 39 | + for (let m = hrefRegex.exec(html); m !== null; m = hrefRegex.exec(html)) { |
| 40 | + const href = m[1] |
| 41 | + if (!href || href.startsWith('mailto:') || href.startsWith('tel:')) |
| 42 | + continue |
| 43 | + let absolute: string |
| 44 | + if (href.startsWith('http')) { |
| 45 | + if (!href.startsWith(BASE)) |
| 46 | + continue |
| 47 | + absolute = href |
| 48 | + } |
| 49 | + else if (href.startsWith('#')) { |
| 50 | + continue |
| 51 | + } |
| 52 | + else { |
| 53 | + absolute = new URL(href, url).toString() |
| 54 | + } |
| 55 | + if (!visited.has(absolute)) |
| 56 | + toVisit.push(absolute) |
| 57 | + } |
21 | 58 | } |
| 59 | + |
| 60 | + if (errors.length) |
| 61 | + console.error('Broken pages detected:', errors) |
| 62 | + |
| 63 | + expect(errors.length).toBe(0) |
22 | 64 | }) |
0 commit comments