Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
c048079
Initial plan
Copilot Feb 23, 2026
4594181
feat: translate data-translate spans before writing article HTML files
Copilot Feb 23, 2026
7b0dcf3
feat: fix Unknown author/party sentinel, dedup motions, content-based…
Copilot Feb 23, 2026
c62a491
feat: add MCP fail-fast (--require-mcp) and article quality validatio…
Copilot Feb 23, 2026
95867b9
fix: address all six reviewer comments on PR #459
Copilot Feb 23, 2026
dd7cd78
fix: resolve merge conflicts for 4 files – take origin/main base + la…
Copilot Feb 23, 2026
cd339b2
merge: resolve conflicts with origin/main — keep branch additions + i…
Copilot Feb 23, 2026
d202fa2
merge: resolve conflicts with origin/main — keep translation post-pro…
Copilot Feb 23, 2026
977a8fd
fix: remove unused qualityScores field from GenerationStats interface
Copilot Feb 23, 2026
aea5413
merge: sync with origin/main — add chamber-debates articles and updat…
Copilot Feb 23, 2026
e4600ba
merge: fix remaining 5 generated files to exactly match origin/main
Copilot Feb 23, 2026
a68b459
merge: sync with origin/main (evening-analysis PR + updated indexes/s…
Copilot Feb 23, 2026
745075c
Merge branch 'main' into copilot/eliminate-untranslated-swedish-content
pethers Feb 23, 2026
1948dc0
Merge branch 'main' into copilot/eliminate-untranslated-swedish-content
pethers Feb 23, 2026
79613d2
fix: address reviewer comments on translateSwedishContent and writeSi…
Copilot Feb 23, 2026
176c78b
Merge branch 'main' into copilot/eliminate-untranslated-swedish-content
pethers Feb 23, 2026
759e803
Update sitemap.xml
pethers Feb 23, 2026
c50cc91
Merge branch 'main' into copilot/eliminate-untranslated-swedish-content
pethers Feb 23, 2026
3690f0f
refactor: extract TRANSLATABLE_SV_SPAN_REGEX to named constant + fix …
Copilot Feb 23, 2026
aa7ade2
Merge branch 'main' into copilot/eliminate-untranslated-swedish-content
pethers Feb 23, 2026
86d96db
refactor: use TRANSLATABLE_SV_SPAN_REGEX directly in html.replace (no…
Copilot Feb 23, 2026
ae2af5d
Merge branch 'main' into copilot/eliminate-untranslated-swedish-content
pethers Feb 24, 2026
e5dab26
fix: push qualityScore before writeArticle in writeSingleArticle
Copilot Feb 24, 2026
77d4f10
Add Swedish→14-language translation post-processing to eliminate untr…
Copilot Feb 24, 2026
4428eee
Update print statement from 'Hello' to 'Goodbye'
pethers Feb 24, 2026
e0a36e2
Merge branch 'main' into copilot/eliminate-untranslated-swedish-content
pethers Feb 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
356 changes: 327 additions & 29 deletions scripts/data-transformers.ts

Large diffs are not rendered by default.

213 changes: 198 additions & 15 deletions scripts/generate-news-enhanced.ts
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ import { generateMonthAhead } from './news-types/month-ahead.js';
import { generateWeeklyReview } from './news-types/weekly-review.js';
import { generateMonthlyReview } from './news-types/monthly-review.js';
import { generateBreakingNews } from './news-types/breaking-news.js';
import { translateTerm, translatePhrase } from './translation-dictionary.js';
Comment thread
github-code-quality[bot] marked this conversation as resolved.
Fixed
import type { Language } from './types/language.js';
import type { ArticleCategory } from './types/article.js';
import type {
Expand Down Expand Up @@ -245,6 +246,19 @@ const batchSizeArg: string | undefined = args.find(arg => arg.startsWith('--batc
const skipExistingArg: boolean = args.includes('--skip-existing');
const batchSize: number = batchSizeArg ? parseInt(batchSizeArg.split('=')[1] ?? '0', 10) : 0;

// --require-mcp flag: when true (default), abort if MCP server is unreachable after all retries.
// Set --require-mcp=false for local development/testing without a live MCP server.
const requireMcpArg: string | undefined = args.find(arg => arg.startsWith('--require-mcp'));
const requireMcp: boolean = requireMcpArg?.split('=')[1] !== 'false';

// --quality-threshold: minimum quality score (0-100) an article must achieve.
// The run exits with code 2 if ALL generated articles score below this threshold.
const qualityThresholdArg: string | undefined = args.find(arg => arg.startsWith('--quality-threshold='));
export const DEFAULT_QUALITY_THRESHOLD = 40;
export const QUALITY_THRESHOLD: number = qualityThresholdArg
? (parseInt(qualityThresholdArg.split('=')[1] ?? String(DEFAULT_QUALITY_THRESHOLD), 10) || DEFAULT_QUALITY_THRESHOLD)
: DEFAULT_QUALITY_THRESHOLD;

// ---------------------------------------------------------------------------
// Valid article types
// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -380,7 +394,12 @@ async function getSharedClient(): Promise<MCPClient> {
console.log(` 📊 Last sync: ${status['last_sync'] as string}`);
}
} catch (error: unknown) {
console.warn(`⚠️ MCP warm-up failed: ${(error as Error).message}`);
const message = (error as Error).message;
if (requireMcp) {
sharedClient = null;
throw new Error(`MCP server unavailable: ${message}`, { cause: error });
}
console.warn(`⚠️ MCP warm-up failed: ${message}`);
console.warn(' Continuing anyway — individual requests will retry with backoff');
}

Expand All @@ -400,11 +419,12 @@ if (!fs.existsSync(METADATA_DIR)) {
// Generation statistics
// ---------------------------------------------------------------------------

const stats: { generated: number; errors: number; articles: string[]; timestamp: string } = {
const stats: { generated: number; errors: number; articles: string[]; timestamp: string; qualityScores: number[] } = {
generated: 0,
errors: 0,
articles: [],
timestamp: new Date().toISOString()
timestamp: new Date().toISOString(),
qualityScores: []
};

// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -450,12 +470,161 @@ async function writeArticle(html: string, filename: string): Promise<boolean> {
return true;
}

// ---------------------------------------------------------------------------
// Translation post-processing
// ---------------------------------------------------------------------------

/**
* Process all `<span data-translate="true" lang="sv">…</span>` spans
* remaining in an article BEFORE writing it to disk.
*
* - For `sv` articles: retains the original Swedish text, removes marker.
* - For other languages: attempts dictionary lookup via translateTerm(); if no
* match, keeps the Swedish text unchanged but still removes the marker.
*
* Upstream invariant: span content has already been HTML-escaped via
* escapeHtml(). The spans therefore never contain nested tags.
*
* @param html - Full article HTML
* @param targetLang - Target language (e.g. 'de', 'sv')
* @returns HTML with all data-translate spans processed
*/
export function translateSwedishContent(html: string, targetLang: string): string {
// Match both attribute orderings:
// <span data-translate="true" lang="sv">...</span>
// <span lang="sv" data-translate="true">...</span>
const spanRe = /<span\s+(?=[^>]*data-translate="true")(?=[^>]*lang="sv")[^>]*>([\s\S]*?)<\/span>/g;

return html.replace(spanRe, (_match: string, inner: string) => {
if (targetLang === 'sv') {
// Swedish articles: keep lang="sv" accessibility attr but remove data-translate marker
return `<span lang="sv">${inner}</span>`;
}
// Try phrase translation (handles prefix patterns like "med anledning av prop.")
const translated = translatePhrase(inner, targetLang as Language);
// If translated, return plain text (span no longer needed)
return translated;
});
}

// ---------------------------------------------------------------------------
// Article quality validation
// ---------------------------------------------------------------------------

/**
* Write article in specified language
* Result shape returned by validateArticleQuality.
*/
export interface ArticleQualityReport {
qualityScore: number;
wordCount: number;
analyticalSections: number;
untranslatedSpans: number;
belowThreshold: boolean;
}

/**
* Score an article on a 0–100 scale across three metrics:
*
* | Metric | Max pts | Target |
* |-------------------------|---------|---------------------------|
* | Word count | 50 | ≥ 1 000 words = full score|
* | Analytical h2 sections | 30 | ≥ 3 h2 tags = full score |
* | Translation completeness| 20 | 0 spans = full score (2 pts deducted per span, capped) |
*
* Swedish (`sv`) articles are exempt from the translation penalty since
* `data-translate="true"` is intentional in the Swedish edition.
*
* @param html - Full article HTML
* @param lang - Article language (e.g. 'de', 'sv')
* @param articleType - Article type slug (e.g. 'motions', 'propositions')
* @param hint - Optional filename hint for console output
*/
export function validateArticleQuality(
html: string,
lang: string,
articleType: string,
hint = ''
): ArticleQualityReport {
// Strip tags using a single-pass replacer; html always comes from our own
// controlled pipeline so we can safely strip all tags for word counting.
const textOnly = html.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
const wordCount = textOnly.split(' ').filter(w => w.length > 1).length;

// Word score: proportional, capped at 50 pts (target: 1000 words)
const wordScore = Math.min(50, Math.round((wordCount / 1000) * 50));

// Section score: h2 heading count, proportional, capped at 30 pts (target: 3)
const h2Count = (html.match(/<h2[\s>]/gi) ?? []).length;
const sectionScore = Math.min(30, Math.round((h2Count / 3) * 30));

// Translation penalty: 2 pts per remaining span, capped at 20 pts
// sv articles are exempt — data-translate is intentional there
const untranslatedSpans = lang === 'sv'
? 0
: (html.match(/<span[^>]*data-translate="true"[^>]*>/gi) ?? []).length;
const translationScore = Math.max(0, 20 - untranslatedSpans * 2);

const qualityScore = wordScore + sectionScore + translationScore;
const belowThreshold = qualityScore < QUALITY_THRESHOLD;

const label = belowThreshold ? '⚠️ LOW' : '✅ OK';
const fileHint = hint ? ` ${hint}` : '';
console.log(
`📊 Quality [${articleType}/${lang}]${fileHint}` +
` Words: ${wordCount} | h2 sections: ${h2Count}` +
` | Untranslated spans: ${untranslatedSpans} | Score: ${qualityScore}/100 ${label}`
);

return { qualityScore, wordCount, analyticalSections: h2Count, untranslatedSpans, belowThreshold };
}

// ---------------------------------------------------------------------------
// Content-based title extraction (#456)
// ---------------------------------------------------------------------------

/** Boilerplate prefixes to strip from Swedish document titles. */
const TOPIC_STRIP_RE = /^(?:Regeringens proposition\s+\d{4}\/\d{2}:\d+\s*|med anledning av prop\.?\s+\d{4}\/\d{2}:\d+\s*|om förslag till\s*|med anledning av\s*|angående\s*)/i;

/**
* Extract a human-readable topic string from an array of documents.
*
* - Skips "med anledning av prop." entries if a better alternative exists.
* - Strips common Swedish boilerplate prefixes.
* - Truncates at `maxLength` characters with an ellipsis if the title is long.
*
* @param docs - Array of raw documents (any shape with `titel` / `title`)
* @param maxLength - Max characters before truncation (default: 80)
*/
export function extractTopicFromDocs(
docs: Array<{ titel?: string; title?: string }>,
maxLength = 80
): string {
if (docs.length === 0) return '';

// Prefer a doc whose title is NOT a "med anledning av" reference
const primary =
docs.find(d => {
const t = d.titel || d.title || '';
return t && !/^med anledning av prop\./i.test(t);
}) ?? docs[0];

const raw = (primary?.titel || primary?.title || '').trim();
const cleaned = raw.replace(TOPIC_STRIP_RE, '').trim() || raw;

if (cleaned.length <= maxLength) return cleaned;
return cleaned.slice(0, maxLength).trimEnd() + '…';
}

/**
* Write article in specified language, applying translation post-processing
* and quality validation before writing to disk.
*/
async function writeSingleArticle(html: string, slug: string, lang: Language): Promise<string> {
const filename: string = `${slug}-${lang}.html`;
await writeArticle(html, filename);
const processedHtml = translateSwedishContent(html, lang);
const report = validateArticleQuality(processedHtml, lang, slug.replace(/^\d{4}-\d{2}-\d{2}-/, ''), filename);
await writeArticle(processedHtml, filename);
stats.qualityScores.push(report.qualityScore);
stats.generated += 1;
stats.articles.push(filename);
return filename;
Expand Down Expand Up @@ -491,25 +660,22 @@ async function generateWeekAhead(): Promise<GenerationResult> {
console.log(` 📊 Found ${events.length} events`);

// 2. Fetch upcoming/recent documents
const rawDocs = await Promise.resolve()
.then(() => client.searchDocuments({ from_date: dateRange.start, to_date: dateRange.end, limit: 30 }))
.catch(() => [] as unknown[]);
const rawDocs = await client.searchDocuments({ from_date: dateRange.start, to_date: dateRange.end, limit: 30 })
.catch((e: unknown) => { if (requireMcp) throw e; return [] as unknown[]; });
const documents: RawDocument[] = Array.isArray(rawDocs) ? rawDocs as RawDocument[] : [];
console.log(` 📊 Found ${documents.length} upcoming documents`);

// 3. Fetch parliamentary questions (fragor)
console.log(' 🔄 Fetching parliamentary questions...');
const rawQuestions = await Promise.resolve()
.then(() => client.fetchWrittenQuestions({ limit: 20 }))
.catch(() => [] as unknown[]);
const rawQuestions = await client.fetchWrittenQuestions({ limit: 20 })
.catch((e: unknown) => { if (requireMcp) throw e; return [] as unknown[]; });
const questions: unknown[] = Array.isArray(rawQuestions) ? rawQuestions : [];
console.log(` 📊 Found ${questions.length} written questions`);

// 4. Fetch interpellations (interpellationer)
console.log(' 🔄 Fetching interpellations...');
const rawInterpellations = await Promise.resolve()
.then(() => client.fetchInterpellations({ limit: 15 }))
.catch(() => [] as unknown[]);
const rawInterpellations = await client.fetchInterpellations({ limit: 15 })
.catch((e: unknown) => { if (requireMcp) throw e; return [] as unknown[]; });
const interpellations: unknown[] = Array.isArray(rawInterpellations) ? rawInterpellations : [];
console.log(` 📊 Found ${interpellations.length} interpellations`);

Expand Down Expand Up @@ -994,6 +1160,13 @@ async function generateNews(): Promise<typeof stats> {
stats.articles.forEach(article => console.log(` - ${article}`));
}

// Quality summary
if (stats.qualityScores.length > 0) {
const avg = Math.round(stats.qualityScores.reduce((a, b) => a + b, 0) / stats.qualityScores.length);
const below = stats.qualityScores.filter(s => s < QUALITY_THRESHOLD).length;
console.log(`\n📊 Quality summary: avg ${avg}/100 | ${below}/${stats.qualityScores.length} articles below threshold (${QUALITY_THRESHOLD})`);
}

if (remainingLangs.length > 0) {
console.log(`\n📦 Batch progress: ${completedLangs.length}/${allRequestedLanguages.length} languages done`);
console.log(` Remaining: ${remainingLangs.join(', ')}`);
Expand All @@ -1012,6 +1185,14 @@ async function generateNews(): Promise<typeof stats> {
if (import.meta.url === `file://${process.argv[1]}`) {
generateNews()
.then(result => {
// Exit 2 when every generated article scored below the quality threshold
const allBelowThreshold =
result.qualityScores.length > 0 &&
result.qualityScores.every(s => s < QUALITY_THRESHOLD);
if (allBelowThreshold) {
console.error(`❌ All articles scored below quality threshold (${QUALITY_THRESHOLD}). Exiting with code 2.`);
process.exit(2);
}
process.exit(result.errors > 0 ? 1 : 0);
})
.catch((error: unknown) => {
Expand All @@ -1036,5 +1217,7 @@ export {
ALL_LANGUAGES,
LANGUAGE_PRESETS,
formatDateForSlug,
getWeekAheadDateRange
getWeekAheadDateRange,
requireMcp,
requireMcpArg
};
Loading
Loading