Skip to content

Commit d5c5c15

Browse files
committed
feat: enhance content extraction settings with scraper selection
- Introduce content scraper options (auto, defuddle, readability) for improved article parsing. - Update content extraction logic to respect user preferences for scraper selection. - Ensure backward compatibility by defaulting to "auto" if no scraper is set.
1 parent 10b27dc commit d5c5c15

File tree

8 files changed

+198
-37
lines changed

8 files changed

+198
-37
lines changed

src/contents/index.ts

Lines changed: 55 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -312,41 +312,63 @@ browser.runtime.onMessage.addListener(
312312
}
313313
}
314314

315-
// Parse with defuddle first (better GitHub support, more tokens)
316-
console.log("[Content Script] Parsing article with defuddle...")
315+
// Parse content based on user's scraper preference
316+
const scraper = effectiveConfig.contentScraper || "auto"
317+
console.log(`[Content Script] Using scraper: ${scraper}`)
318+
317319
let readableText = ""
318320
let pageTitle = ""
319321
let defuddleResult: ReturnType<Defuddle["parse"]> | null = null
320322

321-
try {
322-
const defuddle = new Defuddle(document, {
323-
markdown: true,
324-
separateMarkdown: false,
325-
removeExactSelectors: true // Remove ads and social buttons
326-
})
327-
defuddleResult = defuddle.parse()
323+
// Try Defuddle if user selected "auto" or "defuddle"
324+
if (scraper === "auto" || scraper === "defuddle") {
325+
console.log("[Content Script] Parsing article with Defuddle...")
326+
try {
327+
const defuddle = new Defuddle(document, {
328+
markdown: true,
329+
separateMarkdown: false,
330+
removeExactSelectors: true // Remove ads and social buttons
331+
})
332+
defuddleResult = defuddle.parse()
328333

329-
// Prefer markdown if available, otherwise use HTML content
330-
readableText =
331-
defuddleResult?.contentMarkdown || defuddleResult?.content || ""
332-
readableText = normalizeWhitespaceForLLM(readableText)
333-
pageTitle = defuddleResult?.title || ""
334+
// Prefer markdown if available, otherwise use HTML content
335+
readableText =
336+
defuddleResult?.contentMarkdown || defuddleResult?.content || ""
337+
readableText = normalizeWhitespaceForLLM(readableText)
338+
pageTitle = defuddleResult?.title || ""
334339

335-
console.log(
336-
`[Content Script] defuddle extracted ${readableText.length} chars (${defuddleResult?.contentMarkdown ? "markdown" : "HTML"})`
337-
)
338-
} catch (error) {
339-
console.warn(
340-
"[Content Script] defuddle failed, falling back to Readability:",
341-
error
342-
)
340+
console.log(
341+
`[Content Script] Defuddle extracted ${readableText.length} chars (${defuddleResult?.contentMarkdown ? "markdown" : "HTML"})`
342+
)
343+
} catch (error) {
344+
console.warn("[Content Script] Defuddle failed:", error)
345+
// If user selected "defuddle" only, don't fallback
346+
if (scraper === "defuddle") {
347+
console.error(
348+
"[Content Script] Defuddle-only mode failed, no fallback available"
349+
)
350+
}
351+
}
343352
}
344353

345-
// Fallback to Readability if defuddle failed or returned minimal content
346-
if (!readableText || readableText.trim().length < 100) {
347-
console.log(
348-
"[Content Script] defuddle returned minimal content, trying Readability..."
349-
)
354+
// Try Readability if:
355+
// 1. User selected "readability" only
356+
// 2. User selected "auto" and defuddle failed or returned minimal content
357+
if (
358+
scraper === "readability" ||
359+
(scraper === "auto" &&
360+
(!readableText || readableText.trim().length < 100))
361+
) {
362+
if (scraper === "auto") {
363+
console.log(
364+
"[Content Script] Defuddle returned minimal content, trying Readability..."
365+
)
366+
} else {
367+
console.log(
368+
"[Content Script] Parsing article with Readability..."
369+
)
370+
}
371+
350372
try {
351373
const article = new Readability(
352374
document.cloneNode(true) as Document
@@ -356,8 +378,11 @@ browser.runtime.onMessage.addListener(
356378
const normalizedReadability =
357379
normalizeWhitespaceForLLM(readabilityText)
358380

359-
// Use Readability result if it's better than defuddle
381+
// Use Readability result if:
382+
// - User selected "readability" only, OR
383+
// - Auto mode and Readability is better than defuddle
360384
if (
385+
scraper === "readability" ||
361386
normalizedReadability.length > readableText.length ||
362387
readableText.trim().length < 50
363388
) {
@@ -367,15 +392,12 @@ browser.runtime.onMessage.addListener(
367392
)
368393
}
369394

370-
// Use Readability title if defuddle didn't provide one
395+
// Use Readability title if not already set
371396
if (!pageTitle && article?.title) {
372397
pageTitle = article.title
373398
}
374399
} catch (error) {
375-
console.error(
376-
"[Content Script] Readability fallback failed:",
377-
error
378-
)
400+
console.error("[Content Script] Readability failed:", error)
379401
}
380402
}
381403

src/features/model/components/content-extraction-settings.tsx

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,13 @@ const useContentExtractionConfig = () => {
3030
[setConfig]
3131
)
3232

33+
// Ensure contentScraper is set (for backward compatibility)
34+
useEffect(() => {
35+
if (!config.contentScraper) {
36+
updateConfig({ contentScraper: "auto" })
37+
}
38+
}, [config.contentScraper, updateConfig])
39+
3340
return [config, updateConfig] as const
3441
}
3542

src/features/model/components/global-settings.tsx

Lines changed: 101 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,24 @@ import { Separator } from "@/components/ui/separator"
1919
import { Slider } from "@/components/ui/slider"
2020
import { Switch } from "@/components/ui/switch"
2121
import {
22+
CONTENT_SCRAPER_OPTIONS,
2223
SCROLL_STRATEGY_DESCRIPTIONS,
2324
SCROLL_STRATEGY_OPTIONS
2425
} from "@/lib/constants-ui"
25-
import { Settings, Target, Zap } from "@/lib/lucide-icon"
26-
import type { ContentExtractionConfig, ScrollStrategy } from "@/types"
26+
import {
27+
BookOpen,
28+
Code,
29+
FileText,
30+
Settings,
31+
Sparkles,
32+
Target,
33+
Zap
34+
} from "@/lib/lucide-icon"
35+
import type {
36+
ContentExtractionConfig,
37+
ContentScraper,
38+
ScrollStrategy
39+
} from "@/types"
2740
import { TIMEOUT_FIELDS } from "./content-extraction-constants"
2841

2942
interface GlobalSettingsProps {
@@ -60,6 +73,85 @@ export const GlobalSettings = ({ config, onUpdate }: GlobalSettingsProps) => {
6073
</div>
6174
)
6275

76+
// Get icon for scraper type
77+
const getScraperIcon = (scraper: ContentScraper) => {
78+
switch (scraper) {
79+
case "auto":
80+
return Sparkles
81+
case "defuddle":
82+
return Code
83+
case "readability":
84+
return BookOpen
85+
}
86+
}
87+
88+
// Render content scraper selection
89+
const renderContentScraperSelect = (
90+
value: ContentScraper,
91+
onValueChange: (value: ContentScraper) => void
92+
) => (
93+
<div className="space-y-3">
94+
<Label className="flex items-center gap-2 text-sm font-medium">
95+
<FileText className="h-4 w-4" />
96+
Content Scraper
97+
</Label>
98+
<div className="grid gap-3">
99+
{CONTENT_SCRAPER_OPTIONS.map((option) => {
100+
const Icon = getScraperIcon(option.value)
101+
const isSelected = value === option.value
102+
return (
103+
<button
104+
key={option.value}
105+
type="button"
106+
onClick={() => onValueChange(option.value)}
107+
className={`
108+
group relative flex items-start gap-3 rounded-lg border p-4 text-left transition-all
109+
hover:bg-accent/50 hover:border-accent-foreground/20
110+
${
111+
isSelected
112+
? "border-primary bg-accent/30 ring-1 ring-primary shadow-sm"
113+
: "border-border"
114+
}
115+
`}>
116+
<div
117+
className={`
118+
flex h-10 w-10 shrink-0 items-center justify-center rounded-md transition-colors
119+
${isSelected ? "bg-primary text-primary-foreground" : "bg-muted group-hover:bg-muted/80"}
120+
`}>
121+
<Icon className="h-5 w-5" />
122+
</div>
123+
<div className="flex-1 space-y-1.5 min-w-0">
124+
<div className="flex items-center gap-2 flex-wrap">
125+
<span className="font-semibold text-sm">{option.label}</span>
126+
{option.recommended && (
127+
<Badge
128+
variant="default"
129+
className="text-[10px] h-5 px-1.5 font-medium">
130+
Recommended
131+
</Badge>
132+
)}
133+
{isSelected && (
134+
<Badge
135+
variant="secondary"
136+
className="text-[10px] h-5 px-1.5">
137+
Active
138+
</Badge>
139+
)}
140+
</div>
141+
<p className="text-xs font-medium text-foreground/80">
142+
{option.description}
143+
</p>
144+
<p className="text-xs text-muted-foreground leading-relaxed">
145+
{option.detail}
146+
</p>
147+
</div>
148+
</button>
149+
)
150+
})}
151+
</div>
152+
</div>
153+
)
154+
63155
// Render scroll strategy select
64156
const renderScrollStrategySelect = (
65157
value: ScrollStrategy,
@@ -162,6 +254,13 @@ export const GlobalSettings = ({ config, onUpdate }: GlobalSettingsProps) => {
162254

163255
<Separator />
164256

257+
{/* Content Scraper Selection */}
258+
{renderContentScraperSelect(config.contentScraper, (value) =>
259+
onUpdate({ contentScraper: value })
260+
)}
261+
262+
<Separator />
263+
165264
{/* Scroll Strategy */}
166265
{renderScrollStrategySelect(config.scrollStrategy, (value) =>
167266
onUpdate({ scrollStrategy: value })

src/lib/constants-ui.ts

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import {
1010
type LucideIcon,
1111
Twitter
1212
} from "@/lib/lucide-icon"
13-
import type { ScrollStrategy } from "@/types"
13+
import type { ContentScraper, ScrollStrategy } from "@/types"
1414

1515
export interface SocialLink {
1616
label: string
@@ -79,6 +79,34 @@ export const SCROLL_STRATEGY_DESCRIPTIONS: Record<ScrollStrategy, string> = {
7979
"Intelligent scrolling with content detection, best for dynamic content"
8080
}
8181

82+
// Content scraper options
83+
export const CONTENT_SCRAPER_OPTIONS = [
84+
{
85+
value: "auto" as ContentScraper,
86+
label: "Auto",
87+
description: "Smart fallback: Defuddle → Readability",
88+
detail:
89+
"Best for most websites. Tries Defuddle first for better markdown and code formatting, falls back to Readability if needed.",
90+
recommended: true
91+
},
92+
{
93+
value: "defuddle" as ContentScraper,
94+
label: "Defuddle",
95+
description: "Technical & code-heavy content",
96+
detail:
97+
"Best for GitHub, documentation, Stack Overflow, and developer blogs. Preserves code blocks and markdown structure.",
98+
recommended: false
99+
},
100+
{
101+
value: "readability" as ContentScraper,
102+
label: "Readability",
103+
description: "Articles & blog posts",
104+
detail:
105+
"Best for news articles, Medium posts, and traditional blog content. Mozilla's proven algorithm for clean text extraction.",
106+
recommended: false
107+
}
108+
]
109+
82110
export const GUIDES = [
83111
{
84112
label: "Ollama Client Setup Guide",

src/lib/constants.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ export const DEFAULT_MODEL_CONFIG: ModelConfig = {
164164

165165
export const DEFAULT_CONTENT_EXTRACTION_CONFIG: ContentExtractionConfig = {
166166
enabled: true,
167+
contentScraper: "auto", // Try defuddle first, then readability
167168
excludedUrlPatterns: DEFAULT_EXCLUDE_URLS,
168169
scrollStrategy: "smart",
169170
scrollDepth: 0.8, // 80% of page

src/lib/lucide-icon.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import ChevronsUpDown from "lucide-react/dist/esm/icons/chevrons-up-down"
1515
import Circle from "lucide-react/dist/esm/icons/circle"
1616
import CircleCheck from "lucide-react/dist/esm/icons/circle-check"
1717
import Clock from "lucide-react/dist/esm/icons/clock"
18+
import Code from "lucide-react/dist/esm/icons/code"
1819
import Copy from "lucide-react/dist/esm/icons/copy"
1920
import CopyCheck from "lucide-react/dist/esm/icons/copy-check"
2021
import Cpu from "lucide-react/dist/esm/icons/cpu"
@@ -134,6 +135,7 @@ export {
134135
Sun,
135136
Copy,
136137
Check,
138+
Code,
137139
ChevronDown,
138140
RefreshCcw,
139141
Circle,

src/options/components/ollama-options.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ export const OllamaOptions = () => {
3636
content: <ModelPullPanel />
3737
},
3838
contentExtraction: {
39-
label: "Content Extraction",
39+
label: "Extraction",
4040
content: <ContentExtractionSettings />
4141
},
4242
voices: {

src/types/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,9 +317,11 @@ export interface ChatInput {
317317
}
318318

319319
export type ScrollStrategy = "none" | "gradual" | "instant" | "smart"
320+
export type ContentScraper = "auto" | "defuddle" | "readability"
320321

321322
export interface ContentExtractionConfig {
322323
enabled: boolean
324+
contentScraper: ContentScraper // Which scraper to use: auto (try defuddle then readability), defuddle, or readability
323325
excludedUrlPatterns: string[] // URL patterns to exclude from extraction
324326
scrollStrategy: ScrollStrategy
325327
scrollDepth: number // 0-1 (percentage of page)

0 commit comments

Comments
 (0)