1+ //
2+ // Cornucopia – (C) Dr. Lauer Information Technology
3+ //
4+ import Foundation
5+
6+ #if canImport(AppKit)
7+ import AppKit
8+ #elseif canImport(UIKit)
9+ import UIKit
10+ #endif
11+
12+ public extension String {
13+
14+ /// Returns a new string with HTML entities decoded.
15+ var CC_htmlDecoded : String {
16+ #if canImport(ObjectiveC)
17+ // Use NSAttributedString with HTML document type on Apple platforms
18+ guard let data = self . data ( using: . utf8) else { return self }
19+
20+ let options : [ NSAttributedString . DocumentReadingOptionKey : Any ] = [
21+ . documentType: NSAttributedString . DocumentType. html,
22+ . characterEncoding: String . Encoding. utf8. rawValue
23+ ]
24+
25+ if let attributed = try ? NSAttributedString ( data: data, options: options, documentAttributes: nil ) {
26+ return attributed. string
27+ }
28+
29+ // Fallback to manual decoding if NSAttributedString fails
30+ return manualDecode ( )
31+ #else
32+ // Use manual decoding on non-Apple platforms (e.g., Linux)
33+ return manualDecode ( )
34+ #endif
35+ }
36+
37+ private func manualDecode( ) -> String {
38+ let pattern = #"&(?:#(\d+)|#[xX]([0-9a-fA-F]+)|([a-zA-Z]+));"#
39+ var result = self
40+
41+ let namedEntities : [ String : String ] = [
42+ " amp " : " & " ,
43+ " lt " : " < " ,
44+ " gt " : " > " ,
45+ " quot " : " \" " ,
46+ " apos " : " ' " ,
47+ " nbsp " : " \u{00A0} " ,
48+ " copy " : " © " ,
49+ " reg " : " ® " ,
50+ " trade " : " ™ " ,
51+ " euro " : " € " ,
52+ " pound " : " £ " ,
53+ " yen " : " ¥ " ,
54+ " cent " : " ¢ " ,
55+ " sect " : " § " ,
56+ " para " : " ¶ " ,
57+ " middot " : " · " ,
58+ " bull " : " • " ,
59+ " hellip " : " … " ,
60+ " prime " : " ′ " ,
61+ " Prime " : " ″ " ,
62+ " lsquo " : " \u{2018} " ,
63+ " rsquo " : " \u{2019} " ,
64+ " ldquo " : " \u{201C} " ,
65+ " rdquo " : " \u{201D} " ,
66+ " ndash " : " – " ,
67+ " mdash " : " — "
68+ ]
69+
70+ #if canImport(ObjectiveC)
71+ if let regex = try ? NSRegularExpression ( pattern: pattern) {
72+ let matches = regex. matches ( in: self , range: NSRange ( location: 0 , length: self . utf16. count) )
73+
74+ for match in matches. reversed ( ) {
75+ guard let range = Range ( match. range, in: self ) else { continue }
76+ var replacement : String ?
77+
78+ if match. range ( at: 1 ) . location != NSNotFound,
79+ let decimalRange = Range ( match. range ( at: 1 ) , in: self ) ,
80+ let code = Int ( self [ decimalRange] ) {
81+ replacement = String ( Character ( UnicodeScalar ( code) ?? UnicodeScalar ( 0xFFFD ) !) )
82+ } else if match. range ( at: 2 ) . location != NSNotFound,
83+ let hexRange = Range ( match. range ( at: 2 ) , in: self ) ,
84+ let code = Int ( self [ hexRange] , radix: 16 ) {
85+ replacement = String ( Character ( UnicodeScalar ( code) ?? UnicodeScalar ( 0xFFFD ) !) )
86+ } else if match. range ( at: 3 ) . location != NSNotFound,
87+ let nameRange = Range ( match. range ( at: 3 ) , in: self ) {
88+ let name = String ( self [ nameRange] )
89+ replacement = namedEntities [ name]
90+ }
91+
92+ if let replacement = replacement {
93+ result. replaceSubrange ( range, with: replacement)
94+ }
95+ }
96+ }
97+ #else
98+ // Simple replacement for Linux without regex
99+ for (entity, replacement) in namedEntities {
100+ result = result. replacingOccurrences ( of: " & \( entity) ; " , with: replacement)
101+ }
102+
103+ // Handle numeric entities with a simple approach
104+ // Decimal entities: @
105+ var decimalPattern = " &(#[0-9]+); "
106+ while let range = result. range ( of: " #[0-9]+ " , options: . regularExpression) {
107+ let numStr = result [ range] . dropFirst ( ) // Remove #
108+ if let code = Int ( numStr) ,
109+ let scalar = UnicodeScalar ( code) {
110+ let fullRange = result. range ( of: " &# \( numStr) ; " ) !
111+ result. replaceSubrange ( fullRange, with: String ( Character ( scalar) ) )
112+ } else {
113+ break
114+ }
115+ }
116+
117+ // Hex entities: @
118+ while let range = result. range ( of: " #[xX][0-9a-fA-F]+ " , options: . regularExpression) {
119+ let hexStr = String ( result [ range] . dropFirst ( 2 ) ) // Remove #x
120+ if let code = Int ( hexStr, radix: 16 ) ,
121+ let scalar = UnicodeScalar ( code) {
122+ let fullRange = result. range ( of: " &#[xX] \( hexStr) ; " , options: . regularExpression) !
123+ result. replaceSubrange ( fullRange, with: String ( Character ( scalar) ) )
124+ } else {
125+ break
126+ }
127+ }
128+ #endif
129+
130+ return result
131+ }
132+ }
0 commit comments