Skip to content

Commit 7a888b2

Browse files
committed
Add String+HTMLEntities
1 parent 93b1b78 commit 7a888b2

File tree

2 files changed

+183
-0
lines changed

2 files changed

+183
-0
lines changed
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
//
2+
// Cornucopia – (C) Dr. Lauer Information Technology
3+
//
4+
import Foundation
5+
6+
#if canImport(AppKit)
7+
import AppKit
8+
#elseif canImport(UIKit)
9+
import UIKit
10+
#endif
11+
12+
public extension String {
13+
14+
/// Returns a new string with HTML entities decoded.
15+
var CC_htmlDecoded: String {
16+
#if canImport(ObjectiveC)
17+
// Use NSAttributedString with HTML document type on Apple platforms
18+
guard let data = self.data(using: .utf8) else { return self }
19+
20+
let options: [NSAttributedString.DocumentReadingOptionKey: Any] = [
21+
.documentType: NSAttributedString.DocumentType.html,
22+
.characterEncoding: String.Encoding.utf8.rawValue
23+
]
24+
25+
if let attributed = try? NSAttributedString(data: data, options: options, documentAttributes: nil) {
26+
return attributed.string
27+
}
28+
29+
// Fallback to manual decoding if NSAttributedString fails
30+
return manualDecode()
31+
#else
32+
// Use manual decoding on non-Apple platforms (e.g., Linux)
33+
return manualDecode()
34+
#endif
35+
}
36+
37+
private func manualDecode() -> String {
38+
let pattern = #"&(?:#(\d+)|#[xX]([0-9a-fA-F]+)|([a-zA-Z]+));"#
39+
var result = self
40+
41+
let namedEntities: [String: String] = [
42+
"amp": "&",
43+
"lt": "<",
44+
"gt": ">",
45+
"quot": "\"",
46+
"apos": "'",
47+
"nbsp": "\u{00A0}",
48+
"copy": "©",
49+
"reg": "®",
50+
"trade": "",
51+
"euro": "",
52+
"pound": "£",
53+
"yen": "¥",
54+
"cent": "¢",
55+
"sect": "§",
56+
"para": "",
57+
"middot": "·",
58+
"bull": "",
59+
"hellip": "",
60+
"prime": "",
61+
"Prime": "",
62+
"lsquo": "\u{2018}",
63+
"rsquo": "\u{2019}",
64+
"ldquo": "\u{201C}",
65+
"rdquo": "\u{201D}",
66+
"ndash": "",
67+
"mdash": ""
68+
]
69+
70+
#if canImport(ObjectiveC)
71+
if let regex = try? NSRegularExpression(pattern: pattern) {
72+
let matches = regex.matches(in: self, range: NSRange(location: 0, length: self.utf16.count))
73+
74+
for match in matches.reversed() {
75+
guard let range = Range(match.range, in: self) else { continue }
76+
var replacement: String?
77+
78+
if match.range(at: 1).location != NSNotFound,
79+
let decimalRange = Range(match.range(at: 1), in: self),
80+
let code = Int(self[decimalRange]) {
81+
replacement = String(Character(UnicodeScalar(code) ?? UnicodeScalar(0xFFFD)!))
82+
} else if match.range(at: 2).location != NSNotFound,
83+
let hexRange = Range(match.range(at: 2), in: self),
84+
let code = Int(self[hexRange], radix: 16) {
85+
replacement = String(Character(UnicodeScalar(code) ?? UnicodeScalar(0xFFFD)!))
86+
} else if match.range(at: 3).location != NSNotFound,
87+
let nameRange = Range(match.range(at: 3), in: self) {
88+
let name = String(self[nameRange])
89+
replacement = namedEntities[name]
90+
}
91+
92+
if let replacement = replacement {
93+
result.replaceSubrange(range, with: replacement)
94+
}
95+
}
96+
}
97+
#else
98+
// Simple replacement for Linux without regex
99+
for (entity, replacement) in namedEntities {
100+
result = result.replacingOccurrences(of: "&\(entity);", with: replacement)
101+
}
102+
103+
// Handle numeric entities with a simple approach
104+
// Decimal entities: &#64;
105+
var decimalPattern = "&(#[0-9]+);"
106+
while let range = result.range(of: "#[0-9]+", options: .regularExpression) {
107+
let numStr = result[range].dropFirst() // Remove #
108+
if let code = Int(numStr),
109+
let scalar = UnicodeScalar(code) {
110+
let fullRange = result.range(of: "&#\(numStr);")!
111+
result.replaceSubrange(fullRange, with: String(Character(scalar)))
112+
} else {
113+
break
114+
}
115+
}
116+
117+
// Hex entities: &#x40;
118+
while let range = result.range(of: "#[xX][0-9a-fA-F]+", options: .regularExpression) {
119+
let hexStr = String(result[range].dropFirst(2)) // Remove #x
120+
if let code = Int(hexStr, radix: 16),
121+
let scalar = UnicodeScalar(code) {
122+
let fullRange = result.range(of: "&#[xX]\(hexStr);", options: .regularExpression)!
123+
result.replaceSubrange(fullRange, with: String(Character(scalar)))
124+
} else {
125+
break
126+
}
127+
}
128+
#endif
129+
130+
return result
131+
}
132+
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
//
2+
// Cornucopia – (C) Dr. Lauer Information Technology
3+
//
4+
import XCTest
5+
@testable import CornucopiaCore
6+
7+
final class StringHTMLEntitiesTests: XCTestCase {
8+
9+
func testBasicHTMLEntities() {
10+
XCTAssertEqual("&amp;".CC_htmlDecoded, "&")
11+
XCTAssertEqual("&lt;".CC_htmlDecoded, "<")
12+
XCTAssertEqual("&gt;".CC_htmlDecoded, ">")
13+
XCTAssertEqual("&quot;".CC_htmlDecoded, "\"")
14+
XCTAssertEqual("&apos;".CC_htmlDecoded, "'")
15+
}
16+
17+
func testNumericHTMLEntities() {
18+
XCTAssertEqual("&#64;".CC_htmlDecoded, "@")
19+
XCTAssertEqual("&#x40;".CC_htmlDecoded, "@")
20+
XCTAssertEqual("&#8364;".CC_htmlDecoded, "")
21+
XCTAssertEqual("&#x20AC;".CC_htmlDecoded, "")
22+
}
23+
24+
func testMixedContent() {
25+
let input = "Hello &amp; welcome to &lt;Swift&gt;"
26+
let expected = "Hello & welcome to <Swift>"
27+
XCTAssertEqual(input.CC_htmlDecoded, expected)
28+
}
29+
30+
func testNamedEntities() {
31+
XCTAssertEqual("&nbsp;".CC_htmlDecoded, "\u{00A0}")
32+
XCTAssertEqual("&copy;".CC_htmlDecoded, "©")
33+
XCTAssertEqual("&reg;".CC_htmlDecoded, "®")
34+
XCTAssertEqual("&euro;".CC_htmlDecoded, "")
35+
}
36+
37+
func testNoEntities() {
38+
let plain = "This is plain text with no entities"
39+
XCTAssertEqual(plain.CC_htmlDecoded, plain)
40+
}
41+
42+
func testEmptyString() {
43+
XCTAssertEqual("".CC_htmlDecoded, "")
44+
}
45+
46+
func testMultipleEntitiesInSequence() {
47+
let input = "&lt;&lt;&lt;&amp;&amp;&amp;&gt;&gt;&gt;"
48+
let expected = "<<<&&&>>>"
49+
XCTAssertEqual(input.CC_htmlDecoded, expected)
50+
}
51+
}

0 commit comments

Comments
 (0)