Skip to content

Commit 44821b4

Browse files
authored
Fix scanning of valid surrogate pairs (#2032)
1 parent a99b7f2 commit 44821b4

File tree

9 files changed

+235
-0
lines changed

9 files changed

+235
-0
lines changed

_packages/api/test/api.test.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import {
88
cast,
99
isImportDeclaration,
1010
isNamedImports,
11+
isStringLiteral,
1112
isTemplateHead,
1213
isTemplateMiddle,
1314
isTemplateTail,
@@ -113,6 +114,31 @@ describe("SourceFile", () => {
113114
});
114115
});
115116

117+
test("unicode escapes", () => {
118+
const srcFiles = {
119+
"/src/1.ts": `"😃"`,
120+
"/src/2.ts": `"\\ud83d\\ude03"`, // this is "😃"
121+
};
122+
123+
const api = spawnAPI({
124+
"/tsconfig.json": "{}",
125+
...srcFiles,
126+
});
127+
const project = api.loadProject("/tsconfig.json");
128+
129+
Object.keys(srcFiles).forEach(file => {
130+
const sourceFile = project.getSourceFile(file);
131+
assert.ok(sourceFile);
132+
133+
sourceFile.forEachChild(function visit(node) {
134+
if (isStringLiteral(node)) {
135+
assert.equal(node.text, "😃");
136+
}
137+
node.forEachChild(visit);
138+
});
139+
});
140+
});
141+
116142
test("Object equality", () => {
117143
const api = spawnAPI();
118144
const project = api.loadProject("/tsconfig.json");

internal/api/encoder/encoder_test.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,24 @@ func TestEncodeSourceFile(t *testing.T) {
3535
})
3636
}
3737

38+
func TestEncodeSourceFileWithUnicodeEscapes(t *testing.T) {
39+
t.Parallel()
40+
sourceFile := parser.ParseSourceFile(ast.SourceFileParseOptions{
41+
FileName: "/test.ts",
42+
Path: "/test.ts",
43+
}, `let a = "😃"; let b = "\ud83d\ude03"; let c = "\udc00\ud83d\ude03"; let d = "\ud83d\ud83d\ude03"`, core.ScriptKindTS)
44+
t.Run("baseline", func(t *testing.T) {
45+
t.Parallel()
46+
buf, err := encoder.EncodeSourceFile(sourceFile, "")
47+
assert.NilError(t, err)
48+
49+
str := formatEncodedSourceFile(buf)
50+
baseline.Run(t, "encodeSourceFileWithUnicodeEscapes.txt", str, baseline.Options{
51+
Subfolder: "api",
52+
})
53+
})
54+
}
55+
3856
func BenchmarkEncodeSourceFile(b *testing.B) {
3957
repo.SkipIfNoTypeScriptSubmodule(b)
4058
filePath := filepath.Join(repo.TypeScriptSubmodulePath, "src/compiler/checker.ts")

internal/scanner/scanner.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1629,6 +1629,13 @@ func (s *Scanner) scanEscapeSequence(flags EscapeSequenceScanningFlags) string {
16291629
codePoint := s.scanUnicodeEscape(flags&EscapeSequenceScanningFlagsReportInvalidEscapeErrors != 0)
16301630
if codePoint < 0 {
16311631
return s.text[start:s.pos]
1632+
} else if codePointIsHighSurrogate(codePoint) && s.char() == '\\' && s.charAt(1) == 'u' {
1633+
savedPos := s.pos
1634+
nextCodePoint := s.scanUnicodeEscape(flags&EscapeSequenceScanningFlagsReportInvalidEscapeErrors != 0)
1635+
if codePointIsLowSurrogate(nextCodePoint) {
1636+
return string(surrogatePairToCodepoint(codePoint, nextCodePoint))
1637+
}
1638+
s.pos = savedPos // restore position because we do not consume nextCodePoint
16321639
}
16331640
return string(codePoint)
16341641
case 'x':

internal/scanner/utilities.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,25 @@ import (
88
"github.com/microsoft/typescript-go/internal/core"
99
)
1010

11+
const (
12+
surr1 = 0xd800
13+
surr2 = 0xdc00
14+
surr3 = 0xe000
15+
surrSelf = 0x10000
16+
)
17+
18+
func codePointIsHighSurrogate(r rune) bool {
19+
return surr1 <= r && r < surr2
20+
}
21+
22+
func codePointIsLowSurrogate(r rune) bool {
23+
return surr2 <= r && r < surr3
24+
}
25+
26+
func surrogatePairToCodepoint(r1, r2 rune) rune {
27+
return (r1-surr1)<<10 | (r2 - surr2) + surrSelf
28+
}
29+
1130
func tokenIsIdentifierOrKeyword(token ast.Kind) bool {
1231
return token >= ast.KindIdentifier
1332
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
KindSourceFile [0, 98), i=1, next=0
2+
NodeList [0, 98), i=2, next=27
3+
KindVariableStatement [0, 15), i=3, next=9
4+
KindVariableDeclarationList [0, 14), i=4, next=0
5+
NodeList [3, 14), i=5, next=0
6+
KindVariableDeclaration [3, 14), i=6, next=0
7+
KindIdentifier "a" [3, 5), i=7, next=8
8+
KindStringLiteral "😃" [7, 14), i=8, next=0
9+
KindVariableStatement [15, 39), i=9, next=15
10+
KindVariableDeclarationList [15, 38), i=10, next=0
11+
NodeList [19, 38), i=11, next=0
12+
KindVariableDeclaration [19, 38), i=12, next=0
13+
KindIdentifier "b" [19, 21), i=13, next=14
14+
KindStringLiteral "😃" [23, 38), i=14, next=0
15+
KindVariableStatement [39, 69), i=15, next=21
16+
KindVariableDeclarationList [39, 68), i=16, next=0
17+
NodeList [43, 68), i=17, next=0
18+
KindVariableDeclaration [43, 68), i=18, next=0
19+
KindIdentifier "c" [43, 45), i=19, next=20
20+
KindStringLiteral "�😃" [47, 68), i=20, next=0
21+
KindVariableStatement [69, 98), i=21, next=0
22+
KindVariableDeclarationList [69, 98), i=22, next=0
23+
NodeList [73, 98), i=23, next=0
24+
KindVariableDeclaration [73, 98), i=24, next=0
25+
KindIdentifier "d" [73, 75), i=25, next=26
26+
KindStringLiteral "�😃" [77, 98), i=26, next=0
27+
KindEndOfFile [98, 98), i=27, next=0
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
//// [tests/cases/compiler/unicodeSurrogatesInStringLiterals.ts] ////
2+
3+
//// [unicodeSurrogatesInStringLiterals.ts]
4+
// low-high surrogate pair - the "correct" case
5+
export const highLow = "\ud83d\ude03" as const;
6+
7+
// high surrogate
8+
export const high = "\ud83d" as const;
9+
10+
// low surrogate
11+
export const low = "\ude03" as const;
12+
13+
// two high surrogates
14+
export const highHigh = "\ud83d\ud83d" as const;
15+
16+
// two low surrogates
17+
export const lowLow = "\ude03\ude03" as const;
18+
19+
// swapped expected order of surrogates
20+
export const lowHigh = "\ude03\ud83d" as const;
21+
22+
23+
//// [unicodeSurrogatesInStringLiterals.js]
24+
"use strict";
25+
Object.defineProperty(exports, "__esModule", { value: true });
26+
exports.lowHigh = exports.lowLow = exports.highHigh = exports.low = exports.high = exports.highLow = void 0;
27+
// low-high surrogate pair - the "correct" case
28+
exports.highLow = "\ud83d\ude03";
29+
// high surrogate
30+
exports.high = "\ud83d";
31+
// low surrogate
32+
exports.low = "\ude03";
33+
// two high surrogates
34+
exports.highHigh = "\ud83d\ud83d";
35+
// two low surrogates
36+
exports.lowLow = "\ude03\ude03";
37+
// swapped expected order of surrogates
38+
exports.lowHigh = "\ude03\ud83d";
39+
40+
41+
//// [unicodeSurrogatesInStringLiterals.d.ts]
42+
export declare const highLow: "😃";
43+
export declare const high: "�";
44+
export declare const low: "�";
45+
export declare const highHigh: "��";
46+
export declare const lowLow: "��";
47+
export declare const lowHigh: "��";
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
//// [tests/cases/compiler/unicodeSurrogatesInStringLiterals.ts] ////
2+
3+
=== unicodeSurrogatesInStringLiterals.ts ===
4+
// low-high surrogate pair - the "correct" case
5+
export const highLow = "\ud83d\ude03" as const;
6+
>highLow : Symbol(highLow, Decl(unicodeSurrogatesInStringLiterals.ts, 1, 12))
7+
>const : Symbol(const)
8+
9+
// high surrogate
10+
export const high = "\ud83d" as const;
11+
>high : Symbol(high, Decl(unicodeSurrogatesInStringLiterals.ts, 4, 12))
12+
>const : Symbol(const)
13+
14+
// low surrogate
15+
export const low = "\ude03" as const;
16+
>low : Symbol(low, Decl(unicodeSurrogatesInStringLiterals.ts, 7, 12))
17+
>const : Symbol(const)
18+
19+
// two high surrogates
20+
export const highHigh = "\ud83d\ud83d" as const;
21+
>highHigh : Symbol(highHigh, Decl(unicodeSurrogatesInStringLiterals.ts, 10, 12))
22+
>const : Symbol(const)
23+
24+
// two low surrogates
25+
export const lowLow = "\ude03\ude03" as const;
26+
>lowLow : Symbol(lowLow, Decl(unicodeSurrogatesInStringLiterals.ts, 13, 12))
27+
>const : Symbol(const)
28+
29+
// swapped expected order of surrogates
30+
export const lowHigh = "\ude03\ud83d" as const;
31+
>lowHigh : Symbol(lowHigh, Decl(unicodeSurrogatesInStringLiterals.ts, 16, 12))
32+
>const : Symbol(const)
33+
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
//// [tests/cases/compiler/unicodeSurrogatesInStringLiterals.ts] ////
2+
3+
=== unicodeSurrogatesInStringLiterals.ts ===
4+
// low-high surrogate pair - the "correct" case
5+
export const highLow = "\ud83d\ude03" as const;
6+
>highLow : "😃"
7+
>"\ud83d\ude03" as const : "😃"
8+
>"\ud83d\ude03" : "😃"
9+
10+
// high surrogate
11+
export const high = "\ud83d" as const;
12+
>high : "�"
13+
>"\ud83d" as const : "�"
14+
>"\ud83d" : "�"
15+
16+
// low surrogate
17+
export const low = "\ude03" as const;
18+
>low : "�"
19+
>"\ude03" as const : "�"
20+
>"\ude03" : "�"
21+
22+
// two high surrogates
23+
export const highHigh = "\ud83d\ud83d" as const;
24+
>highHigh : "��"
25+
>"\ud83d\ud83d" as const : "��"
26+
>"\ud83d\ud83d" : "��"
27+
28+
// two low surrogates
29+
export const lowLow = "\ude03\ude03" as const;
30+
>lowLow : "��"
31+
>"\ude03\ude03" as const : "��"
32+
>"\ude03\ude03" : "��"
33+
34+
// swapped expected order of surrogates
35+
export const lowHigh = "\ude03\ud83d" as const;
36+
>lowHigh : "��"
37+
>"\ude03\ud83d" as const : "��"
38+
>"\ude03\ud83d" : "��"
39+
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// @declaration: true
2+
3+
// low-high surrogate pair - the "correct" case
4+
export const highLow = "\ud83d\ude03" as const;
5+
6+
// high surrogate
7+
export const high = "\ud83d" as const;
8+
9+
// low surrogate
10+
export const low = "\ude03" as const;
11+
12+
// two high surrogates
13+
export const highHigh = "\ud83d\ud83d" as const;
14+
15+
// two low surrogates
16+
export const lowLow = "\ude03\ude03" as const;
17+
18+
// swapped expected order of surrogates
19+
export const lowHigh = "\ude03\ud83d" as const;

0 commit comments

Comments
 (0)