You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: README.md
+47-38Lines changed: 47 additions & 38 deletions
Original file line number
Diff line number
Diff line change
@@ -13,15 +13,15 @@ to be able to tokenise a string into separate code points before handling them w
13
13
14
14
Currently language APIs provide two ways to access entire code points:
15
15
16
-
1.`codePointAt` allows to retrieve a code point at a known position. The issue is that position is usually unknown in advance if you're just iterating over the string, and you need to manually
17
-
calculate it on each iteration with a manual `for(;;)` loop and a magically looking expression like
18
-
`pos += currentCodePoint <= 0xFFFF ? 1 : 2`.
19
-
1.`String.prototype[Symbol.iterator]` which allows a hassle-free iteration over string codepoints,
20
-
but yields their string values, which are inefficient to work with in performance-critical lexers.
16
+
1.`codePointAt` allows to retrieve a code point at a known position. The issue is that position is usually unknown in advance if you're just iterating over the string, and you need to manually
17
+
calculate it on each iteration with a manual `for(;;)` loop and a magically looking expression like
18
+
`pos += currentCodePoint <= 0xFFFF ? 1 : 2`.
19
+
1.`String.prototype[Symbol.iterator]` which allows a hassle-free iteration over string codepoints,
20
+
but yields their string values, which are inefficient to work with in performance-critical lexers, and still lack position information.
21
21
22
22
## Proposed solution
23
23
24
-
We propose the addition of a `codePoints()` method functionally similar to the `[@@iterator]`, but yielding numerical values of code points instead of string ones, this way combining the benefits of both approaches presented above while avoiding the related pitfalls in consumer code.
24
+
We propose the addition of a `codePoints()` method functionally similar to the `[@@iterator]`, but yielding positions and numerical values of code points instead of just string values, this way combining the benefits of both approaches presented above while avoiding the related pitfalls in consumer code.
25
25
26
26
## Naming
27
27
@@ -36,11 +36,11 @@ function isIdent(input) {
36
36
let codePoints =input.codePoints();
37
37
let first =codePoints.next();
38
38
39
-
if (first.done||!isIdentifierStart(first.value)) {
39
+
if (first.done||!isIdentifierStart(first.value.codePoint)) {
40
40
returnfalse;
41
41
}
42
42
43
-
for (letcpof codePoints) {
43
+
for (let{ codePoint }of codePoints) {
44
44
if (!isIdentifierContinue(cp)) {
45
45
returnfalse;
46
46
}
@@ -50,41 +50,54 @@ function isIdent(input) {
50
50
}
51
51
```
52
52
53
-
### Tokenise a string with a state machine
53
+
### Full-blown tokeniser
54
54
55
55
```javascript
56
56
functiontoDigit(cp) {
57
57
return cp -/* '0' */48;
58
58
}
59
59
60
-
function*tokenise(input) {
61
-
let token = {};
62
-
63
-
for (let cp of input) {
64
-
let pos =/* see open question #1, we still need to know a pos somehow */;
65
-
66
-
if (token.type==='Identifier') {
67
-
if (isIdentifierContinue(cp)) {
68
-
continue;
69
-
}
70
-
token.end= pos;
71
-
token.name=input.slice(token.start, token.end);
72
-
yield token;
73
-
} elseif (token.type==='Number') {
74
-
if (isDigit(cp)) {
75
-
token.value=token.value*10+toDigit(cp);
76
-
continue;
77
-
}
78
-
token.end= pos;
79
-
yield token;
60
+
// Generic helper
61
+
classLookaheadIterator {
62
+
constructor(inner) {
63
+
this[Symbol.iterator] =this;
64
+
this.inner= inner;
65
+
this.next();
66
+
}
67
+
68
+
next() {
69
+
let next =this.lookahead;
70
+
this.lookahead=this.inner.next();
71
+
return next;
72
+
}
73
+
74
+
skipWhile(cond) {
75
+
while (!this.lookahead.done&&cond(this.lookahead.value)) {
thrownewSyntaxError(`Expected an identifier or digit at ${tokenStart}`);
100
+
thrownewSyntaxError(`Expected an identifier or digit at ${start}`);
88
101
}
89
102
}
90
103
}
@@ -93,7 +106,3 @@ function *tokenise(input) {
93
106
## Specification
94
107
95
108
You can view the rendered spec [here](https://rreverser.github.io/string-prototype-codepoints/).
96
-
97
-
## Open questions
98
-
99
-
1.[Should the API yield `[position, codePoint]` pairs like `entries` API of standard collections?](https://github.com/RReverser/string-prototype-codepoints/issues/1)
0 commit comments