Skip to content

Commit 87a0dce

Browse files
Guy BedfordJakeChampion
Guy Bedford
authored andcommitted
feat: support unicode patterns via precompilation
1 parent 598466f commit 87a0dce

File tree

4 files changed

+95
-4
lines changed

4 files changed

+95
-4
lines changed

integration-tests/js-compute/fixtures/regex/bin/index.js

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2111,10 +2111,18 @@ function long_regex_input() {
21112111
regex.test("example.com/test/".padEnd(520, "x"));
21122112
}
21132113

2114+
function unicode_regex() {
2115+
const regex1 = /^\p{Emoji_Presentation}+$/u;
2116+
const regex2 = /^\p{L}+$/u;
2117+
if (!regex1.test('🐱') || !regex2.test('woah'))
2118+
throw new Error('bad');
2119+
}
2120+
21142121
addEventListener("fetch", (req) => {
21152122
const useragent = duration(() =>
21162123
useragent_parser("should-not-match-any-case")
21172124
);
21182125
const long_regex = duration(long_regex_input);
2119-
req.respondWith(new Response(JSON.stringify({ useragent, long_regex })));
2126+
const unicode = duration(unicode_regex);
2127+
req.respondWith(new Response(JSON.stringify({ useragent, long_regex, unicode })));
21202128
});

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
"@bytecodealliance/jco": "^0.4.1",
4646
"@bytecodealliance/wizer": "^1.6.1-beta.4",
4747
"esbuild": "^0.15.16",
48+
"regexpu-core": "^5.3.1",
4849
"tree-sitter": "^0.20.1",
4950
"tree-sitter-javascript": "^0.19.0"
5051
}

src/precompile.js

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import Parser, { Query } from "tree-sitter";
22
import JavaScript from "tree-sitter-javascript";
3+
import regexpuc from 'regexpu-core';
34

45
function findRegexLiterals(source) {
56
const parser = new Parser();
@@ -12,9 +13,18 @@ function findRegexLiterals(source) {
1213
);
1314
const regexLiterals = [];
1415
for (const m of query.matches(tree.rootNode)) {
16+
const pattern = m.captures[0].node.text;
17+
const flags = m.captures[1]?.node.text || "";
18+
// transpile unicode property escapes
19+
const patternTranspiled = regexpuc(pattern, flags, { unicodePropertyEscapes: 'transform' });
1520
regexLiterals.push({
16-
pattern: m.captures[0].node.text,
17-
flags: m.captures[1]?.node.text || "",
21+
patternStart: m.captures[0].node.startIndex,
22+
patternEnd: m.captures[0].node.endIndex,
23+
pattern,
24+
patternTranspiled,
25+
flags,
26+
flagsStart: m.captures[1]?.node.startIndex,
27+
flagsEnd: m.captures[1]?.node.endIndex,
1828
});
1929
}
2030
return regexLiterals;
@@ -39,11 +49,19 @@ export function precompile(inputApplication) {
3949
return inputApplication;
4050
}
4151

52+
let offset = 0;
53+
for (const lit of lits) {
54+
if (lit.pattern === lit.patternTranspiled)
55+
continue;
56+
inputApplication = inputApplication.slice(0, lit.patternStart + offset) + lit.patternTranspiled + inputApplication.slice(lit.patternEnd + offset);
57+
offset += lit.patternTranspiled.length - lit.pattern.length;
58+
}
59+
4260
return (
4361
PREAMBLE +
4462
lits
4563
.map((regex) => {
46-
return `precompile(/${regex.pattern}/${regex.flags});`;
64+
return `precompile(/${regex.patternTranspiled}/${regex.flags});`;
4765
})
4866
.join("\n") +
4967
POSTAMBLE + inputApplication

yarn.lock

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@
2323
chalk "^2.0.0"
2424
js-tokens "^4.0.0"
2525

26+
"@babel/regjsgen@^0.8.0":
27+
version "0.8.0"
28+
resolved "https://registry.yarnpkg.com/@babel/regjsgen/-/regjsgen-0.8.0.tgz#f0ba69b075e1f05fb2825b7fad991e7adbb18310"
29+
integrity sha512-x/rqGMdzj+fWZvCOYForTghzbtqPDZ5gPwaoNGHdgDfF2QA/XZbCBp4Moo5scrkAMPhB7z26XM/AaHuIJdgauA==
30+
2631
"@bcoe/v8-coverage@^0.2.3":
2732
version "0.2.3"
2833
resolved "https://registry.npmjs.org/@bcoe/v8-coverage/-/v8-coverage-0.2.3.tgz"
@@ -1455,6 +1460,11 @@ js-yaml@^4.1.0:
14551460
dependencies:
14561461
argparse "^2.0.1"
14571462

1463+
jsesc@~0.5.0:
1464+
version "0.5.0"
1465+
resolved "https://registry.yarnpkg.com/jsesc/-/jsesc-0.5.0.tgz#e7dee66e35d6fc16f710fe91d5cf69f70f08911d"
1466+
integrity sha512-uZz5UnB7u4T9LvwmFqXii7pZSouaRPorGs5who1Ip7VO0wxanFvBL7GkM6dTHlgX+jhBApRetaWpnDabOeTcnA==
1467+
14581468
json-parse-even-better-errors@^2.3.0:
14591469
version "2.3.1"
14601470
resolved "https://registry.npmjs.org/json-parse-even-better-errors/-/json-parse-even-better-errors-2.3.1.tgz"
@@ -1930,6 +1940,18 @@ redent@^3.0.0:
19301940
indent-string "^4.0.0"
19311941
strip-indent "^3.0.0"
19321942

1943+
regenerate-unicode-properties@^10.1.0:
1944+
version "10.1.0"
1945+
resolved "https://registry.yarnpkg.com/regenerate-unicode-properties/-/regenerate-unicode-properties-10.1.0.tgz#7c3192cab6dd24e21cb4461e5ddd7dd24fa8374c"
1946+
integrity sha512-d1VudCLoIGitcU/hEg2QqvyGZQmdC0Lf8BqdOMXGFSvJP4bNV1+XqbPQeHHLD51Jh4QJJ225dlIFvY4Ly6MXmQ==
1947+
dependencies:
1948+
regenerate "^1.4.2"
1949+
1950+
regenerate@^1.4.2:
1951+
version "1.4.2"
1952+
resolved "https://registry.yarnpkg.com/regenerate/-/regenerate-1.4.2.tgz#b9346d8827e8f5a32f7ba29637d398b69014848a"
1953+
integrity sha512-zrceR/XhGYU/d/opr2EKO7aRHUeiBI8qjtfHqADTwZd6Szfy16la6kqD0MIUs5z5hx6AaKa+PixpPrR289+I0A==
1954+
19331955
regexp.prototype.flags@^1.4.3:
19341956
version "1.4.3"
19351957
resolved "https://registry.npmjs.org/regexp.prototype.flags/-/regexp.prototype.flags-1.4.3.tgz"
@@ -1944,6 +1966,25 @@ regexpp@^3.2.0:
19441966
resolved "https://registry.npmjs.org/regexpp/-/regexpp-3.2.0.tgz"
19451967
integrity sha512-pq2bWo9mVD43nbts2wGv17XLiNLya+GklZ8kaDLV2Z08gDCsGpnKn9BFMepvWuHCbyVvY7J5o5+BVvoQbmlJLg==
19461968

1969+
regexpu-core@^5.3.1:
1970+
version "5.3.1"
1971+
resolved "https://registry.yarnpkg.com/regexpu-core/-/regexpu-core-5.3.1.tgz#66900860f88def39a5cb79ebd9490e84f17bcdfb"
1972+
integrity sha512-nCOzW2V/X15XpLsK2rlgdwrysrBq+AauCn+omItIz4R1pIcmeot5zvjdmOBRLzEH/CkC6IxMJVmxDe3QcMuNVQ==
1973+
dependencies:
1974+
"@babel/regjsgen" "^0.8.0"
1975+
regenerate "^1.4.2"
1976+
regenerate-unicode-properties "^10.1.0"
1977+
regjsparser "^0.9.1"
1978+
unicode-match-property-ecmascript "^2.0.0"
1979+
unicode-match-property-value-ecmascript "^2.1.0"
1980+
1981+
regjsparser@^0.9.1:
1982+
version "0.9.1"
1983+
resolved "https://registry.yarnpkg.com/regjsparser/-/regjsparser-0.9.1.tgz#272d05aa10c7c1f67095b1ff0addae8442fc5709"
1984+
integrity sha512-dQUtn90WanSNl+7mQKcXAgZxvUe7Z0SqXlgzv0za4LwiUhyzBC58yQO3liFoUgu8GiJVInAhJjkj1N0EtQ5nkQ==
1985+
dependencies:
1986+
jsesc "~0.5.0"
1987+
19471988
require-directory@^2.1.1:
19481989
version "2.1.1"
19491990
resolved "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz"
@@ -2310,6 +2351,29 @@ typescript@^4.9:
23102351
resolved "https://registry.npmjs.org/typescript/-/typescript-4.9.5.tgz"
23112352
integrity sha512-1FXk9E2Hm+QzZQ7z+McJiHL4NW1F2EzMu9Nq9i3zAaGqibafqYwCVU6WyWAuyQRRzOlxou8xZSyXLEN8oKj24g==
23122353

2354+
unicode-canonical-property-names-ecmascript@^2.0.0:
2355+
version "2.0.0"
2356+
resolved "https://registry.yarnpkg.com/unicode-canonical-property-names-ecmascript/-/unicode-canonical-property-names-ecmascript-2.0.0.tgz#301acdc525631670d39f6146e0e77ff6bbdebddc"
2357+
integrity sha512-yY5PpDlfVIU5+y/BSCxAJRBIS1Zc2dDG3Ujq+sR0U+JjUevW2JhocOF+soROYDSaAezOzOKuyyixhD6mBknSmQ==
2358+
2359+
unicode-match-property-ecmascript@^2.0.0:
2360+
version "2.0.0"
2361+
resolved "https://registry.yarnpkg.com/unicode-match-property-ecmascript/-/unicode-match-property-ecmascript-2.0.0.tgz#54fd16e0ecb167cf04cf1f756bdcc92eba7976c3"
2362+
integrity sha512-5kaZCrbp5mmbz5ulBkDkbY0SsPOjKqVS35VpL9ulMPfSl0J0Xsm+9Evphv9CoIZFwre7aJoa94AY6seMKGVN5Q==
2363+
dependencies:
2364+
unicode-canonical-property-names-ecmascript "^2.0.0"
2365+
unicode-property-aliases-ecmascript "^2.0.0"
2366+
2367+
unicode-match-property-value-ecmascript@^2.1.0:
2368+
version "2.1.0"
2369+
resolved "https://registry.yarnpkg.com/unicode-match-property-value-ecmascript/-/unicode-match-property-value-ecmascript-2.1.0.tgz#cb5fffdcd16a05124f5a4b0bf7c3770208acbbe0"
2370+
integrity sha512-qxkjQt6qjg/mYscYMC0XKRn3Rh0wFPlfxB0xkt9CfyTvpX1Ra0+rAmdX2QyAobptSEvuy4RtpPRui6XkV+8wjA==
2371+
2372+
unicode-property-aliases-ecmascript@^2.0.0:
2373+
version "2.1.0"
2374+
resolved "https://registry.yarnpkg.com/unicode-property-aliases-ecmascript/-/unicode-property-aliases-ecmascript-2.1.0.tgz#43d41e3be698bd493ef911077c9b131f827e8ccd"
2375+
integrity sha512-6t3foTQI9qne+OZoVQB/8x8rk2k1eVy1gRXhV3oFQ5T6R1dqQ1xtin3XqSlx3+ATBkliTaR/hHyJBm+LVPNM8w==
2376+
23132377
uri-js@^4.2.2:
23142378
version "4.4.1"
23152379
resolved "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz"

0 commit comments

Comments
 (0)