Skip to content

Commit ccc3658

Browse files
committed
ENH: Prebuild Lunr.js search index
1 parent 79cfcda commit ccc3658

File tree

5 files changed

+183
-8
lines changed

5 files changed

+183
-8
lines changed

pdoc/build-index.js

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import vm from 'vm';
2+
3+
const LUNR_SCRIPT = 'https://cdnjs.cloudflare.com/ajax/libs/lunr.js/2.3.9/lunr.min.js',
4+
stdin = process.stdin,
5+
stdout = process.stdout,
6+
buffer = [];
7+
8+
async function loadScript(url) {
9+
const response = await fetch(url);
10+
return await response.text();
11+
}
12+
async function executeScript(script) {
13+
const sandbox = { window: {}, self: {} };
14+
vm.runInContext(script, vm.createContext(sandbox));
15+
return sandbox;
16+
}
17+
18+
function compact(index) {
19+
/* https://john-millikin.com/compacting-lunr-search-indices */
20+
function compactInvIndex(index) {
21+
const fields = index["fields"];
22+
const fieldVectorIdxs = new Map(index["fieldVectors"].map((v, idx) => [v[0], idx]));
23+
const items = new Map(index["invertedIndex"].map(item => {
24+
const token = item[0];
25+
const props = item[1];
26+
const newItem = [token];
27+
fields.forEach(field => {
28+
const fProps = props[field];
29+
const matches = [];
30+
Object.keys(fProps).forEach(docRef => {
31+
const fieldVectorIdx = fieldVectorIdxs.get(`${field}/${docRef}`);
32+
if (fieldVectorIdx === undefined) {
33+
throw new Error();
34+
}
35+
matches.push(fieldVectorIdx);
36+
matches.push(fProps[docRef]);
37+
});
38+
newItem.push(matches);
39+
});
40+
return [props["_index"], newItem];
41+
}));
42+
const indexes = Array.from(items.keys()).sort((a, b) => a - b);
43+
const compacted = Array.from(indexes, k => items.get(k));
44+
return compacted;
45+
}
46+
function compactVectors(index) {
47+
return index["fieldVectors"].map(item => {
48+
const id = item[0];
49+
const vectors = item[1];
50+
let prev = null;
51+
const compacted = vectors.map((v, ii) => {
52+
if (ii % 2 === 0) {
53+
if (prev !== null && v === prev + 1) {
54+
prev += 1;
55+
return null;
56+
}
57+
prev = v;
58+
}
59+
return v;
60+
});
61+
return [id, compacted];
62+
});
63+
}
64+
index.invertedIndex = compactInvIndex(index);
65+
index.fieldVectors = compactVectors(index);
66+
}
67+
68+
let lunr = (await executeScript(await loadScript(LUNR_SCRIPT)))['lunr'];
69+
70+
stdin.resume();
71+
stdin.setEncoding('utf8');
72+
73+
stdin.on('data', function (data) {buffer.push(data)});
74+
75+
stdin.on('end', function () {
76+
const documents = JSON.parse(buffer.join(''));
77+
let idx = lunr(function () {
78+
this.ref('i');
79+
this.field('name', {boost: 10});
80+
this.field('ref', {boost: 5});
81+
this.field('doc');
82+
this.metadataWhitelist = ['position'];
83+
documents.forEach(function (doc, i) {
84+
const parts = doc.ref.split('.');
85+
doc['name'] = parts[parts.length - 1];
86+
doc['i'] = i;
87+
this.add(doc);
88+
}, this)
89+
})
90+
91+
let out = idx.toJSON();
92+
compact(out);
93+
stdout.write(JSON.stringify(out));
94+
})

pdoc/cli.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@
99
import os.path as path
1010
import json
1111
import re
12+
import subprocess
1213
import sys
1314
import warnings
1415
from contextlib import contextmanager
1516
from functools import lru_cache
1617
from http.server import BaseHTTPRequestHandler, HTTPServer
18+
from pathlib import Path
1719
from typing import Dict, List, Sequence
1820
from warnings import warn
1921

@@ -397,6 +399,7 @@ def recursive_add_to_index(dobj):
397399
info['doc'] = trim_docstring(dobj.docstring)
398400
if isinstance(dobj, pdoc.Function):
399401
info['func'] = 1
402+
nonlocal index
400403
index.append(info)
401404
for member_dobj in getattr(dobj, 'doc', {}).values():
402405
recursive_add_to_index(member_dobj)
@@ -414,12 +417,27 @@ def to_url_id(module):
414417
recursive_add_to_index(top_module)
415418
urls = sorted(url_cache.keys(), key=url_cache.__getitem__)
416419

420+
json_values = [dict(obj, url=urls[obj['url']]) for obj in index]
421+
cmd = ['node', str(Path(__file__).with_name('build-index.js'))]
422+
proc = subprocess.Popen(cmd, text=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
417423
main_path = args.output_dir
418-
with _open_write_file(path.join(main_path, 'index.js')) as f:
419-
f.write("URLS=")
420-
json.dump(urls, f, indent=0, separators=(',', ':'))
421-
f.write(";\nINDEX=")
422-
json.dump(index, f, indent=0, separators=(',', ':'))
424+
if proc.poll() is None:
425+
stdout, stderr = proc.communicate(json.dumps(json_values))
426+
assert proc.poll() == 0, proc.poll()
427+
if proc.returncode == 0:
428+
stdout = 'INDEX=' + stdout
429+
else:
430+
warn(f'Prebuilding Lunr index with command `{" ".join(cmd)}` failed: '
431+
f'{proc.stderr and proc.stderr.read() or ""!r}. '
432+
f'The search feature will still work, '
433+
f'but may be slower (with the index rebuilt just before use). '
434+
f'To prebuild an index in advance, ensure `node` is executable in the '
435+
f'pdoc environment.', category=RuntimeWarning)
436+
stdout = ('URLS=' + json.dumps(urls, indent=0, separators=(',', ':')) +
437+
';\nINDEX=' + json.dumps(index, indent=0, separators=(',', ':')))
438+
index_path = Path(main_path).joinpath('index.js')
439+
index_path.write_text(stdout)
440+
print(str(index_path))
423441

424442
# Generate search.html
425443
with _open_write_file(path.join(main_path, 'doc-search.html')) as f:

pdoc/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"type": "module"}

pdoc/templates/search.mako

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,10 @@
5252
}
5353
5454
async function build_index() {
55-
return lunr(function () {
55+
try {
56+
return lunr.Index.load(_expand(INDEX)); // Prebuilt index
57+
} catch {
58+
return lunr(function () {
5659
this.ref('i');
5760
this.field('name', {boost: 10});
5861
this.field('ref', {boost: 5});
@@ -67,6 +70,60 @@
6770
this.add(doc);
6871
}, this);
6972
});
73+
}
74+
}
75+
76+
function _expand(compact) {
77+
// https://john-millikin.com/compacting-lunr-search-indices
78+
const fields = compact["fields"];
79+
const fieldVectors = compact["fieldVectors"].map((item) => {
80+
const id = item[0];
81+
const vectors = item[1];
82+
let prev = null;
83+
const expanded = vectors.map((v, ii) => {
84+
if (ii % 2 === 0) {
85+
if (v === null) {
86+
v = prev + 1;
87+
}
88+
prev = v;
89+
}
90+
return v;
91+
});
92+
return [id, expanded];
93+
});
94+
const invertedIndex = compact["invertedIndex"].map((item, itemIdx) => {
95+
const token = item[0];
96+
const fieldMap = {"_index": itemIdx};
97+
fields.forEach((field, fieldIdx) => {
98+
const matches = {};
99+
let docRef = null;
100+
item[fieldIdx + 1].forEach((v, ii) => {
101+
if (ii % 2 === 0) {
102+
docRef = fieldVectors[v][0].slice((field + '/').length);
103+
} else {
104+
matches[docRef] = v;
105+
}
106+
});
107+
fieldMap[field] = matches;
108+
})
109+
return [token, fieldMap];
110+
});
111+
invertedIndex.sort((a, b) => {
112+
if (a[0] < b[0]) {
113+
return -1;
114+
}
115+
if (a[0] > b[0]) {
116+
return 1;
117+
}
118+
return 0;
119+
});
120+
return {
121+
"version": compact["version"],
122+
"fields": fields,
123+
"fieldVectors": fieldVectors,
124+
"invertedIndex": invertedIndex,
125+
"pipeline": compact["pipeline"],
126+
};
70127
}
71128
72129
function search(query) {

pdoc/test/__init__.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -272,8 +272,13 @@ def test_lunr_search(self):
272272
files = self.PUBLIC_FILES + ["doc-search.html", "index.js"]
273273
self._basic_html_assertions(expected_files=files)
274274
self._check_files(exclude_patterns=['class="gcse-search"'])
275-
self._check_files(include_patterns=['URLS=[\n"example_pkg/index.html",\n"example_pkg/'],
276-
file_pattern='index.js')
275+
if shutil.which('node'):
276+
self._check_files(include_patterns=['INDEX={"version"'],
277+
file_pattern='index.js')
278+
else:
279+
self._check_files(
280+
include_patterns=['URLS=[\n"example_pkg/index.html",\n"example_pkg/'],
281+
file_pattern='index.js')
277282
self._check_files(include_patterns=["'../doc-search.html#'"],
278283
file_pattern='example_pkg/index.html')
279284
self._check_files(include_patterns=["'../doc-search.html#'"],

0 commit comments

Comments
 (0)