10000 feat: add persistent worker for sphinxdocs (#2938) · benjaminp/rules_python@0498664 · GitHub
[go: up one dir, main page]

Skip to content

Commit 0498664

Browse files
kaycebasquesKayce Basquesrickeylev
authored
feat: add persistent worker for sphinxdocs (bazel-contrib#2938)
This implements a simple, serialized persistent worker for Sphinxdocs with several optimizations. It is enabled by default. * The worker computes what inputs have changed, allowing Sphinx to only rebuild what is necessary. * Doctrees are written to a separate directory so they are retained between builds. * The worker tells Sphinx to write output to an internal directory, then copies it to the expected Bazel output directory afterwards. This allows Sphinx to only write output files that need to be updated. This works by having the worker compute what files have changed and having a Sphinx extension use the `get-env-outdated` event to tell Sphinx which files have changed. The extension is based on https://pwrev.dev/294057, but re-implemented to be in-memory as part of the worker instead of a separate extension projects must configure. For rules_python's doc building, this reduces incremental building from about 8 seconds to about 0.8 seconds. From what I can tell, about half the time is spent generating doctrees, and the other half generating the output files. Worker mode is enabled by default and can be disabled on the target or by adjusting the Bazel flags controlling execution strategy. Docs added to explain how. Because `--doctree-dir` is now always specified and outside the output dir, non-worker invocations can benefit, too, if run without sandboxing. Docs added to explain how to do this. Along the way: * Remove `--write-all` and `--fresh-env` from run args. This lets direct invocations benefit from the normal caching Sphinx does. * Change the args formatting to `--foo=bar` so they are a single element; just a bit nicer to see when debugging. Work towards bazel-contrib#2878, bazel-contrib#2879 --------- Co-authored-by: Kayce Basques <kayce@google.com> Co-authored-by: Richard Levasseur <rlevasseur@google.com>
1 parent 9429ae6 commit 0498664

File tree

5 files changed

+302
-13
lines changed

5 files changed

+302
-13
lines changed

sphinxdocs/docs/index.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,29 @@ documentation. It comes with:
1111
While it is primarily oriented towards docgen for Starlark code, the core of it
1212
is agnostic as to what is being documented.
1313

14+
### Optimization
15+
16+
Normally, Sphinx keeps various cache files to improve incremental building.
17+
Unfortunately, programs performing their own caching don't interact well
18+
with Bazel's model of precisely declaring and strictly enforcing what are
19+
inputs, what are outputs, and what files are available when running a program.
20+
The net effect is programs don't have a prior invocation's cache files
21+
available.
22+
23+
There are two mechanisms available to make some cache available to Sphinx under
24+
Bazel:
25+
26+
* Disable sandboxing, which allows some files from prior invocations to be
27+
visible to subsequent invocations. This can be done multiple ways:
28+
* Set `tags = ["no-sandbox"]` on the `sphinx_docs` target
29+
* `--modify_execution_info=SphinxBuildDocs=+no-sandbox` (Bazel flag)
30+
* `--strategy=SphinxBuildDocs=local` (Bazel flag)
31+
* Use persistent workers (enabled by default) by setting
32+
`allow_persistent_workers=True` on the `sphinx_docs` target. Note that other
33+
Bazel flags can disable using workers even if an action supports it. Setting
34+
`--strategy=SphinxBuildDocs=dynamic,worker,local,sandbox` should tell Bazel
35+
to use workers if possible, otherwise fallback to non-worker invocations.
36+
1437

1538
```{toctree}
1639
:hidden:

sphinxdocs/private/sphinx.bzl

Lines changed: 44 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ def sphinx_docs(
103103
strip_prefix = "",
104104
extra_opts = [],
105105
tools = [],
106+
allow_persistent_workers = True,
106107
**kwargs):
107108
"""Generate docs using Sphinx.
108109
@@ -142,6 +143,9 @@ def sphinx_docs(
142143
tools: {type}`list[label]` Additional tools that are used by Sphinx and its plugins.
143144
This just makes the tools available during Sphinx execution. To locate
144145
them, use {obj}`extra_opts` and `$(location)`.
146+
allow_persistent_workers: {type}`bool` (experimental) If true, allow
147+
using persistent workers for running Sphinx, if Bazel decides to do so.
148+
This can improve incremental building of docs.
145149
**kwargs: {type}`dict` Common attributes to pass onto rules.
146150
"""
147151
add_tag(kwargs, "@rules_python//sphinxdocs:sphinx_docs")
@@ -165,6 +169,7 @@ def sphinx_docs(
165169
source_tree = internal_name + "/_sources",
166170
extra_opts = extra_opts,
167171
tools = tools,
172+
allow_persistent_workers = allow_persistent_workers,
168173
**kwargs
169174
)
170175

@@ -209,6 +214,7 @@ def _sphinx_docs_impl(ctx):
209214
source_path = source_dir_path,
210215
output_prefix = paths.join(ctx.label.name, "_build"),
211216
inputs = inputs,
217+
allow_persistent_workers = ctx.attr.allow_persistent_workers,
212218
)
213219
outputs[format] = output_dir
214220
per_format_args[format] = args_env
@@ -229,6 +235,10 @@ def _sphinx_docs_impl(ctx):
229235
_sphinx_docs = rule(
230236
implementation = _sphinx_docs_impl,
231237
attrs = {
238+
"allow_persistent_workers": attr.bool(
239+
doc = "(experimental) Whether to invoke Sphinx as a persistent worker.",
240+
default = False,
241+
),
232242
"extra_opts": attr.string_list(
233243
doc = "Additional options to pass onto Sphinx. These are added after " +
234244
"other options, but before the source/output args.",
@@ -254,28 +264,45 @@ _sphinx_docs = rule(
254264
},
255265
)
256266

257-
def _run_sphinx(ctx, format, source_path, inputs, output_prefix):
267+
def _run_sphinx(ctx, format, source_path, inputs, output_prefix, allow_persistent_workers):
258268
output_dir = ctx.actions.declare_directory(paths.join(output_prefix, format))
259269

260270
run_args = [] # Copy of the args to forward along to debug runner
261271
args = ctx.actions.args() # Args passed to the action
262272

273+
# An args file is required for persistent workers, but we don't know if
274+
# the action will use worker mode or not (settings we can't see may
275+
# force non-worker mode). For consistency, always use a params file.
276+
args.use_param_file("@%s", use_always = True)
277+
args.set_param_file_format("multiline")
278+
279+
# NOTE: sphinx_build.py relies on the first two args being the srcdir and
280+
# outputdir, in that order.
281+
args.add(source_path)
282+
args.add(output_dir.path)
283+
263284
args.add("--show-traceback") # Full tracebacks on error
264285
run_args.append("--show-traceback")
265-
args.add("--builder", format)
266-
run_args.extend(("--builder", format))
286+
args.add(format, format = "--builder=%s")
287+
run_args.append("--builder={}".format(format))
267288

268289
if ctx.attr._quiet_flag[BuildSettingInfo].value:
269290
# Not added to run_args because run_args is for debugging
270291
args.add("--quiet") # Suppress stdout informational text
271292

272293
# Build in parallel, if possible
273294
# Don't add to run_args: parallel building breaks interactive debugging
274-
args.add("--jobs", "auto")
275-
args.add("--fresh-env") # Don't try to use cache files. Bazel can't make use of them.
276-
run_args.append("--fresh-env")
277-
args.add("--write-all") # Write all files; don't try to detect "changed" files
278-
run_args.append("--write-all")
295+
args.add("--jobs=auto")
296+
297+
# Put the doctree dir outside of the output directory.
298+
# This allows it to be reused between invocations when possible; Bazel
299+
# clears the output directory every action invocation.
300+
# * For workers, they can fully re-use it.
301+
# * For non-workers, it can be reused when sandboxing is disabled via
302+
# the `no-sandbox` tag or execution requirement.
303+
#
304+
# We also use a non-dot prefixed name so it shows up more visibly.
305+
args.add(paths.join(output_dir.path + "_doctrees"), format = "--doctree-dir=%s")
279306

280307
for opt in ctx.attr.extra_opts:
281308
expanded = ctx.expand_location(opt)
@@ -287,9 +314,6 @@ def _run_sphinx(ctx, format, source_path, inputs, output_prefix):
287314
for define in extra_defines:
288315
run_args.extend(("--define", define))
289316

290-
args.add(source_path)
291-
args.add(output_dir.path)
292-
293317
env = dict([
294318
v.split("=", 1)
295319
for v in ctx.attr._extra_env_flag[_FlagInfo].value
@@ -299,6 +323,14 @@ def _run_sphinx(ctx, format, source_path, inputs, output_prefix):
299323
for tool in ctx.attr.tools:
300324
tools.append(tool[DefaultInfo].files_to_run)
301325

326+
# NOTE: Command line flags or RBE capabilities may override the execution
327+
# requirements and disable workers. Thus, we can't assume that these
328+
# exec requirements will actually be respected.
329+
execution_requirements = {}
330+
if allow_persistent_workers:
331+
execution_requirements["supports-workers"] = "1"
332+
execution_requirements["requires-worker-protocol"] = "json"
333+
302334
ctx.actions.run(
303335
executable = ctx.executable.sphinx,
304336
arguments = [args],
@@ -308,6 +340,7 @@ def _run_sphinx(ctx, format, source_path, inputs, output_prefix):
308340
mnemonic = "SphinxBuildDocs",
309341
progress_message = "Sphinx building {} for %{{label}}".format(format),
310342
env = env,
343+
execution_requirements = execution_requirements,
311344
)
312345
return output_dir, struct(args = run_args, env = env)
313346

sphinxdocs/private/sphinx_build.py

Lines changed: 229 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,235 @@
1+
import contextlib
2+
import io
3+
import json
4+
import logging
15
import os
2-
import pathlib
6+
import shutil
37
import sys
8+
import traceback
9+
import typing
410

11+
import sphinx.application
512
from sphinx.cmd.build import main
613

14+
WorkRequest = object
15+
WorkResponse = object
16+
17+
logger = logging.getLogger("sphinxdocs_build")
18+
19+
_WORKER_SPHINX_EXT_MODULE_NAME = "bazel_worker_sphinx_ext"
20+
21+
# Config value name for getting the path to the request info file
22+
_REQUEST_INFO_CONFIG_NAME = "bazel_worker_request_info_path"
23+
24+
25+
class Worker:
26+
27+
def __init__(
28+
self, instream: "typing.TextIO", outstream: "typing.TextIO", exec_root: str
29+
):
30+
# NOTE: Sphinx performs its own logging re-configuration, so any
31+
# logging config we do isn't respected by Sphinx. Controlling where
32+
# stdout and stderr goes are the main mechanisms. Recall that
33+
# Bazel send worker stderr to the worker log file.
34+
# outputBase=$(bazel info output_base)
35+
# find $outputBase/bazel-workers/ -type f -printf '%T@ %p\n' | sort -n | tail -1 | awk '{print $2}'
36+
logging.basicConfig(level=logging.WARN)
37+
logger.info("Initializing worker")
38+
39+
# The directory that paths are relative to.
40+
self._exec_root = exec_root
41+
# Where requests are read from.
42+
self._instream = instream
43+
# Where responses are written to.
44+
self._outstream = outstream
45+
46+
# dict[str srcdir, dict[str path, str digest]]
47+
self._digests = {}
48+
49+
# Internal output directories the worker gives to Sphinx that need
50+
# to be cleaned up upon exit.
51+
# set[str path]
52+
self._worker_outdirs = set()
53+
self._extension = BazelWorkerExtension()
54+
55+
sys.modules[_WORKER_SPHINX_EXT_MODULE_NAME] = self._extension
56+
sphinx.application.builtin_extensions += (_WORKER_SPHINX_EXT_MODULE_NAME,)
57+
58+
def __enter__(self):
59+
return self
60+
61+
def __exit__(self):
62+
for worker_outdir in self._worker_outdirs:
63+
shutil.rmtree(worker_outdir, ignore_errors=True)
64+
65+
def run(self) -> None:
66+
logger.info("Worker started")
67+
try:
68+
while True:
69+
request = None
70+
try:
71+
request = self._get_next_request()
72+
if request is None:
73+
logger.info("Empty request: exiting")
74+
break
75+
response = self._process_request(request)
76+
if response:
77+
self._send_response(response)
78+
except Exception:
79+
logger.exception("Unhandled error: request=%s", request)
80+
output = (
81+
f"Unhandled error:\nRequest id: {request.get('id')}\n"
82+
+ traceback.format_exc()
83+
)
84+
request_id = 0 if not request else request.get("requestId", 0)
85+
self._send_response(
86+
{
87+
"exitCode": 3,
88+
"output": output,
89+
"requestId": request_id,
90+
}
91+
)
92+
finally:
93+
logger.info("Worker shutting down")
94+
95+
def _get_next_request(self) -> "object | None":
96+
line = self._instream.readline()
97+
if not line:
98+
return None
99+
return json.loads(line)
100+
101+
def _send_response(self, response: "WorkResponse") -> None:
102+
self._outstream.write(json.dumps(response) + "\n")
103+
self._outstream.flush()
104+
105+
def _prepare_sphinx(self, request):
106+
sphinx_args = request["arguments"]
107+
srcdir = sphinx_args[0]
108+
109+
incoming_digests = {}
110+
current_digests = self._digests.setdefault(srcdir, {})
111+
changed_paths = []
112+
request_info = {"exec_root": self._exec_root, "inputs": request["inputs"]}
113+
for entry in request["inputs"]:
114+
path = entry["path"]
115+
digest = entry["digest"]
116+
# Make the path srcdir-relative so Sphinx understands it.
117+
path = path.removeprefix(srcdir + "/")
118+
incoming_digests[path] = digest
119+
120+
if path not in current_digests:
121+
logger.info("path %s new", path)
122+
changed_paths.append(path)
123+
elif current_digests[path] != digest:
124+
logger.info("path %s changed", path)
125+
changed_paths.append(path)
126+
127+
self._digests[srcdir] = incoming_digests
128+
self._extension.changed_paths = changed_paths
129+
request_info["changed_sources"] = changed_paths
130+
131+
bazel_outdir = sphinx_args[1]
132+
worker_outdir = bazel_outdir + ".worker-out.d"
133+
self._worker_outdirs.add(worker_outdir)
134+
sphinx_args[1] = worker_outdir
135+
136+
request_info_path = os.path.join(srcdir, "_bazel_worker_request_info.json")
137+
with open(request_info_path, "w") as fp:
138+
json.dump(request_info, fp)
139+
sphinx_args.append(f"--define={_REQUEST_INFO_CONFIG_NAME}={request_info_path}")
140+
141+
return worker_outdir, bazel_outdir, sphinx_args
142+
143+
@contextlib.contextmanager
144+
def _redirect_streams(self):
145+
out = io.StringIO()
146+
orig_stdout = sys.stdout
147+
try:
148+
sys.stdout = out
149+
yield out
150+
finally:
151+
sys.stdout = orig_stdout
152+
153+
def _process_request(self, request: "WorkRequest") -> "WorkResponse | None":
154+
logger.info("Request: %s", json.dumps(request, sort_keys=True, indent=2))
155+
if request.get("cancel"):
156+
return None
157+
158+
worker_outdir, bazel_outdir, sphinx_args = self._prepare_sphinx(request)
159+
160+
# Prevent anything from going to stdout because it breaks the worker
161+
# protocol. We have limited control over where Sphinx sends output.
162+
with self._redirect_streams() as stdout:
163+
logger.info("main args: %s", sphinx_args)
164+
exit_code = main(sphinx_args)
165+
166+
if exit_code:
167+
raise Exception(
168+
"Sphinx main() returned failure: "
169+
+ f" exit code: {exit_code}\n"
170+
+ "========== STDOUT START ==========\n"
171+
+ stdout.getvalue().rstrip("\n")
172+
+ "\n"
173+
+ "========== STDOUT END ==========\n"
174+
)
175+
176+
# Copying is unfortunately necessary because Bazel doesn't know to
177+
# implicily bring along what the symlinks point to.
178+
shutil.copytree(worker_outdir, bazel_outdir, dirs_exist_ok=True)
179+
180+
response = {
181+
"requestId": request.get("requestId", 0),
182+
"output": stdout.getvalue(),
183+
"exitCode": 0,
184+
}
185+
return response
186+
187+
188+
class BazelWorkerExtension:
189+
"""A Sphinx extension implemented as a class acting like a module."""
190+
191+
def __init__(self):
192+
# Make it look like a Module object
193+
self.__name__ = _WORKER_SPHINX_EXT_MODULE_NAME
194+
# set[str] of src-dir relative path names
195+
self.changed_paths = set()
196+
197+
def setup(self, app):
198+
app.add_config_value(_REQUEST_INFO_CONFIG_NAME, "", "")
199+
app.connect("env-get-outdated", self._handle_env_get_outdated)
200+
return {"parallel_read_safe": True, "parallel_write_safe": True}
201+
202+
def _handle_env_get_outdated(self, app, env, added, changed, removed):
203+
changed = {
204+
# NOTE: path2doc returns None if it's not a doc path
205+
env.path2doc(p)
206+
for p in self.changed_paths
207+
}
208+
209+
logger.info("changed docs: %s", changed)
210+
return changed
211+
212+
213+
def _worker_main(stdin, stdout, exec_root):
214+
with Worker(stdin, stdout, exec_root) as worker:
215+
return worker.run()
216+
217+
218+
def _non_worker_main():
219+
args = []
220+
for arg in sys.argv:
221+
if arg.startswith("@"):
222+
with open(arg.removeprefix("@")) as fp:
223+
lines = [line.strip() for line in fp if line.strip()]
224+
args.extend(lines)
225+
else:
226+
args.append(arg)
227+
sys.argv[:] = args
228+
return main()
229+
230+
7231
if __name__ == "__main__":
8-
sys.exit(main())
232+
if "--persistent_worker" in sys.argv:
233+
sys.exit(_worker_main(sys.stdin, sys.stdout, os.getcwd()))
234+
else:
235+
sys.exit(_non_worker_main())

0 commit comments

Comments
 (0)
0