8000 Cache kpsewhich results persistently · matplotlib/matplotlib@1ef6c98 · GitHub
[go: up one dir, main page]

Skip to content

Commit 1ef6c98

Browse files
committed
Cache kpsewhich results persistently
And allow batching them. This commit does not yet use the batching but makes it possible.
1 parent 2bd942c commit 1ef6c98

File tree

4 files changed

+347
-18
lines changed

4 files changed

+347
-18
lines changed
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
dviread changes
2+
---------------
3+
4+
The ``format`` keyword argument to ``dviread.find_tex_file`` has been
5+
deprecated. The function without the ``format`` argument, as well as
6+
the new ``dviread.find_tex_files`` function, cache their results in
7+
``texsupport.N.db`` in the cache directory to speed up dvi file
8+
processing.
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
TeX support cache
2+
-----------------
3+
4+
The `usetex` feature sends snippets of TeX code to LaTeX and related
5+
external tools for processing. This causes a nontrivial number of
6+
helper processes to be spawned, which can be slow on some platforms.
7+
A new cache database helps reduce the need to spawn these helper
8+
processes, which should improve `usetex` processing speed.
9+
10+
The new cache files
11+
~~~~~~~~~~~~~~~~~~~
12+
13+
The cache database is stored in a file named `texsupport.N.db` in the
14+
standard cache directory (traditionally `$HOME/.matplotlib` but
15+
possibly `$HOME/.cache/matplotlib`), where `N` stands for a version
16+
number. The version number is incremented when new kinds of items are
17+
added to the caching code, in order to avoid version clashes when
18+
using multiple different versions of Matplotlib. The auxiliary files
19+
`texsupport.N.db-wal` and `texsupport.N.db-shm` help coordinate usage
20+
of the cache between concurrently running instances. All of these
21+
cache files may be deleted when Matplotlib is not running, and
22+
subsequent calls to the `usetex` code will recompute the TeX results.

lib/matplotlib/dviread.py

Lines changed: 233 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,11 @@
2424
import os
2525
import re
2626
import struct
27+
import sqlite3
2728
import sys
2829
import textwrap
2930

30-
from matplotlib import cbook, rcParams
31+
from matplotlib import cbook, get_cachedir, rcParams
3132
from matplotlib.compat import subprocess
3233

3334
_log = logging.getLogger(__name__)
@@ -978,45 +979,259 @@ def _parse(self, file):
978979
return re.findall(br'/([^][{}<>\s]+)', data)
979980

980981

981-
def find_tex_file(filename, format=None):
982+
class TeXSupportCacheError(Exception):
983+
pass
984+
985+
986+
class TeXSupportCache:
987+
"""A persistent cache of data related to support files related to dvi
988+
files produced by TeX. Currently holds results from :program:`kpsewhich`,
989+
in future versions could hold pre-parsed font data etc.
990+
991+
Usage::
992+
993+
# create or get the singleton instance
994+
cache = TeXSupportCache.get_cache()
995+
with cache.connection as transaction:
996+
cache.update_pathnames(
997+
{"pdftex.map": "/usr/local/pdftex.map",
998+
"cmsy10.pfb": "/usr/local/fonts/cmsy10.pfb"},
999+
transaction)
1000+
pathnames = cache.get_pathnames(["pdftex.map", "cmr10.pfb"])
1001+
# now pathnames = {"pdftex.map": "/usr/local/pdftex.map"}
1002+
1003+
# optional after inserting new data, may improve query performance:
1004+
cache.optimize()
1005+
1006+
Parameters
1007+
----------
1008+
1009+
filename : str, optional
1010+
File in which to store the cache. Defaults to `texsupport.N.db` in
1011+
the standard cache directory where N is the current schema version.
1012+
1013+
Attributes
1014+
----------
1015+
1016+
connection
1017+
This database connection object has a context manager to set up
1018+
a transaction. Transactions are passed into methods that write to
1019+
the database.
9821020
"""
983-
Find a file in the texmf tree.
1021+
1022+
__slots__ = ('connection')
1023+
schema_version = 1 # should match PRAGMA user_version in _create
1024+
instance = None
1025+
1026+
@classmethod
1027+
def get_cache(cls):
1028+
"Return the singleton instance of the cache, at the default location"
1029+
if cls.instance is None:
1030+
cls.instance = cls()
1031+
return cls.instance
1032+
1033+
def __init__(self, filename=None):
1034+
if filename is None:
1035+
filename = os.path.join(get_cachedir(), 'texsupport.%d.db'
1036+
% self.schema_version)
1037+
1038+
self.connection = sqlite3.connect(
1039+
filename, isolation_level="DEFERRED")
1040+
with self.connection as conn:
1041+
conn.execute("PRAGMA journal_mode=WAL;")
1042+
version, = conn.execute("PRAGMA user_version;").fetchone()
1043+
1044+
if version == 0:
1045+
self._create()
1046+
elif version != self.schema_version:
1047+
raise TeXSupportCacheError(
1048+
"support database %s has version %d, expected %d"
1049+
% (filename, version, self.schema_version))
1050+
1051+
def _create(self):
1052+
"""Create the database."""
1053+
with self.connection as conn:
1054+
conn.executescript(
1055+
"""
1056+
PRAGMA page_size=4096;
1057+
CREATE TABLE file_path(
1058+
filename TEXT PRIMARY KEY NOT NULL,
1059+
pathname TEXT
1060+
) WITHOUT ROWID;
1061+
PRAGMA user_version=1;
1062+
""")
1063+
1064+
def optimize(self):
1065+
"""Optional optimization phase after updating data.
1066+
Executes sqlite's `PRAGMA optimize` statement, which can call
1067+
`ANALYZE` or other functions that can improve future query performance
1068+
by spending some time up-front."""
1069+
with self.connection as conn:
1070+
conn.execute("PRAGMA optimize;")
1071+
1072+
def get_pathnames(self, filenames):
1073+
"""Query the cache for pathnames related to `filenames`.
1074+
1075+
Parameters
1076+
----------
1077+
filenames : iterable of str
1078+
1079+
Returns
1080+
-------
1081+
mapping from str to (str or None)
1082+
For those filenames that exist in the cache, the mapping
1083+
includes either the related pathname or None to indicate that
1084+
the named file does not exist.
1085+
"""
1086+
rows = self.connection.execute(
1087+
"SELECT filename, pathname FROM file_path WHERE filename IN "
1088+
"(%s)"
1089+
% ','.join('?' for _ in filenames),
1090+
filenames).fetchall()
1091+
return {filename: pathname for (filename, pathname) in rows}
1092+
1093+
def update_pathnames(self, mapping, transaction):
1094+
"""Update the cache with the given filename-to-pathname mapping
1095+
1096+
Parameters
1097+
----------
1098+
mapping : mapping from str to (str or None)
1099+
Mapping from filenames to the corresponding full pathnames
1100+
or None to indicate that the named file does not exist.
1101+
transaction : obtained via the context manager of self.connection
1102+
"""
1103+
transaction.executemany(
1104+
"INSERT OR REPLACE INTO file_path (filename, pathname) "
1105+
"VALUES (?, ?)",
1106+
mapping.items())
1107+
1108+
1109+
def find_tex_files(filenames, cache=None):
1110+
"""Find multiple files in the texmf tree. This can be more efficient
1111+
than `find_tex_file` because it makes only one call to `kpsewhich`.
9841112
9851113
Calls :program:`kpsewhich` which is an interface to the kpathsea
9861114
library [1]_. Most existing TeX distributions on Unix-like systems use
9871115
kpathsea. It is also available as part of MikTeX, a popular
9881116
distribution on Windows.
9891117
1118+
The results are cached into the TeX support database. In case of
1119+
mistaken results, deleting the database resets the cache.
1120+
9901121
Parameters
9911122
----------
9921123
filename : string or bytestring
993-
format : string or bytestring
994-
Used as the value of the `--format` option to :program:`kpsewhich`.
995-
Could be e.g. 'tfm' or 'vf' to limit the search to that type of files.
1124+
cache : TeXSupportCache, optional
1125+
Cache instance to use, defaults to the singleton instance of the class.
9961126
9971127
References
9981128
----------
9991129
10001130
.. [1] `Kpathsea documentation <http://www.tug.org/kpathsea/>`_
10011131
The library that :program:`kpsewhich` is part of.
1132+
10021133
"""
10031134

10041135
# we expect these to always be ascii encoded, but use utf-8
10051136
# out of caution
1006-
if isinstance(filename, bytes):
1007-
filename = filename.decode('utf-8', errors='replace')
1008-
if isinstance(format, bytes):
1009-
format = format.decode('utf-8', errors='replace')
1137+
filenames = [f.decode('utf-8', errors='replace')
1138+
if isinstance(f, bytes) else f
1139+
for f in filenames]
1140+
if cache is None:
1141+
cache = TeXSupportCache.get_cache()
1142+
result = cache.get_pathnames(filenames)
1143+
1144+
filenames = [f for f in filenames if f not in result]
1145+
if not filenames:
1146+
return result
10101147

1011-
cmd = ['kpsewhich']
1012-
if format is not None:
1013-
cmd += ['--format=' + format]
1014-
cmd += [filename]
1015-
_log.debug('find_tex_file(%s): %s', filename, cmd)
1148+
cmd = ['kpsewhich'] + list(filenames)
1149+
_log.debug('find_tex_files: %s', cmd)
10161150
pipe = subprocess.Popen(cmd, stdout=subprocess.PIPE)
1017-
result = pipe.communicate()[0].rstrip()
1018-
_log.debug('find_tex_file result: %s', result)
1019-
return result.decode('ascii')
1151+
output = pipe.communicate()[0].decode('ascii').splitlines()
1152+
_log.debug('find_tex_files result: %s', output)
1153+
mapping = _match(filenames, output)
1154+
with cache.connection as transaction:
1155+
cache.update_pathnames(mapping, transaction)
1156+
result.update(mapping)
1157+
1158+
return result
1159+
1160+
1161+
def _match(filenames, pathnames):
1162+
"""
1163+
Match filenames to pathnames in lists that are in matching order,
1164+
except that some filenames may lack pathnames.
1165+
"""
1166+
result = {f: None for f in filenames}
1167+
filenames, pathnames = iter(filenames), iter(pathnames)
1168+
try:
1169+
filename, pathname = next(filenames), next(pathnames)
1170+
while True:
1171+
if pathname.endswith(os.path.sep + filename):
1172+
result[filename] = pathname
1173+
pathname = next(pathnames)
1174+
filename = next(filenames)
1175+
except StopIteration:
1176+
return result
1177+
1178+
1179+
def find_tex_file(filename, format=None, cache=None):
1180+
"""
1181+
Find a file in the texmf tree.
1182+
1183+
Calls :program:`kpsewhich` which is an interface to the kpathsea
1184+
library [1]_. Most existing TeX distributions on Unix-like systems use
1185+
kpathsea. It is also available as part of MikTeX, a popular
1186+
distribution on Windows.
1187+
1188+
The results are cached into a database whose location defaults to
1189+
:file:`~/.matplotlib/texsupport.db`. In case of mistaken results,
1190+
deleting this file resets the cache.
1191+
1192+
Parameters
1193+
----------
1194+
filename : string or bytestring
1195+
format : string or bytestring, DEPRECATED
1196+
Used as the value of the `--format` option to :program:`kpsewhich`.
1197+
Could be e.g. 'tfm' or 'vf' to limit the search to that type of files.
1198+
Deprecated to allow batching multiple filenames into one kpsewhich
1199+
call, since any format option would apply to all filenames at once.
1200+
cache : TeXSupportCache, optional
1201+
Cache instance to use, defaults to the singleton instance of the class.
1202+
1203+
References
1204+
----------
1205+
1206+
.. [1] `Kpathsea documentation <http://www.tug.org/kpathsea/>`_
1207+
The library that :program:`kpsewhich` is part of.
1208+
"""
1209+
1210+
if format is not None:
1211+
cbook.warn_deprecated(
1212+
"3.0",
1213+
"The format option to find_tex_file is deprecated "
1214+
"to allow batching multiple filenames into one call. "
1215+
"Omitting the option should not change the result, as "
1216+
"kpsewhich uses the filename extension to choose the path.")
1217+
# we expect these to always be ascii encoded, but use utf-8
1218+
# out of caution
1219+
if isinstance(filename, bytes):
1220+
filename = filename.decode('utf-8', errors='replace')
1221+
if isinstance(format, bytes):
1222+
format = format.decode('utf-8', errors='replace')
1223+
1224+
cmd = ['kpsewhich']
1225+
if format is not None:
1226+
cmd += ['--format=' + format]
1227+
cmd += [filename]
1228+
_log.debug('find_tex_file(%s): %s', filename, cmd)
1229+
pipe = subprocess.Popen(cmd, stdout=subprocess.PIPE)
1230+
result = pipe.communicate()[0].rstrip()
1231+
_log.debug('find_tex_file result: %s', result)
1232+
return result.decode('ascii')
1233+
1234+
return list(find_tex_files([filename], cache).values())[0]
10201235

10211236

10221237
# With multiple text objects per figure (e.g., tick labels) we may end

0 commit comments

Comments
 (0)
0