8000 Type-1 subsetting · jkseppan/matplotlib@6546417 · GitHub
[go: up one dir, main page]

Skip to content

Commit 6546417

Browse files
committed
Type-1 subsetting
With this I can produce smaller pdf files with usetex in some small tests, but this obviously needs more extensive testing, thus marking as draft. On top of matplotlib#20634 and matplotlib#20715. Closes matplotlib#127.
1 parent e35728b commit 6546417

File tree

3 files changed

+259
-16
lines changed

3 files changed

+259
-16
lines changed

lib/matplotlib/backends/_backend_pdf_ps.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,25 @@ class CharacterTracker:
2727
def __init__(self):
2828
self.used = {}
2929

30-
def track(self, font, s):
31-
"""Record that string *s* is being typeset using font *font*."""
30+
@staticmethod
31+
def _get_name(font):
3232
if isinstance(font, str):
3333
# Unused, can be removed after removal of track_characters.
3434
fname = font
35-
else:
35+
elif hasattr(font, 'fname'):
3636
fname = font.fname
37-
self.used.setdefault(fname, set()).update(map(ord, s))
37+
elif hasattr(font, 'name'):
38+
fname = font.name
39+
if isinstance(fname, bytes):
40+
fname = fname.decode('ascii', 'error')
41+
return fname
42+
43+
def get_used(self, font, default=None):
44+
return self.used.get(self._get_name(font), default)
45+
46+
def track(self, font, s):
47+
"""Record that string *s* is being typeset using font *font*."""
48+
self.used.setdefault(self._get_name(font), set()).update(map(ord, s))
3849

3950
# Not public, can be removed when pdf/ps merge_used_characters is removed.
4051
def merge(self, other):

lib/matplotlib/backends/backend_pdf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -963,6 +963,8 @@ def _embedTeXFont(self, fontinfo):
963963
t1font = type1font.Type1Font(fontinfo.fontfile)
964964
if fontinfo.effects:
965965
t1font = t1font.transform(fontinfo.effects)
966+
chars = self._character_tracker.get_used(fontinfo.pdfname)
967+
t1font = t1font.subset(chars)
966968
fontdict['BaseFont'] = Name(t1font.prop['FontName'])
967969

968970
# Font descriptors may be shared between differently encoded
@@ -2227,6 +2229,7 @@ def draw_tex(self, gc, x, y, s, prop, angle, *, mtext=None):
22272229
seq += [['font', pdfname, dvifont.size]]
22282230
oldfont = dvifont
22292231
seq += [['text', x1, y1, [bytes([glyph])], x1+width]]
2232+
self.file._character_tracker.track(pdfname, chr(glyph))
22302233

22312234
# Find consecutive text strings with constant y coordinate and
22322235
# combine into a sequence of strings and kerns, or just one

lib/matplotlib/type1font.py

Lines changed: 241 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
v1.1, 1993. ISBN 0-201-57044-0.
2222
"""
2323

24+
import base64
2425
import binascii
2526
import logging
2627
import re
@@ -35,7 +36,36 @@
3536
_log = logging.getLogger(__name__)
3637

3738

39+
def _make_tag(set):
40+
"""
41+
Hash set into a six-character tag make of uppercase letters
42+
43+
Useful for adding a tag into subsetted fonts while keeping the code
44+
reproducible. The function always returns the same value for the
45+
same set on the same exact Python version but is not guaranteed to
46+
not have collisions.
47+
48+
Parameters
49+
----------
50+
set : iterable
51+
The set of glyphs present in a font subset
52+
53+
Returns
54+
-------
55+
bytes
56+
Six uppercase ASCII letters
57+
"""
58+
59+
# freeze the set to make it hashable, interpret the hash as bytes
60+
array = struct.pack("@q", hash(frozenset(set)))
61+
# turn the bytes into characters with b32encode, which uses uppercase
62+
# letters and numbers from 2 to 7 - remap those arbitrarily
63+
trans = bytes.maketrans(b'234567', b'MTPLIB')
64+
return base64.b32encode(array).translate(trans, delete=b'=')[:6]
65+
66+
3867
class _Token:
68+
3969
"""
4070
A token in a PostScript stream
4171
@@ -485,6 +515,15 @@ def convert(x): return x.decode('ascii', 'replace')
485515
except StopIteration:
486516
break
487517

518+
# there are some standard abbreviations whose names vary
519+
# so detect them
520+
if value == b'{noaccess def}':
521+
self._abbr['ND'] = key.encode('ascii')
522+
elif value == b'{noaccess put}':
523+
self._abbr['NP'] = key.encode('ascii')
524+
elif value == b'{string currentfile exch readstring pop}':
525+
self._abbr['RD'] = key.encode('ascii')
526+
488527
# sometimes noaccess def and readonly def are abbreviated
489528
if kw.is_name(b'def', self._abbr['ND'], self._abbr['NP']):
490529
prop[key] = value
@@ -556,13 +595,16 @@ def _parse_subrs(self, tokens, _data):
556595
"Token preceding subr must be RD or equivalent, "
557596
f"was {token}"
558597
)
598+
if not token.is_name(self._abbr['RD']):
599+
raise RuntimeError(
600+
f"Token preceding subr must be RD or equivalent, was {token}"
601+
)
559602
binary_token = tokens.send(1+nbytes_token.numeric_value())
560603
array[index_token.numeric_value()] = binary_token.value[1:]
561604

562605
return array, next(tokens).endpos()
563606

564-
@staticmethod
565-
def _parse_charstrings(tokens, _data):
607+
def _parse_charstrings(self, tokens, _data):
566608
count_token = next(tokens)
567609
if not count_token.is_number():
568610
raise RuntimeError(
@@ -587,7 +629,11 @@ def _parse_charstrings(tokens, _data):
587629
f"Token following /{glyphname} in CharStrings definition "
588630
f"must be a number, was {nbytes_token}"
589631
)
590-
token = next(tokens) # usually RD or |-
632+
token = next(tokens)
633+
if not token.is_name(self._abbr['RD']):
634+
raise RuntimeError(
635+
f"Token preceding charstring must be RD or equivalent, was {token}"
636+
)
591637
binary_token = tokens.send(1+nbytes_token.numeric_value())
592638
charstrings[glyphname] = binary_token.value[1:]
593639

@@ -620,16 +666,15 @@ def _parse_encoding(tokens, _data):
620666
encoding[index_token.numeric_value()] = \
621667
name_token.value[1:].decode('ascii', 'replace')
622668

623-
@staticmethod
624-
def _parse_othersubrs(tokens, data):
669+
def _parse_othersubrs(self, tokens, data):
625670
init_pos = None
626671
while True:
627672
token = next(tokens)
628673
if init_pos is None:
629674
init_pos = token.pos
630675
if token.is_delim():
631676
_expression(token, tokens, data)
632-
elif token.value in (b'def', b'ND', b'|-'):
677+
elif token.value in (b'def', self._abbr['ND']):
633678
return data[init_pos:token.endpos()], token.endpos()
634679

635680
def transform(self, effects):
@@ -684,7 +729,7 @@ def transform(self, effects):
684729
fontmatrix = (
685730
'[%s]' % ' '.join(_format_approx(x, 6) for x in array)
686731
).encode('ascii')
687-
replacements = (
732+
newparts = self._replace(
688733
[(x, b'/FontName/%s def' % fontname)
689734
for x in self._pos['FontName']]
690735
+ [(x, b'/ItalicAngle %a def' % italicangle)
@@ -694,6 +739,9 @@ def transform(self, effects):
694739
+ [(x, b'') for x in self._pos.get('UniqueID', [])]
695740
)
696741

742+
return Type1Font((newparts[0], self._encrypt(newparts[1], 'eexec'), self.parts[2]))
743+
744+
def _replace(self, replacements):
697745
data = bytearray(self.parts[0])
698746
data.extend(self.decrypted)
699747
len0 = len(self.parts[0])
@@ -708,11 +756,192 @@ def transform(self, effects):
708756
len0 += len(value) - pos1 + pos0
709757

710758
data = bytes(data)
711-
return Type1Font((
712-
data[:len0],
713-
self._encrypt(data[len0:], 'eexec'),
714-
self.parts[2]
715-
))
759+
return data[:len0], data[len0:]
760+
761+
def subset(self, characters):
762+
"""
763+
Return a new font that only defines the given characters.
764+
765+
Parameters
766+
----------
767+
characters : sequence of bytes
768+
The subset of characters to include
769+
770+
Returns
771+
-------
772+
`Type1Font`
773+
"""
774+
775+
characters = set(characters)
776+
encoding = {code: glyph
777+
for code, glyph in self.prop['Encoding'].items()
778+
if code in characters}
779+
encoding[0] = '.notdef'
780+
# todo and done include strings (glyph names)
781+
todo = set(encoding.values())
782+
done = set()
783+
seen_subrs = {0, 1, 2, 3}
784+
while todo - done:
785+
glyph = next(iter(todo - done))
786+
called_glyphs, called_subrs, _, _ = self._simulate(glyph, [], [])
787+
todo.update(called_glyphs)
788+
seen_subrs.update(called_subrs)
789+
done.add(glyph)
790+
791+
fontname = _make_tag(todo) + b'+' + self.prop['FontName'].encode('ascii')
792+
charstrings = self._subset_charstrings(todo)
793+
subrs = self._subset_subrs(seen_subrs)
794+
newparts = self._replace(
795+
[(x, b'/FontName/%s def' % fontname) for x in self._pos['FontName']] + [
796+
(self._pos['CharStrings'][0], charstrings),
797+
(self._pos['Subrs'][0], subrs),
798+
(self._pos['Encoding'][0], self._subset_encoding(encoding))
799+
] + [(x, b'') for x in self._pos.get('UniqueID', [])]
800+
)
801+
return Type1Font((newparts[0], self._encrypt(newparts[1], 'eexec'), self.parts[2]))
802+
803+
@staticmethod
804+
def _charstring_tokens(data):
805+
data = iter(data)
806+
for byte in data:
807+
if 32 <= byte <= 246:
808+
yield byte - 139
809+
elif 247 <= byte <= 250:
810+
byte2 = next(data)
811+
yield (byte-247) * 256 + byte2 + 108
812+
elif 251 <= byte <= 254:
813+
byte2 = next(data)
814+
yield -(byte-251)*256 - byte2 - 108
815+
elif byte == 255:
816+
bs = itertools.islice(data, 4)
817+
yield struct.unpack('>i', bs)[0]
818+
elif byte == 12:
819+
byte1 = next(data)
820+
yield {
821+
0: 'dotsection',
822+
1: 'vstem3',
823+
2: 'hstem3',
824+
6: 'seac',
825+
7: 'sbw',
826+
12: 'div',
827+
16: 'callothersubr',
828+
17: 'pop',
829+
33: 'setcurrentpoint'
830+
}[byte1]
831+
else:
832+
yield {
833+
1: 'hstem',
834+
3: 'vstem',
835+
4: 'vmoveto',
836+
5: 'rlineto',
837+
6: 'hlineto',
838+
7: 'vlineto',
839+
8: 'rrcurveto',
840+
9: 'closepath',
841+
10: 'callsubr',
842+
11: 'return',
843+
13: 'hsbw',
844+
14: 'endchar',
845+
21: 'rmoveto',
846+
22: 'hmoveto',
847+
30: 'vhcurveto',
848+
31: 'hvcurveto'
849+
}[byte]
850+
851+
def _step(self, buildchar_stack, postscript_stack, opcode):
852+
if isinstance(opcode, int):
853+
return set(), set(), buildchar_stack + [opcode], postscript_stack
854+
elif opcode in {'hsbw', 'sbw', 'closepath', 'hlineto', 'hmoveto', 'hcurveto', 'hvcurveto',
855+
'rlineto', 'rmoveto', 'rrcurveto', 'vhcurveto', 'vlineto', 'vmoveto',
856+
'dotsection', 'hstem', 'hstem3', 'vstem', 'vstem3', 'setcurrentpoint'}:
857+
return set(), set(), [], postscript_stack
858+
elif opcode == 'seac':
859+
codes = buildchar_stack[3:5]
860+
glyphs = [self.prop['Encoding'][x] for x in codes]
861+
return set(glyphs), set(), [], postscript_stack
862+
elif opcode == 'div':
863+
num1, num2 = buildchar_stack[-2:]
864+
return set(), set(), buildchar_stack[-2:] + [num1/num2], postscript_stack
865+
elif opcode == 'callothersubr':
866+
othersubr = buildchar_stack[-1]
867+
n = buildchar_stack[-2]
868+
args = buildchar_stack[-2-n:-2]
869+
if othersubr == 3: # Section 8.1 in Type-1 spec
870+
postscript_stack.append(args[0])
871+
else:
872+
postscript_stack.extend(args[::-1])
873+
return set(), set(), buildchar_stack[:-n-2], postscript_stack
874+
elif opcode == 'callsubr':
875+
subr = buildchar_stack[-1]
876+
glyphs, subrs, new_bc_stack, new_ps_stack = \
877+
self._simulate(subr, buildchar_stack[:-1], postscript_stack)
878+
return set(), subrs | {subr}, new_bc_stack, new_ps_stack
879+
elif opcode == 'pop':
880+
return set(), set(), buildchar_stack + [postscript_stack[-1]], postscript_stack[:-1]
881+
else:
882+
raise RuntimeError(f'opcode {opcode}')
883+
884+
def _simulate(self, glyph_or_subr, buildchar_stack, postscript_stack):
885+
if isinstance(glyph_or_subr, str):
886+
program = self.prop['CharStrings'][glyph_or_subr]
887+
glyphs = {glyph_or_subr}
888+
subrs = set()
889+
else:
890+
program = self.prop['Subrs'][glyph_or_subr]
891+
glyphs = set()
892+
subrs = {glyph_or_subr}
893+
for opcode in self._charstring_tokens(program):
894+
if opcode in ('return', 'endchar'):
895+
return glyphs, subrs, buildchar_stack, postscript_stack
896+
newglyphs, newsubrs, buildchar_stack, postscript_stack = \
897+
self._step(buildchar_stack, postscript_stack, opcode)
898+
glyphs.update(newglyphs)
899+
subrs.update(newsubrs)
900+
901+
def _subset_encoding(self, encoding):
902+
data = bytearray(b'/Encoding 256 array\n0 1 255 { 1 index exch /.notdef put } for\n')
903+
for i, glyph in sorted(encoding.items()):
904+
if glyph == '.notdef':
905+
continue
906+
data.extend(f'dup {i} /{glyph} put\n'.encode('ascii'))
907+
data.extend(b'readonly def\n')
908+
return bytes(data)
909+
910+
def _subset_charstrings(self, glyphs):
911+
data = bytearray(f'/CharStrings {len(glyphs)} dict dup begin\n'.encode('ascii'))
912+
for glyph in glyphs:
913+
enc = self._encrypt(self.prop['CharStrings'][glyph], 'charstring', self.prop.get('lenIV', 4))
914+
data.extend(f'/{glyph} {len(enc)} '.encode('ascii'))
915+
data.extend(self._abbr["RD"])
916+
data.extend(b' ')
917+
data.extend(enc)
918+
data.extend(b' ')
919+
data.extend(self._abbr["ND"])
920+
data.extend(b'\n')
921+
data.extend(b'end\n')
922+
return bytes(data)
923+
924+
def _subset_subrs(self, indices):
925+
# we can't remove subroutines, we just replace unused ones with a stub
926+
n_subrs = len(self.prop['Subrs'])
927+
data = bytearray(f'/Subrs {n_subrs} array\n'.encode('ascii'))
928+
for i in range(n_subrs):
929+
if i in indices:
930+
sub = self.prop['Subrs'][i]
931+
else:
932+
sub = bytes([11])
933+
enc = self._encrypt(sub, 'charstring', self.prop.get('lenIV', 4))
934+
data.extend(f'dup {i} {len(enc)} '.encode('ascii'))
935+
data.extend(self._abbr['RD'])
936+
data.extend(b' ')
937+
data.extend(enc)
938+
data.extend(b' ')
939+
data.extend(self._abbr['NP'])
940+
data.extend(b'\n')
941+
942+
data.extend(self._abbr['ND'])
943+
data.extend(b'\n')
944+
return bytes(data)
716945

717946

718947
StandardEncoding = {

0 commit comments

Comments
 (0)
0