Skip to content

Commit b66579d

Browse files
committed
Type-1 subsetting
With this I can produce smaller pdf files with usetex in some small tests, but this obviously needs more extensive testing, thus marking as draft. On top of matplotlib#20634 and matplotlib#20715. Closes matplotlib#127.
1 parent 90b5889 commit b66579d

3 files changed

Lines changed: 259 additions & 16 deletions

File tree

lib/matplotlib/backends/_backend_pdf_ps.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,25 @@ class CharacterTracker:
2727
def __init__(self):
2828
self.used = {}
2929

30-
def track(self, font, s):
31-
"""Record that string *s* is being typeset using font *font*."""
30+
@staticmethod
31+
def _get_name(font):
3232
if isinstance(font, str):
3333
# Unused, can be removed after removal of track_characters.
3434
fname = font
35-
else:
35+
elif hasattr(font, 'fname'):
3636
fname = font.fname
37-
self.used.setdefault(fname, set()).update(map(ord, s))
37+
elif hasattr(font, 'name'):
38+
fname = font.name
39+
if isinstance(fname, bytes):
40+
fname = fname.decode('ascii', 'error')
41+
return fname
42+
43+
def get_used(self, font, default=None):
44+
return self.used.get(self._get_name(font), default)
45+
46+
def track(self, font, s):
47+
"""Record that string *s* is being typeset using font *font*."""
48+
self.used.setdefault(self._get_name(font), set()).update(map(ord, s))
3849

3950
# Not public, can be removed when pdf/ps merge_used_characters is removed.
4051
def merge(self, other):

lib/matplotlib/backends/backend_pdf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -963,6 +963,8 @@ def _embedTeXFont(self, fontinfo):
963963
t1font = type1font.Type1Font(fontinfo.fontfile)
964964
if fontinfo.effects:
965965
t1font = t1font.transform(fontinfo.effects)
966+
chars = self._character_tracker.get_used(fontinfo.pdfname)
967+
t1font = t1font.subset(chars)
966968
fontdict['BaseFont'] = Name(t1font.prop['FontName'])
967969

968970
# Font descriptors may be shared between differently encoded
@@ -2227,6 +2229,7 @@ def draw_tex(self, gc, x, y, s, prop, angle, *, mtext=None):
22272229
seq += [['font', pdfname, dvifont.size]]
22282230
oldfont = dvifont
22292231
seq += [['text', x1, y1, [bytes([glyph])], x1+width]]
2232+
self.file._character_tracker.track(pdfname, chr(glyph))
22302233

22312234
# Find consecutive text strings with constant y coordinate and
22322235
# combine into a sequence of strings and kerns, or just one

lib/matplotlib/type1font.py

Lines changed: 241 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
v1.1, 1993. ISBN 0-201-57044-0.
2222
"""
2323

24+
import base64
2425
import binascii
2526
import logging
2627
import re
@@ -34,7 +35,36 @@
3435
_log = logging.getLogger(__name__)
3536

3637

38+
def _make_tag(set):
39+
"""
40+
Hash set into a six-character tag make of uppercase letters
41+
42+
Useful for adding a tag into subsetted fonts while keeping the code
43+
reproducible. The function always returns the same value for the
44+
same set on the same exact Python version but is not guaranteed to
45+
not have collisions.
46+
47+
Parameters
48+
----------
49+
set : iterable
50+
The set of glyphs present in a font subset
51+
52+
Returns
53+
-------
54+
bytes
55+
Six uppercase ASCII letters
56+
"""
57+
58+
# freeze the set to make it hashable, interpret the hash as bytes
59+
array = struct.pack("@q", hash(frozenset(set)))
60+
# turn the bytes into characters with b32encode, which uses uppercase
61+
# letters and numbers from 2 to 7 - remap those arbitrarily
62+
trans = bytes.maketrans(b'234567', b'MTPLIB')
63+
return base64.b32encode(array).translate(trans, delete=b'=')[:6]
64+
65+
3766
class _Token:
67+
3868
"""
3969
A token in a PostScript stream
4070
@@ -489,6 +519,15 @@ def convert(x): return x.decode('ascii', 'replace')
489519
except StopIteration:
490520
break
491521

522+
# there are some standard abbreviations whose names vary
523+
# so detect them
524+
if value == b'{noaccess def}':
525+
self._abbr['ND'] = key.encode('ascii')
526+
elif value == b'{noaccess put}':
527+
self._abbr['NP'] = key.encode('ascii')
528+
elif value == b'{string currentfile exch readstring pop}':
529+
self._abbr['RD'] = key.encode('ascii')
530+
492531
# sometimes noaccess def and readonly def are abbreviated
493532
if kw.is_name(b'def', self._abbr['ND'], self._abbr['NP']):
494533
prop[key] = value
@@ -560,13 +599,16 @@ def _parse_subrs(self, tokens, _data):
560599
"Token preceding subr must be RD or equivalent, "
561600
f"was {token}"
562601
)
602+
if not token.is_name(self._abbr['RD']):
603+
raise RuntimeError(
604+
f"Token preceding subr must be RD or equivalent, was {token}"
605+
)
563606
binary_token = tokens.send(1+nbytes_token.numeric_value())
564607
array[index_token.numeric_value()] = binary_token.value[1:]
565608

566609
return array, next(tokens).endpos()
567610

568-
@staticmethod
569-
def _parse_charstrings(tokens, _data):
611+
def _parse_charstrings(self, tokens, _data):
570612
count_token = next(tokens)
571613
if not count_token.is_number():
572614
raise RuntimeError(
@@ -591,7 +633,11 @@ def _parse_charstrings(tokens, _data):
591633
f"Token following /{glyphname} in CharStrings definition "
592634
f"must be a number, was {nbytes_token}"
593635
)
594-
token = next(tokens) # usually RD or |-
636+
token = next(tokens)
637+
if not token.is_name(self._abbr['RD']):
638+
raise RuntimeError(
639+
f"Token preceding charstring must be RD or equivalent, was {token}"
640+
)
595641
binary_token = tokens.send(1+nbytes_token.numeric_value())
596642
charstrings[glyphname] = binary_token.value[1:]
597643

@@ -624,16 +670,15 @@ def _parse_encoding(tokens, _data):
624670
encoding[index_token.numeric_value()] = \
625671
name_token.value[1:].decode('ascii', 'replace')
626672

627-
@staticmethod
628-
def _parse_othersubrs(tokens, data):
673+
def _parse_othersubrs(self, tokens, data):
629674
init_pos = None
630675
while True:
631676
token = next(tokens)
632677
if init_pos is None:
633678
init_pos = token.pos
634679
if token.is_delim():
635680
_expression(token, tokens, data)
636-
elif token.value in (b'def', b'ND', b'|-'):
681+
elif token.value in (b'def', self._abbr['ND']):
637682
return data[init_pos:token.endpos()], token.endpos()
638683

639684
def transform(self, effects):
@@ -688,7 +733,7 @@ def transform(self, effects):
688733
fontmatrix = (
689734
'[%s]' % ' '.join(_format_approx(x, 6) for x in array)
690735
).encode('ascii')
691-
replacements = (
736+
newparts = self._replace(
692737
[(x, b'/FontName/%s def' % fontname)
693738
for x in self._pos['FontName']]
694739
+ [(x, b'/ItalicAngle %a def' % italicangle)
@@ -698,6 +743,9 @@ def transform(self, effects):
698743
+ [(x, b'') for x in self._pos.get('UniqueID', [])]
699744
)
700745

746+
return Type1Font((newparts[0], self._encrypt(newparts[1], 'eexec'), self.parts[2]))
747+
748+
def _replace(self, replacements):
701749
data = bytearray(self.parts[0])
702750
data.extend(self.decrypted)
703751
len0 = len(self.parts[0])
@@ -712,11 +760,192 @@ def transform(self, effects):
712760
len0 += len(value) - pos1 + pos0
713761

714762
data = bytes(data)
715-
return Type1Font((
716-
data[:len0],
717-
self._encrypt(data[len0:], 'eexec'),
718-
self.parts[2]
719-
))
763+
return data[:len0], data[len0:]
764+
765+
def subset(self, characters):
766+
"""
767+
Return a new font that only defines the given characters.
768+
769+
Parameters
770+
----------
771+
characters : sequence of bytes
772+
The subset of characters to include
773+
774+
Returns
775+
-------
776+
`Type1Font`
777+
"""
778+
779+
characters = set(characters)
780+
encoding = {code: glyph
781+
for code, glyph in self.prop['Encoding'].items()
782+
if code in characters}
783+
encoding[0] = '.notdef'
784+
# todo and done include strings (glyph names)
785+
todo = set(encoding.values())
786+
done = set()
787+
seen_subrs = {0, 1, 2, 3}
788+
while todo - done:
789+
glyph = next(iter(todo - done))
790+
called_glyphs, called_subrs, _, _ = self._simulate(glyph, [], [])
791+
todo.update(called_glyphs)
792+
seen_subrs.update(called_subrs)
793+
done.add(glyph)
794+
795+
fontname = _make_tag(todo) + b'+' + self.prop['FontName'].encode('ascii')
796+
charstrings = self._subset_charstrings(todo)
797+
subrs = self._subset_subrs(seen_subrs)
798+
newparts = self._replace(
799+
[(x, b'/FontName/%s def' % fontname) for x in self._pos['FontName']] + [
800+
(self._pos['CharStrings'][0], charstrings),
801+
(self._pos['Subrs'][0], subrs),
802+
(self._pos['Encoding'][0], self._subset_encoding(encoding))
803+
] + [(x, b'') for x in self._pos.get('UniqueID', [])]
804+
)
805+
return Type1Font((newparts[0], self._encrypt(newparts[1], 'eexec'), self.parts[2]))
806+
807+
@staticmethod
808+
def _charstring_tokens(data):
809+
data = iter(data)
810+
for byte in data:
811+
if 32 <= byte <= 246:
812+
yield byte - 139
813+
elif 247 <= byte <= 250:
814+
byte2 = next(data)
815+
yield (byte-247) * 256 + byte2 + 108
816+
elif 251 <= byte <= 254:
817+
byte2 = next(data)
818+
yield -(byte-251)*256 - byte2 - 108
819+
elif byte == 255:
820+
bs = itertools.islice(data, 4)
821+
yield struct.unpack('>i', bs)[0]
822+
elif byte == 12:
823+
byte1 = next(data)
824+
yield {
825+
0: 'dotsection',
826+
1: 'vstem3',
827+
2: 'hstem3',
828+
6: 'seac',
829+
7: 'sbw',
830+
12: 'div',
831+
16: 'callothersubr',
832+
17: 'pop',
833+
33: 'setcurrentpoint'
834+
}[byte1]
835+
else:
836+
yield {
837+
1: 'hstem',
838+
3: 'vstem',
839+
4: 'vmoveto',
840+
5: 'rlineto',
841+
6: 'hlineto',
842+
7: 'vlineto',
843+
8: 'rrcurveto',
844+
9: 'closepath',
845+
10: 'callsubr',
846+
11: 'return',
847+
13: 'hsbw',
848+
14: 'endchar',
849+
21: 'rmoveto',
850+
22: 'hmoveto',
851+
30: 'vhcurveto',
852+
31: 'hvcurveto'
853+
}[byte]
854+
855+
def _step(self, buildchar_stack, postscript_stack, opcode):
856+
if isinstance(opcode, int):
857+
return set(), set(), buildchar_stack + [opcode], postscript_stack
858+
elif opcode in {'hsbw', 'sbw', 'closepath', 'hlineto', 'hmoveto', 'hcurveto', 'hvcurveto',
859+
'rlineto', 'rmoveto', 'rrcurveto', 'vhcurveto', 'vlineto', 'vmoveto',
860+
'dotsection', 'hstem', 'hstem3', 'vstem', 'vstem3', 'setcurrentpoint'}:
861+
return set(), set(), [], postscript_stack
862+
elif opcode == 'seac':
863+
codes = buildchar_stack[3:5]
864+
glyphs = [self.prop['Encoding'][x] for x in codes]
865+
return set(glyphs), set(), [], postscript_stack
866+
elif opcode == 'div':
867+
num1, num2 = buildchar_stack[-2:]
868+
return set(), set(), buildchar_stack[-2:] + [num1/num2], postscript_stack
869+
elif opcode == 'callothersubr':
870+
othersubr = buildchar_stack[-1]
871+
n = buildchar_stack[-2]
872+
args = buildchar_stack[-2-n:-2]
873+
if othersubr == 3: # Section 8.1 in Type-1 spec
874+
postscript_stack.append(args[0])
875+
else:
876+
postscript_stack.extend(args[::-1])
877+
return set(), set(), buildchar_stack[:-n-2], postscript_stack
878+
elif opcode == 'callsubr':
879+
subr = buildchar_stack[-1]
880+
glyphs, subrs, new_bc_stack, new_ps_stack = \
881+
self._simulate(subr, buildchar_stack[:-1], postscript_stack)
882+
return set(), subrs | {subr}, new_bc_stack, new_ps_stack
883+
elif opcode == 'pop':
884+
return set(), set(), buildchar_stack + [postscript_stack[-1]], postscript_stack[:-1]
885+
else:
886+
raise RuntimeError(f'opcode {opcode}')
887+
888+
def _simulate(self, glyph_or_subr, buildchar_stack, postscript_stack):
889+
if isinstance(glyph_or_subr, str):
890+
program = self.prop['CharStrings'][glyph_or_subr]
891+
glyphs = {glyph_or_subr}
892+
subrs = set()
893+
else:
894+
program = self.prop['Subrs'][glyph_or_subr]
895+
glyphs = set()
896+
subrs = {glyph_or_subr}
897+
for opcode in self._charstring_tokens(program):
898+
if opcode in ('return', 'endchar'):
899+
return glyphs, subrs, buildchar_stack, postscript_stack
900+
newglyphs, newsubrs, buildchar_stack, postscript_stack = \
901+
self._step(buildchar_stack, postscript_stack, opcode)
902+
glyphs.update(newglyphs)
903+
subrs.update(newsubrs)
904+
905+
def _subset_encoding(self, encoding):
906+
data = bytearray(b'/Encoding 256 array\n0 1 255 { 1 index exch /.notdef put } for\n')
907+
for i, glyph in sorted(encoding.items()):
908+
if glyph == '.notdef':
909+
continue
910+
data.extend(f'dup {i} /{glyph} put\n'.encode('ascii'))
911+
data.extend(b'readonly def\n')
912+
return bytes(data)
913+
914+
def _subset_charstrings(self, glyphs):
915+
data = bytearray(f'/CharStrings {len(glyphs)} dict dup begin\n'.encode('ascii'))
916+
for glyph in glyphs:
917+
enc = self._encrypt(self.prop['CharStrings'][glyph], 'charstring', self.prop.get('lenIV', 4))
918+
data.extend(f'/{glyph} {len(enc)} '.encode('ascii'))
919+
data.extend(self._abbr["RD"])
920+
data.extend(b' ')
921+
data.extend(enc)
922+
data.extend(b' ')
923+
data.extend(self._abbr["ND"])
924+
data.extend(b'\n')
925+
data.extend(b'end\n')
926+
return bytes(data)
927+
928+
def _subset_subrs(self, indices):
929+
# we can't remove subroutines, we just replace unused ones with a stub
930+
n_subrs = len(self.prop['Subrs'])
931+
data = bytearray(f'/Subrs {n_subrs} array\n'.encode('ascii'))
932+
for i in range(n_subrs):
933+
if i in indices:
934+
sub = self.prop['Subrs'][i]
935+
else:
936+
sub = bytes([11])
937+
enc = self._encrypt(sub, 'charstring', self.prop.get('lenIV', 4))
938+
data.extend(f'dup {i} {len(enc)} '.encode('ascii'))
939+
data.extend(self._abbr['RD'])
940+
data.extend(b' ')
941+
data.extend(enc)
942+
data.extend(b' ')
943+
data.extend(self._abbr['NP'])
944+
data.extend(b'\n')
945+
946+
data.extend(self._abbr['ND'])
947+
data.extend(b'\n')
948+
return bytes(data)
720949

721950

722951
StandardEncoding = {

0 commit comments

Comments
 (0)