Type-1 subsetting

jkseppan · jkseppan · commit b66579d62aa4 · 2021-07-22T16:33:40.000+03:00
With this I can produce smaller pdf files with usetex in some small tests, but this obviously needs more extensive testing, thus marking as draft. On top of matplotlib#20634 and matplotlib#20715. Closes matplotlib#127.
diff --git a/lib/matplotlib/backends/_backend_pdf_ps.py b/lib/matplotlib/backends/_backend_pdf_ps.py
@@ -27,14 +27,25 @@ class CharacterTracker:
     def __init__(self):
         self.used = {}
 
-    def track(self, font, s):
-        """Record that string *s* is being typeset using font *font*."""
+    @staticmethod
+    def _get_name(font):
         if isinstance(font, str):
             # Unused, can be removed after removal of track_characters.
             fname = font
-        else:
+        elif hasattr(font, 'fname'):
             fname = font.fname
-        self.used.setdefault(fname, set()).update(map(ord, s))
+        elif hasattr(font, 'name'):
+            fname = font.name
+        if isinstance(fname, bytes):
+            fname = fname.decode('ascii', 'error')
+        return fname
+
+    def get_used(self, font, default=None):
+        return self.used.get(self._get_name(font), default)
+
+    def track(self, font, s):
+        """Record that string *s* is being typeset using font *font*."""
+        self.used.setdefault(self._get_name(font), set()).update(map(ord, s))
 
     # Not public, can be removed when pdf/ps merge_used_characters is removed.
     def merge(self, other):
diff --git a/lib/matplotlib/backends/backend_pdf.py b/lib/matplotlib/backends/backend_pdf.py
@@ -963,6 +963,8 @@ def _embedTeXFont(self, fontinfo):
         t1font = type1font.Type1Font(fontinfo.fontfile)
         if fontinfo.effects:
             t1font = t1font.transform(fontinfo.effects)
+        chars = self._character_tracker.get_used(fontinfo.pdfname)
+        t1font = t1font.subset(chars)
         fontdict['BaseFont'] = Name(t1font.prop['FontName'])
 
         # Font descriptors may be shared between differently encoded
@@ -2227,6 +2229,7 @@ def draw_tex(self, gc, x, y, s, prop, angle, *, mtext=None):
                 seq += [['font', pdfname, dvifont.size]]
                 oldfont = dvifont
             seq += [['text', x1, y1, [bytes([glyph])], x1+width]]
+            self.file._character_tracker.track(pdfname, chr(glyph))
 
         # Find consecutive text strings with constant y coordinate and
         # combine into a sequence of strings and kerns, or just one
diff --git a/lib/matplotlib/type1font.py b/lib/matplotlib/type1font.py
@@ -21,6 +21,7 @@
   v1.1, 1993. ISBN 0-201-57044-0.
 """
 
+import base64
 import binascii
 import logging
 import re
@@ -34,7 +35,36 @@
 _log = logging.getLogger(__name__)
 
 
+def _make_tag(set):
+    """
+    Hash set into a six-character tag make of uppercase letters
+
+    Useful for adding a tag into subsetted fonts while keeping the code
+    reproducible. The function always returns the same value for the
+    same set on the same exact Python version but is not guaranteed to
+    not have collisions.
+
+    Parameters
+    ----------
+    set : iterable
+        The set of glyphs present in a font subset
+
+    Returns
+    -------
+    bytes
+        Six uppercase ASCII letters
+    """
+
+    # freeze the set to make it hashable, interpret the hash as bytes
+    array = struct.pack("@q", hash(frozenset(set)))
+    # turn the bytes into characters with b32encode, which uses uppercase
+    # letters and numbers from 2 to 7 - remap those arbitrarily
+    trans = bytes.maketrans(b'234567', b'MTPLIB')
+    return base64.b32encode(array).translate(trans, delete=b'=')[:6]
+
+
 class _Token:
+
     """
     A token in a PostScript stream
 
@@ -489,6 +519,15 @@ def convert(x): return x.decode('ascii', 'replace')
             except StopIteration:
                 break
 
+            # there are some standard abbreviations whose names vary
+            # so detect them
+            if value == b'{noaccess def}':
+                self._abbr['ND'] = key.encode('ascii')
+            elif value == b'{noaccess put}':
+                self._abbr['NP'] = key.encode('ascii')
+            elif value == b'{string currentfile exch readstring pop}':
+                self._abbr['RD'] = key.encode('ascii')
+
             # sometimes noaccess def and readonly def are abbreviated
             if kw.is_name(b'def', self._abbr['ND'], self._abbr['NP']):
                 prop[key] = value
@@ -560,13 +599,16 @@ def _parse_subrs(self, tokens, _data):
                     "Token preceding subr must be RD or equivalent, "
                     f"was {token}"
                 )
+            if not token.is_name(self._abbr['RD']):
+                raise RuntimeError(
+                    f"Token preceding subr must be RD or equivalent, was {token}"
+                )
             binary_token = tokens.send(1+nbytes_token.numeric_value())
             array[index_token.numeric_value()] = binary_token.value[1:]
 
         return array, next(tokens).endpos()
 
-    @staticmethod
-    def _parse_charstrings(tokens, _data):
+    def _parse_charstrings(self, tokens, _data):
         count_token = next(tokens)
         if not count_token.is_number():
             raise RuntimeError(
@@ -591,7 +633,11 @@ def _parse_charstrings(tokens, _data):
                     f"Token following /{glyphname} in CharStrings definition "
                     f"must be a number, was {nbytes_token}"
                 )
-            token = next(tokens)  # usually RD or |-
+            token = next(tokens)
+            if not token.is_name(self._abbr['RD']):
+                raise RuntimeError(
+                    f"Token preceding charstring must be RD or equivalent, was {token}"
+                )
             binary_token = tokens.send(1+nbytes_token.numeric_value())
             charstrings[glyphname] = binary_token.value[1:]
 
@@ -624,16 +670,15 @@ def _parse_encoding(tokens, _data):
             encoding[index_token.numeric_value()] = \
                 name_token.value[1:].decode('ascii', 'replace')
 
-    @staticmethod
-    def _parse_othersubrs(tokens, data):
+    def _parse_othersubrs(self, tokens, data):
         init_pos = None
         while True:
             token = next(tokens)
             if init_pos is None:
                 init_pos = token.pos
             if token.is_delim():
                 _expression(token, tokens, data)
-            elif token.value in (b'def', b'ND', b'|-'):
+            elif token.value in (b'def', self._abbr['ND']):
                 return data[init_pos:token.endpos()], token.endpos()
 
     def transform(self, effects):
@@ -688,7 +733,7 @@ def transform(self, effects):
         fontmatrix = (
             '[%s]' % ' '.join(_format_approx(x, 6) for x in array)
         ).encode('ascii')
-        replacements = (
+        newparts = self._replace(
             [(x, b'/FontName/%s def' % fontname)
              for x in self._pos['FontName']]
             + [(x, b'/ItalicAngle %a def' % italicangle)
@@ -698,6 +743,9 @@ def transform(self, effects):
             + [(x, b'') for x in self._pos.get('UniqueID', [])]
         )
 
+        return Type1Font((newparts[0], self._encrypt(newparts[1], 'eexec'), self.parts[2]))
+
+    def _replace(self, replacements):
         data = bytearray(self.parts[0])
         data.extend(self.decrypted)
         len0 = len(self.parts[0])
@@ -712,11 +760,192 @@ def transform(self, effects):
                 len0 += len(value) - pos1 + pos0
 
         data = bytes(data)
-        return Type1Font((
-            data[:len0],
-            self._encrypt(data[len0:], 'eexec'),
-            self.parts[2]
-        ))
+        return data[:len0], data[len0:]
+
+    def subset(self, characters):
+        """
+        Return a new font that only defines the given characters.
+
+        Parameters
+        ----------
+        characters : sequence of bytes
+            The subset of characters to include
+
+        Returns
+        -------
+        `Type1Font`
+        """
+
+        characters = set(characters)
+        encoding = {code: glyph
+                    for code, glyph in self.prop['Encoding'].items()
+                    if code in characters}
+        encoding[0] = '.notdef'
+        # todo and done include strings (glyph names)
+        todo = set(encoding.values())
+        done = set()
+        seen_subrs = {0, 1, 2, 3}
+        while todo - done:
+            glyph = next(iter(todo - done))
+            called_glyphs, called_subrs, _, _ = self._simulate(glyph, [], [])
+            todo.update(called_glyphs)
+            seen_subrs.update(called_subrs)
+            done.add(glyph)
+
+        fontname = _make_tag(todo) + b'+' + self.prop['FontName'].encode('ascii')
+        charstrings = self._subset_charstrings(todo)
+        subrs = self._subset_subrs(seen_subrs)
+        newparts = self._replace(
+            [(x, b'/FontName/%s def' % fontname) for x in self._pos['FontName']] + [
+                (self._pos['CharStrings'][0], charstrings),
+                (self._pos['Subrs'][0], subrs),
+                (self._pos['Encoding'][0], self._subset_encoding(encoding))
+            ] + [(x, b'') for x in self._pos.get('UniqueID', [])]
+        )
+        return Type1Font((newparts[0], self._encrypt(newparts[1], 'eexec'), self.parts[2]))
+
+    @staticmethod
+    def _charstring_tokens(data):
+        data = iter(data)
+        for byte in data:
+            if 32 <= byte <= 246:
+                yield byte - 139
+            elif 247 <= byte <= 250:
+                byte2 = next(data)
+                yield (byte-247) * 256 + byte2 + 108
+            elif 251 <= byte <= 254:
+                byte2 = next(data)
+                yield -(byte-251)*256 - byte2 - 108
+            elif byte == 255:
+                bs = itertools.islice(data, 4)
+                yield struct.unpack('>i', bs)[0]
+            elif byte == 12:
+                byte1 = next(data)
+                yield {
+                    0: 'dotsection',
+                    1: 'vstem3',
+                    2: 'hstem3',
+                    6: 'seac',
+                    7: 'sbw',
+                    12: 'div',
+                    16: 'callothersubr',
+                    17: 'pop',
+                    33: 'setcurrentpoint'
+                }[byte1]
+            else:
+                yield {
+                    1: 'hstem',
+                    3: 'vstem',
+                    4: 'vmoveto',
+                    5: 'rlineto',
+                    6: 'hlineto',
+                    7: 'vlineto',
+                    8: 'rrcurveto',
+                    9: 'closepath',
+                    10: 'callsubr',
+                    11: 'return',
+                    13: 'hsbw',
+                    14: 'endchar',
+                    21: 'rmoveto',
+                    22: 'hmoveto',
+                    30: 'vhcurveto',
+                    31: 'hvcurveto'
+                }[byte]
+
+    def _step(self, buildchar_stack, postscript_stack, opcode):
+        if isinstance(opcode, int):
+            return set(), set(), buildchar_stack + [opcode], postscript_stack
+        elif opcode in {'hsbw', 'sbw', 'closepath', 'hlineto', 'hmoveto', 'hcurveto', 'hvcurveto',
+                      'rlineto', 'rmoveto', 'rrcurveto', 'vhcurveto', 'vlineto', 'vmoveto',
+                      'dotsection', 'hstem', 'hstem3', 'vstem', 'vstem3', 'setcurrentpoint'}:
+            return set(), set(), [], postscript_stack
+        elif opcode == 'seac':
+            codes = buildchar_stack[3:5]
+            glyphs = [self.prop['Encoding'][x] for x in codes]
+            return set(glyphs), set(), [], postscript_stack
+        elif opcode == 'div':
+            num1, num2 = buildchar_stack[-2:]
+            return set(), set(), buildchar_stack[-2:] + [num1/num2], postscript_stack
+        elif opcode == 'callothersubr':
+            othersubr = buildchar_stack[-1]
+            n = buildchar_stack[-2]
+            args = buildchar_stack[-2-n:-2]
+            if othersubr == 3:  # Section 8.1 in Type-1 spec
+                postscript_stack.append(args[0])
+            else:
+                postscript_stack.extend(args[::-1])
+            return set(), set(), buildchar_stack[:-n-2], postscript_stack
+        elif opcode == 'callsubr':
+            subr = buildchar_stack[-1]
+            glyphs, subrs, new_bc_stack, new_ps_stack = \
+                self._simulate(subr, buildchar_stack[:-1], postscript_stack)
+            return set(), subrs | {subr}, new_bc_stack, new_ps_stack
+        elif opcode == 'pop':
+            return set(), set(), buildchar_stack + [postscript_stack[-1]], postscript_stack[:-1]
+        else:
+            raise RuntimeError(f'opcode {opcode}')
+
+    def _simulate(self, glyph_or_subr, buildchar_stack, postscript_stack):
+        if isinstance(glyph_or_subr, str):
+            program = self.prop['CharStrings'][glyph_or_subr]
+            glyphs = {glyph_or_subr}
+            subrs = set()
+        else:
+            program = self.prop['Subrs'][glyph_or_subr]
+            glyphs = set()
+            subrs = {glyph_or_subr}
+        for opcode in self._charstring_tokens(program):
+            if opcode in ('return', 'endchar'):
+                return glyphs, subrs, buildchar_stack, postscript_stack
+            newglyphs, newsubrs, buildchar_stack, postscript_stack = \
+                self._step(buildchar_stack, postscript_stack, opcode)
+            glyphs.update(newglyphs)
+            subrs.update(newsubrs)
+
+    def _subset_encoding(self, encoding):
+        data = bytearray(b'/Encoding 256 array\n0 1 255 { 1 index exch /.notdef put } for\n')
+        for i, glyph in sorted(encoding.items()):
+            if glyph == '.notdef':
+                continue
+            data.extend(f'dup {i} /{glyph} put\n'.encode('ascii'))
+        data.extend(b'readonly def\n')
+        return bytes(data)
+
+    def _subset_charstrings(self, glyphs):
+        data = bytearray(f'/CharStrings {len(glyphs)} dict dup begin\n'.encode('ascii'))
+        for glyph in glyphs:
+            enc = self._encrypt(self.prop['CharStrings'][glyph], 'charstring', self.prop.get('lenIV', 4))
+            data.extend(f'/{glyph} {len(enc)} '.encode('ascii'))
+            data.extend(self._abbr["RD"])
+            data.extend(b' ')
+            data.extend(enc)
+            data.extend(b' ')
+            data.extend(self._abbr["ND"])
+            data.extend(b'\n')
+        data.extend(b'end\n')
+        return bytes(data)
+
+    def _subset_subrs(self, indices):
+        # we can't remove subroutines, we just replace unused ones with a stub
+        n_subrs = len(self.prop['Subrs'])
+        data = bytearray(f'/Subrs {n_subrs} array\n'.encode('ascii'))
+        for i in range(n_subrs):
+            if i in indices:
+                sub = self.prop['Subrs'][i]
+            else:
+                sub = bytes([11])
+            enc = self._encrypt(sub, 'charstring', self.prop.get('lenIV', 4))
+            data.extend(f'dup {i} {len(enc)} '.encode('ascii'))
+            data.extend(self._abbr['RD'])
+            data.extend(b' ')
+            data.extend(enc)
+            data.extend(b' ')
+            data.extend(self._abbr['NP'])
+            data.extend(b'\n')
+
+        data.extend(self._abbr['ND'])
+        data.extend(b'\n')
+        return bytes(data)
 
 
 StandardEncoding = {