Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 7c49d52

Browse files
authoredJun 6, 2025
Merge pull request matplotlib#20716 from jkseppan/type1-subset
Type-1 font subsetting
2 parents 01e919a + d5ab3b0 commit 7c49d52

File tree

11 files changed

+548
-61
lines changed

11 files changed

+548
-61
lines changed
 

‎.github/workflows/tests.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,10 @@ jobs:
6464
python-version: '3.12'
6565
# https://github.com/matplotlib/matplotlib/issues/29844
6666
pygobject-ver: '<3.52.0'
67-
- os: ubuntu-22.04
67+
- name-suffix: "(Extra TeX packages)"
68+
os: ubuntu-22.04
6869
python-version: '3.13'
70+
extra-packages: 'texlive-fonts-extra texlive-lang-cyrillic'
6971
# https://github.com/matplotlib/matplotlib/issues/29844
7072
pygobject-ver: '<3.52.0'
7173
- name-suffix: "Free-threaded"
@@ -142,7 +144,8 @@ jobs:
142144
texlive-latex-recommended \
143145
texlive-luatex \
144146
texlive-pictures \
145-
texlive-xetex
147+
texlive-xetex \
148+
${{ matrix.extra-packages }}
146149
if [[ "${{ matrix.name-suffix }}" != '(Minimum Versions)' ]]; then
147150
sudo apt-get install -yy --no-install-recommends ffmpeg poppler-utils
148151
fi
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
PDF files created with usetex now embed subsets of Type 1 fonts
2+
---------------------------------------------------------------
3+
4+
When using the PDF backend with the usetex feature,
5+
Matplotlib calls TeX to render the text and formulas in the figure.
6+
The fonts that get used are usually "Type 1" fonts.
7+
They used to be embedded in full
8+
but are now limited to the glyphs that are actually used in the figure.
9+
This reduces the size of the resulting PDF files.

‎lib/matplotlib/_type1font.py

Lines changed: 332 additions & 13 deletions
Large diffs are not rendered by default.

‎lib/matplotlib/backends/backend_pdf.py

Lines changed: 66 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -722,8 +722,6 @@ def __init__(self, filename, metadata=None):
722722
self._internal_font_seq = (Name(f'F{i}') for i in itertools.count(1))
723723
self._fontNames = {} # maps filenames to internal font names
724724
self._dviFontInfo = {} # maps dvi font names to embedding information
725-
# differently encoded Type-1 fonts may share the same descriptor
726-
self._type1Descriptors = {}
727725
self._character_tracker = _backend_pdf_ps.CharacterTracker()
728726

729727
self.alphaStates = {} # maps alpha values to graphics state objects
@@ -767,8 +765,7 @@ def __init__(self, filename, metadata=None):
767765

768766
fontNames = _api.deprecated("3.11")(property(lambda self: self._fontNames))
769767
dviFontInfo = _api.deprecated("3.11")(property(lambda self: self._dviFontInfo))
770-
type1Descriptors = _api.deprecated("3.11")(
771-
property(lambda self: self._type1Descriptors))
768+
type1Descriptors = _api.deprecated("3.11")(property(lambda _: {}))
772769

773770
def newPage(self, width, height):
774771
self.endStream()
@@ -808,7 +805,14 @@ def newTextnote(self, text, positionRect=[-100, -100, 0, 0]):
808805
}
809806
self.pageAnnotations.append(theNote)
810807

811-
def _get_subsetted_psname(self, ps_name, charmap):
808+
@staticmethod
809+
def _get_subset_prefix(charset):
810+
"""
811+
Get a prefix for a subsetted font name.
812+
813+
The prefix is six uppercase letters followed by a plus sign;
814+
see PDF reference section 5.5.3 Font Subsets.
815+
"""
812816
def toStr(n, base):
813817
if n < base:
814818
return string.ascii_uppercase[n]
@@ -818,11 +822,15 @@ def toStr(n, base):
818822
)
819823

820824
# encode to string using base 26
821-
hashed = hash(frozenset(charmap.keys())) % ((sys.maxsize + 1) * 2)
825+
hashed = hash(charset) % ((sys.maxsize + 1) * 2)
822826
prefix = toStr(hashed, 26)
823827

824828
# get first 6 characters from prefix
825-
return prefix[:6] + "+" + ps_name
829+
return prefix[:6] + "+"
830+
831+
@staticmethod
832+
def _get_subsetted_psname(ps_name, charmap):
833+
return PdfFile._get_subset_prefix(frozenset(charmap.keys())) + ps_name
826834

827835
def finalize(self):
828836
"""Write out the various deferred objects and the pdf end matter."""
@@ -994,53 +1002,60 @@ def _embedTeXFont(self, fontinfo):
9941002
_log.debug('Embedding TeX font %s - fontinfo=%s',
9951003
fontinfo.dvifont.texname, fontinfo.__dict__)
9961004

997-
# Widths
998-
widthsObject = self.reserveObject('font widths')
999-
tfm = fontinfo.dvifont._tfm
1000-
# convert from TeX's 12.20 representation to 1/1000 text space units.
1001-
widths = [(1000 * metrics.tex_width) >> 20
1002-
if (metrics := tfm.get_metrics(char)) else 0
1003-
for char in range(max(tfm._glyph_metrics, default=-1) + 1)]
1004-
self.writeObject(widthsObject, widths)
1005-
1006-
# Font dictionary
1005+
# The font dictionary is the top-level object describing a font
10071006
fontdictObject = self.reserveObject('font dictionary')
10081007
fontdict = {
10091008
'Type': Name('Font'),
10101009
'Subtype': Name('Type1'),
1011-
'FirstChar': 0,
1012-
'LastChar': len(widths) - 1,
1013-
'Widths': widthsObject,
1014-
}
1015-
1016-
# Encoding (if needed)
1017-
if fontinfo.encodingfile is not None:
1018-
fontdict['Encoding'] = {
1019-
'Type': Name('Encoding'),
1020-
'Differences': [
1021-
0, *map(Name, dviread._parse_enc(fontinfo.encodingfile))],
1022-
}
1010+
}
10231011

1024-
# We have a font file to embed - read it in and apply any effects
1012+
# Read the font file and apply any encoding changes and effects
10251013
t1font = _type1font.Type1Font(fontinfo.fontfile)
1014+
if fontinfo.encodingfile is not None:
1015+
t1font = t1font.with_encoding(
1016+
{i: c for i, c in enumerate(dviread._parse_enc(fontinfo.encodingfile))}
1017+
)
10261018
if fontinfo.effects:
10271019
t1font = t1font.transform(fontinfo.effects)
1028-
fontdict['BaseFont'] = Name(t1font.prop['FontName'])
10291020

1030-
# Font descriptors may be shared between differently encoded
1031-
# Type-1 fonts, so only create a new descriptor if there is no
1032-
# existing descriptor for this font.
1033-
effects = (fontinfo.effects.get('slant', 0.0),
1034-
fontinfo.effects.get('extend', 1.0))
1035-
fontdesc = self._type1Descriptors.get((fontinfo.fontfile, effects))
1036-
if fontdesc is None:
1037-
fontdesc = self.createType1Descriptor(t1font)
1038-
self._type1Descriptors[(fontinfo.fontfile, effects)] = fontdesc
1039-
fontdict['FontDescriptor'] = fontdesc
1021+
# Reduce the font to only the glyphs used in the document, get the encoding
1022+
# for that subset, and compute various properties based on the encoding.
1023+
chars = frozenset(self._character_tracker.used[fontinfo.dvifont.fname])
1024+
t1font = t1font.subset(chars, self._get_subset_prefix(chars))
1025+
fontdict['BaseFont'] = Name(t1font.prop['FontName'])
1026+
# createType1Descriptor writes the font data as a side effect
1027+
fontdict['FontDescriptor'] = self.createType1Descriptor(t1font)
1028+
encoding = t1font.prop['Encoding']
1029+
fontdict['Encoding'] = self._generate_encoding(encoding)
1030+
fc = fontdict['FirstChar'] = min(encoding.keys(), default=0)
1031+
lc = fontdict['LastChar'] = max(encoding.keys(), default=255)
1032+
1033+
# Convert glyph widths from TeX 12.20 fixed point to 1/1000 text space units
1034+
tfm = fontinfo.dvifont._tfm
1035+
widths = [(1000 * metrics.tex_width) >> 20
1036+
if (metrics := tfm.get_metrics(char)) else 0
1037+
for char in range(fc, lc + 1)]
1038+
fontdict['Widths'] = widthsObject = self.reserveObject('glyph widths')
1039+
self.writeObject(widthsObject, widths)
10401040

10411041
self.writeObject(fontdictObject, fontdict)
10421042
return fontdictObject
10431043

1044+
1045+
def _generate_encoding(self, encoding):
1046+
prev = -2
1047+
result = []
1048+
for code, name in sorted(encoding.items()):
1049+
if code != prev + 1:
1050+
result.append(code)
1051+
prev = code
1052+
result.append(Name(name))
1053+
return {
1054+
'Type': Name('Encoding'),
1055+
'Differences': result
1056+
}
1057+
1058+
10441059
@_api.delete_parameter("3.11", "fontfile")
10451060
def createType1Descriptor(self, t1font, fontfile=None):
10461061
# Create and write the font descriptor and the font file
@@ -1077,6 +1092,14 @@ def createType1Descriptor(self, t1font, fontfile=None):
10771092
if 0:
10781093
flags |= 1 << 18
10791094

1095+
encoding = t1font.prop['Encoding']
1096+
charset = ''.join(
1097+
sorted(
1098+
f'/{c}' for c in encoding.values()
1099+
if c != '.notdef'
1100+
)
1101+
)
1102+
10801103
descriptor = {
10811104
'Type': Name('FontDescriptor'),
10821105
'FontName': Name(t1font.prop['FontName']),
@@ -1090,6 +1113,7 @@ def createType1Descriptor(self, t1font, fontfile=None):
10901113
'FontFile': fontfileObject,
10911114
'FontFamily': t1font.prop['FamilyName'],
10921115
'StemV': 50, # TODO
1116+
'CharSet': charset,
10931117
# (see also revision 3874; but not all TeX distros have AFM files!)
10941118
# 'FontWeight': a number where 400 = Regular, 700 = Bold
10951119
}
@@ -2269,6 +2293,7 @@ def draw_tex(self, gc, x, y, s, prop, angle, *, mtext=None):
22692293
seq += [['font', pdfname, dvifont.size]]
22702294
oldfont = dvifont
22712295
seq += [['text', x1, y1, [bytes([glyph])], x1+width]]
2296+
self.file._character_tracker.track(dvifont, chr(glyph))
22722297

22732298
# Find consecutive text strings with constant y coordinate and
22742299
# combine into a sequence of strings and kerns, or just one

‎lib/matplotlib/dviread.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,17 @@
1717
...
1818
"""
1919

20-
from collections import namedtuple
2120
import dataclasses
2221
import enum
23-
from functools import cache, lru_cache, partial, wraps
2422
import logging
2523
import os
26-
from pathlib import Path
2724
import re
2825
import struct
2926
import subprocess
3027
import sys
28+
from collections import namedtuple
29+
from functools import cache, lru_cache, partial, wraps
30+
from pathlib import Path
3131

3232
import numpy as np
3333

@@ -583,6 +583,9 @@ class DviFont:
583583
Attributes
584584
----------
585585
texname : bytes
586+
fname : str
587+
Compatibility shim so that DviFont can be used with
588+
``_backend_pdf_ps.CharacterTracker``; not a real filename.
586589
size : float
587590
Size of the font in Adobe points, converted from the slightly
588591
smaller TeX points.
@@ -602,6 +605,18 @@ def __init__(self, scale, tfm, texname, vf):
602605
(1000 * self._tfm.width.get(char, 0)) >> 20
603606
for char in range(max(self._tfm.width, default=-1) + 1)]))
604607

608+
@property
609+
def fname(self):
610+
"""A fake filename"""
611+
return self.texname.decode('latin-1')
612+
613+
def _get_fontmap(self, string):
614+
"""Get the mapping from characters to the font that includes them.
615+
616+
Each value maps to self; there is no fallback mechanism for DviFont.
617+
"""
618+
return {char: self for char in string}
619+
605620
def __eq__(self, other):
606621
return (type(self) is type(other)
607622
and self.texname == other.texname and self.size == other.size)
@@ -1161,8 +1176,8 @@ def _fontfile(cls, suffix, texname):
11611176

11621177

11631178
if __name__ == '__main__':
1164-
from argparse import ArgumentParser
11651179
import itertools
1180+
from argparse import ArgumentParser
11661181

11671182
import fontTools.agl
11681183

‎lib/matplotlib/dviread.pyi

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ class DviFont:
6666
def __ne__(self, other: object) -> bool: ...
6767
@property
6868
def widths(self) -> list[int]: ...
69+
@property
70+
def fname(self) -> str: ...
6971

7072
class Vf(Dvi):
7173
def __init__(self, filename: str | os.PathLike) -> None: ...
Binary file not shown.
Binary file not shown.
Binary file not shown.

‎lib/matplotlib/tests/test_backend_pdf.py

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from matplotlib.backends._backend_pdf_ps import get_glyphs_subset, font_as_file
1717
from matplotlib.backends.backend_pdf import PdfPages
1818
from matplotlib.patches import Rectangle
19-
from matplotlib.testing import _gen_multi_font_text
19+
from matplotlib.testing import _gen_multi_font_text, _has_tex_package
2020
from matplotlib.testing.decorators import check_figures_equal, image_comparison
2121
from matplotlib.testing._markers import needs_usetex
2222

@@ -428,3 +428,53 @@ def test_truetype_conversion(recwarn):
428428
font=Path(__file__).parent / "data/mpltest.ttf", fontsize=80)
429429
ax.set_xticks([])
430430
ax.set_yticks([])
431+
432+
433+
@pytest.mark.skipif(not _has_tex_package("heuristica"),
434+
reason="LaTeX lacks heuristica package")
435+
@image_comparison(["font-heuristica.pdf"])
436+
def test_font_heuristica():
437+
# Heuristica uses the callothersubr operator for some glyphs
438+
mpl.rcParams['text.latex.preamble'] = '\n'.join((
439+
r'\usepackage{heuristica}',
440+
r'\usepackage[T1]{fontenc}',
441+
r'\usepackage[utf8]{inputenc}'
442+
))
443+
fig, ax = plt.subplots()
444+
ax.text(0.1, 0.1, r"BHTem fi ffl 1234", usetex=True, fontsize=50)
445+
ax.set_xticks([])
446+
ax.set_yticks([])
447+
448+
449+
@pytest.mark.skipif(not _has_tex_package("DejaVuSans"),
450+
reason="LaTeX lacks DejaVuSans package")
451+
@image_comparison(["font-dejavusans.pdf"])
452+
def test_font_dejavusans():
453+
# DejaVuSans uses the seac operator to compose characters with diacritics
454+
mpl.rcParams['text.latex.preamble'] = '\n'.join((
455+
r'\usepackage{DejaVuSans}',
456+
r'\usepackage[T1]{fontenc}',
457+
r'\usepackage[utf8]{inputenc}'
458+
))
459+
460+
fig, ax = plt.subplots()
461+
ax.text(0.1, 0.1, r"\textsf{ñäö ABCDabcd}", usetex=True, fontsize=50)
462+
ax.text(0.1, 0.3, r"\textsf{fi ffl 1234}", usetex=True, fontsize=50)
463+
ax.set_xticks([])
464+
ax.set_yticks([])
465+
466+
467+
@pytest.mark.skipif(not _has_tex_package("charter"),
468+
reason="LaTeX lacks charter package")
469+
@image_comparison(["font-bitstream-charter.pdf"])
470+
def test_font_bitstream_charter():
471+
mpl.rcParams['text.latex.preamble'] = '\n'.join((
472+
r'\usepackage{charter}',
473+
r'\usepackage[T1]{fontenc}',
474+
r'\usepackage[utf8]{inputenc}'
475+
))
476+
fig, ax = plt.subplots()
477+
ax.text(0.1, 0.1, r"åüš ABCDabcd", usetex=True, fontsize=50)
478+
ax.text(0.1, 0.3, r"fi ffl 1234", usetex=True, fontsize=50)
479+
ax.set_xticks([])
480+
ax.set_yticks([])

‎lib/matplotlib/tests/test_usetex.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import re
12
from tempfile import TemporaryFile
23

34
import numpy as np
@@ -156,6 +157,69 @@ def test_missing_psfont(fmt, monkeypatch):
156157
fig.savefig(tmpfile, format=fmt)
157158

158159

160+
def test_pdf_type1_font_subsetting():
161+
"""Test that fonts in PDF output are properly subset."""
162+
pikepdf = pytest.importorskip("pikepdf")
163+
164+
mpl.rcParams["text.usetex"] = True
165+
mpl.rcParams["text.latex.preamble"] = r"\usepackage{amssymb}"
166+
fig, ax = plt.subplots()
167+
ax.text(0.2, 0.7, r"$\int_{-\infty}^{\aleph}\sqrt{\alpha\beta\gamma}\mathrm{d}x$")
168+
ax.text(0.2, 0.5, r"$\mathfrak{x}\circledcirc\mathfrak{y}\in\mathbb{R}$")
169+
170+
with TemporaryFile() as tmpfile:
171+
fig.savefig(tmpfile, format="pdf")
172+
tmpfile.seek(0)
173+
pdf = pikepdf.Pdf.open(tmpfile)
174+
175+
length = {}
176+
page = pdf.pages[0]
177+
for font_name, font in page.Resources.Font.items():
178+
assert font.Subtype == "/Type1", (
179+
f"Font {font_name}={font} is not a Type 1 font"
180+
)
181+
182+
# Subsetted font names have a 6-character tag followed by a '+'
183+
base_font = str(font["/BaseFont"]).removeprefix("/")
184+
assert re.match(r"^[A-Z]{6}\+", base_font), (
185+
f"Font {font_name}={base_font} lacks a subset indicator tag"
186+
)
187+
assert "/FontFile" in font.FontDescriptor, (
188+
f"Type 1 font {font_name}={base_font} is not embedded"
189+
)
190+
_, original_name = base_font.split("+", 1)
191+
length[original_name] = len(bytes(font["/FontDescriptor"]["/FontFile"]))
192+
193+
print("Embedded font stream lengths:", length)
194+
# We should have several fonts, each much smaller than the original.
195+
# I get under 10kB on my system for each font, but allow 15kB in case
196+
# of differences in the font files.
197+
assert {
198+
'CMEX10',
199+
'CMMI12',
200+
'CMR12',
201+
'CMSY10',
202+
'CMSY8',
203+
'EUFM10',
204+
'MSAM10',
205+
'MSBM10',
206+
}.issubset(length), "Missing expected fonts in the PDF"
207+
for font_name, length in length.items():
208+
assert length < 15_000, (
209+
f"Font {font_name}={length} is larger than expected"
210+
)
211+
212+
# For comparison, lengths without subsetting on my system:
213+
# 'CMEX10': 29686
214+
# 'CMMI12': 36176
215+
# 'CMR12': 32157
216+
# 'CMSY10': 32004
217+
# 'CMSY8': 32061
218+
# 'EUFM10': 20546
219+
# 'MSAM10': 31199
220+
# 'MSBM10': 34129
221+
222+
159223
try:
160224
_old_gs_version = mpl._get_executable_info('gs').version < parse_version('9.55')
161225
except mpl.ExecutableNotFoundError:

0 commit comments

Comments
 (0)
Please sign in to comment.