| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139 |
- """
- A module for reading dvi files output by TeX. Several limitations make
- this not (currently) useful as a general-purpose dvi preprocessor, but
- it is currently used by the pdf backend for processing usetex text.
- Interface::
- with Dvi(filename, 72) as dvi:
- # iterate over pages:
- for page in dvi:
- w, h, d = page.width, page.height, page.descent
- for x, y, font, glyph, width in page.text:
- fontname = font.texname
- pointsize = font.size
- ...
- for x, y, height, width in page.boxes:
- ...
- """
- from collections import namedtuple
- import enum
- from functools import lru_cache, partial, wraps
- import logging
- import os
- from pathlib import Path
- import re
- import struct
- import subprocess
- import sys
- import numpy as np
- from matplotlib import _api, cbook
- _log = logging.getLogger(__name__)
- # Many dvi related files are looked for by external processes, require
- # additional parsing, and are used many times per rendering, which is why they
- # are cached using lru_cache().
- # Dvi is a bytecode format documented in
- # https://ctan.org/pkg/dvitype
- # https://texdoc.org/serve/dvitype.pdf/0
- #
- # The file consists of a preamble, some number of pages, a postamble,
- # and a finale. Different opcodes are allowed in different contexts,
- # so the Dvi object has a parser state:
- #
- # pre: expecting the preamble
- # outer: between pages (followed by a page or the postamble,
- # also e.g. font definitions are allowed)
- # page: processing a page
- # post_post: state after the postamble (our current implementation
- # just stops reading)
- # finale: the finale (unimplemented in our current implementation)
- _dvistate = enum.Enum('DviState', 'pre outer inpage post_post finale')
- # The marks on a page consist of text and boxes. A page also has dimensions.
- Page = namedtuple('Page', 'text boxes height width descent')
- Box = namedtuple('Box', 'x y height width')
- # Also a namedtuple, for backcompat.
- class Text(namedtuple('Text', 'x y font glyph width')):
- """
- A glyph in the dvi file.
- The *x* and *y* attributes directly position the glyph. The *font*,
- *glyph*, and *width* attributes are kept public for back-compatibility,
- but users wanting to draw the glyph themselves are encouraged to instead
- load the font specified by `font_path` at `font_size`, warp it with the
- effects specified by `font_effects`, and load the glyph specified by
- `glyph_name_or_index`.
- """
- def _get_pdftexmap_entry(self):
- return PsfontsMap(find_tex_file("pdftex.map"))[self.font.texname]
- @property
- def font_path(self):
- """The `~pathlib.Path` to the font for this glyph."""
- psfont = self._get_pdftexmap_entry()
- if psfont.filename is None:
- raise ValueError("No usable font file found for {} ({}); "
- "the font may lack a Type-1 version"
- .format(psfont.psname.decode("ascii"),
- psfont.texname.decode("ascii")))
- return Path(psfont.filename)
- @property
- def font_size(self):
- """The font size."""
- return self.font.size
- @property
- def font_effects(self):
- """
- The "font effects" dict for this glyph.
- This dict contains the values for this glyph of SlantFont and
- ExtendFont (if any), read off :file:`pdftex.map`.
- """
- return self._get_pdftexmap_entry().effects
- @property
- def glyph_name_or_index(self):
- """
- Either the glyph name or the native charmap glyph index.
- If :file:`pdftex.map` specifies an encoding for this glyph's font, that
- is a mapping of glyph indices to Adobe glyph names; use it to convert
- dvi indices to glyph names. Callers can then convert glyph names to
- glyph indices (with FT_Get_Name_Index/get_name_index), and load the
- glyph using FT_Load_Glyph/load_glyph.
- If :file:`pdftex.map` specifies no encoding, the indices directly map
- to the font's "native" charmap; glyphs should directly load using
- FT_Load_Char/load_char after selecting the native charmap.
- """
- entry = self._get_pdftexmap_entry()
- return (_parse_enc(entry.encoding)[self.glyph]
- if entry.encoding is not None else self.glyph)
- # Opcode argument parsing
- #
- # Each of the following functions takes a Dvi object and delta, which is the
- # difference between the opcode and the minimum opcode with the same meaning.
- # Dvi opcodes often encode the number of argument bytes in this delta.
- _arg_mapping = dict(
- # raw: Return delta as is.
- raw=lambda dvi, delta: delta,
- # u1: Read 1 byte as an unsigned number.
- u1=lambda dvi, delta: dvi._read_arg(1, signed=False),
- # u4: Read 4 bytes as an unsigned number.
- u4=lambda dvi, delta: dvi._read_arg(4, signed=False),
- # s4: Read 4 bytes as a signed number.
- s4=lambda dvi, delta: dvi._read_arg(4, signed=True),
- # slen: Read delta bytes as a signed number, or None if delta is None.
- slen=lambda dvi, delta: dvi._read_arg(delta, signed=True) if delta else None,
- # slen1: Read (delta + 1) bytes as a signed number.
- slen1=lambda dvi, delta: dvi._read_arg(delta + 1, signed=True),
- # ulen1: Read (delta + 1) bytes as an unsigned number.
- ulen1=lambda dvi, delta: dvi._read_arg(delta + 1, signed=False),
- # olen1: Read (delta + 1) bytes as an unsigned number if less than 4 bytes,
- # as a signed number if 4 bytes.
- olen1=lambda dvi, delta: dvi._read_arg(delta + 1, signed=(delta == 3)),
- )
- def _dispatch(table, min, max=None, state=None, args=('raw',)):
- """
- Decorator for dispatch by opcode. Sets the values in *table*
- from *min* to *max* to this method, adds a check that the Dvi state
- matches *state* if not None, reads arguments from the file according
- to *args*.
- Parameters
- ----------
- table : dict[int, callable]
- The dispatch table to be filled in.
- min, max : int
- Range of opcodes that calls the registered function; *max* defaults to
- *min*.
- state : _dvistate, optional
- State of the Dvi object in which these opcodes are allowed.
- args : list[str], default: ['raw']
- Sequence of argument specifications:
- - 'raw': opcode minus minimum
- - 'u1': read one unsigned byte
- - 'u4': read four bytes, treat as an unsigned number
- - 's4': read four bytes, treat as a signed number
- - 'slen': read (opcode - minimum) bytes, treat as signed
- - 'slen1': read (opcode - minimum + 1) bytes, treat as signed
- - 'ulen1': read (opcode - minimum + 1) bytes, treat as unsigned
- - 'olen1': read (opcode - minimum + 1) bytes, treat as unsigned
- if under four bytes, signed if four bytes
- """
- def decorate(method):
- get_args = [_arg_mapping[x] for x in args]
- @wraps(method)
- def wrapper(self, byte):
- if state is not None and self.state != state:
- raise ValueError("state precondition failed")
- return method(self, *[f(self, byte-min) for f in get_args])
- if max is None:
- table[min] = wrapper
- else:
- for i in range(min, max+1):
- assert table[i] is None
- table[i] = wrapper
- return wrapper
- return decorate
- class Dvi:
- """
- A reader for a dvi ("device-independent") file, as produced by TeX.
- The current implementation can only iterate through pages in order,
- and does not even attempt to verify the postamble.
- This class can be used as a context manager to close the underlying
- file upon exit. Pages can be read via iteration. Here is an overly
- simple way to extract text without trying to detect whitespace::
- >>> with matplotlib.dviread.Dvi('input.dvi', 72) as dvi:
- ... for page in dvi:
- ... print(''.join(chr(t.glyph) for t in page.text))
- """
- # dispatch table
- _dtable = [None] * 256
- _dispatch = partial(_dispatch, _dtable)
- def __init__(self, filename, dpi):
- """
- Read the data from the file named *filename* and convert
- TeX's internal units to units of *dpi* per inch.
- *dpi* only sets the units and does not limit the resolution.
- Use None to return TeX's internal units.
- """
- _log.debug('Dvi: %s', filename)
- self.file = open(filename, 'rb')
- self.dpi = dpi
- self.fonts = {}
- self.state = _dvistate.pre
- self._missing_font = None
- def __enter__(self):
- """Context manager enter method, does nothing."""
- return self
- def __exit__(self, etype, evalue, etrace):
- """
- Context manager exit method, closes the underlying file if it is open.
- """
- self.close()
- def __iter__(self):
- """
- Iterate through the pages of the file.
- Yields
- ------
- Page
- Details of all the text and box objects on the page.
- The Page tuple contains lists of Text and Box tuples and
- the page dimensions, and the Text and Box tuples contain
- coordinates transformed into a standard Cartesian
- coordinate system at the dpi value given when initializing.
- The coordinates are floating point numbers, but otherwise
- precision is not lost and coordinate values are not clipped to
- integers.
- """
- while self._read():
- yield self._output()
- def close(self):
- """Close the underlying file if it is open."""
- if not self.file.closed:
- self.file.close()
- def _output(self):
- """
- Output the text and boxes belonging to the most recent page.
- page = dvi._output()
- """
- minx = miny = np.inf
- maxx = maxy = -np.inf
- maxy_pure = -np.inf
- for elt in self.text + self.boxes:
- if isinstance(elt, Box):
- x, y, h, w = elt
- e = 0 # zero depth
- else: # glyph
- x, y, font, g, w = elt
- h, e = font._height_depth_of(g)
- minx = min(minx, x)
- miny = min(miny, y - h)
- maxx = max(maxx, x + w)
- maxy = max(maxy, y + e)
- maxy_pure = max(maxy_pure, y)
- if self._baseline_v is not None:
- maxy_pure = self._baseline_v # This should normally be the case.
- self._baseline_v = None
- if not self.text and not self.boxes: # Avoid infs/nans from inf+/-inf.
- return Page(text=[], boxes=[], width=0, height=0, descent=0)
- if self.dpi is None:
- # special case for ease of debugging: output raw dvi coordinates
- return Page(text=self.text, boxes=self.boxes,
- width=maxx-minx, height=maxy_pure-miny,
- descent=maxy-maxy_pure)
- # convert from TeX's "scaled points" to dpi units
- d = self.dpi / (72.27 * 2**16)
- descent = (maxy - maxy_pure) * d
- text = [Text((x-minx)*d, (maxy-y)*d - descent, f, g, w*d)
- for (x, y, f, g, w) in self.text]
- boxes = [Box((x-minx)*d, (maxy-y)*d - descent, h*d, w*d)
- for (x, y, h, w) in self.boxes]
- return Page(text=text, boxes=boxes, width=(maxx-minx)*d,
- height=(maxy_pure-miny)*d, descent=descent)
- def _read(self):
- """
- Read one page from the file. Return True if successful,
- False if there were no more pages.
- """
- # Pages appear to start with the sequence
- # bop (begin of page)
- # xxx comment
- # <push, ..., pop> # if using chemformula
- # down
- # push
- # down
- # <push, push, xxx, right, xxx, pop, pop> # if using xcolor
- # down
- # push
- # down (possibly multiple)
- # push <= here, v is the baseline position.
- # etc.
- # (dviasm is useful to explore this structure.)
- # Thus, we use the vertical position at the first time the stack depth
- # reaches 3, while at least three "downs" have been executed (excluding
- # those popped out (corresponding to the chemformula preamble)), as the
- # baseline (the "down" count is necessary to handle xcolor).
- down_stack = [0]
- self._baseline_v = None
- while True:
- byte = self.file.read(1)[0]
- self._dtable[byte](self, byte)
- if self._missing_font:
- raise self._missing_font.to_exception()
- name = self._dtable[byte].__name__
- if name == "_push":
- down_stack.append(down_stack[-1])
- elif name == "_pop":
- down_stack.pop()
- elif name == "_down":
- down_stack[-1] += 1
- if (self._baseline_v is None
- and len(getattr(self, "stack", [])) == 3
- and down_stack[-1] >= 4):
- self._baseline_v = self.v
- if byte == 140: # end of page
- return True
- if self.state is _dvistate.post_post: # end of file
- self.close()
- return False
- def _read_arg(self, nbytes, signed=False):
- """
- Read and return a big-endian integer *nbytes* long.
- Signedness is determined by the *signed* keyword.
- """
- return int.from_bytes(self.file.read(nbytes), "big", signed=signed)
- @_dispatch(min=0, max=127, state=_dvistate.inpage)
- def _set_char_immediate(self, char):
- self._put_char_real(char)
- if isinstance(self.fonts[self.f], cbook._ExceptionInfo):
- return
- self.h += self.fonts[self.f]._width_of(char)
- @_dispatch(min=128, max=131, state=_dvistate.inpage, args=('olen1',))
- def _set_char(self, char):
- self._put_char_real(char)
- if isinstance(self.fonts[self.f], cbook._ExceptionInfo):
- return
- self.h += self.fonts[self.f]._width_of(char)
- @_dispatch(132, state=_dvistate.inpage, args=('s4', 's4'))
- def _set_rule(self, a, b):
- self._put_rule_real(a, b)
- self.h += b
- @_dispatch(min=133, max=136, state=_dvistate.inpage, args=('olen1',))
- def _put_char(self, char):
- self._put_char_real(char)
- def _put_char_real(self, char):
- font = self.fonts[self.f]
- if isinstance(font, cbook._ExceptionInfo):
- self._missing_font = font
- elif font._vf is None:
- self.text.append(Text(self.h, self.v, font, char,
- font._width_of(char)))
- else:
- scale = font._scale
- for x, y, f, g, w in font._vf[char].text:
- newf = DviFont(scale=_mul2012(scale, f._scale),
- tfm=f._tfm, texname=f.texname, vf=f._vf)
- self.text.append(Text(self.h + _mul2012(x, scale),
- self.v + _mul2012(y, scale),
- newf, g, newf._width_of(g)))
- self.boxes.extend([Box(self.h + _mul2012(x, scale),
- self.v + _mul2012(y, scale),
- _mul2012(a, scale), _mul2012(b, scale))
- for x, y, a, b in font._vf[char].boxes])
- @_dispatch(137, state=_dvistate.inpage, args=('s4', 's4'))
- def _put_rule(self, a, b):
- self._put_rule_real(a, b)
- def _put_rule_real(self, a, b):
- if a > 0 and b > 0:
- self.boxes.append(Box(self.h, self.v, a, b))
- @_dispatch(138)
- def _nop(self, _):
- pass
- @_dispatch(139, state=_dvistate.outer, args=('s4',)*11)
- def _bop(self, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, p):
- self.state = _dvistate.inpage
- self.h = self.v = self.w = self.x = self.y = self.z = 0
- self.stack = []
- self.text = [] # list of Text objects
- self.boxes = [] # list of Box objects
- @_dispatch(140, state=_dvistate.inpage)
- def _eop(self, _):
- self.state = _dvistate.outer
- del self.h, self.v, self.w, self.x, self.y, self.z, self.stack
- @_dispatch(141, state=_dvistate.inpage)
- def _push(self, _):
- self.stack.append((self.h, self.v, self.w, self.x, self.y, self.z))
- @_dispatch(142, state=_dvistate.inpage)
- def _pop(self, _):
- self.h, self.v, self.w, self.x, self.y, self.z = self.stack.pop()
- @_dispatch(min=143, max=146, state=_dvistate.inpage, args=('slen1',))
- def _right(self, b):
- self.h += b
- @_dispatch(min=147, max=151, state=_dvistate.inpage, args=('slen',))
- def _right_w(self, new_w):
- if new_w is not None:
- self.w = new_w
- self.h += self.w
- @_dispatch(min=152, max=156, state=_dvistate.inpage, args=('slen',))
- def _right_x(self, new_x):
- if new_x is not None:
- self.x = new_x
- self.h += self.x
- @_dispatch(min=157, max=160, state=_dvistate.inpage, args=('slen1',))
- def _down(self, a):
- self.v += a
- @_dispatch(min=161, max=165, state=_dvistate.inpage, args=('slen',))
- def _down_y(self, new_y):
- if new_y is not None:
- self.y = new_y
- self.v += self.y
- @_dispatch(min=166, max=170, state=_dvistate.inpage, args=('slen',))
- def _down_z(self, new_z):
- if new_z is not None:
- self.z = new_z
- self.v += self.z
- @_dispatch(min=171, max=234, state=_dvistate.inpage)
- def _fnt_num_immediate(self, k):
- self.f = k
- @_dispatch(min=235, max=238, state=_dvistate.inpage, args=('olen1',))
- def _fnt_num(self, new_f):
- self.f = new_f
- @_dispatch(min=239, max=242, args=('ulen1',))
- def _xxx(self, datalen):
- special = self.file.read(datalen)
- _log.debug(
- 'Dvi._xxx: encountered special: %s',
- ''.join([chr(ch) if 32 <= ch < 127 else '<%02x>' % ch
- for ch in special]))
- @_dispatch(min=243, max=246, args=('olen1', 'u4', 'u4', 'u4', 'u1', 'u1'))
- def _fnt_def(self, k, c, s, d, a, l):
- self._fnt_def_real(k, c, s, d, a, l)
- def _fnt_def_real(self, k, c, s, d, a, l):
- n = self.file.read(a + l)
- fontname = n[-l:].decode('ascii')
- try:
- tfm = _tfmfile(fontname)
- except FileNotFoundError as exc:
- # Explicitly allow defining missing fonts for Vf support; we only
- # register an error when trying to load a glyph from a missing font
- # and throw that error in Dvi._read. For Vf, _finalize_packet
- # checks whether a missing glyph has been used, and in that case
- # skips the glyph definition.
- self.fonts[k] = cbook._ExceptionInfo.from_exception(exc)
- return
- if c != 0 and tfm.checksum != 0 and c != tfm.checksum:
- raise ValueError(f'tfm checksum mismatch: {n}')
- try:
- vf = _vffile(fontname)
- except FileNotFoundError:
- vf = None
- self.fonts[k] = DviFont(scale=s, tfm=tfm, texname=n, vf=vf)
- @_dispatch(247, state=_dvistate.pre, args=('u1', 'u4', 'u4', 'u4', 'u1'))
- def _pre(self, i, num, den, mag, k):
- self.file.read(k) # comment in the dvi file
- if i != 2:
- raise ValueError(f"Unknown dvi format {i}")
- if num != 25400000 or den != 7227 * 2**16:
- raise ValueError("Nonstandard units in dvi file")
- # meaning: TeX always uses those exact values, so it
- # should be enough for us to support those
- # (There are 72.27 pt to an inch so 7227 pt =
- # 7227 * 2**16 sp to 100 in. The numerator is multiplied
- # by 10^5 to get units of 10**-7 meters.)
- if mag != 1000:
- raise ValueError("Nonstandard magnification in dvi file")
- # meaning: LaTeX seems to frown on setting \mag, so
- # I think we can assume this is constant
- self.state = _dvistate.outer
- @_dispatch(248, state=_dvistate.outer)
- def _post(self, _):
- self.state = _dvistate.post_post
- # TODO: actually read the postamble and finale?
- # currently post_post just triggers closing the file
- @_dispatch(249)
- def _post_post(self, _):
- raise NotImplementedError
- @_dispatch(min=250, max=255)
- def _malformed(self, offset):
- raise ValueError(f"unknown command: byte {250 + offset}")
- class DviFont:
- """
- Encapsulation of a font that a DVI file can refer to.
- This class holds a font's texname and size, supports comparison,
- and knows the widths of glyphs in the same units as the AFM file.
- There are also internal attributes (for use by dviread.py) that
- are *not* used for comparison.
- The size is in Adobe points (converted from TeX points).
- Parameters
- ----------
- scale : float
- Factor by which the font is scaled from its natural size.
- tfm : Tfm
- TeX font metrics for this font
- texname : bytes
- Name of the font as used internally by TeX and friends, as an ASCII
- bytestring. This is usually very different from any external font
- names; `PsfontsMap` can be used to find the external name of the font.
- vf : Vf
- A TeX "virtual font" file, or None if this font is not virtual.
- Attributes
- ----------
- texname : bytes
- size : float
- Size of the font in Adobe points, converted from the slightly
- smaller TeX points.
- widths : list
- Widths of glyphs in glyph-space units, typically 1/1000ths of
- the point size.
- """
- __slots__ = ('texname', 'size', 'widths', '_scale', '_vf', '_tfm')
- def __init__(self, scale, tfm, texname, vf):
- _api.check_isinstance(bytes, texname=texname)
- self._scale = scale
- self._tfm = tfm
- self.texname = texname
- self._vf = vf
- self.size = scale * (72.0 / (72.27 * 2**16))
- try:
- nchars = max(tfm.width) + 1
- except ValueError:
- nchars = 0
- self.widths = [(1000*tfm.width.get(char, 0)) >> 20
- for char in range(nchars)]
- def __eq__(self, other):
- return (type(self) is type(other)
- and self.texname == other.texname and self.size == other.size)
- def __ne__(self, other):
- return not self.__eq__(other)
- def __repr__(self):
- return f"<{type(self).__name__}: {self.texname}>"
- def _width_of(self, char):
- """Width of char in dvi units."""
- width = self._tfm.width.get(char, None)
- if width is not None:
- return _mul2012(width, self._scale)
- _log.debug('No width for char %d in font %s.', char, self.texname)
- return 0
- def _height_depth_of(self, char):
- """Height and depth of char in dvi units."""
- result = []
- for metric, name in ((self._tfm.height, "height"),
- (self._tfm.depth, "depth")):
- value = metric.get(char, None)
- if value is None:
- _log.debug('No %s for char %d in font %s',
- name, char, self.texname)
- result.append(0)
- else:
- result.append(_mul2012(value, self._scale))
- # cmsyXX (symbols font) glyph 0 ("minus") has a nonzero descent
- # so that TeX aligns equations properly
- # (https://tex.stackexchange.com/q/526103/)
- # but we actually care about the rasterization depth to align
- # the dvipng-generated images.
- if re.match(br'^cmsy\d+$', self.texname) and char == 0:
- result[-1] = 0
- return result
- class Vf(Dvi):
- r"""
- A virtual font (\*.vf file) containing subroutines for dvi files.
- Parameters
- ----------
- filename : str or path-like
- Notes
- -----
- The virtual font format is a derivative of dvi:
- http://mirrors.ctan.org/info/knuth/virtual-fonts
- This class reuses some of the machinery of `Dvi`
- but replaces the `_read` loop and dispatch mechanism.
- Examples
- --------
- ::
- vf = Vf(filename)
- glyph = vf[code]
- glyph.text, glyph.boxes, glyph.width
- """
- def __init__(self, filename):
- super().__init__(filename, 0)
- try:
- self._first_font = None
- self._chars = {}
- self._read()
- finally:
- self.close()
- def __getitem__(self, code):
- return self._chars[code]
- def _read(self):
- """
- Read one page from the file. Return True if successful,
- False if there were no more pages.
- """
- packet_char = packet_ends = None
- packet_len = packet_width = None
- while True:
- byte = self.file.read(1)[0]
- # If we are in a packet, execute the dvi instructions
- if self.state is _dvistate.inpage:
- byte_at = self.file.tell()-1
- if byte_at == packet_ends:
- self._finalize_packet(packet_char, packet_width)
- packet_len = packet_char = packet_width = None
- # fall through to out-of-packet code
- elif byte_at > packet_ends:
- raise ValueError("Packet length mismatch in vf file")
- else:
- if byte in (139, 140) or byte >= 243:
- raise ValueError(f"Inappropriate opcode {byte} in vf file")
- Dvi._dtable[byte](self, byte)
- continue
- # We are outside a packet
- if byte < 242: # a short packet (length given by byte)
- packet_len = byte
- packet_char = self._read_arg(1)
- packet_width = self._read_arg(3)
- packet_ends = self._init_packet(byte)
- self.state = _dvistate.inpage
- elif byte == 242: # a long packet
- packet_len = self._read_arg(4)
- packet_char = self._read_arg(4)
- packet_width = self._read_arg(4)
- self._init_packet(packet_len)
- elif 243 <= byte <= 246:
- k = self._read_arg(byte - 242, byte == 246)
- c = self._read_arg(4)
- s = self._read_arg(4)
- d = self._read_arg(4)
- a = self._read_arg(1)
- l = self._read_arg(1)
- self._fnt_def_real(k, c, s, d, a, l)
- if self._first_font is None:
- self._first_font = k
- elif byte == 247: # preamble
- i = self._read_arg(1)
- k = self._read_arg(1)
- x = self.file.read(k)
- cs = self._read_arg(4)
- ds = self._read_arg(4)
- self._pre(i, x, cs, ds)
- elif byte == 248: # postamble (just some number of 248s)
- break
- else:
- raise ValueError(f"Unknown vf opcode {byte}")
- def _init_packet(self, pl):
- if self.state != _dvistate.outer:
- raise ValueError("Misplaced packet in vf file")
- self.h = self.v = self.w = self.x = self.y = self.z = 0
- self.stack = []
- self.text = []
- self.boxes = []
- self.f = self._first_font
- self._missing_font = None
- return self.file.tell() + pl
- def _finalize_packet(self, packet_char, packet_width):
- if not self._missing_font: # Otherwise we don't have full glyph definition.
- self._chars[packet_char] = Page(
- text=self.text, boxes=self.boxes, width=packet_width,
- height=None, descent=None)
- self.state = _dvistate.outer
- def _pre(self, i, x, cs, ds):
- if self.state is not _dvistate.pre:
- raise ValueError("pre command in middle of vf file")
- if i != 202:
- raise ValueError(f"Unknown vf format {i}")
- if len(x):
- _log.debug('vf file comment: %s', x)
- self.state = _dvistate.outer
- # cs = checksum, ds = design size
- def _mul2012(num1, num2):
- """Multiply two numbers in 20.12 fixed point format."""
- # Separated into a function because >> has surprising precedence
- return (num1*num2) >> 20
- class Tfm:
- """
- A TeX Font Metric file.
- This implementation covers only the bare minimum needed by the Dvi class.
- Parameters
- ----------
- filename : str or path-like
- Attributes
- ----------
- checksum : int
- Used for verifying against the dvi file.
- design_size : int
- Design size of the font (unknown units)
- width, height, depth : dict
- Dimensions of each character, need to be scaled by the factor
- specified in the dvi file. These are dicts because indexing may
- not start from 0.
- """
- __slots__ = ('checksum', 'design_size', 'width', 'height', 'depth')
- def __init__(self, filename):
- _log.debug('opening tfm file %s', filename)
- with open(filename, 'rb') as file:
- header1 = file.read(24)
- lh, bc, ec, nw, nh, nd = struct.unpack('!6H', header1[2:14])
- _log.debug('lh=%d, bc=%d, ec=%d, nw=%d, nh=%d, nd=%d',
- lh, bc, ec, nw, nh, nd)
- header2 = file.read(4*lh)
- self.checksum, self.design_size = struct.unpack('!2I', header2[:8])
- # there is also encoding information etc.
- char_info = file.read(4*(ec-bc+1))
- widths = struct.unpack(f'!{nw}i', file.read(4*nw))
- heights = struct.unpack(f'!{nh}i', file.read(4*nh))
- depths = struct.unpack(f'!{nd}i', file.read(4*nd))
- self.width = {}
- self.height = {}
- self.depth = {}
- for idx, char in enumerate(range(bc, ec+1)):
- byte0 = char_info[4*idx]
- byte1 = char_info[4*idx+1]
- self.width[char] = widths[byte0]
- self.height[char] = heights[byte1 >> 4]
- self.depth[char] = depths[byte1 & 0xf]
- PsFont = namedtuple('PsFont', 'texname psname effects encoding filename')
- class PsfontsMap:
- """
- A psfonts.map formatted file, mapping TeX fonts to PS fonts.
- Parameters
- ----------
- filename : str or path-like
- Notes
- -----
- For historical reasons, TeX knows many Type-1 fonts by different
- names than the outside world. (For one thing, the names have to
- fit in eight characters.) Also, TeX's native fonts are not Type-1
- but Metafont, which is nontrivial to convert to PostScript except
- as a bitmap. While high-quality conversions to Type-1 format exist
- and are shipped with modern TeX distributions, we need to know
- which Type-1 fonts are the counterparts of which native fonts. For
- these reasons a mapping is needed from internal font names to font
- file names.
- A texmf tree typically includes mapping files called e.g.
- :file:`psfonts.map`, :file:`pdftex.map`, or :file:`dvipdfm.map`.
- The file :file:`psfonts.map` is used by :program:`dvips`,
- :file:`pdftex.map` by :program:`pdfTeX`, and :file:`dvipdfm.map`
- by :program:`dvipdfm`. :file:`psfonts.map` might avoid embedding
- the 35 PostScript fonts (i.e., have no filename for them, as in
- the Times-Bold example above), while the pdf-related files perhaps
- only avoid the "Base 14" pdf fonts. But the user may have
- configured these files differently.
- Examples
- --------
- >>> map = PsfontsMap(find_tex_file('pdftex.map'))
- >>> entry = map[b'ptmbo8r']
- >>> entry.texname
- b'ptmbo8r'
- >>> entry.psname
- b'Times-Bold'
- >>> entry.encoding
- '/usr/local/texlive/2008/texmf-dist/fonts/enc/dvips/base/8r.enc'
- >>> entry.effects
- {'slant': 0.16700000000000001}
- >>> entry.filename
- """
- __slots__ = ('_filename', '_unparsed', '_parsed')
- # Create a filename -> PsfontsMap cache, so that calling
- # `PsfontsMap(filename)` with the same filename a second time immediately
- # returns the same object.
- @lru_cache
- def __new__(cls, filename):
- self = object.__new__(cls)
- self._filename = os.fsdecode(filename)
- # Some TeX distributions have enormous pdftex.map files which would
- # take hundreds of milliseconds to parse, but it is easy enough to just
- # store the unparsed lines (keyed by the first word, which is the
- # texname) and parse them on-demand.
- with open(filename, 'rb') as file:
- self._unparsed = {}
- for line in file:
- tfmname = line.split(b' ', 1)[0]
- self._unparsed.setdefault(tfmname, []).append(line)
- self._parsed = {}
- return self
- def __getitem__(self, texname):
- assert isinstance(texname, bytes)
- if texname in self._unparsed:
- for line in self._unparsed.pop(texname):
- if self._parse_and_cache_line(line):
- break
- try:
- return self._parsed[texname]
- except KeyError:
- raise LookupError(
- f"An associated PostScript font (required by Matplotlib) "
- f"could not be found for TeX font {texname.decode('ascii')!r} "
- f"in {self._filename!r}; this problem can often be solved by "
- f"installing a suitable PostScript font package in your TeX "
- f"package manager") from None
- def _parse_and_cache_line(self, line):
- """
- Parse a line in the font mapping file.
- The format is (partially) documented at
- http://mirrors.ctan.org/systems/doc/pdftex/manual/pdftex-a.pdf
- https://tug.org/texinfohtml/dvips.html#psfonts_002emap
- Each line can have the following fields:
- - tfmname (first, only required field),
- - psname (defaults to tfmname, must come immediately after tfmname if
- present),
- - fontflags (integer, must come immediately after psname if present,
- ignored by us),
- - special (SlantFont and ExtendFont, only field that is double-quoted),
- - fontfile, encodingfile (optional, prefixed by <, <<, or <[; << always
- precedes a font, <[ always precedes an encoding, < can precede either
- but then an encoding file must have extension .enc; < and << also
- request different font subsetting behaviors but we ignore that; < can
- be separated from the filename by whitespace).
- special, fontfile, and encodingfile can appear in any order.
- """
- # If the map file specifies multiple encodings for a font, we
- # follow pdfTeX in choosing the last one specified. Such
- # entries are probably mistakes but they have occurred.
- # https://tex.stackexchange.com/q/10826/
- if not line or line.startswith((b" ", b"%", b"*", b";", b"#")):
- return
- tfmname = basename = special = encodingfile = fontfile = None
- is_subsetted = is_t1 = is_truetype = False
- matches = re.finditer(br'"([^"]*)(?:"|$)|(\S+)', line)
- for match in matches:
- quoted, unquoted = match.groups()
- if unquoted:
- if unquoted.startswith(b"<<"): # font
- fontfile = unquoted[2:]
- elif unquoted.startswith(b"<["): # encoding
- encodingfile = unquoted[2:]
- elif unquoted.startswith(b"<"): # font or encoding
- word = (
- # <foo => foo
- unquoted[1:]
- # < by itself => read the next word
- or next(filter(None, next(matches).groups())))
- if word.endswith(b".enc"):
- encodingfile = word
- else:
- fontfile = word
- is_subsetted = True
- elif tfmname is None:
- tfmname = unquoted
- elif basename is None:
- basename = unquoted
- elif quoted:
- special = quoted
- effects = {}
- if special:
- words = reversed(special.split())
- for word in words:
- if word == b"SlantFont":
- effects["slant"] = float(next(words))
- elif word == b"ExtendFont":
- effects["extend"] = float(next(words))
- # Verify some properties of the line that would cause it to be ignored
- # otherwise.
- if fontfile is not None:
- if fontfile.endswith((b".ttf", b".ttc")):
- is_truetype = True
- elif not fontfile.endswith(b".otf"):
- is_t1 = True
- elif basename is not None:
- is_t1 = True
- if is_truetype and is_subsetted and encodingfile is None:
- return
- if not is_t1 and ("slant" in effects or "extend" in effects):
- return
- if abs(effects.get("slant", 0)) > 1:
- return
- if abs(effects.get("extend", 0)) > 2:
- return
- if basename is None:
- basename = tfmname
- if encodingfile is not None:
- encodingfile = find_tex_file(encodingfile)
- if fontfile is not None:
- fontfile = find_tex_file(fontfile)
- self._parsed[tfmname] = PsFont(
- texname=tfmname, psname=basename, effects=effects,
- encoding=encodingfile, filename=fontfile)
- return True
- def _parse_enc(path):
- r"""
- Parse a \*.enc file referenced from a psfonts.map style file.
- The format supported by this function is a tiny subset of PostScript.
- Parameters
- ----------
- path : `os.PathLike`
- Returns
- -------
- list
- The nth entry of the list is the PostScript glyph name of the nth
- glyph.
- """
- no_comments = re.sub("%.*", "", Path(path).read_text(encoding="ascii"))
- array = re.search(r"(?s)\[(.*)\]", no_comments).group(1)
- lines = [line for line in array.split() if line]
- if all(line.startswith("/") for line in lines):
- return [line[1:] for line in lines]
- else:
- raise ValueError(f"Failed to parse {path} as Postscript encoding")
- class _LuatexKpsewhich:
- @lru_cache # A singleton.
- def __new__(cls):
- self = object.__new__(cls)
- self._proc = self._new_proc()
- return self
- def _new_proc(self):
- return subprocess.Popen(
- ["luatex", "--luaonly",
- str(cbook._get_data_path("kpsewhich.lua"))],
- stdin=subprocess.PIPE, stdout=subprocess.PIPE)
- def search(self, filename):
- if self._proc.poll() is not None: # Dead, restart it.
- self._proc = self._new_proc()
- self._proc.stdin.write(os.fsencode(filename) + b"\n")
- self._proc.stdin.flush()
- out = self._proc.stdout.readline().rstrip()
- return None if out == b"nil" else os.fsdecode(out)
- @lru_cache
- def find_tex_file(filename):
- """
- Find a file in the texmf tree using kpathsea_.
- The kpathsea library, provided by most existing TeX distributions, both
- on Unix-like systems and on Windows (MikTeX), is invoked via a long-lived
- luatex process if luatex is installed, or via kpsewhich otherwise.
- .. _kpathsea: https://www.tug.org/kpathsea/
- Parameters
- ----------
- filename : str or path-like
- Raises
- ------
- FileNotFoundError
- If the file is not found.
- """
- # we expect these to always be ascii encoded, but use utf-8
- # out of caution
- if isinstance(filename, bytes):
- filename = filename.decode('utf-8', errors='replace')
- try:
- lk = _LuatexKpsewhich()
- except FileNotFoundError:
- lk = None # Fallback to directly calling kpsewhich, as below.
- if lk:
- path = lk.search(filename)
- else:
- if sys.platform == 'win32':
- # On Windows only, kpathsea can use utf-8 for cmd args and output.
- # The `command_line_encoding` environment variable is set to force
- # it to always use utf-8 encoding. See Matplotlib issue #11848.
- kwargs = {'env': {**os.environ, 'command_line_encoding': 'utf-8'},
- 'encoding': 'utf-8'}
- else: # On POSIX, run through the equivalent of os.fsdecode().
- kwargs = {'encoding': sys.getfilesystemencoding(),
- 'errors': 'surrogateescape'}
- try:
- path = (cbook._check_and_log_subprocess(['kpsewhich', filename],
- _log, **kwargs)
- .rstrip('\n'))
- except (FileNotFoundError, RuntimeError):
- path = None
- if path:
- return path
- else:
- raise FileNotFoundError(
- f"Matplotlib's TeX implementation searched for a file named "
- f"{filename!r} in your texmf tree, but could not find it")
- @lru_cache
- def _fontfile(cls, suffix, texname):
- return cls(find_tex_file(texname + suffix))
- _tfmfile = partial(_fontfile, Tfm, ".tfm")
- _vffile = partial(_fontfile, Vf, ".vf")
- if __name__ == '__main__':
- from argparse import ArgumentParser
- import itertools
- parser = ArgumentParser()
- parser.add_argument("filename")
- parser.add_argument("dpi", nargs="?", type=float, default=None)
- args = parser.parse_args()
- with Dvi(args.filename, args.dpi) as dvi:
- fontmap = PsfontsMap(find_tex_file('pdftex.map'))
- for page in dvi:
- print(f"=== new page === "
- f"(w: {page.width}, h: {page.height}, d: {page.descent})")
- for font, group in itertools.groupby(
- page.text, lambda text: text.font):
- print(f"font: {font.texname.decode('latin-1')!r}\t"
- f"scale: {font._scale / 2 ** 20}")
- print("x", "y", "glyph", "chr", "w", "(glyphs)", sep="\t")
- for text in group:
- print(text.x, text.y, text.glyph,
- chr(text.glyph) if chr(text.glyph).isprintable()
- else ".",
- text.width, sep="\t")
- if page.boxes:
- print("x", "y", "h", "w", "", "(boxes)", sep="\t")
- for box in page.boxes:
- print(box.x, box.y, box.height, box.width, sep="\t")
|