webpdf.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. """Export to PDF via a headless browser"""
  2. # Copyright (c) IPython Development Team.
  3. # Distributed under the terms of the Modified BSD License.
  4. import asyncio
  5. import concurrent.futures
  6. import os
  7. import subprocess
  8. import sys
  9. import tempfile
  10. from importlib import util as importlib_util
  11. from traitlets import Bool, Int, List, Unicode, default
  12. from .html import HTMLExporter
  13. PLAYWRIGHT_INSTALLED = importlib_util.find_spec("playwright") is not None
  14. IS_WINDOWS = os.name == "nt"
  15. class WebPDFExporter(HTMLExporter):
  16. """Writer designed to write to PDF files.
  17. This inherits from :class:`HTMLExporter`. It creates the HTML using the
  18. template machinery, and then run playwright to create a pdf.
  19. """
  20. export_from_notebook = "PDF via HTML"
  21. allow_chromium_download = Bool(
  22. False,
  23. help="Whether to allow downloading Chromium if no suitable version is found on the system.",
  24. ).tag(config=True)
  25. paginate = Bool(
  26. True,
  27. help="""
  28. Split generated notebook into multiple pages.
  29. If False, a PDF with one long page will be generated.
  30. Set to True to match behavior of LaTeX based PDF generator
  31. """,
  32. ).tag(config=True)
  33. page_render_timeout = Int(
  34. 100,
  35. help="""
  36. Time to wait for the page to render before converting to PDF, in milliseconds.
  37. Increase this value if your notebook has a lot of complex JavaScript
  38. output that needs more time to load.
  39. """,
  40. ).tag(config=True)
  41. @default("file_extension")
  42. def _file_extension_default(self):
  43. return ".pdf"
  44. @default("template_extension")
  45. def _template_extension_default(self):
  46. # NOTE: we use .html.j2 so that the HTMLExporter can find the template
  47. return ".html.j2"
  48. @default("template_name")
  49. def _template_name_default(self):
  50. return "webpdf"
  51. disable_sandbox = Bool(
  52. False,
  53. help="""
  54. Disable chromium security sandbox when converting to PDF.
  55. WARNING: This could cause arbitrary code execution in specific circumstances,
  56. where JS in your notebook can execute serverside code! Please use with
  57. caution.
  58. ``https://github.com/puppeteer/puppeteer/blob/main@%7B2020-12-14T17:22:24Z%7D/docs/troubleshooting.md#setting-up-chrome-linux-sandbox``
  59. has more information.
  60. This is required for webpdf to work inside most container environments.
  61. """,
  62. ).tag(config=True)
  63. browser_args = List(
  64. Unicode(),
  65. help="""
  66. Additional arguments to pass to the browser rendering to PDF.
  67. These arguments will be passed directly to the browser launch method
  68. and can be used to customize browser behavior beyond the default settings.
  69. """,
  70. ).tag(config=True)
  71. def run_playwright(self, html):
  72. """Run playwright."""
  73. async def main(temp_file):
  74. """Run main playwright script."""
  75. try:
  76. from playwright.async_api import ( # type: ignore[import-not-found] # noqa: PLC0415,
  77. async_playwright,
  78. )
  79. except ModuleNotFoundError as e:
  80. msg = (
  81. "Playwright is not installed to support Web PDF conversion. "
  82. "Please install `nbconvert[webpdf]` to enable."
  83. )
  84. raise RuntimeError(msg) from e
  85. if self.allow_chromium_download:
  86. cmd = [sys.executable, "-m", "playwright", "install", "chromium"]
  87. subprocess.check_call(cmd) # noqa: S603
  88. playwright = await async_playwright().start()
  89. chromium = playwright.chromium
  90. args = self.browser_args
  91. if self.disable_sandbox:
  92. args.append("--no-sandbox")
  93. try:
  94. browser = await chromium.launch(
  95. handle_sigint=False, handle_sigterm=False, handle_sighup=False, args=args
  96. )
  97. except Exception as e:
  98. msg = (
  99. "No suitable chromium executable found on the system. "
  100. "Please use '--allow-chromium-download' to allow downloading one,"
  101. "or install it using `playwright install chromium`."
  102. )
  103. await playwright.stop()
  104. raise RuntimeError(msg) from e
  105. page = await browser.new_page()
  106. await page.emulate_media(media="print")
  107. await page.wait_for_timeout(100)
  108. await page.goto(f"file://{temp_file.name}", wait_until="networkidle")
  109. await page.wait_for_timeout(self.page_render_timeout)
  110. pdf_params = {"print_background": True}
  111. if not self.paginate:
  112. # Floating point precision errors cause the printed
  113. # PDF from spilling over a new page by a pixel fraction.
  114. dimensions = await page.evaluate(
  115. """() => {
  116. const rect = document.body.getBoundingClientRect();
  117. return {
  118. width: Math.ceil(rect.width) + 1,
  119. height: Math.ceil(rect.height) + 1,
  120. }
  121. }"""
  122. )
  123. width = dimensions["width"]
  124. height = dimensions["height"]
  125. # 200 inches is the maximum size for Adobe Acrobat Reader.
  126. pdf_params.update(
  127. {
  128. "width": min(width, 200 * 72),
  129. "height": min(height, 200 * 72),
  130. }
  131. )
  132. pdf_data = await page.pdf(**pdf_params)
  133. await browser.close()
  134. await playwright.stop()
  135. return pdf_data
  136. pool = concurrent.futures.ThreadPoolExecutor()
  137. # Create a temporary file to pass the HTML code to Chromium:
  138. # Unfortunately, tempfile on Windows does not allow for an already open
  139. # file to be opened by a separate process. So we must close it first
  140. # before calling Chromium. We also specify delete=False to ensure the
  141. # file is not deleted after closing (the default behavior).
  142. temp_file = tempfile.NamedTemporaryFile( # noqa: SIM115
  143. suffix=".html", delete=False
  144. )
  145. with temp_file:
  146. temp_file.write(html.encode("utf-8"))
  147. try:
  148. pdf_data = pool.submit(asyncio.run, main(temp_file)).result()
  149. finally:
  150. # Ensure the file is deleted even if playwright raises an exception
  151. os.unlink(temp_file.name)
  152. return pdf_data
  153. def from_notebook_node(self, nb, resources=None, **kw):
  154. """Convert from a notebook node."""
  155. html, resources = super().from_notebook_node(nb, resources=resources, **kw)
  156. self.log.info("Building PDF")
  157. pdf_data = self.run_playwright(html)
  158. self.log.info("PDF successfully created")
  159. return pdf_data, resources