fileio.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599
  1. """
  2. Utilities for file-based Contents/Checkpoints managers.
  3. """
  4. # Copyright (c) Jupyter Development Team.
  5. # Distributed under the terms of the Modified BSD License.
  6. from __future__ import annotations
  7. import errno
  8. import hashlib
  9. import os
  10. import shutil
  11. from base64 import decodebytes, encodebytes
  12. from contextlib import contextmanager
  13. from functools import partial
  14. import nbformat
  15. from anyio.to_thread import run_sync
  16. from tornado.web import HTTPError
  17. from traitlets import Bool, Enum
  18. from traitlets.config import Configurable
  19. from traitlets.config.configurable import LoggingConfigurable
  20. from jupyter_server.utils import ApiPath, to_api_path, to_os_path
  21. def replace_file(src, dst):
  22. """replace dst with src"""
  23. os.replace(src, dst)
  24. async def async_replace_file(src, dst):
  25. """replace dst with src asynchronously"""
  26. await run_sync(os.replace, src, dst)
  27. def copy2_safe(src, dst, log=None):
  28. """copy src to dst
  29. like shutil.copy2, but log errors in copystat instead of raising
  30. """
  31. # if src file is not writable, avoid creating a back-up
  32. if not os.access(src, os.W_OK):
  33. if log:
  34. log.debug("Source file, %s, is not writable", src, exc_info=True)
  35. raise PermissionError(errno.EACCES, f"File is not writable: {src}")
  36. shutil.copyfile(src, dst)
  37. try:
  38. shutil.copystat(src, dst)
  39. except OSError:
  40. if log:
  41. log.debug("copystat on %s failed", dst, exc_info=True)
  42. async def async_copy2_safe(src, dst, log=None):
  43. """copy src to dst asynchronously
  44. like shutil.copy2, but log errors in copystat instead of raising
  45. """
  46. if not os.access(src, os.W_OK):
  47. if log:
  48. log.debug("Source file, %s, is not writable", src, exc_info=True)
  49. raise PermissionError(errno.EACCES, f"File is not writable: {src}")
  50. await run_sync(shutil.copyfile, src, dst)
  51. try:
  52. await run_sync(shutil.copystat, src, dst)
  53. except OSError:
  54. if log:
  55. log.debug("copystat on %s failed", dst, exc_info=True)
  56. def path_to_intermediate(path):
  57. """Name of the intermediate file used in atomic writes.
  58. The .~ prefix will make Dropbox ignore the temporary file."""
  59. dirname, basename = os.path.split(path)
  60. return os.path.join(dirname, ".~" + basename)
  61. def path_to_invalid(path):
  62. """Name of invalid file after a failed atomic write and subsequent read."""
  63. dirname, basename = os.path.split(path)
  64. return os.path.join(dirname, basename + ".invalid")
  65. @contextmanager
  66. def atomic_writing(path, text=True, encoding="utf-8", log=None, **kwargs):
  67. """Context manager to write to a file only if the entire write is successful.
  68. This works by copying the previous file contents to a temporary file in the
  69. same directory, and renaming that file back to the target if the context
  70. exits with an error. If the context is successful, the new data is synced to
  71. disk and the temporary file is removed.
  72. Parameters
  73. ----------
  74. path : str
  75. The target file to write to.
  76. text : bool, optional
  77. Whether to open the file in text mode (i.e. to write unicode). Default is
  78. True.
  79. encoding : str, optional
  80. The encoding to use for files opened in text mode. Default is UTF-8.
  81. **kwargs
  82. Passed to :func:`io.open`.
  83. """
  84. # realpath doesn't work on Windows: https://bugs.python.org/issue9949
  85. # Luckily, we only need to resolve the file itself being a symlink, not
  86. # any of its directories, so this will suffice:
  87. if os.path.islink(path):
  88. path = os.path.join(os.path.dirname(path), os.readlink(path))
  89. # Fall back to direct write for existing file in a non-writable dir
  90. dirpath = os.path.dirname(path) or os.getcwd()
  91. if os.path.isfile(path) and not os.access(dirpath, os.W_OK) and os.access(path, os.W_OK):
  92. mode = "w" if text else "wb"
  93. # direct open on the target file
  94. if text:
  95. fileobj = open(path, mode, encoding=encoding, **kwargs) # noqa: SIM115
  96. else:
  97. fileobj = open(path, mode, **kwargs) # noqa: SIM115
  98. try:
  99. yield fileobj
  100. finally:
  101. fileobj.close()
  102. return
  103. tmp_path = path_to_intermediate(path)
  104. if os.path.isfile(path):
  105. copy2_safe(path, tmp_path, log=log)
  106. if text:
  107. # Make sure that text files have Unix linefeeds by default
  108. kwargs.setdefault("newline", "\n")
  109. fileobj = open(path, "w", encoding=encoding, **kwargs) # noqa: SIM115
  110. else:
  111. fileobj = open(path, "wb", **kwargs) # noqa: SIM115
  112. try:
  113. yield fileobj
  114. except BaseException:
  115. # Failed! Move the backup file back to the real path to avoid corruption
  116. fileobj.close()
  117. replace_file(tmp_path, path)
  118. raise
  119. # Flush to disk
  120. fileobj.flush()
  121. os.fsync(fileobj.fileno())
  122. fileobj.close()
  123. # Written successfully, now remove the backup copy
  124. if os.path.isfile(tmp_path):
  125. os.remove(tmp_path)
  126. @contextmanager
  127. def _simple_writing(path, text=True, encoding="utf-8", log=None, **kwargs):
  128. """Context manager to write file without doing atomic writing
  129. (for weird filesystem eg: nfs).
  130. Parameters
  131. ----------
  132. path : str
  133. The target file to write to.
  134. text : bool, optional
  135. Whether to open the file in text mode (i.e. to write unicode). Default is
  136. True.
  137. encoding : str, optional
  138. The encoding to use for files opened in text mode. Default is UTF-8.
  139. **kwargs
  140. Passed to :func:`io.open`.
  141. """
  142. # realpath doesn't work on Windows: https://bugs.python.org/issue9949
  143. # Luckily, we only need to resolve the file itself being a symlink, not
  144. # any of its directories, so this will suffice:
  145. if os.path.islink(path):
  146. path = os.path.join(os.path.dirname(path), os.readlink(path))
  147. if text:
  148. # Make sure that text files have Unix linefeeds by default
  149. kwargs.setdefault("newline", "\n")
  150. fileobj = open(path, "w", encoding=encoding, **kwargs) # noqa: SIM115
  151. else:
  152. fileobj = open(path, "wb", **kwargs) # noqa: SIM115
  153. try:
  154. yield fileobj
  155. except BaseException:
  156. fileobj.close()
  157. raise
  158. fileobj.close()
  159. class FileManagerMixin(LoggingConfigurable, Configurable):
  160. """
  161. Mixin for ContentsAPI classes that interact with the filesystem.
  162. Provides facilities for reading, writing, and copying files.
  163. Shared by FileContentsManager and FileCheckpoints.
  164. Note
  165. ----
  166. Classes using this mixin must provide the following attributes:
  167. root_dir : unicode
  168. A directory against against which API-style paths are to be resolved.
  169. log : logging.Logger
  170. """
  171. use_atomic_writing = Bool(
  172. True,
  173. config=True,
  174. help="""By default notebooks are saved on disk on a temporary file and then if successfully written, it replaces the old ones.
  175. This procedure, namely 'atomic_writing', causes some bugs on file system without operation order enforcement (like some networked fs).
  176. If set to False, the new notebook is written directly on the old one which could fail (eg: full filesystem or quota )""",
  177. )
  178. hash_algorithm = Enum( # type: ignore[call-overload]
  179. hashlib.algorithms_available,
  180. default_value="sha256",
  181. config=True,
  182. help="Hash algorithm to use for file content, support by hashlib",
  183. )
  184. @contextmanager
  185. def open(self, os_path, *args, **kwargs):
  186. """wrapper around io.open that turns permission errors into 403"""
  187. with self.perm_to_403(os_path), open(os_path, *args, **kwargs) as f:
  188. yield f
  189. @contextmanager
  190. def atomic_writing(self, os_path, *args, **kwargs):
  191. """wrapper around atomic_writing that turns permission errors to 403.
  192. Depending on flag 'use_atomic_writing', the wrapper perform an actual atomic writing or
  193. simply writes the file (whatever an old exists or not)"""
  194. with self.perm_to_403(os_path):
  195. kwargs["log"] = self.log
  196. if self.use_atomic_writing:
  197. with atomic_writing(os_path, *args, **kwargs) as f:
  198. yield f
  199. else:
  200. with _simple_writing(os_path, *args, **kwargs) as f:
  201. yield f
  202. @contextmanager
  203. def perm_to_403(self, os_path=""):
  204. """context manager for turning permission errors into 403."""
  205. try:
  206. yield
  207. except OSError as e:
  208. if e.errno in {errno.EPERM, errno.EACCES}:
  209. # make 403 error message without root prefix
  210. # this may not work perfectly on unicode paths on Python 2,
  211. # but nobody should be doing that anyway.
  212. if not os_path:
  213. os_path = e.filename or "unknown file"
  214. path = to_api_path(os_path, root=self.root_dir) # type:ignore[attr-defined]
  215. raise HTTPError(403, "Permission denied: %s" % path) from e
  216. else:
  217. raise
  218. def _copy(self, src, dest):
  219. """copy src to dest
  220. like shutil.copy2, but log errors in copystat
  221. """
  222. copy2_safe(src, dest, log=self.log)
  223. def _get_os_path(self, path):
  224. """Given an API path, return its file system path.
  225. Parameters
  226. ----------
  227. path : str
  228. The relative API path to the named file.
  229. Returns
  230. -------
  231. path : str
  232. Native, absolute OS path to for a file.
  233. Raises
  234. ------
  235. 404: if path is outside root
  236. """
  237. # This statement can cause excessive logging, uncomment if necessary when troubleshooting.
  238. # self.log.debug("Reading path from disk: %s", path)
  239. root = os.path.abspath(self.root_dir) # type:ignore[attr-defined]
  240. # to_os_path is not safe if path starts with a drive, since os.path.join discards first part
  241. if os.path.splitdrive(path)[0]:
  242. raise HTTPError(404, "%s is not a relative API path" % path)
  243. os_path = to_os_path(ApiPath(path), root)
  244. # validate os path
  245. # e.g. "foo\0" raises ValueError: embedded null byte
  246. try:
  247. os.lstat(os_path)
  248. except OSError:
  249. # OSError could be FileNotFound, PermissionError, etc.
  250. # those should raise (or not) elsewhere
  251. pass
  252. except ValueError:
  253. raise HTTPError(404, f"{path} is not a valid path") from None
  254. if not (os.path.abspath(os_path) + os.path.sep).startswith(root):
  255. raise HTTPError(404, "%s is outside root contents directory" % path)
  256. return os_path
  257. def _read_notebook(
  258. self, os_path, as_version=4, capture_validation_error=None, raw: bool = False
  259. ):
  260. """Read a notebook from an os path."""
  261. answer = self._read_file(os_path, "text", raw=raw)
  262. try:
  263. nb = nbformat.reads(
  264. answer[0],
  265. as_version=as_version,
  266. capture_validation_error=capture_validation_error,
  267. )
  268. return (nb, answer[2]) if raw else nb # type:ignore[misc]
  269. except Exception as e:
  270. e_orig = e
  271. # If use_atomic_writing is enabled, we'll guess that it was also
  272. # enabled when this notebook was written and look for a valid
  273. # atomic intermediate.
  274. tmp_path = path_to_intermediate(os_path)
  275. if not self.use_atomic_writing or not os.path.exists(tmp_path):
  276. raise HTTPError(
  277. 400,
  278. f"Unreadable Notebook: {os_path} {e_orig!r}",
  279. )
  280. # Move the bad file aside, restore the intermediate, and try again.
  281. invalid_file = path_to_invalid(os_path)
  282. replace_file(os_path, invalid_file)
  283. replace_file(tmp_path, os_path)
  284. return self._read_notebook(
  285. os_path, as_version, capture_validation_error=capture_validation_error, raw=raw
  286. )
  287. def _save_notebook(self, os_path, nb, capture_validation_error=None):
  288. """Save a notebook to an os_path."""
  289. with self.atomic_writing(os_path, encoding="utf-8") as f:
  290. nbformat.write(
  291. nb,
  292. f,
  293. version=nbformat.NO_CONVERT,
  294. capture_validation_error=capture_validation_error,
  295. )
  296. def _get_hash(self, byte_content: bytes) -> dict[str, str]:
  297. """Compute the hash hexdigest for the provided bytes.
  298. The hash algorithm is provided by the `hash_algorithm` attribute.
  299. Parameters
  300. ----------
  301. byte_content : bytes
  302. The bytes to hash
  303. Returns
  304. -------
  305. A dictionary to be appended to a model {"hash": str, "hash_algorithm": str}.
  306. """
  307. algorithm = self.hash_algorithm
  308. h = hashlib.new(algorithm)
  309. h.update(byte_content)
  310. return {"hash": h.hexdigest(), "hash_algorithm": algorithm}
  311. def _read_file(
  312. self, os_path: str, format: str | None, raw: bool = False
  313. ) -> tuple[str | bytes, str] | tuple[str | bytes, str, bytes]:
  314. """Read a non-notebook file.
  315. Parameters
  316. ----------
  317. os_path: str
  318. The path to be read.
  319. format: str
  320. If 'text', the contents will be decoded as UTF-8.
  321. If 'base64', the raw bytes contents will be encoded as base64.
  322. If 'byte', the raw bytes contents will be returned.
  323. If not specified, try to decode as UTF-8, and fall back to base64
  324. raw: bool
  325. [Optional] If True, will return as third argument the raw bytes content
  326. Returns
  327. -------
  328. (content, format, byte_content) It returns the content in the given format
  329. as well as the raw byte content.
  330. """
  331. if not os.path.isfile(os_path):
  332. raise HTTPError(400, "Cannot read non-file %s" % os_path)
  333. with self.open(os_path, "rb") as f:
  334. bcontent = f.read()
  335. if format == "byte":
  336. # Not for http response but internal use
  337. return (bcontent, "byte", bcontent) if raw else (bcontent, "byte")
  338. if format is None or format == "text":
  339. # Try to interpret as unicode if format is unknown or if unicode
  340. # was explicitly requested.
  341. try:
  342. return (
  343. (bcontent.decode("utf8"), "text", bcontent)
  344. if raw
  345. else (
  346. bcontent.decode("utf8"),
  347. "text",
  348. )
  349. )
  350. except UnicodeError as e:
  351. if format == "text":
  352. raise HTTPError(
  353. 400,
  354. "%s is not UTF-8 encoded" % os_path,
  355. reason="bad format",
  356. ) from e
  357. return (
  358. (encodebytes(bcontent).decode("ascii"), "base64", bcontent)
  359. if raw
  360. else (
  361. encodebytes(bcontent).decode("ascii"),
  362. "base64",
  363. )
  364. )
  365. def _save_file(self, os_path, content, format):
  366. """Save content of a generic file."""
  367. if format not in {"text", "base64"}:
  368. raise HTTPError(
  369. 400,
  370. "Must specify format of file contents as 'text' or 'base64'",
  371. )
  372. try:
  373. if format == "text":
  374. bcontent = content.encode("utf8")
  375. else:
  376. b64_bytes = content.encode("ascii")
  377. bcontent = decodebytes(b64_bytes)
  378. except Exception as e:
  379. raise HTTPError(400, f"Encoding error saving {os_path}: {e}") from e
  380. with self.atomic_writing(os_path, text=False) as f:
  381. f.write(bcontent)
  382. class AsyncFileManagerMixin(FileManagerMixin):
  383. """
  384. Mixin for ContentsAPI classes that interact with the filesystem asynchronously.
  385. """
  386. async def _copy(self, src, dest):
  387. """copy src to dest
  388. like shutil.copy2, but log errors in copystat
  389. """
  390. await async_copy2_safe(src, dest, log=self.log)
  391. async def _read_notebook(
  392. self, os_path, as_version=4, capture_validation_error=None, raw: bool = False
  393. ):
  394. """Read a notebook from an os path."""
  395. answer = await self._read_file(os_path, "text", raw)
  396. try:
  397. nb = await run_sync(
  398. partial(
  399. nbformat.reads,
  400. as_version=as_version,
  401. capture_validation_error=capture_validation_error,
  402. ),
  403. answer[0],
  404. )
  405. return (nb, answer[2]) if raw else nb # type:ignore[misc]
  406. except Exception as e:
  407. e_orig = e
  408. # If use_atomic_writing is enabled, we'll guess that it was also
  409. # enabled when this notebook was written and look for a valid
  410. # atomic intermediate.
  411. tmp_path = path_to_intermediate(os_path)
  412. if not self.use_atomic_writing or not os.path.exists(tmp_path):
  413. raise HTTPError(
  414. 400,
  415. f"Unreadable Notebook: {os_path} {e_orig!r}",
  416. )
  417. # Move the bad file aside, restore the intermediate, and try again.
  418. invalid_file = path_to_invalid(os_path)
  419. await async_replace_file(os_path, invalid_file)
  420. await async_replace_file(tmp_path, os_path)
  421. answer = await self._read_notebook(
  422. os_path, as_version, capture_validation_error=capture_validation_error, raw=raw
  423. )
  424. return answer
  425. async def _save_notebook(self, os_path, nb, capture_validation_error=None):
  426. """Save a notebook to an os_path."""
  427. with self.atomic_writing(os_path, encoding="utf-8") as f:
  428. await run_sync(
  429. partial(
  430. nbformat.write,
  431. version=nbformat.NO_CONVERT,
  432. capture_validation_error=capture_validation_error,
  433. ),
  434. nb,
  435. f,
  436. )
  437. async def _read_file( # type: ignore[override]
  438. self, os_path: str, format: str | None, raw: bool = False
  439. ) -> tuple[str | bytes, str] | tuple[str | bytes, str, bytes]:
  440. """Read a non-notebook file.
  441. Parameters
  442. ----------
  443. os_path: str
  444. The path to be read.
  445. format: str
  446. If 'text', the contents will be decoded as UTF-8.
  447. If 'base64', the raw bytes contents will be encoded as base64.
  448. If 'byte', the raw bytes contents will be returned.
  449. If not specified, try to decode as UTF-8, and fall back to base64
  450. raw: bool
  451. [Optional] If True, will return as third argument the raw bytes content
  452. Returns
  453. -------
  454. (content, format, byte_content) It returns the content in the given format
  455. as well as the raw byte content.
  456. """
  457. if not os.path.isfile(os_path):
  458. raise HTTPError(400, "Cannot read non-file %s" % os_path)
  459. with self.open(os_path, "rb") as f:
  460. bcontent = await run_sync(f.read)
  461. if format == "byte":
  462. # Not for http response but internal use
  463. return (bcontent, "byte", bcontent) if raw else (bcontent, "byte")
  464. if format is None or format == "text":
  465. # Try to interpret as unicode if format is unknown or if unicode
  466. # was explicitly requested.
  467. try:
  468. return (
  469. (bcontent.decode("utf8"), "text", bcontent)
  470. if raw
  471. else (
  472. bcontent.decode("utf8"),
  473. "text",
  474. )
  475. )
  476. except UnicodeError as e:
  477. if format == "text":
  478. raise HTTPError(
  479. 400,
  480. "%s is not UTF-8 encoded" % os_path,
  481. reason="bad format",
  482. ) from e
  483. return (
  484. (encodebytes(bcontent).decode("ascii"), "base64", bcontent)
  485. if raw
  486. else (encodebytes(bcontent).decode("ascii"), "base64")
  487. )
  488. async def _save_file(self, os_path, content, format):
  489. """Save content of a generic file."""
  490. if format not in {"text", "base64"}:
  491. raise HTTPError(
  492. 400,
  493. "Must specify format of file contents as 'text' or 'base64'",
  494. )
  495. try:
  496. if format == "text":
  497. bcontent = content.encode("utf8")
  498. else:
  499. b64_bytes = content.encode("ascii")
  500. bcontent = decodebytes(b64_bytes)
  501. except Exception as e:
  502. raise HTTPError(400, f"Encoding error saving {os_path}: {e}") from e
  503. with self.atomic_writing(os_path, text=False) as f:
  504. await run_sync(f.write, bcontent)