lazy_wheel.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. """Lazy ZIP over HTTP"""
  2. from __future__ import annotations
  3. __all__ = ["HTTPRangeRequestUnsupported", "dist_from_wheel_url"]
  4. from bisect import bisect_left, bisect_right
  5. from collections.abc import Generator
  6. from contextlib import contextmanager
  7. from tempfile import NamedTemporaryFile
  8. from typing import Any
  9. from zipfile import BadZipFile, ZipFile
  10. from pip._vendor.packaging.utils import NormalizedName
  11. from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, Response
  12. from pip._internal.metadata import BaseDistribution, MemoryWheel, get_wheel_distribution
  13. from pip._internal.network.session import PipSession
  14. from pip._internal.network.utils import HEADERS, raise_for_status, response_chunks
  15. class HTTPRangeRequestUnsupported(Exception):
  16. pass
  17. def dist_from_wheel_url(
  18. name: NormalizedName, url: str, session: PipSession
  19. ) -> BaseDistribution:
  20. """Return a distribution object from the given wheel URL.
  21. This uses HTTP range requests to only fetch the portion of the wheel
  22. containing metadata, just enough for the object to be constructed.
  23. If such requests are not supported, HTTPRangeRequestUnsupported
  24. is raised.
  25. """
  26. with LazyZipOverHTTP(url, session) as zf:
  27. # For read-only ZIP files, ZipFile only needs methods read,
  28. # seek, seekable and tell, not the whole IO protocol.
  29. wheel = MemoryWheel(zf.name, zf) # type: ignore
  30. # After context manager exit, wheel.name
  31. # is an invalid file by intention.
  32. return get_wheel_distribution(wheel, name)
  33. class LazyZipOverHTTP:
  34. """File-like object mapped to a ZIP file over HTTP.
  35. This uses HTTP range requests to lazily fetch the file's content,
  36. which is supposed to be fed to ZipFile. If such requests are not
  37. supported by the server, raise HTTPRangeRequestUnsupported
  38. during initialization.
  39. """
  40. def __init__(
  41. self, url: str, session: PipSession, chunk_size: int = CONTENT_CHUNK_SIZE
  42. ) -> None:
  43. head = session.head(url, headers=HEADERS)
  44. raise_for_status(head)
  45. assert head.status_code == 200
  46. self._session, self._url, self._chunk_size = session, url, chunk_size
  47. self._length = int(head.headers["Content-Length"])
  48. self._file = NamedTemporaryFile()
  49. self.truncate(self._length)
  50. self._left: list[int] = []
  51. self._right: list[int] = []
  52. if "bytes" not in head.headers.get("Accept-Ranges", "none"):
  53. raise HTTPRangeRequestUnsupported("range request is not supported")
  54. self._check_zip()
  55. @property
  56. def mode(self) -> str:
  57. """Opening mode, which is always rb."""
  58. return "rb"
  59. @property
  60. def name(self) -> str:
  61. """Path to the underlying file."""
  62. return self._file.name
  63. def seekable(self) -> bool:
  64. """Return whether random access is supported, which is True."""
  65. return True
  66. def close(self) -> None:
  67. """Close the file."""
  68. self._file.close()
  69. @property
  70. def closed(self) -> bool:
  71. """Whether the file is closed."""
  72. return self._file.closed
  73. def read(self, size: int = -1) -> bytes:
  74. """Read up to size bytes from the object and return them.
  75. As a convenience, if size is unspecified or -1,
  76. all bytes until EOF are returned. Fewer than
  77. size bytes may be returned if EOF is reached.
  78. """
  79. download_size = max(size, self._chunk_size)
  80. start, length = self.tell(), self._length
  81. stop = length if size < 0 else min(start + download_size, length)
  82. start = max(0, stop - download_size)
  83. self._download(start, stop - 1)
  84. return self._file.read(size)
  85. def readable(self) -> bool:
  86. """Return whether the file is readable, which is True."""
  87. return True
  88. def seek(self, offset: int, whence: int = 0) -> int:
  89. """Change stream position and return the new absolute position.
  90. Seek to offset relative position indicated by whence:
  91. * 0: Start of stream (the default). pos should be >= 0;
  92. * 1: Current position - pos may be negative;
  93. * 2: End of stream - pos usually negative.
  94. """
  95. return self._file.seek(offset, whence)
  96. def tell(self) -> int:
  97. """Return the current position."""
  98. return self._file.tell()
  99. def truncate(self, size: int | None = None) -> int:
  100. """Resize the stream to the given size in bytes.
  101. If size is unspecified resize to the current position.
  102. The current stream position isn't changed.
  103. Return the new file size.
  104. """
  105. return self._file.truncate(size)
  106. def writable(self) -> bool:
  107. """Return False."""
  108. return False
  109. def __enter__(self) -> LazyZipOverHTTP:
  110. self._file.__enter__()
  111. return self
  112. def __exit__(self, *exc: Any) -> None:
  113. self._file.__exit__(*exc)
  114. @contextmanager
  115. def _stay(self) -> Generator[None, None, None]:
  116. """Return a context manager keeping the position.
  117. At the end of the block, seek back to original position.
  118. """
  119. pos = self.tell()
  120. try:
  121. yield
  122. finally:
  123. self.seek(pos)
  124. def _check_zip(self) -> None:
  125. """Check and download until the file is a valid ZIP."""
  126. end = self._length - 1
  127. for start in reversed(range(0, end, self._chunk_size)):
  128. self._download(start, end)
  129. with self._stay():
  130. try:
  131. # For read-only ZIP files, ZipFile only needs
  132. # methods read, seek, seekable and tell.
  133. ZipFile(self)
  134. except BadZipFile:
  135. pass
  136. else:
  137. break
  138. def _stream_response(
  139. self, start: int, end: int, base_headers: dict[str, str] = HEADERS
  140. ) -> Response:
  141. """Return HTTP response to a range request from start to end."""
  142. headers = base_headers.copy()
  143. headers["Range"] = f"bytes={start}-{end}"
  144. # TODO: Get range requests to be correctly cached
  145. headers["Cache-Control"] = "no-cache"
  146. return self._session.get(self._url, headers=headers, stream=True)
  147. def _merge(
  148. self, start: int, end: int, left: int, right: int
  149. ) -> Generator[tuple[int, int], None, None]:
  150. """Return a generator of intervals to be fetched.
  151. Args:
  152. start (int): Start of needed interval
  153. end (int): End of needed interval
  154. left (int): Index of first overlapping downloaded data
  155. right (int): Index after last overlapping downloaded data
  156. """
  157. lslice, rslice = self._left[left:right], self._right[left:right]
  158. i = start = min([start] + lslice[:1])
  159. end = max([end] + rslice[-1:])
  160. for j, k in zip(lslice, rslice):
  161. if j > i:
  162. yield i, j - 1
  163. i = k + 1
  164. if i <= end:
  165. yield i, end
  166. self._left[left:right], self._right[left:right] = [start], [end]
  167. def _download(self, start: int, end: int) -> None:
  168. """Download bytes from start to end inclusively."""
  169. with self._stay():
  170. left = bisect_left(self._right, start)
  171. right = bisect_right(self._left, end)
  172. for start, end in self._merge(start, end, left, right):
  173. response = self._stream_response(start, end)
  174. response.raise_for_status()
  175. self.seek(start)
  176. for chunk in response_chunks(response, self._chunk_size):
  177. self._file.write(chunk)