github.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. import base64
  2. import re
  3. import requests
  4. from ..spec import AbstractFileSystem
  5. from ..utils import infer_storage_options
  6. from .memory import MemoryFile
  7. class GithubFileSystem(AbstractFileSystem):
  8. """Interface to files in github
  9. An instance of this class provides the files residing within a remote github
  10. repository. You may specify a point in the repos history, by SHA, branch
  11. or tag (default is current master).
  12. For files less than 1 MB in size, file content is returned directly in a
  13. MemoryFile. For larger files, or for files tracked by git-lfs, file content
  14. is returned as an HTTPFile wrapping the ``download_url`` provided by the
  15. GitHub API.
  16. When using fsspec.open, allows URIs of the form:
  17. - "github://path/file", in which case you must specify org, repo and
  18. may specify sha in the extra args
  19. - 'github://org:repo@/precip/catalog.yml', where the org and repo are
  20. part of the URI
  21. - 'github://org:repo@sha/precip/catalog.yml', where the sha is also included
  22. ``sha`` can be the full or abbreviated hex of the commit you want to fetch
  23. from, or a branch or tag name (so long as it doesn't contain special characters
  24. like "/", "?", which would have to be HTTP-encoded).
  25. For authorised access, you must provide username and token, which can be made
  26. at https://github.com/settings/tokens
  27. """
  28. url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
  29. content_url = "https://api.github.com/repos/{org}/{repo}/contents/{path}?ref={sha}"
  30. protocol = "github"
  31. timeout = (60, 60) # connect, read timeouts
  32. def __init__(
  33. self, org, repo, sha=None, username=None, token=None, timeout=None, **kwargs
  34. ):
  35. super().__init__(**kwargs)
  36. self.org = org
  37. self.repo = repo
  38. if (username is None) ^ (token is None):
  39. raise ValueError("Auth required both username and token")
  40. self.username = username
  41. self.token = token
  42. if timeout is not None:
  43. self.timeout = timeout
  44. if sha is None:
  45. # look up default branch (not necessarily "master")
  46. u = "https://api.github.com/repos/{org}/{repo}"
  47. r = requests.get(
  48. u.format(org=org, repo=repo), timeout=self.timeout, **self.kw
  49. )
  50. r.raise_for_status()
  51. sha = r.json()["default_branch"]
  52. self.root = sha
  53. self.ls("")
  54. try:
  55. from .http import HTTPFileSystem
  56. self.http_fs = HTTPFileSystem(**kwargs)
  57. except ImportError:
  58. self.http_fs = None
  59. @property
  60. def kw(self):
  61. if self.username:
  62. return {"auth": (self.username, self.token)}
  63. return {}
  64. @classmethod
  65. def repos(cls, org_or_user, is_org=True):
  66. """List repo names for given org or user
  67. This may become the top level of the FS
  68. Parameters
  69. ----------
  70. org_or_user: str
  71. Name of the github org or user to query
  72. is_org: bool (default True)
  73. Whether the name is an organisation (True) or user (False)
  74. Returns
  75. -------
  76. List of string
  77. """
  78. r = requests.get(
  79. f"https://api.github.com/{['users', 'orgs'][is_org]}/{org_or_user}/repos",
  80. timeout=cls.timeout,
  81. )
  82. r.raise_for_status()
  83. return [repo["name"] for repo in r.json()]
  84. @property
  85. def tags(self):
  86. """Names of tags in the repo"""
  87. r = requests.get(
  88. f"https://api.github.com/repos/{self.org}/{self.repo}/tags",
  89. timeout=self.timeout,
  90. **self.kw,
  91. )
  92. r.raise_for_status()
  93. return [t["name"] for t in r.json()]
  94. @property
  95. def branches(self):
  96. """Names of branches in the repo"""
  97. r = requests.get(
  98. f"https://api.github.com/repos/{self.org}/{self.repo}/branches",
  99. timeout=self.timeout,
  100. **self.kw,
  101. )
  102. r.raise_for_status()
  103. return [t["name"] for t in r.json()]
  104. @property
  105. def refs(self):
  106. """Named references, tags and branches"""
  107. return {"tags": self.tags, "branches": self.branches}
  108. def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
  109. """List files at given path
  110. Parameters
  111. ----------
  112. path: str
  113. Location to list, relative to repo root
  114. detail: bool
  115. If True, returns list of dicts, one per file; if False, returns
  116. list of full filenames only
  117. sha: str (optional)
  118. List at the given point in the repo history, branch or tag name or commit
  119. SHA
  120. _sha: str (optional)
  121. List this specific tree object (used internally to descend into trees)
  122. """
  123. path = self._strip_protocol(path)
  124. if path == "":
  125. _sha = sha or self.root
  126. if _sha is None:
  127. parts = path.rstrip("/").split("/")
  128. so_far = ""
  129. _sha = sha or self.root
  130. for part in parts:
  131. out = self.ls(so_far, True, sha=sha, _sha=_sha)
  132. so_far += "/" + part if so_far else part
  133. out = [o for o in out if o["name"] == so_far]
  134. if not out:
  135. raise FileNotFoundError(path)
  136. out = out[0]
  137. if out["type"] == "file":
  138. if detail:
  139. return [out]
  140. else:
  141. return path
  142. _sha = out["sha"]
  143. if path not in self.dircache or sha not in [self.root, None]:
  144. r = requests.get(
  145. self.url.format(org=self.org, repo=self.repo, sha=_sha),
  146. timeout=self.timeout,
  147. **self.kw,
  148. )
  149. if r.status_code == 404:
  150. raise FileNotFoundError(path)
  151. r.raise_for_status()
  152. types = {"blob": "file", "tree": "directory"}
  153. out = [
  154. {
  155. "name": path + "/" + f["path"] if path else f["path"],
  156. "mode": f["mode"],
  157. "type": types[f["type"]],
  158. "size": f.get("size", 0),
  159. "sha": f["sha"],
  160. }
  161. for f in r.json()["tree"]
  162. if f["type"] in types
  163. ]
  164. if sha in [self.root, None]:
  165. self.dircache[path] = out
  166. else:
  167. out = self.dircache[path]
  168. if detail:
  169. return out
  170. else:
  171. return sorted([f["name"] for f in out])
  172. def invalidate_cache(self, path=None):
  173. self.dircache.clear()
  174. @classmethod
  175. def _strip_protocol(cls, path):
  176. opts = infer_storage_options(path)
  177. if "username" not in opts:
  178. return super()._strip_protocol(path)
  179. return opts["path"].lstrip("/")
  180. @staticmethod
  181. def _get_kwargs_from_urls(path):
  182. opts = infer_storage_options(path)
  183. if "username" not in opts:
  184. return {}
  185. out = {"org": opts["username"], "repo": opts["password"]}
  186. if opts["host"]:
  187. out["sha"] = opts["host"]
  188. return out
  189. def _open(
  190. self,
  191. path,
  192. mode="rb",
  193. block_size=None,
  194. cache_options=None,
  195. sha=None,
  196. **kwargs,
  197. ):
  198. if mode != "rb":
  199. raise NotImplementedError
  200. # construct a url to hit the GitHub API's repo contents API
  201. url = self.content_url.format(
  202. org=self.org, repo=self.repo, path=path, sha=sha or self.root
  203. )
  204. # make a request to this API, and parse the response as JSON
  205. r = requests.get(url, timeout=self.timeout, **self.kw)
  206. if r.status_code == 404:
  207. raise FileNotFoundError(path)
  208. r.raise_for_status()
  209. content_json = r.json()
  210. # if the response's content key is not empty, try to parse it as base64
  211. if content_json["content"]:
  212. content = base64.b64decode(content_json["content"])
  213. # as long as the content does not start with the string
  214. # "version https://git-lfs.github.com/"
  215. # then it is probably not a git-lfs pointer and we can just return
  216. # the content directly
  217. if not content.startswith(b"version https://git-lfs.github.com/"):
  218. return MemoryFile(None, None, content)
  219. # we land here if the content was not present in the first response
  220. # (regular file over 1MB or git-lfs tracked file)
  221. # in this case, we get let the HTTPFileSystem handle the download
  222. if self.http_fs is None:
  223. raise ImportError(
  224. "Please install fsspec[http] to access github files >1 MB "
  225. "or git-lfs tracked files."
  226. )
  227. return self.http_fs.open(
  228. content_json["download_url"],
  229. mode=mode,
  230. block_size=block_size,
  231. cache_options=cache_options,
  232. **kwargs,
  233. )
  234. def rm(self, path, recursive=False, maxdepth=None, message=None):
  235. path = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
  236. for p in reversed(path):
  237. self.rm_file(p, message=message)
  238. def rm_file(self, path, message=None, **kwargs):
  239. """
  240. Remove a file from a specified branch using a given commit message.
  241. Since Github DELETE operation requires a branch name, and we can't reliably
  242. determine whether the provided SHA refers to a branch, tag, or commit, we
  243. assume it's a branch. If it's not, the user will encounter an error when
  244. attempting to retrieve the file SHA or delete the file.
  245. Parameters
  246. ----------
  247. path: str
  248. The file's location relative to the repository root.
  249. message: str, optional
  250. The commit message for the deletion.
  251. """
  252. if not self.username:
  253. raise ValueError("Authentication required")
  254. path = self._strip_protocol(path)
  255. # Attempt to get SHA from cache or Github API
  256. sha = self._get_sha_from_cache(path)
  257. if not sha:
  258. url = self.content_url.format(
  259. org=self.org, repo=self.repo, path=path.lstrip("/"), sha=self.root
  260. )
  261. r = requests.get(url, timeout=self.timeout, **self.kw)
  262. if r.status_code == 404:
  263. raise FileNotFoundError(path)
  264. r.raise_for_status()
  265. sha = r.json()["sha"]
  266. # Delete the file
  267. delete_url = self.content_url.format(
  268. org=self.org, repo=self.repo, path=path, sha=self.root
  269. )
  270. branch = self.root
  271. data = {
  272. "message": message or f"Delete {path}",
  273. "sha": sha,
  274. **({"branch": branch} if branch else {}),
  275. }
  276. r = requests.delete(delete_url, json=data, timeout=self.timeout, **self.kw)
  277. error_message = r.json().get("message", "")
  278. if re.search(r"Branch .+ not found", error_message):
  279. error = "Remove only works when the filesystem is initialised from a branch or default (None)"
  280. raise ValueError(error)
  281. r.raise_for_status()
  282. self.invalidate_cache(path)
  283. def _get_sha_from_cache(self, path):
  284. for entries in self.dircache.values():
  285. for entry in entries:
  286. entry_path = entry.get("name")
  287. if entry_path and entry_path == path and "sha" in entry:
  288. return entry["sha"]
  289. return None