"""Support for parsing GitHub URLs (which might be user provided) into constituent parts.""" from __future__ import annotations import re from dataclasses import dataclass from enum import IntEnum from wandb.sdk.launch.errors import LaunchError PREFIX_HTTPS = "https://" PREFIX_SSH = "git@" SUFFIX_GIT = ".git" GIT_COMMIT_REGEX = re.compile(r"[0-9a-f]{40}") class ReferenceType(IntEnum): BRANCH = 1 COMMIT = 2 def _parse_netloc(netloc: str) -> tuple[str | None, str | None, str]: """Parse netloc into username, password, and host. github.com => None, None, "@github.com" username@github.com => "username", None, "github.com" username:password@github.com => "username", "password", "github.com" """ parts = netloc.split("@", 1) if len(parts) == 1: return None, None, parts[0] auth, host = parts parts = auth.split(":", 1) if len(parts) == 1: return parts[0], None, host return parts[0], parts[1], host @dataclass class GitReference: def __init__(self, remote: str, ref: str | None = None) -> None: """Initialize a reference from a remote and ref. Arguments: remote: A remote URL or URI. ref: A branch, tag, or commit hash. """ self.uri = remote self.ref = ref @property def url(self) -> str | None: return self.uri def fetch(self, dst_dir: str) -> None: """Fetch the repo into dst_dir and refine githubref based on what we learn.""" # We defer importing git until the last moment, because the import requires that the git # executable is available on the PATH, so we only want to fail if we actually need it. import git # type: ignore repo = git.Repo.init(dst_dir) self.path = repo.working_dir origin = repo.create_remote("origin", self.uri or "") try: # We fetch the origin so that we have branch and tag references origin.fetch() except git.exc.GitCommandError as e: raise LaunchError( f"Unable to fetch from git remote repository {self.url}:\n{e}" ) ref: git.RemoteReference | str if self.ref: if self.ref in origin.refs: ref = origin.refs[self.ref] else: ref = self.ref head = repo.create_head(self.ref, ref) head.checkout() self.commit_hash = head.commit.hexsha else: # TODO: Is there a better way to do this? default_branch = None for ref in repo.references: if hasattr(ref, "tag"): # Skip tag references continue refname = ref.name if refname.startswith("origin/"): # Trim off "origin/" refname = refname[7:] if refname == "main": default_branch = "main" break if refname == "master": default_branch = "master" # Keep looking in case we also have a main, which we let take precedence # (While the references appear to be sorted, not clear if that's guaranteed.) if not default_branch: raise LaunchError( f"Unable to determine branch or commit to checkout from {self.url}" ) self.default_branch = default_branch self.ref = default_branch head = repo.create_head(default_branch, origin.refs[default_branch]) head.checkout() self.commit_hash = head.commit.hexsha repo.submodule_update(init=True, recursive=True)