kaniko_builder.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596
  1. from __future__ import annotations
  2. import asyncio
  3. import base64
  4. import copy
  5. import json
  6. import logging
  7. import os
  8. import shutil
  9. import tarfile
  10. import tempfile
  11. import time
  12. import traceback
  13. from typing import Any
  14. import wandb
  15. from wandb.sdk.launch.agent.job_status_tracker import JobAndRunStatusTracker
  16. from wandb.sdk.launch.builder.abstract import AbstractBuilder, registry_from_uri
  17. from wandb.sdk.launch.environment.abstract import AbstractEnvironment
  18. from wandb.sdk.launch.environment.azure_environment import AzureEnvironment
  19. from wandb.sdk.launch.registry.abstract import AbstractRegistry
  20. from wandb.sdk.launch.registry.azure_container_registry import AzureContainerRegistry
  21. from wandb.sdk.launch.registry.elastic_container_registry import (
  22. ElasticContainerRegistry,
  23. )
  24. from wandb.sdk.launch.registry.google_artifact_registry import GoogleArtifactRegistry
  25. from wandb.util import get_module
  26. from .._project_spec import EntryPoint, LaunchProject
  27. from ..errors import LaunchError
  28. from ..utils import (
  29. LOG_PREFIX,
  30. get_kube_context_and_api_client,
  31. warn_failed_packages_from_build_logs,
  32. )
  33. from .build import _WANDB_DOCKERFILE_NAME
  34. from .context_manager import BuildContextManager
  35. get_module(
  36. "kubernetes_asyncio",
  37. required="Kaniko builder requires the kubernetes_asyncio package. Please install it with `pip install wandb[launch]`.",
  38. )
  39. import kubernetes_asyncio as kubernetes # type: ignore # noqa: E402
  40. from kubernetes_asyncio import client # noqa: E402
  41. _logger = logging.getLogger(__name__)
  42. _DEFAULT_BUILD_TIMEOUT_SECS = 1800 # 30 minute build timeout
  43. SERVICE_ACCOUNT_NAME = os.environ.get("WANDB_LAUNCH_SERVICE_ACCOUNT_NAME", "default")
  44. PVC_NAME = os.environ.get("WANDB_LAUNCH_KANIKO_PVC_NAME")
  45. PVC_MOUNT_PATH = (
  46. os.environ.get("WANDB_LAUNCH_KANIKO_PVC_MOUNT_PATH", "/kaniko").rstrip("/")
  47. if PVC_NAME
  48. else None
  49. )
  50. DOCKER_CONFIG_SECRET = os.environ.get("WANDB_LAUNCH_KANIKO_AUTH_SECRET")
  51. if os.path.exists("/var/run/secrets/kubernetes.io/serviceaccount/namespace"):
  52. with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f:
  53. NAMESPACE = f.read().strip()
  54. else:
  55. NAMESPACE = "wandb"
  56. def get_pod_name_safe(job: client.V1Job):
  57. try:
  58. return job.spec.template.metadata.name
  59. except AttributeError:
  60. return None
  61. async def _wait_for_completion(
  62. batch_client: client.BatchV1Api, job_name: str, deadline_secs: int | None = None
  63. ) -> bool:
  64. start_time = time.time()
  65. while True:
  66. job = await batch_client.read_namespaced_job_status(job_name, NAMESPACE)
  67. if job.status.succeeded is not None and job.status.succeeded >= 1:
  68. return True
  69. elif job.status.failed is not None and job.status.failed >= 1:
  70. wandb.termerror(f"{LOG_PREFIX}Build job {job.status.failed} failed {job}")
  71. return False
  72. wandb.termlog(f"{LOG_PREFIX}Waiting for build job to complete...")
  73. if deadline_secs is not None and time.time() - start_time > deadline_secs:
  74. return False
  75. await asyncio.sleep(5)
  76. class KanikoBuilder(AbstractBuilder):
  77. """Builds a docker image for a project using Kaniko."""
  78. type = "kaniko"
  79. build_job_name: str
  80. build_context_store: str
  81. secret_name: str | None
  82. secret_key: str | None
  83. image: str
  84. def __init__(
  85. self,
  86. environment: AbstractEnvironment,
  87. registry: AbstractRegistry,
  88. build_job_name: str = "wandb-launch-container-build",
  89. build_context_store: str = "",
  90. secret_name: str = "",
  91. secret_key: str = "",
  92. image: str = "gcr.io/kaniko-project/executor:v1.11.0",
  93. config: dict | None = None,
  94. ):
  95. """Initialize a KanikoBuilder.
  96. Arguments:
  97. environment (AbstractEnvironment): The environment to use.
  98. registry (AbstractRegistry): The registry to use.
  99. build_job_name (str, optional): The name of the build job.
  100. build_context_store (str, optional): The name of the build context store.
  101. secret_name (str, optional): The name of the secret to use for the registry.
  102. secret_key (str, optional): The key of the secret to use for the registry.
  103. verify (bool, optional): Whether to verify the functionality of the builder.
  104. Defaults to True.
  105. """
  106. self.environment = environment
  107. self.registry = registry
  108. self.build_job_name = build_job_name
  109. self.build_context_store = build_context_store.rstrip("/")
  110. self.secret_name = secret_name
  111. self.secret_key = secret_key
  112. self.image = image
  113. self.kaniko_config = config or {}
  114. @classmethod
  115. def from_config(
  116. cls,
  117. config: dict,
  118. environment: AbstractEnvironment,
  119. registry: AbstractRegistry,
  120. verify: bool = True,
  121. login: bool = True,
  122. ) -> AbstractBuilder:
  123. """Create a KanikoBuilder from a config dict.
  124. Arguments:
  125. config: A dict containing the builder config. Must contain a "type" key
  126. with value "kaniko".
  127. environment: The environment to use for the build.
  128. registry: The registry to use for the build.
  129. verify: Whether to verify the builder config.
  130. Returns:
  131. A KanikoBuilder instance.
  132. """
  133. if config.get("type") != "kaniko":
  134. raise LaunchError(
  135. "Builder config must include 'type':'kaniko' to create a KanikoBuilder."
  136. )
  137. build_context_store = config.get("build-context-store", "")
  138. if build_context_store is None and not PVC_MOUNT_PATH:
  139. raise LaunchError(
  140. "You must specify a build context store for kaniko builds. "
  141. "You can set builder.build-context-store in your agent config "
  142. "to a valid s3, gcs, or azure blog storage URI. Or, configure "
  143. "a persistent volume claim through the agent helm chart: "
  144. "https://github.com/wandb/helm-charts/tree/main/charts/launch-agent"
  145. )
  146. build_job_name = config.get("build-job-name", "wandb-launch-container-build")
  147. secret_name = config.get("secret-name", "")
  148. secret_key = config.get("secret-key", "")
  149. kaniko_image = config.get(
  150. "kaniko-image", "gcr.io/kaniko-project/executor:v1.11.0"
  151. )
  152. image_uri = config.get("destination")
  153. if image_uri is not None:
  154. registry = registry_from_uri(image_uri)
  155. kaniko_config = config.get("kaniko-config", {})
  156. return cls(
  157. environment,
  158. registry,
  159. build_context_store=build_context_store,
  160. build_job_name=build_job_name,
  161. secret_name=secret_name,
  162. secret_key=secret_key,
  163. image=kaniko_image,
  164. config=kaniko_config,
  165. )
  166. async def verify(self) -> None:
  167. """Verify that the builder config is valid.
  168. Raises:
  169. LaunchError: If the builder config is invalid.
  170. """
  171. if self.build_context_store:
  172. await self.environment.verify_storage_uri(self.build_context_store)
  173. def login(self) -> None:
  174. """Login to the registry."""
  175. async def _create_docker_ecr_config_map(
  176. self, job_name: str, corev1_client: client.CoreV1Api, repository: str
  177. ) -> None:
  178. username, password = await self.registry.get_username_password()
  179. encoded = base64.b64encode(f"{username}:{password}".encode()).decode("utf-8")
  180. ecr_config_map = client.V1ConfigMap(
  181. api_version="v1",
  182. kind="ConfigMap",
  183. metadata=client.V1ObjectMeta(
  184. name=f"docker-config-{job_name}",
  185. namespace=NAMESPACE,
  186. ),
  187. data={
  188. "config.json": json.dumps(
  189. {
  190. "auths": {
  191. f"{await self.registry.get_repo_uri()}": {"auth": encoded}
  192. }
  193. }
  194. )
  195. },
  196. immutable=True,
  197. )
  198. await corev1_client.create_namespaced_config_map(NAMESPACE, ecr_config_map)
  199. async def _delete_docker_ecr_config_map(
  200. self, job_name: str, client: client.CoreV1Api
  201. ) -> None:
  202. if self.secret_name:
  203. await client.delete_namespaced_config_map(
  204. f"docker-config-{job_name}", NAMESPACE
  205. )
  206. async def _upload_build_context(self, run_id: str, context_path: str) -> str:
  207. # creat a tar archive of the build context and upload it to s3
  208. context_file = tempfile.NamedTemporaryFile(delete=False)
  209. with tarfile.TarFile.open(fileobj=context_file, mode="w:gz") as context_tgz:
  210. context_tgz.add(context_path, arcname=".")
  211. context_file.close()
  212. if PVC_MOUNT_PATH is None:
  213. destination = f"{self.build_context_store}/{run_id}.tgz"
  214. if self.environment is None:
  215. raise LaunchError("No environment specified for Kaniko build.")
  216. await self.environment.upload_file(context_file.name, destination)
  217. return destination
  218. else:
  219. destination = f"{PVC_MOUNT_PATH}/{run_id}.tgz"
  220. try:
  221. shutil.copy(context_file.name, destination)
  222. except Exception as e:
  223. raise LaunchError(
  224. f"Error copying build context to PVC mounted at {PVC_MOUNT_PATH}: {e}"
  225. ) from e
  226. return f"tar:///context/{run_id}.tgz"
  227. async def build_image(
  228. self,
  229. launch_project: LaunchProject,
  230. entrypoint: EntryPoint,
  231. job_tracker: JobAndRunStatusTracker | None = None,
  232. ) -> str:
  233. await self.verify()
  234. build_contex_manager = BuildContextManager(launch_project=launch_project)
  235. context_path, image_tag = build_contex_manager.create_build_context("kaniko")
  236. run_id = launch_project.run_id
  237. repo_uri = await self.registry.get_repo_uri()
  238. image_uri = repo_uri + ":" + image_tag
  239. # The DOCKER_CONFIG_SECRET option is mutually exclusive with the
  240. # registry classes, so we must skip the check for image existence in
  241. # that case.
  242. if not launch_project.build_required():
  243. if DOCKER_CONFIG_SECRET:
  244. wandb.termlog(
  245. f"Skipping check for existing image {image_uri} due to custom dockerconfig."
  246. )
  247. else:
  248. if await self.registry.check_image_exists(image_uri):
  249. return image_uri
  250. _logger.info(f"Building image {image_uri}...")
  251. _, api_client = await get_kube_context_and_api_client(
  252. kubernetes, launch_project.resource_args
  253. )
  254. # TODO: use same client as kubernetes_runner.py
  255. batch_v1 = client.BatchV1Api(api_client)
  256. core_v1 = client.CoreV1Api(api_client)
  257. build_job_name = f"{self.build_job_name}-{run_id}"
  258. build_context = await self._upload_build_context(run_id, context_path)
  259. build_job = await self._create_kaniko_job(
  260. build_job_name, repo_uri, image_uri, build_context, core_v1, api_client
  261. )
  262. wandb.termlog(f"{LOG_PREFIX}Created kaniko job {build_job_name}")
  263. try:
  264. # DOCKER_CONFIG_SECRET is a user provided dockerconfigjson. Skip our
  265. # dockerconfig handling if it's set.
  266. if (
  267. isinstance(self.registry, AzureContainerRegistry)
  268. and not DOCKER_CONFIG_SECRET
  269. ):
  270. dockerfile_config_map = client.V1ConfigMap(
  271. metadata=client.V1ObjectMeta(
  272. name=f"docker-config-{build_job_name}"
  273. ),
  274. data={
  275. "config.json": json.dumps(
  276. {
  277. "credHelpers": {
  278. f"{self.registry.registry_name}.azurecr.io": "acr-env"
  279. }
  280. }
  281. )
  282. },
  283. )
  284. await core_v1.create_namespaced_config_map(
  285. "wandb", dockerfile_config_map
  286. )
  287. if self.secret_name:
  288. await self._create_docker_ecr_config_map(
  289. build_job_name, core_v1, repo_uri
  290. )
  291. k8s_job = await batch_v1.create_namespaced_job(NAMESPACE, build_job)
  292. # wait for double the job deadline since it might take time to schedule
  293. if not await _wait_for_completion(
  294. batch_v1, build_job_name, 3 * _DEFAULT_BUILD_TIMEOUT_SECS
  295. ):
  296. if job_tracker:
  297. job_tracker.set_err_stage("build")
  298. msg = f"Failed to build image in kaniko for job {run_id}."
  299. pod_name = get_pod_name_safe(k8s_job)
  300. if pod_name:
  301. msg += f" View logs with `kubectl logs -n {NAMESPACE} {pod_name}`."
  302. raise Exception(msg) # noqa: TRY301
  303. try:
  304. pods_from_job = await core_v1.list_namespaced_pod(
  305. namespace=NAMESPACE, label_selector=f"job-name={build_job_name}"
  306. )
  307. if len(pods_from_job.items) != 1:
  308. raise Exception( # noqa: TRY301
  309. f"Expected 1 pod for job {build_job_name},"
  310. f" found {len(pods_from_job.items)}"
  311. )
  312. pod_name = pods_from_job.items[0].metadata.name
  313. logs = await core_v1.read_namespaced_pod_log(pod_name, NAMESPACE)
  314. warn_failed_packages_from_build_logs(
  315. logs, image_uri, launch_project.api, job_tracker
  316. )
  317. except Exception as e:
  318. wandb.termwarn(
  319. f"{LOG_PREFIX}Failed to get logs for kaniko job {build_job_name}: {e}"
  320. )
  321. except Exception as e:
  322. wandb.termerror(
  323. f"{LOG_PREFIX}Exception when creating Kubernetes resources: {e}\n"
  324. )
  325. raise
  326. finally:
  327. wandb.termlog(f"{LOG_PREFIX}Cleaning up resources")
  328. try:
  329. if (
  330. isinstance(self.registry, AzureContainerRegistry)
  331. and not DOCKER_CONFIG_SECRET
  332. ):
  333. await core_v1.delete_namespaced_config_map(
  334. f"docker-config-{build_job_name}", "wandb"
  335. )
  336. if self.secret_name:
  337. await self._delete_docker_ecr_config_map(build_job_name, core_v1)
  338. await batch_v1.delete_namespaced_job(build_job_name, NAMESPACE)
  339. except Exception as e:
  340. traceback.print_exc()
  341. raise LaunchError(
  342. f"Exception during Kubernetes resource clean up {e}"
  343. ) from e
  344. return image_uri
  345. async def _create_kaniko_job(
  346. self,
  347. job_name: str,
  348. repository: str,
  349. image_tag: str,
  350. build_context_path: str,
  351. core_client: client.CoreV1Api,
  352. api_client,
  353. ) -> dict[str, Any]:
  354. job = copy.deepcopy(self.kaniko_config)
  355. job_metadata = job.get("metadata", {})
  356. job_labels = job_metadata.get("labels", {})
  357. job_spec = job.get("spec", {})
  358. pod_template = job_spec.get("template", {})
  359. pod_metadata = pod_template.get("metadata", {})
  360. pod_labels = pod_metadata.get("labels", {})
  361. pod_spec = pod_template.get("spec", {})
  362. volumes = pod_spec.get("volumes", [])
  363. containers = pod_spec.get("containers") or [{}]
  364. if len(containers) > 1:
  365. raise LaunchError(
  366. "Multiple container configs not supported for kaniko builder."
  367. )
  368. container = containers[0]
  369. volume_mounts = container.get("volumeMounts", [])
  370. env = container.get("env", [])
  371. custom_args = container.get("args", [])
  372. if PVC_MOUNT_PATH:
  373. volumes.append(
  374. {"name": "kaniko-pvc", "persistentVolumeClaim": {"claimName": PVC_NAME}}
  375. )
  376. volume_mounts.append({"name": "kaniko-pvc", "mountPath": "/context"})
  377. if bool(self.secret_name) != bool(self.secret_key):
  378. raise LaunchError(
  379. "Both secret_name and secret_key or neither must be specified "
  380. "for kaniko build. You provided only one of them."
  381. )
  382. if isinstance(self.registry, ElasticContainerRegistry):
  383. env.append(
  384. {
  385. "name": "AWS_REGION",
  386. "value": self.registry.region,
  387. }
  388. )
  389. # TODO(ben): Refactor all of this environment/registry
  390. # specific stuff into methods of those classes.
  391. if isinstance(self.environment, AzureEnvironment):
  392. # Use the core api to check if the secret exists
  393. try:
  394. await core_client.read_namespaced_secret(
  395. "azure-storage-access-key",
  396. "wandb",
  397. )
  398. except Exception as e:
  399. raise LaunchError(
  400. "Secret azure-storage-access-key does not exist in "
  401. "namespace wandb. Please create it with the key password "
  402. "set to your azure storage access key."
  403. ) from e
  404. env.append(
  405. {
  406. "name": "AZURE_STORAGE_ACCESS_KEY",
  407. "valueFrom": {
  408. "secretKeyRef": {
  409. "name": "azure-storage-access-key",
  410. "key": "password",
  411. }
  412. },
  413. }
  414. )
  415. if DOCKER_CONFIG_SECRET:
  416. volumes.append(
  417. {
  418. "name": "kaniko-docker-config",
  419. "secret": {
  420. "secretName": DOCKER_CONFIG_SECRET,
  421. "items": [
  422. {
  423. "key": ".dockerconfigjson",
  424. "path": "config.json",
  425. }
  426. ],
  427. },
  428. }
  429. )
  430. volume_mounts.append(
  431. {"name": "kaniko-docker-config", "mountPath": "/kaniko/.docker"}
  432. )
  433. elif self.secret_name and self.secret_key:
  434. volumes.append(
  435. {
  436. "name": "docker-config",
  437. "configMap": {"name": f"docker-config-{job_name}"},
  438. }
  439. )
  440. volume_mounts.append(
  441. {"name": "docker-config", "mountPath": "/kaniko/.docker"}
  442. )
  443. # TODO(ben): I don't like conditioning on the registry type here. As a
  444. # future change I want the registry and environment classes to provide
  445. # a list of environment variables and volume mounts that need to be
  446. # added to the job. The environment class provides credentials for
  447. # build context access, and the registry class provides credentials
  448. # for pushing the image. This way we can have separate secrets for
  449. # each and support build contexts and registries that require
  450. # different credentials.
  451. if isinstance(self.registry, ElasticContainerRegistry):
  452. mount_path = "/root/.aws"
  453. key = "credentials"
  454. elif isinstance(self.registry, GoogleArtifactRegistry):
  455. mount_path = "/kaniko/.config/gcloud"
  456. key = "config.json"
  457. env.append(
  458. {
  459. "name": "GOOGLE_APPLICATION_CREDENTIALS",
  460. "value": "/kaniko/.config/gcloud/config.json",
  461. }
  462. )
  463. else:
  464. wandb.termwarn(
  465. f"{LOG_PREFIX}Automatic credential handling is not supported for registry type {type(self.registry)}. Build job: {self.build_job_name}"
  466. )
  467. volumes.append(
  468. {
  469. "name": self.secret_name,
  470. "secret": {
  471. "secretName": self.secret_name,
  472. "items": [{"key": self.secret_key, "path": key}],
  473. },
  474. }
  475. )
  476. volume_mounts.append(
  477. {
  478. "name": self.secret_name,
  479. "mountPath": mount_path,
  480. "readOnly": True,
  481. }
  482. )
  483. if (
  484. isinstance(self.registry, AzureContainerRegistry)
  485. and not DOCKER_CONFIG_SECRET
  486. ):
  487. # Add the docker config map
  488. volumes.append(
  489. {
  490. "name": "docker-config",
  491. "configMap": {"name": f"docker-config-{job_name}"},
  492. }
  493. )
  494. volume_mounts.append(
  495. {"name": "docker-config", "mountPath": "/kaniko/.docker/"}
  496. )
  497. # Kaniko doesn't want https:// at the beginning of the image tag.
  498. destination = image_tag
  499. if destination.startswith("https://"):
  500. destination = destination.replace("https://", "")
  501. args = {
  502. "--context": build_context_path,
  503. "--dockerfile": _WANDB_DOCKERFILE_NAME,
  504. "--destination": destination,
  505. "--cache": "true",
  506. "--cache-repo": repository.replace("https://", ""),
  507. "--snapshot-mode": "redo",
  508. "--compressed-caching": "false",
  509. }
  510. for custom_arg in custom_args:
  511. arg_name, arg_value = custom_arg.split("=", 1)
  512. args[arg_name] = arg_value
  513. parsed_args = [
  514. f"{arg_name}={arg_value}" for arg_name, arg_value in args.items()
  515. ]
  516. container["args"] = parsed_args
  517. # Apply the rest of our defaults
  518. pod_labels["wandb"] = "launch"
  519. # This annotation is required to enable azure workload identity.
  520. # Don't add this label if using a docker config secret for auth.
  521. if (
  522. isinstance(self.registry, AzureContainerRegistry)
  523. and not DOCKER_CONFIG_SECRET
  524. ):
  525. pod_labels["azure.workload.identity/use"] = "true"
  526. pod_spec["restartPolicy"] = pod_spec.get("restartPolicy", "Never")
  527. pod_spec["activeDeadlineSeconds"] = pod_spec.get(
  528. "activeDeadlineSeconds", _DEFAULT_BUILD_TIMEOUT_SECS
  529. )
  530. pod_spec["serviceAccountName"] = pod_spec.get(
  531. "serviceAccountName", SERVICE_ACCOUNT_NAME
  532. )
  533. job_spec["backoffLimit"] = job_spec.get("backoffLimit", 0)
  534. job_labels["wandb"] = "launch"
  535. job_metadata["namespace"] = job_metadata.get("namespace", NAMESPACE)
  536. job_metadata["name"] = job_metadata.get("name", job_name)
  537. job["apiVersion"] = "batch/v1"
  538. job["kind"] = "Job"
  539. # Apply all nested configs from the bottom up
  540. pod_metadata["labels"] = pod_labels
  541. pod_template["metadata"] = pod_metadata
  542. container["name"] = container.get("name", "wandb-container-build")
  543. container["image"] = container.get("image", self.image)
  544. container["volumeMounts"] = volume_mounts
  545. container["env"] = env
  546. pod_spec["containers"] = [container]
  547. pod_spec["volumes"] = volumes
  548. pod_template["spec"] = pod_spec
  549. job_spec["template"] = pod_template
  550. job_metadata["labels"] = job_labels
  551. job["metadata"] = job_metadata
  552. job["spec"] = job_spec
  553. return job