custom_func_checkpointing.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. # If want to use checkpointing with a custom training function (not a Ray
  2. # integration like PyTorch or Tensorflow), your function can read/write
  3. # checkpoint through the ``ray.tune.report(metrics, checkpoint=...)`` API.
  4. import argparse
  5. import json
  6. import os
  7. import tempfile
  8. import time
  9. from ray import tune
  10. from ray.tune import Checkpoint
  11. def evaluation_fn(step, width, height):
  12. time.sleep(0.1)
  13. return (0.1 + width * step / 100) ** (-1) + height * 0.1
  14. def train_func(config):
  15. step = 0
  16. width, height = config["width"], config["height"]
  17. checkpoint = tune.get_checkpoint()
  18. if checkpoint:
  19. with checkpoint.as_directory() as checkpoint_dir:
  20. with open(os.path.join(checkpoint_dir, "checkpoint.json")) as f:
  21. state = json.load(f)
  22. step = state["step"] + 1
  23. for current_step in range(step, 100):
  24. intermediate_score = evaluation_fn(current_step, width, height)
  25. with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
  26. with open(os.path.join(temp_checkpoint_dir, "checkpoint.json"), "w") as f:
  27. json.dump({"step": current_step}, f)
  28. tune.report(
  29. {"iterations": current_step, "mean_loss": intermediate_score},
  30. checkpoint=Checkpoint.from_directory(temp_checkpoint_dir),
  31. )
  32. if __name__ == "__main__":
  33. parser = argparse.ArgumentParser()
  34. parser.add_argument(
  35. "--smoke-test", action="store_true", help="Finish quickly for testing"
  36. )
  37. args, _ = parser.parse_known_args()
  38. tuner = tune.Tuner(
  39. train_func,
  40. run_config=tune.RunConfig(
  41. name="hyperband_test",
  42. stop={"training_iteration": 1 if args.smoke_test else 10},
  43. ),
  44. tune_config=tune.TuneConfig(
  45. metric="mean_loss",
  46. mode="min",
  47. num_samples=5,
  48. ),
  49. param_space={
  50. "steps": 10,
  51. "width": tune.randint(10, 100),
  52. "height": tune.loguniform(10, 100),
  53. },
  54. )
  55. results = tuner.fit()
  56. best_result = results.get_best_result()
  57. print("Best hyperparameters: ", best_result.config)
  58. best_checkpoint = best_result.checkpoint
  59. print("Best checkpoint: ", best_checkpoint)