| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157 |
- import argparse
- import os
- import random
- from datetime import datetime
- import pandas as pd
- from ray.tune import run, sample_from
- from ray.tune.schedulers import PopulationBasedTraining
- from ray.tune.schedulers.pb2 import PB2
- # Postprocess the perturbed config to ensure it's still valid used if PBT.
- def explore(config):
- # Ensure we collect enough timesteps to do sgd.
- if config["train_batch_size"] < config["sgd_minibatch_size"] * 2:
- config["train_batch_size"] = config["sgd_minibatch_size"] * 2
- # Ensure we run at least one sgd iter.
- if config["lambda"] > 1:
- config["lambda"] = 1
- config["train_batch_size"] = int(config["train_batch_size"])
- return config
- if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("--max", type=int, default=1000000)
- parser.add_argument("--algo", type=str, default="PPO")
- parser.add_argument("--num_workers", type=int, default=4)
- parser.add_argument("--num_samples", type=int, default=4)
- parser.add_argument("--t_ready", type=int, default=50000)
- parser.add_argument("--seed", type=int, default=0)
- parser.add_argument(
- "--horizon", type=int, default=1600
- ) # make this 1000 for other envs
- parser.add_argument("--perturb", type=float, default=0.25) # if using PBT
- parser.add_argument("--env_name", type=str, default="BipedalWalker-v2")
- parser.add_argument(
- "--criteria", type=str, default="timesteps_total"
- ) # "training_iteration", "time_total_s"
- parser.add_argument(
- "--net", type=str, default="32_32"
- ) # May be important to use a larger network for bigger tasks.
- parser.add_argument("--filename", type=str, default="")
- parser.add_argument("--method", type=str, default="pb2") # ['pbt', 'pb2']
- parser.add_argument("--save_csv", type=bool, default=False)
- args = parser.parse_args()
- # bipedalwalker needs 1600
- if args.env_name in ["BipedalWalker-v2", "BipedalWalker-v3"]:
- horizon = 1600
- else:
- horizon = 1000
- pbt = PopulationBasedTraining(
- time_attr=args.criteria,
- metric="episode_reward_mean",
- mode="max",
- perturbation_interval=args.t_ready,
- resample_probability=args.perturb,
- quantile_fraction=args.perturb, # copy bottom % with top %
- # Specifies the search space for these hyperparams
- hyperparam_mutations={
- "lambda": lambda: random.uniform(0.9, 1.0),
- "clip_param": lambda: random.uniform(0.1, 0.5),
- "lr": lambda: random.uniform(1e-3, 1e-5),
- "train_batch_size": lambda: random.randint(1000, 60000),
- },
- custom_explore_fn=explore,
- )
- pb2 = PB2(
- time_attr=args.criteria,
- metric="episode_reward_mean",
- mode="max",
- perturbation_interval=args.t_ready,
- quantile_fraction=args.perturb, # copy bottom % with top %
- # Specifies the hyperparam search space
- hyperparam_bounds={
- "lambda": [0.9, 1.0],
- "clip_param": [0.1, 0.5],
- "lr": [1e-5, 1e-3],
- "train_batch_size": [1000, 60000],
- },
- )
- methods = {"pbt": pbt, "pb2": pb2}
- timelog = (
- str(datetime.date(datetime.now())) + "_" + str(datetime.time(datetime.now()))
- )
- args.dir = "{}_{}_{}_Size{}_{}_{}".format(
- args.algo,
- args.filename,
- args.method,
- str(args.num_samples),
- args.env_name,
- args.criteria,
- )
- analysis = run(
- args.algo,
- name="{}_{}_{}_seed{}_{}".format(
- timelog, args.method, args.env_name, str(args.seed), args.filename
- ),
- scheduler=methods[args.method],
- verbose=1,
- num_samples=args.num_samples,
- reuse_actors=True,
- stop={args.criteria: args.max},
- config={
- "env": args.env_name,
- "log_level": "INFO",
- "seed": args.seed,
- "kl_coeff": 1.0,
- "num_gpus": 0,
- "horizon": horizon,
- "observation_filter": "MeanStdFilter",
- "model": {
- "fcnet_hiddens": [
- int(args.net.split("_")[0]),
- int(args.net.split("_")[1]),
- ],
- "free_log_std": True,
- },
- "num_sgd_iter": 10,
- "sgd_minibatch_size": 128,
- "lambda": sample_from(lambda spec: random.uniform(0.9, 1.0)),
- "clip_param": sample_from(lambda spec: random.uniform(0.1, 0.5)),
- "lr": sample_from(lambda spec: random.uniform(1e-3, 1e-5)),
- "train_batch_size": sample_from(lambda spec: random.randint(1000, 60000)),
- },
- )
- all_dfs = list(analysis.trial_dataframes.values())
- results = pd.DataFrame()
- for i in range(args.num_samples):
- df = all_dfs[i]
- df = df[
- [
- "timesteps_total",
- "episodes_total",
- "episode_reward_mean",
- "info/learner/default_policy/cur_kl_coeff",
- ]
- ]
- df["Agent"] = i
- results = pd.concat([results, df]).reset_index(drop=True)
- if args.save_csv:
- if not (os.path.exists("data/" + args.dir)):
- os.makedirs("data/" + args.dir)
- results.to_csv("data/{}/seed{}.csv".format(args.dir, str(args.seed)))
|