eager_tf_policy.py 42 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051
  1. """Eager mode TF policy built using build_tf_policy().
  2. It supports both traced and non-traced eager execution modes."""
  3. import functools
  4. import logging
  5. import os
  6. import threading
  7. from typing import Dict, List, Optional, Tuple, Union
  8. import tree # pip install dm_tree
  9. from ray._common.deprecation import (
  10. DEPRECATED_VALUE,
  11. deprecation_warning,
  12. )
  13. from ray.rllib.models.catalog import ModelCatalog
  14. from ray.rllib.models.repeated_values import RepeatedValues
  15. from ray.rllib.policy.policy import Policy, PolicyState
  16. from ray.rllib.policy.rnn_sequencing import pad_batch_to_sequences_of_same_size
  17. from ray.rllib.policy.sample_batch import SampleBatch
  18. from ray.rllib.utils import add_mixins, force_list
  19. from ray.rllib.utils.annotations import OldAPIStack, override
  20. from ray.rllib.utils.error import ERR_MSG_TF_POLICY_CANNOT_SAVE_KERAS_MODEL
  21. from ray.rllib.utils.framework import try_import_tf
  22. from ray.rllib.utils.metrics import (
  23. DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICY,
  24. NUM_AGENT_STEPS_TRAINED,
  25. NUM_GRAD_UPDATES_LIFETIME,
  26. )
  27. from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY
  28. from ray.rllib.utils.numpy import convert_to_numpy
  29. from ray.rllib.utils.spaces.space_utils import normalize_action
  30. from ray.rllib.utils.tf_utils import get_gpu_devices
  31. from ray.rllib.utils.threading import with_lock
  32. from ray.rllib.utils.typing import (
  33. LocalOptimizer,
  34. ModelGradients,
  35. TensorStructType,
  36. TensorType,
  37. )
  38. from ray.util.debug import log_once
  39. tf1, tf, tfv = try_import_tf()
  40. logger = logging.getLogger(__name__)
  41. def _convert_to_tf(x, dtype=None):
  42. if isinstance(x, SampleBatch):
  43. dict_ = {k: v for k, v in x.items() if k != SampleBatch.INFOS}
  44. return tree.map_structure(_convert_to_tf, dict_)
  45. elif isinstance(x, Policy):
  46. return x
  47. # Special handling of "Repeated" values.
  48. elif isinstance(x, RepeatedValues):
  49. return RepeatedValues(
  50. tree.map_structure(_convert_to_tf, x.values), x.lengths, x.max_len
  51. )
  52. if x is not None:
  53. d = dtype
  54. return tree.map_structure(
  55. lambda f: _convert_to_tf(f, d)
  56. if isinstance(f, RepeatedValues)
  57. else tf.convert_to_tensor(f, d)
  58. if f is not None and not tf.is_tensor(f)
  59. else f,
  60. x,
  61. )
  62. return x
  63. def _convert_to_numpy(x):
  64. def _map(x):
  65. if isinstance(x, tf.Tensor):
  66. return x.numpy()
  67. return x
  68. try:
  69. return tf.nest.map_structure(_map, x)
  70. except AttributeError:
  71. raise TypeError(
  72. ("Object of type {} has no method to convert to numpy.").format(type(x))
  73. )
  74. def _convert_eager_inputs(func):
  75. @functools.wraps(func)
  76. def _func(*args, **kwargs):
  77. if tf.executing_eagerly():
  78. eager_args = [_convert_to_tf(x) for x in args]
  79. # TODO: (sven) find a way to remove key-specific hacks.
  80. eager_kwargs = {
  81. k: _convert_to_tf(v, dtype=tf.int64 if k == "timestep" else None)
  82. for k, v in kwargs.items()
  83. if k not in {"info_batch", "episodes"}
  84. }
  85. return func(*eager_args, **eager_kwargs)
  86. else:
  87. return func(*args, **kwargs)
  88. return _func
  89. def _convert_eager_outputs(func):
  90. @functools.wraps(func)
  91. def _func(*args, **kwargs):
  92. out = func(*args, **kwargs)
  93. if tf.executing_eagerly():
  94. out = tf.nest.map_structure(_convert_to_numpy, out)
  95. return out
  96. return _func
  97. def _disallow_var_creation(next_creator, **kw):
  98. v = next_creator(**kw)
  99. raise ValueError(
  100. "Detected a variable being created during an eager "
  101. "forward pass. Variables should only be created during "
  102. "model initialization: {}".format(v.name)
  103. )
  104. def _check_too_many_retraces(obj):
  105. """Asserts that a given number of re-traces is not breached."""
  106. def _func(self_, *args, **kwargs):
  107. if (
  108. self_.config.get("eager_max_retraces") is not None
  109. and self_._re_trace_counter > self_.config["eager_max_retraces"]
  110. ):
  111. raise RuntimeError(
  112. "Too many tf-eager re-traces detected! This could lead to"
  113. " significant slow-downs (even slower than running in "
  114. "tf-eager mode w/ `eager_tracing=False`). To switch off "
  115. "these re-trace counting checks, set `eager_max_retraces`"
  116. " in your config to None."
  117. )
  118. return obj(self_, *args, **kwargs)
  119. return _func
  120. @OldAPIStack
  121. class EagerTFPolicy(Policy):
  122. """Dummy class to recognize any eagerized TFPolicy by its inheritance."""
  123. pass
  124. def _traced_eager_policy(eager_policy_cls):
  125. """Wrapper class that enables tracing for all eager policy methods.
  126. This is enabled by the `--trace`/`eager_tracing=True` config when
  127. framework=tf2.
  128. """
  129. class TracedEagerPolicy(eager_policy_cls):
  130. def __init__(self, *args, **kwargs):
  131. self._traced_learn_on_batch_helper = False
  132. self._traced_compute_actions_helper = False
  133. self._traced_compute_gradients_helper = False
  134. self._traced_apply_gradients_helper = False
  135. super(TracedEagerPolicy, self).__init__(*args, **kwargs)
  136. @_check_too_many_retraces
  137. @override(Policy)
  138. def compute_actions_from_input_dict(
  139. self,
  140. input_dict: Dict[str, TensorType],
  141. explore: bool = None,
  142. timestep: Optional[int] = None,
  143. episodes=None,
  144. **kwargs,
  145. ) -> Tuple[TensorType, List[TensorType], Dict[str, TensorType]]:
  146. """Traced version of Policy.compute_actions_from_input_dict."""
  147. # Create a traced version of `self._compute_actions_helper`.
  148. if self._traced_compute_actions_helper is False and not self._no_tracing:
  149. self._compute_actions_helper = _convert_eager_inputs(
  150. tf.function(
  151. super(TracedEagerPolicy, self)._compute_actions_helper,
  152. autograph=False,
  153. reduce_retracing=True,
  154. )
  155. )
  156. self._traced_compute_actions_helper = True
  157. # Now that the helper method is traced, call super's
  158. # `compute_actions_from_input_dict()` (which will call the traced helper).
  159. return super(TracedEagerPolicy, self).compute_actions_from_input_dict(
  160. input_dict=input_dict,
  161. explore=explore,
  162. timestep=timestep,
  163. episodes=episodes,
  164. **kwargs,
  165. )
  166. @_check_too_many_retraces
  167. @override(eager_policy_cls)
  168. def learn_on_batch(self, samples):
  169. """Traced version of Policy.learn_on_batch."""
  170. # Create a traced version of `self._learn_on_batch_helper`.
  171. if self._traced_learn_on_batch_helper is False and not self._no_tracing:
  172. self._learn_on_batch_helper = _convert_eager_inputs(
  173. tf.function(
  174. super(TracedEagerPolicy, self)._learn_on_batch_helper,
  175. autograph=False,
  176. reduce_retracing=True,
  177. )
  178. )
  179. self._traced_learn_on_batch_helper = True
  180. # Now that the helper method is traced, call super's
  181. # apply_gradients (which will call the traced helper).
  182. return super(TracedEagerPolicy, self).learn_on_batch(samples)
  183. @_check_too_many_retraces
  184. @override(eager_policy_cls)
  185. def compute_gradients(self, samples: SampleBatch) -> ModelGradients:
  186. """Traced version of Policy.compute_gradients."""
  187. # Create a traced version of `self._compute_gradients_helper`.
  188. if self._traced_compute_gradients_helper is False and not self._no_tracing:
  189. self._compute_gradients_helper = _convert_eager_inputs(
  190. tf.function(
  191. super(TracedEagerPolicy, self)._compute_gradients_helper,
  192. autograph=False,
  193. reduce_retracing=True,
  194. )
  195. )
  196. self._traced_compute_gradients_helper = True
  197. # Now that the helper method is traced, call super's
  198. # `compute_gradients()` (which will call the traced helper).
  199. return super(TracedEagerPolicy, self).compute_gradients(samples)
  200. @_check_too_many_retraces
  201. @override(Policy)
  202. def apply_gradients(self, grads: ModelGradients) -> None:
  203. """Traced version of Policy.apply_gradients."""
  204. # Create a traced version of `self._apply_gradients_helper`.
  205. if self._traced_apply_gradients_helper is False and not self._no_tracing:
  206. self._apply_gradients_helper = _convert_eager_inputs(
  207. tf.function(
  208. super(TracedEagerPolicy, self)._apply_gradients_helper,
  209. autograph=False,
  210. reduce_retracing=True,
  211. )
  212. )
  213. self._traced_apply_gradients_helper = True
  214. # Now that the helper method is traced, call super's
  215. # `apply_gradients()` (which will call the traced helper).
  216. return super(TracedEagerPolicy, self).apply_gradients(grads)
  217. @classmethod
  218. def with_tracing(cls):
  219. # Already traced -> Return same class.
  220. return cls
  221. TracedEagerPolicy.__name__ = eager_policy_cls.__name__ + "_traced"
  222. TracedEagerPolicy.__qualname__ = eager_policy_cls.__qualname__ + "_traced"
  223. return TracedEagerPolicy
  224. class _OptimizerWrapper:
  225. def __init__(self, tape):
  226. self.tape = tape
  227. def compute_gradients(self, loss, var_list):
  228. return list(zip(self.tape.gradient(loss, var_list), var_list))
  229. @OldAPIStack
  230. def _build_eager_tf_policy(
  231. name,
  232. loss_fn,
  233. get_default_config=None,
  234. postprocess_fn=None,
  235. stats_fn=None,
  236. optimizer_fn=None,
  237. compute_gradients_fn=None,
  238. apply_gradients_fn=None,
  239. grad_stats_fn=None,
  240. extra_learn_fetches_fn=None,
  241. extra_action_out_fn=None,
  242. validate_spaces=None,
  243. before_init=None,
  244. before_loss_init=None,
  245. after_init=None,
  246. make_model=None,
  247. action_sampler_fn=None,
  248. action_distribution_fn=None,
  249. mixins=None,
  250. get_batch_divisibility_req=None,
  251. # Deprecated args.
  252. obs_include_prev_action_reward=DEPRECATED_VALUE,
  253. extra_action_fetches_fn=None,
  254. gradients_fn=None,
  255. ):
  256. """Build an eager TF policy.
  257. An eager policy runs all operations in eager mode, which makes debugging
  258. much simpler, but has lower performance.
  259. You shouldn't need to call this directly. Rather, prefer to build a TF
  260. graph policy and use set `.framework("tf2", eager_tracing=False) in your
  261. AlgorithmConfig to have it automatically be converted to an eager policy.
  262. This has the same signature as build_tf_policy()."""
  263. base = add_mixins(EagerTFPolicy, mixins)
  264. if obs_include_prev_action_reward != DEPRECATED_VALUE:
  265. deprecation_warning(old="obs_include_prev_action_reward", error=True)
  266. if extra_action_fetches_fn is not None:
  267. deprecation_warning(
  268. old="extra_action_fetches_fn", new="extra_action_out_fn", error=True
  269. )
  270. if gradients_fn is not None:
  271. deprecation_warning(old="gradients_fn", new="compute_gradients_fn", error=True)
  272. class eager_policy_cls(base):
  273. def __init__(self, observation_space, action_space, config):
  274. # If this class runs as a @ray.remote actor, eager mode may not
  275. # have been activated yet.
  276. if not tf1.executing_eagerly():
  277. tf1.enable_eager_execution()
  278. self.framework = config.get("framework", "tf2")
  279. EagerTFPolicy.__init__(self, observation_space, action_space, config)
  280. # Global timestep should be a tensor.
  281. self.global_timestep = tf.Variable(0, trainable=False, dtype=tf.int64)
  282. self.explore = tf.Variable(
  283. self.config["explore"], trainable=False, dtype=tf.bool
  284. )
  285. # Log device and worker index.
  286. num_gpus = self._get_num_gpus_for_policy()
  287. if num_gpus > 0:
  288. gpu_ids = get_gpu_devices()
  289. logger.info(f"Found {len(gpu_ids)} visible cuda devices.")
  290. self._is_training = False
  291. # Only for `config.eager_tracing=True`: A counter to keep track of
  292. # how many times an eager-traced method (e.g.
  293. # `self._compute_actions_helper`) has been re-traced by tensorflow.
  294. # We will raise an error if more than n re-tracings have been
  295. # detected, since this would considerably slow down execution.
  296. # The variable below should only get incremented during the
  297. # tf.function trace operations, never when calling the already
  298. # traced function after that.
  299. self._re_trace_counter = 0
  300. self._loss_initialized = False
  301. # To ensure backward compatibility:
  302. # Old way: If `loss` provided here, use as-is (as a function).
  303. if loss_fn is not None:
  304. self._loss = loss_fn
  305. # New way: Convert the overridden `self.loss` into a plain
  306. # function, so it can be called the same way as `loss` would
  307. # be, ensuring backward compatibility.
  308. elif self.loss.__func__.__qualname__ != "Policy.loss":
  309. self._loss = self.loss.__func__
  310. # `loss` not provided nor overridden from Policy -> Set to None.
  311. else:
  312. self._loss = None
  313. self.batch_divisibility_req = (
  314. get_batch_divisibility_req(self)
  315. if callable(get_batch_divisibility_req)
  316. else (get_batch_divisibility_req or 1)
  317. )
  318. self._max_seq_len = config["model"]["max_seq_len"]
  319. if validate_spaces:
  320. validate_spaces(self, observation_space, action_space, config)
  321. if before_init:
  322. before_init(self, observation_space, action_space, config)
  323. self.config = config
  324. self.dist_class = None
  325. if action_sampler_fn or action_distribution_fn:
  326. if not make_model:
  327. raise ValueError(
  328. "`make_model` is required if `action_sampler_fn` OR "
  329. "`action_distribution_fn` is given"
  330. )
  331. else:
  332. self.dist_class, logit_dim = ModelCatalog.get_action_dist(
  333. action_space, self.config["model"]
  334. )
  335. if make_model:
  336. self.model = make_model(self, observation_space, action_space, config)
  337. else:
  338. self.model = ModelCatalog.get_model_v2(
  339. observation_space,
  340. action_space,
  341. logit_dim,
  342. config["model"],
  343. framework=self.framework,
  344. )
  345. # Lock used for locking some methods on the object-level.
  346. # This prevents possible race conditions when calling the model
  347. # first, then its value function (e.g. in a loss function), in
  348. # between of which another model call is made (e.g. to compute an
  349. # action).
  350. self._lock = threading.RLock()
  351. # Auto-update model's inference view requirements, if recurrent.
  352. self._update_model_view_requirements_from_init_state()
  353. # Combine view_requirements for Model and Policy.
  354. self.view_requirements.update(self.model.view_requirements)
  355. self.exploration = self._create_exploration()
  356. self._state_inputs = self.model.get_initial_state()
  357. self._is_recurrent = len(self._state_inputs) > 0
  358. if before_loss_init:
  359. before_loss_init(self, observation_space, action_space, config)
  360. if optimizer_fn:
  361. optimizers = optimizer_fn(self, config)
  362. else:
  363. optimizers = tf.keras.optimizers.Adam(config["lr"])
  364. optimizers = force_list(optimizers)
  365. if self.exploration:
  366. optimizers = self.exploration.get_exploration_optimizer(optimizers)
  367. # The list of local (tf) optimizers (one per loss term).
  368. self._optimizers: List[LocalOptimizer] = optimizers
  369. # Backward compatibility: A user's policy may only support a single
  370. # loss term and optimizer (no lists).
  371. self._optimizer: LocalOptimizer = optimizers[0] if optimizers else None
  372. self._initialize_loss_from_dummy_batch(
  373. auto_remove_unneeded_view_reqs=True,
  374. stats_fn=stats_fn,
  375. )
  376. self._loss_initialized = True
  377. if after_init:
  378. after_init(self, observation_space, action_space, config)
  379. # Got to reset global_timestep again after fake run-throughs.
  380. self.global_timestep.assign(0)
  381. @override(Policy)
  382. def compute_actions_from_input_dict(
  383. self,
  384. input_dict: Dict[str, TensorType],
  385. explore: bool = None,
  386. timestep: Optional[int] = None,
  387. episodes=None,
  388. **kwargs,
  389. ) -> Tuple[TensorType, List[TensorType], Dict[str, TensorType]]:
  390. if not self.config.get("eager_tracing") and not tf1.executing_eagerly():
  391. tf1.enable_eager_execution()
  392. self._is_training = False
  393. explore = explore if explore is not None else self.explore
  394. timestep = timestep if timestep is not None else self.global_timestep
  395. if isinstance(timestep, tf.Tensor):
  396. timestep = int(timestep.numpy())
  397. # Pass lazy (eager) tensor dict to Model as `input_dict`.
  398. input_dict = self._lazy_tensor_dict(input_dict)
  399. input_dict.set_training(False)
  400. # Pack internal state inputs into (separate) list.
  401. state_batches = [
  402. input_dict[k] for k in input_dict.keys() if "state_in" in k[:8]
  403. ]
  404. self._state_in = state_batches
  405. self._is_recurrent = state_batches != []
  406. # Call the exploration before_compute_actions hook.
  407. self.exploration.before_compute_actions(
  408. timestep=timestep, explore=explore, tf_sess=self.get_session()
  409. )
  410. ret = self._compute_actions_helper(
  411. input_dict,
  412. state_batches,
  413. # TODO: Passing episodes into a traced method does not work.
  414. None if self.config["eager_tracing"] else episodes,
  415. explore,
  416. timestep,
  417. )
  418. # Update our global timestep by the batch size.
  419. self.global_timestep.assign_add(tree.flatten(ret[0])[0].shape.as_list()[0])
  420. return convert_to_numpy(ret)
  421. @override(Policy)
  422. def compute_actions(
  423. self,
  424. obs_batch: Union[List[TensorStructType], TensorStructType],
  425. state_batches: Optional[List[TensorType]] = None,
  426. prev_action_batch: Union[List[TensorStructType], TensorStructType] = None,
  427. prev_reward_batch: Union[List[TensorStructType], TensorStructType] = None,
  428. info_batch: Optional[Dict[str, list]] = None,
  429. episodes: Optional[List] = None,
  430. explore: Optional[bool] = None,
  431. timestep: Optional[int] = None,
  432. **kwargs,
  433. ) -> Tuple[TensorType, List[TensorType], Dict[str, TensorType]]:
  434. # Create input dict to simply pass the entire call to
  435. # self.compute_actions_from_input_dict().
  436. input_dict = SampleBatch(
  437. {
  438. SampleBatch.CUR_OBS: obs_batch,
  439. },
  440. _is_training=tf.constant(False),
  441. )
  442. if state_batches is not None:
  443. for i, s in enumerate(state_batches):
  444. input_dict[f"state_in_{i}"] = s
  445. if prev_action_batch is not None:
  446. input_dict[SampleBatch.PREV_ACTIONS] = prev_action_batch
  447. if prev_reward_batch is not None:
  448. input_dict[SampleBatch.PREV_REWARDS] = prev_reward_batch
  449. if info_batch is not None:
  450. input_dict[SampleBatch.INFOS] = info_batch
  451. return self.compute_actions_from_input_dict(
  452. input_dict=input_dict,
  453. explore=explore,
  454. timestep=timestep,
  455. episodes=episodes,
  456. **kwargs,
  457. )
  458. @with_lock
  459. @override(Policy)
  460. def compute_log_likelihoods(
  461. self,
  462. actions,
  463. obs_batch,
  464. state_batches=None,
  465. prev_action_batch=None,
  466. prev_reward_batch=None,
  467. actions_normalized=True,
  468. **kwargs,
  469. ):
  470. if action_sampler_fn and action_distribution_fn is None:
  471. raise ValueError(
  472. "Cannot compute log-prob/likelihood w/o an "
  473. "`action_distribution_fn` and a provided "
  474. "`action_sampler_fn`!"
  475. )
  476. seq_lens = tf.ones(len(obs_batch), dtype=tf.int32)
  477. input_batch = SampleBatch(
  478. {SampleBatch.CUR_OBS: tf.convert_to_tensor(obs_batch)},
  479. _is_training=False,
  480. )
  481. if prev_action_batch is not None:
  482. input_batch[SampleBatch.PREV_ACTIONS] = tf.convert_to_tensor(
  483. prev_action_batch
  484. )
  485. if prev_reward_batch is not None:
  486. input_batch[SampleBatch.PREV_REWARDS] = tf.convert_to_tensor(
  487. prev_reward_batch
  488. )
  489. if self.exploration:
  490. # Exploration hook before each forward pass.
  491. self.exploration.before_compute_actions(explore=False)
  492. # Action dist class and inputs are generated via custom function.
  493. if action_distribution_fn:
  494. dist_inputs, dist_class, _ = action_distribution_fn(
  495. self, self.model, input_batch, explore=False, is_training=False
  496. )
  497. # Default log-likelihood calculation.
  498. else:
  499. dist_inputs, _ = self.model(input_batch, state_batches, seq_lens)
  500. dist_class = self.dist_class
  501. action_dist = dist_class(dist_inputs, self.model)
  502. # Normalize actions if necessary.
  503. if not actions_normalized and self.config["normalize_actions"]:
  504. actions = normalize_action(actions, self.action_space_struct)
  505. log_likelihoods = action_dist.logp(actions)
  506. return log_likelihoods
  507. @override(Policy)
  508. def postprocess_trajectory(
  509. self, sample_batch, other_agent_batches=None, episode=None
  510. ):
  511. assert tf.executing_eagerly()
  512. # Call super's postprocess_trajectory first.
  513. sample_batch = EagerTFPolicy.postprocess_trajectory(self, sample_batch)
  514. if postprocess_fn:
  515. return postprocess_fn(self, sample_batch, other_agent_batches, episode)
  516. return sample_batch
  517. @with_lock
  518. @override(Policy)
  519. def learn_on_batch(self, postprocessed_batch):
  520. # Callback handling.
  521. learn_stats = {}
  522. self.callbacks.on_learn_on_batch(
  523. policy=self, train_batch=postprocessed_batch, result=learn_stats
  524. )
  525. pad_batch_to_sequences_of_same_size(
  526. postprocessed_batch,
  527. max_seq_len=self._max_seq_len,
  528. shuffle=False,
  529. batch_divisibility_req=self.batch_divisibility_req,
  530. view_requirements=self.view_requirements,
  531. )
  532. self._is_training = True
  533. postprocessed_batch = self._lazy_tensor_dict(postprocessed_batch)
  534. postprocessed_batch.set_training(True)
  535. stats = self._learn_on_batch_helper(postprocessed_batch)
  536. self.num_grad_updates += 1
  537. stats.update(
  538. {
  539. "custom_metrics": learn_stats,
  540. NUM_AGENT_STEPS_TRAINED: postprocessed_batch.count,
  541. NUM_GRAD_UPDATES_LIFETIME: self.num_grad_updates,
  542. # -1, b/c we have to measure this diff before we do the update
  543. # above.
  544. DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICY: (
  545. self.num_grad_updates
  546. - 1
  547. - (postprocessed_batch.num_grad_updates or 0)
  548. ),
  549. }
  550. )
  551. return convert_to_numpy(stats)
  552. @override(Policy)
  553. def compute_gradients(
  554. self, postprocessed_batch: SampleBatch
  555. ) -> Tuple[ModelGradients, Dict[str, TensorType]]:
  556. pad_batch_to_sequences_of_same_size(
  557. postprocessed_batch,
  558. shuffle=False,
  559. max_seq_len=self._max_seq_len,
  560. batch_divisibility_req=self.batch_divisibility_req,
  561. view_requirements=self.view_requirements,
  562. )
  563. self._is_training = True
  564. self._lazy_tensor_dict(postprocessed_batch)
  565. postprocessed_batch.set_training(True)
  566. grads_and_vars, grads, stats = self._compute_gradients_helper(
  567. postprocessed_batch
  568. )
  569. return convert_to_numpy((grads, stats))
  570. @override(Policy)
  571. def apply_gradients(self, gradients: ModelGradients) -> None:
  572. self._apply_gradients_helper(
  573. list(
  574. zip(
  575. [
  576. (tf.convert_to_tensor(g) if g is not None else None)
  577. for g in gradients
  578. ],
  579. self.model.trainable_variables(),
  580. )
  581. )
  582. )
  583. @override(Policy)
  584. def get_weights(self, as_dict=False):
  585. variables = self.variables()
  586. if as_dict:
  587. return {v.name: v.numpy() for v in variables}
  588. return [v.numpy() for v in variables]
  589. @override(Policy)
  590. def set_weights(self, weights):
  591. variables = self.variables()
  592. assert len(weights) == len(variables), (len(weights), len(variables))
  593. for v, w in zip(variables, weights):
  594. v.assign(w)
  595. @override(Policy)
  596. def get_exploration_state(self):
  597. return convert_to_numpy(self.exploration.get_state())
  598. @override(Policy)
  599. def is_recurrent(self):
  600. return self._is_recurrent
  601. @override(Policy)
  602. def num_state_tensors(self):
  603. return len(self._state_inputs)
  604. @override(Policy)
  605. def get_initial_state(self):
  606. if hasattr(self, "model"):
  607. return self.model.get_initial_state()
  608. return []
  609. @override(Policy)
  610. def get_state(self) -> PolicyState:
  611. # Legacy Policy state (w/o keras model and w/o PolicySpec).
  612. state = super().get_state()
  613. state["global_timestep"] = state["global_timestep"].numpy()
  614. if self._optimizer and len(self._optimizer.variables()) > 0:
  615. state["_optimizer_variables"] = self._optimizer.variables()
  616. # Add exploration state.
  617. if self.exploration:
  618. # This is not compatible with RLModules, which have a method
  619. # `forward_exploration` to specify custom exploration behavior.
  620. state["_exploration_state"] = self.exploration.get_state()
  621. return state
  622. @override(Policy)
  623. def set_state(self, state: PolicyState) -> None:
  624. # Set optimizer vars first.
  625. optimizer_vars = state.get("_optimizer_variables", None)
  626. if optimizer_vars and self._optimizer.variables():
  627. if not type(self).__name__.endswith("_traced") and log_once(
  628. "set_state_optimizer_vars_tf_eager_policy_v2"
  629. ):
  630. logger.warning(
  631. "Cannot restore an optimizer's state for tf eager! Keras "
  632. "is not able to save the v1.x optimizers (from "
  633. "tf.compat.v1.train) since they aren't compatible with "
  634. "checkpoints."
  635. )
  636. for opt_var, value in zip(self._optimizer.variables(), optimizer_vars):
  637. opt_var.assign(value)
  638. # Set exploration's state.
  639. if hasattr(self, "exploration") and "_exploration_state" in state:
  640. self.exploration.set_state(state=state["_exploration_state"])
  641. # Restore glbal timestep (tf vars).
  642. self.global_timestep.assign(state["global_timestep"])
  643. # Then the Policy's (NN) weights and connectors.
  644. super().set_state(state)
  645. @override(Policy)
  646. def export_model(self, export_dir, onnx: Optional[int] = None) -> None:
  647. """Exports the Policy's Model to local directory for serving.
  648. Note: Since the TfModelV2 class that EagerTfPolicy uses is-NOT-a
  649. tf.keras.Model, we need to assume that there is a `base_model` property
  650. within this TfModelV2 class that is-a tf.keras.Model. This base model
  651. will be used here for the export.
  652. TODO (kourosh): This restriction will be resolved once we move Policy and
  653. ModelV2 to the new Learner/RLModule APIs.
  654. Args:
  655. export_dir: Local writable directory.
  656. onnx: If given, will export model in ONNX format. The
  657. value of this parameter set the ONNX OpSet version to use.
  658. """
  659. if (
  660. hasattr(self, "model")
  661. and hasattr(self.model, "base_model")
  662. and isinstance(self.model.base_model, tf.keras.Model)
  663. ):
  664. # Store model in ONNX format.
  665. if onnx:
  666. try:
  667. import tf2onnx
  668. except ImportError as e:
  669. raise RuntimeError(
  670. "Converting a TensorFlow model to ONNX requires "
  671. "`tf2onnx` to be installed. Install with "
  672. "`pip install tf2onnx`."
  673. ) from e
  674. model_proto, external_tensor_storage = tf2onnx.convert.from_keras(
  675. self.model.base_model,
  676. output_path=os.path.join(export_dir, "model.onnx"),
  677. )
  678. # Save the tf.keras.Model (architecture and weights, so it can be
  679. # retrieved w/o access to the original (custom) Model or Policy code).
  680. else:
  681. try:
  682. self.model.base_model.save(export_dir, save_format="tf")
  683. except Exception:
  684. logger.warning(ERR_MSG_TF_POLICY_CANNOT_SAVE_KERAS_MODEL)
  685. else:
  686. logger.warning(ERR_MSG_TF_POLICY_CANNOT_SAVE_KERAS_MODEL)
  687. def variables(self):
  688. """Return the list of all savable variables for this policy."""
  689. if isinstance(self.model, tf.keras.Model):
  690. return self.model.variables
  691. else:
  692. return self.model.variables()
  693. def loss_initialized(self):
  694. return self._loss_initialized
  695. @with_lock
  696. def _compute_actions_helper(
  697. self, input_dict, state_batches, episodes, explore, timestep
  698. ):
  699. # Increase the tracing counter to make sure we don't re-trace too
  700. # often. If eager_tracing=True, this counter should only get
  701. # incremented during the @tf.function trace operations, never when
  702. # calling the already traced function after that.
  703. self._re_trace_counter += 1
  704. # Calculate RNN sequence lengths.
  705. batch_size = tree.flatten(input_dict[SampleBatch.OBS])[0].shape[0]
  706. seq_lens = tf.ones(batch_size, dtype=tf.int32) if state_batches else None
  707. # Add default and custom fetches.
  708. extra_fetches = {}
  709. # Use Exploration object.
  710. with tf.variable_creator_scope(_disallow_var_creation):
  711. if action_sampler_fn:
  712. action_sampler_outputs = action_sampler_fn(
  713. self,
  714. self.model,
  715. input_dict[SampleBatch.CUR_OBS],
  716. explore=explore,
  717. timestep=timestep,
  718. episodes=episodes,
  719. )
  720. if len(action_sampler_outputs) == 4:
  721. actions, logp, dist_inputs, state_out = action_sampler_outputs
  722. else:
  723. dist_inputs = None
  724. state_out = []
  725. actions, logp = action_sampler_outputs
  726. else:
  727. if action_distribution_fn:
  728. # Try new action_distribution_fn signature, supporting
  729. # state_batches and seq_lens.
  730. try:
  731. (
  732. dist_inputs,
  733. self.dist_class,
  734. state_out,
  735. ) = action_distribution_fn(
  736. self,
  737. self.model,
  738. input_dict=input_dict,
  739. state_batches=state_batches,
  740. seq_lens=seq_lens,
  741. explore=explore,
  742. timestep=timestep,
  743. is_training=False,
  744. )
  745. # Trying the old way (to stay backward compatible).
  746. # TODO: Remove in future.
  747. except TypeError as e:
  748. if (
  749. "positional argument" in e.args[0]
  750. or "unexpected keyword argument" in e.args[0]
  751. ):
  752. (
  753. dist_inputs,
  754. self.dist_class,
  755. state_out,
  756. ) = action_distribution_fn(
  757. self,
  758. self.model,
  759. input_dict[SampleBatch.OBS],
  760. explore=explore,
  761. timestep=timestep,
  762. is_training=False,
  763. )
  764. else:
  765. raise e
  766. elif isinstance(self.model, tf.keras.Model):
  767. input_dict = SampleBatch(input_dict, seq_lens=seq_lens)
  768. if state_batches and "state_in_0" not in input_dict:
  769. for i, s in enumerate(state_batches):
  770. input_dict[f"state_in_{i}"] = s
  771. self._lazy_tensor_dict(input_dict)
  772. dist_inputs, state_out, extra_fetches = self.model(input_dict)
  773. else:
  774. dist_inputs, state_out = self.model(
  775. input_dict, state_batches, seq_lens
  776. )
  777. action_dist = self.dist_class(dist_inputs, self.model)
  778. # Get the exploration action from the forward results.
  779. actions, logp = self.exploration.get_exploration_action(
  780. action_distribution=action_dist,
  781. timestep=timestep,
  782. explore=explore,
  783. )
  784. # Action-logp and action-prob.
  785. if logp is not None:
  786. extra_fetches[SampleBatch.ACTION_PROB] = tf.exp(logp)
  787. extra_fetches[SampleBatch.ACTION_LOGP] = logp
  788. # Action-dist inputs.
  789. if dist_inputs is not None:
  790. extra_fetches[SampleBatch.ACTION_DIST_INPUTS] = dist_inputs
  791. # Custom extra fetches.
  792. if extra_action_out_fn:
  793. extra_fetches.update(extra_action_out_fn(self))
  794. return actions, state_out, extra_fetches
  795. # TODO: Figure out, why _ray_trace_ctx=None helps to prevent a crash in
  796. # AlphaStar w/ framework=tf2; eager_tracing=True on the policy learner actors.
  797. # It seems there may be a clash between the traced-by-tf function and the
  798. # traced-by-ray functions (for making the policy class a ray actor).
  799. def _learn_on_batch_helper(self, samples, _ray_trace_ctx=None):
  800. # Increase the tracing counter to make sure we don't re-trace too
  801. # often. If eager_tracing=True, this counter should only get
  802. # incremented during the @tf.function trace operations, never when
  803. # calling the already traced function after that.
  804. self._re_trace_counter += 1
  805. with tf.variable_creator_scope(_disallow_var_creation):
  806. grads_and_vars, _, stats = self._compute_gradients_helper(samples)
  807. self._apply_gradients_helper(grads_and_vars)
  808. return stats
  809. def _get_is_training_placeholder(self):
  810. return tf.convert_to_tensor(self._is_training)
  811. @with_lock
  812. def _compute_gradients_helper(self, samples):
  813. """Computes and returns grads as eager tensors."""
  814. # Increase the tracing counter to make sure we don't re-trace too
  815. # often. If eager_tracing=True, this counter should only get
  816. # incremented during the @tf.function trace operations, never when
  817. # calling the already traced function after that.
  818. self._re_trace_counter += 1
  819. # Gather all variables for which to calculate losses.
  820. if isinstance(self.model, tf.keras.Model):
  821. variables = self.model.trainable_variables
  822. else:
  823. variables = self.model.trainable_variables()
  824. # Calculate the loss(es) inside a tf GradientTape.
  825. with tf.GradientTape(persistent=compute_gradients_fn is not None) as tape:
  826. losses = self._loss(self, self.model, self.dist_class, samples)
  827. losses = force_list(losses)
  828. # User provided a compute_gradients_fn.
  829. if compute_gradients_fn:
  830. # Wrap our tape inside a wrapper, such that the resulting
  831. # object looks like a "classic" tf.optimizer. This way, custom
  832. # compute_gradients_fn will work on both tf static graph
  833. # and tf-eager.
  834. optimizer = _OptimizerWrapper(tape)
  835. # More than one loss terms/optimizers.
  836. if self.config["_tf_policy_handles_more_than_one_loss"]:
  837. grads_and_vars = compute_gradients_fn(
  838. self, [optimizer] * len(losses), losses
  839. )
  840. # Only one loss and one optimizer.
  841. else:
  842. grads_and_vars = [compute_gradients_fn(self, optimizer, losses[0])]
  843. # Default: Compute gradients using the above tape.
  844. else:
  845. grads_and_vars = [
  846. list(zip(tape.gradient(loss, variables), variables))
  847. for loss in losses
  848. ]
  849. if log_once("grad_vars"):
  850. for g_and_v in grads_and_vars:
  851. for g, v in g_and_v:
  852. if g is not None:
  853. logger.info(f"Optimizing variable {v.name}")
  854. # `grads_and_vars` is returned a list (len=num optimizers/losses)
  855. # of lists of (grad, var) tuples.
  856. if self.config["_tf_policy_handles_more_than_one_loss"]:
  857. grads = [[g for g, _ in g_and_v] for g_and_v in grads_and_vars]
  858. # `grads_and_vars` is returned as a list of (grad, var) tuples.
  859. else:
  860. grads_and_vars = grads_and_vars[0]
  861. grads = [g for g, _ in grads_and_vars]
  862. stats = self._stats(self, samples, grads)
  863. return grads_and_vars, grads, stats
  864. def _apply_gradients_helper(self, grads_and_vars):
  865. # Increase the tracing counter to make sure we don't re-trace too
  866. # often. If eager_tracing=True, this counter should only get
  867. # incremented during the @tf.function trace operations, never when
  868. # calling the already traced function after that.
  869. self._re_trace_counter += 1
  870. if apply_gradients_fn:
  871. if self.config["_tf_policy_handles_more_than_one_loss"]:
  872. apply_gradients_fn(self, self._optimizers, grads_and_vars)
  873. else:
  874. apply_gradients_fn(self, self._optimizer, grads_and_vars)
  875. else:
  876. if self.config["_tf_policy_handles_more_than_one_loss"]:
  877. for i, o in enumerate(self._optimizers):
  878. o.apply_gradients(
  879. [(g, v) for g, v in grads_and_vars[i] if g is not None]
  880. )
  881. else:
  882. self._optimizer.apply_gradients(
  883. [(g, v) for g, v in grads_and_vars if g is not None]
  884. )
  885. def _stats(self, outputs, samples, grads):
  886. fetches = {}
  887. if stats_fn:
  888. fetches[LEARNER_STATS_KEY] = dict(stats_fn(outputs, samples))
  889. else:
  890. fetches[LEARNER_STATS_KEY] = {}
  891. if extra_learn_fetches_fn:
  892. fetches.update(dict(extra_learn_fetches_fn(self)))
  893. if grad_stats_fn:
  894. fetches.update(dict(grad_stats_fn(self, samples, grads)))
  895. return fetches
  896. def _lazy_tensor_dict(self, postprocessed_batch: SampleBatch):
  897. # TODO: (sven): Keep for a while to ensure backward compatibility.
  898. if not isinstance(postprocessed_batch, SampleBatch):
  899. postprocessed_batch = SampleBatch(postprocessed_batch)
  900. postprocessed_batch.set_get_interceptor(_convert_to_tf)
  901. return postprocessed_batch
  902. @classmethod
  903. def with_tracing(cls):
  904. return _traced_eager_policy(cls)
  905. eager_policy_cls.__name__ = name + "_eager"
  906. eager_policy_cls.__qualname__ = name + "_eager"
  907. return eager_policy_cls