vector_env.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542
  1. import logging
  2. from typing import Callable, List, Optional, Set, Tuple, Union
  3. import gymnasium as gym
  4. import numpy as np
  5. from ray.rllib.env.base_env import _DUMMY_AGENT_ID, BaseEnv
  6. from ray.rllib.utils.annotations import Deprecated, OldAPIStack, override
  7. from ray.rllib.utils.typing import (
  8. AgentID,
  9. EnvActionType,
  10. EnvID,
  11. EnvInfoDict,
  12. EnvObsType,
  13. EnvType,
  14. MultiEnvDict,
  15. )
  16. from ray.util import log_once
  17. logger = logging.getLogger(__name__)
  18. @OldAPIStack
  19. class VectorEnv:
  20. """An environment that supports batch evaluation using clones of sub-envs."""
  21. def __init__(
  22. self, observation_space: gym.Space, action_space: gym.Space, num_envs: int
  23. ):
  24. """Initializes a VectorEnv instance.
  25. Args:
  26. observation_space: The observation Space of a single
  27. sub-env.
  28. action_space: The action Space of a single sub-env.
  29. num_envs: The number of clones to make of the given sub-env.
  30. """
  31. self.observation_space = observation_space
  32. self.action_space = action_space
  33. self.num_envs = num_envs
  34. @staticmethod
  35. def vectorize_gym_envs(
  36. make_env: Optional[Callable[[int], EnvType]] = None,
  37. existing_envs: Optional[List[gym.Env]] = None,
  38. num_envs: int = 1,
  39. action_space: Optional[gym.Space] = None,
  40. observation_space: Optional[gym.Space] = None,
  41. restart_failed_sub_environments: bool = False,
  42. # Deprecated. These seem to have never been used.
  43. env_config=None,
  44. policy_config=None,
  45. ) -> "_VectorizedGymEnv":
  46. """Translates any given gym.Env(s) into a VectorizedEnv object.
  47. Args:
  48. make_env: Factory that produces a new gym.Env taking the sub-env's
  49. vector index as only arg. Must be defined if the
  50. number of `existing_envs` is less than `num_envs`.
  51. existing_envs: Optional list of already instantiated sub
  52. environments.
  53. num_envs: Total number of sub environments in this VectorEnv.
  54. action_space: The action space. If None, use existing_envs[0]'s
  55. action space.
  56. observation_space: The observation space. If None, use
  57. existing_envs[0]'s observation space.
  58. restart_failed_sub_environments: If True and any sub-environment (within
  59. a vectorized env) throws any error during env stepping, the
  60. Sampler will try to restart the faulty sub-environment. This is done
  61. without disturbing the other (still intact) sub-environment and without
  62. the RolloutWorker crashing.
  63. Returns:
  64. The resulting _VectorizedGymEnv object (subclass of VectorEnv).
  65. """
  66. return _VectorizedGymEnv(
  67. make_env=make_env,
  68. existing_envs=existing_envs or [],
  69. num_envs=num_envs,
  70. observation_space=observation_space,
  71. action_space=action_space,
  72. restart_failed_sub_environments=restart_failed_sub_environments,
  73. )
  74. def vector_reset(
  75. self, *, seeds: Optional[List[int]] = None, options: Optional[List[dict]] = None
  76. ) -> Tuple[List[EnvObsType], List[EnvInfoDict]]:
  77. """Resets all sub-environments.
  78. Args:
  79. seed: The list of seeds to be passed to the sub-environments' when resetting
  80. them. If None, will not reset any existing PRNGs. If you pass
  81. integers, the PRNGs will be reset even if they already exists.
  82. options: The list of options dicts to be passed to the sub-environments'
  83. when resetting them.
  84. Returns:
  85. Tuple consitsing of a list of observations from each environment and
  86. a list of info dicts from each environment.
  87. """
  88. raise NotImplementedError
  89. def reset_at(
  90. self,
  91. index: Optional[int] = None,
  92. *,
  93. seed: Optional[int] = None,
  94. options: Optional[dict] = None,
  95. ) -> Union[Tuple[EnvObsType, EnvInfoDict], Exception]:
  96. """Resets a single sub-environment.
  97. Args:
  98. index: An optional sub-env index to reset.
  99. seed: The seed to be passed to the sub-environment at index `index` when
  100. resetting it. If None, will not reset any existing PRNG. If you pass an
  101. integer, the PRNG will be reset even if it already exists.
  102. options: An options dict to be passed to the sub-environment at index
  103. `index` when resetting it.
  104. Returns:
  105. Tuple consisting of observations from the reset sub environment and
  106. an info dict of the reset sub environment. Alternatively an Exception
  107. can be returned, indicating that the reset operation on the sub environment
  108. has failed (and why it failed).
  109. """
  110. raise NotImplementedError
  111. def restart_at(self, index: Optional[int] = None) -> None:
  112. """Restarts a single sub-environment.
  113. Args:
  114. index: An optional sub-env index to restart.
  115. """
  116. raise NotImplementedError
  117. def vector_step(
  118. self, actions: List[EnvActionType]
  119. ) -> Tuple[
  120. List[EnvObsType], List[float], List[bool], List[bool], List[EnvInfoDict]
  121. ]:
  122. """Performs a vectorized step on all sub environments using `actions`.
  123. Args:
  124. actions: List of actions (one for each sub-env).
  125. Returns:
  126. A tuple consisting of
  127. 1) New observations for each sub-env.
  128. 2) Reward values for each sub-env.
  129. 3) Terminated values for each sub-env.
  130. 4) Truncated values for each sub-env.
  131. 5) Info values for each sub-env.
  132. """
  133. raise NotImplementedError
  134. def get_sub_environments(self) -> List[EnvType]:
  135. """Returns the underlying sub environments.
  136. Returns:
  137. List of all underlying sub environments.
  138. """
  139. return []
  140. # TODO: (sven) Experimental method. Make @PublicAPI at some point.
  141. def try_render_at(self, index: Optional[int] = None) -> Optional[np.ndarray]:
  142. """Renders a single environment.
  143. Args:
  144. index: An optional sub-env index to render.
  145. Returns:
  146. Either a numpy RGB image (shape=(w x h x 3) dtype=uint8) or
  147. None in case rendering is handled directly by this method.
  148. """
  149. pass
  150. def to_base_env(
  151. self,
  152. make_env: Optional[Callable[[int], EnvType]] = None,
  153. num_envs: int = 1,
  154. remote_envs: bool = False,
  155. remote_env_batch_wait_ms: int = 0,
  156. restart_failed_sub_environments: bool = False,
  157. ) -> "BaseEnv":
  158. """Converts an RLlib MultiAgentEnv into a BaseEnv object.
  159. The resulting BaseEnv is always vectorized (contains n
  160. sub-environments) to support batched forward passes, where n may
  161. also be 1. BaseEnv also supports async execution via the `poll` and
  162. `send_actions` methods and thus supports external simulators.
  163. Args:
  164. make_env: A callable taking an int as input (which indicates
  165. the number of individual sub-environments within the final
  166. vectorized BaseEnv) and returning one individual
  167. sub-environment.
  168. num_envs: The number of sub-environments to create in the
  169. resulting (vectorized) BaseEnv. The already existing `env`
  170. will be one of the `num_envs`.
  171. remote_envs: Whether each sub-env should be a @ray.remote
  172. actor. You can set this behavior in your config via the
  173. `remote_worker_envs=True` option.
  174. remote_env_batch_wait_ms: The wait time (in ms) to poll remote
  175. sub-environments for, if applicable. Only used if
  176. `remote_envs` is True.
  177. Returns:
  178. The resulting BaseEnv object.
  179. """
  180. env = VectorEnvWrapper(self)
  181. return env
  182. @Deprecated(new="vectorize_gym_envs", error=True)
  183. def wrap(self, *args, **kwargs) -> "_VectorizedGymEnv":
  184. pass
  185. @Deprecated(new="get_sub_environments", error=True)
  186. def get_unwrapped(self) -> List[EnvType]:
  187. pass
  188. @OldAPIStack
  189. class _VectorizedGymEnv(VectorEnv):
  190. """Internal wrapper to translate any gym.Envs into a VectorEnv object."""
  191. def __init__(
  192. self,
  193. make_env: Optional[Callable[[int], EnvType]] = None,
  194. existing_envs: Optional[List[gym.Env]] = None,
  195. num_envs: int = 1,
  196. *,
  197. observation_space: Optional[gym.Space] = None,
  198. action_space: Optional[gym.Space] = None,
  199. restart_failed_sub_environments: bool = False,
  200. # Deprecated. These seem to have never been used.
  201. env_config=None,
  202. policy_config=None,
  203. ):
  204. """Initializes a _VectorizedGymEnv object.
  205. Args:
  206. make_env: Factory that produces a new gym.Env taking the sub-env's
  207. vector index as only arg. Must be defined if the
  208. number of `existing_envs` is less than `num_envs`.
  209. existing_envs: Optional list of already instantiated sub
  210. environments.
  211. num_envs: Total number of sub environments in this VectorEnv.
  212. action_space: The action space. If None, use existing_envs[0]'s
  213. action space.
  214. observation_space: The observation space. If None, use
  215. existing_envs[0]'s observation space.
  216. restart_failed_sub_environments: If True and any sub-environment (within
  217. a vectorized env) throws any error during env stepping, we will try to
  218. restart the faulty sub-environment. This is done
  219. without disturbing the other (still intact) sub-environments.
  220. """
  221. self.envs = existing_envs
  222. self.make_env = make_env
  223. self.restart_failed_sub_environments = restart_failed_sub_environments
  224. # Fill up missing envs (so we have exactly num_envs sub-envs in this
  225. # VectorEnv.
  226. while len(self.envs) < num_envs:
  227. self.envs.append(make_env(len(self.envs)))
  228. super().__init__(
  229. observation_space=observation_space or self.envs[0].observation_space,
  230. action_space=action_space or self.envs[0].action_space,
  231. num_envs=num_envs,
  232. )
  233. @override(VectorEnv)
  234. def vector_reset(
  235. self, *, seeds: Optional[List[int]] = None, options: Optional[List[dict]] = None
  236. ) -> Tuple[List[EnvObsType], List[EnvInfoDict]]:
  237. seeds = seeds or [None] * self.num_envs
  238. options = options or [None] * self.num_envs
  239. # Use reset_at(index) to restart and retry until
  240. # we successfully create a new env.
  241. resetted_obs = []
  242. resetted_infos = []
  243. for i in range(len(self.envs)):
  244. while True:
  245. obs, infos = self.reset_at(i, seed=seeds[i], options=options[i])
  246. if not isinstance(obs, Exception):
  247. break
  248. resetted_obs.append(obs)
  249. resetted_infos.append(infos)
  250. return resetted_obs, resetted_infos
  251. @override(VectorEnv)
  252. def reset_at(
  253. self,
  254. index: Optional[int] = None,
  255. *,
  256. seed: Optional[int] = None,
  257. options: Optional[dict] = None,
  258. ) -> Tuple[Union[EnvObsType, Exception], Union[EnvInfoDict, Exception]]:
  259. if index is None:
  260. index = 0
  261. try:
  262. obs_and_infos = self.envs[index].reset(seed=seed, options=options)
  263. except Exception as e:
  264. if self.restart_failed_sub_environments:
  265. logger.exception(e.args[0])
  266. self.restart_at(index)
  267. obs_and_infos = e, {}
  268. else:
  269. raise e
  270. return obs_and_infos
  271. @override(VectorEnv)
  272. def restart_at(self, index: Optional[int] = None) -> None:
  273. if index is None:
  274. index = 0
  275. # Try closing down the old (possibly faulty) sub-env, but ignore errors.
  276. try:
  277. self.envs[index].close()
  278. except Exception as e:
  279. if log_once("close_sub_env"):
  280. logger.warning(
  281. "Trying to close old and replaced sub-environment (at vector "
  282. f"index={index}), but closing resulted in error:\n{e}"
  283. )
  284. env_to_del = self.envs[index]
  285. self.envs[index] = None
  286. del env_to_del
  287. # Re-create the sub-env at the new index.
  288. logger.warning(f"Trying to restart sub-environment at index {index}.")
  289. self.envs[index] = self.make_env(index)
  290. logger.warning(f"Sub-environment at index {index} restarted successfully.")
  291. @override(VectorEnv)
  292. def vector_step(
  293. self, actions: List[EnvActionType]
  294. ) -> Tuple[
  295. List[EnvObsType], List[float], List[bool], List[bool], List[EnvInfoDict]
  296. ]:
  297. obs_batch, reward_batch, terminated_batch, truncated_batch, info_batch = (
  298. [],
  299. [],
  300. [],
  301. [],
  302. [],
  303. )
  304. for i in range(self.num_envs):
  305. try:
  306. results = self.envs[i].step(actions[i])
  307. except Exception as e:
  308. if self.restart_failed_sub_environments:
  309. logger.exception(e.args[0])
  310. self.restart_at(i)
  311. results = e, 0.0, True, True, {}
  312. else:
  313. raise e
  314. obs, reward, terminated, truncated, info = results
  315. if not isinstance(info, dict):
  316. raise ValueError(
  317. "Info should be a dict, got {} ({})".format(info, type(info))
  318. )
  319. obs_batch.append(obs)
  320. reward_batch.append(reward)
  321. terminated_batch.append(terminated)
  322. truncated_batch.append(truncated)
  323. info_batch.append(info)
  324. return obs_batch, reward_batch, terminated_batch, truncated_batch, info_batch
  325. @override(VectorEnv)
  326. def get_sub_environments(self) -> List[EnvType]:
  327. return self.envs
  328. @override(VectorEnv)
  329. def try_render_at(self, index: Optional[int] = None):
  330. if index is None:
  331. index = 0
  332. return self.envs[index].render()
  333. @OldAPIStack
  334. class VectorEnvWrapper(BaseEnv):
  335. """Internal adapter of VectorEnv to BaseEnv.
  336. We assume the caller will always send the full vector of actions in each
  337. call to send_actions(), and that they call reset_at() on all completed
  338. environments before calling send_actions().
  339. """
  340. def __init__(self, vector_env: VectorEnv):
  341. self.vector_env = vector_env
  342. self.num_envs = vector_env.num_envs
  343. self._observation_space = vector_env.observation_space
  344. self._action_space = vector_env.action_space
  345. # Sub-environments' states.
  346. self.new_obs = None
  347. self.cur_rewards = None
  348. self.cur_terminateds = None
  349. self.cur_truncateds = None
  350. self.cur_infos = None
  351. # At first `poll()`, reset everything (all sub-environments).
  352. self.first_reset_done = False
  353. # Initialize sub-environments' state.
  354. self._init_env_state(idx=None)
  355. @override(BaseEnv)
  356. def poll(
  357. self,
  358. ) -> Tuple[
  359. MultiEnvDict,
  360. MultiEnvDict,
  361. MultiEnvDict,
  362. MultiEnvDict,
  363. MultiEnvDict,
  364. MultiEnvDict,
  365. ]:
  366. from ray.rllib.env.base_env import with_dummy_agent_id
  367. if not self.first_reset_done:
  368. self.first_reset_done = True
  369. # TODO(sven): We probably would like to seed this call here as well.
  370. self.new_obs, self.cur_infos = self.vector_env.vector_reset()
  371. new_obs = dict(enumerate(self.new_obs))
  372. rewards = dict(enumerate(self.cur_rewards))
  373. terminateds = dict(enumerate(self.cur_terminateds))
  374. truncateds = dict(enumerate(self.cur_truncateds))
  375. infos = dict(enumerate(self.cur_infos))
  376. # Empty all states (in case `poll()` gets called again).
  377. self.new_obs = []
  378. self.cur_rewards = []
  379. self.cur_terminateds = []
  380. self.cur_truncateds = []
  381. self.cur_infos = []
  382. return (
  383. with_dummy_agent_id(new_obs),
  384. with_dummy_agent_id(rewards),
  385. with_dummy_agent_id(terminateds, "__all__"),
  386. with_dummy_agent_id(truncateds, "__all__"),
  387. with_dummy_agent_id(infos),
  388. {},
  389. )
  390. @override(BaseEnv)
  391. def send_actions(self, action_dict: MultiEnvDict) -> None:
  392. from ray.rllib.env.base_env import _DUMMY_AGENT_ID
  393. action_vector = [None] * self.num_envs
  394. for i in range(self.num_envs):
  395. action_vector[i] = action_dict[i][_DUMMY_AGENT_ID]
  396. (
  397. self.new_obs,
  398. self.cur_rewards,
  399. self.cur_terminateds,
  400. self.cur_truncateds,
  401. self.cur_infos,
  402. ) = self.vector_env.vector_step(action_vector)
  403. @override(BaseEnv)
  404. def try_reset(
  405. self,
  406. env_id: Optional[EnvID] = None,
  407. *,
  408. seed: Optional[int] = None,
  409. options: Optional[dict] = None,
  410. ) -> Tuple[MultiEnvDict, MultiEnvDict]:
  411. from ray.rllib.env.base_env import _DUMMY_AGENT_ID
  412. if env_id is None:
  413. env_id = 0
  414. assert isinstance(env_id, int)
  415. obs, infos = self.vector_env.reset_at(env_id, seed=seed, options=options)
  416. # If exceptions were returned, return MultiEnvDict mapping env indices to
  417. # these exceptions (for obs and infos).
  418. if isinstance(obs, Exception):
  419. return {env_id: obs}, {env_id: infos}
  420. # Otherwise, return a MultiEnvDict (with single agent ID) and the actual
  421. # obs and info dicts.
  422. else:
  423. return {env_id: {_DUMMY_AGENT_ID: obs}}, {env_id: {_DUMMY_AGENT_ID: infos}}
  424. @override(BaseEnv)
  425. def try_restart(self, env_id: Optional[EnvID] = None) -> None:
  426. assert env_id is None or isinstance(env_id, int)
  427. # Restart the sub-env at the index.
  428. self.vector_env.restart_at(env_id)
  429. # Auto-reset (get ready for next `poll()`).
  430. self._init_env_state(env_id)
  431. @override(BaseEnv)
  432. def get_sub_environments(self, as_dict: bool = False) -> Union[List[EnvType], dict]:
  433. if not as_dict:
  434. return self.vector_env.get_sub_environments()
  435. else:
  436. return dict(enumerate(self.vector_env.get_sub_environments()))
  437. @override(BaseEnv)
  438. def try_render(self, env_id: Optional[EnvID] = None) -> None:
  439. assert env_id is None or isinstance(env_id, int)
  440. return self.vector_env.try_render_at(env_id)
  441. @property
  442. @override(BaseEnv)
  443. def observation_space(self) -> gym.Space:
  444. return self._observation_space
  445. @property
  446. @override(BaseEnv)
  447. def action_space(self) -> gym.Space:
  448. return self._action_space
  449. @override(BaseEnv)
  450. def get_agent_ids(self) -> Set[AgentID]:
  451. return {_DUMMY_AGENT_ID}
  452. def _init_env_state(self, idx: Optional[int] = None) -> None:
  453. """Resets all or one particular sub-environment's state (by index).
  454. Args:
  455. idx: The index to reset at. If None, reset all the sub-environments' states.
  456. """
  457. # If index is None, reset all sub-envs' states:
  458. if idx is None:
  459. self.new_obs = [None for _ in range(self.num_envs)]
  460. self.cur_rewards = [0.0 for _ in range(self.num_envs)]
  461. self.cur_terminateds = [False for _ in range(self.num_envs)]
  462. self.cur_truncateds = [False for _ in range(self.num_envs)]
  463. self.cur_infos = [{} for _ in range(self.num_envs)]
  464. # Index provided, reset only the sub-env's state at the given index.
  465. else:
  466. self.new_obs[idx], self.cur_infos[idx] = self.vector_env.reset_at(idx)
  467. # Reset all other states to null values.
  468. self.cur_rewards[idx] = 0.0
  469. self.cur_terminateds[idx] = False
  470. self.cur_truncateds[idx] = False