_dask.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381
  1. from __future__ import absolute_import, division, print_function
  2. import asyncio
  3. import concurrent.futures
  4. import contextlib
  5. import time
  6. import weakref
  7. from uuid import uuid4
  8. from ._utils import (
  9. _retrieve_traceback_capturing_wrapped_call,
  10. _TracebackCapturingWrapper,
  11. )
  12. from .parallel import AutoBatchingMixin, ParallelBackendBase, parallel_config
  13. try:
  14. import dask
  15. import distributed
  16. except ImportError:
  17. dask = None
  18. distributed = None
  19. if dask is not None and distributed is not None:
  20. from dask.distributed import (
  21. Client,
  22. as_completed,
  23. get_client,
  24. rejoin,
  25. secede,
  26. )
  27. from dask.sizeof import sizeof
  28. from dask.utils import funcname
  29. from distributed.utils import thread_state
  30. try:
  31. # asyncio.TimeoutError, Python3-only error thrown by recent versions of
  32. # distributed
  33. from distributed.utils import TimeoutError as _TimeoutError
  34. except ImportError:
  35. from tornado.gen import TimeoutError as _TimeoutError
  36. def is_weakrefable(obj):
  37. try:
  38. weakref.ref(obj)
  39. return True
  40. except TypeError:
  41. return False
  42. class _WeakKeyDictionary:
  43. """A variant of weakref.WeakKeyDictionary for unhashable objects.
  44. This datastructure is used to store futures for broadcasted data objects
  45. such as large numpy arrays or pandas dataframes that are not hashable and
  46. therefore cannot be used as keys of traditional python dicts.
  47. Furthermore using a dict with id(array) as key is not safe because the
  48. Python is likely to reuse id of recently collected arrays.
  49. """
  50. def __init__(self):
  51. self._data = {}
  52. def __getitem__(self, obj):
  53. ref, val = self._data[id(obj)]
  54. if ref() is not obj:
  55. # In case of a race condition with on_destroy.
  56. raise KeyError(obj)
  57. return val
  58. def __setitem__(self, obj, value):
  59. key = id(obj)
  60. try:
  61. ref, _ = self._data[key]
  62. if ref() is not obj:
  63. # In case of race condition with on_destroy.
  64. raise KeyError(obj)
  65. except KeyError:
  66. # Insert the new entry in the mapping along with a weakref
  67. # callback to automatically delete the entry from the mapping
  68. # as soon as the object used as key is garbage collected.
  69. def on_destroy(_):
  70. del self._data[key]
  71. ref = weakref.ref(obj, on_destroy)
  72. self._data[key] = ref, value
  73. def __len__(self):
  74. return len(self._data)
  75. def clear(self):
  76. self._data.clear()
  77. def _funcname(x):
  78. try:
  79. if isinstance(x, list):
  80. x = x[0][0]
  81. except Exception:
  82. pass
  83. return funcname(x)
  84. def _make_tasks_summary(tasks):
  85. """Summarize of list of (func, args, kwargs) function calls"""
  86. unique_funcs = {func for func, args, kwargs in tasks}
  87. if len(unique_funcs) == 1:
  88. mixed = False
  89. else:
  90. mixed = True
  91. return len(tasks), mixed, _funcname(tasks)
  92. class Batch:
  93. """dask-compatible wrapper that executes a batch of tasks"""
  94. def __init__(self, tasks):
  95. # collect some metadata from the tasks to ease Batch calls
  96. # introspection when debugging
  97. self._num_tasks, self._mixed, self._funcname = _make_tasks_summary(tasks)
  98. def __call__(self, tasks=None):
  99. results = []
  100. with parallel_config(backend="dask"):
  101. for func, args, kwargs in tasks:
  102. results.append(func(*args, **kwargs))
  103. return results
  104. def __repr__(self):
  105. descr = f"batch_of_{self._funcname}_{self._num_tasks}_calls"
  106. if self._mixed:
  107. descr = "mixed_" + descr
  108. return descr
  109. def _joblib_probe_task():
  110. # Noop used by the joblib connector to probe when workers are ready.
  111. pass
  112. class DaskDistributedBackend(AutoBatchingMixin, ParallelBackendBase):
  113. MIN_IDEAL_BATCH_DURATION = 0.2
  114. MAX_IDEAL_BATCH_DURATION = 1.0
  115. supports_retrieve_callback = True
  116. default_n_jobs = -1
  117. def __init__(
  118. self,
  119. scheduler_host=None,
  120. scatter=None,
  121. client=None,
  122. loop=None,
  123. wait_for_workers_timeout=10,
  124. **submit_kwargs,
  125. ):
  126. super().__init__()
  127. if distributed is None:
  128. msg = (
  129. "You are trying to use 'dask' as a joblib parallel backend "
  130. "but dask is not installed. Please install dask "
  131. "to fix this error."
  132. )
  133. raise ValueError(msg)
  134. if client is None:
  135. if scheduler_host:
  136. client = Client(scheduler_host, loop=loop, set_as_default=False)
  137. else:
  138. try:
  139. client = get_client()
  140. except ValueError as e:
  141. msg = (
  142. "To use Joblib with Dask first create a Dask Client"
  143. "\n\n"
  144. " from dask.distributed import Client\n"
  145. " client = Client()\n"
  146. "or\n"
  147. " client = Client('scheduler-address:8786')"
  148. )
  149. raise ValueError(msg) from e
  150. self.client = client
  151. if scatter is not None and not isinstance(scatter, (list, tuple)):
  152. raise TypeError(
  153. "scatter must be a list/tuple, got `%s`" % type(scatter).__name__
  154. )
  155. if scatter is not None and len(scatter) > 0:
  156. # Keep a reference to the scattered data to keep the ids the same
  157. self._scatter = list(scatter)
  158. scattered = self.client.scatter(scatter, broadcast=True)
  159. self.data_futures = {id(x): f for x, f in zip(scatter, scattered)}
  160. else:
  161. self._scatter = []
  162. self.data_futures = {}
  163. self.wait_for_workers_timeout = wait_for_workers_timeout
  164. self.submit_kwargs = submit_kwargs
  165. self.waiting_futures = as_completed(
  166. [], loop=client.loop, with_results=True, raise_errors=False
  167. )
  168. self._results = {}
  169. self._callbacks = {}
  170. async def _collect(self):
  171. while self._continue:
  172. async for future, result in self.waiting_futures:
  173. cf_future = self._results.pop(future)
  174. callback = self._callbacks.pop(future)
  175. if future.status == "error":
  176. typ, exc, tb = result
  177. cf_future.set_exception(exc)
  178. else:
  179. cf_future.set_result(result)
  180. callback(result)
  181. await asyncio.sleep(0.01)
  182. def __reduce__(self):
  183. return (DaskDistributedBackend, ())
  184. def get_nested_backend(self):
  185. return DaskDistributedBackend(client=self.client), -1
  186. def configure(self, n_jobs=1, parallel=None, **backend_args):
  187. self.parallel = parallel
  188. return self.effective_n_jobs(n_jobs)
  189. def start_call(self):
  190. self._continue = True
  191. self.client.loop.add_callback(self._collect)
  192. self.call_data_futures = _WeakKeyDictionary()
  193. def stop_call(self):
  194. # The explicit call to clear is required to break a cycling reference
  195. # to the futures.
  196. self._continue = False
  197. # wait for the future collection routine (self._backend._collect) to
  198. # finish in order to limit asyncio warnings due to aborting _collect
  199. # during a following backend termination call
  200. time.sleep(0.01)
  201. self.call_data_futures.clear()
  202. def effective_n_jobs(self, n_jobs):
  203. effective_n_jobs = sum(self.client.ncores().values())
  204. if effective_n_jobs != 0 or not self.wait_for_workers_timeout:
  205. return effective_n_jobs
  206. # If there is no worker, schedule a probe task to wait for the workers
  207. # to come up and be available. If the dask cluster is in adaptive mode
  208. # task might cause the cluster to provision some workers.
  209. try:
  210. self.client.submit(_joblib_probe_task).result(
  211. timeout=self.wait_for_workers_timeout
  212. )
  213. except _TimeoutError as e:
  214. error_msg = (
  215. "DaskDistributedBackend has no worker after {} seconds. "
  216. "Make sure that workers are started and can properly connect "
  217. "to the scheduler and increase the joblib/dask connection "
  218. "timeout with:\n\n"
  219. "parallel_config(backend='dask', wait_for_workers_timeout={})"
  220. ).format(
  221. self.wait_for_workers_timeout,
  222. max(10, 2 * self.wait_for_workers_timeout),
  223. )
  224. raise TimeoutError(error_msg) from e
  225. return sum(self.client.ncores().values())
  226. async def _to_func_args(self, func):
  227. itemgetters = dict()
  228. # Futures that are dynamically generated during a single call to
  229. # Parallel.__call__.
  230. call_data_futures = getattr(self, "call_data_futures", None)
  231. async def maybe_to_futures(args):
  232. out = []
  233. for arg in args:
  234. arg_id = id(arg)
  235. if arg_id in itemgetters:
  236. out.append(itemgetters[arg_id])
  237. continue
  238. f = self.data_futures.get(arg_id, None)
  239. if f is None and call_data_futures is not None:
  240. try:
  241. f = await call_data_futures[arg]
  242. except KeyError:
  243. pass
  244. if f is None:
  245. if is_weakrefable(arg) and sizeof(arg) > 1e3:
  246. # Automatically scatter large objects to some of
  247. # the workers to avoid duplicated data transfers.
  248. # Rely on automated inter-worker data stealing if
  249. # more workers need to reuse this data
  250. # concurrently.
  251. # set hash=False - nested scatter calls (i.e
  252. # calling client.scatter inside a dask worker)
  253. # using hash=True often raise CancelledError,
  254. # see dask/distributed#3703
  255. _coro = self.client.scatter(
  256. arg, asynchronous=True, hash=False
  257. )
  258. # Centralize the scattering of identical arguments
  259. # between concurrent apply_async callbacks by
  260. # exposing the running coroutine in
  261. # call_data_futures before it completes.
  262. t = asyncio.Task(_coro)
  263. call_data_futures[arg] = t
  264. f = await t
  265. if f is not None:
  266. out.append(f)
  267. else:
  268. out.append(arg)
  269. return out
  270. tasks = []
  271. for f, args, kwargs in func.items:
  272. args = list(await maybe_to_futures(args))
  273. kwargs = dict(zip(kwargs.keys(), await maybe_to_futures(kwargs.values())))
  274. tasks.append((f, args, kwargs))
  275. return (Batch(tasks), tasks)
  276. def apply_async(self, func, callback=None):
  277. cf_future = concurrent.futures.Future()
  278. cf_future.get = cf_future.result # achieve AsyncResult API
  279. async def f(func, callback):
  280. batch, tasks = await self._to_func_args(func)
  281. key = f"{repr(batch)}-{uuid4().hex}"
  282. dask_future = self.client.submit(
  283. _TracebackCapturingWrapper(batch),
  284. tasks=tasks,
  285. key=key,
  286. **self.submit_kwargs,
  287. )
  288. self.waiting_futures.add(dask_future)
  289. self._callbacks[dask_future] = callback
  290. self._results[dask_future] = cf_future
  291. self.client.loop.add_callback(f, func, callback)
  292. return cf_future
  293. def retrieve_result_callback(self, out):
  294. return _retrieve_traceback_capturing_wrapped_call(out)
  295. def abort_everything(self, ensure_ready=True):
  296. """Tell the client to cancel any task submitted via this instance
  297. joblib.Parallel will never access those results
  298. """
  299. with self.waiting_futures.lock:
  300. self.waiting_futures.futures.clear()
  301. while not self.waiting_futures.queue.empty():
  302. self.waiting_futures.queue.get()
  303. @contextlib.contextmanager
  304. def retrieval_context(self):
  305. """Override ParallelBackendBase.retrieval_context to avoid deadlocks.
  306. This removes thread from the worker's thread pool (using 'secede').
  307. Seceding avoids deadlock in nested parallelism settings.
  308. """
  309. # See 'joblib.Parallel.__call__' and 'joblib.Parallel.retrieve' for how
  310. # this is used.
  311. if hasattr(thread_state, "execution_state"):
  312. # we are in a worker. Secede to avoid deadlock.
  313. secede()
  314. yield
  315. if hasattr(thread_state, "execution_state"):
  316. rejoin()