dogbox.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. """
  2. Dogleg algorithm with rectangular trust regions for least-squares minimization.
  3. The description of the algorithm can be found in [Voglis]_. The algorithm does
  4. trust-region iterations, but the shape of trust regions is rectangular as
  5. opposed to conventional elliptical. The intersection of a trust region and
  6. an initial feasible region is again some rectangle. Thus, on each iteration a
  7. bound-constrained quadratic optimization problem is solved.
  8. A quadratic problem is solved by well-known dogleg approach, where the
  9. function is minimized along piecewise-linear "dogleg" path [NumOpt]_,
  10. Chapter 4. If Jacobian is not rank-deficient then the function is decreasing
  11. along this path, and optimization amounts to simply following along this
  12. path as long as a point stays within the bounds. A constrained Cauchy step
  13. (along the anti-gradient) is considered for safety in rank deficient cases,
  14. in this situations the convergence might be slow.
  15. If during iterations some variable hit the initial bound and the component
  16. of anti-gradient points outside the feasible region, then a next dogleg step
  17. won't make any progress. At this state such variables satisfy first-order
  18. optimality conditions and they are excluded before computing a next dogleg
  19. step.
  20. Gauss-Newton step can be computed exactly by `numpy.linalg.lstsq` (for dense
  21. Jacobian matrices) or by iterative procedure `scipy.sparse.linalg.lsmr` (for
  22. dense and sparse matrices, or Jacobian being LinearOperator). The second
  23. option allows to solve very large problems (up to couple of millions of
  24. residuals on a regular PC), provided the Jacobian matrix is sufficiently
  25. sparse. But note that dogbox is not very good for solving problems with
  26. large number of constraints, because of variables exclusion-inclusion on each
  27. iteration (a required number of function evaluations might be high or accuracy
  28. of a solution will be poor), thus its large-scale usage is probably limited
  29. to unconstrained problems.
  30. References
  31. ----------
  32. .. [Voglis] C. Voglis and I. E. Lagaris, "A Rectangular Trust Region Dogleg
  33. Approach for Unconstrained and Bound Constrained Nonlinear
  34. Optimization", WSEAS International Conference on Applied
  35. Mathematics, Corfu, Greece, 2004.
  36. .. [NumOpt] J. Nocedal and S. J. Wright, "Numerical optimization, 2nd edition".
  37. """
  38. import numpy as np
  39. from numpy.linalg import lstsq, norm
  40. from scipy.sparse.linalg import LinearOperator, aslinearoperator, lsmr
  41. from scipy.optimize import OptimizeResult
  42. from scipy._lib._util import _call_callback_maybe_halt
  43. from .common import (
  44. step_size_to_bound, in_bounds, update_tr_radius, evaluate_quadratic,
  45. build_quadratic_1d, minimize_quadratic_1d, compute_grad,
  46. compute_jac_scale, check_termination, scale_for_robust_loss_function,
  47. print_header_nonlinear, print_iteration_nonlinear)
  48. def lsmr_operator(Jop, d, active_set):
  49. """Compute LinearOperator to use in LSMR by dogbox algorithm.
  50. `active_set` mask is used to excluded active variables from computations
  51. of matrix-vector products.
  52. """
  53. m, n = Jop.shape
  54. def matvec(x):
  55. x_free = x.ravel().copy()
  56. x_free[active_set] = 0
  57. return Jop.matvec(x * d)
  58. def rmatvec(x):
  59. r = d * Jop.rmatvec(x)
  60. r[active_set] = 0
  61. return r
  62. return LinearOperator((m, n), matvec=matvec, rmatvec=rmatvec, dtype=float)
  63. def find_intersection(x, tr_bounds, lb, ub):
  64. """Find intersection of trust-region bounds and initial bounds.
  65. Returns
  66. -------
  67. lb_total, ub_total : ndarray with shape of x
  68. Lower and upper bounds of the intersection region.
  69. orig_l, orig_u : ndarray of bool with shape of x
  70. True means that an original bound is taken as a corresponding bound
  71. in the intersection region.
  72. tr_l, tr_u : ndarray of bool with shape of x
  73. True means that a trust-region bound is taken as a corresponding bound
  74. in the intersection region.
  75. """
  76. lb_centered = lb - x
  77. ub_centered = ub - x
  78. lb_total = np.maximum(lb_centered, -tr_bounds)
  79. ub_total = np.minimum(ub_centered, tr_bounds)
  80. orig_l = np.equal(lb_total, lb_centered)
  81. orig_u = np.equal(ub_total, ub_centered)
  82. tr_l = np.equal(lb_total, -tr_bounds)
  83. tr_u = np.equal(ub_total, tr_bounds)
  84. return lb_total, ub_total, orig_l, orig_u, tr_l, tr_u
  85. def dogleg_step(x, newton_step, g, a, b, tr_bounds, lb, ub):
  86. """Find dogleg step in a rectangular region.
  87. Returns
  88. -------
  89. step : ndarray, shape (n,)
  90. Computed dogleg step.
  91. bound_hits : ndarray of int, shape (n,)
  92. Each component shows whether a corresponding variable hits the
  93. initial bound after the step is taken:
  94. * 0 - a variable doesn't hit the bound.
  95. * -1 - lower bound is hit.
  96. * 1 - upper bound is hit.
  97. tr_hit : bool
  98. Whether the step hit the boundary of the trust-region.
  99. """
  100. lb_total, ub_total, orig_l, orig_u, tr_l, tr_u = find_intersection(
  101. x, tr_bounds, lb, ub
  102. )
  103. bound_hits = np.zeros_like(x, dtype=int)
  104. if in_bounds(newton_step, lb_total, ub_total):
  105. return newton_step, bound_hits, False
  106. to_bounds, _ = step_size_to_bound(np.zeros_like(x), -g, lb_total, ub_total)
  107. # The classical dogleg algorithm would check if Cauchy step fits into
  108. # the bounds, and just return it constrained version if not. But in a
  109. # rectangular trust region it makes sense to try to improve constrained
  110. # Cauchy step too. Thus, we don't distinguish these two cases.
  111. cauchy_step = -minimize_quadratic_1d(a, b, 0, to_bounds)[0] * g
  112. step_diff = newton_step - cauchy_step
  113. step_size, hits = step_size_to_bound(cauchy_step, step_diff,
  114. lb_total, ub_total)
  115. bound_hits[(hits < 0) & orig_l] = -1
  116. bound_hits[(hits > 0) & orig_u] = 1
  117. tr_hit = np.any((hits < 0) & tr_l | (hits > 0) & tr_u)
  118. return cauchy_step + step_size * step_diff, bound_hits, tr_hit
  119. def dogbox(fun, jac, x0, f0, J0, lb, ub, ftol, xtol, gtol, max_nfev, x_scale,
  120. loss_function, tr_solver, tr_options, verbose, callback=None):
  121. f = f0
  122. f_true = f.copy()
  123. nfev = 1
  124. J = J0
  125. njev = 1
  126. if loss_function is not None:
  127. rho = loss_function(f)
  128. cost = 0.5 * np.sum(rho[0])
  129. J, f = scale_for_robust_loss_function(J, f, rho)
  130. else:
  131. cost = 0.5 * np.dot(f, f)
  132. g = compute_grad(J, f)
  133. jac_scale = isinstance(x_scale, str) and x_scale == 'jac'
  134. if jac_scale:
  135. scale, scale_inv = compute_jac_scale(J)
  136. else:
  137. scale, scale_inv = x_scale, 1 / x_scale
  138. Delta = norm(x0 * scale_inv, ord=np.inf)
  139. if Delta == 0:
  140. Delta = 1.0
  141. on_bound = np.zeros_like(x0, dtype=int)
  142. on_bound[np.equal(x0, lb)] = -1
  143. on_bound[np.equal(x0, ub)] = 1
  144. x = x0
  145. step = np.empty_like(x0)
  146. if max_nfev is None:
  147. max_nfev = x0.size * 100
  148. termination_status = None
  149. iteration = 0
  150. step_norm = None
  151. actual_reduction = None
  152. if verbose == 2:
  153. print_header_nonlinear()
  154. while True:
  155. active_set = on_bound * g < 0
  156. free_set = ~active_set
  157. g_free = g[free_set]
  158. g_full = g.copy()
  159. g[active_set] = 0
  160. g_norm = norm(g, ord=np.inf)
  161. if g_norm < gtol:
  162. termination_status = 1
  163. if verbose == 2:
  164. print_iteration_nonlinear(iteration, nfev, cost, actual_reduction,
  165. step_norm, g_norm)
  166. if termination_status is not None or nfev == max_nfev:
  167. break
  168. x_free = x[free_set]
  169. lb_free = lb[free_set]
  170. ub_free = ub[free_set]
  171. scale_free = scale[free_set]
  172. # Compute (Gauss-)Newton and build quadratic model for Cauchy step.
  173. if tr_solver == 'exact':
  174. J_free = J[:, free_set]
  175. newton_step = lstsq(J_free, -f, rcond=-1)[0]
  176. # Coefficients for the quadratic model along the anti-gradient.
  177. a, b = build_quadratic_1d(J_free, g_free, -g_free)
  178. elif tr_solver == 'lsmr':
  179. Jop = aslinearoperator(J)
  180. # We compute lsmr step in scaled variables and then
  181. # transform back to normal variables, if lsmr would give exact lsq
  182. # solution, this would be equivalent to not doing any
  183. # transformations, but from experience it's better this way.
  184. # We pass active_set to make computations as if we selected
  185. # the free subset of J columns, but without actually doing any
  186. # slicing, which is expensive for sparse matrices and impossible
  187. # for LinearOperator.
  188. lsmr_op = lsmr_operator(Jop, scale, active_set)
  189. newton_step = -lsmr(lsmr_op, f, **tr_options)[0][free_set]
  190. newton_step *= scale_free
  191. # Components of g for active variables were zeroed, so this call
  192. # is correct and equivalent to using J_free and g_free.
  193. a, b = build_quadratic_1d(Jop, g, -g)
  194. actual_reduction = -1.0
  195. while actual_reduction <= 0 and nfev < max_nfev:
  196. tr_bounds = Delta * scale_free
  197. step_free, on_bound_free, tr_hit = dogleg_step(
  198. x_free, newton_step, g_free, a, b, tr_bounds, lb_free, ub_free)
  199. step.fill(0.0)
  200. step[free_set] = step_free
  201. if tr_solver == 'exact':
  202. predicted_reduction = -evaluate_quadratic(J_free, g_free,
  203. step_free)
  204. elif tr_solver == 'lsmr':
  205. predicted_reduction = -evaluate_quadratic(Jop, g, step)
  206. # gh11403 ensure that solution is fully within bounds.
  207. x_new = np.clip(x + step, lb, ub)
  208. f_new = fun(x_new)
  209. nfev += 1
  210. step_h_norm = norm(step * scale_inv, ord=np.inf)
  211. if not np.all(np.isfinite(f_new)):
  212. Delta = 0.25 * step_h_norm
  213. continue
  214. # Usual trust-region step quality estimation.
  215. if loss_function is not None:
  216. cost_new = loss_function(f_new, cost_only=True)
  217. else:
  218. cost_new = 0.5 * np.dot(f_new, f_new)
  219. actual_reduction = cost - cost_new
  220. Delta, ratio = update_tr_radius(
  221. Delta, actual_reduction, predicted_reduction,
  222. step_h_norm, tr_hit
  223. )
  224. step_norm = norm(step)
  225. termination_status = check_termination(
  226. actual_reduction, cost, step_norm, norm(x), ratio, ftol, xtol)
  227. if termination_status is not None:
  228. break
  229. if actual_reduction > 0:
  230. on_bound[free_set] = on_bound_free
  231. x = x_new
  232. # Set variables exactly at the boundary.
  233. mask = on_bound == -1
  234. x[mask] = lb[mask]
  235. mask = on_bound == 1
  236. x[mask] = ub[mask]
  237. f = f_new
  238. f_true = f.copy()
  239. cost = cost_new
  240. J = jac(x)
  241. njev += 1
  242. if loss_function is not None:
  243. rho = loss_function(f)
  244. J, f = scale_for_robust_loss_function(J, f, rho)
  245. g = compute_grad(J, f)
  246. if jac_scale:
  247. scale, scale_inv = compute_jac_scale(J, scale_inv)
  248. else:
  249. step_norm = 0
  250. actual_reduction = 0
  251. iteration += 1
  252. # Call callback function and possibly stop optimization
  253. if callback is not None:
  254. intermediate_result = OptimizeResult(
  255. x=x, fun=f, nit=iteration, nfev=nfev)
  256. intermediate_result["cost"] = cost_new
  257. if _call_callback_maybe_halt(
  258. callback, intermediate_result
  259. ):
  260. termination_status = -2
  261. break
  262. if termination_status is None:
  263. termination_status = 0
  264. return OptimizeResult(
  265. x=x, cost=cost, fun=f_true, jac=J, grad=g_full, optimality=g_norm,
  266. active_mask=on_bound, nfev=nfev, njev=njev, status=termination_status)