_trustregion_exact.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458
  1. """Nearly exact trust-region optimization subproblem."""
  2. import numpy as np
  3. from scipy.linalg import (norm, get_lapack_funcs, solve_triangular,
  4. cho_solve)
  5. from ._trustregion import (_minimize_trust_region, BaseQuadraticSubproblem)
  6. __all__ = ['_minimize_trustregion_exact',
  7. 'estimate_smallest_singular_value',
  8. 'singular_leading_submatrix',
  9. 'IterativeSubproblem']
  10. def _minimize_trustregion_exact(fun, x0, args=(), jac=None, hess=None,
  11. **trust_region_options):
  12. """
  13. Minimization of scalar function of one or more variables using
  14. a nearly exact trust-region algorithm.
  15. Options
  16. -------
  17. initial_trust_radius : float
  18. Initial trust-region radius.
  19. max_trust_radius : float
  20. Maximum value of the trust-region radius. No steps that are longer
  21. than this value will be proposed.
  22. eta : float
  23. Trust region related acceptance stringency for proposed steps.
  24. gtol : float
  25. Gradient norm must be less than ``gtol`` before successful
  26. termination.
  27. subproblem_maxiter : int, optional
  28. Maximum number of iterations to perform per subproblem. Only affects
  29. trust-exact. Default is 25.
  30. .. versionadded:: 1.17.0
  31. """
  32. if jac is None:
  33. raise ValueError('Jacobian is required for trust region '
  34. 'exact minimization.')
  35. if not callable(hess):
  36. raise ValueError('Hessian matrix is required for trust region '
  37. 'exact minimization.')
  38. return _minimize_trust_region(fun, x0, args=args, jac=jac, hess=hess,
  39. subproblem=IterativeSubproblem,
  40. **trust_region_options)
  41. def estimate_smallest_singular_value(U):
  42. """Given upper triangular matrix ``U`` estimate the smallest singular
  43. value and the correspondent right singular vector in O(n**2) operations.
  44. Parameters
  45. ----------
  46. U : ndarray
  47. Square upper triangular matrix.
  48. Returns
  49. -------
  50. s_min : float
  51. Estimated smallest singular value of the provided matrix.
  52. z_min : ndarray
  53. Estimated right singular vector.
  54. Notes
  55. -----
  56. The procedure is based on [1]_ and is done in two steps. First, it finds
  57. a vector ``e`` with components selected from {+1, -1} such that the
  58. solution ``w`` from the system ``U.T w = e`` is as large as possible.
  59. Next it estimate ``U v = w``. The smallest singular value is close
  60. to ``norm(w)/norm(v)`` and the right singular vector is close
  61. to ``v/norm(v)``.
  62. The estimation will be better more ill-conditioned is the matrix.
  63. References
  64. ----------
  65. .. [1] Cline, A. K., Moler, C. B., Stewart, G. W., Wilkinson, J. H.
  66. An estimate for the condition number of a matrix. 1979.
  67. SIAM Journal on Numerical Analysis, 16(2), 368-375.
  68. """
  69. U = np.atleast_2d(U)
  70. m, n = U.shape
  71. if m != n:
  72. raise ValueError("A square triangular matrix should be provided.")
  73. # A vector `e` with components selected from {+1, -1}
  74. # is selected so that the solution `w` to the system
  75. # `U.T w = e` is as large as possible. Implementation
  76. # based on algorithm 3.5.1, p. 142, from reference [2]
  77. # adapted for lower triangular matrix.
  78. p = np.zeros(n)
  79. w = np.empty(n)
  80. # Implemented according to: Golub, G. H., Van Loan, C. F. (2013).
  81. # "Matrix computations". Forth Edition. JHU press. pp. 140-142.
  82. for k in range(n):
  83. wp = (1-p[k]) / U.T[k, k]
  84. wm = (-1-p[k]) / U.T[k, k]
  85. pp = p[k+1:] + U.T[k+1:, k]*wp
  86. pm = p[k+1:] + U.T[k+1:, k]*wm
  87. if abs(wp) + norm(pp, 1) >= abs(wm) + norm(pm, 1):
  88. w[k] = wp
  89. p[k+1:] = pp
  90. else:
  91. w[k] = wm
  92. p[k+1:] = pm
  93. # The system `U v = w` is solved using backward substitution.
  94. v = solve_triangular(U, w)
  95. v_norm = norm(v)
  96. w_norm = norm(w)
  97. # Smallest singular value
  98. s_min = w_norm / v_norm
  99. # Associated vector
  100. z_min = v / v_norm
  101. return s_min, z_min
  102. def gershgorin_bounds(H):
  103. """
  104. Given a square matrix ``H`` compute upper
  105. and lower bounds for its eigenvalues (Gregoshgorin Bounds).
  106. Defined ref. [1].
  107. References
  108. ----------
  109. .. [1] Conn, A. R., Gould, N. I., & Toint, P. L.
  110. Trust region methods. 2000. Siam. pp. 19.
  111. """
  112. H_diag = np.diag(H)
  113. H_diag_abs = np.abs(H_diag)
  114. H_row_sums = np.sum(np.abs(H), axis=1)
  115. lb = np.min(H_diag + H_diag_abs - H_row_sums)
  116. ub = np.max(H_diag - H_diag_abs + H_row_sums)
  117. return lb, ub
  118. def singular_leading_submatrix(A, U, k):
  119. """
  120. Compute term that makes the leading ``k`` by ``k``
  121. submatrix from ``A`` singular.
  122. Parameters
  123. ----------
  124. A : ndarray
  125. Symmetric matrix that is not positive definite.
  126. U : ndarray
  127. Upper triangular matrix resulting of an incomplete
  128. Cholesky decomposition of matrix ``A``.
  129. k : int
  130. Positive integer such that the leading k by k submatrix from
  131. `A` is the first non-positive definite leading submatrix.
  132. Returns
  133. -------
  134. delta : float
  135. Amount that should be added to the element (k, k) of the
  136. leading k by k submatrix of ``A`` to make it singular.
  137. v : ndarray
  138. A vector such that ``v.T B v = 0``. Where B is the matrix A after
  139. ``delta`` is added to its element (k, k).
  140. """
  141. # Compute delta
  142. delta = np.sum(U[:k-1, k-1]**2) - A[k-1, k-1]
  143. n = len(A)
  144. # Initialize v
  145. v = np.zeros(n)
  146. v[k-1] = 1
  147. # Compute the remaining values of v by solving a triangular system.
  148. if k != 1:
  149. v[:k-1] = solve_triangular(U[:k-1, :k-1], -U[:k-1, k-1])
  150. return delta, v
  151. class IterativeSubproblem(BaseQuadraticSubproblem):
  152. """Quadratic subproblem solved by nearly exact iterative method.
  153. Notes
  154. -----
  155. This subproblem solver was based on [1]_, [2]_ and [3]_,
  156. which implement similar algorithms. The algorithm is basically
  157. that of [1]_ but ideas from [2]_ and [3]_ were also used.
  158. References
  159. ----------
  160. .. [1] A.R. Conn, N.I. Gould, and P.L. Toint, "Trust region methods",
  161. Siam, pp. 169-200, 2000.
  162. .. [2] J. Nocedal and S. Wright, "Numerical optimization",
  163. Springer Science & Business Media. pp. 83-91, 2006.
  164. .. [3] J.J. More and D.C. Sorensen, "Computing a trust region step",
  165. SIAM Journal on Scientific and Statistical Computing, vol. 4(3),
  166. pp. 553-572, 1983.
  167. """
  168. # UPDATE_COEFF appears in reference [1]_
  169. # in formula 7.3.14 (p. 190) named as "theta".
  170. # As recommended there it value is fixed in 0.01.
  171. UPDATE_COEFF = 0.01
  172. # The subproblem may iterate infinitely for problematic
  173. # cases (see https://github.com/scipy/scipy/issues/12513).
  174. # When the `maxiter` setting is None, we need to apply a
  175. # default. An ad-hoc number (though tested quite extensively)
  176. # is 25, which is set below. To restore the old behavior (which
  177. # potentially hangs), this parameter may be changed to zero:
  178. MAXITER_DEFAULT = 25 # use np.inf for infinite number of iterations
  179. EPS = np.finfo(float).eps
  180. def __init__(self, x, fun, jac, hess, hessp=None,
  181. k_easy=0.1, k_hard=0.2, maxiter=None):
  182. super().__init__(x, fun, jac, hess)
  183. # When the trust-region shrinks in two consecutive
  184. # calculations (``tr_radius < previous_tr_radius``)
  185. # the lower bound ``lambda_lb`` may be reused,
  186. # facilitating the convergence. To indicate no
  187. # previous value is known at first ``previous_tr_radius``
  188. # is set to -1 and ``lambda_lb`` to None.
  189. self.previous_tr_radius = -1
  190. self.lambda_lb = None
  191. self.niter = 0
  192. # ``k_easy`` and ``k_hard`` are parameters used
  193. # to determine the stop criteria to the iterative
  194. # subproblem solver. Take a look at pp. 194-197
  195. # from reference _[1] for a more detailed description.
  196. self.k_easy = k_easy
  197. self.k_hard = k_hard
  198. # ``maxiter`` optionally limits the number of iterations
  199. # the solve method may perform. Useful for poorly conditioned
  200. # problems which may otherwise hang.
  201. self.maxiter = self.MAXITER_DEFAULT if maxiter is None else maxiter
  202. if self.maxiter < 0:
  203. raise ValueError("maxiter must not be set to a negative number"
  204. ", use np.inf to mean infinite.")
  205. # Get Lapack function for cholesky decomposition.
  206. # The implemented SciPy wrapper does not return
  207. # the incomplete factorization needed by the method.
  208. self.cholesky, = get_lapack_funcs(('potrf',), (self.hess,))
  209. # Get info about Hessian
  210. self.dimension = len(self.hess)
  211. self.hess_gershgorin_lb,\
  212. self.hess_gershgorin_ub = gershgorin_bounds(self.hess)
  213. self.hess_inf = norm(self.hess, np.inf)
  214. self.hess_fro = norm(self.hess, 'fro')
  215. # A constant such that for vectors smaller than that
  216. # backward substitution is not reliable. It was established
  217. # based on Golub, G. H., Van Loan, C. F. (2013).
  218. # "Matrix computations". Forth Edition. JHU press., p.165.
  219. self.CLOSE_TO_ZERO = self.dimension * self.EPS * self.hess_inf
  220. def _initial_values(self, tr_radius):
  221. """Given a trust radius, return a good initial guess for
  222. the damping factor, the lower bound and the upper bound.
  223. The values were chosen accordingly to the guidelines on
  224. section 7.3.8 (p. 192) from [1]_.
  225. """
  226. # Upper bound for the damping factor
  227. lambda_ub = max(0, self.jac_mag/tr_radius + min(-self.hess_gershgorin_lb,
  228. self.hess_fro,
  229. self.hess_inf))
  230. # Lower bound for the damping factor
  231. lambda_lb = max(0, -min(self.hess.diagonal()),
  232. self.jac_mag/tr_radius - min(self.hess_gershgorin_ub,
  233. self.hess_fro,
  234. self.hess_inf))
  235. # Improve bounds with previous info
  236. if tr_radius < self.previous_tr_radius:
  237. lambda_lb = max(self.lambda_lb, lambda_lb)
  238. # Initial guess for the damping factor
  239. if lambda_lb == 0:
  240. lambda_initial = 0
  241. else:
  242. lambda_initial = max(np.sqrt(lambda_lb * lambda_ub),
  243. lambda_lb + self.UPDATE_COEFF*(lambda_ub-lambda_lb))
  244. return lambda_initial, lambda_lb, lambda_ub
  245. def solve(self, tr_radius):
  246. """Solve quadratic subproblem"""
  247. lambda_current, lambda_lb, lambda_ub = self._initial_values(tr_radius)
  248. n = self.dimension
  249. hits_boundary = True
  250. already_factorized = False
  251. self.niter = 0
  252. while self.niter < self.maxiter:
  253. # Compute Cholesky factorization
  254. if already_factorized:
  255. already_factorized = False
  256. else:
  257. H = self.hess+lambda_current*np.eye(n)
  258. U, info = self.cholesky(H, lower=False,
  259. overwrite_a=False,
  260. clean=True)
  261. self.niter += 1
  262. # Check if factorization succeeded
  263. if info == 0 and self.jac_mag > self.CLOSE_TO_ZERO:
  264. # Successful factorization
  265. # Solve `U.T U p = s`
  266. p = cho_solve((U, False), -self.jac)
  267. p_norm = norm(p)
  268. # Check for interior convergence
  269. if p_norm <= tr_radius and lambda_current == 0:
  270. hits_boundary = False
  271. break
  272. # Solve `U.T w = p`
  273. w = solve_triangular(U, p, trans='T')
  274. w_norm = norm(w)
  275. # Compute Newton step accordingly to
  276. # formula (4.44) p.87 from ref [2]_.
  277. delta_lambda = (p_norm/w_norm)**2 * (p_norm-tr_radius)/tr_radius
  278. lambda_new = lambda_current + delta_lambda
  279. if p_norm < tr_radius: # Inside boundary
  280. s_min, z_min = estimate_smallest_singular_value(U)
  281. ta, tb = self.get_boundaries_intersections(p, z_min,
  282. tr_radius)
  283. # Choose `step_len` with the smallest magnitude.
  284. # The reason for this choice is explained at
  285. # ref [3]_, p. 6 (Immediately before the formula
  286. # for `tau`).
  287. step_len = min([ta, tb], key=abs)
  288. # Compute the quadratic term (p.T*H*p)
  289. quadratic_term = np.dot(p, np.dot(H, p))
  290. # Check stop criteria
  291. relative_error = ((step_len**2 * s_min**2)
  292. / (quadratic_term + lambda_current*tr_radius**2))
  293. if relative_error <= self.k_hard:
  294. p += step_len * z_min
  295. break
  296. # Update uncertainty bounds
  297. lambda_ub = lambda_current
  298. lambda_lb = max(lambda_lb, lambda_current - s_min**2)
  299. # Compute Cholesky factorization
  300. H = self.hess + lambda_new*np.eye(n)
  301. c, info = self.cholesky(H, lower=False,
  302. overwrite_a=False,
  303. clean=True)
  304. # Check if the factorization have succeeded
  305. #
  306. if info == 0: # Successful factorization
  307. # Update damping factor
  308. lambda_current = lambda_new
  309. already_factorized = True
  310. else: # Unsuccessful factorization
  311. # Update uncertainty bounds
  312. lambda_lb = max(lambda_lb, lambda_new)
  313. # Update damping factor
  314. lambda_current = max(
  315. np.sqrt(np.abs(lambda_lb * lambda_ub)),
  316. lambda_lb + self.UPDATE_COEFF*(lambda_ub-lambda_lb)
  317. )
  318. else: # Outside boundary
  319. # Check stop criteria
  320. relative_error = abs(p_norm - tr_radius) / tr_radius
  321. if relative_error <= self.k_easy:
  322. break
  323. # Update uncertainty bounds
  324. lambda_lb = lambda_current
  325. # Update damping factor
  326. lambda_current = lambda_new
  327. elif info == 0 and self.jac_mag <= self.CLOSE_TO_ZERO:
  328. # jac_mag very close to zero
  329. # Check for interior convergence
  330. if lambda_current == 0:
  331. p = np.zeros(n)
  332. hits_boundary = False
  333. break
  334. s_min, z_min = estimate_smallest_singular_value(U)
  335. step_len = tr_radius
  336. p = step_len * z_min
  337. # Check stop criteria
  338. if (step_len**2 * s_min**2
  339. <= self.k_hard * lambda_current * tr_radius**2):
  340. break
  341. # Update uncertainty bounds
  342. lambda_ub = lambda_current
  343. lambda_lb = max(lambda_lb, lambda_current - s_min**2)
  344. # Update damping factor
  345. lambda_current = max(
  346. np.sqrt(lambda_lb * lambda_ub),
  347. lambda_lb + self.UPDATE_COEFF*(lambda_ub-lambda_lb)
  348. )
  349. else: # Unsuccessful factorization
  350. # Compute auxiliary terms
  351. delta, v = singular_leading_submatrix(H, U, info)
  352. v_norm = norm(v)
  353. # Update uncertainty interval
  354. lambda_lb = max(lambda_lb, lambda_current + delta/v_norm**2)
  355. # Update damping factor
  356. lambda_current = max(
  357. np.sqrt(np.abs(lambda_lb * lambda_ub)),
  358. lambda_lb + self.UPDATE_COEFF*(lambda_ub-lambda_lb)
  359. )
  360. self.lambda_lb = lambda_lb
  361. self.lambda_current = lambda_current
  362. self.previous_tr_radius = tr_radius
  363. return p, hits_boundary