_hessian_update_strategy.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479
  1. """Hessian update strategies for quasi-Newton optimization methods."""
  2. import numpy as np
  3. from numpy.linalg import norm
  4. from scipy.linalg import get_blas_funcs, issymmetric
  5. from warnings import warn
  6. __all__ = ['HessianUpdateStrategy', 'BFGS', 'SR1']
  7. class HessianUpdateStrategy:
  8. """Interface for implementing Hessian update strategies.
  9. Many optimization methods make use of Hessian (or inverse Hessian)
  10. approximations, such as the quasi-Newton methods BFGS, SR1, L-BFGS.
  11. Some of these approximations, however, do not actually need to store
  12. the entire matrix or can compute the internal matrix product with a
  13. given vector in a very efficiently manner. This class serves as an
  14. abstract interface between the optimization algorithm and the
  15. quasi-Newton update strategies, giving freedom of implementation
  16. to store and update the internal matrix as efficiently as possible.
  17. Different choices of initialization and update procedure will result
  18. in different quasi-Newton strategies.
  19. Four methods should be implemented in derived classes: ``initialize``,
  20. ``update``, ``dot`` and ``get_matrix``. The matrix multiplication
  21. operator ``@`` is also defined to call the ``dot`` method.
  22. Notes
  23. -----
  24. Any instance of a class that implements this interface,
  25. can be accepted by the method ``minimize`` and used by
  26. the compatible solvers to approximate the Hessian (or
  27. inverse Hessian) used by the optimization algorithms.
  28. """
  29. def initialize(self, n, approx_type):
  30. """Initialize internal matrix.
  31. Allocate internal memory for storing and updating
  32. the Hessian or its inverse.
  33. Parameters
  34. ----------
  35. n : int
  36. Problem dimension.
  37. approx_type : {'hess', 'inv_hess'}
  38. Selects either the Hessian or the inverse Hessian.
  39. When set to 'hess' the Hessian will be stored and updated.
  40. When set to 'inv_hess' its inverse will be used instead.
  41. """
  42. raise NotImplementedError("The method ``initialize(n, approx_type)``"
  43. " is not implemented.")
  44. def update(self, delta_x, delta_grad):
  45. """Update internal matrix.
  46. Update Hessian matrix or its inverse (depending on how 'approx_type'
  47. is defined) using information about the last evaluated points.
  48. Parameters
  49. ----------
  50. delta_x : ndarray
  51. The difference between two points the gradient
  52. function have been evaluated at: ``delta_x = x2 - x1``.
  53. delta_grad : ndarray
  54. The difference between the gradients:
  55. ``delta_grad = grad(x2) - grad(x1)``.
  56. """
  57. raise NotImplementedError("The method ``update(delta_x, delta_grad)``"
  58. " is not implemented.")
  59. def dot(self, p):
  60. """Compute the product of the internal matrix with the given vector.
  61. Parameters
  62. ----------
  63. p : array_like
  64. 1-D array representing a vector.
  65. Returns
  66. -------
  67. Hp : array
  68. 1-D represents the result of multiplying the approximation matrix
  69. by vector p.
  70. """
  71. raise NotImplementedError("The method ``dot(p)``"
  72. " is not implemented.")
  73. def get_matrix(self):
  74. """Return current internal matrix.
  75. Returns
  76. -------
  77. H : ndarray, shape (n, n)
  78. Dense matrix containing either the Hessian
  79. or its inverse (depending on how 'approx_type'
  80. is defined).
  81. """
  82. raise NotImplementedError("The method ``get_matrix(p)``"
  83. " is not implemented.")
  84. def __matmul__(self, p):
  85. return self.dot(p)
  86. class FullHessianUpdateStrategy(HessianUpdateStrategy):
  87. """Hessian update strategy with full dimensional internal representation.
  88. """
  89. _syr = get_blas_funcs('syr', dtype='d') # Symmetric rank 1 update
  90. _syr2 = get_blas_funcs('syr2', dtype='d') # Symmetric rank 2 update
  91. # Symmetric matrix-vector product
  92. _symv = get_blas_funcs('symv', dtype='d')
  93. def __init__(self, init_scale='auto'):
  94. self.init_scale = init_scale
  95. # Until initialize is called we can't really use the class,
  96. # so it makes sense to set everything to None.
  97. self.first_iteration = None
  98. self.approx_type = None
  99. self.B = None
  100. self.H = None
  101. def initialize(self, n, approx_type):
  102. """Initialize internal matrix.
  103. Allocate internal memory for storing and updating
  104. the Hessian or its inverse.
  105. Parameters
  106. ----------
  107. n : int
  108. Problem dimension.
  109. approx_type : {'hess', 'inv_hess'}
  110. Selects either the Hessian or the inverse Hessian.
  111. When set to 'hess' the Hessian will be stored and updated.
  112. When set to 'inv_hess' its inverse will be used instead.
  113. """
  114. self.first_iteration = True
  115. self.n = n
  116. self.approx_type = approx_type
  117. if approx_type not in ('hess', 'inv_hess'):
  118. raise ValueError("`approx_type` must be 'hess' or 'inv_hess'.")
  119. # Create matrix
  120. if self.approx_type == 'hess':
  121. self.B = np.eye(n, dtype=float)
  122. else:
  123. self.H = np.eye(n, dtype=float)
  124. def _auto_scale(self, delta_x, delta_grad):
  125. # Heuristic to scale matrix at first iteration.
  126. # Described in Nocedal and Wright "Numerical Optimization"
  127. # p.143 formula (6.20).
  128. s_norm2 = np.dot(delta_x, delta_x)
  129. y_norm2 = np.dot(delta_grad, delta_grad)
  130. ys = np.abs(np.dot(delta_grad, delta_x))
  131. if ys == 0.0 or y_norm2 == 0 or s_norm2 == 0:
  132. return 1
  133. if self.approx_type == 'hess':
  134. return y_norm2 / ys
  135. else:
  136. return ys / y_norm2
  137. def _update_implementation(self, delta_x, delta_grad):
  138. raise NotImplementedError("The method ``_update_implementation``"
  139. " is not implemented.")
  140. def update(self, delta_x, delta_grad):
  141. """Update internal matrix.
  142. Update Hessian matrix or its inverse (depending on how 'approx_type'
  143. is defined) using information about the last evaluated points.
  144. Parameters
  145. ----------
  146. delta_x : ndarray
  147. The difference between two points the gradient
  148. function have been evaluated at: ``delta_x = x2 - x1``.
  149. delta_grad : ndarray
  150. The difference between the gradients:
  151. ``delta_grad = grad(x2) - grad(x1)``.
  152. """
  153. if np.all(delta_x == 0.0):
  154. return
  155. if np.all(delta_grad == 0.0):
  156. warn('delta_grad == 0.0. Check if the approximated '
  157. 'function is linear. If the function is linear '
  158. 'better results can be obtained by defining the '
  159. 'Hessian as zero instead of using quasi-Newton '
  160. 'approximations.',
  161. UserWarning, stacklevel=2)
  162. return
  163. if self.first_iteration:
  164. # Get user specific scale
  165. if isinstance(self.init_scale, str) and self.init_scale == "auto":
  166. scale = self._auto_scale(delta_x, delta_grad)
  167. else:
  168. scale = self.init_scale
  169. # Check for complex: numpy will silently cast a complex array to
  170. # a real one but not so for scalar as it raises a TypeError.
  171. # Checking here brings a consistent behavior.
  172. replace = False
  173. if np.size(scale) == 1:
  174. # to account for the legacy behavior having the exact same cast
  175. scale = float(scale)
  176. elif np.iscomplexobj(scale):
  177. raise TypeError("init_scale contains complex elements, "
  178. "must be real.")
  179. else: # test explicitly for allowed shapes and values
  180. replace = True
  181. if self.approx_type == 'hess':
  182. shape = np.shape(self.B)
  183. dtype = self.B.dtype
  184. else:
  185. shape = np.shape(self.H)
  186. dtype = self.H.dtype
  187. # copy, will replace the original
  188. scale = np.array(scale, dtype=dtype, copy=True)
  189. # it has to match the shape of the matrix for the multiplication,
  190. # no implicit broadcasting is allowed
  191. if shape != (init_shape := np.shape(scale)):
  192. raise ValueError("If init_scale is an array, it must have the "
  193. f"dimensions of the hess/inv_hess: {shape}."
  194. f" Got {init_shape}.")
  195. if not issymmetric(scale):
  196. raise ValueError("If init_scale is an array, it must be"
  197. " symmetric (passing scipy.linalg.issymmetric)"
  198. " to be an approximation of a hess/inv_hess.")
  199. # Scale initial matrix with ``scale * np.eye(n)`` or replace
  200. # This is not ideal, we could assign the scale directly in
  201. # initialize, but we would need to
  202. if self.approx_type == 'hess':
  203. if replace:
  204. self.B = scale
  205. else:
  206. self.B *= scale
  207. else:
  208. if replace:
  209. self.H = scale
  210. else:
  211. self.H *= scale
  212. self.first_iteration = False
  213. self._update_implementation(delta_x, delta_grad)
  214. def dot(self, p):
  215. """Compute the product of the internal matrix with the given vector.
  216. Parameters
  217. ----------
  218. p : array_like
  219. 1-D array representing a vector.
  220. Returns
  221. -------
  222. Hp : array
  223. 1-D represents the result of multiplying the approximation matrix
  224. by vector p.
  225. """
  226. if self.approx_type == 'hess':
  227. return self._symv(1, self.B, p)
  228. else:
  229. return self._symv(1, self.H, p)
  230. def get_matrix(self):
  231. """Return the current internal matrix.
  232. Returns
  233. -------
  234. M : ndarray, shape (n, n)
  235. Dense matrix containing either the Hessian or its inverse
  236. (depending on how `approx_type` was defined).
  237. """
  238. if self.approx_type == 'hess':
  239. M = np.copy(self.B)
  240. else:
  241. M = np.copy(self.H)
  242. li = np.tril_indices_from(M, k=-1)
  243. M[li] = M.T[li]
  244. return M
  245. class BFGS(FullHessianUpdateStrategy):
  246. """Broyden-Fletcher-Goldfarb-Shanno (BFGS) Hessian update strategy.
  247. Parameters
  248. ----------
  249. exception_strategy : {'skip_update', 'damp_update'}, optional
  250. Define how to proceed when the curvature condition is violated.
  251. Set it to 'skip_update' to just skip the update. Or, alternatively,
  252. set it to 'damp_update' to interpolate between the actual BFGS
  253. result and the unmodified matrix. Both exceptions strategies
  254. are explained in [1]_, p.536-537.
  255. min_curvature : float
  256. This number, scaled by a normalization factor, defines the
  257. minimum curvature ``dot(delta_grad, delta_x)`` allowed to go
  258. unaffected by the exception strategy. By default is equal to
  259. 1e-8 when ``exception_strategy = 'skip_update'`` and equal
  260. to 0.2 when ``exception_strategy = 'damp_update'``.
  261. init_scale : {float, np.array, 'auto'}
  262. This parameter can be used to initialize the Hessian or its
  263. inverse. When a float is given, the relevant array is initialized
  264. to ``np.eye(n) * init_scale``, where ``n`` is the problem dimension.
  265. Alternatively, if a precisely ``(n, n)`` shaped, symmetric array is given,
  266. this array will be used. Otherwise an error is generated.
  267. Set it to 'auto' in order to use an automatic heuristic for choosing
  268. the initial scale. The heuristic is described in [1]_, p.143.
  269. The default is 'auto'.
  270. Notes
  271. -----
  272. The update is based on the description in [1]_, p.140.
  273. References
  274. ----------
  275. .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
  276. Second Edition (2006).
  277. """
  278. def __init__(self, exception_strategy='skip_update', min_curvature=None,
  279. init_scale='auto'):
  280. if exception_strategy == 'skip_update':
  281. if min_curvature is not None:
  282. self.min_curvature = min_curvature
  283. else:
  284. self.min_curvature = 1e-8
  285. elif exception_strategy == 'damp_update':
  286. if min_curvature is not None:
  287. self.min_curvature = min_curvature
  288. else:
  289. self.min_curvature = 0.2
  290. else:
  291. raise ValueError("`exception_strategy` must be 'skip_update' "
  292. "or 'damp_update'.")
  293. super().__init__(init_scale)
  294. self.exception_strategy = exception_strategy
  295. def _update_inverse_hessian(self, ys, Hy, yHy, s):
  296. """Update the inverse Hessian matrix.
  297. BFGS update using the formula:
  298. ``H <- H + ((H*y).T*y + s.T*y)/(s.T*y)^2 * (s*s.T)
  299. - 1/(s.T*y) * ((H*y)*s.T + s*(H*y).T)``
  300. where ``s = delta_x`` and ``y = delta_grad``. This formula is
  301. equivalent to (6.17) in [1]_ written in a more efficient way
  302. for implementation.
  303. References
  304. ----------
  305. .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
  306. Second Edition (2006).
  307. """
  308. self.H = self._syr2(-1.0 / ys, s, Hy, a=self.H)
  309. self.H = self._syr((ys + yHy) / ys ** 2, s, a=self.H)
  310. def _update_hessian(self, ys, Bs, sBs, y):
  311. """Update the Hessian matrix.
  312. BFGS update using the formula:
  313. ``B <- B - (B*s)*(B*s).T/s.T*(B*s) + y*y^T/s.T*y``
  314. where ``s`` is short for ``delta_x`` and ``y`` is short
  315. for ``delta_grad``. Formula (6.19) in [1]_.
  316. References
  317. ----------
  318. .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
  319. Second Edition (2006).
  320. """
  321. self.B = self._syr(1.0 / ys, y, a=self.B)
  322. self.B = self._syr(-1.0 / sBs, Bs, a=self.B)
  323. def _update_implementation(self, delta_x, delta_grad):
  324. # Auxiliary variables w and z
  325. if self.approx_type == 'hess':
  326. w = delta_x
  327. z = delta_grad
  328. else:
  329. w = delta_grad
  330. z = delta_x
  331. # Do some common operations
  332. wz = np.dot(w, z)
  333. Mw = self @ w
  334. wMw = Mw.dot(w)
  335. # Guarantee that wMw > 0 by reinitializing matrix.
  336. # While this is always true in exact arithmetic,
  337. # indefinite matrix may appear due to roundoff errors.
  338. if wMw <= 0.0:
  339. scale = self._auto_scale(delta_x, delta_grad)
  340. # Reinitialize matrix
  341. if self.approx_type == 'hess':
  342. self.B = scale * np.eye(self.n, dtype=float)
  343. else:
  344. self.H = scale * np.eye(self.n, dtype=float)
  345. # Do common operations for new matrix
  346. Mw = self @ w
  347. wMw = Mw.dot(w)
  348. # Check if curvature condition is violated
  349. if wz <= self.min_curvature * wMw:
  350. # If the option 'skip_update' is set
  351. # we just skip the update when the condition
  352. # is violated.
  353. if self.exception_strategy == 'skip_update':
  354. return
  355. # If the option 'damp_update' is set we
  356. # interpolate between the actual BFGS
  357. # result and the unmodified matrix.
  358. elif self.exception_strategy == 'damp_update':
  359. update_factor = (1-self.min_curvature) / (1 - wz/wMw)
  360. z = update_factor*z + (1-update_factor)*Mw
  361. wz = np.dot(w, z)
  362. # Update matrix
  363. if self.approx_type == 'hess':
  364. self._update_hessian(wz, Mw, wMw, z)
  365. else:
  366. self._update_inverse_hessian(wz, Mw, wMw, z)
  367. class SR1(FullHessianUpdateStrategy):
  368. """Symmetric-rank-1 Hessian update strategy.
  369. Parameters
  370. ----------
  371. min_denominator : float
  372. This number, scaled by a normalization factor,
  373. defines the minimum denominator magnitude allowed
  374. in the update. When the condition is violated we skip
  375. the update. By default uses ``1e-8``.
  376. init_scale : {float, np.array, 'auto'}, optional
  377. This parameter can be used to initialize the Hessian or its
  378. inverse. When a float is given, the relevant array is initialized
  379. to ``np.eye(n) * init_scale``, where ``n`` is the problem dimension.
  380. Alternatively, if a precisely ``(n, n)`` shaped, symmetric array is given,
  381. this array will be used. Otherwise an error is generated.
  382. Set it to 'auto' in order to use an automatic heuristic for choosing
  383. the initial scale. The heuristic is described in [1]_, p.143.
  384. The default is 'auto'.
  385. Notes
  386. -----
  387. The update is based on the description in [1]_, p.144-146.
  388. References
  389. ----------
  390. .. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
  391. Second Edition (2006).
  392. """
  393. def __init__(self, min_denominator=1e-8, init_scale='auto'):
  394. self.min_denominator = min_denominator
  395. super().__init__(init_scale)
  396. def _update_implementation(self, delta_x, delta_grad):
  397. # Auxiliary variables w and z
  398. if self.approx_type == 'hess':
  399. w = delta_x
  400. z = delta_grad
  401. else:
  402. w = delta_grad
  403. z = delta_x
  404. # Do some common operations
  405. Mw = self @ w
  406. z_minus_Mw = z - Mw
  407. denominator = np.dot(w, z_minus_Mw)
  408. # If the denominator is too small
  409. # we just skip the update.
  410. if np.abs(denominator) <= self.min_denominator*norm(w)*norm(z_minus_Mw):
  411. return
  412. # Update matrix
  413. if self.approx_type == 'hess':
  414. self.B = self._syr(1/denominator, z_minus_Mw, a=self.B)
  415. else:
  416. self.H = self._syr(1/denominator, z_minus_Mw, a=self.H)