_linesearch.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896
  1. """
  2. Functions
  3. ---------
  4. .. autosummary::
  5. :toctree: generated/
  6. line_search_armijo
  7. line_search_wolfe1
  8. line_search_wolfe2
  9. scalar_search_wolfe1
  10. scalar_search_wolfe2
  11. """
  12. from warnings import warn
  13. from ._dcsrch import DCSRCH
  14. import numpy as np
  15. __all__ = ['LineSearchWarning', 'line_search_wolfe1', 'line_search_wolfe2',
  16. 'scalar_search_wolfe1', 'scalar_search_wolfe2',
  17. 'line_search_armijo']
  18. class LineSearchWarning(RuntimeWarning):
  19. pass
  20. def _check_c1_c2(c1, c2):
  21. if not (0 < c1 < c2 < 1):
  22. raise ValueError("'c1' and 'c2' do not satisfy"
  23. "'0 < c1 < c2 < 1'.")
  24. #------------------------------------------------------------------------------
  25. # Minpack's Wolfe line and scalar searches
  26. #------------------------------------------------------------------------------
  27. def line_search_wolfe1(f, fprime, xk, pk, gfk=None,
  28. old_fval=None, old_old_fval=None,
  29. args=(), c1=1e-4, c2=0.9, amax=50, amin=1e-8,
  30. xtol=1e-14):
  31. """
  32. As `scalar_search_wolfe1` but do a line search to direction `pk`
  33. Parameters
  34. ----------
  35. f : callable
  36. Function `f(x)`
  37. fprime : callable
  38. Gradient of `f`
  39. xk : array_like
  40. Current point
  41. pk : array_like
  42. Search direction
  43. gfk : array_like, optional
  44. Gradient of `f` at point `xk`
  45. old_fval : float, optional
  46. Value of `f` at point `xk`
  47. old_old_fval : float, optional
  48. Value of `f` at point preceding `xk`
  49. The rest of the parameters are the same as for `scalar_search_wolfe1`.
  50. Returns
  51. -------
  52. stp, f_count, g_count, fval, old_fval
  53. As in `line_search_wolfe1`
  54. gval : array
  55. Gradient of `f` at the final point
  56. Notes
  57. -----
  58. Parameters `c1` and `c2` must satisfy ``0 < c1 < c2 < 1``.
  59. """
  60. if gfk is None:
  61. gfk = fprime(xk, *args)
  62. gval = [gfk]
  63. gc = [0]
  64. fc = [0]
  65. def phi(s):
  66. fc[0] += 1
  67. return f(xk + s*pk, *args)
  68. def derphi(s):
  69. gval[0] = fprime(xk + s*pk, *args)
  70. gc[0] += 1
  71. return np.dot(gval[0], pk)
  72. derphi0 = np.dot(gfk, pk)
  73. stp, fval, old_fval = scalar_search_wolfe1(
  74. phi, derphi, old_fval, old_old_fval, derphi0,
  75. c1=c1, c2=c2, amax=amax, amin=amin, xtol=xtol)
  76. return stp, fc[0], gc[0], fval, old_fval, gval[0]
  77. def scalar_search_wolfe1(phi, derphi, phi0=None, old_phi0=None, derphi0=None,
  78. c1=1e-4, c2=0.9,
  79. amax=50, amin=1e-8, xtol=1e-14):
  80. """
  81. Scalar function search for alpha that satisfies strong Wolfe conditions
  82. alpha > 0 is assumed to be a descent direction.
  83. Parameters
  84. ----------
  85. phi : callable phi(alpha)
  86. Function at point `alpha`
  87. derphi : callable phi'(alpha)
  88. Objective function derivative. Returns a scalar.
  89. phi0 : float, optional
  90. Value of phi at 0
  91. old_phi0 : float, optional
  92. Value of phi at previous point
  93. derphi0 : float, optional
  94. Value derphi at 0
  95. c1 : float, optional
  96. Parameter for Armijo condition rule.
  97. c2 : float, optional
  98. Parameter for curvature condition rule.
  99. amax, amin : float, optional
  100. Maximum and minimum step size
  101. xtol : float, optional
  102. Relative tolerance for an acceptable step.
  103. Returns
  104. -------
  105. alpha : float
  106. Step size, or None if no suitable step was found
  107. phi : float
  108. Value of `phi` at the new point `alpha`
  109. phi0 : float
  110. Value of `phi` at `alpha=0`
  111. Notes
  112. -----
  113. Uses routine DCSRCH from MINPACK.
  114. Parameters `c1` and `c2` must satisfy ``0 < c1 < c2 < 1`` as described in [1]_.
  115. References
  116. ----------
  117. .. [1] Nocedal, J., & Wright, S. J. (2006). Numerical optimization.
  118. In Springer Series in Operations Research and Financial Engineering.
  119. (Springer Series in Operations Research and Financial Engineering).
  120. Springer Nature.
  121. """
  122. _check_c1_c2(c1, c2)
  123. if phi0 is None:
  124. phi0 = phi(0.)
  125. if derphi0 is None:
  126. derphi0 = derphi(0.)
  127. if old_phi0 is not None and derphi0 != 0:
  128. alpha1 = min(1.0, 1.01*2*(phi0 - old_phi0)/derphi0)
  129. if alpha1 < 0:
  130. alpha1 = 1.0
  131. else:
  132. alpha1 = 1.0
  133. maxiter = 100
  134. dcsrch = DCSRCH(phi, derphi, c1, c2, xtol, amin, amax)
  135. stp, phi1, phi0, task = dcsrch(
  136. alpha1, phi0=phi0, derphi0=derphi0, maxiter=maxiter
  137. )
  138. return stp, phi1, phi0
  139. line_search = line_search_wolfe1
  140. #------------------------------------------------------------------------------
  141. # Pure-Python Wolfe line and scalar searches
  142. #------------------------------------------------------------------------------
  143. # Note: `line_search_wolfe2` is the public `scipy.optimize.line_search`
  144. def line_search_wolfe2(f, myfprime, xk, pk, gfk=None, old_fval=None,
  145. old_old_fval=None, args=(), c1=1e-4, c2=0.9, amax=None,
  146. extra_condition=None, maxiter=10):
  147. """Find alpha that satisfies strong Wolfe conditions.
  148. Parameters
  149. ----------
  150. f : callable f(x,*args)
  151. Objective function.
  152. myfprime : callable f'(x,*args)
  153. Objective function gradient.
  154. xk : ndarray
  155. Starting point.
  156. pk : ndarray
  157. Search direction. The search direction must be a descent direction
  158. for the algorithm to converge.
  159. gfk : ndarray, optional
  160. Gradient value for x=xk (xk being the current parameter
  161. estimate). Will be recomputed if omitted.
  162. old_fval : float, optional
  163. Function value for x=xk. Will be recomputed if omitted.
  164. old_old_fval : float, optional
  165. Function value for the point preceding x=xk.
  166. args : tuple, optional
  167. Additional arguments passed to objective function.
  168. c1 : float, optional
  169. Parameter for Armijo condition rule.
  170. c2 : float, optional
  171. Parameter for curvature condition rule.
  172. amax : float, optional
  173. Maximum step size
  174. extra_condition : callable, optional
  175. A callable of the form ``extra_condition(alpha, x, f, g)``
  176. returning a boolean. Arguments are the proposed step ``alpha``
  177. and the corresponding ``x``, ``f`` and ``g`` values. The line search
  178. accepts the value of ``alpha`` only if this
  179. callable returns ``True``. If the callable returns ``False``
  180. for the step length, the algorithm will continue with
  181. new iterates. The callable is only called for iterates
  182. satisfying the strong Wolfe conditions.
  183. maxiter : int, optional
  184. Maximum number of iterations to perform.
  185. Returns
  186. -------
  187. alpha : float or None
  188. Alpha for which ``x_new = x0 + alpha * pk``,
  189. or None if the line search algorithm did not converge.
  190. fc : int
  191. Number of function evaluations made.
  192. gc : int
  193. Number of gradient evaluations made.
  194. new_fval : float or None
  195. New function value ``f(x_new)=f(x0+alpha*pk)``,
  196. or None if the line search algorithm did not converge.
  197. old_fval : float
  198. Old function value ``f(x0)``.
  199. new_slope : float or None
  200. The local slope along the search direction at the
  201. new value ``<myfprime(x_new), pk>``,
  202. or None if the line search algorithm did not converge.
  203. Notes
  204. -----
  205. Uses the line search algorithm to enforce strong Wolfe
  206. conditions. See Wright and Nocedal, 'Numerical Optimization',
  207. 1999, pp. 59-61.
  208. The search direction `pk` must be a descent direction (e.g.
  209. ``-myfprime(xk)``) to find a step length that satisfies the strong Wolfe
  210. conditions. If the search direction is not a descent direction (e.g.
  211. ``myfprime(xk)``), then `alpha`, `new_fval`, and `new_slope` will be None.
  212. Examples
  213. --------
  214. >>> import numpy as np
  215. >>> from scipy.optimize import line_search
  216. An objective function and its gradient are defined.
  217. >>> def obj_func(x):
  218. ... return (x[0])**2+(x[1])**2
  219. >>> def obj_grad(x):
  220. ... return [2*x[0], 2*x[1]]
  221. We can find alpha that satisfies strong Wolfe conditions.
  222. >>> start_point = np.array([1.8, 1.7])
  223. >>> search_gradient = np.array([-1.0, -1.0])
  224. >>> line_search(obj_func, obj_grad, start_point, search_gradient)
  225. (1.0, 2, 1, 1.1300000000000001, 6.13, [1.6, 1.4])
  226. """
  227. fc = [0]
  228. gc = [0]
  229. gval = [None]
  230. gval_alpha = [None]
  231. def phi(alpha):
  232. fc[0] += 1
  233. return f(xk + alpha * pk, *args)
  234. fprime = myfprime
  235. def derphi(alpha):
  236. gc[0] += 1
  237. gval[0] = fprime(xk + alpha * pk, *args) # store for later use
  238. gval_alpha[0] = alpha
  239. return np.dot(gval[0], pk)
  240. if gfk is None:
  241. gfk = fprime(xk, *args)
  242. derphi0 = np.dot(gfk, pk)
  243. if extra_condition is not None:
  244. # Add the current gradient as argument, to avoid needless
  245. # re-evaluation
  246. def extra_condition2(alpha, phi):
  247. if gval_alpha[0] != alpha:
  248. derphi(alpha)
  249. x = xk + alpha * pk
  250. return extra_condition(alpha, x, phi, gval[0])
  251. else:
  252. extra_condition2 = None
  253. alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  254. phi, derphi, old_fval, old_old_fval, derphi0, c1, c2, amax,
  255. extra_condition2, maxiter=maxiter)
  256. if derphi_star is None:
  257. warn('The line search algorithm did not converge',
  258. LineSearchWarning, stacklevel=2)
  259. else:
  260. # derphi_star is a number (derphi) -- so use the most recently
  261. # calculated gradient used in computing it derphi = gfk*pk
  262. # this is the gradient at the next step no need to compute it
  263. # again in the outer loop.
  264. derphi_star = gval[0]
  265. return alpha_star, fc[0], gc[0], phi_star, old_fval, derphi_star
  266. def scalar_search_wolfe2(phi, derphi, phi0=None,
  267. old_phi0=None, derphi0=None,
  268. c1=1e-4, c2=0.9, amax=None,
  269. extra_condition=None, maxiter=10):
  270. """Find alpha that satisfies strong Wolfe conditions.
  271. alpha > 0 is assumed to be a descent direction.
  272. Parameters
  273. ----------
  274. phi : callable phi(alpha)
  275. Objective scalar function.
  276. derphi : callable phi'(alpha)
  277. Objective function derivative. Returns a scalar.
  278. phi0 : float, optional
  279. Value of phi at 0.
  280. old_phi0 : float, optional
  281. Value of phi at previous point.
  282. derphi0 : float, optional
  283. Value of derphi at 0
  284. c1 : float, optional
  285. Parameter for Armijo condition rule.
  286. c2 : float, optional
  287. Parameter for curvature condition rule.
  288. amax : float, optional
  289. Maximum step size.
  290. extra_condition : callable, optional
  291. A callable of the form ``extra_condition(alpha, phi_value)``
  292. returning a boolean. The line search accepts the value
  293. of ``alpha`` only if this callable returns ``True``.
  294. If the callable returns ``False`` for the step length,
  295. the algorithm will continue with new iterates.
  296. The callable is only called for iterates satisfying
  297. the strong Wolfe conditions.
  298. maxiter : int, optional
  299. Maximum number of iterations to perform.
  300. Returns
  301. -------
  302. alpha_star : float or None
  303. Best alpha, or None if the line search algorithm did not converge.
  304. phi_star : float
  305. phi at alpha_star.
  306. phi0 : float
  307. phi at 0.
  308. derphi_star : float or None
  309. derphi at alpha_star, or None if the line search algorithm
  310. did not converge.
  311. Notes
  312. -----
  313. Uses the line search algorithm to enforce strong Wolfe
  314. conditions. See Wright and Nocedal, 'Numerical Optimization',
  315. 1999, pp. 59-61.
  316. """
  317. _check_c1_c2(c1, c2)
  318. if phi0 is None:
  319. phi0 = phi(0.)
  320. if derphi0 is None:
  321. derphi0 = derphi(0.)
  322. alpha0 = 0
  323. if old_phi0 is not None and derphi0 != 0:
  324. alpha1 = min(1.0, 1.01*2*(phi0 - old_phi0)/derphi0)
  325. else:
  326. alpha1 = 1.0
  327. if alpha1 < 0:
  328. alpha1 = 1.0
  329. if amax is not None:
  330. alpha1 = min(alpha1, amax)
  331. phi_a1 = phi(alpha1)
  332. #derphi_a1 = derphi(alpha1) evaluated below
  333. phi_a0 = phi0
  334. derphi_a0 = derphi0
  335. if extra_condition is None:
  336. def extra_condition(alpha, phi):
  337. return True
  338. for i in range(maxiter):
  339. if alpha1 == 0 or (amax is not None and alpha0 > amax):
  340. # alpha1 == 0: This shouldn't happen. Perhaps the increment has
  341. # slipped below machine precision?
  342. alpha_star = None
  343. phi_star = phi0
  344. phi0 = old_phi0
  345. derphi_star = None
  346. if alpha1 == 0:
  347. msg = 'Rounding errors prevent the line search from converging'
  348. else:
  349. msg = "The line search algorithm could not find a solution " + \
  350. f"less than or equal to amax: {amax}"
  351. warn(msg, LineSearchWarning, stacklevel=2)
  352. break
  353. not_first_iteration = i > 0
  354. if (phi_a1 > phi0 + c1 * alpha1 * derphi0) or \
  355. ((phi_a1 >= phi_a0) and not_first_iteration):
  356. alpha_star, phi_star, derphi_star = \
  357. _zoom(alpha0, alpha1, phi_a0,
  358. phi_a1, derphi_a0, phi, derphi,
  359. phi0, derphi0, c1, c2, extra_condition)
  360. break
  361. derphi_a1 = derphi(alpha1)
  362. if (abs(derphi_a1) <= -c2*derphi0):
  363. if extra_condition(alpha1, phi_a1):
  364. alpha_star = alpha1
  365. phi_star = phi_a1
  366. derphi_star = derphi_a1
  367. break
  368. if (derphi_a1 >= 0):
  369. alpha_star, phi_star, derphi_star = \
  370. _zoom(alpha1, alpha0, phi_a1,
  371. phi_a0, derphi_a1, phi, derphi,
  372. phi0, derphi0, c1, c2, extra_condition)
  373. break
  374. alpha2 = 2 * alpha1 # increase by factor of two on each iteration
  375. if amax is not None:
  376. alpha2 = min(alpha2, amax)
  377. alpha0 = alpha1
  378. alpha1 = alpha2
  379. phi_a0 = phi_a1
  380. phi_a1 = phi(alpha1)
  381. derphi_a0 = derphi_a1
  382. else:
  383. # stopping test maxiter reached
  384. alpha_star = alpha1
  385. phi_star = phi_a1
  386. derphi_star = None
  387. warn('The line search algorithm did not converge',
  388. LineSearchWarning, stacklevel=2)
  389. return alpha_star, phi_star, phi0, derphi_star
  390. def _cubicmin(a, fa, fpa, b, fb, c, fc):
  391. """
  392. Finds the minimizer for a cubic polynomial that goes through the
  393. points (a,fa), (b,fb), and (c,fc) with derivative at a of fpa.
  394. If no minimizer can be found, return None.
  395. """
  396. # f(x) = A *(x-a)^3 + B*(x-a)^2 + C*(x-a) + D
  397. with np.errstate(divide='raise', over='raise', invalid='raise'):
  398. try:
  399. C = fpa
  400. db = b - a
  401. dc = c - a
  402. denom = (db * dc) ** 2 * (db - dc)
  403. d1 = np.empty((2, 2))
  404. d1[0, 0] = dc ** 2
  405. d1[0, 1] = -db ** 2
  406. d1[1, 0] = -dc ** 3
  407. d1[1, 1] = db ** 3
  408. [A, B] = np.dot(d1, np.asarray([fb - fa - C * db,
  409. fc - fa - C * dc]).flatten())
  410. A /= denom
  411. B /= denom
  412. radical = B * B - 3 * A * C
  413. xmin = a + (-B + np.sqrt(radical)) / (3 * A)
  414. except ArithmeticError:
  415. return None
  416. if not np.isfinite(xmin):
  417. return None
  418. return xmin
  419. def _quadmin(a, fa, fpa, b, fb):
  420. """
  421. Finds the minimizer for a quadratic polynomial that goes through
  422. the points (a,fa), (b,fb) with derivative at a of fpa.
  423. """
  424. # f(x) = B*(x-a)^2 + C*(x-a) + D
  425. with np.errstate(divide='raise', over='raise', invalid='raise'):
  426. try:
  427. D = fa
  428. C = fpa
  429. db = b - a * 1.0
  430. B = (fb - D - C * db) / (db * db)
  431. xmin = a - C / (2.0 * B)
  432. except ArithmeticError:
  433. return None
  434. if not np.isfinite(xmin):
  435. return None
  436. return xmin
  437. def _zoom(a_lo, a_hi, phi_lo, phi_hi, derphi_lo,
  438. phi, derphi, phi0, derphi0, c1, c2, extra_condition):
  439. """Zoom stage of approximate linesearch satisfying strong Wolfe conditions.
  440. Part of the optimization algorithm in `scalar_search_wolfe2`.
  441. Notes
  442. -----
  443. Implements Algorithm 3.6 (zoom) in Wright and Nocedal,
  444. 'Numerical Optimization', 1999, pp. 61.
  445. """
  446. maxiter = 10
  447. i = 0
  448. delta1 = 0.2 # cubic interpolant check
  449. delta2 = 0.1 # quadratic interpolant check
  450. phi_rec = phi0
  451. a_rec = 0
  452. while True:
  453. # interpolate to find a trial step length between a_lo and
  454. # a_hi Need to choose interpolation here. Use cubic
  455. # interpolation and then if the result is within delta *
  456. # dalpha or outside of the interval bounded by a_lo or a_hi
  457. # then use quadratic interpolation, if the result is still too
  458. # close, then use bisection
  459. dalpha = a_hi - a_lo
  460. if dalpha < 0:
  461. a, b = a_hi, a_lo
  462. else:
  463. a, b = a_lo, a_hi
  464. # minimizer of cubic interpolant
  465. # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
  466. #
  467. # if the result is too close to the end points (or out of the
  468. # interval), then use quadratic interpolation with phi_lo,
  469. # derphi_lo and phi_hi if the result is still too close to the
  470. # end points (or out of the interval) then use bisection
  471. if (i > 0):
  472. cchk = delta1 * dalpha
  473. a_j = _cubicmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi,
  474. a_rec, phi_rec)
  475. if (i == 0) or (a_j is None) or (a_j > b - cchk) or (a_j < a + cchk):
  476. qchk = delta2 * dalpha
  477. a_j = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
  478. if (a_j is None) or (a_j > b-qchk) or (a_j < a+qchk):
  479. a_j = a_lo + 0.5*dalpha
  480. # Check new value of a_j
  481. phi_aj = phi(a_j)
  482. if (phi_aj > phi0 + c1*a_j*derphi0) or (phi_aj >= phi_lo):
  483. phi_rec = phi_hi
  484. a_rec = a_hi
  485. a_hi = a_j
  486. phi_hi = phi_aj
  487. else:
  488. derphi_aj = derphi(a_j)
  489. if abs(derphi_aj) <= -c2*derphi0 and extra_condition(a_j, phi_aj):
  490. a_star = a_j
  491. val_star = phi_aj
  492. valprime_star = derphi_aj
  493. break
  494. if derphi_aj*(a_hi - a_lo) >= 0:
  495. phi_rec = phi_hi
  496. a_rec = a_hi
  497. a_hi = a_lo
  498. phi_hi = phi_lo
  499. else:
  500. phi_rec = phi_lo
  501. a_rec = a_lo
  502. a_lo = a_j
  503. phi_lo = phi_aj
  504. derphi_lo = derphi_aj
  505. i += 1
  506. if (i > maxiter):
  507. # Failed to find a conforming step size
  508. a_star = None
  509. val_star = None
  510. valprime_star = None
  511. break
  512. return a_star, val_star, valprime_star
  513. #------------------------------------------------------------------------------
  514. # Armijo line and scalar searches
  515. #------------------------------------------------------------------------------
  516. def line_search_armijo(f, xk, pk, gfk, old_fval, args=(), c1=1e-4, alpha0=1):
  517. """Minimize over alpha, the function ``f(xk+alpha pk)``.
  518. Parameters
  519. ----------
  520. f : callable
  521. Function to be minimized.
  522. xk : array_like
  523. Current point.
  524. pk : array_like
  525. Search direction.
  526. gfk : array_like
  527. Gradient of `f` at point `xk`.
  528. old_fval : float
  529. Value of `f` at point `xk`.
  530. args : tuple, optional
  531. Optional arguments.
  532. c1 : float, optional
  533. Value to control stopping criterion.
  534. alpha0 : scalar, optional
  535. Value of `alpha` at start of the optimization.
  536. Returns
  537. -------
  538. alpha
  539. f_count
  540. f_val_at_alpha
  541. Notes
  542. -----
  543. Uses the interpolation algorithm (Armijo backtracking) as suggested by
  544. Wright and Nocedal in 'Numerical Optimization', 1999, pp. 56-57
  545. """
  546. xk = np.atleast_1d(xk)
  547. fc = [0]
  548. def phi(alpha1):
  549. fc[0] += 1
  550. return f(xk + alpha1*pk, *args)
  551. if old_fval is None:
  552. phi0 = phi(0.)
  553. else:
  554. phi0 = old_fval # compute f(xk) -- done in past loop
  555. derphi0 = np.dot(gfk, pk)
  556. alpha, phi1 = scalar_search_armijo(phi, phi0, derphi0, c1=c1,
  557. alpha0=alpha0)
  558. return alpha, fc[0], phi1
  559. def line_search_BFGS(f, xk, pk, gfk, old_fval, args=(), c1=1e-4, alpha0=1):
  560. """
  561. Compatibility wrapper for `line_search_armijo`
  562. """
  563. r = line_search_armijo(f, xk, pk, gfk, old_fval, args=args, c1=c1,
  564. alpha0=alpha0)
  565. return r[0], r[1], 0, r[2]
  566. def scalar_search_armijo(phi, phi0, derphi0, c1=1e-4, alpha0=1, amin=0):
  567. """Minimize over alpha, the function ``phi(alpha)``.
  568. Uses the interpolation algorithm (Armijo backtracking) as suggested by
  569. Wright and Nocedal in 'Numerical Optimization', 1999, pp. 56-57
  570. alpha > 0 is assumed to be a descent direction.
  571. Returns
  572. -------
  573. alpha
  574. phi1
  575. """
  576. phi_a0 = phi(alpha0)
  577. if phi_a0 <= phi0 + c1*alpha0*derphi0:
  578. return alpha0, phi_a0
  579. # Otherwise, compute the minimizer of a quadratic interpolant:
  580. alpha1 = -(derphi0) * alpha0**2 / 2.0 / (phi_a0 - phi0 - derphi0 * alpha0)
  581. phi_a1 = phi(alpha1)
  582. if (phi_a1 <= phi0 + c1*alpha1*derphi0):
  583. return alpha1, phi_a1
  584. # Otherwise, loop with cubic interpolation until we find an alpha which
  585. # satisfies the first Wolfe condition (since we are backtracking, we will
  586. # assume that the value of alpha is not too small and satisfies the second
  587. # condition.
  588. while alpha1 > amin: # we are assuming alpha>0 is a descent direction
  589. factor = alpha0**2 * alpha1**2 * (alpha1-alpha0)
  590. a = alpha0**2 * (phi_a1 - phi0 - derphi0*alpha1) - \
  591. alpha1**2 * (phi_a0 - phi0 - derphi0*alpha0)
  592. a = a / factor
  593. b = -alpha0**3 * (phi_a1 - phi0 - derphi0*alpha1) + \
  594. alpha1**3 * (phi_a0 - phi0 - derphi0*alpha0)
  595. b = b / factor
  596. alpha2 = (-b + np.sqrt(abs(b**2 - 3 * a * derphi0))) / (3.0*a)
  597. phi_a2 = phi(alpha2)
  598. if (phi_a2 <= phi0 + c1*alpha2*derphi0):
  599. return alpha2, phi_a2
  600. if (alpha1 - alpha2) > alpha1 / 2.0 or (1 - alpha2/alpha1) < 0.96:
  601. alpha2 = alpha1 / 2.0
  602. alpha0 = alpha1
  603. alpha1 = alpha2
  604. phi_a0 = phi_a1
  605. phi_a1 = phi_a2
  606. # Failed to find a suitable step length
  607. return None, phi_a1
  608. #------------------------------------------------------------------------------
  609. # Non-monotone line search for DF-SANE
  610. #------------------------------------------------------------------------------
  611. def _nonmonotone_line_search_cruz(f, x_k, d, prev_fs, eta,
  612. gamma=1e-4, tau_min=0.1, tau_max=0.5):
  613. """
  614. Nonmonotone backtracking line search as described in [1]_
  615. Parameters
  616. ----------
  617. f : callable
  618. Function returning a tuple ``(f, F)`` where ``f`` is the value
  619. of a merit function and ``F`` the residual.
  620. x_k : ndarray
  621. Initial position.
  622. d : ndarray
  623. Search direction.
  624. prev_fs : float
  625. List of previous merit function values. Should have ``len(prev_fs) <= M``
  626. where ``M`` is the nonmonotonicity window parameter.
  627. eta : float
  628. Allowed merit function increase, see [1]_
  629. gamma, tau_min, tau_max : float, optional
  630. Search parameters, see [1]_
  631. Returns
  632. -------
  633. alpha : float
  634. Step length
  635. xp : ndarray
  636. Next position
  637. fp : float
  638. Merit function value at next position
  639. Fp : ndarray
  640. Residual at next position
  641. References
  642. ----------
  643. [1] "Spectral residual method without gradient information for solving
  644. large-scale nonlinear systems of equations." W. La Cruz,
  645. J.M. Martinez, M. Raydan. Math. Comp. **75**, 1429 (2006).
  646. """
  647. f_k = prev_fs[-1]
  648. f_bar = max(prev_fs)
  649. alpha_p = 1
  650. alpha_m = 1
  651. alpha = 1
  652. while True:
  653. xp = x_k + alpha_p * d
  654. fp, Fp = f(xp)
  655. if fp <= f_bar + eta - gamma * alpha_p**2 * f_k:
  656. alpha = alpha_p
  657. break
  658. alpha_tp = alpha_p**2 * f_k / (fp + (2*alpha_p - 1)*f_k)
  659. xp = x_k - alpha_m * d
  660. fp, Fp = f(xp)
  661. if fp <= f_bar + eta - gamma * alpha_m**2 * f_k:
  662. alpha = -alpha_m
  663. break
  664. alpha_tm = alpha_m**2 * f_k / (fp + (2*alpha_m - 1)*f_k)
  665. alpha_p = np.clip(alpha_tp, tau_min * alpha_p, tau_max * alpha_p)
  666. alpha_m = np.clip(alpha_tm, tau_min * alpha_m, tau_max * alpha_m)
  667. return alpha, xp, fp, Fp
  668. def _nonmonotone_line_search_cheng(f, x_k, d, f_k, C, Q, eta,
  669. gamma=1e-4, tau_min=0.1, tau_max=0.5,
  670. nu=0.85):
  671. """
  672. Nonmonotone line search from [1]
  673. Parameters
  674. ----------
  675. f : callable
  676. Function returning a tuple ``(f, F)`` where ``f`` is the value
  677. of a merit function and ``F`` the residual.
  678. x_k : ndarray
  679. Initial position.
  680. d : ndarray
  681. Search direction.
  682. f_k : float
  683. Initial merit function value.
  684. C, Q : float
  685. Control parameters. On the first iteration, give values
  686. Q=1.0, C=f_k
  687. eta : float
  688. Allowed merit function increase, see [1]_
  689. nu, gamma, tau_min, tau_max : float, optional
  690. Search parameters, see [1]_
  691. Returns
  692. -------
  693. alpha : float
  694. Step length
  695. xp : ndarray
  696. Next position
  697. fp : float
  698. Merit function value at next position
  699. Fp : ndarray
  700. Residual at next position
  701. C : float
  702. New value for the control parameter C
  703. Q : float
  704. New value for the control parameter Q
  705. References
  706. ----------
  707. .. [1] W. Cheng & D.-H. Li, ''A derivative-free nonmonotone line
  708. search and its application to the spectral residual
  709. method'', IMA J. Numer. Anal. 29, 814 (2009).
  710. """
  711. alpha_p = 1
  712. alpha_m = 1
  713. alpha = 1
  714. while True:
  715. xp = x_k + alpha_p * d
  716. fp, Fp = f(xp)
  717. if fp <= C + eta - gamma * alpha_p**2 * f_k:
  718. alpha = alpha_p
  719. break
  720. alpha_tp = alpha_p**2 * f_k / (fp + (2*alpha_p - 1)*f_k)
  721. xp = x_k - alpha_m * d
  722. fp, Fp = f(xp)
  723. if fp <= C + eta - gamma * alpha_m**2 * f_k:
  724. alpha = -alpha_m
  725. break
  726. alpha_tm = alpha_m**2 * f_k / (fp + (2*alpha_m - 1)*f_k)
  727. alpha_p = np.clip(alpha_tp, tau_min * alpha_p, tau_max * alpha_p)
  728. alpha_m = np.clip(alpha_tm, tau_min * alpha_m, tau_max * alpha_m)
  729. # Update C and Q
  730. Q_next = nu * Q + 1
  731. C = (nu * Q * (C + eta) + fp) / Q_next
  732. Q = Q_next
  733. return alpha, xp, fp, Fp, C, Q