# Line Search and Quasi-Newton Methods 线性搜索与拟牛顿法

2019年11月20日 阅读数：140

b=aαF(a)f(a)f(b)(1)(1)b=a−α∇F(a)⇒f(a)≥f(b)

xk+1=xkαkf(xk),0kn(2)(2)xk+1=xk−αk∇f(xk),0≤k≤n

ide

f(x0)f(x1)f(x2)f(xn)(3)(3)f(x0)≥f(x1)≥f(x2)≥⋯≥f(xn)

f(xk+αdk)<f(xk)α(0,ϵ](4)(4)f(xk+αdk)<f(xk)∀α∈(0,ϵ]

dk=Bkf(xk)(5)(5)dk=−Bk∇f(xk)

# Line Search

α=argminα0h(α)=argminα0f(xk+αdk)(6)(6)α=argminα≥0h(α)=argminα≥0f(xk+αdk)

ui

## Bisection Search

L=(12)nα^(7)(7)L=(12)nα^

Lϵk[log2(α^ϵ)](8)(8)L≤ϵ⇒k≤[log2⁡(α^ϵ)]

 1 def bisection(dfun,theta,args,d,low,high,maxiter=1e4):
2     """
3     #Functionality:find the root of the function(fun) in the interval [low,high]
4     #@Parameters
5     #dfun:compute the graident of function f(x)
6     #theta:Parameters of the model
7     #args:other variables needed to compute the value of dfun
8     #[low,high]:the interval which contains the root
9     #maxiter:the max number of iterations
10     """
11     eps=1e-6
12     val_low=np.sum(dfun(theta+low*d,args)*d.T)
13     val_high=np.sum(dfun(theta+high*d,args)*d.T)
14     if val_low*val_high>0:
15         raise Exception('Invalid interval!')
16     iter_num=1
17     while iter_num<maxiter:
18         mid=(low+high)/2
19         val_mid=np.sum(dfun(theta+mid*d,args)*d.T)
20         if abs(val_mid)<eps or abs(high-low)<eps:
21             return mid
22         elif val_mid*val_low>0:
23             low=mid
24         else:
25             high=mid
26         iter_num+=1

## Backtracking

f(xk+αdk)f(xk)+c1αf(xk)Tdk(9)(9)f(xk+αdk)≤f(xk)+c1αf′(xk)Tdk

h(0)<c1h(0)<0(10)(10)h′(0)<c1h′(0)<0

h(0)=limα0h(α)h(0)α=limα0f(xk+αdk)f(xk)α<ch(0)(11)(11)h′(0)=limα→0h(α)−h(0)α=limα→0f(xk+αdk)−f(xk)α<ch′(0)

f(xk+αdk)f(xk)α<cf(xk)Tdk(12)(12)f(xk+αdk)−f(xk)α<cf′(xk)Tdk

 1 def ArmijoBacktrack(fun,dfun,theta,args,d,stepsize=1,tau=0.5,c1=1e-3):
2     """
3     #Functionality:find an acceptable stepsize via backtrack under Armijo rule
4     #@Parameters
5     #fun:compute the value of objective function
6     #dfun:compute the gradient of objective function
7     #theta:a vector of parameters of the model
8     #stepsize:initial step size
9     #c1:sufficient decrease Parameters
10     #tau:rate of shrink of stepsize
11     """
12     slope=np.sum(dfun(theta,args)*d.T)
13     obj_old=costFunction(theta,args)
14     theta_new=theta+stepsize*d
15     obj_new=costFunction(theta_new,args)
16     while obj_new>obj_old+c1*stepsize*slope:
17         stepsize*=tau
18         theta_new=theta+stepsize*d
19         obj_new=costFunction(theta_new,args)
20     return stepsize

## Interpolation

hq(α)=(h(α0)h(0)α0h(0)α20)α2+h(0)α+h(0)(13)(13)hq(α)=(h(α0)−h(0)−α0h′(0)α02)α2+h′(0)α+h(0)

α1=h(0)α202[h(0)+h(0)α0h(α0)](14)(14)α1=h′(0)α022[h(0)+h′(0)α0−h(α0)]

hc(α)=aα3+bα2+h(0)α+h(0)(15)(15)hc(α)=aα3+bα2+h′(0)α+h(0)

[ab]=1α2i1α2i(αiαi1)[α2i1α3i1α2iα3i][h(αi)h(0)h(0)αih(αi1)h(0)h(0)αi1](16)(16)[ab]=1αi−12αi2(αi−αi−1)[αi−12−αi2−αi−13αi3][h(αi)−h(0)−h′(0)αih(αi−1)−h(0)−h′(0)αi−1]

αi+1=b+b23ah(0)−−−−−−−−−−√3a(17)(17)αi+1=−b+b2−3ah′(0)3a

H3(α)=[1+2αiααiαi1][ααi1αiαi1]2h(αi)+[1+2ααi1αiαi1][αi+1ααiαi1]2h(αi1)+(ααi)[ααi1αiαi1]2h(αi)+(ααi1)[αiααiαi1]2h(αi1)(18)(18)H3(α)=[1+2αi−ααi−αi−1][α−αi−1αi−αi−1]2h(αi)+[1+2α−αi−1αi−αi−1][αi+1−ααi−αi−1]2h(αi−1)+(α−αi)[α−αi−1αi−αi−1]2h′(αi)+(α−αi−1)[αi−ααi−αi−1]2h′(αi−1)

αi+1=αi(αiαi1)[h(αi)+d2d1h(αi)h(αi1)+2d2](19)(19)αi+1=αi−(αi−αi−1)[h′(αi)+d2−d1h′(αi)−h′(αi−1)+2d2]

d1=h(αi)+h(αi1)3[h(αi)h(αi1)αiαi1](20)(20)d1=h′(αi)+h′(αi−1)−3[h(αi)−h(αi−1)αi−αi−1]
d2=sign(αiαi1)d21h(αi1)h(αi)−−−−−−−−−−−−−−−√(21)(21)d2=sign(αi−αi−1)d12−h′(αi−1)h′(αi)

 1 def quadraticInterpolation(a,h,h0,g0):
2     """
3     #Functionality:Approximate h(a) with a quadratic function and return its stationary point
4     #@Parameters
5     #a:current stepsize
6     #h:a function value about stepsize,h(a)=f(x_k+a*d)
7     #h:h(0)=f(x_k)
8     #g0:h'(0)=f'(0)
9     """
10     numerator=g0*a**2
11     denominator=2*(g0*a+h0-h)
12     if abs(denominator)<1e-12:#indicates that a is almost 0
13         return a
14     return numerator/denominator
def cubicInterpolation(a0,h0,a1,h1,h,g):
"""
#Functionality:Approximate h(x) with a cubic function and return its stationary point
#This version of cubic interpolation computes h'(x) as few as possible,suitable for the case in which computing derivative is more expensive than computing function values
#@Parameters
#a0 and a1 are stepsize it previous two iterations
#h0:h(a0)
#h1:h(a1)
#h:h(0)=f(x)
#g:h'(0)
"""
mat=matlib.matrix([[a0**2,-a1**2],[-a0**3,a1**3]])
vec=matlib.matrix([[h1-h-g*a1],[h0-h-g*a0]])
ab=mat*vec/(a0**2*a1**2*(a1-a0))
a=ab[0,0]
b=ab[1,0]
if abs(a)<1e-12:#a=0 and cubic function is a quadratic one
return -g/(2*b)
return (-b+np.sqrt(b**2-3*a*g))/(3*a)
def cubicInterpolationHermite(a0,h0,g0,a1,h1,g1):
"""
#Functionality:Approximate h(a) with a cubic Hermite polynomial function and return its stationary point
#This version of cubic interpolation computes h(a) as few as possible,suitable for the case in which computing derivative is easier than computing function values
#@Parameters
#a0 and a1 are stepsize it previous two iterations
#h0:h(a0)
#g0:h'(a0)
#h1:h(a1)
#g1:h'(a1)
"""
d1=g0+g1-3*(h1-h0)/(a1-a0)
d2=np.sign(a1-a0)*np.sqrt(d1**2-g0*g1)
res=a1-(a1-a0)*(g1+d2-d2)/(g1-g0+2*d2)
return res

 1 def ArmijoLineSearch(fun,dfun,theta,args,d,a0=1,c1=1e-3,a_min=1e-7,max_iter=1e5):
2     """
3     #Functionality:Line search under Armijo condition with quadratic and cubic interpolation
4     #@Parameters
5     #fun:objective Function
6     #dfun:compute the gradient of fun
7     #theta:a vector of parameters of the model
8     #args:other variables needed for fun and func
9     #d:search direction
10     #a0:initial stepsize
11     #c1:constant used in Armijo condition
12     #a_min:minimun of stepsize
13     #max_iter:maximum of the number of iterations
14     """
15     eps=1e-6
16     c1=min(c1,0.5)#c1 should<=0.5
17     a_pre=h_pre=g_pre=0
18     a_cur=a0
19     f_val=fun(theta,args) #h(0)=f(0)
20     g_val=np.sum(dfun(theta,args)*d.T) #h'(0)=f'(x)^Td
21     h_cur=g_cur=0
22     k=0
23     while a_cur>a_min and k<max_iter:
24         h_cur=fun(theta+a_cur*d,args)
25         g_cur=np.sum(dfun(theta+a_cur*d,args)*d.T)
26         if h_cur<=f_val+c1*a_cur*g_val: #meet Armijo condition
27             return a_cur
28         if not k: #k=0,use quadratic interpolation
30         else: #k>0,use cubic Hermite interpolation
31             a_new=cubicInterpolationHermite(a_pre,h_pre,g_pre,a_cur,h_cur,g_cur)
32         if abs(a_new-a_cur)<eps or abs(a_new)<eps: #safeguard procedure
33             a_new=a_cur/2
34         a_pre=a_cur
35         a_cur=a_new
36         h_pre=h_cur
37         g_pre=g_cur
38         k+=1
39     return a_min #failed search

## Wolfe Search

h(α)=f(xk+αdk)Tdkc2f(xk)Tdk(22)(22)h′(α)=f′(xk+αdk)Tdk≥c2f′(xk)Tdk

{f(xk+αdk)f(xk+αdk)Tdkf(xk)+c1αf(xk)Tdkc2f(xk)Tdk(23)(23){f(xk+αdk)≤f(xk)+c1αf′(xk)Tdkf′(xk+αdk)Tdk≥c2f′(xk)Tdk

{f(xk+αdk)|f(xk+αdk)Tdk|f(xk)+c1αf(xk)Tdkc2|f(xk)Tdk|(24)(24){f(xk+αdk)≤f(xk)+c1αf′(xk)Tdk|f′(xk+αdk)Tdk|≥c2|f′(xk)Tdk|

f(xk+αdk)=f(xk)+αc1f(xk)Tdk(25)(25)f(xk+α′dk)=f(xk)+α′c1f′(xk)Tdk

f(xk+αdk)f(xk)=αf(xk+α′′dk)Tdk(26)(26)f(xk+α′dk)−f(xk)=α′f′(xk+α″dk)Tdk

f(xk+α′′dk)Tdk=c1f(xk)Tdk>c2f(xk)Tdk(27)(27)f′(xk+α″dk)Tdk=c1f′(xk)Tdk>c2f′(xk)Tdk

 1 def WolfeLineSearch(fun,dfun,theta,args,d,a0=1,c1=1e-4,c2=0.9,a_min=1e-7,max_iter=1e5):
2     """
3     #Functionality:find a stepsize meeting Wolfe condition
4     #@Parameters
5     #fun:objective Function
6     #dfun:compute the gradient of fun
7     #theta:a vector of parameters of the model
8     #args:other variables needed for fun and func
9     #d:search direction
10     #a0:intial stepsize
11     #c1:constant used in Armijo condition
12     #c2:constant used in curvature condition
13     #a_min:minimun of stepsize
14     #max_iter:maximum of the number of iterations
15     """
16     eps=1e-16
17     c1=min(c1,0.5)
18     a_pre=0
19     a_cur=a0
20     f_val=fun(theta,args) #h(0)=f(x)
21     g_val=np.sum(dfun(theta,args)*d.T)
22     h_pre=f_val #h'(0)=f'(x)^Td
23     k=0
24     while k<max_iter and abs(a_cur-a_pre)>=eps:
26         if h_cur>f_val+c1*a_cur*g_val or h_cur>=h_pre and k>0:
27             return zoom(fun,dfun,theta,args,d,a_pre,a_cur,c1,c2)
28         g_cur=np.sum(dfun(theta+a_cur*d,args)*d.T)
29         if abs(g_cur)<=-c2*g_val:#satisfy Wolfe condition
30             return a_cur
31         if g_cur>=0:
32             return zoom(fun,dfun,theta,args,d,a_pre,a_cur,c1,c2)
34         a_pre=a_cur
35         a_cur=a_new
36         h_pre=h_cur
37         k+=1
38     return a_min

zoom函数的算法描述见6。zoom函数中须要传入搜寻区间

zoom函数对应的Python代码以下：

 1 def zoom(fun,dfun,theta,args,d,a_low,a_high,c1=1e-3,c2=0.9,max_iter=1e4):
2     """
3     #Functionality:enlarge the interval to find a stepsize meeting Wolfe condition
4     #@Parameters
5     #fun:objective Function
6     #dfun:compute the gradient of fun
7     #theta:a vector of parameters of the model
8     #args:other variables needed for fun and func
9     #d:search direction
10     #[a_low,a_high]:interval containing a stepsize satisfying Wolfe condition
11     #c1:constant used in Armijo condition
12     #c2:constant used in curvature condition
13     #max_iter:maximum of the number of iterations
14     """
15     if a_low>a_high:
16         print('low:%f,high:%f'%(a_low,a_high))
17         raise Exception('Invalid interval of stepsize in zoom procedure')
18     eps=1e-16
19     h=fun(theta,args) #h(0)=f(x)
20     g=np.sum(dfun(theta,args)*d.T) #h'(0)=f'(x)^Td
21     k=0
22     h_low=fun(theta+a_low*d,args)
23     h_high=fun(theta+a_high*d,args)
24     if h_low>h+c1*a_low*g:
25         raise Exception('Left endpoint violates Armijo condition in zoom procedure')
26     while k<max_iter and abs(a_high-a_low)>=eps:
27         a_new=(a_low+a_high)/2
28         h_new=fun(theta+a_new*d,args)
29         if h_new>h+c1*a_new*g or h_new>h_low:
30             a_high=a_new
31             h_high=h_new
32         else:
33             g_new=np.sum(dfun(theta+a_new*d,args)*d.T)
34             if abs(g_new)<=-c2*g: #satisfy Wolfe condition
35                 return a_new
36             if g_new*(a_high-a_low)>=0:
37                 a_high=a_new
38                 h_high=h_new
39             else:
40                 a_low=a_new
41                 h_low=h_new
42         k+=1
43     return a_low #a_low definitely satisfy Armijo condition

# Newton's Method

f(xk+x)f(xk)+f(xk)x+12xTBkx(28)(28)f(xk+△x)≈f(xk)+f′(xk)△x+12△xTBk△x

f(xk+1)=f(xk)+Bk(xk+1xk)(29)(29)f′(xk+1)=f′(xk)+Bk(xk+1−xk)

xk+1=xkB1kf(xk)(30)(30)xk+1=xk−Bk−1f′(xk)

# Quasi-Newton Method

f(xk+1)f(xk)Bk+1(xk+1xk)(31)(31)f′(xk+1)−f′(xk)≈Bk+1(xk+1−xk)

 1 def BFGS(fun,dfun,theta,args,H=None,mode=0,eps=1e-12,max_iter=1e4):
2     """
3     #Functionality:find the minimum of objective function f(x)
4     #@Parameters
5     #fun:objective function f(x)
6     #dfun:compute the gradient of f(x)
7     #args:parameters needed by fun and dfun
8     #theta:start vector of parameters of the model
9     #H:initial inverse Hessian approximation
10     #mode:index of line search algorithm
11     """
12     x_pre=x_cur=theta
13     g=dfun(x_cur,args)
14     I=matlib.eye(theta.size)
15     if not H:#initialize H as an identity matrix
16         H=I
17     k=0
18     while k<max_iter and np.sum(np.abs(g))>eps:
19         d=-g*H
20         step=LineSearch(fun,dfun,x_pre,args,d,1,mode)
21         x_cur=x_pre+step*d
22         s=step*d
23         y=dfun(x_cur,args)-dfun(x_pre,args)
24         ys=np.sum(y*s.T)
25         if abs(ys)<eps:
26             return x_cur
27         change=(ys+np.sum(y*H*y.T))*(s.T*s)/(ys**2)-(H*y.T*s+s.T*y*H)/ys
28         H+=change
29         g=dfun(x_cur,args)
30         x_pre=x_cur
31         k+=1
32     return x_cur

===HkgkVTk1Hk1Vk1gk+sk1ρk1sTk1gkVTk1VTk2Hk2Vk2Vk1gk+VTk1sk2ρk2sTk2Vk1gk+sk1ρk1sTk1gk(VTk1VTk2VTkm)Hkm(VkmVk2Vk1)gk+(VTk1VTk2VTkm+1)skmρkmsTkm(Vkm+1Vk1Vk)gk+(VTk1VTk2VTkm+2)skm+1ρkm+1sTkm+1(Vkm+2Vk2Vk1)gk++VTk1sk2ρk2sTk2Vk1gk+sk1ρk1sTk1gk(32)(32)Hkgk=Vk−1THk−1Vk−1gk+sk−1ρk−1sk−1Tgk=Vk−1TVk−2THk−2Vk−2Vk−1gk+Vk−1Tsk−2ρk−2sk−2TVk−1gk+sk−1ρk−1sk−1Tgk=(Vk−1TVk−2T⋯Vk−mT)Hk−m(Vk−m⋯Vk−2Vk−1)gk+(Vk−1TVk−2T⋯Vk−m+1T)sk−mρk−msk−mT(Vk−m+1⋯Vk−1Vk)gk+(Vk−1TVk−2T⋯Vk−m+2T)sk−m+1ρk−m+1sk−m+1T(Vk−m+2⋯Vk−2Vk−1)gk+⋯+Vk−1Tsk−2ρk−2sk−2TVk−1gk+sk−1ρk−1sk−1Tgk

qi=(VkiVk2Vk1)gk(33)(33)qi=(Vk−i⋯Vk−2Vk−1)gk
ai=ρkisTkiqi1(34)(34)ai=ρk−isk−iTqi−1