# 参数的更新

## 随机梯度下降 SGD

$\newcommand{\bd}{\boldsymbol}$

$\bd{W} \leftarrow \bd{W} - \eta \frac{\partial L}{\partial \bd{W}}\\ \bd{W}：权重 \quad \eta：学习率 \quad L：损失函数$
import numpy as np

class SGD:

def __init__(self, lr=0.01):
self.lr = lr

def update(self, params, grads):
for key in params.keys():
params[key] -= self.lr * grads[key]


import numpy as np
import matplotlib.pyplot as plt

def f(x, y):
return x**2 / 20.0 + y**2
x = np.linspace(-10,10,30)
y = np.linspace(-10,10,30)
X, Y = np.meshgrid(x, y)
Z = f(X,Y)

fig = plt.figure()
ax = plt.axes(projection='3d')
ax.contour3D(X, Y, Z, 50, cmap='binary')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('z')
#调整观察角度和方位角。这里将俯仰角设为60度，把方位角调整为35度
ax.view_init(60, 35)


SGD搜索最小值

## Momentum SGD

Momentum 表示动量，其原理就和物理世界一样，想象有个小球从最高点滑到最低点，小球在下降的过程中，会带有一定动量，从而其轨迹并不是沿梯度下降的方向，而是在动量惯性的作用下接近最低点。

$\bd{v} \leftarrow \alpha \bd{v} -\eta \frac{\partial L}{\partial \bd{W}}\\ \bd{W} \leftarrow \bd{W} + \bd{v}$

class Momentum:

"""Momentum SGD"""

def __init__(self, lr=0.01, momentum=0.9):
self.lr = lr
self.momentum = momentum
self.v = None

def update(self, params, grads):
if self.v is None:
self.v = {}
for key, val in params.items():
self.v[key] = np.zeros_like(val)

for key in params.keys():
self.v[key] = self.momentum*self.v[key] - self.lr*grads[key]
params[key] += self.v[key]

Momentum搜索最小值

$\bd{h} \leftarrow \bd{h} + \frac{\partial L}{\partial \bd{W}} \odot \frac{\partial L}{\partial \bd{W}}\\ \bd{W} \leftarrow \bd{W} - \eta \frac{1}{\sqrt{\bd{h}}} \frac{\partial L}{\partial \bd{W}}$

$\odot$ 表示矩阵的对应元素相乘。$\bd{h}$ 会记录过去所有梯度的平方和，显然学习的次数越多，$\bd{h}$ 越大，则 $\eta \frac{1}{\sqrt{\bd{h}}}$ 越小。

class AdaGrad:

def __init__(self, lr=0.01):
self.lr = lr
self.h = None

def update(self, params, grads):
if self.h is None:
self.h = {}
for key, val in params.items():
self.h[key] = np.zeros_like(val)

for key in params.keys():
params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7) #1e-7 是为了避免出现除以0的情况


## 其他方法

class Adam:

def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
self.lr = lr
self.beta1 = beta1
self.beta2 = beta2
self.iter = 0
self.m = None
self.v = None

def update(self, params, grads):
if self.m is None:
self.m, self.v = {}, {}
for key, val in params.items():
self.m[key] = np.zeros_like(val)
self.v[key] = np.zeros_like(val)

self.iter += 1
lr_t  = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter)

for key in params.keys():
#self.m[key] = self.beta1*self.m[key] + (1-self.beta1)*grads[key]
#self.v[key] = self.beta2*self.v[key] + (1-self.beta2)*(grads[key]**2)
self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])
self.v[key] += (1 - self.beta2) * (grads[key]**2 - self.v[key])

params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)

#unbias_m += (1 - self.beta1) * (grads[key] - self.m[key]) # correct bias
#unbisa_b += (1 - self.beta2) * (grads[key]*grads[key] - self.v[key]) # correct bias
#params[key] += self.lr * unbias_m / (np.sqrt(unbisa_b) + 1e-7)


4种方法比较

# 权重的初始值

## 隐藏层的激活值分布

1. 构造一个5层神经网络，每个网络有100个节点，参数随机初始化
2. 随机构造1000个数据
3. 将数据输入到网络，看看每层的激活值的分布

# Batch Normalization

• 可以使学习快速进行
• 不那么依赖初始值
• 抑制过拟合

$\mu_B \leftarrow \frac{1}{m} \sum_{i=1}^m x_i\\ \sigma_B^2 \leftarrow \frac{1}{m} \sum_{i=1}^m (x_i-\mu_B)^2\\ \hat{x}_i \leftarrow \frac{x_i - \mu_B}{\sqrt{\sigma_B^2 + \varepsilon}}$

Batch Norm 一般在激活函数前后，可以减小数据分布的偏向。