# 计算图

1. 先考虑 g 到 p 的反向传播：$\dfrac{\partial g}{\partial p}=z$
2. 然后再考虑 p 到 x 的反向传播：$\dfrac{\partial p}{\partial x}=1$
3. 将两个组合起来：$\dfrac{\partial g}{\partial p} \cdot \dfrac{\partial p}{\partial x}= z$

# 简单层的实现

## 乘法层和加法层

$z=x\cdot y\\ \frac{\partial z}{\partial x}=y\\ \frac{\partial z}{\partial y}=x$

$z=x+ y\\ \frac{\partial z}{\partial x}=1\\ \frac{\partial z}{\partial y}=1$
class MulLayer:
def __init__(self):
self.x = None
self.y = None

def forward(self, x, y):
self.x = x
self.y = y
out = x * y

return out

def backward(self, dout):
dx = dout * self.y
dy = dout * self.x

return dx, dy

def __init__(self):
pass

def forward(self, x, y):
out = x + y

return out

def backward(self, dout):
dx = dout * 1
dy = dout * 1

return dx, dy


## ReLU层

$y= \begin{cases} x, & (x>0)\\ 0, & (x\leq 0) \end{cases}\\ \frac{\partial y}{\partial x}=\begin{cases} 1, & (x>0)\\ 0, & (x\leq 0) \end{cases}$
import numpy as np

class Relu:
def __init__(self):

def forward(self, x):
out = x.copy()

return out

def backward(self, dout):
dx = dout

return dx



## Sigmoid层

$y=\frac{1}{1+\exp(-x)}\\ \frac{\partial y}{\partial x}=\frac{\exp(-x)}{[1+\exp(-x)]^2}=y(1-y)$
class Sigmoid:
def __init__(self):
self.out = None

def forward(self, x):
out = sigmoid(x)
self.out = out
return out

def backward(self, dout):
dx = dout * (1.0 - self.out) * self.out

return dx


## Affine层

Affine 层包含多个权重以及偏置，其求导也相对复杂得多

$\newcommand{\bd}{\boldsymbol}$ $\bd{X}\cdot\bd{W}+\bd{B}=\bd{Y}$

$\begin{bmatrix} x_1 & x_2 \end{bmatrix} \cdot \begin{bmatrix} w_{11} & w_{12} & w_{13}\\ w_{21} & w_{22} & w_{23} \end{bmatrix} + \begin{bmatrix} b_1 & b_2 & b_3 \end{bmatrix}= \begin{bmatrix} y_1 & y_2 & y_3 \end{bmatrix}$

$\dfrac{\partial \bd{Y}}{\partial \bd{X}} = \begin{bmatrix} \dfrac{\partial y_1}{\partial x_1} & \dfrac{\partial y_1}{\partial x_2} \\ \dfrac{\partial y_2}{\partial x_1} & \dfrac{\partial y_2}{\partial x_2} \\ \dfrac{\partial y_3}{\partial x_1} & \dfrac{\partial y_3}{\partial x_2} \end{bmatrix} = \begin{bmatrix} w_{11} & w_{21} \\ w_{12} & w_{22} \\ w_{13} & w_{23} \end{bmatrix} = \bd{W}^T$

$\dfrac{\partial \bd{Y}}{\partial \bd{W}} = \bd{X}^T \quad \dfrac{\partial \bd{Y}}{\partial \bd{B}} = \bd{1}$

$\dfrac{\partial L}{\partial \bd{X}} = \dfrac{\partial L}{\partial \bd{Y}} \cdot \bd{W}^T\\ \dfrac{\partial L}{\partial \bd{W}} = \bd{X}^T \cdot \dfrac{\partial L}{\partial \bd{Y}}$

class Affine:
def __init__(self, W, b):
self.W = W #权重
self.b = b #偏置

self.x = None #输入
self.original_x_shape = None #原输入的形状
# 权重和偏置参数的导数
self.dW = None
self.db = None

def forward(self, x):
# 对应张量
self.original_x_shape = x.shape
x = x.reshape(x.shape[0], -1)
self.x = x

out = np.dot(self.x, self.W) + self.b

return out

def backward(self, dout):
dx = np.dot(dout, self.W.T) #对应上面的公式
self.dW = np.dot(self.x.T, dout) #对应上面的公式
self.db = np.sum(dout, axis=0)

dx = dx.reshape(*self.original_x_shape)  # 还原输入数据的形状（对应张量）
return dx



# Softmax 层

$y_k = \frac{\exp(a_k)}{\sum_{i=1}^n \exp(a_i)}$

class SoftmaxWithLoss:
def __init__(self):
self.loss = None
self.y = None # softmax的输出
self.t = None # 监督数据

def forward(self, x, t):
self.t = t
self.y = softmax(x)
self.loss = cross_entropy_error(self.y, self.t)

return self.loss

def backward(self, dout=1):
batch_size = self.t.shape[0]
if self.t.size == self.y.size: # 监督数据是one-hot-vector的情况
dx = (self.y - self.t) / batch_size
else:
dx = self.y.copy()
dx[np.arange(batch_size), self.t] -= 1
dx = dx / batch_size

return dx


# 误差反向传播算法的实现

from collections import OrderedDict

class TwoLayerNet:

def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):
# 初始化权重
self.params = {}
self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
self.params['b1'] = np.zeros(hidden_size)
self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
self.params['b2'] = np.zeros(output_size)

# 生成层
self.layers = OrderedDict()
self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
self.layers['Relu1'] = Relu()
self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])

self.lastLayer = SoftmaxWithLoss()

def predict(self, x):
for layer in self.layers.values():
x = layer.forward(x)

return x

# x:输入数据, t:监督数据
def loss(self, x, t):
y = self.predict(x)
return self.lastLayer.forward(y, t)

def accuracy(self, x, t):
y = self.predict(x)
y = np.argmax(y, axis=1)
if t.ndim != 1 : t = np.argmax(t, axis=1)

accuracy = np.sum(y == t) / float(x.shape[0])
return accuracy

# x:输入数据, t:监督数据
loss_W = lambda W: self.loss(x, t)

# forward
self.loss(x, t)

# backward
dout = 1
dout = self.lastLayer.backward(dout)

layers = list(self.layers.values())
layers.reverse()
for layer in layers:
dout = layer.backward(dout)

# 设定



(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):

# 梯度

# 更新
for key in ('W1', 'b1', 'W2', 'b2'):

loss = network.loss(x_batch, t_batch)
train_loss_list.append(loss)

if i % iter_per_epoch == 0:
train_acc = network.accuracy(x_train, t_train)
test_acc = network.accuracy(x_test, t_test)
train_acc_list.append(train_acc)
test_acc_list.append(test_acc)
print(train_acc, test_acc)

OUTPUT:
0.12738333333333332 0.1288
0.90455 0.909
0.92495 0.928
0.9365 0.9395
0.9444166666666667 0.946
0.9498 0.9504
0.9550333333333333 0.9544
0.9592666666666667 0.9566
0.95915 0.9562
0.9624333333333334 0.9585
0.9674333333333334 0.9643
0.9693166666666667 0.9628
0.9727166666666667 0.9672
0.9727 0.9658
0.9735166666666667 0.9673
0.975 0.968
0.9780833333333333 0.97


# 读入数据
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

x_batch = x_train[:3]
t_batch = t_train[:3]


OUTPUT