4.1 multilayer percepton基础知识先略过,以后补充。

1
2
3
4
5
6
#Activation function calculate weight and add bias to determine whether neuron should be activated.
#They transform input signals into output differentiable operations.
#Most activation functions are nonlinear
%matplotlib inline
import torch
from d2l import torch as d2l

1.Relu函数

1
2
3
4
#rectified linear unit, ReLU = max(x, 0) namely discard all the minus elements.
x = torch.arange(-8.0, 8.0, 0.1, requires_grad=True)
y = torch.relu(x)
d2l.plot(x.detach(), y.detach(), 'x', 'relu(x)', figsize=(5, 2.5))

svg

1
2
3
4
#and when we input a minus number, the derivative of relu is 0, otherwise the derivative of relu is 1.
#notice: input = 0, we don't have derivative
y.backward(torch.ones_like(x), retain_graph=True)
d2l.plot(x.detach(), x.grad, 'x', 'grad of relu', figsize=(5, 2.5))#用detach防止跟踪其自动微分计算的计算图,我们只想要数据

svg

1
2
y = torch.sigmoid(x)
d2l.plot(x.detach(), y.detach(), 'x', 'sigmoid(x)', figsize=(5, 2.5))

svg

1
2
3
4
#清除以前的梯度
x.grad.data.zero_()
y.backward(torch.ones_like(x), retain_graph=True)
d2l.plot(x.detach(), x.grad, 'x', 'grad of sigmoid', figsize=(5, 2.5))

svg

3 tanh function

1
2
y = torch.tanh(x)
d2l.plot(x.detach(), y.detach(), 'x', 'tanh(x)', figsize=(5, 2.5))

svg

1
2
3
4
#tanh导数为1 - tanh(x)^2
x.grad.data.zero_()
y.backward(torch.ones_like(x), retain_graph=True)
d2l.plot(x.detach(), x.grad, 'x', 'grad of tanh', figsize=(5, 2.5))

svg

1
2
3
4
#我们继续使用上一章使用的图像数据集,将结果与softmax回归作比较
import torch
from torch import nn
from d2l import torch as d2l
1
2
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)#加载数据集
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#回忆数据集28*28=784个灰度值,共有10个类别,测试集10000张,训练集60000张,
#输入为784,输出特征为10
#we choose a signal hidden layer in the multilayer perceptron with 256 hidden elements.
#That is we usually choose power of 2 to be the width of layer, which can make calculation more efficiently
#notice: in every layer we distribute w and b. So there are w1, b1 and w2, b2
num_inputs , num_outputs, num_hiddens = 784, 10, 256
W1 = nn.Parameter(torch.randn(num_inputs, num_hiddens, requires_grad = True) * 0.01)
#将初始权重缩放到一个极小的范围,较小的初始权重有助于更快地收敛到较好的模型。

b1 = nn.Parameter(torch.zeros(num_hiddens, requires_grad = True))

W2 = nn.Parameter(torch.randn(num_hiddens, num_outputs, requires_grad = True) * 0.01)

b2 = nn.Parameter(torch.zeros(num_outputs, requires_grad = True))

params = [W1, b1, W2, b2]

4.2.2 activation function

1
2
3
def relu(X):
a = torch.zeros_like(X)
return torch.max(X, a)#也就是同一个位置0或者X取最大值

4.2.3 model

1
2
3
4
5
#因为我们将图像直接按照784来输入,所以要预先转换一下
def net(X):
X = X.reshape(-1, num_inputs)#-1也就是自动决定有多少行
H = relu(X@W1 + b1)#@代表矩阵乘法,当然也可以用matmul
return (H@W2 + b2)

4.2.4 loss function

1
loss = nn.CrossEntropyLoss(reduction='none')

4.2.5 training

1
2
3
num_epochs, lr = 10, 0.1#10轮训练,每次学习率0.1
updater = torch.optim.SGD(params, lr=lr)#梯度更新参数
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, updater)

svg

1
d2l.predict_ch3(net, test_iter)

svg

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#保持其它参数不变的情况下,更改超参数num_hiddens的值,查看此超参数变化对结果有何影响,确定此超参数的最佳值。
num_hiddens = 1024
W1 = nn.Parameter(torch.randn(num_inputs, num_hiddens, requires_grad = True) * 0.01)

b1 = nn.Parameter(torch.zeros(num_hiddens, requires_grad = True))

W2 = nn.Parameter(torch.randn(num_hiddens, num_outputs, requires_grad = True) * 0.01)

b2 = nn.Parameter(torch.randn(num_outputs, requires_grad = True))

params = [W1, b1, W2, b2]
num_epochs, lr = 10, 0.1
updater = torch.optim.SGD(params, lr = lr)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, updater)#显然hidden维度越大,精度越高,收敛变快

svg

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#如果隐藏层添加两层,总层数三层的话呢
num_hiddens_1 = 1024
num_hiddens_2 = 512
W1 = nn.Parameter(torch.randn(num_inputs, num_hiddens_1, requires_grad = True) * 0.01)

b1 = nn.Parameter(torch.zeros(num_hiddens_1, requires_grad = True))

W2 = nn.Parameter(torch.randn(num_hiddens_1, num_hiddens_2, requires_grad = True) * 0.01)

b2 = nn.Parameter(torch.randn(num_hiddens_2, requires_grad = True))
#前面这些都不变
W3 = nn.Parameter(torch.randn(num_hiddens_2, num_outputs, requires_grad = True) * 0.01)

b3 = nn.Parameter(torch.randn(num_outputs, requires_grad = True))
params = [W1, b1, W2, b2, W3, b3]
def net(X):
X = X.reshape(-1, num_inputs)
H1 = relu(X@W1 + b1)
H2 = relu(H1@W2 + b2)
return (H2@W3 + b3)
num_epochs, lr = 10, 0.1
updater = torch.optim.SGD(params, lr=lr)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, updater)#变化不是很大,收敛加速,精准提升。

svg

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#如果改变学习率呢,适当增加大学习率,会提高很大的精度
num_hiddens_1 = 1024
num_hiddens_2 = 512
W1 = nn.Parameter(torch.randn(num_inputs, num_hiddens_1, requires_grad = True) * 0.01)

b1 = nn.Parameter(torch.zeros(num_hiddens_1, requires_grad = True))

W2 = nn.Parameter(torch.randn(num_hiddens_1, num_hiddens_2, requires_grad = True) * 0.01)

b2 = nn.Parameter(torch.randn(num_hiddens_2, requires_grad = True))
#前面这些都不变
W3 = nn.Parameter(torch.randn(num_hiddens_2, num_outputs, requires_grad = True) * 0.01)

b3 = nn.Parameter(torch.randn(num_outputs, requires_grad = True))
params = [W1, b1, W2, b2, W3, b3]
def net(X):
X = X.reshape(-1, num_inputs)
H1 = relu(X@W1 + b1)
H2 = relu(H1@W2 + b2)
return (H2@W3 + b3)
num_epochs, lr = 10, 0.5#学习率如果太小,收敛变慢,精度还未达到要求,得同时提升轮数
updater = torch.optim.SGD(params, lr=lr)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, updater)#变化不是很大,收敛加速,精度提升。

svg

4.3 multilayer perceptron concise realization

1
2
3
import torch
from torch import nn
from d2l import torch as d2l
1
2
3
4
5
6
net = nn.Sequential(nn.Flatten(), nn.Linear(784, 256), nn.ReLU(), nn.Linear(256, 10))

def init_weights(m):
if type(m) == nn.Linear:#也就是给线性层的权重赋值
nn.init.normal_(m.weight, std = 0.01)#初始化线性层权重weight也就是w
net.apply(init_weights)
Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=256, bias=True)
  (2): ReLU()
  (3): Linear(in_features=256, out_features=10, bias=True)
)
1
2
3
batch_size, lr, num_epochs = 256, 0.1, 10
loss = nn.CrossEntropyLoss(reduction='none')
trainer = torch.optim.SGD(net.parameters(), lr=lr)
1
2
3
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)
#这也就是一个隐藏层的时候

svg