4.1 multilayer percepton多层感知机

4.1 multilayer percepton基础知识先略过，以后补充。

#Activation function calculate weight and add bias to determine whether neuron should be activated.
#They transform input signals into output differentiable operations.
#Most activation functions are nonlinear
%matplotlib inline
import torch
from d2l import torch as d2l

1.Relu函数

#rectified linear unit, ReLU = max(x, 0) namely discard all the minus elements.
x = torch.arange(-8.0, 8.0, 0.1, requires_grad=True)
y = torch.relu(x)
d2l.plot(x.detach(), y.detach(), 'x', 'relu(x)', figsize=(5, 2.5))

svg

#and when we input a minus number, the derivative of relu is 0, otherwise the derivative of relu is 1.
#notice: input = 0, we don't have derivative
y.backward(torch.ones_like(x), retain_graph=True)
d2l.plot(x.detach(), x.grad, 'x', 'grad of relu', figsize=(5, 2.5))#用detach防止跟踪其自动微分计算的计算图，我们只想要数据

svg

1 2	y = torch.sigmoid(x) d2l.plot(x.detach(), y.detach(), 'x', 'sigmoid(x)', figsize=(5, 2.5))

svg

#清除以前的梯度
x.grad.data.zero_()
y.backward(torch.ones_like(x), retain_graph=True)
d2l.plot(x.detach(), x.grad, 'x', 'grad of sigmoid', figsize=(5, 2.5))

svg

3 tanh function

1 2	y = torch.tanh(x) d2l.plot(x.detach(), y.detach(), 'x', 'tanh(x)', figsize=(5, 2.5))

svg

#tanh导数为1 - tanh(x)^2
x.grad.data.zero_()
y.backward(torch.ones_like(x), retain_graph=True)
d2l.plot(x.detach(), x.grad, 'x', 'grad of tanh', figsize=(5, 2.5))

svg

#我们继续使用上一章使用的图像数据集，将结果与softmax回归作比较
import torch
from torch import nn
from d2l import torch as d2l

1 2	batch_size = 256 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)#加载数据集

#回忆数据集28*28=784个灰度值，共有10个类别，测试集10000张，训练集60000张，
#输入为784，输出特征为10
#we choose a signal hidden layer in the multilayer perceptron with 256 hidden elements.
#That is we usually choose power of 2 to be the width of layer, which can make calculation more efficiently
#notice: in every layer we distribute w and b. So there are w1, b1 and w2, b2
num_inputs , num_outputs, num_hiddens = 784, 10, 256
W1 = nn.Parameter(torch.randn(num_inputs, num_hiddens, requires_grad = True) * 0.01)
#将初始权重缩放到一个极小的范围，较小的初始权重有助于更快地收敛到较好的模型。

b1 = nn.Parameter(torch.zeros(num_hiddens, requires_grad = True))

W2 = nn.Parameter(torch.randn(num_hiddens, num_outputs, requires_grad = True) * 0.01)

b2 = nn.Parameter(torch.zeros(num_outputs, requires_grad = True))

params = [W1, b1, W2, b2]

4.2.2 activation function

1
2
3

def relu(X):
    a = torch.zeros_like(X)
    return torch.max(X, a)#也就是同一个位置0或者X取最大值

4.2.3 model

#因为我们将图像直接按照784来输入，所以要预先转换一下
def net(X):
    X = X.reshape(-1, num_inputs)#-1也就是自动决定有多少行
    H = relu(X@W1 + b1)#@代表矩阵乘法，当然也可以用matmul
    return (H@W2 + b2)

4.2.4 loss function

1	loss = nn.CrossEntropyLoss(reduction='none')

4.2.5 training

1
2
3

num_epochs, lr = 10, 0.1#10轮训练，每次学习率0.1
updater = torch.optim.SGD(params, lr=lr)#梯度更新参数
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, updater)

svg

1	d2l.predict_ch3(net, test_iter)

svg

#保持其它参数不变的情况下，更改超参数num_hiddens的值，查看此超参数变化对结果有何影响，确定此超参数的最佳值。
num_hiddens = 1024
W1 = nn.Parameter(torch.randn(num_inputs, num_hiddens, requires_grad = True) * 0.01)

b1 = nn.Parameter(torch.zeros(num_hiddens, requires_grad = True))

W2 = nn.Parameter(torch.randn(num_hiddens, num_outputs, requires_grad = True) * 0.01)

b2 = nn.Parameter(torch.randn(num_outputs, requires_grad = True))

params = [W1, b1, W2, b2]
num_epochs, lr = 10, 0.1
updater = torch.optim.SGD(params, lr = lr)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, updater)#显然hidden维度越大，精度越高，收敛变快

svg

#如果隐藏层添加两层，总层数三层的话呢
num_hiddens_1 = 1024
num_hiddens_2 = 512
W1 = nn.Parameter(torch.randn(num_inputs, num_hiddens_1, requires_grad = True) * 0.01)

b1 = nn.Parameter(torch.zeros(num_hiddens_1, requires_grad = True))

W2 = nn.Parameter(torch.randn(num_hiddens_1, num_hiddens_2, requires_grad = True) * 0.01)

b2 = nn.Parameter(torch.randn(num_hiddens_2, requires_grad = True))
#前面这些都不变
W3 = nn.Parameter(torch.randn(num_hiddens_2, num_outputs, requires_grad = True) * 0.01)

b3 = nn.Parameter(torch.randn(num_outputs, requires_grad = True))
params = [W1, b1, W2, b2, W3, b3]
def net(X):
    X = X.reshape(-1, num_inputs)
    H1 = relu(X@W1 + b1)
    H2 = relu(H1@W2 + b2)
    return (H2@W3 + b3)
num_epochs, lr = 10, 0.1
updater = torch.optim.SGD(params, lr=lr)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, updater)#变化不是很大，收敛加速，精准提升。

svg

#如果改变学习率呢，适当增加大学习率，会提高很大的精度
num_hiddens_1 = 1024
num_hiddens_2 = 512
W1 = nn.Parameter(torch.randn(num_inputs, num_hiddens_1, requires_grad = True) * 0.01)

b1 = nn.Parameter(torch.zeros(num_hiddens_1, requires_grad = True))

W2 = nn.Parameter(torch.randn(num_hiddens_1, num_hiddens_2, requires_grad = True) * 0.01)

b2 = nn.Parameter(torch.randn(num_hiddens_2, requires_grad = True))
#前面这些都不变
W3 = nn.Parameter(torch.randn(num_hiddens_2, num_outputs, requires_grad = True) * 0.01)

b3 = nn.Parameter(torch.randn(num_outputs, requires_grad = True))
params = [W1, b1, W2, b2, W3, b3]
def net(X):
    X = X.reshape(-1, num_inputs)
    H1 = relu(X@W1 + b1)
    H2 = relu(H1@W2 + b2)
    return (H2@W3 + b3)
num_epochs, lr = 10, 0.5#学习率如果太小，收敛变慢，精度还未达到要求，得同时提升轮数
updater = torch.optim.SGD(params, lr=lr)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, updater)#变化不是很大，收敛加速，精度提升。

svg

4.3 multilayer perceptron concise realization

1
2
3

import torch
from torch import nn
from d2l import torch as d2l

net = nn.Sequential(nn.Flatten(), nn.Linear(784, 256), nn.ReLU(), nn.Linear(256, 10))

def init_weights(m):
    if type(m) == nn.Linear:#也就是给线性层的权重赋值
        nn.init.normal_(m.weight, std = 0.01)#初始化线性层权重weight也就是w
net.apply(init_weights)

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=256, bias=True)
  (2): ReLU()
  (3): Linear(in_features=256, out_features=10, bias=True)
)

1
2
3

batch_size, lr, num_epochs = 256, 0.1, 10
loss = nn.CrossEntropyLoss(reduction='none')
trainer = torch.optim.SGD(net.parameters(), lr=lr)

1
2
3

train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)
#这也就是一个隐藏层的时候

svg