4.6 Temporary Retirement

1
2
3
4
#a optimizing algorithm which is also called "Dropout".
#It can prevent overfitting on the datasets, so Dropout drop some neural units randomly during every iteration.
#So, in the neural network, each neuron cannot rely on other specific neurons.
#It seems like training some subnetworks, which can reduce the overfitting and improve the robustness.
1
#we add noise in the inputs in each layer with normal distributions.

4.6.4 starting from zero

1
2
3
4
5
6
7
8
9
10
11
12
13
14
#to realize single-layer dropout function, we choose samples from U[0,1](uniform distribution)
#we save the nodes whose samples > p, and drop out the remain.
import torch
from torch import nn
from d2l import torch as d2l

def dropout_layer(X, dropout):
assert 0 <= dropout <= 1
if dropout == 1:
return torch.zeros_like(X)#all are dropped, because the variable dropout is a probability.
if dropout == 0:
return X
mask = (torch.rand(X.shape) > dropout).float()#剩余部分,进行缩放运算,除以下面的
return mask * X / (1.0 - dropout)
1
2
3
4
5
6
7
#now we test dropout_layer function
X = torch.arange(16, dtype=torch.float32).reshape(2, 8)#之前是不用两个括号的啊,这里reshape里面再加一套括号也可以!
#注意arange生成的是一维向量
print(X)
print(dropout_layer(X, 0.))#也就是全部保留
print(dropout_layer(X, 0.4))#也就是保留一半
print(dropout_layer(X, 1.))#一个不剩,变成0
tensor([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11., 12., 13., 14., 15.]])
tensor([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11., 12., 13., 14., 15.]])
tensor([[ 0.0000,  1.6667,  3.3333,  5.0000,  0.0000,  8.3333,  0.0000,  0.0000],
        [ 0.0000,  0.0000, 16.6667, 18.3333, 20.0000,  0.0000,  0.0000,  0.0000]])
tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

1. define model parameters

1
2
#we also use datasets Fashion-MNIST in chapter3.5, and we define a perceptron with two hidden layer.
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256

2. define model

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
dropout1, dropout2 = 0.2, 0.5

class Net(nn.Module):
def __init__(self, num_inputs, num_outputs, num_hiddens1, num_hiddens2, is_training = True):
super(Net, self).__init__()
#再回忆一下,super是用于调用子类的时候,同时调用父类,让父类也进行初始化,这里就是nn.Module,让Net正确继承nn.Module
self.num_inputs = num_inputs
self.training = is_training
self.lin1 = nn.Linear(num_inputs, num_hiddens1)
self.lin2 = nn.Linear(num_hiddens1, num_hiddens2)
self.lin3 = nn.Linear(num_hiddens2, num_outputs)
self.relu = nn.ReLU()#初始化模型
def forward(self, X):
H1 = self.relu(self.lin1(X.reshape(-1, self.num_inputs)))#调整输入大小,并得到隐藏层1的输出
#只有在训练模型时才使用暂退法
if self.training == True:
H1 = dropout_layer(H1, dropout1)#将隐藏层1的输出进行一次dropout
H2 = self.relu(self.lin2(H1))
if self.training == True:
#第二个隐藏层输出之后经历dropout
H2 = dropout_layer(H2, dropout2)
out = self.lin3(H2)
return out
net = Net(num_inputs, num_outputs, num_hiddens1, num_hiddens2)

3. training and testing

1
2
3
4
5
num_epochs, lr, batch_size = 10, 0.5, 256
loss = nn.CrossEntropyLoss(reduction = 'none')
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
trainer = torch.optim.SGD(net.parameters(), lr = lr)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)#也就是和之前一样,输入我们的参数,数据,损失函数,轮数,优化器

svg

4.6.5 concisely realize

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#we just need add a "dropout" after a full-connected layer, passing the only parameter p as dropout probability
net = nn.Sequential(nn.Flatten(),
nn.Linear(784, 256),
nn.ReLU(),#也就是隐藏层H1的输出,下面经历dropout,再进第二层
#add dropout
nn.Dropout(dropout1),#第一层的输出作为
nn.Linear(256, 256),
nn.ReLU(),#第二个隐藏层输出后经历dropout
#add dropout
nn.Dropout(dropout2),
nn.Linear(256, 10))#最后是输出层。
def init_weights(m):
if type(m) == nn.Linear:
nn.init.normal_(m.weight, std=0.01)#初始化w,b参数,也就是均值0方差0.01
net.apply(init_weights)
Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=256, bias=True)
  (2): ReLU()
  (3): Dropout(p=0.2, inplace=False)
  (4): Linear(in_features=256, out_features=256, bias=True)
  (5): ReLU()
  (6): Dropout(p=0.5, inplace=False)
  (7): Linear(in_features=256, out_features=10, bias=True)
)
1
2
trainer = torch.optim.SGD(net.parameters(), lr=lr)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)#传递全部参数,进行训练了

svg

1
2
3
#conclusion:
#The method of dropout drop some neurons after calculating each full-connected layer.
#This method can avoid overfitting, only using during training.