4.6 Temporary Retirement

#a optimizing algorithm which is also called "Dropout".
#It can prevent overfitting on the datasets, so Dropout drop some neural units randomly during every iteration.
#So, in the neural network, each neuron cannot rely on other specific neurons. 
#It seems like training some subnetworks, which can reduce the overfitting and improve the robustness.

1	#we add noise in the inputs in each layer with normal distributions.

4.6.4 starting from zero

#to realize single-layer dropout function, we choose samples from U[0,1](uniform distribution)
#we save the nodes whose samples  > p, and drop out the remain.
import torch
from torch import nn
from d2l import torch as d2l

def dropout_layer(X, dropout):
    assert 0 <= dropout <= 1
    if dropout == 1:
        return torch.zeros_like(X)#all are dropped, because the variable dropout is a probability.
    if dropout == 0:
        return X
    mask = (torch.rand(X.shape) > dropout).float()#剩余部分，进行缩放运算，除以下面的
    return mask * X / (1.0 - dropout)

#now we test dropout_layer function
X = torch.arange(16, dtype=torch.float32).reshape(2, 8)#之前是不用两个括号的啊，这里reshape里面再加一套括号也可以！
#注意arange生成的是一维向量
print(X)
print(dropout_layer(X, 0.))#也就是全部保留
print(dropout_layer(X, 0.4))#也就是保留一半
print(dropout_layer(X, 1.))#一个不剩，变成0

tensor([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11., 12., 13., 14., 15.]])
tensor([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11., 12., 13., 14., 15.]])
tensor([[ 0.0000,  1.6667,  3.3333,  5.0000,  0.0000,  8.3333,  0.0000,  0.0000],
        [ 0.0000,  0.0000, 16.6667, 18.3333, 20.0000,  0.0000,  0.0000,  0.0000]])
tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

1. define model parameters

1 2	#we also use datasets Fashion-MNIST in chapter3.5, and we define a perceptron with two hidden layer. num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256

2. define model

dropout1, dropout2 = 0.2, 0.5

class Net(nn.Module):
    def __init__(self, num_inputs, num_outputs, num_hiddens1, num_hiddens2, is_training = True):
        super(Net, self).__init__()
        #再回忆一下，super是用于调用子类的时候，同时调用父类，让父类也进行初始化，这里就是nn.Module，让Net正确继承nn.Module
        self.num_inputs = num_inputs
        self.training = is_training
        self.lin1 = nn.Linear(num_inputs, num_hiddens1)
        self.lin2 = nn.Linear(num_hiddens1, num_hiddens2)
        self.lin3 = nn.Linear(num_hiddens2, num_outputs)
        self.relu = nn.ReLU()#初始化模型
    def forward(self, X):
        H1 = self.relu(self.lin1(X.reshape(-1, self.num_inputs)))#调整输入大小，并得到隐藏层1的输出
        #只有在训练模型时才使用暂退法
        if self.training == True:
            H1 = dropout_layer(H1, dropout1)#将隐藏层1的输出进行一次dropout
        H2 = self.relu(self.lin2(H1))
        if self.training == True:
            #第二个隐藏层输出之后经历dropout
            H2 = dropout_layer(H2, dropout2)
        out = self.lin3(H2)
        return out
net = Net(num_inputs, num_outputs, num_hiddens1, num_hiddens2)

3. training and testing

num_epochs, lr, batch_size = 10, 0.5, 256
loss = nn.CrossEntropyLoss(reduction = 'none')
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
trainer = torch.optim.SGD(net.parameters(), lr = lr)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)#也就是和之前一样，输入我们的参数，数据，损失函数，轮数，优化器

svg

4.6.5 concisely realize

#we just need add a "dropout" after a full-connected layer, passing the only parameter p as dropout probability
net = nn.Sequential(nn.Flatten(), 
                   nn.Linear(784, 256),
                   nn.ReLU(),#也就是隐藏层H1的输出，下面经历dropout，再进第二层
                   #add dropout
                   nn.Dropout(dropout1),#第一层的输出作为
                   nn.Linear(256, 256),
                   nn.ReLU(),#第二个隐藏层输出后经历dropout
                   #add dropout
                   nn.Dropout(dropout2),
                   nn.Linear(256, 10))#最后是输出层。
def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, std=0.01)#初始化w,b参数，也就是均值0方差0.01
net.apply(init_weights)

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=256, bias=True)
  (2): ReLU()
  (3): Dropout(p=0.2, inplace=False)
  (4): Linear(in_features=256, out_features=256, bias=True)
  (5): ReLU()
  (6): Dropout(p=0.5, inplace=False)
  (7): Linear(in_features=256, out_features=10, bias=True)
)

1 2	trainer = torch.optim.SGD(net.parameters(), lr=lr) d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)#传递全部参数，进行训练了

svg

1
2
3

#conclusion:
#The method of dropout drop some neurons after calculating each full-connected layer.
#This method can avoid overfitting, only using during training.