#a optimizing algorithm which is also called "Dropout". #It can prevent overfitting on the datasets, so Dropout drop some neural units randomly during every iteration. #So, in the neural network, each neuron cannot rely on other specific neurons. #It seems like training some subnetworks, which can reduce the overfitting and improve the robustness.
1
#we add noise in the inputs in each layer with normal distributions.
4.6.4 starting from zero
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#to realize single-layer dropout function, we choose samples from U[0,1](uniform distribution) #we save the nodes whose samples > p, and drop out the remain. import torch from torch import nn from d2l import torch as d2l
defdropout_layer(X, dropout): assert0 <= dropout <= 1 if dropout == 1: return torch.zeros_like(X)#all are dropped, because the variable dropout is a probability. if dropout == 0: return X mask = (torch.rand(X.shape) > dropout).float()#剩余部分,进行缩放运算,除以下面的 return mask * X / (1.0 - dropout)
1 2 3 4 5 6 7
#now we test dropout_layer function X = torch.arange(16, dtype=torch.float32).reshape(2, 8)#之前是不用两个括号的啊,这里reshape里面再加一套括号也可以! #注意arange生成的是一维向量 print(X) print(dropout_layer(X, 0.))#也就是全部保留 print(dropout_layer(X, 0.4))#也就是保留一半 print(dropout_layer(X, 1.))#一个不剩,变成0
#we also use datasets Fashion-MNIST in chapter3.5, and we define a perceptron with two hidden layer. num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256
#we just need add a "dropout" after a full-connected layer, passing the only parameter p as dropout probability net = nn.Sequential(nn.Flatten(), nn.Linear(784, 256), nn.ReLU(),#也就是隐藏层H1的输出,下面经历dropout,再进第二层 #add dropout nn.Dropout(dropout1),#第一层的输出作为 nn.Linear(256, 256), nn.ReLU(),#第二个隐藏层输出后经历dropout #add dropout nn.Dropout(dropout2), nn.Linear(256, 10))#最后是输出层。 definit_weights(m): iftype(m) == nn.Linear: nn.init.normal_(m.weight, std=0.01)#初始化w,b参数,也就是均值0方差0.01 net.apply(init_weights)
#conclusion: #The method of dropout drop some neurons after calculating each full-connected layer. #This method can avoid overfitting, only using during training.