#Activation function calculate weight and add bias to determine whether neuron should be activated. #They transform input signals into output differentiable operations. #Most activation functions are nonlinear %matplotlib inline import torch from d2l import torch as d2l
1.Relu函数
1 2 3 4
#rectified linear unit, ReLU = max(x, 0) namely discard all the minus elements. x = torch.arange(-8.0, 8.0, 0.1, requires_grad=True) y = torch.relu(x) d2l.plot(x.detach(), y.detach(), 'x', 'relu(x)', figsize=(5, 2.5))
1 2 3 4
#and when we input a minus number, the derivative of relu is 0, otherwise the derivative of relu is 1. #notice: input = 0, we don't have derivative y.backward(torch.ones_like(x), retain_graph=True) d2l.plot(x.detach(), x.grad, 'x', 'grad of relu', figsize=(5, 2.5))#用detach防止跟踪其自动微分计算的计算图,我们只想要数据
1 2
y = torch.sigmoid(x) d2l.plot(x.detach(), y.detach(), 'x', 'sigmoid(x)', figsize=(5, 2.5))
#回忆数据集28*28=784个灰度值,共有10个类别,测试集10000张,训练集60000张, #输入为784,输出特征为10 #we choose a signal hidden layer in the multilayer perceptron with 256 hidden elements. #That is we usually choose power of 2 to be the width of layer, which can make calculation more efficiently #notice: in every layer we distribute w and b. So there are w1, b1 and w2, b2 num_inputs , num_outputs, num_hiddens = 784, 10, 256 W1 = nn.Parameter(torch.randn(num_inputs, num_hiddens, requires_grad = True) * 0.01) #将初始权重缩放到一个极小的范围,较小的初始权重有助于更快地收敛到较好的模型。