%matplotlib inline #This is a magic command in order to display the function curves directly in the notebook but not interactively. import numpy as np from matplotlib_inline import backend_inline from d2l import torch as d2l
h = 0.1 for i inrange(5): print(f'h = {h:.5f}, numerical limit = {numerical_lim(f, 1, h):.5f}') #namely figure out the limit at x = 1 h *= 0.1 #the results are to 2
h = 0.10000, numerical limit = 2.30000
h = 0.01000, numerical limit = 2.03000
h = 0.00100, numerical limit = 2.00300
h = 0.00010, numerical limit = 2.00030
h = 0.00001, numerical limit = 2.00003
#set a format of figure defuse_svg_display(): #@save # "#@save" this is a specific mark, which can save the function into the d2l packet. backend_inline.set_matplotlib_formats('svg') defset_figsize(figsize=(3.5, 2.5)): #@save #"""设置matplotlib图表大小""" use_svg_display() #we can straightforward use plt because we have saved it in the d2l by "from d2l import torch as d2l" d2l.plt.rcParams['figure.figsize'] = figsize #@save defset_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend): #this function can set attributions of axes axes.set_xlabel(xlabel) axes.set_ylabel(ylabel) axes.set_xscale(xscale) axes.set_yscale(yscale) axes.set_xlim(xlim) axes.set_ylim(ylim) if legend: axes.legend(legend) axes.grid()
defplot(X, Y=None, xlabel=None, ylabel=None, legend=None, xlim=None, ylim=None, xscale='linear', yscale='linear', fmts=('-', 'm--', 'g-', 'r:'), figsize=(3.5, 2.5), axes=None): #this function can concisely plot a couple of curves # plot data point if legend isNone: legend = [] set_figsize(figsize) axes = axes if axes else d2l.plt.gca() #If X has an axis, then return True defhas_one_axis(X): return (hasattr(X, "ndim") and X.ndim == 1orisinstance(X, list) andnothasattr(X[0], "__len__")) #非常好的判断是否为一维的方法!首先直接用dim判断,如果是列表的话,看第一个元素是否还是可迭代对象,不是即可! if has_one_axis(X): X = [X] #如果 X 是一个一维数组或列表,将其转换为一个嵌套的列表 [X]。这样做是为了将一维的 X 转换为二维的数据结构,以便与后续的处理方式保持一致。 #后面Y = [Y]同理 if Y isNone: X, Y = [[]] * len(X), X #如果 Y 是 None,则将 X 转换为一个包含空列表的列表,长度与 X 相同,并将 X 赋值给 Y。这样做是为了确保 X 和 Y 的长度相同, #并且 Y 不为 None。 elif has_one_axis(Y): Y = [Y] iflen(X) != len(Y): X = X * len(Y) #如果 X 和 Y 的长度不相等,将 X 复制多次,使其长度与 Y 相同。这样做是为了确保 X 和 Y 具有相同的长度,以便进行后续处理。 #看下面一个方框中的内容,因为不同的y是不同的函数,所以每个函数y都要有列表x,所以用复制的方式 axes.cla()#清除绘图内容,以便重新绘制图形 for x, y, fmt inzip(X, Y, fmts): iflen(x): axes.plot(x, y, fmt) else: axes.plot(y, fmt) set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
1 2
x = np.arange(0, 3, 0.1)#也就是x的数组 plot(x, [f(x), 2*x - 3], 'x', 'f(x)', legend=['f(x)', 'Tangent line (x=1)'])
2.5 Automatic differentiation
1 2 3
import torch x = torch.arange(4.0) x, torch.ones(len(x))
x.requires_grad_(True) #set a region to storage gradient, that is x.grad x.grad#默认为None
1 2
y = 2 * torch.dot(x, x) y
tensor(28., grad_fn=<MulBackward0>)
1 2 3
y.backward() x.grad # we know y = 2x**2, so the gradient is 4*x
tensor([ 0., 4., 8., 12.])
1
x.grad == 4*x#verified right
tensor([True, True, True, True])
1 2 3 4 5 6
#In default, pytorch will accumulate gradient, so we need to clean previous values. x.grad.zero_() y = x.sum() y.backward() x.grad #apparently y = x1 + x2 + x3 + x4, so the gradient is 1 in each part.
tensor([1., 1., 1., 1.])
2.5.2 向量的反向传播
1 2 3 4 5 6 7 8 9
#we can reasonably imagine the result of derivative of vector y to vector x is a matrix. #when we are using backward(), We need to input a gradient, indicating the gradient of differentiation function to #self.
x.grad.zero_()#clean the previous values y = x * x#now y is also a vector, 注意,一维张量相乘得到的结果还是一维张量 y.sum().backward() #相当于y.backward(torch.ones(len(x)))也就是说y.backward([1, 1, 1, 1]) y, x.grad
x.grad.zero_() y = x*x u = y.detach() print(u) z = u*x#也就是说其实z是x^3,但是我们想将y看成一个常数,所以需要一个和x无关但是和y的值相等的变量u。 #所以z对x求导得到u,因为这里相当于一个常数 #因为反向传播会计算每个参数的偏导数,所以我们想让u断开连接,看成常数,不计算偏导数 z.sum().backward() print(z.sum()) print(x.grad) x.grad == u
#下面的代码中,while循环的迭代次数和if语句的结果都取决于输入a的值 deff(a): b = a*2 while b.norm() < 1000: b = b * 2 if b.sum() > 0: c = b else: c = 100 * b return c #注意这个函数求了半天也就是个正比例函数!所以求导数就是常数! #下面计算梯度 a = torch.randn(size=([1]), requires_grad=True) #a = torch.randn(size=([2]), requires_grad=True)一个行向量就不行了,因为必须得是常数或者rank是1的向量为结果向后传播求导才行。 print(a) d = f(a) d.backward()
tensor([-1.3762], requires_grad=True)
1 2 3
#a.grad.zero_() a.grad == d / a#显然d / a得到的就是系数!a.grad表示对a的梯度 print(a.grad)
tensor([102400.])
1 2 3 4 5 6 7 8 9 10
defg(t): m = t*t while m.norm() < 1000: m = m*2 return m a = torch.randn(size=[2], requires_grad=True) print(a) d = g(a) print(d) d.sum().backward()
#Exercise 5 根据sin(x)用自动微分画出导数图像cos(x) import numpy as np from d2l import torch as d2l x = np.linspace(-np.pi, np.pi, 100)#x取值范围 x = torch.tensor(x, requires_grad = True) y = torch.sin(x) for i inrange(100): y[i].backward(retain_graph=True) print(x.grad) d2l.plot(x.detach(), (y.detach(), x.grad), legend=("sin(x)", "cos(x)"))
#there is a detailed website https://zhang-yang.medium.com/the-gradient-argument-in-pytorchs-backward-function-explained-by-examples-68f266950c29 #there is an example. #x = [x1, x2] = [1, 2] #y = [3x1^2, x1^2 + 2x2^3, 10x2] #so we can see this is a derivative from vector y to vector x, we will get the 3 by 2 Jacobbi matrix. #so we need the gradient parameter 1 by 3 which will bring the result matrix 1 by 2 (a row vector!) x = torch.tensor([1.0, 2.0], requires_grad = True) print("x: ", x) y = torch.empty(3) y[0] = 3 * x[0] ** 2 y[1] = x[0] ** 2 + 2 * x[1] ** 3 y[2] = 10 * x[1] print("y:", y) gradient_value = [1, 10, 100] y.backward(torch.tensor(gradient_value)) x.grad
#There is a little special case. x = torch.tensor(1., requires_grad=True) print('x:', x) y = x**2 print('y:', y) gradient_value = [1., 10., 100., 1000.] # y.backward(torch.tensor(gradient_value)) # print('x.grad:', x.grad) #目前已经报错了!本来就应该报错,不能违背矩阵的运算法则。 #下面看分别乘gradient_value for v in gradient_value: y.backward(torch.tensor(v), retain_graph = True) print("x.grad: ", x.grad) #这样就不报错了!一个一个乘,并且不清零,积累起来! #下面框是和这个类似的一个积累梯度
x = torch.tensor([1., 1.], requires_grad = True) y = sum(x)#value function gradient_value = [1., 10., 100., 1000.] for v in gradient_value: y.backward(torch.tensor(v), retain_graph = True)#因为要满足矩阵乘法,所以gradient_value每次只能取1 by 1 向量。 print("x.grad = ", x.grad)