2.4 Calculus

1
2
3
4
5
6
7
8
%matplotlib inline
#This is a magic command in order to display the function curves directly in the notebook but not interactively.
import numpy as np
from matplotlib_inline import backend_inline
from d2l import torch as d2l

def f(x):
return 3 * x ** 2 - 4 * x
1
2
3
4
5
6
7
8
9
def numerical_lim(f, x, h):
return (f(x + h) - f(x)) / h

h = 0.1
for i in range(5):
print(f'h = {h:.5f}, numerical limit = {numerical_lim(f, 1, h):.5f}')
#namely figure out the limit at x = 1
h *= 0.1
#the results are to 2
h = 0.10000, numerical limit = 2.30000
h = 0.01000, numerical limit = 2.03000
h = 0.00100, numerical limit = 2.00300
h = 0.00010, numerical limit = 2.00030
h = 0.00001, numerical limit = 2.00003
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#set a format of figure
def use_svg_display(): #@save
# "#@save" this is a specific mark, which can save the function into the d2l packet.
backend_inline.set_matplotlib_formats('svg')

def set_figsize(figsize=(3.5, 2.5)): #@save
#"""设置matplotlib图表大小"""
use_svg_display()
#we can straightforward use plt because we have saved it in the d2l by "from d2l import torch as d2l"
d2l.plt.rcParams['figure.figsize'] = figsize

#@save
def set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend):
#this function can set attributions of axes
axes.set_xlabel(xlabel)
axes.set_ylabel(ylabel)
axes.set_xscale(xscale)
axes.set_yscale(yscale)
axes.set_xlim(xlim)
axes.set_ylim(ylim)
if legend:
axes.legend(legend)
axes.grid()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def plot(X, Y=None, xlabel=None, ylabel=None, legend=None, xlim=None, ylim=None, xscale='linear', yscale='linear',
fmts=('-', 'm--', 'g-', 'r:'), figsize=(3.5, 2.5), axes=None):
#this function can concisely plot a couple of curves
# plot data point
if legend is None:
legend = []

set_figsize(figsize)
axes = axes if axes else d2l.plt.gca()
#If X has an axis, then return True
def has_one_axis(X):
return (hasattr(X, "ndim") and X.ndim == 1 or isinstance(X, list) and not hasattr(X[0], "__len__"))
#非常好的判断是否为一维的方法!首先直接用dim判断,如果是列表的话,看第一个元素是否还是可迭代对象,不是即可!

if has_one_axis(X):
X = [X]
#如果 X 是一个一维数组或列表,将其转换为一个嵌套的列表 [X]。这样做是为了将一维的 X 转换为二维的数据结构,以便与后续的处理方式保持一致。
#后面Y = [Y]同理

if Y is None:
X, Y = [[]] * len(X), X
#如果 Y 是 None,则将 X 转换为一个包含空列表的列表,长度与 X 相同,并将 X 赋值给 Y。这样做是为了确保 X 和 Y 的长度相同,
#并且 Y 不为 None。

elif has_one_axis(Y):
Y = [Y]

if len(X) != len(Y):
X = X * len(Y)
#如果 X 和 Y 的长度不相等,将 X 复制多次,使其长度与 Y 相同。这样做是为了确保 X 和 Y 具有相同的长度,以便进行后续处理。
#看下面一个方框中的内容,因为不同的y是不同的函数,所以每个函数y都要有列表x,所以用复制的方式
axes.cla()#清除绘图内容,以便重新绘制图形
for x, y, fmt in zip(X, Y, fmts):
if len(x):
axes.plot(x, y, fmt)
else:
axes.plot(y, fmt)
set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
1
2
x = np.arange(0, 3, 0.1)#也就是x的数组
plot(x, [f(x), 2*x - 3], 'x', 'f(x)', legend=['f(x)', 'Tangent line (x=1)'])

svg

2.5 Automatic differentiation

1
2
3
import torch
x = torch.arange(4.0)
x, torch.ones(len(x))
(tensor([0., 1., 2., 3.]), tensor([1., 1., 1., 1.]))
1
2
3
x.requires_grad_(True)
#set a region to storage gradient, that is x.grad
x.grad#默认为None
1
2
y = 2 * torch.dot(x, x)
y
tensor(28., grad_fn=<MulBackward0>)
1
2
3
y.backward()
x.grad
# we know y = 2x**2, so the gradient is 4*x
tensor([ 0.,  4.,  8., 12.])
1
x.grad == 4*x#verified right
tensor([True, True, True, True])
1
2
3
4
5
6
#In default, pytorch will accumulate gradient, so we need to clean previous values.
x.grad.zero_()
y = x.sum()
y.backward()
x.grad
#apparently y = x1 + x2 + x3 + x4, so the gradient is 1 in each part.
tensor([1., 1., 1., 1.])

2.5.2 向量的反向传播

1
2
3
4
5
6
7
8
9
#we can reasonably imagine the result of derivative of vector y to vector x is a matrix.
#when we are using backward(), We need to input a gradient, indicating the gradient of differentiation function to
#self.

x.grad.zero_()#clean the previous values
y = x * x#now y is also a vector, 注意,一维张量相乘得到的结果还是一维张量
y.sum().backward()
#相当于y.backward(torch.ones(len(x)))也就是说y.backward([1, 1, 1, 1])
y, x.grad
(tensor([0., 1., 4., 9.], grad_fn=<MulBackward0>), tensor([0., 2., 4., 6.]))

2.5.3 分离计算

1
2
3
4
5
6
7
8
9
10
11
x.grad.zero_()
y = x*x
u = y.detach()
print(u)
z = u*x#也就是说其实z是x^3,但是我们想将y看成一个常数,所以需要一个和x无关但是和y的值相等的变量u。
#所以z对x求导得到u,因为这里相当于一个常数
#因为反向传播会计算每个参数的偏导数,所以我们想让u断开连接,看成常数,不计算偏导数
z.sum().backward()
print(z.sum())
print(x.grad)
x.grad == u
tensor([0., 1., 4., 9.])
tensor(36., grad_fn=<SumBackward0>)
tensor([0., 1., 4., 9.])





tensor([True, True, True, True])
1
2
3
4
#补充证明
a = torch.tensor([1, 2, 3, 4])
b = torch.tensor([5, 6, 7, 8])
a*b
tensor([ 5, 12, 21, 32])
1
2
3
4
#由于记录了y的计算结果,我们可以随后在y上调用反向传播函数,得到y = x*x关于x的导数2x
x.grad.zero_()
y.sum().backward()
x.grad
tensor([0., 2., 4., 6.])

2.5.4 Python控制流的梯度计算

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#下面的代码中,while循环的迭代次数和if语句的结果都取决于输入a的值
def f(a):
b = a*2
while b.norm() < 1000:
b = b * 2
if b.sum() > 0:
c = b
else:
c = 100 * b
return c
#注意这个函数求了半天也就是个正比例函数!所以求导数就是常数!
#下面计算梯度
a = torch.randn(size=([1]), requires_grad=True)
#a = torch.randn(size=([2]), requires_grad=True)一个行向量就不行了,因为必须得是常数或者rank是1的向量为结果向后传播求导才行。
print(a)
d = f(a)
d.backward()
tensor([-1.3762], requires_grad=True)
1
2
3
#a.grad.zero_()
a.grad == d / a#显然d / a得到的就是系数!a.grad表示对a的梯度
print(a.grad)
tensor([102400.])
1
2
3
4
5
6
7
8
9
10
def g(t):
m = t*t
while m.norm() < 1000:
m = m*2
return m
a = torch.randn(size=[2], requires_grad=True)
print(a)
d = g(a)
print(d)
d.sum().backward()
tensor([0.1107, 2.5644], requires_grad=True)
tensor([   3.1388, 1683.4849], grad_fn=<MulBackward0>)
1
a.grad#因为输入是二维向量,所有结果对两个维度求导!
tensor([  56.6937, 1312.9694])
1
2
3
4
5
6
7
8
9
10
#Exercise 5 根据sin(x)用自动微分画出导数图像cos(x)
import numpy as np
from d2l import torch as d2l
x = np.linspace(-np.pi, np.pi, 100)#x取值范围
x = torch.tensor(x, requires_grad = True)
y = torch.sin(x)
for i in range(100):
y[i].backward(retain_graph=True)
print(x.grad)
d2l.plot(x.detach(), (y.detach(), x.grad), legend=("sin(x)", "cos(x)"))
tensor([-1.0000, -0.9980, -0.9920, -0.9819, -0.9679, -0.9501, -0.9284, -0.9029,
        -0.8738, -0.8413, -0.8053, -0.7660, -0.7237, -0.6785, -0.6306, -0.5801,
        -0.5272, -0.4723, -0.4154, -0.3569, -0.2969, -0.2358, -0.1736, -0.1108,
        -0.0476,  0.0159,  0.0792,  0.1423,  0.2048,  0.2665,  0.3271,  0.3863,
         0.4441,  0.5000,  0.5539,  0.6056,  0.6549,  0.7015,  0.7453,  0.7861,
         0.8237,  0.8580,  0.8888,  0.9161,  0.9397,  0.9595,  0.9754,  0.9874,
         0.9955,  0.9995,  0.9995,  0.9955,  0.9874,  0.9754,  0.9595,  0.9397,
         0.9161,  0.8888,  0.8580,  0.8237,  0.7861,  0.7453,  0.7015,  0.6549,
         0.6056,  0.5539,  0.5000,  0.4441,  0.3863,  0.3271,  0.2665,  0.2048,
         0.1423,  0.0792,  0.0159, -0.0476, -0.1108, -0.1736, -0.2358, -0.2969,
        -0.3569, -0.4154, -0.4723, -0.5272, -0.5801, -0.6306, -0.6785, -0.7237,
        -0.7660, -0.8053, -0.8413, -0.8738, -0.9029, -0.9284, -0.9501, -0.9679,
        -0.9819, -0.9920, -0.9980, -1.0000], dtype=torch.float64)

svg

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#there is a detailed website https://zhang-yang.medium.com/the-gradient-argument-in-pytorchs-backward-function-explained-by-examples-68f266950c29
#there is an example.
#x = [x1, x2] = [1, 2]
#y = [3x1^2, x1^2 + 2x2^3, 10x2]
#so we can see this is a derivative from vector y to vector x, we will get the 3 by 2 Jacobbi matrix.
#so we need the gradient parameter 1 by 3 which will bring the result matrix 1 by 2 (a row vector!)
x = torch.tensor([1.0, 2.0], requires_grad = True)
print("x: ", x)
y = torch.empty(3)
y[0] = 3 * x[0] ** 2
y[1] = x[0] ** 2 + 2 * x[1] ** 3
y[2] = 10 * x[1]
print("y:", y)
gradient_value = [1, 10, 100]
y.backward(torch.tensor(gradient_value))
x.grad
x:  tensor([1., 2.], requires_grad=True)
y: tensor([ 3., 17., 20.], grad_fn=<CopySlices>)





tensor([  26., 1240.])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#There is a little special case.
x = torch.tensor(1., requires_grad=True)
print('x:', x)
y = x**2
print('y:', y)
gradient_value = [1., 10., 100., 1000.]
# y.backward(torch.tensor(gradient_value))
# print('x.grad:', x.grad)
#目前已经报错了!本来就应该报错,不能违背矩阵的运算法则。
#下面看分别乘gradient_value
for v in gradient_value:
y.backward(torch.tensor(v), retain_graph = True)
print("x.grad: ", x.grad)
#这样就不报错了!一个一个乘,并且不清零,积累起来!
#下面框是和这个类似的一个积累梯度
x: tensor(1., requires_grad=True)
y: tensor(1., grad_fn=<PowBackward0>)
x.grad:  tensor(2.)
x.grad:  tensor(22.)
x.grad:  tensor(222.)
x.grad:  tensor(2222.)
1
2
3
4
5
6
x = torch.tensor([1., 1.], requires_grad = True)
y = sum(x)#value function
gradient_value = [1., 10., 100., 1000.]
for v in gradient_value:
y.backward(torch.tensor(v), retain_graph = True)#因为要满足矩阵乘法,所以gradient_value每次只能取1 by 1 向量。
print("x.grad = ", x.grad)
x.grad =  tensor([1., 1.])
x.grad =  tensor([11., 11.])
x.grad =  tensor([111., 111.])
x.grad =  tensor([1111., 1111.])