4.4 model choosing, underfitting and overfitting

4.4.1 polynomial regression

import math
import numpy as np
import torch
from torch import nn
from d2l import torch as d2l

4.4.4多项式回归

1. generate datasets

#中间内容补充：
#泛化误差：我们通过将模型应用于一个独立的测试集来估计泛化误差，随机抽取该测试集，测试集不能在训练样本中出现过，避免过拟合(overfitting)。
#所以泛化误差也就是模型应用在同样从原始样本的分布中抽取的无限多数据样本上，误差的期望。
#训练误差：模型在训练数据集上得到的误差。

#给定x，我们将使用以下三阶多项式来生成训练数据和测试数据的标签：
#y = 5 + 1.2x - 3.4x^2 / 2! + 5.6x^3 / 3! + epsilon其中epsilon满足正态分布N(0,0.1^2)
max_degree = 20 #多项式最大阶数
n_train, n_test = 100, 100
true_w = np.zeros(max_degree) #distribute large space, notice this is one-dimensional vetor
true_w[0:4] = np.array([5, 1.2, -3.4, 5.6])
true_w

array([ 5. ,  1.2, -3.4,  5.6,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
        0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ])

import math
features = np.random.normal(size=(n_train + n_test, 1))#generate column vector randomly with normal distribution
np.random.shuffle(features) #数组中元素随机打乱次序

poly_features = np.power(features, np.arange(max_degree).reshape(1, -1))#np.arange(生成(0到19)行向量，但是是1维，reshape变2维)
#所以上述第一行就是features第一行的不同幂次，从0到19

for i in range(max_degree):#显然有20列，对于每一列
    poly_features[:, i] /= math.gamma(i + 1) #gamma(n) = (n-1)!也就是让每一个幂次除以对应的次数i的阶乘
#labels的维度:(n_train+n_test)
labels = np.dot(poly_features, true_w)#2维dot1维，namely each vector in poly_features is to dot true_w. That is 
#to multiply the coefficient of polynomail to each term respectively.

labels += np.random.normal(scale=0.1, size=labels.shape)#加一些噪声,其实就是刚才最后的epsilon，现在我们取了一堆x带入多项式了
#最后生成的labels就是一个列向量，不同的y结果，当然其实是行向量了，因为2维的列向量，可以直接降维了
labels

array([  4.8692312 ,   5.16634179,   5.17157219,   4.44992635,
         1.46280344,  -5.06218181,   5.889207  ,   4.56238664,
         4.87028183,   4.54172465,   5.24859217,   5.04015151,
         5.18181701,   5.38424111,   3.39873268,   4.64275208,
         4.35271003,   5.16832895,   4.14543903,   4.65063877,
       -14.04214122,   8.89788376,   2.31341084,  15.59443065,
         5.27895688,   5.20009563,   5.52324288,   4.79111697,
         4.83454092,   4.21118178,   5.34829524,   4.98778878,
         4.78903991,   5.42121524,  -0.08646758,   5.22075864,
         3.04052132,   5.37641382,   5.31427983,   1.95115128,
         5.20895355,   4.78196463,   4.86370408,   3.10946228,
        -0.0601124 ,   2.08128196,   4.96359631,   4.95759656,
         4.96459902,  -1.95144794,   2.27218026,   5.06122989,
         5.15736863,   8.24413866,   2.93171763,   5.10899302,
         5.16873876,   3.97678117,   5.22783075, -14.22600597,
         5.89299691,   1.4600962 ,   5.39673791,   5.34356736,
         1.41960834,   4.74550209,   5.30386891,   5.1905644 ,
         8.63503162,   5.15137852,   4.91717414,   5.25393608,
         4.22608381,   5.19507594,  -2.33845945,   5.03897046,
         4.83550022,   3.99346892,   6.13130488,   5.35084292,
         5.29180567,   5.13371325,  -5.39315402,   3.64932434,
         5.4949872 ,   3.23705686,   5.01227656,   5.21709494,
         4.89355881,  -2.32141592,   5.16010209,   5.30992539,
         5.05714421,   3.77927268,   1.67874088,  -0.74648169,
         5.23818345,   4.46044286, -14.47000192,   5.40050032,
        -0.41654157,   5.22906079,   5.99768845,  -0.39934549,
         0.04835856,   5.35289082,   5.14103181,   5.2558319 ,
         5.24250871,   5.26817837,   5.33336094,   5.37566382,
         5.21431961,   1.89708795,   4.41728779,   2.90170984,
         5.23697246,   5.33037079,   3.70246128,   8.07963553,
         5.21835657,   5.24508515,   5.17449204,   1.72924506,
         5.72796592,   5.31269464,   5.03416459,   4.97580148,
         5.17061158,   5.20614036,   4.48565089,   5.33699495,
         4.7308748 ,   5.82422954,   4.32557951,   6.60313714,
         6.61531613,   5.01810497,   3.76648818,   3.68447076,
         4.66927153,   5.5730621 ,  -1.39018833,   5.43207817,
         5.48576896,   5.12510468,   5.50463484,   5.34312639,
         6.09526137,   5.46858919,   5.35950711,   3.90834397,
         4.93954786,   1.84312552, -24.41216278,   5.13337525,
        -7.75916151,  10.15291851,   5.16810522,   4.70395796,
         5.51297724,   4.69024493,   3.52962049,   5.5393479 ,
         4.79681432,   4.77456907,   5.4856192 ,  -3.6675196 ,
         5.41595652,  -0.67486994,   3.51052895,   5.2695433 ,
         5.32002376,   4.9454762 ,   4.43202639,   4.78441846,
         4.58334594,   3.90972686,   3.7824209 ,   5.69007162,
         3.49141187,   5.162478  ,   4.27201393,   5.10804769,
         5.43021279,   3.43238998,   6.4019248 ,  -0.20671286,
         6.09722773,   2.12787315, -10.43890134,   5.4582987 ,
         4.89111854,  -0.37229775,   5.19130977,   5.11395338,
         5.31805814,   5.23543407,   2.61401117,  -0.30074837])

import torch
#Numpy ndarray转换为tensor
true_w, features, poly_features, labels = [torch.tensor(x, dtype=torch.float32) for x in [true_w, features, poly_features, labels]]
#true_w is the coefficient vector of polynomail
#features is 2维列向量，取的样本x
#poly_fetures is just different power of each x, the different power is in each column, from 0 to 19
#labels is the result of polynomail working on x, and the result is a row vector.
true_w, features[:2], poly_features[:2, :], labels[:2]#也就是对向量直接转，并不改变维数

(tensor([ 5.0000,  1.2000, -3.4000,  5.6000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000]),
 tensor([[-0.0973],
         [ 0.1975]]),
 tensor([[ 1.0000e+00, -9.7273e-02,  4.7310e-03, -1.5340e-04,  3.7305e-06,
          -7.2575e-08,  1.1766e-09, -1.6350e-11,  1.9880e-13, -2.1487e-15,
           2.0901e-17, -1.8483e-19,  1.4982e-21, -1.1211e-23,  7.7893e-26,
          -5.0513e-28,  3.0710e-30, -1.7572e-32,  9.4960e-35, -4.8616e-37],
         [ 1.0000e+00,  1.9748e-01,  1.9499e-02,  1.2835e-03,  6.3367e-05,
           2.5027e-06,  8.2371e-08,  2.3238e-09,  5.7362e-11,  1.2586e-12,
           2.4855e-14,  4.4621e-16,  7.3431e-18,  1.1155e-19,  1.5734e-21,
           2.0714e-23,  2.5566e-25,  2.9698e-27,  3.2582e-29,  3.3864e-31]]),
 tensor([4.8692, 5.1663]))

2. train and test the model

def evaluate_loss(net, data_iter, loss): #@save
    """评估给定数据集上模型的损失"""
    metric = d2l.Accumulator(2) #损失的总和，样本数量
    for X, y in data_iter:
        out = net(X)#经过模型之后和y做比对，评估损失
        y = y.reshape(out.shape)
        l = loss(out, y)
        metric.add(l.sum(), l.numel())
    return metric[0] / metric[1]#损失值和样本数量

#定义训练函数
def train(train_features, test_features, train_labels, test_labels, num_epochs = 400):
    loss = nn.MSELoss(reduction='none')#均方误差，none保留了每个样本的损失
    input_shape = train_features.shape[-1]#这里主要为了定义线性层获取输入维度
    #不设置偏置，因为多项式中已经有epsilon
    net = nn.Sequential(nn.Linear(input_shape, 1, bias = False))#定义一个线性层，输出维度1
    batch_size = min(10, train_labels.shape[0])
    train_iter = d2l.load_array((train_features, train_labels.reshape(-1, 1)), batch_size)
    test_iter = d2l.load_array((test_features, test_labels.reshape(-1, 1)), batch_size, is_train = False)#is_train=False表示测试数据不进行随机洗牌。
    trainer = torch.optim.SGD(net.parameters(), lr = 0.01)#进行最优化，进行训练求解
    animator = d2l.Animator(xlabel='epoch', ylabel='loss', yscale='log', xlim=[1, num_epochs], ylim=[1e-3, 1e2], 
                            legend=['train', 'test'])
    for epoch in range(num_epochs):
        d2l.train_epoch_ch3(net, train_iter, loss, trainer)#具体训练函数
        if epoch == 0 or (epoch + 1) % 20 == 0:#每隔20个训练周期输出一次
            animator.add(epoch + 1, (evaluate_loss(net, train_iter, loss), evaluate_loss(net, test_iter, loss)))
    print('weight:', net[0].weight.data.numpy())#输出权重

3. three-order polynomail fitting(normal)

1
2
3

#we choose the first four dimensionals, namely 1, x, x^2/2, x^3/6 from polynomail features
train(poly_features[:n_train, :4], poly_features[n_train:, :4], labels[:n_train], labels[n_train:])
#the dataset is divided into training part and testing part.

weight: [[ 4.996408   1.1785661 -3.4105065  5.635429 ]]

svg

4. linear function fitting(underfitting)

1
2
3

#we just choose first two dimensionals, 1 and x
train(poly_features[:n_train, :2], poly_features[n_train:, :2], labels[:n_train], labels[n_train:])
#也就是用一次函数来训练训练集，测试的时候误差会很大。

weight: [[3.8133729 3.794563 ]]

svg

5. high-order polynomail fitting(overfitting)

#we choose all dimensions from polynomail features.
train(poly_features[:n_train, :], poly_features[n_train:, :], 
     labels[:n_train], labels[n_train:], num_epochs = 1500)
#复杂模型对训练损失降低，但是测试损失仍然很高，显然这是过拟合了。

weight: [[ 4.974611    1.2390194  -3.2549357   5.2375994  -0.4758543   1.3785309
  -0.08715282  0.22586435 -0.0352993  -0.09998694 -0.03311426 -0.0362402
  -0.15619889  0.04489927  0.04778471 -0.15157764  0.19884372 -0.04693874
  -0.07462315  0.01904617]]

svg