4.10 Kaggle competition_predict house price

4.10.1 download and load datasets非常全的文件下载保存操作

#we will download different datasets and we will realize some functions to download them easily.

#Firstly we set a dictionary DATA_HUB, which can map names of datasets to 2-tuples related to datasets

#The 2-tuple includes urls of datasets and sha-1 sercet keys to verify the file completion.

#All the datasets like these are stored onto website with address DATA_URL

import hashlib
import os
import tarfile
import zipfile

import requests
#@save
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'#用来下载数据集

#we set a download function used to download datasets and cache them in the local directory(../data in default),
#then return names of downloaded files.
#If there already exists this dataset file and its sha-1 matches that in DATA_HUB, we will utilize the caching file
#which can avoid downloading it repeatedly.
def download(name, cache_dir=os.path.join('..', 'data')): #@save
    """下载一个data_hub中文件并返回本地名"""
    assert name in DATA_HUB, f"{name} 不存在于{DATA_HUB}"
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)#namely default directory ../data
    fname = os.path.join(cache_dir, url.split('/')[-1])
    if os.path.exists(fname):#如果存在该数据集
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:#打开
            while True:
                data = f.read(1048576)#以1 MB为单位读取数据，分别更新到hash 对象中
                if not data:
                    break#一直到读取结束
                sha1.update(data)
            if sha1.hexdigest() == sha1_hash:#最后计算摘要
                return fname
    print(f'正在从{url}下载{fname}...')
    r = requests.get(url, stream = True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname

def download_extract(name, folder=None): #@save
    """下载并解压zip/tar文件"""
    fname = download(name)
    base_dir = os.path.dirname(fname)#找到dataset的directory作为base_dir
    data_dir, ext = os.path.splitext(fname)#将扩展名与文件名分开
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')#解压缩，并且递归解压缩
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')#递归解压缩
    else:
        assert False, '只有zip/tar文件才能被解压'
    fp.extractall(base_dir)#提取到数据集所在目录下
    return os.path.join(base_dir, folder) if folder else data_dir#返回数据集的名称
def download_all(): #@save
    """下载DATA_HUB中所有文件"""
    for name in DATA_HUB:
        download(name)

4.10.2 Kaggle

dataset_dir_name = "data/house-prices-advanced-regression-techniques/"
%matplotlib inline
import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l

1 2	train_data = pd.read_csv(dataset_dir_name + "train.csv") test_data = pd.read_csv(dataset_dir_name + "test.csv")

1 2	print(train_data.shape) print(test_data.shape)

(1460, 81)
(1459, 80)

1
2

#we search for the former 4 features and the last 2 features, as well as corresponding labels(price)
print(train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])#注意0123代表前4个特征-3-2代表后两个特征，因为房价才是-1位置

   Id  MSSubClass MSZoning  LotFrontage SaleType SaleCondition  SalePrice
0   1          60       RL         65.0       WD        Normal     208500
1   2          20       RL         80.0       WD        Normal     181500
2   3          60       RL         68.0       WD        Normal     223500
3   4          70       RL         60.0       WD       Abnorml     140000

1
2

all_features = pd.concat((train_data.iloc[:, 1:-1], test_data))#也就是除了id的所有列，把id抛出在外，因为它对预测信息没有用
#把train_data和test_data链起来，不要id因为test_data中没有id

1	all_features

结果如下：

	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	LotConfig	...	PoolArea	PoolQC	Fence	MiscFeature	MiscVal	MoSold	YrSold	SaleType	SaleCondition	Id
0	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	Inside	...	0	NaN	NaN	NaN	0	2	2008	WD	Normal	NaN
1	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	FR2	...	0	NaN	NaN	NaN	0	5	2007	WD	Normal	NaN
2	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	Inside	...	0	NaN	NaN	NaN	0	9	2008	WD	Normal	NaN
3	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	Corner	...	0	NaN	NaN	NaN	0	2	2006	WD	Abnorml	NaN
4	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	FR2	...	0	NaN	NaN	NaN	0	12	2008	WD	Normal	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1454	160	RM	21.0	1936	Pave	NaN	Reg	Lvl	AllPub	Inside	...	0	NaN	NaN	NaN	0	6	2006	WD	Normal	2915.0
1455	160	RM	21.0	1894	Pave	NaN	Reg	Lvl	AllPub	Inside	...	0	NaN	NaN	NaN	0	4	2006	WD	Abnorml	2916.0
1456	20	RL	160.0	20000	Pave	NaN	Reg	Lvl	AllPub	Inside	...	0	NaN	NaN	NaN	0	9	2006	WD	Abnorml	2917.0
1457	85	RL	62.0	10441	Pave	NaN	Reg	Lvl	AllPub	Inside	...	0	NaN	MnPrv	Shed	700	7	2006	WD	Normal	2918.0
1458	60	RL	74.0	9627	Pave	NaN	Reg	Lvl	AllPub	Inside	...	0	NaN	NaN	NaN	0	11	2006	WD	Normal	2919.0

2919 rows × 80 columns

4.10.4 data preprocessing

#firstly we normalize the features to get zero mean and normalized variance.
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index#index提取出所有数据类型不是'object'的索引
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
#非常好的将每项按公式计算一遍的方法。
#在标准化数据之后，所有均值消失，因此我们可以将缺失值为0
all_features[numeric_features] = all_features[numeric_features].fillna(0)#fillna也就是填充na值，也就是原来均值位置
all_features#能转换的特征，全部转换了，都是均值和方差一定的数据了

	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	LotConfig	...	PoolArea	PoolQC	Fence	MiscFeature	MiscVal	MoSold	YrSold	SaleType	SaleCondition	Id
0	0.067320	RL	-0.184443	-0.217841	Pave	NaN	Reg	Lvl	AllPub	Inside	...	-0.063139	NaN	NaN	NaN	-0.089577	-1.551918	0.157619	WD	Normal	0.000000
1	-0.873466	RL	0.458096	-0.072032	Pave	NaN	Reg	Lvl	AllPub	FR2	...	-0.063139	NaN	NaN	NaN	-0.089577	-0.446848	-0.602858	WD	Normal	0.000000
2	0.067320	RL	-0.055935	0.137173	Pave	NaN	IR1	Lvl	AllPub	Inside	...	-0.063139	NaN	NaN	NaN	-0.089577	1.026577	0.157619	WD	Normal	0.000000
3	0.302516	RL	-0.398622	-0.078371	Pave	NaN	IR1	Lvl	AllPub	Corner	...	-0.063139	NaN	NaN	NaN	-0.089577	-1.551918	-1.363335	WD	Abnorml	0.000000
4	0.067320	RL	0.629439	0.518814	Pave	NaN	IR1	Lvl	AllPub	FR2	...	-0.063139	NaN	NaN	NaN	-0.089577	2.131647	0.157619	WD	Normal	0.000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1454	2.419286	RM	-2.069222	-1.043758	Pave	NaN	Reg	Lvl	AllPub	Inside	...	-0.063139	NaN	NaN	NaN	-0.089577	-0.078492	-1.363335	WD	Normal	1.720777
1455	2.419286	RM	-2.069222	-1.049083	Pave	NaN	Reg	Lvl	AllPub	Inside	...	-0.063139	NaN	NaN	NaN	-0.089577	-0.815205	-1.363335	WD	Abnorml	1.723150
1456	-0.873466	RL	3.884968	1.246594	Pave	NaN	Reg	Lvl	AllPub	Inside	...	-0.063139	NaN	NaN	NaN	-0.089577	1.026577	-1.363335	WD	Abnorml	1.725524
1457	0.655311	RL	-0.312950	0.034599	Pave	NaN	Reg	Lvl	AllPub	Inside	...	-0.063139	NaN	MnPrv	Shed	1.144116	0.289865	-1.363335	WD	Normal	1.727897
1458	0.067320	RL	0.201080	-0.068608	Pave	NaN	Reg	Lvl	AllPub	Inside	...	-0.063139	NaN	NaN	NaN	-0.089577	1.763290	-1.363335	WD	Normal	1.730271

2919 rows × 80 columns

#接下来我们处理离散值，包括诸如MSZoning之类的特征，我们用one-hot编码替换它们，方法与前面将多类别标签转换为向量的方式相同
#例如，MSZoning包含值RL和RM。我们将创建两个新的指示符特征MSZoning_RL和RM，其值为0或1.根据one-hot编码，如果MSZoning原始值为RL,
#则MSZoning_RL为1，RM为0

#注意：这里的RL和RM分别是residential low density and residential high density
all_features = pd.get_dummies(all_features, dummy_na=True)
#get_dummies将类别型特征转换为数值型特征，对于类别型特征，它会为每个类别创建一个二进制列，该列的值为1表示数据属于该类别，为0即不属于。
all_features.shape
#之前在第二章第三章做过这个操作，也就是把其值RL和RM分开，反正只有两种情况，就用二进制来表示了

#所有的特征都用数字来表示了。

(2919, 332)

#都转换成合理数据之后，该训练了。
n_train = train_data.shape[0]#样本数量
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)#前面
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)
#也就是将价格转换成合适的一列张量

4.10.5 training

#首先我们训练一个带有损失平方的线性模型。显然线性模型很难让我们获胜，但是线性模型提供了一种健全性检查，查看数据中是否存在有意义的信息。
#如果一切顺利线性模型将作为baseline模型
loss = nn.MSELoss()
in_features = train_features.shape[1]

def get_net():
    net = nn.Sequential(nn.Linear(in_features, 1))
    return net

#为了考虑相对误差，我们计算的是预测结果和正确结果之间的对数的差的平方均根误差
def log_rmse(net, features, labels):
    clipped_preds = torch.clamp(net(features), 1, float('inf'))#截断操作，防止进行对数运算时出现非法值，比如截断了inf
    rmse = torch.sqrt(loss(torch.log(clipped_preds), torch.log(labels)))
    
    return rmse.item()

#此次训练借助Adam优化器，其吸引力在于它对初始学习率不那么敏感
def train(net, train_features, train_labels, test_features, test_labels, num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []
    train_iter = d2l.load_array((train_features, train_labels), batch_size)#分成批量数据准备训练
    #这里使用的是Adam优化器
    optimizer = torch.optim.Adam(net.parameters(), lr = learning_rate, weight_decay=weight_decay)#这里引入了参数衰减
    
    for epoch in range(num_epochs):
        for X, y in train_iter:
            optimizer.zero_grad()#每次梯度清零
            l = loss(net(X), y)
            l.backward()#计算梯度，下面优化参数
            optimizer.step()
        train_ls.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    return train_ls, test_ls

4.10.6 K折交叉验证

#k折交叉验证有助于模型选择和参数调整。我们首先定义一个函数，在K折交叉验证过程中返回第i折的数据。也就是说选择第i个切片作为验证数据，
#其余部分作为训练数据。也就是一共k个部分，每选择一个切片作为验证集，其他部分就是训练集。
def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = X.shape[0] // k#也就是样本数，分成k份，每一折的个数
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)#slice切片函数，左闭又开区间。正好合适我们这里的划分，不包括最右边。
        X_part, y_part = X[idx, :], y[idx]#idx就是切开之后每个小组所在的索引。
        if j == i:
            X_valid, y_valid = X_part, y_part# 比如i=1时，第一个循环j=1那么就让第一片作为验证集，继续循环
        elif X_train is None:
            X_train, y_train = X_part, y_part#此时训练集还没有内容，那么就让后面的全是训练集，所以先把第二片赋值给训练集，后面补充
        else:
            X_train = torch.cat([X_train, X_part], 0)#用了concatenate函数进行拼接，把后面所有的片根据循环逐个拼上，
            #0表示按照行维度拼接，也就是叠加行数。
            y_train = torch.cat([y_train, y_part], 0)
    return X_train, y_train, X_valid, y_valid

1	#现在我们在K折交叉验证中训练K次后，返回训练和验证的平均误差。

def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size):
    train_l_sum, valid_l_sum = 0, 0#训练集和验证集的损失
    for i in range(k):#每一轮可以看成一个新的数据集
        data = get_k_fold_data(k, i, X_train, y_train)#对数据切片
        net = get_net()#网络建立
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate, weight_decay, batch_size)#*data将要传递的参数
        #打包成一个元组tuple进行传递，因为我们知道data是四个结果！，所以要打包传递。
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]#添加每一折的误差
        if i == 0:
            d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls], xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
                    legend=['train', 'valid'], yscale='log')
            #第一折画出训练集和验证集的损失曲线，后面打印出每一折的损失。最后返回损失和的平均值。
        print(f'折{i+1},训练log rmse{float(train_ls[-1]):f}, 'f'验证log rmse{float(valid_ls[-1]):f}')
    return train_l_sum / k, valid_l_sum / k

4.10.7 模型选择

#我们选择了一组未调优的超参数，将其留给读者来改进，K折交叉验证往往在多次测试中具有相当的稳定性
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)
print(f'{k}折验证：平均训练log rmse{float(train_l):f}, '
     f'平均验证log rmse:{float(valid_l):f}')

折1,训练log rmse0.169717, 验证log rmse0.156305
折2,训练log rmse0.162615, 验证log rmse0.193323
折3,训练log rmse0.163744, 验证log rmse0.168370
折4,训练log rmse0.168188, 验证log rmse0.154512
折5,训练log rmse0.162700, 验证log rmse0.182809
5折验证：平均训练log rmse0.165393, 平均验证log rmse:0.171064

svg

1	#有时候一组超参数的误差可能很低但是K折交叉验证的误差很高，这显然是过拟合了。

4.10.8 submit Kaggle prediction

#将结果保存在csv中上交
def train_and_pred(train_features, test_features, train_labels, test_data, num_epochs, lr, weight_decay, batch_size):
    net = get_net()
    train_ls, _ = train(net, train_features, train_labels, None, None, num_epochs, lr, weight_decay, batch_size)
    #也就是没有测试集的损失，只有验证集的损失
    d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch', ylabel='log rmse', xlim=[1, num_epochs], yscale='log')
    print(f'训练log_rmse:{float(train_ls[-1]):f}')
    preds = net(test_features).detach().numpy()#将预测结果从计算图中分离并转换成numpy数组
    #重新格式化并到处到Kaggle
    test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])#（1， -1）是转换为二维行向量，我们用[0]来降维，再用Series转换为列向量
    submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)#将这些列合并！
    submission.to_csv('submission.csv', index=False)#最后转换为表格，输出

1
2

train_and_pred(train_features, test_features, train_labels, test_data, num_epochs, lr, weight_decay, batch_size)
#注意这里只是训练和预测并给出结果，并没有测试集什么事，测试集是用来我们自己本地训练的时候检验精度的。