4.10.1 download and load datasets非常全的文件下载保存操作

1
2
3
4
5
6
7
8
9
10
11
12
#we will download different datasets and we will realize some functions to download them easily.

#Firstly we set a dictionary DATA_HUB, which can map names of datasets to 2-tuples related to datasets

#The 2-tuple includes urls of datasets and sha-1 sercet keys to verify the file completion.

#All the datasets like these are stored onto website with address DATA_URL

import hashlib
import os
import tarfile
import zipfile
1
2
3
4
import requests
#@save
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'#用来下载数据集
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#we set a download function used to download datasets and cache them in the local directory(../data in default),
#then return names of downloaded files.
#If there already exists this dataset file and its sha-1 matches that in DATA_HUB, we will utilize the caching file
#which can avoid downloading it repeatedly.
def download(name, cache_dir=os.path.join('..', 'data')): #@save
"""下载一个data_hub中文件并返回本地名"""
assert name in DATA_HUB, f"{name} 不存在于{DATA_HUB}"
url, sha1_hash = DATA_HUB[name]
os.makedirs(cache_dir, exist_ok=True)#namely default directory ../data
fname = os.path.join(cache_dir, url.split('/')[-1])
if os.path.exists(fname):#如果存在该数据集
sha1 = hashlib.sha1()
with open(fname, 'rb') as f:#打开
while True:
data = f.read(1048576)#以1 MB为单位读取数据,分别更新到hash 对象中
if not data:
break#一直到读取结束
sha1.update(data)
if sha1.hexdigest() == sha1_hash:#最后计算摘要
return fname
print(f'正在从{url}下载{fname}...')
r = requests.get(url, stream = True, verify=True)
with open(fname, 'wb') as f:
f.write(r.content)
return fname
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def download_extract(name, folder=None): #@save
"""下载并解压zip/tar文件"""
fname = download(name)
base_dir = os.path.dirname(fname)#找到dataset的directory作为base_dir
data_dir, ext = os.path.splitext(fname)#将扩展名与文件名分开
if ext == '.zip':
fp = zipfile.ZipFile(fname, 'r')#解压缩,并且递归解压缩
elif ext in ('.tar', '.gz'):
fp = tarfile.open(fname, 'r')#递归解压缩
else:
assert False, '只有zip/tar文件才能被解压'
fp.extractall(base_dir)#提取到数据集所在目录下
return os.path.join(base_dir, folder) if folder else data_dir#返回数据集的名称
def download_all(): #@save
"""下载DATA_HUB中所有文件"""
for name in DATA_HUB:
download(name)

4.10.2 Kaggle

1
2
3
4
5
6
7
dataset_dir_name = "data/house-prices-advanced-regression-techniques/"
%matplotlib inline
import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l
1
2
train_data = pd.read_csv(dataset_dir_name + "train.csv")
test_data = pd.read_csv(dataset_dir_name + "test.csv")
1
2
print(train_data.shape)
print(test_data.shape)
(1460, 81)
(1459, 80)
1
2
#we search for the former 4 features and the last 2 features, as well as corresponding labels(price)
print(train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])#注意0123代表前4个特征-3-2代表后两个特征,因为房价才是-1位置
   Id  MSSubClass MSZoning  LotFrontage SaleType SaleCondition  SalePrice
0   1          60       RL         65.0       WD        Normal     208500
1   2          20       RL         80.0       WD        Normal     181500
2   3          60       RL         68.0       WD        Normal     223500
3   4          70       RL         60.0       WD       Abnorml     140000
1
2
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data))#也就是除了id的所有列,把id抛出在外,因为它对预测信息没有用
#把train_data和test_data链起来,不要id因为test_data中没有id
1
all_features

结果如下:

MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition Id
0 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub Inside ... 0 NaN NaN NaN 0 2 2008 WD Normal NaN
1 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub FR2 ... 0 NaN NaN NaN 0 5 2007 WD Normal NaN
2 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub Inside ... 0 NaN NaN NaN 0 9 2008 WD Normal NaN
3 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub Corner ... 0 NaN NaN NaN 0 2 2006 WD Abnorml NaN
4 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub FR2 ... 0 NaN NaN NaN 0 12 2008 WD Normal NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1454 160 RM 21.0 1936 Pave NaN Reg Lvl AllPub Inside ... 0 NaN NaN NaN 0 6 2006 WD Normal 2915.0
1455 160 RM 21.0 1894 Pave NaN Reg Lvl AllPub Inside ... 0 NaN NaN NaN 0 4 2006 WD Abnorml 2916.0
1456 20 RL 160.0 20000 Pave NaN Reg Lvl AllPub Inside ... 0 NaN NaN NaN 0 9 2006 WD Abnorml 2917.0
1457 85 RL 62.0 10441 Pave NaN Reg Lvl AllPub Inside ... 0 NaN MnPrv Shed 700 7 2006 WD Normal 2918.0
1458 60 RL 74.0 9627 Pave NaN Reg Lvl AllPub Inside ... 0 NaN NaN NaN 0 11 2006 WD Normal 2919.0

2919 rows × 80 columns

4.10.4 data preprocessing

1
2
3
4
5
6
7
#firstly we normalize the features to get zero mean and normalized variance.
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index#index提取出所有数据类型不是'object'的索引
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
#非常好的将每项按公式计算一遍的方法。
#在标准化数据之后,所有均值消失,因此我们可以将缺失值为0
all_features[numeric_features] = all_features[numeric_features].fillna(0)#fillna也就是填充na值,也就是原来均值位置
all_features#能转换的特征,全部转换了,都是均值和方差一定的数据了
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition Id
0 0.067320 RL -0.184443 -0.217841 Pave NaN Reg Lvl AllPub Inside ... -0.063139 NaN NaN NaN -0.089577 -1.551918 0.157619 WD Normal 0.000000
1 -0.873466 RL 0.458096 -0.072032 Pave NaN Reg Lvl AllPub FR2 ... -0.063139 NaN NaN NaN -0.089577 -0.446848 -0.602858 WD Normal 0.000000
2 0.067320 RL -0.055935 0.137173 Pave NaN IR1 Lvl AllPub Inside ... -0.063139 NaN NaN NaN -0.089577 1.026577 0.157619 WD Normal 0.000000
3 0.302516 RL -0.398622 -0.078371 Pave NaN IR1 Lvl AllPub Corner ... -0.063139 NaN NaN NaN -0.089577 -1.551918 -1.363335 WD Abnorml 0.000000
4 0.067320 RL 0.629439 0.518814 Pave NaN IR1 Lvl AllPub FR2 ... -0.063139 NaN NaN NaN -0.089577 2.131647 0.157619 WD Normal 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1454 2.419286 RM -2.069222 -1.043758 Pave NaN Reg Lvl AllPub Inside ... -0.063139 NaN NaN NaN -0.089577 -0.078492 -1.363335 WD Normal 1.720777
1455 2.419286 RM -2.069222 -1.049083 Pave NaN Reg Lvl AllPub Inside ... -0.063139 NaN NaN NaN -0.089577 -0.815205 -1.363335 WD Abnorml 1.723150
1456 -0.873466 RL 3.884968 1.246594 Pave NaN Reg Lvl AllPub Inside ... -0.063139 NaN NaN NaN -0.089577 1.026577 -1.363335 WD Abnorml 1.725524
1457 0.655311 RL -0.312950 0.034599 Pave NaN Reg Lvl AllPub Inside ... -0.063139 NaN MnPrv Shed 1.144116 0.289865 -1.363335 WD Normal 1.727897
1458 0.067320 RL 0.201080 -0.068608 Pave NaN Reg Lvl AllPub Inside ... -0.063139 NaN NaN NaN -0.089577 1.763290 -1.363335 WD Normal 1.730271

2919 rows × 80 columns

1
2
3
4
5
6
7
8
9
10
11
#接下来我们处理离散值,包括诸如MSZoning之类的特征,我们用one-hot编码替换它们,方法与前面将多类别标签转换为向量的方式相同
#例如,MSZoning包含值RL和RM。我们将创建两个新的指示符特征MSZoning_RL和RM,其值为0或1.根据one-hot编码,如果MSZoning原始值为RL,
#则MSZoning_RL为1,RM为0

#注意:这里的RL和RM分别是residential low density and residential high density
all_features = pd.get_dummies(all_features, dummy_na=True)
#get_dummies将类别型特征转换为数值型特征,对于类别型特征,它会为每个类别创建一个二进制列,该列的值为1表示数据属于该类别,为0即不属于。
all_features.shape
#之前在第二章第三章做过这个操作,也就是把其值RL和RM分开,反正只有两种情况,就用二进制来表示了

#所有的特征都用数字来表示了。
(2919, 332)
1
2
3
4
5
6
#都转换成合理数据之后,该训练了。
n_train = train_data.shape[0]#样本数量
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)#前面
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)
#也就是将价格转换成合适的一列张量

4.10.5 training

1
2
3
4
5
6
7
8
#首先我们训练一个带有损失平方的线性模型。显然线性模型很难让我们获胜,但是线性模型提供了一种健全性检查,查看数据中是否存在有意义的信息。
#如果一切顺利线性模型将作为baseline模型
loss = nn.MSELoss()
in_features = train_features.shape[1]

def get_net():
net = nn.Sequential(nn.Linear(in_features, 1))
return net
1
2
3
4
5
6
#为了考虑相对误差,我们计算的是预测结果和正确结果之间的对数的差的平方均根误差
def log_rmse(net, features, labels):
clipped_preds = torch.clamp(net(features), 1, float('inf'))#截断操作,防止进行对数运算时出现非法值,比如截断了inf
rmse = torch.sqrt(loss(torch.log(clipped_preds), torch.log(labels)))

return rmse.item()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#此次训练借助Adam优化器,其吸引力在于它对初始学习率不那么敏感
def train(net, train_features, train_labels, test_features, test_labels, num_epochs, learning_rate, weight_decay, batch_size):
train_ls, test_ls = [], []
train_iter = d2l.load_array((train_features, train_labels), batch_size)#分成批量数据准备训练
#这里使用的是Adam优化器
optimizer = torch.optim.Adam(net.parameters(), lr = learning_rate, weight_decay=weight_decay)#这里引入了参数衰减

for epoch in range(num_epochs):
for X, y in train_iter:
optimizer.zero_grad()#每次梯度清零
l = loss(net(X), y)
l.backward()#计算梯度,下面优化参数
optimizer.step()
train_ls.append(log_rmse(net, train_features, train_labels))
if test_labels is not None:
test_ls.append(log_rmse(net, test_features, test_labels))
return train_ls, test_ls

4.10.6 K折交叉验证

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#k折交叉验证有助于模型选择和参数调整。我们首先定义一个函数,在K折交叉验证过程中返回第i折的数据。也就是说选择第i个切片作为验证数据,
#其余部分作为训练数据。也就是一共k个部分,每选择一个切片作为验证集,其他部分就是训练集。
def get_k_fold_data(k, i, X, y):
assert k > 1
fold_size = X.shape[0] // k#也就是样本数,分成k份,每一折的个数
X_train, y_train = None, None
for j in range(k):
idx = slice(j * fold_size, (j + 1) * fold_size)#slice切片函数,左闭又开区间。正好合适我们这里的划分,不包括最右边。
X_part, y_part = X[idx, :], y[idx]#idx就是切开之后每个小组所在的索引。
if j == i:
X_valid, y_valid = X_part, y_part# 比如i=1时,第一个循环j=1那么就让第一片作为验证集,继续循环
elif X_train is None:
X_train, y_train = X_part, y_part#此时训练集还没有内容,那么就让后面的全是训练集,所以先把第二片赋值给训练集,后面补充
else:
X_train = torch.cat([X_train, X_part], 0)#用了concatenate函数进行拼接,把后面所有的片根据循环逐个拼上,
#0表示按照行维度拼接,也就是叠加行数。
y_train = torch.cat([y_train, y_part], 0)
return X_train, y_train, X_valid, y_valid
1
#现在我们在K折交叉验证中训练K次后,返回训练和验证的平均误差。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size):
train_l_sum, valid_l_sum = 0, 0#训练集和验证集的损失
for i in range(k):#每一轮可以看成一个新的数据集
data = get_k_fold_data(k, i, X_train, y_train)#对数据切片
net = get_net()#网络建立
train_ls, valid_ls = train(net, *data, num_epochs, learning_rate, weight_decay, batch_size)#*data将要传递的参数
#打包成一个元组tuple进行传递,因为我们知道data是四个结果!,所以要打包传递。
train_l_sum += train_ls[-1]
valid_l_sum += valid_ls[-1]#添加每一折的误差
if i == 0:
d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls], xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
legend=['train', 'valid'], yscale='log')
#第一折画出训练集和验证集的损失曲线,后面打印出每一折的损失。最后返回损失和的平均值。
print(f'折{i+1},训练log rmse{float(train_ls[-1]):f}, 'f'验证log rmse{float(valid_ls[-1]):f}')
return train_l_sum / k, valid_l_sum / k

4.10.7 模型选择

1
2
3
4
5
#我们选择了一组未调优的超参数,将其留给读者来改进,K折交叉验证往往在多次测试中具有相当的稳定性
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)
print(f'{k}折验证:平均训练log rmse{float(train_l):f}, '
f'平均验证log rmse:{float(valid_l):f}')
折1,训练log rmse0.169717, 验证log rmse0.156305
折2,训练log rmse0.162615, 验证log rmse0.193323
折3,训练log rmse0.163744, 验证log rmse0.168370
折4,训练log rmse0.168188, 验证log rmse0.154512
折5,训练log rmse0.162700, 验证log rmse0.182809
5折验证:平均训练log rmse0.165393, 平均验证log rmse:0.171064

svg

1
#有时候一组超参数的误差可能很低但是K折交叉验证的误差很高,这显然是过拟合了。

4.10.8 submit Kaggle prediction

1
2
3
4
5
6
7
8
9
10
11
12
#将结果保存在csv中上交
def train_and_pred(train_features, test_features, train_labels, test_data, num_epochs, lr, weight_decay, batch_size):
net = get_net()
train_ls, _ = train(net, train_features, train_labels, None, None, num_epochs, lr, weight_decay, batch_size)
#也就是没有测试集的损失,只有验证集的损失
d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch', ylabel='log rmse', xlim=[1, num_epochs], yscale='log')
print(f'训练log_rmse:{float(train_ls[-1]):f}')
preds = net(test_features).detach().numpy()#将预测结果从计算图中分离并转换成numpy数组
#重新格式化并到处到Kaggle
test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])#(1, -1)是转换为二维行向量,我们用[0]来降维,再用Series转换为列向量
submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)#将这些列合并!
submission.to_csv('submission.csv', index=False)#最后转换为表格,输出
1
2
train_and_pred(train_features, test_features, train_labels, test_data, num_epochs, lr, weight_decay, batch_size)
#注意这里只是训练和预测并给出结果,并没有测试集什么事,测试集是用来我们自己本地训练的时候检验精度的。
训练log_rmse:0.162730

svg