#we set a download function used to download datasets and cache them in the local directory(../data in default), #then return names of downloaded files. #If there already exists this dataset file and its sha-1 matches that in DATA_HUB, we will utilize the caching file #which can avoid downloading it repeatedly. defdownload(name, cache_dir=os.path.join('..', 'data')): #@save """下载一个data_hub中文件并返回本地名""" assert name in DATA_HUB, f"{name} 不存在于{DATA_HUB}" url, sha1_hash = DATA_HUB[name] os.makedirs(cache_dir, exist_ok=True)#namely default directory ../data fname = os.path.join(cache_dir, url.split('/')[-1]) if os.path.exists(fname):#如果存在该数据集 sha1 = hashlib.sha1() withopen(fname, 'rb') as f:#打开 whileTrue: data = f.read(1048576)#以1 MB为单位读取数据,分别更新到hash 对象中 ifnot data: break#一直到读取结束 sha1.update(data) if sha1.hexdigest() == sha1_hash:#最后计算摘要 return fname print(f'正在从{url}下载{fname}...') r = requests.get(url, stream = True, verify=True) withopen(fname, 'wb') as f: f.write(r.content) return fname
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
defdownload_extract(name, folder=None): #@save """下载并解压zip/tar文件""" fname = download(name) base_dir = os.path.dirname(fname)#找到dataset的directory作为base_dir data_dir, ext = os.path.splitext(fname)#将扩展名与文件名分开 if ext == '.zip': fp = zipfile.ZipFile(fname, 'r')#解压缩,并且递归解压缩 elif ext in ('.tar', '.gz'): fp = tarfile.open(fname, 'r')#递归解压缩 else: assertFalse, '只有zip/tar文件才能被解压' fp.extractall(base_dir)#提取到数据集所在目录下 return os.path.join(base_dir, folder) if folder else data_dir#返回数据集的名称 defdownload_all(): #@save """下载DATA_HUB中所有文件""" for name in DATA_HUB: download(name)
4.10.2 Kaggle
1 2 3 4 5 6 7
dataset_dir_name = "data/house-prices-advanced-regression-techniques/" %matplotlib inline import numpy as np import pandas as pd import torch from torch import nn from d2l import torch as d2l
#we search for the former 4 features and the last 2 features, as well as corresponding labels(price) print(train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])#注意0123代表前4个特征-3-2代表后两个特征,因为房价才是-1位置
Id MSSubClass MSZoning LotFrontage SaleType SaleCondition SalePrice
0 1 60 RL 65.0 WD Normal 208500
1 2 20 RL 80.0 WD Normal 181500
2 3 60 RL 68.0 WD Normal 223500
3 4 70 RL 60.0 WD Abnorml 140000
#firstly we normalize the features to get zero mean and normalized variance. numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index#index提取出所有数据类型不是'object'的索引 all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std())) #非常好的将每项按公式计算一遍的方法。 #在标准化数据之后,所有均值消失,因此我们可以将缺失值为0 all_features[numeric_features] = all_features[numeric_features].fillna(0)#fillna也就是填充na值,也就是原来均值位置 all_features#能转换的特征,全部转换了,都是均值和方差一定的数据了
#注意:这里的RL和RM分别是residential low density and residential high density all_features = pd.get_dummies(all_features, dummy_na=True) #get_dummies将类别型特征转换为数值型特征,对于类别型特征,它会为每个类别创建一个二进制列,该列的值为1表示数据属于该类别,为0即不属于。 all_features.shape #之前在第二章第三章做过这个操作,也就是把其值RL和RM分开,反正只有两种情况,就用二进制来表示了