House Prices Advanced Regression Techniques V2

Kaggle Competition 的练习

房价预测

# 安装 vecstack
!pip install vecstack
# 数据分析库
import pandas as pd
import numpy as np
import random

# 数据可视化
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# 机器学习库
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, BaggingRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, StratifiedKFold, learning_curve, KFold, train_test_split

# Ensemble Models
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Package for stacking models
from vecstack import stacking

导入数据

train = pd.read_csv('/train.csv', index_col='Id')
test = pd.read_csv('/test.csv', index_col='Id')

train.head()
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
Id
1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2003 2003 Gable CompShg VinylSd VinylSd BrkFace 196.0 Gd TA PConc Gd TA No GLQ 706 Unf 0 150 856 GasA Ex Y SBrkr 856 854 0 1710 1 0 2 1 3 1 Gd 8 Typ 0 NaN Attchd 2003.0 RFn 2 548 TA TA Y 0 61 0 0 0 0 NaN NaN NaN 0 2 2008 WD Normal 208500
2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub FR2 Gtl Veenker Feedr Norm 1Fam 1Story 6 8 1976 1976 Gable CompShg MetalSd MetalSd None 0.0 TA TA CBlock Gd TA Gd ALQ 978 Unf 0 284 1262 GasA Ex Y SBrkr 1262 0 0 1262 0 1 2 0 3 1 TA 6 Typ 1 TA Attchd 1976.0 RFn 2 460 TA TA Y 298 0 0 0 0 0 NaN NaN NaN 0 5 2007 WD Normal 181500
3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2001 2002 Gable CompShg VinylSd VinylSd BrkFace 162.0 Gd TA PConc Gd TA Mn GLQ 486 Unf 0 434 920 GasA Ex Y SBrkr 920 866 0 1786 1 0 2 1 3 1 Gd 6 Typ 1 TA Attchd 2001.0 RFn 2 608 TA TA Y 0 42 0 0 0 0 NaN NaN NaN 0 9 2008 WD Normal 223500
4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub Corner Gtl Crawfor Norm Norm 1Fam 2Story 7 5 1915 1970 Gable CompShg Wd Sdng Wd Shng None 0.0 TA TA BrkTil TA Gd No ALQ 216 Unf 0 540 756 GasA Gd Y SBrkr 961 756 0 1717 1 0 1 0 3 1 Gd 7 Typ 1 Gd Detchd 1998.0 Unf 3 642 TA TA Y 0 35 272 0 0 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub FR2 Gtl NoRidge Norm Norm 1Fam 2Story 8 5 2000 2000 Gable CompShg VinylSd VinylSd BrkFace 350.0 Gd TA PConc Gd TA Av GLQ 655 Unf 0 490 1145 GasA Ex Y SBrkr 1145 1053 0 2198 1 0 2 1 4 1 Gd 9 Typ 1 TA Attchd 2000.0 RFn 3 836 TA TA Y 192 84 0 0 0 0 NaN NaN NaN 0 12 2008 WD Normal 250000

浏览数据

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
MSSubClass 1460 non-null int64
MSZoning 1460 non-null object
LotFrontage 1201 non-null float64
LotArea 1460 non-null int64
Street 1460 non-null object
Alley 91 non-null object
LotShape 1460 non-null object
LandContour 1460 non-null object
Utilities 1460 non-null object
LotConfig 1460 non-null object
LandSlope 1460 non-null object
Neighborhood 1460 non-null object
Condition1 1460 non-null object
Condition2 1460 non-null object
BldgType 1460 non-null object
HouseStyle 1460 non-null object
OverallQual 1460 non-null int64
OverallCond 1460 non-null int64
YearBuilt 1460 non-null int64
YearRemodAdd 1460 non-null int64
RoofStyle 1460 non-null object
RoofMatl 1460 non-null object
Exterior1st 1460 non-null object
Exterior2nd 1460 non-null object
MasVnrType 1452 non-null object
MasVnrArea 1452 non-null float64
ExterQual 1460 non-null object
ExterCond 1460 non-null object
Foundation 1460 non-null object
BsmtQual 1423 non-null object
BsmtCond 1423 non-null object
BsmtExposure 1422 non-null object
BsmtFinType1 1423 non-null object
BsmtFinSF1 1460 non-null int64
BsmtFinType2 1422 non-null object
BsmtFinSF2 1460 non-null int64
BsmtUnfSF 1460 non-null int64
TotalBsmtSF 1460 non-null int64
Heating 1460 non-null object
HeatingQC 1460 non-null object
CentralAir 1460 non-null object
Electrical 1459 non-null object
1stFlrSF 1460 non-null int64
2ndFlrSF 1460 non-null int64
LowQualFinSF 1460 non-null int64
GrLivArea 1460 non-null int64
BsmtFullBath 1460 non-null int64
BsmtHalfBath 1460 non-null int64
FullBath 1460 non-null int64
HalfBath 1460 non-null int64
BedroomAbvGr 1460 non-null int64
KitchenAbvGr 1460 non-null int64
KitchenQual 1460 non-null object
TotRmsAbvGrd 1460 non-null int64
Functional 1460 non-null object
Fireplaces 1460 non-null int64
FireplaceQu 770 non-null object
GarageType 1379 non-null object
GarageYrBlt 1379 non-null float64
GarageFinish 1379 non-null object
GarageCars 1460 non-null int64
GarageArea 1460 non-null int64
GarageQual 1379 non-null object
GarageCond 1379 non-null object
PavedDrive 1460 non-null object
WoodDeckSF 1460 non-null int64
OpenPorchSF 1460 non-null int64
EnclosedPorch 1460 non-null int64
3SsnPorch 1460 non-null int64
ScreenPorch 1460 non-null int64
PoolArea 1460 non-null int64
PoolQC 7 non-null object
Fence 281 non-null object
MiscFeature 54 non-null object
MiscVal 1460 non-null int64
MoSold 1460 non-null int64
YrSold 1460 non-null int64
SaleType 1460 non-null object
SaleCondition 1460 non-null object
SalePrice 1460 non-null int64
dtypes: float64(3), int64(34), object(43)
memory usage: 923.9+ KB

train.describe(include="O")
MSZoning Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2 Heating HeatingQC CentralAir Electrical KitchenQual Functional FireplaceQu GarageType GarageFinish GarageQual GarageCond PavedDrive PoolQC Fence MiscFeature SaleType SaleCondition
count 1460 1460 91 1460 1460 1460 1460 1460 1460 1460 1460 1460 1460 1460 1460 1460 1460 1452 1460 1460 1460 1423 1423 1422 1423 1422 1460 1460 1460 1459 1460 1460 770 1379 1379 1379 1379 1460 7 281 54 1460 1460
unique 5 2 2 4 4 2 5 3 25 9 8 5 8 6 8 15 16 4 4 5 6 4 4 4 6 6 6 5 2 5 4 7 5 6 3 5 5 3 3 4 4 9 6
top RL Pave Grvl Reg Lvl AllPub Inside Gtl NAmes Norm Norm 1Fam 1Story Gable CompShg VinylSd VinylSd None TA TA PConc TA TA No Unf Unf GasA Ex Y SBrkr TA Typ Gd Attchd Unf TA TA Y Gd MnPrv Shed WD Normal
freq 1151 1454 50 925 1311 1459 1052 1382 225 1260 1445 1220 726 1141 1434 515 504 864 906 1282 647 649 1311 953 430 1256 1428 741 1365 1334 735 1360 380 870 605 1311 1326 1340 3 157 49 1267 1198

检查缺失数据

train_missing = train.isnull().sum()
train_missing = train_missing[train_missing > 0]
train_missing

LotFrontage 259
Alley 1369
MasVnrType 8
MasVnrArea 8
BsmtQual 37
BsmtCond 37
BsmtExposure 38
BsmtFinType1 37
BsmtFinType2 38
Electrical 1
FireplaceQu 690
GarageType 81
GarageYrBlt 81
GarageFinish 81
GarageQual 81
GarageCond 81
PoolQC 1453
Fence 1179
MiscFeature 1406
dtype: int64

test_missing = test.isnull().sum()
test_missing = test_missing[test_missing > 0]
test_missing

MSZoning 4
LotFrontage 227
Alley 1352
Utilities 2
Exterior1st 1
Exterior2nd 1
MasVnrType 16
MasVnrArea 15
BsmtQual 44
BsmtCond 45
BsmtExposure 44
BsmtFinType1 42
BsmtFinSF1 1
BsmtFinType2 42
BsmtFinSF2 1
BsmtUnfSF 1
TotalBsmtSF 1
BsmtFullBath 2
BsmtHalfBath 2
KitchenQual 1
Functional 2
FireplaceQu 730
GarageType 76
GarageYrBlt 78
GarageFinish 78
GarageCars 1
GarageArea 1
GarageQual 78
GarageCond 78
PoolQC 1456
Fence 1169
MiscFeature 1408
SaleType 1
dtype: int64

# 可视化缺失数据
def plot_missing(df):
    # 寻找缺失的列
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    missing.sort_values(inplace=True)
    
    # 画出缺失值的柱状图。
    missing.plot.bar(figsize=(10,8))
    plt.xlabel('Columns with missing values')
    plt.ylabel('Count')
    
    # 搜索缺失值
    import missingno as msno
    msno.matrix(df=df, figsize=(10,8))
    # 查看相关性
    #msno.heatmap(df=df,figsize=(10,8))
plot_missing(train)

png

png

plot_missing(test)

png

png

分析概要

以下是缺失比较多的feature

Feature Train miss Test miss Dispos
LotFrontage 259 227 填个中位数吧
Alley 1369 1352 删除
FireplaceQu 690 730 fireplaceQU 和 fireplaces 有关 缺失项貌似都是没有fireplace的
PoolQC 1453 1456 删除
Fence 1179 1169 删除
MiscFeature 1406 1408 删除

处理缺失值

# 删除缺失过多的feature
train = train.drop(['Alley', 'PoolQC', 'MiscFeature', 'Fence'], axis=1)
test = test.drop(['Alley', 'PoolQC', 'MiscFeature', 'Fence'], axis=1)
# FireplaceQu 缺失的都认为是没有的
train['FireplaceQu'] = train['FireplaceQu'].fillna('NA')
test['FireplaceQu'] = test['FireplaceQu'].fillna('NA')
# 填充其他缺失值
def fill_missing_values(df):
    # 查找缺失值
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    
    for column in missing.index:
        # 类型为 object 的填众数
        if df[column].dtype == 'object':
            df[column].fillna(df[column].value_counts().index[0], inplace=True)
        # 其他类型填中位数
        else:
            df[column].fillna(df[column].median(), inplace=True)
fill_missing_values(train)
train.isnull().sum().max()

0

fill_missing_values(test)
test.isnull().sum().max()

0

好的,已经没有缺失数据了。

整理 description 文件

数据描述文件记录了所有特征所代表的含义,其中许多特征是字符串,现在我们要整理为个字典,便于我们查询。

description_dict = {}
with open('/data_description.txt','r') as description:
    description_data = description.read()
    description.close()
    
description_data = description_data.split('\n')
for i in description_data:
    if ':' in i:
        key = i.split(':')[0]
        description_dict[key] = []
    elif i.split() and '       ' in i:
        value = i.split()[0]
        description_dict[key].append(value)

print(description_dict)

{'MSSubClass': ['20', '30', '40', '45', '50', '60', '70', '75', '80', '85', '90', '120', '150', '160', '180', '190'], 'MSZoning': ['A', 'C', 'FV', 'I', 'RH', 'RL', 'RP', 'RM'], 'LotFrontage': [], 'LotArea': [], 'Street': ['Grvl', 'Pave'], 'Alley': ['Grvl', 'Pave', 'NA'], 'LotShape': ['Reg', 'IR1', 'IR2', 'IR3'], 'LandContour': ['Lvl', 'Bnk', 'HLS', 'Low'], 'Utilities': ['AllPub', 'NoSewr', 'NoSeWa', 'ELO'], 'LotConfig': ['Inside', 'Corner', 'CulDSac', 'FR2', 'FR3'], 'LandSlope': ['Gtl', 'Mod', 'Sev'], 'Neighborhood': ['Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'Names', 'NoRidge', 'NPkVill', 'NridgHt', 'NWAmes', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber', 'Veenker'], 'Condition1': ['Artery', 'Feedr', 'Norm', 'RRNn', 'RRAn', 'PosN', 'PosA', 'RRNe', 'RRAe'], 'Condition2': ['Artery', 'Feedr', 'Norm', 'RRNn', 'RRAn', 'PosN', 'PosA', 'RRNe', 'RRAe'], 'BldgType': ['1Fam', '2FmCon', 'Duplx', 'TwnhsE', 'TwnhsI'], 'HouseStyle': ['1Story'], ' 1.5Fin\tOne and one-half story': [], ' 1.5Unf\tOne and one-half story': ['2Story'], ' 2.5Fin\tTwo and one-half story': [], ' 2.5Unf\tTwo and one-half story': ['SFoyer', 'SLvl'], 'OverallQual': ['10', '9', '8', '7', '6', '5', '4', '3', '2', '1'], 'OverallCond': ['10', '9', '8', '7', '6', '5', '4', '3', '2', '1'], 'YearBuilt': [], 'YearRemodAdd': [], 'RoofStyle': ['Flat', 'Gable', 'Gambrel', 'Hip', 'Mansard', 'Shed'], 'RoofMatl': ['ClyTile', 'CompShg', 'Membran', 'Metal', 'Roll', 'Tar&Grv', 'WdShake', 'WdShngl'], 'Exterior1st': ['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Other', 'Plywood', 'PreCast', 'Stone', 'Stucco', 'VinylSd', 'Wd', 'WdShing'], 'Exterior2nd': ['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Other', 'Plywood', 'PreCast', 'Stone', 'Stucco', 'VinylSd', 'Wd', 'WdShing'], 'MasVnrType': ['BrkCmn', 'BrkFace', 'CBlock', 'None', 'Stone'], 'MasVnrArea': [], 'ExterQual': ['Ex', 'Gd', 'TA', 'Fa', 'Po'], 'ExterCond': ['Ex', 'Gd', 'TA', 'Fa', 'Po'], 'Foundation': ['BrkTil', 'CBlock', 'PConc', 'Slab', 'Stone', 'Wood'], 'BsmtQual': ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'], 'BsmtCond': ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'], 'BsmtExposure': ['Gd', 'Av', 'Mn', 'No', 'NA'], 'BsmtFinType1': ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NA'], 'BsmtFinSF1': [], 'BsmtFinType2': ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NA'], 'BsmtFinSF2': [], 'BsmtUnfSF': [], 'TotalBsmtSF': [], 'Heating': ['Floor', 'GasA', 'GasW', 'Grav', 'OthW', 'Wall'], 'HeatingQC': ['Ex', 'Gd', 'TA', 'Fa', 'Po'], 'CentralAir': ['N', 'Y'], 'Electrical': ['SBrkr', 'FuseA', 'FuseF', 'FuseP', 'Mix'], '1stFlrSF': [], '2ndFlrSF': [], 'LowQualFinSF': [], 'GrLivArea': [], 'BsmtFullBath': [], 'BsmtHalfBath': [], 'FullBath': [], 'HalfBath': [], 'Bedroom': [], 'Kitchen': [], 'KitchenQual': ['Ex', 'Gd', 'TA', 'Fa', 'Po'], 'TotRmsAbvGrd': [], 'Functional': ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal'], 'Fireplaces': [], 'FireplaceQu': ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'], 'GarageType': ['2Types', 'Attchd', 'Basment', 'BuiltIn', 'CarPort', 'Detchd', 'NA'], 'GarageYrBlt': [], 'GarageFinish': ['Fin', 'RFn', 'Unf', 'NA'], 'GarageCars': [], 'GarageArea': [], 'GarageQual': ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'], 'GarageCond': ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'], 'PavedDrive': ['Y', 'P', 'N'], 'WoodDeckSF': [], 'OpenPorchSF': [], 'EnclosedPorch': [], '3SsnPorch': [], 'ScreenPorch': [], 'PoolArea': [], 'PoolQC': ['Ex', 'Gd', 'TA', 'Fa', 'NA'], 'Fence': ['GdPrv', 'MnPrv', 'GdWo', 'MnWw', 'NA'], 'MiscFeature': ['Elev', 'Gar2', 'Othr', 'Shed', 'TenC', 'NA'], 'MiscVal': [], 'MoSold': [], 'YrSold': [], 'SaleType': ['WD', 'CWD', 'VWD', 'New', 'COD', 'Con', 'ConLw', 'ConLI', 'ConLD', 'Oth'], 'SaleCondition': ['Normal', 'Abnorml', 'AdjLand', 'Alloca', 'Family', 'Partial']}

字符串类型的 feature 重编码

# 这个函数的作用是得到数据集中非数字的feature column
def get_object_column(df):
    object_column = []
    for column in df.columns:
        if df[column].dtype == 'object':
            object_column.append(column)
    return object_column
#object_column = get_object_column(train)
# 这个函数的作用是把 description_dict 中的 value 转换为对应数字的字典
def generate_map(map_list, end_index=1):
    d = {}
    j = len(map_list) - end_index
    for i in map_list:
        d[i] = j
        j -= 1
    return d
def preprocess_order_feature(df):
    for i in get_object_column(df):
        order_map = generate_map(description_dict[i], 0)
        df[i] = df[i].map(order_map)
        df[i] = df[i].fillna(0)
    return df
train = preprocess_order_feature(train)
test = preprocess_order_feature(test)

观察数据

train.describe()
MSSubClass MSZoning LotFrontage LotArea Street LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold SaleType SaleCondition SalePrice
count 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.00000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000
mean 56.897260 2.825342 69.863699 10516.828082 1.004110 3.591781 3.814384 3.998630 4.583562 2.937671 10.747945 6.969178 6.993151 4.334247 0.497260 6.099315 5.575342 1971.267808 1984.865753 4.589726 6.924658 5.958904 5.271918 2.552740 103.117123 3.39589 3.083562 4.603425 4.565068 4.010959 2.656164 4.571233 443.639726 2.273288 46.549315 567.240411 1057.429452 4.963699 4.145205 1.065068 4.889726 1162.626712 346.992466 5.844521 1515.463699 0.425342 0.057534 1.565068 0.382877 2.866438 1.046575 3.511644 6.517808 7.841781 0.613014 2.825342 4.791781 1978.589041 2.771233 1.767123 472.980137 3.976712 3.975342 2.856164 94.244521 46.660274 21.954110 3.409589 15.060959 2.758904 43.489041 6.321918 2007.815753 9.509589 5.417808 180921.195890
std 42.300571 1.020174 22.027677 9981.264932 0.063996 0.582296 0.606509 0.052342 0.773448 0.276232 7.565716 0.878349 0.248272 1.555218 0.500164 1.382997 1.112799 30.202904 20.645407 0.834998 0.599127 4.426038 4.263353 1.046204 180.731373 0.57428 0.351054 0.722394 0.678071 0.284178 1.039123 2.070649 456.098091 0.869859 161.319273 441.866955 438.705324 0.295124 0.959501 0.246731 0.394658 386.587738 436.528436 48.623081 525.480383 0.518911 0.238753 0.550916 0.502885 0.815778 0.220338 0.663760 1.625393 0.667698 0.644666 1.810877 1.759864 23.997022 0.811835 0.747315 213.804841 0.241665 0.232860 0.496592 125.338794 66.256028 61.119149 29.317331 55.757415 40.177307 496.123024 2.703626 1.328095 1.368616 1.475209 79442.502883
min 20.000000 0.000000 21.000000 1300.000000 1.000000 1.000000 1.000000 2.000000 1.000000 1.000000 0.000000 1.000000 1.000000 0.000000 0.000000 1.000000 1.000000 1872.000000 1950.000000 1.000000 1.000000 0.000000 0.000000 1.000000 0.000000 2.00000 1.000000 1.000000 3.000000 2.000000 2.000000 2.000000 0.000000 2.000000 0.000000 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 334.000000 0.000000 0.000000 334.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 2.000000 2.000000 0.000000 1.000000 2.000000 1900.000000 2.000000 0.000000 0.000000 2.000000 2.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 2006.000000 1.000000 1.000000 34900.000000
25% 20.000000 3.000000 60.000000 7553.500000 1.000000 3.000000 4.000000 4.000000 4.000000 3.000000 4.000000 7.000000 7.000000 5.000000 0.000000 5.000000 5.000000 1954.000000 1967.000000 5.000000 7.000000 3.000000 3.000000 2.000000 0.000000 3.00000 3.000000 4.000000 4.000000 4.000000 2.000000 2.000000 0.000000 2.000000 0.000000 223.000000 795.750000 5.000000 3.000000 1.000000 5.000000 882.000000 0.000000 0.000000 1129.500000 0.000000 0.000000 1.000000 0.000000 2.000000 1.000000 3.000000 5.000000 8.000000 0.000000 1.000000 2.000000 1962.000000 2.000000 1.000000 334.500000 4.000000 4.000000 3.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5.000000 2007.000000 10.000000 6.000000 129975.000000
50% 50.000000 3.000000 69.000000 9478.500000 1.000000 4.000000 4.000000 4.000000 5.000000 3.000000 10.000000 7.000000 7.000000 5.000000 0.000000 6.000000 5.000000 1973.000000 1994.000000 5.000000 7.000000 3.000000 3.000000 2.000000 0.000000 3.00000 3.000000 5.000000 5.000000 4.000000 2.000000 5.000000 383.500000 2.000000 0.000000 477.500000 991.500000 5.000000 5.000000 1.000000 5.000000 1087.000000 0.000000 0.000000 1464.000000 0.000000 0.000000 2.000000 0.000000 3.000000 1.000000 3.000000 6.000000 8.000000 1.000000 3.000000 6.000000 1980.000000 3.000000 2.000000 480.000000 4.000000 4.000000 3.000000 0.000000 25.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.000000 2008.000000 10.000000 6.000000 163000.000000
75% 70.000000 3.000000 79.000000 11601.500000 1.000000 4.000000 4.000000 4.000000 5.000000 3.000000 18.000000 7.000000 7.000000 5.000000 1.000000 7.000000 6.000000 2000.000000 2004.000000 5.000000 7.000000 9.000000 9.000000 4.000000 164.250000 4.00000 3.000000 5.000000 5.000000 4.000000 3.000000 7.000000 712.250000 2.000000 0.000000 808.000000 1298.250000 5.000000 5.000000 1.000000 5.000000 1391.250000 728.000000 0.000000 1776.750000 1.000000 0.000000 2.000000 1.000000 3.000000 1.000000 4.000000 7.000000 8.000000 1.000000 5.000000 6.000000 2001.000000 3.000000 2.000000 576.000000 4.000000 4.000000 3.000000 168.000000 68.000000 0.000000 0.000000 0.000000 0.000000 0.000000 8.000000 2009.000000 10.000000 6.000000 214000.000000
max 190.000000 6.000000 313.000000 215245.000000 2.000000 4.000000 4.000000 4.000000 5.000000 3.000000 25.000000 9.000000 9.000000 5.000000 1.000000 10.000000 9.000000 2010.000000 2010.000000 6.000000 8.000000 17.000000 17.000000 5.000000 1600.000000 5.00000 5.000000 6.000000 6.000000 5.000000 5.000000 7.000000 5644.000000 7.000000 1474.000000 2336.000000 6110.000000 6.000000 5.000000 2.000000 5.000000 4692.000000 2065.000000 572.000000 5642.000000 3.000000 2.000000 3.000000 2.000000 8.000000 3.000000 5.000000 14.000000 8.000000 3.000000 6.000000 7.000000 2010.000000 4.000000 4.000000 1418.000000 6.000000 6.000000 3.000000 857.000000 547.000000 552.000000 508.000000 480.000000 738.000000 15500.000000 12.000000 2010.000000 10.000000 6.000000 755000.000000

观察 feature 之间的相关性

corr_mat = train[["SalePrice","MSSubClass","MSZoning","LotFrontage","LotArea", "BldgType",
                       "OverallQual", "OverallCond","YearBuilt", "BedroomAbvGr", "PoolArea", "GarageArea",
                       "SaleType", "MoSold"]].corr()
# corr_mat = train.corr()
f, ax = plt.subplots(figsize=(16, 8))
sns.heatmap(corr_mat, vmax=1 , square=True)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb6c086fa20>

png

观察热力图,颜色越浅相关性越大。关于热力图-> this video.

观察年限和售价的规律

f, ax = plt.subplots(figsize=(16, 8))
sns.lineplot(x='YearBuilt', y='SalePrice', data=train)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb6a5211320>

png

在二十世纪的时候价格增长迅速。

综合质量和售价有明显的相关性

f, ax = plt.subplots(figsize=(12, 8))
sns.lineplot(x='OverallQual', y='SalePrice', color='green',data=train)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb6a3e6d8d0>

png

我们可以看到,随着房屋整体质量的提高,销售价格快速上涨,这是非常合理的。

观察售价

f, ax = plt.subplots(figsize=(10, 6))
sns.distplot(train['SalePrice'])

<matplotlib.axes._subplots.AxesSubplot at 0x7fb6a25cdba8>

png

大部分的房屋售价在 100000 到 200000 之间。

构建预测模型

X = train.drop('SalePrice', axis=1)
y = np.ravel(np.array(train[['SalePrice']]))
print(y.shape)
y

(1460,)

array([208500, 181500, 223500, ..., 266500, 142125, 147500])

# Use train_test_split from sci-kit learn to segment our data into train and a local testset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

定义评估函数

评估函数基于预测值的对数与观察到的销售价格的对数之间的均方根误差(RMSE)。

def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(np.log(y), np.log(y_pred)))

随机森林

random_forest = RandomForestRegressor(n_estimators=1200,
                                      max_depth=15,
                                      min_samples_split=5,
                                      min_samples_leaf=5,
                                      max_features=None,
                                      random_state=42,
                                      oob_score=True)
kf = KFold(n_splits=5)
y_pred = cross_val_score(random_forest, X, y, cv=kf)
y_pred.mean()

0.8500001566166802

random_forest.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=15,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=5, min_samples_split=5,
min_weight_fraction_leaf=0.0, n_estimators=1200,
n_jobs=None, oob_score=True, random_state=42, verbose=0,
warm_start=False)

rf_pred = random_forest.predict(test)
rf_pred

array([126945.71699684, 153924.56961003, 182182.80294353, ...,
156066.28489667, 117296.65091637, 224995.13115853])

XG Boost

xg_boost = XGBRegressor(learning_rate=0.01,
                        n_estimators=6000,
                        max_depth=4, 
                        min_child_weight=1,
                        gamma=0.6,
                        subsample=0.7,
                        colsample_bytree=0.2,
                        objective='reg:linear',
                        nthread=-1,
                        scale_pos_weight=1,
                        seed=27,
                        reg_alpha=0.00006)

kf = KFold(n_splits=5)
y_pred = cross_val_score(xg_boost, X, y, cv=kf)
y_pred.mean()

[03:11:43] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[03:11:51] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[03:11:59] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[03:12:06] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[03:12:13] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.

0.8959027545454475

xg_boost.fit(X, y)

[03:12:21] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=0.2, gamma=0.6,
importance_type='gain', learning_rate=0.01, max_delta_step=0,
max_depth=4, min_child_weight=1, missing=None, n_estimators=6000,
n_jobs=1, nthread=-1, objective='reg:linear', random_state=0,
reg_alpha=6e-05, reg_lambda=1, scale_pos_weight=1, seed=27,
silent=None, subsample=0.7, verbosity=1)

xgb_pred = xg_boost.predict(test)
xgb_pred

array([127263.77 , 163056.02 , 193208.25 , ..., 173218.75 , 115712.914,
211616.88 ], dtype=float32)

Gradient Boost Regressor(GBM)

g_boost = GradientBoostingRegressor(n_estimators=6000,
                                    learning_rate=0.01,
                                    max_depth=5,
                                    max_features='sqrt',
                                    min_samples_leaf=15,
                                    min_samples_split=10,
                                    loss='ls',
                                    random_state=42
                                    )

kf = KFold(n_splits=5)
y_pred = cross_val_score(g_boost, X, y, cv=kf)
y_pred.mean()

0.8905525972502479

g_boost.fit(X, y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
learning_rate=0.01, loss='ls', max_depth=5,
max_features='sqrt', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=15, min_samples_split=10,
min_weight_fraction_leaf=0.0, n_estimators=6000,
n_iter_no_change=None, presort='auto',
random_state=42, subsample=1.0, tol=0.0001,
validation_fraction=0.1, verbose=0, warm_start=False)

gbm_pred = g_boost.predict(test)
gbm_pred

array([125909.87154943, 163184.8652622 , 187872.72372976, ...,
176429.22616544, 119620.23912638, 211953.00041747])

Light GBM

lightgbm = LGBMRegressor(objective='regression',
                         num_leaves=6,
                         learning_rate=0.01,
                         n_estimators=6400,
                         verbose=-1,
                         bagging_fraction=0.8,
                         bagging_freq=4,
                         bagging_seed=6,
                         feature_fraction=0.2,
                         feature_fraction_seed=7
                         )
kf = KFold(n_splits=5)
y_pred = cross_val_score(lightgbm, X, y, cv=kf)
y_pred.mean()

0.8881867437856128

lightgbm.fit(X,y)

LGBMRegressor(bagging_fraction=0.8, bagging_freq=4, bagging_seed=6,
boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
feature_fraction=0.2, feature_fraction_seed=7,
importance_type='split', learning_rate=0.01, max_depth=-1,
min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
n_estimators=6400, n_jobs=-1, num_leaves=6,
objective='regression', random_state=None, reg_alpha=0.0,
reg_lambda=0.0, silent=True, subsample=1.0,
subsample_for_bin=200000, subsample_freq=0, verbose=-1)

lgb_pred = lightgbm.predict(test)
lgb_pred

array([124759.4703253 , 161206.70919126, 187680.444818 , ...,
168310.83365532, 123698.90457326, 206480.92047866])

Logistic Regression

logreg = LogisticRegression()
kf = KFold(n_splits=5)
y_pred = cross_val_score(logreg, X, y, cv=kf)
y_pred.mean()
logreg.fit(X, y)

/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/logistic.py:469: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.
"this warning.", FutureWarning)
/usr/local/lib/python3.6/dist-packages/sklearn/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
"the number of iterations.", ConvergenceWarning)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)

round(logreg.score(X, y) * 100, 2)

88.9

log_pred = logreg.predict(test)
log_pred

array([135500, 128950, 175000, ..., 133900, 190000, 187500])

模型的叠加

叠加(也称为元集成)是一种模型集成技术,用于组合来自多个预测模型的信息,生成性能更好的新模型。在这个项目中,我们使用名为vecstack的python包,它可以帮助我们对前面导入的模型进行堆栈。它实际上非常容易使用,可以查看文档了解更多信息。vecstack

models = [g_boost, xg_boost, lightgbm, random_forest]
Strain, S_test = stacking(models,
                          X_train,
                          y_train,
                          X_test,
                          regression=True,
                          mode='oof_pred_bag',
                          metric=rmse,
                          n_folds=5,
                          random_state=25,
                          verbose=2)

task: [regression]
metric: [rmse]
mode: [oof_pred_bag]
n_models: [4]

model 0: [GradientBoostingRegressor]
fold 0: [0.12653004]
fold 1: [0.13818165]
fold 2: [0.10747644]
fold 3: [0.14980732]
fold 4: [0.11127270]
----
MEAN: [0.12665363] + [0.01595833]
FULL: [0.12764756]

model 1: [XGBRegressor]
[03:23:54] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
fold 0: [0.11631560]
[03:24:00] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
fold 1: [0.14701253]
[03:24:06] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
fold 2: [0.10450330]
[03:24:12] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
fold 3: [0.14328067]
[03:24:18] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
fold 4: [0.10632026]
----
MEAN: [0.12348647] + [0.01817552]
FULL: [0.12481458]

model 2: [LGBMRegressor]
fold 0: [0.12668239]
fold 1: [0.14251415]
fold 2: [0.11409004]
fold 3: [0.15394461]
fold 4: [0.11576550]
----
MEAN: [0.13059934] + [0.01545902]
FULL: [0.13150292]

model 3: [RandomForestRegressor]
fold 0: [0.13803357]
fold 1: [0.16746496]
fold 2: [0.13370269]
fold 3: [0.17907099]
fold 4: [0.13625091]
----
MEAN: [0.15090463] + [0.01867560]
FULL: [0.15204350]

Strain, S_test

(array([[145154.57609501, 140247.640625 , 144708.92814448,
136304.80002144],
[441586.84575012, 453786.875 , 476049.8262998 ,
433615.5690085 ],
[205559.38156983, 199459.953125 , 204548.62741617,
189012.87710637],
...,
[229773.83814053, 245324.03125 , 222988.34529258,
233726.54189435],
[ 78529.68615301, 81706.46875 , 74919.86211206,
94651.91862458],
[126564.42955093, 118016.921875 , 131591.97464745,
134449.34870648]]),
array([[156946.11019358, 162235.903125 , 156274.58204718,
174271.19166786],
[168719.74644755, 171368.26875 , 172660.15892698,
168988.4858204 ],
[165875.73697659, 167827.703125 , 166511.09786993,
144720.7621456 ],
...,
[235105.18179731, 240780.803125 , 236012.18528746,
224310.44197611],
[311340.99357469, 306275.29375 , 305036.50098821,
319953.95931372],
[100400.26285948, 97430.8671875 , 97576.05463032,
109458.70614352]]))

# Initialize 2nd level model
xgb_lev2 = XGBRegressor(learning_rate=0.1, 
                        n_estimators=500,
                        max_depth=3,
                        n_jobs=-1,
                        random_state=17
                       )

# Fit the 2nd level model on the output of level 1
xgb_lev2.fit(Strain, y_train)

[03:25:26] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0,
importance_type='gain', learning_rate=0.1, max_delta_step=0,
max_depth=3, min_child_weight=1, missing=None, n_estimators=500,
n_jobs=-1, nthread=None, objective='reg:linear', random_state=17,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=None, subsample=1, verbosity=1)

# Make predictions on the localized test set
stacked_pred = xgb_lev2.predict(S_test)
print("RMSE of Stacked Model: {}".format(rmse(y_test,stacked_pred)))

RMSE of Stacked Model: 0.12290530277450722

y1_pred_L1 = models[0].predict(test)
y2_pred_L1 = models[1].predict(test)
y3_pred_L1 = models[2].predict(test)
y4_pred_L1 = models[3].predict(test)
S_test_L1 = np.c_[y1_pred_L1, y2_pred_L1, y3_pred_L1, y4_pred_L1]
test_stacked_pred = xgb_lev2.predict(S_test_L1)
# Save the predictions in form of a dataframe
submission = pd.DataFrame()

submission['Id'] = np.array(test.index)
submission['SalePrice'] = test_stacked_pred
submission.to_csv('/submissionV2.csv', index=False)

混合较好得分的 submission

因为不知道最终的测试集合的正真数据是什么,只能一遍一遍提交去蒙,看到别人的方法是混合他人较好的提交去验证,尝试下看看。

submission_v1 = pd.read_csv('/House_price_submission_v44.csv')
submission_v2 = pd.read_csv('/submissionV19.csv')
submission_v3 = pd.read_csv('/blended_submission.csv')
final_blend = 0.5*submission_v1.SalePrice.values + 0.2*submission_v2.SalePrice.values + 0.3*submission_v3.SalePrice.values

blended_submission = pd.DataFrame()

blended_submission['Id'] = submission_v1.Id.values
blended_submission['SalePrice'] = final_blend
blended_submission.to_csv('/submissionV20.csv', index=False)
blended_submission

Top4

呃,就这样吧,最终和前十名差不到0.004。

用 Tensorfolw 试试

import math
import tensorflow as tf
from tensorflow.python.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn import metrics
tf.logging.set_verbosity(tf.logging.ERROR)
correlation_dataframe = train.copy()
saleprice_corr = correlation_dataframe.corr()['SalePrice']
saleprice_corr = saleprice_corr[saleprice_corr > 0]
corr_feature = saleprice_corr.index
corr_feature

Index(['MSZoning', 'LotFrontage', 'LotArea', 'Utilities', 'BldgType',
'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrType', 'MasVnrArea',
'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
'BsmtFinType1', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
'HeatingQC', 'Electrical', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenQual',
'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', '3SsnPorch',
'ScreenPorch', 'PoolArea', 'MoSold', 'SalePrice'],
dtype='object')

def preprocess_feature(df, corr_feature):
    newdf = pd.DataFrame()
    for feature in corr_feature:
        newdf[feature] = df[feature]
    return newdf
X = preprocess_feature(train, corr_feature).drop('SalePrice', axis=1)
y = np.ravel(np.array(train[['SalePrice']]))
X.describe()
MSZoning LotFrontage LotArea Utilities BldgType OverallQual YearBuilt YearRemodAdd MasVnrType MasVnrArea ExterQual ExterCond BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtUnfSF TotalBsmtSF Heating HeatingQC Electrical 1stFlrSF 2ndFlrSF GrLivArea BsmtFullBath FullBath HalfBath BedroomAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF 3SsnPorch ScreenPorch PoolArea MoSold
count 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.00000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000
mean 2.825342 69.863699 10516.828082 3.998630 4.334247 6.099315 1971.267808 1984.865753 2.552740 103.117123 3.39589 3.083562 4.565068 4.010959 2.656164 4.571233 443.639726 567.240411 1057.429452 4.963699 4.145205 4.889726 1162.626712 346.992466 1515.463699 0.425342 1.565068 0.382877 2.866438 3.511644 6.517808 7.841781 0.613014 2.825342 4.791781 1978.589041 2.771233 1.767123 472.980137 3.976712 3.975342 2.856164 94.244521 46.660274 3.409589 15.060959 2.758904 6.321918
std 1.020174 22.027677 9981.264932 0.052342 1.555218 1.382997 30.202904 20.645407 1.046204 180.731373 0.57428 0.351054 0.678071 0.284178 1.039123 2.070649 456.098091 441.866955 438.705324 0.295124 0.959501 0.394658 386.587738 436.528436 525.480383 0.518911 0.550916 0.502885 0.815778 0.663760 1.625393 0.667698 0.644666 1.810877 1.759864 23.997022 0.811835 0.747315 213.804841 0.241665 0.232860 0.496592 125.338794 66.256028 29.317331 55.757415 40.177307 2.703626
min 0.000000 21.000000 1300.000000 2.000000 0.000000 1.000000 1872.000000 1950.000000 1.000000 0.000000 2.00000 1.000000 3.000000 2.000000 2.000000 2.000000 0.000000 0.000000 0.000000 1.000000 1.000000 1.000000 334.000000 0.000000 334.000000 0.000000 0.000000 0.000000 0.000000 2.000000 2.000000 2.000000 0.000000 1.000000 2.000000 1900.000000 2.000000 0.000000 0.000000 2.000000 2.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
25% 3.000000 60.000000 7553.500000 4.000000 5.000000 5.000000 1954.000000 1967.000000 2.000000 0.000000 3.00000 3.000000 4.000000 4.000000 2.000000 2.000000 0.000000 223.000000 795.750000 5.000000 3.000000 5.000000 882.000000 0.000000 1129.500000 0.000000 1.000000 0.000000 2.000000 3.000000 5.000000 8.000000 0.000000 1.000000 2.000000 1962.000000 2.000000 1.000000 334.500000 4.000000 4.000000 3.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5.000000
50% 3.000000 69.000000 9478.500000 4.000000 5.000000 6.000000 1973.000000 1994.000000 2.000000 0.000000 3.00000 3.000000 5.000000 4.000000 2.000000 5.000000 383.500000 477.500000 991.500000 5.000000 5.000000 5.000000 1087.000000 0.000000 1464.000000 0.000000 2.000000 0.000000 3.000000 3.000000 6.000000 8.000000 1.000000 3.000000 6.000000 1980.000000 3.000000 2.000000 480.000000 4.000000 4.000000 3.000000 0.000000 25.000000 0.000000 0.000000 0.000000 6.000000
75% 3.000000 79.000000 11601.500000 4.000000 5.000000 7.000000 2000.000000 2004.000000 4.000000 164.250000 4.00000 3.000000 5.000000 4.000000 3.000000 7.000000 712.250000 808.000000 1298.250000 5.000000 5.000000 5.000000 1391.250000 728.000000 1776.750000 1.000000 2.000000 1.000000 3.000000 4.000000 7.000000 8.000000 1.000000 5.000000 6.000000 2001.000000 3.000000 2.000000 576.000000 4.000000 4.000000 3.000000 168.000000 68.000000 0.000000 0.000000 0.000000 8.000000
max 6.000000 313.000000 215245.000000 4.000000 5.000000 10.000000 2010.000000 2010.000000 5.000000 1600.000000 5.00000 5.000000 6.000000 5.000000 5.000000 7.000000 5644.000000 2336.000000 6110.000000 6.000000 5.000000 5.000000 4692.000000 2065.000000 5642.000000 3.000000 3.000000 2.000000 8.000000 5.000000 14.000000 8.000000 3.000000 6.000000 7.000000 2010.000000 4.000000 4.000000 1418.000000 6.000000 6.000000 3.000000 857.000000 547.000000 508.000000 480.000000 738.000000 12.000000
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
    features = {key: np.array(value) for key, value in dict(features).items()}
    
    ds = Dataset.from_tensor_slices((features, targets))
    ds = ds.batch(batch_size).repeat(num_epochs)
    
    if shuffle:
        ds = ds.shuffle(10000)
    
    features, labels = ds.make_one_shot_iterator().get_next()
    return features, labels
def construct_feature_columns(input_features):
    """构造TensorFlow特征列
        参数:
            input_features:要使用的数字输入特性的名称。
        返回:
            一个 feature columns 集合
    """
    return set([tf.feature_column.numeric_column(my_feature) 
                for my_feature in input_features])
def train_model(
    learning_rate,
    strps,
    batch_size,
    training_examples,
    training_targets,
    validation_examples,
    validation_targets):
    """训练多元特征的线性回归模型
        除训练外,此功能还打印训练进度信息,
        以及随着时间的推移而失去的训练和验证。
    参数:
        learning_rate:一个float,表示学习率
        steps:一个非零的int,训练步骤的总数。训练步骤
            由使用单个批处理的向前和向后传递组成。
        batch_size:一个非零的int
        training_example: DataFrame 包含一个或多个列
        '  california_housing_dataframe '作为训练的输入feature
        training_targets:一个' DataFrame ',它只包含一列
        ' california_housing_dataframe '作为训练的目标。
        validation_example: ' DataFrame '包含一个或多个列
        ' california_housing_dataframe '作为验证的输入feature
        validation_targets: ' DataFrame ',仅包含来自其中的一列
        ' california_housing_dataframe '作为验证的目标。
    返回:
        在训练数据上训练的“线性回归器”对象
    """
    periods = 10
    steps_per_period = strps / periods
    
    # 创建一个线性回归对象
    my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
    linear_regressor = tf.estimator.LinearRegressor(
        feature_columns=construct_feature_columns(training_examples),
        optimizer=my_optimizer
    )
    
    # 创建输入函数
    training_input_fn = lambda: my_input_fn(
        training_examples,
        training_targets,
        batch_size=batch_size)
    
    predict_training_input_fn = lambda: my_input_fn(
      training_examples, 
      training_targets, 
      num_epochs=1, 
      shuffle=False)
    
    predict_validation_input_fn = lambda: my_input_fn(
        validation_examples, 
        validation_targets,
        num_epochs=1,
        shuffle=False)
    
    #训练模型,但要在循环中进行,这样我们才能定期评估
    #损失指标
    print("Training model...")
    print("RMSE (on training data):")
    training_rmse = []
    validation_rmse = []
    for period in range (0, periods):
      # Train the model, starting from the prior state.
      linear_regressor.train(
          input_fn=training_input_fn,
          steps=steps_per_period,
      )
      # Take a break and compute predictions.
      training_predictions = linear_regressor.predict(input_fn=predict_training_input_fn)
      training_predictions = np.array([item['predictions'][0] for item in training_predictions])
      
      validation_predictions = linear_regressor.predict(input_fn=predict_validation_input_fn)
      validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])
      

      # Compute training and validation loss.
      training_root_mean_squared_error = math.sqrt(
          metrics.mean_squared_error(training_predictions, training_targets))
      validation_root_mean_squared_error = math.sqrt(
          metrics.mean_squared_error(validation_predictions, validation_targets))
      # Occasionally print the current loss.
      print("  period %02d : %0.2f" % (period, training_root_mean_squared_error))
      # Add the loss metrics from this period to our list.
      training_rmse.append(training_root_mean_squared_error)
      validation_rmse.append(validation_root_mean_squared_error)
    print("Model training finished.")

    # Output a graph of loss metrics over periods.
    plt.ylabel("RMSE")
    plt.xlabel("Periods")
    plt.title("Root Mean Squared Error vs. Periods")
    plt.tight_layout()
    plt.plot(training_rmse, label="training")
    plt.plot(validation_rmse, label="validation")
    plt.legend()

    return linear_regressor
linear_regressor = train_model(
    learning_rate=0.5,
    strps=200,
    batch_size=5,
    training_examples=X_train,
    training_targets=y_train,
    validation_examples=X_test,
    validation_targets=y_test)

Training model...
RMSE (on training data):
period 00 : 115236.06
period 01 : 135729.29
period 02 : 94007.13
period 03 : 94940.56
period 04 : 78776.40
period 05 : 73407.86
period 06 : 79100.02
period 07 : 76995.32
period 08 : 94657.55
period 09 : 55721.83
Model training finished.

png

linear_regressor2 = train_model(
    learning_rate=0.25,
    strps=500,
    batch_size=5,
    training_examples=X_train,
    training_targets=y_train,
    validation_examples=X_test,
    validation_targets=y_test)

Training model...
RMSE (on training data):
period 00 : 126663.18
period 01 : 89900.76
period 02 : 67300.53
period 03 : 73597.13
period 04 : 59429.95
period 05 : 60645.34
period 06 : 57056.42
period 07 : 55974.51
period 08 : 59490.64
period 09 : 59963.44
Model training finished.

png

predict_test_input_fn = lambda: my_input_fn(
      test, 
      test, 
      num_epochs=1, 
      shuffle=False)

test_predictions = linear_regressor.predict(input_fn=predict_test_input_fn)
test_predictions = np.array([item['predictions'][0] for item in test_predictions])
test_predictions

array([152403.8 , 178793.02, 186845.23, ..., 203018.39, 142496.02,
197809.12], dtype=float32)

就先这样吧。