房价预测

题目

KaggleHouse Prices - Advanced Regression Techniques

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

# 读取数据
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# 备份 test_data 的 ID 列
test_ids = test_data['Id']

# 处理缺失值
numeric_features = train_data.select_dtypes(include=[np.number]).columns.drop('SalePrice', errors='ignore')
non_numeric_features = train_data.select_dtypes(exclude=[np.number]).columns

# 数值型特征填充
train_data[numeric_features] = train_data[numeric_features].fillna(train_data[numeric_features].mean())
test_data[numeric_features] = test_data[numeric_features].fillna(test_data[numeric_features].mean())

# 类别型特征填充
for feature in non_numeric_features:
train_data.loc[:, feature] = train_data[feature].fillna('Missing')
test_data.loc[:, feature] = test_data[feature].fillna('Missing')

# 独热编码
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

# 确保特征对齐
train_data, test_data = train_data.align(test_data, join='left', axis=1, fill_value=0)

# 提取 X, y
X = train_data.drop('SalePrice', axis=1)
y = train_data['SalePrice']

# 划分训练集和验证集
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# 标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(test_data.drop('SalePrice', axis=1, errors='ignore'))

# XGBoost 训练
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, missing=np.nan)

# 超参数搜索
param_grid = {
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 7],
'n_estimators': [100, 200, 300]
}
grid_search = RandomizedSearchCV(xgb_model, param_distributions=param_grid, n_iter=10, cv=3, scoring='neg_mean_squared_error', random_state=42)
grid_search.fit(X_train_scaled, y_train)

# 最优模型
best_model = grid_search.best_estimator_

# 预测
y_pred = best_model.predict(X_valid_scaled)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(f"Validation RMSE: {rmse:.2f}")

# 测试集预测
test_predictions = best_model.predict(X_test_scaled)

# 生成提交文件
submission = pd.DataFrame({'Id': test_ids, 'SalePrice': test_predictions})
submission.to_csv('submission.csv', index=False)

mean_price = y_valid.mean()
relative_rmse = rmse / mean_price
print(f"Relative RMSE: {relative_rmse:.2%}")

Contents
  1. 1. 题目
  2. 2. 代码
|