1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
| import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, RandomizedSearchCV from sklearn.metrics import mean_squared_error from sklearn.preprocessing import StandardScaler import xgboost as xgb
train_data = pd.read_csv('train.csv') test_data = pd.read_csv('test.csv')
test_ids = test_data['Id']
numeric_features = train_data.select_dtypes(include=[np.number]).columns.drop('SalePrice', errors='ignore') non_numeric_features = train_data.select_dtypes(exclude=[np.number]).columns
train_data[numeric_features] = train_data[numeric_features].fillna(train_data[numeric_features].mean()) test_data[numeric_features] = test_data[numeric_features].fillna(test_data[numeric_features].mean())
for feature in non_numeric_features: train_data.loc[:, feature] = train_data[feature].fillna('Missing') test_data.loc[:, feature] = test_data[feature].fillna('Missing')
train_data = pd.get_dummies(train_data) test_data = pd.get_dummies(test_data)
train_data, test_data = train_data.align(test_data, join='left', axis=1, fill_value=0)
X = train_data.drop('SalePrice', axis=1) y = train_data['SalePrice']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_valid_scaled = scaler.transform(X_valid) X_test_scaled = scaler.transform(test_data.drop('SalePrice', axis=1, errors='ignore'))
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, missing=np.nan)
param_grid = { 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7], 'n_estimators': [100, 200, 300] } grid_search = RandomizedSearchCV(xgb_model, param_distributions=param_grid, n_iter=10, cv=3, scoring='neg_mean_squared_error', random_state=42) grid_search.fit(X_train_scaled, y_train)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_valid_scaled) rmse = np.sqrt(mean_squared_error(y_valid, y_pred)) print(f"Validation RMSE: {rmse:.2f}")
test_predictions = best_model.predict(X_test_scaled)
submission = pd.DataFrame({'Id': test_ids, 'SalePrice': test_predictions}) submission.to_csv('submission.csv', index=False)
mean_price = y_valid.mean() relative_rmse = rmse / mean_price print(f"Relative RMSE: {relative_rmse:.2%}")
|