1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115 | import pathlib
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.cross_decomposition import PLSRegression
# get relative data folder
PATH = pathlib.Path(__file__).resolve().parents[1]
DATA_PATH = PATH.joinpath("data").resolve()
data = pd.read_csv(DATA_PATH.joinpath("prostate.csv"), header=0)
data_train = data.loc[data['train'] == 'T']
data_test = data.loc[data['train'] == 'F']
X_train = data_train.loc[:, 'lcavol':'pgg45']
X_test = data_test.loc[:, 'lcavol':'pgg45']
y_train = pd.DataFrame(data_train.loc[:, 'lpsa'])
y_test = pd.DataFrame(data_test.loc[:, 'lpsa'])
# a utility function to print results
def displayResults(name, model):
if name == "Best Subset":
intercept = model.estimator_.intercept_[0]
coef = model.estimator_.coef_
elif name == "PCR":
lr_tuple = model.steps[1]
lr_est = lr_tuple[1]
intercept = lr_est.intercept_[0]
coef = lr_est.coef_
elif name == "PLS":
intercept = model.y_mean_[0]
coef = model.coef_
else: # default
intercept = model.intercept_[0]
coef = model.coef_
print("Intercept of final {} model is: {:.3f}".format(name, intercept))
print("Coefficient of final {} model is: {}".format(name, coef))
# pre-processing
pipeline = Pipeline([('std_scaler', StandardScaler())])
X_train_prepared = pipeline.fit_transform(X_train)
X_test_prepared = pipeline.transform(X_test)
# define various models and their parameter grids
models = [
{"name": "LS",
"params": [{'fit_intercept': [True]}],
"estimator": LinearRegression()
},
{"name": "Ridge",
"params": [
{'alpha': [0.01, 0.1, 1, 2, 3, 4, 5, 7, 10]}
],
"estimator": Ridge()
},
{"name": "Lasso",
"params": [
{'alpha': [0.001, 0.01, 0.1, 0.2, 0.5, 1, 5]}
],
"estimator": Lasso()
},
{"name": "PLS",
"params": [
{'n_components': np.arange(1, 9)}
],
"estimator": PLSRegression(scale=False)
},
{"name": "PCR",
"params": [
{'pca__n_components': np.arange(1, 9)}
],
"estimator": Pipeline(steps=[('pca', PCA()), ('linear regression', LinearRegression())])
},
{"name": "Best Subset",
"params": [
{'n_features_to_select': np.arange(1, 9)}
],
"estimator": RFE(LinearRegression())
}
]
for model in models:
name = model["name"]
print("******** Start running model: {} **********".format(name))
estimator = model["estimator"]
param_grid = model["params"]
grid_search = GridSearchCV(estimator, param_grid, cv=10,
scoring='neg_mean_squared_error',
return_train_score=True)
grid_search.fit(X_train_prepared, y_train)
cv_res = grid_search.cv_results_
for mean_score, params in zip(cv_res["mean_test_score"], cv_res["params"]):
print("CV results for model {} with parameter {} is {}".format(name, params, np.sqrt(-mean_score)))
final_model = grid_search.best_estimator_
final_y_pred = final_model.predict(X_test_prepared)
final_test_error = mean_squared_error(final_y_pred, y_test)
displayResults(name, final_model)
print("Test error of final {} model is : {:.3f}".format(name, final_test_error))
print("******** End running model: {} **********".format(name))
|