Validation Curves
description
Transcript of Validation Curves
-
TODOsWeight classesStratified Kfoldordinal categorics: AgeOptimize GBRT TuningAdd additional categoricalsTry LogregTry factorizing GBRTClean codenormalize numericstune RFs / GBRTsinverse transform one hot encodingsrevisit NA encodingsremove rows with > threshold NAs
In[5]: import pandas as pd
import os
from utility import *
Load Original Data SetsIn[6]: train = pd.read_csv("../training.csv",index_col=0)
test = pd.read_csv("../test.csv",index_col=0)
all_numerics,all_categorics = getDataTypes(train)
#Categoricals encoded as integers
categoricals_as_integers = {"ADMISSION_SOURCE","ADMISSION_TYPE","DISCHARGE_TO",
"ETHNICITY","INSURANCE",
"PHYSICIAN_SPECIALTY","WEIGHT"}
categoricals_as_strings = set(all_categorics) - categoricals_as_integers
#NOTE: WEIGHT is as encoded as integer
threshold = .50
num_pt,num_features = train.shape
#Only retain categorical variables with > threshold * num_pt non-null values
categorics_to_keep = list(train[all_categorics].columns[train[all_categorics].c
ount() > threshold * num_pt])
int_categorics = filter(lambda col: col in categoricals_as_integers,categorics_
to_keep)
str_categorics = filter(lambda col: col in categoricals_as_strings,categorics_t
o_keep)
-
print sum(train[all_categorics].count() > threshold * num_pt)
In[7]: #Fill NAs encoded as integers with 0 and convert all to str type
cat_int_predictors_train = train[int_categorics].fillna(0).astype(str)
cat_int_predictors_test = test[int_categorics].fillna(0).astype(str)
#Fill NAs encoded as strings with "NA"
cat_str_predictors_train = train[str_categorics].fillna("NA")
cat_str_predictors_test = test[str_categorics].fillna("NA")
#DictVectorize string categoricals
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
combined_categorics_train = pd.concat([cat_int_predictors_train,cat_str_predict
ors_train],axis=1)
combined_categorics_test = pd.concat([cat_int_predictors_test,cat_str_predictor
s_test],axis=1)
#First convert dataframes to dictionaries
df_dict_train = combined_categorics_train.T.to_dict().values()
df_dict_test = combined_categorics_test.T.to_dict().values()
#Create dictionary vectorizer to factorize original categoricals
dv = DictVectorizer()
dv.fit(df_dict_train)
cat_train_vectorized = dv.transform(df_dict_train).toarray()
cat_test_vectorized = dv.transform(df_dict_test).toarray()
#Define datasets gbrt_categorical.csv
X_train_full = np.hstack((train[all_numerics].values,cat_train_vectorized))
y_train_full = pd.factorize(train.HIGH_COST)[0]
X_test_full = np.hstack((test[all_numerics].values,cat_test_vectorized))
In[14]: from sklearn.grid_search import ParameterGrid
/Library/Python/2.7/site-packages/pandas/io/parsers.py:1154: DtypeWarning: Colu
mns (23,28,32,33) have mixed types. Specify dtype option on import or set low_m
emory=False.
data = self._reader.read(nrows)
/Library/Python/2.7/site-packages/pandas/io/parsers.py:1154: DtypeWarning: Colu
mns (20,34) have mixed types. Specify dtype option on import or set low_memory=
False.
data = self._reader.read(nrows)
10
-
from sklearn.cross_validation import ShuffleSplit,StratifiedKFold,KFold,Stratif
iedShuffleSplit
gbm_params = {
'n_estimators': 2000,
'learning_rate': 0.005,
'max_depth': 6,
'subsample': .5,
# 'max_features':'log2',
# 'min_samples_leaf': 20
}
model = GradientBoostingClassifier()
model.set_params(**gbm_params)
cv = StratifiedKFold(y_train_full,n_folds=5)
scoring = "roc_auc"
In[19]: param_name="n_estimators"
param_range = [10,100,500,1000,2000]
In[20]: plot_validation_curve(model=model,X=X_train_full,y=y_train_full,param_name=para
m_name,
param_range=param_range,cv=cv)
In[17]: print(__doc__)
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_digits
from sklearn.learning_curve import validation_curve
-
def plot_validation_curve(model,X,y,param_name,param_range,cv,scoring="roc_auc"
,n_jobs=-1):
train_scores, test_scores = validation_curve(model,X,y,param_name=param_nam
e,
param_range=param_range,cv=cv,
scoring=scoring, n_jobs=n_jobs
)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.title("Validation Curve")
plt.xlabel(param_name)
plt.ylabel("AUC")
plt.ylim(0.0, 1.1)
plt.plot(param_range, train_scores_mean, label="Training score", color="r")
plt.fill_between(param_range, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2, color="r"
)
plt.plot(param_range, test_scores_mean, label="Cross-validation score",
color="g")
plt.fill_between(param_range, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.2, color="g")
plt.legend(loc="best")
plt.show()
In[1]: print(__doc__)
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.learning_curve import validation_curve
digits = load_digits()
X, y = digits.data, digits.target
param_range = np.logspace(-6, -1, 5)
train_scores, test_scores = validation_curve(
SVC(), X, y, param_name="gamma", param_range=param_range,
cv=10, scoring="accuracy", n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
Automatically created module for IPython interactive environment
-
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.title("Validation Curve with SVM")
plt.xlabel("$\gamma$")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
plt.semilogx(param_range, train_scores_mean, label="Training score", color="r")
plt.fill_between(param_range, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2, color="r")
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
color="g")
plt.fill_between(param_range, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.2, color="g")
plt.legend(loc="best")
plt.show()
In[]:
Automatically created module for IPython interactive environment