Validation Curves

5
TODOs Weight classes Stratified Kfold ordinal categorics: Age Optimize GBRT Tuning Add additional categoricals Try Logreg Try factorizing GBRT Clean code normalize numerics tune RFs / GBRTs inverse transform one hot encodings revisit NA encodings remove rows with > threshold NAs In [5]: import pandas as pd import os from utility import * Load Original Data Sets In [6]: train = pd.read_csv("../training.csv",index_col=0) test = pd.read_csv("../test.csv",index_col=0) all_numerics,all_categorics = getDataTypes(train) #Categoricals encoded as integers categoricals_as_integers = {"ADMISSION_SOURCE","ADMISSION_TYPE","DISCHARGE_TO", "ETHNICITY","INSURANCE", "PHYSICIAN_SPECIALTY","WEIGHT"} categoricals_as_strings = set(all_categorics) - categoricals_as_integers #NOTE: WEIGHT is as encoded as integer threshold = .50 num_pt,num_features = train.shape #Only retain categorical variables with > threshold * num_pt non-null values categorics_to_keep = list(train[all_categorics].columns[train[all_categorics].c ount() > threshold * num_pt]) int_categorics = filter( lambda col: col in categoricals_as_integers,categorics_ to_keep) str_categorics = filter( lambda col: col in categoricals_as_strings,categorics_t o_keep)

description

a

Transcript of Validation Curves

  • TODOsWeight classesStratified Kfoldordinal categorics: AgeOptimize GBRT TuningAdd additional categoricalsTry LogregTry factorizing GBRTClean codenormalize numericstune RFs / GBRTsinverse transform one hot encodingsrevisit NA encodingsremove rows with > threshold NAs

    In[5]: import pandas as pd

    import os

    from utility import *

    Load Original Data SetsIn[6]: train = pd.read_csv("../training.csv",index_col=0)

    test = pd.read_csv("../test.csv",index_col=0)

    all_numerics,all_categorics = getDataTypes(train)

    #Categoricals encoded as integers

    categoricals_as_integers = {"ADMISSION_SOURCE","ADMISSION_TYPE","DISCHARGE_TO",

    "ETHNICITY","INSURANCE",

    "PHYSICIAN_SPECIALTY","WEIGHT"}

    categoricals_as_strings = set(all_categorics) - categoricals_as_integers

    #NOTE: WEIGHT is as encoded as integer

    threshold = .50

    num_pt,num_features = train.shape

    #Only retain categorical variables with > threshold * num_pt non-null values

    categorics_to_keep = list(train[all_categorics].columns[train[all_categorics].c

    ount() > threshold * num_pt])

    int_categorics = filter(lambda col: col in categoricals_as_integers,categorics_

    to_keep)

    str_categorics = filter(lambda col: col in categoricals_as_strings,categorics_t

    o_keep)

  • print sum(train[all_categorics].count() > threshold * num_pt)

    In[7]: #Fill NAs encoded as integers with 0 and convert all to str type

    cat_int_predictors_train = train[int_categorics].fillna(0).astype(str)

    cat_int_predictors_test = test[int_categorics].fillna(0).astype(str)

    #Fill NAs encoded as strings with "NA"

    cat_str_predictors_train = train[str_categorics].fillna("NA")

    cat_str_predictors_test = test[str_categorics].fillna("NA")

    #DictVectorize string categoricals

    from sklearn.preprocessing import OneHotEncoder

    from sklearn.feature_extraction import DictVectorizer

    combined_categorics_train = pd.concat([cat_int_predictors_train,cat_str_predict

    ors_train],axis=1)

    combined_categorics_test = pd.concat([cat_int_predictors_test,cat_str_predictor

    s_test],axis=1)

    #First convert dataframes to dictionaries

    df_dict_train = combined_categorics_train.T.to_dict().values()

    df_dict_test = combined_categorics_test.T.to_dict().values()

    #Create dictionary vectorizer to factorize original categoricals

    dv = DictVectorizer()

    dv.fit(df_dict_train)

    cat_train_vectorized = dv.transform(df_dict_train).toarray()

    cat_test_vectorized = dv.transform(df_dict_test).toarray()

    #Define datasets gbrt_categorical.csv

    X_train_full = np.hstack((train[all_numerics].values,cat_train_vectorized))

    y_train_full = pd.factorize(train.HIGH_COST)[0]

    X_test_full = np.hstack((test[all_numerics].values,cat_test_vectorized))

    In[14]: from sklearn.grid_search import ParameterGrid

    /Library/Python/2.7/site-packages/pandas/io/parsers.py:1154: DtypeWarning: Colu

    mns (23,28,32,33) have mixed types. Specify dtype option on import or set low_m

    emory=False.

    data = self._reader.read(nrows)

    /Library/Python/2.7/site-packages/pandas/io/parsers.py:1154: DtypeWarning: Colu

    mns (20,34) have mixed types. Specify dtype option on import or set low_memory=

    False.

    data = self._reader.read(nrows)

    10

  • from sklearn.cross_validation import ShuffleSplit,StratifiedKFold,KFold,Stratif

    iedShuffleSplit

    gbm_params = {

    'n_estimators': 2000,

    'learning_rate': 0.005,

    'max_depth': 6,

    'subsample': .5,

    # 'max_features':'log2',

    # 'min_samples_leaf': 20

    }

    model = GradientBoostingClassifier()

    model.set_params(**gbm_params)

    cv = StratifiedKFold(y_train_full,n_folds=5)

    scoring = "roc_auc"

    In[19]: param_name="n_estimators"

    param_range = [10,100,500,1000,2000]

    In[20]: plot_validation_curve(model=model,X=X_train_full,y=y_train_full,param_name=para

    m_name,

    param_range=param_range,cv=cv)

    In[17]: print(__doc__)

    import matplotlib.pyplot as plt

    import numpy as np

    from sklearn.datasets import load_digits

    from sklearn.learning_curve import validation_curve

  • def plot_validation_curve(model,X,y,param_name,param_range,cv,scoring="roc_auc"

    ,n_jobs=-1):

    train_scores, test_scores = validation_curve(model,X,y,param_name=param_nam

    e,

    param_range=param_range,cv=cv,

    scoring=scoring, n_jobs=n_jobs

    )

    train_scores_mean = np.mean(train_scores, axis=1)

    train_scores_std = np.std(train_scores, axis=1)

    test_scores_mean = np.mean(test_scores, axis=1)

    test_scores_std = np.std(test_scores, axis=1)

    plt.title("Validation Curve")

    plt.xlabel(param_name)

    plt.ylabel("AUC")

    plt.ylim(0.0, 1.1)

    plt.plot(param_range, train_scores_mean, label="Training score", color="r")

    plt.fill_between(param_range, train_scores_mean - train_scores_std,

    train_scores_mean + train_scores_std, alpha=0.2, color="r"

    )

    plt.plot(param_range, test_scores_mean, label="Cross-validation score",

    color="g")

    plt.fill_between(param_range, test_scores_mean - test_scores_std,

    test_scores_mean + test_scores_std, alpha=0.2, color="g")

    plt.legend(loc="best")

    plt.show()

    In[1]: print(__doc__)

    import matplotlib.pyplot as plt

    import numpy as np

    from sklearn.datasets import load_digits

    from sklearn.svm import SVC

    from sklearn.learning_curve import validation_curve

    digits = load_digits()

    X, y = digits.data, digits.target

    param_range = np.logspace(-6, -1, 5)

    train_scores, test_scores = validation_curve(

    SVC(), X, y, param_name="gamma", param_range=param_range,

    cv=10, scoring="accuracy", n_jobs=1)

    train_scores_mean = np.mean(train_scores, axis=1)

    train_scores_std = np.std(train_scores, axis=1)

    Automatically created module for IPython interactive environment

  • test_scores_mean = np.mean(test_scores, axis=1)

    test_scores_std = np.std(test_scores, axis=1)

    plt.title("Validation Curve with SVM")

    plt.xlabel("$\gamma$")

    plt.ylabel("Score")

    plt.ylim(0.0, 1.1)

    plt.semilogx(param_range, train_scores_mean, label="Training score", color="r")

    plt.fill_between(param_range, train_scores_mean - train_scores_std,

    train_scores_mean + train_scores_std, alpha=0.2, color="r")

    plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",

    color="g")

    plt.fill_between(param_range, test_scores_mean - test_scores_std,

    test_scores_mean + test_scores_std, alpha=0.2, color="g")

    plt.legend(loc="best")

    plt.show()

    In[]:

    Automatically created module for IPython interactive environment