Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
| Download

📚 The CoCalc Library - books, templates and other resources

Views: 96106
License: OTHER
Kernel: Python [conda env:py37]
from preamble import * %matplotlib inline

Model Evaluation and Improvement

from sklearn.datasets import make_blobs from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split # create a synthetic dataset X, y = make_blobs(random_state=0) # split data and labels into a training and a test set X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # instantiate a model and fit it to the training set logreg = LogisticRegression().fit(X_train, y_train) # evaluate the model on the test set print("Test set score: {:.2f}".format(logreg.score(X_test, y_test)))
Test set score: 0.88

Cross-Validation

mglearn.plots.plot_cross_validation()
Image in a Jupyter notebook

Cross-Validation in scikit-learn

from sklearn.model_selection import cross_val_score from sklearn.datasets import load_iris from sklearn.linear_model import LogisticRegression iris = load_iris() logreg = LogisticRegression() scores = cross_val_score(logreg, iris.data, iris.target) print("Cross-validation scores: {}".format(scores))
Cross-validation scores: [0.961 0.922 0.958]
scores = cross_val_score(logreg, iris.data, iris.target, cv=5) print("Cross-validation scores: {}".format(scores))
Cross-validation scores: [1. 0.967 0.933 0.9 1. ]
print("Average cross-validation score: {:.2f}".format(scores.mean()))
Average cross-validation score: 0.96
from sklearn.model_selection import cross_validate res = cross_validate(logreg, iris.data, iris.target, cv=5, return_train_score=True) display(res)
{'fit_time': array([0.001, 0.001, 0.001, 0.001, 0.001]), 'score_time': array([0., 0., 0., 0., 0.]), 'test_score': array([1. , 0.967, 0.933, 0.9 , 1. ]), 'train_score': array([0.95 , 0.967, 0.967, 0.975, 0.958])}
res_df = pd.DataFrame(res) display(res_df) print("Mean times and scores:\n", res_df.mean())
fit_time score_time test_score train_score
0 9.79e-04 2.69e-04 1.00 0.95
1 7.50e-04 2.07e-04 0.97 0.97
2 8.71e-04 2.95e-04 0.93 0.97
3 8.22e-04 2.60e-04 0.90 0.97
4 1.44e-03 4.58e-04 1.00 0.96
Mean times and scores: fit_time 9.73e-04 score_time 2.98e-04 test_score 9.60e-01 train_score 9.63e-01 dtype: float64

Benefits of Cross-Validation

Stratified K-Fold cross-validation and other strategies

from sklearn.datasets import load_iris iris = load_iris() print("Iris labels:\n{}".format(iris.target))
Iris labels: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
mglearn.plots.plot_stratified_cross_validation()
Image in a Jupyter notebook

More control over cross-validation

from sklearn.model_selection import KFold kfold = KFold(n_splits=5)
print("Cross-validation scores:\n{}".format( cross_val_score(logreg, iris.data, iris.target, cv=kfold)))
Cross-validation scores: [1. 0.933 0.433 0.967 0.433]
kfold = KFold(n_splits=3) print("Cross-validation scores:\n{}".format( cross_val_score(logreg, iris.data, iris.target, cv=kfold)))
Cross-validation scores: [0. 0. 0.]
kfold = KFold(n_splits=3, shuffle=True, random_state=0) print("Cross-validation scores:\n{}".format( cross_val_score(logreg, iris.data, iris.target, cv=kfold)))
Cross-validation scores: [0.9 0.96 0.96]

Leave-one-out cross-validation

from sklearn.model_selection import LeaveOneOut loo = LeaveOneOut() scores = cross_val_score(logreg, iris.data, iris.target, cv=loo) print("Number of cv iterations: ", len(scores)) print("Mean accuracy: {:.2f}".format(scores.mean()))
Number of cv iterations: 150 Mean accuracy: 0.95

Shuffle-split cross-validation

mglearn.plots.plot_shuffle_split()
Image in a Jupyter notebook
from sklearn.model_selection import ShuffleSplit shuffle_split = ShuffleSplit(test_size=.5, train_size=.5, n_splits=10) scores = cross_val_score(logreg, iris.data, iris.target, cv=shuffle_split) print("Cross-validation scores:\n{}".format(scores))
Cross-validation scores: [0.973 0.933 0.933 0.933 0.92 0.853 0.947 0.813 0.947 0.96 ]
Cross-validation with groups
mglearn.plots.plot_group_kfold()
Image in a Jupyter notebook
from sklearn.model_selection import GroupKFold # create synthetic dataset X, y = make_blobs(n_samples=12, random_state=0) # assume the first three samples belong to the same group, # then the next four, etc. groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3] scores = cross_val_score(logreg, X, y, groups, cv=GroupKFold(n_splits=3)) print("Cross-validation scores:\n{}".format(scores))
Cross-validation scores: [0.75 0.8 0.667]
# naive grid search implementation from sklearn.svm import SVC X_train, X_test, y_train, y_test = train_test_split( iris.data, iris.target, random_state=0) print("Size of training set: {} size of test set: {}".format( X_train.shape[0], X_test.shape[0])) best_score = 0 for gamma in [0.001, 0.01, 0.1, 1, 10, 100]: for C in [0.001, 0.01, 0.1, 1, 10, 100]: # for each combination of parameters, train an SVC svm = SVC(gamma=gamma, C=C) svm.fit(X_train, y_train) # evaluate the SVC on the test set score = svm.score(X_test, y_test) # if we got a better score, store the score and parameters if score > best_score: best_score = score best_parameters = {'C': C, 'gamma': gamma} print("Best score: {:.2f}".format(best_score)) print("Best parameters: {}".format(best_parameters))
Size of training set: 112 size of test set: 38 Best score: 0.97 Best parameters: {'C': 100, 'gamma': 0.001}

The danger of overfitting the parameters and the validation set

mglearn.plots.plot_threefold_split()
Image in a Jupyter notebook
from sklearn.svm import SVC # split data into train+validation set and test set X_trainval, X_test, y_trainval, y_test = train_test_split( iris.data, iris.target, random_state=0) # split train+validation set into training and validation sets X_train, X_valid, y_train, y_valid = train_test_split( X_trainval, y_trainval, random_state=1) print("Size of training set: {} size of validation set: {} size of test set:" " {}\n".format(X_train.shape[0], X_valid.shape[0], X_test.shape[0])) best_score = 0 for gamma in [0.001, 0.01, 0.1, 1, 10, 100]: for C in [0.001, 0.01, 0.1, 1, 10, 100]: # for each combination of parameters, train an SVC svm = SVC(gamma=gamma, C=C) svm.fit(X_train, y_train) # evaluate the SVC on the validation set score = svm.score(X_valid, y_valid) # if we got a better score, store the score and parameters if score > best_score: best_score = score best_parameters = {'C': C, 'gamma': gamma} # rebuild a model on the combined training and validation set, # and evaluate it on the test set svm = SVC(**best_parameters) svm.fit(X_trainval, y_trainval) test_score = svm.score(X_test, y_test) print("Best score on validation set: {:.2f}".format(best_score)) print("Best parameters: ", best_parameters) print("Test set score with best parameters: {:.2f}".format(test_score))
Size of training set: 84 size of validation set: 28 size of test set: 38 Best score on validation set: 0.96 Best parameters: {'C': 10, 'gamma': 0.001} Test set score with best parameters: 0.92

Grid Search with Cross-Validation

for gamma in [0.001, 0.01, 0.1, 1, 10, 100]: for C in [0.001, 0.01, 0.1, 1, 10, 100]: # for each combination of parameters, # train an SVC svm = SVC(gamma=gamma, C=C) # perform cross-validation scores = cross_val_score(svm, X_trainval, y_trainval, cv=5) # compute mean cross-validation accuracy score = np.mean(scores) # if we got a better score, store the score and parameters if score > best_score: best_score = score best_parameters = {'C': C, 'gamma': gamma} # rebuild a model on the combined training and validation set svm = SVC(**best_parameters) svm.fit(X_trainval, y_trainval)
SVC(C=100, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
mglearn.plots.plot_cross_val_selection()
Image in a Jupyter notebook
mglearn.plots.plot_grid_search_overview()
Image in a Jupyter notebook
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]} print("Parameter grid:\n{}".format(param_grid))
Parameter grid: {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
from sklearn.model_selection import GridSearchCV from sklearn.svm import SVC grid_search = GridSearchCV(SVC(), param_grid, cv=5, return_train_score=True)
X_train, X_test, y_train, y_test = train_test_split( iris.data, iris.target, random_state=0)
grid_search.fit(X_train, y_train)
GridSearchCV(cv=5, error_score='raise-deprecating', estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto_deprecated', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False), iid='warn', n_jobs=None, param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}, pre_dispatch='2*n_jobs', refit=True, return_train_score=True, scoring=None, verbose=0)
print("Test set score: {:.2f}".format(grid_search.score(X_test, y_test)))
Test set score: 0.97
print("Best parameters: {}".format(grid_search.best_params_)) print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
Best parameters: {'C': 100, 'gamma': 0.01} Best cross-validation score: 0.97
print("Best estimator:\n{}".format(grid_search.best_estimator_))
Best estimator: SVC(C=100, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
Analyzing the result of cross-validation
import pandas as pd # convert to Dataframe results = pd.DataFrame(grid_search.cv_results_) # show the first 5 rows display(results.head())
mean_fit_time std_fit_time mean_score_time std_score_time ... split3_train_score split4_train_score mean_train_score std_train_score
0 1.40e-03 6.85e-04 4.92e-04 1.69e-04 ... 0.37 0.36 0.37 2.85e-03
1 1.24e-03 5.33e-04 4.27e-04 1.46e-04 ... 0.37 0.36 0.37 2.85e-03
2 1.10e-03 3.77e-04 6.01e-04 4.14e-04 ... 0.37 0.36 0.37 2.85e-03
3 8.72e-04 3.81e-05 3.39e-04 3.83e-05 ... 0.37 0.36 0.37 2.85e-03
4 1.40e-03 4.82e-04 6.03e-04 1.58e-04 ... 0.37 0.36 0.37 2.85e-03

5 rows × 22 columns

scores = np.array(results.mean_test_score).reshape(6, 6) # plot the mean cross-validation scores mglearn.tools.heatmap(scores, xlabel='gamma', xticklabels=param_grid['gamma'], ylabel='C', yticklabels=param_grid['C'], cmap="viridis")
<matplotlib.collections.PolyCollection at 0x7fcb17fd24a8>
Image in a Jupyter notebook
fig, axes = plt.subplots(1, 3, figsize=(13, 5)) param_grid_linear = {'C': np.linspace(1, 2, 6), 'gamma': np.linspace(1, 2, 6)} param_grid_one_log = {'C': np.linspace(1, 2, 6), 'gamma': np.logspace(-3, 2, 6)} param_grid_range = {'C': np.logspace(-3, 2, 6), 'gamma': np.logspace(-7, -2, 6)} for param_grid, ax in zip([param_grid_linear, param_grid_one_log, param_grid_range], axes): grid_search = GridSearchCV(SVC(), param_grid, cv=5) grid_search.fit(X_train, y_train) scores = grid_search.cv_results_['mean_test_score'].reshape(6, 6) # plot the mean cross-validation scores scores_image = mglearn.tools.heatmap( scores, xlabel='gamma', ylabel='C', xticklabels=param_grid['gamma'], yticklabels=param_grid['C'], cmap="viridis", ax=ax) plt.colorbar(scores_image, ax=axes.tolist())
<matplotlib.colorbar.Colorbar at 0x7fcb184cf940>
Image in a Jupyter notebook
param_grid = [{'kernel': ['rbf'], 'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}, {'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1, 10, 100]}] print("List of grids:\n{}".format(param_grid))
List of grids: [{'kernel': ['rbf'], 'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}, {'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1, 10, 100]}]
grid_search = GridSearchCV(SVC(), param_grid, cv=5, return_train_score=True) grid_search.fit(X_train, y_train) print("Best parameters: {}".format(grid_search.best_params_)) print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
Best parameters: {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'} Best cross-validation score: 0.97
results = pd.DataFrame(grid_search.cv_results_) # we display the transposed table so that it better fits on the page: display(results.T)
0 1 2 3 ... 38 39 40 41
mean_fit_time 0.00073 0.00071 0.00085 0.0011 ... 0.00033 0.0003 0.0003 0.00031
std_fit_time 5.5e-05 4.3e-05 0.00028 0.00036 ... 2.1e-05 9.6e-06 1.4e-05 2.5e-05
mean_score_time 0.00028 0.00027 0.00033 0.00047 ... 0.00017 0.00016 0.00016 0.00016
std_score_time 2e-05 1.2e-05 0.00013 0.00025 ... 4.2e-07 2.3e-06 4.1e-06 9.9e-07
param_C 0.001 0.001 0.001 0.001 ... 0.1 1 10 100
param_gamma 0.001 0.01 0.1 1 ... NaN NaN NaN NaN
param_kernel rbf rbf rbf rbf ... linear linear linear linear
params {'C': 0.001, 'gamma': 0.001, 'kernel': 'rbf'} {'C': 0.001, 'gamma': 0.01, 'kernel': 'rbf'} {'C': 0.001, 'gamma': 0.1, 'kernel': 'rbf'} {'C': 0.001, 'gamma': 1, 'kernel': 'rbf'} ... {'C': 0.1, 'kernel': 'linear'} {'C': 1, 'kernel': 'linear'} {'C': 10, 'kernel': 'linear'} {'C': 100, 'kernel': 'linear'}
split0_test_score 0.38 0.38 0.38 0.38 ... 0.96 1 0.96 0.96
split1_test_score 0.35 0.35 0.35 0.35 ... 0.91 0.96 1 1
split2_test_score 0.36 0.36 0.36 0.36 ... 1 1 1 1
split3_test_score 0.36 0.36 0.36 0.36 ... 0.91 0.95 0.91 0.91
split4_test_score 0.38 0.38 0.38 0.38 ... 0.95 0.95 0.95 0.95
mean_test_score 0.37 0.37 0.37 0.37 ... 0.95 0.97 0.96 0.96
std_test_score 0.011 0.011 0.011 0.011 ... 0.033 0.022 0.034 0.034
rank_test_score 27 27 27 27 ... 11 1 3 3
split0_train_score 0.36 0.36 0.36 0.36 ... 0.97 0.99 0.99 0.99
split1_train_score 0.37 0.37 0.37 0.37 ... 0.98 0.98 0.99 0.99
split2_train_score 0.37 0.37 0.37 0.37 ... 0.94 0.98 0.98 0.99
split3_train_score 0.37 0.37 0.37 0.37 ... 0.98 0.99 0.99 1
split4_train_score 0.36 0.36 0.36 0.36 ... 0.97 0.99 1 1
mean_train_score 0.37 0.37 0.37 0.37 ... 0.97 0.98 0.99 0.99
std_train_score 0.0029 0.0029 0.0029 0.0029 ... 0.012 0.0055 0.007 0.0055

23 rows × 42 columns

Nested cross-validation

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]} scores = cross_val_score(GridSearchCV(SVC(), param_grid, cv=5), iris.data, iris.target, cv=5) print("Cross-validation scores: ", scores) print("Mean cross-validation score: ", scores.mean())
Cross-validation scores: [0.967 1. 0.967 0.967 1. ] Mean cross-validation score: 0.9800000000000001
def nested_cv(X, y, inner_cv, outer_cv, Classifier, parameter_grid): outer_scores = [] # for each split of the data in the outer cross-validation # (split method returns indices of training and test parts) for training_samples, test_samples in outer_cv.split(X, y): # find best parameter using inner cross-validation best_parms = {} best_score = -np.inf # iterate over parameters for parameters in parameter_grid: # accumulate score over inner splits cv_scores = [] # iterate over inner cross-validation for inner_train, inner_test in inner_cv.split( X[training_samples], y[training_samples]): # build classifier given parameters and training data clf = Classifier(**parameters) clf.fit(X[inner_train], y[inner_train]) # evaluate on inner test set score = clf.score(X[inner_test], y[inner_test]) cv_scores.append(score) # compute mean score over inner folds mean_score = np.mean(cv_scores) if mean_score > best_score: # if better than so far, remember parameters best_score = mean_score best_params = parameters # build classifier on best parameters using outer training set clf = Classifier(**best_params) clf.fit(X[training_samples], y[training_samples]) # evaluate outer_scores.append(clf.score(X[test_samples], y[test_samples])) return np.array(outer_scores)
from sklearn.model_selection import ParameterGrid, StratifiedKFold scores = nested_cv(iris.data, iris.target, StratifiedKFold(5), StratifiedKFold(5), SVC, ParameterGrid(param_grid)) print("Cross-validation scores: {}".format(scores))
Cross-validation scores: [0.967 1. 0.967 0.967 1. ]

Evaluation Metrics and Scoring

Keep the End Goal in Mind

Metrics for Binary Classification

Kinds of errors
Imbalanced datasets
from sklearn.datasets import load_digits digits = load_digits() y = digits.target == 9 X_train, X_test, y_train, y_test = train_test_split( digits.data, y, random_state=0)
from sklearn.dummy import DummyClassifier dummy_majority = DummyClassifier(strategy='most_frequent').fit(X_train, y_train) pred_most_frequent = dummy_majority.predict(X_test) print("Unique predicted labels: {}".format(np.unique(pred_most_frequent))) print("Test score: {:.2f}".format(dummy_majority.score(X_test, y_test)))
Unique predicted labels: [False] Test score: 0.90
from sklearn.tree import DecisionTreeClassifier tree = DecisionTreeClassifier(max_depth=2).fit(X_train, y_train) pred_tree = tree.predict(X_test) print("Test score: {:.2f}".format(tree.score(X_test, y_test)))
Test score: 0.92
from sklearn.linear_model import LogisticRegression dummy = DummyClassifier().fit(X_train, y_train) pred_dummy = dummy.predict(X_test) print("dummy score: {:.2f}".format(dummy.score(X_test, y_test))) logreg = LogisticRegression(C=0.1).fit(X_train, y_train) pred_logreg = logreg.predict(X_test) print("logreg score: {:.2f}".format(logreg.score(X_test, y_test)))
dummy score: 0.80 logreg score: 0.98
Confusion matrices
from sklearn.metrics import confusion_matrix confusion = confusion_matrix(y_test, pred_logreg) print("Confusion matrix:\n{}".format(confusion))
Confusion matrix: [[401 2] [ 8 39]]
mglearn.plots.plot_confusion_matrix_illustration()
Image in a Jupyter notebook
mglearn.plots.plot_binary_confusion_matrix()
Image in a Jupyter notebook
print("Most frequent class:") print(confusion_matrix(y_test, pred_most_frequent)) print("\nDummy model:") print(confusion_matrix(y_test, pred_dummy)) print("\nDecision tree:") print(confusion_matrix(y_test, pred_tree)) print("\nLogistic Regression") print(confusion_matrix(y_test, pred_logreg))
Most frequent class: [[403 0] [ 47 0]] Dummy model: [[366 37] [ 43 4]] Decision tree: [[390 13] [ 24 23]] Logistic Regression [[401 2] [ 8 39]]
Relation to accuracy
Accuracy=TP+TNTP+TN+FP+FN\begin{equation} \text{Accuracy} = \frac{\text{TP} + \text{TN}}{\text{TP} + \text{TN} + \text{FP} + \text{FN}} \end{equation}
Precision, recall and f-score
Precision=TPTP+FP\begin{equation} \text{Precision} = \frac{\text{TP}}{\text{TP} + \text{FP}} \end{equation}
Recall=TPTP+FN\begin{equation} \text{Recall} = \frac{\text{TP}}{\text{TP} + \text{FN}} \end{equation}F=2â‹…precisionâ‹…recallprecision+recall\begin{equation} \text{F} = 2 \cdot \frac{\text{precision} \cdot \text{recall}}{\text{precision} + \text{recall}} \end{equation}
from sklearn.metrics import f1_score print("f1 score most frequent: {:.2f}".format( f1_score(y_test, pred_most_frequent))) print("f1 score dummy: {:.2f}".format(f1_score(y_test, pred_dummy))) print("f1 score tree: {:.2f}".format(f1_score(y_test, pred_tree))) print("f1 score logistic regression: {:.2f}".format( f1_score(y_test, pred_logreg)))
f1 score most frequent: 0.00 f1 score dummy: 0.09 f1 score tree: 0.55 f1 score logistic regression: 0.89
/home/andy/checkout/scikit-learn/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples. 'precision', 'predicted', average, warn_for)
from sklearn.metrics import classification_report print(classification_report(y_test, pred_most_frequent, target_names=["not nine", "nine"]))
precision recall f1-score support not nine 0.90 1.00 0.94 403 nine 0.00 0.00 0.00 47 micro avg 0.90 0.90 0.90 450 macro avg 0.45 0.50 0.47 450 weighted avg 0.80 0.90 0.85 450
/home/andy/checkout/scikit-learn/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. 'precision', 'predicted', average, warn_for)
print(classification_report(y_test, pred_dummy, target_names=["not nine", "nine"]))
precision recall f1-score support not nine 0.89 0.91 0.90 403 nine 0.10 0.09 0.09 47 micro avg 0.82 0.82 0.82 450 macro avg 0.50 0.50 0.50 450 weighted avg 0.81 0.82 0.82 450
print(classification_report(y_test, pred_logreg, target_names=["not nine", "nine"]))
precision recall f1-score support not nine 0.98 1.00 0.99 403 nine 0.95 0.83 0.89 47 micro avg 0.98 0.98 0.98 450 macro avg 0.97 0.91 0.94 450 weighted avg 0.98 0.98 0.98 450
Taking uncertainty into account
X, y = make_blobs(n_samples=(400, 50), cluster_std=[7.0, 2], random_state=22) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) svc = SVC(gamma=.05).fit(X_train, y_train)
mglearn.plots.plot_decision_threshold()
Image in a Jupyter notebook
print(classification_report(y_test, svc.predict(X_test)))
precision recall f1-score support 0 0.97 0.89 0.93 104 1 0.35 0.67 0.46 9 micro avg 0.88 0.88 0.88 113 macro avg 0.66 0.78 0.70 113 weighted avg 0.92 0.88 0.89 113
y_pred_lower_threshold = svc.decision_function(X_test) > -.8
print(classification_report(y_test, y_pred_lower_threshold))
precision recall f1-score support 0 1.00 0.82 0.90 104 1 0.32 1.00 0.49 9 micro avg 0.83 0.83 0.83 113 macro avg 0.66 0.91 0.69 113 weighted avg 0.95 0.83 0.87 113
Precision-Recall curves and ROC curves
from sklearn.metrics import precision_recall_curve precision, recall, thresholds = precision_recall_curve( y_test, svc.decision_function(X_test))
# Use more data points for a smoother curve X, y = make_blobs(n_samples=(4000, 500), cluster_std=[7.0, 2], random_state=22) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) svc = SVC(gamma=.05).fit(X_train, y_train) precision, recall, thresholds = precision_recall_curve( y_test, svc.decision_function(X_test)) # find threshold closest to zero close_zero = np.argmin(np.abs(thresholds)) plt.plot(precision[close_zero], recall[close_zero], 'o', markersize=10, label="threshold zero", fillstyle="none", c='k', mew=2) plt.plot(precision, recall, label="precision recall curve") plt.xlabel("Precision") plt.ylabel("Recall") plt.legend(loc="best")
<matplotlib.legend.Legend at 0x7fcb17f90e10>
Image in a Jupyter notebook
from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(n_estimators=100, random_state=0, max_features=2) rf.fit(X_train, y_train) # RandomForestClassifier has predict_proba, but not decision_function precision_rf, recall_rf, thresholds_rf = precision_recall_curve( y_test, rf.predict_proba(X_test)[:, 1]) plt.plot(precision, recall, label="svc") plt.plot(precision[close_zero], recall[close_zero], 'o', markersize=10, label="threshold zero svc", fillstyle="none", c='k', mew=2) plt.plot(precision_rf, recall_rf, label="rf") close_default_rf = np.argmin(np.abs(thresholds_rf - 0.5)) plt.plot(precision_rf[close_default_rf], recall_rf[close_default_rf], '^', c='k', markersize=10, label="threshold 0.5 rf", fillstyle="none", mew=2) plt.xlabel("Precision") plt.ylabel("Recall") plt.legend(loc="best")
<matplotlib.legend.Legend at 0x7fcb18404240>
Image in a Jupyter notebook
print("f1_score of random forest: {:.3f}".format( f1_score(y_test, rf.predict(X_test)))) print("f1_score of svc: {:.3f}".format(f1_score(y_test, svc.predict(X_test))))
f1_score of random forest: 0.610 f1_score of svc: 0.656
from sklearn.metrics import average_precision_score ap_rf = average_precision_score(y_test, rf.predict_proba(X_test)[:, 1]) ap_svc = average_precision_score(y_test, svc.decision_function(X_test)) print("Average precision of random forest: {:.3f}".format(ap_rf)) print("Average precision of svc: {:.3f}".format(ap_svc))
Average precision of random forest: 0.660 Average precision of svc: 0.666
Receiver Operating Characteristics (ROC) and AUC
FPR=FPFP+TN\begin{equation} \text{FPR} = \frac{\text{FP}}{\text{FP} + \text{TN}} \end{equation}
from sklearn.metrics import roc_curve fpr, tpr, thresholds = roc_curve(y_test, svc.decision_function(X_test)) plt.plot(fpr, tpr, label="ROC Curve") plt.xlabel("FPR") plt.ylabel("TPR (recall)") # find threshold closest to zero close_zero = np.argmin(np.abs(thresholds)) plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10, label="threshold zero", fillstyle="none", c='k', mew=2) plt.legend(loc=4)
<matplotlib.legend.Legend at 0x7fcb180db978>
Image in a Jupyter notebook
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, rf.predict_proba(X_test)[:, 1]) plt.plot(fpr, tpr, label="ROC Curve SVC") plt.plot(fpr_rf, tpr_rf, label="ROC Curve RF") plt.xlabel("FPR") plt.ylabel("TPR (recall)") plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10, label="threshold zero SVC", fillstyle="none", c='k', mew=2) close_default_rf = np.argmin(np.abs(thresholds_rf - 0.5)) plt.plot(fpr_rf[close_default_rf], tpr[close_default_rf], '^', markersize=10, label="threshold 0.5 RF", fillstyle="none", c='k', mew=2) plt.legend(loc=4)
<matplotlib.legend.Legend at 0x7fcb180dba20>
Image in a Jupyter notebook
from sklearn.metrics import roc_auc_score rf_auc = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]) svc_auc = roc_auc_score(y_test, svc.decision_function(X_test)) print("AUC for Random Forest: {:.3f}".format(rf_auc)) print("AUC for SVC: {:.3f}".format(svc_auc))
AUC for Random Forest: 0.937 AUC for SVC: 0.916
y = digits.target == 9 X_train, X_test, y_train, y_test = train_test_split( digits.data, y, random_state=0) plt.figure() for gamma in [1, 0.05, 0.01]: svc = SVC(gamma=gamma).fit(X_train, y_train) accuracy = svc.score(X_test, y_test) auc = roc_auc_score(y_test, svc.decision_function(X_test)) fpr, tpr, _ = roc_curve(y_test , svc.decision_function(X_test)) print("gamma = {:.2f} accuracy = {:.2f} AUC = {:.2f}".format( gamma, accuracy, auc)) plt.plot(fpr, tpr, label="gamma={:.3f}".format(gamma)) plt.xlabel("FPR") plt.ylabel("TPR") plt.xlim(-0.01, 1) plt.ylim(0, 1.02) plt.legend(loc="best")
gamma = 1.00 accuracy = 0.90 AUC = 0.50 gamma = 0.05 accuracy = 0.90 AUC = 1.00 gamma = 0.01 accuracy = 0.90 AUC = 1.00
<matplotlib.legend.Legend at 0x7fcb17f67e48>
Image in a Jupyter notebook

Metrics for Multiclass Classification

from sklearn.metrics import accuracy_score X_train, X_test, y_train, y_test = train_test_split( digits.data, digits.target, random_state=0) lr = LogisticRegression().fit(X_train, y_train) pred = lr.predict(X_test) print("Accuracy: {:.3f}".format(accuracy_score(y_test, pred))) print("Confusion matrix:\n{}".format(confusion_matrix(y_test, pred)))
Accuracy: 0.953 Confusion matrix: [[37 0 0 0 0 0 0 0 0 0] [ 0 39 0 0 0 0 2 0 2 0] [ 0 0 41 3 0 0 0 0 0 0] [ 0 0 1 43 0 0 0 0 0 1] [ 0 0 0 0 38 0 0 0 0 0] [ 0 1 0 0 0 47 0 0 0 0] [ 0 0 0 0 0 0 52 0 0 0] [ 0 1 0 1 1 0 0 45 0 0] [ 0 3 1 0 0 0 0 0 43 1] [ 0 0 0 1 0 1 0 0 1 44]]
scores_image = mglearn.tools.heatmap( confusion_matrix(y_test, pred), xlabel='Predicted label', ylabel='True label', xticklabels=digits.target_names, yticklabels=digits.target_names, cmap=plt.cm.gray_r, fmt="%d") plt.title("Confusion matrix") plt.gca().invert_yaxis()
Image in a Jupyter notebook
print(classification_report(y_test, pred))
precision recall f1-score support 0 1.00 1.00 1.00 37 1 0.89 0.91 0.90 43 2 0.95 0.93 0.94 44 3 0.90 0.96 0.92 45 4 0.97 1.00 0.99 38 5 0.98 0.98 0.98 48 6 0.96 1.00 0.98 52 7 1.00 0.94 0.97 48 8 0.93 0.90 0.91 48 9 0.96 0.94 0.95 47 micro avg 0.95 0.95 0.95 450 macro avg 0.95 0.95 0.95 450 weighted avg 0.95 0.95 0.95 450
print("Micro average f1 score: {:.3f}".format( f1_score(y_test, pred, average="micro"))) print("Macro average f1 score: {:.3f}".format( f1_score(y_test, pred, average="macro")))
Micro average f1 score: 0.953 Macro average f1 score: 0.954

Regression metrics

Using evaluation metrics in model selection

# default scoring for classification is accuracy print("Default scoring: {}".format( cross_val_score(SVC(), digits.data, digits.target == 9, cv=5))) # providing scoring="accuracy" doesn't change the results explicit_accuracy = cross_val_score(SVC(), digits.data, digits.target == 9, scoring="accuracy", cv=5) print("Explicit accuracy scoring: {}".format(explicit_accuracy)) roc_auc = cross_val_score(SVC(), digits.data, digits.target == 9, scoring="roc_auc", cv=5) print("AUC scoring: {}".format(roc_auc))
Default scoring: [0.9 0.9 0.9 0.9 0.9] Explicit accuracy scoring: [0.9 0.9 0.9 0.9 0.9] AUC scoring: [0.997 0.997 0.996 0.998 0.992]
res = cross_validate(SVC(), digits.data, digits.target == 9, scoring=["accuracy", "roc_auc", "recall_macro"], return_train_score=True, cv=5) display(pd.DataFrame(res))
fit_time score_time test_accuracy train_accuracy test_roc_auc train_roc_auc test_recall_macro train_recall_macro
0 0.24 0.15 0.9 1.0 1.00 1.0 0.5 1.0
1 0.23 0.16 0.9 1.0 1.00 1.0 0.5 1.0
2 0.22 0.15 0.9 1.0 1.00 1.0 0.5 1.0
3 0.22 0.16 0.9 1.0 1.00 1.0 0.5 1.0
4 0.22 0.15 0.9 1.0 0.99 1.0 0.5 1.0
X_train, X_test, y_train, y_test = train_test_split( digits.data, digits.target == 9, random_state=0) # we provide a somewhat bad grid to illustrate the point: param_grid = {'gamma': [0.0001, 0.01, 0.1, 1, 10]} # using the default scoring of accuracy: grid = GridSearchCV(SVC(), param_grid=param_grid) grid.fit(X_train, y_train) print("Grid-Search with accuracy") print("Best parameters:", grid.best_params_) print("Best cross-validation score (accuracy)): {:.3f}".format(grid.best_score_)) print("Test set AUC: {:.3f}".format( roc_auc_score(y_test, grid.decision_function(X_test)))) print("Test set accuracy: {:.3f}".format(grid.score(X_test, y_test)))
Grid-Search with accuracy Best parameters: {'gamma': 0.0001} Best cross-validation score (accuracy)): 0.970 Test set AUC: 0.992 Test set accuracy: 0.973
# using AUC scoring instead: grid = GridSearchCV(SVC(), param_grid=param_grid, scoring="roc_auc") grid.fit(X_train, y_train) print("\nGrid-Search with AUC") print("Best parameters:", grid.best_params_) print("Best cross-validation score (AUC): {:.3f}".format(grid.best_score_)) print("Test set AUC: {:.3f}".format( roc_auc_score(y_test, grid.decision_function(X_test)))) print("Test set accuracy: {:.3f}".format(grid.score(X_test, y_test)))
Grid-Search with AUC Best parameters: {'gamma': 0.01} Best cross-validation score (AUC): 0.997 Test set AUC: 1.000 Test set accuracy: 1.000
from sklearn.metrics.scorer import SCORERS print("Available scorers:") print(sorted(SCORERS.keys()))
Available scorers: ['explained_variance', 'r2', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'accuracy', 'roc_auc', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'brier_score_loss', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted']

Summary and Outlook