Kernel: Python [conda env:py37]
In [1]:
Algorithm Chains and Pipelines
In [2]:
In [3]:
Test score: 0.95
Parameter Selection with Preprocessing
In [4]:
Best cross-validation accuracy: 0.98
Best parameters: {'C': 1, 'gamma': 1}
Test set accuracy: 0.97
In [5]:
Invalid PDF output
Building Pipelines
In [6]:
In [7]:
Pipeline(memory=None,
steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
kernel='rbf', max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False))])
In [8]:
Test score: 0.95
Using Pipelines in Grid-searches
In [9]:
In [10]:
Best cross-validation accuracy: 0.98
Test set score: 0.97
Best parameters: {'svm__C': 1, 'svm__gamma': 1}
In [11]:
Invalid PDF output
In [12]:
In [13]:
X_selected.shape: (100, 500)
In [14]:
Cross-validation accuracy (cv only on ridge): 0.91
In [15]:
Cross-validation accuracy (pipeline): -0.25
The General Pipeline Interface
In [16]:
In [17]:
Convenient Pipeline creation with make_pipeline
In [18]:
In [19]:
Pipeline steps:
[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svc', SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
kernel='rbf', max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False))]
In [20]:
Pipeline steps:
[('standardscaler-1', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)), ('standardscaler-2', StandardScaler(copy=True, with_mean=True, with_std=True))]
Accessing step attributes
In [21]:
components.shape: (2, 30)
Accessing Attributes in a Pipeline inside GridSearchCV
In [22]:
In [23]:
In [24]:
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=Pipeline(memory=None,
steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='warn',
n_jobs=None, penalty='l2', random_state=None, solver='warn',
tol=0.0001, verbose=0, warm_start=False))]),
iid='warn', n_jobs=None,
param_grid={'logisticregression__C': [0.01, 0.1, 1, 10, 100]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)
In [25]:
Best estimator:
Pipeline(memory=None,
steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='warn',
n_jobs=None, penalty='l2', random_state=None, solver='warn',
tol=0.0001, verbose=0, warm_start=False))])
In [26]:
Logistic regression step:
LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='warn',
n_jobs=None, penalty='l2', random_state=None, solver='warn',
tol=0.0001, verbose=0, warm_start=False)
In [27]:
Logistic regression coefficients:
[[-0.389 -0.375 -0.376 -0.396 -0.115 0.017 -0.355 -0.39 -0.058 0.209
-0.495 -0.004 -0.371 -0.383 -0.045 0.198 0.004 -0.049 0.21 0.224
-0.547 -0.525 -0.499 -0.515 -0.393 -0.123 -0.388 -0.417 -0.325 -0.139]]
Grid-searching preprocessing steps and model parameters
In [28]:
In [29]:
In [30]:
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=Pipeline(memory=None,
steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('polynomialfeatures', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False,
order='C')), ('ridge', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
normalize=False, random_state=None, solver='auto', tol=0.001))]),
iid='warn', n_jobs=-1,
param_grid={'polynomialfeatures__degree': [1, 2, 3], 'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)
In [31]:
<matplotlib.collections.PolyCollection at 0x7fecac083ba8>
Invalid PDF output
In [32]:
Best parameters: {'polynomialfeatures__degree': 2, 'ridge__alpha': 10}
In [33]:
Test-set score: 0.77
In [34]:
Score without poly features: 0.63
In [35]:
In [36]:
In [37]:
Best params:
{'classifier': SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False), 'classifier__C': 10, 'classifier__gamma': 0.01, 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}
Best cross-validation score: 0.99
Test-set score: 0.98
Avoiding Redundant Computation
In [38]: