CoCalc -- kernridgeCV1.py

Project: Jamlah Travis - chemistry/chemistry
Path: chemistry/Machine Learning/kernridgeCV1.py
Views: ⁵⁷
1
#!/usr/bin/python
2
# Import libraries needed
3
import sys
4
import numpy as np
5
#from sklearn.svm import SVR
6
#from sklearn.neighbors import KNeighborsRegressor
7
from sklearn.kernel_ridge import KernelRidge
8
from sklearn.linear_model import Ridge
9
from sklearn.model_selection import GridSearchCV
10
from sklearn.metrics import classification_report
11
from sklearn.svm import SVC
12
import argparse
13
import matplotlib
14
matplotlib.use('Agg')
15
import matplotlib.pyplot as plt
16
from sklearn.preprocessing import PolynomialFeatures
17
from sklearn.pipeline import make_pipeline
18
from sklearn import linear_model
19
from __future__ import print_function
20
from sklearn import datasets
21
from sklearn.model_selection import train_test_split
22

23
#Create the argument parser
24
parser = argparse.ArgumentParser (description = "ploting a polynomial fit of x and y data values")
25
parser.add_argument("input")
26
parser.add_argument("degree", type = int)
27
#parser.add_argument("C", type = int)
28
arguments = parser.parse_args()
29

30

31
# Load the traing and validation sets
32
validate_data = arguments.input + ".validate.xy"
33
training_data = arguments.input + ".training.xy"
34
plot_name = arguments.input + ".simple_polynomial_fit.png"
35

36
#Create an empty list of x and y values and load in the text file and interpret it as an array (2 coloumns, x and y, with 100 rows)
37
tx_y_data = np.loadtxt(training_data, delimiter = '\t')
38
# This x_y_data array is two columns, just as the text file was. Split the input array into an array for x and a separate one for y
39
xtdata = tx_y_data[:,0] #all rows in the 0th column
40
ytdata = tx_y_data[:,1] #all
41

42
# X and y data needs to be numpy array [n_samples,n_features]. For simple x-y data, this is [100,1] (if we have 100 data points) [rows,columns]
43
#print the shape of the x data. As of now it is a list with 100 elements. As read from disk these 1-D arrays are size [100, ]. *Convert a simple list to a [1,100] array, and then transpose to make it [100,1]*
44
#This is done because the previous code is just a list with no dimensiality. Turns into an array with a long row of 100 elements. The transform makes it 100 rows with 1 column, for both x and y.
45
xtdata = np.array([xtdata])
46
xtdata = xtdata.T
47
#Print the shape of the x data
48

49
#repeat for y data
50
#ytdata = np.array([ytdata])
51
#ytdata = ytdata.T
52
#print the shape of y data
53

54
#Repeat the process above for validate file
55
vx_y_data = np.loadtxt(validate_data, delimiter = '\t')
56
# This x_y_data array is two columns, just as the text file was. Split the input array into an array for x and a separate one for y
57
x_vdata = vx_y_data[:,0] #all rows in the 0th column
58
y_vdata = vx_y_data[:,1] #all
59

60
# X and y data needs to be numpy array [n_samples,n_features]. For simple x-y data, this is [100,1] (if we have 100 data points) [rows,columns]
61
#print the shape of the x data. As of now it is a list with 100 elements. As read from disk these 1-D arrays are size [100, ]. *Convert a simple list to a [1,100] array, and then transpose to make it [100,1]*
62
x_vdata = np.array([x_vdata])
63
x_vdata = x_vdata.T
64
#Print the shape of the x data
65

66
#repeat for y data
67
#y_vdata = np.array([y_vdata])
68
#y_vdata = y_vdata.T
69

70

71
fit_stdev = list()
72
# Define the model that we will be using to do the fit
73
#regr = linear_model.LinearRegression(fit_intercept=True)  # just a simple LinearRegression fit
74
#regr = make_pipeline( PolynomialFeatures(degree=arguments.degree), linear_model.LinearRegression(fit_intercept=True) )
75
#regr = SVR(degree=arguments.degree, gamma='auto', coef0=0.0, tol=10, C=arguments.C, epsilon=35, shrinking=True, cache_size=200, verbose=False, max_iter=-1, kernel='poly')
76
#regr = NearestNeighbors(n_neighbors=2, algorithm='auto').fit(x_vdata)
77
#distances, indices = nbrs.kneighbors(X)
78
#regr = KNeighborsRegressor(n_neighbors=10, weights='distance', algorithm='auto', leaf_size=30, p=1, metric='minkowski', metric_params=None, n_jobs=1)
79
kr = KernelRidge(alpha=1.0, coef0=1, kernel='polynomial', gamma=None, degree = arguments.degree, kernel_params=None)
80
kr.fit (xtdata, ytdata)
81
fit_stdev.append( np.sqrt( np.mean((kr.predict(x_vdata) - y_vdata) ** 2) ) )
82
print "The standard deviation of the fit is:", fit_stdev
83

84
print(__doc__)
85

86
# Loading the Digits dataset
87
digits = datasets.load_digits()
88

89
# To apply an classifier on this data, we need to flatten the image, to
90
# turn the data in a (samples, feature) matrix:
91
n_samples = len(digits.images)
92
X = digits.images.reshape((n_samples, -1))
93
y = digits.target
94

95
# Split the dataset in two equal parts
96
#X_train, X_test, y_train, y_test = train_test_split(
97
 #   X, y, test_size=0.5, random_state=0)
98

99
# Set the parameters by cross-validation
100
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
101
                     'C': [1, 10, 100, 1000]},
102
                    {'kernel': ['polynomial'], 'C': [1, 10, 100, 1000]}]
103
scores = ['precision', 'recall']
104
for score in scores:
105
    print("# Tuning hyper-parameters for %s" % score)
106
    print()
107

108
    clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5,
109
                       scoring='%s_macro' % score)
110
    clf.fit(xtdata, ytdata)
111

112
    print("Best parameters set found on development set:")
113
    print()
114
    print(clf.best_params_)
115
    print()
116
    print("Grid scores on development set:")
117
    print()
118
    means = clf.cv_results_['mean_test_score']
119
    stds = clf.cv_results_['std_test_score']
120
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
121
        print("%0.3f (+/-%0.03f) for %r"
122
              % (mean, std * 2, params))
123
    print()
124

125
    print("Detailed classification report:")
126
    print()
127
    print("The model is trained on the full development set.")
128
    print("The scores are computed on the full evaluation set.")
129
    print()
130
    y_true, y_pred = y_test, clf.predict(X_test)
131
    print(classification_report(y_true, y_pred))
132
    print()
133

134
R = Ridge(alpha=1.0,fit_intercept=False)
135
kr.fit(xtdata, ytdata)
136
R.fit(xtdata, ytdata)
137
print np.dot(xtdata.transpose(),kr.dual_coef_)
138
print R.coef_
139

140
#fit_coefficients = regr.coef_ to pull out the linear regression
141
#fit_coefficients = regr.named_steps['linearregression'].coef_
142
#print 'The coefficients of the regression are' +str(fit_coefficients)
143
#fit_intercept = regr.intercept_
144
#fit_intercept = regr.named_steps['linearregression'].intercept_
145
#print "The intercept of the line is %d" % (fit_intercept)
146

147
# Evaluate how well the model did, usually using the validation data
148
# Now evaluate how the model performs on the validation data. Calculate the mean squared error, predict and fit will take in data, calculate a y and x and compare. In prediction, use those parameters with given x data to spit out some y data.  Take a difference between that and the true y value to get the errors
149
mean_sq_error = np.mean((kr.predict(x_vdata) - y_vdata) ** 2)
150
sqrt_mean_sq_error = (mean_sq_error) ** .5
151
# The variance score, determined by the ML fitting: 1 is perfect prediction. Score uses the fit results we have to determine a normalized mean squared erro\]\AASASDF"    wwq\`"AS
152
variance_score = kr.score(x_vdata, y_vdata)
153
print "The varience score is %d" % (variance_score)
154
print "The mean sqaured error of the fit is %d" % (mean_sq_error)
155
print "The square root of the mean squared error is %d" % (sqrt_mean_sq_error)
156
#print 'regr is:\n', regr
157

158
# Plot training data
159
plt.plot (xtdata, ytdata, 'bo', label="Training Data")
160
# Plot validation data
161
plt.plot (x_vdata, y_vdata, 'ro', label="Validation Data")
162
# Plot fit line
163
plt.plot (x_vdata, kr.predict(x_vdata), label="Fit Function with degree %d." % (arguments.degree))
164
#Create a legend
165
plt.legend(shadow=True, loc="upper left")
166

167
#Lable the plot
168
plt.xlabel('Time (sec)')
169
plt.ylabel('Signal Strength')
170
plt.title('Signal Strength vs. Time for Relative Activation Energies')
171
plt.savefig("kernel_ridge_poly_fit.png")
172

173

174

175

176