CoCalc -- kernrrg.py

Project: Jamlah Travis - chemistry/chemistry
Path: chemistry/Machine Learning/kernrrg.py
Views: ⁶⁰
1
#!/usr/bin/python
2

3
#import modules used for this code. Important to have all of them to prevent future error messages in the terminal
4

5
import pandas as pd
6
import matplotlib
7
matplotlib.use('Agg')
8
import matplotlib.pyplot as plt
9
import numpy as np
10
from sklearn.neural_network import MLPRegressor as mlp
11
from sklearn.preprocessing import StandardScaler
12
from sklearn.model_selection import GridSearchCV
13
from sklearn.kernel_ridge import KernelRidge
14

15
#Tell the program which data file the code will be reading from. These are big data files so they will read slower in the terminal.
16
raw_data_path = "UW_diffusion_data_for_Brent_with_AN_sorted_by_num.csv" #big csv file
17
#Tell sage math which row in the data file is the header and to ignore that row. *Not useful for analysis*
18
raw_data = pd.read_csv(raw_data_path, header=3)
19
#What are we looking for?
20
target_value = "E(Impurity) - E(Host) [eV]"
21

22
# ************Organize data for fitting************
23
# Make a copy of the full training frame that has all of the X data columns.
24
x_data_matrix = raw_data.loc[:,"AREmp (pm) I":"SpaceGroup I"]
25

26
# grab just the one column to use as y data. Once as a dataframe and once as a simple list. Create an empty list of y values and load in the text file
27
target_data = raw_data.loc[:,target_value] #This grabs the column that will be used as the y value for training. This trains the model by pairing the input with expected output.
28
target_data_list = raw_data[target_value].tolist() #this line of code creates a simplers list version of the pervious line of y values for training data.
29

30
# grab just the one column that has the host element labels.
31
host = raw_data.loc[:,"Host"]  #mini dataframe that has just the host element
32
hostlist = raw_data['Host'].tolist()  #simple list of the host elements for each row in the x data matrix
33
impurity = raw_data.loc[:,"Impurity"]  #mini dataframe that as just the impurity element
34
impuritylist = raw_data['Impurity'].tolist()  #simple list of the impurities for each row in the x data matrix
35
host_element_list = ["Al","Au","Ca","Cu","Ir","Ni","Pb","Pd","Pt"]  #list of host elements in case it is useful
36

37

38
# ************Scaling and Fitting************
39
#Kernel Ridge and scaling implemenation.
40
#Scales the procceessed data frames
41
scaler = StandardScaler()
42
scaler.fit(x_data_matrix) #Train the scaler with x training data frame
43
x_data_matrix = scaler.transform(x_data_matrix)  #Now scale the x training frame
44

45

46
#Kernel Ridge Regression
47
# This sets up the GridSearch process.
48
# Here, alpha and gamma will be optimized using CV.
49
#Note that more than one thing can be optimized simultaneously, just put them all within the curly braces.
50
regr = GridSearchCV(KernelRidge(kernel='polynomial', coef0=1), cv=20,
51
                    param_grid={"alpha": [.1,1e-2, 1e-3],
52
                                "gamma": [10, 25, 50, 75, 100]})
53
#Without GridSearchCV, the best values for each coefficient can be determined by the outputted R^2 value. Better R^2 values resulted in higher gamma (100) and smaller alpha (.0001).
54
#regr = KernelRidge(alpha=.000001, coef0=1, kernel='polynomial', gamma=100)
55
#regr = KernelRidge(coef0=1, kernel='polynomial', kernel_params = {'alpha':[1e10, 0.01, 1e-2, 1e-3, 1e-4], 'gamma': [-10, 1, 10, 100]})
56

57

58
regr.fit(x_data_matrix, target_data)
59
#predicted_data=regr.predict(validation_frame)
60
predicted_data=regr.predict(x_data_matrix)
61
#predicted_data = grid.predict(x_data_matrix)  #This output is just a simple list of numbers
62
fit_error = predicted_data - target_data
63

64
#Print the R^2 value. R-squared is a statistical measure of how close the data are to the fitted regression line. It is also known as the coefficient of determination for multiple regression. 
65
#The closer R^2 is to 1 indicates that the model explains the variability of the response data around its mean very well.
66
print "R^2: " +str(regr.score(x_data_matrix, target_data))
67
#print "R^2: " +str(grid.score(x_data_matrix, target_data))
68

69

70
# ************Plotting the Data************
71
# We will make one big figure with lots of little subplots -- one for each host element.
72

73
# Setup the full figure with nine subplots
74
fullfig, (axAl, axAu, axCa, axCu, axIr, axNi, axPb, axPd, axPt) = plt.subplots(9)
75
fullfig.subplots_adjust(hspace=0.5)  #sets the spacing between plots -- default was too tight.
76
fullfig.suptitle('Activation Energies (eV)')  #overall title for the whole thing
77
fullfig.set_figheight(20)  #default is to squash all subplots together. This gives more space.
78

79
#Host Al, rows 0-41 (42 entries)
80
axAl.set_ylabel('Host Al')
81
x = range(0,42)
82
# Set number of ticks for x-axis
83
axAl.set_xticks(x)
84
# Set ticks labels for x-axis
85
axAl.set_xticklabels(impuritylist[0:42], rotation=270, fontsize=12)
86
axAl.plot(x, target_data_list[0:42], marker='o', ls='', color='black', label='Test Data')
87
axAl.plot(x, predicted_data[0:42], color='red', label='Fit')
88

89
#Host Au, rows 42-52 (11 entries)
90
axAu.set_ylabel('Host Au')
91
#print impuritylist[42:53]   #To double-check that we are plotting the right data
92
#print target_data_list[42:53]
93
x = range(0,11)
94
# Set number of ticks for x-axis
95
axAu.set_xticks(x)
96
# Set ticks labels for x-axis
97
axAu.set_xticklabels(impuritylist[42:53], rotation=270, fontsize=12)
98
axAu.plot(x, target_data_list[42:53], marker='o', ls='', color='black', label='Test Data')
99
axAu.plot(x, predicted_data[42:53], color='red', label='Fit')
100

101

102
#Host Ca, rows 53-58 (6 entries)
103
axCa.set_ylabel('Host Ca')
104
#print impuritylist[53:59]   #To double-check that we are plotting the right data
105
#print target_data_list[53:59]
106
x = range(0,6)
107
# Set number of ticks for x-axis
108
axCa.set_xticks(x)
109
# Set ticks labels for x-axis
110
axCa.set_xticklabels(impuritylist[53:59], rotation=270, fontsize=12)
111
axCa.plot(x, target_data_list[53:59], marker='o', ls='', color='black', label='Test Data')
112
axCa.plot(x, predicted_data[53:59], color='red', label='Fit')
113

114
#Host Cu, rows 59-98 (39 entries)
115
axCu.set_ylabel('Host Cu')
116
#print impuritylist[59-98]   #To double-check that we are plotting the right data
117
#print target_data_list[59-98]
118
x = range(0,39)
119
# Set number of ticks for x-axis
120
axCu.set_xticks(x)
121
# Set ticks labels for x-axis
122
axCu.set_xticklabels(impuritylist[59:98], rotation=270, fontsize=12)
123
axCu.plot(x, target_data_list[59:98], marker='o', ls='', color='black', label='Test Data')
124
axCu.plot(x, predicted_data[59:98], color='red', label='Fit')
125

126
#Host Ir, rows 99-109
127
axIr.set_ylabel('Host Ir')
128
#print impuritylist[99-110]   #To double-check that we are plotting the right data
129
#print target_data_list[99-110]
130
x = range(0,10)
131
# Set number of ticks for x-axis
132
axIr.set_xticks(x)
133
# Set ticks labels for x-axis
134
axIr.set_xticklabels(impuritylist[99:109], rotation=270, fontsize=12)
135
axIr.plot(x, target_data_list[99:109], marker='o', ls='', color='black', label='Test Data')
136
axIr.plot(x, predicted_data[99:109], color='red', label='Fit')
137

138
#Host Ni, rows 110-149
139
axNi.set_ylabel('Host Ni')
140
#print impuritylist[110-149]   #To double-check that we are plotting the right data
141
#print target_data_list[110-149]
142
x = range(0,39)
143
# Set number of ticks for x-axis
144
axNi.set_xticks(x)
145
# Set ticks labels for x-axis
146
axNi.set_xticklabels(impuritylist[110:149], rotation=270, fontsize=12)
147
axNi.plot(x, target_data_list[110:149], marker='o', ls='', color='black', label='Test Data')
148
axNi.plot(x, predicted_data[110:149], color='red', label='Fit')
149

150

151
#Host Pb, rows 150-157
152
axPb.set_ylabel('Host Pb')
153
#print impuritylist[150-157]   #To double-check that we are plotting the right data
154
#print target_data_list[150-157]
155
x = range(0,7)
156
# Set number of ticks for x-axis
157
axPb.set_xticks(x)
158
# Set ticks labels for x-axis
159
axPb.set_xticklabels(impuritylist[150:157], rotation=270, fontsize=12)
160
axPb.plot(x, target_data_list[150:157], marker='o', ls='', color='black', label='Test Data')
161
axPb.plot(x, predicted_data[150:157], color='red', label='Fit')
162

163
#Host Pd, rows 158-187
164
axPd.set_ylabel('Host Pd')
165
#print impuritylist[158-187]   #To double-check that we are plotting the right data
166
#print target_data_list[158-187]
167
x = range(0,29)
168
# Set number of ticks for x-axis
169
axPd.set_xticks(x)
170
# Set ticks labels for x-axis
171
axPd.set_xticklabels(impuritylist[158:187], rotation=270, fontsize=12)
172
axPd.plot(x, target_data_list[158:187], marker='o', ls='', color='black', label='Test Data')
173
axPd.plot(x, predicted_data[158:187], color='red', label='Fit')
174

175
#Host Pt, rows 188-217
176
axPt.set_ylabel('Host Pt')
177
#print impuritylist[188-217]   #To double-check that we are plotting the right data
178
#print target_data_list[188-217]
179
x = range(0,29)
180
# Set number of ticks for x-axis
181
axPt.set_xticks(x)
182
# Set ticks labels for x-axis
183
axPt.set_xticklabels(impuritylist[188:217], rotation=270, fontsize=12)
184
axPt.plot(x, target_data_list[188:217], marker='o', ls='', color='black', label='Test Data')
185
axPt.plot(x, predicted_data[188:217], color='red', label='Fit')
186

187

188
fullfig.savefig("activation_energies.png")
189

190