#!/usr/bin/python12#import modules used for this code. Important to have all of them to prevent future error messages in the terminal34import pandas as pd5import matplotlib6matplotlib.use('Agg')7import matplotlib.pyplot as plt8import numpy as np9from sklearn.neural_network import MLPRegressor as mlp10from sklearn.preprocessing import StandardScaler11from sklearn.model_selection import GridSearchCV12from sklearn.kernel_ridge import KernelRidge1314#Tell the program which data file the code will be reading from. These are big data files so they will read slower in the terminal.15raw_data_path = "UW_diffusion_data_for_Brent_with_AN_sorted_by_num.csv" #big csv file16#Tell sage math which row in the data file is the header and to ignore that row. *Not useful for analysis*17raw_data = pd.read_csv(raw_data_path, header=3)18#What are we looking for?19target_value = "E(Impurity) - E(Host) [eV]"2021# ************Organize data for fitting************22# Make a copy of the full training frame that has all of the X data columns.23x_data_matrix = raw_data.loc[:,"AREmp (pm) I":"SpaceGroup I"]2425# grab just the one column to use as y data. Once as a dataframe and once as a simple list. Create an empty list of y values and load in the text file26target_data = raw_data.loc[:,target_value] #This grabs the column that will be used as the y value for training. This trains the model by pairing the input with expected output.27target_data_list = raw_data[target_value].tolist() #this line of code creates a simplers list version of the pervious line of y values for training data.2829# grab just the one column that has the host element labels.30host = raw_data.loc[:,"Host"] #mini dataframe that has just the host element31hostlist = raw_data['Host'].tolist() #simple list of the host elements for each row in the x data matrix32impurity = raw_data.loc[:,"Impurity"] #mini dataframe that as just the impurity element33impuritylist = raw_data['Impurity'].tolist() #simple list of the impurities for each row in the x data matrix34host_element_list = ["Al","Au","Ca","Cu","Ir","Ni","Pb","Pd","Pt"] #list of host elements in case it is useful353637# ************Scaling and Fitting************38#Kernel Ridge and scaling implemenation.39#Scales the procceessed data frames40scaler = StandardScaler()41scaler.fit(x_data_matrix) #Train the scaler with x training data frame42x_data_matrix = scaler.transform(x_data_matrix) #Now scale the x training frame434445#Kernel Ridge Regression46# This sets up the GridSearch process.47# Here, alpha and gamma will be optimized using CV.48#Note that more than one thing can be optimized simultaneously, just put them all within the curly braces.49regr = GridSearchCV(KernelRidge(kernel='polynomial', coef0=1), cv=20,50param_grid={"alpha": [.1,1e-2, 1e-3],51"gamma": [10, 25, 50, 75, 100]})52#Without GridSearchCV, the best values for each coefficient can be determined by the outputted R^2 value. Better R^2 values resulted in higher gamma (100) and smaller alpha (.0001).53#regr = KernelRidge(alpha=.000001, coef0=1, kernel='polynomial', gamma=100)54#regr = KernelRidge(coef0=1, kernel='polynomial', kernel_params = {'alpha':[1e10, 0.01, 1e-2, 1e-3, 1e-4], 'gamma': [-10, 1, 10, 100]})555657regr.fit(x_data_matrix, target_data)58#predicted_data=regr.predict(validation_frame)59predicted_data=regr.predict(x_data_matrix)60#predicted_data = grid.predict(x_data_matrix) #This output is just a simple list of numbers61fit_error = predicted_data - target_data6263#Print the R^2 value. R-squared is a statistical measure of how close the data are to the fitted regression line. It is also known as the coefficient of determination for multiple regression.64#The closer R^2 is to 1 indicates that the model explains the variability of the response data around its mean very well.65print "R^2: " +str(regr.score(x_data_matrix, target_data))66#print "R^2: " +str(grid.score(x_data_matrix, target_data))676869# ************Plotting the Data************70# We will make one big figure with lots of little subplots -- one for each host element.7172# Setup the full figure with nine subplots73fullfig, (axAl, axAu, axCa, axCu, axIr, axNi, axPb, axPd, axPt) = plt.subplots(9)74fullfig.subplots_adjust(hspace=0.5) #sets the spacing between plots -- default was too tight.75fullfig.suptitle('Activation Energies (eV)') #overall title for the whole thing76fullfig.set_figheight(20) #default is to squash all subplots together. This gives more space.7778#Host Al, rows 0-41 (42 entries)79axAl.set_ylabel('Host Al')80x = range(0,42)81# Set number of ticks for x-axis82axAl.set_xticks(x)83# Set ticks labels for x-axis84axAl.set_xticklabels(impuritylist[0:42], rotation=270, fontsize=12)85axAl.plot(x, target_data_list[0:42], marker='o', ls='', color='black', label='Test Data')86axAl.plot(x, predicted_data[0:42], color='red', label='Fit')8788#Host Au, rows 42-52 (11 entries)89axAu.set_ylabel('Host Au')90#print impuritylist[42:53] #To double-check that we are plotting the right data91#print target_data_list[42:53]92x = range(0,11)93# Set number of ticks for x-axis94axAu.set_xticks(x)95# Set ticks labels for x-axis96axAu.set_xticklabels(impuritylist[42:53], rotation=270, fontsize=12)97axAu.plot(x, target_data_list[42:53], marker='o', ls='', color='black', label='Test Data')98axAu.plot(x, predicted_data[42:53], color='red', label='Fit')99100101#Host Ca, rows 53-58 (6 entries)102axCa.set_ylabel('Host Ca')103#print impuritylist[53:59] #To double-check that we are plotting the right data104#print target_data_list[53:59]105x = range(0,6)106# Set number of ticks for x-axis107axCa.set_xticks(x)108# Set ticks labels for x-axis109axCa.set_xticklabels(impuritylist[53:59], rotation=270, fontsize=12)110axCa.plot(x, target_data_list[53:59], marker='o', ls='', color='black', label='Test Data')111axCa.plot(x, predicted_data[53:59], color='red', label='Fit')112113#Host Cu, rows 59-98 (39 entries)114axCu.set_ylabel('Host Cu')115#print impuritylist[59-98] #To double-check that we are plotting the right data116#print target_data_list[59-98]117x = range(0,39)118# Set number of ticks for x-axis119axCu.set_xticks(x)120# Set ticks labels for x-axis121axCu.set_xticklabels(impuritylist[59:98], rotation=270, fontsize=12)122axCu.plot(x, target_data_list[59:98], marker='o', ls='', color='black', label='Test Data')123axCu.plot(x, predicted_data[59:98], color='red', label='Fit')124125#Host Ir, rows 99-109126axIr.set_ylabel('Host Ir')127#print impuritylist[99-110] #To double-check that we are plotting the right data128#print target_data_list[99-110]129x = range(0,10)130# Set number of ticks for x-axis131axIr.set_xticks(x)132# Set ticks labels for x-axis133axIr.set_xticklabels(impuritylist[99:109], rotation=270, fontsize=12)134axIr.plot(x, target_data_list[99:109], marker='o', ls='', color='black', label='Test Data')135axIr.plot(x, predicted_data[99:109], color='red', label='Fit')136137#Host Ni, rows 110-149138axNi.set_ylabel('Host Ni')139#print impuritylist[110-149] #To double-check that we are plotting the right data140#print target_data_list[110-149]141x = range(0,39)142# Set number of ticks for x-axis143axNi.set_xticks(x)144# Set ticks labels for x-axis145axNi.set_xticklabels(impuritylist[110:149], rotation=270, fontsize=12)146axNi.plot(x, target_data_list[110:149], marker='o', ls='', color='black', label='Test Data')147axNi.plot(x, predicted_data[110:149], color='red', label='Fit')148149150#Host Pb, rows 150-157151axPb.set_ylabel('Host Pb')152#print impuritylist[150-157] #To double-check that we are plotting the right data153#print target_data_list[150-157]154x = range(0,7)155# Set number of ticks for x-axis156axPb.set_xticks(x)157# Set ticks labels for x-axis158axPb.set_xticklabels(impuritylist[150:157], rotation=270, fontsize=12)159axPb.plot(x, target_data_list[150:157], marker='o', ls='', color='black', label='Test Data')160axPb.plot(x, predicted_data[150:157], color='red', label='Fit')161162#Host Pd, rows 158-187163axPd.set_ylabel('Host Pd')164#print impuritylist[158-187] #To double-check that we are plotting the right data165#print target_data_list[158-187]166x = range(0,29)167# Set number of ticks for x-axis168axPd.set_xticks(x)169# Set ticks labels for x-axis170axPd.set_xticklabels(impuritylist[158:187], rotation=270, fontsize=12)171axPd.plot(x, target_data_list[158:187], marker='o', ls='', color='black', label='Test Data')172axPd.plot(x, predicted_data[158:187], color='red', label='Fit')173174#Host Pt, rows 188-217175axPt.set_ylabel('Host Pt')176#print impuritylist[188-217] #To double-check that we are plotting the right data177#print target_data_list[188-217]178x = range(0,29)179# Set number of ticks for x-axis180axPt.set_xticks(x)181# Set ticks labels for x-axis182axPt.set_xticklabels(impuritylist[188:217], rotation=270, fontsize=12)183axPt.plot(x, target_data_list[188:217], marker='o', ls='', color='black', label='Test Data')184axPt.plot(x, predicted_data[188:217], color='red', label='Fit')185186187fullfig.savefig("activation_energies.png")188189190