Sharedtumor_classifier.ipynbOpen in CoCalc
Author: sepidehparhami sepidehparhami
Views : 70
Description: A series of classifiers for predicting tumors from CDC NHANES data
In [1]:
import pdb import glob import numpy as np import pandas as pd import matplotlib.pyplot as plt import sklearn from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression from sklearn.metrics import precision_recall_fscore_support from sklearn.metrics import accuracy_score from sklearn.tree import DecisionTreeClassifier from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split from sklearn.feature_selection import mutual_info_classif from sklearn.metrics import roc_curve from sklearn.metrics import roc_auc_score from sklearn.model_selection import GridSearchCV from sklearn.model_selection import RandomizedSearchCV import nhanes as nhanes import seaborn as sns %matplotlib inline import time from imblearn.pipeline import make_pipeline, Pipeline from imblearn.over_sampling import RandomOverSampler from imblearn.over_sampling import SMOTE from sklearn.ensemble import ExtraTreesClassifier from sklearn.metrics import normalized_mutual_info_score import re import gc
In [2]:
import joblib import scipy from scipy.optimize import curve_fit import torch import importlib
In [3]:
importlib.reload(nhanes)
<module 'nhanes' from '/home/user/nhanes.py'>
In [3]:
DATA_PATH = 'CDC/SELECTED/'
In [4]:
ds = nhanes.Dataset(DATA_PATH) ds.load_cancer() n_fe = ds.features.shape[1] n_classes = 2
In [5]:
names = ds.names target = ds.targets features = ds.features features_df = pd.DataFrame(ds.features) features_df.columns = names
In [7]:
target.shape #features.shape
(49454,)
In [324]:
# Seaborn styling for document sns.set_style('whitegrid') c=sns.color_palette('cubehelix') np.random.shuffle(c) sns.set_palette(c)

Feature Importance by Random Forest

In [7]:
# # https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html # forest = ExtraTreesClassifier(n_estimators=250) # forest.fit(features, target) # importances = forest.feature_importances_ # std = np.std([tree.feature_importances_ for tree in forest.estimators_], # axis=0) # indices = np.argsort(importances)[::-1]
In [8]:
# sorted_imps = importances[indices] # sorted_labels = features_df.columns[indices]
In [11]:
# plt.figure(figsize=(8,4)) # p=sns.scatterplot(range(len(importances)),sorted_imps); # p.set_xlabel('Ranking',fontsize=15) # p.set_ylabel('Importance',fontsize=15) # p.set_title('Sorted Feature Importances by Random Forest',fontsize=20)
Text(0.5, 1.0, 'Sorted Feature Importances by Random Forest')
In [12]:
# sorted_labels
Index(['BMXBMI', 'BMXWAIST', 'BPXSY1', 'BPXDI1', 'RIDAGEYR', 'SLD010H', 'INDHHINC', 'SXQ272#2.0', 'RIDRETH1#3.0', 'DMDEDUC2#4.0', 'DMDEDUC2#3.0', 'DMDEDUC2#5.0', 'DMDEDUC2#2.0', 'RHQ131#1.0', 'RHD180', 'PAQ605#2.0', 'BPQ057#2.0', 'BPQ020#1.0', 'DMDEDUC2#1.0', 'SMD650#1.0', 'DIQ170#2.0', 'BPQ020#2.0', 'RIAGENDR#1.0', 'RIAGENDR#2.0', 'ALQ141Q', 'URXUCL#2.0', 'ALQ151#2.0', 'DIQ160#2.0', 'SMQ020#1.0', 'SMQ020#2.0', 'DIQ010#2.0', 'LBXTC', 'PAQ610', 'DIQ170#1.0', 'BPQ057#1.0', 'DIQ010#1.0', 'ARQ125C#2.0', 'MCQ160J#2.0', 'RIDRETH3#3.0', 'RIDRETH1#1.0', 'MCQ160J#1.0', 'PAQ605#1.0', 'MCQ200#2.0', 'DIQ160#1.0', 'RIDRETH1#4.0', 'RHQ131#2.0', 'SMD470#0.0', 'HEQ030#2.0', 'HEQ010#2.0', 'ALQ151#1.0', 'ALQ120Q', 'LBXHP1#0.0', 'RIDRETH1#2.0', 'MCQ200#1.0', 'LBXHP1#1.0', 'RIDRETH1#5.0', 'SXQ753#2.0', 'LBXHIVC#2.0', 'SMD650#0.0', 'RIDRETH3#4.0', 'SMD470#1.0', 'SXQ753#1.0', 'SMD480#1.0', 'RIDRETH3#2.0', 'RIDRETH3#6.0', 'RIDRETH3#7.0', 'RIDRETH3#1.0', 'SMD480#0.0', 'HEQ010#1.0', 'HEQ030#1.0', 'ARQ125C#1.0', 'SXQ272#1.0', 'LBXHIVC#1.0', 'DMDEDUC2#9.0', 'DMDEDUC2#7.0', 'URXUCL#1.0', 'ALQ160', 'SSEBV#1.0', 'SSEBV#2.0', 'PAQ560', 'PAQ677'], dtype='object')

Mutual Information

In [7]:
mi_cancer = mutual_info_classif(features,target) inds = mi_cancer.argsort() sorted_names = names[inds[::-1]] sorted_mi = mi_cancer[inds[::-1]]
In [114]:
print(sorted_names)
Index(['SXQ272#2.0', 'RIDAGEYR', 'RIDRETH1#3.0', 'URXUCL#2.0', 'BPQ020#2.0', 'DIQ010#2.0', 'BPQ020#1.0', 'RHD180', 'ALQ141Q', 'RIDRETH1#1.0', 'SMQ020#1.0', 'BPXSY1', 'SMQ020#2.0', 'DIQ170#2.0', 'RIAGENDR#1.0', 'DIQ160#2.0', 'RIDRETH3#6.0', 'RIDRETH1#4.0', 'HEQ010#2.0', 'PAQ605#2.0', 'RHQ131#1.0', 'RIDRETH3#1.0', 'RIDRETH1#5.0', 'BPQ057#2.0', 'ALQ151#2.0', 'SLD010H', 'SMD650#1.0', 'BPXDI1', 'DMDEDUC2#2.0', 'LBXHP1#1.0', 'SXQ753#2.0', 'DIQ010#1.0', 'ARQ125C#1.0', 'MCQ200#1.0', 'SXQ753#1.0', 'DMDEDUC2#9.0', 'MCQ200#2.0', 'LBXHIVC#2.0', 'INDHHINC', 'DIQ170#1.0', 'ALQ151#1.0', 'RIDRETH3#3.0', 'DMDEDUC2#1.0', 'DMDEDUC2#5.0', 'SMD650#0.0', 'RIAGENDR#2.0', 'HEQ030#2.0', 'LBXHIVC#1.0', 'DMDEDUC2#4.0', 'LBXTC', 'DMDEDUC2#3.0', 'RIDRETH1#2.0', 'SMD480#1.0', 'PAQ605#1.0', 'ALQ120Q', 'PAQ610', 'ALQ160', 'SXQ272#1.0', 'BMXWAIST', 'SMD470#1.0', 'HEQ030#1.0', 'RIDRETH3#4.0', 'RIDRETH3#2.0', 'URXUCL#1.0', 'PAQ677', 'DMDEDUC2#7.0', 'PAQ560', 'HEQ010#1.0', 'BPQ057#1.0', 'MCQ160J#2.0', 'MCQ160J#1.0', 'RIDRETH3#7.0', 'BMXBMI', 'SSEBV#1.0', 'SSEBV#2.0', 'DIQ160#1.0', 'RHQ131#2.0', 'LBXHP1#0.0', 'ARQ125C#2.0', 'SMD480#0.0', 'SMD470#0.0'], dtype='object')
In [210]:
sorted_mi
array([2.06626772e-02, 1.89693913e-02, 1.38144986e-02, 1.32518869e-02, 1.28380246e-02, 1.07593222e-02, 9.27515420e-03, 8.03260258e-03, 7.36387332e-03, 7.23458529e-03, 6.46033268e-03, 5.87808657e-03, 4.81656519e-03, 4.48231657e-03, 4.34064741e-03, 2.95228857e-03, 2.83435668e-03, 2.79594060e-03, 2.63464534e-03, 2.57905674e-03, 2.34468943e-03, 2.19680156e-03, 2.18873058e-03, 2.18448058e-03, 2.17391712e-03, 2.11242237e-03, 1.90903123e-03, 1.89141768e-03, 1.86682013e-03, 1.80251723e-03, 1.59737350e-03, 1.57684555e-03, 1.43096499e-03, 1.40136210e-03, 1.34907328e-03, 1.27214477e-03, 1.18691828e-03, 1.15240635e-03, 1.13675840e-03, 1.09340922e-03, 1.08565166e-03, 1.06212281e-03, 1.05321770e-03, 7.71489952e-04, 7.31147870e-04, 7.08363914e-04, 6.73932548e-04, 6.06556095e-04, 4.62942093e-04, 4.51162952e-04, 4.48805360e-04, 4.37280892e-04, 3.39414191e-04, 3.11952234e-04, 1.96938758e-04, 1.96684621e-04, 1.10698902e-04, 4.77270690e-05, 1.84312855e-05, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00])
In [192]:
plt.figure(figsize=(8,4)) p=sns.scatterplot(range(len(sorted_mi)),sorted_mi); p.set_xlabel('Ranking',fontsize=15); p.set_ylabel('Mutual Information',fontsize=15); p.set_title('Sorted Mutual Information of Features with Cancer Target',fontsize=20);
In [67]:
def reset_original(): features = ds.features features_df = pd.DataFrame(ds.features) features_df.columns = names return features, features_df
In [101]:
features, features_df = reset_original()
In [69]:
features.shape
(49454, 81)
In [8]:
# top=pd.concat((pd.DataFrame(sorted_labels),pd.DataFrame(sorted_names)),axis=1) # top.columns = ['Random Forest','MI'] top = sorted_names[:40] remaining = features_df[top].copy() # s1=set(top['Random Forest']) # s2=set(top['MI']) # shared = s1.intersection(s2) # shared=list(shared) # remaining = features_df[shared].copy() remaining.shape
(49454, 40)
In [58]:
# calculate normalized mutual information for all feature pairs t = pd.DataFrame(target,columns=['MCQ220']) c = pd.concat((t,remaining),axis=1) compare = ['MCQ220'] + list(remaining.columns) dim = len(compare) mi = np.zeros([dim]*2) for i in range(dim): for j in range(dim): if i <= j: compare1 = compare[i] compare2 = compare[j] mi[i,j] = normalized_mutual_info_score(c[compare1],c[compare2],average_method='arithmetic') mi[j,i] = mi[i,j]
In [59]:
mi_round = np.round(mi,4) for i in range(dim): for j in range(dim): if i == j: mi_round[i,j] = 0 plt.figure(figsize=(30,30)) sns.heatmap(mi_round,annot=True,square=True,xticklabels=compare,yticklabels=compare,cmap='Blues');