CoCalc -- Collaborative Calculation in the Cloud
Sharedtumor_classifier.ipynbOpen in CoCalc

A series of classifiers for predicting tumors from CDC NHANES data

import pdb
import glob

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

import nhanes as nhanes

import seaborn as sns
%matplotlib inline

import time

from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import ExtraTreesClassifier

from sklearn.metrics import normalized_mutual_info_score

import re

import gc
import joblib
import scipy
from scipy.optimize import curve_fit
import torch
import importlib
importlib.reload(nhanes)
<module 'nhanes' from '/home/user/nhanes.py'>
DATA_PATH = 'CDC/SELECTED/'

ds = nhanes.Dataset(DATA_PATH)
ds.load_cancer()
n_fe = ds.features.shape[1]
n_classes = 2
names = ds.names
target = ds.targets
features = ds.features
features_df = pd.DataFrame(ds.features)
features_df.columns = names
target.shape
#features.shape
(49454,)
# Seaborn styling for document

sns.set_style('whitegrid')

c=sns.color_palette('cubehelix')
np.random.shuffle(c)
sns.set_palette(c)

Feature Importance by Random Forest

# # https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
# forest = ExtraTreesClassifier(n_estimators=250)

# forest.fit(features, target)
# importances = forest.feature_importances_
# std = np.std([tree.feature_importances_ for tree in forest.estimators_],
#              axis=0)
# indices = np.argsort(importances)[::-1]
# sorted_imps = importances[indices]
# sorted_labels = features_df.columns[indices]
# plt.figure(figsize=(8,4))
# p=sns.scatterplot(range(len(importances)),sorted_imps);
# p.set_xlabel('Ranking',fontsize=15)
# p.set_ylabel('Importance',fontsize=15)
# p.set_title('Sorted Feature Importances by Random Forest',fontsize=20)
Text(0.5, 1.0, 'Sorted Feature Importances by Random Forest')
# sorted_labels
Index(['BMXBMI', 'BMXWAIST', 'BPXSY1', 'BPXDI1', 'RIDAGEYR', 'SLD010H', 'INDHHINC', 'SXQ272#2.0', 'RIDRETH1#3.0', 'DMDEDUC2#4.0', 'DMDEDUC2#3.0', 'DMDEDUC2#5.0', 'DMDEDUC2#2.0', 'RHQ131#1.0', 'RHD180', 'PAQ605#2.0', 'BPQ057#2.0', 'BPQ020#1.0', 'DMDEDUC2#1.0', 'SMD650#1.0', 'DIQ170#2.0', 'BPQ020#2.0', 'RIAGENDR#1.0', 'RIAGENDR#2.0', 'ALQ141Q', 'URXUCL#2.0', 'ALQ151#2.0', 'DIQ160#2.0', 'SMQ020#1.0', 'SMQ020#2.0', 'DIQ010#2.0', 'LBXTC', 'PAQ610', 'DIQ170#1.0', 'BPQ057#1.0', 'DIQ010#1.0', 'ARQ125C#2.0', 'MCQ160J#2.0', 'RIDRETH3#3.0', 'RIDRETH1#1.0', 'MCQ160J#1.0', 'PAQ605#1.0', 'MCQ200#2.0', 'DIQ160#1.0', 'RIDRETH1#4.0', 'RHQ131#2.0', 'SMD470#0.0', 'HEQ030#2.0', 'HEQ010#2.0', 'ALQ151#1.0', 'ALQ120Q', 'LBXHP1#0.0', 'RIDRETH1#2.0', 'MCQ200#1.0', 'LBXHP1#1.0', 'RIDRETH1#5.0', 'SXQ753#2.0', 'LBXHIVC#2.0', 'SMD650#0.0', 'RIDRETH3#4.0', 'SMD470#1.0', 'SXQ753#1.0', 'SMD480#1.0', 'RIDRETH3#2.0', 'RIDRETH3#6.0', 'RIDRETH3#7.0', 'RIDRETH3#1.0', 'SMD480#0.0', 'HEQ010#1.0', 'HEQ030#1.0', 'ARQ125C#1.0', 'SXQ272#1.0', 'LBXHIVC#1.0', 'DMDEDUC2#9.0', 'DMDEDUC2#7.0', 'URXUCL#1.0', 'ALQ160', 'SSEBV#1.0', 'SSEBV#2.0', 'PAQ560', 'PAQ677'], dtype='object')

Mutual Information

mi_cancer = mutual_info_classif(features,target)
inds = mi_cancer.argsort()
sorted_names = names[inds[::-1]]
sorted_mi = mi_cancer[inds[::-1]]
print(sorted_names)
Index(['SXQ272#2.0', 'RIDAGEYR', 'RIDRETH1#3.0', 'URXUCL#2.0', 'BPQ020#2.0', 'DIQ010#2.0', 'BPQ020#1.0', 'RHD180', 'ALQ141Q', 'RIDRETH1#1.0', 'SMQ020#1.0', 'BPXSY1', 'SMQ020#2.0', 'DIQ170#2.0', 'RIAGENDR#1.0', 'DIQ160#2.0', 'RIDRETH3#6.0', 'RIDRETH1#4.0', 'HEQ010#2.0', 'PAQ605#2.0', 'RHQ131#1.0', 'RIDRETH3#1.0', 'RIDRETH1#5.0', 'BPQ057#2.0', 'ALQ151#2.0', 'SLD010H', 'SMD650#1.0', 'BPXDI1', 'DMDEDUC2#2.0', 'LBXHP1#1.0', 'SXQ753#2.0', 'DIQ010#1.0', 'ARQ125C#1.0', 'MCQ200#1.0', 'SXQ753#1.0', 'DMDEDUC2#9.0', 'MCQ200#2.0', 'LBXHIVC#2.0', 'INDHHINC', 'DIQ170#1.0', 'ALQ151#1.0', 'RIDRETH3#3.0', 'DMDEDUC2#1.0', 'DMDEDUC2#5.0', 'SMD650#0.0', 'RIAGENDR#2.0', 'HEQ030#2.0', 'LBXHIVC#1.0', 'DMDEDUC2#4.0', 'LBXTC', 'DMDEDUC2#3.0', 'RIDRETH1#2.0', 'SMD480#1.0', 'PAQ605#1.0', 'ALQ120Q', 'PAQ610', 'ALQ160', 'SXQ272#1.0', 'BMXWAIST', 'SMD470#1.0', 'HEQ030#1.0', 'RIDRETH3#4.0', 'RIDRETH3#2.0', 'URXUCL#1.0', 'PAQ677', 'DMDEDUC2#7.0', 'PAQ560', 'HEQ010#1.0', 'BPQ057#1.0', 'MCQ160J#2.0', 'MCQ160J#1.0', 'RIDRETH3#7.0', 'BMXBMI', 'SSEBV#1.0', 'SSEBV#2.0', 'DIQ160#1.0', 'RHQ131#2.0', 'LBXHP1#0.0', 'ARQ125C#2.0', 'SMD480#0.0', 'SMD470#0.0'], dtype='object')
sorted_mi
array([2.06626772e-02, 1.89693913e-02, 1.38144986e-02, 1.32518869e-02, 1.28380246e-02, 1.07593222e-02, 9.27515420e-03, 8.03260258e-03, 7.36387332e-03, 7.23458529e-03, 6.46033268e-03, 5.87808657e-03, 4.81656519e-03, 4.48231657e-03, 4.34064741e-03, 2.95228857e-03, 2.83435668e-03, 2.79594060e-03, 2.63464534e-03, 2.57905674e-03, 2.34468943e-03, 2.19680156e-03, 2.18873058e-03, 2.18448058e-03, 2.17391712e-03, 2.11242237e-03, 1.90903123e-03, 1.89141768e-03, 1.86682013e-03, 1.80251723e-03, 1.59737350e-03, 1.57684555e-03, 1.43096499e-03, 1.40136210e-03, 1.34907328e-03, 1.27214477e-03, 1.18691828e-03, 1.15240635e-03, 1.13675840e-03, 1.09340922e-03, 1.08565166e-03, 1.06212281e-03, 1.05321770e-03, 7.71489952e-04, 7.31147870e-04, 7.08363914e-04, 6.73932548e-04, 6.06556095e-04, 4.62942093e-04, 4.51162952e-04, 4.48805360e-04, 4.37280892e-04, 3.39414191e-04, 3.11952234e-04, 1.96938758e-04, 1.96684621e-04, 1.10698902e-04, 4.77270690e-05, 1.84312855e-05, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00])
plt.figure(figsize=(8,4))
p=sns.scatterplot(range(len(sorted_mi)),sorted_mi);
p.set_xlabel('Ranking',fontsize=15);
p.set_ylabel('Mutual Information',fontsize=15);
p.set_title('Sorted Mutual Information of Features with Cancer Target',fontsize=20);
def reset_original():
    features = ds.features
    features_df = pd.DataFrame(ds.features)
    features_df.columns = names
    return features, features_df
features, features_df = reset_original()
features.shape
(49454, 81)
# top=pd.concat((pd.DataFrame(sorted_labels),pd.DataFrame(sorted_names)),axis=1)
# top.columns = ['Random Forest','MI']
top = sorted_names[:40]
remaining = features_df[top].copy()
# s1=set(top['Random Forest'])
# s2=set(top['MI'])
# shared = s1.intersection(s2)
# shared=list(shared)


# remaining = features_df[shared].copy()
remaining.shape
(49454, 40)
# calculate normalized mutual information for all feature pairs
t = pd.DataFrame(target,columns=['MCQ220'])
c = pd.concat((t,remaining),axis=1)
compare = ['MCQ220'] + list(remaining.columns)
dim = len(compare)
mi = np.zeros([dim]*2)
for i in range(dim):
    for j in range(dim):
        if i <= j:
            compare1 = compare[i]
            compare2 = compare[j]
            mi[i,j] = normalized_mutual_info_score(c[compare1],c[compare2],average_method='arithmetic')
            mi[j,i] = mi[i,j]
mi_round = np.round(mi,4)
for i in range(dim):
    for j in range(dim):
        if i == j:
            mi_round[i,j] = 0
plt.figure(figsize=(30,30))
sns.heatmap(mi_round,annot=True,square=True,xticklabels=compare,yticklabels=compare,cmap='Blues');