import pdb
import glob
import copy
import os
import pickle
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats
import sklearn.feature_selection
class FeatureColumn:
def __init__(self, category, field, preprocessor, args=None, cost=None):
self.category = category
self.field = field
self.preprocessor = preprocessor
self.args = args
self.data = None
self.cost = cost
class NHANES:
def __init__(self, db_path=None, columns=None):
self.db_path = db_path
self.columns = columns
self.dataset = None
self.column_data = None
self.column_info = None
self.df_features = None
self.df_targets = None
self.costs = None
def process(self):
df = None
cache = {}
df = []
for fe_col in self.columns:
sheet = fe_col.category
field = fe_col.field
data_files = glob.glob(self.db_path+sheet+'/*.XPT')
df_col = []
for dfile in data_files:
bad_files = ['CDC/SELECTED/Questionnaire/PAQ_G.XPT','CDC/SELECTED/Questionnaire/PAQ_D.XPT']
if dfile in bad_files:
continue
if dfile in cache:
df_tmp = cache[dfile]
else:
df_tmp = pd.read_sas(dfile)
cache[dfile] = df_tmp
if 'SEQN' not in df_tmp.columns:
continue
sel_cols = set(df_tmp.columns).intersection([field])
if not sel_cols:
continue
else:
df_tmp = df_tmp[['SEQN'] + list(sel_cols)]
df_tmp.set_index('SEQN', inplace=True)
df_col.append(df_tmp)
try:
df_col = pd.concat(df_col)
except:
raise Exception('Failed to process ' + field)
df.append(df_col)
df = pd.concat(df, axis=1)
df_proc = []
for fe_col in self.columns:
field = fe_col.field
fe_col.data = df[field].copy()
if fe_col.preprocessor is not None:
if fe_col.args is not None:
prepr_col = fe_col.preprocessor(df[field], **fe_col.args)
else:
prepr_col = fe_col.preprocessor(df[field])
else:
prepr_col = df[field]
if (len(prepr_col.shape) > 1):
fe_col.cost = [fe_col.cost] * prepr_col.shape[1]
else:
fe_col.cost = [fe_col.cost]
df_proc.append(prepr_col)
self.dataset = pd.concat(df_proc, axis=1)
return self.dataset
def preproc_onehot(df_col, args=None):
return pd.get_dummies(df_col, prefix=df_col.name, prefix_sep='#')
def preproc_real(df_col, cutoff=np.inf):
df_col[df_col > cutoff] = np.nan
df_col = preproc_impute(df_col)
df_col = (df_col-df_col.mean()) / df_col.std()
return df_col
def preproc_impute(df_col, args=None):
df_col[pd.isna(df_col)] = df_col.mean()
return df_col
def preproc_cut(df_col, bins):
df_col[df_col <= bins[0]] = np.nan
df_col[df_col >= bins[-1]] = np.nan
df_col = preproc_impute(df_col)
return pd.cut(df_col, bins=bins, labels=False,include_lowest=True)
def preproc_cut_dummy(df_col, bins):
df_col[df_col <= bins[0]] = np.nan
df_col[df_col >= bins[-1]] = np.nan
binning = pd.cut(df_col, bins=bins, labels=False,include_lowest=True)
return preproc_onehot(binning)
def preproc_dropna(df_col, args=None):
df_col.dropna(axis=0, how='any', inplace=True)
return df_col
def preproc_binary(df_col, args=None):
for i in range(len(df_col)):
if df_col.iloc[i] != 1 and df_col.iloc[i] != 2:
df_col.iloc[i] = np.nan
return preproc_onehot(df_col)
class Dataset():
"""
Dataset manager class
"""
def __init__(self, data_path=None):
"""
Class intitializer.
"""
if data_path == None:
self.data_path = './run_data/'
else:
self.data_path = data_path
self.features = None
self.names = None
self.targets = None
self.costs = None
def load_cancer(self, opts=None):
data = pd.read_csv('Fields.csv',dtype='str')
proc_dict = {'preproc_onehot':preproc_onehot, 'preproc_real':preproc_real, 'preproc_impute':preproc_impute, 'preproc_cut':preproc_cut, 'preproc_cut_dummy':preproc_cut_dummy, 'preproc_dropna':preproc_dropna,'preproc_binary':preproc_binary, 'None': None}
columns = []
nf = data.shape[0]
for i in range(nf):
row=data.loc[i]
proc_fn = proc_dict[row['Preprocessing']]
args_init = row['Args']
if pd.isna(args_init):
args = None
else:
args = eval(str(args_init))
fc = FeatureColumn(row['NHANES Category'], row['Field'], proc_fn, args)
columns.append(fc)
nhanes_dataset = NHANES(self.data_path, columns)
df = nhanes_dataset.process()
fe_cols = df.drop(['MCQ220'], axis=1)
features = fe_cols.values
names = fe_cols.columns
target = df['MCQ220'].values
inds_valid = ~ np.isnan(target)
features = features[inds_valid]
target = target[inds_valid]
targets = np.full((target.shape[0]), 3)
targets[target == 1] = 1
targets[target == 2] = 0
inds_response = ~ (targets == 3)
features = features[inds_response]
targets = targets[inds_response]
perm = np.random.permutation(targets.shape[0])
self.features = features[perm]
self.targets = targets[perm]
self.costs = [c.cost for c in columns[1:]]
self.costs = np.array(
[item for sublist in self.costs for item in sublist])
self.names = names