CoCalc -- nhanes.py

Processing functions for NHANES data
Views: ²⁷²
1
import pdb
2
import glob
3
import copy
4
import os
5
import pickle
6
import joblib
7

8
import numpy as np
9
import pandas as pd
10
import matplotlib.pyplot as plt
11
import scipy.stats
12
import sklearn.feature_selection
13

14

15

16
class FeatureColumn:
17
    def __init__(self, category, field, preprocessor, args=None, cost=None):
18
        self.category = category
19
        self.field = field
20
        self.preprocessor = preprocessor
21
        self.args = args
22
        self.data = None
23
        self.cost = cost
24

25
class NHANES:
26
    def __init__(self, db_path=None, columns=None):
27
        self.db_path = db_path
28
        self.columns = columns # Depricated
29
        self.dataset = None # Depricated
30
        self.column_data = None
31
        self.column_info = None
32
        self.df_features = None
33
        self.df_targets = None
34
        self.costs = None
35

36
    def process(self):
37
        df = None
38
        cache = {}
39
        # collect relevant data
40
        df = []
41
        for fe_col in self.columns:
42

43
            sheet = fe_col.category
44
            field = fe_col.field
45

46
            data_files = glob.glob(self.db_path+sheet+'/*.XPT')
47

48
            df_col = []
49
            for dfile in data_files:
50
                #print(80*' ', end='\r')
51
                #print('\rProcessing: ' + dfile.split('/')[-1], end='')
52

53
                bad_files = ['CDC/SELECTED/Questionnaire/PAQ_G.XPT','CDC/SELECTED/Questionnaire/PAQ_D.XPT']
54

55
                if dfile in bad_files:
56
                    continue
57

58
                # read the file
59
                if dfile in cache:
60
                    df_tmp = cache[dfile]
61
                else:
62
                    df_tmp = pd.read_sas(dfile)
63
                    cache[dfile] = df_tmp
64
                # skip of there is no SEQN
65
                if 'SEQN' not in df_tmp.columns:
66
                    continue
67
                #df_tmp.set_index('SEQN')
68
                # skip if there is nothing interseting there
69

70

71
                sel_cols = set(df_tmp.columns).intersection([field])
72
                if not sel_cols:
73
                    continue
74
                else:
75
                    df_tmp = df_tmp[['SEQN'] + list(sel_cols)]
76
                    df_tmp.set_index('SEQN', inplace=True)
77
                    df_col.append(df_tmp)
78

79
            try:
80
                df_col = pd.concat(df_col)
81
            except:
82
                #raise Error('Failed to process ' + field)
83
                raise Exception('Failed to process ' + field)
84
            df.append(df_col)
85
        df = pd.concat(df, axis=1)
86
        #df = pd.merge(df, df_sel, how='outer')
87
        # do preprocessing steps
88
        df_proc = []#[df['SEQN']]
89
        for fe_col in self.columns:
90
            field = fe_col.field
91
            fe_col.data = df[field].copy()
92
            # do preprocessing
93
            if fe_col.preprocessor is not None:
94
                if fe_col.args is not None:
95
                    prepr_col = fe_col.preprocessor(df[field], **fe_col.args)
96

97
                else:
98
                    prepr_col = fe_col.preprocessor(df[field])
99
            else:
100
                prepr_col = df[field]
101
            # handle the 1 to many
102
            if (len(prepr_col.shape) > 1):
103
                fe_col.cost = [fe_col.cost] * prepr_col.shape[1]
104
            else:
105
                fe_col.cost = [fe_col.cost]
106
            df_proc.append(prepr_col)
107
        self.dataset = pd.concat(df_proc, axis=1)
108

109
        return self.dataset
110

111

112
# Preprocessing functions
113
def preproc_onehot(df_col, args=None):
114
    return pd.get_dummies(df_col, prefix=df_col.name, prefix_sep='#')
115

116
def preproc_real(df_col, cutoff=np.inf):
117
    #if args is None:
118
    #    args={'cutoff':np.inf}
119
    # other answers as nan
120
    df_col[df_col > cutoff] = np.nan
121
    # nan replaced by mean
122
    df_col = preproc_impute(df_col)
123
    # statistical normalization
124
    df_col = (df_col-df_col.mean()) / df_col.std()
125
    return df_col
126

127
def preproc_impute(df_col, args=None):
128
    # nan replaced by mean
129
    df_col[pd.isna(df_col)] = df_col.mean()
130
    return df_col
131

132
def preproc_cut(df_col, bins):
133
    # limit values to the bins range
134
    df_col[df_col <= bins[0]] = np.nan
135
    df_col[df_col >= bins[-1]] = np.nan
136
    #df_col = df_col[df_col >= bins[0]]
137
    #df_col = df_col[df_col <= bins[-1]]
138
    df_col = preproc_impute(df_col)
139
    return pd.cut(df_col, bins=bins, labels=False,include_lowest=True)
140

141

142
def preproc_cut_dummy(df_col, bins):
143
    # limit values to the bins range
144
    #df_col = df_col[df_col >= bins[0]]
145
    #df_col = df_col[df_col <= bins[-1]]
146
    df_col[df_col <= bins[0]] = np.nan
147
    df_col[df_col >= bins[-1]] = np.nan
148
    binning = pd.cut(df_col, bins=bins, labels=False,include_lowest=True)
149
    return preproc_onehot(binning)
150

151

152
def preproc_dropna(df_col, args=None):
153
    df_col.dropna(axis=0, how='any', inplace=True)
154
    return df_col
155

156

157
def preproc_binary(df_col, args=None):
158
    for i in range(len(df_col)):
159
        if df_col.iloc[i] != 1 and df_col.iloc[i] != 2:
160
            df_col.iloc[i] = np.nan
161
    return preproc_onehot(df_col)
162

163

164

165

166
#### Add your own preprocessing functions ####
167

168
#def preproc_binary(df_col, args=None):
169

170

171

172

173
# Dataset loader
174
class Dataset():
175
    """ 
176
    Dataset manager class
177
    """
178
    def  __init__(self, data_path=None):
179
        """
180
        Class intitializer.
181
        """
182
        # set database path
183
        if data_path == None:
184
            self.data_path = './run_data/'
185
        else:
186
            self.data_path = data_path
187
        # feature and target vecotrs
188
        self.features = None
189
        self.names = None
190
        self.targets = None
191
        self.costs = None
192

193

194
    #### Add your own dataset loader ####
195

196
    def load_cancer(self, opts=None):
197
        data = pd.read_csv('Fields.csv',dtype='str')
198

199
        proc_dict = {'preproc_onehot':preproc_onehot, 'preproc_real':preproc_real, 'preproc_impute':preproc_impute, 'preproc_cut':preproc_cut, 'preproc_cut_dummy':preproc_cut_dummy, 'preproc_dropna':preproc_dropna,'preproc_binary':preproc_binary, 'None': None}
200

201
        columns = []
202
        nf = data.shape[0]
203

204
        for i in range(nf):
205
            row=data.loc[i]
206

207
            proc_fn = proc_dict[row['Preprocessing']]
208

209
            args_init = row['Args']
210

211
            if pd.isna(args_init):
212
                args = None
213
            else:
214
                args = eval(str(args_init))
215

216
            fc = FeatureColumn(row['NHANES Category'], row['Field'], proc_fn, args)
217
            columns.append(fc)
218

219

220
        nhanes_dataset = NHANES(self.data_path, columns)
221
        df = nhanes_dataset.process()
222

223

224
        fe_cols = df.drop(['MCQ220'], axis=1)
225
        features = fe_cols.values
226
        names = fe_cols.columns
227
        target = df['MCQ220'].values
228

229
        # remove nan labeled samples
230
        inds_valid = ~ np.isnan(target)
231
        features = features[inds_valid]
232
        target = target[inds_valid]
233

234
        # Put each person in the corresponding bin
235
        targets = np.full((target.shape[0]), 3)
236
        targets[target == 1] = 1 # yes cancer
237
        targets[target == 2] = 0 # no cancer
238

239
        # Exclude non-responders
240
        inds_response = ~ (targets == 3)
241
        features = features[inds_response]
242
        targets = targets[inds_response]
243

244

245
        # random permutation
246
        perm = np.random.permutation(targets.shape[0])
247
        self.features = features[perm]
248
        self.targets = targets[perm]
249
        self.costs = [c.cost for c in columns[1:]]
250
        self.costs = np.array(
251
            [item for sublist in self.costs for item in sublist])
252
        
253
        
254
        self.names = names
255

256
        
257
        
258
        
259
        
260
        
261
        
262