| Hosted by CoCalc | Download
1
import pdb
2
import glob
3
import copy
4
import os
5
import pickle
6
import joblib
7
8
import numpy as np
9
import pandas as pd
10
import matplotlib.pyplot as plt
11
import scipy.stats
12
import sklearn.feature_selection
13
14
15
16
class FeatureColumn:
17
def __init__(self, category, field, preprocessor, args=None, cost=None):
18
self.category = category
19
self.field = field
20
self.preprocessor = preprocessor
21
self.args = args
22
self.data = None
23
self.cost = cost
24
25
class NHANES:
26
def __init__(self, db_path=None, columns=None):
27
self.db_path = db_path
28
self.columns = columns # Depricated
29
self.dataset = None # Depricated
30
self.column_data = None
31
self.column_info = None
32
self.df_features = None
33
self.df_targets = None
34
self.costs = None
35
36
def process(self):
37
df = None
38
cache = {}
39
# collect relevant data
40
df = []
41
for fe_col in self.columns:
42
43
sheet = fe_col.category
44
field = fe_col.field
45
46
data_files = glob.glob(self.db_path+sheet+'/*.XPT')
47
48
df_col = []
49
for dfile in data_files:
50
#print(80*' ', end='\r')
51
#print('\rProcessing: ' + dfile.split('/')[-1], end='')
52
53
bad_files = ['CDC/SELECTED/Questionnaire/PAQ_G.XPT','CDC/SELECTED/Questionnaire/PAQ_D.XPT']
54
55
if dfile in bad_files:
56
continue
57
58
# read the file
59
if dfile in cache:
60
df_tmp = cache[dfile]
61
else:
62
df_tmp = pd.read_sas(dfile)
63
cache[dfile] = df_tmp
64
# skip of there is no SEQN
65
if 'SEQN' not in df_tmp.columns:
66
continue
67
#df_tmp.set_index('SEQN')
68
# skip if there is nothing interseting there
69
70
71
sel_cols = set(df_tmp.columns).intersection([field])
72
if not sel_cols:
73
continue
74
else:
75
df_tmp = df_tmp[['SEQN'] + list(sel_cols)]
76
df_tmp.set_index('SEQN', inplace=True)
77
df_col.append(df_tmp)
78
79
try:
80
df_col = pd.concat(df_col)
81
except:
82
#raise Error('Failed to process ' + field)
83
raise Exception('Failed to process ' + field)
84
df.append(df_col)
85
df = pd.concat(df, axis=1)
86
#df = pd.merge(df, df_sel, how='outer')
87
# do preprocessing steps
88
df_proc = []#[df['SEQN']]
89
for fe_col in self.columns:
90
field = fe_col.field
91
fe_col.data = df[field].copy()
92
# do preprocessing
93
if fe_col.preprocessor is not None:
94
if fe_col.args is not None:
95
prepr_col = fe_col.preprocessor(df[field], **fe_col.args)
96
97
else:
98
prepr_col = fe_col.preprocessor(df[field])
99
else:
100
prepr_col = df[field]
101
# handle the 1 to many
102
if (len(prepr_col.shape) > 1):
103
fe_col.cost = [fe_col.cost] * prepr_col.shape[1]
104
else:
105
fe_col.cost = [fe_col.cost]
106
df_proc.append(prepr_col)
107
self.dataset = pd.concat(df_proc, axis=1)
108
109
return self.dataset
110
111
112
# Preprocessing functions
113
def preproc_onehot(df_col, args=None):
114
return pd.get_dummies(df_col, prefix=df_col.name, prefix_sep='#')
115
116
def preproc_real(df_col, cutoff=np.inf):
117
#if args is None:
118
# args={'cutoff':np.inf}
119
# other answers as nan
120
df_col[df_col > cutoff] = np.nan
121
# nan replaced by mean
122
df_col = preproc_impute(df_col)
123
# statistical normalization
124
df_col = (df_col-df_col.mean()) / df_col.std()
125
return df_col
126
127
def preproc_impute(df_col, args=None):
128
# nan replaced by mean
129
df_col[pd.isna(df_col)] = df_col.mean()
130
return df_col
131
132
def preproc_cut(df_col, bins):
133
# limit values to the bins range
134
df_col[df_col <= bins[0]] = np.nan
135
df_col[df_col >= bins[-1]] = np.nan
136
#df_col = df_col[df_col >= bins[0]]
137
#df_col = df_col[df_col <= bins[-1]]
138
df_col = preproc_impute(df_col)
139
return pd.cut(df_col, bins=bins, labels=False,include_lowest=True)
140
141
142
def preproc_cut_dummy(df_col, bins):
143
# limit values to the bins range
144
#df_col = df_col[df_col >= bins[0]]
145
#df_col = df_col[df_col <= bins[-1]]
146
df_col[df_col <= bins[0]] = np.nan
147
df_col[df_col >= bins[-1]] = np.nan
148
binning = pd.cut(df_col, bins=bins, labels=False,include_lowest=True)
149
return preproc_onehot(binning)
150
151
152
def preproc_dropna(df_col, args=None):
153
df_col.dropna(axis=0, how='any', inplace=True)
154
return df_col
155
156
157
def preproc_binary(df_col, args=None):
158
for i in range(len(df_col)):
159
if df_col.iloc[i] != 1 and df_col.iloc[i] != 2:
160
df_col.iloc[i] = np.nan
161
return preproc_onehot(df_col)
162
163
164
165
166
#### Add your own preprocessing functions ####
167
168
#def preproc_binary(df_col, args=None):
169
170
171
172
173
# Dataset loader
174
class Dataset():
175
"""
176
Dataset manager class
177
"""
178
def __init__(self, data_path=None):
179
"""
180
Class intitializer.
181
"""
182
# set database path
183
if data_path == None:
184
self.data_path = './run_data/'
185
else:
186
self.data_path = data_path
187
# feature and target vecotrs
188
self.features = None
189
self.names = None
190
self.targets = None
191
self.costs = None
192
193
194
#### Add your own dataset loader ####
195
196
def load_cancer(self, opts=None):
197
data = pd.read_csv('Fields.csv',dtype='str')
198
199
proc_dict = {'preproc_onehot':preproc_onehot, 'preproc_real':preproc_real, 'preproc_impute':preproc_impute, 'preproc_cut':preproc_cut, 'preproc_cut_dummy':preproc_cut_dummy, 'preproc_dropna':preproc_dropna,'preproc_binary':preproc_binary, 'None': None}
200
201
columns = []
202
nf = data.shape[0]
203
204
for i in range(nf):
205
row=data.loc[i]
206
207
proc_fn = proc_dict[row['Preprocessing']]
208
209
args_init = row['Args']
210
211
if pd.isna(args_init):
212
args = None
213
else:
214
args = eval(str(args_init))
215
216
fc = FeatureColumn(row['NHANES Category'], row['Field'], proc_fn, args)
217
columns.append(fc)
218
219
220
nhanes_dataset = NHANES(self.data_path, columns)
221
df = nhanes_dataset.process()
222
223
224
fe_cols = df.drop(['MCQ220'], axis=1)
225
features = fe_cols.values
226
names = fe_cols.columns
227
target = df['MCQ220'].values
228
229
# remove nan labeled samples
230
inds_valid = ~ np.isnan(target)
231
features = features[inds_valid]
232
target = target[inds_valid]
233
234
# Put each person in the corresponding bin
235
targets = np.full((target.shape[0]), 3)
236
targets[target == 1] = 1 # yes cancer
237
targets[target == 2] = 0 # no cancer
238
239
# Exclude non-responders
240
inds_response = ~ (targets == 3)
241
features = features[inds_response]
242
targets = targets[inds_response]
243
244
245
# random permutation
246
perm = np.random.permutation(targets.shape[0])
247
self.features = features[perm]
248
self.targets = targets[perm]
249
self.costs = [c.cost for c in columns[1:]]
250
self.costs = np.array(
251
[item for sublist in self.costs for item in sublist])
252
253
254
self.names = names
255
256
257
258
259
260
261
262