CoCalc -- Collaborative Calculation in the Cloud
Sharednhanes.pyOpen in CoCalc

Processing functions for NHANES data

import pdb
import glob
import copy
import os
import pickle
import joblib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats
import sklearn.feature_selection



class FeatureColumn:
    def __init__(self, category, field, preprocessor, args=None, cost=None):
        self.category = category
        self.field = field
        self.preprocessor = preprocessor
        self.args = args
        self.data = None
        self.cost = cost

class NHANES:
    def __init__(self, db_path=None, columns=None):
        self.db_path = db_path
        self.columns = columns # Depricated
        self.dataset = None # Depricated
        self.column_data = None
        self.column_info = None
        self.df_features = None
        self.df_targets = None
        self.costs = None

    def process(self):
        df = None
        cache = {}
        # collect relevant data
        df = []
        for fe_col in self.columns:

            sheet = fe_col.category
            field = fe_col.field

            data_files = glob.glob(self.db_path+sheet+'/*.XPT')

            df_col = []
            for dfile in data_files:
                #print(80*' ', end='\r')
                #print('\rProcessing: ' + dfile.split('/')[-1], end='')

                bad_files = ['CDC/SELECTED/Questionnaire/PAQ_G.XPT','CDC/SELECTED/Questionnaire/PAQ_D.XPT']

                if dfile in bad_files:
                    continue

                # read the file
                if dfile in cache:
                    df_tmp = cache[dfile]
                else:
                    df_tmp = pd.read_sas(dfile)
                    cache[dfile] = df_tmp
                # skip of there is no SEQN
                if 'SEQN' not in df_tmp.columns:
                    continue
                #df_tmp.set_index('SEQN')
                # skip if there is nothing interseting there


                sel_cols = set(df_tmp.columns).intersection([field])
                if not sel_cols:
                    continue
                else:
                    df_tmp = df_tmp[['SEQN'] + list(sel_cols)]
                    df_tmp.set_index('SEQN', inplace=True)
                    df_col.append(df_tmp)

            try:
                df_col = pd.concat(df_col)
            except:
                #raise Error('Failed to process ' + field)
                raise Exception('Failed to process ' + field)
            df.append(df_col)
        df = pd.concat(df, axis=1)
        #df = pd.merge(df, df_sel, how='outer')
        # do preprocessing steps
        df_proc = []#[df['SEQN']]
        for fe_col in self.columns:
            field = fe_col.field
            fe_col.data = df[field].copy()
            # do preprocessing
            if fe_col.preprocessor is not None:
                if fe_col.args is not None:
                    prepr_col = fe_col.preprocessor(df[field], **fe_col.args)

                else:
                    prepr_col = fe_col.preprocessor(df[field])
            else:
                prepr_col = df[field]
            # handle the 1 to many
            if (len(prepr_col.shape) > 1):
                fe_col.cost = [fe_col.cost] * prepr_col.shape[1]
            else:
                fe_col.cost = [fe_col.cost]
            df_proc.append(prepr_col)
        self.dataset = pd.concat(df_proc, axis=1)

        return self.dataset


# Preprocessing functions
def preproc_onehot(df_col, args=None):
    return pd.get_dummies(df_col, prefix=df_col.name, prefix_sep='#')

def preproc_real(df_col, cutoff=np.inf):
    #if args is None:
    #    args={'cutoff':np.inf}
    # other answers as nan
    df_col[df_col > cutoff] = np.nan
    # nan replaced by mean
    df_col = preproc_impute(df_col)
    # statistical normalization
    df_col = (df_col-df_col.mean()) / df_col.std()
    return df_col

def preproc_impute(df_col, args=None):
    # nan replaced by mean
    df_col[pd.isna(df_col)] = df_col.mean()
    return df_col

def preproc_cut(df_col, bins):
    # limit values to the bins range
    df_col[df_col <= bins[0]] = np.nan
    df_col[df_col >= bins[-1]] = np.nan
    #df_col = df_col[df_col >= bins[0]]
    #df_col = df_col[df_col <= bins[-1]]
    df_col = preproc_impute(df_col)
    return pd.cut(df_col, bins=bins, labels=False,include_lowest=True)


def preproc_cut_dummy(df_col, bins):
    # limit values to the bins range
    #df_col = df_col[df_col >= bins[0]]
    #df_col = df_col[df_col <= bins[-1]]
    df_col[df_col <= bins[0]] = np.nan
    df_col[df_col >= bins[-1]] = np.nan
    binning = pd.cut(df_col, bins=bins, labels=False,include_lowest=True)
    return preproc_onehot(binning)


def preproc_dropna(df_col, args=None):
    df_col.dropna(axis=0, how='any', inplace=True)
    return df_col


def preproc_binary(df_col, args=None):
    for i in range(len(df_col)):
        if df_col.iloc[i] != 1 and df_col.iloc[i] != 2:
            df_col.iloc[i] = np.nan
    return preproc_onehot(df_col)


#### Add your own preprocessing functions ####

#def preproc_binary(df_col, args=None):




# Dataset loader
class Dataset():
    """ 
    Dataset manager class
    """
    def  __init__(self, data_path=None):
        """
        Class intitializer.
        """
        # set database path
        if data_path == None:
            self.data_path = './run_data/'
        else:
            self.data_path = data_path
        # feature and target vecotrs
        self.features = None
        self.names = None
        self.targets = None
        self.costs = None


    #### Add your own dataset loader ####

    def load_cancer(self, opts=None):
        data = pd.read_csv('Fields.csv',dtype='str')

        proc_dict = {'preproc_onehot':preproc_onehot, 'preproc_real':preproc_real, 'preproc_impute':preproc_impute, 'preproc_cut':preproc_cut, 'preproc_cut_dummy':preproc_cut_dummy, 'preproc_dropna':preproc_dropna,'preproc_binary':preproc_binary, 'None': None}

        columns = []
        nf = data.shape[0]

        for i in range(nf):
            row=data.loc[i]

            proc_fn = proc_dict[row['Preprocessing']]

            args_init = row['Args']

            if pd.isna(args_init):
                args = None
            else:
                args = eval(str(args_init))

            fc = FeatureColumn(row['NHANES Category'], row['Field'], proc_fn, args)
            columns.append(fc)


        nhanes_dataset = NHANES(self.data_path, columns)
        df = nhanes_dataset.process()


        fe_cols = df.drop(['MCQ220'], axis=1)
        features = fe_cols.values
        names = fe_cols.columns
        target = df['MCQ220'].values

        # remove nan labeled samples
        inds_valid = ~ np.isnan(target)
        features = features[inds_valid]
        target = target[inds_valid]

        # Put each person in the corresponding bin
        targets = np.full((target.shape[0]), 3)
        targets[target == 1] = 1 # yes cancer
        targets[target == 2] = 0 # no cancer

        # Exclude non-responders
        inds_response = ~ (targets == 3)
        features = features[inds_response]
        targets = targets[inds_response]


        # random permutation
        perm = np.random.permutation(targets.shape[0])
        self.features = features[perm]
        self.targets = targets[perm]
        self.costs = [c.cost for c in columns[1:]]
        self.costs = np.array(
            [item for sublist in self.costs for item in sublist])
        
        
        self.names = names