Kernel: Python 3 (Ubuntu Linux)
In [2]:
import sys, os, pickle from keras.models import Sequential, model_from_json import pandas as pd from keras.layers.advanced_activations import LeakyReLU from keras.layers import BatchNormalization from keras.optimizers import Adam, RMSprop, SGD import sys, re, glob import numpy as np from keras.layers import Input, Dense, Flatten import sys, re, glob import pandas as pd import numpy as np from lxml import etree from sklearn.model_selection import StratifiedKFold
Data
In [0]:
proto_dict = { 'ns3::WifiMacHeader' : 'MAC', 'ns3::LlcSnapHeader' : 'LLC', 'ns3::ArpHeader' : 'ARP', 'ns3::Ipv4Header' : 'IPv4', 'ns3::UdpHeader' : 'UDP', 'ns3::aodv::TypeHeader' : 'AODV_Type', 'ns3::aodv::RrepHeader' : 'AODV_RREP', 'ns3::aodv::RreqHeader' : 'AODV_RREQ', 'ns3::aodv::RrepAckHeader' : 'AODV_RACK', 'ns3::aodv::RerrHeader' : 'AODV_RERR', 'ns3::Icmpv4Header' : 'ICMPv4_HEADER', 'ns3::Icmpv4TimeExceeded' : 'ICMPv4_TE', 'ns3::Icmpv4DestinationUnreachable' : 'ICMPv4_DU' } def parce_packets(input_file): with open(input_file, 'r') as f: data = f.read() data = data.split('meta-info="') meta_info = [] data.pop(0) for b in data: meta_info.append(b.split('"')[0]) packets = [] for m in meta_info: raw = {} for proto in proto_dict: if m.find(proto) != -1: raw[proto_dict[proto]] = m.split(proto+' (')[1].split(') ns3')[0] p = {} if 'MAC' in raw and 'LLC' in raw: p['MAC'] = {} p['MAC']['src'] = raw['MAC'].split('SA=')[1].split(',')[0] p['MAC']['dst'] = raw['MAC'].split('DA=')[1].split(',')[0] p['MAC']['type'] = int(raw['LLC'].split('type ')[1], 16) if 'ARP' in raw: p['ARP'] = {} if raw['ARP'].find('request') != -1: p['ARP']['type'] = 'request' else: p['ARP']['type'] = 'reply' p['ARP']['MAC_dst'] = raw['ARP'].split('dest mac: ')[1].split(' ')[0] p['ARP']['MAC_src'] = raw['ARP'].split('source mac: ')[1].split(' ')[0] p['ARP']['IP_src'] = raw['ARP'].split('source ipv4: ')[1].split(' ')[0] p['ARP']['IP_dst'] = raw['ARP'].split('dest ipv4: ')[1] if 'IPv4' in raw: p['IPv4'] = {} ipv4data = raw['IPv4'].split(' ') p['IPv4']['src'] = ipv4data[-3] p['IPv4']['dst'] = ipv4data[-1] p['IPv4']['proto'] = int(raw['IPv4'].split('protocol ')[1].split(' ')[0]) p['IPv4']['ttl'] = int(raw['IPv4'].split('ttl ')[1].split(' ')[0]) p['IPv4']['length'] = int(raw['IPv4'].split('length: ')[1].split(' ')[0]) if 'UDP' in raw: p['UDP'] = {} p['UDP']['length'] = int(raw['UDP'].split('length: ')[1].split(' ')[0]) if 'AODV_Type' in raw: p['AODV'] = {} p['AODV']['type'] = raw['AODV_Type'] if 'AODV_RREP' in raw: p['AODV']['RREP'] = {} p['AODV']['RREP']['dst'] = raw['AODV_RREP'].split('destination: ipv4 ')[1].split(' ')[0] p['AODV']['RREP']['src'] = raw['AODV_RREP'].split('source ipv4 ')[1].split(' ')[0] p['AODV']['RREP']['SN'] = int(raw['AODV_RREP'].split('sequence number ')[1].split(' ')[0]) p['AODV']['RREP']['lifetime'] = int(raw['AODV_RREP'].split('lifetime ')[1].split(' ')[0]) p['AODV']['RREP']['acknowledgment'] = raw['AODV_RREP'].split('acknowledgment ')[1].split(' ')[0] p['AODV']['RREP']['flag'] = raw['AODV_RREP'].split('flag ')[1] if 'AODV_RREQ' in raw: p['AODV']['RREQ'] = {} p['AODV']['RREQ']['ID'] = int(raw['AODV_RREQ'].split('ID ')[1].split(' ')[0]) p['AODV']['RREQ']['dst'] = raw['AODV_RREQ'].split('destination: ipv4 ')[1].split(' ')[0] p['AODV']['RREQ']['src'] = raw['AODV_RREQ'].split('source: ipv4 ')[1].split(' ')[0] p['AODV']['RREQ']['SN'] = int(raw['AODV_RREQ'].split('sequence number ')[1].split(' ')[0]) p['AODV']['RREQ']['flags'] = raw['AODV_RREQ'].split('flags: ')[1] if 'AODV_RACK' in raw: pass # seems to be empty if 'AODV_RERR' in raw: p['AODV']['RERR'] = raw['AODV_RERR'] if 'ICMPv4_HEADER' in raw: p['ICMPv4'] = {} p['ICMPv4']['type'] = int(raw['ICMPv4_HEADER'].split('type=')[1].split(',')[0]) p['ICMPv4']['code'] = int(raw['ICMPv4_HEADER'].split('code=')[1]) if 'ICMPv4_TE' in raw: p['ICMPv4']['data'] = raw['ICMPv4_TE'] if 'ICMPv4_DU' in raw: p['ICMPv4']['data'] = raw['ICMPv4_DU'] if len(p) > 0: packets.append(p) return packets
In [0]:
In [0]:
def mac_to_id(mac): return int(mac.split(':')[-1], 16) - 1 def ip_to_id(ip): return int(ip.split('.')[-1]) - 1 def entropy(arr): s = arr.sum() arr = arr / s arr = arr * np.log(arr) / np.log(2) return -(arr.sum() / np.log(len(arr)))
In [0]:
def load_data(path, rng): routes = list() flows = list() packets = list() for i in range(rng): for routes_file in glob.glob('{}/output_{}/*.routes'.format(path, i)): info = re.findall('(\w+)/output_(\d+)', routes_file) handle = open(routes_file, 'r') data = handle.read() handle.close() nodes = re.split('\n\n', data) nodes.pop() for node in nodes: header = re.findall('Node:\s+(\d+)\s+Time:\s+(\d+)', node) lines = re.findall('(\d{1,3}(?:\.\d{1,3}){3})\s+(\d{1,3}(?:\.\d{1,3}){3})\s+(\d{1,3}(?:\.\d{1,3}){3})\s+(\w+)\s+(-?\d+\.\d+)\s+(\d+)', node) for line in lines: l = list(line) l[4] = float(line[4]) l[5] = int(line[5]) routes.append(header[0] + tuple(l) + info[0]) for flowmon_file in glob.glob('{}/output_{}/*.flowmon'.format(path, i)): info = re.findall('(\w+)/output_(\d+)', flowmon_file) with open(flowmon_file) as fobj: xml = fobj.read() root = etree.fromstring(xml) for flow in root.xpath('/FlowMonitor/FlowStats/Flow'): attributes = list() for attrib in flow.attrib: attr = flow.attrib[attrib] if 'ns' in attr: attr = re.findall('(\d+)', attr)[0] attributes.append(int(attr)) flows.append(tuple(attributes) + info[0]) for packets_file in glob.glob('{}/output_{}/*.xml'.format(path, i)): if "routingtable-wireless.xml" in packets_file: continue info = re.findall('(\w+)/output_(\d+)', packets_file) packets_dict = parce_packets(packets_file) table_packets = [] for p in packets_dict: if 'IPv4' in p: table_packets.append([mac_to_id(p['MAC']['src']), mac_to_id(p['MAC']['dst']), ip_to_id(p['IPv4']['src']), ip_to_id(p['IPv4']['dst'])]) table_packets = pd.DataFrame(table_packets, columns=['mac_src', 'mac_dst', 'ip_src', 'ip_dst']) table_packets = table_packets[table_packets['ip_dst'] != 254] table_packets['input'] = 0 table_packets['output'] = 0 input_cnt = table_packets.groupby('mac_src').agg({'input':'count'}) output_cnt = table_packets.groupby('mac_dst').agg({'output':'count'}) io_mac = input_cnt.join(output_cnt) io_mac["diff"] = io_mac['input'] - io_mac['output'] io_mac["normalized"] = io_mac["diff"] - io_mac["diff"].min() + 1 input = io_mac.agg({'input': [entropy, 'min', 'max', 'mean', 'var']}).values.flatten() output = io_mac.agg({'output': [entropy, 'min', 'max', 'mean', 'var']}).values.flatten() diff = io_mac.agg({'diff': ['min', 'max', 'mean', 'var']}).values.flatten() normalized = io_mac.agg({'normalized': [entropy, 'min', 'max', 'mean', 'var']}).values.flatten() src_count = table_packets[table_packets['mac_src'] == table_packets['ip_src']].count()[0] dst_count = table_packets[table_packets['mac_dst'] == table_packets['ip_dst']].count()[0] packets.append(np.concatenate((input, output, diff, normalized, [src_count, dst_count], info[0]))) routes_table = pd.DataFrame(routes, columns=['Node', 'Time', 'Destination', 'Gateway', 'Interface', 'Flag', 'Expire', 'Hops', 'Type', 'Test']) #print(routes_table) flag_count = routes_table.groupby(['Type', 'Test', 'Time', 'Flag']).agg({'Flag' : ['count']}) flag_count = flag_count.reset_index(col_level=1) flag_count.columns = flag_count.columns.droplevel() flag_agg = flag_count.groupby(['Type', 'Test', 'Flag']).agg({'count': ['max', 'min', 'mean', 'var']}).unstack().reset_index() flag_agg.columns = [col[0]+col[1]+col[2] for col in flag_agg.columns] flag_agg = flag_agg.set_index(['Type', 'Test']) hops_node_dest_agg = routes_table.groupby(['Type', 'Test', 'Node', 'Destination']).agg({'Hops' : ['min', 'max', 'mean']}) hops_node_agg = hops_node_dest_agg.reset_index().groupby(["Type", "Test", "Node"]).agg(['min', 'max', 'mean']).reset_index(col_level=1) hops_node_agg.columns = hops_node_agg.columns.droplevel() hops_agg = hops_node_agg.reset_index().groupby(["Type", "Test"]).agg(['min', 'max', 'mean']).reset_index(col_level=1) hops_agg.columns = [col[0]+col[1]+col[2] for col in hops_agg.columns] hops_agg = hops_agg.set_index(['Type', 'Test']) flows_table = pd.DataFrame(flows, columns=['flowId', 'timeFirstTxPacket', 'timeFirstRxPacket', 'timeLastTxPacket', 'timeLastRxPacket', 'delaySum', 'jitterSum', 'lastDelay', 'txBytes', 'rxBytes', 'txPackets', 'rxPackets', 'lostPackets', 'timesForwarded', 'Type', 'Test']) lost_agg = flows_table.groupby(['Type', 'Test']).agg({'lostPackets' : ['sum', 'mean']}) forwarded_agg = flows_table.groupby(['Type', 'Test']).agg({'timesForwarded' : ['sum', 'max', 'mean', 'var']}) packets_table = pd.DataFrame(packets, columns=["entopy_input", "min_input", "max_input", "mean_input", "var_input", "entopy_output", "min_output", "max_output", "mean_output", "var_output", "min_diff", "max_diff", "mean_diff", "var_diff", "entopy_normolized", "min_normolized", "max_normolized", "mean_normolized", "var_normolized", "src_count", "dst_count", 'Type', 'Test']) packets_table = packets_table.set_index(['Type', 'Test']) return flag_agg.join(hops_agg).join(lost_agg).join(forwarded_agg).join(packets_table)
In [0]:
normal = load_data("data/normal", 100) blackhole = load_data("data/blackhole", 100) grayhole = load_data("data/greyhole", 100) wormhole = load_data("data/wormhole", 100) ddos = load_data("data/ddos", 100)
In [0]:
X = np.concatenate((normal, blackhole, grayhole, ddos)).astype(np.float) y = np.concatenate((np.zeros((100, 1)), np.ones((300, 1))))
In [0]:
indexes = np.array(range(len(y))) np.random.shuffle(indexes) indexes X = X[indexes] y = y[indexes]
In [0]:
X[np.isnan(X)] = 0
Model
In [0]:
class GAN: def __init__(self, shape): self.SHAPE = shape self.OPTIMIZER = Adam() self.compile_models() def __generator(self): model = Sequential() model.add(Dense(2048, input_shape=(1000,))) model.add(LeakyReLU(alpha=0.2)) model.add(Dense(1024)) model.add(LeakyReLU(alpha=0.2)) model.add(Dense(512)) model.add(LeakyReLU(alpha=0.2)) model.add(Dense(256)) model.add(LeakyReLU(alpha=0.2)) model.add(Dense(self.SHAPE, activation='sigmoid')) #model = Sequential() #model.add(Dense(2048, input_shape=(1000,))) #model.add(Dense(4096)) #model.add(Dense(4096)) #model.add(Dense(4096)) #model.add(Dense(4096)) #model.add(Dense(4096)) #model.add(Dense(4096)) #model.add(Dense(self.SHAPE, activation='sigmoid')) return model def __discriminator(self): model = Sequential() model.add(Dense(10, input_shape=(self.SHAPE,))) model.add(LeakyReLU(alpha=0.2)) model.add(BatchNormalization(momentum=0.8)) model.add(Dense(256)) model.add(LeakyReLU(alpha=0.2)) model.add(BatchNormalization(momentum=0.8)) model.add(Dense(512)) model.add(LeakyReLU(alpha=0.2)) model.add(BatchNormalization(momentum=0.8)) model.add(Dense(1024)) model.add(LeakyReLU(alpha=0.2)) model.add(BatchNormalization(momentum=0.8)) model.add(Dense(1, activation='sigmoid')) #model = Sequential() #model.add(Dense(10, input_shape=(self.SHAPE,))) #model.add(Dense(4096)) #model.add(Dense(4096)) #model.add(Dense(4096)) #model.add(Dense(4096)) #model.add(Dense(4096)) #model.add(Dense(4096)) #model.add(Dense(1, activation='sigmoid')) return model def __stacked(self, generator, discriminator): discriminator.trainable = False model = Sequential() model.add(generator) model.add(discriminator) return model def compile_models(self): self.generator = self.__generator() self.discriminator = self.__discriminator() self.stacked = self.__stacked(self.generator, self.discriminator) self.generator.compile(loss='binary_crossentropy', optimizer=self.OPTIMIZER) self.discriminator.compile(loss='binary_crossentropy', optimizer=self.OPTIMIZER, metrics=['accuracy'] ) self.stacked.compile(loss='binary_crossentropy', optimizer=self.OPTIMIZER) def save_models(self): self.generator.save_weights("generator_weights.h5") self.discriminator.save_weights("discriminator_weights.h5") self.stacked.save_weights("stacked_weights.h5") with open("generator_model.json", "w") as json_file: json_file.write(self.generator.to_json()) with open("discriminator_model.json", "w") as json_file: json_file.write(self.discriminator.to_json()) with open("stacked_model.json", "w") as json_file: json_file.write(self.stacked.to_json()) def load_models(self): with open("generator_model.json", "r") as json_file: self.generator = model_from_json(json_file.read()) self.generator.load_weights("generator_weights.h5") with open("discriminator_model.json", "r") as json_file: self.discriminator = model_from_json(json_file.read()) self.discriminator.load_weights("discriminator_weights.h5") with open("stacked_model.json", "r") as json_file: self.stacked = model_from_json(json_file.read()) self.stacked.load_weights("stacked_weights.h5") self.generator.compile(loss='binary_crossentropy', optimizer=self.OPTIMIZER) self.discriminator.compile(loss='binary_crossentropy', optimizer=self.OPTIMIZER, metrics=['accuracy'] ) self.stacked.compile(loss='binary_crossentropy', optimizer=self.OPTIMIZER) def train(self,X, y, epochs=200, batch = 100, debug=False): for cnt in range(epochs): ## train discriminator random_index = np.random.randint(0, len(y) - batch) X_batch = X[random_index : random_index + batch] y_batch = y[random_index : random_index + batch] gen_noise = np.random.normal(0, 1, (batch,1000)) syntetic = self.generator.predict(gen_noise) x_combined_batch = np.concatenate((X_batch, syntetic)) y_combined_batch = np.concatenate((y_batch, np.zeros((batch, 1)))) d_loss = self.discriminator.train_on_batch(x_combined_batch, y_combined_batch) # train generator noise = np.random.normal(0, 1, (batch,1000)) y_mislabled = np.ones((batch, 1)) g_loss = self.stacked.train_on_batch(noise, y_mislabled) if debug: print ('epoch: %d, [Discriminator :: d_loss: %f], [ Generator :: loss: %f]' % (cnt, d_loss[0], g_loss))
Train
In [0]:
gan = GAN(X.shape[1]) gan.train(X, y, debug=True, epochs=100, batch=20)
Preidiction
In [0]:
predict = gan.discriminator.predict(X)
In [0]:
np.mean(y == (predict > 0.5))
Metrics
In [0]:
from sklearn import metrics print(metrics.confusion_matrix(y, (predict > 0.5)))
In [0]:
metrics.accuracy_score(y, (predict > 0.5))
In [0]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=17) cvscores = [] for train, test in kfold.split(X, y): gan = GAN(X.shape[1]) gan.train(X[train], y[train], epochs=100, batch=20, debug=False) predict = gan.discriminator.predict(X[test]) acc = metrics.f1_score(y[test], (predict > 0.5)) cvscores.append(acc) print(acc) print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
In [0]:
metrics.f1_score(y, (predict > 0.5))
In [0]:
predict.max()
In [0]:
gan.save_models()
In [0]:
gan.load_models() predict = gan.discriminator.predict(X) np.mean(y == (predict > 0.5))
In [0]: