diff --git a/scripts/createOptions.py b/scripts/createOptions.py new file mode 100644 index 0000000..2154416 --- /dev/null +++ b/scripts/createOptions.py @@ -0,0 +1,120 @@ +import os, sys +from collections import OrderedDict + + +try: + outFileName = sys.argv[1] +except: + print("File name required") + exit() + +args = OrderedDict([("Working_Dir", None),\ +("Source_Graph_File", None),\ +("Source_Attributes_File", None),\ +("Target_Graph_File", None),\ +("Target_Attributes_File", None),\ +("Number_Target_Graph_Nodes", None),\ +("Number_Target_Graph_Edges", None),\ +("Number_Attributes", None),\ +("Attribute_Cardinalities", None), +("Node_Probability_Threshold", None),\ +("Edge_Probability_Threshold", None),\ +("Augment_Label_Cardinality", None),\ +("Random_Seed", None),\ +("Augmentation_Type", None),\ +("OpenMP", None),\ +("Attributes_Input_Type", None)\ +]) + +#print(args) + + + + +keys = list(args.keys()) + + + +for key in keys: + val = input(str(key) + "\n") + + #Path Arguments + if key == "Working_Dir" and val[len(val) - 1] != "/": + val = val + "/" + + #Integer Arguments + if key == "Number_Target_Graph_Nodes" or key == "Number_Target_Graph_Edges" or key == "Number_Attributes" or key == "Augment_Label_Cardinality" or key == "Random_Seed" or key == "OpenMP": + while True: + try: + assert int(val) + break + except ValueError: + print("This value must be an integer") + val = input(str(key) + "\n") + + #List Arguments + if key == "Attribute_Cardinalities": + while True: + try: + tmp_val = val.strip().split(" ") + #Check Cardinality Length + assert len(tmp_val) == int(args["Number_Attributes"]) + #Check Cardinality Values + validVals = True + for i,v in enumerate(tmp_val): + try: + assert int(v) + except ValueError: + validVals = False + raise Exception("Invalid entry for ", i, " entry") + break + #If all values are valid, move on to next attribute + if validVals: + break + except: + print("Cardinalities don't match total number of attributes") + val = input(str(key) + "\n") + + #Float Arguments + if key == "Node_Probability_Threshold" or key == "Edge_Probability_Threshold": + while True: + try: + assert float(val) + break + except: + print("Must be float") + val = input(str(key) + "\n") + + #String Arguments + if key == "Augmentation_Type": + while True: + try: + assert val == "Linear" or val == "Logarithmic" + break + except: + val = input(str(key) + "\n") + + if key == "Attributes_Input_Type": + include = True + while True: + try: + assert val == "Node_Id_Implicit" + break + except: + response = input("Would you like to include this parameter (y/n)?") + include = response == "y" + if include: + val = input(str(key) + "\n") + continue + else: + break + if not include: + del args[key] + break + + args[key] = val + +keys = list(args.keys()) +outFP = open(outFileName,"w") +for key in keys: + outFP.write(key + " "+ str(args[key]) + "\n") diff --git a/scripts/dns_to_prop_graph.py b/scripts/dns_to_prop_graph.py new file mode 100644 index 0000000..ee7b09d --- /dev/null +++ b/scripts/dns_to_prop_graph.py @@ -0,0 +1,234 @@ +import collections +import numpy as np +import os, sys +import pandas as pd +import argparse +import statistics +from threading import Thread, Lock + +#Checks to see if an edge is valid +def update_edge(key,value,property_map): + #No Cycles (ie staying on one node) + if key == value: + return False + + prop_keys = list(property_map.keys()) + #Check to see if value is already a key + if value in prop_keys: + #Check to see if key is a value in list for value (ie cycle) + if key in property_map[value]: + return False + + + # Check to see if node is a new node that hasn't been evaluated + if key not in list(prop_keys): + property_map[key] = [] + property_map[key].append(value) + return True + +#Updates a property for a given node +def update_property(key, value, property_map): + if key not in list(property_map.keys()): + property_map[key] = [] + + property_map[key].append(value) + return + +#Creates ID's for the individual nodes. Node ID's could be written to a file +#But that currently isn't implemented +def set_id(node_id_map, node_str): + if node_str not in list(node_id_map.keys()): + node_id_map[node_str] = len(node_id_map) + +#Creates edges between nodes +def createEdges(src_map, node_id_map): + edge_dict = dict() + keys = list(src_map.keys()) + for key in keys: + edge_dict[int(node_id_map[key])] = [] + for elem in src_map[key]: + edge_dict[int(node_id_map[key])].append(int(node_id_map[elem])) + return edge_dict + + +#Function takes in a property(src_map) and generates a label for each node based on that property +def createLabels(src_map, node_id_map): + node_ids = list(node_id_map.values()) + node_keys = list(node_id_map.keys()) + + label_dict = dict.fromkeys(node_ids) + src_keys = list(src_map.keys()) + + threads = [] + lock = Lock() + numActiveThreads = 0 + + for i,node_key in enumerate(node_keys): + t = Thread(target=setLabels,args=(node_key, node_id_map, src_keys, src_map, label_dict,lock)) + threads.append(t) + t.start() + numActiveThreads += 1 + if numActiveThreads == 500: + for j in reversed(range(numActiveThreads)): + threads[i-j].join() + numActiveThreads = 0 + + + for i in range(len(threads) - numActiveThreads,len(threads)): + threads[i].join() + + discretizeData(label_dict) + return label_dict + +#This function sets a label for a property for each node +def setLabels(node_key, node_id_map, src_keys, src_map, label_dict,lock): + label_dict[node_id_map[node_key]] = 0 + if node_key in src_keys: + for val in src_map[node_key]: + lock.acquire() + label_dict[node_id_map[node_key]] += val + lock.release() + + +#Since data is not categorical, it must be discretized. Discretizes data to create label +def discretizeData(labels): + keys = list(labels.keys()) + values = list(labels.values()) + med = statistics.median(values) + + threads = [] + lock = Lock() + numActiveThreads = 0 + + for i,key in enumerate(keys): + t = Thread(target=discretizeThread,args=(key,labels,med,lock)) + threads.append(t) + t.start() + numActiveThreads += 1 + if numActiveThreads == 500: + for j in reversed(range(numActiveThreads)): + threads[i-j].join() + numActiveThreads = 0 + for i in range(len(threads)-numActiveThreads,len(threads)): + threads[i].join() + +#Discretizes property for an individual node +def discretizeThread(key, labels, med,lock): + lock.acquire() + labels[key] = 1 if labels[key] > med else 0 + lock.release() + + +#Main Function. Reads input file, generates edges and labels and +#Writes output to OutDir +def processData(outDir, fieldIndex, fP): + sep = ',' + node_id_map = dict() + edges = {} + src_times = {} + res_times = {} + totalEdges = 0 + + #Read and Parse input file + while True: + line = fP.readline() + if line == '': + break + totalEdges += 1 + line = line.strip().replace(" ", "") + tokens = line.split(sep) + src = tokens[fieldIndex['src']] + res = tokens[fieldIndex['res']] + time = int(tokens[fieldIndex['time']]) + + if not update_edge(src, res,edges): + continue + + update_property(src,time,src_times) + update_property(res,time,res_times) + set_id(node_id_map,src) + set_id(node_id_map, res) + + fP.close() + #Dictionary of edges + edge_dict = createEdges(edges, node_id_map) + src_time_dict = createLabels(src_times, node_id_map) + res_time_dict = createLabels(res_times, node_id_map) + + edge_keys = list(edge_dict.keys()) + src_time_keys = list(src_time_dict.keys()) + res_time_keys = list(res_time_dict.keys()) + node_ids = list(node_id_map.values()) + + print("Total Nodes:\t", len(node_ids)) + print("Total Edges:\t", totalEdges) + + edgeFile = open(outDir + ".edges","w") + labelFile = open(outDir + '-labels.txt',"w") + for key in node_ids: + edge_list = [] + if key in edge_keys: + edge_list = edge_dict[key] + for i in range(len(edge_list)): + edgeFile.write(str(key) + "\t" + str(edge_list[i]) + "\n") + labelFile.write(str(src_time_dict[key]) + "\t" + str(res_time_dict[key]) + "\n") + + +#Creates Filed Index based on input +def createFieldIndex(path): + fieldIndex = {"src": None, "res": None, "time": None} + header = "" + sep = "," + fp = open(path, "r") + while header == "": + header = fp.readline() + + tokens = [] + for h in header.strip().split(sep): + if h == '': + continue + tokens.append(h.strip()) + + fieldIndex = getIndex(fieldIndex, tokens) + status, invalidCol = checkFieldIndex(fieldIndex) + if status: + return fieldIndex,fp + else: + print("Error. Cannot find column corresponding to " + str(invalidCol)) + + +#Creates index for each header element in file +def getIndex(fieldIndex, header): + keys = list(fieldIndex.keys()) + for i,col in enumerate(header): + if col in keys: + fieldIndex[col] = i + return fieldIndex + +#Checks that src, header and res have values +def checkFieldIndex(fieldIndex): + keys = list(fieldIndex.keys()) + for key in keys: + if fieldIndex[key] == None and key != None: + return False, key + return True,None + +def driver(): + parser = argparse.ArgumentParser() + parser.add_argument('inPath') + parser.add_argument('outPath') + args = parser.parse_args() + + path = args.inPath + outDir = args.outPath + fieldIndex, filePointer = createFieldIndex(path) + + fileName = os.path.split(path)[1] + outDir = outDir + fileName if outDir[len(outDir) - 1] == "/" else outDir + "/" + fileName + + processData(outDir, fieldIndex, filePointer) + + + +if __name__ == '__main__': + driver() \ No newline at end of file diff --git a/scripts/netflow_to_prop_graph.py b/scripts/netflow_to_prop_graph.py new file mode 100644 index 0000000..2bcad4f --- /dev/null +++ b/scripts/netflow_to_prop_graph.py @@ -0,0 +1,222 @@ +import collections +#import networkx as nx +import numpy as np +import os, sys +import pandas as pd + +def update_property(prop_map, key, value): + if key not in prop_map: + prop_map[key] = set() + prop_map[key].add(value) + return + +def update_count(counter, key, count): + counter[key] += int(count) + return + +def discretize_values(prop_map): + median = np.median(np.asarray(list(prop_map.values()))) + new_map = dict() + for k,v in prop_map.items(): + new_map[k] = 1 if v > median else 0 + return new_map + +def get_id(node_id_map, node_str): + if node_str not in node_id_map: + node_id_map[node_str] = str(len(node_id_map)) + return node_id_map[node_str] + +def transform_node_ids(prefix): + node_id_map = dict() + in_edge_path = prefix + '.TEMP.edges' + in_labels_path = prefix + '.TEMP-labels.txt' + out_edge_path = prefix + '.edges' + out_labels_path = prefix + '-labels.txt' + out_ids_path = prefix + '.ids.tsv' + + f_out_edges = open(out_edge_path, 'w') + with open(in_edge_path) as f_in: + for line in f_in: + tokens = line.strip().split('\t') + u = get_id(node_id_map, tokens[0]) + v = get_id(node_id_map, tokens[1]) + f_out_edges.write(u + '\t' + v + '\n') + f_out_edges.close() + + f_out_labels = open(out_labels_path, 'w') + line_count = 0 + with open(in_labels_path) as f_in: + node_labels = dict() + for line in f_in: + line_count += 1 + if line_count == 1: + continue + tokens = line.strip().split('\t') + node_labels[get_id(node_id_map, tokens[0])] = '\t'.join(tokens[1:]) + for node_id in sorted(node_labels.keys()): + f_out_labels.write(node_labels[node_id] + '\n') + f_out_labels.close() + + with open(out_ids_path, 'w') as f: + for k in sorted(node_id_map.keys()): + f.write(k + '\t' + node_id_map[k] + '\n') + +def process_flow(path, netflow_field_index, out_prefix): + sep = ',' + line_count = 0 + edge_list = [] + incoming_ports = dict() + incoming_protocols = dict() + outgoing_protocols = dict() + incoming_packets = collections.Counter() + outgoing_packets = collections.Counter() + incoming_bytes = collections.Counter() + outgoing_bytes = collections.Counter() + num_in_flows = collections.Counter() + num_out_flows = collections.Counter() + + with open(path) as f_in: + print('Reading file ...') + last_s = '' + last_d = '' + last_s_port = '' + last_d_port = '' + last_line = '' + for line in f_in: + line_count += 1 + if line_count == 1: + continue + tokens = [t.strip() for t in line.strip().split(sep)] + src_ip = tokens[netflow_field_index['SrcAddr']] + dst_ip = tokens[netflow_field_index['DstAddr']] + if src_ip == last_d and dst_ip == last_s and src_port == last_d_port and dst_port == last_s_port: + continue + src_port = tokens[netflow_field_index['Sport']] + dst_port = tokens[netflow_field_index['Dport']] + protocol = tokens[netflow_field_index['Proto']] + num_packets = tokens[netflow_field_index['TotPkts']] + num_bytes = tokens[netflow_field_index['TotBytes']] + edge_list.append(src_ip + '\t' + dst_ip) + update_property(incoming_ports, dst_ip, dst_port) + update_property(incoming_protocols, dst_ip, protocol) + update_property(outgoing_protocols, src_ip, protocol) + + update_count(incoming_packets, dst_ip, num_packets) + update_count(outgoing_packets, src_ip, num_packets) + update_count(incoming_bytes, dst_ip, num_bytes) + update_count(outgoing_bytes, src_ip, num_bytes) + num_in_flows[dst_ip] += 1 + num_out_flows[src_ip] += 1 + + last_s = src_ip + last_d = dst_ip + last_s_port = src_port + last_d_port = dst_port + last_line = line + + print('Read ' + str(line_count-1) + ' records') + print('Number of edges = ' + str(len(edge_list))) + print('Building additional counters ...') + incoming_port_counts = dict([(k, len(v)) for k,v in incoming_ports.items()]) + incoming_protocol_counts = dict([(k, len(v)) for k,v in incoming_protocols.items()]) + outgoing_protocol_counts = dict([(k, len(v)) for k,v in outgoing_protocols.items()]) + + print('Computing binary features ...') + incoming_port_counts_bin = discretize_values(incoming_port_counts) + incoming_protocol_counts_bin = discretize_values(incoming_protocol_counts) + outgoing_protocol_counts_bin = discretize_values(outgoing_protocol_counts) + incoming_packets_bin = discretize_values(incoming_packets) + outgoing_packets_bin = discretize_values(outgoing_packets) + incoming_bytes_bin = discretize_values(incoming_bytes) + outgoing_bytes_bin = discretize_values(outgoing_bytes) + incoming_flows_bin = discretize_values(num_in_flows) + outgoing_flows_bin = discretize_values(num_out_flows) + + print('Creating input to dataframe ...') + label_data = { 'incoming_port_counts_high': pd.Series(list(incoming_port_counts_bin.values()), \ + index=incoming_port_counts_bin.keys()),\ + #'incoming_protocol_counts_high': pd.Series(incoming_protocol_counts_bin.values(), \ + #index=incoming_protocol_counts_bin.keys()), \ + #'outgoing_protocol_counts_high': pd.Series(outgoing_protocol_counts_bin.values(), \ + #index=outgoing_protocol_counts_bin.keys()), \ + #'incoming_packet_counts_high': pd.Series(incoming_packets_bin.values(), \ + #index=incoming_packets_bin.keys()), \ + #'outgoing_packet_counts_high': pd.Series(outgoing_packets_bin.values(), \ + #index=outgoing_packets_bin.keys()), \ + 'incoming_bytes_high': pd.Series(list(incoming_bytes_bin.values()), \ + index=incoming_bytes_bin.keys()), \ + 'outgoing_bytes_high': pd.Series(list(outgoing_bytes_bin.values()), \ + index=outgoing_bytes_bin.keys()), \ + #'incoming_flows_high': pd.Series(incoming_flows_bin.values(), \ + #index=incoming_flows_bin.keys()), \ + #'outgoing_flows_high': pd.Series(outgoing_flows_bin.values(), \ + #index=outgoing_flows_bin.keys()) \ + } + + out_temp_edges = out_prefix + '.TEMP.edges' + out_temp_labels = out_prefix + '.TEMP-labels.txt' + with open(out_temp_edges, 'w') as f: + for e in edge_list: + f.write(e + '\n') + + print('Creating dataframe ...') + label_df = pd.DataFrame.from_dict(label_data).fillna(0).astype(np.int8) + print('Saving dataframes ...') + label_df.to_csv(out_temp_labels, sep = '\t', index_label='ip_addr') + + transform_node_ids(out_prefix) + +def createFieldIndex(path): + field_index = {"SrcAddr" : None, "DstAddr": None, 'Sport': None, 'Dport': None, 'Proto': None, 'TotPkts': None, 'TotBytes': None} + header = "" + with open(path) as fp: + sep = "," + ##In case file has a empty line prior to header + while header == "" : + header = fp.readline() + fp.close() + + tokens = [] + for h in header.strip().split(sep): + if h == '': + continue + tokens.append(h.strip()) + + field_index = getIndex(field_index,tokens) + status, invalidCol = checkFieldIndex(field_index) + if status: + return field_index + else: + print("Error. Cannot find column corresponding to " + str(invalidCol)) + exit() + + +def getIndex(field_index, header): + keys = list(field_index.keys()) + for i,col in enumerate(header): + if col in keys: + field_index[col] = i + return field_index + +def checkFieldIndex(field_index): + keys = list(field_index.keys()) + for key in keys: + if field_index[key] == None and key != None: + return False, key + return True,None + +def driver(): + if len(sys.argv) < 3: + print("File Expects 2 Arguments:") + print("1. Path To Data") + print("2. Path To Output") + exit() + path = sys.argv[1] + out_dir = sys.argv[2] + field_index = createFieldIndex(path) + out_prefix = out_dir + '/' + os.path.basename(path) + process_flow(path, field_index, out_prefix) + +if __name__ == '__main__': + driver() +