#imports numpy as needed by pandas module1import numpy as np23#imports pandas module to read through excell sheet4import pandas as pd56#imports arg parse7import argparse8910parser = argparse.ArgumentParser()1112#argument requires user to input the name of a textfile containing the xl data matrix13parser.add_argument('--input', help='Enter the name of the xl file .')14#argument requires user for an output file15parser.add_argument('--output', help= 'Enter the name of the text file you want to output your aprroximate patterns found in the sequence.')16arguments=parser.parse_args()1718#if the user doesnt put in an input file its prompts them to do so19if arguments.input == None:20input_xl = raw_input('Please enter the name of a xl file : ')2122else:23input_xl = arguments.input2425#if the user doesnt put in an input file its prompts them to do so26if arguments.output == None:27output = raw_input('Please enter the name of a output file : ')2829else:30output = arguments.output3132#USing the pandas module it reads in the excell file into a data structure that can be read/modified in python33data_file =pd.read_excel(input_xl)3435# Creates a list of all the headers at the top of the excel file36headers = data_file.columns3738#Creates a dictionary. The dictionary will have the genome id's as the keys and a select data set as its value.39g_dict= {}40values= data_file.index4142#iterates through all of the genome id's in the excell file43#Uses the pandas function .loc to select a specific data set. In this case it is saying to set the variable clms to a data set to all the rows under the specific column containing the header we are iterating through. i.e) A B C44# I 4 3 245# II 5 2 446# III 3 4 547# clms = data_file.loc[:(or all rows),h(the firs iteration h=A)]48#print clms49# A50# I 451# II 552#III 353#Then if the header is not a key in the dicitonary G dict it will set put the header in the dict with the sub data set coresponding to the 6 fingerprints54for h in headers:5556clms = data_file.loc[:,h]57if h not in g_dict:58g_dict[h]= clms59#Next it goes through each of the keys in the dictionary looking at the percent values for each likeliness60#If the likeliness factor is less than 90% it sets the value to 0 if it is greater than 90% it sets it equal to 161for keys in g_dict:6263pb = g_dict[keys]6465for x in range(len(pb)):66if pb.ix[x] >= .90:67pb.ix[x]= 16869else:70pb.ix[x] = 07172# Creates a new dictionary with the key as the genome ID and the value a list of the binary numbers73binary= {}74for keys in g_dict:75p = g_dict[keys]76if keys not in binary:77binary[keys]= []787980for x in range(len(pb)):81binary[keys].append(p.ix[x])82838485outputfile = open(output,'w')8687# iterates through all the keys in the dictionary and writes the binary fingerprint to an output file.88for keys in g_dict:89outputfile.write('GENOME ID: %s\n%s\n\n' %(keys,g_dict[keys]))90out=open('o.txt','w')919293949596979899100101