Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download
Views: 61
1
#imports numpy as needed by pandas module
2
import numpy as np
3
4
#imports pandas module to read through excell sheet
5
import pandas as pd
6
7
#imports arg parse
8
import argparse
9
10
11
parser = argparse.ArgumentParser()
12
13
#argument requires user to input the name of a textfile containing the xl data matrix
14
parser.add_argument('--input', help='Enter the name of the xl file .')
15
#argument requires user for an output file
16
parser.add_argument('--output', help= 'Enter the name of the text file you want to output your aprroximate patterns found in the sequence.')
17
arguments=parser.parse_args()
18
19
#if the user doesnt put in an input file its prompts them to do so
20
if arguments.input == None:
21
input_xl = raw_input('Please enter the name of a xl file : ')
22
23
else:
24
input_xl = arguments.input
25
26
#if the user doesnt put in an input file its prompts them to do so
27
if arguments.output == None:
28
output = raw_input('Please enter the name of a output file : ')
29
30
else:
31
output = arguments.output
32
33
#USing the pandas module it reads in the excell file into a data structure that can be read/modified in python
34
data_file =pd.read_excel(input_xl)
35
36
# Creates a list of all the headers at the top of the excel file
37
headers = data_file.columns
38
39
#Creates a dictionary. The dictionary will have the genome id's as the keys and a select data set as its value.
40
g_dict= {}
41
values= data_file.index
42
43
#iterates through all of the genome id's in the excell file
44
#Uses the pandas function .loc to select a specific data set. In this case it is saying to set the variable clms to a data set to all the rows under the specific column containing the header we are iterating through. i.e) A B C
45
# I 4 3 2
46
# II 5 2 4
47
# III 3 4 5
48
# clms = data_file.loc[:(or all rows),h(the firs iteration h=A)]
49
#print clms
50
# A
51
# I 4
52
# II 5
53
#III 3
54
#Then if the header is not a key in the dicitonary G dict it will set put the header in the dict with the sub data set coresponding to the 6 fingerprints
55
for h in headers:
56
57
clms = data_file.loc[:,h]
58
if h not in g_dict:
59
g_dict[h]= clms
60
#Next it goes through each of the keys in the dictionary looking at the percent values for each likeliness
61
#If the likeliness factor is less than 90% it sets the value to 0 if it is greater than 90% it sets it equal to 1
62
for keys in g_dict:
63
64
pb = g_dict[keys]
65
66
for x in range(len(pb)):
67
if pb.ix[x] >= .90:
68
pb.ix[x]= 1
69
70
else:
71
pb.ix[x] = 0
72
73
# Creates a new dictionary with the key as the genome ID and the value a list of the binary numbers
74
binary= {}
75
for keys in g_dict:
76
p = g_dict[keys]
77
if keys not in binary:
78
binary[keys]= []
79
80
81
for x in range(len(pb)):
82
binary[keys].append(p.ix[x])
83
84
85
86
outputfile = open(output,'w')
87
88
# iterates through all the keys in the dictionary and writes the binary fingerprint to an output file.
89
for keys in g_dict:
90
outputfile.write('GENOME ID: %s\n%s\n\n' %(keys,g_dict[keys]))
91
out=open('o.txt','w')
92
93
94
95
96
97
98
99
100
101