CoCalc -- ls-bsr_convert.py

Project: Cody L Berkobien - 2017-01-CSCI195/2017-01-CSCI195
Views: ⁶¹
1
#imports numpy as needed by pandas module
2
import numpy as np
3

4
#imports pandas module to read through excell sheet
5
import pandas as pd
6

7
#imports arg parse
8
import argparse
9

10

11
parser = argparse.ArgumentParser()
12

13
#argument requires user to input the name of a textfile containing the xl data matrix
14
parser.add_argument('--input', help='Enter the name of the xl file .')
15
#argument requires user for an output file
16
parser.add_argument('--output', help= 'Enter the name of the text file you want to output your aprroximate patterns found in the sequence.')
17
arguments=parser.parse_args()
18

19
#if the user doesnt put in an input file its prompts them to do so
20
if arguments.input == None:
21
    input_xl = raw_input('Please enter the name of a xl file : ')
22
    
23
else:
24
    input_xl = arguments.input
25
    
26
#if the user doesnt put in an input file its prompts them to do so
27
if arguments.output == None:
28
    output = raw_input('Please enter the name of a output file : ')
29
    
30
else:
31
    output = arguments.output
32

33
#USing the pandas module it reads in the excell file into a data structure that can be read/modified in python
34
data_file =pd.read_excel(input_xl)
35

36
# Creates a list of all the headers at the top of the excel file
37
headers = data_file.columns
38

39
#Creates a dictionary. The dictionary will have the genome id's as the keys and a select data set as its value.
40
g_dict= {}
41
values= data_file.index
42

43
#iterates through all of the genome id's in the excell file
44
#Uses the pandas function .loc to select a specific data set. In this case it is saying to set the variable clms to a data set to all the rows under the specific column containing the header we are iterating through. i.e)      A B C
45
    #                                                                                 I  4 3 2
46
    #                                                                                II  5 2 4
47
    #                                                                               III  3 4 5
48
    #  clms = data_file.loc[:(or all rows),h(the firs iteration h=A)]
49
    #print clms
50
    #     A 
51
    #  I  4 
52
    # II  5 
53
    #III  3 
54
#Then if the header is not a key in the dicitonary G dict it will set put the header in the dict with the sub data set coresponding to the 6 fingerprints 
55
for h in headers:
56
    
57
    clms = data_file.loc[:,h]
58
    if h not in g_dict:
59
        g_dict[h]= clms
60
#Next it goes through each of the keys in the dictionary looking at the percent values for each likeliness
61
#If the likeliness factor is less than 90% it sets the value to 0 if it is greater than 90% it sets it equal to 1
62
for keys in g_dict:
63
    
64
    pb = g_dict[keys]
65

66
    for x in range(len(pb)):
67
        if pb.ix[x] >= .90:
68
            pb.ix[x]= 1
69
        
70
        else:
71
            pb.ix[x] = 0
72

73
# Creates a new dictionary with the key as the genome ID and the value a list of the binary numbers
74
binary= {}
75
for keys in g_dict:
76
    p = g_dict[keys]
77
    if keys not in binary:
78
        binary[keys]= []
79
    
80
    
81
    for x in range(len(pb)):
82
        binary[keys].append(p.ix[x])
83
        
84
        
85
       
86
outputfile = open(output,'w')
87

88
# iterates through all the keys in the dictionary and writes the binary fingerprint to an output file.
89
for keys in g_dict:
90
    outputfile.write('GENOME ID: %s\n%s\n\n' %(keys,g_dict[keys]))
91
out=open('o.txt','w')
92

93
    
94

95

96
    
97

98
   
99
    
100
    
101