CoCalc -- group7

Project: Liyue Chang - BIOL60201-ProgrammingSkills-2021
Views: ⁵⁸⁵
Image: ubuntu2004
1
#task4-group7
2
#the ion statistics calculator
3
#Liyue Chang
4
#import necessary libraries
5
import argparse
6

7
#use argparse to define command lines
8
parser=argparse.ArgumentParser()
9
parser.add_argument("inputfile",
10
                    help="directory to the input file")
11
parser.add_argument("-i", "--minimum_mass_to_charge_ratio_range",
12
                    help="the minimum range of mass-to-charge of the peptide fragment", type=float,default=1000)
13
parser.add_argument("-a", "--maximum_mass_to_charge_ratio_range",
14
                    help="the maximum range of mass-to-charge of the peptide fragment", type=float,default=1500)
15
parser.add_argument("-w", "--window_size",
16
                    help="size of the sliding window", type=float, default=0.5)
17
parser.add_argument("-s", "--step_size",
18
                    help="size of steps the sliding window slides(must be smaller than window size)", type=float, default=0.2)
19
parser.add_argument("-m", "--match_pattern",
20
                    help="how the input amino acid matches the sequence", choices=['start with','contain'], default='contain')
21
parser.add_argument("-p", "--amino_acid_matches_sequence",
22
                    help="input amino acid matching the sequence",
23
                    choices=['K','N','R','T','S','I','M','Q','H','P','L','E','D','A','G','V','Y','S','C','W','F','no'],default='no')
24
parser.add_argument("create_file",
25
                    help="name of the output file")
26

27
args = parser.parse_args()
28
filename=args.inputfile
29
min_mz=round(args.minimum_mass_to_charge_ratio_range,4)
30
max_mz=round(args.maximum_mass_to_charge_ratio_range,4)
31
window_size=args.window_size
32
step_size=args.step_size
33
match_pattern=args.match_pattern
34
pep_find=args.amino_acid_matches_sequence
35
create_file=args.create_file
36

37
#lists of useful elements
38
protein_name=[]
39
mass_to_charge=[]
40
sequence=[]
41
output=open(create_file,'w')
42

43
#define a function to read file and store data
44
def readfile(filename):
45
    global output
46
    readfile=open(filename,'r')
47
    next(readfile)
48
    lines=readfile.readlines()
49
    list1=[]
50
    cleavage_site=[]
51
    for line in lines:
52
        #use split to handle the data
53
        data=line.split()
54
        protein_name.append(data[0])
55
        mass_to_charge.append(data[3])
56
        cleavage_site.append(data[4])
57
        sequence.append(data[5])
58
    cleavage_site=set(cleavage_site)
59
    for i in cleavage_site:
60
        if i==0:
61
            print("There is no missed cleavage site")
62
            list1.append("no missed cleavage site")
63
        else:
64
            print("Missed cleavage site: ",i)
65
            list1.append("cleavage site: ")
66
            list1.append(i)
67
    output.writelines([list1[0],str(list1[1]),'\n'])
68

69
    #define a function to count number of peptides
70
def count_num(min_mz,max_mz):
71
    count_pep=0
72
    #use lists to store output data
73
    list1=[]
74
    list2=[]
75
    list3=[]
76
    global output
77
    #use defined list mass_to_charge, loops and if statement to count
78
    for m in mass_to_charge:
79
        mz=float(m)
80
        if ((min_mz<mz)&(mz<max_mz)):
81
            count_pep+=1
82
    print("mass range:", "minimum:",round(min_mz,4),", maximum:",round(max_mz,4)," count:",count_pep)
83
    list1.append(round(min_mz,4))
84
    list2.append(round(max_mz,4))
85
    list3.append(count_pep)
86
    for i in zip(list1,list2,list3):
87
        output.writelines([str(i[0]),'\t',str(i[1]),'\t',str(i[2]),'\n'])
88
    return count_pep
89

90
#define a function to find unique protein
91
def find_protein():
92
    mz_num={}
93
    uni_pr=[]
94
    rep_pr={}
95
    rep_pr1=[]
96
    list1=[]
97
    list2=[]
98
    num_rep={}
99
    num_all={}
100
    cons_pr={}
101
    cons_pr1=[]
102
    num_rep1={}
103
    uni_pr1=[]
104
    global output,mass_to_charge,protein_name
105

106
    #record the frequency of matched m/z of sequences
107
    for i in range(len(mass_to_charge)):
108
        if mass_to_charge[i] in mz_num:
109
            mz_num[mass_to_charge[i]]+=1
110
        else:
111
            mz_num[mass_to_charge[i]]=1
112

113
    #a dictionary containing all duplicated m/z and corresponding protein name
114
    for key in mz_num:
115
        if mz_num[key]!=1:
116
            rep_pr[key]=[]
117
    for key in rep_pr.keys():
118
        for index,nums in enumerate(mass_to_charge):
119
            if nums==key:
120
                rep_pr[key].append(protein_name[index])
121
        #a dictionary containing m/z within the range (1000,1500)
122
        if float(key)<1500.0 and float(key)>1000.0:
123
            cons_pr[key]=[]
124
    for key in cons_pr.keys():
125
        for index,nums in enumerate(mass_to_charge):
126
            if nums==key:
127
                cons_pr[key].append(protein_name[index])
128

129
    #compare protein with same m/z
130
    for i in rep_pr:
131
        check_pr=[]
132
        check_p=[]
133
        for i in rep_pr[i]:
134
            check_pr.append(i)
135
            check_p=set(check_pr)
136
            if check_p!=1:
137
                rep_pr1.append(i)
138

139
    #use dictionary to compare duplicated protein with all proteins and proteins within m/z range of 1000 and 1500
140
    for rp in range(len(rep_pr1)):
141
        if rep_pr1[rp] in num_rep:
142
            num_rep[rep_pr1[rp]]+=1
143
        else:
144
            num_rep[rep_pr1[rp]]=1
145
    for i in cons_pr:
146
        check_pr1=[]
147
        check_p1=[]
148
        for i in cons_pr[i]:
149
            check_pr1.append(i)
150
            check_p1=set(check_pr)
151
            if check_p!=1:
152
                cons_pr1.append(i)
153
    for cp in range(len(cons_pr1)):
154
        if cons_pr1[cp] in num_rep1:
155
            num_rep1[cons_pr1[cp]]+=1
156
        else:
157
            num_rep1[cons_pr1[cp]]=1
158
    for pr in range(len(protein_name)):
159
        if protein_name[pr] in num_all:
160
            num_all[protein_name[pr]]+=1
161
        else:
162
            num_all[protein_name[pr]]=1
163
    for m in num_rep.keys():
164
        if m in num_all.keys():
165
            if num_rep[m]<num_all[m]:
166
                uni_pr.append(m)
167
    for m in num_rep1.keys():
168
        if m in num_all.keys():
169
            if num_rep1[m]<num_all[m]:
170
                uni_pr1.append(m)
171
    if len(uni_pr)==1:
172
        print(len(uni_pr)," protein has been unambiguously identified in all proteins. ")
173
        #print(" ".join(str(i) for i in uni_pr))
174
    else:
175
        print(len(uni_pr)," proteins have been unambiguously identified in all proteins. ")
176
        #print(",".join(str(i) for i in uni_pr))
177
    if len(uni_pr1)==1:
178
        print(len(uni_pr1)," protein has been unambiguously identified within 1000 to 1500 m/z range. ")
179
        #print(" ".join(str(i) for i in uni_pr1))
180
    else:
181
        print(len(uni_pr1)," proteins have been unambiguously identified within 1000 to 1500 m/z range. ")
182
        #print(",".join(str(i) for i in uni_pr1))
183
    list1.append(len(uni_pr))
184
    list1.append(len(uni_pr1))
185
    output.writelines(["all unique proteins: ",str(list1[0]),'\n',"unique proteins within 1000 to 1500 m/z: ", str(list1[1]),'\n'])
186

187
#define a function to match peptide
188
def match_peptide(pep_find,match_pattern):
189
    list1=[]
190
    list2=[]
191
    match=[]
192
    global output
193
    if pep_find!="no":
194
        #differentiate different match pattern
195
        if match_pattern=="start with":
196
            for s in sequence:
197
                if s.startswith(pep_find):
198
                    match.append(s)
199
            if len(match)==0:
200
                print("There is no sequence starts with ",pep_find," in all peptides.")
201
        else:
202
            for s in sequence:
203
                match.append(s)
204
            if len(match)==0:
205
                print("There is no sequence contains ",pep_find," in all peptides.")
206
        if len(match)==1:
207
            print("There is ",len(match)," sequence ",match_pattern, pep_find," in all peptides.")
208
        else:
209
            print("There are ",len(match)," sequences ",match_pattern, pep_find," in all peptides.")
210
        list1.append("number of matched sequence")
211
        list2.append(len(match))
212
        for i in zip(list1,list2):
213
            output.writelines([str(i[0]),'\t',str(i[1]),'\n'])
214
#main code starts here
215
readfile(filename)
216
match_peptide(pep_find,match_pattern)
217
find_protein()
218
output.writelines(['min_range','\t','max_range','\t','peps_in_range','\n'])
219
count_num(min_mz,max_mz)
220
#use while loop to do sliding window
221
while min_mz<max_mz:
222
    window=min_mz+window_size
223
    if window>max_mz:
224
        window=max_mz
225
    count_num(min_mz,window)
226
    min_mz+=step_size
227
output.close()
228