Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download
Views: 585
Image: ubuntu2004
1
#task4-group7
2
#the ion statistics calculator
3
#Liyue Chang
4
#import necessary libraries
5
import argparse
6
7
#use argparse to define command lines
8
parser=argparse.ArgumentParser()
9
parser.add_argument("inputfile",
10
help="directory to the input file")
11
parser.add_argument("-i", "--minimum_mass_to_charge_ratio_range",
12
help="the minimum range of mass-to-charge of the peptide fragment", type=float,default=1000)
13
parser.add_argument("-a", "--maximum_mass_to_charge_ratio_range",
14
help="the maximum range of mass-to-charge of the peptide fragment", type=float,default=1500)
15
parser.add_argument("-w", "--window_size",
16
help="size of the sliding window", type=float, default=0.5)
17
parser.add_argument("-s", "--step_size",
18
help="size of steps the sliding window slides(must be smaller than window size)", type=float, default=0.2)
19
parser.add_argument("-m", "--match_pattern",
20
help="how the input amino acid matches the sequence", choices=['start with','contain'], default='contain')
21
parser.add_argument("-p", "--amino_acid_matches_sequence",
22
help="input amino acid matching the sequence",
23
choices=['K','N','R','T','S','I','M','Q','H','P','L','E','D','A','G','V','Y','S','C','W','F','no'],default='no')
24
parser.add_argument("create_file",
25
help="name of the output file")
26
27
args = parser.parse_args()
28
filename=args.inputfile
29
min_mz=round(args.minimum_mass_to_charge_ratio_range,4)
30
max_mz=round(args.maximum_mass_to_charge_ratio_range,4)
31
window_size=args.window_size
32
step_size=args.step_size
33
match_pattern=args.match_pattern
34
pep_find=args.amino_acid_matches_sequence
35
create_file=args.create_file
36
37
#lists of useful elements
38
protein_name=[]
39
mass_to_charge=[]
40
sequence=[]
41
output=open(create_file,'w')
42
43
#define a function to read file and store data
44
def readfile(filename):
45
global output
46
readfile=open(filename,'r')
47
next(readfile)
48
lines=readfile.readlines()
49
list1=[]
50
cleavage_site=[]
51
for line in lines:
52
#use split to handle the data
53
data=line.split()
54
protein_name.append(data[0])
55
mass_to_charge.append(data[3])
56
cleavage_site.append(data[4])
57
sequence.append(data[5])
58
cleavage_site=set(cleavage_site)
59
for i in cleavage_site:
60
if i==0:
61
print("There is no missed cleavage site")
62
list1.append("no missed cleavage site")
63
else:
64
print("Missed cleavage site: ",i)
65
list1.append("cleavage site: ")
66
list1.append(i)
67
output.writelines([list1[0],str(list1[1]),'\n'])
68
69
#define a function to count number of peptides
70
def count_num(min_mz,max_mz):
71
count_pep=0
72
#use lists to store output data
73
list1=[]
74
list2=[]
75
list3=[]
76
global output
77
#use defined list mass_to_charge, loops and if statement to count
78
for m in mass_to_charge:
79
mz=float(m)
80
if ((min_mz<mz)&(mz<max_mz)):
81
count_pep+=1
82
print("mass range:", "minimum:",round(min_mz,4),", maximum:",round(max_mz,4)," count:",count_pep)
83
list1.append(round(min_mz,4))
84
list2.append(round(max_mz,4))
85
list3.append(count_pep)
86
for i in zip(list1,list2,list3):
87
output.writelines([str(i[0]),'\t',str(i[1]),'\t',str(i[2]),'\n'])
88
return count_pep
89
90
#define a function to find unique protein
91
def find_protein():
92
mz_num={}
93
uni_pr=[]
94
rep_pr={}
95
rep_pr1=[]
96
list1=[]
97
list2=[]
98
num_rep={}
99
num_all={}
100
cons_pr={}
101
cons_pr1=[]
102
num_rep1={}
103
uni_pr1=[]
104
global output,mass_to_charge,protein_name
105
106
#record the frequency of matched m/z of sequences
107
for i in range(len(mass_to_charge)):
108
if mass_to_charge[i] in mz_num:
109
mz_num[mass_to_charge[i]]+=1
110
else:
111
mz_num[mass_to_charge[i]]=1
112
113
#a dictionary containing all duplicated m/z and corresponding protein name
114
for key in mz_num:
115
if mz_num[key]!=1:
116
rep_pr[key]=[]
117
for key in rep_pr.keys():
118
for index,nums in enumerate(mass_to_charge):
119
if nums==key:
120
rep_pr[key].append(protein_name[index])
121
#a dictionary containing m/z within the range (1000,1500)
122
if float(key)<1500.0 and float(key)>1000.0:
123
cons_pr[key]=[]
124
for key in cons_pr.keys():
125
for index,nums in enumerate(mass_to_charge):
126
if nums==key:
127
cons_pr[key].append(protein_name[index])
128
129
#compare protein with same m/z
130
for i in rep_pr:
131
check_pr=[]
132
check_p=[]
133
for i in rep_pr[i]:
134
check_pr.append(i)
135
check_p=set(check_pr)
136
if check_p!=1:
137
rep_pr1.append(i)
138
139
#use dictionary to compare duplicated protein with all proteins and proteins within m/z range of 1000 and 1500
140
for rp in range(len(rep_pr1)):
141
if rep_pr1[rp] in num_rep:
142
num_rep[rep_pr1[rp]]+=1
143
else:
144
num_rep[rep_pr1[rp]]=1
145
for i in cons_pr:
146
check_pr1=[]
147
check_p1=[]
148
for i in cons_pr[i]:
149
check_pr1.append(i)
150
check_p1=set(check_pr)
151
if check_p!=1:
152
cons_pr1.append(i)
153
for cp in range(len(cons_pr1)):
154
if cons_pr1[cp] in num_rep1:
155
num_rep1[cons_pr1[cp]]+=1
156
else:
157
num_rep1[cons_pr1[cp]]=1
158
for pr in range(len(protein_name)):
159
if protein_name[pr] in num_all:
160
num_all[protein_name[pr]]+=1
161
else:
162
num_all[protein_name[pr]]=1
163
for m in num_rep.keys():
164
if m in num_all.keys():
165
if num_rep[m]<num_all[m]:
166
uni_pr.append(m)
167
for m in num_rep1.keys():
168
if m in num_all.keys():
169
if num_rep1[m]<num_all[m]:
170
uni_pr1.append(m)
171
if len(uni_pr)==1:
172
print(len(uni_pr)," protein has been unambiguously identified in all proteins. ")
173
#print(" ".join(str(i) for i in uni_pr))
174
else:
175
print(len(uni_pr)," proteins have been unambiguously identified in all proteins. ")
176
#print(",".join(str(i) for i in uni_pr))
177
if len(uni_pr1)==1:
178
print(len(uni_pr1)," protein has been unambiguously identified within 1000 to 1500 m/z range. ")
179
#print(" ".join(str(i) for i in uni_pr1))
180
else:
181
print(len(uni_pr1)," proteins have been unambiguously identified within 1000 to 1500 m/z range. ")
182
#print(",".join(str(i) for i in uni_pr1))
183
list1.append(len(uni_pr))
184
list1.append(len(uni_pr1))
185
output.writelines(["all unique proteins: ",str(list1[0]),'\n',"unique proteins within 1000 to 1500 m/z: ", str(list1[1]),'\n'])
186
187
#define a function to match peptide
188
def match_peptide(pep_find,match_pattern):
189
list1=[]
190
list2=[]
191
match=[]
192
global output
193
if pep_find!="no":
194
#differentiate different match pattern
195
if match_pattern=="start with":
196
for s in sequence:
197
if s.startswith(pep_find):
198
match.append(s)
199
if len(match)==0:
200
print("There is no sequence starts with ",pep_find," in all peptides.")
201
else:
202
for s in sequence:
203
match.append(s)
204
if len(match)==0:
205
print("There is no sequence contains ",pep_find," in all peptides.")
206
if len(match)==1:
207
print("There is ",len(match)," sequence ",match_pattern, pep_find," in all peptides.")
208
else:
209
print("There are ",len(match)," sequences ",match_pattern, pep_find," in all peptides.")
210
list1.append("number of matched sequence")
211
list2.append(len(match))
212
for i in zip(list1,list2):
213
output.writelines([str(i[0]),'\t',str(i[1]),'\n'])
214
#main code starts here
215
readfile(filename)
216
match_peptide(pep_find,match_pattern)
217
find_protein()
218
output.writelines(['min_range','\t','max_range','\t','peps_in_range','\n'])
219
count_num(min_mz,max_mz)
220
#use while loop to do sliding window
221
while min_mz<max_mz:
222
window=min_mz+window_size
223
if window>max_mz:
224
window=max_mz
225
count_num(min_mz,window)
226
min_mz+=step_size
227
output.close()
228