import argparse
parser=argparse.ArgumentParser()
parser.add_argument("inputfile",
help="directory to the input file")
parser.add_argument("-i", "--minimum_mass_to_charge_ratio_range",
help="the minimum range of mass-to-charge of the peptide fragment", type=float,default=1000)
parser.add_argument("-a", "--maximum_mass_to_charge_ratio_range",
help="the maximum range of mass-to-charge of the peptide fragment", type=float,default=1500)
parser.add_argument("-w", "--window_size",
help="size of the sliding window", type=float, default=0.5)
parser.add_argument("-s", "--step_size",
help="size of steps the sliding window slides(must be smaller than window size)", type=float, default=0.2)
parser.add_argument("-m", "--match_pattern",
help="how the input amino acid matches the sequence", choices=['start with','contain'], default='contain')
parser.add_argument("-p", "--amino_acid_matches_sequence",
help="input amino acid matching the sequence",
choices=['K','N','R','T','S','I','M','Q','H','P','L','E','D','A','G','V','Y','S','C','W','F','no'],default='no')
parser.add_argument("create_file",
help="name of the output file")
args = parser.parse_args()
filename=args.inputfile
min_mz=round(args.minimum_mass_to_charge_ratio_range,4)
max_mz=round(args.maximum_mass_to_charge_ratio_range,4)
window_size=args.window_size
step_size=args.step_size
match_pattern=args.match_pattern
pep_find=args.amino_acid_matches_sequence
create_file=args.create_file
protein_name=[]
mass_to_charge=[]
sequence=[]
output=open(create_file,'w')
def readfile(filename):
global output
readfile=open(filename,'r')
next(readfile)
lines=readfile.readlines()
list1=[]
cleavage_site=[]
for line in lines:
data=line.split()
protein_name.append(data[0])
mass_to_charge.append(data[3])
cleavage_site.append(data[4])
sequence.append(data[5])
cleavage_site=set(cleavage_site)
for i in cleavage_site:
if i==0:
print("There is no missed cleavage site")
list1.append("no missed cleavage site")
else:
print("Missed cleavage site: ",i)
list1.append("cleavage site: ")
list1.append(i)
output.writelines([list1[0],str(list1[1]),'\n'])
def count_num(min_mz,max_mz):
count_pep=0
list1=[]
list2=[]
list3=[]
global output
for m in mass_to_charge:
mz=float(m)
if ((min_mz<mz)&(mz<max_mz)):
count_pep+=1
print("mass range:", "minimum:",round(min_mz,4),", maximum:",round(max_mz,4)," count:",count_pep)
list1.append(round(min_mz,4))
list2.append(round(max_mz,4))
list3.append(count_pep)
for i in zip(list1,list2,list3):
output.writelines([str(i[0]),'\t',str(i[1]),'\t',str(i[2]),'\n'])
return count_pep
def find_protein():
mz_num={}
uni_pr=[]
rep_pr={}
rep_pr1=[]
list1=[]
list2=[]
num_rep={}
num_all={}
cons_pr={}
cons_pr1=[]
num_rep1={}
uni_pr1=[]
global output,mass_to_charge,protein_name
for i in range(len(mass_to_charge)):
if mass_to_charge[i] in mz_num:
mz_num[mass_to_charge[i]]+=1
else:
mz_num[mass_to_charge[i]]=1
for key in mz_num:
if mz_num[key]!=1:
rep_pr[key]=[]
for key in rep_pr.keys():
for index,nums in enumerate(mass_to_charge):
if nums==key:
rep_pr[key].append(protein_name[index])
if float(key)<1500.0 and float(key)>1000.0:
cons_pr[key]=[]
for key in cons_pr.keys():
for index,nums in enumerate(mass_to_charge):
if nums==key:
cons_pr[key].append(protein_name[index])
for i in rep_pr:
check_pr=[]
check_p=[]
for i in rep_pr[i]:
check_pr.append(i)
check_p=set(check_pr)
if check_p!=1:
rep_pr1.append(i)
for rp in range(len(rep_pr1)):
if rep_pr1[rp] in num_rep:
num_rep[rep_pr1[rp]]+=1
else:
num_rep[rep_pr1[rp]]=1
for i in cons_pr:
check_pr1=[]
check_p1=[]
for i in cons_pr[i]:
check_pr1.append(i)
check_p1=set(check_pr)
if check_p!=1:
cons_pr1.append(i)
for cp in range(len(cons_pr1)):
if cons_pr1[cp] in num_rep1:
num_rep1[cons_pr1[cp]]+=1
else:
num_rep1[cons_pr1[cp]]=1
for pr in range(len(protein_name)):
if protein_name[pr] in num_all:
num_all[protein_name[pr]]+=1
else:
num_all[protein_name[pr]]=1
for m in num_rep.keys():
if m in num_all.keys():
if num_rep[m]<num_all[m]:
uni_pr.append(m)
for m in num_rep1.keys():
if m in num_all.keys():
if num_rep1[m]<num_all[m]:
uni_pr1.append(m)
if len(uni_pr)==1:
print(len(uni_pr)," protein has been unambiguously identified in all proteins. ")
else:
print(len(uni_pr)," proteins have been unambiguously identified in all proteins. ")
if len(uni_pr1)==1:
print(len(uni_pr1)," protein has been unambiguously identified within 1000 to 1500 m/z range. ")
else:
print(len(uni_pr1)," proteins have been unambiguously identified within 1000 to 1500 m/z range. ")
list1.append(len(uni_pr))
list1.append(len(uni_pr1))
output.writelines(["all unique proteins: ",str(list1[0]),'\n',"unique proteins within 1000 to 1500 m/z: ", str(list1[1]),'\n'])
def match_peptide(pep_find,match_pattern):
list1=[]
list2=[]
match=[]
global output
if pep_find!="no":
if match_pattern=="start with":
for s in sequence:
if s.startswith(pep_find):
match.append(s)
if len(match)==0:
print("There is no sequence starts with ",pep_find," in all peptides.")
else:
for s in sequence:
match.append(s)
if len(match)==0:
print("There is no sequence contains ",pep_find," in all peptides.")
if len(match)==1:
print("There is ",len(match)," sequence ",match_pattern, pep_find," in all peptides.")
else:
print("There are ",len(match)," sequences ",match_pattern, pep_find," in all peptides.")
list1.append("number of matched sequence")
list2.append(len(match))
for i in zip(list1,list2):
output.writelines([str(i[0]),'\t',str(i[1]),'\n'])
readfile(filename)
match_peptide(pep_find,match_pattern)
find_protein()
output.writelines(['min_range','\t','max_range','\t','peps_in_range','\n'])
count_num(min_mz,max_mz)
while min_mz<max_mz:
window=min_mz+window_size
if window>max_mz:
window=max_mz
count_num(min_mz,window)
min_mz+=step_size
output.close()