#task4-group7
#the ion statistics calculator
#Liyue Chang
#import necessary libraries
import argparse

#use argparse to define command lines
parser=argparse.ArgumentParser()
parser.add_argument("inputfile",
                    help="directory to the input file")
parser.add_argument("-i", "--minimum_mass_to_charge_ratio_range",
                    help="the minimum range of mass-to-charge of the peptide fragment", type=float,default=1000)
parser.add_argument("-a", "--maximum_mass_to_charge_ratio_range",
                    help="the maximum range of mass-to-charge of the peptide fragment", type=float,default=1500)
parser.add_argument("-w", "--window_size",
                    help="size of the sliding window", type=float, default=0.5)
parser.add_argument("-s", "--step_size",
                    help="size of steps the sliding window slides(must be smaller than window size)", type=float, default=0.2)
parser.add_argument("-m", "--match_pattern",
                    help="how the input amino acid matches the sequence", choices=['start with','contain'], default='contain')
parser.add_argument("-p", "--amino_acid_matches_sequence",
                    help="input amino acid matching the sequence",
                    choices=['K','N','R','T','S','I','M','Q','H','P','L','E','D','A','G','V','Y','S','C','W','F','no'],default='no')
parser.add_argument("create_file",
                    help="name of the output file")

args = parser.parse_args()
filename=args.inputfile
min_mz=round(args.minimum_mass_to_charge_ratio_range,4)
max_mz=round(args.maximum_mass_to_charge_ratio_range,4)
window_size=args.window_size
step_size=args.step_size
match_pattern=args.match_pattern
pep_find=args.amino_acid_matches_sequence
create_file=args.create_file

#lists of useful elements
protein_name=[]
mass_to_charge=[]
sequence=[]
output=open(create_file,'w')

#define a function to read file and store data
def readfile(filename):
    global output
    readfile=open(filename,'r')
    next(readfile)
    lines=readfile.readlines()
    list1=[]
    cleavage_site=[]
    for line in lines:
        #use split to handle the data
        data=line.split()
        protein_name.append(data[0])
        mass_to_charge.append(data[3])
        cleavage_site.append(data[4])
        sequence.append(data[5])
    cleavage_site=set(cleavage_site)
    for i in cleavage_site:
        if i==0:
            print("There is no missed cleavage site")
            list1.append("no missed cleavage site")
        else:
            print("Missed cleavage site: ",i)
            list1.append("cleavage site: ")
            list1.append(i)
    output.writelines([list1[0],str(list1[1]),'\n'])

    #define a function to count number of peptides
def count_num(min_mz,max_mz):
    count_pep=0
    #use lists to store output data
    list1=[]
    list2=[]
    list3=[]
    global output
    #use defined list mass_to_charge, loops and if statement to count
    for m in mass_to_charge:
        mz=float(m)
        if ((min_mz<mz)&(mz<max_mz)):
            count_pep+=1
    print("mass range:", "minimum:",round(min_mz,4),", maximum:",round(max_mz,4)," count:",count_pep)
    list1.append(round(min_mz,4))
    list2.append(round(max_mz,4))
    list3.append(count_pep)
    for i in zip(list1,list2,list3):
        output.writelines([str(i[0]),'\t',str(i[1]),'\t',str(i[2]),'\n'])
    return count_pep

#define a function to find unique protein
def find_protein():
    mz_num={}
    uni_pr=[]
    rep_pr={}
    rep_pr1=[]
    list1=[]
    list2=[]
    num_rep={}
    num_all={}
    cons_pr={}
    cons_pr1=[]
    num_rep1={}
    uni_pr1=[]
    global output,mass_to_charge,protein_name

    #record the frequency of matched m/z of sequences
    for i in range(len(mass_to_charge)):
        if mass_to_charge[i] in mz_num:
            mz_num[mass_to_charge[i]]+=1
        else:
            mz_num[mass_to_charge[i]]=1

    #a dictionary containing all duplicated m/z and corresponding protein name
    for key in mz_num:
        if mz_num[key]!=1:
            rep_pr[key]=[]
    for key in rep_pr.keys():
        for index,nums in enumerate(mass_to_charge):
            if nums==key:
                rep_pr[key].append(protein_name[index])
        #a dictionary containing m/z within the range (1000,1500)
        if float(key)<1500.0 and float(key)>1000.0:
            cons_pr[key]=[]
    for key in cons_pr.keys():
        for index,nums in enumerate(mass_to_charge):
            if nums==key:
                cons_pr[key].append(protein_name[index])

    #compare protein with same m/z
    for i in rep_pr:
        check_pr=[]
        check_p=[]
        for i in rep_pr[i]:
            check_pr.append(i)
            check_p=set(check_pr)
            if check_p!=1:
                rep_pr1.append(i)

    #use dictionary to compare duplicated protein with all proteins and proteins within m/z range of 1000 and 1500
    for rp in range(len(rep_pr1)):
        if rep_pr1[rp] in num_rep:
            num_rep[rep_pr1[rp]]+=1
        else:
            num_rep[rep_pr1[rp]]=1
    for i in cons_pr:
        check_pr1=[]
        check_p1=[]
        for i in cons_pr[i]:
            check_pr1.append(i)
            check_p1=set(check_pr)
            if check_p!=1:
                cons_pr1.append(i)
    for cp in range(len(cons_pr1)):
        if cons_pr1[cp] in num_rep1:
            num_rep1[cons_pr1[cp]]+=1
        else:
            num_rep1[cons_pr1[cp]]=1
    for pr in range(len(protein_name)):
        if protein_name[pr] in num_all:
            num_all[protein_name[pr]]+=1
        else:
            num_all[protein_name[pr]]=1
    for m in num_rep.keys():
        if m in num_all.keys():
            if num_rep[m]<num_all[m]:
                uni_pr.append(m)
    for m in num_rep1.keys():
        if m in num_all.keys():
            if num_rep1[m]<num_all[m]:
                uni_pr1.append(m)
    if len(uni_pr)==1:
        print(len(uni_pr)," protein has been unambiguously identified in all proteins. ")
        #print(" ".join(str(i) for i in uni_pr))
    else:
        print(len(uni_pr)," proteins have been unambiguously identified in all proteins. ")
        #print(",".join(str(i) for i in uni_pr))
    if len(uni_pr1)==1:
        print(len(uni_pr1)," protein has been unambiguously identified within 1000 to 1500 m/z range. ")
        #print(" ".join(str(i) for i in uni_pr1))
    else:
        print(len(uni_pr1)," proteins have been unambiguously identified within 1000 to 1500 m/z range. ")
        #print(",".join(str(i) for i in uni_pr1))
    list1.append(len(uni_pr))
    list1.append(len(uni_pr1))
    output.writelines(["all unique proteins: ",str(list1[0]),'\n',"unique proteins within 1000 to 1500 m/z: ", str(list1[1]),'\n'])

#define a function to match peptide
def match_peptide(pep_find,match_pattern):
    list1=[]
    list2=[]
    match=[]
    global output
    if pep_find!="no":
        #differentiate different match pattern
        if match_pattern=="start with":
            for s in sequence:
                if s.startswith(pep_find):
                    match.append(s)
            if len(match)==0:
                print("There is no sequence starts with ",pep_find," in all peptides.")
        else:
            for s in sequence:
                match.append(s)
            if len(match)==0:
                print("There is no sequence contains ",pep_find," in all peptides.")
        if len(match)==1:
            print("There is ",len(match)," sequence ",match_pattern, pep_find," in all peptides.")
        else:
            print("There are ",len(match)," sequences ",match_pattern, pep_find," in all peptides.")
        list1.append("number of matched sequence")
        list2.append(len(match))
        for i in zip(list1,list2):
            output.writelines([str(i[0]),'\t',str(i[1]),'\n'])
#main code starts here
readfile(filename)
match_peptide(pep_find,match_pattern)
find_protein()
output.writelines(['min_range','\t','max_range','\t','peps_in_range','\n'])
count_num(min_mz,max_mz)
#use while loop to do sliding window
while min_mz<max_mz:
    window=min_mz+window_size
    if window>max_mz:
        window=max_mz
    count_num(min_mz,window)
    min_mz+=step_size
output.close()