Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download
Views: 180

A beta-splitting model for evolutionary trees

2014, 2015, Raazesh Sainudiin and Amandine Veber

This is code accompanying the technical report 'A beta-splitting model for evolutionary trees' by Raazesh Sainudiin and Amandine Veber. The code is for clarity and simplicity of implementation (it is not efficient for large scale simulations).

def split01ScaledCD(x,I): '''x \in (0,1), c=I[0] < d=I[1]''' c=I[0]; d=I[1]; e=c+(d-c)*x; return [[c,e],[e,d]]; def CountsDictWithFirstIndex(X): '''convert a list X into a Dictionary of counts or frequencies with first index of each Key saved''' CD = {} for i in range(len(X)): x=X[i] if (x in CD): CD[x][1] = CD[x][1]+1 else: CD[x] = [i,1] return CD def SplittingPermutation(splitpointsequence): '''return the permutation of [n] given by the map from the sequence of n real numbers in the list splitpointsequence to an ordering by indices in [n]''' sss=sorted(splitpointsequence) return tuple([splitpointsequence.index(i)+1 for i in sss]) def MakePartitionAndTree(n,m,a,b): '''This creates m independent trees with n+1 leaves and alpha=a, beta=b n >= 1, where n+1 is the number of leaves m is the number of replicates a,b>-1, where (a+1,b+1) are the parameters of the beta distribution''' PlanarShapeSamples=[]; PhyloShapeSamples=[]; SplittingPermutationSamples=[]; BetaD = RealDistribution('beta', [a+1, b+1],seed=0) show(BetaD.plot(xmin=0,xmax=1),figsize=[7,2]); print '\n'; for reps in range(m): # generating i.i.d. samples from BetaD B=[BetaD.get_random_element() for _ in range(n)] #initialize LBT = LabelledBinaryTree t = LBT(None) SplitPoints=[B[0]]# keep order of split points Splits=split01ScaledCD(B[0],[0,1]) t = t.binary_search_insert(B[0]) #iterate for i in range(1,n): Widths=[x[1]-x[0] for x in Splits]; W = GeneralDiscreteDistribution(Widths); nextSplitI=Splits[W.get_random_element()]; Splits.remove(nextSplitI); NewLeaves=split01ScaledCD(B[i],[nextSplitI[0], nextSplitI[1]]); # find the split point between the new leaves RescaledG=NewLeaves[0][1]; SplitPoints.append(RescaledG); # insert the new split point into the tree t = t.binary_search_insert(RescaledG); Splits.append(NewLeaves[0]); Splits.append(NewLeaves[1]); SplittingPermutationSamples.append( SplittingPermutation(SplitPoints)); sh=t.shape(); PlanarShapeSamples.append(sh); PhyloShapeSamples.append(Graph(sh.to_undirected_graph (with_leaves=True),immutable=True)); return (PlanarShapeSamples,PhyloShapeSamples,SplittingPermutationSamples);

Probability of tree under the generating-organizing DTDS Markov chain at various resolutions.

def splitsSequence(T): '''return a list of tuples (left,right) split sizes at each split node''' l = [] T.post_order_traversal(lambda node: l.append((node[0].node_number(),node[1].node_number()))) return l def isIso(N): '''does node N of binary tree have the same left and right subtree shapes (are left and right subtrees of node N in tree isomorphic)''' L=Graph(N[0].canonical_labelling() .shape().to_undirected_graph(with_leaves=True),immutable=True) R=Graph(N[1].canonical_labelling() .shape().to_undirected_graph(with_leaves=True),immutable=True) #return (N.label(),N[0].label(),N[1].label(), L==R) return 1 if L==R else 0 def numIso(T): '''number of internal nodes that have isomorphic left and right sub-trees''' l = [] T.post_order_traversal(lambda node: l.append(isIso(node))) return sum(l) def prob_RPT(T,a,b): '''probability of ranked planar tree T under beta-splitting model a,b>-1, where (a+1,b+1) are the parameters of the beta distribution''' ncspS=filter(lambda x: x!=(0,0), splitsSequence(T)) # non-cherry splits return prod(map(lambda x: beta(x[0]+a+1,x[1]+b+1)/beta(a+1,b+1),ncspS)) def prob_PT(T,a,b): '''probability of planar tree T under beta-splitting model a,b>-1, where (a+1,b+1) are the parameters of the beta distribution''' ncspS=filter(lambda x: x!=(0,0), splitsSequence(T)) # non-cherry splits return prod(map(lambda x: binomial(x[0]+x[1],x[1])*beta(x[0]+a+1,x[1]+b+1)/beta(a+1,b+1),ncspS)) def prob_RT(T,a,b): '''probability of ranked (nonplar) tree T under beta-splitting model a,b>-1, where (a+1,b+1) are the parameters of the beta distribution''' spS=splitsSequence(T) numSplits=len(spS) ncspS=filter(lambda x: x!=(0,0), spS) # non-cherry splits numCherries=numSplits-len(ncspS) probRPT = prod(map(lambda x: beta(x[0]+a+1,x[1]+b+1)/beta(a+1,b+1), ncspS)) return 2^(numSplits-numCherries)*probRPT def prob_T(T,a,b): '''probability of tree T (phylo tree shape) under beta-splitting model a,b>-1, where (a+1,b+1) are the parameters of the beta distribution''' spS=splitsSequence(T) numSplits=len(spS) ncspS=filter(lambda x: x!=(0,0), spS) # non-cherry splits probPT = prod(map(lambda x: binomial(x[0]+x[1],x[1])*beta(x[0]+a+1,x[1]+b+1)/beta(a+1,b+1),ncspS)) numIsoSplits=numIso(T) return 2^(numSplits-numIsoSplits)*probPT def stats_probs_Tree(T,a,b): '''probability of various resolutions of tree T under beta-splitting model a,b>-1, where (a+1,b+1) are the parameters of the beta distribution''' spS=splitsSequence(T) numSplits=len(spS) ncspS=filter(lambda x: x!=(0,0), spS) # non-cherry splits numCherries=numSplits-len(ncspS) probRPT = prod(map(lambda x: beta(x[0]+a+1,x[1]+b+1)/beta(a+1,b+1), ncspS)) catCoeff = prod(map(lambda x: binomial(x[0]+x[1],x[1]),ncspS)) probPT = catCoeff * probRPT # prob of (non-ranked) planar tree probRT = 2^(numSplits-numCherries)*probRPT numIsoSplits=numIso(T) probT=2^(numSplits-numIsoSplits)*probPT return (numSplits,numIsoSplits,numCherries,catCoeff,probRPT,probPT,probRT,probT)

Demonstration with a=b=0 (Yule model)

#Yule works B=0 a=0; b=0; m=10000; (bts,pts,sps)=MakePartitionAndTree(3,m,a,b)
BtcCnts=CountsDictWithFirstIndex(sps) for x in BtcCnts: print (sps[BtcCnts[x][0]],prob_RPT(bts[BtcCnts[x][0]],a,b).N(digits=4),(BtcCnts[x][1]/m).N(digits=4))
((1, 3, 2), 0.1667, 0.1700) ((3, 2, 1), 0.1667, 0.1666) ((2, 1, 3), 0.1667, 0.1625) ((3, 1, 2), 0.1667, 0.1664) ((1, 2, 3), 0.1667, 0.1683) ((2, 3, 1), 0.1667, 0.1662)
BtcCnts=CountsDictWithFirstIndex(bts) for x in BtcCnts: print (bts[BtcCnts[x][0]],prob_PT(bts[BtcCnts[x][0]],a,b).N(digits=5),(BtcCnts[x][1]/m).N(digits=5))
([., [[., .], .]], 0.16667, 0.17000) ([., [., [., .]]], 0.16667, 0.16830) ([[[., .], .], .], 0.16667, 0.16660) ([[., [., .]], .], 0.16667, 0.16620) ([[., .], [., .]], 0.33333, 0.32890)
BtcCnts=CountsDictWithFirstIndex(pts) for x in BtcCnts: pts[BtcCnts[x][0]].show() print (prob_T(bts[BtcCnts[x][0]],a,b).N(digits=5),(BtcCnts[x][1]/m).N(digits=5))
(0.66667, 0.67110)
(0.33333, 0.32890)

Demonstration for more balanced trees with a=b=1000()a=b=1000 \qquad (\to \infty)

#more balanced works B -> 100 a=1000; b=1000; m=10000; (bts,pts,sps)=MakePartitionAndTree(3,m,a,b)
BtcCnts=CountsDictWithFirstIndex(sps) for x in BtcCnts: print (sps[BtcCnts[x][0]],prob_RPT(bts[BtcCnts[x][0]],a,b).N(digits=4),(BtcCnts[x][1]/m).N(digits=4))
((1, 3, 2), 0.1251, 0.1229) ((3, 2, 1), 0.1251, 0.1242) ((2, 3, 1), 0.1251, 0.1230) ((3, 1, 2), 0.2499, 0.2571) ((1, 2, 3), 0.1251, 0.1216) ((2, 1, 3), 0.2499, 0.2512)
BtcCnts=CountsDictWithFirstIndex(bts) for x in BtcCnts: print (bts[BtcCnts[x][0]],prob_PT(bts[BtcCnts[x][0]],a,b).N(digits=4),(BtcCnts[x][1]/m).N(digits=4))
([[., [., .]], .], 0.1251, 0.1230) ([., [., [., .]]], 0.1251, 0.1216) ([[[., .], .], .], 0.1251, 0.1242) ([., [[., .], .]], 0.1251, 0.1229) ([[., .], [., .]], 0.4998, 0.5083)
BtcCnts=CountsDictWithFirstIndex(pts) for x in BtcCnts: pts[BtcCnts[x][0]].show() print (prob_T(bts[BtcCnts[x][0]],a,b).N(digits=5),(BtcCnts[x][1]/m).N(digits=5))
(0.50025, 0.49170)
(0.49975, 0.50830)
#more balanced works B -> 100 a=1000; b=1000; m=10000; (bts,pts,sps)=MakePartitionAndTree(5,m,a,b)
BtcCnts=CountsDictWithFirstIndex(pts) for x in BtcCnts: pts[BtcCnts[x][0]].show() print (prob_T(bts[BtcCnts[x][0]],a,b).N(digits=5),(BtcCnts[x][1]/m).N(digits=5))
(0.093984, 0.098800)
(0.37463, 0.37940)
(0.015703, 0.014600)
(0.015687, 0.016600)
(0.24988, 0.24850)
(0.25012, 0.24210)
︠fd53ce0b-4609-4e43-bb01-37dc590e09e1︠