A beta-splitting model for evolutionary trees

2014, 2015, Raazesh Sainudiin and Amandine Veber

This is code accompanying the technical report 'A beta-splitting model for evolutionary trees' by Raazesh Sainudiin and Amandine Veber. The code is for clarity and simplicity of implementation (it is not efficient for large scale simulations).

def split01ScaledCD(x,I):
    '''x \in (0,1), c=I[0] < d=I[1]'''
    c=I[0]; d=I[1]; e=c+(d-c)*x;
    return [[c,e],[e,d]];

def CountsDictWithFirstIndex(X):
    '''convert a list X into a Dictionary of counts or frequencies with first index of each Key saved'''
    CD = {}
    for i in range(len(X)):
        x=X[i]
        if (x in CD):
            CD[x][1] = CD[x][1]+1
        else:
            CD[x] = [i,1]
    return CD

def SplittingPermutation(splitpointsequence):
    '''return the permutation of [n] given by the map from
    the sequence of n real numbers in the list
    splitpointsequence to an ordering by indices in [n]'''
    sss=sorted(splitpointsequence)
    return tuple([splitpointsequence.index(i)+1 for i in sss])

def MakePartitionAndTree(n,m,a,b):
    '''This creates m independent trees with n+1 leaves and
    alpha=a, beta=b
       n >= 1, where n+1 is the number of leaves
       m is the number of replicates
       a,b>-1, where (a+1,b+1) are the parameters of the
       beta distribution'''
    PlanarShapeSamples=[];
    PhyloShapeSamples=[];
    SplittingPermutationSamples=[];
    BetaD = RealDistribution('beta', [a+1, b+1],seed=0)
    show(BetaD.plot(xmin=0,xmax=1),figsize=[7,2]);
    print '\n';
    for reps in range(m):
        # generating i.i.d. samples from BetaD
        B=[BetaD.get_random_element() for _ in range(n)]
        #initialize
        LBT = LabelledBinaryTree
        t = LBT(None)
        SplitPoints=[B[0]]# keep order of split points
        Splits=split01ScaledCD(B[0],[0,1])
        t = t.binary_search_insert(B[0])

        #iterate
        for i in range(1,n):
            Widths=[x[1]-x[0] for x in Splits];
            W = GeneralDiscreteDistribution(Widths);
            nextSplitI=Splits[W.get_random_element()];
            Splits.remove(nextSplitI);
            NewLeaves=split01ScaledCD(B[i],[nextSplitI[0],
                                          nextSplitI[1]]);
            # find the split point between the new leaves
            RescaledG=NewLeaves[0][1];
            SplitPoints.append(RescaledG);
            # insert the new split point into the tree
            t = t.binary_search_insert(RescaledG);
            Splits.append(NewLeaves[0]);
            Splits.append(NewLeaves[1]);
        SplittingPermutationSamples.append(
                       SplittingPermutation(SplitPoints));
        sh=t.shape();
        PlanarShapeSamples.append(sh);
        PhyloShapeSamples.append(Graph(sh.to_undirected_graph
                      (with_leaves=True),immutable=True));
    return (PlanarShapeSamples,PhyloShapeSamples,SplittingPermutationSamples);

Probability of tree under the generating-organizing DTDS Markov chain at various resolutions.

def splitsSequence(T):
    '''return a list of tuples (left,right) split sizes at each split node'''
    l = []
    T.post_order_traversal(lambda node:
       l.append((node[0].node_number(),node[1].node_number())))
    return l

def isIso(N):
    '''does node N of binary tree have the same left and right subtree shapes
       (are left and right subtrees of node N in tree isomorphic)'''
    L=Graph(N[0].canonical_labelling()
        .shape().to_undirected_graph(with_leaves=True),immutable=True)
    R=Graph(N[1].canonical_labelling()
        .shape().to_undirected_graph(with_leaves=True),immutable=True)
    #return (N.label(),N[0].label(),N[1].label(), L==R)
    return 1 if L==R else 0

def numIso(T):
    '''number of internal nodes that have isomorphic left and right sub-trees'''
    l = []
    T.post_order_traversal(lambda node: l.append(isIso(node)))
    return sum(l)

def prob_RPT(T,a,b):
    '''probability of ranked planar tree T under beta-splitting model
       a,b>-1, where (a+1,b+1) are the parameters of the beta distribution'''
    ncspS=filter(lambda x: x!=(0,0), splitsSequence(T)) # non-cherry splits
    return prod(map(lambda x: beta(x[0]+a+1,x[1]+b+1)/beta(a+1,b+1),ncspS))

def prob_PT(T,a,b):
    '''probability of planar tree T under beta-splitting model
       a,b>-1, where (a+1,b+1) are the parameters of the
       beta distribution'''
    ncspS=filter(lambda x: x!=(0,0), splitsSequence(T)) # non-cherry splits
    return prod(map(lambda x: binomial(x[0]+x[1],x[1])*beta(x[0]+a+1,x[1]+b+1)/beta(a+1,b+1),ncspS))

def prob_RT(T,a,b):
    '''probability of ranked (nonplar) tree T under beta-splitting model
       a,b>-1, where (a+1,b+1) are the parameters of the beta distribution'''
    spS=splitsSequence(T)
    numSplits=len(spS)
    ncspS=filter(lambda x: x!=(0,0), spS) # non-cherry splits
    numCherries=numSplits-len(ncspS)
    probRPT = prod(map(lambda x: beta(x[0]+a+1,x[1]+b+1)/beta(a+1,b+1), ncspS))
    return 2^(numSplits-numCherries)*probRPT

def prob_T(T,a,b):
    '''probability of tree T (phylo tree shape) under beta-splitting model
       a,b>-1, where (a+1,b+1) are the parameters of the beta distribution'''
    spS=splitsSequence(T)
    numSplits=len(spS)
    ncspS=filter(lambda x: x!=(0,0), spS) # non-cherry splits
    probPT = prod(map(lambda x: binomial(x[0]+x[1],x[1])*beta(x[0]+a+1,x[1]+b+1)/beta(a+1,b+1),ncspS))
    numIsoSplits=numIso(T)
    return 2^(numSplits-numIsoSplits)*probPT

def stats_probs_Tree(T,a,b):
    '''probability of various resolutions of tree T under beta-splitting model
       a,b>-1, where (a+1,b+1) are the parameters of the beta distribution'''
    spS=splitsSequence(T)
    numSplits=len(spS)
    ncspS=filter(lambda x: x!=(0,0), spS) # non-cherry splits
    numCherries=numSplits-len(ncspS)
    probRPT = prod(map(lambda x: beta(x[0]+a+1,x[1]+b+1)/beta(a+1,b+1), ncspS))
    catCoeff = prod(map(lambda x: binomial(x[0]+x[1],x[1]),ncspS))
    probPT = catCoeff * probRPT # prob of (non-ranked) planar tree
    probRT = 2^(numSplits-numCherries)*probRPT
    numIsoSplits=numIso(T)
    probT=2^(numSplits-numIsoSplits)*probPT
    return (numSplits,numIsoSplits,numCherries,catCoeff,probRPT,probPT,probRT,probT)

Demonstration with a=b=0 (Yule model)

#Yule works B=0
a=0; b=0; m=10000; (bts,pts,sps)=MakePartitionAndTree(3,m,a,b)

BtcCnts=CountsDictWithFirstIndex(sps)
for x in BtcCnts:
    print (sps[BtcCnts[x][0]],prob_RPT(bts[BtcCnts[x][0]],a,b).N(digits=4),(BtcCnts[x][1]/m).N(digits=4))

((1, 3, 2), 0.1667, 0.1700)
((3, 2, 1), 0.1667, 0.1666)
((2, 1, 3), 0.1667, 0.1625)
((3, 1, 2), 0.1667, 0.1664)
((1, 2, 3), 0.1667, 0.1683)
((2, 3, 1), 0.1667, 0.1662)

BtcCnts=CountsDictWithFirstIndex(bts)
for x in BtcCnts:
    print (bts[BtcCnts[x][0]],prob_PT(bts[BtcCnts[x][0]],a,b).N(digits=5),(BtcCnts[x][1]/m).N(digits=5))

([., [[., .], .]], 0.16667, 0.17000)
([., [., [., .]]], 0.16667, 0.16830)
([[[., .], .], .], 0.16667, 0.16660)
([[., [., .]], .], 0.16667, 0.16620)
([[., .], [., .]], 0.33333, 0.32890)

BtcCnts=CountsDictWithFirstIndex(pts)
for x in BtcCnts:
    pts[BtcCnts[x][0]].show()
    print (prob_T(bts[BtcCnts[x][0]],a,b).N(digits=5),(BtcCnts[x][1]/m).N(digits=5))

(0.66667, 0.67110)

(0.33333, 0.32890)

Demonstration for more balanced trees with $a=b=1000 \qquad (\to \infty)$

#more balanced works B -> 100
a=1000; b=1000; m=10000; (bts,pts,sps)=MakePartitionAndTree(3,m,a,b)

BtcCnts=CountsDictWithFirstIndex(sps)
for x in BtcCnts:
 print (sps[BtcCnts[x][0]],prob_RPT(bts[BtcCnts[x][0]],a,b).N(digits=4),(BtcCnts[x][1]/m).N(digits=4))

((1, 3, 2), 0.1251, 0.1229)
((3, 2, 1), 0.1251, 0.1242)
((2, 3, 1), 0.1251, 0.1230)
((3, 1, 2), 0.2499, 0.2571)
((1, 2, 3), 0.1251, 0.1216)
((2, 1, 3), 0.2499, 0.2512)

BtcCnts=CountsDictWithFirstIndex(bts)
for x in BtcCnts:
    print (bts[BtcCnts[x][0]],prob_PT(bts[BtcCnts[x][0]],a,b).N(digits=4),(BtcCnts[x][1]/m).N(digits=4))

([[., [., .]], .], 0.1251, 0.1230)
([., [., [., .]]], 0.1251, 0.1216)
([[[., .], .], .], 0.1251, 0.1242)
([., [[., .], .]], 0.1251, 0.1229)
([[., .], [., .]], 0.4998, 0.5083)

BtcCnts=CountsDictWithFirstIndex(pts)
for x in BtcCnts:
    pts[BtcCnts[x][0]].show()
    print (prob_T(bts[BtcCnts[x][0]],a,b).N(digits=5),(BtcCnts[x][1]/m).N(digits=5))

(0.50025, 0.49170)

(0.49975, 0.50830)

#more balanced works B -> 100
a=1000; b=1000; m=10000; (bts,pts,sps)=MakePartitionAndTree(5,m,a,b)

BtcCnts=CountsDictWithFirstIndex(pts)
for x in BtcCnts:
    pts[BtcCnts[x][0]].show()
    print (prob_T(bts[BtcCnts[x][0]],a,b).N(digits=5),(BtcCnts[x][1]/m).N(digits=5))

(0.093984, 0.098800)

(0.37463, 0.37940)

(0.015703, 0.014600)

(0.015687, 0.016600)

(0.24988, 0.24850)

(0.25012, 0.24210)

︠fd53ce0b-4609-4e43-bb01-37dc590e09e1︠

A beta-splitting model for evolutionary trees

2014, 2015, Raazesh Sainudiin and Amandine Veber

Probability of tree under the generating-organizing DTDS Markov chain at various resolutions.

Demonstration with a=b=0 (Yule model)

Demonstration for more balanced trees with a=b=1000(→∞)a=b=1000 \qquad (\to \infty)a=b=1000(→∞)

Demonstration for more balanced trees with $a=b=1000 \qquad (\to \infty)$