Source code for egglib._tree

"""
    Copyright 2008-2023 Stephane De Mita, Mathieu Siol

    This file is part of EggLib.

    EggLib is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    EggLib is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
"""

import sys, functools
from . import eggwrapper as _eggwrapper, random

[docs]class Node(object): """ Manage a single node of a given tree. This class provides an interface to :class:`~.Tree` instance's nodes and allows access and modification of data attached to a given node as well as the tree descending from that node. A node must be understood as the point *below* a branch (in the direction towards leaves). So the length describe in the instance concerns the branch *above* the corresponding node (towards the base of the tree). Branches (connections between nodes) have a direction: they go *from* a node *to* another node. Nodes have therefore *children* and *parents* (a given node have one parent except the base of the tree which has none). Connecting a node to itself, making a two-way branch (to branches connecting the same two nodes in opposite directions) or duplicate branches (between the same two nodes and in the same direction) are illegal. :param label: node label (in case of a terminal node, its leaf label), if needed. Labels, if provided, are expected to be strings for terminal nodes, and numeric values for internal nodes, but technically, all user-supplied values are accepted (however, some :class:`~.Tree` methods require proper types). """ def __init__(self, label=None): self._parent = None self._parent_branch = None self._children = [] self._child_branches = {} self._label = label
[docs] def newick(self, skip_labels=False, skip_brlens=False): """ Formats the node and the subtree descending from is as a newick string. :param skip_labels: omit internal branch labels. :param skip_brlens: omit the branch lengths. """ return ''.join(self._newick(skip_labels, skip_brlens) + [';'])
def _newick(self, skip_labels, skip_brlens): string_bits = [] if len(self._children): string_bits.append('(') for child in self._children: string_bits.extend(child._newick(skip_labels, skip_brlens)) brlen = self._child_branches[child] if brlen != None and not skip_brlens: string_bits.append(':' + str(brlen)) string_bits.append(',') string_bits[-1] = ')' if self._label != None and (not skip_labels or not len(self._children)): string_bits.append(str(self._label)) return string_bits
[docs] def leaves_down(self): """ Recursively gets all leaf labels descending from that node. If this is a terminal node, returns its label in a one-item list. """ leaves = [] if len(self._children) > 0: for child in self._children: leaves += child.leaves_down() return leaves else: return [self._label]
[docs] def leaves_up(self): """ Recursively gets all leaf labels contained on the other side of the tree. In other words, get all leaves of the tree except those descending from this node). If this is the root node, returns an empty list. """ leaves = [] if self._parent == None: return [] else: for brother in self._parent._children: if brother != self: leaves += brother.leaves_down() leaves += self._parent.leaves_up() return leaves
def _unlink(self): # Clears all references to other :class:`~.Node` instances # (parent and children) from this node. The nodes in question # are not affected (they are not deleted, and references they # hold to the current node are not erased). self._parent = None self._parent_branch = None self._children = [] self._child_branches = {} def _add_child(self, label, brlen): # Generates a new :class:`~.Node` instance descending from the # current instance (add a child to this instance). # :param label: label of the new node. # :param brlen: length of the branch connecting this node to the # new node. # :return: The new node. child = Node(label) child._parent = self child._parent_branch = brlen self._children.append(child) self._child_branches[child] = brlen return child def _connect(self, node, brlen): # Connect this node to an other, existing, node. The orientation # of the link is *from* the current instance *to* the passed # instance. If the passed node has children, they will be # retained, allowing to connect a subtree to this tree. It is # not allowed to connect a node that has a (non-``None`) parent. # To do so, one must first disconnect the node from its parent # and then connect it to a new one. # :param node: an existing node. # :param brlen: length of the branch between this node and the # passed one. # tests are not performed for a hidden method # if node._parent is not None: raise ValueError('cannot connect node): already has a parent (disconnect first)' self._children.append(node) self._child_branches[node] = brlen node._parent = self node._parent_branch = brlen
[docs] def has_descendant(self, node): """ Tell if a :class:`.Node` is a descendant of this node. """ for child in self._children: if child == node: return True if child.has_descendant(node): return True return False
[docs] def is_parent(self, node): """ Tell if a :class:`.Node` is the parent of this node. Passing ``None`` to an instance that has no parent will return ``True``. """ return self._parent == node
[docs] def is_child(self, node): """ Tell if a :class:`.Node` is a child of this node. """ for child in self._children: if node == child: return True return False
@property def parent(self): """ Parent of this node. ``None`` if the node has no parent. """ return self._parent
[docs] def children(self): """ Return an iterator over this node's children. """ for node in self._children: yield node
[docs] def child(self, idx): """ Return a given child, as a :class:`~.Node` instance. """ try: return self._children[idx] except IndexError: raise IndexError('node index out of range')
[docs] def siblings(self): """ List of other children of this node's parent. It is required that this node has a parent. """ if self._parent == None: raise ValueError('cannot access siblings: this node has no parent') return [node for node in self._parent._children if node != self]
def _remove_parent(self): # Removes the branch between this node and its parent. Note that # this method removes also this node from its parent's children # (both sides of the branch are removed). self._parent._children.remove(self) del self._parent._child_branches[self] self._parent = None self._parent_branch = None def _remove_child(self, node): # Removes the branch between this node and one of its children. # Note that this method also removes the parent of child in # question (both sides of the branch are removed). self._children.remove(node) del self._child_branches[node] node._parent = None node._parent_branch = None
[docs] def set_branch_to(self, child, brlen): """ Set the length of the branch to a child node. :param child: node whose branch should be resized. It can be represented by a direct reference (as a :class:`.Node`), or by its index in the children list. In that case, ensure that the index is currently valid. :param brlen: new branch length (it is allowed to pass ``None``). """ if isinstance(child, Node): if child not in self._children: raise ValueError('invalid node: not part of the list of children') self._child_branches[child] = brlen child._parent_branch = brlen else: if child >= len(self._children): raise ValueError('invalid node index') self._child_branches[self._children[child]] = brlen self._children[child]._parent_branch = brlen
[docs] def branch_to(self, child): """ Length of the branch to a child node. Non-specified branch lengths are represented by ``None``. :param child: node whose branch length is requested. It can be represented by a direct reference (as a :class:`.Node`), or by its index in the children list. In that case, ensure that the index is currently valid. """ if isinstance(child, Node): if child not in self._children: raise ValueError('invalid node: not part of the list of children') return self._child_branches[child] else: if child >= len(self._children): raise ValueError('invalid node index') return self._child_branches[self._children[child]]
@property def parent_branch(self): """ Length of the branch to the parent. Non-specified branch lengths are represented by ``None``. An exception is thrown upon accessing this attribute if this node has no parent. This attribute can be modified. """ if self._parent is None: raise ValueError('this node has not parent') return self._parent_branch @parent_branch.setter def parent_branch(self, brlen): if self._parent == None: raise ValueError('this node has no parent') self._parent.set_branch_to(self, brlen) # will set both brlen at both sides @property def label(self): """ Node's label (modifiable). """ return self._label @label.setter def label(self, label): self._label = label @property def num_children(self): """ Number of children connected to this node. """ return len(self._children)
[docs]class Tree(object): """ Editable genealogical or phylogenetic tree. A tree is a linked collection of nodes which all have one parent (except the ultimate base node) and any number of children. :param fname: name of a newick-formatted file containing the tree to import. :param string: newick-formatted string representing the the tree to import. If it not allowed to specified both *fname* and *string*. The instance can be initialized as an empty tree (with only a root node), or from a newick-formatted string. By default, the string is read from the file name passed as the *fname* argument to the constructor, but it can be passed directly through the constructor argument *string*. It is not allowed to set both *fname* and *string* at the same time. If neither is specified, an empty tree, containing only a bare base node, is created. The newick parser expects a well-formed newick string (including the trailing semicolon). Nodes are implemented as :class:`.Node` instances. A node without children is a *leaf* and otherwise it is *internal*. A node with exactly one child is generally meaningless, but is allowed. All nodes (internal nodes as well as leaves) have a *label* which in the case of leaves can be used as a sample name. It is not possible to apply a name *and* a label to leaf node, in agreement with the newick format. All connections between nodes (*branches*) are oriented and can have a length (although the lengths can be omitted) but note that labels are applied to nodes, not branches. All :class:`!Tree` instances have one base node which is the only one allowed not to have a parent. Network-like structures are not allowed (because nodes must have exactly one parent). Import and export to/from strings and files are in the bracket-based `newick <http://evolution.genetics.washington.edu/phylip/newicktree.html>`_ format (the parser treats terminal node labels as strings, and internal node labels as integers or, by default, floats). :class:`!Tree` instances can be exported as a built-in :class:`str` function by the syntax ``str(tree)`` or using the method :meth:`~.Tree.newick`. Nodes also have a :meth:`~.Node.newick` method. :class:`!Tree` instances are iterable. Three iterators are provided: one is depth-first (:meth:`~.Tree.depth_iter`), another is breath-first (:meth:`~.Tree.breadth_iter`) and one iterates on terminal nodes (leaves) only (:meth:`~.Tree.iter_leaves`). """ def __init__(self, fname=None, string=None): self._base = Node() self._leaves = [] self._nodes = [self._base] if fname != None and string != None: raise ValueError('cannot set both arguments of Tree constructor') if fname != None: f = open(fname) string = f.read() f.close() if string != None: try: string = string.translate(None, ' \n\r\t') except TypeError: string = string.translate(dict.fromkeys(map(ord, ' \n\r\t'))) if len(string) == 0: raise ValueError('empty newick string') if string[0] != '(' or string[-2:]!= ');': raise ValueError('invalid newick string') self._parse(string[:-1], self._base) @staticmethod def _from_coalesce(coal, idx, fun): # Create a :class:`~.Tree` instance from the results of a # coalescent simulation. # # :param coal: a :cpp:class:`~.Coalesce` object which has run a # valid simulation. # :param idx: the index of a tree (must be within range of # :cpp:meth:`~.Coalesce.number_of_trees`). # :param fun: executable expression converting the label to a # string. # # :return: A new :class:`~.Tree` instance. source = coal.tree(idx) root = source.root() tree = Tree.__new__(Tree) tree._base = Node() tree._leaves = [] tree._nodes = [tree._base] tree._from_coalesce_helper(tree._base, source, root, fun) return tree def _from_coalesce_helper(self, node_dst, tree_src, node_src, fun): # Helper for _from_coalesce # :param node_dst: current node in self. # :param tree_src: the reference C++ tree. # :param node_src: current C++ node in reference tree # :param fun: like for :meth:`~._from_coalesce`. if node_src.is_terminal(): self._leaves.append(node_dst) node_dst._label = fun(node_src.label()) else: son1 = tree_src.node(node_src.son1()) child1 = node_dst._add_child(None, son1.get_L()) self._from_coalesce_helper(child1, tree_src, son1, fun) self._nodes.append(child1) son2 = tree_src.node(node_src.son2()) child2 = node_dst._add_child(None, son2.get_L()) self._from_coalesce_helper(child2, tree_src, son2, fun) self._nodes.append(child2)
[docs] def copy(self, node=None): """ Make a deep copy of the tree. Create a new instance of :class:`.Tree` that is a deep copy of a subtree of the the current tree, or a deep copy of it all. :param node: a :class:`.Node` instance (one of the nodes of the current tree) at the base of the subtree that should be copied. By default, or if ``None``, or the base of the tree is passed, the whole tree is copied. It is not allowed to pass a leaf. """ if node == None or node == self._base: return self._copy_full() if node not in self._nodes: raise ValueError('cannot copy tree: node is not part of tree') if node in self._leaves: raise ValueError('cannot copy tree: node is terminal') clone = Tree.__new__(Tree) clone._base = Node() clone._nodes = [clone._base] clone._leaves = [] self._copy_helper(clone, clone._base, node) return clone
def _copy_helper(self, clone, dest_node, node): dest_node._label = node._label if node in self._leaves: clone._leaves.append(dest_node) for child in node._children: new_node = dest_node._add_child(child._label, child._parent_branch) self._copy_helper(clone, new_node, child) clone._nodes.append(new_node) def _copy_full(self): # create a new bare tree (all nodes are also disconnected) clone = Tree.__new__(Tree) clone._nodes = [Node() for i in range(len(self._nodes))] clone._leaves = [] # locate the base by its index base_index = self._nodes.index(self._base) clone._base = clone._nodes[base_index] # connect nodes bases on their indexes for node, copy in zip(self._nodes, clone._nodes): copy._label = node._label # copy label copy._parent_branch = node._parent_branch # copy parent brlen if node._parent == None: copy._parent = None else: idx = self._nodes.index(node._parent) copy._parent = clone._nodes[idx] # copy parent for child in node._children: idx = self._nodes.index(child) child_copy = clone._nodes[idx] copy._children.append(child_copy) # copy a child copy._child_branches[child_copy] = node._child_branches[child] # copy child brlen if node in self._leaves: clone._leaves.append(copy) return clone
[docs] def extract(self, node, label=None): """ Remove a subtree. The subtree is returned as a new tree. All nodes of the subtree descending from the requested node will now belong to the new tree. The label of the node at the base of the selected clade is deleted. In the original tree, the extracted clade is replaced by a terminal node which has, by default, the label of the node at the base of the extracted clade, or the label passed as argument. :param node: a :class:`.Node` instance (one of the nodes of the current tree) at the base of the subtree that should be extracted. It is not allowed to pass a leaf or the base of the tree. :param label: label to affect to the terminal node that is introduced to replace the extracted clade in the orginal tree. By default (or if ``None``), use the label of the node passed as first argument (note that this label should be in principle a number). :return: A new :class:`.Tree` instance. """ if node not in self._nodes: raise ValueError('cannot extract subtree: node is not part of tree') if node in self._leaves: raise ValueError('cannot extract subtree: node is terminal') if node == self._base: raise ValueError('cannot extract subtree: node is base') parent = node._parent brlen = node._parent_branch node._remove_parent() if label == None: label = node._label parent._add_child(label, brlen) clone = Tree.__new__(Tree) clone._base = node clone._nodes = [] clone._leaves = [] self._extract_helper(clone, node) return clone
def _extract_helper(self, clone, node): clone._nodes.append(node) self._nodes.remove(node) if node in self._leaves: clone._leaves.append(node) self._leaves.remove(node) else: for child in node._children: self._extract_helper(clone, child) def __del__(self): for node in self._nodes: node._unlink() def _parse(self, string, cur): # This parser expects a string starting with an open round # bracket and ending with a closed one. if not len(string): raise IOError('invalid newick string (empty string)') string= string[1:-1] i=0 while(True): # case of a subtree subtree = [] if string[i] == '(': # we gather everything until the matching closing bracket subtree.append(string[i]) acc = 1 while acc: i+=1 if i == len(string): raise ValueError('invalid newick string') subtree.append(string[i]) if string[i] == '(': acc+=1 if string[i] == ')': acc-=1 i+=1 # get all until the next comma or end of string buff = '' while i < len(string) and string[i] != ',': buff += string[i] i+=1 # convert to label and brlen if not len(buff): label = None brlen = None else: tbuff = buff.split(':') if len(tbuff) == 1: label = buff brlen = None elif len(tbuff) == 2: if len(tbuff[0]) == 0: label = None else: label = tbuff[0] try: brlen = float(tbuff[1]) except ValueError: raise ValueError('invalid newick string (invalid branch length: {0}'.format(tbuff[1])) else: raise ValueError('invalid newick string') # add the node (whichever internal or terminal) node = cur._add_child(label=label, brlen=brlen) self._nodes.append(node) # in case there is a subtree, it is recursively parsed if len(subtree): self._parse(''.join(subtree), node) # try to convert label to int of float (silent otherwise) if node._label != None: try: node._label = int(node._label) except ValueError: try: node._label = float(node._label) except ValueError: pass # else, cache the leaf else: self._leaves.append(node) # if we didn't reach the end of the string, we continue parsing if i == len(string): break else: i+=1 continue def __str__(self): return self._base.newick() @property def num_nodes(self): """ Total number of nodes in the tree. This number is never smaller than 1, even for empty trees. """ return len(self._nodes) @property def num_leaves(self): """ Number of terminal nodes in the tree. If the tree is empty (only the default base node), the number of leaves is 0. """ return len(self._leaves)
[docs] def newick(self, skip_labels=False, skip_brlens=False): """ Return the newick-formatted string representing the instance. :param skip_labels: omit internal branch labels. :param skip_brlens: omit the branch lengths. """ return self._base.newick(skip_labels, skip_brlens)
@property def base(self): """ Basal node of the tree. If the tree is unrooted, this is a trifurcation whose location should be considered as arbitrary, unless one the three clades below this node is the outgroup. If the tree is rooted, this is the root). This attribute is a :class:`.Node` instance which can be modified, but it cannot be replaced. """ return self._base
[docs] def add_node(self, parent, label=None, brlen=None): """ Add a node to the tree. :param parent: one of the nodes of this instance, as a :class:`.Node` instance. :param label: node label which will be internal node label or leaf name according to the final structure of the tree. The new node has initially no children and is therefore a leaf until it is itself connected to a child (if ever). :param brlen: length of the branche connecting *parent* to the new node. :return: The new node as a :class:`.Node` reference. """ if parent not in self._nodes: raise ValueError('cannot add node to a node that does not belong to this tree') child = parent._add_child(label, brlen) self._leaves.append(child) self._nodes.append(child) if parent in self._leaves: self._leaves.remove(parent)
[docs] def iter_leaves(self): """ Return an iterator over the leaves as :class:`.Node` instances. """ for node in self._leaves: yield node
[docs] def get_leaf(self, label): """ Get a terminal node. Return the node (as a :class:`.Node` instance) that has the requested leaf label. If several nodes have this label, returns the first one. If no nodes have this label, returns ``None``. """ for leaf in self._leaves: if leaf._label == label: return leaf else: return None
[docs] def depth_iter(self, start=None): """ Return a depth-first iterator. Iterate over the :class:`~.Node` instances of the trees, starting from the base but, then, following a depth-first order. :param start: start point of the iteration, as a :class:`.Node` instance of this tree. By default, start from the base of the tree. """ if start is None: start = self._base elif start not in self._nodes: raise ValueError('node does not belong to this tree') return _tree_depth_first_iterator(self, start)
[docs] def breadth_iter(self, start=None): """ Return a breadth-first iterator. Iterate over the :class:`~.Node` instances of the trees, starting from the base but, then, following a breadth-first order. :param start: start point of the iteration, as a :class:`.Node` instance of this tree. By default, start from the base of the tree. """ if start is None: start = self._base elif start not in self._nodes: raise ValueError('node does not belong to this tree') return _tree_breath_first_iterator(self, start)
[docs] def total_length(self): """ Compute the sum of branch lengths. All branch lengths must be defined (non-``None``), otherwise a :exc:`ValueError` will be raised. """ L = 0 for node in self._nodes: for child in node.children(): l = node.branch_to(child) if l == None: raise ValueError('cannot compute tree\'s length: at least one branch has no defined length') L += l return L
[docs] def find_clade(self, names, ancestral=False, both_sides=False): """ Check whether a group is one of the clades defined by the tree. The leaf names must be provided as an iterable (most logically, a :class:`set`). Leaf names are normally :class:`str` instances. All leaves must be present in the tree. If the*ancestral* is ``False``, search for the clade that contains the provided list of names as descendant. There must not be any other name amongst its descendants. If the tree is unrooted and oriented in such a way that the a base lies within the requested clade, it will not be detected. It is possible (if *both_sides* is ``True``) to allow searching for the complement of the clade, thereby detecting the right clade even if it is at the base of the tree. By default, if this situation occurs, the clade will not be detected. If the option *ancestral* is ``True``, search for the most recent common ancestor of all leaves specified in *names*. Use of this method necessarily supposes that the tree is rooted (however, there is no requirement regarding its shape such as bifurcation at the base) and it is not allowed to *both_sides* is ``True``. With this option, it is not possible to have a ``None`` return value (since it is required that all leaves are present in the tree, in the worse case the base of the tree is returned). :param name: a :class:`set` (or compatible) specifying the requested leaves (as node labels, normally :class:`str` instances). :param ancestral: whether to look for the most recent common ancestral clade containing requested leaves (by default, look for the clade containing the exact same list of leaves). :param both_sides: only allowed if *ancestral* is ``False``. Look for both the requested list of leaves and its complement, allowing to detect a clade even if it is spanning the base of the tree. :return: The :class:`.Node` instance, if it exists, which has the exact same list of descendants than *taxa*. If no such clade is found, ``None``. .. warning:: This method assumes that all leaf names of the tree are unique, as well as the list of names provided as argument. If this condition is not fulfilled, the right clade might not be found even if it exists. .. versionchanged:: 3.0.0 Replaces previous methods :meth:`!findGroup`, :meth:`!findMonophyleticGroup`, :meth:`!smallest_group` and :meth:`!smallest_monophyleticGroup` with a modification of the underlying algorithm. """ query = set(names) # capture errors or trivial cases if ancestral == True and both_sides == True: raise ValueError('cannot combine `ancestral` and `both_sides` options') if len(names) == 0: raise ValueError('cannot find clade: empty list of names') leaves_set = set([leaf._label for leaf in self._leaves]) if query < leaves_set: pass elif query == leaves_set: return self._base # clade is the tree itself else: raise ValueError('cannot find clade: names are not all present in the tree') if both_sides: compl = leaves_set - query # find any leaf belonging to query for node in self._leaves: if node._label in query: query.discard(node._label) break # at least one of the names should be found else: raise RuntimeError('unexpected case in `Tree.find_clade()`: please report is as a bug') # climbs up the tree collecting other leaves from the query while node != None: # the descendant of the current node must have been all processed # they are all found to be part of the query (and removed from it) # if query completed, it is the right node if len(query) == 0: return node # there should be a parent because if node is the root, then we should have collected all queried leaves if node.parent == None: raise RuntimeError('unexpected case in `Tree.find_clade()`: please report is as a bug') # collect descendants of the parent (except those already processed through the current node) siblings = node.siblings() # move one level up node = node.parent for sib in siblings: for label in sib.leaves_down(): # discard leaves that are in query if label in query: query.discard(label) # if non-query leaf found else: # if we are looking for the most recent common ancestor, it is fine if ancestral == True: pass # if the current node is the base, the clade might be spanning the base # try the complement (if user allowed it) elif node.parent == None and both_sides == True: return self.find_clade(names=compl, both_sides=False) # otherwise there is no such clade in this tree else: return None # in principle, we never reach this point if the query raise RuntimeError('unexpected case in `Tree.find_clade()`: please report is as a bug')
[docs] def collapse(self, node, ignore_len=False, ignore_label=False): # note: this API method is also used by remove_node() with explicit option values """ Collapse a branch of the tree. :param node: :class:`~.Node` representing the branch to remove. :param ignore_len: don't try to transfer branch lengths to children. :param ignore_label: don't transfer label of the destroyed node to its parent. *node* represents the branch that must be removed from the tree (this node is destroyed in the process). It must be one of the nodes contained in the tree (as a :class:`~.Node` instance), but not the base of the tree. It cannot be an terminal node (leaf). If *ignore_label* is not set to ``True``, the label of the destroyed node is transferred to the parent based on the following procedure: (1) if the destroyed node's label is ``None``, nothing is done; (2) if the destroyed node's parent's label is ``None``, the destroyed node's label is copied to its parent as is; (3) otherwise both labels are converted to strings (if they are not yet) and concatenated as in the string ``a;b`` (where ``a`` is the parent's label and ``b`` is the destroyed node's label), even if the two labels are identical. If *ignore_len* is not set to ``True``, and if the length of the removed branch (branch from the specified node to its parent) is specified, it will be spread equally among the branches to its children (see example below). This requires that the branch length to all children are specified. If the removed branch has no specified length, nothing is done. Collapsing node ``[4]`` on the following tree:: /------------------------------------------->[1] | | /----------------------------->[3] | | |----------->[2] /-------------->[5] | | | | \------------>[4] [0] | | \-------------->[6] | | /---------------------------->[8] | | \------------>[7] /------------->[10] | | \----------->[9] | \------------->[11] will generate the following tree, with the correction of edge lengths as depicted:: /------------------------------------------->[1] | | /----------------------------->[3] | | |----------->[2] | | | |-------------------->[5] L5 = L5+L4/2 [0] | | \-------------------->[6] L6 = L6+L4/2 | | /---------------------------->[8] | | \------------>[7] /------------->[10] | | \----------->[9] | \------------->[11] Although the total edge length of the tree is not modified, the relationships will be altered: the distance between the descendants of the collapsed node (nodes 5 and 6 in the example above) will be artificially increased. """ # sanity checks if node not in self._nodes != self: raise ValueError('cannot collapse a node that does not belong to this tree') if node == self._base: raise ValueError('cannot collapse the base of the tree') if node.num_children == 0: raise ValueError('cannot collapse a terminal node') parent = node.parent children = list(node.children()) # process branch lengths p_len = node.parent_branch s_len = [node.branch_to(child) for child in children] if not ignore_len and p_len != None: if None in s_len: raise ValueError('cannot collapse node: branches to children are required') s_len = [length + p_len/node.num_children for length in s_len] # disconnects the node to be collapsed and remove it node._remove_parent() for child in children: node._remove_child(child) self._nodes.remove(node) # connects the sons their grandparent for child, length in zip(children, s_len): parent._connect(child, length) # saves the label information if not ignore_label: plab = parent._label nlab = node._label if nlab != None: if plab == None: parent.label = nlab else: parent.label = '%s;%s' %(plab, nlab)
[docs] def root(self, outgroup, branch_split=0.5, reoriente=False): """ Root or reoriente the tree. By default, a new node is created to represent the root and is placed on the branch leading to the provided outgroup node (the second argument determines where the new node is placed on this branch). Otherwise, the tree is reoriented such as its base is placed at the location of the provided outgroup. In the former case, its ends with a bifurcation at the root; in the latter case, a trifurcation. It is illegal to call this method on trees that are already rooted (have a difurcation at the root). :param outgroup: :class:`~.Node` instance contained in this tree. It can be a leaf or any internal node, but not the current base of the tree (unless *reoriente* is ``True``: in that case, it *might* be the base of the tree [it will not change anything] and it *cannot* be a leaf). :param branch_split: where to cut the branch leading to the outgroup. :param reoriente: don't create any root node *branch_site* is therefore not considered) and only place the node provided as *outgroup* at the base of the tree, thereby merely changing the representation of the tree. The information below describes the case where *reoriente* is ``False`` (proper rooting). If the branch to the provided outgroup doesn't have a branch length, the *branch_split* argument is ignored. Otherwise, *branch_split* must be a real number between 0 and 1 and give the proportion of the branch that must be allocated to the basal branch leading to the outgroup, the complement being allocated to the branch leading to the rest of the tree. If *branch_split* is either 0 or 1, one of the branch will have a length of 0, but it will exist anyway. If the original tree has this structure:: /------------------------------------------->[1] | | /----------------------------->[3] | | |----------->[2] /-------------->[5] | | | | \------------>[4] [0] | | \-------------->[6] | | /---------------------------->[8] | | \------------>[7] /------------->[10] | | \---[ROOT]-->[9] | \------------->[11] And rooting is requested at node ``[9]``, the root will be placed on the branch marked by ``[ROOT]``. The outcome will be as depicted below, with the introduction of a new node (marked ``[ROOT]``) and the reorientation of the tree to place it at the base:: /-------------------------[1] | /-----[0] /------------------[3] | | | | \-----[2] /----------[5] | | | /---[E2]---[7] \------[4] | | | | | \----------[6] | | [ROOT] \--------------------------------[8] | | /---------------------[10] | | \--------[E1]--------[9] | \---------------------[11] In this example, the relationship between nodes ``[7]`` and ``[0]`` (the previous base of the tree) is reverted. The label of node ``[7]`` is automatically transferred to node ``[0]``. This is consistent with the idea that internal node labels describe a property of the branch. The original label of the base, if it exists, is discarded. Since the branch between ``[7]`` and ``[9]`` is cut in two, the original label of node ``[9]`` is copied to node ``[7]``, leaving them both with the same label. However, if the outgroup is a terminal node, the label is not copied and the other basal branch is left without label. Let :math:`L` be the length of the branch from ``[7]`` and ``[9]`` in the original tree, and :math:`r` the value of the parameter *branch_split*. The length of the branch ``[E1]`` will be set to :math:`rL`, and the branch ``[E2]`` to :math:`(1-r)L`. Overall, the length of the tree will not be modified. In the case that *reoriente* is ``True``, the final tree is rather:: /------------------------------------------->[10] | |------------------------------------------->[11] | [9] /----------------------------------->[8] | | \------>[7] /-------------------------->[1] | | \------>[0] /------------------>[3] | | \----->[2] /-------->[5] | | \------->[4] | \-------->[6] The topology of the tree is the same as the initial one, except that the base is now ``[9]``. The lengths of all branches are conserved. However, node labels between the old and the new base are reverted: the node label of the new base (``[9]`` in the example) is affected to the next node (``[7]`` in the example) and so on until the old base (``[0]`` in the example), whose label, if it exists, is discarded. """ # sanity checking if outgroup not in self._nodes: raise ValueError('cannot root tree: outgroup node is not part of the tree') if outgroup == self._base and reoriente == False: raise ValueError('cannot root tree: outgroup node is the current base of the tree') if outgroup.num_children == 0 and reoriente == True: raise ValueError('cannot reoriente tree: outgroup node is a terminal node') if self._base.num_children == 2: raise ValueError('cannot root tree: tree is already rooted') if self._base.num_children < 2: raise ValueError('cannot root tree: tree has an invalid structure') L = outgroup.parent_branch if not reoriente and L != None and (branch_split < 0.0 or branch_split > 1.0): raise ValueError('cannot root tree: invalid `branch_split` value') # collect all nodes on the path from the new to the old root path = [] labels = [] # collect also labels brlens = [] # and branch lengths if reoriente: cur = outgroup else: cur = outgroup.parent while cur != None: path.append(cur) labels.append(cur._label) # last one will be discarded brlens.append(cur._parent_branch) # last one is None cur = cur.parent # path[0] = node after new root (or new base if reoriente) # path[...] = intermediate nodes # path[-1] = base of the tree if not reoriente: # remove the branch where the root is placed outgroup._remove_parent() # create root and add it to nodes root = Node() self._nodes.append(root) self._base = root else: self._base = outgroup # disconnect all nodes on the way from new base to old base (old base not included) for node in path[:-1]: node._remove_parent() # reconnect in the other order for i in range(len(path)-1): path[i]._connect(path[i+1], brlens[i]) # note that brlens[-1] is discarded / is None anyway path[i+1]._label = labels[i] # labels[-1] is discarded and path[0]._label unset yet if not reoriente: # compute length of basal branches if L != None: L1 = L * branch_split L2 = L * (1.0 - branch_split) else: L1 = None L2 = None # connect the new root to the two subtrees root._connect(outgroup, L1) root._connect(path[0], L2) if not reoriente: # copy the root branch label to the new branch if outgroup.num_children > 0: path[0]._label = outgroup._label else: path[0]._label = None else: outgroup._label = None
[docs] def map_descendants(self): """ Map all leaves of the trees to internal nodes. Generate and return a :class:`dict` which gives, for all internal nodes of the trees (excluding the base), the list of terminal nodes that are ultimately connected when reading the tree away from its base. """ mapping = {} queue = {} # add the parent of all leaves to the queue for leaf in self._leaves: if leaf._parent not in queue: queue[leaf._parent] = [] queue[leaf._parent].append([leaf._label]) # process queue'd node and their parent until exhaustion while len(queue): to_del = set() to_add = {} for node in queue: items = queue[node] # this checks if the node has been fully processed if len(items) == len(node._children): # if the node is the base of the tree, just trash the labels if node._parent != None: # otherwise, send the data to the mapping... mapping[node] = functools.reduce(list.__add__, items) # ... and propagate labels to higher level if node._parent in queue: queue[node._parent].append(mapping[node]) elif node._parent in to_add: to_add[node._parent].append(mapping[node]) else: to_add[node._parent] = [mapping[node]] # remove the node from the queue to_del.add(node) for node in to_del: del queue[node] queue.update(to_add) for node in mapping: mapping[node] = tuple(mapping[node]) return mapping
[docs] def frequency_nodes(self, trees, relative=False): """ Label nodes based on their number of occurrences in a list of trees. Each node receives an integer as label counting the number of trees where the same node exists among the trees provided as argument. It is required that all leaf labels are unique. :param trees: an iterable containing :class:`.Tree` instances with exactly the same set of leaf labels. :param relative: node frequencies are expressed as fractions. The use of this option requires that at least one tree is provided. .. note:: With the exception of the base of the tree (which is ignored by this function) and leaf labels, all previously set labels are erased. """ # prepare a set for safety checking leaves = [node._label for node in self._leaves] n = len(leaves) leaves = set(leaves) if len(leaves) != n: raise ValueError('cannot compute node frequencies: not all leaf labels are unique') # get own list of nodes and reverse relationships nodes = dict([(clade, node) for (node, clade) in self.map_descendants().items()]) # initialize counter dict counts = dict.fromkeys([nodes[i] for i in nodes], 0) # process all provided trees n = 0 for tree in trees: n += 1 # check consistency if set([node._label for node in tree._leaves]) != leaves: raise ValueError('cannot compute node frequencies: the list of leaves differs between trees') # count if any set clade of provided tree matches with a clade of the reference tree desc = tree.map_descendants() for i in desc: if desc[i] in nodes: counts[nodes[desc[i]]] += 1 # compute relative frequencies if relative: if n == 0: raise ValueError('cannot compute relative node frequencies: no trees provided') for node in counts: counts[node] = counts[node] / n # annotate nodes for node, count in counts.items(): node._label = count
[docs] def clean_internal_labels(self): """ Remove all internal node labels. This included the base of the tree. In practice, they are set to ``None``. """ for node in self._nodes: if len(node._children) > 0: node._label = None
[docs] def clean_branch_lengths(self): """ Remove all branch lengths. In practice, they are set to ``None``. """ for node in self._nodes: node._parent_branch = None # also the base for child in node._children: node._child_branches[child] = None
[docs] def remove_node(self, node, keep_parent=False): """ Remove a node from the tree, as well as all its descendants. Since this operation may create a node with a single child, this method may remove the parent or the brother of the removed node depending on the structure of the tree, unless specified otherwise (see *keep_parent*). :param node: the node to remove, as a :class:`.Node` instance belowing to the current tree. Terminal nodes can be removed, but not the base of the tree. :param keep_parent: don't remove the parent of the removed node if it is left with only one child. If the parent is the base of the tree, keep the other descendant if it is not terminal (see below). Assume we remove node ``[3]`` from the tree with this structure:: /---------------------------->[2] | /----------->[1] /--------------->[4] | | | | \---------->[3] | | [0] \--------------->[5] | | /--------------------------->[7] | | \------------>[6] /------------->[9] | | \----------->[8] | \------------>[10] Then, we would end up with the following tree:: /----------->[1]--------------------------->[2] | | [0] | /--------------------------->[7] | | \------------>[6] /------------->[9] | | \----------->[8] | \------------>[10] The default behaviour is then to remove node ``[1]`` (and delete its label if it exists) and to set the length of the branch from ``[0]`` to ``[2]`` to the sum of the ``[0]`` to ``[1]`` and ``[1]`` to ``[2]``. But, with *keep_parent* is ``True``, the tree is left as is. There is a special case with the base of the tree. Assume that we remove node ``[1]`` from the original tree above. We then would have a non-standard structure with a single child to the base of the tree:: /--------------------------->[7] | [0]----------->[6] /------------->[9] | | \----------->[8] | \------------>[10] In that case, the base is not removed, but node ``[6]`` is removed using the :meth:`~.Tree.collapse` method (using option *ignore_len* set to ``False`` but *ignore_label* set to ``True`` since the base is not supposed to bear a label). We end up with the following structure:: /--------------------------------->[7] | [0] /------------->[9] | | \----------------->[8] | \------------>[10] The length of the branch from ``[0]`` to ``[6]`` is spread equally between the branch from ``[0]`` to ``[7]`` and the branch from ``[0]`` to ``[8]`` (and so on if there are actually more than one descendants). If *keep_parent* is ``True`` or if ``[6]`` is a terminal node, it is not removed. """ # sanity check if node not in self._nodes: raise ValueError('cannot delete a node that is not part of the tree') if node == self._base: raise ValueError('cannot delete the base of the tree') # record parent parent = node._parent # disconnect the subtree node._remove_parent() # remove nodes of the subtree for node in Tree._gather_nodes(node): node._unlink() self._nodes.remove(node) if node in self._leaves: self._leaves.remove(node) # collapse the ascendant/brother if needed if not keep_parent: if parent.num_children == 1: if parent == self._base: brother = parent._children[0] if brother.num_children != 0: self.collapse(brother, ignore_len=False, ignore_label=True) else: self.collapse(parent, ignore_len=False, ignore_label=True)
@staticmethod def _gather_nodes(node): # collect all nodes from a subtree: return list of this node and # all its descendants nodes = [node] for child in node._children: nodes.extend(Tree._gather_nodes(child)) return nodes
[docs] def lateralize(self, reverse=False): """ Flush bigger clades to one side of the tree. Modify the order of children of all nodes of the trees in such a way that they are sorted from the smallest to the largest number of descending leaves. :param reverse: sort from in the more-descendants to less-descendants order instead. """ # get node mapping (only store number of descendants) nodes = self.map_descendants() for node in nodes: nodes[node] = len(nodes[node]) # add leaves for leaf in self._leaves: nodes[leaf] = 1 # sort all nodes of the tree for node in self._nodes: numdict = dict(zip(node._children, [nodes[child] for child in node._children])) node._children.sort(key=numdict.get, reverse=reverse)
[docs] def midroot(self): """ Automatic midpoint rooting of the tree. The tree must be initially unrooted (trifurcation at the root). This method identifies the most distant pair of terminal nodes (in case of a draw, one pair is picked randomly) and the root of the tree (as a new node) placed at the middle point of this path. """ # sanity check if len(self._nodes) < 3: raise ValueError('cannot perform automatic rooting: not enough branches in tree') if self._base.num_children < 3: raise ValueError('cannot perform automatic rooting: tree must have a trifurcation at the root') # collect the path to root for all leaves paths = [] for node in self._leaves: path = [] while node.parent != None: L = node.parent_branch if L == None: raise ValueError('cannot perform automatic rooting: all branch lengths must be specified') path.append((node, L)) node = node.parent paths.append(path) # find shortest path(s) best = [] best_dist = - sys.float_info.max for i in range(len(paths)): for j in range(i+1, len(paths)): part1 = list(paths[i]) # deep copies to allow editing part2 = list(paths[j]) while part1[-1] == part2[-1]: # the first of each at least is different del part1[-1] del part2[-1] path = [(node, L, False) for (node, L) in part1] + [(node, L, True) for (node, L) in part2[::-1]] d = sum([L for (node, L, reverse) in path]) if (d - best_dist) > -0.000000001: if (d - best_dist) > +0.000000001: best_dist = d best = [] best.append(path) # if more than several best, pick one randomly if len(best) > 1: best = best[random.irand(len(best))] else: best = best[0] # find midpoint lim = best_dist / 2 acc = 0.0 for node, L, reverse in best: acc += L if acc - lim > -0.000000001: outgroup = node if reverse: offset = min(1.0, (acc - lim) / L) else: offset = max(0.0, (acc - lim) / L) break else: raise RuntimeError('midpoint rooting: cannot find root location (please report bug)') # root itself self.root(outgroup, offset)
[docs] def unroot(self, reverse=False): """ Remove the root. The tree must be initially rooted (bifurcation at the root). This method removes the root node and places the base of the tree at one of the two basal nodes (the nodes that are ancestral to the two basal groups). This method does not change to total length of the tree. And error is raised if only one of the two basal branches has a length. If the initial basal node has a label, it is lost. If the node that becomes the base has a label, it is left there (it will appear a the base of the tree). :param reverse: if ``True``, place the base of the tree at the second basal node (by default, the first basal node is used). """ if self._base.num_children != 2: raise ValueError('cannot unroot tree: tree is not rooted') node1 = self._base.child(0) node2 = self._base.child(1) L1 = node1.parent_branch L2 = node2.parent_branch if L1 == None and L2 == None: L = None elif L1 != None and L2 != None: L = L1 + L2 else: raise ValueError('cannot unroot tree: only one of the two basal branches has a length') self._base._remove_child(node1) self._base._remove_child(node2) self._nodes.remove(self._base) if reverse: node2._connect(node1, L) self._base = node2 else: node1._connect(node2, L) self._base = node1
class _tree_breath_first_iterator(object): def __init__(self, tree, start): self._this_level = [start] self._next_level = list(tree._base._children) # must be a deep copy def __iter__(self): return self def __next__(self): if len(self._this_level) == 0: if len(self._next_level) == 0: raise StopIteration else: self._this_level = self._next_level self._next_level = functools.reduce(list.__add__, [node._children for node in self._this_level]) node = self._this_level.pop(0) return node class _tree_depth_first_iterator(object): def __init__(self, tree, start): self._cur = start self._pile = [] def __iter__(self): return self def __next__(self): if self._cur == None: raise StopIteration node = self._cur children = list(node.children()) if len(children) == 0: if len(self._pile) == 0: self._cur = None else: self._cur = self._pile.pop() else: self._cur = children[0] self._pile.extend(children[1:]) return node