Source code for egglib.stats._coding_diversity

"""
    Copyright 2015-2021 Stephane De Mita, Mathieu Siol

    This file is part of EggLib.

    EggLib is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    EggLib is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
"""

from .. import eggwrapper as _eggwrapper
from .. import _interface, alphabets
from ..tools import _code_tools
from .. import _site

[docs]class CodingDiversity(object): """ Detection of synonymous and non-synonymous sites. This class processes alignments with a reading frame specification in order to detect synonymous and non-synonymous variable positions. It provides basic statistics, but it can also filter data to let the user compute all other statistics on synonymous-only, or non-synonymous-only variation (e.g. :math:`\pi` or ``D``). The constructor takes optional arguments. By default, build an empty instance. If arguments are passed, they must match the signature of :meth:`~.CodingDiversity.process` that will be called. Note that codons with missing data are never considered, even if the resolution of all possibilities consistent translate to the same nucleotide. This applies to stop and start codons. Alternative start codons are not considered either. """ def _get_site(self): if len(self._site_pool) > 0: return self._site_pool.pop(0) else: return _site.Site(alphabets.codons) def _defaults(self): self._num_tot = 0 self._num_eff = 0 self._num_NS = 0.0 self._num_S = 0.0 self._num_pol_single = 0 self._num_multiple_hits = 0 self._num_multiple_alleles = 0 self._num_pol_NS = 0 self._num_pol_S = 0 self._num_stop = 0 def __init__(self, *args, **kwargs): # internal processing helpers self._site_pool = [] self._sites_S = [] self._sites_NS = [] self._alleles_S = [] self._alleles_NS = [] self._positions_S = [] self._positions_NS = [] # run process if len(args) + len(kwargs) > 0: self.process(*args, **kwargs) # set internal variables to default values else: self._defaults()
[docs] def process(self, align, code=1, struct=None, max_missing=0.0, skipstop=True, raise_stop=False, multiple_alleles=False, multiple_hits=False): """ Process an alignment. It this instance already had data in memory, they will all be erased. :param align: an :class:`.Align` instance containing the coding sequence to process. The alphabet must be codons. :param code: genetic code identifier (see :ref:`here <genetic-codes>`). Required to be an integer among the valid values. The default value is the standard genetic code. :param struct: a class:`Structure` object specifying samples to include and outgroup samples to skip for analysis (outgroup samples are ignored to determine if site is polymorphic, coding, or non-coding, but included in generated sites). By default, use all samples. :param max_missing: maximum relative proportion of missing data (per codon site) to allow (including stop codons if *skipstop* if ``true``). By default, all codon sites with any missing data are excluded. Outgroup is not considered. .. note:: Here, *max_missing* is a relative proportion to allow using the same value for different alignments that might not have the same number of samples (to avoid reevaluating *max_missing* if the user wants the same maximum rate of missing data). In other functions, *max_missing* is the maximum *number* of missing and is an integer. :param skipstop: if ``True``, stop codons are treated as missing data and skipped. If so, potential mutations to stop codons are not taken into account when estimating the number of non-synonymous sites. Warning (this may be counter-intuitive): it actually assumes that stop codons are not biologically plausible and considers them as missing data. On the other hand, if *skipstop* is ``False``, it considers stop codons as if they were valid amino acids. This option has no effect if *raise_stop* is ``True``. :param raise_stop: raise a :exc:`ValueError` if a stop codon is met. If ``True``, *skipstop* has no effect. Outgroup is not considered. :param multiple_alleles: include coding sites with more than two alleles (regardless of whether mutations hit the same position within the triplet). If there are more than two different codons, they must either encode for the same amino acid or all encode for different amino acids (otherwise the site is rejected). If more than one of the three codon position are mutated, the option *multiple_hits* is considered. :param multiple_hits: include coding sites for which more than one of the three positions has changed (regardless of the number of alleles). If there are more than two alleles, the option *multiple_alleles* is also considered. """ if not isinstance(align, _interface.Align): raise TypeError('an Align instance is required') if align._alphabet._obj.get_type() != 'codons': raise ValueError('alphabet must be codons') if code not in _code_tools._codes: raise ValueError('unknown genetic code: {0}'.format(code)) code = _code_tools._codes[code] if struct is None: struct = _eggwrapper.StructureHolder() struct.mk_dummy_structure(align.ns, 1) else: struct = struct._obj # if raise_stop is on: force skipstop to be false to make them appear if raise_stop and skipstop: skipstop = False # return all CodingSite's to stock self._site_pool.extend(self._sites_S) self._site_pool.extend(self._sites_NS) del self._sites_S[:] del self._sites_NS[:] del self._alleles_S[:] del self._alleles_NS[:] del self._positions_S[:] del self._positions_NS[:] # get the first coding site current = self._get_site() # initialize variables self._num_tot = align.ls self._num_eff = 0 self._num_stop = 0 self._num_NS = 0.0 self._num_S = 0.0 self._num_pol_single = 0 self._num_multiple_hits = 0 self._num_multiple_alleles = 0 self._num_pol_NS = 0 self._num_pol_S = 0 ns = struct.get_ni() max_missing = int(max_missing * ns) # process all codon sites for idx in range(self._num_tot): # process site (don't care about missing data) current._obj.reset() current._obj.process_align(align._obj, idx, struct) # WE ASSUME HERE THAT INGROUP SAMPLES ARE PACKED AT THE FRONT OF THE SITE # count the number of stop codon and missing data nstop = 0 nmiss = 0 for i in range(ns): if current._obj.get_sample(i) < 0: nmiss += 1 elif code.is_stop_unsmart(current._obj.get_sample(i)): if raise_stop: raise ValueError('stop codon found in sequences') nstop += 1 if skipstop: nmiss += nstop # check stop codon if nstop > 0: self._num_stop += 1 # incremented even if skipstop is False # skip if too many missing data (but still count stop codon) if nmiss > max_missing: continue good = True # check number of alleles (exclude if no alleles) alleles = set() for i in range(ns): a = current._obj.get_sample(i) if a >= 0 and (not skipstop or not code.is_stop_unsmart(a)): alleles.add(a) na = len(alleles) if na < 1: continue if na == 2: self._num_pol_single += 1 # decremented below if the two alleles differ at more than one position if na > 2: self._num_multiple_alleles += 1 good &= multiple_alleles # check for multiple hits na1, na2, na3 = map(len, map(set, zip(* map(alphabets.codons.get_value, alleles)))) if (na1 > 1) + (na2 > 1) + (na3 > 1) > 1: self._num_multiple_hits += 1 good &= multiple_hits if na == 2: self._num_pol_single -= 1 # exclude site if too multiple alleles/hits if not good: continue # process alleles if na > 1: aas = set(map(code.translate, alleles)) if len(aas) == 1: SYN = True elif len(aas) == na: SYN = False elif len(aas) < na: continue # skip because ambiguous syn/non-syn else: raise RuntimeError('unexpected error in CodingDiversity') if SYN: self._num_pol_S += 1 self._sites_S.append(current) self._alleles_S.append(alleles) self._positions_S.append(idx) else: self._num_pol_NS += 1 self._sites_NS.append(current) self._alleles_NS.append(alleles) self._positions_NS.append(idx) # count site as exploitable self._num_eff += 1 num_NS = 0.0 num_S = 0.0 c = 0 for i in range(ns): codon = current._obj.get_sample(i) if codon >= 0 and (not skipstop or not code.is_stop_unsmart(current._obj.get_sample(i))): num_NS += code.NSsites(codon, skipstop) num_S += code.Ssites(codon, skipstop) c += 1 if c > 0: self._num_NS += num_NS / c self._num_S += num_S / c if na > 1: # generate a new `current` site current = self._get_site()
@property def num_codons_tot(self): """ Total number of considered codon sites. Only complete codons have been considered, but this value includes codons that have been rejected because of missing data. """ return self._num_tot @property def num_codons_eff(self): """ Number of analysed codon sites. Like :attr:`.num_codons_tot` but excluding sites rejected because of missing data. """ return self._num_eff @property def num_codons_stop(self): """ Number of codon sites with at least one codon stop. """ return self._num_stop @property def num_sites_NS(self): """ Estimated number of non-synonymous sites. Note that the total number of sites per codon is always 3. The numbers of non-synonymous and synonymous sites are estimated using the method of Nei & Gojobori (*Mol. Biol. Evol.* 1986 **3**:418-426). """ return self._num_NS @property def num_sites_S(self): """ Estimated number of synonymous sites. Note that the total number of sites per codon is always 3. """ return self._num_S @property def num_pol_single(self): """ Number of polymorphic coding sites with only one mutation. All these sites are always included. """ return self._num_pol_single @property def num_multiple_alleles(self): """ Number of polymorphic coding sites with more than two alleles. These sites are included only if *multiple_alleles* is ``True`` except those who mix synonymous and non-synonymous changes (they can be rejected if there are more than two alleles in total as well). """ return self._num_multiple_alleles @property def num_multiple_hits(self): """ Number of polymorphic codons for which more than one position is changed. These sites are included only if *multiple_hits* is ``True`` and depdenting on the total number of alleles. """ return self._num_multiple_hits @property def num_pol_NS(self): """ Number of polymorphic codon sites with only non-synonymous variation. """ return self._num_pol_NS @property def num_pol_S(self): """ Number of polymorphic codon sites with only synonymous variation. """ return self._num_pol_S @property def sites_S(self): """ List of sites with only synonymous variation """ return self._sites_S @property def positions_S(self): """ List of positions of sites with only synonymous variation """ return [site.position for site in self._sites_S] @property def sites_NS(self): """ List of sites with only non-synonymous variation """ return self._sites_NS @property def positions_NS(self): """ List of positions of sites with only non-synonymous variation """ return [site.position for site in self._sites_NS]