Source code for egglib.stats._ehh

"""
    Copyright 2015-2023 Stephane De Mita, Mathieu Siol

    This file is part of EggLib.

    EggLib is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    EggLib is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
"""

from .. import eggwrapper as _eggwrapper
from .. import _interface
from .. import _site

[docs]class EHH(object): """ Extended Haplotype Homozygosity statistics. Statistics can be computed for unphased genotypic data. The usage of this class is to: to first set the core haplotypes by passing a :class:`.Site` to :meth:`.set_core` and then to load repetitively distant sites (always with increasing distance from the core), through :meth:`.load_distant`, until the list of sites to process is exhausted or one of the thresholds has been reached. In order to process distant sites using the same core region to the opposite direction, or to use a different core region, it is always required to call :meth:`!set_core` again (this is the only way to reset the instance). After at least one distant site site is loaded, EHH statistics can be accessed using there accessors. EHH statistics fell into four categories: 1. Raw EHH statistics, provided for each loaded distant sites and separately for each core haplotype. See the methods: :meth:`.get_EHH`, :meth:`.get_EHHc`, and :meth:`.get_rEHH`. .. admonition:: Reference Sabeti P.C., D.E. Reich, J.M. Higgins, H.Z.P. Levine, D.J. Richter, S.F. Schaffner, S.B. Gabriel, J.V. Platko, N.J. Patterson, G.J. McDonald, H.C. Ackerman, S.J. Campbell, D. Altshuler, R. Cooper, D. Kwiatkowski, R. Ward & E.S. Lander. 2002. Detecting recent positive selection in the human genome from haplotype structure. *Nature*\ \xa0\ **419**\ : 832-837. 2. Integrated EHH statistics, computed separately for each core haplotype and incremented at each provided sites until the EHH value reaches the thresholds provided as the *EHH_thr* and *EHHc_thr* arguments to :meth:`!set_core`. See the methods :meth:`.get_iHH`, :meth:`.get_iHHc`, and :meth:`.get_iHS`. The methods :meth:`.done_EHH` and :meth:`.done_EHHc` allow to check whether the threshold has been reached for all genotypes. .. admonition:: Reference Voight B.F., S. Kudaravalli, X. Wen & J.K. Pritchard. 2006. A map of recent positive selection in the human genome. *PLoS Biol*\ \xa0\ **4**\ : e772. 3. Whole-site EHHS statistic and its integrated version iES, which is incremented as long that the EHHS value is larger than or equal to the threshold provided as the *EHHS_thr* argument to :meth:`!set_core`. See the methods :meth:`.get_EHHS` and :meth:`.get_iES`. If data are unphased, a specific EHHS estimate based on homozygosity is provided (EHHG, with its iEG integrated counter-part). .. admonition:: Reference Tang K., K.R. Thornton & M. Stoneking. 2007. A new approach for using genome scans to detect recent positive selection in the human genome. *PLoS Biol.*\ \xa0\ **5**\ : e171. 4. EHH, EHHS, and EHHG decay statistics, that give the minimal distance at which, respectively, EHH starts to be smaller than the threshold provided as the *EHH_thr* argument to :meth:`!set_core`, EHHS starts to be smaller than the threshold provided as the *EHHS_thr* argument to :meth:`!set_core`, and EHHG starts to be smaller than the threshold provided as the *EHHG_thr* argument to :meth:`!set_core`. EHH decay is computed separately for each core haplotype. See the methods :meth:`.get_dEHH`, :meth:`.get_dEHHS`, and :meth:`.get_dEHHG`. The values are not available until the respective threshold is reached (``None`` is returned). The method :meth:`.done_EHH` allows to check whether the threshold for the EHH decay has been reached for all core haplotypes. The maximum and average value of the EHH decay across all core haplotypes can be accessed with :meth:`.get_dEHH_max` and :meth:`.get_dEHH_mean`, respectively. .. admonition:: Reference Ram\u00EDrez-Soriano A., S.E. Ramos-Onsins, J. Rozas, F. Calafell & A. Navarro. 2008. Statistical power analysis of neutrality tests under demographic expansions, contractions and bottlenecks with recombination. *Genetics*\ \xa0\ **179** \: 555-567. In all cases, ``None`` is returned when the value is not available (no data loaded, division by zero in the case of ratios, or threshold not reached in the case of decay statistics). The thresholds must all be within the range between 0 and 1. """ def __init__(self): self._obj = _eggwrapper.EHH() self._struct = _eggwrapper.StructureHolder() self._valid = False self._distance = 0.0 self._K_core = None self._K_tot = None self._K_cur = None self._R = None
[docs] def set_core(self, site, struct=None, min_freq=None, min_sam=2, EHH_thr=None, EHHc_thr=None, EHHS_thr=None, EHHG_thr=None, crop_EHHS=False): """ Specify the core site. If the instance already contains data, it will be reset. :param site: a :class:`.Site` instance. The site must have a specified :py:obj:`~.Site.position`. It is assumed that data are phased (samples should be phased; but it is possible to load genotypes whose phase is unknown, as long as the genotypes are phased at the individual level: see *indiv_struct*). :param struct: This argument must be set to a non-``None`` value if genotypic data are provided and if unphased versions of statistics should be computed. If not ``None``, this argument should either be a :class:`.Structure` object mapping samples to individuals (other structure levels are not considered), or an integer specifying the ploidy (in the latter case, samples of any individuals are supposed to be consecutive in the site. It is still required that individuals are phased (that is, that they are entered in the same order in all distant sites). The integer 0 is accepted as a synonym of ``None`` (data will be treated as haploid). :param min_freq: minimal absolute frequency for haplotypes (haplotypes with lower frequencies are ignored). By default, all haplotypes are considered. :param min_sam: minimal absolute number of non-missing samples to compute statistics or stop incrementing them (for statistics expressed per core haplotype, number of samples for this haplotype). :param EHH_thr: threshold determining when iHH should stop integrating and dEHH must be evaluated. By default (``None``), iHH is permanently incremented and dEHH is not evaluated at all. :param EHHc_thr: threshold determining when iHH should stop integrating and dEHH must be evaluated. By default (if ``None``), use the same value as for *EHH_thr*. :param EHHS_thr: threshold determining when iES should stop integrating and dEHHS must be evaluated. By default (``None``), iES is permanently incremented and dEHHS is not evaluated at all. :param EHHG_thr: threshold determining when iEG (iES computed for unphased data) should stop integrating and dEHHG (equivalent to dEHHS) must be evaluated. Must be ``None`` if *genotypes* is ``True``. By default (``None``), iEG is permanently incremented and dEHHG is not evaluated at all. By default: equal to EHHS_thr. :param crop_EHHS: if True, set values of EHHS that are below the threshold to 0 to emulate the behaviour of the R package rehh (also affects iES). """ # check validity of instance if not isinstance(site, _site.Site): raise TypeError('a Site instance is expected') if min_freq is None: min_freq = 1 elif min_freq < 1: raise ValueError('minimal frequency must be strictly larger than 0') if min_sam < 2: raise ValueError('minimal number of non-missing samples must be at least 2') if site.position is None: raise ValueError('core site must have a position') # load core site and get core statistics if EHH_thr is None: EHH_thr = 0.0 elif EHH_thr < 0.0 or EHH_thr > 1.0: raise ValueError('invalid value for `EHH_thr`') if EHHc_thr is None: EHHc_thr = EHH_thr elif EHHc_thr < 0.0 or EHHc_thr > 1.0: raise ValueError('invalid value for `EHHc_thr`') if EHHS_thr is None: EHHS_thr = 0.0 elif EHHS_thr < 0.0 or EHHS_thr > 1.0: raise ValueError('invalid value for `EHHS_thr`') if EHHG_thr is None: EHHG_thr = EHHS_thr elif EHHG_thr < 0.0 or EHHG_thr > 1.0: raise ValueError('invalid value for `EHHG_thr`') if struct == 0: struct = None elif struct is None: pass elif isinstance(struct, _interface.Structure): struct = struct._obj if struct.get_req() > site._obj.get_ns(): raise ValueError('structure does not match site') elif isinstance(struct, int): ns = site.ns pl = struct if ns % pl != 0: raise ValueError('number of samples is not a multiple of the specified ploidy') self._struct.mk_dummy_structure(ns//pl, pl) struct = self._struct else: raise TypeError('invalid type for `struct` argument') self._obj.set_core(site._obj, struct, EHH_thr, EHHc_thr, EHHS_thr, EHHG_thr, min_freq, min_sam, crop_EHHS) self._K_core = self._obj.K_core() if self._K_core == 0: raise ValueError('cannot analyse core site: no valid data') # initialize flags self._valid = True self._ns = site._obj.get_ns()
[docs] def load_distant(self, site): """ Process a distant site. The core site must have been specified. The site must have a specified :py:obj:`~.Site.position`. :param site: a :class:`.Site` instance containing data for the distant site. It must be consistent with the core site (same list of samples, and in the same order). Missing data are supported. """ # check validity of argument if not self._valid: raise ValueError('no core site has been loaded') if site._obj.get_ns() != self._ns: raise ValueError('distant site must be consistent with core') if site.position is None: raise ValueError('distant site must have a position') # pass site self._obj.load_distant(site._obj)
@property def num_haplotypes(self): """ Number of core haplotypes. ``None`` if core has not been set. """ return self._K_core @property def cur_haplotypes(self): """ Current number of haplotypes. ``None`` if core has not been set, equal to :obj:`.num_haplotypes` if no distant site has been loaded. """ return self._obj.K_cur() @property def nsam(self): """ Current number of non-missing samples. """ if not self._valid: raise ValueError('cannot access statistics (no core site provided)') return self._obj.num_avail_tot()
[docs] def nsam_core(self, hap): """ Number of samples for a core haplotype. Number of non-missing samples for one of the core haplotypes. """ if not self._valid: raise ValueError('no data processed') if hap < 0 or hap >= self._K_core: raise IndexError('invalid haplotype index') return self._obj.num_avail_core(hap)
[docs] def nsam_hap(self, hap): """ Number of samples for a haplotype. Number of non-missing samples for one of the current haplotypes. """ if not self._valid: raise ValueError('no data processed') if hap < 0 or hap >= self._obj.K_cur(): raise IndexError('invalid haplotype index') return self._obj.num_avail_cur(hap)
[docs] def get_EHH(self, i): """ Get an EHH value. Get the EHH value for the last processed distant site for core haplotype *i*. Return ``None`` if the value cannot be computed (no available samples). """ if not self._valid: raise ValueError('no data processed') if i<0 or i>=self._K_core: raise IndexError('invalid haplotype index') value = self._obj.EHHi(i) if value == _eggwrapper.UNDEF: return None else: return value
[docs] def get_EHHc(self, i): """ Get an EHHc value. Get the EHHc value for the last processed distant site for core haplotype *i*. Return ``None`` if the value cannot be computed (no available samples). """ if not self._valid: raise ValueError('no data processed') if i<0 or i>=self._K_core: raise IndexError('invalid haplotype index') value = self._obj.EHHc(i) if value == _eggwrapper.UNDEF: return None else: return value
[docs] def get_rEHH(self, i): """ Get a rEHH value. Get the rEHH value for the last processed distant site for core haplotype *i*. Return ``None`` if the ratio cannot be computed (no available sample or division by zero). """ if not self._valid: raise ValueError('no data processed') if i<0 or i>=self._K_core: raise IndexError('invalid haplotype index') value = self._obj.rEHH(i) if value == _eggwrapper.UNDEF: return None else: return value
[docs] def get_iHH(self, i): """ Get an iHH value. Get the iHH value for the last processed distant site for core haplotype *i* . Return ``None`` if the value cannot be computed (no available samples). """ if not self._valid: raise ValueError('no data processed') if i<0 or i>=self._K_core: raise IndexError('invalid haplotype index') value = self._obj.iHH(i) if value == _eggwrapper.UNDEF: return None else: return value
[docs] def get_iHHc(self, i): """ Get an iHHc value. Get the iHHc value for the last processed distant site for core haplotype *i*. Return ``None`` if the value cannot be computed (no available samples). """ if not self._valid: raise ValueError('no data processed') if i<0 or i>=self._K_core: raise IndexError('invalid haplotype index') value = self._obj.iHHc(i) if value == _eggwrapper.UNDEF: return None else: return value
[docs] def get_iHS(self, i): """ Get an iHS value. Get the iHS value for the last processed distant site for core haplotype *i*. Return ``None`` if the ratio cannot be computed (no available sample or division by zero). """ if not self._valid: raise ValueError('no data processed') if i<0 or i>=self._K_core: raise IndexError('invalid haplotype index') value = self._obj.iHS(i) if value == _eggwrapper.UNDEF: return None else: return value
[docs] def done_EHH(self): """Completed EHH calculation. Return ``True`` if the values of iHH for all core haplotypes have finished integrating (and all dEHH values have been evaluated). """ if not self._valid: raise ValueError('no data processed') return self._obj.num_EHH_done() == self._K_core
[docs] def done_EHHc(self): """ Completed EHHc calculation. Return ``True`` if the values of iHHc for all core haplotypes have finished integrating (and all dEHH values have been evaluated). """ if not self._valid: raise ValueError('no data processed') return self._obj.num_EHHc_done() == self._K_core
[docs] def get_EHHS(self): """ Get EHHS value. Get the EHHS value for the last processed distant site. Return ``None`` if the value cannot be computed (no available samples). """ if not self._valid: raise ValueError('no data processed') value = self._obj.EHHS() if value == _eggwrapper.UNDEF: return None else: return value
[docs] def get_EHHG(self): """ Get EHHG value. Get the EHHS (computed with genotypes) value for the last processed distant site. Return ``None`` if the value cannot be computed (no available samples, no indiv_struct option used). """ if not self._valid: raise ValueError('no data processed') value = self._obj.EHHG() if value == _eggwrapper.UNDEF: return None else: return value
[docs] def get_iES(self): """ Get iES value. Get the iES value for the last processed distant site. Return ``None`` if the value cannot be computed (no available samples). """ if not self._valid: raise ValueError('no data processed') value = self._obj.iES() if value == _eggwrapper.UNDEF: return None else: return value
[docs] def get_iEG(self): """ Get iEG value. Get the iES (computed with genotypes) value for the last processed distant site. Return ``None`` if the value cannot be computed (no available samples or indiv_struct option was not used). """ if not self._valid: raise ValueError('no data processed') value = self._obj.iEG() if value == _eggwrapper.UNDEF: return None else: return value
[docs] def get_dEHH(self, i): """ EHH decay. Get the EHH decay distance for core haplotype *i*. Return ``None`` if the EHH threshold has not been reached. """ if not self._valid: raise ValueError('no data processed') if i<0 or i>=self._K_core: raise IndexError('invalid haplotype index') value = self._obj.dEHH(i) if value == _eggwrapper.UNDEF: return None else: return value
[docs] def get_dEHHc(self, i): """ EHHc decay. Get the EHHc decay distance for core haplotype *i*. Return ``None`` if the EHHc threshold has not been reached. """ if not self._valid: raise ValueError('no data processed') if i<0 or i>=self._K_core: raise IndexError('invalid haplotype index') value = self._obj.dEHHc(i) if value == _eggwrapper.UNDEF: return None else: return value
[docs] def get_dEHH_max(self): """ Maximum EHH decay. Get the maximum EHH decay distance across core haplotypes. Return ``None`` if the EHH threshold has not been reached for at least one of the core haplotypes. """ if not self._valid: raise ValueError('no data processed') value = self._obj.dEHH_max() if value == _eggwrapper.UNDEF: return None else: return value
[docs] def get_dEHH_mean(self): """ Average EHH decay. Get the average EHH decay distance across core haplotypes. Return ``None`` if the EHH threshold has not been reached for at least one of the core haplotypes. """ if not self._valid: raise ValueError('no data processed') value = self._obj.dEHH_mean() if value == _eggwrapper.UNDEF: return None else: return value
[docs] def get_dEHHS(self): """ EHHS decay. Get the EHHS decay distance. Return ``None`` if the EHHS threshold has not been reached. """ if not self._valid: raise ValueError('no data processed') value = self._obj.dEHHS() if value == _eggwrapper.UNDEF: return None else: return value
[docs] def get_dEHHG(self): """ EHHG decay. Get the EHHS (computed with genotypes) decay distance. Return ``None`` if the EHHG threshold has not been reached. """ if not self._valid: raise ValueError('no data processed') value = self._obj.dEHHG() if value == _eggwrapper.UNDEF: return None else: return value