Source code for egglib._freq

"""
    Copyright 2016-2023 Stephane De Mita, Mathieu Siol

    This file is part of EggLib.

    EggLib is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    EggLib is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
"""

import numbers
from . import eggwrapper as _eggwrapper
from . import alphabets, _interface

[docs]def freq_from_site(site, struct=None): """ Compute allele frequencies from a site. :param site: a :class:`.Site` instance. :param struct: this argument can be: * A :class:`.Structure` instance , allowing to select a subset of samples and/or define the structure. * A :class:`list` (or compatible) of at least one integer providing the sample size (as numbers of samples) of all populations, assuming the individuals are organized in the corresponding order (all samples of a given population grouped together). Samples are grouped in haploid individuals. * ``None`` (no structure, all samples placed in haploid individuals and in a single population). :return: A new :class:`.Freq` instance. """ obj = Freq() obj.from_site(site, struct) return obj
[docs]def freq_from_list(ingroup, outgroup, geno_list=None, alphabet=None): """ Import allele frequencies. :param ingroup: a list of genotype or allele frequencies (based on the value of *geno_list*). There must be one item for each allele. If there is a single list, a unique population is assumed. To specify multiple populations, the user must pass a list of lists (each inner list containing the same number of items representing allele frequencies). To specify multiple clusters, the user must pass a list of lists of lists of allele frequencies. There may also be single-item lists when necessary. The frequencies must be null or positive integers. The number of frequencies per population is required to be constant for all populations (corresponding to the number of alleles or genotypes). If *geno_list* is defined, data must be the frequencies of the provided genotypes, in the same order. Otherwise, data must be allelic frequencies, in the order of increasing allele index (in the latter case, data will be loaded as haploid). :param outgroup: a list of allele/genotype frequencies for the outgroup. The number of alleles or genotypes is also required to match. If ``None``, no outgroup samples (equivalent to a list of zeros). :param geno_list: list of genotypes. Genotypes must be provided as tuples or lists. Their length is equal to the ploidy and is required to be at least one and constant for all genotypes. For haploid data, it is still required to passed allelic values as one-item lists (with a ploidy of one). Order of alleles within genotypes is significant. If ``None``, data are loaded as haploid alleles and the index of alleles is taken as allelic value. :param alphabet: alphabet to be used to describe alleles. Required if *geno_list* is used. By default, use a :return: A new :class:`.Freq` instance. .. note:: It is required that there is at least one cluster and one population. """ obj = Freq() obj.from_list(ingroup, outgroup, geno_list, alphabet) return obj
[docs]def freq_from_vcf(vcf): """ Import allelic frequencies from a VCF parser. The VCF parser must have processed a variant and the variant is required to have frequency data available as the AC format field along with the AN field. An exception is raised otherwise. This function only imports haploid allele frequencies in the ingroup (without structure). The first allele is the reference, by construction, then all the alternate alleles in the order in which they are provided in the VCF file. :param vcf: a :class:`.VcfParser` instance containing data. There must at least one sample and the AN/AC format fields must be available. It is not required to extract variant data manually. """ obj = Freq() obj.from_vcf(vcf) return obj
[docs]class Freq(object): """ Hold allelic and genotypic frequencies for a single site. Instances can be created using the three functions :func:`.freq_from_site`, :func:`.freq_from_list`, and :func:`.freq_from_vcf`, or using the default constructor. After it is created by any way, instances can be re-used (which is faster), using their methods :meth:`~.Freq.from_site`, :meth:`~.Freq.from_list`, and :meth:`~.Freq.from_vcf`. .. _compartments: :cvar ingroup: whole ingroup. :cvar outgroup: outgroup. :cvar cluster: a specific cluster (identified by its index). :cvar population: a specific population (identified by its index). The variables above specify what subset of data should be considered. You should never try to modify them. For clusters and populations, an index should be provided as well. They can be used as:: >>> freq.freq_allele(allele_index, cpt=egglib.Freq.ingroup) >>> freq.freq_allele(allele_index, cpt=egglib.Freq.population, idx=pop_index) or:: >>> freq.freq_allele(allele_index, cpt=freq.ingroup) >>> freq.freq_allele(allele_index, cpt=freq.population, idx=pop_index) """ ingroup = 0 outgroup = 1 cluster = 2 population = 3 def __init__(self): self._obj = _eggwrapper.FreqHolder() self._alphabet = None
[docs] def from_site(self, site, struct=None): """ Import frequencies from a site. Reset the instance as if it was created using :func:`.freq_from_site`. Arguments are identical to this function. """ if struct is None: struct = _eggwrapper.StructureHolder() struct.mk_dummy_structure(site._obj.get_ns(), 1) elif isinstance(struct, _interface.Structure): struct = struct._obj if struct.get_req() > site._obj.get_ns(): raise ValueError('invalid structure (sample index out of range)') else: if len(struct) < 1: raise ValueError('there must be at least one population size') if sum(struct) != site._obj.get_ns(): raise ValueError('invalid structure (sample size is required to match)') struct = _eggwrapper.StructureHolder() struct.mk_dummy_structure(struct[0], 1) for i in struct[1:]: struct.dummy_add_pop(i) self._obj.setup_structure(struct) self._obj.process_site(site._obj) self._alphabet = site._alphabet
[docs] def from_list(self, ingroup, outgroup, geno_list=None, alphabet=None): """ Import frequencies from lists. Reset the instance as if it was created using :func:`.freq_from_list`. Arguments are identical to this function. """ # reformat ingroup argument if all([isinstance(i, numbers.Integral) for i in ingroup]): ingroup = [[ingroup]] elif all([isinstance(j, numbers.Integral) for i in ingroup for j in i]): ingroup = [ingroup] else: if not all([isinstance(k, numbers.Integral) for i in ingroup for j in i for k in j]): raise ValueError('invalid form for the `ingroup` argument') # get structure properties nc = len(ingroup) ns = [sum(pop) for clu in ingroup for pop in clu] np = len(ns) ni = sum(ns) # get number of genotypes ng = set(map(len, [j for i in ingroup for j in i])) if outgroup is not None: ng.add(len(outgroup)) if len(ng) != 1: raise ValueError('number of frequencies must be the same for all populations and outgroup') ng = ng.pop() if ng < 1: raise ValueError('there must be at least one allele') if outgroup is None: outgroup = [0] * ng no = sum(outgroup) if geno_list is None: pl = 1 geno_list = [[i,] for i in range(ng)] if alphabet is None: self._alphabet = alphabets.positive_infinite else: self._alphabet = alphabet try: [self._alphabet.get_value(i) for i in range(ng)] except ValueError: raise ValueError('alphabet does not have enough values') else: if ng != len(geno_list): raise ValueError('invalid number of genotypes') pl = set(map(len, geno_list)) if len(pl) != 1: raise ValueError('ploidy is not constant in genotypes') pl = pl.pop() if pl < 1: raise ValueError('ploidy must be at least 1') geno_set = set() for g in geno_list: if g in geno_set: raise ValueError('genotype {0} is repeated'.format(g)) geno_set.add(g) if alphabet is None: raise ValueError('alphabet must be specified') self._alphabet = alphabet # list with allele index (not code!) instead of allele value from geno_list geno_indexes = [[None for j in i] for i in geno_list] # setup the instance self._obj.setup_raw(nc, np, pl) idx = 0 for i, clu in enumerate(ingroup): for j, pop in enumerate(clu): self._obj.setup_pop(idx, i, j, sum(pop)) idx += 1 self._obj.set_ngeno(ng) for i, g in enumerate(geno_list): for j, v in enumerate(g): v = self._alphabet.get_code(v) if v < 0: raise ValueError('missing data found in genotypes') self._obj.set_genotype_item(i, j, v) geno_indexes[i][j] = self._obj.find_allele(v) # load data pop_idx = 0 ing_frq = self._obj.frq_ingroup() for clu_idx, clu_data in enumerate(ingroup): clu_frq = self._obj.frq_cluster(clu_idx) for pop_data in clu_data: pop_frq = self._obj.frq_population(pop_idx) for i, n in enumerate(pop_data): ing_frq.incr_genotype(i, n) clu_frq.incr_genotype(i, n) pop_frq.incr_genotype(i, n) for j in geno_indexes[i]: ing_frq.incr_allele(j, n) clu_frq.incr_allele(j, n) pop_frq.incr_allele(j, n) pop_idx += 1 otg_frq = self._obj.frq_outgroup() for i, n in enumerate(outgroup): otg_frq.incr_genotype(i, n) for j in geno_indexes[i]: otg_frq.incr_allele(j, n) # load heterozygote genotypes if pl > 1: for i, g in enumerate(geno_indexes): g = set(g) if len(g) > 1: for j in g: self._obj.frq_ingroup().tell_het(i, j) self._obj.frq_outgroup().tell_het(i, j) for k in range(self._obj.num_clusters()): self._obj.frq_cluster(k).tell_het(i, j) for k in range(self._obj.num_populations()): self._obj.frq_population(k).tell_het(i, j)
[docs] def from_vcf(self, vcf): """ Import frequencies from a VCF file. Reset the instance as if it was created using :func:`.freq_from_vcf`. Argument is identical to this function. """ if vcf._parser.has_data() == False: raise ValueError('data must have been read from VCF parser') if vcf._parser.has_AC() == False or vcf._parser.has_AN() == False: raise ValueError('VCF data must have AC and AN info fields') self._obj.process_vcf(vcf._parser) self._alphabet = alphabets.positive_infinite
@property def ploidy(self): """ Ploidy. """ return self._obj.ploidy() @property def num_alleles(self): """ Number of alleles in the whole site. """ return self._obj.num_alleles() @property def num_genotypes(self): """ Number of genotypes in the whole site. """ return self._obj.num_genotypes() @property def num_clusters(self): """ Number of clusters. """ return self._obj.num_clusters() @property def num_populations(self): """ Number of populations. """ return self._obj.num_populations()
[docs] def allele(self, idx): """Get an allele.""" if idx<0 or idx>=self._obj.num_alleles(): raise IndexError('invalid allele index') return self._alphabet.get_value(self._obj.allele(idx))
[docs] def genotype(self, idx): """ Get a genotype, as a tuple of alleles. """ if idx<0 or idx>=self._obj.num_genotypes(): raise IndexError('invalid genotype index') return tuple(self._alphabet.get_value(self._obj.genotype_item(idx, i)) for i in range(self._obj.ploidy()))
def _getter(self, cpt, idx): if cpt == self.ingroup: return self._obj.frq_ingroup() elif cpt == self.outgroup: return self._obj.frq_outgroup() elif cpt == self.cluster: if idx is None: raise ValueError('cluster index is required') if idx<0 or idx>= self._obj.num_clusters(): raise IndexError('invalid cluster index') return self._obj.frq_cluster(idx) elif cpt == self.population: if idx is None: raise ValueError('population index is required') if idx<0 or idx>= self._obj.num_populations(): raise IndexError('invalid population index') return self._obj.frq_population(idx) else: raise ValueError('invalid compartment identifier')
[docs] def freq_allele(self, allele, cpt=ingroup, idx=None): """ Get the frequency of an allele. :param allele: allele index. :param cpt: :ref:`compartment <compartments>` identifier. :param idx: compartment index (required for clusters and populations, ignored otherwise). """ if allele<0 or allele>=self._obj.num_alleles(): raise IndexError('invalid allele index') return self._getter(cpt, idx).frq_all(allele)
[docs] def freq_genotype(self, genotype, cpt=ingroup, idx=None): """ Get the frequency of an genotype. :param genotype: genotype index. :param cpt: :ref:`compartment <compartments>` identifier. :param idx: compartment index (required for clusters and populations, ignored otherwise). """ if genotype<0 or genotype>=self._obj.num_genotypes(): raise IndexError('invalid genotype index') return self._getter(cpt, idx).frq_gen(genotype)
[docs] def nieff(self, cpt=ingroup, idx=None): """ Get the number of individuals within a given compartment. In the haploid case, this method is identical to :meth:`.nseff`. :param cpt: :ref:`compartment <compartments>` identifier. :param idx: compartment index (required for clusters and populations, ignored otherwise). """ return self._getter(cpt, idx).nieff()
[docs] def nseff(self, cpt=ingroup, idx=None): """ Get the number of samples within a given compartment. :param cpt: :ref:`compartment <compartments>` identifier. :param idx: compartment index (required for clusters and populations, ignored otherwise). """ return self._getter(cpt, idx).nseff()