Source code for egglib.tools._reading_frame

"""
    Copyright 2008-2023 Stephane De Mita, Mathieu Siol

    This file is part of EggLib.

    EggLib is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    EggLib is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
"""

from .. import alphabets

[docs]class ReadingFrame(object): """ Handle reading frame positions. The reading frame positions can be loaded as constructor argument or using the method :meth:`~.ReadingFrame.process`. By default, build an instance with no exons. .. _reading-frame-constructor-arguments: :param frame: the reading frame specification must be a sequence of ``(start, stop[, codon_start])`` pairs or triplets where *start* and *stop* give the limits of an exon, such that ``sequence[start:stop]`` returns the exon sequence, and *codon_start*, if specified, can be: * 1 if the first position of the exon is the first position of a codon (e.g. ``ATG ATG``), * 2 if the first position of the segment is the second position of a codon (e.g. ``TG ATG``), * 3 if the first position of the segment is the third position a of codon (e.g. ``G ATG``), * ``None`` if the reading frame is continuing the previous exon. :param keep_truncated: if ``True``, codons that are truncated (either because the reading frame is 5', internally, or 3' partial, or if the number of bases is not a multiple of three) are not skipped (missing positions of truncated codons are replaced by ``None``). If *codon_start* of the first segment is ``None``, 1 will be assumed. If *codon_start* of any non-first segment is not ``None``, the reading frame is supposed to be interupted. This means that if any codon was not completed at the end of the previous exon, it will assumed to be assumed incomplete, and the first codon of the current codon will be incomplete as well. """ def __init__(self, frame=None, keep_truncated=False): if frame is None: frame = [] self.process(frame, keep_truncated)
[docs] def process(self, frame, keep_truncated=False): """ Reset instance with a new reading frame. All previously loaded data are discarded. The arguments of this method are identical to the :ref:`constructor <reading-frame-constructor-arguments>`. """ cur = -1 self._starts = [] self._stops = [] self._codon_starts = [] for exon in frame: if len(exon) == 2: start, stop = exon codon_start = None elif len(exon) == 3: start, stop, codon_start = exon if codon_start not in set([1, 2, 3, None]): raise ValueError('invalid value for `codon_start`: {0}'.format(codon_start)) if start < cur: raise ValueError('exon limits must in increasing order') if stop <= start: raise ValueError('exon limits must in increasing order') cur = stop self._starts.append(start) self._stops.append(stop) self._codon_starts.append(codon_start) self._num_exons = len(self._starts) if self._num_exons > 0: self._num_tot_bases = self._stops[-1] - self._starts[0] self._num_exon_bases = sum([stop-start for (start, stop) in zip(self._starts, self._stops)]) if self._codon_starts[0] is None: self._codon_starts[0] = 1 # important else: self._num_tot_bases = 0 self._num_exon_bases = 0 self._codons = [] self._bases = {} # keys are bases, values are [exon, codon] indexes pairs for idx, (start, stop, codon_start) in enumerate(zip(self._starts, self._stops, self._codon_starts)): if codon_start != None: # if codon_start is specified, complete the previous codon if len(self._codons): if keep_truncated: self._codons[-1] += [None] * (3 - len(self._codons[-1])) else: if len(self._codons[-1]) != 3: del self._codons[-1] # add non codon with shift if needed if keep_truncated: if codon_start == 1: self._codons.append([]) elif codon_start == 2: self._codons.append([None]) elif codon_start == 3: self._codons.append([None, None]) else: raise ValueError('invalid value for codon start') else: self._codons.append([]) if codon_start == 1: pass elif codon_start == 2: self._bases[start] = [idx, None] # avoid side effect of deleting bases due to shift self._bases[start+1] = [idx, None] start += 2 elif codon_start == 3: self._bases[start] = [idx, None] start += 1 else: raise ValueError('invalid value for codon start') # loop over bases, adding codons when the current is filled # it is guaranteed that there will always be one codon (because 1st exon is in frame 1) for i in range(start, stop): if len(self._codons[-1]) == 3: self._codons.append([]) self._codons[-1].append(i) self._bases[i] = [idx, None] # complete last codon if needed if len(self._codons) > 0: if keep_truncated: self._codons[-1] += [None] * (3 - len(self._codons[-1])) else: if len(self._codons[-1]) != 3: del self._codons[-1] # final processing self._codons = tuple(map(tuple, self._codons)) for idx, codon in enumerate(self._codons): for i in codon: if i is not None: if i not in self._bases or self._bases[i][1] is not None: raise RuntimeError('error in ReadingFrame code') self._bases[i][1] = idx if self._num_exons > 0: self._needed_bases = self._stops[-1] else: self._needed_bases = 0
@property def num_needed_bases(self): """ Number of bases needed to apply this reading frame. Minimum size of a sequence to be consistent with this reading frame. In practice, the value equals to the end of the last exon or zero if there are no exons. If the reading frame is used with a shorter sequence, it can lead to errors. """ return self._needed_bases @property def num_tot_bases(self): """ Number of bases of the reading frame. All introns and exons are included, starting from the start of the first exon up to end of the last one. """ return self._num_tot_bases @property def num_exon_bases(self): """ Number of bases in exons. """ return self._num_exon_bases @property def num_exons(self): """ Number of exons. """ return self._num_exons @property def num_codons(self): """ Number of codons. Truncated codons are included. """ return len(self._codons)
[docs] def exon_index(self, base): """ Find the exon in which a given base falls. :param base: any base index. :return: The index of the corresponding exon, or ``None`` if the base does not fall in any exon. """ if base in self._bases: return self._bases[base][0] else: return None
[docs] def codon_index(self, base): """ Find the codon in which a given base falls. :param base: any base index. :return: The index of the corresponding codon, or ``None`` if the base does not fall in any codon. """ if base in self._bases: return self._bases[base][1] else: return None
[docs] def codon_position(self, base): """ Tell the position of a base within the codon in which it falls. :param base: any base index. :return: The index of the base in the codon (0, 1 or 3), or ``None`` if the base does not fall in any codon. """ if base in self._bases and self._bases[base][1] is not None: return self._codons[self._bases[base][1]].index(base) else: return None
[docs] def codon_bases(self, codon): """ Give the position of the three bases of a given codon. One or two positions (but never the middle one alone) will be ``None`` if the codon is truncated (beginning/end of an exon without coverage of the previous/next one). If the codon index is out of range, return ``None``. :param codon: any codon index. :return: A tuple with the three base positions (containing potentially one or two ``None``) or ``None`` if the codon index is out of range. """ try: return self._codons[codon] except IndexError: return None
[docs] def iter_exon_bounds(self): """ Iterate over exons. This iterator returns ``(start, stop)`` tuples of the positions of the limits of each exon. """ for start, stop in zip(self._starts, self._stops): yield start, stop
[docs] def iter_codons(self): """ Iterate over codons. This iterator returns ``(first, second, third)`` tuples of the positions of the three bases of each codon. Positions might be ``None`` if *keep_truncated* has been set to ``True``. """ for codon in self._codons: yield codon