Source code for egglib.tools._code_tools

"""
    Copyright 2008-2023 Stephane De Mita, Mathieu Siol

    This file is part of EggLib.

    EggLib is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    EggLib is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
"""

import re, itertools
from .. import eggwrapper as _eggwrapper
from .. import _interface, alphabets
from . import _reading_frame, _seq_manip

# preload all genetic codes
_codes = {}
for i in range(_eggwrapper.GeneticCode.num_codes()):
    code = _eggwrapper.GeneticCode(i)
    key = code.get_code()
    _codes[key] = code

[docs]class Translator(object): """ Class to translate nucleotide sequences to proteins. :param code: genetic code identifier (see :ref:`here <genetic-codes>`). Required to be an integer among the valid values. The default value is the standard genetic code. """ def _help_frame_processor(self, ls, frame): # take the frame argument and ls and generate self._frame and self._naa if frame is None: if ls == 0: self._frame = _reading_frame.ReadingFrame([]) elif ls % 3 != 0: raise ValueError('alignment length must be a multiple of 3') else: self._frame = _reading_frame.ReadingFrame([(0, ls)]) else: if frame.num_needed_bases > ls: raise ValueError('reading frame is extending past the end of the sequence') self._frame = frame self._naa = self._frame.num_codons def __init__(self, code=1): if code not in _codes: raise ValueError('unknown genetic code: {0}'.format(code)) self._code = _codes[code]
[docs] def translate_codon(self, codon): """ Translate a single codon. Translation is based on the genetic code defined at construction time. Alternative start codons are not supported. :param codon: codon as a three-character string :return: The one-letter amino acid code if the codon can be translated, or ``-`` if the codon is ``---``, or ``X`` in all other cases (including all cases with invalid nucleotides). """ return alphabets.protein.get_value( self._code.translate( alphabets.codons.get_code(codon)))
[docs] def translate_sequence(self, sequence, allow_alt=False): """ Translate a sequence. :param sequence: a :class:`str`, :class:`~.SequenceView` or compatible instance containing codon sequences. Only upper-case strings. :param frame: a :class:`~.ReadingFrame` instance providing the exon positions in the correct frame. By default, a non-segmented frame covering all sequences is assumed (in case the provided alignment is the coding region and length must be a multiple of 3). :param allow_alt: a boolean telling whether alternative start (initiation) codons should be considered. If ``False``, codons are translated as a methionine (M) if, and only if, there are among the alternative start codons for the considered genetic code and they appear at the first position for the sequence. :return: A new :class:`str` instance containing translated (protein) sequences. """ if isinstance(sequence, str): if len(sequence) % 3 != 0: raise ValueError('number of bases must be a multiple of 3') sequence = re.findall('...', sequence.upper()) if allow_alt: for i, codon in enumerate(sequence): if codon == '---': continue if self._code.start(alphabets.codons.get_code(codon)): start = i break if alphabets.codons.get_code(codon) < 0: start = None break else: start = None dest = [self._code.translate(alphabets.codons.get_code(codon)) for codon in sequence] if allow_alt and start is not None and dest[start] != 10: dest[start] = 10 return ''.join(map(alphabets.protein.get_value, dest))
[docs] def translate_align(self, src, allow_alt=False, in_place=False): """ Translate an :class:`.Align` instance. :param src: an :class:`!Align` containing codon sequences and using the :py:obj:`.alphabets.codons` alphabet. :param allow_alt: a boolean telling whether alternative start (initiation) codons should be considered. If ``True``, codons are translated as a methionine (M) if they are among the alternative start codons for the considered genetic code and they appear at the first position for the considered sequence (excluding all triplets of gap symbols appearing at the 5' end of the sequence). :param in_place: place translated sequences into the original :class:`!Align` instance (this discards original data and replaces the original alphabet by the protein alphabet). By default, returns a new instance. :return: By default, an original :class:`!Align` instance containing translated (protein) sequences. If *in_place* was ``True``, return ``None``. """ # check input if not isinstance(src, _interface.Align): raise TypeError('expect an Align instance') if src._alphabet._obj.get_type() != 'codons': raise ValueError('alphabet must be codons') ns = src.ns ls = src.ls # create destination if not in_place: dest = _interface.Align(nsam=ns, nsit=ls, alphabet=alphabets.protein) for i in range(ns): dest.set_name(i, src.get_name(i)) dest._obj.set_nlabels(i, src._obj.get_nlabels(i)) for j in range(src._obj.get_nlabels(i)): dest.set_label(i, j, src.get_label(i, j)) else: dest = src dest._alphabet = alphabets.protein # iterate over sequences for i in range(ns): if allow_alt: for j in range(ls): codon = src._obj.get_sample(i, j) if codon == -1165: continue if self._code.start(codon): start = j break if codon < 0: start = None break else: start = None for j in range(ls): dest._obj.set_sample(i, j, self._code.translate(src._obj.get_sample(i, j))) if allow_alt and start is not None and dest._obj.get_sample(i, start) != 10: dest._obj.set_sample(i, start, 10) # return if not in_place: return dest
[docs] def translate_container(self, src, allow_alt=False, in_place=False): """ Translate a :class:`.Container` instance. :param align: a :class:`!Container` containing codons sequences. :param allow_alt: a boolean telling whether alternative start (initiation) codons should be considered. If ``False``, codons are translated as a methionine (M) if, and only if, they are among the alternative start codons for the considered genetic code and they appear at the first position for the considered sequence. :param in_place: place translated sequences into the original :class:`!Container` instance (this discards original data). By default, return a new instance. :return: By default, a new :class:`!Container` instance containing translated (protein) sequences. If *in_place* was ``True``, return ``None``. """ # check input if not isinstance(src, _interface.Container): raise TypeError('expect a Container instance') if src._alphabet._obj.get_type() != 'codons': raise ValueError('alphabet must be codons') ns = src.ns # create destination if not in_place: dest = _interface.Container(alphabet=alphabets.protein) dest._obj.set_nsam(ns) dest._ns = ns for i in range(ns): dest.set_name(i, src.get_name(i)) dest._obj.set_nlabels(i, src._obj.get_nlabels(i)) for j in range(src._obj.get_nlabels(i)): dest.set_label(i, j, src.get_label(i, j)) else: dest = src dest._alphabet = alphabets.protein # iterate over sequences for i in range(ns): ls = src._obj.get_nsit_sample(i) if not in_place: dest._obj.set_nsit_sample(i, ls) if allow_alt: for j in range(ls): codon = src._obj.get_sample(i, j) if codon == -1165: continue if self._code.start(codon): start = j break if codon < 0: start = None break else: start = None for j in range(ls): dest._obj.set_sample(i, j, self._code.translate(src._obj.get_sample(i, j))) if allow_alt and start is not None and dest._obj.get_sample(i, start) != 10: dest._obj.set_sample(i, start, 10) # return if not in_place: return dest
[docs]def translate(seq, code=1, allow_alt=False, in_place=False): """ Translates coding nucleotide sequences to proteins. See the class :class:`.tools.Translator` for more details. This is a convenience method allowing to translate nucleotide sequences in a single call. For repetitive calls, direct use of :class:`!Translator` can be more efficient. :param seq: input codon sequences. Accepted types are :class:`.Align`, :class:`.Container`, :class:`.SequenceView` or :class:`str` (or also types compatible with :class:`str`). If the input object is an :class:`!Align` or a :class:`!Container`, the alphabet must be :py:obj:`.alphabets.codons`. :param code: genetic code identifier (see below). Required to be an integer among the valid values. The default value is the standard genetic code. :param allow_alt: a boolean telling whether alternative start (initiation) codons should be considered. If ``True``, codons are translated as a methionine (M) if, and only if, there are among the alternative start codons for the considered genetic code and they appear at the first position for the sequence. If *seq* is an :class:`!Align`, leading gaps are ignored as long as they appear as multiples of 3 (fully missing codons). :param in_place: place translated sequences in the provided :class:`!Align` or :class:`!Container` instance, overwriting initial data. Not allowed if *seq* is not of one of these two types. Replace the original alphabet by :py:obj:`.alphabets.protein`. By default, return a new instance. :return: Protein sequences as an :class:`!Align` or a :class:`!Container` if either of these types have been provided as *seq*, or as a string otherwise. If *in_place* has been specified, return ``None``. .. _genetic-codes: All genetic codes defined by the National Center for Biotechnology Information are supported and can be accessed using codes corresponding to the GenBank ``/trans_table`` qualifier. The codes are integers. .. include:: list_genetic_codes.txt Reference: National Center for Biotechnology Information [`<http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi>`_] """ T = Translator(code=code) if isinstance(seq, _interface.Align): return T.translate_align(seq, allow_alt=allow_alt, in_place=in_place) elif isinstance(seq, _interface.Container): return T.translate_container(seq, allow_alt=allow_alt, in_place=in_place) else: if in_place == True: raise ValueError('cannot translate sequence in place') return T.translate_sequence(seq, allow_alt=allow_alt)
[docs]class orf_iter: """ Iterate over open reading frames. Return an iterator over non-segmented open reading frames (ORFs) found in a provided codon sequence in any of the six possible frames. :param sequence: a string or a :class:`.SequenceView` representing a codon sequence. :param code: genetic code identifier (see :ref:`here <genetic-codes>`). Required to be an integer among the valid values. The default value is the standard genetic code. :param min_length: minimum length of returned ORFs. This value must be at least 1. It is understood as the length of the encoded peptide (since ORFs are returned as nucleotide sequences, they will have a length of least three times this value). :param forward_only: consider only the three forward frames (do not consider the reverse strand). :param force_start: if ``True``, all returned ORFs are required to start with a start codon. Otherwise, all non-stop codons are included in ORFs. Note that alternative start codons (``CTG`` and ``TTG`` for the standard genetic code) are also supported. :param allow_alt: allow alternative start codons (only considered if *force_start* is set). :param force_stop: require that ORFs end with a stop codon (this only excludes 3'-partial ORFs, that is ORFs that end with the end of the provided sequennce). :return: An iterator over all detected ORFs. Each ORF is represented by a ``(start, stop, length, frame)`` tuple where ``start`` is the start position of the ORF and ``stop`` the stop position (such as ``sequence[start:stop]`` returns the ORF sequence or its reverse complement [note that the stop codon, if present, is included]), ``length`` is the ORF length (number of amino acids, excluding any stop codon) and ``frame`` is the reading frame on which it was found: +1, +2, +3 are the frames on the forward strand (starting respectively at the first, second, and third base), and -1, -2, -3 are the frames on the reverse strand (starting respectively at the last, last but one, and last but two bases). """ def __init__(self, sequence, code=1, min_length=1, forward_only=False, force_start=True, allow_alt=False, force_stop=True): # get/check arguments if code not in _codes: raise ValueError('unknown genetic code: {0}'.format(code)) code = _codes[code] if min_length < 1: raise ValueError('value for `min_length` is too small') self._ls = len(sequence) self._sequence = sequence self._code = code self._min_length = min_length self._forward_only = forward_only self._force_start = force_start self._force_stop = force_stop self._frames = [None] * 6 # make a list of codon status (1:start/2:stop/3:valid/4:missing) self._codon_table = {} aa_stop = alphabets.protein.get_code('*') valid, missing = alphabets.codons.get_alleles() for cod in valid: c = alphabets.codons.get_code(cod) if cod == 'ATG' or (allow_alt and code.start(c)): self._codon_table[cod] = 1 elif code.translate(c) == aa_stop: self._codon_table[cod] = 2 else: self._codon_table[cod] = 3 for cod in missing: self._codon_table[cod] = 4 def __iter__(self): self._pos = 0 self._iter_f = itertools.cycle([1, 2, 3]) self._iter_r = itertools.cycle([-1, -3, -2]) if self._ls % 3 == 1: next(self._iter_r) next(self._iter_r) elif self._ls % 3 == 2: next(self._iter_r) self._cache = None return self def __next__(self): # return cache frame if any if self._cache: r = self._cache self._cache = None return r # proceed until an ORF (or the end) is found while True: # if reached end of sequence, process hanging ORFs and finish if self._pos > self._ls-3: return self.finishing() # ultimately raises StopIteration orf_f = None orf_r = None # process forward frame F = next(self._iter_f) cod = self._codon_table[self._sequence[self._pos:self._pos + 3]] if self._frames[F-1] is None: if (self._force_start == True and cod == 1) or (self._force_start == False and cod in [1, 3]): self._frames[F-1] = [self._pos, None, None, F] else: if cod == 2: self._frames[F-1][1] = self._pos + 3 self._frames[F-1][2] = (self._pos + 3 - self._frames[F-1][0]) // 3 - 1 # -1 because of stop if self._frames[F-1][2] >= self._min_length: orf_f = tuple(self._frames[F-1]) self._frames[F-1] = None # process reverse frame if not self._forward_only: R = next(self._iter_r) cod = self._codon_table[_seq_manip.rc(self._sequence[self._pos:self._pos + 3])] if self._frames[2-R] is None: if cod == 2: # start, stop, len, frame, final_stop, last_M, has_non_stop self._frames[2-R] = [self._pos, None, None, R, 1, None, False] elif cod != 4 and not self._force_stop: self._frames[2-R] = [self._pos, None, None, R, 0, self._pos + 3 if cod==1 else None, True] else: if self._force_start == True and cod == 1: self._frames[2-R][5] = self._pos + 3 self._frames[2-R][6] |= True elif cod == 2: if self._force_start: if self._frames[2-R][5] is not None: # there is an M self._frames[2-R][1] = self._frames[2-R][5] self._frames[2-R][2] = (self._frames[2-R][5] - self._frames[2-R][0]) // 3 - self._frames[2-R][4] if self._frames[2-R][2] >= self._min_length: orf_r = tuple(self._frames[2-R][:4]) elif self._frames[2-R][6]: self._frames[2-R][1] = self._pos self._frames[2-R][2] = (self._pos - self._frames[2-R][0]) // 3 - self._frames[2-R][4] if self._frames[2-R][2] >= self._min_length: orf_r = tuple(self._frames[2-R][:4]) self._frames[2-R] = [self._pos, None, None, R, 1, None, False] elif cod != 4: self._frames[2-R][6] |= True # return frame (return orf_f and cache orf_r if both) self._pos += 1 if orf_f and orf_r: self._cache = orf_r return orf_f if orf_f: return orf_f if orf_r: return orf_r def finishing(self): # if we reached the end, return any remaining still-running ORF (if allowed by options) # forward frames if not self._force_stop: for i in range(3): if self._frames[i] is not None: self._frames[i][1] = (self._ls - i) // 3 * 3 + i self._frames[i][2] = (self._frames[i][1] - self._frames[i][0]) // 3 r = self._frames[i] self._frames[i] = None if r[2] >= self._min_length: return tuple(r) # reverse frames for i in range(3): if self._frames[3+i] is not None: if self._force_start: if self._frames[3+i][5] is None: self._frames[3+i] = None continue self._frames[3+i][1] = self._frames[3+i][5] self._frames[3+i][2] = (self._frames[3+i][5] - self._frames[3+i][0]) // 3 - self._frames[3+i][4] elif self._frames[3+i][6]: self._frames[3+i][1] = self._ls - i self._frames[3+i][2] = (self._frames[3+i][1] - self._frames[3+i][0]) // 3 - self._frames[3+i][4] if self._frames[3+i][6]: r = self._frames[3+i][:4] self._frames[3+i] = None if r[2] >= self._min_length: return tuple(r) else: self._frames[3+i] = None raise StopIteration
[docs]def longest_orf(*args, **kwargs): """ longest_orf(sequence, code=1, min_length=1, forward_only=False, force_start=True, allow_alt=False, force_stop=True) Detect the longest open reading frame. Arguments are identical to :func:`.orf_iter`. A :exc:`ValueError` is raised if two or more open reading frames have the largest length. :return: A ``(start, stop, length, frame)`` tuple (see the return value of :func:`.orf_iter` for details), or ``None`` if no open reading frame fits the requirements. """ max_length = 0 candidate = None for start, stop, length, frame in orf_iter(*args, **kwargs): if length == max_length: candidate = None num += 1 elif length > max_length: max_length = length candidate = start, stop, length, frame num = 1 if max_length == 0: return None if num > 1: raise ValueError('several equally long ORFs found') return candidate
[docs]def backalign(nucl, aln, code=1, ignore_names=False, ignore_mismatches=False, fix_stop=False, allow_alt=True): """ Align coding sequence based on the protein alignment. :param nucl: a :class:`.Container` or :class:`.Align` instance containing coding sequences that should be aligned. There should not be any alignment gap in the sequences. The alphabet must be :class:`.alphabets.codons`. :param aln: an :class:`!Align` instance containing an alignment of the protein sequences encoded by the coding sequences provided as *nucl*. Group labels of *aln* are not taken into account. The alphabet must be :class:`.alphabets.protein`. :param code: genetic code identifier (see :ref:`here <genetic-codes>`). Required to be an integer among the valid values. The default value is the standard genetic code. :param ignore_names: if ``True``, ignore names for matching sequences in the protein alignment to coding sequences. Sequences will be matched using their rank and the names in the returned alignment will be taken from *nucl*. :param ignore_mismatches: if ``True``, do not generate any exception if a predicted protein does not match the provided protein sequence (if the lengths differ, an exception is always raised). :param fix_stop: if ``True``, support a single trailing stop codon in coding sequences not represented by a ``*`` in the provided protein alignment (if the final stop codons have been stripped during alignment). If found, this stop codon will be flushed as left as possible (immediately after the last non-gap character) in the returned coding alignment. :return: An :class:`.Align` instance containing aligned coding codon sequences. If a mismatch is detected between a protein from *aln* and the corresponding prediction from *nucl*, an instance of :exc:`.tools.BackalignError` (a subclass of :exc:`ValueError`) is raised. Its attribute :py:obj:`~.tools.BackalignError.alignment` can be used to help identify the reason of the error. Mismatches (but not differences of length) can be ignored with the option *ignore_mismatches*. """ # checking if not isinstance(aln, _interface.Align): raise TypeError('expect an Align instance') if aln._alphabet._obj.get_name() != 'protein': raise ValueError('alignment alphabet must be protein') if not isinstance(nucl, _interface.Container): raise TypeError('argument must be a Container instance') if nucl._alphabet._obj.get_type() != 'codons': raise ValueError('alignment alphabet must be codons') # get code translator = Translator(code=code) # initialize the return object ns = aln.ns ls = aln.ls result = _eggwrapper.DataHolder(True) result.set_nsam(ns) result.set_nsit_all(ls + int(fix_stop)) # set a trailing gape at the end of all sequences to allow 1 or more additional stop codons if fix_stop: for i in range(ns): result.set_sample(i, ls, -1165) last_used = False # set to True if at least one sequence uses the last three bases # get mapping of indexes nucl = nucl._obj aln = aln._obj if nucl.get_nsam() != ns: raise ValueError('numbers of sequences don\'t match') names = list(map(nucl.get_name, range(ns))) if not ignore_names: if len(set(names)) < ns: raise ValueError('duplicate name found') aln_names = list(map(aln.get_name, range(ns))) ranks = [] for name in names: try: ranks.append(aln_names.index(name)) except ValueError: 'sequence name not found in protein alignment: {0}'.format(name) else: ranks = list(range(ns)) # set names and group labels for idx_cds in range(ns): result.set_name(idx_cds, names[idx_cds]) result.set_nlabels(idx_cds, nucl.get_nlabels(idx_cds)) for k in range(nucl.get_nlabels(idx_cds)): result.set_label(idx_cds, k, nucl.get_label(idx_cds, k)) # main loop last_used = False for idx_cds, idx_aln in enumerate(ranks): ls_nucl = nucl.get_nsit_sample(idx_cds) if ls_nucl < 1: raise ValueError('empty nucleotide sequence') # process all aligned aa positions c = 0 for idx in range(ls): aa = aln.get_sample(idx_aln, idx) # process gaps if aa == -1: result.set_sample(idx_cds, idx, -1165) # non-gap else: codon = nucl.get_sample(idx_cds, c) c += 1 if (not ignore_mismatches and translator._code.translate(codon) != aa): # check mismatch raise BackalignError(names[idx_cds], nucl.get_sample, aln.get_sample, idx_cds, idx_aln, ls, ls_nucl, translator) result.set_sample(idx_cds, idx, codon) # set codon # if there is an additional codon at the end if c != ls_nucl: # if requested, check that it is a stop codon if fix_stop: codon = nucl.get_sample(idx_cds, c) c += 1 # fix stop codon (only if all bases have been read) if c == ls_nucl and translator._code.translate(codon) == 20: i = ls # there is necessarily room there, filled by gaps while i > 0 and result.get_sample(idx_cds, i-1) == -1165: i -= 1 result.set_sample(idx_cds, i, codon) if i == ls: last_used = True else: raise BackalignError(names[idx_cds], nucl.get_sample, aln.get_sample, idx_cds, idx_aln, ls, ls_nucl, translator) else: raise BackalignError(names[idx_cds], nucl.get_sample, aln.get_sample, idx_cds, idx_aln, ls, ls_nucl, translator) # if the last codon position has not been used at least once by fix_stop, remove them if fix_stop and not last_used: result.set_nsit_all(ls) # decrease by three # that's all, folks! return _interface.Align._create_from_data_holder(result, alphabets.codons)
[docs]class BackalignError(ValueError): """ Exception used to report errors occurring during the use of :func:`.tools.backalign` because of mismatches between the provided alignment and predicted proteins. """ def __init__(self, name, fnuc, faln, i_nuc, i_aln, ls_aln, ls_nuc, translator): self._name = name message = 'mismatch between provided and predicted proteins for `{0}`'.format(name) prov = ''.join(map(alphabets.protein.get_value, [faln(i_aln, i) for i in range(ls_aln)])).replace('-', '') pred = ''.join(map(alphabets.codons.get_value, [fnuc(i_nuc, i) for i in range(ls_nuc)])).replace('-', '') if len(pred) % 3 != 0: message = 'length of nucleotide sequence not a multiple of 3 for {0}'.format(name) pred = ''.join([translator.translate_codon(''.join(codon)) for codon in zip(pred[::3], pred[1::3], pred[2::3])]) n = max([len(prov), len(pred)]) mid = [] for i in range(n): if i >= len(prov): prov += '-' mid.append('~') elif i >= len(pred): pred += '-' mid.append('~') elif prov[i] != pred[i]: mid.append('#') else: mid.append('|') mid = ''.join(mid) c = 0 self._alignment = [] while True: A = prov[c:c+60] A = ' '.join([A[i*10:i*10+10] for i in range(6)]) B = mid[c:c+60] B = ' '.join([B[i*10:i*10+10] for i in range(6)]) C = pred[c:c+60] C = ' '.join([C[i*10:i*10+10] for i in range(6)]) self._alignment.append('[provided] {0}'.format(A) + '[{0}]\n'.format(c+60).rjust(7)) self._alignment.append(' {0}\n'.format(B)) self._alignment.append('[predicted] {0}'.format(C) + '[{0}]\n'.format(c+60).rjust(7)) c += 40 if c < n: self._alignment.append('\n') else: break self._alignment = ''.join(self._alignment) ValueError.__init__(self, message) @property def name(self): """ Name of sequence for which the error occurred. """ return self._name @property def alignment(self): """ Comparaison of provided and predicted proteins. This string shows the two proteins with a middle line composed of the following symbols: * ``|``: match. * ``#``: mismatch. * ``~``: one protein shorter. """ return self._alignment
[docs]def trailing_stops(align, action=0, code=1, replacement='???'): """ Process stop codons at the end of the sequences. This function detects and (optionally) fix trailing stop codons. The algorithm consists in locating the last non-gap codon of each sequence. If the last codon is partially gapped (which can happen if coding sequences are aligned using a DNA-based algorithm), it will not be detected and ignored. :param align: an :class:`.Align` containing aligned coding sequences. The alphabet must be :py:obj:`.alphabets.codons`. :param action: an integer specifying what should be done if a stop codon is found at the end of a given sequence. Possible actions are listed in the following table: +------+---------------------------------------------------+ | Code | Action + +======+===================================================+ | 0 | Nothing (just count them). + +------+---------------------------------------------------+ | 1 | Replace them by a gap, and delete the last + | | position if it is made by gaps only. + +------+---------------------------------------------------+ | 2 | Replace them by the value given as *replacement*. + +------+---------------------------------------------------+ Note that using ``action=1`` is not stricly equivalent to using ``action=2, replacement='---'`` because the former deletes the last three positions of the alignment if needed while the latter does not. :param code: genetic code identifier (see :ref:`here <genetic-codes>`). Required to be an integer among the valid values. The default value is the standard genetic code. :param replacement: if *action* is set to 2, provide the codon that should be used to replace stop codons. :return: The number of sequences that had a trailing stop codons among the considered sequences. """ # argument check if not isinstance(align, _interface.Align): raise TypeError('expect an Align instance') if align._alphabet.type != 'codons': raise ValueError('alignment alphabet must be codons') if code not in _codes: raise ValueError('unknown genetic code: {0}'.format(code)) code = _codes[code] ns = align.ns ls = align.ls try: replacement = alphabets.codons.get_code(replacement) except ValueError: raise ValueError('invalid codon: {0}'.format(replacement)) cnt = 0 num_gapped = 0 for i in range(ns): cur = ls - 1 for cur in range(ls-1, -1, -1): if align._obj.get_sample(i, cur) != -1165: if code.translate(align._obj.get_sample(i, cur)) == 20: cnt += 1 if action == 1: align._obj.set_sample(i, cur, -1165) num_gapped += 1 elif action == 2: align._obj.set_sample(i, cur, replacement) break # remove final 3 gaps (only if ALL samples where replaced by gaps, in particular only if include_outgroup was true) if action == 1 and num_gapped == ns: align._obj.set_nsit_all(ls-3) return cnt
[docs]def iter_stops(src, code=1): """ Iterate over stop codons. Return an iterator providing the coordinates of all stop codons over all sequences. Each iteration returns a ``(sample, position)`` where ``sample`` is the sample index and ``position`` is the index of the stop codon. :param src: an :class:`.Align` or :class:`.Container` containing coding sequences. The alphabet must be :py:obj:`.alphabets.codons`. :param code: genetic code identifier (see :ref:`here <genetic-codes>`). Required to be an integer among the valid values. The default value is the standard genetic code. :return: An iterator over the ``(sample, position)`` tuples corresponding to each stop codon found in the alignment (see above). """ # argument check if not isinstance(src, (_interface.Align, _interface.Container)): raise TypeError('expect an Align or Container instance') if src._alphabet.type != 'codons': raise ValueError('alignment alphabet must be codons') ns = src.ns if code not in _codes: raise ValueError('unknown genetic code: {0}'.format(code)) code = _codes[code] if isinstance(src, _interface.Align): ls = src.ls for i in range(ns): if isinstance(src, _interface.Container): ls = src.ls(i) for j in range(ls): if code.translate(src._obj.get_sample(i, j)) == 20: yield (i, j)
[docs]def has_stop(src, code=1): """ Test if there is at least one stop codon. Return ``True`` if the sequences contain at least one codon stop at any position in any sequence, and ``False`` otherwise. :param src: an :class:`.Align` or :class:`.Container` containing aligned coding sequences. :param code: genetic code identifier (see :ref:`here <genetic-codes>`). Required to be an integer among the valid values. The default value is the standard genetic code. :return: A boolean. """ for stop in iter_stops(src, code=code): break else: return False return True