Source code for egglib.io._fasta

"""
    Copyright 2015-2023 Stephane De Mita, Mathieu Siol

    This file is part of EggLib.

    EggLib is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    EggLib is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
"""

from .. import eggwrapper as _eggwrapper
from .. import _interface
from .. import alphabets

[docs]def from_fasta(fname, alphabet, labels=False, label_marker='@', label_separator=',', cls=None): """ Import sequences from a fasta file. Create a new instance of either :class:`.Align` or :class:`.Container` from data read from a fasta-formatted file. To process data from a fasta-formatted string, use :func:`.io.from_fasta_string`. :param source: name of a fasta-formatted sequence file. :param alphabet: an :class:`.Alphabet` instance defining the type of data. Only character alphabets are allowed (such as :py:obj:`.alphabets.DNA`, or :py:obj:`.alphabets.protein`). :param labels: import group labels. If so, they are not actually required to be present for each (or any) sequence. By default tags in sequence names considered to be part of the name and not as labels. :param label_marker: this option allows to change the character indicating the start of the labels. :param label_separator: this options allows to change character used to separate labels. :param cls: type that should be generated. Possible values are: :class:`!Align` (then, data must be aligned), :class:`!Container`, or ``None``. In the latter case, an :class:`!Align` is returned if data are found to be aligned or if the data set is empty, and otherwise a :class:`!Container` is returned. :return: A new :class:`.Container` or :class:`.Align` instance depending on the value of the *cls* option. """ if not isinstance(alphabet._obj, _eggwrapper.CharAlphabet): raise ValueError('invalid alphabet for parsing fasta data: {0}'.format(alphabet.name)) fasta_parser = _eggwrapper.FastaParser() fasta_parser.open_file(fname, alphabet._obj) return _from_fasta(fasta_parser, alphabet, labels, label_marker, label_separator, cls)
[docs]def from_fasta_string(string, alphabet, labels=False, label_marker='@', label_separator=',', cls=None): """ Import sequences from a fasta-formatted string. Identical to :func:`.io.from_fasta` but directly takes an fasta-formatted string as first argument. """ if not isinstance(alphabet._obj, _eggwrapper.CharAlphabet): raise ValueError('invalid alphabet for parsing fasta data: {0}'.format(alphabet.name)) fasta_parser = _eggwrapper.FastaParser() fasta_parser.set_string(string, alphabet._obj) return _from_fasta(fasta_parser, alphabet, labels, label_marker, label_separator, cls)
def _from_fasta(fasta_parser, alphabet, labels, label_marker, label_separator, cls): obj = _eggwrapper.DataHolder(False) fasta_parser.read_all(labels, obj, label_marker, label_separator) if cls is _interface.Align or cls is None: ns = set([obj.get_nsit_sample(i) for i in range(obj.get_nsam())]) if len(ns) == 0: ns = 0 if cls is None: cls = _interface.Align elif len(ns) == 1: ns = ns.pop() if cls is None: cls = _interface.Align else: if cls is _interface.Align: raise ValueError('cannot create `Align`: lengths of sequences do not match') cls = _interface.Container elif cls is not _interface.Container: raise ValueError('invalid value provided for `cls`') if cls is _interface.Container: return _interface.Container._create_from_data_holder(obj, alphabet) else: return _interface.Align._create_from_data_holder(obj, alphabet)
[docs]class fasta_iter(object): """ Iterative sequence-by-sequence fasta parser. :param fname: name of a fasta-formatted file. :param alphabet: an :class:`.Alphabet` instance defining the type of data. Only character alphabets are allowed (such as :py:obj:`.alphabets.DNA` and :py:obj:`.alphabets.protein`). :param labels: import group labels from sequence names (by default, they are considered as part of the name). This function can be used in an iteration as shown below: .. code-block:: python >>> for item in egglib.io.fasta_iter(fname): ... ... The ``with`` statement is also supported, which ensures that the input file is properly closed whenever the ``with`` statement completes: .. code-block:: python >>> with egglib.io.fasta_iter(fname) as f: ... for item in f: ... ... Each iteration yields a :class:`.SampleView` instance (which is valid only during the iteration round, see the warning below). .. warning:: The aim of this iterator is to iterate over large fasta files without actually storing all data in memory at the same time. The :class:`.SampleView` instance provided at each iteration is a proxy to a local :class:`.Container` instance that is recycled at each iteration step. The iteration variable should be used immediately and never stored as this. If one wants to sequence data, they should copy them immediately (typically using the :meth:`~.Container.add_sample` method of a separate :class:`.Container` instance). """ def __init__(self, fname, alphabet, labels=False): self._parser = _eggwrapper.FastaParser() # define it before, otherwise the exit code will break in case of error if not isinstance(alphabet._obj, _eggwrapper.CharAlphabet): raise ValueError('invalid alphabet for parsing fasta data: {0}'.format(alphabet.name)) self._parser.open_file(fname, alphabet._obj) self._cont = _interface.Container(alphabet) self._labels = labels def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self._parser.close() def __del__(self): self._parser.close() def __iter__(self): return self def __next__(self): if not self._parser.good(): raise StopIteration self._cont.reset() self._parser.read_sequence(self._labels, self._cont._obj) self._cont._ns = self._cont._obj.get_nsam() return _interface.SampleView(self._cont, 0)