Source code for dnachisel.biotools.genbank_operations

"""Generic methods for reading/modifying Genbank/Biopython records"""

from copy import deepcopy

import numpy as np
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import DNAAlphabet
from Bio import SeqIO

try:
    from snapgene_reader import snapgene_file_to_seqrecord
except ImportError:

    def snapgene_file_to_seqrecord(*a, **k):
        """Please install the snapgene_reader library to use this function."""
        raise ImportError(
            "Please install snapgene_reader to import Snapgene .dna files"
        )


[docs]def load_record(filepath, linear=True, name="unnamed", file_format="auto"): """Load a FASTA/Genbank/Snapgene record. Note that reading Snapgene records requires the library snapgene_reader installed. """ if file_format != "auto": record = SeqIO.read(filepath, file_format) elif filepath.lower().endswith(("gb", "gbk")): record = SeqIO.read(filepath, "genbank") elif filepath.lower().endswith(("fa", "fasta")): record = SeqIO.read(filepath, "fasta") elif filepath.lower().endswith(".dna"): record = snapgene_file_to_seqrecord(filepath) else: raise ValueError("Unknown format for file: %s" % filepath) record.linear = linear if name != "unnamed": record.id = name record.name = name.replace(" ", "_")[:20] return record
[docs]def annotate_record( seqrecord, location="full", feature_type="misc_feature", margin=0, **qualifiers ): """Add a feature to a Biopython SeqRecord. Parameters ---------- seqrecord The biopython seqrecord to be annotated. location Either (start, end) or (start, end, strand). (strand defaults to +1) feature_type The type associated with the feature margin Number of extra bases added on each side of the given location. qualifiers Dictionnary that will be the Biopython feature's `qualifiers` attribute. """ if location == "full": location = (margin, len(seqrecord) - margin) strand = location[2] if len(location) == 3 else 1 seqrecord.features.append( SeqFeature( FeatureLocation(location[0], location[1], strand), qualifiers=qualifiers, type=feature_type, ) )
[docs]def annotate_differences( record, reference, feature_type="misc_feature", prefix="#" ): """Annotate differences between two records in a new record. Returns a version of SeqRecord ``record`` where differences with the references are annotated as new features. Parameters ---------- record The SeqRecord to be compared to the reference reference The reference SeqRecord. Must be the same size as ``reference`` feature_type The type of the features added to mark differences. prefix Each new feature will be labeled "po" where p is the prefix and o the original sequence at the feature's location. For instance "#A" or "#TT". """ seq1 = str(record.seq) seq2 = str(reference.seq) indices_diff = ( np.fromstring(seq1, dtype="uint8") - np.fromstring(seq2, dtype="uint8") ).nonzero()[0] indices_diff = [int(e) for e in indices_diff] locations = [[indices_diff[0], indices_diff[0]]] for ind in indices_diff[1:]: if ind - locations[-1][-1] == 1: locations[-1][-1] = ind else: locations.append([ind, ind]) new_record = deepcopy(record) for (start, end) in locations: annotate_record( new_record, location=(start, end + 1), feature_type=feature_type, label=prefix + seq2[start : end + 1], ) return new_record
[docs]def annotate_pattern_occurrences( record, pattern, feature_type="misc_feature", prefix="!" ): """Return a new record annotated w. all occurences of pattern in sequence. Parameters ----------- record A Biopython record pattern A DnaChisel SequencePattern object (such as DnaPAttern) feature_type Type of the annotations in the returned record """ new_record = deepcopy(record) label = prefix + str(pattern) for location in pattern.find_matches(str(record.seq)): annotate_record( new_record, location=(location.start, location.end), feature_type=feature_type, label=label, ) return new_record
[docs]def change_biopython_record_sequence(record, new_seq): """Return a version of the record with the sequence set to new_seq""" new_record = deepcopy(record) new_record.seq = Seq(new_seq, alphabet=DNAAlphabet()) return new_record
[docs]def sequence_to_biopython_record( sequence, id="<unknown id>", name="<unknown name>", features=() ): """Return a SeqRecord of the sequence, ready to be Genbanked.""" return SeqRecord( Seq(sequence, alphabet=DNAAlphabet()), id=id, name=name, features=list(features), )
[docs]def find_specification_label_in_feature(feature): """Analyse a Biopython feature to find a DnaChisel Specification in it. The specification should start with either "@" or "~", in the feature's field "label" or "note". """ for labelfield in ["label", "note"]: if labelfield not in feature.qualifiers: continue potential_label = feature.qualifiers.get(labelfield, "_") if isinstance(potential_label, list): potential_label = potential_label[0] if (potential_label != "") and (potential_label[0] in "@~"): return potential_label return None
[docs]def write_record( record, target, file_format="genbank", remove_locationless_features=True, max_name_length=20, ): """Write a record as genbank, fasta, etc. via Biopython, with fixes. Parameters ---------- record A biopython record target Path to a file or filelike object. file_format Format, either Genbank or fasta remove_locationless_features If True, will remove all features whose location is None, to avoid a Biopython bug max_name_length The record's name will be truncated if longer than this (also here to avoid a biopython bug). """ record = deepcopy(record) if remove_locationless_features: record.features = [ f for f in record.features if f.location is not None ] record.name = record.name[:max_name_length] if str(record.seq.alphabet.__class__.__name__) != "DNAAlphabet": record.seq.alphabet = DNAAlphabet() if hasattr(target, "open"): target = target.open("w") SeqIO.write(record, target, file_format)