Source code for dnachisel.reports.constraints_reports

"""Misc. plotting and reporting methods, some of which are really arbitrary.


Here is a typical example of use:

>>> import dnachisel.reports.constraint_reports as cr
>>> dataframe = cr.constraints_breaches_dataframe(constraints, sequences)
>>> dataframe.to_excel("output_breaches.xlsx")
>>> records = cr.records_from_breaches_dataframe(dataframe, sequences)
>>> cr.breaches_records_to_pdf(records, 'output_breaches_plots.pdf')
"""

from copy import deepcopy
import re
from io import BytesIO

import proglog

try:
    import matplotlib.pyplot as plt
    from matplotlib.backends.backend_pdf import PdfPages
    from dna_features_viewer import BiopythonTranslator

    DFV_AVAILABLE = True
except ImportError:
    BiopythonTranslator = object
    DFV_AVAILABLE = False

try:
    import pandas

    PANDAS_AVAILABLE = True
except ImportError:
    PANDAS_AVAILABLE = False

from ..DnaOptimizationProblem import DnaOptimizationProblem
from ..biotools import sequence_to_biopython_record, annotate_record
from ..builtin_specifications import (
    EnforceGCContent,
    AvoidPattern,
    AvoidHairpins,
)
from ..Location import Location
from .colors_cycle import colors_cycle


__all__ = [
    "records_from_breaches_dataframe",
    "plot_breaches_record",
    "breaches_records_to_pdf",
    "constraints_breaches_dataframe",
]


def _sequences_to_new_records(sequences):
    """Turn acceptable sequences input into a records list.

    Acceptable formats are
    - ('name', 'sequence')
    - {'name': 'sequence'}
    - [records] (will be deepcopied)
    """
    if isinstance(sequences, dict):
        sequences = list(sequences.items())
    records = []
    for seq in sequences:
        if hasattr(seq, "id"):
            records.append(deepcopy(seq))
        else:
            name, seq = seq
            records.append(
                sequence_to_biopython_record(seq, id=name, name=name)
            )
    return records


def _parse_location(location_string):
    """Parses locations like 235-350(+)"""
    location_regex = r"(\d+)-(\d+)(\(+\)|\(-\)|)"
    match = re.match(location_regex, location_string.strip())
    start, end, strand = match.groups()
    return int(start), int(end), -1 if strand == "(-)" else 1


def _install_extras_message(libname):
    return (
        "Could not load %s (is it installed ?). You can install it separately "
        " with:  pip install %s\n\n"
        "Install all dependencies for generating DNA Chisel reports with:"
        "\n\npip install dnachisel[reports]"
        % (libname, libname.lower().replace(" ", "_"))
    )


def _breaches(constraint, sequence):
    problem = DnaOptimizationProblem(sequence, mutation_space={})
    new_constraint = constraint.initialized_on_problem(problem, role=None)
    evaluation = new_constraint.evaluate(problem)
    locations = Location.merge_overlapping_locations(evaluation.locations)
    return ", ".join(map(str, locations))


[docs]def constraints_breaches_dataframe( constraints, sequences, display_constraints_locations=False, ): """Return a dataframe summarizing constraints breaches in the sequences. Output dataframe schema (cst = constraint): ===== ======== ====================== / Cst1 Cst2 ===== ======== ====================== Seq1 10-50(+) 100-200(+), 300-350(+) seq2 Seq3 2-10(+) Seq4 500-1000(-) ===== ======== ====================== Parameters ---------- constraints A list of DNA Chisel Specifications. sequences Either a list [("name", "sequence")...] or a dict {"name": "sequence"} or a list of biopython records whole id is the sequence name. Examples -------- >>> import dnachisel as dc >>> from dnachisel.utils import constraints_breaches_dataframe >>> sequences = [ >>> ("SEQ 1", "ATTGTGCATGTGACAC"), >>> ("SEQ 2", "ACATGTGTTGTGACAC"), >>> ("SEQ 3", "TTGTGCACACATGTGA"), >>> ] >>> constraints = [ >>> dc.AvoidPattern('ATTG'), >>> dc.EnforceGCContent(0.4, 0.6), >>> dc.UniquifyAllKmers(5) >>> ] >>> dataframe = constraints_breaches_dataframe(constraints, sequences) >>> dataframe.to_excel('summary_spreadsheet.xlsx') """ if not PANDAS_AVAILABLE: raise ImportError(_install_extras_message("Pandas")) if isinstance(sequences, dict): sequences = list(sequences.items()) if hasattr(sequences[0], "id"): sequences = [(s.id, s) for s in sequences] dataframe_records = [ dict( [("sequence", name)] + [ ( constraint.label( use_short_form=True, with_location=display_constraints_locations, ), _breaches(constraint, sequence), ) for constraint in constraints ] ) for (name, sequence) in sequences ] return pandas.DataFrame.from_records(dataframe_records, index="sequence")
[docs]def records_from_breaches_dataframe(dataframe, sequences): """Generate records with annotations indicating constraints breaches. Parameters ---------- dataframe A breaches dataframe returned by ``constraints_breaches_dataframe`` sequences Either a list [("name", "sequence")...] or a dict {"name": "sequence"} or a list of biopython records whole id is the sequence name. """ records = _sequences_to_new_records(sequences) for record in records: record.features = [ f for f in record.features if not f.qualifiers.get("is_a_breach", False) ] colors_cycle_iterator = colors_cycle() columns_colors = { c: next(colors_cycle_iterator) for c in dataframe.columns } for rec, (i, row) in zip(records, dataframe.iterrows()): for column in dataframe.columns: locations = row[column] if not locations: continue for location in locations.split(","): annotate_record( rec, location=_parse_location(location), label=column, color=columns_colors[column], ApEinfo_fwdcolor=columns_colors[column], ApEinfo_revcolor=columns_colors[column], is_a_breach=True, ) return records
class Translator(BiopythonTranslator): """A Biopython record translator for DNA Features Viewer. This translator produces label-free plots. """ @staticmethod def compute_feature_box_linewidth(f): return 1 if f.qualifiers.get("is_a_breach", False) else 0 @staticmethod def compute_feature_fontdict(f): return { "fontsize": 12 if f.qualifiers.get("is_a_breach", False) else 9 } def compute_feature_label(self, f): label = BiopythonTranslator.compute_feature_label(self, f) if not f.qualifiers.get("is_a_breach", False): if len(label) > 20: label = label[:19] + "…" return label def compute_feature_color(self, f): if f.qualifiers.get("is_a_breach", False): return f.qualifiers.get("color", "red") else: return "#ffffff" def plot_breaches_record(record, ax=None, figure_width=10): translator = Translator() graphic_record = translator.translate_record(record) ax, _ = graphic_record.plot(ax=ax, figure_width=figure_width) ax.set_title(record.id, loc="left", fontweight="bold") ax.set_ylim(top=ax.get_ylim()[1] + 1) return ax
[docs]def breaches_records_to_pdf( breaches_records, pdf_path=None, figure_width=10, logger="bar" ): """Plots figures of the breaches annotated in the records into a PDF file. Parameters ---------- breaches_records A least of records annotated with breaches, as returned by the pdf_path Either the path to a PDF, or a file handle (open in wb mode) or None for this method to return binary PDF data. logger Either "bar" for a progress bar, None for no logging, or any Proglog logger. The bar name is "sequence". """ pdf_io = BytesIO() if pdf_path is None else pdf_path logger = proglog.default_bar_logger(logger, min_time_interval=0.2) with PdfPages(pdf_io) as pdf: for record in logger.iter_bar(sequence=breaches_records): ax = plot_breaches_record(record, figure_width=figure_width) pdf.savefig(ax.figure, bbox_inches="tight") plt.close(ax.figure) if pdf_path is None: return pdf_io.getvalue()
EXAMPLE_MANUFACTURING_CONSTRAINTS = [ AvoidPattern("BsaI_site"), AvoidPattern("BsmBI_site"), AvoidPattern("BbsI_site"), AvoidPattern("SapI_site"), AvoidPattern("9xA"), AvoidPattern("9xT"), AvoidPattern("6xG"), AvoidPattern("6xC"), AvoidPattern("5x3mer"), AvoidPattern("9x2mer"), AvoidHairpins(stem_size=20, hairpin_window=200), EnforceGCContent(mini=0.3, maxi=0.7, window=100), ]