Source code for dnachisel.reports.optimization_reports

"""Methods to generate optimization reports."""

import os
import textwrap
from collections import OrderedDict
import hashlib

from Bio import SeqIO
import pandas
import flametree
import numpy as np

from ..biotools import (
    sequence_to_biopython_record,
    find_specification_label_in_feature,
)
from ..version import __version__
from .SpecAnnotationsTranslator import SpecAnnotationsTranslator
from .tools import install_extras_message
from ..Location import Location

try:
    from sequenticon import sequenticon

    SEQUENTICON_AVAILABLE = True
except:
    SEQUENTICON_AVAILABLE = False

MATPLOTLIB_AVAILABLE = False
try:
    import matplotlib.cm as cm
    import matplotlib.pyplot as plt
    from matplotlib.backends.backend_pdf import PdfPages

    MATPLOTLIB_AVAILABLE = True
except ImportError:
    pass


try:
    from geneblocks import DiffBlocks

    GENEBLOCKS_AVAILABLE = True
except:
    GENEBLOCKS_AVAILABLE = False

try:
    from pdf_reports import ReportWriter
    import pdf_reports.tools as pdf_tools

    PDF_REPORTS_AVAILABLE = True
except:

    def ReportWriter(*a, **kw):
        return None

    PDF_REPORTS_AVAILABLE = False

THIS_DIR = os.path.dirname(os.path.realpath(__file__))
ASSETS_DIR = os.path.join(THIS_DIR, "assets")
TITLE_FONTDICT = fontdict = dict(size=14, weight="bold")

report_writer = ReportWriter(
    dnachisel_logo_url=os.path.join(ASSETS_DIR, "logo.png"),
    version=__version__,
    default_stylesheets=(os.path.join(ASSETS_DIR, "style.css"),),
)

install_reports_extra_message = (
    "Could not load %s (is it installed ?). You can install all "
    "dependencies for generating reports in DNA Chisel with this command:\n\n "
    "pip install dnachisel[reports]"
)


[docs]def write_no_solution_report( target, problem, error, file_content=None, file_path=None ): """Write a report on incompatibility found in the problem's constraints. The report comprises a PDF of plots of the sequence (global constraints, local constraints around the problem) and an annotated genbank. Parameters ---------- target Either a path to a folder, or a path to a zip archive, or "@memory" to return raw data of a zip archive containing the report. problem A DnaOptimizationProblem error A NoSolutionError (carries a message and a location) """ if not MATPLOTLIB_AVAILABLE: raise ImportError(install_extras_message("Matplotlib")) if isinstance(target, str): root = flametree.file_tree(target, replace=True) else: root = target # TRANSFER THE ORIGINAL FILE file_hash = None if file_path is not None: if file_content is None: with open(file_path, "rb") as f: file_content = f.read() basename = os.path.basename(file_path) file_hash = hashlib.md5(file_content).hexdigest()[:8] root._file("_".join([file_hash, basename])).write(file_content) translator = SpecAnnotationsTranslator() with PdfPages(root._file("plots.pdf").open("wb")) as pdf_io: # PLOT GLOBAL LOCATION OF ERROR record = problem.to_record() translator = SpecAnnotationsTranslator() graphical_record = translator.translate_record(record) ax, _ = graphical_record.plot(figure_width=min(20, 0.3 * len(record))) if len(record) < 60: graphical_record.plot_sequence(ax) if error.location is None: raise error start, end, strand = error.location.to_tuple() ax.fill_between( [start, end], -10, 10, zorder=-1000, facecolor="#ffcccc" ) title = "\n".join( textwrap.wrap( "No solution found in zone [%d, %d]:%s" % (start, end, str(error)), width=120, ) ) ax.set_title(title, fontdict=TITLE_FONTDICT) pdf_io.savefig(ax.figure, bbox_inches="tight", alpha=0.5) plt.close(ax.figure) # CREATE AND SAVE THE LOCAL CONSTRAINTS BREACHES RECORD record = error.problem.to_record( with_original_spec_features=False, with_constraints=False, with_objectives=False, ) start = max(0, error.location.start - 5) end = min(len(record), error.location.end + 4) focus_location = Location(start, end) def is_in_focus(location): return location.overlap_region(focus_location) is not None evals = error.problem.constraints_evaluations() passing = evals.filter("passing") record.features += passing.success_and_failures_as_features() failing = evals.filter("failing") record.features += failing.locations_as_features( label_prefix="BREACH", locations_filter=is_in_focus ) SeqIO.write( record, root._file("local_constraints_breaches.gb").open("w"), "genbank", ) # CREATE A FIGURE OF THE LOCAL CONSTRAINTS BREACHES AS A NEW PDF PAGE graphical_record = translator.translate_record(record) graphical_record = graphical_record.crop((start, end)) figure_width = min(20, 0.3 * (end - start)) ax, _ = graphical_record.plot(figure_width=figure_width) graphical_record.plot_sequence(ax) ax.set_title( "Local constraints breaches in [%d, %d]" % (start, end) + " (green = passing constraints)", fontdict=TITLE_FONTDICT, ) ax.set_ylim(top=ax.get_ylim()[1] + 1) pdf_io.savefig(ax.figure, bbox_inches="tight", alpha=0.5) plt.close(ax.figure) root._file("logs.txt").write(problem.logger.dump_logs()) # returns zip data if target == '@memory' if isinstance(target, str): return root._close()
def constraints_before_after_dataframe(problem, constraints_evaluations=None): if constraints_evaluations is None: constraints_evaluations = problem.constraints_evaluations() edits = problem.sequence_edits_as_array() def constraint_record(evaluation_before, evaluation_after): constraint = evaluation_before.specification start, end, _ = constraint.location.to_tuple() edits_sum = edits[start:end].sum() edits_percent = 100 * edits_sum / (end - start) label = constraint.label(use_short_form=True, with_location=False) return OrderedDict( [ ("constraint", label), ("start", start), ("end", end), ("before", "PASS" if evaluation_before.passes else "FAIL"), ("after", "PASS" if evaluation_after.passes else "FAIL"), ("edits", edits_sum), ("edits (%)", np.round(edits_percent, 2)), ] ) dataframe = pandas.DataFrame.from_records( [ constraint_record(before, after) for (before, after) in zip( problem.constraints_before, constraints_evaluations ) ] ) if len(dataframe): dataframe = dataframe.sort_values(by="start") return dataframe def objectives_before_after_dataframe(problem, objectives_evaluations=None): if objectives_evaluations is None: objectives_evaluations = problem.objectives_evaluations() edits = problem.sequence_edits_as_array() def objective_record(evaluation_before, evaluation_after): objective = evaluation_before.specification start, end, _ = objective.location.to_tuple() edits_sum = edits[start:end].sum() edits_percent = 100 * edits_sum / (end - start) label = objective.label(use_short_form=True, with_location=False) return OrderedDict( [ ("objective", label), ("boost", objective.boost), ("start", start), ("end", end), ("before", evaluation_before.score_to_formatted_string), ("after", evaluation_after.score_to_formatted_string), ("edits", edits_sum), ("edits (%)", np.round(edits_percent, 2)), ] ) dataframe = pandas.DataFrame.from_records( [ objective_record(before, after) for (before, after) in zip( problem.objectives_before, objectives_evaluations ) ] ) if len(dataframe): dataframe = dataframe.sort_values(by="start") return dataframe def plot_optimization_changes(problem): if not GENEBLOCKS_AVAILABLE: raise ImportError("Install Geneblocks to use plot_differences()") sequence_before = sequence_to_biopython_record(problem.sequence_before) sequence_after = problem.to_record() diffs = DiffBlocks.from_sequences(sequence_before, sequence_after) span = max(2, len(sequence_after) / 20) diffs = diffs.merged( blocks_per_span=(3, span), replace_gap=span / 2, change_gap=span / 2 ) _, diffs_ax = diffs.plot( translator_class=SpecAnnotationsTranslator, annotate_inline=True, figure_width=15, ) return diffs_ax
[docs]def write_optimization_report( target, problem, project_name="unnammed", plot_figure=True, constraints_evaluations=None, objectives_evaluations=None, figure_width=20, max_features_in_plots=300, file_path=None, file_content=None, ): """Write an optimization report with a PDF summary, plots, and genbanks. Parameters ---------- target Path to a directory or zip file, or "@memory" for returning raw data of a zip file created in-memory. problem A DnaOptimizationProblem to be solved and optimized project_name Name of the project that will appear on the PDF report constraints_evaluations Precomputed constraints evaluations. If None provided, they will be computed again from the problem. objectives_evaluations Precomputed objectives evaluations. If None provided, they will be computed again from the problem. figure_width Width of the report's figure, in inches. The more annotations there will be in the figure, the wider it should be. The default should work for most cases. max_features_in_plots Limit to the number of features to plot (plots with thousands of features may take ages to plot) file_path Path to the file from which the problem was created """ if not PDF_REPORTS_AVAILABLE: raise ImportError(install_extras_message("PDF Reports")) if not SEQUENTICON_AVAILABLE: raise ImportError(install_extras_message("Sequenticon")) if constraints_evaluations is None: constraints_evaluations = problem.constraints_evaluations() if objectives_evaluations is None: objectives_evaluations = problem.objectives_evaluations() if isinstance(target, str): root = flametree.file_tree(target, replace=True) else: root = target # TRANSFER THE ORIGINAL FILE file_hash = None if file_path is not None: if file_content is None: with open(file_path, "rb") as f: file_content = f.read() basename = os.path.basename(file_path) file_hash = hashlib.md5(file_content).hexdigest()[:8] root._file("_".join([file_hash, basename])).write(file_content) # CREATE FIGURES AND GENBANKS diffs_figure_data = None if GENEBLOCKS_AVAILABLE and plot_figure: diffs_ax = plot_optimization_changes(problem) diffs_figure_data = pdf_tools.figure_data(diffs_ax.figure, fmt="svg") plt.close(diffs_ax.figure) # GENERATE AND SAVE THE CONSTRAINTS SUMMARY constraints_before_after = constraints_before_after_dataframe( problem=problem, constraints_evaluations=constraints_evaluations ) filename = "constraints_before_and_after.csv" constraints_before_after.to_csv( root._file(filename).open("w"), index=False ) # GENERATE AND SAVE THE OBJECTIVES SUMMARY objectives_before_after = objectives_before_after_dataframe( problem=problem, objectives_evaluations=objectives_evaluations ) filename = "objectives_before_and_after.csv" objectives_before_after.to_csv(root._file(filename).open("w"), index=False) # CREATE PDF REPORT html = report_writer.pug_to_html( path=os.path.join(ASSETS_DIR, "optimization_report.pug"), project_name=project_name, problem=problem, constraints_evaluations=constraints_evaluations, objectives_evaluations=objectives_evaluations, constraints_before_after=constraints_before_after, objectives_before_after=objectives_before_after, edits=problem.sequence_edits_as_array().sum(), diffs_figure_data=diffs_figure_data, file_hash=file_hash, sequenticons={ label: sequenticon(seq, output_format="html_image", size=24) for label, seq in [ ("before", problem.sequence_before), ("after", problem.sequence), ] }, ) report_writer.write_report(html, root._file("Report.pdf")) # CREATE THE "SEQUENCE EDITS" REPORT record = problem.to_record(with_sequence_edits=True) breaches = problem.constraints_before.filter("failing") breaches_locations = breaches.locations_as_features( label_prefix="Breach from", merge_overlapping=True ) record.features += breaches_locations SeqIO.write( record, root._file("final_sequence_with_edits.gb").open("w"), "genbank" ) # CREATE THE "FINAL SEQUENCE" REPORT problem.to_record( root._file("final_sequence.gb").open("w"), with_constraints=False, with_objectives=False, ) if isinstance(target, str): return root._close()