Source code for dnachisel.SequencePattern

"""Implements the SequencePattern, DnaNotationPattern classes.

These classes are responsible for looking for a pattern in a sequence
(including overlapping patterns !), separating patterns with fixed size
and patterns with maximal size (from problem localization purposes).

The module also implements functions to specify common DNA patterns:
homopolymers, repeats, enzymatic restriction sites.


"""

import re
import itertools
from .Location import Location
from .biotools import (
    reverse_complement,
    NUCLEOTIDE_TO_REGEXPR,
    IUPAC_NOTATION,
)
from Bio.Restriction.Restriction_Dictionary import rest_dict


[docs]class SequencePattern: """Pattern/ that will be looked for in a DNA sequence. Use this class for matching regular expression patterns, and DnaNotationPattern for matching explicit sequences or sequences using Ns etc. Examples -------- >>> expression = "A[ATGC]{3,}" >>> pattern = SequencePattern(expression) >>> constraint = AvoidPattern(pattern) Parameters ---------- expression Any string or regular expression for matching ATGC nucleotides. Note that multi-nucleotides symbols such as "N" (for A-T-G-C), or "K" are not supported by this class, see DnaNotationPattern instead. size Size of the pattern, in number of characters (if none provided, the size of the ``pattern`` string is used). The ``size`` is used to determine the size of windows when performing local optimization and constraint solving. It can be important to provide the size when the ``pattern`` string provided represents a complex regular expression whose maximal matching size cannot be easily evaluated. name Name of the pattern (will be displayed e.g. when the pattern is printed) in_both_strands Set to True (default) if the pattern should also be looked for on the reverse-complement of sequences. """ registered_string_pattern_classes = [] def __init__( self, expression, size=None, name=None, in_both_strands=True, lookahead="loop", ): if size is None: size = len(expression) self.expression = expression self.lookahead = lookahead if lookahead == "re": expression = "(?=(%s))" % expression if "(" not in expression: expression = "(%s)" % expression self.lookahead_expression = expression self.compiled_expression = re.compile(self.lookahead_expression) self.size = size self.name = name self.in_both_strands = in_both_strands
[docs] def find_matches(self, sequence, location=None): """Return the locations where the sequence matches the expression. Parameters ---------- sequence A string of "ATGC..." location Location indicating a segment to which to restrict the search. Only patterns entirely included in the segment will be returned Returns ------- matches A list of the locations of matches, of the form ``[(start1, end1), (start2, end2),...]``. """ if location is not None: subsequence = location.extract_sequence(sequence) return [ (loc + location.start) if (location.strand != -1) else Location( location.end - loc.end, location.end - loc.start, strand=-1 ) for loc in self.find_matches(subsequence) ] matches = self.find_all_re_matches(sequence) if self.in_both_strands: reverse = reverse_complement(sequence) L = len(sequence) matches += [ (L - end, L - start, -1) for (start, end, strand) in self.find_all_re_matches(reverse) ] return [Location(start, end, strand) for start, end, strand in matches]
def find_all_re_matches(self, sequence): if self.lookahead == "loop": matches = [] position = 0 while True: result = re.search(self.compiled_expression, sequence) if result is None: return matches start, end = result.start(), result.end() matches.append((start + position, end + position, 1)) sequence = sequence[start + 1 :] position += start + 1 else: return [ (match.start(), match.start() + len(match.groups()[0]), 1) for match in re.finditer(self.compiled_expression, sequence) ] def __str__(self): return self.expression + ( "" if self.name is None else " (%s)" % self.name ) @classmethod def from_string(cls, string): for myclass in cls.registered_string_pattern_classes: pattern = myclass.from_string(string) if pattern is not None: return pattern return SequencePattern(string)
[docs]class DnaNotationPattern(SequencePattern): """Class for patterns in plain DNA notation: ATTGCCA, GCNNKTA, etc. If the sequence is not palyndromic, the pattern will be looked for in both strands of sequences. """ def __init__(self, sequence, name=None, in_both_strands="auto"): """Initialize""" if in_both_strands == "auto": # If the pattern sequence is palydromic there is no use looking # for the pattern in both strands in_both_strands = not (reverse_complement(sequence) == sequence) SequencePattern.__init__( self, size=len(sequence), expression=self.dna_sequence_to_regexpr(sequence), name=name, in_both_strands=in_both_strands, ) self.sequence = sequence
[docs] @staticmethod def dna_sequence_to_regexpr(sequence): """Return a regular expression to find the pattern in a sequence.""" regexpr = "".join([NUCLEOTIDE_TO_REGEXPR[n] for n in sequence]) return regexpr
[docs] def all_variants(self): """Return all ATGC sequence variants of a sequence""" return [ "".join(nucleotides) for nucleotides in itertools.product( *[IUPAC_NOTATION[n] for n in self.sequence] ) ]
def __repr__(self): """Represent the pattern as PatternType(name) """ return self.sequence + ( "" if self.name is None else " (%s)" % self.name ) def __str__(self): """Represent the pattern as PatternType(name) """ return self.sequence + ( "" if self.name is None else " (%s)" % self.name ) @staticmethod def from_string(string): if set(string) <= set(NUCLEOTIDE_TO_REGEXPR.keys()): return DnaNotationPattern(string)
# DEFINITION OF COMMON PATTERNS
[docs]class EnzymeSitePattern(DnaNotationPattern): """Class to represent Enzyme site patterns Examples -------- >>> enzyme_pattern = EnzymeSitePattern("BsaI") >>> constraint = AvoidPattern(enzyme_pattern) """ def __init__(self, enzyme_name): self.enzyme_site = rest_dict[enzyme_name]["site"] DnaNotationPattern.__init__(self, self.enzyme_site, name=enzyme_name)
[docs] @staticmethod def from_string(string): """Convert BsmBI_site to EnzymeSitePattern(BsmBI)""" match = re.match(r"(\S+)_site", string) if match is not None: enzyme_name = match.groups()[0] if enzyme_name in rest_dict: return EnzymeSitePattern(enzyme_name)
def __str__(self): return "%s(%s)" % (self.name, self.enzyme_site)
[docs]class HomopolymerPattern(DnaNotationPattern): """Homopolymer of the form AAAAAAA, TTTTT, etc. Shorthand string version: "7xA", "9xC", etc. Examples -------- >>> pattern = HomopolymerPattern("A", 6) >>> constraint = AvoidPattern(pattern) """ def __init__(self, nucleotide, number): self.nucleotide = nucleotide self.number = number DnaNotationPattern.__init__( self, number * nucleotide, in_both_strands=True ) @staticmethod def from_string(string): match = re.match(r"(\d+)x(\S)$", string) if match is not None: number, nucleotide = match.groups() return HomopolymerPattern(nucleotide, int(number)) def __str__(self): return "%sx%s" % (self.number, self.nucleotide)
[docs]class RepeatedKmerPattern(SequencePattern): """Direct repeats like ATT-ATT, ATGC-ATGC-ATGC, etc. Shorthand string version: "3x4mer", "5x2mer", etc. Examples -------- >>> RepeatedKmerPattern(3, 2) # dimers repeated 3 times """ def __init__(self, n_repeats, kmer_size): self.n_repeats = n_repeats self.kmer_size = kmer_size SequencePattern.__init__( self, size=kmer_size * n_repeats, expression=r"([ATGC]{%d})\1{%d}" % (kmer_size, n_repeats - 1), name="%d-repeats %d-mers" % (n_repeats, kmer_size), in_both_strands=False, # a repeat on a strand is also on the other lookahead="loop", ) @staticmethod def from_string(string): match = re.match(r"(\d+)x(\d+)mer$", string) if match is not None: n_repeats, kmer_size = match.groups() return RepeatedKmerPattern(int(n_repeats), int(kmer_size)) def __str__(self): return "%sx%smer" % (self.n_repeats, self.kmer_size)
SequencePattern.registered_string_pattern_classes = [ HomopolymerPattern, RepeatedKmerPattern, EnzymeSitePattern, DnaNotationPattern, ]