Source code for tidymut.core.sequence

# tidymut/core/sequence.py
from __future__ import annotations

import warnings
from abc import ABC
from pathlib import Path
from typing import TYPE_CHECKING

from .alphabet import DNAAlphabet, RNAAlphabet, ProteinAlphabet
from .codon import CodonTable
from .constants import DNA_BASE_COMPLEMENTS, RNA_BASE_COMPLEMENTS
from .mutation import (
    BaseMutation,
    CodonMutation,
    AminoAcidMutation,
    MutationSet,
    CodonMutationSet,
    AminoAcidMutationSet,
)

if TYPE_CHECKING:
    from typing import Callable, Dict, Optional, List, Literal, Union, Type

    from .alphabet import BaseAlphabet
    from .types import SequenceType


__all__ = ["BaseSequence", "DNASequence", "ProteinSequence", "RNASequence"]


def __dir__() -> List[str]:
    return __all__


[docs] class BaseSequence(ABC): """Base class for biological sequences""" def __init__( self, sequence: str, alphabet: BaseAlphabet, name: Optional[str] = None, metadata: Optional[Dict] = None, ): self.alphabet = alphabet self.sequence = self.alphabet.validate_sequence(sequence) self.name = name self.metadata = metadata or {} def __len__(self) -> int: return len(self.sequence) def __str__(self) -> str: return self.sequence def __getitem__(self: SequenceType, key) -> SequenceType: if isinstance(key, slice): return type(self)( self.sequence[key], self.alphabet, ( f"{self.name}_{key.start}_{key.stop if key.stop is not None else len(self.sequence)}" if self.name else None ), self.metadata.copy(), ) elif isinstance(key, int): return type(self)( self.sequence[key], self.alphabet, f"{self.name}_{key}_{key+1}" if self.name else None, self.metadata.copy(), ) else: raise TypeError(f"Invalid argument type: {type(key).__name__}") def __eq__(self, other) -> bool: if isinstance(other, str): return self.sequence == other if not isinstance(other, BaseSequence): return False return self.sequence == other.sequence and type(self.alphabet) == type( other.alphabet )
[docs] def get_subsequence( self: SequenceType, start: int, end: Optional[int] = None ) -> SequenceType: """get subsequence (0-indexed, inclusive)""" if start < 0: raise IndexError("Start position must be greater than or equal to 0") if end is not None and end < start: raise ValueError("End position must be greater than or equal to start.") subseq = self[start:end] return subseq
[docs] def apply_mutation( self: SequenceType, mutation: Union[BaseMutation, MutationSet, MutationSet[BaseMutation]], ) -> SequenceType: """ Apply a mutation or set of mutations to the sequence and return a new sequence. Parameters ---------- mutation : Union[BaseMutation, CodonMutationSet, AminoAcidMutationSet] A single mutation or a set of mutations to apply Returns ------- SequenceType A new sequence with the mutation(s) applied Raises ------ ValueError If mutation position is invalid or mutation is incompatible TypeError If mutation type is not supported """ # Handle mutation sets (multiple mutations) if isinstance(mutation, (AminoAcidMutationSet, CodonMutationSet)): # Apply mutations in reverse order of position to avoid index shifting mutations = sorted( mutation.mutations, key=lambda m: m.position, reverse=True ) result_sequence = self for single_mutation in mutations: result_sequence = result_sequence.apply_mutation(single_mutation) return result_sequence # Handle single mutations elif isinstance(mutation, BaseMutation): # Validate mutation position and get mutation details if isinstance(mutation, CodonMutation): # For codon mutations, check 3 bases starting at position if mutation.position < 0 or mutation.position + 2 >= len(self.sequence): raise ValueError( f"Codon mutation at position {mutation.position} extends beyond sequence length {len(self.sequence)}" ) # Check mutation subtypes (DNA or RNA) VALID_COMBINATIONS = { "DNA": DNAAlphabet, "Both": DNAAlphabet, "RNA": RNAAlphabet, } expected_alphabet = VALID_COMBINATIONS.get(mutation.seq_type) if expected_alphabet is None or not isinstance( self.alphabet, expected_alphabet ): raise TypeError( f"Unmatching mutation subtype: {mutation.seq_type} with {mutation.seq_type} sequence" ) # Validate original codon matches expected actual_codon = self.sequence[mutation.position : mutation.position + 3] if actual_codon != mutation.wild_codon: raise ValueError( f"Expected codon '{mutation.wild_codon}' at position {mutation.position}, " f"but found '{actual_codon}'" ) # Apply codon mutation (replace 3 bases) new_sequence = ( self.sequence[: mutation.position] + mutation.mutant_codon + self.sequence[mutation.position + 3 :] ) elif isinstance(mutation, AminoAcidMutation): # For amino acid mutations, check single position if mutation.position < 0 or mutation.position >= len(self.sequence): raise ValueError( f"Amino acid mutation position {mutation.position} is out of bounds for sequence of length {len(self.sequence)}" ) # Validate original amino acid matches expected actual_aa = self.sequence[mutation.position] if actual_aa != mutation.wild_amino_acid: raise ValueError( f"Expected amino acid '{mutation.wild_amino_acid}' at position {mutation.position}, " f"but found '{actual_aa}'" ) # Apply amino acid mutation (replace single position) new_sequence = ( self.sequence[: mutation.position] + mutation.mutant_amino_acid + self.sequence[mutation.position + 1 :] ) else: # Handle other BaseMutation subclasses generically if mutation.position < 0 or mutation.position >= len(self.sequence): raise ValueError( f"Mutation position {mutation.position} is out of bounds for sequence of length {len(self.sequence)}" ) # For generic mutations, we can't validate original or determine replacement length # This is a fallback for custom mutation types raise TypeError( f"Unsupported mutation subtype: {type(mutation).__name__}. " f"Only CodonMutation and AminoAcidMutation are supported." ) # Update metadata to track mutation new_metadata = self.metadata.copy() if "mutations_applied" not in new_metadata: new_metadata["mutations_applied"] = [] mutation_record = { "type": type(mutation).__name__, "mutation_type": mutation.type, "position": mutation.position, } # Add type-specific information if isinstance(mutation, CodonMutation): mutation_record.update( { "wild_codon": mutation.wild_codon, "mutant_codon": mutation.mutant_codon, "seq_type": mutation.seq_type, } ) elif isinstance(mutation, AminoAcidMutation): mutation_record.update( { "wild_amino_acid": mutation.wild_amino_acid, "mutant_amino_acid": mutation.mutant_amino_acid, "effect_type": mutation.effect_type, } ) new_metadata["mutations_applied"].append(mutation_record) # Create new sequence instance return type(self)(new_sequence, self.alphabet, self.name, new_metadata) else: raise TypeError(f"Unsupported mutation type: {type(mutation).__name__}")
[docs] class ProteinSequence(BaseSequence): """Protein sequence with amino acid validation""" def __init__( self, sequence: str, alphabet: Optional[ProteinAlphabet] = None, name: Optional[str] = None, metadata: Optional[Dict] = None, ): if alphabet is None: alphabet = ProteinAlphabet(include_stop=True) super().__init__(sequence, alphabet, name, metadata)
[docs] def get_residue(self, position: int) -> str: """Get amino acid at specific position (0-indexed)""" if position < 0 or position >= len(self.sequence): raise IndexError( f"Position {position} out of range (0-{len(self.sequence)})" ) return self.sequence[position]
[docs] def find_motif(self, motif: str) -> List[int]: """Find all positions where motif occurs (0-indexed)""" positions = [] motif = motif.upper() start = 0 while True: pos = self.sequence.find(motif, start) if pos == -1: break positions.append(pos) start = pos + 1 return positions
[docs] class RNASequence(BaseSequence): """RNA sequence with nucleotide validation""" def __init__( self, sequence: str, alphabet: Optional[RNAAlphabet] = None, name: Optional[str] = None, metadata: Optional[Dict] = None, ): if alphabet is None: alphabet = RNAAlphabet() super().__init__(sequence, alphabet, name, metadata)
[docs] def reverse_complement(self) -> "RNASequence": """Get reverse complement of RNA sequence""" try: rev_comp = "".join( RNA_BASE_COMPLEMENTS[base] for base in self.sequence[::-1] ) except KeyError as e: raise ValueError(f"Invalid RNA base found: {e}") return RNASequence( sequence=rev_comp, name=f"{self.name}_rc" if self.name else None, metadata=self.metadata, )
[docs] def back_transcribe(self) -> "DNASequence": """Back-transcribe RNA sequence into DNA sequence""" dna_seq = self.sequence.replace("U", "T") return DNASequence( sequence=dna_seq, name=f"{self.name}_back_transcribe" if self.name else None, metadata=self.metadata, )
[docs] def translate( self, codon_table: Optional[CodonTable] = None, start_at_first_met: bool = False, stop_at_stop_codon: bool = False, require_mod3: bool = True, start: Optional[int] = None, end: Optional[int] = None, ) -> ProteinSequence: """ Translate RNA sequence into amino acid sequence using this codon table. Parameters ---------- codon_table : Optional[CodonTable], default=None Codon table to use for translation. If None, uses standard genetic code. start_at_first_met : bool, default=False Start translation at the first start codon if found. stop_at_stop_codon : bool, default=False Stop translation when a stop codon is encountered. require_mod3 : bool, default=True Whether the sequence must be a multiple of 3 in length. start : Option[int], default=None Custom 0-based start position. Overrides `start_at_first_met`. end : Option[int], default=None Custom 0-based end position. Overrides `stop_at_stop_codon`. Returns ------- ProteinSequence Translated amino acid sequence. """ aa_seq = translate( sequence=self.sequence, seq_type="RNA", codon_table=codon_table, start_at_first_met=start_at_first_met, stop_at_stop_codon=stop_at_stop_codon, require_mod3=require_mod3, start=start, end=end, ) return ProteinSequence( sequence=aa_seq, name=f"{self.name}_translation" if self.name else None, metadata=self.metadata, )
[docs] class DNASequence(BaseSequence): """DNA sequence with nucleotide validation""" def __init__( self, sequence: str, alphabet: Optional[DNAAlphabet] = None, name: Optional[str] = None, metadata: Optional[Dict] = None, ): if alphabet is None: alphabet = DNAAlphabet() super().__init__(sequence, alphabet, name, metadata)
[docs] def reverse_complement(self) -> "DNASequence": """Get reverse complement of DNA sequence""" try: rev_comp = "".join( DNA_BASE_COMPLEMENTS[base] for base in self.sequence[::-1] ) except KeyError as e: raise ValueError(f"Invalid DNA base found: {e}") return DNASequence( sequence=rev_comp, name=f"{self.name}_rc" if self.name else None, metadata=self.metadata, )
[docs] def translate( self, codon_table: Optional[CodonTable] = None, start_at_first_met: bool = False, stop_at_stop_codon: bool = False, require_mod3: bool = True, start: Optional[int] = None, end: Optional[int] = None, ) -> ProteinSequence: """ Translate DNA sequence into amino acid sequence using this codon table. Parameters ---------- codon_table : Optional[CodonTable], default=None Codon table to use for translation. If None, uses standard genetic code. start_at_first_met : bool, default=False Start translation at the first start codon if found. stop_at_stop_codon : bool, default=False Stop translation when a stop codon is encountered. require_mod3 : bool, default=True Whether the sequence must be a multiple of 3 in length. start : Option[int], default=None Custom 0-based start position. Overrides `start_at_first_met`. end : Option[int], default=None Custom 0-based end position. Overrides `stop_at_stop_codon`. Returns ------- ProteinSequence Translated amino acid sequence. """ aa_seq = translate( sequence=self.sequence, seq_type="DNA", codon_table=codon_table, start_at_first_met=start_at_first_met, stop_at_stop_codon=stop_at_stop_codon, require_mod3=require_mod3, start=start, end=end, ) return ProteinSequence( sequence=aa_seq, name=f"{self.name}_translation" if self.name else None, metadata=self.metadata, )
[docs] def transcribe(self) -> "RNASequence": """Transcribe DNA sequence into RNA sequence""" rna_seq = self.sequence.replace("T", "U") return RNASequence( sequence=rna_seq, name=f"{self.name}_transcribed" if self.name else None, metadata=self.metadata, )
def translate( sequence: str, seq_type: Literal["DNA", "RNA"] = "DNA", codon_table: Optional[CodonTable] = None, start_at_first_met: bool = False, stop_at_stop_codon: bool = False, require_mod3: bool = True, start: Optional[int] = None, end: Optional[int] = None, ) -> str: """ Translate DNA or RNA sequence into amino acid sequence using this codon table. this function should not be called directly. Instead, use the DNASequence or RNASequence classes. Parameters ---------- sequence : str DNA or RNA sequence to translate. seq_type : Literal["DNA", "RNA"], default="DNA" codon_table : Optional[CodonTable], default=None Codon table to use for translation. If None, uses standard genetic code. start_at_first_met : bool, default=False Start translation at the first start codon if found. stop_at_stop_codon : bool, default=False Stop translation when a stop codon is encountered. require_mod3 : bool, default=True Whether the sequence must be a multiple of 3 in length. start : Option[int], default=None Custom 0-based start position. Overrides `start_at_first_met`. end : Option[int], default=None Custom 0-based end position. Overrides `stop_at_stop_codon`. Returns ------- str Translated amino acid sequence. """ n = len(sequence) # Use standard table if none provided if codon_table is None: codon_table = CodonTable.get_standard_table(seq_type=seq_type) # Auto detect start if `start` not provided if start is None: if start_at_first_met: for i in range(0, n - 2, 3): codon = sequence[i : i + 3] if codon_table.is_start_codon(codon): start = i break else: return "" # No start codon found else: start = 0 # Auto detect end if `end` not provided if end is None: if stop_at_stop_codon: for i in range(start, n - 2, 3): codon = sequence[i : i + 3] if codon_table.is_stop_codon(codon): end = i + 3 break else: end = n else: end = n sub_seq = sequence[start:end] if len(sub_seq) % 3 != 0: remainder = len(sub_seq) % 3 if require_mod3: raise ValueError( f"Sequence length from start={start} to end={end} is not divisible by 3 " f"(remainder = {remainder})." ) else: warnings.warn( f"Sequence length from start={start} to end={end} is not divisible by 3. " f"Discarding {remainder} trailing nucleotide(s): {sub_seq[-remainder:]}" ) sub_seq = sub_seq[: len(sub_seq) - remainder] # Translate using this codon table codons = [sub_seq[i : i + 3] for i in range(0, len(sub_seq), 3)] aa_seq = "".join([codon_table.translate_codon(codon) for codon in codons]) return aa_seq def load_sequences_from_fasta( fasta_path: Union[str, Path], sequence_class: Type[BaseSequence], alphabet: Optional[BaseAlphabet] = None, header_func: Optional[Callable[[str], tuple[str, str]]] = None, allow_duplicates: bool = False, ) -> Dict[str, BaseSequence]: """ Load sequences from a FASTA file into a dictionary of BaseSequence-derived objects. Parameters ---------- fasta_path : Union[str, Path] Path to the FASTA file sequence_class : Type[BaseSequence] A subclass of BaseSequence to instantiate (e.g. DNASequence) alphabet : Optional[BaseAlphabet], default=None Optional alphabet to validate the sequence. If None, uses default for sequence class. header_func : Optional[Callable[[str], tuple[str, str]]], default=None Function to process header line and extract (sequence_id, description). If None, uses default parsing (first word as ID, rest as description). allow_duplicates : bool, default=False If False, raises error on duplicate sequence IDs. If True, overwrites. Returns ------- Dict[str, BaseSequence] Dictionary of {sequence_id: sequence_object} Raises ------ FileNotFoundError If FASTA file doesn't exist ValueError If duplicate sequence IDs found and allow_duplicates=False TypeError If sequence_class is not a subclass of BaseSequence """ # Validate inputs if not issubclass(sequence_class, BaseSequence): raise TypeError( f"sequence_class must be a subclass of BaseSequence, got {sequence_class}" ) fasta_path = Path(fasta_path) if not fasta_path.exists(): raise FileNotFoundError(f"FASTA file not found: {fasta_path}") # Default header processing function def default_header_func(header_line: str) -> tuple[str, str]: """Extract ID and description from header line (without '>')""" parts = header_line.split(maxsplit=1) seq_id = parts[0] description = parts[1] if len(parts) > 1 else "" return seq_id, description # Use provided header function or default process_header = header_func or default_header_func # Get default alphabet for the sequence class if not provided def get_default_alphabet(): """Get default alphabet for the sequence class""" if sequence_class == DNASequence: return DNAAlphabet() elif sequence_class == RNASequence: return RNAAlphabet() elif sequence_class == ProteinSequence: return ProteinAlphabet(include_stop=True) else: # TODO: For custom sequence classes, try to instantiate with None # and let the class handle default alphabet # but for easy, raise an error raise TypeError( f"sequence_class {sequence_class} does not have a default alphabet, " f"please provide it with the alphabet parameter." ) sequences: Dict[str, BaseSequence] = {} current_id = None current_description = "" current_seq_lines = [] line_number = 0 try: with fasta_path.open("r", encoding="utf-8") as f: for line in f: line_number += 1 line = line.strip() # Skip empty lines if not line: continue if line.startswith(">"): # Save previous entry if exists if current_id is not None: _save_sequence( sequences, current_id, current_description, current_seq_lines, sequence_class, alphabet or get_default_alphabet(), allow_duplicates, line_number, ) # Process new header try: current_id, current_description = process_header(line[1:]) except Exception as e: raise ValueError( f"Error processing header at line {line_number}: '{line}'. {str(e)}" ) current_seq_lines = [] else: # Accumulate sequence lines # Remove any whitespace and validate characters clean_line = "".join(line.split()) # Remove all whitespace if clean_line: # Only add non-empty lines current_seq_lines.append(clean_line) # Save last entry if current_id is not None: _save_sequence( sequences, current_id, current_description, current_seq_lines, sequence_class, alphabet or get_default_alphabet(), allow_duplicates, line_number, ) except UnicodeDecodeError: raise ValueError( f"Unable to decode file {fasta_path}. Please ensure it's a valid text file." ) except Exception as e: raise ValueError(f"Error reading FASTA file at line {line_number}: {str(e)}") if not sequences: raise ValueError(f"No valid sequences found in {fasta_path}") return sequences def _save_sequence( sequences: Dict[str, BaseSequence], seq_id: str, description: str, seq_lines: List[str], sequence_class: Type[BaseSequence], alphabet: BaseAlphabet, allow_duplicates: bool, line_number: int, ) -> None: """Helper function to save a sequence to the dictionary""" # Check for duplicate IDs if seq_id in sequences and not allow_duplicates: raise ValueError( f"Duplicate sequence ID '{seq_id}' found near line {line_number}" ) # Join sequence lines and convert to uppercase full_seq = "".join(seq_lines).upper() # Skip empty sequences if not full_seq: warnings.warn(f"Empty sequence found for ID '{seq_id}', skipping") return try: # Create sequence object sequences[seq_id] = sequence_class( sequence=full_seq, alphabet=alphabet, name=seq_id, metadata={"description": description, "source": "fasta"}, ) except Exception as e: raise ValueError(f"Error creating sequence object for ID '{seq_id}': {str(e)}")