Source code for tidymut.core.mutation

# tidymut/core/mutation.py
from __future__ import annotations

import re
from abc import ABC, abstractmethod
from typing import Generic, TYPE_CHECKING

from .alphabet import ProteinAlphabet, DNAAlphabet, RNAAlphabet
from .codon import CodonTable
from .types import MutationType

if TYPE_CHECKING:
    from typing import Any, Dict, List, Literal, Optional, Sequence, Set, Type

    from .alphabet import BaseAlphabet

__all__ = [
    "AminoAcidMutation",
    "AminoAcidMutationSet",
    "BaseMutation",
    "CodonMutation",
    "CodonMutationSet",
    "MutationSet",
]


def __dir__() -> List[str]:
    return __all__


[docs] class BaseMutation(ABC): """Base class for all mutations""" def __init__( self, wild_type: str, mutant_type: str, position: int, alphabet: Optional[BaseAlphabet] = None, metadata: Optional[Dict[str, Any]] = None, ): if position < 0: raise ValueError("Position must be non-negative (0-indexed)") self.wild_type = wild_type self.mutant_type = mutant_type self.position = position self.alphabet = alphabet self.metadata = metadata or {} @property @abstractmethod def type(self) -> str: """Get the type of mutation""" pass @abstractmethod def __str__(self) -> str: """Return string representation of the mutation""" pass @abstractmethod def _is_valid(self) -> bool: """Check if the mutation is valid""" pass
[docs] @abstractmethod def get_mutation_category(self) -> str: """Get mutation category""" pass
[docs] @classmethod @abstractmethod def from_string( cls, mutation_string: str, is_zero_based: bool, alphabet: Optional[BaseAlphabet] = None, ) -> "BaseMutation": """Parse mutation from string format like 'A123V' or 'Ala123Val'""" pass
def __eq__(self, other) -> bool: """Check if two mutations are equal""" if not isinstance(other, self.__class__): return False return ( self.position == other.position and str(self) == str(other) and self.type == other.type ) def __hash__(self) -> int: """Enable use in sets and as dict keys""" return hash((self.__class__, self.position, str(self), self.type))
[docs] class AminoAcidMutation(BaseMutation): """Represents an amino acid mutation (e.g., A123V)""" def __init__( self, wild_type: str, position: int, mutant_type: str, alphabet: Optional[ProteinAlphabet] = None, metadata: Optional[Dict[str, Any]] = None, ): super().__init__(wild_type, mutant_type, position, alphabet, metadata) self.alphabet = alphabet or ProteinAlphabet(include_stop=True) self.wild_amino_acid = wild_type.upper() self.mutant_amino_acid = mutant_type.upper() if not self._is_valid(): raise ValueError(f"Invalid amino acid mutation: {self}") @property def type(self) -> str: """Get the type of mutation""" return "amino_acid" @property def effect_type(self) -> Literal["synonymous", "nonsense", "missense"]: """Get the effect type of the mutation (synonymous, nonsense, or missense)""" return self.get_mutation_category() def __str__(self) -> str: return f"{self.wild_amino_acid}{self.position}{self.mutant_amino_acid}" def _is_valid(self) -> bool: """Check if mutation uses valid amino acid codes""" return ( self.wild_amino_acid in self.alphabet and self.mutant_amino_acid in self.alphabet and isinstance(self.position, int) and self.position >= 0 )
[docs] def is_synonymous(self) -> bool: """Check if mutation is synonymous (no change)""" return self.wild_amino_acid == self.mutant_amino_acid
[docs] def is_nonsense(self) -> bool: """Check if mutation introduces stop codon""" return self.mutant_amino_acid == "*"
[docs] def is_missense(self) -> bool: """Check if mutation is missense (changes amino acid)""" return not self.is_synonymous() and not self.is_nonsense()
[docs] def get_mutation_category(self) -> Literal["synonymous", "nonsense", "missense"]: """Get mutation classification""" if self.is_synonymous(): return "synonymous" elif self.is_nonsense(): return "nonsense" else: return "missense"
[docs] @classmethod def from_string( cls, mutation_str: str, is_zero_based: bool = False, alphabet: Optional[ProteinAlphabet] = None, ) -> "AminoAcidMutation": """Parse mutation from string format like 'A123V' or 'Ala123Val'""" mutation_str = mutation_str.strip() # Handle three-letter codes first three_letter_pattern = r"^([A-Za-z]{3})(\d+)([A-Za-z]{3})$" match = re.match(three_letter_pattern, mutation_str) if match: alphabet = alphabet or ProteinAlphabet(include_stop=True) wild_three, position, mutant_three = match.groups() try: wild_amino_acid = alphabet.get_one_letter_code(wild_three) mutant_amino_acid = alphabet.get_one_letter_code(mutant_three) except KeyError as e: raise ValueError(f"Unknown three-letter amino acid code: {e}") return cls(wild_amino_acid, int(position), mutant_amino_acid, alphabet) # Handle one-letter codes one_letter_pattern = r"^([A-Z\*])(\d+)([A-Z\*])$" match = re.match(one_letter_pattern, mutation_str) if not match: raise ValueError( f"Invalid mutation format: {mutation_str}. " f"Expected formats: 'A123V' or 'Ala123Val'" ) wild_amino_acid, position, mutant_amino_acid = match.groups() if is_zero_based: return cls(wild_amino_acid, int(position), mutant_amino_acid, alphabet) else: return cls(wild_amino_acid, int(position) - 1, mutant_amino_acid, alphabet)
[docs] class CodonMutation(BaseMutation): """Represents a codon mutation""" def __init__( self, wild_type: str, position: int, mutant_type: str, alphabet: Optional[BaseAlphabet] = None, metadata: Optional[Dict[str, Any]] = None, ): super().__init__(wild_type, mutant_type, position, alphabet, metadata) self.wild_codon = wild_type.upper() self.mutant_codon = mutant_type.upper() # Auto-detect sequence type based on presence of T/U self.seq_type: Literal["DNA", "RNA", "Both"] = self._detect_seq_type( self.wild_codon, self.mutant_codon ) # Use appropriate alphabet based on detected sequence type self.alphabet = ( alphabet if alphabet is not None else (RNAAlphabet() if self.seq_type == "RNA" else DNAAlphabet()) ) print("Alphabet:", self.alphabet, "seq_type:", self.seq_type) if not self._is_valid(): raise ValueError(f"Invalid codon mutation: {self}") @staticmethod def _detect_seq_type( wild_codon: str, mutant_codon: str ) -> Literal["DNA", "RNA", "Both"]: """Auto-detect sequence type based on T/U presence""" combined_sequence = (wild_codon + mutant_codon).upper() has_t = "T" in combined_sequence has_u = "U" in combined_sequence if has_t and has_u: raise ValueError("Codons cannot contain both T and U") elif has_t: return "DNA" elif has_u: return "RNA" else: return "Both" @property def type(self) -> str: """Get the type of mutation""" return f"codon_{self.seq_type.lower()}" def __str__(self) -> str: return f"{self.wild_codon}{self.position}{self.mutant_codon}" def _is_valid(self) -> bool: """Check if codons are valid""" return ( len(self.wild_codon) == 3 and len(self.mutant_codon) == 3 and self.alphabet.is_valid_sequence(self.wild_codon) and self.alphabet.is_valid_sequence(self.mutant_codon) and isinstance(self.position, int) and self.position >= 0 )
[docs] def get_mutation_category(self) -> str: return f"codon_{self.seq_type.lower()}"
[docs] def to_amino_acid_mutation( self, codon_table: Optional[CodonTable] = None ) -> AminoAcidMutation: """Convert codon mutation to amino acid mutation""" if codon_table is None: # Use appropriate codon table based on detected sequence type if self.seq_type == "Both": # Default to DNA when ambiguous codon_table = CodonTable.get_standard_table("DNA") else: codon_table = CodonTable.get_standard_table(self.seq_type) wild_aa = codon_table.translate_codon(self.wild_codon) mutant_aa = codon_table.translate_codon(self.mutant_codon) return AminoAcidMutation( wild_aa, self.position, mutant_aa, metadata=self.metadata.copy() )
[docs] @classmethod def from_string( cls, mutation_str: str, is_zero_based: bool = False, alphabet: Optional[BaseAlphabet] = None, ) -> "CodonMutation": """Parse mutation from string format like 'ATG123TAA' or 'AUG123UAA'""" mutation_str = mutation_str.strip() # Handle codon format first codon_pattern = r"^([ATUCG]{3})(\d+)([ATUCG]{3})$" match = re.match(codon_pattern, mutation_str) if match: wild_codon, position, mutant_codon = match.groups() alphabet = ( alphabet if alphabet is not None else ( RNAAlphabet() if cls._detect_seq_type(wild_codon, mutant_codon) == "RNA" else DNAAlphabet() ) ) if is_zero_based: return cls(wild_codon, int(position), mutant_codon, alphabet) else: return cls(wild_codon, int(position) - 1, mutant_codon, alphabet) raise ValueError(f"Invalid codon mutation format: {mutation_str}")
[docs] class MutationSet(Generic[MutationType]): """Represents a set of mutations of the same type""" def __init__( self, mutations: Sequence[MutationType], mutation_type: Optional[Type[MutationType]], name: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, ): if not mutations: raise ValueError("MutationSet must contain at least one mutation") if mutation_type is None: # guess the mutation type based on the first mutation mutation_type = type(mutations[0]) # Validate that all mutations are of the same type and mutation type self._validate_mutation_types(mutations, mutation_type) # Validate that all mutations have the same type property self._validate_mutation_type_consistency(mutations) # Check for duplicate positions self._validate_unique_positions(mutations) self.mutations: List = sorted(list(mutations), key=lambda m: m.position) self.mutation_type = mutation_type self.name = name self.metadata = metadata or {} def __str__(self) -> str: return ",".join(str(m) for m in self.mutations) def __repr__(self) -> str: return f"MutationSet(mutations={self.mutations!r}, mutation_type={self.mutation_type}, name={self.name!r})" def __len__(self) -> int: """Return number of mutations""" return len(self.mutations) def __iter__(self): """Make the mutation set iterable""" return iter(self.mutations) @property def mutation_subtype(self) -> str: """Get the specific mutation subtype (e.g., 'amino_acid', 'codon_dna', 'codon_rna', 'codon_both')""" if self.mutations: return self.mutations[0].type return "unknown" def _validate_mutation_types( self, mutations: Sequence[MutationType], expected_type: Type[MutationType] ) -> None: """Validate that all mutations are of the expected class type""" if not issubclass(expected_type, BaseMutation): raise TypeError( f"Expected mutations to be of type {expected_type.__name__}" ) invalid_mutations = [m for m in mutations if not isinstance(m, expected_type)] if invalid_mutations: raise ValueError( f"All mutations must be of type {expected_type.__name__}. " f"Found {len(invalid_mutations)} mutations of different types." ) def _validate_mutation_type_consistency( self, mutations: Sequence[MutationType] ) -> None: """Validate that all mutations have the same type property""" if not mutations: return types_found = {m.type for m in mutations} if len(types_found) > 1: if types_found == {"codon_dna", "codon_both"}: return else: raise ValueError( f"All mutations must have the same type property. " f"Found mixed types: {types_found}" ) def _validate_unique_positions(self, mutations: Sequence[MutationType]) -> None: """Validate that mutations have unique positions""" positions = [m.position for m in mutations] if len(positions) != len(set(positions)): duplicates = [pos for pos in positions if positions.count(pos) > 1] raise ValueError(f"Duplicate mutations at positions: {set(duplicates)}")
[docs] def add_mutation(self, mutation: MutationType) -> None: """Add a mutation to this set""" # Validate mutation class type if not isinstance(mutation, self.mutation_type): raise ValueError( f"Mutation must be of type {self.mutation_type.__name__}, " f"got {type(mutation).__name__}" ) # Validate mutation type property consistency if self.mutations and mutation.type != self.mutation_subtype: raise ValueError( f"Mutation type property must match existing mutations. " f"Expected '{self.mutation_subtype}', got '{mutation.type}'" ) # Check for position conflict if mutation.position in self.get_positions(): raise ValueError(f"Mutation already exists at position {mutation.position}") self.mutations.append(mutation) self.mutations.sort(key=lambda m: m.position)
[docs] def remove_mutation(self, position: int) -> bool: """Remove mutation at specified position, return True if removed""" original_length = len(self.mutations) self.mutations = [m for m in self.mutations if m.position != position] return len(self.mutations) < original_length
[docs] def get_mutation_at(self, position: int) -> Optional[MutationType]: """Get mutation at specified position""" for mutation in self.mutations: if mutation.position == position: return mutation return None
[docs] def has_mutation_at(self, position: int) -> bool: """Check if there is a mutation at specified position""" return position in self.get_positions()
[docs] def is_single_mutation(self) -> bool: """Check if this is a single mutation""" return len(self.mutations) == 1
[docs] def is_multiple_mutations(self) -> bool: """Check if this contains multiple mutations""" return len(self.mutations) > 1
[docs] def get_mutation_count(self) -> int: """Get number of mutations""" return len(self.mutations)
[docs] def validate_all(self) -> bool: """Validate all mutations""" return all(mutation._is_valid() for mutation in self.mutations)
[docs] def get_positions(self) -> List[int]: """Get all mutation positions""" return [mutation.position for mutation in self.mutations]
[docs] def get_positions_set(self) -> Set[int]: """Get all mutation positions as a set""" return set(self.get_positions())
[docs] def get_mutation_categories(self) -> Dict[str, int]: """Get mutation category statistics""" categories = {} for mutation in self.mutations: category = mutation.get_mutation_category() categories[category] = categories.get(category, 0) + 1 return categories
[docs] def filter_by_category(self, category: str) -> List[MutationType]: """Filter mutations by category""" return [m for m in self.mutations if m.get_mutation_category() == category]
[docs] def sort_by_position(self) -> None: """Sort mutations by position in ascending order""" self.mutations.sort(key=lambda m: m.position)
[docs] def get_sorted_by_position(self) -> List[MutationType]: """Get mutations sorted by position without modifying the original list""" return sorted(self.mutations, key=lambda m: m.position)
[docs] @classmethod def from_string( cls, string: str, sep: Optional[str] = None, is_zero_based: bool = False, mutation_type: Optional[Type[MutationType]] = None, alphabet: Optional[BaseAlphabet] = None, name: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, ) -> "MutationSet": """ Create a mutation set from a string Parameters ---------- string : str String containing mutations separated by delimiter sep : Optional[str], default=None Separator to use. If None, will try to guess is_zero_based : bool, default=False Whether origin mutation positions are zero-based mutation_type : Type[MutationType], default=None The type of mutations to create. If None, will infer from first mutation alphabet : Optional[BaseAlphabet], default=None Alphabet to use for mutation parsing (if applicable) name : str, default=None Optional name for the mutation set metadata : Optional[Dict[str, Any]], default=None Optional metadata for the mutation set Returns ------- MutationSet A MutationSet created from the input string. Returns AminoAcidMutationSet for amino acid mutations, CodonMutationSet for codon mutations, or generic MutationSet for others. Raises ------ ValueError If string is empty, no valid mutations found, or mutations are inconsistent TypeError If mutation_type is not a subclass of BaseMutation """ if not string or not string.strip(): raise ValueError("Input string cannot be empty") string = string.strip() # Guess separator if not provided if sep is None: sep = cls._guess_sep(string) if sep is None: # Assume single mutation if no separator found mutation_parts = [string] else: mutation_parts = string.split(sep) else: mutation_parts = string.split(sep) # Clean up mutation strings mutation_parts = [part.strip() for part in mutation_parts if part.strip()] if not mutation_parts: raise ValueError("No valid mutation strings found after splitting") mutations = [] errors = [] for i, mutation_str in enumerate(mutation_parts): try: if mutation_type is None: # Try to infer mutation type from the first mutation string mutation = cls._infer_and_create_mutation( mutation_str, is_zero_based, alphabet ) mutation_type = type(mutation) else: # Use specified mutation type if hasattr(mutation_type, "from_string"): mutation = mutation_type.from_string( mutation_str, is_zero_based, alphabet ) else: raise NotImplementedError( f"Mutation type {mutation_type.__name__} does not have from_string method" ) mutations.append(mutation) except Exception as e: errors.append( f"Error parsing mutation '{mutation_str}' at position {i}: {str(e)}" ) if not mutations: error_msg = "No valid mutations could be parsed" if errors: error_msg += f". Errors encountered: {'; '.join(errors)}" raise ValueError(error_msg) if errors: # Raise an exception if we have some valid mutations but also errors raise ValueError(f"Some mutations could not be parsed: {'; '.join(errors)}") # Return appropriate mutation set type based on detected mutation type if mutation_type == AminoAcidMutation: return AminoAcidMutationSet( mutations=mutations, name=name, metadata=metadata, ) elif mutation_type == CodonMutation: return CodonMutationSet( mutations=mutations, name=name, metadata=metadata, ) else: # For other mutation types or when called on subclasses, use the generic approach return cls( mutations=mutations, mutation_type=mutation_type, name=name, metadata=metadata, )
@classmethod def _infer_and_create_mutation( cls, mutation_str: str, is_zero_based: bool = False, alphabet: Optional[BaseAlphabet] = None, ) -> MutationType: """ Infer mutation type and create mutation from string This method tries different mutation types to see which one can parse the string. Parameters ---------- mutation_str : str Mutation string to parse is_zero_based : bool, default=False Whether origin mutation positions are zero-based alphabet : Optional[BaseAlphabet] Optional alphabet to use for parsing Returns ------- MutationType The inferred mutation type that can parse the string """ # List of mutation types to try (order matters - most common first) mutation_types = [ AminoAcidMutation, CodonMutation, ] last_error = None for mutation_type in mutation_types: try: return mutation_type.from_string(mutation_str, is_zero_based, alphabet) except Exception as e: last_error = e continue # none of the mutation types could parse the string raise ValueError( f"Could not parse mutation string '{mutation_str}' with any known mutation type. " f"Last error: {last_error}" ) @classmethod def _create_mutation( cls, mutation_str: str, mutation_type: Type[BaseMutation], is_zero_based: bool = False, alphabet: Optional[BaseAlphabet] = None, ) -> BaseMutation: return mutation_type.from_string(mutation_str, is_zero_based, alphabet) @staticmethod def _guess_sep(string: str) -> Optional[str]: """Guess the separator for a string of mutations""" if not string: return None candidate_separators = [";", ",", "|", ":", "/", "\\", "\t"] # Count occurrences of each separator separator_counts = {sep: string.count(sep) for sep in candidate_separators} # Filter out separators that don't appear valid_separators = { sep: count for sep, count in separator_counts.items() if count > 0 } if not valid_separators: return None # Choose the separator with the highest count # In case of tie, prefer the order in candidate_separators best_sep = None best_count = 0 for sep in candidate_separators: if sep in valid_separators and valid_separators[sep] > best_count: best_sep = sep best_count = valid_separators[sep] return best_sep
[docs] class AminoAcidMutationSet(MutationSet[AminoAcidMutation]): """Represents a set of amino acid mutations""" def __init__( self, mutations: Sequence[AminoAcidMutation], name: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, ): super().__init__(mutations, AminoAcidMutation, name, metadata)
[docs] def get_synonymous_mutations(self) -> List[AminoAcidMutation]: """Get all synonymous mutations""" return [m for m in self.mutations if m.is_synonymous()]
[docs] def get_missense_mutations(self) -> List[AminoAcidMutation]: """Get all missense mutations""" return [m for m in self.mutations if m.is_missense()]
[docs] def get_nonsense_mutations(self) -> List[AminoAcidMutation]: """Get all nonsense mutations""" return [m for m in self.mutations if m.is_nonsense()]
[docs] def has_stop_codon_mutations(self) -> bool: """Check if any mutations introduce stop codons""" return any(m.is_nonsense() for m in self.mutations)
[docs] def count_by_effect_type(self) -> Dict[str, int]: """Count mutations by effect type""" return { "synonymous": len(self.get_synonymous_mutations()), "missense": len(self.get_missense_mutations()), "nonsense": len(self.get_nonsense_mutations()), }
[docs] class CodonMutationSet(MutationSet[CodonMutation]): """Represents a set of codon mutations""" def __init__( self, mutations: Sequence[CodonMutation], name: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, ): super().__init__(mutations, CodonMutation, name, metadata) @property def seq_type(self) -> Literal["DNA", "RNA", "Both"]: """Get the sequence type (DNA, RNA, or Both) of the codon mutations""" if self.mutations: return self.mutations[0].seq_type return "Both"
[docs] def to_amino_acid_mutation_set( self, codon_table: Optional[CodonTable] = None ) -> AminoAcidMutationSet: """Convert all codon mutations to amino acid mutations""" if codon_table is None: # Use appropriate codon table based on detected sequence type if self.seq_type == "Both": # Default to DNA when ambiguous codon_table = CodonTable.get_standard_table("DNA") else: codon_table = CodonTable.get_standard_table(self.seq_type) aa_mutations = [ mutation.to_amino_acid_mutation(codon_table) for mutation in self.mutations ] return AminoAcidMutationSet( aa_mutations, name=f"{self.name}_aa" if self.name else None, metadata=self.metadata.copy(), )