Source code for tidymut.core.alphabet
# tidymut/core/alphabet.py
from __future__ import annotations
from abc import ABC
from typing import TYPE_CHECKING
from .constants import (
STANDARD_DNA_BASES,
AMBIGUOUSE_DNA_BASES,
STANDARD_RNA_BASES,
AMBIGUOUSE_RNA_BASES,
STANDARD_AMINO_ACIDS,
AMBIGUOUSE_AMINO_ACIDS,
AA1_TO_3,
AA3_TO_1,
)
if TYPE_CHECKING:
from typing import List, Set
__all__ = ["BaseAlphabet", "DNAAlphabet", "RNAAlphabet", "ProteinAlphabet"]
def __dir__() -> List[str]:
return __all__
[docs]
class BaseAlphabet(ABC):
"""Base class for biological alphabets"""
def __init__(self, letters: Set[str], name: str):
self.letters = set(letter.upper() for letter in letters)
self.name = name
[docs]
def is_valid_char(self, char: str) -> bool:
"""Check if character is valid in this alphabet"""
return char.upper() in self.letters
[docs]
def is_valid_sequence(self, sequence: str) -> bool:
"""Check if entire sequence is valid"""
return all(self.is_valid_char(char) for char in sequence)
[docs]
def get_invalid_chars(self, sequence: str) -> Set[str]:
"""Get set of invalid characters in sequence"""
return set(char.upper() for char in sequence) - self.letters
[docs]
def validate_sequence(self, sequence: str) -> str:
"""Validate sequence and raise error if invalid"""
invalid = self.get_invalid_chars(sequence)
if invalid:
raise ValueError(f"Invalid characters in {self.name} sequence: {invalid}")
return sequence.upper()
def __contains__(self, char: str) -> bool:
return self.is_valid_char(char)
def __str__(self) -> str:
return f"{self.name}Alphabet: {''.join(sorted(self.letters))}"
[docs]
class DNAAlphabet(BaseAlphabet):
"""DNA alphabet (A, T, C, G)"""
def __init__(self, include_ambiguous: bool = False):
standard = STANDARD_DNA_BASES
if include_ambiguous:
# IUPAC ambiguous nucleotide codes
ambiguous = AMBIGUOUSE_DNA_BASES
letters = standard | ambiguous
else:
letters = standard
super().__init__(letters, "DNA")
self.include_ambiguous = include_ambiguous
[docs]
class RNAAlphabet(BaseAlphabet):
"""RNA alphabet (A, U, C, G)"""
def __init__(self, include_ambiguous: bool = False):
standard = STANDARD_RNA_BASES
if include_ambiguous:
ambiguous = AMBIGUOUSE_RNA_BASES
letters = standard | ambiguous
else:
letters = standard
super().__init__(letters, "RNA")
self.include_ambiguous = include_ambiguous
[docs]
class ProteinAlphabet(BaseAlphabet):
"""Protein alphabet (20 standard amino acids + stop codon)"""
def __init__(self, include_stop: bool = True, include_ambiguous: bool = False):
# 20 standard amino acids
standard = STANDARD_AMINO_ACIDS
letters = standard.copy()
if include_stop:
letters.add("*") # Stop codon
if include_ambiguous:
# Ambiguous amino acids
letters.update(AMBIGUOUSE_AMINO_ACIDS)
super().__init__(letters, "Protein")
self.include_stop = include_stop
self.include_ambiguous = include_ambiguous
[docs]
def get_three_letter_code(self, one_letter: str, strict: bool = True) -> str:
"""Convert one-letter to three-letter amino acid code"""
if strict:
if one_letter not in AA1_TO_3.keys():
raise KeyError(f"Invalid character: {one_letter}")
return AA1_TO_3.get(one_letter.upper(), "Unk")
[docs]
def get_one_letter_code(self, three_letter: str, strict: bool = True) -> str:
"""Convert three-letter to one-letter amino acid code"""
if strict:
if three_letter not in AA3_TO_1.keys():
raise KeyError(f"Invalid amino acid code: {three_letter}")
return AA3_TO_1.get(three_letter.upper(), "X")