Source code for tidymut.cleaners.human_domainome_cleaner

# tidymut/cleaners/human_domainome_cleaner.py
from __future__ import annotations

from typing import TYPE_CHECKING
from dataclasses import dataclass, field
from pathlib import Path
import logging

from .base_config import BaseCleanerConfig
from .basic_cleaners import (
    read_dataset,
    merge_columns,
    extract_and_rename_columns,
    filter_and_clean_data,
    convert_data_types,
    apply_mutations_to_sequences,
    convert_to_mutation_dataset_format,
)
from .human_domainome_custom_cleaners import (
    process_domain_positions,
    add_sequences_to_dataset,
    extract_domain_sequences,
)
from ..core.dataset import MutationDataset
from ..core.pipeline import Pipeline, create_pipeline
from ..utils.sequence_io import load_sequences, parse_uniprot_header

if TYPE_CHECKING:
    from typing import Any, Callable, Dict, List, Optional, Tuple, Union

__all__ = [
    "HumanDomainomeCleanerConfig",
    "create_human_domainome_cleaner",
    "clean_human_domainome_dataset",
]


def __dir__() -> List[str]:
    return __all__


# Create module logger
logger = logging.getLogger(__name__)


[docs] @dataclass(kw_only=True) class HumanDomainomeCleanerConfig(BaseCleanerConfig): """Configuration class for HumanDomainome dataset cleaner Inherits from BaseCleanerConfig and adds HumanDomainome-specific configuration options. Attributes ---------- sequence_dict_path : Union[str, Path] Path to the file containing UniProt ID to sequence mapping header_parser : Callable[[str], Tuple[str, Dict[str, str]]] Parse Header in fasta files and extract relevant information column_mapping : Dict[str, str] Mapping from source to target column names type_conversions : Dict[str, str] Data type conversion specifications drop_na_columns: List[str] List of column names where null values should be dropped is_zero_based : bool Whether mutation positions are zero-based process_workers : int Number of workers for parallel processing label_columns : List[str] List of score columns to process primary_label_column : str Primary score column for the dataset """ # Path to sequence dictionary file sequence_dict_path: Union[str, Path] # Header parser function header_parser: Callable[[str], Tuple[str, Dict[str, str]]] = parse_uniprot_header # Column mapping configuration column_mapping: Dict[str, str] = field( default_factory=lambda: { "uniprot_ID": "name", "wt_aa": "wt_aa", "mut_aa": "mut_aa", "pos": "pos", "PFAM_entry": "PFAM_entry", "mean_kcalmol_scaled": "label_humanDomainome", } ) # Type conversion configuration type_conversions: Dict[str, str] = field( default_factory=lambda: {"label_humanDomainome": "float"} ) # columns to perfrom dropping NA drop_na_columns: List = field( default_factory=lambda: ["name", "PFAM_entry", "pos", "wt_aa", "mut_aa"] ) # Processing parameters is_zero_based: bool = False # Human Domainome uses 1-based positions process_workers: int = 16 # Score columns configuration label_columns: List[str] = field(default_factory=lambda: ["label_humanDomainome"]) primary_label_column: str = "label_humanDomainome" # Override default pipeline name pipeline_name: str = "human_domainome_cleaner" def __post_init__(self): self.type_conversions.update({"pos": "int", "mut_rel_pos": "int"}) return super().__post_init__()
[docs] def validate(self) -> None: """Validate HumanDomainome-specific configuration parameters Raises ------ ValueError If configuration is invalid """ # Call parent validation super().validate() # Validate sequence dictionary path if self.sequence_dict_path is not None: seq_path = Path(self.sequence_dict_path) if not seq_path.exists(): raise ValueError( f"Sequence dictionary file not found: {self.sequence_dict_path}" ) # Validate score columns if not self.label_columns: raise ValueError("label_columns cannot be empty") if self.primary_label_column not in self.label_columns: raise ValueError( f"primary_label_column '{self.primary_label_column}' " f"must be in label_columns {self.label_columns}" ) # Validate column mapping required_mappings = { "uniprot_ID", "wt_aa", "mut_aa", "pos", "PFAM_entry", "mean_kcalmol_scaled", } missing = required_mappings - set(self.column_mapping.keys()) if missing: raise ValueError(f"Missing required column mappings: {missing}")
[docs] def create_human_domainome_cleaner( dataset_or_path: Union[str, Path], sequence_dict_path: Union[str, Path], config: Optional[ Union[HumanDomainomeCleanerConfig, Dict[str, Any], str, Path] ] = None, ) -> Pipeline: """Create HumanDomainome dataset cleaning pipeline Parameters ---------- dataset_or_path : Union[pd.DataFrame, str, Path] Raw HumanDomainome dataset DataFrame or file path to K50 HumanDomainome - File: `SupplementaryTable4.txt` from the article 'Site-saturation mutagenesis of 500 human protein domains' sequence_dict_path : Union[str, Path] Path to file containing UniProt ID to sequence mapping config : Optional[Union[HumanDomainomeCleanerConfig, Dict[str, Any], str, Path]] Configuration for the cleaning pipeline. Can be: - HumanDomainomeCleanerConfig object - Dictionary with configuration parameters (merged with defaults) - Path to JSON configuration file (str or Path) - None (uses default configuration) Returns ------- Pipeline The cleaning pipeline Raises ------ FileNotFoundError If data file or sequence dictionary file not found TypeError If config has invalid type ValueError If configuration validation fails Examples -------- Basic usage: >>> pipeline = create_human_domainome_cleaner( ... "human_domainome.csv", ... "uniprot_sequences.fasta" ... ) >>> pipeline, dataset = clean_human_domainome_dataset(pipeline) Custom configuration: >>> config = { ... "process_workers": 8, ... "type_conversions": {"label_humanDomainome": "float32"} ... } >>> pipeline = create_human_domainome_cleaner( ... "human_domainome.csv", ... "sequences.csv", ... config=config ... ) Load configuration from file: >>> pipeline = create_human_domainome_cleaner( ... "data.csv", ... "sequences.fasta", ... config="config.json" ... ) """ seq_path_obj = Path(sequence_dict_path) if not seq_path_obj.exists(): raise FileNotFoundError( f"Sequence dictionary file does not exist: {sequence_dict_path}" ) # Handle configuration parameter if config is None: final_config = HumanDomainomeCleanerConfig( sequence_dict_path=sequence_dict_path ) elif isinstance(config, HumanDomainomeCleanerConfig): final_config = config # Override sequence_dict_path if not set if final_config.sequence_dict_path is None: final_config.sequence_dict_path = sequence_dict_path elif isinstance(config, dict): # Partial configuration - merge with defaults default_config = HumanDomainomeCleanerConfig( sequence_dict_path=sequence_dict_path ) final_config = default_config.merge(config) elif isinstance(config, (str, Path)): # Load from file final_config = HumanDomainomeCleanerConfig.from_json(config) # Override sequence_dict_path if not set if final_config.sequence_dict_path is None: final_config.sequence_dict_path = sequence_dict_path else: raise TypeError( f"config must be HumanDomainomeCleanerConfig, dict, str, Path or None, " f"got {type(config)}" ) # Log configuration summary logger.info( f"HumanDomainome dataset will be cleaned with pipeline: {final_config.pipeline_name}" ) logger.debug(f"Configuration:\n{final_config.get_summary()}") # Load sequence dictionary seq_dict = _load_sequence_dict( final_config.sequence_dict_path, header_parser=final_config.header_parser ) try: # Create pipeline pipeline = create_pipeline(dataset_or_path, final_config.pipeline_name) # Add cleaning steps pipeline = ( pipeline.delayed_then( extract_and_rename_columns, column_mapping=final_config.column_mapping, ) .delayed_then( filter_and_clean_data, drop_na_columns=final_config.drop_na_columns, ) .delayed_then( convert_data_types, type_conversions=final_config.type_conversions, ) .delayed_then( process_domain_positions, ) .delayed_then( merge_columns, columns_to_merge=[ final_config.column_mapping.get("uniprot_ID", "uniprot_ID"), "pos", ], new_column_name="protein_mut_id", ) .delayed_then( add_sequences_to_dataset, sequence_dict=seq_dict, name_column=final_config.column_mapping.get("uniprot_ID", "uniprot_ID"), ) .delayed_then( extract_domain_sequences, sequence_column="sequence", start_pos_column="start_pos", end_pos_column="end_pos", num_workers=final_config.process_workers, ) .delayed_then( apply_mutations_to_sequences, sequence_column="sequence", name_column=final_config.column_mapping.get("uniprot_ID", "uniprot_ID"), mutation_column="mut_info", mutation_sep=",", is_zero_based=True, # After process_domain_positions, positions are 0-based sequence_type="protein", num_workers=final_config.process_workers, ) .delayed_then( convert_to_mutation_dataset_format, name_column="protein_mut_id", mutation_column="mut_info", sequence_column="sequence", label_column=final_config.primary_label_column, is_zero_based=True, # After process_domain_positions, positions are 0-based ) ) # Create pipeline based on dataset_or_path type if isinstance(dataset_or_path, (str, Path)): pipeline.add_delayed_step(read_dataset, 0, file_format="tsv") elif not isinstance(dataset_or_path, pd.DataFrame): raise TypeError( f"dataset_or_path must be pd.DataFrame or str/Path, " f"got {type(dataset_or_path)}" ) return pipeline except Exception as e: logger.error(f"Error in creating HumanDomainome cleaning pipeline: {str(e)}") raise RuntimeError( f"Error in creating HumanDomainome cleaning pipeline: {str(e)}" )
[docs] def clean_human_domainome_dataset( pipeline: Pipeline, ) -> Tuple[Pipeline, MutationDataset]: """Clean HumanDomainome dataset using configurable pipeline Parameters ---------- pipeline : Pipeline HumanDomainome dataset cleaning pipeline Returns ------- Tuple[Pipeline, MutationDataset] - Pipeline: The cleaned pipeline - MutationDataset: The cleaned HumanDomainome dataset Raises ------ RuntimeError If pipeline execution fails """ try: # Run pipeline pipeline.execute() # Extract results dataset_df, ref_sequences = pipeline.data human_domainome_dataset = MutationDataset.from_dataframe( dataset_df, ref_sequences ) logger.info( f"Successfully cleaned HumanDomainome dataset: " f"{len(dataset_df)} mutations from {len(ref_sequences)} proteins" ) return pipeline, human_domainome_dataset except Exception as e: logger.error(f"Error in running HumanDomainome cleaning pipeline: {str(e)}") raise RuntimeError( f"Error in running HumanDomainome cleaning pipeline: {str(e)}" )
def _load_sequence_dict( seq_dict_path: Union[str, Path], header_parser: Optional[Callable[[str], Tuple[str, Dict[str, str]]]] = None, ) -> Dict[str, str]: """Load UniProt ID to sequence mapping from file Parameters ---------- seq_dict_path : Union[str, Path] Path to sequence dictionary file (CSV, TSV, or FASTA format) header_parser : Optional[Callable], default=None Function to parse FASTA headers. If None, uses UniProt parser. Only used for FASTA files. Returns ------- Dict[str, str] Dictionary mapping UniProt IDs to sequences """ # Use the new load_sequences function seq_dict = load_sequences(seq_dict_path, header_parser=header_parser) logger.info(f"Loaded {len(seq_dict)} sequences from {seq_dict_path}") return seq_dict