Source code for tidymut.cleaners.k50_cleaner

# tidymut/cleaners/k50_cleaner.py
from __future__ import annotations

import pandas as pd
from typing import Tuple, Dict, Any, Optional, Union, Callable, List, Literal
from dataclasses import dataclass, field
from pathlib import Path
import logging

from .base_config import BaseCleanerConfig
from .basic_cleaners import (
    read_dataset,
    extract_and_rename_columns,
    filter_and_clean_data,
    convert_data_types,
    validate_mutations,
    infer_wildtype_sequences,
    convert_to_mutation_dataset_format,
)
from ..core.dataset import MutationDataset
from ..core.pipeline import Pipeline, create_pipeline

__all__ = ["K50CleanerConfig", "create_k50_cleaner", "clean_k50_dataset"]


def __dir__() -> List[str]:
    return __all__


# Create module logger
logger = logging.getLogger(__name__)


[docs] @dataclass class K50CleanerConfig(BaseCleanerConfig): """Configuration class for K50 dataset cleaner Inherits from BaseCleanerConfig and adds K50-specific configuration options. Attributes ---------- column_mapping : Dict[str, str] Mapping from source to target column names filters : Dict[str, Callable] Filter conditions for data cleaning type_conversions : Dict[str, str] Data type conversion specifications validation_workers : int Number of workers for mutation validation, set to -1 to use all available CPUs infer_wt_workers : int Number of workers for wildtype sequence inference, set to -1 to use all available CPUs handle_multiple_wt : Literal["error", "first", "separate"], default="error" Strategy for handling multiple wildtype sequences ('error', 'first', 'separate') label_columns : List[str] List of score columns to process primary_label_column : str Primary score column for the dataset """ # Column mapping configuration column_mapping: Dict[str, str] = field( default_factory=lambda: { "WT_name": "name", "aa_seq": "mut_seq", "mut_type": "mut_info", "ddG_ML": "ddG", } ) # Data filtering configuration filters: Dict[str, Callable] = field( default_factory=lambda: {"ddG": lambda x: x != "-"} ) # Type conversion configuration type_conversions: Dict[str, str] = field(default_factory=lambda: {"ddG": "float"}) # Mutation validation parameters validation_workers: int = 16 # Wildtype inference parameters infer_wt_workers: int = 16 handle_multiple_wt: Literal["error", "first", "separate"] = "error" # Score columns configuration label_columns: List[str] = field(default_factory=lambda: ["ddG"]) primary_label_column: str = "ddG" # Override default pipeline name pipeline_name: str = "k50_cleaner"
[docs] def validate(self) -> None: """Validate K50-specific configuration parameters Raises ------ ValueError If configuration is invalid """ # Call parent validation super().validate() # Validate handle_multiple_wt parameter valid_strategies = ["error", "first", "separate"] if self.handle_multiple_wt not in valid_strategies: raise ValueError( f"handle_multiple_wt must be one of {valid_strategies}, " f"got {self.handle_multiple_wt}" ) # Validate score columns if not self.label_columns: raise ValueError("label_columns cannot be empty") if self.primary_label_column not in self.label_columns: raise ValueError( f"primary_label_column '{self.primary_label_column}' " f"must be in label_columns {self.label_columns}" ) # Validate column mapping required_mappings = {"WT_name", "aa_seq", "mut_type"} missing = required_mappings - set(self.column_mapping.keys()) if missing: raise ValueError(f"Missing required column mappings: {missing}")
[docs] def create_k50_cleaner( dataset_or_path: Union[pd.DataFrame, str, Path], config: Optional[Union[K50CleanerConfig, Dict[str, Any], str, Path]] = None, ) -> Pipeline: """Create K50 dataset cleaning pipeline Parameters ---------- dataset_or_path : Union[pd.DataFrame, str, Path] Raw K50 dataset DataFrame or file path to K50 dataset - Download from: https://zenodo.org/records/799292 - File: `Tsuboyama2023_Dataset2_Dataset3_20230416.csv` in `Processed_K50_dG_datasets.zip` config : Optional[Union[K50CleanerConfig, Dict[str, Any], str, Path]] Configuration for the cleaning pipeline. Can be: - K50CleanerConfig object - Dictionary with configuration parameters (merged with defaults) - Path to JSON configuration file (str or Path) - None (uses default configuration) Returns ------- Pipeline Pipeline: The cleaning pipeline used Raises ------ TypeError If config has invalid type ValueError If configuration validation fails Examples -------- Use default configuration: >>> pipeline, dataset = clean_k50_dataset(df) Use partial configuration: >>> pipeline, dataset = clean_k50_dataset(df, config={ ... "validation_workers": 8, ... "handle_multiple_wt": "first" ... }) Load configuration from file: >>> pipeline, dataset = clean_k50_dataset(df, config="config.json") """ # Handle configuration parameter if config is None: final_config = K50CleanerConfig() elif isinstance(config, K50CleanerConfig): final_config = config elif isinstance(config, dict): # Partial configuration - merge with defaults default_config = K50CleanerConfig() final_config = default_config.merge(config) elif isinstance(config, (str, Path)): # Load from file final_config = K50CleanerConfig.from_json(config) else: raise TypeError( f"config must be K50CleanerConfig, dict, str, Path or None, " f"got {type(config)}" ) # Log configuration summary logger.info( f"K50 dataset will cleaning with pipeline: {final_config.pipeline_name}" ) logger.debug(f"Configuration:\n{final_config.get_summary()}") try: # Create pipeline pipeline = create_pipeline(dataset_or_path, final_config.pipeline_name) # Add cleaning steps pipeline = ( pipeline.delayed_then( extract_and_rename_columns, column_mapping=final_config.column_mapping, ) .delayed_then(filter_and_clean_data, filters=final_config.filters) .delayed_then( convert_data_types, type_conversions=final_config.type_conversions ) .delayed_then( validate_mutations, mutation_column=final_config.column_mapping.get("mut_type", "mut_type"), mutation_sep="_", is_zero_based=False, num_workers=final_config.validation_workers, ) .delayed_then( infer_wildtype_sequences, label_columns=final_config.label_columns, handle_multiple_wt=final_config.handle_multiple_wt, is_zero_based=True, # Always True after validate_mutations num_workers=final_config.infer_wt_workers, ) .delayed_then( convert_to_mutation_dataset_format, name_column=final_config.column_mapping.get("WT_name", "WT_name"), mutation_column=final_config.column_mapping.get("mut_type", "mut_type"), mutated_sequence_column=final_config.column_mapping.get( "aa_seq", "aa_seq" ), label_column=final_config.primary_label_column, is_zero_based=True, ) ) # Create pipeline based on dataset_or_path type if isinstance(dataset_or_path, (str, Path)): pipeline.add_delayed_step(read_dataset, 0) elif not isinstance(dataset_or_path, pd.DataFrame): raise TypeError( f"dataset_or_path must be pd.DataFrame or str/Path, " f"got {type(dataset_or_path)}" ) return pipeline except Exception as e: logger.error(f"Error in creating K50 cleaning pipeline: {str(e)}") raise RuntimeError(f"Error in creating K50 cleaning pipeline: {str(e)}")
[docs] def clean_k50_dataset(pipeline: Pipeline) -> Tuple[Pipeline, MutationDataset]: """Clean K50 dataset using configurable pipeline Parameters ---------- pipeline : Pipeline K50 dataset cleaning pipeline Returns ------- Tuple[Pipeline, MutationDataset] - Pipeline: The cleaned pipeline - MutationDataset: The cleaned K50 dataset """ try: # Run pipeline pipeline.execute() # Extract results k50_dataset_df, k50_ref_seq = pipeline.data k50_dataset = MutationDataset.from_dataframe(k50_dataset_df, k50_ref_seq) logger.info( f"Successfully cleaned K50 dataset: " f"{len(k50_dataset_df)} mutations from {len(k50_ref_seq)} proteins" ) return pipeline, k50_dataset except Exception as e: logger.error(f"Error in running K50 cleaning pipeline: {str(e)}") raise RuntimeError(f"Error in running K50 cleaning pipeline: {str(e)}")