Source code for tidymut.cleaners.base_config
# tidymut/cleaners/base_config.py
from __future__ import annotations
import json
from abc import ABC, abstractmethod
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from typing import Any, Dict, List, Type, Union
from ..core.types import CleanerConfigType
__all__ = ["BaseCleanerConfig"]
def __dir__() -> List[str]:
return __all__
[docs]
@dataclass
class BaseCleanerConfig(ABC):
"""Base configuration class for all dataset cleaners
This abstract base class provides common configuration functionality
that can be inherited by specific cleaner configurations.
Attributes
----------
pipeline_name : str
Name of the cleaning pipeline
strict_mode : bool
Whether to stop on errors (True) or continue with warnings (False)
num_workers : int
Default number of worker processes
validate_config : bool
Whether to validate configuration before use
"""
# Common configuration options
pipeline_name: str = field(default="base_cleaner")
strict_mode: bool = field(default=True)
num_workers: int = field(default=16)
validate_config: bool = field(default=True)
def __post_init__(self):
"""Post-initialization validation"""
if self.validate_config:
self.validate()
[docs]
@abstractmethod
def validate(self) -> None:
"""Validate the configuration
This method should be implemented by subclasses to perform
specific validation logic.
Raises
------
ValueError
If configuration is invalid
"""
# Common validations
if self.num_workers < 1:
raise ValueError(f"num_workers must be at least 1, got {self.num_workers}")
[docs]
@classmethod
def from_dict(
cls: Type[CleanerConfigType], config_dict: Dict[str, Any]
) -> CleanerConfigType:
"""Create configuration object from dictionary
Parameters
----------
config_dict : Dict[str, Any]
Dictionary containing configuration parameters
Returns
-------
BaseCleanerConfig
Configuration object
"""
return cls(**config_dict)
[docs]
@classmethod
def from_json(
cls: Type[CleanerConfigType], json_path: Union[str, Path]
) -> CleanerConfigType:
"""Load configuration from JSON file
Parameters
----------
json_path : Union[str, Path]
Path to JSON configuration file
Returns
-------
BaseCleanerConfig
Configuration object
Raises
------
FileNotFoundError
If configuration file does not exist
"""
json_path = Path(json_path)
if not json_path.exists():
raise FileNotFoundError(f"Configuration file not found: {json_path}")
with open(json_path, "r") as f:
config_dict = json.load(f)
return cls.from_dict(config_dict)
[docs]
def to_dict(self, exclude_callables: bool = True) -> Dict[str, Any]:
"""Convert configuration to dictionary
Parameters
----------
exclude_callables : bool, optional
Whether to exclude callable objects (functions, lambdas), by default True
Returns
-------
Dict[str, Any]
Dictionary representation of the configuration
"""
data = asdict(self)
if exclude_callables:
# Remove any callable values that can't be serialized
data = {
k: v
for k, v in data.items()
if not callable(v)
and not (isinstance(v, dict) and any(callable(vv) for vv in v.values()))
}
return data
[docs]
def to_json(self, json_path: Union[str, Path], **json_kwargs) -> None:
"""Save configuration to JSON file
Parameters
----------
json_path : Union[str, Path]
Path where to save the JSON file
**json_kwargs
Additional arguments passed to json.dump
"""
json_path = Path(json_path)
json_path.parent.mkdir(parents=True, exist_ok=True)
config_dict = self.to_dict(exclude_callables=True)
with open(json_path, "w") as f:
json.dump(config_dict, f, indent=2, **json_kwargs)
[docs]
def merge(
self: CleanerConfigType, partial_config: Dict[str, Any]
) -> CleanerConfigType:
"""Merge partial configuration with current configuration
Parameters
----------
partial_config : Dict[str, Any]
Dictionary containing configuration values to update
Returns
-------
BaseCleanerConfig
New configuration object with merged values
"""
current_dict = asdict(self)
# Deep merge for nested dictionaries
def deep_merge(base: dict, update: dict) -> dict:
result = base.copy()
for key, value in update.items():
if (
key in result
and isinstance(result[key], dict)
and isinstance(value, dict)
):
result[key] = deep_merge(result[key], value)
else:
result[key] = value
return result
merged_dict = deep_merge(current_dict, partial_config)
return self.__class__.from_dict(merged_dict)
[docs]
def get_summary(self) -> str:
"""Get a human-readable summary of the configuration
Returns
-------
str
String summary of the configuration
"""
lines = [f"{self.__class__.__name__} Configuration:"]
for key, value in self.to_dict().items():
if isinstance(value, dict):
lines.append(f" {key}:")
for k, v in value.items():
lines.append(f" {k}: {v}")
else:
lines.append(f" {key}: {value}")
return "\n".join(lines)