Source code for VmaxBuilder.protein.input_resolution

"""Generated: validation needed.

Description:
    Shared dataframe input resolution helpers for protein-stage submodules.
"""

from __future__ import annotations

from pathlib import Path

import pandas as pd

from VmaxBuilder.config.dataclasses import APIConfig
from VmaxBuilder.config.validation import ConfigurationError
from VmaxBuilder.core.protocols import Scaffold
from VmaxBuilder.utils.file_handling import load_existing_file_based_on_extension


[docs] def resolve_dataframe_input( scaffold: Scaffold, config: APIConfig, *, input_key: str, ) -> pd.DataFrame | None: """Generated: validation needed. Description: Resolve one dataframe input from scaffold, in-memory config, explicit path, or search roots. Args: scaffold (Scaffold): Shared pipeline scaffold. config (APIConfig): Root API configuration. input_key (str): Logical input key, e.g. expression, ptr, proteomics. Returns: pd.DataFrame | None: Resolved dataframe or None when absent. Raises: ConfigurationError: When provided input is not a dataframe. Modifies: scaffold["inputs"] when a dataframe is resolved from config or disk. """ input_payload = scaffold.setdefault("inputs", {}) if input_key in input_payload: input_value = input_payload[input_key] if not isinstance(input_value, pd.DataFrame): raise ConfigurationError( f"Input '{input_key}' must be pandas.DataFrame; got " f"{type(input_value).__name__}." ) return input_value in_memory_inputs = config.loading.get_effective_in_memory_inputs() if input_key in in_memory_inputs: input_value = in_memory_inputs[input_key] if not isinstance(input_value, pd.DataFrame): raise ConfigurationError( f"In-memory input '{input_key}' must be pandas.DataFrame; got " f"{type(input_value).__name__}." ) input_payload[input_key] = input_value return input_value explicit_paths = config.loading.get_effective_exact_paths() input_path = explicit_paths.get(input_key) if input_path is not None: resolved_input_path = resolve_input_file_path( Path(input_path), input_key=input_key, config=config, ) loaded_value = load_existing_file_based_on_extension( resolved_input_path, index_col=0, ) if not isinstance(loaded_value, pd.DataFrame): raise ConfigurationError( f"Loaded input '{input_key}' from '{resolved_input_path}' is not " "pandas.DataFrame." ) loaded_value = normalize_loaded_dataframe(loaded_value) input_payload[input_key] = loaded_value return loaded_value for search_root in config.loading.iter_search_roots(input_key): try: resolved_input_path = resolve_input_file_path( Path(search_root), input_key=input_key, config=config, ) except ConfigurationError: continue loaded_value = load_existing_file_based_on_extension( resolved_input_path, index_col=0, ) if not isinstance(loaded_value, pd.DataFrame): raise ConfigurationError( f"Loaded input '{input_key}' from '{resolved_input_path}' is not " "pandas.DataFrame." ) loaded_value = normalize_loaded_dataframe(loaded_value) input_payload[input_key] = loaded_value return loaded_value return None
[docs] def resolve_input_file_path( candidate_path: Path, *, input_key: str, config: APIConfig, ) -> Path: """Generated: validation needed. Description: Resolve concrete file path for one dataframe input from file or directory candidate. Args: candidate_path (Path): File or directory candidate path. input_key (str): Logical input key. config (APIConfig): Root API configuration. Returns: Path: Resolved file path. Raises: ConfigurationError: When no matching input file exists. """ allowed_extensions = { extension.lower() for extension in config.loading.get_discovery_extensions(input_key) } filename_prefixes = tuple( prefix.lower() for prefix in config.loading.get_discovery_prefixes(input_key) ) if candidate_path.is_file(): if candidate_path.suffix.lower() not in allowed_extensions: raise ConfigurationError( f"Input '{input_key}' has unsupported extension '{candidate_path.suffix}'." ) return candidate_path if candidate_path.is_dir(): candidates = sorted( file_path for file_path in candidate_path.iterdir() if file_path.is_file() and file_path.name.lower().startswith(filename_prefixes) and file_path.suffix.lower() in allowed_extensions ) if candidates: return candidates[0] raise ConfigurationError( f"Could not resolve input '{input_key}' from '{candidate_path}'." )
[docs] def normalize_loaded_dataframe(loaded_df: pd.DataFrame) -> pd.DataFrame: """Generated: validation needed. Description: Normalize loaded dataframe index when loader leaves identifier column in values. Args: loaded_df (pd.DataFrame): Loaded dataframe from file. Returns: pd.DataFrame: Normalized dataframe with identifier index when detectable. """ if loaded_df.empty or not isinstance(loaded_df.index, pd.RangeIndex): return loaded_df first_column = loaded_df.columns[0] first_series = loaded_df[first_column] should_promote_index = ( str(first_column).lower().startswith("unnamed") or first_series.dtype == object ) and first_series.is_unique if not should_promote_index: return loaded_df normalized_df = loaded_df.copy() normalized_df.set_index(first_column, inplace=True) return normalized_df