Source code for VmaxBuilder.protein.input_resolution
"""Generated: validation needed.
Description:
Shared dataframe input resolution helpers for protein-stage submodules.
"""
from __future__ import annotations
from pathlib import Path
import pandas as pd
from VmaxBuilder.config.dataclasses import APIConfig
from VmaxBuilder.config.validation import ConfigurationError
from VmaxBuilder.core.protocols import Scaffold
from VmaxBuilder.utils.file_handling import load_existing_file_based_on_extension
[docs]
def resolve_dataframe_input(
scaffold: Scaffold,
config: APIConfig,
*,
input_key: str,
) -> pd.DataFrame | None:
"""Generated: validation needed.
Description:
Resolve one dataframe input from scaffold, in-memory config,
explicit path, or search roots.
Args:
scaffold (Scaffold): Shared pipeline scaffold.
config (APIConfig): Root API configuration.
input_key (str): Logical input key, e.g. expression, ptr, proteomics.
Returns:
pd.DataFrame | None: Resolved dataframe or None when absent.
Raises:
ConfigurationError: When provided input is not a dataframe.
Modifies:
scaffold["inputs"] when a dataframe is resolved from config or disk.
"""
input_payload = scaffold.setdefault("inputs", {})
if input_key in input_payload:
input_value = input_payload[input_key]
if not isinstance(input_value, pd.DataFrame):
raise ConfigurationError(
f"Input '{input_key}' must be pandas.DataFrame; got "
f"{type(input_value).__name__}."
)
return input_value
in_memory_inputs = config.loading.get_effective_in_memory_inputs()
if input_key in in_memory_inputs:
input_value = in_memory_inputs[input_key]
if not isinstance(input_value, pd.DataFrame):
raise ConfigurationError(
f"In-memory input '{input_key}' must be pandas.DataFrame; got "
f"{type(input_value).__name__}."
)
input_payload[input_key] = input_value
return input_value
explicit_paths = config.loading.get_effective_exact_paths()
input_path = explicit_paths.get(input_key)
if input_path is not None:
resolved_input_path = resolve_input_file_path(
Path(input_path),
input_key=input_key,
config=config,
)
loaded_value = load_existing_file_based_on_extension(
resolved_input_path,
index_col=0,
)
if not isinstance(loaded_value, pd.DataFrame):
raise ConfigurationError(
f"Loaded input '{input_key}' from '{resolved_input_path}' is not "
"pandas.DataFrame."
)
loaded_value = normalize_loaded_dataframe(loaded_value)
input_payload[input_key] = loaded_value
return loaded_value
for search_root in config.loading.iter_search_roots(input_key):
try:
resolved_input_path = resolve_input_file_path(
Path(search_root),
input_key=input_key,
config=config,
)
except ConfigurationError:
continue
loaded_value = load_existing_file_based_on_extension(
resolved_input_path,
index_col=0,
)
if not isinstance(loaded_value, pd.DataFrame):
raise ConfigurationError(
f"Loaded input '{input_key}' from '{resolved_input_path}' is not "
"pandas.DataFrame."
)
loaded_value = normalize_loaded_dataframe(loaded_value)
input_payload[input_key] = loaded_value
return loaded_value
return None
[docs]
def resolve_input_file_path(
candidate_path: Path,
*,
input_key: str,
config: APIConfig,
) -> Path:
"""Generated: validation needed.
Description:
Resolve concrete file path for one dataframe input from file or directory candidate.
Args:
candidate_path (Path): File or directory candidate path.
input_key (str): Logical input key.
config (APIConfig): Root API configuration.
Returns:
Path: Resolved file path.
Raises:
ConfigurationError: When no matching input file exists.
"""
allowed_extensions = {
extension.lower() for extension in config.loading.get_discovery_extensions(input_key)
}
filename_prefixes = tuple(
prefix.lower() for prefix in config.loading.get_discovery_prefixes(input_key)
)
if candidate_path.is_file():
if candidate_path.suffix.lower() not in allowed_extensions:
raise ConfigurationError(
f"Input '{input_key}' has unsupported extension '{candidate_path.suffix}'."
)
return candidate_path
if candidate_path.is_dir():
candidates = sorted(
file_path
for file_path in candidate_path.iterdir()
if file_path.is_file()
and file_path.name.lower().startswith(filename_prefixes)
and file_path.suffix.lower() in allowed_extensions
)
if candidates:
return candidates[0]
raise ConfigurationError(
f"Could not resolve input '{input_key}' from '{candidate_path}'."
)
[docs]
def normalize_loaded_dataframe(loaded_df: pd.DataFrame) -> pd.DataFrame:
"""Generated: validation needed.
Description:
Normalize loaded dataframe index when loader leaves identifier column in values.
Args:
loaded_df (pd.DataFrame): Loaded dataframe from file.
Returns:
pd.DataFrame: Normalized dataframe with identifier index when detectable.
"""
if loaded_df.empty or not isinstance(loaded_df.index, pd.RangeIndex):
return loaded_df
first_column = loaded_df.columns[0]
first_series = loaded_df[first_column]
should_promote_index = (
str(first_column).lower().startswith("unnamed") or first_series.dtype == object
) and first_series.is_unique
if not should_promote_index:
return loaded_df
normalized_df = loaded_df.copy()
normalized_df.set_index(first_column, inplace=True)
return normalized_df