Source code for biobb_pytorch.mdae.explainability.LRP

import torch
import numpy as np
import os
from typing import Optional
from biobb_common.tools.file_utils import launchlogger
from biobb_common.tools import file_utils as fu
from biobb_pytorch.mdae.utils.log_utils import get_size
from biobb_common.generic.biobb_object import BiobbObject
from torch.utils.data import DataLoader
from mlcolvar.data import DictDataset
from biobb_pytorch.mdae.explainability.layerwise_relevance_prop import lrp_encoder



[docs]
class LRP(BiobbObject):
    """
    | biobb_pytorch LRP
    | Performs Layer-wise Relevance Propagation on a trained autoencoder encoder.
    | Performs Layer-wise Relevance Propagation on a trained autoencoder encoder.

    Args:
        input_model_pth_path (str): Path to the trained model file whose encoder is analyzed. File type: input. `Sample file <https://github.com/bioexcel/biobb_pytorch/raw/master/biobb_pytorch/test/reference/mdae/output_model.pth>`_. Accepted formats: pth (edam:format_2333).
        input_dataset_pt_path (str): Path to the input dataset file (.pt) used for computing relevance scores. File type: input. `Sample file <https://github.com/bioexcel/biobb_pytorch/raw/master/biobb_pytorch/test/reference/mdae/output_model.pt>`_. Accepted formats: pt (edam:format_2333).
        output_results_npz_path (str) (Optional): Path to the output results file containing relevance scores (compressed NumPy archive). File type: output. `Sample file <https://github.com/bioexcel/biobb_pytorch/raw/master/biobb_pytorch/test/reference/mdae/output_results.npz>`_. Accepted formats: npz (edam:format_2333).
        properties (dict - Python dictionary object containing the tool parameters, not input/output files):
            * **Dataset** (*dict*) - ({}) Dataset/DataLoader options (e.g. batch_size and optional indices to subset the dataset).

    Examples:
        This example shows how to use the LRP class to perform Layer-wise Relevance Propagation::

            from biobb_pytorch.mdae.explainability import relevancePropagation

            input_model_pth_path='input_model.pth'
            input_dataset_pt_path='input_dataset.pt'
            output_results_npz_path='output_results.npz'

            prop={
                'Dataset': {
                    'batch_size': 32
                }
            }

            LRP(input_model_pth_path=input_model_pth_path,
                           input_dataset_pt_path=input_dataset_pt_path,
                           output_results_npz_path=None,
                           properties=prop)


    Info:
        * wrapped_software:
            * name: PyTorch
            * version: >=1.6.0
            * license: BSD 3-Clause
        * ontology:
            * name: EDAM
            * schema: http://edamontology.org/EDAM.owl
    """

    def __init__(
        self,
        input_model_pth_path: str,
        input_dataset_pt_path: str,
        output_results_npz_path: Optional[str] = None,
        properties: dict = None,
        **kwargs,
    ) -> None:

        properties = properties or {}

        super().__init__(properties)

        self.input_model_pth_path = input_model_pth_path
        self.input_dataset_pt_path = input_dataset_pt_path
        self.output_results_npz_path = output_results_npz_path
        self.properties = properties.copy()
        self.locals_var_dict = locals().copy()

        # Input/Output files
        self.io_dict = {
            "in": {
                "input_model_pth_path": input_model_pth_path,
                "input_dataset_pt_path": input_dataset_pt_path,
            },
            "out": {},
        }

        if output_results_npz_path:
            self.io_dict["out"]["output_results_npz_path"] = output_results_npz_path

        self.Dataset = self.properties.get('Dataset', {})
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.results = None

        # Check the properties
        self.check_properties(properties)
        self.check_arguments()


[docs]
    def load_model(self):
        return torch.load(self.io_dict["in"]["input_model_pth_path"], weights_only=False)



[docs]
    def mask_idx(self, dataset: dict, indices: np.ndarray) -> dict:
        """
        Mask the dataset (dict) for all keys.
        """
        for key in dataset.keys():
            dataset[key] = dataset[key][indices]
        return dataset



[docs]
    def load_dataset(self):
        dataset = torch.load(self.io_dict["in"]["input_dataset_pt_path"], weights_only=False)

        if self.Dataset.get('indices', None):
            if isinstance(self.Dataset['indices'], list):
                indices = np.array(self.Dataset['indices'])
            elif isinstance(self.Dataset['indices'], np.ndarray):
                indices = self.Dataset['indices']
            dataset = self.mask_idx(dataset, indices)

        return DictDataset(dataset)



[docs]
    def create_dataloader(self, dataset):
        ds_cfg = self.properties['Dataset']
        return DataLoader(
            dataset,
            batch_size=ds_cfg.get('batch_size', 16),
            shuffle=False
        )



[docs]
    def compute_global_importance(self, model, dataloader, latent_index=None):
        all_R0 = []
        for batch in dataloader:
            X_batch = batch['data'].to(self.device)  # Assuming DictDataset with 'data' key
            R0 = lrp_encoder(model, X_batch, latent_index=latent_index)
            all_R0.append(R0.cpu())  # Move to CPU to save GPU memory
        R0_all = torch.cat(all_R0, dim=0)  # [total_samples, in_dim]

        # Reshape assuming features grouped by 3 (e.g., coordinates); adjust if needed
        num_features = R0_all.size(1) // 3
        R0_all = R0_all.reshape(-1, num_features, 3)
        R0_mean = R0_all.mean(dim=2)  # [total_samples, num_features]

        global_importance = R0_mean.abs().mean(dim=0)  # [num_features]
        global_importance_raw = global_importance.detach().numpy()

        # Normalize
        min_val = global_importance_raw.min()
        max_val = global_importance_raw.max()
        global_range = max_val - min_val + 1e-10  # Avoid division by zero
        global_importance_norm = (global_importance_raw - min_val) / global_range

        return {
            "global_importance": global_importance_norm,
            "global_importance_raw": global_importance_raw,
        }



[docs]
    @launchlogger
    def launch(self) -> int:
        """
        Execute the :class:`LRP` class and its `.launch()` method.
        """

        fu.log('## BioBB Layer-wise Relevance Propagation ##', self.out_log)

        # Setup Biobb
        if self.check_restart():
            return 0

        self.stage_files()

        # load the model
        fu.log(f'Load model from {os.path.abspath(self.io_dict["in"]["input_model_pth_path"])}', self.out_log)
        model = self.load_model()

        # load the dataset
        fu.log(f'Load dataset from {os.path.abspath(self.io_dict["in"]["input_dataset_pt_path"])}', self.out_log)
        dataset = self.load_dataset()

        # create the dataloader
        fu.log('Start LRP analysis...', self.out_log)
        dataloader = self.create_dataloader(dataset)

        # Compute LRP
        self.results = self.compute_global_importance(model, dataloader, latent_index=None)

        # Save the results if path provided
        if self.output_results_npz_path:
            np.savez_compressed(self.io_dict["out"]["output_results_npz_path"], **self.results)
            fu.log(f'Results saved to {os.path.abspath(self.io_dict["out"]["output_results_npz_path"])}', self.out_log)
            fu.log(f'File size: {get_size(self.io_dict["out"]["output_results_npz_path"])}', self.out_log)

        # Copy files to host
        self.copy_to_host()

        # Remove temporal files
        self.remove_tmp_files()

        output_created = bool(self.output_results_npz_path)
        self.check_arguments(output_files_created=output_created, raise_exception=False)

        return 0





[docs]
def relevance_propagation(
    properties: dict,
    input_model_pth_path: str,
    input_dataset_pt_path: str,
    output_results_npz_path: Optional[str] = None,
    **kwargs,
) -> int:
    """Create the :class:`LRP <LRP>` class and
    execute the :meth:`launch() <LRP.launch>` method."""
    return LRP(**dict(locals())).launch()



relevance_propagation.__doc__ = LRP.__doc__
main = LRP.get_main(relevance_propagation, "Performs Layer-wise Relevance Propagation on a trained autoencoder encoder.")


if __name__ == "__main__":
    main()