Universe

A class to help with running the analysis of a single universe contained within a multiverse analysis.

Attributes:

Name	Type	Description
`run_no`		The run number of the multiverse analysis.
`universe_id`		The id of the universe.
`universe`		The universe settings.
`output_dir`		The directory to which the output should be written.
`metrics`		A dictionary containing the metrics to be computed.
`fairness_metrics`		A dictionary containing the fairness metrics to be computed.
`ts_start`		The timestamp of the start of the analysis.
`ts_end`		The timestamp of the end of the analysis.

Source code in multiversum/universe.py

class Universe:
    """
    A class to help with running the analysis of a single universe contained
    within a multiverse analysis.

    Attributes:
        run_no: The run number of the multiverse analysis.
        universe_id: The id of the universe.
        universe: The universe settings.
        output_dir: The directory to which the output should be written.
        metrics: A dictionary containing the metrics to be computed.
        fairness_metrics: A dictionary containing the fairness metrics to be
            computed.
        ts_start: The timestamp of the start of the analysis.
        ts_end: The timestamp of the end of the analysis.
    """

    metrics = None
    fairness_metrics = None
    ts_start = None
    ts_end = None

    def __init__(
        self,
        settings: Union[str, Dict[str, Any]],
        metrics: Optional[Dict[str, Callable]] = None,
        fairness_metrics: Optional[Dict[str, Callable]] = None,
        expand_dicts: bool = False,
        seed: Optional[Union[int, str]] = None,
        set_seed: bool = True,
    ) -> None:
        """
        Initialize the Universe class.

        The arguments should be passed in from the larger multiverse analysis.

        Args:
            settings: The settings for the universe analysis. This can usually
                just be passed along from the multiverse analysis. You only need
                to specify this yourself when developing / trying out an
                analysis. Possible keys in the dictionary are:
                - dimensions: The specified universe dimensions. This is the
                    only required information.
                - run_no: The run number of the multiverse analysis.
                - seed: The seed to use for analyses.
                - universe_id: The id of the universe.
                output_dir: The directory to which the output should be written.
            metrics: A dictionary containing the metrics to be computed.
                Pass an empty dictionary to not compute any.
            fairness_metrics: A dictionary containing the fairness metrics to be
                computed. (These are cumputed with awareness of groups.)
                Pass an empty dictionary to not compute any.
            expand_dicts: Whether to expand dictionaries in the dimensions i.e.
                if there are any dictionaries in the dimensions, expand them into
                separate dimensions of their own. Defaults to False.
            seed: An optional seed value to override the one from settings.
                This can be either:
                - an integer: directly used as seed value
                - a string: interpreted as a dimension name whose value will be used as seed
            set_seed: Whether to use the seed provided in the settings.
                Defaults to True. Please note, that this only sets the seed in
                the Python random module and numpy.
        """
        self.ts_start = time.time()

        # Check whether global overrides are present
        global_overwrite_settings = search_in_parent_frames(
            SCRIPT_GLOBAL_OVERWRITE_NAME
        )
        if global_overwrite_settings is not None:
            print(
                f"Detected {SCRIPT_GLOBAL_OVERWRITE_NAME}, the settings argument will be ignored."
            )
            settings = global_overwrite_settings

        # Extract settings
        parsed_settings = (
            json.loads(settings) if isinstance(settings, str) else settings
        )

        self.run_no = parsed_settings["run_no"] if "run_no" in parsed_settings else 0
        self.universe_id = (
            parsed_settings["universe_id"]
            if "universe_id" in parsed_settings
            else "no-universe-id-provided"
        )
        self.dimensions = parsed_settings["dimensions"]
        if expand_dicts:
            # Create a new dictionary to store expanded dimensions
            expanded_dimensions = {}
            # Process each key-value pair in dimensions
            for key, value in self.dimensions.items():
                if isinstance(value, dict):
                    # For dictionary values, add all key-value pairs to root level
                    # but don't expand nested dictionaries further
                    expanded_dimensions.update(value)
                else:
                    # For non-dictionary values, keep them as is
                    expanded_dimensions[key] = value
            self.dimensions = expanded_dimensions

        # Handle seed overriding
        settings_seed = parsed_settings["seed"] if "seed" in parsed_settings else 0

        if seed is not None:
            # If seed is a string, try to interpret it as a dimension name
            if isinstance(seed, str):
                if seed in self.dimensions:
                    seed_value = self.dimensions[seed]
                    if "seed" in parsed_settings:
                        warnings.warn(
                            f"Seed from dimension '{seed}' ({seed_value}) is overriding seed from settings ({settings_seed})."
                        )
                    self.seed = seed_value
                else:
                    warnings.warn(
                        f"Dimension '{seed}' not found in dimensions. Using settings seed ({settings_seed}) instead."
                    )
                    self.seed = settings_seed
            else:
                # Seed is a direct value
                if "seed" in parsed_settings:
                    warnings.warn(
                        f"Seed provided in constructor ({seed}) is overriding seed from settings ({settings_seed})."
                    )
                self.seed = seed
        else:
            self.seed = settings_seed

        self.output_dir = (
            Path(parsed_settings["output_dir"])
            if "output_dir" in parsed_settings
            else Path("./output")
        )

        self.metrics = metrics
        self.fairness_metrics = fairness_metrics

        if self.dimensions is None:
            warnings.warn("No dimensions specified for universe analysis.")

        if set_seed:
            print(f"Setting seed to {self.seed} (in: [random, numpy.random]).")
            random.seed(self.seed)
            np.random.seed(self.seed)

    def get_execution_time(self) -> float:
        """
        Gets the execution time of the universe analysis.

        Returns:
            float: The execution time in seconds.
        """
        if self.ts_end is None:
            print("Stopping execution_time clock.")
            self.ts_end = time.time()
        return self.ts_end - self.ts_start

    def _add_universe_info(
        self, data: pd.DataFrame, overwrite_dimensions: Optional[dict] = None
    ) -> pd.DataFrame:
        """
        Add general universe / run info to the dataframe.

        Args:
            data: The dataframe to which the info should be added.
            overwrite_dimensions: A dictionary containing dimensions to overwrite. (optional)

        Returns:
            The dataframe with the added info.
        """
        return add_universe_info_to_df(
            data=data,
            universe_id=self.universe_id,
            run_no=self.run_no,
            dimensions=self.dimensions
            if overwrite_dimensions is None
            else overwrite_dimensions,
            execution_time=self.get_execution_time(),
        )

    def save_data(
        self, data: pd.DataFrame, add_info: bool = True, multiple_rows: bool = False
    ) -> None:
        """
        Save the data to the appropriate file for this Universe.

        Args:
            data: The dataframe to be saved.
            add_info: Whether to add universe info to the dataframe. (optional)
            multiple_rows: Whether to allow multiple rows in data. (optional)

        Returns:
            None
        """
        if len(data) > 1 and not multiple_rows:
            raise ValueError(
                f"Saving {len(data)} rows for universe '{self.universe_id}'. "
                "Data is usually in wide format (multiple columns, one row) "
                "rather than multiple rows, as each row corresponds to a "
                "universe. Please verify that the output is correct."
                "You can disable this error with multiple_rows=True"
            )

        # Add universe data to the dataframe
        if add_info:
            data = self._add_universe_info(data=data)

        # Path management
        target_dir = (
            self.output_dir / "runs" / str(self.run_no) / UNIVERSE_DATA_DIR_NAME
        )
        # Make sure the directory exists
        target_dir.mkdir(parents=True, exist_ok=True)
        filename = f"d_{str(self.run_no)}_{self.universe_id}.csv"
        filepath = target_dir / filename
        if filepath.exists():
            warnings.warn(f"File {filepath} already exists. Overwriting it.")
        # Write the file
        data.to_csv(filepath, index=False)

    def get_export_file_path(self, filename: str, mkdir: bool = False) -> Path:
        """
        Get the file path for exporting a file from this Universe.

        Args:
            filename: The name of the file to export.
            mkdir: Whether to create the directory if it does not exist. (optional)

        Returns:
            Path: The full path where the file should be exported.
        """
        export_dir = (
            self.output_dir / "runs" / str(self.run_no) / "exports" / self.universe_id
        )
        if mkdir:
            export_dir.mkdir(parents=True, exist_ok=True)
        return export_dir / filename

    def export_dataframe(self, data: pd.DataFrame, filename: str, **kwargs) -> None:
        """
        Export a dataframe from this Universe with automatic format detection.

        Args:
            data: The dataframe to export.
            filename: The name of the file to export (incl. extension).
            **kwargs: Additional keyword arguments passed to the export function.

        Returns:
            None
        """
        filepath = self.get_export_file_path(filename, mkdir=True)

        if filepath.exists():
            warnings.warn(f"File {filepath} already exists. Overwriting it.")

        # Determine format based on file extension
        file_ext = filepath.suffix.lower()

        if file_ext == ".csv":
            # Default kwargs for CSV
            csv_kwargs = {"index": False}
            csv_kwargs.update(kwargs)
            data.to_csv(filepath, **csv_kwargs)
        elif file_ext == ".json":
            # Default kwargs for JSON
            json_kwargs = {"orient": "records", "indent": 2}
            json_kwargs.update(kwargs)
            data.to_json(filepath, **json_kwargs)
        elif file_ext in [".xlsx", ".xls"]:
            # Default kwargs for Excel
            excel_kwargs = {"index": False}
            excel_kwargs.update(kwargs)
            data.to_excel(filepath, **excel_kwargs)
        elif file_ext == ".parquet":
            # Default kwargs for Parquet
            parquet_kwargs = {"index": False}
            parquet_kwargs.update(kwargs)
            data.to_parquet(filepath, **parquet_kwargs)
        else:
            # Default to CSV for unknown extensions
            warnings.warn(
                f"Unknown file extension '{file_ext}'. Defaulting to CSV format."
            )
            csv_kwargs = {"index": False}
            csv_kwargs.update(kwargs)
            data.to_csv(filepath, **csv_kwargs)

    def compute_sub_universe_metrics(
        self,
        sub_universe: Dict,
        y_pred_prob: pd.Series,
        y_test: pd.Series,
        org_test: pd.DataFrame,
    ) -> Tuple[dict, dict]:
        """
        Computes a set of metrics for a given sub-universe.

        Args:
            sub_universe: A dictionary containing the parameters for the
                sub-universe.
            y_pred_prob: A pandas series containing the predicted
                probabilities.
            y_test: A pandas series containing the true labels.
            org_test: A pandas dataframe containing the test data, including
                variables that were not used as features.

        Returns:
            A tuple containing two dics: explicit fairness metrics and
                performance metrics split by fairness groups.
        """
        # Determine cutoff for predictions
        cutoff_type, cutoff_value = sub_universe["cutoff"].split("_")
        cutoff_value = float(cutoff_value)

        if cutoff_type == "raw":
            threshold = cutoff_value
        elif cutoff_type == "quantile":
            probabilities_true = y_pred_prob[:, 1]
            threshold = np.quantile(probabilities_true, cutoff_value)

        fairness_grouping = sub_universe["eval_fairness_grouping"]
        if fairness_grouping == "majority-minority":
            fairness_group_column = "majmin"
        elif fairness_grouping == "race-all":
            fairness_group_column = "RAC1P"

        y_pred = predict_w_threshold(y_pred_prob, threshold)

        try:
            from fairlearn.metrics import (
                MetricFrame,
                count,
                demographic_parity_difference,
                demographic_parity_ratio,
                equalized_odds_difference,
                equalized_odds_ratio,
                false_negative_rate,
                false_positive_rate,
                selection_rate,
            )
            from sklearn.metrics import (
                accuracy_score,
                balanced_accuracy_score,
                f1_score,
                precision_score,
            )

            metrics = (
                {
                    "accuracy": accuracy_score,
                    "balanced accuracy": balanced_accuracy_score,
                    "f1": f1_score,
                    "precision": precision_score,
                    "false positive rate": false_positive_rate,
                    "false negative rate": false_negative_rate,
                    "selection rate": selection_rate,
                    "count": count,
                }
                if self.metrics is None
                else self.metrics
            )

            fairness_metrics = (
                {
                    "equalized_odds_difference": equalized_odds_difference,
                    "equalized_odds_ratio": equalized_odds_ratio,
                    "demographic_parity_difference": demographic_parity_difference,
                    "demographic_parity_ratio": demographic_parity_ratio,
                }
                if self.fairness_metrics is None
                else self.fairness_metrics
            )

            # Compute fairness metrics
            fairness_dict = {
                name: metric(
                    y_true=y_test,
                    y_pred=y_pred,
                    sensitive_features=org_test[fairness_group_column],
                )
                for name, metric in fairness_metrics.items()
            }

            # Compute "normal" metrics (but split by fairness column)
            metric_frame = MetricFrame(
                metrics=metrics,
                y_true=y_test,
                y_pred=y_pred,
                sensitive_features=org_test[fairness_group_column],
            )

            return (fairness_dict, metric_frame)
        except ImportError:
            raise ImportError(
                "Packages fairlearn and scikit-learn are required for computing metrics."
            )

    def visit_sub_universe(
        self,
        sub_universe: Dict[str, Any],
        y_pred_prob: pd.Series,
        y_test: pd.Series,
        org_test: pd.Series,
        filter_data: Callable,
    ) -> pd.DataFrame:
        """
        Visit a sub-universe and compute the metrics for it.

        Sub-universes correspond to theoretically distinct universes of
        decisions, which can be computed without re-fitting a model. The
        distinction has only been made to improve performance by not having to
        compute these universes from scratch.

        Args:
            sub_universe: A dictionary containing the parameters for the
                sub-universe.
            y_pred_prob: A pandas series containing the predicted
                probabilities.
            y_test: A pandas series containing the true labels.
            org_test: A pandas dataframe containing the test data, including
                variables that were not used as features.
            filter_data: A function that filters data for each sub-universe.
                The function is called for each sub-universe with its
                respective settings and expected to return a pandas Series
                of booleans.

        Returns:
            A pandas dataframe containing the metrics for the sub-universe.
        """
        final_output = self._add_universe_info(
            data=pd.DataFrame(index=[self.universe_id]),
            overwrite_dimensions=sub_universe,
        )

        data_mask = filter_data(sub_universe=sub_universe, org_test=org_test)
        final_output["test_size_n"] = data_mask.sum()
        final_output["test_size_frac"] = data_mask.sum() / len(data_mask)

        # Compute metrics for majority-minority split
        fairness_dict, metric_frame = self.compute_sub_universe_metrics(
            sub_universe,
            y_pred_prob[data_mask],
            y_test[data_mask],
            org_test[data_mask],
        )

        # Add main fairness metrics to final_output
        final_output = add_dict_to_df(final_output, fairness_dict, prefix="fair_main_")
        final_output = add_dict_to_df(
            final_output, dict(metric_frame.overall), prefix="perf_ovrl_"
        )

        # Add group metrics to final output
        final_output = add_dict_to_df(
            final_output, flatten_dict(metric_frame.by_group), prefix="perf_grp_"
        )

        return final_output

    def generate_sub_universes(self) -> List[dict]:
        """
        Generate the sub-universes for the given universe settings.

        Returns:
            A list of dictionaries containing the sub-universes.
        """
        # Wrap all non-lists in the universe to make them work with generate_multiverse_grid
        universe_all_lists = {k: list_wrap(v) for k, v in self.dimensions.items()}

        # Within-Universe variation
        return generate_multiverse_grid(universe_all_lists)

    def compute_final_metrics(
        self,
        y_pred_prob: pd.Series,
        y_test: pd.Series,
        org_test: pd.Series,
        filter_data: Callable,
        save: bool = True,
    ) -> pd.DataFrame:
        """
        Generate the final output for the given universe settings.

        Args:
            y_pred_prob: A pandas series containing the predicted
                probabilities.
            y_test: A pandas series containing the true labels.
            org_test: A pandas dataframe containing the test data, including
                variables that were not used as features.
            filter_data: A function that filters data for each sub-universe.
                The function is called for each sub-universe with its
                respective settings and expected to return a pandas Series
                of booleans.
            save: Whether to save the output to a file. (optional)

        Returns:
            A pandas dataframe containing the final output.
        """
        # Within-Universe variation
        sub_universes = self.generate_sub_universes()

        final_outputs = list()
        for sub_universe in sub_universes:
            final_outputs.append(
                self.visit_sub_universe(
                    sub_universe=sub_universe,
                    y_pred_prob=y_pred_prob,
                    y_test=y_test,
                    org_test=org_test,
                    filter_data=filter_data,
                ).reset_index(drop=True)
            )
        final_output = pd.concat(final_outputs)

        # Write the final output file
        if save:
            self.save_data(final_output, add_info=False)

        return final_output

`init(settings, metrics=None, fairness_metrics=None, expand_dicts=False, seed=None, set_seed=True)`

Initialize the Universe class.

The arguments should be passed in from the larger multiverse analysis.

Parameters:

Name	Type	Description	Default
`settings`	`Union[str, Dict[str, Any]]`	The settings for the universe analysis. This can usually just be passed along from the multiverse analysis. You only need to specify this yourself when developing / trying out an analysis. Possible keys in the dictionary are: - dimensions: The specified universe dimensions. This is the only required information. - run_no: The run number of the multiverse analysis. - seed: The seed to use for analyses. - universe_id: The id of the universe. output_dir: The directory to which the output should be written.	required
`metrics`	`Optional[Dict[str, Callable]]`	A dictionary containing the metrics to be computed. Pass an empty dictionary to not compute any.	`None`
`fairness_metrics`	`Optional[Dict[str, Callable]]`	A dictionary containing the fairness metrics to be computed. (These are cumputed with awareness of groups.) Pass an empty dictionary to not compute any.	`None`
`expand_dicts`	`bool`	Whether to expand dictionaries in the dimensions i.e. if there are any dictionaries in the dimensions, expand them into separate dimensions of their own. Defaults to False.	`False`
`seed`	`Optional[Union[int, str]]`	An optional seed value to override the one from settings. This can be either: - an integer: directly used as seed value - a string: interpreted as a dimension name whose value will be used as seed	`None`
`set_seed`	`bool`	Whether to use the seed provided in the settings. Defaults to True. Please note, that this only sets the seed in the Python random module and numpy.	`True`

Source code in multiversum/universe.py

def __init__(
    self,
    settings: Union[str, Dict[str, Any]],
    metrics: Optional[Dict[str, Callable]] = None,
    fairness_metrics: Optional[Dict[str, Callable]] = None,
    expand_dicts: bool = False,
    seed: Optional[Union[int, str]] = None,
    set_seed: bool = True,
) -> None:
    """
    Initialize the Universe class.

    The arguments should be passed in from the larger multiverse analysis.

    Args:
        settings: The settings for the universe analysis. This can usually
            just be passed along from the multiverse analysis. You only need
            to specify this yourself when developing / trying out an
            analysis. Possible keys in the dictionary are:
            - dimensions: The specified universe dimensions. This is the
                only required information.
            - run_no: The run number of the multiverse analysis.
            - seed: The seed to use for analyses.
            - universe_id: The id of the universe.
            output_dir: The directory to which the output should be written.
        metrics: A dictionary containing the metrics to be computed.
            Pass an empty dictionary to not compute any.
        fairness_metrics: A dictionary containing the fairness metrics to be
            computed. (These are cumputed with awareness of groups.)
            Pass an empty dictionary to not compute any.
        expand_dicts: Whether to expand dictionaries in the dimensions i.e.
            if there are any dictionaries in the dimensions, expand them into
            separate dimensions of their own. Defaults to False.
        seed: An optional seed value to override the one from settings.
            This can be either:
            - an integer: directly used as seed value
            - a string: interpreted as a dimension name whose value will be used as seed
        set_seed: Whether to use the seed provided in the settings.
            Defaults to True. Please note, that this only sets the seed in
            the Python random module and numpy.
    """
    self.ts_start = time.time()

    # Check whether global overrides are present
    global_overwrite_settings = search_in_parent_frames(
        SCRIPT_GLOBAL_OVERWRITE_NAME
    )
    if global_overwrite_settings is not None:
        print(
            f"Detected {SCRIPT_GLOBAL_OVERWRITE_NAME}, the settings argument will be ignored."
        )
        settings = global_overwrite_settings

    # Extract settings
    parsed_settings = (
        json.loads(settings) if isinstance(settings, str) else settings
    )

    self.run_no = parsed_settings["run_no"] if "run_no" in parsed_settings else 0
    self.universe_id = (
        parsed_settings["universe_id"]
        if "universe_id" in parsed_settings
        else "no-universe-id-provided"
    )
    self.dimensions = parsed_settings["dimensions"]
    if expand_dicts:
        # Create a new dictionary to store expanded dimensions
        expanded_dimensions = {}
        # Process each key-value pair in dimensions
        for key, value in self.dimensions.items():
            if isinstance(value, dict):
                # For dictionary values, add all key-value pairs to root level
                # but don't expand nested dictionaries further
                expanded_dimensions.update(value)
            else:
                # For non-dictionary values, keep them as is
                expanded_dimensions[key] = value
        self.dimensions = expanded_dimensions

    # Handle seed overriding
    settings_seed = parsed_settings["seed"] if "seed" in parsed_settings else 0

    if seed is not None:
        # If seed is a string, try to interpret it as a dimension name
        if isinstance(seed, str):
            if seed in self.dimensions:
                seed_value = self.dimensions[seed]
                if "seed" in parsed_settings:
                    warnings.warn(
                        f"Seed from dimension '{seed}' ({seed_value}) is overriding seed from settings ({settings_seed})."
                    )
                self.seed = seed_value
            else:
                warnings.warn(
                    f"Dimension '{seed}' not found in dimensions. Using settings seed ({settings_seed}) instead."
                )
                self.seed = settings_seed
        else:
            # Seed is a direct value
            if "seed" in parsed_settings:
                warnings.warn(
                    f"Seed provided in constructor ({seed}) is overriding seed from settings ({settings_seed})."
                )
            self.seed = seed
    else:
        self.seed = settings_seed

    self.output_dir = (
        Path(parsed_settings["output_dir"])
        if "output_dir" in parsed_settings
        else Path("./output")
    )

    self.metrics = metrics
    self.fairness_metrics = fairness_metrics

    if self.dimensions is None:
        warnings.warn("No dimensions specified for universe analysis.")

    if set_seed:
        print(f"Setting seed to {self.seed} (in: [random, numpy.random]).")
        random.seed(self.seed)
        np.random.seed(self.seed)

`compute_final_metrics(y_pred_prob, y_test, org_test, filter_data, save=True)`

Generate the final output for the given universe settings.

Parameters:

Name	Type	Description	Default
`y_pred_prob`	`Series`	A pandas series containing the predicted probabilities.	required
`y_test`	`Series`	A pandas series containing the true labels.	required
`org_test`	`Series`	A pandas dataframe containing the test data, including variables that were not used as features.	required
`filter_data`	`Callable`	A function that filters data for each sub-universe. The function is called for each sub-universe with its respective settings and expected to return a pandas Series of booleans.	required
`save`	`bool`	Whether to save the output to a file. (optional)	`True`

Returns:

Type	Description
`DataFrame`	A pandas dataframe containing the final output.

Source code in multiversum/universe.py

def compute_final_metrics(
    self,
    y_pred_prob: pd.Series,
    y_test: pd.Series,
    org_test: pd.Series,
    filter_data: Callable,
    save: bool = True,
) -> pd.DataFrame:
    """
    Generate the final output for the given universe settings.

    Args:
        y_pred_prob: A pandas series containing the predicted
            probabilities.
        y_test: A pandas series containing the true labels.
        org_test: A pandas dataframe containing the test data, including
            variables that were not used as features.
        filter_data: A function that filters data for each sub-universe.
            The function is called for each sub-universe with its
            respective settings and expected to return a pandas Series
            of booleans.
        save: Whether to save the output to a file. (optional)

    Returns:
        A pandas dataframe containing the final output.
    """
    # Within-Universe variation
    sub_universes = self.generate_sub_universes()

    final_outputs = list()
    for sub_universe in sub_universes:
        final_outputs.append(
            self.visit_sub_universe(
                sub_universe=sub_universe,
                y_pred_prob=y_pred_prob,
                y_test=y_test,
                org_test=org_test,
                filter_data=filter_data,
            ).reset_index(drop=True)
        )
    final_output = pd.concat(final_outputs)

    # Write the final output file
    if save:
        self.save_data(final_output, add_info=False)

    return final_output

`compute_sub_universe_metrics(sub_universe, y_pred_prob, y_test, org_test)`

Computes a set of metrics for a given sub-universe.

Parameters:

Name	Type	Description	Default
`sub_universe`	`Dict`	A dictionary containing the parameters for the sub-universe.	required
`y_pred_prob`	`Series`	A pandas series containing the predicted probabilities.	required
`y_test`	`Series`	A pandas series containing the true labels.	required
`org_test`	`DataFrame`	A pandas dataframe containing the test data, including variables that were not used as features.	required

Returns:

Type	Description
`Tuple[dict, dict]`	A tuple containing two dics: explicit fairness metrics and performance metrics split by fairness groups.

Source code in multiversum/universe.py

def compute_sub_universe_metrics(
    self,
    sub_universe: Dict,
    y_pred_prob: pd.Series,
    y_test: pd.Series,
    org_test: pd.DataFrame,
) -> Tuple[dict, dict]:
    """
    Computes a set of metrics for a given sub-universe.

    Args:
        sub_universe: A dictionary containing the parameters for the
            sub-universe.
        y_pred_prob: A pandas series containing the predicted
            probabilities.
        y_test: A pandas series containing the true labels.
        org_test: A pandas dataframe containing the test data, including
            variables that were not used as features.

    Returns:
        A tuple containing two dics: explicit fairness metrics and
            performance metrics split by fairness groups.
    """
    # Determine cutoff for predictions
    cutoff_type, cutoff_value = sub_universe["cutoff"].split("_")
    cutoff_value = float(cutoff_value)

    if cutoff_type == "raw":
        threshold = cutoff_value
    elif cutoff_type == "quantile":
        probabilities_true = y_pred_prob[:, 1]
        threshold = np.quantile(probabilities_true, cutoff_value)

    fairness_grouping = sub_universe["eval_fairness_grouping"]
    if fairness_grouping == "majority-minority":
        fairness_group_column = "majmin"
    elif fairness_grouping == "race-all":
        fairness_group_column = "RAC1P"

    y_pred = predict_w_threshold(y_pred_prob, threshold)

    try:
        from fairlearn.metrics import (
            MetricFrame,
            count,
            demographic_parity_difference,
            demographic_parity_ratio,
            equalized_odds_difference,
            equalized_odds_ratio,
            false_negative_rate,
            false_positive_rate,
            selection_rate,
        )
        from sklearn.metrics import (
            accuracy_score,
            balanced_accuracy_score,
            f1_score,
            precision_score,
        )

        metrics = (
            {
                "accuracy": accuracy_score,
                "balanced accuracy": balanced_accuracy_score,
                "f1": f1_score,
                "precision": precision_score,
                "false positive rate": false_positive_rate,
                "false negative rate": false_negative_rate,
                "selection rate": selection_rate,
                "count": count,
            }
            if self.metrics is None
            else self.metrics
        )

        fairness_metrics = (
            {
                "equalized_odds_difference": equalized_odds_difference,
                "equalized_odds_ratio": equalized_odds_ratio,
                "demographic_parity_difference": demographic_parity_difference,
                "demographic_parity_ratio": demographic_parity_ratio,
            }
            if self.fairness_metrics is None
            else self.fairness_metrics
        )

        # Compute fairness metrics
        fairness_dict = {
            name: metric(
                y_true=y_test,
                y_pred=y_pred,
                sensitive_features=org_test[fairness_group_column],
            )
            for name, metric in fairness_metrics.items()
        }

        # Compute "normal" metrics (but split by fairness column)
        metric_frame = MetricFrame(
            metrics=metrics,
            y_true=y_test,
            y_pred=y_pred,
            sensitive_features=org_test[fairness_group_column],
        )

        return (fairness_dict, metric_frame)
    except ImportError:
        raise ImportError(
            "Packages fairlearn and scikit-learn are required for computing metrics."
        )

`export_dataframe(data, filename, **kwargs)`

Export a dataframe from this Universe with automatic format detection.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	The dataframe to export.	required
`filename`	`str`	The name of the file to export (incl. extension).	required
`**kwargs`		Additional keyword arguments passed to the export function.	`{}`

Returns:

Type	Description
`None`	None

Source code in multiversum/universe.py

def export_dataframe(self, data: pd.DataFrame, filename: str, **kwargs) -> None:
    """
    Export a dataframe from this Universe with automatic format detection.

    Args:
        data: The dataframe to export.
        filename: The name of the file to export (incl. extension).
        **kwargs: Additional keyword arguments passed to the export function.

    Returns:
        None
    """
    filepath = self.get_export_file_path(filename, mkdir=True)

    if filepath.exists():
        warnings.warn(f"File {filepath} already exists. Overwriting it.")

    # Determine format based on file extension
    file_ext = filepath.suffix.lower()

    if file_ext == ".csv":
        # Default kwargs for CSV
        csv_kwargs = {"index": False}
        csv_kwargs.update(kwargs)
        data.to_csv(filepath, **csv_kwargs)
    elif file_ext == ".json":
        # Default kwargs for JSON
        json_kwargs = {"orient": "records", "indent": 2}
        json_kwargs.update(kwargs)
        data.to_json(filepath, **json_kwargs)
    elif file_ext in [".xlsx", ".xls"]:
        # Default kwargs for Excel
        excel_kwargs = {"index": False}
        excel_kwargs.update(kwargs)
        data.to_excel(filepath, **excel_kwargs)
    elif file_ext == ".parquet":
        # Default kwargs for Parquet
        parquet_kwargs = {"index": False}
        parquet_kwargs.update(kwargs)
        data.to_parquet(filepath, **parquet_kwargs)
    else:
        # Default to CSV for unknown extensions
        warnings.warn(
            f"Unknown file extension '{file_ext}'. Defaulting to CSV format."
        )
        csv_kwargs = {"index": False}
        csv_kwargs.update(kwargs)
        data.to_csv(filepath, **csv_kwargs)

`generate_sub_universes()`

Generate the sub-universes for the given universe settings.

Returns:

Type	Description
`List[dict]`	A list of dictionaries containing the sub-universes.

Source code in multiversum/universe.py

def generate_sub_universes(self) -> List[dict]:
    """
    Generate the sub-universes for the given universe settings.

    Returns:
        A list of dictionaries containing the sub-universes.
    """
    # Wrap all non-lists in the universe to make them work with generate_multiverse_grid
    universe_all_lists = {k: list_wrap(v) for k, v in self.dimensions.items()}

    # Within-Universe variation
    return generate_multiverse_grid(universe_all_lists)

`get_execution_time()`

Gets the execution time of the universe analysis.

Returns:

Name	Type	Description
`float`	`float`	The execution time in seconds.

Source code in multiversum/universe.py

def get_execution_time(self) -> float:
    """
    Gets the execution time of the universe analysis.

    Returns:
        float: The execution time in seconds.
    """
    if self.ts_end is None:
        print("Stopping execution_time clock.")
        self.ts_end = time.time()
    return self.ts_end - self.ts_start

`get_export_file_path(filename, mkdir=False)`

Get the file path for exporting a file from this Universe.

Parameters:

Name	Type	Description	Default
`filename`	`str`	The name of the file to export.	required
`mkdir`	`bool`	Whether to create the directory if it does not exist. (optional)	`False`

Returns:

Name	Type	Description
`Path`	`Path`	The full path where the file should be exported.

Source code in multiversum/universe.py

def get_export_file_path(self, filename: str, mkdir: bool = False) -> Path:
    """
    Get the file path for exporting a file from this Universe.

    Args:
        filename: The name of the file to export.
        mkdir: Whether to create the directory if it does not exist. (optional)

    Returns:
        Path: The full path where the file should be exported.
    """
    export_dir = (
        self.output_dir / "runs" / str(self.run_no) / "exports" / self.universe_id
    )
    if mkdir:
        export_dir.mkdir(parents=True, exist_ok=True)
    return export_dir / filename

`save_data(data, add_info=True, multiple_rows=False)`

Save the data to the appropriate file for this Universe.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	The dataframe to be saved.	required
`add_info`	`bool`	Whether to add universe info to the dataframe. (optional)	`True`
`multiple_rows`	`bool`	Whether to allow multiple rows in data. (optional)	`False`

Returns:

Type	Description
`None`	None

Source code in multiversum/universe.py

def save_data(
    self, data: pd.DataFrame, add_info: bool = True, multiple_rows: bool = False
) -> None:
    """
    Save the data to the appropriate file for this Universe.

    Args:
        data: The dataframe to be saved.
        add_info: Whether to add universe info to the dataframe. (optional)
        multiple_rows: Whether to allow multiple rows in data. (optional)

    Returns:
        None
    """
    if len(data) > 1 and not multiple_rows:
        raise ValueError(
            f"Saving {len(data)} rows for universe '{self.universe_id}'. "
            "Data is usually in wide format (multiple columns, one row) "
            "rather than multiple rows, as each row corresponds to a "
            "universe. Please verify that the output is correct."
            "You can disable this error with multiple_rows=True"
        )

    # Add universe data to the dataframe
    if add_info:
        data = self._add_universe_info(data=data)

    # Path management
    target_dir = (
        self.output_dir / "runs" / str(self.run_no) / UNIVERSE_DATA_DIR_NAME
    )
    # Make sure the directory exists
    target_dir.mkdir(parents=True, exist_ok=True)
    filename = f"d_{str(self.run_no)}_{self.universe_id}.csv"
    filepath = target_dir / filename
    if filepath.exists():
        warnings.warn(f"File {filepath} already exists. Overwriting it.")
    # Write the file
    data.to_csv(filepath, index=False)

`visit_sub_universe(sub_universe, y_pred_prob, y_test, org_test, filter_data)`

Visit a sub-universe and compute the metrics for it.

Sub-universes correspond to theoretically distinct universes of decisions, which can be computed without re-fitting a model. The distinction has only been made to improve performance by not having to compute these universes from scratch.

Parameters:

Name	Type	Description	Default
`sub_universe`	`Dict[str, Any]`	A dictionary containing the parameters for the sub-universe.	required
`y_pred_prob`	`Series`	A pandas series containing the predicted probabilities.	required
`y_test`	`Series`	A pandas series containing the true labels.	required
`org_test`	`Series`	A pandas dataframe containing the test data, including variables that were not used as features.	required
`filter_data`	`Callable`	A function that filters data for each sub-universe. The function is called for each sub-universe with its respective settings and expected to return a pandas Series of booleans.	required

Returns:

Type	Description
`DataFrame`	A pandas dataframe containing the metrics for the sub-universe.

Source code in multiversum/universe.py

def visit_sub_universe(
    self,
    sub_universe: Dict[str, Any],
    y_pred_prob: pd.Series,
    y_test: pd.Series,
    org_test: pd.Series,
    filter_data: Callable,
) -> pd.DataFrame:
    """
    Visit a sub-universe and compute the metrics for it.

    Sub-universes correspond to theoretically distinct universes of
    decisions, which can be computed without re-fitting a model. The
    distinction has only been made to improve performance by not having to
    compute these universes from scratch.

    Args:
        sub_universe: A dictionary containing the parameters for the
            sub-universe.
        y_pred_prob: A pandas series containing the predicted
            probabilities.
        y_test: A pandas series containing the true labels.
        org_test: A pandas dataframe containing the test data, including
            variables that were not used as features.
        filter_data: A function that filters data for each sub-universe.
            The function is called for each sub-universe with its
            respective settings and expected to return a pandas Series
            of booleans.

    Returns:
        A pandas dataframe containing the metrics for the sub-universe.
    """
    final_output = self._add_universe_info(
        data=pd.DataFrame(index=[self.universe_id]),
        overwrite_dimensions=sub_universe,
    )

    data_mask = filter_data(sub_universe=sub_universe, org_test=org_test)
    final_output["test_size_n"] = data_mask.sum()
    final_output["test_size_frac"] = data_mask.sum() / len(data_mask)

    # Compute metrics for majority-minority split
    fairness_dict, metric_frame = self.compute_sub_universe_metrics(
        sub_universe,
        y_pred_prob[data_mask],
        y_test[data_mask],
        org_test[data_mask],
    )

    # Add main fairness metrics to final_output
    final_output = add_dict_to_df(final_output, fairness_dict, prefix="fair_main_")
    final_output = add_dict_to_df(
        final_output, dict(metric_frame.overall), prefix="perf_ovrl_"
    )

    # Add group metrics to final output
    final_output = add_dict_to_df(
        final_output, flatten_dict(metric_frame.by_group), prefix="perf_grp_"
    )

    return final_output

Universe

__init__(settings, metrics=None, fairness_metrics=None, expand_dicts=False, seed=None, set_seed=True)

compute_final_metrics(y_pred_prob, y_test, org_test, filter_data, save=True)

compute_sub_universe_metrics(sub_universe, y_pred_prob, y_test, org_test)

export_dataframe(data, filename, **kwargs)

generate_sub_universes()

get_execution_time()

get_export_file_path(filename, mkdir=False)

save_data(data, add_info=True, multiple_rows=False)

visit_sub_universe(sub_universe, y_pred_prob, y_test, org_test, filter_data)

`init(settings, metrics=None, fairness_metrics=None, expand_dicts=False, seed=None, set_seed=True)`

`compute_final_metrics(y_pred_prob, y_test, org_test, filter_data, save=True)`

`compute_sub_universe_metrics(sub_universe, y_pred_prob, y_test, org_test)`

`export_dataframe(data, filename, **kwargs)`

`generate_sub_universes()`

`get_execution_time()`

`get_export_file_path(filename, mkdir=False)`

`save_data(data, add_info=True, multiple_rows=False)`

`visit_sub_universe(sub_universe, y_pred_prob, y_test, org_test, filter_data)`