Skip to content

deep_structure

deep_structure

Classes:

Name Description
DeepStructure

Deep Structure Stability metric via joined PCA.

DeepStructure pydantic-model

Bases: Component

Deep Structure Stability metric via joined PCA.

Projects training and synthetic data into a shared principal-component space and scores the distributional similarity of the projections.

Config:

  • arbitrary_types_allowed: True

Fields:

training_pca = None pydantic-field

PCA-projected training dataframe.

synthetic_pca = None pydantic-field

PCA-projected synthetic dataframe.

jinja_context cached property

Template context with PCA scatter plot figure.

from_evaluation_datasets(evaluation_datasets, config=None) staticmethod

Compute PCA projections and the principal component stability score.

Parameters:

Name Type Description Default
evaluation_datasets EvaluationDatasets

Paired training/synthetic data.

required
config SafeSynthesizerParameters | None

Pipeline configuration (unused, reserved for future use).

None

Returns:

Type Description
DeepStructure

A DeepStructure with PCA dataframes and the stability score.

Source code in src/nemo_safe_synthesizer/evaluation/components/deep_structure.py
@staticmethod
def from_evaluation_datasets(
    evaluation_datasets: EvaluationDatasets, config: SafeSynthesizerParameters | None = None
) -> DeepStructure:
    """Compute PCA projections and the principal component stability score.

    Args:
        evaluation_datasets: Paired training/synthetic data.
        config: Pipeline configuration (unused, reserved for future use).

    Returns:
        A ``DeepStructure`` with PCA dataframes and the stability score.
    """
    tabular_columns = evaluation_datasets.get_tabular_columns(based_on="both")
    if not tabular_columns:
        return DeepStructure(score=EvaluationScore(notes="No columns detected for PCA."))

    training_pca, synthetic_pca = DeepStructure._calculate_pca(
        evaluation_datasets.training.reindex(columns=tabular_columns),
        evaluation_datasets.synthetic.reindex(columns=tabular_columns),
    )

    principal_component_stability = DeepStructure.get_principal_component_stability(
        training_pca,
        synthetic_pca,
    )

    return DeepStructure(
        score=principal_component_stability, training_pca=training_pca, synthetic_pca=synthetic_pca
    )

get_principal_component_stability(training_pca, synthetic_pca) staticmethod

Score the distributional similarity of PCA projections.

Computes per-component Jensen-Shannon divergence, averages, and applies an exponential function to produce a 0--10 score.

Source code in src/nemo_safe_synthesizer/evaluation/components/deep_structure.py
@staticmethod
def get_principal_component_stability(
    training_pca: pd.DataFrame | None,
    synthetic_pca: pd.DataFrame | None,
) -> EvaluationScore:
    """Score the distributional similarity of PCA projections.

    Computes per-component Jensen-Shannon divergence, averages,
    and applies an exponential function to produce a 0--10 score.
    """
    if training_pca is None or synthetic_pca is None:
        return EvaluationScore(notes="Missing input Dataframe.")

    try:
        sum_pca_distances = 0.0
        pca_df_fields = [
            EvaluationField.from_series(field, training=training_pca[field], synthetic=synthetic_pca[field])
            for field in training_pca.columns
        ]
        for field in pca_df_fields:
            # field.distribution_distance is None for highly unique fields
            if field.distribution_distance:
                sum_pca_distances += field.distribution_distance

        raw_score = sum_pca_distances / len(pca_df_fields)

        if np.isnan(raw_score):
            return EvaluationScore()
        # Scale the raw score to between ~2 and 10
        # The factor of 1.6 is to ensure rough consistency with the legacy score
        score = 10 * np.exp(-1.6 * raw_score)
        return EvaluationScore.finalize_grade(raw_score, score)
    except Exception as e:
        logger.exception("Failed to calculate Principal Component Stability SQS")
        return EvaluationScore(notes=str(e))