Skip to content

deep_structure

deep_structure

Classes:

Name Description
DeepStructure

Deep Structure Stability metric via joined PCA.

DeepStructure pydantic-model

Bases: Component

Deep Structure Stability metric via joined PCA.

Projects reference and output data into a shared principal-component space and scores the distributional similarity of the projections.

Config:

  • arbitrary_types_allowed: True

Fields:

reference_pca = None pydantic-field

PCA-projected reference dataframe.

output_pca = None pydantic-field

PCA-projected output dataframe.

jinja_context cached property

Template context with PCA scatter plot figure.

from_evaluation_dataset(evaluation_dataset, config=None) staticmethod

Compute PCA projections and the principal component stability score.

Parameters:

Name Type Description Default
evaluation_dataset EvaluationDataset

Paired reference/output data.

required
config SafeSynthesizerParameters | None

Pipeline configuration (unused, reserved for future use).

None

Returns:

Type Description
DeepStructure

A DeepStructure with PCA dataframes and the stability score.

Source code in src/nemo_safe_synthesizer/evaluation/components/deep_structure.py
@staticmethod
def from_evaluation_dataset(
    evaluation_dataset: EvaluationDataset, config: SafeSynthesizerParameters | None = None
) -> DeepStructure:
    """Compute PCA projections and the principal component stability score.

    Args:
        evaluation_dataset: Paired reference/output data.
        config: Pipeline configuration (unused, reserved for future use).

    Returns:
        A ``DeepStructure`` with PCA dataframes and the stability score.
    """
    tabular_columns = evaluation_dataset.get_tabular_columns(mode="both")
    if not tabular_columns:
        return DeepStructure(score=EvaluationScore(notes="No columns detected for PCA."))

    reference_pca, output_pca = DeepStructure._calculate_pca(
        evaluation_dataset.reference[tabular_columns],  # ty: ignore[invalid-argument-type]
        evaluation_dataset.output[tabular_columns],  # ty: ignore[invalid-argument-type]
    )

    principal_component_stability = DeepStructure.get_principal_component_stability(
        reference_pca,
        output_pca,
    )

    return DeepStructure(score=principal_component_stability, reference_pca=reference_pca, output_pca=output_pca)

get_principal_component_stability(reference_pca, output_pca) staticmethod

Score the distributional similarity of PCA projections.

Computes per-component Jensen-Shannon divergence, averages, and applies an exponential function to produce a 0--10 score.

Source code in src/nemo_safe_synthesizer/evaluation/components/deep_structure.py
@staticmethod
def get_principal_component_stability(
    reference_pca: pd.DataFrame | None,
    output_pca: pd.DataFrame | None,
) -> EvaluationScore:
    """Score the distributional similarity of PCA projections.

    Computes per-component Jensen-Shannon divergence, averages,
    and applies an exponential function to produce a 0--10 score.
    """
    if reference_pca is None or output_pca is None:
        return EvaluationScore(notes="Missing input Dataframe.")

    try:
        sum_pca_distances = 0.0
        pca_df_fields = [
            EvaluationField.from_series(field, reference_pca[field], output_pca[field])
            for field in reference_pca.columns
        ]
        for field in pca_df_fields:
            # field.distribution_distance is None for highly unique fields
            if field.distribution_distance:
                sum_pca_distances += field.distribution_distance

        raw_score = sum_pca_distances / len(pca_df_fields)

        if np.isnan(raw_score):
            return EvaluationScore()
        # Scale the raw score to between ~2 and 10
        # The factor of 1.6 is to ensure rough consistency with the legacy score
        score = 10 * np.exp(-1.6 * raw_score)
        return EvaluationScore.finalize_grade(raw_score, score)
    except Exception as e:
        logger.exception("Failed to calculate Principal Component Stability SQS")
        return EvaluationScore(notes=str(e))