Skip to content

text_semantic_similarity

text_semantic_similarity

Classes:

Name Description
TextSemanticSimilarityDatum

Per-column text semantic similarity scores and PCA projections.

TextSemanticSimilarity

Text Semantic Similarity metric.

TextSemanticSimilarityDatum pydantic-model

Bases: BaseModel

Per-column text semantic similarity scores and PCA projections.

Config:

  • arbitrary_types_allowed: True

Fields:

text_semantic_similarity = EvaluationScore() pydantic-field

Overall semantic similarity score for this column.

text_semantic_similarity_underfitting_factor = EvaluationScore() pydantic-field

Underfitting factor score for this column.

text_semantic_similarity_overfitting_factor = EvaluationScore() pydantic-field

Overfitting factor score for this column.

training_pca = pd.DataFrame() pydantic-field

PCA-projected reference embeddings.

synthetic_pca = pd.DataFrame() pydantic-field

PCA-projected synthetic embeddings.

TextSemanticSimilarity pydantic-model

Bases: Component

Text Semantic Similarity metric.

Embeds text columns with a sentence transformer and compares the distribution of cosine similarities between reference-synthetic and reference-reference (or test-test) pairs using two-sided Kolmogorov-Smirnov tests to capture both underfitting and overfitting.

Fields:

text_semantic_similarity_dict = dict() pydantic-field

Per-column semantic similarity scores and PCA projections.

jinja_context cached property

Template context with per-column PCA scatter figures.

from_evaluation_dataset(evaluation_dataset, config=None) staticmethod

Compute text semantic similarity across all text columns.

Source code in src/nemo_safe_synthesizer/evaluation/components/text_semantic_similarity.py
@staticmethod
def from_evaluation_dataset(
    evaluation_dataset: EvaluationDataset, config: SafeSynthesizerParameters | None = None
) -> TextSemanticSimilarity:
    """Compute text semantic similarity across all text columns."""
    if evaluation_dataset.test is None or evaluation_dataset.test.empty:
        return TextSemanticSimilarity(
            score=EvaluationScore(
                notes="Unable to calculate Text Semantic Similarity. No holdout dataframe provided."
            )
        )

    nrows = min(
        len(evaluation_dataset.reference), len(evaluation_dataset.output)
    )  # MIN_RECORDS_FOR_TEXT_AND_PRIVACY_METRICS ?

    text_semantic_similarity_dict = dict()
    text_fields = [
        f.name for f in evaluation_dataset.evaluation_fields if f.reference_field_features.type == FieldType.TEXT
    ]

    for field in text_fields:
        training = evaluation_dataset.reference[field]
        synthetic = evaluation_dataset.output[field]
        test = evaluation_dataset.test[field]

        # Initialize a stub instance before trying anything.
        text_semantic_similarity = EvaluationScore()
        text_semantic_similarity_underfitting_factor = EvaluationScore()
        text_semantic_similarity_overfitting_factor = EvaluationScore()
        training_pca = pd.DataFrame()
        synthetic_pca = pd.DataFrame()

        try:
            stm = TextSemanticSimilarity._init_sentence_transformer_model()
            if stm is None:
                raise RuntimeError("Sentence Transformer Model is None, unable to continue.")

            if nrows is None:
                nrows = DEFAULT_RECORD_COUNT
            # PLAT-914 Easter egg, 0 rows means use everything.
            if nrows == 0:
                nrows = max(len(training), len(synthetic))
                if test is not None:
                    nrows = max(nrows, len(test))

            training = TextSemanticSimilarity._preprocess_text_data(training, nrows)
            synthetic = TextSemanticSimilarity._preprocess_text_data(synthetic, nrows)

            training_embedding_vector = TextSemanticSimilarity._get_embedding_vectors(training, stm)
            synthetic_embedding_vector = TextSemanticSimilarity._get_embedding_vectors(synthetic, stm)

            test_embedding_vector = None
            warning_message = None
            if test is not None and not test.empty:
                # Only calculate the semantic similarity score if we have a test set.
                # And only if there at least 100 total input records
                test = TextSemanticSimilarity._preprocess_text_data(test, nrows)
                total_input_records = len(training) + len(test)
                if total_input_records >= MIN_RECORDS_FOR_TEXT_AND_PRIVACY_METRICS:
                    test_embedding_vector = TextSemanticSimilarity._get_embedding_vectors(test, stm)
                    # TODO: Use dynamic calculation based on training/test/synthetic data sizes
                    # to determine if we should log this warning.
                    if total_input_records < MIN_RECORDS_FOR_TEXT_METRICS_WITHOUT_WARNING:
                        warning_message = f"Warning: Consider using at least {MIN_RECORDS_FOR_TEXT_METRICS_WITHOUT_WARNING} input records for a more accurate semantic similarity score."
                else:
                    warning_message = (
                        f"Not enough input records for text semantic similarity score. "
                        f"Need at least {MIN_RECORDS_FOR_TEXT_AND_PRIVACY_METRICS} non-empty records. Skipping text semantic similarity."
                    )
            else:
                warning_message = "No test data provided, skipping text semantic similarity."

            if test_embedding_vector is not None:
                (
                    text_semantic_similarity,
                    text_semantic_similarity_underfitting_factor,
                    text_semantic_similarity_overfitting_factor,
                ) = TextSemanticSimilarity._get_text_semantic_similarity(
                    real_embed=training_embedding_vector,
                    synth_embed=synthetic_embedding_vector,
                    test_embed=test_embedding_vector,
                    warning_message=warning_message,
                )
            else:
                (
                    text_semantic_similarity,
                    text_semantic_similarity_underfitting_factor,
                    text_semantic_similarity_overfitting_factor,
                ) = (EvaluationScore(notes=warning_message),) * 3

            if warning_message:
                logger.info(warning_message)

            # I'm PCA I've got nothing to prove pay attention my intention is to bust a move.
            training_pca, synthetic_pca = TextSemanticSimilarity._get_pca(
                pd.DataFrame(training_embedding_vector),
                pd.DataFrame(synthetic_embedding_vector),
            )

            text_semantic_similarity_dict[field] = TextSemanticSimilarityDatum(
                text_semantic_similarity=text_semantic_similarity,
                text_semantic_similarity_underfitting_factor=text_semantic_similarity_underfitting_factor,
                text_semantic_similarity_overfitting_factor=text_semantic_similarity_overfitting_factor,
                training_pca=training_pca,
                synthetic_pca=synthetic_pca,
            )
        except Exception:
            logger.exception("Failed to initialize TextSemanticSimilarity.")

    score_numerator = 0
    score_denominator = 0
    for v in text_semantic_similarity_dict.values():
        if v.text_semantic_similarity.score is not None:
            score_numerator += v.text_semantic_similarity.score
            score_denominator += 1
    if score_denominator == 0:
        return TextSemanticSimilarity(
            score=EvaluationScore(), text_semantic_similarity_dict=text_semantic_similarity_dict
        )
    else:
        score = score_numerator / score_denominator
        return TextSemanticSimilarity(
            score=EvaluationScore.finalize_grade(score, score),
            text_semantic_similarity_dict=text_semantic_similarity_dict,
        )