Skip to content

evaluation_dataset

evaluation_dataset

Classes:

Name Description
EvaluationDataset

Paired reference and output dataframes prepared for evaluation.

EvaluationDataset pydantic-model

Bases: BaseModel

Paired reference and output dataframes prepared for evaluation.

On construction the validator computes per-column EvaluationField instances, counts memorized lines, and records dataset dimensions. Use from_dataframes to build an instance with optional column/row subsampling.

Config:

  • arbitrary_types_allowed: True

Fields:

Validators:

  • validate

reference = pd.DataFrame() pydantic-field

Training (reference) dataframe.

output = pd.DataFrame() pydantic-field

Synthetic (output) dataframe.

test = None pydantic-field

Optional holdout dataframe for text-similarity and privacy metrics.

reference_rows = 0 pydantic-field

Row count of the reference dataframe.

reference_cols = 0 pydantic-field

Column count of the reference dataframe.

output_rows = 0 pydantic-field

Row count of the output dataframe.

output_cols = 0 pydantic-field

Column count of the output dataframe.

memorized_lines = 0 pydantic-field

Number of exact row matches between reference and output.

column_statistics = None pydantic-field

Per-column PII entity counts and transform metadata.

evaluation_fields = list() pydantic-field

Per-column evaluation metadata and distribution scores.

check_dataframe(df, df_name) staticmethod

Raise ValueError if df is None or empty.

Source code in src/nemo_safe_synthesizer/evaluation/data_model/evaluation_dataset.py
@staticmethod
def check_dataframe(df: pd.DataFrame, df_name: str):
    """Raise ``ValueError`` if ``df`` is ``None`` or empty."""
    if df is None:
        raise ValueError(f"{df_name} is None!")
    if df.empty:
        raise ValueError(f"{df_name} is empty!")

get_columns_of_type(types, mode='reference')

Return column names whose FieldType is in types.

Parameters:

Name Type Description Default
types set[FieldType]

Set of FieldType values to match.

required
mode

Which dataframe's field features to inspect -- "reference", "output", or "both" (intersection).

'reference'

Returns:

Type Description
list[str]

List of matching column names.

Source code in src/nemo_safe_synthesizer/evaluation/data_model/evaluation_dataset.py
def get_columns_of_type(self, types: set[FieldType], mode="reference") -> list[str]:
    """Return column names whose ``FieldType`` is in ``types``.

    Args:
        types: Set of ``FieldType`` values to match.
        mode: Which dataframe's field features to inspect --
            ``"reference"``, ``"output"``, or ``"both"`` (intersection).

    Returns:
        List of matching column names.
    """
    if mode == "reference":
        return [f.name for f in self.evaluation_fields if f.reference_field_features.type in types]
    elif mode == "output":
        return [f.name for f in self.evaluation_fields if f.output_field_features.type in types]
    elif mode == "both":
        return [
            f.name
            for f in self.evaluation_fields
            if f.reference_field_features.type in types and f.output_field_features.type in types
        ]
    else:
        return []

get_tabular_columns(mode='reference')

Return columns classified as binary, categorical, or numeric.

Source code in src/nemo_safe_synthesizer/evaluation/data_model/evaluation_dataset.py
def get_tabular_columns(self, mode="reference") -> list[str]:
    """Return columns classified as binary, categorical, or numeric."""
    return self.get_columns_of_type({FieldType.BINARY, FieldType.CATEGORICAL, FieldType.NUMERIC}, mode)

get_nominal_columns(mode='reference')

Return columns classified as binary or categorical.

Source code in src/nemo_safe_synthesizer/evaluation/data_model/evaluation_dataset.py
def get_nominal_columns(self, mode="reference") -> list[str]:
    """Return columns classified as binary or categorical."""
    return self.get_columns_of_type({FieldType.BINARY, FieldType.CATEGORICAL}, mode)

get_text_columns(mode='reference')

Return columns classified as free text.

Source code in src/nemo_safe_synthesizer/evaluation/data_model/evaluation_dataset.py
def get_text_columns(self, mode="reference") -> list[str]:
    """Return columns classified as free text."""
    return self.get_columns_of_type({FieldType.TEXT}, mode)

subsample_columns(reference, output, test=None, target_column_count=DEFAULT_SQS_REPORT_COLUMNS, mandatory_columns=None) staticmethod

Reduce dataframes to shared columns, optionally subsampling columns.

Mandatory columns are always included. A fixed random seed ensures reproducible column selection across evaluation components.

Parameters:

Name Type Description Default
reference DataFrame

Training dataframe.

required
output DataFrame

Synthetic dataframe.

required
test DataFrame | None

Optional holdout dataframe.

None
target_column_count int

Maximum number of columns to keep.

DEFAULT_SQS_REPORT_COLUMNS
mandatory_columns list[str] | None

Columns that must be included regardless.

None

Returns:

Type Description
DataFrame

Tuple of (reference, output, test) dataframes restricted to the

DataFrame

selected column set.

Raises:

Type Description
ValueError

If reference and output share no columns.

Source code in src/nemo_safe_synthesizer/evaluation/data_model/evaluation_dataset.py
@staticmethod
def subsample_columns(
    reference: pd.DataFrame,
    output: pd.DataFrame,
    test: pd.DataFrame | None = None,
    target_column_count: int = DEFAULT_SQS_REPORT_COLUMNS,
    mandatory_columns: list[str] | None = None,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame | None]:
    """Reduce dataframes to shared columns, optionally subsampling columns.

    Mandatory columns are always included. A fixed random seed ensures
    reproducible column selection across evaluation components.

    Args:
        reference: Training dataframe.
        output: Synthetic dataframe.
        test: Optional holdout dataframe.
        target_column_count: Maximum number of columns to keep.
        mandatory_columns: Columns that must be included regardless.

    Returns:
        Tuple of (reference, output, test) dataframes restricted to the
        selected column set.

    Raises:
        ValueError: If reference and output share no columns.
    """
    if mandatory_columns is None:
        mandatory_columns = []
    # Check and subsample columns
    shared_columns = set(reference.columns).intersection(set(output.columns))
    if len(shared_columns) == 0:
        raise ValueError(
            "Reference and Output dataframes contain no columns in common. Please check dataframes for mismatch."
        )
    if target_column_count < len(shared_columns):
        # Really sample columns.
        logger.info(
            f"Found {len(shared_columns)} shared columns. Attempting to sample down to {target_column_count} columns. Will include {len(mandatory_columns)} mandatory columns."
        )
        col_set = set()
        col_set.update([col for col in mandatory_columns if col in shared_columns])

        if len(col_set) < target_column_count:
            # Use a fixed seed for reproducibility. In particular, we want to sample the
            # same columns for correlation and everything else.
            r = random.Random()
            r.seed(2112)
            shared_columns = shared_columns.difference(set(mandatory_columns))
            col_set.update(r.sample(list(shared_columns), k=(target_column_count - len(col_set))))
    else:
        # Even without sampling, we only want to use shared columns.
        col_set = shared_columns
    reference = reference[list(col_set)]  # ty: ignore[invalid-assignment]
    output = output[list(col_set)]  # ty: ignore[invalid-assignment]

    # Check and subsample test columns, or split out a test set if not provided.
    if test is not None and not test.empty:
        test_shared_columns = shared_columns.intersection(set(test.columns))
        if len(test_shared_columns) == 0:
            raise ValueError(
                "Test dataframe has no columns in common with Reference and Output dataframes. Please check dataframes for mismatch."
            )
        else:
            test = test[list(test_shared_columns)]  # ty: ignore[invalid-assignment]

    return reference, output, test

subsample_rows(reference, output, target_record_count=DEFAULT_RECORD_COUNT) staticmethod

Downsample both dataframes to at most target_record_count rows.

Source code in src/nemo_safe_synthesizer/evaluation/data_model/evaluation_dataset.py
@staticmethod
def subsample_rows(
    reference: pd.DataFrame,
    output: pd.DataFrame,
    target_record_count: int = DEFAULT_RECORD_COUNT,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Downsample both dataframes to at most ``target_record_count`` rows."""
    target_record_count = min(target_record_count, reference.shape[0], output.shape[0])
    if target_record_count < reference.shape[0]:
        logger.info(f"Subsampling reference data from {reference.shape[0]} records to {target_record_count}.")
        reference = reference.sample(target_record_count, ignore_index=True, random_state=424242)
    if target_record_count < output.shape[0]:
        logger.info(f"Subsampling output data from {output.shape[0]} records to {target_record_count}.")
        output = output.sample(target_record_count, ignore_index=True, random_state=424242)
    return reference, output

from_dataframes(reference, output, test=None, column_statistics=None, rows=DEFAULT_RECORD_COUNT, cols=DEFAULT_SQS_REPORT_COLUMNS, mandatory_columns=None, enable_sampling=True) staticmethod

Build an EvaluationDataset with optional column/row subsampling.

This is the primary constructor for evaluation. It validates inputs, optionally subsamples columns and rows, then delegates to the Pydantic model validator which computes per-column evaluation fields.

Parameters:

Name Type Description Default
reference DataFrame

Training dataframe.

required
output DataFrame

Synthetic dataframe.

required
test DataFrame | None

Optional holdout dataframe for text-similarity and privacy metrics.

None
column_statistics dict[str, ColumnStatistics] | None

Per-column PII entity metadata.

None
rows int

Target row count for subsampling.

DEFAULT_RECORD_COUNT
cols int

Target column count for subsampling.

DEFAULT_SQS_REPORT_COLUMNS
mandatory_columns list[str] | None

Columns to always include in subsampling.

None
enable_sampling bool

When False, skip all subsampling.

True

Returns:

Type Description
EvaluationDataset

A fully initialized EvaluationDataset.

Source code in src/nemo_safe_synthesizer/evaluation/data_model/evaluation_dataset.py
@staticmethod
def from_dataframes(
    reference: pd.DataFrame,
    output: pd.DataFrame,
    test: pd.DataFrame | None = None,
    column_statistics: dict[str, ColumnStatistics] | None = None,
    rows: int = DEFAULT_RECORD_COUNT,
    cols: int = DEFAULT_SQS_REPORT_COLUMNS,
    mandatory_columns: list[str] | None = None,
    enable_sampling: bool = True,
) -> EvaluationDataset:
    """Build an ``EvaluationDataset`` with optional column/row subsampling.

    This is the primary constructor for evaluation. It validates inputs,
    optionally subsamples columns and rows, then delegates to the
    Pydantic model validator which computes per-column evaluation fields.

    Args:
        reference: Training dataframe.
        output: Synthetic dataframe.
        test: Optional holdout dataframe for text-similarity and privacy metrics.
        column_statistics: Per-column PII entity metadata.
        rows: Target row count for subsampling.
        cols: Target column count for subsampling.
        mandatory_columns: Columns to always include in subsampling.
        enable_sampling: When ``False``, skip all subsampling.

    Returns:
        A fully initialized ``EvaluationDataset``.
    """
    # Spot check df's before doing anything.
    EvaluationDataset.check_dataframe(reference, "Reference")
    EvaluationDataset.check_dataframe(output, "Output")

    # Sample while we have config params in hand.
    if enable_sampling:
        reference, output, test = EvaluationDataset.subsample_columns(
            reference, output, test, cols, mandatory_columns
        )
        reference, output = EvaluationDataset.subsample_rows(reference, output, rows)

    return EvaluationDataset(reference=reference, output=output, test=test, column_statistics=column_statistics)