# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "data-designer>=0.5.6",
# ]
# ///
"""Long-Document Understanding Frontier Model QA Judge Recipe
Use a frontier VLM as an LLM-as-a-judge to evaluate the quality
of (question, answer) pairs generated by the upstream visual QA recipes. The
judge scores each example across five rubrics:
1. **Answer Correctness** – factual accuracy against the visible document
2. **Question Quality** – reasoning depth, ambiguity, specificity
3. **Visual Grounding** – reliance on visual elements vs. plain text
4. **Format Compliance** – answer format matches the question type
5. **Training Signal Strength** – overall value as VLM training data
A weighted composite score (0–1) is computed from the five rubric scores.
Prerequisites:
- A seed parquet file containing output from an upstream QA recipe
(e.g. 05-visual-qa-sdg.py, 06-single-page-qa-sdg.py, or
08-whole-document-qa-sdg.py) with at least:
* `png_images_base64` – JSON array of base64-encoded PNG(s) of
document pages.
* `question_type` – classification of the question.
* `question` – the generated question.
* `answer` – the generated answer.
- Access to a frontier model endpoint that exposes an OpenAI-compatible
API. Provide the model ID, endpoint URL, and the name of the
environment variable holding the API key via the CLI flags
``--model-id``, ``--endpoint``, and ``--api-key-env``.
Run:
# Basic usage (judges 5 records by default)
uv run 09-frontier-judge-sdg.py --seed-path my_qa_output.parquet \
--model-id <model-id> --endpoint <endpoint-url> --api-key-env <ENV_VAR>
# Custom record count
uv run 09-frontier-judge-sdg.py --seed-path my_qa_output.parquet \
--model-id <model-id> --endpoint <endpoint-url> --api-key-env <ENV_VAR> \
--num-records 100
# For help message and available options
uv run 09-frontier-judge-sdg.py --help
"""
from pathlib import Path
import data_designer.config as dd
from data_designer.interface import DataDesigner, DatasetCreationResults
PROVIDER_NAME = "frontier"
# =============================================================================
# Score weights for the weighted composite
# =============================================================================
FINAL_SCORE_WEIGHTS = {
"Answer Correctness": 0.35,
"Training Signal Strength": 0.30,
"Question Quality": 0.15,
"Visual Grounding": 0.10,
"Format Compliance": 0.10,
}
# =============================================================================
# Custom column: weighted composite score
# =============================================================================
@dd.custom_column_generator(required_columns=["qa_quality_judge"])
def compute_weighted_score(row: dict) -> dict:
"""Weighted composite score normalized to 0-1."""
judge = row["qa_quality_judge"]
raw = sum(float(judge[k]["score"]) * w for k, w in FINAL_SCORE_WEIGHTS.items())
row["weighted_quality_score"] = round(raw / 5.0, 2)
return row
# =============================================================================
# Judge prompt
# =============================================================================
PROMPT_JUDGE = """\
You are an expert evaluator of visual document question-answering (VQA) training data
for the MMLongBench-Doc benchmark.
Your task is to assess the quality of a (question, answer) pair generated from a PDF
document image. The goal is to determine how strong of a training signal this example
would provide for improving VLM performance.
You will be given:
- One or more images of document pages (with tables, charts, diagrams, etc.)
- A question type classification
- A question about the document
- An answer to the question
<question-type>
{{ question_type }}
</question-type>
<question>
{{ question }}
</question>
<answer>
{{ answer }}
</answer>
Evaluate the example across the following rubrics. For each rubric, provide a brief
reasoning and a score. Be objective and critical -- do not inflate scores.
{
"Answer Correctness": {
"reasoning": "Your brief analysis here",
"score": "X"
},
"Question Quality": {
"reasoning": "Your brief analysis here",
"score": "X"
},
"Visual Grounding": {
"reasoning": "Your brief analysis here",
"score": "X"
},
"Format Compliance": {
"reasoning": "Your brief analysis here",
"score": "X"
},
"Training Signal Strength": {
"reasoning": "Your brief analysis here",
"score": "X"
}
}
Provide your evaluation in the exact JSON format above with ALL 5 rubrics.
Keep your reasoning for each rubric short and to the point.
"""
# =============================================================================
# Score rubric definitions
# =============================================================================
answer_correctness_score = dd.Score(
name="Answer Correctness",
description=(
"Is the answer factually correct given the visible document content? "
"Verify by examining the image yourself. For calculations, redo the math. "
"For counts, recount. For lists, check completeness."
),
options={
"5": "Exactly correct: answer matches the visible content precisely, calculations are accurate, lists are complete",
"4": "Substantially correct: answer is right with minor imprecision (e.g., rounding differences within +/-5%, equivalent formats like '25%' vs '0.25')",
"3": "Partially correct: core answer is right but has notable issues (missing list items, slightly off calculation, incomplete but not wrong)",
"2": "Mostly incorrect: answer has the right idea but wrong values, wrong entity, or significant calculation errors",
"1": "Incorrect: answer contradicts the visible content, uses wrong data, or is completely off",
"0": "Not answerable or refused: answer is a refusal, 'Not answerable', or nonsensical when a real answer exists",
},
)
question_quality_score = dd.Score(
name="Question Quality",
description=(
"Is the question well-formed, unambiguous, and appropriately challenging? "
"Does it require genuine reasoning (comparison, calculation, counting) rather than trivial lookup? "
"Is it specific to the visual content and not generic?"
),
options={
"5": "Excellent: requires clear reasoning (comparison, calculation, or cross-element synthesis), unambiguous, has exactly one correct answer, well-matched to the visual element type",
"4": "Good: requires some reasoning, mostly unambiguous, well-grounded in the visual content with minor issues",
"3": "Adequate: reasonable question but either too easy (direct lookup), slightly ambiguous, or not well-matched to the visual element type",
"2": "Poor: trivial lookup, ambiguous wording, or asks about content not well-suited to the visual element type",
"1": "Very poor: unanswerable from the image, contains the answer, or is about irrelevant content",
"0": "Invalid: nonsensical, empty, or completely unrelated to the document",
},
)
visual_grounding_score = dd.Score(
name="Visual Grounding",
description=(
"Does the question target the actual visual elements (tables, charts, diagrams) in the image? "
"Does answering require examining the visual structure, not just reading plain text? "
"Is the question grounded in specific, identifiable elements?"
),
options={
"5": "Excellent: question directly targets specific visual elements (chart data, table cells, diagram nodes), answering requires visual perception and spatial understanding",
"4": "Good: question is grounded in visual content with clear references to identifiable elements, requires examining the visual structure",
"3": "Adequate: question relates to visual content but could partially be answered from text alone, or uses vague references ('the table' without specificity)",
"2": "Poor: question mostly targets plain text content, minimal visual grounding, could be answered without seeing the visual elements",
"1": "Very poor: question has no meaningful connection to the visual elements, purely text-based",
"0": "No grounding: question is about content not present in the image at all",
},
)
format_compliance_score = dd.Score(
name="Format Compliance",
description=(
"Does the answer match the expected format for its question type? "
"Check: multiple choice uses 'A. option' format, yes/no is exactly 'Yes'/'No', "
"percentages include '%', integers are digits only, lists are JSON arrays, "
"and the answer contains no reasoning traces or meta-commentary."
),
options={
"5": "Perfect compliance: answer format exactly matches the question type requirements, no extraneous content",
"4": "Good compliance: correct format with trivial deviations (e.g., extra whitespace, minor punctuation)",
"3": "Adequate: answer is usable but has format issues (e.g., missing units, prose instead of JSON array, includes 'Based on the image...')",
"2": "Poor: significant format violations (e.g., includes reasoning steps, wrong answer structure, contains <think> tags)",
"1": "Very poor: answer format is fundamentally wrong for the question type",
"0": "No compliance: answer is empty, garbled, or completely ignores format requirements",
},
)
training_signal_score = dd.Score(
name="Training Signal Strength",
description=(
"Overall, how valuable is this (question, answer) pair as training data for improving "
"VLM performance on document understanding? Consider: does it exercise visual perception, "
"require non-trivial reasoning, demand multi-page evidence gathering, and provide a clear learning signal?"
),
options={
"5": "Excellent: requires combining evidence from multiple pages, exercises visual perception + reasoning, non-trivial, clear correct answer. Would meaningfully improve a VLM on document QA benchmarks",
"4": "Strong: good training example with cross-page reasoning or strong single-page visual grounding and reasoning, minor issues don't significantly reduce value",
"3": "Moderate: decent training signal but answerable from a single page, or doesn't fully exercise multi-page or visual understanding",
"2": "Weak: limited training value -- trivial question, wrong answer, single-page lookup, or doesn't require visual reasoning",
"1": "Very weak: almost no training value -- incorrect, ambiguous, or completely text-based with no multi-page dependency",
"0": "No value: harmful to training -- wrong answer presented as correct, nonsensical, or would teach bad patterns",
},
)
# =============================================================================
# Config builder
# =============================================================================
def build_config(
seed_path: str = "seed.parquet",
model_alias: str = "frontier-judge-vlm",
model_id: str = "",
) -> dd.DataDesignerConfigBuilder:
"""Build the Data Designer config for frontier-model QA judging."""
config_builder = dd.DataDesignerConfigBuilder(
model_configs=[
dd.ModelConfig(
alias=model_alias,
model=model_id,
provider=PROVIDER_NAME,
inference_parameters=dd.ChatCompletionInferenceParams(
timeout=300,
max_tokens=40000,
max_parallel_requests=32,
),
),
]
)
config_builder.with_seed_dataset(
dd.LocalFileSeedSource(path=seed_path),
sampling_strategy=dd.SamplingStrategy.ORDERED,
)
config_builder.add_column(
dd.LLMJudgeColumnConfig(
name="qa_quality_judge",
model_alias=model_alias,
prompt=PROMPT_JUDGE,
scores=[
answer_correctness_score,
question_quality_score,
visual_grounding_score,
format_compliance_score,
training_signal_score,
],
multi_modal_context=[
dd.ImageContext(
column_name="png_images_base64",
data_type=dd.ModalityDataType.BASE64,
image_format=dd.ImageFormat.PNG,
),
],
)
)
config_builder.add_column(
dd.CustomColumnConfig(
name="weighted_quality_score",
generator_function=compute_weighted_score,
)
)
return config_builder
# =============================================================================
# Dataset creation
# =============================================================================
def create_dataset(
config_builder: dd.DataDesignerConfigBuilder,
num_records: int,
endpoint: str = "",
api_key_env: str = "",
artifact_path: Path | str | None = None,
) -> DatasetCreationResults:
"""Create the judged dataset."""
model_providers = [
dd.ModelProvider(
name=PROVIDER_NAME,
endpoint=endpoint,
provider_type="openai",
api_key=api_key_env,
),
]
data_designer = DataDesigner(
artifact_path=artifact_path,
model_providers=model_providers,
)
data_designer.set_run_config(
dd.RunConfig(disable_early_shutdown=True, progress_bar=True),
)
results = data_designer.create(config_builder, num_records=num_records, dataset_name="frontier_judge")
return results
# =============================================================================
# CLI entry point
# =============================================================================
if __name__ == "__main__":
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument("--seed-path", type=str, required=True, help="Path to the seed parquet file")
parser.add_argument("--model-alias", type=str, default="frontier-judge-vlm")
parser.add_argument("--model-id", type=str, required=True, help="ID of the model to use for judging")
parser.add_argument("--endpoint", type=str, required=True, help="OpenAI-compatible API endpoint URL")
parser.add_argument(
"--api-key-env", type=str, required=True, help="Environment variable name containing the API key"
)
parser.add_argument("--num-records", type=int, default=5)
parser.add_argument("--artifact-path", type=str, default=None)
args = parser.parse_args()
config_builder = build_config(
seed_path=args.seed_path,
model_alias=args.model_alias,
model_id=args.model_id,
)
results = create_dataset(
config_builder,
num_records=args.num_records,
endpoint=args.endpoint,
api_key_env=args.api_key_env,
artifact_path=args.artifact_path,
)
print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")
results.load_analysis().to_report()