Nemotron Super Text to SQL
Dev Note
For a deep dive into the pipeline design, distractor injection strategy, quality waterfall analysis, and BIRD benchmark results, see Engineering an Enterprise-Grade Text-to-SQL Dataset with NeMo Data Designer.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "data-designer",
# ]
# ///
"""Nemotron Super Text-to-SQL Recipe: Distractors, Dirty Data, and Multi-Judge Scoring
Generate enterprise-grade text-to-SQL training data with dialect-specific SQL
(SQLite, MySQL, PostgreSQL), distractor table/column injection, dirty data
handling, conditional sampling, and multi-dimensional LLM judge scoring.
This recipe implements the pipeline used to produce 96.5k validated text-to-SQL
records for Nemotron Super v3 SFT training, which raised BIRD benchmark
execution accuracy from 26.77% to 41.80%.
Pipeline architecture:
┌─────────────────────────────────────────────────────────────────────────┐
│ STAGE 1: SEEDING & DIVERSIFICATION │
│ │
│ Domain Controls SQL Controls Prompt Controls │
│ ├─ industry_sector ├─ sql_complexity ├─ instruction_style │
│ ├─ topic (conditional) ├─ sql_concept ├─ linguistic_register │
│ ├─ data_quality_challenge├─ sql_task_type └─ politeness_level │
│ ├─ data_quality_concept │ (conditional) │
│ ├─ knowledge_dependency └─ sql_task_concept │
│ └─ knowledge_concept │
├─────────────────────────────────────────────────────────────────────────┤
│ STAGE 2: PROMPT GENERATION (LLM) │
│ Natural-language request grounded in metadata; no SQL jargon. │
│ Style adapts to instruction_style × register × politeness. │
├─────────────────────────────────────────────────────────────────────────┤
│ STAGE 3: SCHEMA + DATA GENERATION (LLM) │
│ Dialect-specific DDL + INSERT with 3-5 core tables, 1-2 distractor │
│ tables, 3-5 distractor columns per table, dirty data injection. │
├─────────────────────────────────────────────────────────────────────────┤
│ STAGE 4: SQL GENERATION (LLM) │
│ Dialect-specific SQL; ignores distractors; handles dirty data. │
├─────────────────────────────────────────────────────────────────────────┤
│ STAGE 5: VALIDATION + QUALITY SCORING │
│ │
│ Syntax Validator 5 LLM Judges (0-4 scores) │
│ ├─ SQL_SQLITE ├─ Prompt: naturalness, specificity, │
│ ├─ SQL_MYSQL │ absence of SQL jargon │
│ └─ SQL_POSTGRES ├─ SQL: relevance, readability, │
│ │ scalability, standards │
│ ├─ Context: relevance, readability, │
│ │ scalability, standards │
│ ├─ Data Quality: cleaning correctness, │
│ │ efficiency │
│ └─ Knowledge: application correctness, │
│ clarity of inference │
│ │
│ 15 score columns extracted for downstream filtering │
└─────────────────────────────────────────────────────────────────────────┘
Prerequisites:
- OPENAI_API_KEY environment variable for OpenAI provider model aliases (default model alias is "openai-text").
- NVIDIA_API_KEY environment variable for NVIDIA provider model aliases.
Run:
# Basic usage (generates 5 records by default, SQLite dialect)
uv run enterprise_text_to_sql.py
# Generate for a specific dialect
uv run enterprise_text_to_sql.py --dialect postgres
# For help message and available options
uv run enterprise_text_to_sql.py --help
"""
from pathlib import Path
import data_designer.config as dd
from data_designer.interface import DataDesigner, DatasetCreationResults
SQL_DIALECTS = {
"sqlite": dd.CodeLang.SQL_SQLITE,
"mysql": dd.CodeLang.SQL_MYSQL,
"postgres": dd.CodeLang.SQL_POSTGRES,
}
def build_config(model_alias: str, dialect: str = "sqlite") -> dd.DataDesignerConfigBuilder:
code_lang = SQL_DIALECTS[dialect]
config_builder = dd.DataDesignerConfigBuilder()
# =========================================================================
# Stage 1: Seeding & diversification
# =========================================================================
config_builder.add_column(
dd.SamplerColumnConfig(
name="sql_dialect",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(values=[dialect]),
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="industry_sector",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(
values=[
"Healthcare",
"Financial Services",
"Retail",
"Technology",
"Manufacturing",
"Aerospace",
"Energy",
"Telecommunications",
"Transportation",
"Education",
],
),
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="topic",
sampler_type=dd.SamplerType.SUBCATEGORY,
params=dd.SubcategorySamplerParams(
category="industry_sector",
values={
"Healthcare": [
"Electronic Health Records",
"Telemedicine Platforms",
"Clinical Trials",
"Patient Scheduling",
"Insurance Claims",
],
"Financial Services": [
"Fraud Detection",
"Trading Systems",
"Risk Assessment",
"Portfolio Management",
"Regulatory Compliance",
],
"Retail": [
"Inventory Management",
"Customer Segmentation",
"Pricing Optimization",
"Supply Chain",
"Returns Processing",
],
"Technology": [
"Cloud Platforms",
"ML Pipelines",
"DevOps Tools",
"API Gateway Logs",
"User Analytics",
],
"Manufacturing": [
"Quality Control",
"Production Scheduling",
"Equipment Maintenance",
"Supply Chain Optimization",
"Safety Compliance",
],
"Aerospace": [
"Flight Operations",
"Satellite Systems",
"Parts Procurement",
"Maintenance Scheduling",
"Crew Management",
],
"Energy": [
"Grid Management",
"Renewable Forecasting",
"Asset Monitoring",
"Trading and Markets",
"Regulatory Reporting",
],
"Telecommunications": [
"Network Operations",
"Customer Billing",
"Service Provisioning",
"Call Detail Records",
"Churn Prediction",
],
"Transportation": [
"Fleet Management",
"Route Optimization",
"Freight Tracking",
"Driver Scheduling",
"Maintenance Records",
],
"Education": [
"Student Records",
"Course Enrollment",
"Learning Analytics",
"Financial Aid",
"Faculty Management",
],
},
),
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="sql_complexity",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(
values=["Beginner", "Intermediate", "Advanced"],
),
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="sql_concept",
sampler_type=dd.SamplerType.SUBCATEGORY,
params=dd.SubcategorySamplerParams(
category="sql_complexity",
values={
"Beginner": [
"Basic SELECT Statements",
"WHERE Clauses",
"Simple Aggregations",
"Basic JOINs",
"INSERT, UPDATE, DELETE",
"ORDER BY and LIMIT",
],
"Intermediate": [
"Window Functions",
"Correlated Subqueries",
"Multiple JOINs with Aggregations",
"CASE Expressions",
"GROUP BY with HAVING",
"Set Operations (UNION, INTERSECT, EXCEPT)",
],
"Advanced": [
"Recursive CTEs",
"Frame Clauses",
"Pivot/Unpivot Patterns",
"Complex Analytical Functions",
"Self-Joins for Hierarchies",
"Conditional Aggregation",
],
},
),
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="sql_task_type",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(
values=[
"Foundational Queries & DML",
"Data Quality & Validation",
"Advanced Analytics & Windowing",
"Schema, DDL & Performance",
],
),
conditional_params={
"sql_complexity == 'Beginner'": dd.CategorySamplerParams(
values=["Foundational Queries & DML", "Data Quality & Validation"],
),
"sql_complexity == 'Intermediate'": dd.CategorySamplerParams(
values=[
"Foundational Queries & DML",
"Data Quality & Validation",
"Advanced Analytics & Windowing",
],
),
"sql_complexity == 'Advanced'": dd.CategorySamplerParams(
values=[
"Advanced Analytics & Windowing",
"Schema, DDL & Performance",
"Data Quality & Validation",
],
),
},
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="sql_task_concept",
sampler_type=dd.SamplerType.SUBCATEGORY,
params=dd.SubcategorySamplerParams(
category="sql_task_type",
values={
"Foundational Queries & DML": [
"Multi-table retrieval",
"Filtered aggregation",
"Conditional insert/update",
"Ranked retrieval",
],
"Data Quality & Validation": [
"NULL detection and handling",
"Duplicate detection",
"Data type casting and cleanup",
"Referential integrity check",
],
"Advanced Analytics & Windowing": [
"Running totals and moving averages",
"Ranking and percentile computation",
"Gap and island detection",
"Year-over-year comparison",
],
"Schema, DDL & Performance": [
"Index-aware query optimization",
"Partitioned query design",
"Constraint-based validation",
"Schema migration pattern",
],
},
),
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="data_quality_challenge",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(
values=[
"Type Mismatches",
"Temporal Drift",
"Embedded Special Characters",
"Mixed Formats",
"NULL Handling",
],
),
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="data_quality_concept",
sampler_type=dd.SamplerType.SUBCATEGORY,
params=dd.SubcategorySamplerParams(
category="data_quality_challenge",
values={
"Type Mismatches": [
"Currency stored as text with symbols ($57,500)",
"Boolean stored as string (yes/no/true/false/1/0)",
],
"Temporal Drift": [
"Dates stored as text in mixed formats (01-Jan-2023 vs 2023/01/01)",
"Timestamps with inconsistent timezone handling",
],
"Embedded Special Characters": [
"Newlines or tabs inside text fields",
"Unicode or accented characters in names",
],
"Mixed Formats": [
"Phone numbers in mixed formats (555-1234 vs (555) 123-4567)",
"Addresses with inconsistent abbreviations",
],
"NULL Handling": [
"NULLs disguised as empty strings or sentinel values (-1, N/A)",
"Optional FKs with NULL references",
],
},
),
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="knowledge_dependency",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(
values=["Domain Knowledge", "Implicit Logic", "Common Sense"],
),
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="knowledge_concept",
sampler_type=dd.SamplerType.SUBCATEGORY,
params=dd.SubcategorySamplerParams(
category="knowledge_dependency",
values={
"Domain Knowledge": [
"Industry-specific business rules",
"Regulatory thresholds and compliance criteria",
"Domain-specific KPI definitions",
],
"Implicit Logic": [
"Fiscal year vs calendar year reasoning",
"Business-day exclusion logic",
"Implied sort/filter criteria from context",
],
"Common Sense": [
"Unit conversion (e.g., cents to dollars)",
"Age or duration calculation from dates",
"Geographic or hierarchical inference",
],
},
),
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="instruction_style",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(
values=["imperative", "declarative", "interrogative", "contextual", "abbreviated"],
),
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="linguistic_register",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(
values=["formal", "conversational", "technical", "academic", "direct"],
),
)
)
config_builder.add_column(
dd.SamplerColumnConfig(
name="politeness_level",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(
values=["none", "minimal", "polite", "very polite"],
),
)
)
# =========================================================================
# Stage 2: Prompt generation
# =========================================================================
config_builder.add_column(
dd.LLMTextColumnConfig(
name="sql_prompt",
model_alias=model_alias,
system_prompt=PROMPT_GEN_SYSTEM_PROMPT,
prompt=PROMPT_GEN_TEXT,
)
)
# =========================================================================
# Stage 3: Schema + data with distractor injection
# =========================================================================
config_builder.add_column(
dd.LLMCodeColumnConfig(
name="sql_context",
model_alias=model_alias,
system_prompt="You are an expert SQL database architect who designs well-structured, normalized schemas.",
prompt=SCHEMA_GEN_PROMPTS[dialect],
code_lang=code_lang,
)
)
# =========================================================================
# Stage 4: Dialect-specific SQL generation
# =========================================================================
config_builder.add_column(
dd.LLMCodeColumnConfig(
name="sql",
model_alias=model_alias,
system_prompt="You are an expert SQL programmer who solves problems with clean, efficient, and perfectly structured queries. Return only the final SQL.",
prompt=SQL_GEN_PROMPTS[dialect],
code_lang=code_lang,
)
)
# =========================================================================
# Stage 5: Validation + 5 LLM judges
# =========================================================================
config_builder.add_column(
dd.ValidationColumnConfig(
name="sql_validity_result",
target_columns=["sql"],
validator_type=dd.ValidatorType.CODE,
validator_params=dd.CodeValidatorParams(code_lang=code_lang),
)
)
config_builder.add_column(
dd.LLMJudgeColumnConfig(
name="sql_prompt_judge_result",
model_alias=model_alias,
prompt=PROMPT_JUDGE_TEXT,
scores=PROMPT_SCORES,
)
)
config_builder.add_column(
dd.LLMJudgeColumnConfig(
name="sql_judge_result",
model_alias=model_alias,
prompt=SQL_JUDGE_TEXT,
scores=SQL_SCORES,
)
)
config_builder.add_column(
dd.LLMJudgeColumnConfig(
name="sql_context_judge_result",
model_alias=model_alias,
prompt=CONTEXT_JUDGE_PROMPTS[dialect],
scores=SQL_SCORES,
)
)
config_builder.add_column(
dd.LLMJudgeColumnConfig(
name="sql_data_quality_judge_result",
model_alias=model_alias,
prompt=DATA_QUALITY_JUDGE_TEXT,
scores=DATA_QUALITY_SCORES,
)
)
config_builder.add_column(
dd.LLMJudgeColumnConfig(
name="sql_knowledge_judge_result",
model_alias=model_alias,
prompt=KNOWLEDGE_JUDGE_TEXT,
scores=KNOWLEDGE_SCORES,
)
)
# =========================================================================
# Score extraction (15 flat columns for downstream filtering)
# =========================================================================
for judge_name, rubric_names in SCORE_EXTRACTION_MAP:
prefix = judge_name.replace("_judge_result", "").replace("sql_", "")
for rubric in rubric_names:
config_builder.add_column(
dd.ExpressionColumnConfig(
name=f"{prefix}_{rubric}_score",
expr=f"{{{{ {judge_name}.{rubric}.score if {judge_name}.{rubric}.score else ' ' }}}}",
)
)
return config_builder
def create_dataset(
config_builder: dd.DataDesignerConfigBuilder,
num_records: int,
artifact_path: Path | str | None = None,
) -> DatasetCreationResults:
data_designer = DataDesigner(artifact_path=artifact_path)
results = data_designer.create(config_builder, num_records=num_records)
return results
# =============================================================================
# Prompt Templates
# =============================================================================
PROMPT_GEN_SYSTEM_PROMPT = """\
You write natural-language requests to a data assistant. You adapt your writing \
style based on the specified instruction style, linguistic register, and politeness level. \
Requests avoid meta-instructions, code, or explicit SQL jargon unless it's common-sense terminology.\
"""
PROMPT_GEN_TEXT = """\
Write a single-sentence, natural-language request to a data assistant or agent to solve a specific SQL problem.
## Style Requirements
* **Instruction Style:** Use a {{ instruction_style }} style.
* **Linguistic Register:** Use a {{ linguistic_register }} register.
* **Politeness Level:** Apply {{ politeness_level }} politeness.
## Content Constraints
* Do NOT use explicit SQL keywords or technical jargon. Describe the **business problem**.
* Keep the intent specific; mention outputs, filters, and aggregations clearly.
* Do not include code, backticks, or any fenced blocks.
* Realistic Thresholds: The sample data is small (5-10 rows per table). Keep thresholds small.
* Relative Time: It's okay to say "recent", "last year", "past few months" instead of exact dates.
## Grounding Requirements
* The request must pertain to the {{ industry_sector }} sector and {{ topic }} topic.
* The request must implicitly require SQL at the {{ sql_complexity }} level involving {{ sql_concept }}.
* The request must require a {{ sql_task_type }} task, specifically "{{ sql_task_concept }}".
* The problem must implicitly require handling "{{ data_quality_concept }}".
* The problem must implicitly require "{{ knowledge_concept }}".\
"""
_SCHEMA_GEN_TEMPLATE = """\
Generate {dialect_label} DDL and sample data for tables relevant to the instruction.
Instruction: {{{{ sql_prompt }}}}
Requirements:
* Scope: Provide only CREATE TABLE and INSERT statements.
* Integrity: Define PRIMARY KEYs and FOREIGN KEYs with consistent data types. Use snake_case names.
* **Section Headers (REQUIRED):**
- `-- Core Tables`
- `-- Distractor Tables`
- `-- Sample Data for Core Tables`
- `-- Sample Data for Distractor Tables`
- Do NOT include any other comments.
* Coverage: Include 3-5 core tables for {{{{ industry_sector }}}}/{{{{ topic }}}} connected via FKs.
* **Distractor Tables:** Include 1-2 additional tables plausible for the domain but NOT needed \
for the instruction. Each with FK links to core tables and 5-10 rows of realistic data.
* Realism: Include 3-5 distractor columns per table (created_at, updated_by, description, is_active).
* **Dirty Data:** Introduce "{{{{ data_quality_concept }}}}" issues. Dirty columns MUST be {text_type}.
* Sample Data: 5-10 realistic rows per table. Mix clean and dirty rows.
* **No Data Comments:** Do NOT explain which rows are dirty.
* **Determinism:** No NOW()/CURRENT_DATE in INSERT statements. Use explicit literal dates.
* Executability: End each statement with a semicolon. Use {dialect_label} syntax.
* Do not include meta-instructions or reasoning traces.\
"""
SCHEMA_GEN_PROMPTS = {
"sqlite": _SCHEMA_GEN_TEMPLATE.format(dialect_label="SQLite", text_type="TEXT"),
"mysql": _SCHEMA_GEN_TEMPLATE.format(dialect_label="MySQL", text_type="VARCHAR or TEXT"),
"postgres": _SCHEMA_GEN_TEMPLATE.format(dialect_label="PostgreSQL", text_type="TEXT or VARCHAR"),
}
_SQL_GEN_BASE = """\
Write {dialect_label} SQL for the instruction using only the provided database context.
Instruction: {{{{ sql_prompt }}}}
Database Context:
{{{{ sql_context }}}}
Requirements:
* Validity: You are strictly forbidden from referencing any table or column not in the context.
* Handle Data Quality: Correctly handle "{{{{ data_quality_concept }}}}" using appropriate cleaning functions.
* Apply Knowledge: Apply "{{{{ knowledge_concept }}}}" even if it requires inferring unstated logic.
* Grounding: The SQL must demonstrate {{{{ sql_concept }}}} and {{{{ sql_task_type }}}}.
* Precision: Avoid SELECT *. Explicitly list columns; alias computed columns descriptively.
* Alignment: Match the {{{{ sql_complexity }}}} level.
* **Relative Time Anchoring:** Do NOT use CURRENT_DATE or real-time functions. Anchor to max date in data.
* **No Unasked Joins:** Do NOT join distractor tables or select distractor columns.
* **Logic:** Prefer CTEs to clean/normalize first, then compute/aggregate.
* Comments: Do not include inline comments.
* Formatting: Terminate with semicolons.\
"""
_SQLITE_EXTRAS = """
* Use SQLite syntax: strftime for dates, LIMIT instead of TOP.
* Do NOT use LATERAL joins or REGEXP_REPLACE. Use REPLACE()/SUBSTR() for cleaning.
* Date Parsing: Normalize with REPLACE(date_col, '/', '-') inside date()/strftime().\
"""
_MYSQL_EXTRAS = """
* Use MySQL syntax: DATE_ADD, DATEDIFF for dates, LIMIT for pagination, backticks for identifiers.
* Do NOT use REGEXP_REPLACE or CONVERT_TZ. Use REPLACE(), TRIM(), SUBSTRING().\
"""
_POSTGRES_EXTRAS = """
* Use PostgreSQL syntax: :: for casting, ILIKE for case-insensitive matching, LIMIT and OFFSET.
* regexp_replace is available for cleaning.\
"""
SQL_GEN_PROMPTS = {
"sqlite": _SQL_GEN_BASE.format(dialect_label="SQLite") + _SQLITE_EXTRAS,
"mysql": _SQL_GEN_BASE.format(dialect_label="MySQL") + _MYSQL_EXTRAS,
"postgres": _SQL_GEN_BASE.format(dialect_label="PostgreSQL") + _POSTGRES_EXTRAS,
}
# =============================================================================
# Judge Prompts
# =============================================================================
PROMPT_JUDGE_TEXT = """\
You are an expert product analyst who writes and reviews natural, human-like data requests.
Evaluate the **NL Prompt** quality.
## NL Prompt
{{ sql_prompt }}\
"""
SQL_JUDGE_TEXT = """\
You are a SQL data expert. Grade the quality of **Generated SQL** based on the prompt and context.
Natural Language Prompt:
{{ sql_prompt }}
Database Context:
{{ sql_context }}
Generated SQL:
{{ sql }}
When scoring, pay special attention to:
- **Minimal Table Usage:** Penalize queries that unnecessarily join distractor tables.
- **Minimal Column Usage:** Distractor columns should be ignored unless explicitly needed.
- **Correctness:** The query must produce the correct result.
- **Efficiency:** Prefer simple, readable solutions over unnecessarily complex ones.\
"""
_CONTEXT_JUDGE_TEMPLATE = """\
You are a SQL database architect. Evaluate the **Generated Database Context** quality.
Natural Language Prompt:
{{{{ sql_prompt }}}}
Generated Database Context ({dialect}):
{{{{ sql_context }}}}
When scoring, verify:
- **Sufficient Tables:** 3-5 core tables plus 1-2 distractor tables. Penalize bare-minimum schemas.
- **Distractor Columns:** Each table should include realistic columns beyond those needed for the query.
- **Realistic Relationships:** Appropriate PK/FK relationships. Distractor tables should have logical FK links.
- **Sample Data Quality:** Realistic, varied INSERT data.
- **Executability:** Syntactically correct for {dialect}.\
"""
CONTEXT_JUDGE_PROMPTS = {
"sqlite": _CONTEXT_JUDGE_TEMPLATE.format(dialect="SQLite"),
"mysql": _CONTEXT_JUDGE_TEMPLATE.format(dialect="MySQL"),
"postgres": _CONTEXT_JUDGE_TEMPLATE.format(dialect="PostgreSQL"),
}
DATA_QUALITY_JUDGE_TEXT = """\
You are an expert in data quality and validation. Score the SQL's handling of messy data.
## Natural Language Prompt
{{ sql_prompt }}
## Data Quality Challenge
{{ data_quality_challenge }} / {{ data_quality_concept }}
## Database Context
{{ sql_context }}
## Generated SQL
{{ sql }}\
"""
KNOWLEDGE_JUDGE_TEXT = """\
You are an expert in business intelligence and semantic interpretation. \
Score the SQL's application of implicit business knowledge.
## Natural Language Prompt
{{ sql_prompt }}
## Knowledge Dependency
{{ knowledge_dependency }} / {{ knowledge_concept }}
## Database Context
{{ sql_context }}
## Generated SQL
{{ sql }}\
"""
# =============================================================================
# Scoring Rubrics (5 judges, 15 dimensions)
# =============================================================================
SQL_SCORES = [
dd.Score(
name="relevance",
description="Uses only necessary tables/columns; ignores distractors",
options={
"4": "Perfectly meets all requirements; uses only strictly necessary tables and columns.",
"3": "Meets most requirements with minor deviations; may include a slightly unnecessary column.",
"2": "Moderate deviation; joins an unnecessary table or selects several irrelevant columns.",
"1": "Significant deviations; multiple unnecessary table joins or largely irrelevant output.",
"0": "Does not adhere to instructions; query is unrelated or joins many unnecessary tables.",
},
),
dd.Score(
name="readability",
description="Formatting, clarity, and maintainability",
options={
"4": "Excellently formatted, meaningful aliases, high readability and ease of maintenance.",
"3": "Well-formatted, relatively easy to understand; uses aliases with some organization.",
"2": "Somewhat readable with basic formatting but needs improvement.",
"1": "Minimal formatting, hard to understand; lacks meaningful names.",
"0": "Unreadable, no attempt at formatting.",
},
),
dd.Score(
name="scalability",
description="Scales well with larger datasets; avoids inefficient patterns",
options={
"4": "Highly scalable; avoids Cartesian joins and unnecessary table joins.",
"3": "Scales well; minor areas for optimization such as an extra join.",
"2": "Moderately scalable; includes unnecessary joins or suboptimal access patterns.",
"1": "Poor scalability; joins multiple unnecessary tables or uses inefficient patterns.",
"0": "Does not scale; overlooks fundamental scalability practices.",
},
),
dd.Score(
name="standards",
description="Compliance with SQL standards and best practices",
options={
"4": "Strictly adheres to SQL standards and best practices.",
"3": "Closely follows SQL standards and many best practices.",
"2": "Generally follows standards but has room for better alignment.",
"1": "Loosely follows standards, with several deviations.",
"0": "Does not follow standards; uses deprecated or non-standard syntax.",
},
),
]
PROMPT_SCORES = [
dd.Score(
name="naturalness_of_wording",
description="How human-like, colloquial, and non-robotic the phrasing is",
options={
"4": "Reads like a native speaker; concise, fluent, and natural.",
"3": "Generally natural; minor stiffness or formalism.",
"2": "Somewhat stilted or templated; noticeable artifacts.",
"1": "Robotic or awkward; obviously machine-generated.",
"0": "Unnatural and hard to read.",
},
),
dd.Score(
name="specificity_and_clarity",
description="Is the request specific about outputs, filters, and operations?",
options={
"4": "Very specific and clear outputs/filters/aggregations; minimal ambiguity.",
"3": "Mostly specific; minor ambiguity remains.",
"2": "Partially specific; key details are missing.",
"1": "Vague; unclear what should be returned or computed.",
"0": "Completely ambiguous.",
},
),
dd.Score(
name="absence_of_sql_jargon",
description="Avoids explicit SQL terms, table/column names, or schema hints",
options={
"4": "No SQL jargon at all; entirely tool-agnostic phrasing.",
"3": "Tiny hints but no explicit SQL or schema leakage.",
"2": "Occasional SQL terms or schema leakage present.",
"1": "Frequent SQL jargon and schema references.",
"0": "Reads like a SQL spec; heavy jargon.",
},
),
]
DATA_QUALITY_SCORES = [
dd.Score(
name="correctness_of_cleaning_logic",
description="Does the query correctly and fully clean the messy data?",
options={
"4": "Flawless cleaning logic; handles all transformations and edge cases perfectly.",
"3": "Correctly cleans data for most cases but might miss minor edge cases.",
"2": "Attempts to clean data, but logic is only partially correct.",
"1": "Cleaning logic is fundamentally flawed.",
"0": "No attempt to clean the data.",
},
),
dd.Score(
name="efficiency_of_cleaning_method",
description="Uses efficient, standard functions for cleaning?",
options={
"4": "Highly efficient, optimal SQL functions for the task.",
"3": "Correct and standard functions, but a more performant approach exists.",
"2": "Convoluted or inefficient method where a simpler one would suffice.",
"1": "Very inefficient or non-standard method that would scale poorly.",
"0": "Completely inappropriate or non-functional method.",
},
),
]
KNOWLEDGE_SCORES = [
dd.Score(
name="correctness_of_knowledge_application",
description="Does the query correctly translate the implicit knowledge into SQL logic?",
options={
"4": "Flawlessly translates abstract concept into precise, correct SQL logic.",
"3": "Logic correctly reflects the knowledge concept but could be expressed more directly.",
"2": "Partially applies the logic, misinterpreting some nuances.",
"1": "Fundamentally misinterprets the knowledge concept.",
"0": "No attempt to apply the required knowledge.",
},
),
dd.Score(
name="clarity_of_inference",
description="Is the applied logic clear and self-explanatory within the query?",
options={
"4": "Logic is immediately obvious through well-chosen aliases, CTEs, or clear filtering.",
"3": "Logic is correct but requires some inspection to understand.",
"2": "Logic is technically correct but obscure, using magic numbers or hard-to-read conditions.",
"1": "Logic is convoluted and appears incorrect.",
"0": "Query is completely opaque, with no discernible link to the required knowledge.",
},
),
]
SCORE_EXTRACTION_MAP = [
("sql_judge_result", ["relevance", "readability", "scalability", "standards"]),
("sql_context_judge_result", ["relevance", "readability", "scalability", "standards"]),
("sql_prompt_judge_result", ["naturalness_of_wording", "specificity_and_clarity", "absence_of_sql_jargon"]),
("sql_data_quality_judge_result", ["correctness_of_cleaning_logic", "efficiency_of_cleaning_method"]),
("sql_knowledge_judge_result", ["correctness_of_knowledge_application", "clarity_of_inference"]),
]
if __name__ == "__main__":
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument("--model-alias", type=str, default="openai-text")
parser.add_argument("--num-records", type=int, default=5)
parser.add_argument("--artifact-path", type=str, default=None)
parser.add_argument(
"--dialect",
type=str,
default="sqlite",
choices=list(SQL_DIALECTS.keys()),
help="SQL dialect to generate for (default: sqlite)",
)
args = parser.parse_args()
config_builder = build_config(model_alias=args.model_alias, dialect=args.dialect)
results = create_dataset(config_builder, num_records=args.num_records, artifact_path=args.artifact_path)
print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")
results.load_analysis().to_report()