Text to Python

Download Code
from pathlib import Path

from data_designer.essentials import (
    CategorySamplerParams,
    CodeLang,
    CodeValidatorParams,
    DataDesigner,
    DataDesignerConfigBuilder,
    LLMCodeColumnConfig,
    LLMJudgeColumnConfig,
    LLMTextColumnConfig,
    SamplerColumnConfig,
    SamplerType,
    Score,
    SubcategorySamplerParams,
    ValidationColumnConfig,
    ValidatorType,
)
from data_designer.interface.results import DatasetCreationResults


def build_config(model_alias: str) -> DataDesignerConfigBuilder:
    config_builder = DataDesignerConfigBuilder()

    config_builder.add_column(
        SamplerColumnConfig(
            name="industry_sector",
            sampler_type=SamplerType.CATEGORY,
            params=CategorySamplerParams(
                values=[
                    "Healthcare",
                    "Finance",
                    "Technology",
                ],
            ),
        ),
    )

    config_builder.add_column(
        SamplerColumnConfig(
            name="topic",
            sampler_type=SamplerType.SUBCATEGORY,
            params=SubcategorySamplerParams(
                category="industry_sector",
                values={
                    "Healthcare": [
                        "Electronic Health Records (EHR) Systems",
                        "Telemedicine Platforms",
                        "AI-Powered Diagnostic Tools",
                    ],
                    "Finance": [
                        "Fraud Detection Software",
                        "Automated Trading Systems",
                        "Personal Finance Apps",
                    ],
                    "Technology": [
                        "Cloud Computing Platforms",
                        "Artificial Intelligence and Machine Learning Platforms",
                        "DevOps and CI/CD Tools",
                    ],
                },
            ),
        ),
    )

    config_builder.add_column(
        SamplerColumnConfig(
            name="code_complexity",
            sampler_type=SamplerType.CATEGORY,
            params=CategorySamplerParams(
                values=[
                    "Beginner",
                    "Intermediate",
                    "Advanced",
                ],
            ),
        ),
    )

    config_builder.add_column(
        SamplerColumnConfig(
            name="code_concept",
            sampler_type=SamplerType.SUBCATEGORY,
            params=SubcategorySamplerParams(
                category="code_complexity",
                values={
                    "Beginner": [
                        "Variables",
                        "Data Types",
                        "Functions",
                        "Loops",
                        "Classes",
                    ],
                    "Intermediate": [
                        "List Comprehensions",
                        "Object-oriented programming",
                        "Lambda Functions",
                        "Web frameworks",
                        "Pandas",
                    ],
                    "Advanced": [
                        "Multithreading",
                        "Context Managers",
                        "Generators",
                    ],
                },
            ),
        ),
    )

    config_builder.add_column(
        SamplerColumnConfig(
            name="instruction_phrase",
            sampler_type=SamplerType.CATEGORY,
            params=CategorySamplerParams(
                values=[
                    "Write a function that",
                    "Create a class that",
                    "Implement a script",
                    "Can you create a function",
                    "Develop a module that",
                ],
            ),
        ),
    )

    config_builder.add_column(
        LLMTextColumnConfig(
            name="instruction",
            model_alias=model_alias,
            system_prompt=("You are an expert at generating clear and specific programming tasks."),
            prompt=(
                "Generate an instruction to create Python code that solves a specific problem.\n"
                "Each instruction should begin with one of the following phrases: {{ instruction_phrase }}.\n\n"
                "Important Guidelines:\n"
                "* Industry Relevance: Ensure the instruction pertains to the {{ industry_sector }} sector and {{ topic }} topic.\n"
                "* Code Complexity: Tailor the instruction to the {{ code_complexity }} level. Utilize relevant {{ code_concept }} where appropriate to match the complexity level.\n"
                "* Clarity and Specificity: Make the problem statement clear and unambiguous. Provide sufficient context to understand the requirements without being overly verbose.\n"
                "* Response Formatting: Do not include any markers such as ### Response ### in the instruction.\n"
            ),
        )
    )

    config_builder.add_column(
        LLMCodeColumnConfig(
            name="code_implementation",
            model_alias=model_alias,
            code_lang=CodeLang.PYTHON,
            system_prompt=(
                "You are an expert Python programmer who writes clean, efficient, and well-documented code."
            ),
            prompt=(
                "Write Python code for the following instruction:\n"
                "Instruction: {{ instruction }}\n\n"
                "Important Guidelines:\n"
                "* Code Quality: Your code should be clean, complete, self-contained, and accurate.\n"
                "* Code Validity: Please ensure that your Python code is executable and does not contain any errors.\n"
                "* Packages: Remember to import any necessary libraries, and to use all libraries you import.\n"
                "* Complexity & Concepts: The code should be written at a {{ code_complexity }} level, making use of concepts such as {{code_concept}}.\n"
            ),
        )
    )

    config_builder.add_column(
        LLMTextColumnConfig(
            name="instruction",
            model_alias=model_alias,
            system_prompt=("You are an expert at generating clear and specific programming tasks."),
            prompt=(
                "Generate an instruction to create Python code that solves a specific problem.\n"
                "Each instruction should begin with one of the following phrases: {{ instruction_phrase }}.\n\n"
                "Important Guidelines:\n"
                "* Industry Relevance: Ensure the instruction pertains to the {{ industry_sector }} sector and {{ topic }} topic.\n"
                "* Code Complexity: Tailor the instruction to the {{ code_complexity }} level. Utilize relevant {{ code_concept }} where appropriate to match the complexity level.\n"
                "* Clarity and Specificity: Make the problem statement clear and unambiguous. Provide sufficient context to understand the requirements without being overly verbose.\n"
                "* Response Formatting: Do not include any markers such as ### Response ### in the instruction.\n"
            ),
        )
    )

    config_builder.add_column(
        LLMCodeColumnConfig(
            name="code_implementation",
            model_alias=model_alias,
            code_lang=CodeLang.PYTHON,
            system_prompt=(
                "You are an expert Python programmer who writes clean, efficient, and well-documented code."
            ),
            prompt=(
                "Write Python code for the following instruction:\n"
                "Instruction: {{ instruction }}\n\n"
                "Important Guidelines:\n"
                "* Code Quality: Your code should be clean, complete, self-contained, and accurate.\n"
                "* Code Validity: Please ensure that your Python code is executable and does not contain any errors.\n"
                "* Packages: Remember to import any necessary libraries, and to use all libraries you import.\n"
                "* Complexity & Concepts: The code should be written at a {{ code_complexity }} level, making use of concepts such as {{ code_concept }}.\n"
            ),
        )
    )

    config_builder.add_column(
        LLMJudgeColumnConfig(
            name="code_judge_result",
            model_alias=model_alias,
            prompt=TEXT_TO_PYTHON_JUDGE_TEMPLATE,
            scores=python_scoring,
        )
    )

    config_builder.add_column(
        ValidationColumnConfig(
            name="code_validity_result",
            validator_type=ValidatorType.CODE,
            target_columns=["code_implementation"],
            validator_params=CodeValidatorParams(
                code_lang=CodeLang.PYTHON,
            ),
            batch_size=100,
        )
    )

    return config_builder


def create_dataset(
    config_builder: DataDesignerConfigBuilder,
    num_records: int,
    artifact_path: Path | str | None = None,
) -> DatasetCreationResults:
    data_designer = DataDesigner(artifact_path=artifact_path)
    results = data_designer.create(config_builder, num_records=num_records)
    return results


TEXT_TO_PYTHON_JUDGE_TEMPLATE = """\
You are an expert in Python programming, with specialized knowledge in software engineering, data science, and algorithmic problem-solving.

You think about potential flaws and errors in the code. You are a tough critic, but a fair one.

Take a deep breath and use the Python Code Quality Rubric below to score the **Generated Python Code** based on the INSTRUCTIONS.

#### INSTRUCTIONS
The Generated Python Code should be a valid response to the Natural Language Prompt below

Natural Language Prompt:
{{ instruction }}

Generated Python Code
{{ code_implementation }}
"""


python_scoring = [
    Score(
        name="Relevance",
        description="Adherence to INSTRUCTIONS and CONTEXT",
        options={
            4: "Perfectly meets all specified requirements.",
            3: "Meets most requirements with minor deviations.",
            2: "Moderate deviation from the instructions.",
            1: "Significant deviations from the instructions.",
            0: "Does not adhere to the instructions.",
        },
    ),
    Score(
        name="Pythonic",
        description="Pythonic Code and Best Practices (Does the code follow Python conventions and best practices?)",
        options={
            4: "The code exemplifies Pythonic principles, making excellent use of Python-specific constructs, standard library modules and programming idioms; follows all relevant PEPs.",
            3: "The code closely follows Python conventions and adheres to many best practices; good use of Python-specific constructs, standard library modules and programming idioms.",
            2: "The code generally follows Python conventions but has room for better alignment with Pythonic practices.",
            1: "The code loosely follows Python conventions, with several deviations from best practices.",
            0: "The code does not follow Python conventions or best practices, using non-Pythonic approaches.",
        },
    ),
    Score(
        name="Readability",
        description="Readability and Maintainability (Is the Python code easy to understand and maintain?)",
        options={
            4: (
                "The code is excellently formatted, follows PEP 8 guidelines, is elegantly concise and clear, uses meaningful variable names, "
                "ensuring high readability and ease of maintenance; organizes complex logic well. Docstrings are given in a Google Docstring format."
            ),
            3: "The code is well-formatted in the sense of code-as-documentation, making it relatively easy to understand and maintain; uses descriptive names and organizes logic clearly.",
            2: "The code is somewhat readable with basic formatting and some comments, but improvements are needed; needs better use of descriptive names and organization.",
            1: "The code has minimal formatting, making it hard to understand; lacks meaningful names and organization.",
            0: "The code is unreadable, with no attempt at formatting or description.",
        },
    ),
    Score(
        name="Efficiency",
        description="Efficiency and Performance (Is the code optimized for performance?)",
        options={
            4: "The solution is highly efficient, using appropriate data structures and algorithms; avoids unnecessary computations and optimizes for both time and space complexity.",
            3: "The solution is efficient, with good use of Python's built-in functions and libraries; minor areas for optimization.",
            2: "The solution is moderately efficient, but misses some opportunities for optimization; uses some inefficient patterns.",
            1: "The solution shows poor efficiency, with notable performance issues; lacks effective optimization techniques.",
            0: "The solution is highly inefficient; overlooks fundamental optimization practices, resulting in significant performance issues.",
        },
    ),
]


if __name__ == "__main__":
    from argparse import ArgumentParser

    parser = ArgumentParser()
    parser.add_argument("--model-alias", type=str, default="openai-text")
    parser.add_argument("--num-records", type=int, default=5)
    parser.add_argument("--artifact-path", type=str, default=None)
    args = parser.parse_args()

    config_builder = build_config(model_alias=args.model_alias)
    results = create_dataset(config_builder, num_records=args.num_records, artifact_path=args.artifact_path)

    print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")

    results.load_analysis().to_report()