Model evaluation¶
Here are the commands you can run to reproduce our evaluation numbers.
Prepare evaluation data¶
For comp-math-24-25¶
Below are the evaluation commands for three reasoning modes (high, medium, and low), with and without Python TIR.
from nemo_skills.pipeline.cli import eval, wrap_arguments
cluster = 'slurm'
modes = [
'low',
'medium',
'high'
]
max_length = 120000
model_path = '/workspace/final_sft_model'
for mode in modes:
output_python = f"/workspace/final_sft_model/{mode}/with-python/"
output_no_python = f"/workspace/final_sft_model/{mode}/no-python/"
#with Python TIR evaluation
eval(
ctx=wrap_arguments(
f"++inference.tokens_to_generate=120000 "
f"++inference.temperature=1.0 "
f"++inference.top_p=1.0 "
"++max_concurrent_requests=1024 "
"++prompt_config=gpt-oss/math "
"++code_tags=gpt-oss "
"++code_execution=true "
"++server.code_execution.max_code_executions=100 "
"++server.enable_soft_fail=True "
"++inference.endpoint_type=text "
f"++chat_template_kwargs.reasoning_effort={mode} "
"++chat_template_kwargs.builtin_tools=[python] "
),
cluster=cluster,
expname=f"with-python-comp-math",
model=model_path,
server_type='vllm',
server_gpus=8,
benchmarks="comp-math-24-25:16",
output_dir=output_python,
server_args="--async-scheduling",
with_sandbox=True,
)
#without Python TIR evaluation
eval(
ctx=wrap_arguments(
f"++inference.tokens_to_generate=120000 "
f"++inference.temperature=1.0 "
f"++inference.top_p=1.0 "
"++max_concurrent_requests=1024 "
"++prompt_config=gpt-oss/math "
"++inference.endpoint_type=text "
"++server.enable_soft_fail=True "
f"++chat_template_kwargs.reasoning_effort={mode} "
),
cluster=cluster,
expname=f"no-python-comp-math",
model=model_path,
server_type='vllm',
server_gpus=8,
benchmarks="comp-math-24-25:16",
output_dir=output_no_python,
server_args="--async-scheduling",
)
For hle-math¶
For hle-math it's necessary to run LLM-as-a-judge step to get accurate evaluation results. We use the Qwen2.5-32B-Instruct model as the judge which can be specified as follows.
from nemo_skills.pipeline.cli import eval, wrap_arguments
cluster = 'slurm'
num_gpus = 8
modes = ['low', 'medium', 'high']
num_chunks = {
'high': 4,
'medium': 0,
'low': 0
}
model_path = '/workspace/final_sft_model'
for mode in modes:
output_python = f"/workspace/final_sft_model/{mode}/with-python/"
output_no_python = f"/workspace/final_sft_model/{mode}/no-python/"
eval(
ctx=wrap_arguments(
f"++inference.tokens_to_generate=120000 "
"++inference.temperature=1.0 "
"++inference.top_p=1.0 "
"++max_concurrent_requests=1024 "
"++prompt_config=gpt-oss/math "
"++code_tags=gpt-oss "
"++code_execution=true "
"++server.code_execution.max_code_executions=100 "
"++server.enable_soft_fail=True "
"++inference.endpoint_type=text "
f"++chat_template_kwargs.reasoning_effort={mode} "
"++chat_template_kwargs.builtin_tools=[python] "
),
cluster=cluster,
expname=f"with-python-hle-math",
model=model_path,
server_type='vllm',
server_gpus=8,
benchmarks="hle:4",
num_chunks=num_chunks[mode],
judge_model='/workspace/Qwen2.5-32B-Instruct',
judge_server_type="sglang",
judge_server_gpus=8,
extra_judge_args="++inference.tokens_to_generate=4096 ++server.enable_soft_fail=True",
split='math',
output_dir=output_python,
server_args="--async-scheduling",
with_sandbox=True,
)
eval(
ctx=wrap_arguments(
f"++inference.tokens_to_generate=120000 "
"++inference.temperature=1.0 "
"++inference.top_p=1.0 "
"++max_concurrent_requests=1024 "
"++prompt_config=gpt-oss/math "
"++inference.endpoint_type=text "
"++server.enable_soft_fail=True "
f"++chat_template_kwargs.reasoning_effort={mode} "
),
cluster=cluster,
expname=f"no-python-hle-math",
model=model_path,
server_type='vllm',
server_gpus=8,
benchmarks="hle:4",
num_chunks=num_chunks[mode],
judge_model='/workspace/Qwen2.5-32B-Instruct',
judge_server_type="sglang",
judge_server_gpus=8,
extra_judge_args="++inference.tokens_to_generate=4096 ++server.enable_soft_fail=True",
split='math',
output_dir=output_no_python,
server_args="--async-scheduling",
)