From e450a6fbc4aeca256ef7979d618bff43f7bbc153 Mon Sep 17 00:00:00 2001
From: Jingze Shi <losercheems@gmail.com>
Date: Fri, 31 Jan 2025 19:41:53 +0800
Subject: [PATCH 001/137] Recipes for optimzing training scripts (#120)

* Add recipe configs to optimize scripts (#73)

* remove small models

* Add README for recipes

* Add README for recipes

* Attempt to resolve conflicts

* Optimize src scripts

* Update recipe of DeepSeek-R1-Distill-Qwen-7B

* Update recipe of Qwen2.5-1.5B

* Updated recipe readme for qwen

* Update training command for recipes

* Update README.md

Co-authored-by: Kashif Rasul <kashif.rasul@gmail.com>

* Update preprocessing_num_workers from 36 to 8

* Add small language model recipes for quickly verify R1

* Fix src code quality

* Add back the Slurm job command

* Remove recipe of doge

* Fix torch_dtype is not used

* fix grpo yaml

* fix grpo yaml

* fix deprecation warning

* fix config folder location

* Remove duplicate variables in grpo.py

* Update README.md

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* Update README.md

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* Update recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/confg_full.yaml

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

---------

Co-authored-by: Kashif Rasul <kashif.rasul@gmail.com>
Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>
---
 README.md                                     |  34 +----
 recipes/README.md                             |   1 +
 .../accelerate_configs}/ddp.yaml              |   0
 .../accelerate_configs}/zero2.yaml            |   0
 .../accelerate_configs}/zero3.yaml            |   0
 .../grpo/config_full.yaml                     |  45 ++++++
 .../grpo/confg_full.yaml                      |  45 ++++++
 .../sft/config_full.yaml                      |  41 ++++++
 recipes/qwen/README.md                        |  24 ++++
 slurm/grpo.slurm                              |   2 +-
 slurm/sft.slurm                               |   2 +-
 src/open_r1/grpo.py                           | 106 +++++++++++++-
 src/open_r1/sft.py                            | 136 +++++++++++++++---
 13 files changed, 379 insertions(+), 57 deletions(-)
 create mode 100644 recipes/README.md
 rename {configs => recipes/accelerate_configs}/ddp.yaml (100%)
 rename {configs => recipes/accelerate_configs}/zero2.yaml (100%)
 rename {configs => recipes/accelerate_configs}/zero3.yaml (100%)
 create mode 100644 recipes/deepseek/DeepSeek-R1-Distill-Qwen-7B/grpo/config_full.yaml
 create mode 100644 recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/confg_full.yaml
 create mode 100644 recipes/qwen/Qwen2.5-1.5B-Instruct/sft/config_full.yaml
 create mode 100644 recipes/qwen/README.md

diff --git a/README.md b/README.md
index 43deda9e4..ebb725e28 100644
--- a/README.md
+++ b/README.md
@@ -100,22 +100,7 @@ We support training models with either DDP or DeepSpeed (ZeRO-2 and ZeRO-3). To
 To run SFT on a dataset distilled from DeepSeek-R1 with reasoning traces such as [Bespoke-Stratos-17k](https://huggingface.co/datasets/bespokelabs/Bespoke-Stratos-17k), run:
 
 ```shell
-accelerate launch --config_file=configs/zero3.yaml src/open_r1/sft.py \
-    --model_name_or_path Qwen/Qwen2.5-Math-1.5B-Instruct \
-    --dataset_name HuggingFaceH4/Bespoke-Stratos-17k \
-    --learning_rate 2.0e-5 \
-    --num_train_epochs 1 \
-    --packing \
-    --max_seq_length 4096 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --gradient_accumulation_steps 4 \
-    --gradient_checkpointing \
-    --bf16 \
-    --logging_steps 5 \
-    --eval_strategy steps \
-    --eval_steps 100 \
-    --output_dir data/Qwen2.5-1.5B-Open-R1-Distill
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py --config recipes/qwen/Qwen2.5-1.5B-Instruct/sft/config_full.yaml
 ```
 
 To launch a Slurm job, run:
@@ -128,22 +113,10 @@ Here `{model}` and `{dataset}` refer to the model and dataset IDs on the Hugging
 
 ### GRPO
 
-To train via the GRPO trainer we will use the strategy of using one node to run vLLM for faster generation and the remaining nodes for training. Thus we will use the `configs/zero3.yaml` config and then overwrite the `num_processes=7` for the 8 GPU training scenario. Thus all we need to do is:
+To train via the GRPO trainer, we use one GPU to run vLLM for faster generation and the remaining GPUs for training. For example, one a node with 8 GPUs, use the `recipes/accelerate_configs/zero3.yaml` config and then overwrite `num_processes` to run on 7 devices:
 
 ```shell
-accelerate launch --config_file configs/zero3.yaml --num_processes=7 src/open_r1/grpo.py \
-    --output_dir DeepSeek-R1-Distill-Qwen-7B-GRPO \
-    --model_name_or_path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \
-    --dataset_name AI-MO/NuminaMath-TIR \
-    --max_prompt_length 512 \
-    --max_completion_length 1024 \
-    --per_device_train_batch_size 1 \
-    --gradient_accumulation_steps 16 \
-    --logging_steps 10 \
-    --bf16 \
-    --use_vllm \
-    --vllm_device auto \
-    --vllm_gpu_memory_utilization 0.7
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero3.yaml --num_processes=7 src/open_r1/grpo.py --config recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/confg_full.yaml
 ```
 
 To launch a Slurm job, run:
@@ -152,6 +125,7 @@ To launch a Slurm job, run:
 sbatch --output=/path/to/logs/%x-%j.out --err=/path/to/logs/%x-%j.err slurm/grpo.slurm {model} {dataset} {accelerator}
 ```
 
+You can find more model configurations in the [recipes](./recipes).
 
 ## Evaluating models
 
diff --git a/recipes/README.md b/recipes/README.md
new file mode 100644
index 000000000..a42ab27a1
--- /dev/null
+++ b/recipes/README.md
@@ -0,0 +1 @@
+**TODO:** we will add more recipes in the future, just like alignment-handbook, this is the purpose of adding recipes to this project.
\ No newline at end of file
diff --git a/configs/ddp.yaml b/recipes/accelerate_configs/ddp.yaml
similarity index 100%
rename from configs/ddp.yaml
rename to recipes/accelerate_configs/ddp.yaml
diff --git a/configs/zero2.yaml b/recipes/accelerate_configs/zero2.yaml
similarity index 100%
rename from configs/zero2.yaml
rename to recipes/accelerate_configs/zero2.yaml
diff --git a/configs/zero3.yaml b/recipes/accelerate_configs/zero3.yaml
similarity index 100%
rename from configs/zero3.yaml
rename to recipes/accelerate_configs/zero3.yaml
diff --git a/recipes/deepseek/DeepSeek-R1-Distill-Qwen-7B/grpo/config_full.yaml b/recipes/deepseek/DeepSeek-R1-Distill-Qwen-7B/grpo/config_full.yaml
new file mode 100644
index 000000000..a5cfcb484
--- /dev/null
+++ b/recipes/deepseek/DeepSeek-R1-Distill-Qwen-7B/grpo/config_full.yaml
@@ -0,0 +1,45 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+model_revision: main
+torch_dtype: bfloat16
+
+# Data training arguments
+dataset_name: AI-MO/NuminaMath-TIR
+dataset_configs:
+- all
+# Num processes is less by 1 as vLLM is using 1 GPU
+num_processes: 7
+
+# GRPO trainer config
+bf16: true
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: DeepSeek-R1-Distill-Qwen-7B-GRPO
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 10
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_prompt_length: 512
+max_completion_length: 1024
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/DeepSeek-R1-Distill-Qwen-7B-GRPO
+overwrite_output_dir: true
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_ratio: 0.1
diff --git a/recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/confg_full.yaml b/recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/confg_full.yaml
new file mode 100644
index 000000000..3624c95ac
--- /dev/null
+++ b/recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/confg_full.yaml
@@ -0,0 +1,45 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+model_revision: main
+torch_dtype: bfloat16
+
+# Data training arguments
+dataset_name: AI-MO/NuminaMath-TIR
+dataset_configs:
+- all
+# Num processes is less by 1 as vLLM is using 1 GPU
+num_processes: 7
+
+# GRPO trainer config
+bf16: true
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 10
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_prompt_length: 512
+max_completion_length: 1024
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/Qwen2.5-1.5B-Open-R1-GRPO
+overwrite_output_dir: true
+per_device_eval_batch_size: 4   
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_ratio: 0.1
diff --git a/recipes/qwen/Qwen2.5-1.5B-Instruct/sft/config_full.yaml b/recipes/qwen/Qwen2.5-1.5B-Instruct/sft/config_full.yaml
new file mode 100644
index 000000000..94e2225e7
--- /dev/null
+++ b/recipes/qwen/Qwen2.5-1.5B-Instruct/sft/config_full.yaml
@@ -0,0 +1,41 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+
+# Data training arguments
+dataset_name: HuggingFaceH4/Bespoke-Stratos-17k
+dataset_configs:
+- all
+preprocessing_num_workers: 8
+
+# SFT trainer config
+bf16: true
+do_eval: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-1.5B-Open-R1-Distill
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+lr_scheduler_type: cosine
+packing: true
+max_seq_length: 4096
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/Qwen2.5-1.5B-Open-R1-Distill
+overwrite_output_dir: true
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/qwen/README.md b/recipes/qwen/README.md
new file mode 100644
index 000000000..b7490c008
--- /dev/null
+++ b/recipes/qwen/README.md
@@ -0,0 +1,24 @@
+# Instructions to train Qwen-R1
+
+We build the **Qwen-R1** by doing `SFT` on [Bespoke-Stratos-17k](https://huggingface.co/datasets/bespokelabs/Bespoke-Stratos-17k) and then `GRPO` on [NuminaMath-TIR](https://huggingface.co/datasets/AI-MO/NuminaMath-TIR).
+
+## Setup
+
+Follow the installation instructions in https://github.com/huggingface/open-r1/tree/main?tab=readme-ov-file## Installation.
+
+## Training
+
+We support training models with either DDP or DeepSpeed ZeRO-2 and ZeRO-3. To switch between methods, simply change the path to the `recipes` YAML config in `accelerate_configs`.
+
+> [!NOTE]
+> The training commands below are configured for a node of 8 x H100s (80GB). For different hardware and topologies, you may need to tune the batch size and number of gradient accumulation steps.
+
+You can find the configuration files for different model sizes in this folder and specify the path to the configuration file in the commands below.
+
+```shell
+# SFT
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py --config recipes/qwen/Qwen2.5-1.5B-Instruct/sft/config_full.yaml
+
+# GRPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r1/grpo.py --config recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/confg_full.yaml
+```
diff --git a/slurm/grpo.slurm b/slurm/grpo.slurm
index 4965dde22..419af2373 100644
--- a/slurm/grpo.slurm
+++ b/slurm/grpo.slurm
@@ -52,7 +52,7 @@ export CMD=" \
     "
 
 export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \
-    --config_file configs/$ACCELERATOR.yaml  \
+    --config_file recipes/accelerate_configs/$ACCELERATOR.yaml  \
     --num_processes $(($WORLD_SIZE - 1)) \
     --gradient_accumulation_steps 4 \
     --num_machines $NUM_NODES \
diff --git a/slurm/sft.slurm b/slurm/sft.slurm
index 60c1949ae..120f8e8a2 100644
--- a/slurm/sft.slurm
+++ b/slurm/sft.slurm
@@ -49,7 +49,7 @@ export CMD=" \
     "
 
 export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \
-    --config_file configs/$ACCELERATOR.yaml  \
+    --config_file recipes/accelerate_configs/$ACCELERATOR.yaml  \
     --gradient_accumulation_steps 4 \
     --num_machines $NUM_NODES \
     --num_processes $WORLD_SIZE \
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 24d31a434..79046048d 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -12,10 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
+import os
 import re
+import sys
 from dataclasses import dataclass, field
 
+import datasets
+import transformers
 from datasets import load_dataset
+from transformers import set_seed
+from transformers.trainer_utils import get_last_checkpoint
 
 from latex2sympy2_extended import NormalizationConfig
 from math_verify import LatexExtractionConfig, parse, verify
@@ -24,6 +31,9 @@
 from trl import GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
 
 
+logger = logging.getLogger(__name__)
+
+
 @dataclass
 class GRPOScriptArguments(ScriptArguments):
     """
@@ -100,12 +110,47 @@ def format_reward(completions, **kwargs):
 
 
 def main(script_args, training_args, model_args):
-    # Get reward functions
-    reward_funcs = [reward_funcs_registry[func] for func in script_args.reward_funcs]
+
+    # Set seed for reproducibility
+    set_seed(training_args.seed)
+
+    ###############
+    # Setup logging
+    ###############
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process a small summary
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Model parameters {model_args}")
+    logger.info(f"Script parameters {script_args}")
+    logger.info(f"Data parameters {training_args}")
+
+    # Check for last checkpoint
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir):
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+    if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+        logger.info(f"Checkpoint detected, resuming training at {last_checkpoint=}.")
 
     # Load the dataset
     dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
 
+    # Get reward functions
+    reward_funcs = [reward_funcs_registry[func] for func in script_args.reward_funcs]
+
     # Format into conversation
     def make_conversation(example):
         return {
@@ -120,7 +165,9 @@ def make_conversation(example):
         if "messages" in dataset[split].column_names:
             dataset[split] = dataset[split].remove_columns("messages")
 
+    #############################
     # Initialize the GRPO trainer
+    #############################
     trainer = GRPOTrainer(
         model=model_args.model_name_or_path,
         reward_funcs=reward_funcs,
@@ -131,13 +178,58 @@ def make_conversation(example):
         callbacks=get_callbacks(training_args, model_args),
     )
 
-    # Train and push the model to the Hub
-    trainer.train()
-
-    # Save and push to hub
+    ###############
+    # Training loop
+    ###############
+    logger.info("*** Train ***")
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+    train_result = trainer.train(resume_from_checkpoint=checkpoint)
+    metrics = train_result.metrics
+    metrics["train_samples"] = len(dataset[script_args.dataset_train_split])
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+    trainer.save_state()
+
+    ##################################
+    # Save model and create model card
+    ##################################
+    logger.info("*** Save model ***")
     trainer.save_model(training_args.output_dir)
+    logger.info(f"Model saved to {training_args.output_dir}")
+
+    # Save everything else on main process
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "dataset": list(script_args.dataset_name),
+        "dataset_tags": list(script_args.dataset_name),
+        "tags": ["open-r1"],
+    }
+    if trainer.accelerator.is_main_process:
+        trainer.create_model_card(**kwargs)
+        # Restore k,v cache for fast inference
+        trainer.model.config.use_cache = True
+        trainer.model.config.save_pretrained(training_args.output_dir)
+
+    ##########
+    # Evaluate
+    ##########
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        metrics["eval_samples"] = len(dataset[script_args.dataset_test_split])
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    #############
+    # push to hub
+    #############
     if training_args.push_to_hub:
-        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+        logger.info("Pushing to hub...")
+        trainer.push_to_hub(**kwargs)
 
 
 if __name__ == "__main__":
diff --git a/src/open_r1/sft.py b/src/open_r1/sft.py
index 56bcddb10..27695af05 100644
--- a/src/open_r1/sft.py
+++ b/src/open_r1/sft.py
@@ -35,8 +35,16 @@
     --output_dir data/Qwen2.5-1.5B-Open-R1-Distill
 """
 
+import logging
+import os
+import sys
+
+import datasets
+import torch
+import transformers
 from datasets import load_dataset
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, set_seed
+from transformers.trainer_utils import get_last_checkpoint
 
 from open_r1.configs import SFTConfig
 from open_r1.utils.callbacks import get_callbacks
@@ -51,34 +59,80 @@
 )
 
 
+logger = logging.getLogger(__name__)
+
+
 def main(script_args, training_args, model_args):
+
+    # Set seed for reproducibility
+    set_seed(training_args.seed)
+
+    ###############
+    # Setup logging
+    ###############
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process a small summary
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Model parameters {model_args}")
+    logger.info(f"Script parameters {script_args}")
+    logger.info(f"Data parameters {training_args}")
+
+    # Check for last checkpoint
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir):
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+    if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+        logger.info(f"Checkpoint detected, resuming training at {last_checkpoint=}.")
+
+    ################
+    # Load datasets
+    ################
+    dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
+
     ################
-    # Model init kwargs & Tokenizer
+    # Load tokenizer
     ################
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code, use_fast=True
+    )
+    tokenizer.pad_token = tokenizer.eos_token
+
+    ###################
+    # Model init kwargs
+    ###################
+    logger.info("*** Initializing model kwargs ***")
+    torch_dtype = (
+        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
+    )
     quantization_config = get_quantization_config(model_args)
     model_kwargs = dict(
         revision=model_args.model_revision,
         trust_remote_code=model_args.trust_remote_code,
         attn_implementation=model_args.attn_implementation,
-        torch_dtype=model_args.torch_dtype,
+        torch_dtype=torch_dtype,
         use_cache=False if training_args.gradient_checkpointing else True,
         device_map=get_kbit_device_map() if quantization_config is not None else None,
         quantization_config=quantization_config,
     )
     training_args.model_init_kwargs = model_kwargs
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code, use_fast=True
-    )
-    tokenizer.pad_token = tokenizer.eos_token
-
-    ################
-    # Dataset
-    ################
-    dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
 
-    ################
-    # Training
-    ################
+    ############################
+    # Initialize the SFT Trainer
+    ############################
     trainer = SFTTrainer(
         model=model_args.model_name_or_path,
         args=training_args,
@@ -89,12 +143,58 @@ def main(script_args, training_args, model_args):
         callbacks=get_callbacks(training_args, model_args),
     )
 
-    trainer.train()
+    ###############
+    # Training loop
+    ###############
+    logger.info("*** Train ***")
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+    train_result = trainer.train(resume_from_checkpoint=checkpoint)
+    metrics = train_result.metrics
+    metrics["train_samples"] = len(dataset[script_args.dataset_train_split])
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+    trainer.save_state()
 
-    # Save and push to hub
+    ##################################
+    # Save model and create model card
+    ##################################
+    logger.info("*** Save model ***")
     trainer.save_model(training_args.output_dir)
+    logger.info(f"Model saved to {training_args.output_dir}")
+
+    # Save everything else on main process
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "dataset": list(script_args.dataset_name),
+        "dataset_tags": list(script_args.dataset_name),
+        "tags": ["open-r1"],
+    }
+    if trainer.accelerator.is_main_process:
+        trainer.create_model_card(**kwargs)
+        # Restore k,v cache for fast inference
+        trainer.model.config.use_cache = True
+        trainer.model.config.save_pretrained(training_args.output_dir)
+
+    ##########
+    # Evaluate
+    ##########
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        metrics["eval_samples"] = len(dataset[script_args.dataset_test_split])
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    #############
+    # push to hub
+    #############
     if training_args.push_to_hub:
-        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+        logger.info("Pushing to hub...")
+        trainer.push_to_hub(**kwargs)
 
 
 if __name__ == "__main__":

From a0d61ccece254ab7a24767d5a2ba36eea80ff78a Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 31 Jan 2025 13:36:08 +0100
Subject: [PATCH 002/137] use ruff (#137)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* use ruff

* reformat

* re-run

* update deps

* undo

* Update src/open_r1/configs.py

Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>

* Update src/open_r1/configs.py

Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>

* Update src/open_r1/configs.py

Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>

* Update src/open_r1/configs.py

Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>

* fix help strings

* fix ruff version

* fix formatting

---------

Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
---
 Makefile                  |  4 ++--
 scripts/run_benchmarks.py | 15 ++++++---------
 setup.py                  |  4 ++--
 src/open_r1/configs.py    | 28 ++++++++++++----------------
 src/open_r1/grpo.py       |  1 -
 src/open_r1/sft.py        |  1 -
 6 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/Makefile b/Makefile
index 200b01f34..1140e59e7 100644
--- a/Makefile
+++ b/Makefile
@@ -6,11 +6,11 @@ export PYTHONPATH = src
 check_dirs := src
 
 style:
-	black --line-length 119 --target-version py310 $(check_dirs) setup.py
+	ruff format --line-length 119 --target-version py310 $(check_dirs) setup.py
 	isort $(check_dirs) setup.py
 
 quality:
-	black --check --line-length 119 --target-version py310 $(check_dirs) setup.py
+	ruff check --line-length 119 --target-version py310 $(check_dirs) setup.py
 	isort --check-only $(check_dirs) setup.py
 	flake8 --max-line-length 119 $(check_dirs) setup.py
 
diff --git a/scripts/run_benchmarks.py b/scripts/run_benchmarks.py
index 3de4cf41e..b7395947a 100644
--- a/scripts/run_benchmarks.py
+++ b/scripts/run_benchmarks.py
@@ -15,7 +15,7 @@
 from typing import List, Optional
 
 from open_r1.utils.evaluation import SUPPORTED_BENCHMARKS, run_benchmark_jobs
-from open_r1.configs import SFTConfig, GRPOConfig
+from open_r1.configs import SFTConfig
 from trl import ModelConfig, TrlParser
 
 
@@ -25,18 +25,14 @@ class ScriptArguments:
         default="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
         metadata={"help": "The Hub model id to push the model to."},
     )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The Hub model branch to push the model to."},
-    )
+    model_revision: str = field(default="main", metadata={"help": "The Hub model branch to push the model to."})
     trust_remote_code: bool = field(default=False, metadata={"help": "Trust the remote code."})
     benchmarks: List[str] = field(
-        default_factory=lambda: [], metadata={"help": ("The benchmarks to run after training.")}
+        default_factory=lambda: [], metadata={"help": "The benchmarks to run after training."}
     )
     list_benchmarks: bool = field(default=False, metadata={"help": "List all supported benchmarks."})
     system_prompt: Optional[str] = field(
-        default=None,
-        metadata={"help": "The system prompt to use for the benchmark."},
+        default=None, metadata={"help": "The system prompt to use for the benchmark."}
     )
 
 
@@ -56,7 +52,8 @@ def main():
         system_prompt=args.system_prompt,
     )
     run_benchmark_jobs(
-        benchmark_args, ModelConfig(model_name_or_path="", model_revision="", trust_remote_code=args.trust_remote_code)
+        benchmark_args,
+        ModelConfig(model_name_or_path="", model_revision="", trust_remote_code=args.trust_remote_code),
     )
 
 
diff --git a/setup.py b/setup.py
index e1ea11903..b3b10694d 100644
--- a/setup.py
+++ b/setup.py
@@ -43,7 +43,7 @@
 _deps = [
     "accelerate>=1.2.1",
     "bitsandbytes>=0.43.0",
-    "black>=24.4.2",
+    "ruff>=0.9.0",
     "datasets>=3.2.0",
     "deepspeed==0.15.4",
     "distilabel[vllm,ray,openai]>=1.5.2",
@@ -83,7 +83,7 @@ def deps_list(*pkgs):
 extras = {}
 extras["tests"] = deps_list("pytest", "parameterized")
 extras["torch"] = deps_list("torch")
-extras["quality"] = deps_list("black", "isort", "flake8")
+extras["quality"] = deps_list("ruff", "isort", "flake8")
 extras["eval"] = deps_list("lighteval", "math-verify")
 extras["dev"] = extras["quality"] + extras["tests"] + extras["eval"]
 
diff --git a/src/open_r1/configs.py b/src/open_r1/configs.py
index 8aa12f954..57968b4b4 100644
--- a/src/open_r1/configs.py
+++ b/src/open_r1/configs.py
@@ -27,22 +27,19 @@ class GRPOConfig(trl.GRPOConfig):
     """
 
     benchmarks: list[str] = field(
-        default_factory=lambda: [],
-        metadata={"help": ("The benchmarks to run after training.")},
+        default_factory=lambda: [], metadata={"help": "The benchmarks to run after training."}
     )
     callbacks: list[str] = field(
-        default_factory=lambda: [], metadata={"help": ("The callbacks to run during training.")}
+        default_factory=lambda: [], metadata={"help": "The callbacks to run during training."}
     )
     system_prompt: Optional[str] = field(
-        default=None,
-        metadata={"help": ("The optional system prompt to use for benchmarking.")},
+        default=None, metadata={"help": "The optional system prompt to use for benchmarking."}
     )
     hub_model_revision: Optional[str] = field(
-        default="main",
-        metadata={"help": ("The Hub model branch to push the model to.")},
+        default="main", metadata={"help": "The Hub model branch to push the model to."}
     )
-    overwrite_hub_revision: bool = field(default=False, metadata={"help": ("Whether to overwrite the Hub revision.")})
-    push_to_hub_revision: bool = field(default=False, metadata={"help": ("Whether to push to a Hub revision/branch.")})
+    overwrite_hub_revision: bool = field(default=False, metadata={"help": "Whether to overwrite the Hub revision."})
+    push_to_hub_revision: bool = field(default=False, metadata={"help": "Whether to push to a Hub revision/branch."})
 
 
 @dataclass
@@ -52,19 +49,18 @@ class SFTConfig(trl.SFTConfig):
     """
 
     benchmarks: list[str] = field(
-        default_factory=lambda: [],
-        metadata={"help": ("The benchmarks to run after training.")},
+        default_factory=lambda: [], metadata={"help": "The benchmarks to run after training."}
     )
     callbacks: list[str] = field(
-        default_factory=lambda: [], metadata={"help": ("The callbacks to run during training.")}
+        default_factory=lambda: [], metadata={"help": "The callbacks to run during training."}
     )
     system_prompt: Optional[str] = field(
         default=None,
-        metadata={"help": ("The optional system prompt to use for benchmarking.")},
+        metadata={"help": "The optional system prompt to use for benchmarking."},
     )
     hub_model_revision: Optional[str] = field(
         default="main",
-        metadata={"help": ("The Hub model branch to push the model to.")},
+        metadata={"help": "The Hub model branch to push the model to."},
     )
-    overwrite_hub_revision: bool = field(default=False, metadata={"help": ("Whether to overwrite the Hub revision.")})
-    push_to_hub_revision: bool = field(default=False, metadata={"help": ("Whether to push to a Hub revision/branch.")})
+    overwrite_hub_revision: bool = field(default=False, metadata={"help": "Whether to overwrite the Hub revision."})
+    push_to_hub_revision: bool = field(default=False, metadata={"help": "Whether to push to a Hub revision/branch."})
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 79046048d..f8b7f9267 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -110,7 +110,6 @@ def format_reward(completions, **kwargs):
 
 
 def main(script_args, training_args, model_args):
-
     # Set seed for reproducibility
     set_seed(training_args.seed)
 
diff --git a/src/open_r1/sft.py b/src/open_r1/sft.py
index 27695af05..d5369fc20 100644
--- a/src/open_r1/sft.py
+++ b/src/open_r1/sft.py
@@ -63,7 +63,6 @@
 
 
 def main(script_args, training_args, model_args):
-
     # Set seed for reproducibility
     set_seed(training_args.seed)
 

From b2d7ba2f1d212c0193b4598476126ef99b8f27dd Mon Sep 17 00:00:00 2001
From: Matt <rocketknight1@gmail.com>
Date: Tue, 4 Feb 2025 14:34:57 +0000
Subject: [PATCH 003/137] Add puzzles

---
 src/open_r1/puzzles/README.md                 | 27 ++++++++++
 src/open_r1/puzzles/__init__.py               | 15 ++++++
 src/open_r1/puzzles/base_config.py            |  7 +++
 src/open_r1/puzzles/base_task.py              | 49 +++++++++++++++++
 .../tasks/math/algebra/linear_equations.py    | 52 +++++++++++++++++++
 5 files changed, 150 insertions(+)
 create mode 100644 src/open_r1/puzzles/README.md
 create mode 100644 src/open_r1/puzzles/__init__.py
 create mode 100644 src/open_r1/puzzles/base_config.py
 create mode 100644 src/open_r1/puzzles/base_task.py
 create mode 100644 src/open_r1/puzzles/tasks/math/algebra/linear_equations.py

diff --git a/src/open_r1/puzzles/README.md b/src/open_r1/puzzles/README.md
new file mode 100644
index 000000000..22d675abe
--- /dev/null
+++ b/src/open_r1/puzzles/README.md
@@ -0,0 +1,27 @@
+The puzzles module contains a simple and extensible system for generating and verifying reasoning tasks.
+The focus is on tasks where infinite variants can be generated with automatic answer verification, like mathematics, logic puzzles or coding tasks, although
+we highly encourage creativity - if you can come up with less STEM-y tasks that can still be rigorously validated, we'd love to see them!
+
+# Generating puzzles
+
+After `pip install`ing the open-r1 repo, you can very quickly get started
+
+```python
+>>> from open_r1.puzzles import LinearEquationConfig, LinearEquationTask
+
+>>> task = LinearEquationTask()
+>>> # Tasks are iterable, so you can iterate with "for question, answer in task:"
+>>> question, answer = next(iter(task))
+>>> print(question)
+'-2y - 4 = -16'
+
+# To score a model output, use task.validate()
+>>> task.validate("y = 6", answer)
+1.0
+
+>>> # To control the task difficulty, you can use the task's associated config
+>>> config = LinearEquationConfig()
+>>> config.min_coefficient = -1000
+>>> config.max_coefficient = 1000
+>>> harder_task = LinearEquationTask(config)
+```
\ No newline at end of file
diff --git a/src/open_r1/puzzles/__init__.py b/src/open_r1/puzzles/__init__.py
new file mode 100644
index 000000000..da83ac78d
--- /dev/null
+++ b/src/open_r1/puzzles/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .tasks.math.algebra.linear_equations import LinearEquationConfig, LinearEquationTask
\ No newline at end of file
diff --git a/src/open_r1/puzzles/base_config.py b/src/open_r1/puzzles/base_config.py
new file mode 100644
index 000000000..77f6b527b
--- /dev/null
+++ b/src/open_r1/puzzles/base_config.py
@@ -0,0 +1,7 @@
+from __future__ import annotations
+from dataclasses import dataclass
+
+@dataclass
+class BaseConfig:
+    num_tasks: int = 100
+    seed: int | None = None
\ No newline at end of file
diff --git a/src/open_r1/puzzles/base_task.py b/src/open_r1/puzzles/base_task.py
new file mode 100644
index 000000000..e40fef7da
--- /dev/null
+++ b/src/open_r1/puzzles/base_task.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+import numpy as np
+from .base_config import BaseConfig
+from random import randint
+from abc import ABC, abstractmethod
+
+MAX_INT = np.iinfo(np.int64).max
+
+class BaseTask(ABC):
+    config_class = None
+
+    def __init__(self, config: BaseConfig = None):
+        if config is not None:
+            self.config = config
+        elif self.config_class is not None:
+            self.config = self.config_class()  # Instantiate the default config for this task
+        else:
+            raise ValueError("No config provided and no default config_class set for this task")
+
+        # We generate individual sample rngs using seed + idx, so we scramble the seeds to large ints first
+        # to avoid sample overlap between datasets using common similar, small seeds (like 0 and 42 and 123)
+        self.seed = self.config.seed or randint(0, MAX_INT)
+        seed_scrambler = np.random.default_rng(self.seed)
+        self.scrambled_seed = seed_scrambler.integers(low=0, high=MAX_INT, size=None)
+
+    def __len__(self):
+        return self.config.num_tasks
+
+    def __iter__(self):
+        for i in range(len(self)):
+            yield self[i]
+
+
+    def get_rng(self, idx) -> np.random.Generator:
+        return np.random.default_rng(self.scrambled_seed + idx)
+
+    def __getitem__(self, item) -> tuple:
+        rng = self.get_rng(item)
+        return self.generate_sample(self.config, rng)
+
+    @abstractmethod
+    def generate_sample(self, config: BaseConfig, rng: np.random.Generator) -> tuple:
+        # This should return a tuple of (output, answer)
+        raise NotImplementedError
+
+    @abstractmethod
+    def validate(self, output, answer) -> float:
+        # This should return a score between 0. and 1. based on how well the output matches the answer
+        raise NotImplementedError
\ No newline at end of file
diff --git a/src/open_r1/puzzles/tasks/math/algebra/linear_equations.py b/src/open_r1/puzzles/tasks/math/algebra/linear_equations.py
new file mode 100644
index 000000000..9b8279ff7
--- /dev/null
+++ b/src/open_r1/puzzles/tasks/math/algebra/linear_equations.py
@@ -0,0 +1,52 @@
+from ....base_task import BaseTask
+from ....base_config import BaseConfig
+import numpy as np
+import re
+
+
+class LinearEquationConfig(BaseConfig):
+    min_coefficient: int = -10
+    max_coefficient: int = 10
+    min_var_value = -10
+    max_var_value = 10
+
+class LinearEquationTask(BaseTask):
+    config_class = LinearEquationConfig
+
+    def generate_sample(self, config: LinearEquationConfig, rng: np.random.Generator):
+        variable_names = ('x', 'y', 'z', 'a', 'b', 'c')
+        var_name = rng.choice(variable_names)
+        var_coefficient = 0
+        while var_coefficient == 0:
+            # We can't have the variable's coefficient be 0, so keep sampling until we get a non-zero one
+            var_coefficient = rng.integers(config.min_coefficient, config.max_coefficient, endpoint=True)
+        constant = rng.integers(config.min_coefficient, config.max_coefficient, endpoint=True)
+        while var_coefficient == 1 and constant == 0:
+            # We can't have the variable's coefficient be 1 and the constant be 0, as this is a trivial equation
+            # so keep rerolling until it isn't
+            constant = rng.integers(config.min_coefficient, config.max_coefficient, endpoint=True)
+        var_value = int(rng.integers(config.min_var_value, config.max_var_value, endpoint=True))
+        rhs = var_coefficient * var_value + constant
+
+        if constant < 0:
+            equation = f"{var_coefficient}{var_name} - {-constant} = {rhs}"
+        elif constant > 0:
+            equation = f"{var_coefficient}{var_name} + {constant} = {rhs}"
+        else:
+            equation = f"{var_coefficient}{var_name} = {rhs}"
+
+        return equation, var_value
+
+    def validate(self, output, answer):
+        # If there's only one number in the output, it's the answer
+        numbers = re.findall(r"\d+", output)
+        if len(numbers) == 1:
+            return float(int(numbers[0]) == answer)
+        # If not, look for a pattern like "x = 5" to disambiguate
+        numbers = re.findall(r"=\s+(\d+)", output)
+        if len(numbers) == 1:
+            return float(int(numbers[0].group(1)) == answer)
+        # Finally, maybe it gave the answer as a decimal, so check for that
+        numbers = re.findall(r"\d+\.\d+", output)
+        if len(numbers) == 1:
+            return float(float(numbers[0]) == answer)

From 9ca4ed5a6a68e12f7df4814a95cceae0a9a3831c Mon Sep 17 00:00:00 2001
From: Matt <rocketknight1@gmail.com>
Date: Tue, 4 Feb 2025 15:09:21 +0000
Subject: [PATCH 004/137] Add puzzles

---
 src/open_r1/puzzles/README.md                       | 13 ++++++++++++-
 src/open_r1/puzzles/base_task.py                    |  2 +-
 .../puzzles/tasks/math/algebra/linear_equations.py  |  3 ++-
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/open_r1/puzzles/README.md b/src/open_r1/puzzles/README.md
index 22d675abe..fc75587d3 100644
--- a/src/open_r1/puzzles/README.md
+++ b/src/open_r1/puzzles/README.md
@@ -24,4 +24,15 @@ After `pip install`ing the open-r1 repo, you can very quickly get started
 >>> config.min_coefficient = -1000
 >>> config.max_coefficient = 1000
 >>> harder_task = LinearEquationTask(config)
-```
\ No newline at end of file
+```
+
+## Adding new puzzles
+
+[add puzzle guide goes here]
+
+## Coming soon:
+
+- Proper indexing of puzzles
+- More puzzle types!
+- Lazy loading (if the module gets very big)
+- Gu
\ No newline at end of file
diff --git a/src/open_r1/puzzles/base_task.py b/src/open_r1/puzzles/base_task.py
index e40fef7da..2474aa5e0 100644
--- a/src/open_r1/puzzles/base_task.py
+++ b/src/open_r1/puzzles/base_task.py
@@ -44,6 +44,6 @@ def generate_sample(self, config: BaseConfig, rng: np.random.Generator) -> tuple
         raise NotImplementedError
 
     @abstractmethod
-    def validate(self, output, answer) -> float:
+    def verify(self, output, answer) -> float:
         # This should return a score between 0. and 1. based on how well the output matches the answer
         raise NotImplementedError
\ No newline at end of file
diff --git a/src/open_r1/puzzles/tasks/math/algebra/linear_equations.py b/src/open_r1/puzzles/tasks/math/algebra/linear_equations.py
index 9b8279ff7..056e13bb2 100644
--- a/src/open_r1/puzzles/tasks/math/algebra/linear_equations.py
+++ b/src/open_r1/puzzles/tasks/math/algebra/linear_equations.py
@@ -10,6 +10,7 @@ class LinearEquationConfig(BaseConfig):
     min_var_value = -10
     max_var_value = 10
 
+
 class LinearEquationTask(BaseTask):
     config_class = LinearEquationConfig
 
@@ -37,7 +38,7 @@ def generate_sample(self, config: LinearEquationConfig, rng: np.random.Generator
 
         return equation, var_value
 
-    def validate(self, output, answer):
+    def verify(self, output, answer):
         # If there's only one number in the output, it's the answer
         numbers = re.findall(r"\d+", output)
         if len(numbers) == 1:

From b24ce903cb5268b9f4ce5c16fe4e2198428a3245 Mon Sep 17 00:00:00 2001
From: Matt <rocketknight1@gmail.com>
Date: Tue, 4 Feb 2025 15:18:23 +0000
Subject: [PATCH 005/137] Fix quality after adding puzzles

---
 src/open_r1/puzzles/README.md                        |  5 ++---
 src/open_r1/puzzles/__init__.py                      |  2 +-
 src/open_r1/puzzles/base_config.py                   |  4 +++-
 src/open_r1/puzzles/base_task.py                     | 12 ++++++++----
 .../puzzles/tasks/math/algebra/linear_equations.py   | 10 ++++++----
 5 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/src/open_r1/puzzles/README.md b/src/open_r1/puzzles/README.md
index fc75587d3..0a14bd82a 100644
--- a/src/open_r1/puzzles/README.md
+++ b/src/open_r1/puzzles/README.md
@@ -16,7 +16,7 @@ After `pip install`ing the open-r1 repo, you can very quickly get started
 '-2y - 4 = -16'
 
 # To score a model output, use task.validate()
->>> task.validate("y = 6", answer)
+>>> task.verify("y = 6", answer)
 1.0
 
 >>> # To control the task difficulty, you can use the task's associated config
@@ -34,5 +34,4 @@ After `pip install`ing the open-r1 repo, you can very quickly get started
 
 - Proper indexing of puzzles
 - More puzzle types!
-- Lazy loading (if the module gets very big)
-- Gu
\ No newline at end of file
+- Lazy loading (if the module gets very big)
\ No newline at end of file
diff --git a/src/open_r1/puzzles/__init__.py b/src/open_r1/puzzles/__init__.py
index da83ac78d..c59ec6a75 100644
--- a/src/open_r1/puzzles/__init__.py
+++ b/src/open_r1/puzzles/__init__.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .tasks.math.algebra.linear_equations import LinearEquationConfig, LinearEquationTask
\ No newline at end of file
+from .tasks.math.algebra.linear_equations import LinearEquationConfig, LinearEquationTask
diff --git a/src/open_r1/puzzles/base_config.py b/src/open_r1/puzzles/base_config.py
index 77f6b527b..587af2e45 100644
--- a/src/open_r1/puzzles/base_config.py
+++ b/src/open_r1/puzzles/base_config.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
+
 from dataclasses import dataclass
 
+
 @dataclass
 class BaseConfig:
     num_tasks: int = 100
-    seed: int | None = None
\ No newline at end of file
+    seed: int | None = None
diff --git a/src/open_r1/puzzles/base_task.py b/src/open_r1/puzzles/base_task.py
index 2474aa5e0..f910b8411 100644
--- a/src/open_r1/puzzles/base_task.py
+++ b/src/open_r1/puzzles/base_task.py
@@ -1,11 +1,16 @@
 from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from random import randint
+
 import numpy as np
+
 from .base_config import BaseConfig
-from random import randint
-from abc import ABC, abstractmethod
+
 
 MAX_INT = np.iinfo(np.int64).max
 
+
 class BaseTask(ABC):
     config_class = None
 
@@ -30,7 +35,6 @@ def __iter__(self):
         for i in range(len(self)):
             yield self[i]
 
-
     def get_rng(self, idx) -> np.random.Generator:
         return np.random.default_rng(self.scrambled_seed + idx)
 
@@ -46,4 +50,4 @@ def generate_sample(self, config: BaseConfig, rng: np.random.Generator) -> tuple
     @abstractmethod
     def verify(self, output, answer) -> float:
         # This should return a score between 0. and 1. based on how well the output matches the answer
-        raise NotImplementedError
\ No newline at end of file
+        raise NotImplementedError
diff --git a/src/open_r1/puzzles/tasks/math/algebra/linear_equations.py b/src/open_r1/puzzles/tasks/math/algebra/linear_equations.py
index 056e13bb2..21f2913f7 100644
--- a/src/open_r1/puzzles/tasks/math/algebra/linear_equations.py
+++ b/src/open_r1/puzzles/tasks/math/algebra/linear_equations.py
@@ -1,8 +1,10 @@
-from ....base_task import BaseTask
-from ....base_config import BaseConfig
-import numpy as np
 import re
 
+import numpy as np
+
+from ....base_config import BaseConfig
+from ....base_task import BaseTask
+
 
 class LinearEquationConfig(BaseConfig):
     min_coefficient: int = -10
@@ -15,7 +17,7 @@ class LinearEquationTask(BaseTask):
     config_class = LinearEquationConfig
 
     def generate_sample(self, config: LinearEquationConfig, rng: np.random.Generator):
-        variable_names = ('x', 'y', 'z', 'a', 'b', 'c')
+        variable_names = ("x", "y", "z", "a", "b", "c")
         var_name = rng.choice(variable_names)
         var_coefficient = 0
         while var_coefficient == 0:

From 1fc8d425a995ddf8dbc6f8ef239d8161acdb7fc1 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Tue, 4 Feb 2025 15:30:25 +0000
Subject: [PATCH 006/137] Fix code quality after adding puzzles (#178)

---
 src/open_r1/puzzles/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/open_r1/puzzles/__init__.py b/src/open_r1/puzzles/__init__.py
index c59ec6a75..760bc1ee2 100644
--- a/src/open_r1/puzzles/__init__.py
+++ b/src/open_r1/puzzles/__init__.py
@@ -13,3 +13,6 @@
 # limitations under the License.
 
 from .tasks.math.algebra.linear_equations import LinearEquationConfig, LinearEquationTask
+
+
+__all__ = [LinearEquationConfig, LinearEquationTask]

From 5aff57c919dfcf11a26affe68d4c53276c3fe325 Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Wed, 5 Feb 2025 09:09:45 +0100
Subject: [PATCH 007/137] GRPO training args fixes (#177)

* grpo training args fixes

* style

---------

Co-authored-by: Kashif Rasul <kashif.rasul@gmail.com>
---
 src/open_r1/grpo.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index f8b7f9267..6b19c0e24 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -19,6 +19,7 @@
 from dataclasses import dataclass, field
 
 import datasets
+import torch
 import transformers
 from datasets import load_dataset
 from transformers import set_seed
@@ -164,6 +165,19 @@ def make_conversation(example):
         if "messages" in dataset[split].column_names:
             dataset[split] = dataset[split].remove_columns("messages")
 
+    logger.info("*** Initializing model kwargs ***")
+    torch_dtype = (
+        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
+    )
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        attn_implementation=model_args.attn_implementation,
+        torch_dtype=torch_dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+    )
+    training_args.model_init_kwargs = model_kwargs
+
     #############################
     # Initialize the GRPO trainer
     #############################

From 138df0ca44d5799fb24db2b50ff8f029c37aeca0 Mon Sep 17 00:00:00 2001
From: Lewis <1657236+ctjlewis@users.noreply.github.com>
Date: Wed, 5 Feb 2025 02:53:31 -0600
Subject: [PATCH 008/137] chore(setup.py): bump vllm>=0.7.1 (#181)

See https://github.com/huggingface/trl/pull/2766.
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b3b10694d..7ac019182 100644
--- a/setup.py
+++ b/setup.py
@@ -63,7 +63,7 @@
     "torch>=2.5.1",
     "transformers @ git+https://github.com/huggingface/transformers.git@main",
     "trl @ git+https://github.com/huggingface/trl.git@main",
-    "vllm>=0.7.0",
+    "vllm>=0.7.1",
     "wandb>=0.19.1",
 ]
 

From 3fd56dc7b439f8aa54cab764d325c677de0e8152 Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Wed, 5 Feb 2025 23:59:25 +0100
Subject: [PATCH 009/137] fix uv env path + details (#188)

* fix uv env path + details

* Update slurm/grpo.slurm

---------

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>
---
 slurm/eval_callback.slurm           | 5 ++---
 slurm/evaluate.slurm                | 2 +-
 slurm/generate.slurm                | 2 +-
 slurm/grpo.slurm                    | 2 +-
 slurm/sft.slurm                     | 2 +-
 src/open_r1/utils/upload_details.py | 2 +-
 6 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/slurm/eval_callback.slurm b/slurm/eval_callback.slurm
index 093c3d067..bec49ab96 100644
--- a/slurm/eval_callback.slurm
+++ b/slurm/eval_callback.slurm
@@ -8,8 +8,7 @@
 
 set -x -e
 source ~/.bashrc
-conda activate openr1
-
+source openr1/bin/activate
 TASK_NAME=$1
 TASKS=$2
 MODEL_ID=$3
@@ -31,7 +30,7 @@ fi
 
 LM_EVAL_REPO_ID="open-r1/open-r1-eval-leaderboard"
 MODEL_NAME=$(echo $MODEL_ID | sed 's/\//_/g') # replaces / with _
-DETAILS_REPO_ID="open-r1//details-$MODEL_NAME"
+DETAILS_REPO_ID="open-r1/details-$MODEL_NAME"
 OUTPUT_DIR="eval_results/$MODEL_ID/$MODEL_REVISION/$TASK_NAME"
 # We need this flag since we run this script from training jobs that use DeepSpeed and the env vars get progated which causes errors during evaluation
 ACCELERATE_USE_DEEPSPEED=false
diff --git a/slurm/evaluate.slurm b/slurm/evaluate.slurm
index 5fe7f8e33..0ca4a8701 100644
--- a/slurm/evaluate.slurm
+++ b/slurm/evaluate.slurm
@@ -14,7 +14,7 @@
 set -x -e
 
 source ~/.bashrc
-conda activate openr1
+source openr1/bin/activate
 module load cuda/12.1
 echo "START TIME: $(date)"
 echo "PYTHON ENV: $(which python)"
diff --git a/slurm/generate.slurm b/slurm/generate.slurm
index c154d64af..9cc9b1cce 100644
--- a/slurm/generate.slurm
+++ b/slurm/generate.slurm
@@ -129,7 +129,7 @@ export LD_LIBRARY_PATH=.venv/lib/python3.11/site-packages/nvidia/nvjitlink/lib
 echo "SLURM_JOB_ID: $SLURM_JOB_ID"
 echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
 
-source .venv/bin/activate
+source openr1/bin/activate
 
 # Getting the node names
 nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
diff --git a/slurm/grpo.slurm b/slurm/grpo.slurm
index 419af2373..8a3d63a32 100644
--- a/slurm/grpo.slurm
+++ b/slurm/grpo.slurm
@@ -11,7 +11,7 @@
 set -x -e
 
 source ~/.bashrc
-conda activate openr1
+source openr1/bin/activate
 echo "START TIME: $(date)"
 echo "PYTHON ENV: $(which python)"
 
diff --git a/slurm/sft.slurm b/slurm/sft.slurm
index 120f8e8a2..31528cdcb 100644
--- a/slurm/sft.slurm
+++ b/slurm/sft.slurm
@@ -11,7 +11,7 @@
 set -x -e
 
 source ~/.bashrc
-conda activate openr1
+source openr1/bin/activate
 echo "START TIME: $(date)"
 echo "PYTHON ENV: $(which python)"
 
diff --git a/src/open_r1/utils/upload_details.py b/src/open_r1/utils/upload_details.py
index 273e48bdb..caa491cfa 100644
--- a/src/open_r1/utils/upload_details.py
+++ b/src/open_r1/utils/upload_details.py
@@ -39,7 +39,7 @@ class ScriptArguments:
 
 def main():
     parser = HfArgumentParser(ScriptArguments)
-    args = parser.parse()
+    args = parser.parse_args_into_dataclasses()[0]
 
     if all(file.endswith(".json") for file in args.data_files):
         ds = load_dataset("json", data_files=args.data_files)

From 736b59f9a31e0a91806b376d1cda39e531ae5665 Mon Sep 17 00:00:00 2001
From: Dongwei Jiang <jdw19930725@gmail.com>
Date: Wed, 5 Feb 2025 15:25:08 -0800
Subject: [PATCH 010/137] Update grpo.py (#171)

---
 src/open_r1/grpo.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 6b19c0e24..e8c1c556e 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -216,9 +216,7 @@ def make_conversation(example):
 
     # Save everything else on main process
     kwargs = {
-        "finetuned_from": model_args.model_name_or_path,
-        "dataset": list(script_args.dataset_name),
-        "dataset_tags": list(script_args.dataset_name),
+        "dataset_name": script_args.dataset_name,
         "tags": ["open-r1"],
     }
     if trainer.accelerator.is_main_process:

From 571661a1e42f1482ebaa64bb6678d78508dd4ca3 Mon Sep 17 00:00:00 2001
From: Dongwei Jiang <jdw19930725@gmail.com>
Date: Thu, 6 Feb 2025 02:43:42 -0800
Subject: [PATCH 011/137] Provide a minimal reproducible experiment using GRPO
 for mathematical reasoning on base model, referencing the approach from
 SimpleRL-Reason (#197)

* Create config_base_math_smalllr.yaml

* Update README.md

* Update README.md
---
 README.md                                     |  8 ++++
 .../grpo/config_base_math_smalllr.yaml        | 45 +++++++++++++++++++
 2 files changed, 53 insertions(+)
 create mode 100644 recipes/deepseek/DeepSeek-R1-Distill-Qwen-7B/grpo/config_base_math_smalllr.yaml

diff --git a/README.md b/README.md
index ebb725e28..f02f8e095 100644
--- a/README.md
+++ b/README.md
@@ -119,6 +119,14 @@ To train via the GRPO trainer, we use one GPU to run vLLM for faster generation
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero3.yaml --num_processes=7 src/open_r1/grpo.py --config recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/confg_full.yaml
 ```
 
+We provide a minimal reproducible experiment using GRPO for mathematical reasoning, referencing the approach from [SimpleRL-Reason](https://hkust-nlp.notion.site/simplerl-reason) which uses a 7B model trained on 8K examples. Running this on 8 H100 80G GPU takes about 3 hours:
+
+```shell
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero2.yaml --num_processes=7 src/open_r1/grpo.py --config recipes/deepseek/DeepSeek-R1-Distill-Qwen-7B/grpo/config_base_math_smalllr.yaml
+```
+
+Our final [model](Dongwei/Qwen-2.5-7B_Base_Math_smalllr), while using different learning rates, loss functions and reward structures, achieves 69.4% accuracy on Math_500, demonstrating a 17%+ improvement over the base model.
+
 To launch a Slurm job, run:
 
 ```shell
diff --git a/recipes/deepseek/DeepSeek-R1-Distill-Qwen-7B/grpo/config_base_math_smalllr.yaml b/recipes/deepseek/DeepSeek-R1-Distill-Qwen-7B/grpo/config_base_math_smalllr.yaml
new file mode 100644
index 000000000..a102c9f10
--- /dev/null
+++ b/recipes/deepseek/DeepSeek-R1-Distill-Qwen-7B/grpo/config_base_math_smalllr.yaml
@@ -0,0 +1,45 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-Math-7B
+model_revision: main
+torch_dtype: bfloat16
+
+# Data training arguments
+dataset_name: DigitalLearningGmbH/MATH-lighteval
+dataset_configs:
+- train
+# Num processes is less by 1 as vLLM is using 1 GPU
+num_processes: 7
+
+# GRPO trainer config
+bf16: true
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen-2.5-7B_Base_Math_smalllr
+hub_strategy: every_save
+learning_rate: 3.0e-06
+log_level: info
+logging_steps: 10
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_prompt_length: 512
+max_completion_length: 1024
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/Qwen-2.5-7B_Base_Math_smalllr
+overwrite_output_dir: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_ratio: 0.1

From f8cbb98b0dfca2ae4462b68be87332abd6cb3161 Mon Sep 17 00:00:00 2001
From: westonbrown <47581657+westonbrown@users.noreply.github.com>
Date: Thu, 6 Feb 2025 06:33:27 -0600
Subject: [PATCH 012/137] Update sft.py (#201)

---
 src/open_r1/sft.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/open_r1/sft.py b/src/open_r1/sft.py
index d5369fc20..6d2b18008 100644
--- a/src/open_r1/sft.py
+++ b/src/open_r1/sft.py
@@ -167,9 +167,7 @@ def main(script_args, training_args, model_args):
 
     # Save everything else on main process
     kwargs = {
-        "finetuned_from": model_args.model_name_or_path,
-        "dataset": list(script_args.dataset_name),
-        "dataset_tags": list(script_args.dataset_name),
+        "dataset_name": script_args.dataset_name,
         "tags": ["open-r1"],
     }
     if trainer.accelerator.is_main_process:

From cec57f3a550a426801985ee15535e7fd459aafd2 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Thu, 6 Feb 2025 15:24:52 +0100
Subject: [PATCH 013/137] Add GPQA Diamond and fix evaluation deps (#196)

* Add GPQA Diamond

* Add table

* Fix README

* Up

* Fixes

* Ignore logs

* Fix

* Pin deps

* Fix GRPO

* Add Llama 70B tabels

* Restore dp

* Pin lighteval

* Use bfloat16

* Tune table

* Add note
---
 Makefile                        |   2 +-
 README.md                       | 117 ++++++++++++++++++++++++--------
 logs/.gitkeep                   |   0
 setup.py                        |   8 +--
 slurm/eval_callback.slurm       |  75 --------------------
 slurm/evaluate.slurm            |  98 +++++++++++++++-----------
 src/open_r1/evaluate.py         |  45 ++++++++++++
 src/open_r1/grpo.py             |   2 +-
 src/open_r1/utils/evaluation.py |   3 +-
 9 files changed, 202 insertions(+), 148 deletions(-)
 create mode 100644 logs/.gitkeep
 delete mode 100644 slurm/eval_callback.slurm

diff --git a/Makefile b/Makefile
index 1140e59e7..17e2ddc11 100644
--- a/Makefile
+++ b/Makefile
@@ -26,7 +26,7 @@ evaluate:
 		fi \
 	),))
 	$(if $(filter tensor,$(PARALLEL)),export VLLM_WORKER_MULTIPROC_METHOD=spawn &&,) \
-	MODEL_ARGS="pretrained=$(MODEL),dtype=float16,$(PARALLEL_ARGS),max_model_length=32768,gpu_memory_utilisation=0.8" && \
+	MODEL_ARGS="pretrained=$(MODEL),dtype=bfloat16,$(PARALLEL_ARGS),max_model_length=32768,gpu_memory_utilisation=0.8" && \
 	lighteval vllm $$MODEL_ARGS "custom|$(TASK)|0|0" \
 		--custom-tasks src/open_r1/evaluate.py \
 		--use-chat-template \
diff --git a/README.md b/README.md
index f02f8e095..0faffa2ec 100644
--- a/README.md
+++ b/README.md
@@ -50,23 +50,23 @@ To install `uv`, follow the [UV Installation Guide](https://docs.astral.sh/uv/ge
 
 
 ```shell
-uv venv openr1 --python 3.11 && source openr1/bin/activate && uv pip install --upgrade pip
+uv venv openr1 --python 3.11 && source openr1/bin/activate && uv pip install --upgrade pip --link-mode=copy
 ```
 
 Next, install vLLM:
 
 ```shell
-uv pip install vllm>=0.7.0
+uv pip install vllm==0.7.1
 
 # For CUDA 12.1
-pip install vllm>=0.7.0 --extra-index-url https://download.pytorch.org/whl/cu121
+uv pip install vllm==0.7.1 --extra-index-url https://download.pytorch.org/whl/cu121 --index-strategy unsafe-best-match --link-mode=copy
 export LD_LIBRARY_PATH=$(python -c "import site; print(site.getsitepackages()[0] + '/nvidia/nvjitlink/lib')"):$LD_LIBRARY_PATH
 ```
 
 This will also install PyTorch `v2.5.1` and it is **very important** to use this version since the vLLM binaries are compiled for it. You can then install the remaining dependencies for your specific use case via `pip install -e .[LIST OF MODES]`. For most contributors, we recommend:
 
 ```shell
-pip install -e ".[dev]"
+GIT_LFS_SKIP_SMUDGE=1 uv pip install -e ".[dev]" --link-mode=copy
 ```
 
 Next, log into your Hugging Face and Weights and Biases accounts as follows:
@@ -141,30 +141,46 @@ We use `lighteval` to evaluate models, with custom tasks defined in `src/open_r1
 
 ```shell
 MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
-MODEL_ARGS="pretrained=$MODEL,dtype=float16,max_model_length=32768,gpu_memory_utilisation=0.8"
-TASK=aime24
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilisation=0.8"
 OUTPUT_DIR=data/evals/$MODEL
 
+# AIME 2024
+TASK=aime24
+lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
+    --custom-tasks src/open_r1/evaluate.py \
+    --use-chat-template \
+    --output-dir $OUTPUT_DIR
+
+# MATH-500
+TASK=math_500
+lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
+    --custom-tasks src/open_r1/evaluate.py \
+    --use-chat-template \
+    --output-dir $OUTPUT_DIR
+
+# GPQA Diamond
+TASK=gpqa:diamond
 lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
     --custom-tasks src/open_r1/evaluate.py \
     --use-chat-template \
-    --system-prompt="Please reason step by step, and put your final answer within \boxed{}." \
     --output-dir $OUTPUT_DIR 
 ```
 
+> [!IMPORTANT]
+> You must set `max_model_length=32768` in the `vllm` command to align with the `generation_size` we define per eval. Without this, `lighteval` will throw an error.
+
 To increase throughput across multiple GPUs, use _data parallel_ as follows:
 
 ```shell
 NUM_GPUS=8
 MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
-MODEL_ARGS="pretrained=$MODEL,dtype=float16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
 TASK=aime24
 OUTPUT_DIR=data/evals/$MODEL
 
 lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
     --custom-tasks src/open_r1/evaluate.py \
     --use-chat-template \
-    --system-prompt="Please reason step by step, and put your final answer within \boxed{}." \
     --output-dir $OUTPUT_DIR 
 ```
 
@@ -173,7 +189,7 @@ For large models which require sharding across GPUs, use _tensor parallel_ and r
 ```shell
 NUM_GPUS=8
 MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
-MODEL_ARGS="pretrained=$MODEL,dtype=float16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
 TASK=aime24
 OUTPUT_DIR=data/evals/$MODEL
 
@@ -181,50 +197,97 @@ export VLLM_WORKER_MULTIPROC_METHOD=spawn
 lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
     --custom-tasks src/open_r1/evaluate.py \
     --use-chat-template \
-    --system-prompt="Please reason step by step, and put your final answer within \boxed{}." \
     --output-dir $OUTPUT_DIR 
 ```
 
 You can also launch an evaluation with `make evaluate`, specifying the model, task, and optionally the parallelism technique and number of GPUs.
 
 To evaluate on a single GPU:
+
 ```shell
 make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24
 ```
 
 To use Data Parallelism:
+
 ```shell
 make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24 PARALLEL=data NUM_GPUS=8
 ```
 
 To use Tensor Parallelism:
+
 ```shell
 make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24 PARALLEL=tensor NUM_GPUS=8
 ```
-## Reproducing Deepseek's evaluation results on MATH-500
-We are able to reproduce Deepseek's reported results on the MATH-500 Benchmark:
-| Model                      | MATH-500 (HF lighteval) | MATH-500 (DeepSeek Reported) |
-| :-------------------------- | :-------: | :----------------------------: |
-| DeepSeek-R1-Distill-Qwen-1.5B  |  81.6   |              83.9              |
-| DeepSeek-R1-Distill-Qwen-7B    |  91.8   |              92.8              |
-| DeepSeek-R1-Distill-Qwen-14B   |  94.2   |              93.9              |
-| DeepSeek-R1-Distill-Qwen-32B   |  95.0   |              94.3              |
-| DeepSeek-R1-Distill-Llama-8B   |  85.8   |              89.1              |
-| DeepSeek-R1-Distill-Llama-70B  |  93.4   |              94.5              |
 
+## Reproducing Deepseek's evaluation results
+
+> [!NOTE]
+> The DeepSeek-R1 paper uses sampling with a temperature of 0.6, a top-p value of 0.95, and 64 responses per query to estimate `pass@1`. Below, we report the results from greedy decoding, which likely explains the small 1-3σ discrepancies between our results and theirs.
+
+### MATH-500
+
+We are able to reproduce Deepseek's reported results on the MATH-500 benchmark within ~1-3 standard deviations:
 
+| Model                         | MATH-500 (🤗 LightEval) | MATH-500 (DeepSeek Reported) |
+|:------------------------------|:-----------------------:|:----------------------------:|
+| DeepSeek-R1-Distill-Qwen-1.5B |          81.2           |             83.9             |
+| DeepSeek-R1-Distill-Qwen-7B   |          91.8           |             92.8             |
+| DeepSeek-R1-Distill-Qwen-14B  |          94.2           |             93.9             |
+| DeepSeek-R1-Distill-Qwen-32B  |          95.0           |             94.3             |
+| DeepSeek-R1-Distill-Llama-8B  |          85.4           |             89.1             |
+| DeepSeek-R1-Distill-Llama-70B |          93.4           |             94.5             |
 
 To reproduce these results use the following command:
+
+```shell
+NUM_GPUS=1 # Set to 8 for 32B and 70B models
+MODEL=deepseek-ai/{model_name}
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilisation=0.8,tensor_parallel_size=$NUM_GPUS"
+OUTPUT_DIR=data/evals/$MODEL
+
+lighteval vllm $MODEL_ARGS "custom|math_500|0|0" \
+    --custom-tasks src/open_r1/evaluate.py \
+    --use-chat-template \
+    --output-dir $OUTPUT_DIR
+```
+
+Alternatively, you can launch Slurm jobs as follows:
+
 ```shell
-sbatch slurm/evaluate.slurm deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B math_500
-sbatch slurm/evaluate.slurm deepseek-ai/DeepSeek-R1-Distill-Qwen-7B math_500
-sbatch slurm/evaluate.slurm deepseek-ai/DeepSeek-R1-Distill-Qwen-14B math_500
-sbatch slurm/evaluate.slurm deepseek-ai/DeepSeek-R1-Distill-Qwen-32B math_500 tp
-sbatch slurm/evaluate.slurm deepseek-ai/DeepSeek-R1-Distill-Llama-8B math_500
-sbatch slurm/evaluate.slurm deepseek-ai/DeepSeek-R1-Distill-Llama-70B math_500 tp
+python scripts/run_benchmarks.py --model-id={model_id}  --benchmarks math_500
 ```
 
+### GPQA Diamond
+
+We are able to reproduce Deepseek's reported results on the GPQA Diamond benchmark within ~1-3 standard deviations:
+
+| Model                         | GPQA Diamond (🤗 LightEval) | GPQA Diamond (DeepSeek Reported) |
+|:------------------------------|:---------------------------:|:--------------------------------:|
+| DeepSeek-R1-Distill-Qwen-1.5B |            33.3             |               33.8               |
+| DeepSeek-R1-Distill-Qwen-7B   |            48.4             |               49.1               |
+| DeepSeek-R1-Distill-Qwen-14B  |            55.6             |               59.1               |
+| DeepSeek-R1-Distill-Qwen-32B  |            58.6             |               62.1               |
+| DeepSeek-R1-Distill-Llama-8B  |            51.0             |               49.0               |
+| DeepSeek-R1-Distill-Llama-70B |            65.2             |               65.2               |
+
+To reproduce these results use the following command:
+
+```shell
+NUM_GPUS=1 # Set to 8 for 32B and 70B models
+MODEL=deepseek-ai/{model_name}
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilisation=0.8,tensor_parallel_size=$NUM_GPUS"
+OUTPUT_DIR=data/evals/$MODEL
+
+lighteval vllm $MODEL_ARGS "custom|gpqa:diamond|0|0" \
+    --custom-tasks src/open_r1/evaluate.py \
+    --use-chat-template \
+    --output-dir $OUTPUT_DIR
+```
 
+```shell
+python scripts/run_benchmarks.py --model-id={model_id}  --benchmarks gpqa
+```
 
 ## Data generation
 
diff --git a/logs/.gitkeep b/logs/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/setup.py b/setup.py
index 7ac019182..a2dd93add 100644
--- a/setup.py
+++ b/setup.py
@@ -53,17 +53,17 @@
     "huggingface-hub[cli]>=0.19.2,<1.0",
     "isort>=5.12.0",
     "liger_kernel==0.5.2",
-    "lighteval @ git+https://github.com/huggingface/lighteval.git@0e462692436e1f0575bdb4c6ef63453ad9bde7d4#egg=lighteval[math]",
-    "math-verify>=0.3.3",  # Used for math verification in grpo
+    "lighteval @ git+https://github.com/huggingface/lighteval.git@86f62259f105ae164f655e0b91c92a823a742724#egg=lighteval[math]",
+    "math-verify==0.5.2",  # Used for math verification in grpo
     "packaging>=23.0",
     "parameterized>=0.9.0",
     "pytest",
     "safetensors>=0.3.3",
     "sentencepiece>=0.1.99",
-    "torch>=2.5.1",
+    "torch==2.5.1",
     "transformers @ git+https://github.com/huggingface/transformers.git@main",
     "trl @ git+https://github.com/huggingface/trl.git@main",
-    "vllm>=0.7.1",
+    "vllm==0.7.1",
     "wandb>=0.19.1",
 ]
 
diff --git a/slurm/eval_callback.slurm b/slurm/eval_callback.slurm
deleted file mode 100644
index bec49ab96..000000000
--- a/slurm/eval_callback.slurm
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/bin/bash
-#SBATCH --ntasks-per-node=1
-#SBATCH --gres=gpu:8
-#SBATCH --partition=hopper-prod
-#SBATCH --output=./logs/evaluate/%x-%j.out
-#SBATCH --err=./logs/evaluate/%x-%j.err
-#SBATCH --requeue
-
-set -x -e
-source ~/.bashrc
-source openr1/bin/activate
-TASK_NAME=$1
-TASKS=$2
-MODEL_ID=$3
-MODEL_REVISION=$4
-# Optional args
-[ -z "$5"] && TENSOR_PARALLEL=False || TENSOR_PARALLEL=$5
-[ -z "$6"] && TRUST_REMOTE_CODE=False || TRUST_REMOTE_CODE=$6
-# $7 is reserved for system_prompt, see line 51
-NUM_GPUS=$(nvidia-smi -L | wc -l)
-
-# Set Whether to use tensor parallelism or data parallelism
-if [ "$TENSOR_PARALLEL" = "True" ]; then
-    # use TP to shard model across NUM_GPUS
-    export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
-else
-    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
-fi
-
-LM_EVAL_REPO_ID="open-r1/open-r1-eval-leaderboard"
-MODEL_NAME=$(echo $MODEL_ID | sed 's/\//_/g') # replaces / with _
-DETAILS_REPO_ID="open-r1/details-$MODEL_NAME"
-OUTPUT_DIR="eval_results/$MODEL_ID/$MODEL_REVISION/$TASK_NAME"
-# We need this flag since we run this script from training jobs that use DeepSpeed and the env vars get progated which causes errors during evaluation
-ACCELERATE_USE_DEEPSPEED=false
-# Enable fast downloads
-HF_HUB_ENABLE_HF_TRANSFER=1
-
-echo "Running lighteval script ..."
-echo "Eval results will be saved to $OUTPUT_DIR"
-# Check if "custom" is a substring of TASKS
-if [[ $TASKS == *"custom"* ]]; then
-    echo "Custom task detected. Running custom task evaluation script ..."
-    lighteval vllm $MODEL_ARGS $TASKS \
-    --custom-tasks "src/open_r1/evaluate.py" \
-    --use-chat-template \
-    --output-dir $OUTPUT_DIR \
-    --save-details \
-    ${7:+--system-prompt "$7"}
-else
-    lighteval vllm $MODEL_ARGS $TASKS \
-    --use-chat-template \
-    --output-dir $OUTPUT_DIR \
-    --save-details \
-    ${7:+--system-prompt "$7"}
-fi
-
-OUTPUT_FILEPATHS=$(find $OUTPUT_DIR/results/ -type f \( -name "*.json" \))
-for filepath in $OUTPUT_FILEPATHS; do
-    echo "Uploading $filepath to Hugging Face Hub..."
-    filename=$(basename -- "$filepath")
-    huggingface-cli upload --repo-type space --private $LM_EVAL_REPO_ID $filepath $OUTPUT_DIR/$filename
-done
-
-echo "Uploading details to Hugging Face Hub..."
-DETAILS_FILEPATHS=$(find $OUTPUT_DIR/details/ -type f \( -name "*.parquet" \))
-echo "DETAILS_FILEPATHS: $DETAILS_FILEPATHS"
-TIMESTAMP=$(date +"%Y-%m-%dT%H-%M-%S")
-python src/open_r1/utils/upload_details.py --data_files $DETAILS_FILEPATHS --hub_repo_id $DETAILS_REPO_ID --config_name $MODEL_REVISION.$TASK_NAME.$TIMESTAMP
-    
-echo "Cleaning up ..."
-rm -rf $OUTPUT_DIR
-
-echo "Done!"
\ No newline at end of file
diff --git a/slurm/evaluate.slurm b/slurm/evaluate.slurm
index 0ca4a8701..47659807b 100644
--- a/slurm/evaluate.slurm
+++ b/slurm/evaluate.slurm
@@ -1,55 +1,75 @@
 #!/bin/bash
-#SBATCH --job-name=open-r1-evaluate
-#SBATCH --nodes=1
 #SBATCH --ntasks-per-node=1
-#SBATCH --exclusive
 #SBATCH --gres=gpu:8
-#SBATCH --partition=hopper-prod 
-#SBATCH --time=01:59:00
-#SBATCH --output=./logs/evaluate/%x-%j.out
-#SBATCH --err=./logs/evaluate/%x-%j.err
-
-# Usage: sbatch slurm/evaluate.slurm deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B aime24
+#SBATCH --partition=hopper-prod
+#SBATCH --output=./logs/%x-%j.out
+#SBATCH --err=./logs/%x-%j.err
+#SBATCH --requeue
 
 set -x -e
-
 source ~/.bashrc
 source openr1/bin/activate
-module load cuda/12.1
-echo "START TIME: $(date)"
-echo "PYTHON ENV: $(which python)"
-
+TASK_NAME=$1
+TASKS=$2
+MODEL_ID=$3
+MODEL_REVISION=$4
+# Optional args
+[ -z "$5"] && TENSOR_PARALLEL=False || TENSOR_PARALLEL=$5
+[ -z "$6"] && TRUST_REMOTE_CODE=False || TRUST_REMOTE_CODE=$6
+# $7 is reserved for system_prompt, see line 51
+NUM_GPUS=$(nvidia-smi -L | wc -l)
 
-NUM_GPUS=8
-MODEL=$1
-TASK=$2
-# Check if a third argument is passed, if it is tp then eval with tensor parallelism. Required for larger models
-if [ -n "$3" ] && [ "$3" == "tp" ]; then
-  MODEL_ARGS="pretrained=$MODEL,dtype=float16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
+# Set Whether to use tensor parallelism or data parallelism
+if [ "$TENSOR_PARALLEL" = "True" ]; then
+    # use TP to shard model across NUM_GPUS
+    export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
 else
-  MODEL_ARGS="pretrained=$MODEL,dtype=float16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
+    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
 fi
-OUTPUT_DIR=data/evals/$MODEL
-
-
-# force crashing on nccl issues like hanging broadcast
-export NCCL_ASYNC_ERROR_HANDLING=1
-# export NCCL_DEBUG=INFO
-# export NCCL_DEBUG_SUBSYS=COLL
-# export NCCL_SOCKET_NTHREADS=1
-# export NCCL_NSOCKS_PERTHREAD=1
-# export CUDA_LAUNCH_BLOCKING=1
 
-# Specific configuration optimized for the Hugging Face Compute Cluster
-# Be ye warned this may not work on other clusters!
-module load cuda/12.1
+LM_EVAL_REPO_ID="open-r1/open-r1-eval-leaderboard"
+MODEL_NAME=$(echo $MODEL_ID | sed 's/\//_/g') # replaces / with _
+DETAILS_REPO_ID="open-r1/details-$MODEL_NAME"
+OUTPUT_DIR="eval_results/$MODEL_ID/$MODEL_REVISION/$TASK_NAME"
+# We need this flag since we run this script from training jobs that use DeepSpeed and the env vars get progated which causes errors during evaluation
+ACCELERATE_USE_DEEPSPEED=false
+# Enable fast downloads
+HF_HUB_ENABLE_HF_TRANSFER=1
 
-lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
-    --custom-tasks src/open_r1/evaluate.py \
+echo "Running lighteval script ..."
+echo "Eval results will be saved to $OUTPUT_DIR"
+# Check if "custom" is a substring of TASKS
+if [[ $TASKS == *"custom"* ]]; then
+    echo "Custom task detected. Running custom task evaluation script ..."
+    lighteval vllm $MODEL_ARGS $TASKS \
+    --custom-tasks "src/open_r1/evaluate.py" \
     --use-chat-template \
-    --system-prompt="Please reason step by step, and put your final answer within \boxed{}." \
+    --output-dir $OUTPUT_DIR \
     --save-details \
-    --output-dir $OUTPUT_DIR 
+    ${7:+--system-prompt "$7"}
+else
+    lighteval vllm $MODEL_ARGS $TASKS \
+    --use-chat-template \
+    --output-dir $OUTPUT_DIR \
+    --save-details \
+    ${7:+--system-prompt "$7"}
+fi
+
+OUTPUT_FILEPATHS=$(find $OUTPUT_DIR/results/ -type f \( -name "*.json" \))
+for filepath in $OUTPUT_FILEPATHS; do
+    echo "Uploading $filepath to Hugging Face Hub..."
+    filename=$(basename -- "$filepath")
+    huggingface-cli upload --repo-type space --private $LM_EVAL_REPO_ID $filepath $OUTPUT_DIR/$filename
+done
 
+echo "Uploading details to Hugging Face Hub..."
+DETAILS_FILEPATHS=$(find $OUTPUT_DIR/details/ -type f \( -name "*.parquet" \))
+echo "DETAILS_FILEPATHS: $DETAILS_FILEPATHS"
+TIMESTAMP=$(date +"%Y-%m-%dT%H-%M-%S")
+python src/open_r1/utils/upload_details.py --data_files $DETAILS_FILEPATHS --hub_repo_id $DETAILS_REPO_ID --config_name $MODEL_REVISION.$TASK_NAME.$TIMESTAMP
+    
+echo "Cleaning up ..."
+rm -rf $OUTPUT_DIR
 
-echo "END TIME: $(date)"
+echo "Done!"
\ No newline at end of file
diff --git a/src/open_r1/evaluate.py b/src/open_r1/evaluate.py
index c800a889a..0447b266e 100644
--- a/src/open_r1/evaluate.py
+++ b/src/open_r1/evaluate.py
@@ -14,8 +14,11 @@
 
 """Custom evaluation tasks for LightEval."""
 
+import random
+
 from lighteval.metrics.dynamic_metrics import (
     ExprExtractionConfig,
+    IndicesExtractionConfig,
     LatexExtractionConfig,
     multilingual_extractive_match_metric,
 )
@@ -44,6 +47,13 @@
     aggregation_function=max,
 )
 
+gpqa_metric = multilingual_extractive_match_metric(
+    language=Language.ENGLISH,
+    gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
+    pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
+    precision=5,
+)
+
 
 def prompt_fn(line, task_name: str = None):
     """Assumes the model is either prompted to emit \\boxed{answer} or does so automatically"""
@@ -64,6 +74,23 @@ def aime_prompt_fn(line, task_name: str = None):
     )
 
 
+def gpqa_prompt_fn(line, task_name: str = None):
+    """Prompt template adapted from simple-evals: https://github.com/openai/simple-evals/blob/83ed7640a7d9cd26849bcb3340125002ef14abbe/common.py#L14"""
+    gold_index = random.randint(0, 3)
+    choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
+    choices.insert(gold_index, line["Correct Answer"])
+    query_template = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\n{Question}\n\nA) {A}\nB) {B}\nC) {C}\nD) {D}"
+    query = query_template.format(A=choices[0], B=choices[1], C=choices[2], D=choices[3], Question=line["Question"])
+
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=["A", "B", "C", "D"],
+        gold_index=gold_index,
+        instruction=query,
+    )
+
+
 # Define tasks
 aime24 = LightevalTaskConfig(
     name="aime24",
@@ -93,11 +120,29 @@ def aime_prompt_fn(line, task_name: str = None):
     metric=[latex_gold_metric],
     version=1,
 )
+gpqa_diamond = LightevalTaskConfig(
+    name="gpqa:diamond",
+    suite=["custom"],
+    prompt_function=gpqa_prompt_fn,
+    hf_repo="Idavidrein/gpqa",
+    hf_subset="gpqa_diamond",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select=None,
+    generation_size=32768,  # needed for reasoning models like R1
+    metric=[gpqa_metric],
+    stop_sequence=[],  # no stop sequence, will use eos token
+    trust_dataset=True,
+    version=1,
+)
+
 
 # Add tasks to the table
 TASKS_TABLE = []
 TASKS_TABLE.append(aime24)
 TASKS_TABLE.append(math_500)
+TASKS_TABLE.append(gpqa_diamond)
 
 # MODULE LOGIC
 if __name__ == "__main__":
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index e8c1c556e..4bdc335f7 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -68,7 +68,7 @@ def accuracy_reward(completions, solution, **kwargs):
                             malformed_operators=False,
                             basic_latex=True,
                             equations=True,
-                            boxed=True,
+                            boxed="all",
                             units=True,
                         ),
                         # Ensures that boxed is tried first
diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py
index 9cbac82d4..86de906d9 100644
--- a/src/open_r1/utils/evaluation.py
+++ b/src/open_r1/utils/evaluation.py
@@ -48,6 +48,7 @@ def register_lighteval_task(
 
 register_lighteval_task(LIGHTEVAL_TASKS, "custom", "math_500", "math_500", 0)
 register_lighteval_task(LIGHTEVAL_TASKS, "custom", "aime24", "aime24", 0)
+register_lighteval_task(LIGHTEVAL_TASKS, "custom", "gpqa", "gpqa:diamond", 0)
 
 
 def get_lighteval_tasks():
@@ -74,7 +75,7 @@ def run_lighteval_job(
     cmd_args = [
         f"--gres=gpu:{num_gpus}",
         f"--job-name=or1_{benchmark}_{model_name.split('/')[-1]}_{model_revision}",
-        "slurm/eval_callback.slurm",
+        "slurm/evaluate.slurm",
         benchmark,
         f'"{task_list}"',
         model_name,

From 3fbdeac96cb1ac8c9aa46bf5b005e35f8327d2af Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Thu, 6 Feb 2025 15:46:33 +0100
Subject: [PATCH 014/137] Fix slurm eval (#208)

---
 slurm/evaluate.slurm | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/slurm/evaluate.slurm b/slurm/evaluate.slurm
index 47659807b..0a9354f54 100644
--- a/slurm/evaluate.slurm
+++ b/slurm/evaluate.slurm
@@ -9,6 +9,11 @@
 set -x -e
 source ~/.bashrc
 source openr1/bin/activate
+
+# Specific configuration optimized for the Hugging Face Compute Cluster
+# Be ye warned this may not work on other clusters!
+module load cuda/12.1
+
 TASK_NAME=$1
 TASKS=$2
 MODEL_ID=$3

From a60b175aeb94abfa3e35506221b6cca5e360429e Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Thu, 6 Feb 2025 16:31:13 +0100
Subject: [PATCH 015/137] Update CUDA (#209)

* Update CUDA

* Fix

* Remove module

* Restore CUDA

* Move cuda import
---
 README.md            | 6 +-----
 slurm/evaluate.slurm | 9 +++++----
 slurm/generate.slurm | 2 +-
 slurm/grpo.slurm     | 2 +-
 slurm/sft.slurm      | 2 +-
 5 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 0faffa2ec..303a371ae 100644
--- a/README.md
+++ b/README.md
@@ -56,11 +56,7 @@ uv venv openr1 --python 3.11 && source openr1/bin/activate && uv pip install --u
 Next, install vLLM:
 
 ```shell
-uv pip install vllm==0.7.1
-
-# For CUDA 12.1
-uv pip install vllm==0.7.1 --extra-index-url https://download.pytorch.org/whl/cu121 --index-strategy unsafe-best-match --link-mode=copy
-export LD_LIBRARY_PATH=$(python -c "import site; print(site.getsitepackages()[0] + '/nvidia/nvjitlink/lib')"):$LD_LIBRARY_PATH
+uv pip install vllm==0.7.1 --link-mode=copy
 ```
 
 This will also install PyTorch `v2.5.1` and it is **very important** to use this version since the vLLM binaries are compiled for it. You can then install the remaining dependencies for your specific use case via `pip install -e .[LIST OF MODES]`. For most contributors, we recommend:
diff --git a/slurm/evaluate.slurm b/slurm/evaluate.slurm
index 0a9354f54..4883930ba 100644
--- a/slurm/evaluate.slurm
+++ b/slurm/evaluate.slurm
@@ -6,14 +6,15 @@
 #SBATCH --err=./logs/%x-%j.err
 #SBATCH --requeue
 
+# Specific configuration optimized for the Hugging Face Compute Cluster
+# Be ye warned this may not work on other clusters!
+module load cuda/12.4
+
 set -x -e
+
 source ~/.bashrc
 source openr1/bin/activate
 
-# Specific configuration optimized for the Hugging Face Compute Cluster
-# Be ye warned this may not work on other clusters!
-module load cuda/12.1
-
 TASK_NAME=$1
 TASKS=$2
 MODEL_ID=$3
diff --git a/slurm/generate.slurm b/slurm/generate.slurm
index 9cc9b1cce..2fdf61795 100644
--- a/slurm/generate.slurm
+++ b/slurm/generate.slurm
@@ -122,7 +122,7 @@ echo "-------------------"
 
 set -ex
 
-module load cuda/12.1
+module load cuda/12.4
 
 export LD_LIBRARY_PATH=.venv/lib/python3.11/site-packages/nvidia/nvjitlink/lib
 
diff --git a/slurm/grpo.slurm b/slurm/grpo.slurm
index 8a3d63a32..96440491b 100644
--- a/slurm/grpo.slurm
+++ b/slurm/grpo.slurm
@@ -75,7 +75,7 @@ export NCCL_ASYNC_ERROR_HANDLING=1
 
 # Specific configuration optimized for the Hugging Face Compute Cluster
 # Be ye warned this may not work on other clusters!
-module load cuda/12.1
+module load cuda/12.4
 
 # srun error handling:
 # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
diff --git a/slurm/sft.slurm b/slurm/sft.slurm
index 31528cdcb..7815dcf37 100644
--- a/slurm/sft.slurm
+++ b/slurm/sft.slurm
@@ -72,7 +72,7 @@ export NCCL_ASYNC_ERROR_HANDLING=1
 
 # Specific configuration optimized for the Hugging Face Compute Cluster
 # Be ye warned this may not work on other clusters!
-module load cuda/12.1
+module load cuda/12.4
 
 # srun error handling:
 # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks

From c4227d62206dff607ebccf66a25252d76d773d5e Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Thu, 6 Feb 2025 16:40:09 +0100
Subject: [PATCH 016/137] Update README.md (#211)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 303a371ae..127314124 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@ We will use the DeepSeek-R1 [tech report](https://github.com/deepseek-ai/DeepSee
 
 ## Installation
 
-**Note: Libraries rely on CUDA 12.1. Double check your system if you get segmentation faults.**
+**Note: Libraries rely on CUDA 12.4. Double check your system if you get segmentation faults.**
 
 To run the code in this project, first, create a Python virtual environment using e.g. `uv`.
 To install `uv`, follow the [UV Installation Guide](https://docs.astral.sh/uv/getting-started/installation/).

From e8c2673a155a98660d5206ff6e2e066335b89d1b Mon Sep 17 00:00:00 2001
From: Almaz Zinollayev <39913951+zeenolife@users.noreply.github.com>
Date: Thu, 6 Feb 2025 19:10:05 +0000
Subject: [PATCH 017/137] Refactoring reward functions. Adding step by step
 reasoning reward. Adding test coverage for reward functions (#144)

* Refactoring reward functions. Adding step by step reasoning reward. Adding test coverage for reward functions

* [Refactoring reward functions] - Ruff error fix

* [Refactoring reward functions] - Linting error fix
---
 src/open_r1/grpo.py    | 63 +++--------------------------
 src/open_r1/rewards.py | 80 +++++++++++++++++++++++++++++++++++++
 tests/__init__.py      |  0
 tests/test_rewards.py  | 91 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 177 insertions(+), 57 deletions(-)
 create mode 100644 src/open_r1/rewards.py
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_rewards.py

diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 4bdc335f7..8394a43fe 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -14,7 +14,6 @@
 
 import logging
 import os
-import re
 import sys
 from dataclasses import dataclass, field
 
@@ -25,9 +24,8 @@
 from transformers import set_seed
 from transformers.trainer_utils import get_last_checkpoint
 
-from latex2sympy2_extended import NormalizationConfig
-from math_verify import LatexExtractionConfig, parse, verify
 from open_r1.configs import GRPOConfig
+from open_r1.rewards import REWARD_FUNCS_REGISTRY
 from open_r1.utils.callbacks import get_callbacks
 from trl import GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
 
@@ -42,66 +40,17 @@ class GRPOScriptArguments(ScriptArguments):
 
     Args:
         reward_funcs (`list[str]`):
-            List of reward functions. Possible values: 'accuracy', 'format'.
+            List of reward functions. Possible values are dynamically populated from REWARD_FUNCS_REGISTRY.
     """
 
     reward_funcs: list[str] = field(
         default_factory=lambda: ["accuracy", "format"],
-        metadata={"help": "List of reward functions. Possible values: 'accuracy', 'format'"},
+        metadata={
+            "help": f"List of reward functions. Possible values: {', '.join(REWARD_FUNCS_REGISTRY.keys())}"
+        },
     )
 
 
-def accuracy_reward(completions, solution, **kwargs):
-    """Reward function that checks if the completion is the same as the ground truth."""
-    contents = [completion[0]["content"] for completion in completions]
-    rewards = []
-    for content, sol in zip(contents, solution):
-        gold_parsed = parse(sol, extraction_mode="first_match", extraction_config=[LatexExtractionConfig()])
-        if len(gold_parsed) != 0:
-            # We require the answer to be provided in correct latex (no malformed operators)
-            answer_parsed = parse(
-                content,
-                extraction_config=[
-                    LatexExtractionConfig(
-                        normalization_config=NormalizationConfig(
-                            nits=False,
-                            malformed_operators=False,
-                            basic_latex=True,
-                            equations=True,
-                            boxed="all",
-                            units=True,
-                        ),
-                        # Ensures that boxed is tried first
-                        boxed_match_priority=0,
-                        try_extract_without_anchor=False,
-                    )
-                ],
-                extraction_mode="first_match",
-            )
-            # Reward 1 if the content is the same as the ground truth, 0 otherwise
-            reward = float(verify(answer_parsed, gold_parsed))
-        else:
-            # If the gold solution is not parseable, we reward 1 to skip this example
-            reward = 1.0
-            print("Failed to parse gold solution: ", sol)
-        rewards.append(reward)
-
-    return rewards
-
-
-def format_reward(completions, **kwargs):
-    """Reward function that checks if the completion has a specific format."""
-    pattern = r"^<think>.*?</think><answer>.*?</answer>$"
-    completion_contents = [completion[0]["content"] for completion in completions]
-    matches = [re.match(pattern, content) for content in completion_contents]
-    return [1.0 if match else 0.0 for match in matches]
-
-
-reward_funcs_registry = {
-    "accuracy": accuracy_reward,
-    "format": format_reward,
-}
-
 SYSTEM_PROMPT = (
     "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
     "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
@@ -149,7 +98,7 @@ def main(script_args, training_args, model_args):
     dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
 
     # Get reward functions
-    reward_funcs = [reward_funcs_registry[func] for func in script_args.reward_funcs]
+    reward_funcs = [REWARD_FUNCS_REGISTRY[func] for func in script_args.reward_funcs]
 
     # Format into conversation
     def make_conversation(example):
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
new file mode 100644
index 000000000..9362d6eb5
--- /dev/null
+++ b/src/open_r1/rewards.py
@@ -0,0 +1,80 @@
+"""Reward functions for GRPO training."""
+
+import re
+
+from latex2sympy2_extended import NormalizationConfig
+from math_verify import LatexExtractionConfig, parse, verify
+
+
+def accuracy_reward(completions, solution, **kwargs):
+    """Reward function that checks if the completion is the same as the ground truth."""
+    contents = [completion[0]["content"] for completion in completions]
+    rewards = []
+    for content, sol in zip(contents, solution):
+        gold_parsed = parse(
+            sol,
+            extraction_mode="first_match",
+            extraction_config=[LatexExtractionConfig()],
+        )
+        if len(gold_parsed) != 0:
+            # We require the answer to be provided in correct latex (no malformed operators)
+            answer_parsed = parse(
+                content,
+                extraction_config=[
+                    LatexExtractionConfig(
+                        normalization_config=NormalizationConfig(
+                            nits=False,
+                            malformed_operators=False,
+                            basic_latex=True,
+                            equations=True,
+                            boxed="all",
+                            units=True,
+                        ),
+                        # Ensures that boxed is tried first
+                        boxed_match_priority=0,
+                        try_extract_without_anchor=False,
+                    )
+                ],
+                extraction_mode="first_match",
+            )
+            # Reward 1 if the content is the same as the ground truth, 0 otherwise
+            reward = float(verify(answer_parsed, gold_parsed))
+        else:
+            # If the gold solution is not parseable, we reward 1 to skip this example
+            reward = 1.0
+            print("Failed to parse gold solution: ", sol)
+        rewards.append(reward)
+
+    return rewards
+
+
+def format_reward(completions, **kwargs):
+    """Reward function that checks if the completion has a specific format."""
+    pattern = r"^<think>.*?</think><answer>.*?</answer>$"
+    completion_contents = [completion[0]["content"] for completion in completions]
+    matches = [re.match(pattern, content) for content in completion_contents]
+    return [1.0 if match else 0.0 for match in matches]
+
+
+def reasoning_steps_reward(completions, **kwargs):
+    """Reward function that checks for clear step-by-step reasoning.
+    Regex pattern:
+        Step \d+: - matches "Step 1:", "Step 2:", etc.
+        ^\d+\. - matches numbered lists like "1.", "2.", etc. at start of line
+        \n- - matches bullet points with hyphens
+        \n\* - matches bullet points with asterisks
+        First,|Second,|Next,|Finally, - matches transition words
+    """
+    pattern = r"(Step \d+:|^\d+\.|\n-|\n\*|First,|Second,|Next,|Finally,)"
+    completion_contents = [completion[0]["content"] for completion in completions]
+    matches = [len(re.findall(pattern, content)) for content in completion_contents]
+
+    # Magic nubmer 3 to encourage 3 steps and more, otherwise partial reward
+    return [min(1.0, count / 3) for count in matches]
+
+
+REWARD_FUNCS_REGISTRY = {
+    "accuracy": accuracy_reward,
+    "format": format_reward,
+    "reasoning_steps": reasoning_steps_reward,
+}
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
new file mode 100644
index 000000000..a9fb95915
--- /dev/null
+++ b/tests/test_rewards.py
@@ -0,0 +1,91 @@
+import unittest
+from open_r1.rewards import accuracy_reward, format_reward, reasoning_steps_reward
+
+
+class TestRewards(unittest.TestCase):
+    def test_accuracy_reward_correct_answer(self):
+        """Test accuracy_reward with a correct answer."""
+        completion = [[{"content": r"\boxed{\frac{63}{400}}"}]]
+        solution = [r"\frac{63}{400}"]
+        
+        rewards = accuracy_reward(completion, solution)
+        self.assertEqual(rewards[0], 1.0)
+
+    def test_accuracy_reward_wrong_answer(self):
+        """Test accuracy_reward with an incorrect answer."""
+        completion = [[{"content": r"\boxed{\frac{64}{400}}"}]]
+        solution = [r"\frac{63}{400}"]
+        
+        rewards = accuracy_reward(completion, solution)
+        self.assertEqual(rewards[0], 0.0)
+
+    def test_format_reward_correct(self):
+        """Test format_reward with correct format."""
+        completion = [[{"content": "<think>Some reasoning</think><answer>The answer</answer>"}]]
+        rewards = format_reward(completion)
+        self.assertEqual(rewards[0], 1.0)
+
+    def test_format_reward_incorrect(self):
+        """Test format_reward with incorrect format."""
+        incorrect_formats = [
+            "<think>Only thinking</think>",
+            "<answer>Only answer</answer>",
+            "No tags at all",
+            "<think>Missing closing</think><answer>Missing closing",
+            "<think>Wrong order</answer><answer>Wrong order</think>"
+        ]
+        
+        for fmt in incorrect_formats:
+            completion = [[{"content": fmt}]]
+            rewards = format_reward(completion)
+            self.assertEqual(rewards[0], 0.0)
+
+    def test_reasoning_steps_reward(self):
+        """Test reasoning_steps_reward with various formats."""
+        test_cases = [
+            # Full credit cases (3 or more steps)
+            (
+                "Step 1: First step\nStep 2: Second step\nStep 3: Third step",
+                1.0
+            ),
+            (
+                "First, we do this.\nSecond, we do that.\nFinally, we conclude.",
+                1.0
+            ),
+            # Partial credit cases (less than 3 steps)
+            (
+                "Step 1: Only step",
+                1/3
+            ),
+            (
+                "First, we do this.\nFinally, we conclude.",
+                2/3
+            ),
+            # No credit case
+            (
+                "Just plain text without any clear steps",
+                0.0
+            )
+        ]
+        
+        for content, expected_reward in test_cases:
+            completion = [[{"content": content}]]
+            rewards = reasoning_steps_reward(completion)
+            self.assertAlmostEqual(rewards[0], expected_reward)
+
+    def test_multiple_completions(self):
+        """Test handling multiple completions at once."""
+        completions = [
+            [{"content": r"\boxed{\frac{63}{400}}"}],
+            [{"content": r"\boxed{\frac{64}{400}}"}]
+        ]
+        solutions = [r"\frac{63}{400}", r"\frac{63}{400}"]
+        
+        rewards = accuracy_reward(completions, solutions)
+        self.assertEqual(len(rewards), 2)
+        self.assertEqual(rewards[0], 1.0)
+        self.assertEqual(rewards[1], 0.0)
+
+
+if __name__ == '__main__':
+    unittest.main() 
\ No newline at end of file

From 250ab46ea165415226c20e2da9bbaa7af34edc63 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 7 Feb 2025 08:10:48 +0100
Subject: [PATCH 018/137] [GRPO] add cosine reward (#206)

* add cosine reward

* fix merge

* fix typo

* fix check
---
 Makefile               |  2 +-
 setup.py               |  4 +++
 src/open_r1/grpo.py    | 52 +++++++++++++++++++++++++---
 src/open_r1/rewards.py | 78 +++++++++++++++++++++++++++++++++++++++---
 tests/test_rewards.py  | 77 +++++++++++++++++++++++------------------
 5 files changed, 170 insertions(+), 43 deletions(-)

diff --git a/Makefile b/Makefile
index 17e2ddc11..ec757927e 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@
 # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 export PYTHONPATH = src
 
-check_dirs := src
+check_dirs := src tests
 
 style:
 	ruff format --line-length 119 --target-version py310 $(check_dirs) setup.py
diff --git a/setup.py b/setup.py
index a2dd93add..018eb9335 100644
--- a/setup.py
+++ b/setup.py
@@ -52,6 +52,8 @@
     "hf_transfer>=0.1.4",
     "huggingface-hub[cli]>=0.19.2,<1.0",
     "isort>=5.12.0",
+    "latex2sympy2_extended>=1.0.6",
+    "math-verify>=0.5.2",
     "liger_kernel==0.5.2",
     "lighteval @ git+https://github.com/huggingface/lighteval.git@86f62259f105ae164f655e0b91c92a823a742724#egg=lighteval[math]",
     "math-verify==0.5.2",  # Used for math verification in grpo
@@ -96,6 +98,8 @@ def deps_list(*pkgs):
     deps["deepspeed"],
     deps["hf_transfer"],
     deps["huggingface-hub"],
+    deps["latex2sympy2_extended"],
+    deps["math-verify"],
     deps["liger_kernel"],
     deps["packaging"],  # utilities from PyPA to e.g., compare versions
     deps["safetensors"],
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 8394a43fe..3572a97e8 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -16,6 +16,7 @@
 import os
 import sys
 from dataclasses import dataclass, field
+from functools import partial
 
 import datasets
 import torch
@@ -25,7 +26,7 @@
 from transformers.trainer_utils import get_last_checkpoint
 
 from open_r1.configs import GRPOConfig
-from open_r1.rewards import REWARD_FUNCS_REGISTRY
+from open_r1.rewards import accuracy_reward, cosine_scaled_reward, format_reward, reasoning_steps_reward
 from open_r1.utils.callbacks import get_callbacks
 from trl import GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
 
@@ -40,15 +41,45 @@ class GRPOScriptArguments(ScriptArguments):
 
     Args:
         reward_funcs (`list[str]`):
-            List of reward functions. Possible values are dynamically populated from REWARD_FUNCS_REGISTRY.
+            List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine'.
+        cosine_min_value_wrong (`float`):
+            Minimum reward for cosine scaling for wrong answers.
+        cosine_max_value_wrong (`float`):
+            Maximum reward for cosine scaling for wrong answers.
+        cosine_min_value_correct (`float`):
+            Minimum reward for cosine scaling for correct answers.
+        cosine_max_value_correct (`float`):
+            Maximum reward for cosine scaling for correct answers.
+        cosine_max_len (`int`):
+            Maximum length for cosine scaling.
     """
 
     reward_funcs: list[str] = field(
-        default_factory=lambda: ["accuracy", "format"],
+        default_factory=lambda: ["accuracy", "format", "reasoning_steps", "cosine"],
         metadata={
-            "help": f"List of reward functions. Possible values: {', '.join(REWARD_FUNCS_REGISTRY.keys())}"
+            "help": "List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine'"
         },
     )
+    cosine_min_value_wrong: float = field(
+        default=0.0,
+        metadata={"help": "Minimum reward for wrong answers"},
+    )
+    cosine_max_value_wrong: float = field(
+        default=-0.5,
+        metadata={"help": "Maximum reward for wrong answers"},
+    )
+    cosine_min_value_correct: float = field(
+        default=0.5,
+        metadata={"help": "Minimum reward for correct answers"},
+    )
+    cosine_max_value_correct: float = field(
+        default=1.0,
+        metadata={"help": "Maximum reward for correct answers"},
+    )
+    cosine_max_len: int = field(
+        default=1000,
+        metadata={"help": "Maximum length for scaling"},
+    )
 
 
 SYSTEM_PROMPT = (
@@ -98,6 +129,19 @@ def main(script_args, training_args, model_args):
     dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
 
     # Get reward functions
+    REWARD_FUNCS_REGISTRY = {
+        "accuracy": accuracy_reward,
+        "format": format_reward,
+        "reasoning_steps": reasoning_steps_reward,
+        "cosine": partial(
+            cosine_scaled_reward,
+            min_value_wrong=script_args.cosine_min_value_wrong,
+            max_value_wrong=script_args.cosine_max_value_wrong,
+            min_value_correct=script_args.cosine_min_value_correct,
+            max_value_correct=script_args.cosine_max_value_correct,
+            max_len=script_args.cosine_max_len,
+        ),
+    }
     reward_funcs = [REWARD_FUNCS_REGISTRY[func] for func in script_args.reward_funcs]
 
     # Format into conversation
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 9362d6eb5..2aac7a563 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -1,5 +1,6 @@
 """Reward functions for GRPO training."""
 
+import math
 import re
 
 from latex2sympy2_extended import NormalizationConfig
@@ -73,8 +74,75 @@ def reasoning_steps_reward(completions, **kwargs):
     return [min(1.0, count / 3) for count in matches]
 
 
-REWARD_FUNCS_REGISTRY = {
-    "accuracy": accuracy_reward,
-    "format": format_reward,
-    "reasoning_steps": reasoning_steps_reward,
-}
+def cosine_scaled_reward(
+    completions,
+    solution,
+    min_value_wrong: float = -1.0,
+    max_value_wrong: float = -0.5,
+    min_value_correct: float = 0.5,
+    max_value_correct: float = 1.0,
+    max_len: int = 1000,
+    **kwargs,
+):
+    """Reward function that scales based on completion length using a cosine schedule.
+
+    Shorter correct solutions are rewarded more than longer ones.
+    Longer incorrect solutions are penalized less than shorter ones.
+
+    Args:
+        completions: List of model completions
+        solution: List of ground truth solutions
+        min_value_wrong: Minimum reward for wrong answers
+        max_value_wrong: Maximum reward for wrong answers
+        min_value_correct: Minimum reward for correct answers
+        max_value_correct: Maximum reward for correct answers
+        max_len: Maximum length for scaling
+    """
+    contents = [completion[0]["content"] for completion in completions]
+    rewards = []
+
+    for content, sol in zip(contents, solution):
+        gold_parsed = parse(sol, extraction_mode="first_match", extraction_config=[LatexExtractionConfig()])
+        if len(gold_parsed) == 0:
+            rewards.append(1.0)  # Skip unparseable examples
+            print("Failed to parse gold solution: ", sol)
+            continue
+
+        answer_parsed = parse(
+            content,
+            extraction_config=[
+                LatexExtractionConfig(
+                    normalization_config=NormalizationConfig(
+                        nits=False,
+                        malformed_operators=False,
+                        basic_latex=True,
+                        equations=True,
+                        boxed=True,
+                        units=True,
+                    ),
+                    boxed_match_priority=0,
+                    try_extract_without_anchor=False,
+                )
+            ],
+            extraction_mode="first_match",
+        )
+
+        is_correct = verify(answer_parsed, gold_parsed)
+        gen_len = len(content)
+
+        # Apply cosine scaling based on length
+        progress = gen_len / max_len
+        cosine = math.cos(progress * math.pi)
+
+        if is_correct:
+            min_value = min_value_correct
+            max_value = max_value_correct
+        else:
+            # Swap min/max for incorrect answers
+            min_value = max_value_wrong
+            max_value = min_value_wrong
+
+        reward = min_value + 0.5 * (max_value - min_value) * (1.0 + cosine)
+        rewards.append(float(reward))
+
+    return rewards
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index a9fb95915..eb9569f02 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -1,5 +1,6 @@
 import unittest
-from open_r1.rewards import accuracy_reward, format_reward, reasoning_steps_reward
+
+from open_r1.rewards import accuracy_reward, cosine_scaled_reward, format_reward, reasoning_steps_reward
 
 
 class TestRewards(unittest.TestCase):
@@ -7,7 +8,7 @@ def test_accuracy_reward_correct_answer(self):
         """Test accuracy_reward with a correct answer."""
         completion = [[{"content": r"\boxed{\frac{63}{400}}"}]]
         solution = [r"\frac{63}{400}"]
-        
+
         rewards = accuracy_reward(completion, solution)
         self.assertEqual(rewards[0], 1.0)
 
@@ -15,7 +16,7 @@ def test_accuracy_reward_wrong_answer(self):
         """Test accuracy_reward with an incorrect answer."""
         completion = [[{"content": r"\boxed{\frac{64}{400}}"}]]
         solution = [r"\frac{63}{400}"]
-        
+
         rewards = accuracy_reward(completion, solution)
         self.assertEqual(rewards[0], 0.0)
 
@@ -32,9 +33,9 @@ def test_format_reward_incorrect(self):
             "<answer>Only answer</answer>",
             "No tags at all",
             "<think>Missing closing</think><answer>Missing closing",
-            "<think>Wrong order</answer><answer>Wrong order</think>"
+            "<think>Wrong order</answer><answer>Wrong order</think>",
         ]
-        
+
         for fmt in incorrect_formats:
             completion = [[{"content": fmt}]]
             rewards = format_reward(completion)
@@ -44,30 +45,15 @@ def test_reasoning_steps_reward(self):
         """Test reasoning_steps_reward with various formats."""
         test_cases = [
             # Full credit cases (3 or more steps)
-            (
-                "Step 1: First step\nStep 2: Second step\nStep 3: Third step",
-                1.0
-            ),
-            (
-                "First, we do this.\nSecond, we do that.\nFinally, we conclude.",
-                1.0
-            ),
+            ("Step 1: First step\nStep 2: Second step\nStep 3: Third step", 1.0),
+            ("First, we do this.\nSecond, we do that.\nFinally, we conclude.", 1.0),
             # Partial credit cases (less than 3 steps)
-            (
-                "Step 1: Only step",
-                1/3
-            ),
-            (
-                "First, we do this.\nFinally, we conclude.",
-                2/3
-            ),
+            ("Step 1: Only step", 1 / 3),
+            ("First, we do this.\nFinally, we conclude.", 2 / 3),
             # No credit case
-            (
-                "Just plain text without any clear steps",
-                0.0
-            )
+            ("Just plain text without any clear steps", 0.0),
         ]
-        
+
         for content, expected_reward in test_cases:
             completion = [[{"content": content}]]
             rewards = reasoning_steps_reward(completion)
@@ -75,17 +61,42 @@ def test_reasoning_steps_reward(self):
 
     def test_multiple_completions(self):
         """Test handling multiple completions at once."""
-        completions = [
-            [{"content": r"\boxed{\frac{63}{400}}"}],
-            [{"content": r"\boxed{\frac{64}{400}}"}]
-        ]
+        completions = [[{"content": r"\boxed{\frac{63}{400}}"}], [{"content": r"\boxed{\frac{64}{400}}"}]]
         solutions = [r"\frac{63}{400}", r"\frac{63}{400}"]
-        
+
         rewards = accuracy_reward(completions, solutions)
         self.assertEqual(len(rewards), 2)
         self.assertEqual(rewards[0], 1.0)
         self.assertEqual(rewards[1], 0.0)
 
+    def test_cosine_scaled_reward(self):
+        """Test cosine_scaled_reward with various cases."""
+        # Test parameters
+        test_params = {
+            "min_value_wrong": -1.0,
+            "max_value_wrong": -0.5,
+            "min_value_correct": 0.5,
+            "max_value_correct": 1.0,
+            "max_len": 100,
+        }
+
+        test_cases = [
+            # Correct answers with different lengths
+            (r"\boxed{\frac{63}{400}}", r"\frac{63}{400}", 20, 0.943),  # Short correct answer
+            (r"\boxed{\frac{63}{400}}", r"\frac{63}{400}", 80, 0.547),  # Long correct answer
+            # Wrong answers with different lengths
+            (r"\boxed{\frac{64}{400}}", r"\frac{63}{400}", 20, -0.942),  # Short wrong answer
+            (r"\boxed{\frac{64}{400}}", r"\frac{63}{400}", 80, -0.547),  # Long wrong answer
+        ]
+
+        for content, solution, content_len, expected_reward in test_cases:
+            # Pad content to desired length
+            padded_content = content + " " * (content_len - len(content))
+            completion = [[{"content": padded_content}]]
+
+            rewards = cosine_scaled_reward(completion, [solution], **test_params)
+            self.assertAlmostEqual(rewards[0], expected_reward, places=2)
+
 
-if __name__ == '__main__':
-    unittest.main() 
\ No newline at end of file
+if __name__ == "__main__":
+    unittest.main()

From dba152a494ceeb5fd0437f805bb135b6702e4a35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Fri, 7 Feb 2025 14:34:46 +0100
Subject: [PATCH 019/137] fix config name (#222)

---
 README.md                                                       | 2 +-
 .../grpo/{confg_full.yaml => config_full.yaml}                  | 2 +-
 recipes/qwen/README.md                                          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/{confg_full.yaml => config_full.yaml} (96%)

diff --git a/README.md b/README.md
index 127314124..22a3eb794 100644
--- a/README.md
+++ b/README.md
@@ -112,7 +112,7 @@ Here `{model}` and `{dataset}` refer to the model and dataset IDs on the Hugging
 To train via the GRPO trainer, we use one GPU to run vLLM for faster generation and the remaining GPUs for training. For example, one a node with 8 GPUs, use the `recipes/accelerate_configs/zero3.yaml` config and then overwrite `num_processes` to run on 7 devices:
 
 ```shell
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero3.yaml --num_processes=7 src/open_r1/grpo.py --config recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/confg_full.yaml
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero3.yaml --num_processes=7 src/open_r1/grpo.py --config recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/config_full.yaml
 ```
 
 We provide a minimal reproducible experiment using GRPO for mathematical reasoning, referencing the approach from [SimpleRL-Reason](https://hkust-nlp.notion.site/simplerl-reason) which uses a 7B model trained on 8K examples. Running this on 8 H100 80G GPU takes about 3 hours:
diff --git a/recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/confg_full.yaml b/recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/config_full.yaml
similarity index 96%
rename from recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/confg_full.yaml
rename to recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/config_full.yaml
index 3624c95ac..597de18b8 100644
--- a/recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/confg_full.yaml
+++ b/recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/config_full.yaml
@@ -35,7 +35,7 @@ max_steps: -1
 num_train_epochs: 1
 output_dir: data/Qwen2.5-1.5B-Open-R1-GRPO
 overwrite_output_dir: true
-per_device_eval_batch_size: 4   
+per_device_eval_batch_size: 4
 per_device_train_batch_size: 1
 push_to_hub: true
 report_to:
diff --git a/recipes/qwen/README.md b/recipes/qwen/README.md
index b7490c008..d88d6cfb7 100644
--- a/recipes/qwen/README.md
+++ b/recipes/qwen/README.md
@@ -20,5 +20,5 @@ You can find the configuration files for different model sizes in this folder an
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py --config recipes/qwen/Qwen2.5-1.5B-Instruct/sft/config_full.yaml
 
 # GRPO
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r1/grpo.py --config recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/confg_full.yaml
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r1/grpo.py --config recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/config_full.yaml
 ```

From dd915f8483f79ccd40f65588c778c7f126bf021c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Fri, 7 Feb 2025 15:21:40 +0100
Subject: [PATCH 020/137] Fix `cosine_scaled_reward` compatibility with GRPO
 (#229)

* Drop partial

* Update src/open_r1/grpo.py

* style
---
 src/open_r1/grpo.py    |   6 +-
 src/open_r1/rewards.py | 122 +++++++++++++++++++++--------------------
 tests/test_rewards.py  |   4 +-
 3 files changed, 66 insertions(+), 66 deletions(-)

diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 3572a97e8..55a8a849c 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -16,7 +16,6 @@
 import os
 import sys
 from dataclasses import dataclass, field
-from functools import partial
 
 import datasets
 import torch
@@ -26,7 +25,7 @@
 from transformers.trainer_utils import get_last_checkpoint
 
 from open_r1.configs import GRPOConfig
-from open_r1.rewards import accuracy_reward, cosine_scaled_reward, format_reward, reasoning_steps_reward
+from open_r1.rewards import accuracy_reward, format_reward, get_cosine_scaled_reward, reasoning_steps_reward
 from open_r1.utils.callbacks import get_callbacks
 from trl import GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
 
@@ -133,8 +132,7 @@ def main(script_args, training_args, model_args):
         "accuracy": accuracy_reward,
         "format": format_reward,
         "reasoning_steps": reasoning_steps_reward,
-        "cosine": partial(
-            cosine_scaled_reward,
+        "cosine": get_cosine_scaled_reward(
             min_value_wrong=script_args.cosine_min_value_wrong,
             max_value_wrong=script_args.cosine_max_value_wrong,
             min_value_correct=script_args.cosine_min_value_correct,
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 2aac7a563..f92da19f9 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -74,75 +74,77 @@ def reasoning_steps_reward(completions, **kwargs):
     return [min(1.0, count / 3) for count in matches]
 
 
-def cosine_scaled_reward(
-    completions,
-    solution,
+def get_cosine_scaled_reward(
     min_value_wrong: float = -1.0,
     max_value_wrong: float = -0.5,
     min_value_correct: float = 0.5,
     max_value_correct: float = 1.0,
     max_len: int = 1000,
-    **kwargs,
 ):
-    """Reward function that scales based on completion length using a cosine schedule.
-
-    Shorter correct solutions are rewarded more than longer ones.
-    Longer incorrect solutions are penalized less than shorter ones.
-
-    Args:
-        completions: List of model completions
-        solution: List of ground truth solutions
-        min_value_wrong: Minimum reward for wrong answers
-        max_value_wrong: Maximum reward for wrong answers
-        min_value_correct: Minimum reward for correct answers
-        max_value_correct: Maximum reward for correct answers
-        max_len: Maximum length for scaling
-    """
-    contents = [completion[0]["content"] for completion in completions]
-    rewards = []
+    def cosine_scaled_reward(completions, solution, **kwargs):
+        """Reward function that scales based on completion length using a cosine schedule.
+
+        Shorter correct solutions are rewarded more than longer ones.
+        Longer incorrect solutions are penalized less than shorter ones.
+
+        Args:
+            completions: List of model completions
+            solution: List of ground truth solutions
+
+        This function is parameterized by the following arguments:
+            min_value_wrong: Minimum reward for wrong answers
+            max_value_wrong: Maximum reward for wrong answers
+            min_value_correct: Minimum reward for correct answers
+            max_value_correct: Maximum reward for correct answers
+            max_len: Maximum length for scaling
+        """
+        contents = [completion[0]["content"] for completion in completions]
+        rewards = []
+
+        for content, sol in zip(contents, solution):
+            gold_parsed = parse(sol, extraction_mode="first_match", extraction_config=[LatexExtractionConfig()])
+            if len(gold_parsed) == 0:
+                rewards.append(1.0)  # Skip unparseable examples
+                print("Failed to parse gold solution: ", sol)
+                continue
 
-    for content, sol in zip(contents, solution):
-        gold_parsed = parse(sol, extraction_mode="first_match", extraction_config=[LatexExtractionConfig()])
-        if len(gold_parsed) == 0:
-            rewards.append(1.0)  # Skip unparseable examples
-            print("Failed to parse gold solution: ", sol)
-            continue
-
-        answer_parsed = parse(
-            content,
-            extraction_config=[
-                LatexExtractionConfig(
-                    normalization_config=NormalizationConfig(
-                        nits=False,
-                        malformed_operators=False,
-                        basic_latex=True,
-                        equations=True,
-                        boxed=True,
-                        units=True,
-                    ),
-                    boxed_match_priority=0,
-                    try_extract_without_anchor=False,
-                )
-            ],
-            extraction_mode="first_match",
-        )
+            answer_parsed = parse(
+                content,
+                extraction_config=[
+                    LatexExtractionConfig(
+                        normalization_config=NormalizationConfig(
+                            nits=False,
+                            malformed_operators=False,
+                            basic_latex=True,
+                            equations=True,
+                            boxed=True,
+                            units=True,
+                        ),
+                        boxed_match_priority=0,
+                        try_extract_without_anchor=False,
+                    )
+                ],
+                extraction_mode="first_match",
+            )
 
-        is_correct = verify(answer_parsed, gold_parsed)
-        gen_len = len(content)
+            is_correct = verify(answer_parsed, gold_parsed)
+            gen_len = len(content)
 
-        # Apply cosine scaling based on length
-        progress = gen_len / max_len
-        cosine = math.cos(progress * math.pi)
+            # Apply cosine scaling based on length
+            progress = gen_len / max_len
+            cosine = math.cos(progress * math.pi)
 
-        if is_correct:
-            min_value = min_value_correct
-            max_value = max_value_correct
-        else:
-            # Swap min/max for incorrect answers
-            min_value = max_value_wrong
-            max_value = min_value_wrong
+            if is_correct:
+                min_value = min_value_correct
+                max_value = max_value_correct
+            else:
+                # Swap min/max for incorrect answers
+                min_value = max_value_wrong
+                max_value = min_value_wrong
 
-        reward = min_value + 0.5 * (max_value - min_value) * (1.0 + cosine)
-        rewards.append(float(reward))
+            reward = min_value + 0.5 * (max_value - min_value) * (1.0 + cosine)
+            rewards.append(float(reward))
 
-    return rewards
+        return rewards
+
+    return cosine_scaled_reward
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index eb9569f02..0ff8a106c 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -1,6 +1,6 @@
 import unittest
 
-from open_r1.rewards import accuracy_reward, cosine_scaled_reward, format_reward, reasoning_steps_reward
+from open_r1.rewards import accuracy_reward, format_reward, get_cosine_scaled_reward, reasoning_steps_reward
 
 
 class TestRewards(unittest.TestCase):
@@ -94,7 +94,7 @@ def test_cosine_scaled_reward(self):
             padded_content = content + " " * (content_len - len(content))
             completion = [[{"content": padded_content}]]
 
-            rewards = cosine_scaled_reward(completion, [solution], **test_params)
+            rewards = get_cosine_scaled_reward(**test_params)(completion, [solution])
             self.assertAlmostEqual(rewards[0], expected_reward, places=2)
 
 

From 0da0f7cce21a46ff88d55a6715a9421de1ebff9d Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Fri, 7 Feb 2025 15:56:43 +0100
Subject: [PATCH 021/137] Refactor training configs and unify Slurm for
 training SFT & GRPO (#231)

* Refactor Slurm

* Fix

* FML

* Nuke

* Clean

* Fix config

* Fix deps

* Fix logging
---
 README.md                                     | 79 ++++++++++++----
 .../grpo/config_demo.yaml}                    |  8 +-
 .../grpo/config_demo.yaml}                    |  8 +-
 .../sft/config_demo.yaml}                     |  6 +-
 .../grpo/config_simple_rl.yaml}               | 14 +--
 recipes/qwen/README.md                        | 24 -----
 setup.py                                      |  6 +-
 slurm/grpo.slurm                              | 90 -------------------
 slurm/{sft.slurm => train.slurm}              | 60 +++++++------
 src/open_r1/sft.py                            |  6 +-
 10 files changed, 122 insertions(+), 179 deletions(-)
 rename recipes/{deepseek/DeepSeek-R1-Distill-Qwen-7B/grpo/config_full.yaml => DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml} (88%)
 rename recipes/{qwen/Qwen2.5-1.5B-Instruct/grpo/config_full.yaml => Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml} (87%)
 rename recipes/{qwen/Qwen2.5-1.5B-Instruct/sft/config_full.yaml => Qwen2.5-1.5B-Instruct/sft/config_demo.yaml} (87%)
 rename recipes/{deepseek/DeepSeek-R1-Distill-Qwen-7B/grpo/config_base_math_smalllr.yaml => Qwen2.5-Math-7B/grpo/config_simple_rl.yaml} (77%)
 delete mode 100644 recipes/qwen/README.md
 delete mode 100644 slurm/grpo.slurm
 rename slurm/{sft.slurm => train.slurm} (61%)

diff --git a/README.md b/README.md
index 22a3eb794..ac23f0ef0 100644
--- a/README.md
+++ b/README.md
@@ -86,50 +86,97 @@ sudo apt-get install git-lfs
 
 ## Training models
 
-We support training models with either DDP or DeepSpeed (ZeRO-2 and ZeRO-3). To switch between methods, simply change the path to the `accelerate` YAML config in `configs`.
+We support training models with either DDP or DeepSpeed (ZeRO-2 and ZeRO-3). For example, to run SFT on a dataset distilled from DeepSeek-R1 with reasoning traces such as [Bespoke-Stratos-17k](https://huggingface.co/datasets/bespokelabs/Bespoke-Stratos-17k), run:
 
-> [!NOTE]
-> The training commands below are configured for a node of 8 x H100s (80GB). For different hardware and topologies, you may need to tune the batch size and number of gradient accumulation steps.
+```shell
+# Train via command line
+accelerate launch --config_file=recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py \
+    --model_name_or_path Qwen/Qwen2.5-1.5B-Instruct \
+    --dataset_name HuggingFaceH4/Bespoke-Stratos-17k \
+    --learning_rate 2.0e-5 \
+    --num_train_epochs 1 \
+    --packing \
+    --max_seq_length 4096 \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 8 \
+    --gradient_checkpointing \
+    --bf16 \
+    --output_dir data/Qwen2.5-1.5B-Open-R1-Distill
+
+# Train via YAML config
+accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/openr1/sft.py \
+    recipes/Qwen/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
+```
 
-### SFT
+Currently, the following tasks are supported:
 
-To run SFT on a dataset distilled from DeepSeek-R1 with reasoning traces such as [Bespoke-Stratos-17k](https://huggingface.co/datasets/bespokelabs/Bespoke-Stratos-17k), run:
+* Supervised Fine-Tuning `sft`
+* Group Relative Policy Optimization `grpo`
+
+> [!TIP]
+> If you scale up/down the number of GPUs, we recommend also scaling up the per-device batch size or number of gradient accumulation steps to keep the global batch size constant.
+
+By default, these scripts will push each model to your Hugging Face Hub username, i.e. `{username}/{model_name}-{task}`. You can override the parameters in each YAML config by appending them to the command as follows: 
 
 ```shell
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py --config recipes/qwen/Qwen2.5-1.5B-Instruct/sft/config_full.yaml
+# Change batch size, number of epochs etc
+accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/openr1/sft.py \
+    recipes/Qwen/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
+    --per_device_train_batch_size=1 --num_train_epochs=5
 ```
 
-To launch a Slurm job, run:
+> [!NOTE]
+> The training commands below are configured for a node of 8 x H100s (80GB). For different hardware and topologies, you may need to tune the batch size and number of gradient accumulation steps.
+
+### SFT
+
+To run SFT on a dataset distilled from DeepSeek-R1 with reasoning traces such as [Bespoke-Stratos-17k](https://huggingface.co/datasets/bespokelabs/Bespoke-Stratos-17k), run:
 
 ```shell
-sbatch --output=/path/to/logs/%x-%j.out --err=/path/to/logs/%x-%j.err slurm/sft.slurm {model} {dataset} {accelerator}
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero3.yaml \
+    src/open_r1/sft.py \
+    --config recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
 ```
 
-Here `{model}` and `{dataset}` refer to the model and dataset IDs on the Hugging Face Hub, while `{accelerator}` refers to the choice of an 🤗 Accelerate config file in configs. 
-
 ### GRPO
 
 To train via the GRPO trainer, we use one GPU to run vLLM for faster generation and the remaining GPUs for training. For example, one a node with 8 GPUs, use the `recipes/accelerate_configs/zero3.yaml` config and then overwrite `num_processes` to run on 7 devices:
 
 ```shell
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero3.yaml --num_processes=7 src/open_r1/grpo.py --config recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/config_full.yaml
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero3.yaml \
+    --num_processes=7 src/open_r1/grpo.py \
+    --config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
 ```
 
 We provide a minimal reproducible experiment using GRPO for mathematical reasoning, referencing the approach from [SimpleRL-Reason](https://hkust-nlp.notion.site/simplerl-reason) which uses a 7B model trained on 8K examples. Running this on 8 H100 80G GPU takes about 3 hours:
 
 ```shell
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero2.yaml --num_processes=7 src/open_r1/grpo.py --config recipes/deepseek/DeepSeek-R1-Distill-Qwen-7B/grpo/config_base_math_smalllr.yaml
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero2.yaml \
+    --num_processes=7 src/open_r1/grpo.py \
+    --config recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
 ```
 
-Our final [model](Dongwei/Qwen-2.5-7B_Base_Math_smalllr), while using different learning rates, loss functions and reward structures, achieves 69.4% accuracy on Math_500, demonstrating a 17%+ improvement over the base model.
+Our final [model](https://huggingface.co/Dongwei/Qwen-2.5-7B_Base_Math_smalllr), while using different learning rates, loss functions and reward structures, achieves 69.4% accuracy on MATH-500, demonstrating a 17%+ improvement over the base model.
+
+### Launching jobs on a Slurm cluster
 
-To launch a Slurm job, run:
+If you have access to a Slurm cluster, we provide a `slurm/train.slurm` script that will automatically queue training jobs for you. Here's how you can use it:
 
 ```shell
-sbatch --output=/path/to/logs/%x-%j.out --err=/path/to/logs/%x-%j.err slurm/grpo.slurm {model} {dataset} {accelerator}
+sbatch --job-name=open_r1 --nodes=1 slurm/train.slurm {model_name} {task} {config_suffix} {accelerator}
 ```
 
-You can find more model configurations in the [recipes](./recipes).
+Here `{model_name}` and `{task}` are defined as above, while `{config_suffix}` refers to the specific config and `{accelerator}` refers to the choice of 🤗 Accelerate config in `recipes/accelerate_configs`. If you wish to override the default config parameters, you can provide them by appending a space-separated string like `'--arg1=value1 --arg2=value2'`. Here's a concrete example to run SFT on 1 node of 8 GPUs:
+
+```shell
+# Launch on Slurm and override default hyperparameters
+sbatch --job-name=open_r1 --nodes=1 slurm/train.slurm Qwen2.5-1.5B-Instruct sft demo zero3 '--per_device_train_batch_size=1 --num_train_epochs=5'
+```
+
+You can scale the number of nodes by increasing the `--nodes` flag.
+
+> [!NOTE]
+> The configuration in `slurm/train.slurm` is optimised for the Hugging Face Compute Cluster and may require tweaking to be adapted to your own compute nodes.
 
 ## Evaluating models
 
diff --git a/recipes/deepseek/DeepSeek-R1-Distill-Qwen-7B/grpo/config_full.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml
similarity index 88%
rename from recipes/deepseek/DeepSeek-R1-Distill-Qwen-7B/grpo/config_full.yaml
rename to recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml
index a5cfcb484..f1468283d 100644
--- a/recipes/deepseek/DeepSeek-R1-Distill-Qwen-7B/grpo/config_full.yaml
+++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml
@@ -2,13 +2,12 @@
 model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
 model_revision: main
 torch_dtype: bfloat16
+attn_implementation: flash_attention_2
 
 # Data training arguments
 dataset_name: AI-MO/NuminaMath-TIR
 dataset_configs:
 - all
-# Num processes is less by 1 as vLLM is using 1 GPU
-num_processes: 7
 
 # GRPO trainer config
 bf16: true
@@ -26,17 +25,18 @@ hub_model_id: DeepSeek-R1-Distill-Qwen-7B-GRPO
 hub_strategy: every_save
 learning_rate: 2.0e-05
 log_level: info
-logging_steps: 10
+logging_steps: 5
 logging_strategy: steps
 lr_scheduler_type: cosine
 max_prompt_length: 512
 max_completion_length: 1024
 max_steps: -1
+num_generations: 2
 num_train_epochs: 1
 output_dir: data/DeepSeek-R1-Distill-Qwen-7B-GRPO
 overwrite_output_dir: true
 per_device_eval_batch_size: 4
-per_device_train_batch_size: 1
+per_device_train_batch_size: 2
 push_to_hub: true
 report_to:
 - wandb
diff --git a/recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/config_full.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
similarity index 87%
rename from recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/config_full.yaml
rename to recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
index 597de18b8..167140e99 100644
--- a/recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/config_full.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
@@ -2,6 +2,7 @@
 model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 model_revision: main
 torch_dtype: bfloat16
+attn_implementation: flash_attention_2
 
 # Data training arguments
 dataset_name: AI-MO/NuminaMath-TIR
@@ -26,17 +27,18 @@ hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO
 hub_strategy: every_save
 learning_rate: 2.0e-05
 log_level: info
-logging_steps: 10
+logging_steps: 5
 logging_strategy: steps
 lr_scheduler_type: cosine
 max_prompt_length: 512
 max_completion_length: 1024
 max_steps: -1
+num_generations: 2
 num_train_epochs: 1
 output_dir: data/Qwen2.5-1.5B-Open-R1-GRPO
 overwrite_output_dir: true
-per_device_eval_batch_size: 4
-per_device_train_batch_size: 1
+per_device_eval_batch_size: 4   
+per_device_train_batch_size: 2
 push_to_hub: true
 report_to:
 - wandb
diff --git a/recipes/qwen/Qwen2.5-1.5B-Instruct/sft/config_full.yaml b/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
similarity index 87%
rename from recipes/qwen/Qwen2.5-1.5B-Instruct/sft/config_full.yaml
rename to recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
index 94e2225e7..6781996d0 100644
--- a/recipes/qwen/Qwen2.5-1.5B-Instruct/sft/config_full.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
@@ -2,6 +2,7 @@
 model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
 model_revision: main
 torch_dtype: bfloat16
+attn_implementation: flash_attention_2
 
 # Data training arguments
 dataset_name: HuggingFaceH4/Bespoke-Stratos-17k
@@ -14,7 +15,7 @@ bf16: true
 do_eval: true
 eval_strategy: steps
 eval_steps: 100
-gradient_accumulation_steps: 4
+gradient_accumulation_steps: 8
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
@@ -31,8 +32,9 @@ max_steps: -1
 num_train_epochs: 1
 output_dir: data/Qwen2.5-1.5B-Open-R1-Distill
 overwrite_output_dir: true
+packing: true
 per_device_eval_batch_size: 4
-per_device_train_batch_size: 4
+per_device_train_batch_size: 2
 push_to_hub: true
 report_to:
 - wandb
diff --git a/recipes/deepseek/DeepSeek-R1-Distill-Qwen-7B/grpo/config_base_math_smalllr.yaml b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
similarity index 77%
rename from recipes/deepseek/DeepSeek-R1-Distill-Qwen-7B/grpo/config_base_math_smalllr.yaml
rename to recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
index a102c9f10..636bfecbe 100644
--- a/recipes/deepseek/DeepSeek-R1-Distill-Qwen-7B/grpo/config_base_math_smalllr.yaml
+++ b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
@@ -2,6 +2,7 @@
 model_name_or_path: Qwen/Qwen2.5-Math-7B
 model_revision: main
 torch_dtype: bfloat16
+attn_implementation: flash_attention_2
 
 # Data training arguments
 dataset_name: DigitalLearningGmbH/MATH-lighteval
@@ -18,25 +19,26 @@ vllm_gpu_memory_utilization: 0.7
 do_eval: true
 eval_strategy: steps
 eval_steps: 100
-gradient_accumulation_steps: 16
+gradient_accumulation_steps: 8
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
-hub_model_id: Qwen-2.5-7B_Base_Math_smalllr
+hub_model_id: Qwen-2.5-7B-Simple-RL
 hub_strategy: every_save
 learning_rate: 3.0e-06
 log_level: info
-logging_steps: 10
+logging_steps: 5
 logging_strategy: steps
 lr_scheduler_type: cosine
 max_prompt_length: 512
 max_completion_length: 1024
 max_steps: -1
+num_generations: 7
 num_train_epochs: 1
-output_dir: data/Qwen-2.5-7B_Base_Math_smalllr
+output_dir: data/Qwen-2.5-7B-Simple-RL
 overwrite_output_dir: true
-per_device_eval_batch_size: 1
-per_device_train_batch_size: 1
+per_device_eval_batch_size: 2
+per_device_train_batch_size: 2
 push_to_hub: true
 report_to:
 - wandb
diff --git a/recipes/qwen/README.md b/recipes/qwen/README.md
deleted file mode 100644
index d88d6cfb7..000000000
--- a/recipes/qwen/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# Instructions to train Qwen-R1
-
-We build the **Qwen-R1** by doing `SFT` on [Bespoke-Stratos-17k](https://huggingface.co/datasets/bespokelabs/Bespoke-Stratos-17k) and then `GRPO` on [NuminaMath-TIR](https://huggingface.co/datasets/AI-MO/NuminaMath-TIR).
-
-## Setup
-
-Follow the installation instructions in https://github.com/huggingface/open-r1/tree/main?tab=readme-ov-file## Installation.
-
-## Training
-
-We support training models with either DDP or DeepSpeed ZeRO-2 and ZeRO-3. To switch between methods, simply change the path to the `recipes` YAML config in `accelerate_configs`.
-
-> [!NOTE]
-> The training commands below are configured for a node of 8 x H100s (80GB). For different hardware and topologies, you may need to tune the batch size and number of gradient accumulation steps.
-
-You can find the configuration files for different model sizes in this folder and specify the path to the configuration file in the commands below.
-
-```shell
-# SFT
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py --config recipes/qwen/Qwen2.5-1.5B-Instruct/sft/config_full.yaml
-
-# GRPO
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r1/grpo.py --config recipes/qwen/Qwen2.5-1.5B-Instruct/grpo/config_full.yaml
-```
diff --git a/setup.py b/setup.py
index 018eb9335..06b007c8e 100644
--- a/setup.py
+++ b/setup.py
@@ -43,12 +43,12 @@
 _deps = [
     "accelerate>=1.2.1",
     "bitsandbytes>=0.43.0",
-    "ruff>=0.9.0",
     "datasets>=3.2.0",
     "deepspeed==0.15.4",
     "distilabel[vllm,ray,openai]>=1.5.2",
     "einops>=0.8.0",
     "flake8>=6.0.0",
+    "flash_attn>=2.7.4.post1",
     "hf_transfer>=0.1.4",
     "huggingface-hub[cli]>=0.19.2,<1.0",
     "isort>=5.12.0",
@@ -60,6 +60,7 @@
     "packaging>=23.0",
     "parameterized>=0.9.0",
     "pytest",
+    "ruff>=0.9.0",
     "safetensors>=0.3.3",
     "sentencepiece>=0.1.99",
     "torch==2.5.1",
@@ -86,8 +87,9 @@ def deps_list(*pkgs):
 extras["tests"] = deps_list("pytest", "parameterized")
 extras["torch"] = deps_list("torch")
 extras["quality"] = deps_list("ruff", "isort", "flake8")
+extras["train"] = deps_list("flash_attn")
 extras["eval"] = deps_list("lighteval", "math-verify")
-extras["dev"] = extras["quality"] + extras["tests"] + extras["eval"]
+extras["dev"] = extras["quality"] + extras["tests"] + extras["eval"] + extras["train"]
 
 # core dependencies shared across the whole project - keep this to a bare minimum :)
 install_requires = [
diff --git a/slurm/grpo.slurm b/slurm/grpo.slurm
deleted file mode 100644
index 96440491b..000000000
--- a/slurm/grpo.slurm
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=open-r1-grpo
-#SBATCH --nodes=1
-#SBATCH --ntasks-per-node=1
-#SBATCH --exclusive
-#SBATCH --gres=gpu:8
-#SBATCH --partition=hopper-prod 
-#SBATCH --output=./logs/%x-%j.out
-#SBATCH --err=./logs/%x-%j.err
-
-set -x -e
-
-source ~/.bashrc
-source openr1/bin/activate
-echo "START TIME: $(date)"
-echo "PYTHON ENV: $(which python)"
-
-MODEL_PATH=$1
-DATASET_PATH=$2
-ACCELERATOR=$3
-
-# Training setup
-NUM_NODES=$SLURM_NNODES
-GPUS_PER_NODE=8
-WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
-
-
-# so processes know who to talk to
-MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
-MASTER_PORT=6000
-
-export CMD=" \
-    src/open_r1/grpo.py \
-    --model_name_or_path $MODEL_PATH \
-    --dataset_name $DATASET_PATH \
-    --learning_rate 2.0e-5 \
-    --num_train_epochs 1 \
-    --max_completion_length 1024 \
-    --max_prompt_length 512 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --gradient_accumulation_steps 4 \
-    --gradient_checkpointing \
-    --bf16 \
-    --use_vllm \
-    --vllm_device auto \
-    --vllm_gpu_memory_utilization 0.7 \
-    --logging_steps 5 \
-    --eval_strategy steps \
-    --eval_steps 100 \
-    --output_dir data/Qwen2.5-1.5B-Open-R1-GRPO
-    "
-
-export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \
-    --config_file recipes/accelerate_configs/$ACCELERATOR.yaml  \
-    --num_processes $(($WORLD_SIZE - 1)) \
-    --gradient_accumulation_steps 4 \
-    --num_machines $NUM_NODES \
-    --main_process_ip $MASTER_ADDR \
-    --main_process_port $MASTER_PORT \
-    --machine_rank \$SLURM_PROCID \
-    --rdzv_conf "rdzv_backend=c10d,rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT" \
-    --max_restarts 1 \
-    --role \$(hostname -s): \
-    --tee 3 \
-    "
-
-# force crashing on nccl issues like hanging broadcast
-export NCCL_ASYNC_ERROR_HANDLING=1
-# export NCCL_DEBUG=INFO
-# export NCCL_DEBUG_SUBSYS=COLL
-# export NCCL_SOCKET_NTHREADS=1
-# export NCCL_NSOCKS_PERTHREAD=1
-# export CUDA_LAUNCH_BLOCKING=1
-
-# Specific configuration optimized for the Hugging Face Compute Cluster
-# Be ye warned this may not work on other clusters!
-module load cuda/12.4
-
-# srun error handling:
-# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
-# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
-SRUN_ARGS=" \
-    --wait=60 \
-    --kill-on-bad-exit=1 \
-    "
-
-clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --role \$SLURMD_NODENAME: $CMD" 2>&1
-
-echo "END TIME: $(date)"
diff --git a/slurm/sft.slurm b/slurm/train.slurm
similarity index 61%
rename from slurm/sft.slurm
rename to slurm/train.slurm
index 7815dcf37..2c7708521 100644
--- a/slurm/sft.slurm
+++ b/slurm/train.slurm
@@ -1,56 +1,62 @@
 #!/bin/bash
 #SBATCH --job-name=open-r1-sft
-#SBATCH --nodes=1
 #SBATCH --ntasks-per-node=1
 #SBATCH --exclusive
 #SBATCH --gres=gpu:8
-#SBATCH --partition=hopper-prod 
+#SBATCH --partition=hopper-prod  # Adjust this for your cluster
 #SBATCH --output=./logs/%x-%j.out
 #SBATCH --err=./logs/%x-%j.err
+#SBATCH --requeue
+
+# Specific configuration optimized for the Hugging Face Compute Cluster
+# Be ye warned this may not work on other clusters!
+module load cuda/12.4
+
 
 set -x -e
 
 source ~/.bashrc
 source openr1/bin/activate
 echo "START TIME: $(date)"
-echo "PYTHON ENV: $(which python)"
 
-MODEL_PATH=$1
-DATASET_PATH=$2
-ACCELERATOR=$3
+MODEL=$1
+TASK=$2
+CONFIG_SUFFIX=$3
+ACCELERATOR=$4
+OPTIONAL_ARGS=$5
 
 # Training setup
 NUM_NODES=$SLURM_NNODES
 GPUS_PER_NODE=8
 WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
+# Due to conflicts between Accelerate's DeepSpeed configs and Transformers' TrainingArguments, we need to parse the gradient accumulation steps from the config file to ensure they match
+CONFIG_FILE=recipes/$MODEL/$TASK/config_$CONFIG_SUFFIX.yaml
+GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}')
+
+# Split the string into individual arguments
+IFS=' ' read -ra ARGS <<< "$OPTIONAL_ARGS"
 
+# Loop through the arguments and find the one with "--gradient_accumulation_steps"
+for arg in "${ARGS[@]}"; do
+    if [[ "$arg" == "--gradient_accumulation_steps="* ]]; then
+        # Extract the value after the equals sign
+        GRAD_ACC_STEPS="${arg#*=}"
+        break  # Exit the loop once we find the desired argument
+    fi
+done
+
+echo "Gradient accumulation steps: $GRAD_ACC_STEPS"
 # so processes know who to talk to
 MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
 MASTER_PORT=6000
 
 export CMD=" \
-    src/open_r1/sft.py \
-    --model_name_or_path $MODEL_PATH \
-    --dataset_name $DATASET_PATH \
-    --use_liger_kernel true \
-    --learning_rate 2.0e-5 \
-    --num_train_epochs 1 \
-    --packing \
-    --max_seq_length 4096 \
-    --per_device_train_batch_size 4 \
-    --per_device_eval_batch_size 4 \
-    --gradient_accumulation_steps 4 \
-    --gradient_checkpointing \
-    --bf16 \
-    --logging_steps 5 \
-    --eval_strategy steps \
-    --eval_steps 100 \
-    --output_dir data/Qwen2.5-1.5B-Open-R1-Distill
+    src/open_r1/$TASK.py --config $CONFIG_FILE $OPTIONAL_ARGS
     "
 
 export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \
     --config_file recipes/accelerate_configs/$ACCELERATOR.yaml  \
-    --gradient_accumulation_steps 4 \
+    --gradient_accumulation_steps $GRAD_ACC_STEPS \
     --num_machines $NUM_NODES \
     --num_processes $WORLD_SIZE \
     --main_process_ip $MASTER_ADDR \
@@ -70,10 +76,6 @@ export NCCL_ASYNC_ERROR_HANDLING=1
 # export NCCL_NSOCKS_PERTHREAD=1
 # export CUDA_LAUNCH_BLOCKING=1
 
-# Specific configuration optimized for the Hugging Face Compute Cluster
-# Be ye warned this may not work on other clusters!
-module load cuda/12.4
-
 # srun error handling:
 # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
 # --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
@@ -84,4 +86,4 @@ SRUN_ARGS=" \
 
 clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --role \$SLURMD_NODENAME: $CMD" 2>&1
 
-echo "END TIME: $(date)"
+echo "END TIME: $(date)"
\ No newline at end of file
diff --git a/src/open_r1/sft.py b/src/open_r1/sft.py
index 6d2b18008..e8587d034 100644
--- a/src/open_r1/sft.py
+++ b/src/open_r1/sft.py
@@ -18,15 +18,15 @@
 Usage:
 
 # One 1 node of 8 x H100s
-accelerate launch --config_file=configs/zero3.yaml src/open_r1/sft.py \
+accelerate launch --config_file=recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py \
     --model_name_or_path Qwen/Qwen2.5-1.5B-Instruct \
     --dataset_name HuggingFaceH4/Bespoke-Stratos-17k \
     --learning_rate 2.0e-5 \
     --num_train_epochs 1 \
     --packing \
     --max_seq_length 4096 \
-    --per_device_train_batch_size 4 \
-    --gradient_accumulation_steps 4 \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 8 \
     --gradient_checkpointing \
     --bf16 \
     --logging_steps 5 \

From 9c768d56b199d28692063bbbd6802f3899c0d817 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Fri, 7 Feb 2025 16:52:43 +0000
Subject: [PATCH 022/137] Remove puzzles (#233)

---
 src/open_r1/puzzles/README.md                 | 37 -------------
 src/open_r1/puzzles/__init__.py               | 18 ------
 src/open_r1/puzzles/base_config.py            |  9 ---
 src/open_r1/puzzles/base_task.py              | 53 ------------------
 .../tasks/math/algebra/linear_equations.py    | 55 -------------------
 5 files changed, 172 deletions(-)
 delete mode 100644 src/open_r1/puzzles/README.md
 delete mode 100644 src/open_r1/puzzles/__init__.py
 delete mode 100644 src/open_r1/puzzles/base_config.py
 delete mode 100644 src/open_r1/puzzles/base_task.py
 delete mode 100644 src/open_r1/puzzles/tasks/math/algebra/linear_equations.py

diff --git a/src/open_r1/puzzles/README.md b/src/open_r1/puzzles/README.md
deleted file mode 100644
index 0a14bd82a..000000000
--- a/src/open_r1/puzzles/README.md
+++ /dev/null
@@ -1,37 +0,0 @@
-The puzzles module contains a simple and extensible system for generating and verifying reasoning tasks.
-The focus is on tasks where infinite variants can be generated with automatic answer verification, like mathematics, logic puzzles or coding tasks, although
-we highly encourage creativity - if you can come up with less STEM-y tasks that can still be rigorously validated, we'd love to see them!
-
-# Generating puzzles
-
-After `pip install`ing the open-r1 repo, you can very quickly get started
-
-```python
->>> from open_r1.puzzles import LinearEquationConfig, LinearEquationTask
-
->>> task = LinearEquationTask()
->>> # Tasks are iterable, so you can iterate with "for question, answer in task:"
->>> question, answer = next(iter(task))
->>> print(question)
-'-2y - 4 = -16'
-
-# To score a model output, use task.validate()
->>> task.verify("y = 6", answer)
-1.0
-
->>> # To control the task difficulty, you can use the task's associated config
->>> config = LinearEquationConfig()
->>> config.min_coefficient = -1000
->>> config.max_coefficient = 1000
->>> harder_task = LinearEquationTask(config)
-```
-
-## Adding new puzzles
-
-[add puzzle guide goes here]
-
-## Coming soon:
-
-- Proper indexing of puzzles
-- More puzzle types!
-- Lazy loading (if the module gets very big)
\ No newline at end of file
diff --git a/src/open_r1/puzzles/__init__.py b/src/open_r1/puzzles/__init__.py
deleted file mode 100644
index 760bc1ee2..000000000
--- a/src/open_r1/puzzles/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .tasks.math.algebra.linear_equations import LinearEquationConfig, LinearEquationTask
-
-
-__all__ = [LinearEquationConfig, LinearEquationTask]
diff --git a/src/open_r1/puzzles/base_config.py b/src/open_r1/puzzles/base_config.py
deleted file mode 100644
index 587af2e45..000000000
--- a/src/open_r1/puzzles/base_config.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from __future__ import annotations
-
-from dataclasses import dataclass
-
-
-@dataclass
-class BaseConfig:
-    num_tasks: int = 100
-    seed: int | None = None
diff --git a/src/open_r1/puzzles/base_task.py b/src/open_r1/puzzles/base_task.py
deleted file mode 100644
index f910b8411..000000000
--- a/src/open_r1/puzzles/base_task.py
+++ /dev/null
@@ -1,53 +0,0 @@
-from __future__ import annotations
-
-from abc import ABC, abstractmethod
-from random import randint
-
-import numpy as np
-
-from .base_config import BaseConfig
-
-
-MAX_INT = np.iinfo(np.int64).max
-
-
-class BaseTask(ABC):
-    config_class = None
-
-    def __init__(self, config: BaseConfig = None):
-        if config is not None:
-            self.config = config
-        elif self.config_class is not None:
-            self.config = self.config_class()  # Instantiate the default config for this task
-        else:
-            raise ValueError("No config provided and no default config_class set for this task")
-
-        # We generate individual sample rngs using seed + idx, so we scramble the seeds to large ints first
-        # to avoid sample overlap between datasets using common similar, small seeds (like 0 and 42 and 123)
-        self.seed = self.config.seed or randint(0, MAX_INT)
-        seed_scrambler = np.random.default_rng(self.seed)
-        self.scrambled_seed = seed_scrambler.integers(low=0, high=MAX_INT, size=None)
-
-    def __len__(self):
-        return self.config.num_tasks
-
-    def __iter__(self):
-        for i in range(len(self)):
-            yield self[i]
-
-    def get_rng(self, idx) -> np.random.Generator:
-        return np.random.default_rng(self.scrambled_seed + idx)
-
-    def __getitem__(self, item) -> tuple:
-        rng = self.get_rng(item)
-        return self.generate_sample(self.config, rng)
-
-    @abstractmethod
-    def generate_sample(self, config: BaseConfig, rng: np.random.Generator) -> tuple:
-        # This should return a tuple of (output, answer)
-        raise NotImplementedError
-
-    @abstractmethod
-    def verify(self, output, answer) -> float:
-        # This should return a score between 0. and 1. based on how well the output matches the answer
-        raise NotImplementedError
diff --git a/src/open_r1/puzzles/tasks/math/algebra/linear_equations.py b/src/open_r1/puzzles/tasks/math/algebra/linear_equations.py
deleted file mode 100644
index 21f2913f7..000000000
--- a/src/open_r1/puzzles/tasks/math/algebra/linear_equations.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import re
-
-import numpy as np
-
-from ....base_config import BaseConfig
-from ....base_task import BaseTask
-
-
-class LinearEquationConfig(BaseConfig):
-    min_coefficient: int = -10
-    max_coefficient: int = 10
-    min_var_value = -10
-    max_var_value = 10
-
-
-class LinearEquationTask(BaseTask):
-    config_class = LinearEquationConfig
-
-    def generate_sample(self, config: LinearEquationConfig, rng: np.random.Generator):
-        variable_names = ("x", "y", "z", "a", "b", "c")
-        var_name = rng.choice(variable_names)
-        var_coefficient = 0
-        while var_coefficient == 0:
-            # We can't have the variable's coefficient be 0, so keep sampling until we get a non-zero one
-            var_coefficient = rng.integers(config.min_coefficient, config.max_coefficient, endpoint=True)
-        constant = rng.integers(config.min_coefficient, config.max_coefficient, endpoint=True)
-        while var_coefficient == 1 and constant == 0:
-            # We can't have the variable's coefficient be 1 and the constant be 0, as this is a trivial equation
-            # so keep rerolling until it isn't
-            constant = rng.integers(config.min_coefficient, config.max_coefficient, endpoint=True)
-        var_value = int(rng.integers(config.min_var_value, config.max_var_value, endpoint=True))
-        rhs = var_coefficient * var_value + constant
-
-        if constant < 0:
-            equation = f"{var_coefficient}{var_name} - {-constant} = {rhs}"
-        elif constant > 0:
-            equation = f"{var_coefficient}{var_name} + {constant} = {rhs}"
-        else:
-            equation = f"{var_coefficient}{var_name} = {rhs}"
-
-        return equation, var_value
-
-    def verify(self, output, answer):
-        # If there's only one number in the output, it's the answer
-        numbers = re.findall(r"\d+", output)
-        if len(numbers) == 1:
-            return float(int(numbers[0]) == answer)
-        # If not, look for a pattern like "x = 5" to disambiguate
-        numbers = re.findall(r"=\s+(\d+)", output)
-        if len(numbers) == 1:
-            return float(int(numbers[0].group(1)) == answer)
-        # Finally, maybe it gave the answer as a decimal, so check for that
-        numbers = re.findall(r"\d+\.\d+", output)
-        if len(numbers) == 1:
-            return float(float(numbers[0]) == answer)

From 3519a7fa3db303ecc16ea6b44ae5ef8bb5815c57 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Fri, 7 Feb 2025 20:01:54 +0100
Subject: [PATCH 023/137] Remove duplicate math-verify (#234)

---
 recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml | 2 +-
 setup.py                                            | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
index 167140e99..6de40e023 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
@@ -1,5 +1,5 @@
 # Model arguments
-model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2
diff --git a/setup.py b/setup.py
index 06b007c8e..ef742eb00 100644
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,6 @@
     "huggingface-hub[cli]>=0.19.2,<1.0",
     "isort>=5.12.0",
     "latex2sympy2_extended>=1.0.6",
-    "math-verify>=0.5.2",
     "liger_kernel==0.5.2",
     "lighteval @ git+https://github.com/huggingface/lighteval.git@86f62259f105ae164f655e0b91c92a823a742724#egg=lighteval[math]",
     "math-verify==0.5.2",  # Used for math verification in grpo

From f5f0b55dc4b0fcedf073ccb76bfb50f294c12bbe Mon Sep 17 00:00:00 2001
From: Xu Song <xusong.vip@gmail.com>
Date: Sat, 8 Feb 2025 17:28:11 +0800
Subject: [PATCH 024/137] Fix typo (#241)

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ac23f0ef0..434c5dedf 100644
--- a/README.md
+++ b/README.md
@@ -104,7 +104,7 @@ accelerate launch --config_file=recipes/accelerate_configs/zero3.yaml src/open_r
     --output_dir data/Qwen2.5-1.5B-Open-R1-Distill
 
 # Train via YAML config
-accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/openr1/sft.py \
+accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py \
     recipes/Qwen/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
 ```
 
@@ -120,7 +120,7 @@ By default, these scripts will push each model to your Hugging Face Hub username
 
 ```shell
 # Change batch size, number of epochs etc
-accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/openr1/sft.py \
+accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py \
     recipes/Qwen/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
     --per_device_train_batch_size=1 --num_train_epochs=5
 ```

From d12886da7fa7f84b8b78b94e3a54e10a5a98bd4d Mon Sep 17 00:00:00 2001
From: JamesHujy <48405323+JamesHujy@users.noreply.github.com>
Date: Sat, 8 Feb 2025 09:46:44 -0500
Subject: [PATCH 025/137] fix format reward (#238)

* fix format reward

* failing test

* add \s* between </think> and <answer> tag to handle multilines

---------

Co-authored-by: Kashif Rasul <kashif.rasul@gmail.com>
---
 src/open_r1/rewards.py | 4 ++--
 tests/test_rewards.py  | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index f92da19f9..f33b5641e 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -51,9 +51,9 @@ def accuracy_reward(completions, solution, **kwargs):
 
 def format_reward(completions, **kwargs):
     """Reward function that checks if the completion has a specific format."""
-    pattern = r"^<think>.*?</think><answer>.*?</answer>$"
+    pattern = r"^<think>.*?</think>\s*<answer>.*?</answer>$"
     completion_contents = [completion[0]["content"] for completion in completions]
-    matches = [re.match(pattern, content) for content in completion_contents]
+    matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completion_contents]
     return [1.0 if match else 0.0 for match in matches]
 
 
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index 0ff8a106c..473a07608 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -97,6 +97,13 @@ def test_cosine_scaled_reward(self):
             rewards = get_cosine_scaled_reward(**test_params)(completion, [solution])
             self.assertAlmostEqual(rewards[0], expected_reward, places=2)
 
+    def test_format_reward_specific_multiline(self):
+        """Test format_reward with a specific multiline input."""
+        inputs = "<think>\nI will count each distinct object in the image:\n1. Purple scooter\n2. Red bicycle\n3. Green motorcycle\n4. Gray sedan\n5. Yellow school bus\n6. Small green double-decker bus\n7. Small red car\n8. Small purple car\n9. Small gray dirt bike\n\nThere are 9 distinct objects in total.\n</think>\n<answer>9</answer>"
+        completion = [[{"content": inputs}]]
+        rewards = format_reward(completion)
+        self.assertEqual(rewards[0], 1.0)
+
 
 if __name__ == "__main__":
     unittest.main()

From 90c1bfe8292ef58f4d6d9aaed5c9a1ea6e7eea91 Mon Sep 17 00:00:00 2001
From: Ty Feng <42445931+tyfeng1997@users.noreply.github.com>
Date: Sun, 9 Feb 2025 15:21:35 +0800
Subject: [PATCH 026/137] Fix README: Correct recipes path and missing --config
 option (#247)

* Fix incorrect recipes path in README

* Fix missing --config option and incorrect recipes path

* Fix missing --config option and incorrect recipes path
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 434c5dedf..46283c7ae 100644
--- a/README.md
+++ b/README.md
@@ -105,7 +105,7 @@ accelerate launch --config_file=recipes/accelerate_configs/zero3.yaml src/open_r
 
 # Train via YAML config
 accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py \
-    recipes/Qwen/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
+    --config recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
 ```
 
 Currently, the following tasks are supported:
@@ -121,7 +121,7 @@ By default, these scripts will push each model to your Hugging Face Hub username
 ```shell
 # Change batch size, number of epochs etc
 accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py \
-    recipes/Qwen/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
+    --config recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
     --per_device_train_batch_size=1 --num_train_epochs=5
 ```
 

From 9be2e9a859233cbee13f9346454e78f398a0a9b8 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Sun, 9 Feb 2025 09:44:35 +0100
Subject: [PATCH 027/137] Add retry mechanism for pushing eval results (#252)

The Hub throws 403 errors if there are too many concurrent pushes to the same repo, so we need a retry mechanism when that happens.
---
 slurm/evaluate.slurm | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/slurm/evaluate.slurm b/slurm/evaluate.slurm
index 4883930ba..c659c0b34 100644
--- a/slurm/evaluate.slurm
+++ b/slurm/evaluate.slurm
@@ -66,7 +66,15 @@ OUTPUT_FILEPATHS=$(find $OUTPUT_DIR/results/ -type f \( -name "*.json" \))
 for filepath in $OUTPUT_FILEPATHS; do
     echo "Uploading $filepath to Hugging Face Hub..."
     filename=$(basename -- "$filepath")
-    huggingface-cli upload --repo-type space --private $LM_EVAL_REPO_ID $filepath $OUTPUT_DIR/$filename
+    for attempt in {1..20}; do
+        if huggingface-cli upload --repo-type space --private $LM_EVAL_REPO_ID $filepath $OUTPUT_DIR/$filename; then
+            echo "Upload succeeded for $filepath"
+            break
+        else
+            echo "Upload failed for $filepath. Attempt $attempt of 20. Retrying in 5 seconds..."
+            sleep 5
+        fi
+    done
 done
 
 echo "Uploading details to Hugging Face Hub..."
@@ -78,4 +86,4 @@ python src/open_r1/utils/upload_details.py --data_files $DETAILS_FILEPATHS --hub
 echo "Cleaning up ..."
 rm -rf $OUTPUT_DIR
 
-echo "Done!"
\ No newline at end of file
+echo "Done!"

From db19392befbd0ad70cbcc34e7294a08c6ecca4e3 Mon Sep 17 00:00:00 2001
From: Lewis <1657236+ctjlewis@users.noreply.github.com>
Date: Sun, 9 Feb 2025 02:45:38 -0600
Subject: [PATCH 028/137] chore(README): fix link, consistent formatting for
 CUDA warning (#248)

low priority & cosmetic
---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 46283c7ae..48b767b53 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
    - [SFT](#sft)  
    - [GRPO](#grpo)  
 5. [Evaluating models](#evaluating-models)  
-6. [Reproducing Deepseek's evaluation results on MATH-500](#reproducing-deepseeks-evaluation-results-on-math-500)  
+6. [Reproducing Deepseek's evaluation results](#reproducing-deepseeks-evaluation-results)  
 7. [Data generation](#data-generation)  
    - [Generate data from a smol distilled R1 model](#generate-data-from-a-smol-distilled-r1-model)  
    - [Generate data from DeepSeek-R1](#generate-data-from-deepseek-r1)  
@@ -43,7 +43,8 @@ We will use the DeepSeek-R1 [tech report](https://github.com/deepseek-ai/DeepSee
 
 ## Installation
 
-**Note: Libraries rely on CUDA 12.4. Double check your system if you get segmentation faults.**
+> [!CAUTION]
+> Libraries rely on CUDA 12.4. If you see errors related to segmentation faults, double check the version your system is running with `nvcc --version`.
 
 To run the code in this project, first, create a Python virtual environment using e.g. `uv`.
 To install `uv`, follow the [UV Installation Guide](https://docs.astral.sh/uv/getting-started/installation/).

From cabf27560bb13f6909243b0364366e5f62aa8277 Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Mon, 10 Feb 2025 11:43:16 +0100
Subject: [PATCH 029/137] hardcodes num_processes to 7 when using vllm (#264)

* hardcodes num_processes to 7 when using vllm

* nits
---
 slurm/train.slurm | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/slurm/train.slurm b/slurm/train.slurm
index 2c7708521..c10a2a237 100644
--- a/slurm/train.slurm
+++ b/slurm/train.slurm
@@ -32,6 +32,11 @@ WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
 # Due to conflicts between Accelerate's DeepSpeed configs and Transformers' TrainingArguments, we need to parse the gradient accumulation steps from the config file to ensure they match
 CONFIG_FILE=recipes/$MODEL/$TASK/config_$CONFIG_SUFFIX.yaml
 GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}')
+USE_VLLM=$(grep 'use_vllm:\s*true' $CONFIG_FILE) # Match "use_vllm: true" (with optional whitespace)
+
+if [ -n "$USE_VLLM" ]; then  # Check if USE_VLLM is *not* empty (found)
+    WORLD_SIZE=$(($WORLD_SIZE-1))
+fi
 
 # Split the string into individual arguments
 IFS=' ' read -ra ARGS <<< "$OPTIONAL_ARGS"

From d57f2edbd4cb8c4654dab5f67dfbf892845f02a0 Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Mon, 10 Feb 2025 12:21:08 +0100
Subject: [PATCH 030/137] Adds repetition penalty reward (#263)

* Adds a Repetition Penalty Reward

* style

* adds option to configue in grpo

* style

* improve desciptions
---
 src/open_r1/grpo.py    |  21 +++++-
 src/open_r1/rewards.py |  48 ++++++++++++++
 tests/test_rewards.py  | 147 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 214 insertions(+), 2 deletions(-)

diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 55a8a849c..ab9472676 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -25,7 +25,13 @@
 from transformers.trainer_utils import get_last_checkpoint
 
 from open_r1.configs import GRPOConfig
-from open_r1.rewards import accuracy_reward, format_reward, get_cosine_scaled_reward, reasoning_steps_reward
+from open_r1.rewards import (
+    accuracy_reward,
+    format_reward,
+    get_cosine_scaled_reward,
+    get_repetition_penalty_reward,
+    reasoning_steps_reward,
+)
 from open_r1.utils.callbacks import get_callbacks
 from trl import GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
 
@@ -80,6 +86,15 @@ class GRPOScriptArguments(ScriptArguments):
         metadata={"help": "Maximum length for scaling"},
     )
 
+    repetition_n_grams: int = field(
+        default=3,
+        metadata={"help": "Number of n-grams for repetition penalty reward"},
+    )
+    repetition_max_penalty: float = field(
+        default=-1.0,
+        metadata={"help": "Maximum (negative) penalty for for repetition penalty reward"},
+    )
+
 
 SYSTEM_PROMPT = (
     "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
@@ -139,6 +154,10 @@ def main(script_args, training_args, model_args):
             max_value_correct=script_args.cosine_max_value_correct,
             max_len=script_args.cosine_max_len,
         ),
+        "repetition_penalty": get_repetition_penalty_reward(
+            ngram_size=script_args.repetition_n_grams,
+            max_penalty=script_args.repetition_max_penalty,
+        ),
     }
     reward_funcs = [REWARD_FUNCS_REGISTRY[func] for func in script_args.reward_funcs]
 
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index f33b5641e..5599d1f50 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -148,3 +148,51 @@ def cosine_scaled_reward(completions, solution, **kwargs):
         return rewards
 
     return cosine_scaled_reward
+
+
+def get_repetition_penalty_reward(ngram_size: int, max_penalty: float):
+    if max_penalty > 0:
+        raise ValueError(f"max_penalty {max_penalty} should not be positive")
+
+    if max_penalty == 0:
+        return 0
+
+    def zipngram(text: str, ngram_size: int):
+        words = text.lower().split()
+        return zip(*[words[i:] for i in range(ngram_size)])
+
+    def repetition_penalty_reward(completions, *args, **kwargs):
+        """
+        reward function the penalizes repetitions
+        ref implementation: https://github.com/eddycmu/demystify-long-cot/blob/release/openrlhf/openrlhf/reward/repetition.py
+
+        Args:
+            completions: List of model completions
+            solution: List of ground truth solutions
+
+        This function is parameterized by the following arguments:
+            ngram_size: size of the n-grams
+            max_penalty: Maximum (negative) penalty for wrong answers
+        """
+
+        rewards = []
+        for completion in completions:
+            if completion == "":
+                rewards.append(0.0)
+                continue
+            if len(completion.split()) < ngram_size:
+                rewards.append(0.0)
+                continue
+
+            ngrams = set()
+            total = 0
+            for ng in zipngram(completion, ngram_size):
+                ngrams.add(ng)
+                total += 1
+
+            scaling = 1 - len(ngrams) / total
+            reward = scaling * max_penalty
+            rewards.append(reward)
+        return rewards
+
+    return repetition_penalty_reward
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index 473a07608..435f356f0 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -1,6 +1,12 @@
 import unittest
 
-from open_r1.rewards import accuracy_reward, format_reward, get_cosine_scaled_reward, reasoning_steps_reward
+from open_r1.rewards import (
+    accuracy_reward,
+    format_reward,
+    get_cosine_scaled_reward,
+    get_repetition_penalty_reward,
+    reasoning_steps_reward,
+)
 
 
 class TestRewards(unittest.TestCase):
@@ -105,5 +111,144 @@ def test_format_reward_specific_multiline(self):
         self.assertEqual(rewards[0], 1.0)
 
 
+class TestRepetitionPenaltyReward(unittest.TestCase):
+    def test_positive_max_penalty_raises_value_error(self):
+        with self.assertRaises(ValueError):
+            get_repetition_penalty_reward(ngram_size=2, max_penalty=1.0)
+        with self.assertRaisesRegex(ValueError, "max_penalty 1.5 should not be positive"):
+            get_repetition_penalty_reward(ngram_size=2, max_penalty=1.5)
+
+    def test_zero_max_penalty_returns_zero(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=0.0)
+        self.assertEqual(reward_fn, 0)
+
+    def test_no_repetition(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
+        completions = ["this is a test sentence"]
+        solution = []  # Solution is not used in the reward calculation
+        rewards = reward_fn(completions, solution)
+        self.assertEqual(rewards, [0.0])
+
+    def test_full_repetition(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
+        completions = ["this this this this this"]
+        solution = []
+        rewards = reward_fn(completions, solution)
+        # (1 - 1/4) * -1 = -0.75
+        self.assertEqual(rewards, [-0.75])
+
+    def test_partial_repetition(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
+        completions = [
+            "this is a this is a test"
+        ]  # 2-grams: (this, is), (is, a), (a, this), (this, is), (is, a), (a, test)
+        solution = []
+        rewards = reward_fn(completions, solution)
+        # Unique 2-grams: (this, is), (is, a), (a, this), (a, test).  4 unique out of 6 total
+        # (1 - 4/6) * -1 = -1/3 = -0.3333...
+        self.assertAlmostEqual(rewards[0], -1 / 3)
+
+    def test_multiple_completions(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-0.5)
+        completions = ["this is a test", "test test test test"]
+        solution = []
+        rewards = reward_fn(completions, solution)
+        # Completion 1:  (this, is, a), (is, a, test) -> 2 unique / 2 total -> (1 - 2/2) * -0.5 = 0
+        # Completion 2: (test, test, test) -> 1 unique / 2 total -> (1 - 1/2) * -0.5 = -0.25
+        self.assertAlmostEqual(rewards[0], 0.0)
+        self.assertAlmostEqual(rewards[1], -0.25)
+
+    def test_empty_completion(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
+        completions = [""]
+        solution = []
+        rewards = reward_fn(completions, solution)
+        self.assertEqual(rewards, [0.0])
+
+    def test_different_ngram_size(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-2.0)
+        completions = [
+            "this is a this is a test"
+        ]  # 3-grams:(this, is, a) (is, a, this) (a, this, is) (this, is, a) (is, a, test)
+        solution = []
+        rewards = reward_fn(completions, solution)
+        # Unique 3-grams: (this, is, a), (is, a, this), (a, this, is), (is, a, test) = 4.  Total 3-grams: 5
+        # (1 - 4/5) * -2 = -0.4
+        self.assertAlmostEqual(rewards[0], -0.4)
+
+    def test_mixed_case(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
+        completions = ["This is A Test", "this IS a test"]
+        solution = []
+        rewards = reward_fn(completions, solution)
+        # both completions should produce the same reward, because the text gets lowercased
+        self.assertAlmostEqual(rewards[0], rewards[1])
+
+    def test_one_word_completion(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
+        completions = ["word"]
+        solutions = []
+        rewards = reward_fn(completions, solutions)
+        self.assertEqual(rewards, [0.0])
+
+    def test_two_word_completion(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
+        completions = ["two words"]
+        solutions = []
+        rewards = reward_fn(completions, solutions)
+        self.assertEqual(rewards, [0.0])
+
+    def test_three_word_completion(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
+        completions = ["three different words"]
+        solutions = []
+        rewards = reward_fn(completions, solutions)
+        self.assertEqual(rewards, [0.0])
+
+    def test_three_word_repetition_completion(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
+        completions = ["word word word word"]
+        solutions = []
+        rewards = reward_fn(completions, solutions)
+        self.assertEqual(rewards, [-0.5])
+
+    def test_four_word_completion_with_repetition(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
+        completions = ["one two one two"]
+        solutions = []
+        rewards = reward_fn(completions, solutions)
+        # ngrams are (one two one) (two one two). unique is 2 and count is 2, therefore (1-1) * -1.
+        self.assertEqual(rewards, [0.0])
+
+    def test_five_word_completion_with_repetition(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-0.5)
+        completions = ["A B C A B"]
+        solutions = []
+        rewards = reward_fn(completions, solutions)
+        # (A B C) (B C A) (C A B). unique is 3. count is 3 (1-1) * -.5 = 0
+        self.assertEqual(rewards, [0.0])
+
+    def test_six_word_completion_with_repetition(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
+        completions = ["A B C A B C"]
+        solutions = []
+        rewards = reward_fn(completions, solutions)
+        self.assertEqual(rewards, [-0.25])
+
+    def test_long_completion_with_repetition(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
+        completions = ["A B C A B C E F G A B C A B C"]
+        solutions = []
+        rewards = reward_fn(completions, solutions)
+        self.assertAlmostEqual(rewards[0], -0.3846, places=4)
+
+    def test_long_completion_without_repetition(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
+        completions = ["A B C D E F G H I J K L"]
+        solutions = []
+        rewards = reward_fn(completions, solutions)
+        self.assertEqual(rewards, [0.0])
+
+
 if __name__ == "__main__":
     unittest.main()

From 486f7d48f53114b12121cbbb2a689165909deabd Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Mon, 10 Feb 2025 12:28:55 +0100
Subject: [PATCH 031/137] Revert "Adds repetition penalty reward (#263)" (#267)

This reverts commit d57f2edbd4cb8c4654dab5f67dfbf892845f02a0.
---
 src/open_r1/grpo.py    |  21 +-----
 src/open_r1/rewards.py |  48 --------------
 tests/test_rewards.py  | 147 +----------------------------------------
 3 files changed, 2 insertions(+), 214 deletions(-)

diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index ab9472676..55a8a849c 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -25,13 +25,7 @@
 from transformers.trainer_utils import get_last_checkpoint
 
 from open_r1.configs import GRPOConfig
-from open_r1.rewards import (
-    accuracy_reward,
-    format_reward,
-    get_cosine_scaled_reward,
-    get_repetition_penalty_reward,
-    reasoning_steps_reward,
-)
+from open_r1.rewards import accuracy_reward, format_reward, get_cosine_scaled_reward, reasoning_steps_reward
 from open_r1.utils.callbacks import get_callbacks
 from trl import GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
 
@@ -86,15 +80,6 @@ class GRPOScriptArguments(ScriptArguments):
         metadata={"help": "Maximum length for scaling"},
     )
 
-    repetition_n_grams: int = field(
-        default=3,
-        metadata={"help": "Number of n-grams for repetition penalty reward"},
-    )
-    repetition_max_penalty: float = field(
-        default=-1.0,
-        metadata={"help": "Maximum (negative) penalty for for repetition penalty reward"},
-    )
-
 
 SYSTEM_PROMPT = (
     "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
@@ -154,10 +139,6 @@ def main(script_args, training_args, model_args):
             max_value_correct=script_args.cosine_max_value_correct,
             max_len=script_args.cosine_max_len,
         ),
-        "repetition_penalty": get_repetition_penalty_reward(
-            ngram_size=script_args.repetition_n_grams,
-            max_penalty=script_args.repetition_max_penalty,
-        ),
     }
     reward_funcs = [REWARD_FUNCS_REGISTRY[func] for func in script_args.reward_funcs]
 
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 5599d1f50..f33b5641e 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -148,51 +148,3 @@ def cosine_scaled_reward(completions, solution, **kwargs):
         return rewards
 
     return cosine_scaled_reward
-
-
-def get_repetition_penalty_reward(ngram_size: int, max_penalty: float):
-    if max_penalty > 0:
-        raise ValueError(f"max_penalty {max_penalty} should not be positive")
-
-    if max_penalty == 0:
-        return 0
-
-    def zipngram(text: str, ngram_size: int):
-        words = text.lower().split()
-        return zip(*[words[i:] for i in range(ngram_size)])
-
-    def repetition_penalty_reward(completions, *args, **kwargs):
-        """
-        reward function the penalizes repetitions
-        ref implementation: https://github.com/eddycmu/demystify-long-cot/blob/release/openrlhf/openrlhf/reward/repetition.py
-
-        Args:
-            completions: List of model completions
-            solution: List of ground truth solutions
-
-        This function is parameterized by the following arguments:
-            ngram_size: size of the n-grams
-            max_penalty: Maximum (negative) penalty for wrong answers
-        """
-
-        rewards = []
-        for completion in completions:
-            if completion == "":
-                rewards.append(0.0)
-                continue
-            if len(completion.split()) < ngram_size:
-                rewards.append(0.0)
-                continue
-
-            ngrams = set()
-            total = 0
-            for ng in zipngram(completion, ngram_size):
-                ngrams.add(ng)
-                total += 1
-
-            scaling = 1 - len(ngrams) / total
-            reward = scaling * max_penalty
-            rewards.append(reward)
-        return rewards
-
-    return repetition_penalty_reward
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index 435f356f0..473a07608 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -1,12 +1,6 @@
 import unittest
 
-from open_r1.rewards import (
-    accuracy_reward,
-    format_reward,
-    get_cosine_scaled_reward,
-    get_repetition_penalty_reward,
-    reasoning_steps_reward,
-)
+from open_r1.rewards import accuracy_reward, format_reward, get_cosine_scaled_reward, reasoning_steps_reward
 
 
 class TestRewards(unittest.TestCase):
@@ -111,144 +105,5 @@ def test_format_reward_specific_multiline(self):
         self.assertEqual(rewards[0], 1.0)
 
 
-class TestRepetitionPenaltyReward(unittest.TestCase):
-    def test_positive_max_penalty_raises_value_error(self):
-        with self.assertRaises(ValueError):
-            get_repetition_penalty_reward(ngram_size=2, max_penalty=1.0)
-        with self.assertRaisesRegex(ValueError, "max_penalty 1.5 should not be positive"):
-            get_repetition_penalty_reward(ngram_size=2, max_penalty=1.5)
-
-    def test_zero_max_penalty_returns_zero(self):
-        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=0.0)
-        self.assertEqual(reward_fn, 0)
-
-    def test_no_repetition(self):
-        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
-        completions = ["this is a test sentence"]
-        solution = []  # Solution is not used in the reward calculation
-        rewards = reward_fn(completions, solution)
-        self.assertEqual(rewards, [0.0])
-
-    def test_full_repetition(self):
-        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
-        completions = ["this this this this this"]
-        solution = []
-        rewards = reward_fn(completions, solution)
-        # (1 - 1/4) * -1 = -0.75
-        self.assertEqual(rewards, [-0.75])
-
-    def test_partial_repetition(self):
-        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
-        completions = [
-            "this is a this is a test"
-        ]  # 2-grams: (this, is), (is, a), (a, this), (this, is), (is, a), (a, test)
-        solution = []
-        rewards = reward_fn(completions, solution)
-        # Unique 2-grams: (this, is), (is, a), (a, this), (a, test).  4 unique out of 6 total
-        # (1 - 4/6) * -1 = -1/3 = -0.3333...
-        self.assertAlmostEqual(rewards[0], -1 / 3)
-
-    def test_multiple_completions(self):
-        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-0.5)
-        completions = ["this is a test", "test test test test"]
-        solution = []
-        rewards = reward_fn(completions, solution)
-        # Completion 1:  (this, is, a), (is, a, test) -> 2 unique / 2 total -> (1 - 2/2) * -0.5 = 0
-        # Completion 2: (test, test, test) -> 1 unique / 2 total -> (1 - 1/2) * -0.5 = -0.25
-        self.assertAlmostEqual(rewards[0], 0.0)
-        self.assertAlmostEqual(rewards[1], -0.25)
-
-    def test_empty_completion(self):
-        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
-        completions = [""]
-        solution = []
-        rewards = reward_fn(completions, solution)
-        self.assertEqual(rewards, [0.0])
-
-    def test_different_ngram_size(self):
-        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-2.0)
-        completions = [
-            "this is a this is a test"
-        ]  # 3-grams:(this, is, a) (is, a, this) (a, this, is) (this, is, a) (is, a, test)
-        solution = []
-        rewards = reward_fn(completions, solution)
-        # Unique 3-grams: (this, is, a), (is, a, this), (a, this, is), (is, a, test) = 4.  Total 3-grams: 5
-        # (1 - 4/5) * -2 = -0.4
-        self.assertAlmostEqual(rewards[0], -0.4)
-
-    def test_mixed_case(self):
-        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
-        completions = ["This is A Test", "this IS a test"]
-        solution = []
-        rewards = reward_fn(completions, solution)
-        # both completions should produce the same reward, because the text gets lowercased
-        self.assertAlmostEqual(rewards[0], rewards[1])
-
-    def test_one_word_completion(self):
-        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
-        completions = ["word"]
-        solutions = []
-        rewards = reward_fn(completions, solutions)
-        self.assertEqual(rewards, [0.0])
-
-    def test_two_word_completion(self):
-        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
-        completions = ["two words"]
-        solutions = []
-        rewards = reward_fn(completions, solutions)
-        self.assertEqual(rewards, [0.0])
-
-    def test_three_word_completion(self):
-        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
-        completions = ["three different words"]
-        solutions = []
-        rewards = reward_fn(completions, solutions)
-        self.assertEqual(rewards, [0.0])
-
-    def test_three_word_repetition_completion(self):
-        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
-        completions = ["word word word word"]
-        solutions = []
-        rewards = reward_fn(completions, solutions)
-        self.assertEqual(rewards, [-0.5])
-
-    def test_four_word_completion_with_repetition(self):
-        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
-        completions = ["one two one two"]
-        solutions = []
-        rewards = reward_fn(completions, solutions)
-        # ngrams are (one two one) (two one two). unique is 2 and count is 2, therefore (1-1) * -1.
-        self.assertEqual(rewards, [0.0])
-
-    def test_five_word_completion_with_repetition(self):
-        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-0.5)
-        completions = ["A B C A B"]
-        solutions = []
-        rewards = reward_fn(completions, solutions)
-        # (A B C) (B C A) (C A B). unique is 3. count is 3 (1-1) * -.5 = 0
-        self.assertEqual(rewards, [0.0])
-
-    def test_six_word_completion_with_repetition(self):
-        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
-        completions = ["A B C A B C"]
-        solutions = []
-        rewards = reward_fn(completions, solutions)
-        self.assertEqual(rewards, [-0.25])
-
-    def test_long_completion_with_repetition(self):
-        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
-        completions = ["A B C A B C E F G A B C A B C"]
-        solutions = []
-        rewards = reward_fn(completions, solutions)
-        self.assertAlmostEqual(rewards[0], -0.3846, places=4)
-
-    def test_long_completion_without_repetition(self):
-        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
-        completions = ["A B C D E F G H I J K L"]
-        solutions = []
-        rewards = reward_fn(completions, solutions)
-        self.assertEqual(rewards, [0.0])
-
-
 if __name__ == "__main__":
     unittest.main()

From 88c51fe05d682543f9685e3dc821494bf253d231 Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Mon, 10 Feb 2025 12:35:21 +0100
Subject: [PATCH 032/137] Repetition penalty hotfix (#266)

* Adds a Repetition Penalty Reward

* style

* adds option to configue in grpo

* style

* improve desciptions

* fix final changes

* fix docstring

* style
---
 src/open_r1/grpo.py    |  21 ++++++-
 src/open_r1/rewards.py |  51 ++++++++++++++++
 tests/test_rewards.py  | 131 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 201 insertions(+), 2 deletions(-)

diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 55a8a849c..ab9472676 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -25,7 +25,13 @@
 from transformers.trainer_utils import get_last_checkpoint
 
 from open_r1.configs import GRPOConfig
-from open_r1.rewards import accuracy_reward, format_reward, get_cosine_scaled_reward, reasoning_steps_reward
+from open_r1.rewards import (
+    accuracy_reward,
+    format_reward,
+    get_cosine_scaled_reward,
+    get_repetition_penalty_reward,
+    reasoning_steps_reward,
+)
 from open_r1.utils.callbacks import get_callbacks
 from trl import GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
 
@@ -80,6 +86,15 @@ class GRPOScriptArguments(ScriptArguments):
         metadata={"help": "Maximum length for scaling"},
     )
 
+    repetition_n_grams: int = field(
+        default=3,
+        metadata={"help": "Number of n-grams for repetition penalty reward"},
+    )
+    repetition_max_penalty: float = field(
+        default=-1.0,
+        metadata={"help": "Maximum (negative) penalty for for repetition penalty reward"},
+    )
+
 
 SYSTEM_PROMPT = (
     "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
@@ -139,6 +154,10 @@ def main(script_args, training_args, model_args):
             max_value_correct=script_args.cosine_max_value_correct,
             max_len=script_args.cosine_max_len,
         ),
+        "repetition_penalty": get_repetition_penalty_reward(
+            ngram_size=script_args.repetition_n_grams,
+            max_penalty=script_args.repetition_max_penalty,
+        ),
     }
     reward_funcs = [REWARD_FUNCS_REGISTRY[func] for func in script_args.reward_funcs]
 
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index f33b5641e..8fc15dbac 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -148,3 +148,54 @@ def cosine_scaled_reward(completions, solution, **kwargs):
         return rewards
 
     return cosine_scaled_reward
+
+
+def get_repetition_penalty_reward(ngram_size: int, max_penalty: float):
+    """
+    Computes N-gram repetition penalty as described in Appendix C.2 of https://arxiv.org/abs/2502.03373.
+    Reference implementation from: https://github.com/eddycmu/demystify-long-cot/blob/release/openrlhf/openrlhf/reward/repetition.py
+
+    Args:
+    ngram_size: size of the n-grams
+    max_penalty: Maximum (negative) penalty for wrong answers
+    """
+    if max_penalty > 0:
+        raise ValueError(f"max_penalty {max_penalty} should not be positive")
+
+    if max_penalty == 0:
+        return 0
+
+    def zipngram(text: str, ngram_size: int):
+        words = text.lower().split()
+        return zip(*[words[i:] for i in range(ngram_size)])
+
+    def repetition_penalty_reward(completions, **kwargs) -> float:
+        """
+        reward function the penalizes repetitions
+        ref implementation: https://github.com/eddycmu/demystify-long-cot/blob/release/openrlhf/openrlhf/reward/repetition.py
+
+        Args:
+            completions: List of model completions
+        """
+
+        rewards = []
+        for completion in completions:
+            if completion == "":
+                rewards.append(0.0)
+                continue
+            if len(completion.split()) < ngram_size:
+                rewards.append(0.0)
+                continue
+
+            ngrams = set()
+            total = 0
+            for ng in zipngram(completion, ngram_size):
+                ngrams.add(ng)
+                total += 1
+
+            scaling = 1 - len(ngrams) / total
+            reward = scaling * max_penalty
+            rewards.append(reward)
+        return rewards
+
+    return repetition_penalty_reward
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index 473a07608..30e937e1f 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -1,6 +1,12 @@
 import unittest
 
-from open_r1.rewards import accuracy_reward, format_reward, get_cosine_scaled_reward, reasoning_steps_reward
+from open_r1.rewards import (
+    accuracy_reward,
+    format_reward,
+    get_cosine_scaled_reward,
+    get_repetition_penalty_reward,
+    reasoning_steps_reward,
+)
 
 
 class TestRewards(unittest.TestCase):
@@ -105,5 +111,128 @@ def test_format_reward_specific_multiline(self):
         self.assertEqual(rewards[0], 1.0)
 
 
+class TestRepetitionPenaltyReward(unittest.TestCase):
+    def test_positive_max_penalty_raises_value_error(self):
+        with self.assertRaises(ValueError):
+            get_repetition_penalty_reward(ngram_size=2, max_penalty=1.0)
+        with self.assertRaisesRegex(ValueError, "max_penalty 1.5 should not be positive"):
+            get_repetition_penalty_reward(ngram_size=2, max_penalty=1.5)
+
+    def test_zero_max_penalty_returns_zero(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=0.0)
+        self.assertEqual(reward_fn, 0)
+
+    def test_no_repetition(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
+        completions = ["this is a test sentence"]
+        rewards = reward_fn(completions)
+        self.assertEqual(rewards, [0.0])
+
+    def test_full_repetition(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
+        completions = ["this this this this this"]
+        rewards = reward_fn(completions)
+        # (1 - 1/4) * -1 = -0.75
+        self.assertEqual(rewards, [-0.75])
+
+    def test_partial_repetition(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
+        completions = [
+            "this is a this is a test"
+        ]  # 2-grams: (this, is), (is, a), (a, this), (this, is), (is, a), (a, test)
+        rewards = reward_fn(completions)
+        # Unique 2-grams: (this, is), (is, a), (a, this), (a, test).  4 unique out of 6 total
+        # (1 - 4/6) * -1 = -1/3 = -0.3333...
+        self.assertAlmostEqual(rewards[0], -1 / 3)
+
+    def test_multiple_completions(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-0.5)
+        completions = ["this is a test", "test test test test"]
+        rewards = reward_fn(completions)
+        # Completion 1:  (this, is, a), (is, a, test) -> 2 unique / 2 total -> (1 - 2/2) * -0.5 = 0
+        # Completion 2: (test, test, test) -> 1 unique / 2 total -> (1 - 1/2) * -0.5 = -0.25
+        self.assertAlmostEqual(rewards[0], 0.0)
+        self.assertAlmostEqual(rewards[1], -0.25)
+
+    def test_empty_completion(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
+        completions = [""]
+        rewards = reward_fn(completions)
+        self.assertEqual(rewards, [0.0])
+
+    def test_different_ngram_size(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-2.0)
+        completions = [
+            "this is a this is a test"
+        ]  # 3-grams:(this, is, a) (is, a, this) (a, this, is) (this, is, a) (is, a, test)
+        rewards = reward_fn(completions)
+        # Unique 3-grams: (this, is, a), (is, a, this), (a, this, is), (is, a, test) = 4.  Total 3-grams: 5
+        # (1 - 4/5) * -2 = -0.4
+        self.assertAlmostEqual(rewards[0], -0.4)
+
+    def test_mixed_case(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
+        completions = ["This is A Test", "this IS a test"]
+        rewards = reward_fn(completions)
+        # both completions should produce the same reward, because the text gets lowercased
+        self.assertAlmostEqual(rewards[0], rewards[1])
+
+    def test_one_word_completion(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
+        completions = ["word"]
+        rewards = reward_fn(completions)
+        self.assertEqual(rewards, [0.0])
+
+    def test_two_word_completion(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
+        completions = ["two words"]
+        rewards = reward_fn(completions)
+        self.assertEqual(rewards, [0.0])
+
+    def test_three_word_completion(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
+        completions = ["three different words"]
+        rewards = reward_fn(completions)
+        self.assertEqual(rewards, [0.0])
+
+    def test_three_word_repetition_completion(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
+        completions = ["word word word word"]
+        rewards = reward_fn(completions)
+        self.assertEqual(rewards, [-0.5])
+
+    def test_four_word_completion_with_repetition(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
+        completions = ["one two one two"]
+        rewards = reward_fn(completions)
+        # ngrams are (one two one) (two one two). unique is 2 and count is 2, therefore (1-1) * -1.
+        self.assertEqual(rewards, [0.0])
+
+    def test_five_word_completion_with_repetition(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-0.5)
+        completions = ["A B C A B"]
+        rewards = reward_fn(completions)
+        # (A B C) (B C A) (C A B). unique is 3. count is 3 (1-1) * -.5 = 0
+        self.assertEqual(rewards, [0.0])
+
+    def test_six_word_completion_with_repetition(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
+        completions = ["A B C A B C"]
+        rewards = reward_fn(completions)
+        self.assertEqual(rewards, [-0.25])
+
+    def test_long_completion_with_repetition(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
+        completions = ["A B C A B C E F G A B C A B C"]
+        rewards = reward_fn(completions)
+        self.assertAlmostEqual(rewards[0], -0.3846, places=4)
+
+    def test_long_completion_without_repetition(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
+        completions = ["A B C D E F G H I J K L"]
+        rewards = reward_fn(completions)
+        self.assertEqual(rewards, [0.0])
+
+
 if __name__ == "__main__":
     unittest.main()

From 6c4c9e945bc2d274b0d6b3292271b1394567884c Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Mon, 10 Feb 2025 13:23:26 +0100
Subject: [PATCH 033/137] Add aime25 (#265)

* Add AIME25

* Fix col
---
 src/open_r1/evaluate.py         | 15 +++++++++++++++
 src/open_r1/utils/evaluation.py |  1 +
 2 files changed, 16 insertions(+)

diff --git a/src/open_r1/evaluate.py b/src/open_r1/evaluate.py
index 0447b266e..b32389ac2 100644
--- a/src/open_r1/evaluate.py
+++ b/src/open_r1/evaluate.py
@@ -106,6 +106,20 @@ def gpqa_prompt_fn(line, task_name: str = None):
     metric=[expr_gold_metric],
     version=1,
 )
+aime25 = LightevalTaskConfig(
+    name="aime25",
+    suite=["custom"],
+    prompt_function=aime_prompt_fn,
+    hf_repo="open-r1/aime_2025_1",
+    hf_subset="default",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select=None,
+    generation_size=32768,
+    metric=[expr_gold_metric],
+    version=1,
+)
 math_500 = LightevalTaskConfig(
     name="math_500",
     suite=["custom"],
@@ -141,6 +155,7 @@ def gpqa_prompt_fn(line, task_name: str = None):
 # Add tasks to the table
 TASKS_TABLE = []
 TASKS_TABLE.append(aime24)
+TASKS_TABLE.append(aime25)
 TASKS_TABLE.append(math_500)
 TASKS_TABLE.append(gpqa_diamond)
 
diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py
index 86de906d9..489971830 100644
--- a/src/open_r1/utils/evaluation.py
+++ b/src/open_r1/utils/evaluation.py
@@ -48,6 +48,7 @@ def register_lighteval_task(
 
 register_lighteval_task(LIGHTEVAL_TASKS, "custom", "math_500", "math_500", 0)
 register_lighteval_task(LIGHTEVAL_TASKS, "custom", "aime24", "aime24", 0)
+register_lighteval_task(LIGHTEVAL_TASKS, "custom", "aime25", "aime25", 0)
 register_lighteval_task(LIGHTEVAL_TASKS, "custom", "gpqa", "gpqa:diamond", 0)
 
 

From baec330ef58550e8892ee37449e24bcc01bbe953 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Mon, 10 Feb 2025 14:37:58 +0100
Subject: [PATCH 034/137] Add SGLang inference scripts (#268)

* sglang inference server

* add vllm

* readme
---
 slurm/README.md                        |  17 ++++
 slurm/experimental/serve_r1_vllm.slurm | 135 +++++++++++++++++++++++++
 slurm/serve_r1.slurm                   | 112 ++++++++++++++++++++
 slurm/serve_router.slurm               |  46 +++++++++
 4 files changed, 310 insertions(+)
 create mode 100644 slurm/README.md
 create mode 100644 slurm/experimental/serve_r1_vllm.slurm
 create mode 100644 slurm/serve_r1.slurm
 create mode 100644 slurm/serve_router.slurm

diff --git a/slurm/README.md b/slurm/README.md
new file mode 100644
index 000000000..f81c583c4
--- /dev/null
+++ b/slurm/README.md
@@ -0,0 +1,17 @@
+## Serving DeepSeek-R1 on 2x8 H100 SLURM nodes with SGLang 
+
+1. Set up the environment (adjust for your cuda version):
+```bash
+conda create -n sglang124 python=3.11
+conda activate sglang124
+
+pip install torch=2.5.1 --index-url https://download.pytorch.org/whl/cu124
+
+pip install sgl-kernel --force-reinstall --no-deps
+pip install "sglang[all]>=0.4.2.post4" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer/
+```
+
+2. Run the server:
+```bash
+sbatch serve_r1.slurm -m "/fsx/deepseek-r1-checkpoint" -e "sglang124"
+```
\ No newline at end of file
diff --git a/slurm/experimental/serve_r1_vllm.slurm b/slurm/experimental/serve_r1_vllm.slurm
new file mode 100644
index 000000000..7e0b0d597
--- /dev/null
+++ b/slurm/experimental/serve_r1_vllm.slurm
@@ -0,0 +1,135 @@
+#!/bin/bash
+#SBATCH --job-name=r1-vllm
+#SBATCH --partition=hopper-prod
+#SBATCH --qos=normal
+#SBATCH --nodes=4
+#SBATCH --gpus-per-node=8
+#SBATCH --exclusive
+#SBATCH --output=./logs/%x_%j_%n.out
+#SBATCH --error=./logs/%x_%j_%n.err
+#SBATCH --time=7-00:00:00
+#SBATCH --ntasks-per-node=1
+#SBATCH --requeue
+
+set -exuo pipefail
+
+MODEL_PATH="deepseek-ai/DeepSeek-R1"
+CONDA_ENV="vllm7"
+SERVER_PORT=8000
+RAY_PORT=6379
+RAY_DASHBOARD_PORT=8265
+
+trap 'scontrol requeue ${SLURM_JOB_ID}; exit 15' SIGUSR1
+
+while getopts "m:e:h" opt; do
+    case $opt in
+        m) MODEL_PATH="$OPTARG" ;;
+        e) CONDA_ENV="$OPTARG" ;;
+        h|?) echo "Usage: sbatch $0 [-m MODEL_PATH] [-e CONDA_ENV]"; exit 1 ;;
+    esac
+done
+
+# Environment setup
+module load cuda/12.1
+source ~/.bashrc
+source "$CONDA_PREFIX/etc/profile.d/conda.sh"
+conda activate "$CONDA_ENV" || { echo "Failed to activate conda env $CONDA_ENV"; exit 1; }
+
+# Get nodes information
+NODES=($(scontrol show hostnames "$SLURM_JOB_NODELIST"))
+HEAD_NODE="${NODES[0]}"
+HEAD_NODE_IP=$(srun --nodes=1 --ntasks=1 -w "$HEAD_NODE" hostname --ip-address)
+
+echo "SLURM_JOB_ID: $SLURM_JOB_ID"
+echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
+echo "Head node: $HEAD_NODE ($HEAD_NODE_IP)"
+
+# Start Ray head node
+echo "Starting Ray head node at $HEAD_NODE"
+srun --nodes=1 --ntasks=1 -w "$HEAD_NODE" \
+    ray start --head \
+    --node-ip-address="$HEAD_NODE_IP" \
+    --port=$RAY_PORT \
+    --dashboard-host=0.0.0.0 \
+    --dashboard-port=$RAY_DASHBOARD_PORT \
+    --block &
+
+sleep 10
+
+# Start Ray worker nodes
+WORKER_COUNT=$((SLURM_JOB_NUM_NODES - 1))
+for ((i = 1; i <= WORKER_COUNT; i++)); do
+    WORKER_NODE="${NODES[$i]}"
+    echo "Starting Ray worker $i at $WORKER_NODE"
+    srun --nodes=1 --ntasks=1 -w "$WORKER_NODE" \
+        ray start --address "$HEAD_NODE_IP:$RAY_PORT" \
+        --block &
+    sleep 5
+done
+
+echo "Waiting for Ray cluster to initialize..."
+sleep 60
+
+# Start vLLM server
+echo "Starting vLLM server..."
+RAY_ADDRESS="http://$HEAD_NODE_IP:$RAY_DASHBOARD_PORT" ray job submit \
+    --working-dir src/open_r1 \
+    --no-wait \
+    --job-id vllm-server \
+    -- vllm serve "$MODEL_PATH" \
+        --tensor-parallel-size 8 \
+        --pipeline-parallel-size 4 \
+        --gpu-memory-utilization 0.90 \
+        --max-model-len 32768 \
+        --max-num-batched-tokens 262144 \
+        --max-num-seqs 128 \
+        --max-seq-len-to-capture 32768 \
+        --enable-chunked-prefill true \
+        --preemption-mode recompute \
+        --swap-space 128 \
+        --trust-remote-code \
+        --distributed-executor-backend ray
+
+# Wait for server with timeout
+TIMEOUT=3600  # 1h
+START_TIME=$(date +%s)
+echo "Waiting for vLLM server (http://$HEAD_NODE_IP:$SERVER_PORT)..."
+
+while true; do
+    if curl -s -o /dev/null -w "%{http_code}" "http://$HEAD_NODE_IP:$SERVER_PORT/health" >/dev/null 2>&1; then
+        echo "Server is ready at http://$HEAD_NODE_IP:$SERVER_PORT"
+        break
+    fi
+
+    CURRENT_TIME=$(date +%s)
+    if [ $((CURRENT_TIME - START_TIME)) -gt $TIMEOUT ]; then
+        echo "Error: Server failed to start within $TIMEOUT seconds"
+        exit 1
+    fi
+
+    echo "Still waiting... ($(($CURRENT_TIME - $START_TIME)) seconds elapsed)"
+    sleep 60
+done
+
+echo "Checking available models..."
+curl "http://$HEAD_NODE_IP:$SERVER_PORT/v1/models"
+sleep 10
+
+echo "Executing sanity check..."
+curl "http://$HEAD_NODE_IP:$SERVER_PORT/v1/completions" \
+    -H "Content-Type: application/json" \
+    -d "{
+        \"model\": \"default\",
+        \"prompt\": \"<｜begin▁of▁sentence｜><｜User｜>hi, how are you?<｜Assistant｜>\",
+        \"max_tokens\": 2048,
+        \"temperature\": 0.6
+    }"
+
+# Keep the job running with health checks
+while true; do
+    if ! curl -s -o /dev/null "http://$HEAD_NODE_IP:$SERVER_PORT/health"; then
+        echo "Error: Server health check failed"
+        exit 1
+    fi
+    sleep 300
+done
\ No newline at end of file
diff --git a/slurm/serve_r1.slurm b/slurm/serve_r1.slurm
new file mode 100644
index 000000000..60a72aadf
--- /dev/null
+++ b/slurm/serve_r1.slurm
@@ -0,0 +1,112 @@
+#!/bin/bash
+#SBATCH --job-name=r1-server
+#SBATCH --partition=hopper-prod
+#SBATCH --qos=normal
+#SBATCH --nodes=2
+#SBATCH --gpus-per-node=8
+#SBATCH --exclusive
+#SBATCH --output=./logs/%x_%j_%n.out
+#SBATCH --error=./logs/%x_%j_%n.err
+#SBATCH --time=7-00:00:00
+#SBATCH --ntasks-per-node=1
+#SBATCH --requeue
+
+set -exuo pipefail
+
+MODEL_PATH="deepseek-ai/DeepSeek-R1"
+CONDA_ENV="sglang124"
+ROUTER_ADDRESS=""
+SERVER_PORT=39877
+DIST_PORT=45000
+
+trap 'scontrol requeue ${SLURM_JOB_ID}; exit 15' SIGUSR1
+
+# TODO: Adjust these variables to your cluster configuration
+export OUTLINES_CACHE_DIR=/scratch/serve_r1/ocache/
+export TRITON_HOME=/scratch/serve_r1/triton/
+export GLOO_SOCKET_IFNAME="enp71s0"
+export NCCL_SOCKET_IFNAME="enp71s0"
+
+while getopts "m:e:r:h" opt; do
+    case $opt in
+        m) MODEL_PATH="$OPTARG" ;;
+        e) CONDA_ENV="$OPTARG" ;;
+        r) ROUTER_ADDRESS="$OPTARG" ;;
+        h|?) echo "Usage: sbatch $0 [-m MODEL_PATH] [-e CONDA_ENV] [-r ROUTER_ADDRESS]"; exit 1 ;;
+    esac
+done
+
+# TODO: Environment setup, adjust to your cluster configuration
+module load cuda/12.4
+source ~/.bashrc
+source "$CONDA_PREFIX/etc/profile.d/conda.sh"
+conda activate "$CONDA_ENV" || { echo "Failed to activate conda env $CONDA_ENV"; exit 1; }
+
+FIRST_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n1)
+FIRST_NODE_IP=$(srun --nodes=1 --ntasks=1 -w "$FIRST_NODE" hostname --ip-address)
+
+# Launch servers synchronously across all nodes
+# (--max-running-requests=56 is rough estimate to avoid too many evicted/preempted 16k-long requests)
+srun --nodes=2 --ntasks=2 --ntasks-per-node=1 \
+    bash -c "python -m sglang.launch_server \
+        --model-path '$MODEL_PATH' \
+        --tp 16 \
+        --dist-init-addr '$FIRST_NODE_IP:$DIST_PORT' \
+        --nnodes 2 \
+        --node-rank \$SLURM_PROCID \
+        --port '$SERVER_PORT' \
+        --host 0.0.0.0 \
+        --trust-remote-code \
+        --max-running-requests 56 \
+        --context-length 32768" &
+
+# Wait for server with timeout
+TIMEOUT=3600  # 1h, but model loading should take ~30min
+START_TIME=$(date +%s)
+echo "Waiting for SGLang server (http://$FIRST_NODE_IP:$SERVER_PORT)..."
+
+while true; do
+    if curl -s -o /dev/null -w "%{http_code}" "http://$FIRST_NODE_IP:$SERVER_PORT/health" >/dev/null 2>&1; then
+        echo "Server is ready at http://$FIRST_NODE_IP:$SERVER_PORT"
+        break
+    fi
+
+    CURRENT_TIME=$(date +%s)
+    if [ $((CURRENT_TIME - START_TIME)) -gt $TIMEOUT ]; then
+        echo "Error: Server failed to start within $TIMEOUT seconds"
+        exit 1
+    fi
+
+    echo "Still waiting... ($(($CURRENT_TIME - $START_TIME)) seconds elapsed)"
+    sleep 60
+done
+
+# Register with router only if address was provided
+if [ -n "$ROUTER_ADDRESS" ]; then
+    echo "Registering with router at $ROUTER_ADDRESS..."
+    curl -X POST "http://$ROUTER_ADDRESS/add_worker?url=http://$FIRST_NODE_IP:$SERVER_PORT" || true
+    sleep 10
+fi
+
+echo "Checking available models..."
+curl "http://$FIRST_NODE_IP:$SERVER_PORT/v1/models"
+sleep 10
+
+echo "Executing sanity check..."
+curl "http://$FIRST_NODE_IP:$SERVER_PORT/v1/completions" \
+    -H "Content-Type: application/json" \
+    -d "{
+        \"model\": \"default\",
+        \"prompt\": \"<｜begin▁of▁sentence｜><｜User｜>hi, how are you?<｜Assistant｜>\",
+        \"max_tokens\": 2048,
+        \"temperature\": 0.6
+    }"
+
+# Keep the job running with health checks
+while true; do
+    if ! curl -s -o /dev/null "http://$FIRST_NODE_IP:$SERVER_PORT/health"; then
+        echo "Error: Server health check failed"
+        exit 1
+    fi
+    sleep 300
+done
\ No newline at end of file
diff --git a/slurm/serve_router.slurm b/slurm/serve_router.slurm
new file mode 100644
index 000000000..b39ca66aa
--- /dev/null
+++ b/slurm/serve_router.slurm
@@ -0,0 +1,46 @@
+#!/bin/bash
+#SBATCH --job-name=r1-router
+#SBATCH --partition=hopper-cpu
+#SBATCH --qos=high
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem-per-cpu=1875m
+#SBATCH --output=./logs/%x_%j_%n.out
+#SBATCH --error=./logs/%x_%j_%n.err
+#SBATCH --time=30-00:00:00
+#SBATCH --requeue
+
+set -exuo pipefail
+
+# TODO: Adjust these variables to your cluster configuration
+CONDA_ENV="sglang124"
+ROUTER_PORT=39876
+
+trap 'scontrol requeue ${SLURM_JOB_ID}; exit 15' SIGUSR1
+
+while getopts "e:h" opt; do
+    case $opt in
+        e) CONDA_ENV="$OPTARG" ;;
+        h|?) echo "Usage: sbatch $0 [-e CONDA_ENV]"; exit 1 ;;
+    esac
+done
+
+# TODO: Environment setup, adjust to your cluster configuration
+source ~/.bashrc
+source "$CONDA_PREFIX/etc/profile.d/conda.sh"
+conda activate "$CONDA_ENV" || { echo "Failed to activate conda env $CONDA_ENV"; exit 1; }
+
+python -m sglang_router.launch_router \
+    --port "$ROUTER_PORT" \
+    --host 0.0.0.0 \
+    --policy "round_robin" \
+    --worker-startup-timeout-secs 300
+
+# Keep the job running with health checks
+while true; do
+    if ! curl -s -o /dev/null "http://localhost:$ROUTER_PORT/health"; then
+        echo "Error: Router health check failed"
+        exit 1
+    fi
+    sleep 300
+done
\ No newline at end of file

From 37d988791e6d79de5c1c9a2e3b852535f71f6b09 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Mon, 10 Feb 2025 15:05:52 +0100
Subject: [PATCH 035/137] Fix AIME25 naming (#270)

* Fix AIME25 naming

* Fix
---
 src/open_r1/evaluate.py         | 7 ++++---
 src/open_r1/utils/evaluation.py | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/open_r1/evaluate.py b/src/open_r1/evaluate.py
index b32389ac2..bd781b956 100644
--- a/src/open_r1/evaluate.py
+++ b/src/open_r1/evaluate.py
@@ -106,8 +106,9 @@ def gpqa_prompt_fn(line, task_name: str = None):
     metric=[expr_gold_metric],
     version=1,
 )
-aime25 = LightevalTaskConfig(
-    name="aime25",
+# Part I from AIME 2025 exam: https://artofproblemsolving.com/wiki/index.php/2025_AIME_I?srsltid=AfmBOoof5gaaqlt3-l6LH7Tt6qmJZtl_2PQEDYlLFlMqhq9dLL8FMCRR
+aime25_part1 = LightevalTaskConfig(
+    name="aime25:part1",
     suite=["custom"],
     prompt_function=aime_prompt_fn,
     hf_repo="open-r1/aime_2025_1",
@@ -155,7 +156,7 @@ def gpqa_prompt_fn(line, task_name: str = None):
 # Add tasks to the table
 TASKS_TABLE = []
 TASKS_TABLE.append(aime24)
-TASKS_TABLE.append(aime25)
+TASKS_TABLE.append(aime25_part1)
 TASKS_TABLE.append(math_500)
 TASKS_TABLE.append(gpqa_diamond)
 
diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py
index 489971830..3be5daab0 100644
--- a/src/open_r1/utils/evaluation.py
+++ b/src/open_r1/utils/evaluation.py
@@ -48,7 +48,7 @@ def register_lighteval_task(
 
 register_lighteval_task(LIGHTEVAL_TASKS, "custom", "math_500", "math_500", 0)
 register_lighteval_task(LIGHTEVAL_TASKS, "custom", "aime24", "aime24", 0)
-register_lighteval_task(LIGHTEVAL_TASKS, "custom", "aime25", "aime25", 0)
+register_lighteval_task(LIGHTEVAL_TASKS, "custom", "aime25_part1", "aime25:part1", 0)
 register_lighteval_task(LIGHTEVAL_TASKS, "custom", "gpqa", "gpqa:diamond", 0)
 
 

From e4ac3ae070e43d7923f806c4e66c45e32ea52969 Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Mon, 10 Feb 2025 15:50:47 +0100
Subject: [PATCH 036/137] Fix repetition reward + tests (#272)

* fix rep penalty

* fix tests

* clean up, style
---
 src/open_r1/grpo.py    |  6 ++---
 src/open_r1/rewards.py |  6 ++---
 tests/test_rewards.py  | 57 ++++++++++++++++++++++++++----------------
 3 files changed, 40 insertions(+), 29 deletions(-)

diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index ab9472676..5cb645524 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -46,7 +46,7 @@ class GRPOScriptArguments(ScriptArguments):
 
     Args:
         reward_funcs (`list[str]`):
-            List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine'.
+            List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty'.
         cosine_min_value_wrong (`float`):
             Minimum reward for cosine scaling for wrong answers.
         cosine_max_value_wrong (`float`):
@@ -60,9 +60,9 @@ class GRPOScriptArguments(ScriptArguments):
     """
 
     reward_funcs: list[str] = field(
-        default_factory=lambda: ["accuracy", "format", "reasoning_steps", "cosine"],
+        default_factory=lambda: ["accuracy", "format"],
         metadata={
-            "help": "List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine'"
+            "help": "List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty'"
         },
     )
     cosine_min_value_wrong: float = field(
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 8fc15dbac..f7e270ef8 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -162,9 +162,6 @@ def get_repetition_penalty_reward(ngram_size: int, max_penalty: float):
     if max_penalty > 0:
         raise ValueError(f"max_penalty {max_penalty} should not be positive")
 
-    if max_penalty == 0:
-        return 0
-
     def zipngram(text: str, ngram_size: int):
         words = text.lower().split()
         return zip(*[words[i:] for i in range(ngram_size)])
@@ -178,8 +175,9 @@ def repetition_penalty_reward(completions, **kwargs) -> float:
             completions: List of model completions
         """
 
+        contents = [completion[0]["content"] for completion in completions]
         rewards = []
-        for completion in completions:
+        for completion in contents:
             if completion == "":
                 rewards.append(0.0)
                 continue
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index 30e937e1f..0ae015d15 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -124,22 +124,22 @@ def test_zero_max_penalty_returns_zero(self):
 
     def test_no_repetition(self):
         reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
-        completions = ["this is a test sentence"]
+        completions = [[{"content": "this is a test sentence"}]]
         rewards = reward_fn(completions)
         self.assertEqual(rewards, [0.0])
 
     def test_full_repetition(self):
         reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
-        completions = ["this this this this this"]
+        completions = [[{"content": "this this this this this"}]]
+
         rewards = reward_fn(completions)
         # (1 - 1/4) * -1 = -0.75
         self.assertEqual(rewards, [-0.75])
 
     def test_partial_repetition(self):
         reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
-        completions = [
-            "this is a this is a test"
-        ]  # 2-grams: (this, is), (is, a), (a, this), (this, is), (is, a), (a, test)
+        completions = [[{"content": "this is a this is a test"}]]
+
         rewards = reward_fn(completions)
         # Unique 2-grams: (this, is), (is, a), (a, this), (a, test).  4 unique out of 6 total
         # (1 - 4/6) * -1 = -1/3 = -0.3333...
@@ -147,7 +147,11 @@ def test_partial_repetition(self):
 
     def test_multiple_completions(self):
         reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-0.5)
-        completions = ["this is a test", "test test test test"]
+        completions = [
+            [{"content": "this is a test"}],
+            [{"content": "test test test test"}],
+        ]
+
         rewards = reward_fn(completions)
         # Completion 1:  (this, is, a), (is, a, test) -> 2 unique / 2 total -> (1 - 2/2) * -0.5 = 0
         # Completion 2: (test, test, test) -> 1 unique / 2 total -> (1 - 1/2) * -0.5 = -0.25
@@ -156,80 +160,89 @@ def test_multiple_completions(self):
 
     def test_empty_completion(self):
         reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
-        completions = [""]
+        completions = [[{"content": ""}]]
         rewards = reward_fn(completions)
         self.assertEqual(rewards, [0.0])
 
     def test_different_ngram_size(self):
         reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-2.0)
-        completions = [
-            "this is a this is a test"
-        ]  # 3-grams:(this, is, a) (is, a, this) (a, this, is) (this, is, a) (is, a, test)
+        completions = [[{"content": "this is a this is a test"}]]
+
         rewards = reward_fn(completions)
-        # Unique 3-grams: (this, is, a), (is, a, this), (a, this, is), (is, a, test) = 4.  Total 3-grams: 5
-        # (1 - 4/5) * -2 = -0.4
         self.assertAlmostEqual(rewards[0], -0.4)
 
     def test_mixed_case(self):
         reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
-        completions = ["This is A Test", "this IS a test"]
+        completions = [
+            [{"content": "This is A Test"}],
+            [{"content": "this IS a test"}],
+        ]
+
         rewards = reward_fn(completions)
         # both completions should produce the same reward, because the text gets lowercased
         self.assertAlmostEqual(rewards[0], rewards[1])
 
     def test_one_word_completion(self):
         reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
-        completions = ["word"]
+        completions = [[{"content": "word"}]]
+
         rewards = reward_fn(completions)
         self.assertEqual(rewards, [0.0])
 
     def test_two_word_completion(self):
         reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
-        completions = ["two words"]
+        completions = [[{"content": "two words"}]]
+
         rewards = reward_fn(completions)
         self.assertEqual(rewards, [0.0])
 
     def test_three_word_completion(self):
         reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
-        completions = ["three different words"]
+        completions = [[{"content": "three different words"}]]
+
         rewards = reward_fn(completions)
         self.assertEqual(rewards, [0.0])
 
     def test_three_word_repetition_completion(self):
         reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
-        completions = ["word word word word"]
+        completions = [[{"content": "word word word word"}]]
+
         rewards = reward_fn(completions)
         self.assertEqual(rewards, [-0.5])
 
     def test_four_word_completion_with_repetition(self):
         reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
-        completions = ["one two one two"]
+        completions = [[{"content": "one two one two"}]]
+
         rewards = reward_fn(completions)
         # ngrams are (one two one) (two one two). unique is 2 and count is 2, therefore (1-1) * -1.
         self.assertEqual(rewards, [0.0])
 
     def test_five_word_completion_with_repetition(self):
         reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-0.5)
-        completions = ["A B C A B"]
+        completions = [[{"content": "A B C A B"}]]
+
         rewards = reward_fn(completions)
         # (A B C) (B C A) (C A B). unique is 3. count is 3 (1-1) * -.5 = 0
         self.assertEqual(rewards, [0.0])
 
     def test_six_word_completion_with_repetition(self):
         reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
-        completions = ["A B C A B C"]
+        completions = [[{"content": "A B C A B C"}]]
+
         rewards = reward_fn(completions)
         self.assertEqual(rewards, [-0.25])
 
     def test_long_completion_with_repetition(self):
         reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
-        completions = ["A B C A B C E F G A B C A B C"]
+        completions = [[{"content": "A B C A B C E F G A B C A B C"}]]
         rewards = reward_fn(completions)
         self.assertAlmostEqual(rewards[0], -0.3846, places=4)
 
     def test_long_completion_without_repetition(self):
         reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-1.0)
-        completions = ["A B C D E F G H I J K L"]
+        completions = [[{"content": "A B C D E F G H I J K L"}]]
+
         rewards = reward_fn(completions)
         self.assertEqual(rewards, [0.0])
 

From 440ae0b24e550bf4a72b5125e0b238f30ce1efd9 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Mon, 10 Feb 2025 16:52:23 +0100
Subject: [PATCH 037/137] Add the actual async generation script (#273)

* sglang inference server

* add vllm

* readme

* add a generation script

* ruff
---
 scripts/generate_reason_data.py        | 170 +++++++++++++++++++++++++
 slurm/README.md                        |  17 ++-
 slurm/experimental/serve_r1_vllm.slurm |   3 -
 slurm/serve_r1.slurm                   |   3 -
 slurm/serve_router.slurm               |   1 -
 5 files changed, 185 insertions(+), 9 deletions(-)
 create mode 100644 scripts/generate_reason_data.py

diff --git a/scripts/generate_reason_data.py b/scripts/generate_reason_data.py
new file mode 100644
index 000000000..01e6e7a73
--- /dev/null
+++ b/scripts/generate_reason_data.py
@@ -0,0 +1,170 @@
+import argparse
+import asyncio
+import json
+import os
+import random
+from asyncio import Lock
+from typing import Set
+
+from datasets import load_dataset
+from tqdm.asyncio import tqdm
+
+import aiofiles
+import aiohttp
+import uvloop
+
+
+file_lock = Lock()
+
+
+async def generate_completion(session, prompt, args):
+    retry_budget = 10
+    while retry_budget > 0:
+        try:
+            await asyncio.sleep(random.uniform(0.0, 0.1))
+            async with session.post(
+                f"http://{args.api_addr}/v1/chat/completions",
+                json={
+                    "model": "default",
+                    "messages": [{"role": "user", "content": prompt}],
+                    "max_tokens": args.max_tokens,
+                    "temperature": args.temperature,
+                    "top_p": args.top_p,
+                },
+                headers={"Authorization": "Bearer EMPTY"},
+            ) as response:
+                return await response.json(content_type=None)
+        except Exception as e:
+            print(f"API error (will retry): {e}")
+            retry_budget -= 1
+            await asyncio.sleep(10)
+    return None
+
+
+async def process_example(example, session, args, output_file, pbar):
+    prompt = args.prompt_template.format(prompt=example[args.prompt_column])
+
+    try:
+        tasks = [generate_completion(session, prompt, args) for _ in range(args.num_generations)]
+
+        completions = await asyncio.gather(*tasks)
+
+        if any(completion is None for completion in completions):
+            print(f"Error processing example")
+            pbar.update(1)
+            return None
+
+        generations = []
+        finish_reasons = []
+        api_metadata = []
+
+        for completion in completions:
+            generations.append(completion["choices"][0]["message"]["content"])
+            finish_reasons.append(completion["choices"][0]["finish_reason"])
+            api_metadata.append(completion["usage"])
+
+        # Combine original dataset fields with generations
+        result = {
+            **example,  # Preserve all original dataset fields
+            "generations": generations,
+            "finish_reasons": finish_reasons,
+            "api_metadata": api_metadata,
+        }
+
+        # Write to file with lock
+        async with file_lock:
+            async with aiofiles.open(output_file, mode="a") as f:
+                await f.write(json.dumps(result) + "\n")
+                await f.flush()
+
+        pbar.set_postfix(active=len(pbar.active_tasks), refresh=False)
+        pbar.update(1)
+
+        return result
+    except Exception as e:
+        print(f"Error processing example: {e}")
+        pbar.update(1)
+        return None
+
+
+async def load_processed_uuids(output_file):
+    processed_uuids = set()
+    if os.path.exists(output_file):
+        async with aiofiles.open(output_file, mode="r") as f:
+            async for line in f:
+                try:
+                    data = json.loads(line)
+                    processed_uuids.add(data["uuid"])
+                except json.JSONDecodeError:
+                    continue
+    return processed_uuids
+
+
+async def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset-name", type=str, required=True)
+    parser.add_argument("--output-file", type=str, required=True)
+    parser.add_argument("--prompt-column", type=str, required=True)
+    parser.add_argument("--uuid-column", type=str, required=True)
+    parser.add_argument("--api-addr", type=str, default="localhost:39876")
+    parser.add_argument("--num-generations", type=int, default=4)
+    parser.add_argument(
+        "--prompt-template",
+        type=str,
+        default="You will be given a problem. Please reason step by step, and put your final answer within \\boxed{{}}:\n{prompt}",
+    )
+    parser.add_argument("--temperature", type=float, default=0.6)
+    parser.add_argument("--top-p", type=float, default=0.95)
+    parser.add_argument("--max-tokens", type=int, default=16384)
+    parser.add_argument("--max-concurrent", type=int, default=1000)
+    args = parser.parse_args()
+
+    dataset = load_dataset(args.dataset_name, split="train").shuffle()
+    processed_uuids = await load_processed_uuids(args.output_file)
+
+    if not os.path.exists(args.output_file):
+        async with aiofiles.open(args.output_file, mode="w") as f:
+            await f.write("")
+
+    active_tasks: Set[asyncio.Task] = set()
+
+    pbar = tqdm(
+        total=len(dataset),
+        desc="Generating responses",
+        unit="row",
+        mininterval=2,
+        smoothing=0.0001,
+    )
+    pbar.active_tasks = active_tasks
+
+    async with aiohttp.ClientSession(
+        timeout=aiohttp.ClientTimeout(total=60 * 60),
+        connector=aiohttp.TCPConnector(limit=args.max_concurrent, ttl_dns_cache=300, keepalive_timeout=60 * 60),
+    ) as session:
+        for example in dataset:
+            if example["uuid"] not in processed_uuids:
+                # Wait if we've hit the concurrency limit
+                while len(active_tasks) >= args.max_concurrent:
+                    done, active_tasks = await asyncio.wait(active_tasks, return_when=asyncio.FIRST_COMPLETED)
+                    for task in done:
+                        try:
+                            await task
+                        except Exception as e:
+                            print(f"Task failed: {e}")
+
+                task = asyncio.create_task(process_example(example, session, args, args.output_file, pbar))
+                active_tasks.add(task)
+                task.add_done_callback(active_tasks.discard)
+
+                pbar.set_postfix(active=len(active_tasks), refresh=True)
+
+        # Wait for remaining tasks
+        if active_tasks:
+            await asyncio.gather(*active_tasks, return_exceptions=True)
+
+    pbar.close()
+
+
+if __name__ == "__main__":
+    uvloop.install()
+    asyncio.run(main())
diff --git a/slurm/README.md b/slurm/README.md
index f81c583c4..101064594 100644
--- a/slurm/README.md
+++ b/slurm/README.md
@@ -11,7 +11,20 @@ pip install sgl-kernel --force-reinstall --no-deps
 pip install "sglang[all]>=0.4.2.post4" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer/
 ```
 
-2. Run the server:
+2. Run the server and wait for the model to load:
 ```bash
-sbatch serve_r1.slurm -m "/fsx/deepseek-r1-checkpoint" -e "sglang124"
+sbatch slurm/serve_r1.slurm -m "/fsx/deepseek-r1-checkpoint" -e "sglang124"
+```
+
+3. Run the data generation script:
+```bash
+python scripts/generate_reasoning.py \
+    --dataset-name "AI-MO/NuminaMath-1.5" \
+    --output-file "numinamath_r1_generations.jsonl" \
+    --prompt-column "problem" \
+    --uuid-column "problem" \
+    --api-addr "<SGLANG_SERVER_ADDRESS>:39877" \
+    --num-generations 2 \
+    --max-tokens 16384 \
+    --max-concurrent 200
 ```
\ No newline at end of file
diff --git a/slurm/experimental/serve_r1_vllm.slurm b/slurm/experimental/serve_r1_vllm.slurm
index 7e0b0d597..9f1ffd938 100644
--- a/slurm/experimental/serve_r1_vllm.slurm
+++ b/slurm/experimental/serve_r1_vllm.slurm
@@ -9,7 +9,6 @@
 #SBATCH --error=./logs/%x_%j_%n.err
 #SBATCH --time=7-00:00:00
 #SBATCH --ntasks-per-node=1
-#SBATCH --requeue
 
 set -exuo pipefail
 
@@ -19,8 +18,6 @@ SERVER_PORT=8000
 RAY_PORT=6379
 RAY_DASHBOARD_PORT=8265
 
-trap 'scontrol requeue ${SLURM_JOB_ID}; exit 15' SIGUSR1
-
 while getopts "m:e:h" opt; do
     case $opt in
         m) MODEL_PATH="$OPTARG" ;;
diff --git a/slurm/serve_r1.slurm b/slurm/serve_r1.slurm
index 60a72aadf..6cb3719db 100644
--- a/slurm/serve_r1.slurm
+++ b/slurm/serve_r1.slurm
@@ -9,7 +9,6 @@
 #SBATCH --error=./logs/%x_%j_%n.err
 #SBATCH --time=7-00:00:00
 #SBATCH --ntasks-per-node=1
-#SBATCH --requeue
 
 set -exuo pipefail
 
@@ -19,8 +18,6 @@ ROUTER_ADDRESS=""
 SERVER_PORT=39877
 DIST_PORT=45000
 
-trap 'scontrol requeue ${SLURM_JOB_ID}; exit 15' SIGUSR1
-
 # TODO: Adjust these variables to your cluster configuration
 export OUTLINES_CACHE_DIR=/scratch/serve_r1/ocache/
 export TRITON_HOME=/scratch/serve_r1/triton/
diff --git a/slurm/serve_router.slurm b/slurm/serve_router.slurm
index b39ca66aa..0fe96177f 100644
--- a/slurm/serve_router.slurm
+++ b/slurm/serve_router.slurm
@@ -33,7 +33,6 @@ conda activate "$CONDA_ENV" || { echo "Failed to activate conda env $CONDA_ENV";
 python -m sglang_router.launch_router \
     --port "$ROUTER_PORT" \
     --host 0.0.0.0 \
-    --policy "round_robin" \
     --worker-startup-timeout-secs 300
 
 # Keep the job running with health checks

From 3f630aaabba6d686d3df3406b80b235a64be7ceb Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Mon, 10 Feb 2025 16:53:53 +0100
Subject: [PATCH 038/137] Rename to generate_reasoning.py (#275)

---
 scripts/{generate_reason_data.py => generate_reasoning.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename scripts/{generate_reason_data.py => generate_reasoning.py} (100%)

diff --git a/scripts/generate_reason_data.py b/scripts/generate_reasoning.py
similarity index 100%
rename from scripts/generate_reason_data.py
rename to scripts/generate_reasoning.py

From 517adddae34ac943d3924c1b33b4391b70291d44 Mon Sep 17 00:00:00 2001
From: Almaz Zinollayev <39913951+zeenolife@users.noreply.github.com>
Date: Mon, 10 Feb 2025 17:28:35 +0000
Subject: [PATCH 039/137] [Testing Github workflow] Updating workflows and
 makefile (#214)

* [Testing Github workflow] Updating workflows and makefile

* [Testing Github workflow] - Refactoring workflow, fixing tests erorr, easier debugging

* [Testing Github workflow] Converting docstring into raw string

* [Testing Github workflow] - Fixing test_zero_max_penalty_returns_zero() test

* [Testing Github workflow] Removing redundant test
---
 .github/workflows/{quality.yml => tests.yml} | 11 +++++++----
 Makefile                                     |  2 ++
 setup.py                                     |  2 +-
 src/open_r1/rewards.py                       |  2 +-
 tests/test_rewards.py                        |  4 ----
 5 files changed, 11 insertions(+), 10 deletions(-)
 rename .github/workflows/{quality.yml => tests.yml} (74%)

diff --git a/.github/workflows/quality.yml b/.github/workflows/tests.yml
similarity index 74%
rename from .github/workflows/quality.yml
rename to .github/workflows/tests.yml
index 908377273..1d2885ab5 100644
--- a/.github/workflows/quality.yml
+++ b/.github/workflows/tests.yml
@@ -1,4 +1,4 @@
-name: Quality
+name: Tests
 
 on:
   push:
@@ -11,8 +11,8 @@ on:
 
 jobs:
 
-  check_code_quality:
-    name: Check code quality
+  tests:
+    name: Run tests and quality checks
     runs-on: ubuntu-latest
     steps:
       - name: Checkout code
@@ -24,8 +24,11 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          python -m pip install ".[quality]"
+          python -m pip install ".[quality,tests]"
       - name: Code quality
         run: |
           make quality
+      - name: Run tests
+        run: |
+          make test
 
diff --git a/Makefile b/Makefile
index ec757927e..7f28821b5 100644
--- a/Makefile
+++ b/Makefile
@@ -14,6 +14,8 @@ quality:
 	isort --check-only $(check_dirs) setup.py
 	flake8 --max-line-length 119 $(check_dirs) setup.py
 
+test:
+	pytest -sv tests/
 
 # Evaluation
 
diff --git a/setup.py b/setup.py
index ef742eb00..6d7c74a74 100644
--- a/setup.py
+++ b/setup.py
@@ -83,7 +83,7 @@ def deps_list(*pkgs):
 
 
 extras = {}
-extras["tests"] = deps_list("pytest", "parameterized")
+extras["tests"] = deps_list("pytest", "parameterized", "math-verify")
 extras["torch"] = deps_list("torch")
 extras["quality"] = deps_list("ruff", "isort", "flake8")
 extras["train"] = deps_list("flash_attn")
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index f7e270ef8..bec3d11c1 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -58,7 +58,7 @@ def format_reward(completions, **kwargs):
 
 
 def reasoning_steps_reward(completions, **kwargs):
-    """Reward function that checks for clear step-by-step reasoning.
+    r"""Reward function that checks for clear step-by-step reasoning.
     Regex pattern:
         Step \d+: - matches "Step 1:", "Step 2:", etc.
         ^\d+\. - matches numbered lists like "1.", "2.", etc. at start of line
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index 0ae015d15..7f0cbfa94 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -118,10 +118,6 @@ def test_positive_max_penalty_raises_value_error(self):
         with self.assertRaisesRegex(ValueError, "max_penalty 1.5 should not be positive"):
             get_repetition_penalty_reward(ngram_size=2, max_penalty=1.5)
 
-    def test_zero_max_penalty_returns_zero(self):
-        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=0.0)
-        self.assertEqual(reward_fn, 0)
-
     def test_no_repetition(self):
         reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0)
         completions = [[{"content": "this is a test sentence"}]]

From 82b2a6525f1045b5ba7469e75daefd0abec1d063 Mon Sep 17 00:00:00 2001
From: Jinfeng Sun <86536994+Tendo33@users.noreply.github.com>
Date: Tue, 11 Feb 2025 16:34:19 +0800
Subject: [PATCH 040/137] fix(sft recipes):  remove duplicate packing option
 from config (#280)

---
 recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
index 6781996d0..c7dd25bbd 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
@@ -32,7 +32,6 @@ max_steps: -1
 num_train_epochs: 1
 output_dir: data/Qwen2.5-1.5B-Open-R1-Distill
 overwrite_output_dir: true
-packing: true
 per_device_eval_batch_size: 4
 per_device_train_batch_size: 2
 push_to_hub: true

From 52aa8759a263151eb8fdcea6e819403f808c08cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Tue, 11 Feb 2025 09:35:06 +0100
Subject: [PATCH 041/137] new grpo logic (#274)

---
 recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml | 6 +++---
 recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml       | 6 +++---
 recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml        | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml
index f1468283d..be1fa3f65 100644
--- a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml
+++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml
@@ -31,12 +31,12 @@ lr_scheduler_type: cosine
 max_prompt_length: 512
 max_completion_length: 1024
 max_steps: -1
-num_generations: 2
+num_generations: 7
 num_train_epochs: 1
 output_dir: data/DeepSeek-R1-Distill-Qwen-7B-GRPO
 overwrite_output_dir: true
-per_device_eval_batch_size: 4
-per_device_train_batch_size: 2
+per_device_eval_batch_size: 32
+per_device_train_batch_size: 16
 push_to_hub: true
 report_to:
 - wandb
diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
index 6de40e023..817939390 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
@@ -33,12 +33,12 @@ lr_scheduler_type: cosine
 max_prompt_length: 512
 max_completion_length: 1024
 max_steps: -1
-num_generations: 2
+num_generations: 7
 num_train_epochs: 1
 output_dir: data/Qwen2.5-1.5B-Open-R1-GRPO
 overwrite_output_dir: true
-per_device_eval_batch_size: 4   
-per_device_train_batch_size: 2
+per_device_eval_batch_size: 32
+per_device_train_batch_size: 16
 push_to_hub: true
 report_to:
 - wandb
diff --git a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
index 636bfecbe..6df7730b9 100644
--- a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
+++ b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
@@ -37,8 +37,8 @@ num_generations: 7
 num_train_epochs: 1
 output_dir: data/Qwen-2.5-7B-Simple-RL
 overwrite_output_dir: true
-per_device_eval_batch_size: 2
-per_device_train_batch_size: 2
+per_device_eval_batch_size: 16
+per_device_train_batch_size: 16
 push_to_hub: true
 report_to:
 - wandb

From fa9b621cc9fc21cb67912dbb41b389974b7213ec Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Tue, 11 Feb 2025 14:08:46 +0100
Subject: [PATCH 042/137] Fix uuid in the data generator (#284)

* fix uuid issues
---
 scripts/generate_reasoning.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/scripts/generate_reasoning.py b/scripts/generate_reasoning.py
index 01e6e7a73..11bdb7e88 100644
--- a/scripts/generate_reasoning.py
+++ b/scripts/generate_reasoning.py
@@ -1,5 +1,6 @@
 import argparse
 import asyncio
+import hashlib
 import json
 import os
 import random
@@ -87,14 +88,14 @@ async def process_example(example, session, args, output_file, pbar):
         return None
 
 
-async def load_processed_uuids(output_file):
+async def load_processed_uuids(output_file, uuid_column):
     processed_uuids = set()
     if os.path.exists(output_file):
         async with aiofiles.open(output_file, mode="r") as f:
             async for line in f:
                 try:
                     data = json.loads(line)
-                    processed_uuids.add(data["uuid"])
+                    processed_uuids.add(hashlib.md5(str(data[uuid_column]).encode()).hexdigest())
                 except json.JSONDecodeError:
                     continue
     return processed_uuids
@@ -120,7 +121,9 @@ async def main():
     args = parser.parse_args()
 
     dataset = load_dataset(args.dataset_name, split="train").shuffle()
-    processed_uuids = await load_processed_uuids(args.output_file)
+    processed_uuids = await load_processed_uuids(args.output_file, args.uuid_column)
+    if processed_uuids:
+        print(f"Found {len(processed_uuids)} already processed examples, resuming from there...")
 
     if not os.path.exists(args.output_file):
         async with aiofiles.open(args.output_file, mode="w") as f:
@@ -129,7 +132,7 @@ async def main():
     active_tasks: Set[asyncio.Task] = set()
 
     pbar = tqdm(
-        total=len(dataset),
+        total=len(dataset) - len(processed_uuids),
         desc="Generating responses",
         unit="row",
         mininterval=2,
@@ -142,7 +145,8 @@ async def main():
         connector=aiohttp.TCPConnector(limit=args.max_concurrent, ttl_dns_cache=300, keepalive_timeout=60 * 60),
     ) as session:
         for example in dataset:
-            if example["uuid"] not in processed_uuids:
+            uuid = hashlib.md5(str(example[args.uuid_column]).encode()).hexdigest()
+            if uuid not in processed_uuids:
                 # Wait if we've hit the concurrency limit
                 while len(active_tasks) >= args.max_concurrent:
                     done, active_tasks = await asyncio.wait(active_tasks, return_when=asyncio.FIRST_COMPLETED)

From 96a6b0fa33308e69a9b0cf7c4ae09bbf3aa39d41 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Wed, 12 Feb 2025 13:01:07 +0100
Subject: [PATCH 043/137] Enable Weights & Biases defaults to be overridden in
 training (#294)

* Enable WandB defaults to be set

* Fix
---
 README.md                    | 12 ++++++++++--
 src/open_r1/configs.py       | 16 ++++++++++++++++
 src/open_r1/grpo.py          |  6 +++++-
 src/open_r1/sft.py           |  6 +++++-
 src/open_r1/utils/logging.py |  9 +++++++++
 5 files changed, 45 insertions(+), 4 deletions(-)
 create mode 100644 src/open_r1/utils/logging.py

diff --git a/README.md b/README.md
index 48b767b53..333c54f87 100644
--- a/README.md
+++ b/README.md
@@ -126,6 +126,14 @@ accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r
     --per_device_train_batch_size=1 --num_train_epochs=5
 ```
 
+If you also wish to override the Weights and Biases default settings, you can do so as follows:
+
+```shell
+accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py \
+    --config recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
+    --wandb_entity huggingface --wandb_project open-r1 --run_name Qwen2.5-1.5B-GRPO
+```
+
 > [!NOTE]
 > The training commands below are configured for a node of 8 x H100s (80GB). For different hardware and topologies, you may need to tune the batch size and number of gradient accumulation steps.
 
@@ -141,10 +149,10 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con
 
 ### GRPO
 
-To train via the GRPO trainer, we use one GPU to run vLLM for faster generation and the remaining GPUs for training. For example, one a node with 8 GPUs, use the `recipes/accelerate_configs/zero3.yaml` config and then overwrite `num_processes` to run on 7 devices:
+To train via the GRPO trainer, we use one GPU to run vLLM for faster generation and the remaining GPUs for training. For example, one a node with 8 GPUs, use the `recipes/accelerate_configs/zero2.yaml` config and then overwrite `num_processes` to run on 7 devices:
 
 ```shell
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero3.yaml \
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero2.yaml \
     --num_processes=7 src/open_r1/grpo.py \
     --config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
 ```
diff --git a/src/open_r1/configs.py b/src/open_r1/configs.py
index 57968b4b4..3a6f68665 100644
--- a/src/open_r1/configs.py
+++ b/src/open_r1/configs.py
@@ -40,6 +40,14 @@ class GRPOConfig(trl.GRPOConfig):
     )
     overwrite_hub_revision: bool = field(default=False, metadata={"help": "Whether to overwrite the Hub revision."})
     push_to_hub_revision: bool = field(default=False, metadata={"help": "Whether to push to a Hub revision/branch."})
+    wandb_entity: Optional[str] = field(
+        default=None,
+        metadata={"help": ("The entity to store runs under.")},
+    )
+    wandb_project: Optional[str] = field(
+        default=None,
+        metadata={"help": ("The project to store runs under.")},
+    )
 
 
 @dataclass
@@ -64,3 +72,11 @@ class SFTConfig(trl.SFTConfig):
     )
     overwrite_hub_revision: bool = field(default=False, metadata={"help": "Whether to overwrite the Hub revision."})
     push_to_hub_revision: bool = field(default=False, metadata={"help": "Whether to push to a Hub revision/branch."})
+    wandb_entity: Optional[str] = field(
+        default=None,
+        metadata={"help": ("The entity to store runs under.")},
+    )
+    wandb_project: Optional[str] = field(
+        default=None,
+        metadata={"help": ("The project to store runs under.")},
+    )
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 5cb645524..128375db5 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -33,6 +33,7 @@
     reasoning_steps_reward,
 )
 from open_r1.utils.callbacks import get_callbacks
+from open_r1.utils.logging import init_wandb_training
 from trl import GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
 
 
@@ -130,7 +131,7 @@ def main(script_args, training_args, model_args):
     )
     logger.info(f"Model parameters {model_args}")
     logger.info(f"Script parameters {script_args}")
-    logger.info(f"Data parameters {training_args}")
+    logger.info(f"Training parameters {training_args}")
 
     # Check for last checkpoint
     last_checkpoint = None
@@ -139,6 +140,9 @@ def main(script_args, training_args, model_args):
     if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
         logger.info(f"Checkpoint detected, resuming training at {last_checkpoint=}.")
 
+    if "wandb" in training_args.report_to:
+        init_wandb_training(training_args)
+
     # Load the dataset
     dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
 
diff --git a/src/open_r1/sft.py b/src/open_r1/sft.py
index e8587d034..16791cd4f 100644
--- a/src/open_r1/sft.py
+++ b/src/open_r1/sft.py
@@ -48,6 +48,7 @@
 
 from open_r1.configs import SFTConfig
 from open_r1.utils.callbacks import get_callbacks
+from open_r1.utils.logging import init_wandb_training
 from trl import (
     ModelConfig,
     ScriptArguments,
@@ -88,7 +89,7 @@ def main(script_args, training_args, model_args):
     )
     logger.info(f"Model parameters {model_args}")
     logger.info(f"Script parameters {script_args}")
-    logger.info(f"Data parameters {training_args}")
+    logger.info(f"Training parameters {training_args}")
 
     # Check for last checkpoint
     last_checkpoint = None
@@ -97,6 +98,9 @@ def main(script_args, training_args, model_args):
     if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
         logger.info(f"Checkpoint detected, resuming training at {last_checkpoint=}.")
 
+    if "wandb" in training_args.report_to:
+        init_wandb_training(training_args)
+
     ################
     # Load datasets
     ################
diff --git a/src/open_r1/utils/logging.py b/src/open_r1/utils/logging.py
new file mode 100644
index 000000000..764f30f8a
--- /dev/null
+++ b/src/open_r1/utils/logging.py
@@ -0,0 +1,9 @@
+import os
+
+
+def init_wandb_training(training_args):
+    """
+    Helper function for setting up Weights & Biases logging tools.
+    """
+    os.environ["WANDB_ENTITY"] = training_args.wandb_entity
+    os.environ["WANDB_PROJECT"] = training_args.wandb_project

From f987b3c8775943c87e69b15e704bc1935a31cfab Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Thu, 13 Feb 2025 10:48:11 +0100
Subject: [PATCH 044/137] bump vllm to version to 0.7.2 (#311)

VLLM has made a number of throughput improvements in version 0.7.2, so it's worth bumping the version, particularly for GRPO training runs.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 333c54f87..07d2ddd36 100644
--- a/README.md
+++ b/README.md
@@ -57,7 +57,7 @@ uv venv openr1 --python 3.11 && source openr1/bin/activate && uv pip install --u
 Next, install vLLM:
 
 ```shell
-uv pip install vllm==0.7.1 --link-mode=copy
+uv pip install vllm==0.7.2 --link-mode=copy
 ```
 
 This will also install PyTorch `v2.5.1` and it is **very important** to use this version since the vLLM binaries are compiled for it. You can then install the remaining dependencies for your specific use case via `pip install -e .[LIST OF MODES]`. For most contributors, we recommend:

From 80e7e7b23c3e6694d94f1d719d69f2134504af66 Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Thu, 13 Feb 2025 11:13:00 +0100
Subject: [PATCH 045/137] move details script and fix wandb logging (#314)

---
 {src/open_r1/utils => scripts}/upload_details.py |  0
 slurm/evaluate.slurm                             |  2 +-
 src/open_r1/utils/logging.py                     |  9 ---------
 src/open_r1/utils/wandb_logging.py               | 11 +++++++++++
 4 files changed, 12 insertions(+), 10 deletions(-)
 rename {src/open_r1/utils => scripts}/upload_details.py (100%)
 delete mode 100644 src/open_r1/utils/logging.py
 create mode 100644 src/open_r1/utils/wandb_logging.py

diff --git a/src/open_r1/utils/upload_details.py b/scripts/upload_details.py
similarity index 100%
rename from src/open_r1/utils/upload_details.py
rename to scripts/upload_details.py
diff --git a/slurm/evaluate.slurm b/slurm/evaluate.slurm
index c659c0b34..da106f6be 100644
--- a/slurm/evaluate.slurm
+++ b/slurm/evaluate.slurm
@@ -81,7 +81,7 @@ echo "Uploading details to Hugging Face Hub..."
 DETAILS_FILEPATHS=$(find $OUTPUT_DIR/details/ -type f \( -name "*.parquet" \))
 echo "DETAILS_FILEPATHS: $DETAILS_FILEPATHS"
 TIMESTAMP=$(date +"%Y-%m-%dT%H-%M-%S")
-python src/open_r1/utils/upload_details.py --data_files $DETAILS_FILEPATHS --hub_repo_id $DETAILS_REPO_ID --config_name $MODEL_REVISION.$TASK_NAME.$TIMESTAMP
+python scripts/upload_details.py --data_files $DETAILS_FILEPATHS --hub_repo_id $DETAILS_REPO_ID --config_name $MODEL_REVISION.$TASK_NAME.$TIMESTAMP
     
 echo "Cleaning up ..."
 rm -rf $OUTPUT_DIR
diff --git a/src/open_r1/utils/logging.py b/src/open_r1/utils/logging.py
deleted file mode 100644
index 764f30f8a..000000000
--- a/src/open_r1/utils/logging.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import os
-
-
-def init_wandb_training(training_args):
-    """
-    Helper function for setting up Weights & Biases logging tools.
-    """
-    os.environ["WANDB_ENTITY"] = training_args.wandb_entity
-    os.environ["WANDB_PROJECT"] = training_args.wandb_project
diff --git a/src/open_r1/utils/wandb_logging.py b/src/open_r1/utils/wandb_logging.py
new file mode 100644
index 000000000..13b552766
--- /dev/null
+++ b/src/open_r1/utils/wandb_logging.py
@@ -0,0 +1,11 @@
+import os
+
+
+def init_wandb_training(training_args):
+    """
+    Helper function for setting up Weights & Biases logging tools.
+    """
+    if training_args.wandb_entity is not None:
+        os.environ["WANDB_ENTITY"] = training_args.wandb_entity
+    if training_args.wandb_project is not None:
+        os.environ["WANDB_PROJECT"] = training_args.wandb_project

From 78322906872d7daafa002eaa49d89cbf0d7c43ad Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 13 Feb 2025 11:51:09 +0100
Subject: [PATCH 046/137] [Rewards] add kimi len_reward (#292)

* add kimi len_reward

* add to REWARD_FUNCS_REGISTRY

* fix formatting

* Update src/open_r1/grpo.py

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* Update src/open_r1/grpo.py

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* Update src/open_r1/grpo.py

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* Update src/open_r1/rewards.py

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* Update src/open_r1/rewards.py

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* Update src/open_r1/rewards.py

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* Update src/open_r1/rewards.py

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* Update src/open_r1/rewards.py

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* missing import

---------

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>
---
 src/open_r1/grpo.py    |  6 ++--
 src/open_r1/rewards.py | 74 ++++++++++++++++++++++++++++++++++++++++++
 tests/test_rewards.py  | 70 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 148 insertions(+), 2 deletions(-)

diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 128375db5..916be06e4 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -30,6 +30,7 @@
     format_reward,
     get_cosine_scaled_reward,
     get_repetition_penalty_reward,
+    len_reward,
     reasoning_steps_reward,
 )
 from open_r1.utils.callbacks import get_callbacks
@@ -47,7 +48,7 @@ class GRPOScriptArguments(ScriptArguments):
 
     Args:
         reward_funcs (`list[str]`):
-            List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty'.
+            List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length'.
         cosine_min_value_wrong (`float`):
             Minimum reward for cosine scaling for wrong answers.
         cosine_max_value_wrong (`float`):
@@ -63,7 +64,7 @@ class GRPOScriptArguments(ScriptArguments):
     reward_funcs: list[str] = field(
         default_factory=lambda: ["accuracy", "format"],
         metadata={
-            "help": "List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty'"
+            "help": "List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length'"
         },
     )
     cosine_min_value_wrong: float = field(
@@ -162,6 +163,7 @@ def main(script_args, training_args, model_args):
             ngram_size=script_args.repetition_n_grams,
             max_penalty=script_args.repetition_max_penalty,
         ),
+        "length": len_reward,
     }
     reward_funcs = [REWARD_FUNCS_REGISTRY[func] for func in script_args.reward_funcs]
 
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index bec3d11c1..279627846 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -2,6 +2,7 @@
 
 import math
 import re
+from typing import Dict
 
 from latex2sympy2_extended import NormalizationConfig
 from math_verify import LatexExtractionConfig, parse, verify
@@ -74,6 +75,79 @@ def reasoning_steps_reward(completions, **kwargs):
     return [min(1.0, count / 3) for count in matches]
 
 
+def len_reward(completions: list[Dict[str, str]], solutions: list[str], **kwargs) -> float:
+    """Compute length-based rewards to discourage overthinking and promote token efficiency.
+
+    Taken from from the Kimi 1.5 tech report: https://arxiv.org/abs/2501.12599
+
+    Args:
+        completions: List of model completions
+        solutions: List of ground truth solutions
+
+    Returns:
+        List of rewards where:
+        - For correct answers: reward = 0.5 - (len - min_len)/(max_len - min_len)
+        - For incorrect answers: reward = min(0, 0.5 - (len - min_len)/(max_len - min_len))
+    """
+    contents = [completion[0]["content"] for completion in completions]
+
+    # First check correctness of answers
+    correctness = []
+    for content, sol in zip(contents, solutions):
+        gold_parsed = parse(
+            sol,
+            extraction_mode="first_match",
+            extraction_config=[LatexExtractionConfig()],
+        )
+        if len(gold_parsed) == 0:
+            # Skip unparseable examples
+            correctness.append(True)  # Treat as correct to avoid penalizing
+            print("Failed to parse gold solution: ", sol)
+            continue
+
+        answer_parsed = parse(
+            content,
+            extraction_config=[
+                LatexExtractionConfig(
+                    normalization_config=NormalizationConfig(
+                        nits=False,
+                        malformed_operators=False,
+                        basic_latex=True,
+                        equations=True,
+                        boxed=True,
+                        units=True,
+                    ),
+                    boxed_match_priority=0,
+                    try_extract_without_anchor=False,
+                )
+            ],
+            extraction_mode="first_match",
+        )
+        correctness.append(verify(answer_parsed, gold_parsed))
+
+    # Calculate lengths
+    lengths = [len(content) for content in contents]
+    min_len = min(lengths)
+    max_len = max(lengths)
+
+    # If all responses have the same length, return zero rewards
+    if max_len == min_len:
+        return [0.0] * len(completions)
+
+    rewards = []
+    for length, is_correct in zip(lengths, correctness):
+        lambda_val = 0.5 - (length - min_len) / (max_len - min_len)
+
+        if is_correct:
+            reward = lambda_val
+        else:
+            reward = min(0, lambda_val)
+
+        rewards.append(float(reward))
+
+    return rewards
+
+
 def get_cosine_scaled_reward(
     min_value_wrong: float = -1.0,
     max_value_wrong: float = -0.5,
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index 7f0cbfa94..9e41bdb0d 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -5,6 +5,7 @@
     format_reward,
     get_cosine_scaled_reward,
     get_repetition_penalty_reward,
+    len_reward,
     reasoning_steps_reward,
 )
 
@@ -110,6 +111,75 @@ def test_format_reward_specific_multiline(self):
         rewards = format_reward(completion)
         self.assertEqual(rewards[0], 1.0)
 
+    def test_same_length_responses(self):
+        """Test len_reward when all responses have the same length."""
+        completions = [[{"content": r"\boxed{\frac{63}{400}}"}], [{"content": r"\boxed{\frac{64}{400}}"}]]
+        solutions = [r"\frac{63}{400}", r"\frac{63}{400}"]
+
+        rewards = len_reward(completions, solutions)
+        self.assertEqual(rewards, [0.0, 0.0])
+
+    def test_different_lengths_correct_answers(self):
+        """Test len_reward with different length correct answers."""
+        completions = [
+            [{"content": r"\boxed{\frac{63}{400}}"}],  # shorter
+            [{"content": r"\boxed{\frac{63}{400}}  " + "x" * 10}],  # longer
+        ]
+        solutions = [r"\frac{63}{400}", r"\frac{63}{400}"]
+
+        rewards = len_reward(completions, solutions)
+        self.assertGreater(rewards[0], rewards[1])  # shorter answer should get higher reward
+        self.assertAlmostEqual(rewards[0], 0.5)  # shortest correct answer gets maximum reward
+
+    def test_different_lengths_incorrect_answers(self):
+        """Test len_reward with different length incorrect answers."""
+        completions = [
+            [{"content": r"\boxed{\frac{64}{400}}"}],  # shorter
+            [{"content": r"\boxed{\frac{64}{400}}  " + "x" * 10}],  # longer
+        ]
+        solutions = [r"\frac{63}{400}", r"\frac{63}{400}"]
+
+        rewards = len_reward(completions, solutions)
+        self.assertLessEqual(rewards[0], 0.0)  # incorrect answers should get non-positive rewards
+        self.assertLessEqual(rewards[1], 0.0)
+        self.assertGreater(rewards[0], rewards[1])  # shorter answer should still be penalized less
+
+    def test_mixed_correctness(self):
+        """Test len_reward with mix of correct and incorrect answers of different lengths."""
+        completions = [
+            [{"content": r"\boxed{\frac{63}{400}}"}],  # correct, shorter
+            [{"content": r"\boxed{\frac{63}{400}}  " + "x" * 10}],  # correct, longer
+            [{"content": r"\boxed{\frac{64}{400}}"}],  # incorrect, shorter
+            [{"content": r"\boxed{\frac{64}{400}}  " + "x" * 10}],  # incorrect, longer
+        ]
+        solutions = [r"\frac{63}{400}"] * 4
+
+        rewards = len_reward(completions, solutions)
+
+        # Shortest correct answer should get positive reward
+        self.assertGreater(rewards[0], 0.0)
+
+        # Longer correct answer might get negative reward:
+        self.assertGreater(rewards[2], rewards[1])
+        self.assertGreaterEqual(rewards[1], rewards[3])
+
+        # Incorrect answers should get non-positive rewards
+        self.assertLessEqual(rewards[2], 0.0)
+        self.assertLessEqual(rewards[3], 0.0)
+
+        # Shorter answers should get better rewards within their correctness category
+        self.assertGreater(rewards[0], rewards[1])  # correct answers
+        self.assertGreater(rewards[2], rewards[3])  # incorrect answers
+
+    def test_unparseable_solution(self):
+        """Test len_reward with unparseable solution."""
+        completions = [[{"content": r"\boxed{answer}"}], [{"content": r"\boxed{answer} " + "x" * 10}]]
+        solutions = ["unparseable_latex", "unparseable_latex"]
+
+        rewards = len_reward(completions, solutions)
+        self.assertGreater(rewards[0], rewards[1])  # shorter answer should still get better reward
+        self.assertAlmostEqual(rewards[0], 0.5)  # treated as correct, shortest gets maximum reward
+
 
 class TestRepetitionPenaltyReward(unittest.TestCase):
     def test_positive_max_penalty_raises_value_error(self):

From 272b648c03debab468293a3f0dc7493a6cd56662 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Thu, 13 Feb 2025 12:01:09 +0100
Subject: [PATCH 047/137] Fix logging import (#316)

---
 .gitignore          | 4 +++-
 src/open_r1/grpo.py | 2 +-
 src/open_r1/sft.py  | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index d44c47f64..f4db78191 100644
--- a/.gitignore
+++ b/.gitignore
@@ -175,4 +175,6 @@ data/
 wandb/
 logs/
 eval_results/
-results/
\ No newline at end of file
+results/
+
+.vscode/
\ No newline at end of file
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 916be06e4..1970f8ef4 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -34,7 +34,7 @@
     reasoning_steps_reward,
 )
 from open_r1.utils.callbacks import get_callbacks
-from open_r1.utils.logging import init_wandb_training
+from open_r1.utils.wandb_logging import init_wandb_training
 from trl import GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
 
 
diff --git a/src/open_r1/sft.py b/src/open_r1/sft.py
index 16791cd4f..b6031d813 100644
--- a/src/open_r1/sft.py
+++ b/src/open_r1/sft.py
@@ -48,7 +48,7 @@
 
 from open_r1.configs import SFTConfig
 from open_r1.utils.callbacks import get_callbacks
-from open_r1.utils.logging import init_wandb_training
+from open_r1.utils.wandb_logging import init_wandb_training
 from trl import (
     ModelConfig,
     ScriptArguments,

From fbea53267b9676fc89e92c9a24c83cb23e0884d0 Mon Sep 17 00:00:00 2001
From: Almaz Zinollayev <39913951+zeenolife@users.noreply.github.com>
Date: Thu, 13 Feb 2025 13:08:27 +0000
Subject: [PATCH 048/137] Weighted reward functions (#213)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [Weighted reward functions] Adding functionality to weigh rewards. Tests.

* [Weighted reward functions] Adding @wraps decorator to preserve reward function metadata

* style

* Changing grpo.py tests to run if cuda is available

* style

* Apply suggestions from code review

Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>

---------

Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
Co-authored-by: Quentin Gallouédec <quentin.gallouedec@huggingface.co>
Co-authored-by: Kashif Rasul <kashif.rasul@gmail.com>
---
 .../grpo/config_demo.yaml                     |  6 +++
 .../grpo/config_demo.yaml                     |  6 +++
 .../grpo/config_simple_rl.yaml                |  6 +++
 src/open_r1/grpo.py                           | 28 ++++++++++-
 src/open_r1/rewards.py                        | 20 ++++++++
 tests/test_grpo.py                            | 47 +++++++++++++++++++
 tests/test_rewards.py                         | 41 ++++++++++++++++
 7 files changed, 152 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_grpo.py

diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml
index be1fa3f65..2cedcb544 100644
--- a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml
+++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml
@@ -40,6 +40,12 @@ per_device_train_batch_size: 16
 push_to_hub: true
 report_to:
 - wandb
+reward_funcs:
+  - accuracy
+  - format
+reward_weights:
+  - 1.0
+  - 1.0
 save_strategy: "no"
 seed: 42
 warmup_ratio: 0.1
diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
index 817939390..2e35dd46e 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
@@ -42,6 +42,12 @@ per_device_train_batch_size: 16
 push_to_hub: true
 report_to:
 - wandb
+reward_funcs:
+  - accuracy
+  - format
+reward_weights:
+  - 1.0
+  - 1.0
 save_strategy: "no"
 seed: 42
 warmup_ratio: 0.1
diff --git a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
index 6df7730b9..60af677ed 100644
--- a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
+++ b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
@@ -42,6 +42,12 @@ per_device_train_batch_size: 16
 push_to_hub: true
 report_to:
 - wandb
+reward_funcs:
+  - accuracy
+  - format
+reward_weights:
+  - 1.0
+  - 1.0
 save_strategy: "no"
 seed: 42
 warmup_ratio: 0.1
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 1970f8ef4..305374221 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -16,6 +16,7 @@
 import os
 import sys
 from dataclasses import dataclass, field
+from typing import Optional
 
 import datasets
 import torch
@@ -27,6 +28,7 @@
 from open_r1.configs import GRPOConfig
 from open_r1.rewards import (
     accuracy_reward,
+    create_weighted_reward,
     format_reward,
     get_cosine_scaled_reward,
     get_repetition_penalty_reward,
@@ -49,6 +51,8 @@ class GRPOScriptArguments(ScriptArguments):
     Args:
         reward_funcs (`list[str]`):
             List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length'.
+        reward_weights (`list[float]` or `None`, *optional*):
+            List of weights for each reward function. If not provided, defaults to 1.0 for each reward function.
         cosine_min_value_wrong (`float`):
             Minimum reward for cosine scaling for wrong answers.
         cosine_max_value_wrong (`float`):
@@ -67,6 +71,12 @@ class GRPOScriptArguments(ScriptArguments):
             "help": "List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length'"
         },
     )
+    reward_weights: Optional[list[float]] = field(
+        default=None,
+        metadata={
+            "help": "List of weights for each reward function. If not provided, defaults to 1.0 for each function."
+        },
+    )
     cosine_min_value_wrong: float = field(
         default=0.0,
         metadata={"help": "Minimum reward for wrong answers"},
@@ -88,6 +98,17 @@ class GRPOScriptArguments(ScriptArguments):
         metadata={"help": "Maximum length for scaling"},
     )
 
+    def __post_init__(self):
+        # If no weights were provided, default to 1.0 for each reward function
+        if self.reward_weights is None:
+            self.reward_weights = [1.0] * len(self.reward_funcs)
+        # If weights were provided, validate the length
+        elif len(self.reward_weights) != len(self.reward_funcs):
+            raise ValueError(
+                f"Number of reward weights ({len(self.reward_weights)}: {self.reward_weights}) must match "
+                f"number of reward functions ({len(self.reward_funcs)}: {self.reward_funcs})"
+            )
+
     repetition_n_grams: int = field(
         default=3,
         metadata={"help": "Number of n-grams for repetition penalty reward"},
@@ -147,7 +168,7 @@ def main(script_args, training_args, model_args):
     # Load the dataset
     dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
 
-    # Get reward functions
+    # Create weighted reward functions
     REWARD_FUNCS_REGISTRY = {
         "accuracy": accuracy_reward,
         "format": format_reward,
@@ -165,7 +186,10 @@ def main(script_args, training_args, model_args):
         ),
         "length": len_reward,
     }
-    reward_funcs = [REWARD_FUNCS_REGISTRY[func] for func in script_args.reward_funcs]
+    reward_funcs = [
+        create_weighted_reward(REWARD_FUNCS_REGISTRY[func], weight)
+        for func, weight in zip(script_args.reward_funcs, script_args.reward_weights)
+    ]
 
     # Format into conversation
     def make_conversation(example):
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 279627846..7da8866fc 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -2,6 +2,7 @@
 
 import math
 import re
+from functools import wraps
 from typing import Dict
 
 from latex2sympy2_extended import NormalizationConfig
@@ -271,3 +272,22 @@ def repetition_penalty_reward(completions, **kwargs) -> float:
         return rewards
 
     return repetition_penalty_reward
+
+
+def create_weighted_reward(func, weight):
+    """Create a weighted version of a reward function.
+
+    Args:
+        func: The reward function to weight
+        weight: The weight to apply to the reward
+
+    Returns:
+        A new function that applies the weight to the reward
+    """
+
+    @wraps(func)
+    def weighted_reward(*args, **kwargs):
+        rewards = func(*args, **kwargs)
+        return [r * weight for r in rewards]
+
+    return weighted_reward
diff --git a/tests/test_grpo.py b/tests/test_grpo.py
new file mode 100644
index 000000000..8cfb92904
--- /dev/null
+++ b/tests/test_grpo.py
@@ -0,0 +1,47 @@
+import unittest
+
+import torch
+
+
+@unittest.skipUnless(torch.cuda.is_available(), "CUDA not available")
+class TestGRPOScriptArguments(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        from open_r1.grpo import GRPOScriptArguments
+
+        cls.GRPOScriptArguments = GRPOScriptArguments
+
+    def test_default_weights(self):
+        """Test that default weights are correctly set when not provided."""
+        args = self.GRPOScriptArguments(dataset_name="ABC")
+        self.assertEqual(len(args.reward_funcs), len(args.reward_weights))
+        self.assertEqual(args.reward_weights, [1.0] * len(args.reward_funcs))
+
+    def test_custom_weights_valid(self):
+        """Test that custom weights are accepted when matching reward_funcs length."""
+        args = self.GRPOScriptArguments(
+            dataset_name="ABC", reward_funcs=["accuracy", "format", "reasoning_steps"], reward_weights=[0.5, 1.0, 2.0]
+        )
+        self.assertEqual(args.reward_weights, [0.5, 1.0, 2.0])
+
+    def test_custom_weights_invalid(self):
+        """Test that mismatched weights raise ValueError."""
+        with self.assertRaises(ValueError) as context:
+            self.GRPOScriptArguments(
+                dataset_name="ABC", reward_funcs=["accuracy", "format"], reward_weights=[1.0, 2.0, 3.0]
+            )
+        self.assertIn("Number of reward weights", str(context.exception))
+        self.assertIn("must match number of reward functions", str(context.exception))
+
+    def test_empty_weights_with_custom_funcs(self):
+        """Test that empty weights are filled with 1.0 for custom reward functions."""
+        args = self.GRPOScriptArguments(
+            dataset_name="ABC",
+            reward_funcs=["accuracy", "format", "reasoning_steps"],
+        )
+        self.assertEqual(len(args.reward_weights), 3)
+        self.assertEqual(args.reward_weights, [1.0, 1.0, 1.0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index 9e41bdb0d..7956fdbc0 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -2,6 +2,7 @@
 
 from open_r1.rewards import (
     accuracy_reward,
+    create_weighted_reward,
     format_reward,
     get_cosine_scaled_reward,
     get_repetition_penalty_reward,
@@ -76,6 +77,35 @@ def test_multiple_completions(self):
         self.assertEqual(rewards[0], 1.0)
         self.assertEqual(rewards[1], 0.0)
 
+    def test_weighted_reward(self):
+        """Test create_weighted_reward with different weights."""
+        # Test with weight = 2.0
+        completion = [[{"content": "<think>Some reasoning</think><answer>The answer</answer>"}]]
+        base_reward_func = format_reward
+        weighted_reward_func = create_weighted_reward(base_reward_func, 2.0)
+
+        base_rewards = base_reward_func(completion)
+        weighted_rewards = weighted_reward_func(completion)
+
+        self.assertEqual(base_rewards[0], 1.0)
+        self.assertEqual(weighted_rewards[0], 2.0)
+
+        # Test with weight = 0.5
+        weighted_reward_func = create_weighted_reward(base_reward_func, 0.5)
+        weighted_rewards = weighted_reward_func(completion)
+        self.assertEqual(weighted_rewards[0], 0.5)
+
+        # Test with multiple completions
+        completions = [
+            [{"content": "<think>Some reasoning</think><answer>The answer</answer>"}],
+            [{"content": "Invalid format"}],
+        ]
+        weighted_reward_func = create_weighted_reward(base_reward_func, 2.0)
+        weighted_rewards = weighted_reward_func(completions)
+
+        self.assertEqual(weighted_rewards[0], 2.0)
+        self.assertEqual(weighted_rewards[1], 0.0)
+
     def test_cosine_scaled_reward(self):
         """Test cosine_scaled_reward with various cases."""
         # Test parameters
@@ -111,6 +141,17 @@ def test_format_reward_specific_multiline(self):
         rewards = format_reward(completion)
         self.assertEqual(rewards[0], 1.0)
 
+    def test_weighted_reward_preserves_name(self):
+        """Test that create_weighted_reward preserves the original function name. Important for logging."""
+        base_reward_func = format_reward
+        weighted_reward_func = create_weighted_reward(base_reward_func, 2.0)
+
+        self.assertEqual(
+            weighted_reward_func.__name__,
+            base_reward_func.__name__,
+            "Weighted reward function should preserve the original function name",
+        )
+
     def test_same_length_responses(self):
         """Test len_reward when all responses have the same length."""
         completions = [[{"content": r"\boxed{\frac{63}{400}}"}], [{"content": r"\boxed{\frac{64}{400}}"}]]

From 90a6de94c7e1cefb4045416907d79375443f2779 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 13 Feb 2025 15:00:05 +0100
Subject: [PATCH 049/137] Revert "Weighted reward functions (#213)" (#317)

This reverts commit fbea53267b9676fc89e92c9a24c83cb23e0884d0.
---
 .../grpo/config_demo.yaml                     |  6 ---
 .../grpo/config_demo.yaml                     |  6 ---
 .../grpo/config_simple_rl.yaml                |  6 ---
 src/open_r1/grpo.py                           | 28 +----------
 src/open_r1/rewards.py                        | 20 --------
 tests/test_grpo.py                            | 47 -------------------
 tests/test_rewards.py                         | 41 ----------------
 7 files changed, 2 insertions(+), 152 deletions(-)
 delete mode 100644 tests/test_grpo.py

diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml
index 2cedcb544..be1fa3f65 100644
--- a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml
+++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml
@@ -40,12 +40,6 @@ per_device_train_batch_size: 16
 push_to_hub: true
 report_to:
 - wandb
-reward_funcs:
-  - accuracy
-  - format
-reward_weights:
-  - 1.0
-  - 1.0
 save_strategy: "no"
 seed: 42
 warmup_ratio: 0.1
diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
index 2e35dd46e..817939390 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
@@ -42,12 +42,6 @@ per_device_train_batch_size: 16
 push_to_hub: true
 report_to:
 - wandb
-reward_funcs:
-  - accuracy
-  - format
-reward_weights:
-  - 1.0
-  - 1.0
 save_strategy: "no"
 seed: 42
 warmup_ratio: 0.1
diff --git a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
index 60af677ed..6df7730b9 100644
--- a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
+++ b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
@@ -42,12 +42,6 @@ per_device_train_batch_size: 16
 push_to_hub: true
 report_to:
 - wandb
-reward_funcs:
-  - accuracy
-  - format
-reward_weights:
-  - 1.0
-  - 1.0
 save_strategy: "no"
 seed: 42
 warmup_ratio: 0.1
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 305374221..1970f8ef4 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -16,7 +16,6 @@
 import os
 import sys
 from dataclasses import dataclass, field
-from typing import Optional
 
 import datasets
 import torch
@@ -28,7 +27,6 @@
 from open_r1.configs import GRPOConfig
 from open_r1.rewards import (
     accuracy_reward,
-    create_weighted_reward,
     format_reward,
     get_cosine_scaled_reward,
     get_repetition_penalty_reward,
@@ -51,8 +49,6 @@ class GRPOScriptArguments(ScriptArguments):
     Args:
         reward_funcs (`list[str]`):
             List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length'.
-        reward_weights (`list[float]` or `None`, *optional*):
-            List of weights for each reward function. If not provided, defaults to 1.0 for each reward function.
         cosine_min_value_wrong (`float`):
             Minimum reward for cosine scaling for wrong answers.
         cosine_max_value_wrong (`float`):
@@ -71,12 +67,6 @@ class GRPOScriptArguments(ScriptArguments):
             "help": "List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length'"
         },
     )
-    reward_weights: Optional[list[float]] = field(
-        default=None,
-        metadata={
-            "help": "List of weights for each reward function. If not provided, defaults to 1.0 for each function."
-        },
-    )
     cosine_min_value_wrong: float = field(
         default=0.0,
         metadata={"help": "Minimum reward for wrong answers"},
@@ -98,17 +88,6 @@ class GRPOScriptArguments(ScriptArguments):
         metadata={"help": "Maximum length for scaling"},
     )
 
-    def __post_init__(self):
-        # If no weights were provided, default to 1.0 for each reward function
-        if self.reward_weights is None:
-            self.reward_weights = [1.0] * len(self.reward_funcs)
-        # If weights were provided, validate the length
-        elif len(self.reward_weights) != len(self.reward_funcs):
-            raise ValueError(
-                f"Number of reward weights ({len(self.reward_weights)}: {self.reward_weights}) must match "
-                f"number of reward functions ({len(self.reward_funcs)}: {self.reward_funcs})"
-            )
-
     repetition_n_grams: int = field(
         default=3,
         metadata={"help": "Number of n-grams for repetition penalty reward"},
@@ -168,7 +147,7 @@ def main(script_args, training_args, model_args):
     # Load the dataset
     dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
 
-    # Create weighted reward functions
+    # Get reward functions
     REWARD_FUNCS_REGISTRY = {
         "accuracy": accuracy_reward,
         "format": format_reward,
@@ -186,10 +165,7 @@ def main(script_args, training_args, model_args):
         ),
         "length": len_reward,
     }
-    reward_funcs = [
-        create_weighted_reward(REWARD_FUNCS_REGISTRY[func], weight)
-        for func, weight in zip(script_args.reward_funcs, script_args.reward_weights)
-    ]
+    reward_funcs = [REWARD_FUNCS_REGISTRY[func] for func in script_args.reward_funcs]
 
     # Format into conversation
     def make_conversation(example):
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 7da8866fc..279627846 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -2,7 +2,6 @@
 
 import math
 import re
-from functools import wraps
 from typing import Dict
 
 from latex2sympy2_extended import NormalizationConfig
@@ -272,22 +271,3 @@ def repetition_penalty_reward(completions, **kwargs) -> float:
         return rewards
 
     return repetition_penalty_reward
-
-
-def create_weighted_reward(func, weight):
-    """Create a weighted version of a reward function.
-
-    Args:
-        func: The reward function to weight
-        weight: The weight to apply to the reward
-
-    Returns:
-        A new function that applies the weight to the reward
-    """
-
-    @wraps(func)
-    def weighted_reward(*args, **kwargs):
-        rewards = func(*args, **kwargs)
-        return [r * weight for r in rewards]
-
-    return weighted_reward
diff --git a/tests/test_grpo.py b/tests/test_grpo.py
deleted file mode 100644
index 8cfb92904..000000000
--- a/tests/test_grpo.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import unittest
-
-import torch
-
-
-@unittest.skipUnless(torch.cuda.is_available(), "CUDA not available")
-class TestGRPOScriptArguments(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        from open_r1.grpo import GRPOScriptArguments
-
-        cls.GRPOScriptArguments = GRPOScriptArguments
-
-    def test_default_weights(self):
-        """Test that default weights are correctly set when not provided."""
-        args = self.GRPOScriptArguments(dataset_name="ABC")
-        self.assertEqual(len(args.reward_funcs), len(args.reward_weights))
-        self.assertEqual(args.reward_weights, [1.0] * len(args.reward_funcs))
-
-    def test_custom_weights_valid(self):
-        """Test that custom weights are accepted when matching reward_funcs length."""
-        args = self.GRPOScriptArguments(
-            dataset_name="ABC", reward_funcs=["accuracy", "format", "reasoning_steps"], reward_weights=[0.5, 1.0, 2.0]
-        )
-        self.assertEqual(args.reward_weights, [0.5, 1.0, 2.0])
-
-    def test_custom_weights_invalid(self):
-        """Test that mismatched weights raise ValueError."""
-        with self.assertRaises(ValueError) as context:
-            self.GRPOScriptArguments(
-                dataset_name="ABC", reward_funcs=["accuracy", "format"], reward_weights=[1.0, 2.0, 3.0]
-            )
-        self.assertIn("Number of reward weights", str(context.exception))
-        self.assertIn("must match number of reward functions", str(context.exception))
-
-    def test_empty_weights_with_custom_funcs(self):
-        """Test that empty weights are filled with 1.0 for custom reward functions."""
-        args = self.GRPOScriptArguments(
-            dataset_name="ABC",
-            reward_funcs=["accuracy", "format", "reasoning_steps"],
-        )
-        self.assertEqual(len(args.reward_weights), 3)
-        self.assertEqual(args.reward_weights, [1.0, 1.0, 1.0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index 7956fdbc0..9e41bdb0d 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -2,7 +2,6 @@
 
 from open_r1.rewards import (
     accuracy_reward,
-    create_weighted_reward,
     format_reward,
     get_cosine_scaled_reward,
     get_repetition_penalty_reward,
@@ -77,35 +76,6 @@ def test_multiple_completions(self):
         self.assertEqual(rewards[0], 1.0)
         self.assertEqual(rewards[1], 0.0)
 
-    def test_weighted_reward(self):
-        """Test create_weighted_reward with different weights."""
-        # Test with weight = 2.0
-        completion = [[{"content": "<think>Some reasoning</think><answer>The answer</answer>"}]]
-        base_reward_func = format_reward
-        weighted_reward_func = create_weighted_reward(base_reward_func, 2.0)
-
-        base_rewards = base_reward_func(completion)
-        weighted_rewards = weighted_reward_func(completion)
-
-        self.assertEqual(base_rewards[0], 1.0)
-        self.assertEqual(weighted_rewards[0], 2.0)
-
-        # Test with weight = 0.5
-        weighted_reward_func = create_weighted_reward(base_reward_func, 0.5)
-        weighted_rewards = weighted_reward_func(completion)
-        self.assertEqual(weighted_rewards[0], 0.5)
-
-        # Test with multiple completions
-        completions = [
-            [{"content": "<think>Some reasoning</think><answer>The answer</answer>"}],
-            [{"content": "Invalid format"}],
-        ]
-        weighted_reward_func = create_weighted_reward(base_reward_func, 2.0)
-        weighted_rewards = weighted_reward_func(completions)
-
-        self.assertEqual(weighted_rewards[0], 2.0)
-        self.assertEqual(weighted_rewards[1], 0.0)
-
     def test_cosine_scaled_reward(self):
         """Test cosine_scaled_reward with various cases."""
         # Test parameters
@@ -141,17 +111,6 @@ def test_format_reward_specific_multiline(self):
         rewards = format_reward(completion)
         self.assertEqual(rewards[0], 1.0)
 
-    def test_weighted_reward_preserves_name(self):
-        """Test that create_weighted_reward preserves the original function name. Important for logging."""
-        base_reward_func = format_reward
-        weighted_reward_func = create_weighted_reward(base_reward_func, 2.0)
-
-        self.assertEqual(
-            weighted_reward_func.__name__,
-            base_reward_func.__name__,
-            "Weighted reward function should preserve the original function name",
-        )
-
     def test_same_length_responses(self):
         """Test len_reward when all responses have the same length."""
         completions = [[{"content": r"\boxed{\frac{63}{400}}"}], [{"content": r"\boxed{\frac{64}{400}}"}]]

From 7041fbc9d65b6f1832db727961e8282243f8f82a Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Thu, 13 Feb 2025 15:04:03 +0100
Subject: [PATCH 050/137] Update setup.py (#315)

adds peft as a temp dep due to https://github.com/huggingface/trl/issues/2849
---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 6d7c74a74..231de49a8 100644
--- a/setup.py
+++ b/setup.py
@@ -58,6 +58,7 @@
     "math-verify==0.5.2",  # Used for math verification in grpo
     "packaging>=23.0",
     "parameterized>=0.9.0",
+    "peft>=0.14.0",
     "pytest",
     "ruff>=0.9.0",
     "safetensors>=0.3.3",

From d5b67f4fe52215fb1d237588d845efb306760d26 Mon Sep 17 00:00:00 2001
From: Yen-Ting Lin <31605305+adamlin120@users.noreply.github.com>
Date: Tue, 18 Feb 2025 15:52:45 +0800
Subject: [PATCH 051/137] Add SFT configuration for
 Mistral-Small-24B-Instruct-2501 model (#348)

* Add SFT configuration for Mistral-Small-24B-Instruct-2501 model

* Rename config_numina.yaml to config_openr1_math.yaml

---------

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>
---
 .../sft/config_openr1_math.yaml               | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 recipes/Mistral-Small-24B-Instruct-2501/sft/config_openr1_math.yaml

diff --git a/recipes/Mistral-Small-24B-Instruct-2501/sft/config_openr1_math.yaml b/recipes/Mistral-Small-24B-Instruct-2501/sft/config_openr1_math.yaml
new file mode 100644
index 000000000..72ccdb688
--- /dev/null
+++ b/recipes/Mistral-Small-24B-Instruct-2501/sft/config_openr1_math.yaml
@@ -0,0 +1,44 @@
+# To start the training, run the following command:
+# sbatch -N 4 --job-name=mistral_sft slurm/train.slurm Mistral-Small-24B-Instruct-2501 sft numina zero3
+
+model_name_or_path: mistralai/Mistral-Small-24B-Instruct-2501
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+# dataset_name: yentinglin/s1K-1.1-trl-format
+dataset_name: yentinglin/OpenR1-Math-220k-trl-format
+dataset_configs:
+- all
+preprocessing_num_workers: 8
+
+# SFT trainer config
+bf16: true
+do_eval: true
+eval_strategy: no
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Mistral-Small-24B-Instruct-2501-Open-R1-Distill
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine
+packing: true
+max_seq_length: 32768
+max_steps: -1
+num_train_epochs: 5
+output_dir: data/Mistral-Small-24B-Instruct-2501-Open-R1-Distill
+overwrite_output_dir: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+save_strategy: epoch
+seed: 42
+warmup_ratio: 0.1

From 698530484c34ce704d70ce97df12f790def7148f Mon Sep 17 00:00:00 2001
From: Almaz Zinollayev <39913951+zeenolife@users.noreply.github.com>
Date: Tue, 18 Feb 2025 12:10:03 +0000
Subject: [PATCH 052/137] Adding grpo reward args into yaml files (#337)

---
 recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml | 6 ++++++
 recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml       | 6 ++++++
 recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml        | 6 ++++++
 3 files changed, 18 insertions(+)

diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml
index be1fa3f65..f853a3416 100644
--- a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml
+++ b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml
@@ -40,6 +40,12 @@ per_device_train_batch_size: 16
 push_to_hub: true
 report_to:
 - wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 1.0
 save_strategy: "no"
 seed: 42
 warmup_ratio: 0.1
diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
index 817939390..458377539 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
@@ -42,6 +42,12 @@ per_device_train_batch_size: 16
 push_to_hub: true
 report_to:
 - wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 1.0
 save_strategy: "no"
 seed: 42
 warmup_ratio: 0.1
diff --git a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
index 6df7730b9..15c8364ae 100644
--- a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
+++ b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
@@ -42,6 +42,12 @@ per_device_train_batch_size: 16
 push_to_hub: true
 report_to:
 - wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 1.0
 save_strategy: "no"
 seed: 42
 warmup_ratio: 0.1

From 78c197df513e718e4077b56101068a4efedf7a1b Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Tue, 18 Feb 2025 14:46:43 +0100
Subject: [PATCH 053/137] Enable chat template and system prompt to be
 configured during training (#349)

* Enable chat template to be configured

* Add notes to README

* Handle None

* Remove default system prompt

* Fix ST

* Tune hparams

* Fix

* Tune

* Fix
---
 README.md                                     | 21 ++++---
 .../grpo/config_demo.yaml                     | 58 +++++++++++++++++++
 .../grpo/config_demo.yaml                     | 51 ----------------
 .../grpo/config_demo.yaml                     | 24 ++++----
 .../sft/config_demo.yaml                      | 32 +++++-----
 .../grpo/config_simple_rl.yaml                |  5 +-
 src/open_r1/configs.py                        |  5 +-
 src/open_r1/grpo.py                           | 34 +++++------
 src/open_r1/rewards.py                        |  2 +-
 src/open_r1/sft.py                            | 12 +---
 src/open_r1/utils/__init__.py                 |  4 ++
 src/open_r1/utils/model_utils.py              | 26 +++++++++
 12 files changed, 158 insertions(+), 116 deletions(-)
 create mode 100644 recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
 delete mode 100644 recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml
 create mode 100644 src/open_r1/utils/model_utils.py

diff --git a/README.md b/README.md
index 07d2ddd36..a14f180f6 100644
--- a/README.md
+++ b/README.md
@@ -87,19 +87,18 @@ sudo apt-get install git-lfs
 
 ## Training models
 
-We support training models with either DDP or DeepSpeed (ZeRO-2 and ZeRO-3). For example, to run SFT on a dataset distilled from DeepSeek-R1 with reasoning traces such as [Bespoke-Stratos-17k](https://huggingface.co/datasets/bespokelabs/Bespoke-Stratos-17k), run:
+We support training models with either DDP or DeepSpeed (ZeRO-2 and ZeRO-3). For example, to run SFT on a dataset distilled from DeepSeek-R1 with reasoning traces such as [open-r1/OpenR1-Math-220k](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k), run:
 
 ```shell
 # Train via command line
 accelerate launch --config_file=recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py \
     --model_name_or_path Qwen/Qwen2.5-1.5B-Instruct \
-    --dataset_name HuggingFaceH4/Bespoke-Stratos-17k \
-    --learning_rate 2.0e-5 \
+    --dataset_name open-r1/OpenR1-Math-220k \
+    --learning_rate 1.0e-5 \
     --num_train_epochs 1 \
     --packing \
-    --max_seq_length 4096 \
-    --per_device_train_batch_size 2 \
-    --gradient_accumulation_steps 8 \
+    --max_seq_length 16384 \
+    --per_device_train_batch_size 16 \
     --gradient_checkpointing \
     --bf16 \
     --output_dir data/Qwen2.5-1.5B-Open-R1-Distill
@@ -139,7 +138,7 @@ accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r
 
 ### SFT
 
-To run SFT on a dataset distilled from DeepSeek-R1 with reasoning traces such as [Bespoke-Stratos-17k](https://huggingface.co/datasets/bespokelabs/Bespoke-Stratos-17k), run:
+To run SFT on a dataset distilled from DeepSeek-R1 with reasoning traces such as [open-r1/OpenR1-Math-220k](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k), run:
 
 ```shell
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero3.yaml \
@@ -149,14 +148,18 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con
 
 ### GRPO
 
-To train via the GRPO trainer, we use one GPU to run vLLM for faster generation and the remaining GPUs for training. For example, one a node with 8 GPUs, use the `recipes/accelerate_configs/zero2.yaml` config and then overwrite `num_processes` to run on 7 devices:
+To train via the GRPO trainer, we use one GPU to run vLLM for faster generation and the remaining GPUs for training. For example, one a node with 8 GPUs, set `--num_processes` to override the default value in the `accelerate` configs:
 
 ```shell
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero2.yaml \
     --num_processes=7 src/open_r1/grpo.py \
-    --config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
+    --config recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
 ```
 
+> [!WARNING]
+> The chat template used in the distilled DeepSeek models omits the contents of the reasoning block within the `<think>` and `</think>` tags. It also prefills the assistant response with `<think>` which interferes with the format reward function. To handle that, it is important to override the chat template as done in e.g.  [recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml](./recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml).
+
+
 We provide a minimal reproducible experiment using GRPO for mathematical reasoning, referencing the approach from [SimpleRL-Reason](https://hkust-nlp.notion.site/simplerl-reason) which uses a 7B model trained on 8K examples. Running this on 8 H100 80G GPU takes about 3 hours:
 
 ```shell
diff --git a/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml b/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
new file mode 100644
index 000000000..102966905
--- /dev/null
+++ b/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
@@ -0,0 +1,58 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+# We edit the DeepSeek chat template to ensure (a) the reasoning block within <think> and </think> is included in the completion and (b) the <think> tag is not part of the prefill so that the format reward works
+chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}"
+dataset_name: open-r1/OpenR1-Math-220k
+dataset_configs:
+- default
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+bf16: true
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: DeepSeek-R1-Distill-Qwen-1.5B-GRPO
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_prompt_length: 512
+max_completion_length: 2048
+max_steps: -1
+num_generations: 16
+num_train_epochs: 1
+output_dir: data/DeepSeek-R1-Distill-Qwen-1.5B-GRPO
+overwrite_output_dir: true
+per_device_eval_batch_size: 16
+per_device_train_batch_size: 16
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 1.0
+save_strategy: "epoch"
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+warmup_ratio: 0.1
diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml
deleted file mode 100644
index f853a3416..000000000
--- a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_demo.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-# Model arguments
-model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
-model_revision: main
-torch_dtype: bfloat16
-attn_implementation: flash_attention_2
-
-# Data training arguments
-dataset_name: AI-MO/NuminaMath-TIR
-dataset_configs:
-- all
-
-# GRPO trainer config
-bf16: true
-use_vllm: true
-vllm_device: auto
-vllm_gpu_memory_utilization: 0.7
-do_eval: true
-eval_strategy: steps
-eval_steps: 100
-gradient_accumulation_steps: 16
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-hub_model_id: DeepSeek-R1-Distill-Qwen-7B-GRPO
-hub_strategy: every_save
-learning_rate: 2.0e-05
-log_level: info
-logging_steps: 5
-logging_strategy: steps
-lr_scheduler_type: cosine
-max_prompt_length: 512
-max_completion_length: 1024
-max_steps: -1
-num_generations: 7
-num_train_epochs: 1
-output_dir: data/DeepSeek-R1-Distill-Qwen-7B-GRPO
-overwrite_output_dir: true
-per_device_eval_batch_size: 32
-per_device_train_batch_size: 16
-push_to_hub: true
-report_to:
-- wandb
-reward_funcs:
-- accuracy
-- format
-reward_weights:
-- 1.0
-- 1.0
-save_strategy: "no"
-seed: 42
-warmup_ratio: 0.1
diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
index 458377539..f38add6ac 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
@@ -5,39 +5,38 @@ torch_dtype: bfloat16
 attn_implementation: flash_attention_2
 
 # Data training arguments
-dataset_name: AI-MO/NuminaMath-TIR
+dataset_name: open-r1/OpenR1-Math-220k
 dataset_configs:
-- all
-# Num processes is less by 1 as vLLM is using 1 GPU
-num_processes: 7
+- default
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
 
 # GRPO trainer config
 bf16: true
 use_vllm: true
 vllm_device: auto
 vllm_gpu_memory_utilization: 0.7
-do_eval: true
-eval_strategy: steps
-eval_steps: 100
-gradient_accumulation_steps: 16
+do_eval: false
+gradient_accumulation_steps: 4
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
 hub_model_id: Qwen2.5-1.5B-Open-R1-GRPO
 hub_strategy: every_save
 learning_rate: 2.0e-05
+log_completions: true
 log_level: info
-logging_steps: 5
+logging_first_step: true
+logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: cosine
 max_prompt_length: 512
 max_completion_length: 1024
 max_steps: -1
-num_generations: 7
+num_generations: 16
 num_train_epochs: 1
 output_dir: data/Qwen2.5-1.5B-Open-R1-GRPO
 overwrite_output_dir: true
-per_device_eval_batch_size: 32
+per_device_eval_batch_size: 16
 per_device_train_batch_size: 16
 push_to_hub: true
 report_to:
@@ -48,6 +47,7 @@ reward_funcs:
 reward_weights:
 - 1.0
 - 1.0
-save_strategy: "no"
+save_strategy: "epoch"
+save_total_limit: 1
 seed: 42
 warmup_ratio: 0.1
diff --git a/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
index c7dd25bbd..30201bff7 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
@@ -5,38 +5,42 @@ torch_dtype: bfloat16
 attn_implementation: flash_attention_2
 
 # Data training arguments
-dataset_name: HuggingFaceH4/Bespoke-Stratos-17k
+dataset_name: open-r1/OpenR1-Math-220k
 dataset_configs:
-- all
-preprocessing_num_workers: 8
+- default
+dataset_num_proc: 48
 
 # SFT trainer config
 bf16: true
-do_eval: true
-eval_strategy: steps
-eval_steps: 100
-gradient_accumulation_steps: 8
+do_eval: false
+eval_strategy: 'no'
+gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
 hub_model_id: Qwen2.5-1.5B-Open-R1-Distill
 hub_strategy: every_save
-learning_rate: 2.0e-05
+learning_rate: 5.0e-05
 log_level: info
 logging_steps: 5
 logging_strategy: steps
-lr_scheduler_type: cosine
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
 packing: true
-max_seq_length: 4096
+max_seq_length: 16384
 max_steps: -1
 num_train_epochs: 1
 output_dir: data/Qwen2.5-1.5B-Open-R1-Distill
 overwrite_output_dir: true
-per_device_eval_batch_size: 4
-per_device_train_batch_size: 2
+per_device_eval_batch_size: 16
+per_device_train_batch_size: 16
 push_to_hub: true
 report_to:
 - wandb
-save_strategy: "no"
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
 seed: 42
-warmup_ratio: 0.1
\ No newline at end of file
+use_liger: true
+warmup_ratio: 0.05
\ No newline at end of file
diff --git a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
index 15c8364ae..762f77164 100644
--- a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
+++ b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
@@ -8,8 +8,7 @@ attn_implementation: flash_attention_2
 dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_configs:
 - train
-# Num processes is less by 1 as vLLM is using 1 GPU
-num_processes: 7
+system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 
 # GRPO trainer config
 bf16: true
@@ -26,7 +25,9 @@ gradient_checkpointing_kwargs:
 hub_model_id: Qwen-2.5-7B-Simple-RL
 hub_strategy: every_save
 learning_rate: 3.0e-06
+log_completions: true
 log_level: info
+logging_first_step: true
 logging_steps: 5
 logging_strategy: steps
 lr_scheduler_type: cosine
diff --git a/src/open_r1/configs.py b/src/open_r1/configs.py
index 3a6f68665..98cd0d108 100644
--- a/src/open_r1/configs.py
+++ b/src/open_r1/configs.py
@@ -32,8 +32,10 @@ class GRPOConfig(trl.GRPOConfig):
     callbacks: list[str] = field(
         default_factory=lambda: [], metadata={"help": "The callbacks to run during training."}
     )
+    chat_template: Optional[str] = field(default=None, metadata={"help": "The chat template to use."})
     system_prompt: Optional[str] = field(
-        default=None, metadata={"help": "The optional system prompt to use for benchmarking."}
+        default=None,
+        metadata={"help": "The optional system prompt to use."},
     )
     hub_model_revision: Optional[str] = field(
         default="main", metadata={"help": "The Hub model branch to push the model to."}
@@ -62,6 +64,7 @@ class SFTConfig(trl.SFTConfig):
     callbacks: list[str] = field(
         default_factory=lambda: [], metadata={"help": "The callbacks to run during training."}
     )
+    chat_template: Optional[str] = field(default=None, metadata={"help": "The chat template to use."})
     system_prompt: Optional[str] = field(
         default=None,
         metadata={"help": "The optional system prompt to use for benchmarking."},
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 1970f8ef4..7032346b2 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -33,6 +33,7 @@
     len_reward,
     reasoning_steps_reward,
 )
+from open_r1.utils import get_tokenizer
 from open_r1.utils.callbacks import get_callbacks
 from open_r1.utils.wandb_logging import init_wandb_training
 from trl import GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
@@ -48,7 +49,7 @@ class GRPOScriptArguments(ScriptArguments):
 
     Args:
         reward_funcs (`list[str]`):
-            List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length'.
+            List of reward functions. Possible values: 'accuracy', 'format', 'format_deepseek', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length'.
         cosine_min_value_wrong (`float`):
             Minimum reward for cosine scaling for wrong answers.
         cosine_max_value_wrong (`float`):
@@ -64,7 +65,7 @@ class GRPOScriptArguments(ScriptArguments):
     reward_funcs: list[str] = field(
         default_factory=lambda: ["accuracy", "format"],
         metadata={
-            "help": "List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length'"
+            "help": "List of reward functions. Possible values: 'accuracy', 'format', 'format_deepseek', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length'"
         },
     )
     cosine_min_value_wrong: float = field(
@@ -87,7 +88,6 @@ class GRPOScriptArguments(ScriptArguments):
         default=1000,
         metadata={"help": "Maximum length for scaling"},
     )
-
     repetition_n_grams: int = field(
         default=3,
         metadata={"help": "Number of n-grams for repetition penalty reward"},
@@ -98,14 +98,6 @@ class GRPOScriptArguments(ScriptArguments):
     )
 
 
-SYSTEM_PROMPT = (
-    "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
-    "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
-    "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
-    "<think> reasoning process here </think><answer> answer here </answer>"
-)
-
-
 def main(script_args, training_args, model_args):
     # Set seed for reproducibility
     set_seed(training_args.seed)
@@ -147,6 +139,11 @@ def main(script_args, training_args, model_args):
     # Load the dataset
     dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
 
+    ################
+    # Load tokenizer
+    ################
+    tokenizer = get_tokenizer(model_args, training_args)
+
     # Get reward functions
     REWARD_FUNCS_REGISTRY = {
         "accuracy": accuracy_reward,
@@ -169,14 +166,16 @@ def main(script_args, training_args, model_args):
 
     # Format into conversation
     def make_conversation(example):
-        return {
-            "prompt": [
-                {"role": "system", "content": SYSTEM_PROMPT},
-                {"role": "user", "content": example["problem"]},
-            ],
-        }
+        prompt = []
+
+        if training_args.system_prompt is not None:
+            prompt.append({"role": "system", "content": training_args.system_prompt})
+
+        prompt.append({"role": "user", "content": example["problem"]})
+        return {"prompt": prompt}
 
     dataset = dataset.map(make_conversation)
+
     for split in dataset:
         if "messages" in dataset[split].column_names:
             dataset[split] = dataset[split].remove_columns("messages")
@@ -205,6 +204,7 @@ def make_conversation(example):
         eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
         peft_config=get_peft_config(model_args),
         callbacks=get_callbacks(training_args, model_args),
+        processing_class=tokenizer,
     )
 
     ###############
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 279627846..3deea1933 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -51,7 +51,7 @@ def accuracy_reward(completions, solution, **kwargs):
 
 
 def format_reward(completions, **kwargs):
-    """Reward function that checks if the completion has a specific format."""
+    """Reward function that checks if the reasoning process is enclosed within <think> and </think> tags, while the final answer is enclosed within <answer> and </answer> tags."""
     pattern = r"^<think>.*?</think>\s*<answer>.*?</answer>$"
     completion_contents = [completion[0]["content"] for completion in completions]
     matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completion_contents]
diff --git a/src/open_r1/sft.py b/src/open_r1/sft.py
index b6031d813..82964574a 100644
--- a/src/open_r1/sft.py
+++ b/src/open_r1/sft.py
@@ -43,10 +43,11 @@
 import torch
 import transformers
 from datasets import load_dataset
-from transformers import AutoTokenizer, set_seed
+from transformers import set_seed
 from transformers.trainer_utils import get_last_checkpoint
 
 from open_r1.configs import SFTConfig
+from open_r1.utils import get_tokenizer
 from open_r1.utils.callbacks import get_callbacks
 from open_r1.utils.wandb_logging import init_wandb_training
 from trl import (
@@ -82,11 +83,6 @@ def main(script_args, training_args, model_args):
     transformers.utils.logging.enable_default_handler()
     transformers.utils.logging.enable_explicit_format()
 
-    # Log on each process a small summary
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
-    )
     logger.info(f"Model parameters {model_args}")
     logger.info(f"Script parameters {script_args}")
     logger.info(f"Training parameters {training_args}")
@@ -109,9 +105,7 @@ def main(script_args, training_args, model_args):
     ################
     # Load tokenizer
     ################
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code, use_fast=True
-    )
+    tokenizer = get_tokenizer(model_args, training_args)
     tokenizer.pad_token = tokenizer.eos_token
 
     ###################
diff --git a/src/open_r1/utils/__init__.py b/src/open_r1/utils/__init__.py
index e69de29bb..b1de213d5 100644
--- a/src/open_r1/utils/__init__.py
+++ b/src/open_r1/utils/__init__.py
@@ -0,0 +1,4 @@
+from .model_utils import get_tokenizer
+
+
+__all__ = ["get_tokenizer"]
diff --git a/src/open_r1/utils/model_utils.py b/src/open_r1/utils/model_utils.py
new file mode 100644
index 000000000..1312ed66d
--- /dev/null
+++ b/src/open_r1/utils/model_utils.py
@@ -0,0 +1,26 @@
+from transformers import AutoTokenizer, PreTrainedTokenizer
+
+from trl import ModelConfig
+
+from ..configs import GRPOConfig, SFTConfig
+
+
+DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+
+
+def get_tokenizer(
+    model_args: ModelConfig, training_args: SFTConfig | GRPOConfig, auto_set_chat_template: bool = True
+) -> PreTrainedTokenizer:
+    """Get the tokenizer for the model."""
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+    )
+
+    if training_args.chat_template is not None:
+        tokenizer.chat_template = training_args.chat_template
+    elif auto_set_chat_template and tokenizer.get_chat_template() is None:
+        tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE
+
+    return tokenizer

From 9cf270d605390775166cd55c76a14ffe6e7d7960 Mon Sep 17 00:00:00 2001
From: Yen-Ting Lin <31605305+adamlin120@users.noreply.github.com>
Date: Tue, 18 Feb 2025 21:59:38 +0800
Subject: [PATCH 054/137] Update AIME25 task configuration and registration
 (#344)

---
 src/open_r1/evaluate.py         | 9 ++++-----
 src/open_r1/utils/evaluation.py | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/open_r1/evaluate.py b/src/open_r1/evaluate.py
index bd781b956..5d38b8608 100644
--- a/src/open_r1/evaluate.py
+++ b/src/open_r1/evaluate.py
@@ -106,12 +106,11 @@ def gpqa_prompt_fn(line, task_name: str = None):
     metric=[expr_gold_metric],
     version=1,
 )
-# Part I from AIME 2025 exam: https://artofproblemsolving.com/wiki/index.php/2025_AIME_I?srsltid=AfmBOoof5gaaqlt3-l6LH7Tt6qmJZtl_2PQEDYlLFlMqhq9dLL8FMCRR
-aime25_part1 = LightevalTaskConfig(
-    name="aime25:part1",
+aime25 = LightevalTaskConfig(
+    name="aime25",
     suite=["custom"],
     prompt_function=aime_prompt_fn,
-    hf_repo="open-r1/aime_2025_1",
+    hf_repo="yentinglin/aime_2025",
     hf_subset="default",
     hf_avail_splits=["train"],
     evaluation_splits=["train"],
@@ -156,7 +155,7 @@ def gpqa_prompt_fn(line, task_name: str = None):
 # Add tasks to the table
 TASKS_TABLE = []
 TASKS_TABLE.append(aime24)
-TASKS_TABLE.append(aime25_part1)
+TASKS_TABLE.append(aime25)
 TASKS_TABLE.append(math_500)
 TASKS_TABLE.append(gpqa_diamond)
 
diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py
index 3be5daab0..489971830 100644
--- a/src/open_r1/utils/evaluation.py
+++ b/src/open_r1/utils/evaluation.py
@@ -48,7 +48,7 @@ def register_lighteval_task(
 
 register_lighteval_task(LIGHTEVAL_TASKS, "custom", "math_500", "math_500", 0)
 register_lighteval_task(LIGHTEVAL_TASKS, "custom", "aime24", "aime24", 0)
-register_lighteval_task(LIGHTEVAL_TASKS, "custom", "aime25_part1", "aime25:part1", 0)
+register_lighteval_task(LIGHTEVAL_TASKS, "custom", "aime25", "aime25", 0)
 register_lighteval_task(LIGHTEVAL_TASKS, "custom", "gpqa", "gpqa:diamond", 0)
 
 

From 740a7a4305cfc63838494f881bb774d687d6518a Mon Sep 17 00:00:00 2001
From: Agus <agustin.piqueres@gmail.com>
Date: Wed, 19 Feb 2025 08:32:33 +0100
Subject: [PATCH 055/137] Add LiveCodeBench's codegeneration task from
 lighteval (#346)

* Add lcb:codegeneration task from ligtheval

* Add results from R1 Qwen 32B
---
 README.md                       | 30 ++++++++++++++++++++++++++++++
 src/open_r1/utils/evaluation.py |  1 +
 2 files changed, 31 insertions(+)

diff --git a/README.md b/README.md
index a14f180f6..76e5814d0 100644
--- a/README.md
+++ b/README.md
@@ -344,6 +344,36 @@ lighteval vllm $MODEL_ARGS "custom|gpqa:diamond|0|0" \
 python scripts/run_benchmarks.py --model-id={model_id}  --benchmarks gpqa
 ```
 
+### LiveCodeBench
+
+We are able to reproduce Deepseek's reported results on the LiveCodeBench code generation benchmark within ~1-3 standard deviations:
+
+| Model                         | LiveCodeBench (🤗 LightEval) | GPQA Diamond (DeepSeek Reported) |
+|:------------------------------|:---------------------------:|:--------------------------------:|
+| DeepSeek-R1-Distill-Qwen-1.5B |            16.3             |               16.9               |
+| DeepSeek-R1-Distill-Qwen-7B   |            36.6             |               37.6               |
+| DeepSeek-R1-Distill-Qwen-14B  |            51.5             |               53.1               |
+| DeepSeek-R1-Distill-Qwen-32B  |            56.6                |               57.2               |
+| DeepSeek-R1-Distill-Llama-8B  |            37.0             |               39.6               |
+| DeepSeek-R1-Distill-Llama-70B |            54.5             |               57.5               |
+
+To reproduce these results use the following command:
+
+```shell
+NUM_GPUS=1 # Set to 8 for 32B and 70B models, or data_parallel_size=8 with the smaller models for speed
+MODEL=deepseek-ai/{model_name}
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilisation=0.8,tensor_parallel_size=$NUM_GPUS,generation_parameters={temperature:0.6,top_p:0.95}"
+OUTPUT_DIR=data/evals/$MODEL
+
+lighteval vllm $MODEL_ARGS "extended|lcb:codegeneration|0|0" \
+    --use-chat-template \
+    --output-dir $OUTPUT_DIR
+```
+
+```shell
+python scripts/run_benchmarks.py --model-id={model_id}  --benchmarks lcb
+```
+
 ## Data generation
 
 ### Generate data from a smol distilled R1 model
diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py
index 489971830..dff5c9858 100644
--- a/src/open_r1/utils/evaluation.py
+++ b/src/open_r1/utils/evaluation.py
@@ -50,6 +50,7 @@ def register_lighteval_task(
 register_lighteval_task(LIGHTEVAL_TASKS, "custom", "aime24", "aime24", 0)
 register_lighteval_task(LIGHTEVAL_TASKS, "custom", "aime25", "aime25", 0)
 register_lighteval_task(LIGHTEVAL_TASKS, "custom", "gpqa", "gpqa:diamond", 0)
+register_lighteval_task(LIGHTEVAL_TASKS, "extended", "lcb", "lcb:codegeneration", 0)
 
 
 def get_lighteval_tasks():

From d76ecc12a23928b7a4355f19f508a44d6a624033 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Wed, 19 Feb 2025 11:26:46 +0100
Subject: [PATCH 056/137] Add E2B code interpreter reward function (#364)

* Add stuff

* Make it kind of work

* Add more stuff

* Add fix for parse

* Fix

* Refactor

* Clean up

* Fix config

* Fix sys

* Add SFT config

* Use min rate

* Fix eval

* Add base model

* Add s1k

* Disable eval

* Fix

* Add import checker

* Fix importer

* Fix

* Tune config

* Tune

* Fix

* Fix save

* Tuen beta

* Remove configs

* Fix vLLM

* Fix

* Add note

* Add doc

* doc

* Fix

* Tune lr

* Add command
---
 README.md                                     | 37 ++++++++
 .../grpo/config_demo_code.yaml                | 57 ++++++++++++
 setup.py                                      |  3 +
 src/open_r1/grpo.py                           |  2 +
 src/open_r1/rewards.py                        | 87 +++++++++++++++++++
 src/open_r1/utils/__init__.py                 |  3 +-
 src/open_r1/utils/import_utils.py             | 23 +++++
 7 files changed, 211 insertions(+), 1 deletion(-)
 create mode 100644 recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
 create mode 100644 src/open_r1/utils/import_utils.py

diff --git a/README.md b/README.md
index 76e5814d0..7b8ff5a75 100644
--- a/README.md
+++ b/README.md
@@ -170,6 +170,43 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con
 
 Our final [model](https://huggingface.co/Dongwei/Qwen-2.5-7B_Base_Math_smalllr), while using different learning rates, loss functions and reward structures, achieves 69.4% accuracy on MATH-500, demonstrating a 17%+ improvement over the base model.
 
+#### 👨‍💻 Training with a code interpreter
+
+We provide a `code` reward function for executing code generated by the policy during training. Currently, this reward function targets code contests like [Codeforces](https://codeforces.com), where solutions are executed against a set of test cases and the overall success rate is returned as the final reward. To ensure safe execution, we use [E2B](https://e2b.dev) sandboxes, which are fast and cheap to run. To use this reward function, first install the necessary dependencies:
+
+```shell
+uv pip install -e '.[code]'
+```
+
+Then create a `.env` file and place an API token from E2B within it:
+
+```
+E2B_API_KEY="e2b_xxx"
+```
+
+Then make sure your dataset contains a `verification_info` column with the following schema (adopted from PrimeIntellect's excellent [datasets](https://huggingface.co/collections/PrimeIntellect/synthetic-1-67a2c399cfdd6c9f7fae0c37) of verifiable problems):
+
+```python
+{
+    "language": "python",
+    "test_cases": [
+        {
+            "input": "4\n4\n0001\n1000\n0011\n0111\n3\n010\n101\n0\n2\n00000\n00001\n4\n01\n001\n0001\n00001\n",
+            "output": "1\n3 \n-1\n0\n\n2\n1 2 \n",
+            "type": "stdin_stdout",
+        }
+    ],
+}
+```
+
+For example, to train a smol model on Python problems, run:
+
+```shell
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero2.yaml \
+    --num_processes=7 src/open_r1/grpo.py \
+    --config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
+```
+
 ### Launching jobs on a Slurm cluster
 
 If you have access to a Slurm cluster, we provide a `slurm/train.slurm` script that will automatically queue training jobs for you. Here's how you can use it:
diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
new file mode 100644
index 000000000..783a4d2a1
--- /dev/null
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
@@ -0,0 +1,57 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python-10k
+dataset_configs:
+- default
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+beta: 0.01
+bf16: true
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.9
+do_eval: false
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-1.5B-Open-R1-Code-GRPO
+hub_strategy: every_save
+learning_rate: 5.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_prompt_length: 1024
+max_completion_length: 2048
+max_steps: 500
+num_generations: 14
+num_train_epochs: 1
+output_dir: data/Qwen2.5-1.5B-Open-R1-Code-GRPO
+overwrite_output_dir: true
+per_device_train_batch_size: 16
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- code
+- format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 50
+save_total_limit: 1
+seed: 42
+temperature: 1.0
+warmup_ratio: 0.03
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 231de49a8..907269c20 100644
--- a/setup.py
+++ b/setup.py
@@ -46,6 +46,7 @@
     "datasets>=3.2.0",
     "deepspeed==0.15.4",
     "distilabel[vllm,ray,openai]>=1.5.2",
+    "e2b-code-interpreter>=1.0.5",
     "einops>=0.8.0",
     "flake8>=6.0.0",
     "flash_attn>=2.7.4.post1",
@@ -60,6 +61,7 @@
     "parameterized>=0.9.0",
     "peft>=0.14.0",
     "pytest",
+    "python-dotenv",
     "ruff>=0.9.0",
     "safetensors>=0.3.3",
     "sentencepiece>=0.1.99",
@@ -88,6 +90,7 @@ def deps_list(*pkgs):
 extras["torch"] = deps_list("torch")
 extras["quality"] = deps_list("ruff", "isort", "flake8")
 extras["train"] = deps_list("flash_attn")
+extras["code"] = deps_list("e2b-code-interpreter", "python-dotenv")
 extras["eval"] = deps_list("lighteval", "math-verify")
 extras["dev"] = extras["quality"] + extras["tests"] + extras["eval"] + extras["train"]
 
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 7032346b2..2ead27da6 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -27,6 +27,7 @@
 from open_r1.configs import GRPOConfig
 from open_r1.rewards import (
     accuracy_reward,
+    code_reward,
     format_reward,
     get_cosine_scaled_reward,
     get_repetition_penalty_reward,
@@ -161,6 +162,7 @@ def main(script_args, training_args, model_args):
             max_penalty=script_args.repetition_max_penalty,
         ),
         "length": len_reward,
+        "code": code_reward,
     }
     reward_funcs = [REWARD_FUNCS_REGISTRY[func] for func in script_args.reward_funcs]
 
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 3deea1933..c003f4d32 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -1,5 +1,6 @@
 """Reward functions for GRPO training."""
 
+import json
 import math
 import re
 from typing import Dict
@@ -7,6 +8,15 @@
 from latex2sympy2_extended import NormalizationConfig
 from math_verify import LatexExtractionConfig, parse, verify
 
+from .utils import is_e2b_available
+
+
+if is_e2b_available():
+    from dotenv import load_dotenv
+    from e2b_code_interpreter import Sandbox
+
+    load_dotenv()
+
 
 def accuracy_reward(completions, solution, **kwargs):
     """Reward function that checks if the completion is the same as the ground truth."""
@@ -271,3 +281,80 @@ def repetition_penalty_reward(completions, **kwargs) -> float:
         return rewards
 
     return repetition_penalty_reward
+
+
+def extract_code(completion: str) -> str:
+    pattern = re.compile(r"```python\n(.*?)```", re.DOTALL)
+    matches = pattern.findall(completion)
+    extracted_answer = matches[-1] if len(matches) >= 1 else ""
+    return extracted_answer
+
+
+def code_reward(completions, **kwargs) -> list[float]:
+    """Reward function that evaluates code snippets using the E2B code interpreter.
+
+    Assumes the dataset contains a `verification_info` column with test cases.
+    """
+    if not is_e2b_available():
+        raise ImportError(
+            "E2B is not available and required for this reward function. Please install E2B with "
+            "`pip install e2b-code-interpreter` and add an API key to a `.env` file."
+        )
+
+    rewards = []
+    # TODO: add support for other languages in E2B: https://e2b.dev/docs/code-interpreting/supported-languages
+    try:
+        """Returns a reward function that evaluates code snippets in a sandbox."""
+        evaluation_script_template = """
+        import subprocess
+        import json
+
+        def evaluate_code(code, test_cases):
+            passed = 0
+            total = len(test_cases)
+            exec_timeout = 5
+
+            for case in test_cases:
+                process = subprocess.run(
+                    ["python3", "-c", code],
+                    input=case["input"],
+                    text=True,
+                    capture_output=True,
+                    timeout=exec_timeout
+                )
+
+                if process.returncode != 0:  # Error in execution
+                    continue
+
+                output = process.stdout.strip()
+                if output.strip() == case["output"].strip():
+                    passed += 1
+
+            success_rate = (passed / total)
+            return success_rate
+
+        code_snippet = {code}
+        test_cases = json.loads({test_cases})
+
+        evaluate_code(code_snippet, test_cases)
+        """
+        code_snippets = [extract_code(completion[-1]["content"]) for completion in completions]
+        verification_info = kwargs["verification_info"]
+        scripts = [
+            evaluation_script_template.format(
+                code=json.dumps(code), test_cases=json.dumps(json.dumps(info["test_cases"]))
+            )
+            for code, info in zip(code_snippets, verification_info)
+        ]
+        with Sandbox(timeout=30, request_timeout=3) as sbx:
+            for script in scripts:
+                execution = sbx.run_code(script, language=verification_info["language"])
+                try:
+                    output = float(execution.text)
+                except (TypeError, ValueError):
+                    output = 0.0
+                rewards.append(output)
+    except Exception as e:
+        print(f"Error from E2B executor: {e}")
+        rewards = [0.0] * len(completions)
+    return rewards
diff --git a/src/open_r1/utils/__init__.py b/src/open_r1/utils/__init__.py
index b1de213d5..5302463e1 100644
--- a/src/open_r1/utils/__init__.py
+++ b/src/open_r1/utils/__init__.py
@@ -1,4 +1,5 @@
+from .import_utils import is_e2b_available
 from .model_utils import get_tokenizer
 
 
-__all__ = ["get_tokenizer"]
+__all__ = ["get_tokenizer", "is_e2b_available"]
diff --git a/src/open_r1/utils/import_utils.py b/src/open_r1/utils/import_utils.py
new file mode 100644
index 000000000..8893264ae
--- /dev/null
+++ b/src/open_r1/utils/import_utils.py
@@ -0,0 +1,23 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers.utils.import_utils import _is_package_available
+
+
+# Use same as transformers.utils.import_utils
+_e2b_available = _is_package_available("e2b")
+
+
+def is_e2b_available() -> bool:
+    return _e2b_available

From 45a32eecc2854924ec644e2e31ae031ea05722d0 Mon Sep 17 00:00:00 2001
From: Jingze Shi <3314685395@qq.com>
Date: Thu, 20 Feb 2025 22:50:50 +0800
Subject: [PATCH 057/137] Fix len reward (#385)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Rename solutions to solution for `len_reward`

* Fix docstring for len_reward

* Update src/open_r1/rewards.py

Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>

---------

Co-authored-by: Kashif Rasul <kashif.rasul@gmail.com>
Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
---
 src/open_r1/rewards.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index c003f4d32..2960cdee7 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -85,14 +85,14 @@ def reasoning_steps_reward(completions, **kwargs):
     return [min(1.0, count / 3) for count in matches]
 
 
-def len_reward(completions: list[Dict[str, str]], solutions: list[str], **kwargs) -> float:
+def len_reward(completions: list[Dict[str, str]], solution: list[str], **kwargs) -> float:
     """Compute length-based rewards to discourage overthinking and promote token efficiency.
 
     Taken from from the Kimi 1.5 tech report: https://arxiv.org/abs/2501.12599
 
     Args:
         completions: List of model completions
-        solutions: List of ground truth solutions
+        solution: List of ground truth solutions
 
     Returns:
         List of rewards where:
@@ -103,7 +103,7 @@ def len_reward(completions: list[Dict[str, str]], solutions: list[str], **kwargs
 
     # First check correctness of answers
     correctness = []
-    for content, sol in zip(contents, solutions):
+    for content, sol in zip(contents, solution):
         gold_parsed = parse(
             sol,
             extraction_mode="first_match",

From 9fb45bede65dd6b7349892279ec069205bb70473 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Fri, 21 Feb 2025 14:52:45 +0100
Subject: [PATCH 058/137] Fix LightEval commands and dependencies (#386)

* Fix lighteval cmd

* Fix typo

* Pin lighteval

* Hacks to the max

* Fix slurm

* Fix

* Pin lighteval

* Pin l

---------

Co-authored-by: lewis@huggingface.co <lewis@ip-26-0-160-242.ec2.internal>
---
 Makefile             |  2 +-
 README.md            | 44 ++++++++++++++++++++++++--------------------
 setup.py             | 12 +++++++-----
 slurm/evaluate.slurm | 15 +++++++++++----
 4 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/Makefile b/Makefile
index 7f28821b5..cdeba5511 100644
--- a/Makefile
+++ b/Makefile
@@ -28,7 +28,7 @@ evaluate:
 		fi \
 	),))
 	$(if $(filter tensor,$(PARALLEL)),export VLLM_WORKER_MULTIPROC_METHOD=spawn &&,) \
-	MODEL_ARGS="pretrained=$(MODEL),dtype=bfloat16,$(PARALLEL_ARGS),max_model_length=32768,gpu_memory_utilisation=0.8" && \
+	MODEL_ARGS="pretrained=$(MODEL),dtype=bfloat16,$(PARALLEL_ARGS),max_model_length=32768,gpu_memory_utilization=0.8" && \
 	lighteval vllm $$MODEL_ARGS "custom|$(TASK)|0|0" \
 		--custom-tasks src/open_r1/evaluate.py \
 		--use-chat-template \
diff --git a/README.md b/README.md
index 7b8ff5a75..41efba053 100644
--- a/README.md
+++ b/README.md
@@ -51,19 +51,23 @@ To install `uv`, follow the [UV Installation Guide](https://docs.astral.sh/uv/ge
 
 
 ```shell
-uv venv openr1 --python 3.11 && source openr1/bin/activate && uv pip install --upgrade pip --link-mode=copy
+uv venv openr1 --python 3.11 && source openr1/bin/activate && uv pip install --upgrade pip
 ```
 
-Next, install vLLM:
+> [!TIP]
+> For Hugging Face cluster users, add `export UV_LINK_MODE=copy` to your `.bashrc` to suppress cache warnings from `uv`
+
+Next, install vLLM and FlashAttention:
 
 ```shell
-uv pip install vllm==0.7.2 --link-mode=copy
+uv pip install vllm==0.7.2
+uv pip install setuptools && uv pip install flash-attn --no-build-isolation
 ```
 
 This will also install PyTorch `v2.5.1` and it is **very important** to use this version since the vLLM binaries are compiled for it. You can then install the remaining dependencies for your specific use case via `pip install -e .[LIST OF MODES]`. For most contributors, we recommend:
 
 ```shell
-GIT_LFS_SKIP_SMUDGE=1 uv pip install -e ".[dev]" --link-mode=copy
+GIT_LFS_SKIP_SMUDGE=1 uv pip install -e ".[dev]"
 ```
 
 Next, log into your Hugging Face and Weights and Biases accounts as follows:
@@ -233,7 +237,7 @@ We use `lighteval` to evaluate models, with custom tasks defined in `src/open_r1
 
 ```shell
 MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilisation=0.8"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.0}"
 OUTPUT_DIR=data/evals/$MODEL
 
 # AIME 2024
@@ -266,7 +270,7 @@ To increase throughput across multiple GPUs, use _data parallel_ as follows:
 ```shell
 NUM_GPUS=8
 MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.0}"
 TASK=aime24
 OUTPUT_DIR=data/evals/$MODEL
 
@@ -281,7 +285,7 @@ For large models which require sharding across GPUs, use _tensor parallel_ and r
 ```shell
 NUM_GPUS=8
 MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.0}"
 TASK=aime24
 OUTPUT_DIR=data/evals/$MODEL
 
@@ -335,7 +339,7 @@ To reproduce these results use the following command:
 ```shell
 NUM_GPUS=1 # Set to 8 for 32B and 70B models
 MODEL=deepseek-ai/{model_name}
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilisation=0.8,tensor_parallel_size=$NUM_GPUS"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.0}"
 OUTPUT_DIR=data/evals/$MODEL
 
 lighteval vllm $MODEL_ARGS "custom|math_500|0|0" \
@@ -347,7 +351,7 @@ lighteval vllm $MODEL_ARGS "custom|math_500|0|0" \
 Alternatively, you can launch Slurm jobs as follows:
 
 ```shell
-python scripts/run_benchmarks.py --model-id={model_id}  --benchmarks math_500
+python scripts/run_benchmarks.py --model-id {model_id}  --benchmarks math_500
 ```
 
 ### GPQA Diamond
@@ -368,7 +372,7 @@ To reproduce these results use the following command:
 ```shell
 NUM_GPUS=1 # Set to 8 for 32B and 70B models
 MODEL=deepseek-ai/{model_name}
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilisation=0.8,tensor_parallel_size=$NUM_GPUS"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.0}"
 OUTPUT_DIR=data/evals/$MODEL
 
 lighteval vllm $MODEL_ARGS "custom|gpqa:diamond|0|0" \
@@ -378,7 +382,7 @@ lighteval vllm $MODEL_ARGS "custom|gpqa:diamond|0|0" \
 ```
 
 ```shell
-python scripts/run_benchmarks.py --model-id={model_id}  --benchmarks gpqa
+python scripts/run_benchmarks.py --model-id {model_id}  --benchmarks gpqa
 ```
 
 ### LiveCodeBench
@@ -386,20 +390,20 @@ python scripts/run_benchmarks.py --model-id={model_id}  --benchmarks gpqa
 We are able to reproduce Deepseek's reported results on the LiveCodeBench code generation benchmark within ~1-3 standard deviations:
 
 | Model                         | LiveCodeBench (🤗 LightEval) | GPQA Diamond (DeepSeek Reported) |
-|:------------------------------|:---------------------------:|:--------------------------------:|
-| DeepSeek-R1-Distill-Qwen-1.5B |            16.3             |               16.9               |
-| DeepSeek-R1-Distill-Qwen-7B   |            36.6             |               37.6               |
-| DeepSeek-R1-Distill-Qwen-14B  |            51.5             |               53.1               |
-| DeepSeek-R1-Distill-Qwen-32B  |            56.6                |               57.2               |
-| DeepSeek-R1-Distill-Llama-8B  |            37.0             |               39.6               |
-| DeepSeek-R1-Distill-Llama-70B |            54.5             |               57.5               |
+|:------------------------------|:----------------------------:|:--------------------------------:|
+| DeepSeek-R1-Distill-Qwen-1.5B |             16.3             |               16.9               |
+| DeepSeek-R1-Distill-Qwen-7B   |             36.6             |               37.6               |
+| DeepSeek-R1-Distill-Qwen-14B  |             51.5             |               53.1               |
+| DeepSeek-R1-Distill-Qwen-32B  |             56.6             |               57.2               |
+| DeepSeek-R1-Distill-Llama-8B  |             37.0             |               39.6               |
+| DeepSeek-R1-Distill-Llama-70B |             54.5             |               57.5               |
 
 To reproduce these results use the following command:
 
 ```shell
 NUM_GPUS=1 # Set to 8 for 32B and 70B models, or data_parallel_size=8 with the smaller models for speed
 MODEL=deepseek-ai/{model_name}
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilisation=0.8,tensor_parallel_size=$NUM_GPUS,generation_parameters={temperature:0.6,top_p:0.95}"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 OUTPUT_DIR=data/evals/$MODEL
 
 lighteval vllm $MODEL_ARGS "extended|lcb:codegeneration|0|0" \
@@ -408,7 +412,7 @@ lighteval vllm $MODEL_ARGS "extended|lcb:codegeneration|0|0" \
 ```
 
 ```shell
-python scripts/run_benchmarks.py --model-id={model_id}  --benchmarks lcb
+python scripts/run_benchmarks.py --model-id {model_id}  --benchmarks lcb
 ```
 
 ## Data generation
diff --git a/setup.py b/setup.py
index 907269c20..8cbb27832 100644
--- a/setup.py
+++ b/setup.py
@@ -39,7 +39,7 @@
 
 
 # IMPORTANT: all dependencies should be listed here with their version requirements, if any.
-#   * If a dependency is fast-moving (e.g. transformers), pin to the exact version
+#   * If a dependency is fast-moving (e.g. trl), pin to the exact version
 _deps = [
     "accelerate>=1.2.1",
     "bitsandbytes>=0.43.0",
@@ -53,9 +53,10 @@
     "hf_transfer>=0.1.4",
     "huggingface-hub[cli]>=0.19.2,<1.0",
     "isort>=5.12.0",
+    "langdetect",  # Needed for LightEval's extended tasks
     "latex2sympy2_extended>=1.0.6",
     "liger_kernel==0.5.2",
-    "lighteval @ git+https://github.com/huggingface/lighteval.git@86f62259f105ae164f655e0b91c92a823a742724#egg=lighteval[math]",
+    "lighteval @ git+https://github.com/huggingface/lighteval.git@ebb7377b39a48ab0691e6fbd9dea57e9fe290a7e",
     "math-verify==0.5.2",  # Used for math verification in grpo
     "packaging>=23.0",
     "parameterized>=0.9.0",
@@ -68,7 +69,7 @@
     "torch==2.5.1",
     "transformers @ git+https://github.com/huggingface/transformers.git@main",
     "trl @ git+https://github.com/huggingface/trl.git@main",
-    "vllm==0.7.1",
+    "vllm==0.7.2",
     "wandb>=0.19.1",
 ]
 
@@ -89,10 +90,9 @@ def deps_list(*pkgs):
 extras["tests"] = deps_list("pytest", "parameterized", "math-verify")
 extras["torch"] = deps_list("torch")
 extras["quality"] = deps_list("ruff", "isort", "flake8")
-extras["train"] = deps_list("flash_attn")
 extras["code"] = deps_list("e2b-code-interpreter", "python-dotenv")
 extras["eval"] = deps_list("lighteval", "math-verify")
-extras["dev"] = extras["quality"] + extras["tests"] + extras["eval"] + extras["train"]
+extras["dev"] = extras["quality"] + extras["tests"] + extras["eval"]
 
 # core dependencies shared across the whole project - keep this to a bare minimum :)
 install_requires = [
@@ -103,6 +103,7 @@ def deps_list(*pkgs):
     deps["deepspeed"],
     deps["hf_transfer"],
     deps["huggingface-hub"],
+    deps["langdetect"],
     deps["latex2sympy2_extended"],
     deps["math-verify"],
     deps["liger_kernel"],
@@ -111,6 +112,7 @@ def deps_list(*pkgs):
     deps["sentencepiece"],
     deps["transformers"],
     deps["trl"],
+    deps["wandb"],
 ]
 
 setup(
diff --git a/slurm/evaluate.slurm b/slurm/evaluate.slurm
index da106f6be..5119daa18 100644
--- a/slurm/evaluate.slurm
+++ b/slurm/evaluate.slurm
@@ -29,9 +29,16 @@ NUM_GPUS=$(nvidia-smi -L | wc -l)
 if [ "$TENSOR_PARALLEL" = "True" ]; then
     # use TP to shard model across NUM_GPUS
     export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
+    # FIXME: lighteval is broken on `main`so we need to manually pass the generation params
+    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.0}"
 else
-    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilisation=0.8"
+    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.0}"
+fi
+
+# FIXME: enable sampling for pass@1 (remove once this is fixed on lighteval side). We use the defaults from Qwen2.5-Coder: https://github.com/QwenLM/Qwen2.5-Coder/blob/main/qwencoder-eval/instruct/livecode_bench/lcb_runner/runner/parser.py#L8
+if [ "$TASK_NAME" = "lcb" ]; then
+    MODEL_ARGS="${MODEL_ARGS/temperature:0.0/temperature:0.2}"
+    MODEL_ARGS="${MODEL_ARGS/generation_parameters={/generation_parameters={top_p:0.95,}"
 fi
 
 LM_EVAL_REPO_ID="open-r1/open-r1-eval-leaderboard"
@@ -48,14 +55,14 @@ echo "Eval results will be saved to $OUTPUT_DIR"
 # Check if "custom" is a substring of TASKS
 if [[ $TASKS == *"custom"* ]]; then
     echo "Custom task detected. Running custom task evaluation script ..."
-    lighteval vllm $MODEL_ARGS $TASKS \
+    lighteval vllm "$MODEL_ARGS" $TASKS \
     --custom-tasks "src/open_r1/evaluate.py" \
     --use-chat-template \
     --output-dir $OUTPUT_DIR \
     --save-details \
     ${7:+--system-prompt "$7"}
 else
-    lighteval vllm $MODEL_ARGS $TASKS \
+    lighteval vllm "$MODEL_ARGS" $TASKS \
     --use-chat-template \
     --output-dir $OUTPUT_DIR \
     --save-details \

From 8322b3173fed9974371d35a459607ec9da415f43 Mon Sep 17 00:00:00 2001
From: Almaz Zinollayev <39913951+zeenolife@users.noreply.github.com>
Date: Fri, 21 Feb 2025 14:41:34 +0000
Subject: [PATCH 059/137] Language specific code format reward (#377)

---
 src/open_r1/grpo.py    | 13 ++++++-
 src/open_r1/rewards.py | 16 +++++++++
 tests/test_rewards.py  | 79 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 2ead27da6..bbbe8d129 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -29,6 +29,7 @@
     accuracy_reward,
     code_reward,
     format_reward,
+    get_code_format_reward,
     get_cosine_scaled_reward,
     get_repetition_penalty_reward,
     len_reward,
@@ -61,12 +62,14 @@ class GRPOScriptArguments(ScriptArguments):
             Maximum reward for cosine scaling for correct answers.
         cosine_max_len (`int`):
             Maximum length for cosine scaling.
+        code_language (`str`):
+            Language for code format reward.
     """
 
     reward_funcs: list[str] = field(
         default_factory=lambda: ["accuracy", "format"],
         metadata={
-            "help": "List of reward functions. Possible values: 'accuracy', 'format', 'format_deepseek', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length'"
+            "help": "List of reward functions. Possible values: 'accuracy', 'format', 'format_deepseek', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length', 'code', 'code_format'"
         },
     )
     cosine_min_value_wrong: float = field(
@@ -97,6 +100,13 @@ class GRPOScriptArguments(ScriptArguments):
         default=-1.0,
         metadata={"help": "Maximum (negative) penalty for for repetition penalty reward"},
     )
+    code_language: str = field(
+        default="python",
+        metadata={
+            "help": "Language for code format reward. Based on E2B supported languages https://e2b.dev/docs/code-interpreting/supported-languages",
+            "choices": ["python", "javascript", "r", "java", "bash"],
+        },
+    )
 
 
 def main(script_args, training_args, model_args):
@@ -163,6 +173,7 @@ def main(script_args, training_args, model_args):
         ),
         "length": len_reward,
         "code": code_reward,
+        "code_format": get_code_format_reward(language=script_args.code_language),
     }
     reward_funcs = [REWARD_FUNCS_REGISTRY[func] for func in script_args.reward_funcs]
 
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 2960cdee7..d91c63ed2 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -358,3 +358,19 @@ def evaluate_code(code, test_cases):
         print(f"Error from E2B executor: {e}")
         rewards = [0.0] * len(completions)
     return rewards
+
+
+def get_code_format_reward(language: str = "python"):
+    """Format reward function specifically for code responses.
+
+    Args:
+        language: Programming language supported by E2B https://e2b.dev/docs/code-interpreting/supported-languages
+    """
+    pattern = rf"^<think>.*?</think>\s*<answer>.*?```{language}\n.*?```.*?</answer>$"
+
+    def code_format_reward(completions, **kwargs):
+        completion_contents = [completion[0]["content"] for completion in completions]
+        matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completion_contents]
+        return [1.0 if match else 0.0 for match in matches]
+
+    return code_format_reward
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index 9e41bdb0d..32d0f1137 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -3,6 +3,7 @@
 from open_r1.rewards import (
     accuracy_reward,
     format_reward,
+    get_code_format_reward,
     get_cosine_scaled_reward,
     get_repetition_penalty_reward,
     len_reward,
@@ -313,5 +314,83 @@ def test_long_completion_without_repetition(self):
         self.assertEqual(rewards, [0.0])
 
 
+class TestCodeFormat(unittest.TestCase):
+    def test_correct_python_format(self):
+        """Test code format reward with correct Python format."""
+        completion = [
+            [
+                {
+                    "content": "<think>Let's solve this\nStep 1: First step</think>\n<answer>```python\ndef hello():\n    print('world')\n```</answer>"
+                }
+            ]
+        ]
+        reward_fn = get_code_format_reward(language="python")
+        rewards = reward_fn(completion)
+        self.assertEqual(rewards[0], 1.0)
+
+    def test_incorrect_formats(self):
+        """Test code format reward with various incorrect formats."""
+        incorrect_formats = [
+            # Missing think/answer tags
+            "```python\ndef hello():\n    print('world')\n```",
+            # Missing code block
+            "<think>Some thinking</think><answer>Just plain text</answer>",
+            # Wrong language
+            "<think>Analysis</think><answer>```javascript\nconsole.log('hello');\n```</answer>",
+            # Missing language identifier
+            "<think>Analysis</think><answer>```\ndef hello(): pass\n```</answer>",
+            # Wrong order of tags
+            "<answer>```python\ndef hello(): pass\n```</answer><think>Analysis</think>",
+        ]
+
+        reward_fn = get_code_format_reward(language="python")
+        for fmt in incorrect_formats:
+            completion = [[{"content": fmt}]]
+            rewards = reward_fn(completion)
+            self.assertEqual(rewards[0], 0.0)
+
+    def test_multiple_code_blocks(self):
+        """Test format reward with multiple code blocks in think and answer sections."""
+        completion = [
+            [
+                {
+                    "content": "<think>Here's an example:\n```python\nx = 1\n```\nNow the solution:</think>\n<answer>```python\ndef solution():\n    return 42\n```</answer>"
+                }
+            ]
+        ]
+        reward_fn = get_code_format_reward(language="python")
+        rewards = reward_fn(completion)
+        self.assertEqual(rewards[0], 1.0)
+
+    def test_different_languages(self):
+        """Test code format reward with different programming languages."""
+        completion = [
+            [{"content": "<think>Analysis</think><answer>```javascript\nconsole.log('hello');\n```</answer>"}]
+        ]
+
+        # Test with JavaScript
+        js_reward_fn = get_code_format_reward(language="javascript")
+        rewards = js_reward_fn(completion)
+        self.assertEqual(rewards[0], 1.0)
+
+        # Same completion should fail for Python
+        py_reward_fn = get_code_format_reward(language="python")
+        rewards = py_reward_fn(completion)
+        self.assertEqual(rewards[0], 0.0)
+
+    def test_multiline_code(self):
+        """Test format reward with complex multiline code blocks."""
+        completion = [
+            [
+                {
+                    "content": "<think>Here's the analysis</think>\n<answer>```python\nclass Solution:\n    def __init__(self):\n        self.value = 42\n        \n    def get_value(self):\n        return self.value\n```</answer>"
+                }
+            ]
+        ]
+        reward_fn = get_code_format_reward(language="python")
+        rewards = reward_fn(completion)
+        self.assertEqual(rewards[0], 1.0)
+
+
 if __name__ == "__main__":
     unittest.main()

From 49d9b741a50fcb3044c741a24a908d8e32620432 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Sat, 22 Feb 2025 14:46:09 +0100
Subject: [PATCH 060/137] Pin dependencies (#393)

---
 setup.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 8cbb27832..be4c0871e 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,7 @@
 # IMPORTANT: all dependencies should be listed here with their version requirements, if any.
 #   * If a dependency is fast-moving (e.g. trl), pin to the exact version
 _deps = [
-    "accelerate>=1.2.1",
+    "accelerate==1.4.0",
     "bitsandbytes>=0.43.0",
     "datasets>=3.2.0",
     "deepspeed==0.15.4",
@@ -49,7 +49,6 @@
     "e2b-code-interpreter>=1.0.5",
     "einops>=0.8.0",
     "flake8>=6.0.0",
-    "flash_attn>=2.7.4.post1",
     "hf_transfer>=0.1.4",
     "huggingface-hub[cli]>=0.19.2,<1.0",
     "isort>=5.12.0",
@@ -67,8 +66,8 @@
     "safetensors>=0.3.3",
     "sentencepiece>=0.1.99",
     "torch==2.5.1",
-    "transformers @ git+https://github.com/huggingface/transformers.git@main",
-    "trl @ git+https://github.com/huggingface/trl.git@main",
+    "transformers==4.49.0",
+    "trl @ git+https://github.com/huggingface/trl.git@013d360b8f2703d3546786fa124f3204d6cd8018",
     "vllm==0.7.2",
     "wandb>=0.19.1",
 ]

From eeca246b078457bc0f69ba2e8297b799df0e2bda Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Sat, 22 Feb 2025 15:21:01 +0100
Subject: [PATCH 061/137] Update prompt template and sampling parameters for
 evaluation (#392)

* Pin t

* Pin t

* Set top p

* C

* Tune math prompt

* Improve math prompt

* Update tables
---
 README.md               | 76 ++++++++++++++++++++++++++++++-----------
 slurm/evaluate.slurm    | 12 ++-----
 src/open_r1/evaluate.py | 38 ++++++++++++++++-----
 3 files changed, 89 insertions(+), 37 deletions(-)

diff --git a/README.md b/README.md
index 41efba053..2c01d1fa5 100644
--- a/README.md
+++ b/README.md
@@ -237,7 +237,7 @@ We use `lighteval` to evaluate models, with custom tasks defined in `src/open_r1
 
 ```shell
 MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.0}"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 OUTPUT_DIR=data/evals/$MODEL
 
 # AIME 2024
@@ -258,19 +258,24 @@ lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
 TASK=gpqa:diamond
 lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
     --custom-tasks src/open_r1/evaluate.py \
+    --use-chat-template \
+    --output-dir $OUTPUT_DIR
+
+# LiveCodeBench
+lighteval vllm $MODEL_ARGS "extended|lcb:codegeneration|0|0" \
     --use-chat-template \
     --output-dir $OUTPUT_DIR 
 ```
 
 > [!IMPORTANT]
-> You must set `max_model_length=32768` in the `vllm` command to align with the `generation_size` we define per eval. Without this, `lighteval` will throw an error.
+> You must set `max_model_length=32768` in the `vllm` command to align with the `max_new_tokens` we define per eval. Without this, `lighteval` will throw an error.
 
 To increase throughput across multiple GPUs, use _data parallel_ as follows:
 
 ```shell
 NUM_GPUS=8
 MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.0}"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 TASK=aime24
 OUTPUT_DIR=data/evals/$MODEL
 
@@ -285,7 +290,7 @@ For large models which require sharding across GPUs, use _tensor parallel_ and r
 ```shell
 NUM_GPUS=8
 MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.0}"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 TASK=aime24
 OUTPUT_DIR=data/evals/$MODEL
 
@@ -319,7 +324,40 @@ make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24 PARALLE
 ## Reproducing Deepseek's evaluation results
 
 > [!NOTE]
-> The DeepSeek-R1 paper uses sampling with a temperature of 0.6, a top-p value of 0.95, and 64 responses per query to estimate `pass@1`. Below, we report the results from greedy decoding, which likely explains the small 1-3σ discrepancies between our results and theirs.
+> The DeepSeek-R1 paper uses sampling with 64 responses per query to estimate `pass@1`. Below, we report the results from sampling 1 response per query, which likely explains the small 1-3σ discrepancies between our results and theirs.
+
+### AIME 2024
+
+We are able to reproduce Deepseek's reported results on the AIME 2024 benchmark within ~1-3 standard deviations:
+
+| Model                         | AIME 2024 (🤗 LightEval) | AIME 2024 (DeepSeek Reported) |
+|:------------------------------|:-----------------------:|:----------------------------:|
+| DeepSeek-R1-Distill-Qwen-1.5B |          26.7           |             28.9             |
+| DeepSeek-R1-Distill-Qwen-7B   |          56.6           |             55.5             |
+| DeepSeek-R1-Distill-Qwen-14B  |          60.0           |             69.7             |
+| DeepSeek-R1-Distill-Qwen-32B  |          73.2           |             72.6             |
+| DeepSeek-R1-Distill-Llama-8B  |          43.3           |             50.4             |
+| DeepSeek-R1-Distill-Llama-70B |          73.3           |             70.0             |
+
+To reproduce these results use the following command:
+
+```shell
+NUM_GPUS=1 # Set to 8 for 32B and 70B models
+MODEL=deepseek-ai/{model_name}
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
+OUTPUT_DIR=data/evals/$MODEL
+
+lighteval vllm $MODEL_ARGS "custom|aime24|0|0" \
+    --custom-tasks src/open_r1/evaluate.py \
+    --use-chat-template \
+    --output-dir $OUTPUT_DIR
+```
+
+Alternatively, you can launch Slurm jobs as follows:
+
+```shell
+python scripts/run_benchmarks.py --model-id {model_id}  --benchmarks aime24
+```
 
 ### MATH-500
 
@@ -327,19 +365,19 @@ We are able to reproduce Deepseek's reported results on the MATH-500 benchmark w
 
 | Model                         | MATH-500 (🤗 LightEval) | MATH-500 (DeepSeek Reported) |
 |:------------------------------|:-----------------------:|:----------------------------:|
-| DeepSeek-R1-Distill-Qwen-1.5B |          81.2           |             83.9             |
-| DeepSeek-R1-Distill-Qwen-7B   |          91.8           |             92.8             |
-| DeepSeek-R1-Distill-Qwen-14B  |          94.2           |             93.9             |
-| DeepSeek-R1-Distill-Qwen-32B  |          95.0           |             94.3             |
-| DeepSeek-R1-Distill-Llama-8B  |          85.4           |             89.1             |
-| DeepSeek-R1-Distill-Llama-70B |          93.4           |             94.5             |
+| DeepSeek-R1-Distill-Qwen-1.5B |          84.6           |             83.9             |
+| DeepSeek-R1-Distill-Qwen-7B   |          93.0           |             92.8             |
+| DeepSeek-R1-Distill-Qwen-14B  |          95.0           |             93.9             |
+| DeepSeek-R1-Distill-Qwen-32B  |          96.6           |             94.3             |
+| DeepSeek-R1-Distill-Llama-8B  |          88.6           |             89.1             |
+| DeepSeek-R1-Distill-Llama-70B |          96.4           |             94.5             |
 
 To reproduce these results use the following command:
 
 ```shell
 NUM_GPUS=1 # Set to 8 for 32B and 70B models
 MODEL=deepseek-ai/{model_name}
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.0}"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 OUTPUT_DIR=data/evals/$MODEL
 
 lighteval vllm $MODEL_ARGS "custom|math_500|0|0" \
@@ -360,19 +398,19 @@ We are able to reproduce Deepseek's reported results on the GPQA Diamond benchma
 
 | Model                         | GPQA Diamond (🤗 LightEval) | GPQA Diamond (DeepSeek Reported) |
 |:------------------------------|:---------------------------:|:--------------------------------:|
-| DeepSeek-R1-Distill-Qwen-1.5B |            33.3             |               33.8               |
-| DeepSeek-R1-Distill-Qwen-7B   |            48.4             |               49.1               |
-| DeepSeek-R1-Distill-Qwen-14B  |            55.6             |               59.1               |
-| DeepSeek-R1-Distill-Qwen-32B  |            58.6             |               62.1               |
-| DeepSeek-R1-Distill-Llama-8B  |            51.0             |               49.0               |
-| DeepSeek-R1-Distill-Llama-70B |            65.2             |               65.2               |
+| DeepSeek-R1-Distill-Qwen-1.5B |            34.3             |               33.8               |
+| DeepSeek-R1-Distill-Qwen-7B   |            50.5             |               49.1               |
+| DeepSeek-R1-Distill-Qwen-14B  |            59.6             |               59.1               |
+| DeepSeek-R1-Distill-Qwen-32B  |            63.6             |               62.1               |
+| DeepSeek-R1-Distill-Llama-8B  |            52.0             |               49.0               |
+| DeepSeek-R1-Distill-Llama-70B |            67.2             |               65.2               |
 
 To reproduce these results use the following command:
 
 ```shell
 NUM_GPUS=1 # Set to 8 for 32B and 70B models
 MODEL=deepseek-ai/{model_name}
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.0}"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 OUTPUT_DIR=data/evals/$MODEL
 
 lighteval vllm $MODEL_ARGS "custom|gpqa:diamond|0|0" \
diff --git a/slurm/evaluate.slurm b/slurm/evaluate.slurm
index 5119daa18..6c4631d0c 100644
--- a/slurm/evaluate.slurm
+++ b/slurm/evaluate.slurm
@@ -29,16 +29,10 @@ NUM_GPUS=$(nvidia-smi -L | wc -l)
 if [ "$TENSOR_PARALLEL" = "True" ]; then
     # use TP to shard model across NUM_GPUS
     export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    # FIXME: lighteval is broken on `main`so we need to manually pass the generation params
-    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.0}"
+    # FIXME: lighteval now requires us to manually pass the generation params
+    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 else
-    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.0}"
-fi
-
-# FIXME: enable sampling for pass@1 (remove once this is fixed on lighteval side). We use the defaults from Qwen2.5-Coder: https://github.com/QwenLM/Qwen2.5-Coder/blob/main/qwencoder-eval/instruct/livecode_bench/lcb_runner/runner/parser.py#L8
-if [ "$TASK_NAME" = "lcb" ]; then
-    MODEL_ARGS="${MODEL_ARGS/temperature:0.0/temperature:0.2}"
-    MODEL_ARGS="${MODEL_ARGS/generation_parameters={/generation_parameters={top_p:0.95,}"
+    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 fi
 
 LM_EVAL_REPO_ID="open-r1/open-r1-eval-leaderboard"
diff --git a/src/open_r1/evaluate.py b/src/open_r1/evaluate.py
index 5d38b8608..699ed66d2 100644
--- a/src/open_r1/evaluate.py
+++ b/src/open_r1/evaluate.py
@@ -27,6 +27,28 @@
 from lighteval.utils.language import Language
 
 
+# Prompt template adapted from
+# - simple-evals: https://github.com/openai/simple-evals/blob/6e84f4e2aed6b60f6a0c7b8f06bbbf4bfde72e58/math_eval.py#L17
+# - Llama 3: https://huggingface.co/datasets/meta-llama/Llama-3.2-1B-Instruct-evals/viewer/Llama-3.2-1B-Instruct-evals__math__details?views%5B%5D=llama_32_1b_instruct_evals__math__details
+# Note that it is important to have the final answer in a box for math-verify to work correctly
+MATH_QUERY_TEMPLATE = """
+Solve the following math problem efficiently and clearly.  The last line of your response should be of the following format: 'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct' (without quotes) where ANSWER is just the final number or expression that solves the problem. Think step by step before answering.
+
+{Question}
+""".strip()
+
+# Prompt template from simple-evals: https://github.com/openai/simple-evals/blob/83ed7640a7d9cd26849bcb3340125002ef14abbe/common.py#L14
+GPQA_QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
+
+{Question}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
 latex_gold_metric = multilingual_extractive_match_metric(
     language=Language.ENGLISH,
     fallback_mode="first_match",
@@ -55,11 +77,10 @@
 )
 
 
-def prompt_fn(line, task_name: str = None):
-    """Assumes the model is either prompted to emit \\boxed{answer} or does so automatically"""
+def math_prompt_fn(line, task_name: str = None):
     return Doc(
         task_name=task_name,
-        query=line["problem"],
+        query=MATH_QUERY_TEMPLATE.format(Question=line["problem"]),
         choices=[line["solution"]],
         gold_index=0,
     )
@@ -68,20 +89,19 @@ def prompt_fn(line, task_name: str = None):
 def aime_prompt_fn(line, task_name: str = None):
     return Doc(
         task_name=task_name,
-        query=line["problem"],
+        query=MATH_QUERY_TEMPLATE.format(Question=line["problem"]),
         choices=[line["answer"]],
         gold_index=0,
     )
 
 
 def gpqa_prompt_fn(line, task_name: str = None):
-    """Prompt template adapted from simple-evals: https://github.com/openai/simple-evals/blob/83ed7640a7d9cd26849bcb3340125002ef14abbe/common.py#L14"""
     gold_index = random.randint(0, 3)
     choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
     choices.insert(gold_index, line["Correct Answer"])
-    query_template = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\n{Question}\n\nA) {A}\nB) {B}\nC) {C}\nD) {D}"
-    query = query_template.format(A=choices[0], B=choices[1], C=choices[2], D=choices[3], Question=line["Question"])
-
+    query = GPQA_QUERY_TEMPLATE.format(
+        A=choices[0], B=choices[1], C=choices[2], D=choices[3], Question=line["Question"]
+    )
     return Doc(
         task_name=task_name,
         query=query,
@@ -123,7 +143,7 @@ def gpqa_prompt_fn(line, task_name: str = None):
 math_500 = LightevalTaskConfig(
     name="math_500",
     suite=["custom"],
-    prompt_function=prompt_fn,
+    prompt_function=math_prompt_fn,
     hf_repo="HuggingFaceH4/MATH-500",
     hf_subset="default",
     hf_avail_splits=["test"],

From 3f9d75a595c00577dddbf5707dcfe74ee8898adb Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Sun, 23 Feb 2025 17:44:03 +0100
Subject: [PATCH 062/137] Bump Liger kernel (#399)

Needed to enable SFT training via https://github.com/huggingface/trl/pull/2874
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index be4c0871e..9b9a1c0ed 100644
--- a/setup.py
+++ b/setup.py
@@ -54,7 +54,7 @@
     "isort>=5.12.0",
     "langdetect",  # Needed for LightEval's extended tasks
     "latex2sympy2_extended>=1.0.6",
-    "liger_kernel==0.5.2",
+    "liger_kernel==0.5.3",
     "lighteval @ git+https://github.com/huggingface/lighteval.git@ebb7377b39a48ab0691e6fbd9dea57e9fe290a7e",
     "math-verify==0.5.2",  # Used for math verification in grpo
     "packaging>=23.0",

From 5355687e6c0f6f9f09d8806e6f0288aed282c1f3 Mon Sep 17 00:00:00 2001
From: elie <97572401+eliebak@users.noreply.github.com>
Date: Mon, 24 Feb 2025 15:43:12 +0100
Subject: [PATCH 063/137] add sft recipe (#415)

---
 recipes/OpenR1-Qwen-7B/sft/config.yaml | 50 ++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 recipes/OpenR1-Qwen-7B/sft/config.yaml

diff --git a/recipes/OpenR1-Qwen-7B/sft/config.yaml b/recipes/OpenR1-Qwen-7B/sft/config.yaml
new file mode 100644
index 000000000..3955dc952
--- /dev/null
+++ b/recipes/OpenR1-Qwen-7B/sft/config.yaml
@@ -0,0 +1,50 @@
+# Model arguments
+# You need to download the model and manually change the rope to 300k and max_position_embeddings to 32768
+# the config file should match https://huggingface.co/open-r1/OpenR1-Qwen-7B/blob/main/config.json
+model_name_or_path: Qwen/Qwen2.5-Math-7B-Instruct 
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: sdpa
+
+# Data training arguments
+dataset_name: open-r1/OpenR1-Math-220k
+dataset_configs:
+- default
+dataset_num_proc: 48
+
+#SFT hyperparam
+max_seq_length: 32768
+weight_decay: 0.0001
+optim: adamw_torch
+lr_scheduler_type: linear
+warmup_ratio: 0.1
+learning_rate: 5.0e-05
+gradient_accumulation_steps: 2
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+
+# SFT trainer config
+max_steps: -1
+num_train_epochs: 3
+bf16: true
+do_eval: false
+use_liger_kernel: true
+eval_strategy: 'no'
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: OpenR1-Qwen-7B-SFT
+hub_strategy: every_save
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+packing: true
+output_dir: data/OpenR1-Qwen-7B-SFT
+overwrite_output_dir: true
+push_to_hub: true
+report_to:
+- wandb
+save_strategy: "steps"
+save_steps: 500
+save_total_limit: 1
+seed: 42
\ No newline at end of file

From 566cfd1a44f57d68567bd89283d5b348de3753a3 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Mon, 24 Feb 2025 17:16:40 +0100
Subject: [PATCH 064/137] Align format reward with R1 traces and add reward
 function to count think / answer tags (#418)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix tests

* Tune

* Add reward

* Apply suggestions from code review

Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>

---------

Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
---
 .../grpo/config_demo.yaml                     |  2 +
 .../grpo/config_demo.yaml                     |  2 +
 src/open_r1/grpo.py                           |  8 +--
 src/open_r1/rewards.py                        | 26 ++++++++-
 tests/test_rewards.py                         | 53 ++++++++++++++++---
 5 files changed, 80 insertions(+), 11 deletions(-)

diff --git a/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml b/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
index 102966905..ecd53336b 100644
--- a/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
+++ b/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
@@ -48,9 +48,11 @@ report_to:
 reward_funcs:
 - accuracy
 - format
+- tag_count
 reward_weights:
 - 1.0
 - 1.0
+- 1.0
 save_strategy: "epoch"
 save_total_limit: 1
 seed: 42
diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
index f38add6ac..a6cb77d85 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
@@ -44,9 +44,11 @@ report_to:
 reward_funcs:
 - accuracy
 - format
+- tag_count
 reward_weights:
 - 1.0
 - 1.0
+- 1.0
 save_strategy: "epoch"
 save_total_limit: 1
 seed: 42
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index bbbe8d129..d5f023e02 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -34,6 +34,7 @@
     get_repetition_penalty_reward,
     len_reward,
     reasoning_steps_reward,
+    tag_count_reward,
 )
 from open_r1.utils import get_tokenizer
 from open_r1.utils.callbacks import get_callbacks
@@ -51,7 +52,7 @@ class GRPOScriptArguments(ScriptArguments):
 
     Args:
         reward_funcs (`list[str]`):
-            List of reward functions. Possible values: 'accuracy', 'format', 'format_deepseek', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length'.
+            List of reward functions. Possible values: 'accuracy', 'format', 'format_deepseek', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length', tag_count', 'code', 'code_format'.
         cosine_min_value_wrong (`float`):
             Minimum reward for cosine scaling for wrong answers.
         cosine_max_value_wrong (`float`):
@@ -67,9 +68,9 @@ class GRPOScriptArguments(ScriptArguments):
     """
 
     reward_funcs: list[str] = field(
-        default_factory=lambda: ["accuracy", "format"],
+        default_factory=lambda: ["accuracy", "format", "tag_count"],
         metadata={
-            "help": "List of reward functions. Possible values: 'accuracy', 'format', 'format_deepseek', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length', 'code', 'code_format'"
+            "help": "List of reward functions. Possible values: 'accuracy', 'format', 'format_deepseek', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length', tag_count', 'code', 'code_format'"
         },
     )
     cosine_min_value_wrong: float = field(
@@ -174,6 +175,7 @@ def main(script_args, training_args, model_args):
         "length": len_reward,
         "code": code_reward,
         "code_format": get_code_format_reward(language=script_args.code_language),
+        "tag_count": tag_count_reward,
     }
     reward_funcs = [REWARD_FUNCS_REGISTRY[func] for func in script_args.reward_funcs]
 
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index d91c63ed2..8771b3866 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -62,12 +62,34 @@ def accuracy_reward(completions, solution, **kwargs):
 
 def format_reward(completions, **kwargs):
     """Reward function that checks if the reasoning process is enclosed within <think> and </think> tags, while the final answer is enclosed within <answer> and </answer> tags."""
-    pattern = r"^<think>.*?</think>\s*<answer>.*?</answer>$"
+    pattern = r"^<think>\n.*?\n</think>\n<answer>\n.*?\n</answer>$"
     completion_contents = [completion[0]["content"] for completion in completions]
     matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completion_contents]
     return [1.0 if match else 0.0 for match in matches]
 
 
+def tag_count_reward(completions, **kwargs) -> list[float]:
+    """Reward function that checks if we produce the desired number of think and answer tags associated with `format_reward()`.
+
+    Adapted from: https://gist.github.com/willccbb/4676755236bb08cab5f4e54a0475d6fb#file-grpo_demo-py-L90
+    """
+
+    def count_tags(text: str) -> float:
+        count = 0.0
+        if text.count("<think>\n") == 1:
+            count += 0.25
+        if text.count("\n</think>\n") == 1:
+            count += 0.25
+        if text.count("\n<answer>\n") == 1:
+            count += 0.25
+        if text.count("\n</answer>") == 1:
+            count += 0.25
+        return count
+
+    contents = [completion[0]["content"] for completion in completions]
+    return [count_tags(c) for c in contents]
+
+
 def reasoning_steps_reward(completions, **kwargs):
     r"""Reward function that checks for clear step-by-step reasoning.
     Regex pattern:
@@ -366,7 +388,7 @@ def get_code_format_reward(language: str = "python"):
     Args:
         language: Programming language supported by E2B https://e2b.dev/docs/code-interpreting/supported-languages
     """
-    pattern = rf"^<think>.*?</think>\s*<answer>.*?```{language}\n.*?```.*?</answer>$"
+    pattern = rf"^<think>\n.*?\n</think>\n<answer>\n.*?```{language}.*?```.*?\n</answer>$"
 
     def code_format_reward(completions, **kwargs):
         completion_contents = [completion[0]["content"] for completion in completions]
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index 32d0f1137..103b9981a 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -8,6 +8,7 @@
     get_repetition_penalty_reward,
     len_reward,
     reasoning_steps_reward,
+    tag_count_reward,
 )
 
 
@@ -30,7 +31,7 @@ def test_accuracy_reward_wrong_answer(self):
 
     def test_format_reward_correct(self):
         """Test format_reward with correct format."""
-        completion = [[{"content": "<think>Some reasoning</think><answer>The answer</answer>"}]]
+        completion = [[{"content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer\n</answer>"}]]
         rewards = format_reward(completion)
         self.assertEqual(rewards[0], 1.0)
 
@@ -107,7 +108,7 @@ def test_cosine_scaled_reward(self):
 
     def test_format_reward_specific_multiline(self):
         """Test format_reward with a specific multiline input."""
-        inputs = "<think>\nI will count each distinct object in the image:\n1. Purple scooter\n2. Red bicycle\n3. Green motorcycle\n4. Gray sedan\n5. Yellow school bus\n6. Small green double-decker bus\n7. Small red car\n8. Small purple car\n9. Small gray dirt bike\n\nThere are 9 distinct objects in total.\n</think>\n<answer>9</answer>"
+        inputs = "<think>\nI will count each distinct object in the image:\n1. Purple scooter\n2. Red bicycle\n3. Green motorcycle\n4. Gray sedan\n5. Yellow school bus\n6. Small green double-decker bus\n7. Small red car\n8. Small purple car\n9. Small gray dirt bike\n\nThere are 9 distinct objects in total.\n</think>\n<answer>\n9\n</answer>"
         completion = [[{"content": inputs}]]
         rewards = format_reward(completion)
         self.assertEqual(rewards[0], 1.0)
@@ -313,6 +314,42 @@ def test_long_completion_without_repetition(self):
         rewards = reward_fn(completions)
         self.assertEqual(rewards, [0.0])
 
+    def test_tag_count_rewards_all_correct(self):
+        """Test tag_count_reward with correct tags."""
+        completion = [[{"content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer\n</answer>"}]]
+        rewards = tag_count_reward(completion)
+        self.assertEqual(rewards[0], 1.0)
+
+    def test_tag_count_rewards_missing_think_begin(self):
+        """Test tag_count_reward with missing <think> tag."""
+        completion = [[{"content": "Some reasoning\n</think>\n<answer>\nThe answer\n</answer>"}]]
+        rewards = tag_count_reward(completion)
+        self.assertEqual(rewards[0], 0.75)
+
+    def test_tag_count_rewards_missing_think_end(self):
+        """Test tag_count_reward with missing </think> tag."""
+        completion = [[{"content": "<think>\nSome reasoning\n<answer>\nThe answer\n</answer>"}]]
+        rewards = tag_count_reward(completion)
+        self.assertEqual(rewards[0], 0.75)
+
+    def test_tag_count_rewards_missing_answer_begin(self):
+        """Test tag_count_reward with missing <answer> tag."""
+        completion = [[{"content": "<think>\nSome reasoning\n</think>\nThe answer\n</answer>"}]]
+        rewards = tag_count_reward(completion)
+        self.assertEqual(rewards[0], 0.75)
+
+    def test_tag_count_rewards_missing_answer_end(self):
+        """Test tag_count_reward with missing </answer> tag."""
+        completion = [[{"content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer"}]]
+        rewards = tag_count_reward(completion)
+        self.assertEqual(rewards[0], 0.75)
+
+    def test_tag_count_rewards_missing_all_tags(self):
+        """Test tag_count_reward with missing all tags."""
+        completion = [[{"content": "Some reasoning\nThe answer"}]]
+        rewards = tag_count_reward(completion)
+        self.assertEqual(rewards[0], 0.0)
+
 
 class TestCodeFormat(unittest.TestCase):
     def test_correct_python_format(self):
@@ -320,7 +357,7 @@ def test_correct_python_format(self):
         completion = [
             [
                 {
-                    "content": "<think>Let's solve this\nStep 1: First step</think>\n<answer>```python\ndef hello():\n    print('world')\n```</answer>"
+                    "content": "<think>\nLet's solve this\nStep 1: First step\n</think>\n<answer>\n```python\ndef hello():\n    print('world')\n```\n</answer>"
                 }
             ]
         ]
@@ -354,7 +391,7 @@ def test_multiple_code_blocks(self):
         completion = [
             [
                 {
-                    "content": "<think>Here's an example:\n```python\nx = 1\n```\nNow the solution:</think>\n<answer>```python\ndef solution():\n    return 42\n```</answer>"
+                    "content": "<think>\nHere's an example:\n```python\nx = 1\n```\nNow the solution:\n</think>\n<answer>\n```python\ndef solution():\n    return 42\n```\n</answer>"
                 }
             ]
         ]
@@ -365,7 +402,11 @@ def test_multiple_code_blocks(self):
     def test_different_languages(self):
         """Test code format reward with different programming languages."""
         completion = [
-            [{"content": "<think>Analysis</think><answer>```javascript\nconsole.log('hello');\n```</answer>"}]
+            [
+                {
+                    "content": "<think>\nAnalysis\n</think>\n<answer>\n```javascript\nconsole.log('hello');\n```\n</answer>"
+                }
+            ]
         ]
 
         # Test with JavaScript
@@ -383,7 +424,7 @@ def test_multiline_code(self):
         completion = [
             [
                 {
-                    "content": "<think>Here's the analysis</think>\n<answer>```python\nclass Solution:\n    def __init__(self):\n        self.value = 42\n        \n    def get_value(self):\n        return self.value\n```</answer>"
+                    "content": "<think>\nHere's the analysis\n</think>\n<answer>\n```python\nclass Solution:\n    def __init__(self):\n        self.value = 42\n        \n    def get_value(self):\n        return self.value\n```\n</answer>"
                 }
             ]
         ]

From 0c3ef8372ec595b1561596dbfa43e2cd6dfd3181 Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Mon, 24 Feb 2025 17:27:56 +0100
Subject: [PATCH 065/137] updates max_seq_length to max length due to a bug in
 trl (#419)

---
 .../Mistral-Small-24B-Instruct-2501/sft/config_openr1_math.yaml | 2 +-
 recipes/OpenR1-Qwen-7B/sft/config.yaml                          | 2 +-
 recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/recipes/Mistral-Small-24B-Instruct-2501/sft/config_openr1_math.yaml b/recipes/Mistral-Small-24B-Instruct-2501/sft/config_openr1_math.yaml
index 72ccdb688..4c52ab404 100644
--- a/recipes/Mistral-Small-24B-Instruct-2501/sft/config_openr1_math.yaml
+++ b/recipes/Mistral-Small-24B-Instruct-2501/sft/config_openr1_math.yaml
@@ -29,7 +29,7 @@ logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: cosine
 packing: true
-max_seq_length: 32768
+max_length: 32768
 max_steps: -1
 num_train_epochs: 5
 output_dir: data/Mistral-Small-24B-Instruct-2501-Open-R1-Distill
diff --git a/recipes/OpenR1-Qwen-7B/sft/config.yaml b/recipes/OpenR1-Qwen-7B/sft/config.yaml
index 3955dc952..ddb77364f 100644
--- a/recipes/OpenR1-Qwen-7B/sft/config.yaml
+++ b/recipes/OpenR1-Qwen-7B/sft/config.yaml
@@ -13,7 +13,7 @@ dataset_configs:
 dataset_num_proc: 48
 
 #SFT hyperparam
-max_seq_length: 32768
+max_length: 32768
 weight_decay: 0.0001
 optim: adamw_torch
 lr_scheduler_type: linear
diff --git a/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
index 30201bff7..2355a8305 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
@@ -28,7 +28,7 @@ lr_scheduler_type: cosine_with_min_lr
 lr_scheduler_kwargs:
   min_lr_rate: 0.1
 packing: true
-max_seq_length: 16384
+max_length: 16384
 max_steps: -1
 num_train_epochs: 1
 output_dir: data/Qwen2.5-1.5B-Open-R1-Distill

From 71880012818d486cf12202c7d54f3e88727bb746 Mon Sep 17 00:00:00 2001
From: Agus <agustin.piqueres@gmail.com>
Date: Mon, 24 Feb 2025 19:54:44 +0100
Subject: [PATCH 066/137] Add script to decontaminate datasets against
 benchmark datasets (#416)

* Add script to decontaminate datasets against benchmark datasets

* Add docs for the decontamination script

* Update README.md

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* Update README.md

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* Update README.md

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* Update scripts/decontaminate.py

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* Update scripts/decontaminate.py

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* Update scripts/decontaminate.py

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* Update scripts/decontaminate.py

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* Update scripts/decontaminate.py

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* Add license header and attribution to the authors

---------

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>
---
 README.md                |  31 +++++++++
 scripts/decontaminate.py | 142 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 173 insertions(+)
 create mode 100644 scripts/decontaminate.py

diff --git a/README.md b/README.md
index 2c01d1fa5..f778815b8 100644
--- a/README.md
+++ b/README.md
@@ -211,6 +211,37 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con
     --config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
 ```
 
+#### Data decontamination
+
+Following [s1: Simple test-time scaling](https://arxiv.org/abs/2501.19393) the data can be decontaminated using the script at: [scripts/decontaminate.py](./scripts/decontaminate.py), which decontaminates a dataset using 8-grams and deduplicate the data. Sample run:
+
+```shell
+python scripts/decontaminate.py \
+    --dataset "open-r1/verifiable-coding-problems-python" \
+    --problem_column problem \
+    --cleanup
+```
+
+It will decontaminate against the benchmark datasets, and remove the contaminated samples afterwards. If no argument `--new_dataset_name` is provided, the same dataset will be reused, adding a `_decontaminated`. It runs against the prompt, which for this dataset is the column `problem`, but a different one can be provided.
+
+Arguments for the script:
+
+```shell
+usage: decontaminate.py [-h] --dataset DATASET [--split SPLIT] [--ngram_size NGRAM_SIZE] [--problem_column PROBLEM_COLUMN] [--cleanup] [--new_dataset_name NEW_DATASET_NAME]
+
+options:
+  -h, --help            show this help message and exit
+  --dataset DATASET     Name of the dataset to check for contamination.
+  --split SPLIT         Split to check for contamination, defaults to `train`.
+  --ngram_size NGRAM_SIZE
+                        Size of n-grams to build, defaults to 8.
+  --problem_column PROBLEM_COLUMN
+                        Name of the column containing the problem (prompt).
+  --cleanup           Whether to remove the contaminated rows before pushing the dataset.
+  --new_dataset_name NEW_DATASET_NAME
+                        New name for the dataset. If not provided, will reuse the name and add a `_decontaminated` to the name.
+```
+
 ### Launching jobs on a Slurm cluster
 
 If you have access to a Slurm cluster, we provide a `slurm/train.slurm` script that will automatically queue training jobs for you. Here's how you can use it:
diff --git a/scripts/decontaminate.py b/scripts/decontaminate.py
new file mode 100644
index 000000000..14feb5bd6
--- /dev/null
+++ b/scripts/decontaminate.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script is used to decontaminate a dataset by checking for n-gram overlap with other datasets.
+It uses the same approach presented in https://arxiv.org/abs/2501.19393,
+as found in: https://github.com/simplescaling/s1/blob/main/data/decontaminate_util.py
+
+python scripts/decontaminate.py \
+    --dataset "open-r1/verifiable-coding-problems-python" \
+    --split train \
+    --ngram_size 8 \
+    --problem_column problem \
+    --cleanup
+"""
+
+import collections
+
+from tqdm import tqdm
+
+
+def normalize_string(text: str) -> str:
+    """Basic string normalization."""
+    # Convert to lowercase and normalize whitespace
+    text = text.lower().strip()
+    # Replace multiple spaces with single space
+    text = " ".join(text.split())
+    return text
+
+
+def word_ngrams(text: str, n: int) -> list:
+    """Generate word-level n-grams from text."""
+    words = text.split()
+    return [" ".join(words[i : i + n]) for i in range(len(words) - n + 1)]
+
+
+def build_ngram_lookup(documents: list[str], ngram_size: int = 8) -> dict[str, set[int]]:
+    """Build ngram lookup for documents."""
+    lookup = collections.defaultdict(set)
+
+    for doc_id, document in enumerate(tqdm(documents)):
+        normalized_text = normalize_string(document)
+        ngrams = word_ngrams(normalized_text, ngram_size)
+        for ngram in ngrams:
+            lookup[ngram].add(doc_id)
+
+    return lookup
+
+
+def build_ngram_single(document: str, ngram_size: int = 8) -> set[str]:
+    normalized_text = normalize_string(document)
+    ngrams = word_ngrams(normalized_text, ngram_size)
+
+    return set(ngrams)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset", type=str, required=True, help="Name of the dataset to check for contamination.")
+    parser.add_argument("--split", type=str, default="train", help="Split to check for contamination, defaults to `train`.")
+    parser.add_argument("--ngram_size", type=int, default=8, help="Size of n-grams to build, defaults to 8.")
+    parser.add_argument(
+        "--problem_column", type=str, default="problem", help="Name of the column containing the problem (prompt)."
+    )
+    parser.add_argument(
+        "--cleanup",
+        action="store_true",
+        help="Whether to remove the contaminated rows before pushing the dataset.",
+    )
+    parser.add_argument(
+        "--new_dataset_name",
+        type=str,
+        default=None,
+        help="New name for the dataset. If not provided, will reuse the name and add a `_decontaminated` to the name."
+    )
+    args = parser.parse_args()
+
+    from datasets import load_dataset, Dataset
+
+    # Load the dataset to check for contamination
+    ds = load_dataset(args.dataset, split=args.split)
+
+    eval_datasets = {
+        "aime_2024": (load_dataset("HuggingFaceH4/aime_2024", split="train"), "problem"),
+        "aime_2025": (load_dataset("yentinglin/aime_2025", split="train"), "problem"),
+        "math_500": (load_dataset("HuggingFaceH4/MATH-500", split="test"), "problem"),
+        "gpqa": (load_dataset("Idavidrein/gpqa", "gpqa_diamond", split="train", trust_remote_code=True), "Question"),
+        "lcb": (
+            load_dataset(
+                "livecodebench/code_generation_lite", split="test", version_tag="v4_v5", trust_remote_code=True
+            ),
+            "question_content",
+        ),
+    }
+    ngram_lookups = {}
+    for ds_name, (eval_dataset, problem_col) in eval_datasets.items():
+        ngram_lookups[ds_name] = build_ngram_lookup(eval_dataset[problem_col], ngram_size=args.ngram_size)
+
+    for eval_name, ngram_lookup in ngram_lookups.items():
+        # Update the ngram_lookup variable for each dataset
+        def find_contaminated(row):
+            # For each example we have to build the ngrams and check for all of them on each row
+            ngrams = build_ngram_single(row[args.problem_column], ngram_size=args.ngram_size)
+            row[f"contaminated_{eval_name}"] = any(set(ngram in ngram_lookup for ngram in ngrams))
+            return row
+
+        ds = ds.map(find_contaminated, num_proc=8)
+
+    # Allow cleaning up via CLI args (removing the contaminated examples and dropping the columns)
+    def cleanup(dataset: Dataset) -> Dataset:
+        initial_size = len(dataset)
+        contamination_cols = [col for col in dataset.column_names if col.startswith("contaminated_")]
+        for col in contamination_cols:
+            if col.startswith("contaminated_"):
+                size_prior = len(dataset)
+                dataset = dataset.filter(lambda x: not x[col], num_proc=8)
+                if len(dataset) < size_prior:
+                    print(f"Removed {size_prior - len(dataset)} samples from '{col.replace('contaminated_', '')}'")
+        dataset = dataset.remove_columns(contamination_cols)
+        print(f"Initial size: {initial_size}, Final size: {len(dataset)}")
+        return dataset
+
+    if args.cleanup:
+        ds = cleanup(ds)
+
+    new_ds_name = args.new_dataset_name or f"{args.dataset}_decontaminated"
+    ds.push_to_hub(new_ds_name, split="train", private=False)
+    print(f"Decontaminated dataset: {new_ds_name}")

From 11beb9a4dcd002645fbef100686d0ffe7b9fa42d Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Tue, 25 Feb 2025 16:00:13 +0100
Subject: [PATCH 067/137] Updates evals to run with ddp=8 for small models
 (#428)

Currently the logic for calculating num_gpus considers eval in the TP setting, for the Qwen 7b models this retuns 4. However for smaller models we can use DDP and fix the num_gpus at 8
---
 src/open_r1/utils/evaluation.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py
index dff5c9858..7c95812b3 100644
--- a/src/open_r1/utils/evaluation.py
+++ b/src/open_r1/utils/evaluation.py
@@ -71,6 +71,7 @@ def run_lighteval_job(
     if get_param_count_from_repo_id(model_name) >= 30_000_000_000:
         tensor_parallel = True
     else:
+        num_gpus = 8
         tensor_parallel = False
 
     cmd = VLLM_SLURM_PREFIX.copy()

From d036a1b341f527097172bea212b019afb5f96f66 Mon Sep 17 00:00:00 2001
From: Nile Zhou <22871947+NileZhou@users.noreply.github.com>
Date: Tue, 25 Feb 2025 23:15:00 +0800
Subject: [PATCH 068/137] fix reward verify err (#430)

---
 src/open_r1/rewards.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 8771b3866..856d08956 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -50,7 +50,11 @@ def accuracy_reward(completions, solution, **kwargs):
                 extraction_mode="first_match",
             )
             # Reward 1 if the content is the same as the ground truth, 0 otherwise
-            reward = float(verify(answer_parsed, gold_parsed))
+            try:
+                reward = float(verify(answer_parsed, gold_parsed))
+            except Exception as e:
+                print(f"verify failed: {e}, answer: {answer_parsed}, gold: {gold_parsed}")
+                reward = 0.0
         else:
             # If the gold solution is not parseable, we reward 1 to skip this example
             reward = 1.0

From 3ba56c1c3d9d01dabdf9adf8f5efd6346f8cc36e Mon Sep 17 00:00:00 2001
From: elie <97572401+eliebak@users.noreply.github.com>
Date: Tue, 25 Feb 2025 21:45:59 +0100
Subject: [PATCH 069/137] Add config sft smollm (#425)

* add sft recipe

* add smollm sft

* max_length modif 1

* max_length modif 2
---
 recipes/SmolLM2-1.7B-Instruct/sft/config.yaml | 48 +++++++++++++++++++
 recipes/SmolLM2-1.7B/sft/config.yaml          | 48 +++++++++++++++++++
 2 files changed, 96 insertions(+)
 create mode 100644 recipes/SmolLM2-1.7B-Instruct/sft/config.yaml
 create mode 100644 recipes/SmolLM2-1.7B/sft/config.yaml

diff --git a/recipes/SmolLM2-1.7B-Instruct/sft/config.yaml b/recipes/SmolLM2-1.7B-Instruct/sft/config.yaml
new file mode 100644
index 000000000..6e2242f3d
--- /dev/null
+++ b/recipes/SmolLM2-1.7B-Instruct/sft/config.yaml
@@ -0,0 +1,48 @@
+# Model arguments
+# You can download the model and manually change the rope to 300k/500k and max_position_embeddings to 32768
+model_name_or_path: HuggingFaceTB/SmolLM2-1.7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: sdpa
+
+# Data training arguments
+dataset_name: open-r1/OpenR1-Math-220k 
+dataset_configs:
+- default
+dataset_num_proc: 48
+
+#SFT hyperparam
+max_length: 8192 # You can set this to 32768 if you change the rope, but you need to change the config.json file 
+weight_decay: 0.0001
+optim: adamw_torch
+lr_scheduler_type: linear
+warmup_ratio: 0.1
+learning_rate: 5.0e-05
+gradient_accumulation_steps: 2
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 4 # Change this depending on the context length of the model to keep a 500M GBS. 
+
+# SFT trainer config
+max_steps: -1
+num_train_epochs: 3
+bf16: true
+do_eval: false
+eval_strategy: 'no'
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: OpenR1-Qwen-7B-SFT
+hub_strategy: every_save
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+packing: true
+output_dir: data/OpenR1-Qwen-7B-SFT
+overwrite_output_dir: true
+push_to_hub: true
+report_to:
+- wandb
+save_strategy: "steps"
+save_steps: 500
+save_total_limit: 1
+seed: 42
diff --git a/recipes/SmolLM2-1.7B/sft/config.yaml b/recipes/SmolLM2-1.7B/sft/config.yaml
new file mode 100644
index 000000000..10df6694d
--- /dev/null
+++ b/recipes/SmolLM2-1.7B/sft/config.yaml
@@ -0,0 +1,48 @@
+# Model arguments
+# You can download the model and manually change the rope to 300k/500k and max_position_embeddings to 32768
+model_name_or_path: HuggingFaceTB/SmolLM2-1.7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: sdpa
+
+# Data training arguments
+dataset_name: open-r1/OpenR1-Math-220k 
+dataset_configs:
+- default
+dataset_num_proc: 48
+
+#SFT hyperparam
+max_length: 8192 # You can set this to 32768 if you change the rope, but you need to change the config.json file 
+weight_decay: 0.0001
+optim: adamw_torch
+lr_scheduler_type: linear
+warmup_ratio: 0.1
+learning_rate: 5.0e-05
+gradient_accumulation_steps: 2
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 4 # Change this depending on the context length of the model to keep a 500M GBS. 
+
+# SFT trainer config
+max_steps: -1
+num_train_epochs: 3
+bf16: true
+do_eval: false
+eval_strategy: 'no'
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: OpenR1-Qwen-7B-SFT
+hub_strategy: every_save
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+packing: true
+output_dir: data/OpenR1-Qwen-7B-SFT
+overwrite_output_dir: true
+push_to_hub: true
+report_to:
+- wandb
+save_strategy: "steps"
+save_steps: 500
+save_total_limit: 1
+seed: 42

From a20666d5b57ccf64d4c48702b552204b98340568 Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Wed, 26 Feb 2025 10:35:50 +0100
Subject: [PATCH 070/137] Bumps TRL (#437)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 9b9a1c0ed..6346574ce 100644
--- a/setup.py
+++ b/setup.py
@@ -67,7 +67,7 @@
     "sentencepiece>=0.1.99",
     "torch==2.5.1",
     "transformers==4.49.0",
-    "trl @ git+https://github.com/huggingface/trl.git@013d360b8f2703d3546786fa124f3204d6cd8018",
+    "trl @ git+https://github.com/huggingface/trl.git@69ad852e5654a77f1695eb4c608906fe0c7e8624",
     "vllm==0.7.2",
     "wandb>=0.19.1",
 ]

From 8782fa6e9080cfdfe2bf83e321e48c42fff42f0c Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Wed, 26 Feb 2025 17:59:44 +0100
Subject: [PATCH 071/137] bump lighteval, expose the lcb_v4 benchmark (#441)

---
 setup.py                        | 2 +-
 src/open_r1/utils/evaluation.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6346574ce..00b83d811 100644
--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,7 @@
     "langdetect",  # Needed for LightEval's extended tasks
     "latex2sympy2_extended>=1.0.6",
     "liger_kernel==0.5.3",
-    "lighteval @ git+https://github.com/huggingface/lighteval.git@ebb7377b39a48ab0691e6fbd9dea57e9fe290a7e",
+    "lighteval @ git+https://github.com/huggingface/lighteval.git@ed084813e0bd12d82a06d9f913291fdbee774905",
     "math-verify==0.5.2",  # Used for math verification in grpo
     "packaging>=23.0",
     "parameterized>=0.9.0",
diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py
index 7c95812b3..66a048ecf 100644
--- a/src/open_r1/utils/evaluation.py
+++ b/src/open_r1/utils/evaluation.py
@@ -51,6 +51,7 @@ def register_lighteval_task(
 register_lighteval_task(LIGHTEVAL_TASKS, "custom", "aime25", "aime25", 0)
 register_lighteval_task(LIGHTEVAL_TASKS, "custom", "gpqa", "gpqa:diamond", 0)
 register_lighteval_task(LIGHTEVAL_TASKS, "extended", "lcb", "lcb:codegeneration", 0)
+register_lighteval_task(LIGHTEVAL_TASKS, "extended", "lcb_v4", "lcb:codegeneration_v4", 0)
 
 
 def get_lighteval_tasks():

From c7733d3fa45b1579d6114cff469d38c8bb48a999 Mon Sep 17 00:00:00 2001
From: Marco Z <ocramz@users.noreply.github.com>
Date: Sat, 1 Mar 2025 15:08:30 +0100
Subject: [PATCH 072/137] update makefile and readme (#449)

Co-authored-by: Marco Zocca <marco.zocca@unfoldml.com>
---
 Makefile  | 8 ++++++++
 README.md | 4 ++++
 2 files changed, 12 insertions(+)

diff --git a/Makefile b/Makefile
index cdeba5511..c775ed66f 100644
--- a/Makefile
+++ b/Makefile
@@ -5,6 +5,14 @@ export PYTHONPATH = src
 
 check_dirs := src tests
 
+
+# dev dependencies
+install:
+	uv venv openr1 --python 3.11 && source openr1/bin/activate && uv pip install --upgrade pip
+	uv pip install vllm==0.7.2
+	uv pip install setuptools
+	GIT_LFS_SKIP_SMUDGE=1 uv pip install -e ".[dev]"
+
 style:
 	ruff format --line-length 119 --target-version py310 $(check_dirs) setup.py
 	isort $(check_dirs) setup.py
diff --git a/README.md b/README.md
index f778815b8..b520ee022 100644
--- a/README.md
+++ b/README.md
@@ -50,6 +50,10 @@ To run the code in this project, first, create a Python virtual environment usin
 To install `uv`, follow the [UV Installation Guide](https://docs.astral.sh/uv/getting-started/installation/).
 
 
+> [!NOTE]
+> As a shortcut, run `make install` to setup development libraries (spelled out below). Afterwards if everything is setup correctly and you have a functioning CUDA, you can install `flash-attn` and try out the Open-R1 models.
+
+
 ```shell
 uv venv openr1 --python 3.11 && source openr1/bin/activate && uv pip install --upgrade pip
 ```

From 45ccf60109974fa4674c712409ecd444f79a9222 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Mon, 3 Mar 2025 13:54:58 +0100
Subject: [PATCH 073/137] Remove dataset_configs from YAML recipes (#461)

---
 recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml   | 2 --
 .../sft/config_openr1_math.yaml                               | 2 --
 recipes/OpenR1-Qwen-7B/sft/config.yaml                        | 2 --
 recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml           | 2 --
 recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml      | 4 +---
 recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml            | 2 --
 recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml            | 3 +--
 recipes/SmolLM2-1.7B-Instruct/sft/config.yaml                 | 4 +---
 recipes/SmolLM2-1.7B/sft/config.yaml                          | 2 --
 9 files changed, 3 insertions(+), 20 deletions(-)

diff --git a/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml b/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
index ecd53336b..13c4f668b 100644
--- a/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
+++ b/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
@@ -8,8 +8,6 @@ attn_implementation: flash_attention_2
 # We edit the DeepSeek chat template to ensure (a) the reasoning block within <think> and </think> is included in the completion and (b) the <think> tag is not part of the prefill so that the format reward works
 chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}"
 dataset_name: open-r1/OpenR1-Math-220k
-dataset_configs:
-- default
 system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
 
 # GRPO trainer config
diff --git a/recipes/Mistral-Small-24B-Instruct-2501/sft/config_openr1_math.yaml b/recipes/Mistral-Small-24B-Instruct-2501/sft/config_openr1_math.yaml
index 4c52ab404..417b86063 100644
--- a/recipes/Mistral-Small-24B-Instruct-2501/sft/config_openr1_math.yaml
+++ b/recipes/Mistral-Small-24B-Instruct-2501/sft/config_openr1_math.yaml
@@ -9,8 +9,6 @@ attn_implementation: flash_attention_2
 # Data training arguments
 # dataset_name: yentinglin/s1K-1.1-trl-format
 dataset_name: yentinglin/OpenR1-Math-220k-trl-format
-dataset_configs:
-- all
 preprocessing_num_workers: 8
 
 # SFT trainer config
diff --git a/recipes/OpenR1-Qwen-7B/sft/config.yaml b/recipes/OpenR1-Qwen-7B/sft/config.yaml
index ddb77364f..812469f00 100644
--- a/recipes/OpenR1-Qwen-7B/sft/config.yaml
+++ b/recipes/OpenR1-Qwen-7B/sft/config.yaml
@@ -8,8 +8,6 @@ attn_implementation: sdpa
 
 # Data training arguments
 dataset_name: open-r1/OpenR1-Math-220k
-dataset_configs:
-- default
 dataset_num_proc: 48
 
 #SFT hyperparam
diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
index a6cb77d85..156ccf160 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
@@ -6,8 +6,6 @@ attn_implementation: flash_attention_2
 
 # Data training arguments
 dataset_name: open-r1/OpenR1-Math-220k
-dataset_configs:
-- default
 system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
 
 # GRPO trainer config
diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
index 783a4d2a1..135f40aa1 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
@@ -5,9 +5,7 @@ torch_dtype: bfloat16
 attn_implementation: flash_attention_2
 
 # Data training arguments
-dataset_name: open-r1/verifiable-coding-problems-python-10k
-dataset_configs:
-- default
+dataset_name: open-r1/verifiable-coding-problems-python
 system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
 
 # GRPO trainer config
diff --git a/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
index 2355a8305..8f9af88a9 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
@@ -6,8 +6,6 @@ attn_implementation: flash_attention_2
 
 # Data training arguments
 dataset_name: open-r1/OpenR1-Math-220k
-dataset_configs:
-- default
 dataset_num_proc: 48
 
 # SFT trainer config
diff --git a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
index 762f77164..d1c3d63c4 100644
--- a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
+++ b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
@@ -6,8 +6,7 @@ attn_implementation: flash_attention_2
 
 # Data training arguments
 dataset_name: DigitalLearningGmbH/MATH-lighteval
-dataset_configs:
-- train
+dataset_config: default
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 
 # GRPO trainer config
diff --git a/recipes/SmolLM2-1.7B-Instruct/sft/config.yaml b/recipes/SmolLM2-1.7B-Instruct/sft/config.yaml
index 6e2242f3d..6f0bc9498 100644
--- a/recipes/SmolLM2-1.7B-Instruct/sft/config.yaml
+++ b/recipes/SmolLM2-1.7B-Instruct/sft/config.yaml
@@ -6,9 +6,7 @@ torch_dtype: bfloat16
 attn_implementation: sdpa
 
 # Data training arguments
-dataset_name: open-r1/OpenR1-Math-220k 
-dataset_configs:
-- default
+dataset_name: open-r1/OpenR1-Math-220k
 dataset_num_proc: 48
 
 #SFT hyperparam
diff --git a/recipes/SmolLM2-1.7B/sft/config.yaml b/recipes/SmolLM2-1.7B/sft/config.yaml
index 10df6694d..4a1f2d68c 100644
--- a/recipes/SmolLM2-1.7B/sft/config.yaml
+++ b/recipes/SmolLM2-1.7B/sft/config.yaml
@@ -7,8 +7,6 @@ attn_implementation: sdpa
 
 # Data training arguments
 dataset_name: open-r1/OpenR1-Math-220k 
-dataset_configs:
-- default
 dataset_num_proc: 48
 
 #SFT hyperparam

From 4b4c377f27dacbf306fb41e1ce54c9076aa03f58 Mon Sep 17 00:00:00 2001
From: A-transformer <cl5743590921@gmail.com>
Date: Mon, 3 Mar 2025 18:33:23 +0400
Subject: [PATCH 074/137] fix typo (#459)

fix typo
---
 src/open_r1/rewards.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 856d08956..99d5d9b5a 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -107,14 +107,14 @@ def reasoning_steps_reward(completions, **kwargs):
     completion_contents = [completion[0]["content"] for completion in completions]
     matches = [len(re.findall(pattern, content)) for content in completion_contents]
 
-    # Magic nubmer 3 to encourage 3 steps and more, otherwise partial reward
+    # Magic number 3 to encourage 3 steps and more, otherwise partial reward
     return [min(1.0, count / 3) for count in matches]
 
 
 def len_reward(completions: list[Dict[str, str]], solution: list[str], **kwargs) -> float:
     """Compute length-based rewards to discourage overthinking and promote token efficiency.
 
-    Taken from from the Kimi 1.5 tech report: https://arxiv.org/abs/2501.12599
+    Taken from the Kimi 1.5 tech report: https://arxiv.org/abs/2501.12599
 
     Args:
         completions: List of model completions

From 44cb13d4bab20e2f112f7398e186eb2d7e4049ef Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Mon, 3 Mar 2025 17:25:30 +0100
Subject: [PATCH 075/137] Fix vLLM (#464)

---
 Makefile          |  3 ++-
 README.md         |  2 +-
 slurm/train.slurm | 12 +++++++++---
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index c775ed66f..2ca457a33 100644
--- a/Makefile
+++ b/Makefile
@@ -8,9 +8,10 @@ check_dirs := src tests
 
 # dev dependencies
 install:
-	uv venv openr1 --python 3.11 && source openr1/bin/activate && uv pip install --upgrade pip
+	uv venv openr1 --python 3.11 && . openr1/bin/activate && uv pip install --upgrade pip
 	uv pip install vllm==0.7.2
 	uv pip install setuptools
+	uv pip install flash-attn --no-build-isolation
 	GIT_LFS_SKIP_SMUDGE=1 uv pip install -e ".[dev]"
 
 style:
diff --git a/README.md b/README.md
index b520ee022..264bf98c6 100644
--- a/README.md
+++ b/README.md
@@ -51,7 +51,7 @@ To install `uv`, follow the [UV Installation Guide](https://docs.astral.sh/uv/ge
 
 
 > [!NOTE]
-> As a shortcut, run `make install` to setup development libraries (spelled out below). Afterwards if everything is setup correctly and you have a functioning CUDA, you can install `flash-attn` and try out the Open-R1 models.
+> As a shortcut, run `make install` to setup development libraries (spelled out below). Afterwards, if everything is setup correctly you can try out the Open-R1 models.
 
 
 ```shell
diff --git a/slurm/train.slurm b/slurm/train.slurm
index c10a2a237..c1209e88a 100644
--- a/slurm/train.slurm
+++ b/slurm/train.slurm
@@ -32,10 +32,16 @@ WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
 # Due to conflicts between Accelerate's DeepSpeed configs and Transformers' TrainingArguments, we need to parse the gradient accumulation steps from the config file to ensure they match
 CONFIG_FILE=recipes/$MODEL/$TASK/config_$CONFIG_SUFFIX.yaml
 GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}')
-USE_VLLM=$(grep 'use_vllm:\s*true' $CONFIG_FILE) # Match "use_vllm: true" (with optional whitespace)
 
-if [ -n "$USE_VLLM" ]; then  # Check if USE_VLLM is *not* empty (found)
-    WORLD_SIZE=$(($WORLD_SIZE-1))
+# Check if we are running vLLM during training to adjust the world size
+if grep -q 'use_vllm:\s*true' "$CONFIG_FILE"; then
+    USE_VLLM="true"
+else
+    USE_VLLM="false"
+fi
+
+if [[ "$USE_VLLM" == "true" ]]; then
+    WORLD_SIZE=$(($WORLD_SIZE - 1))
 fi
 
 # Split the string into individual arguments

From 299446902da48ac567267d2b5a7a274267121622 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Tue, 4 Mar 2025 09:22:01 +0100
Subject: [PATCH 076/137] Enable decontamination on dataset configs (#460)

---
 scripts/decontaminate.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/scripts/decontaminate.py b/scripts/decontaminate.py
index 14feb5bd6..0cb2a7a93 100644
--- a/scripts/decontaminate.py
+++ b/scripts/decontaminate.py
@@ -18,8 +18,10 @@
 It uses the same approach presented in https://arxiv.org/abs/2501.19393,
 as found in: https://github.com/simplescaling/s1/blob/main/data/decontaminate_util.py
 
+Usage:
+
 python scripts/decontaminate.py \
-    --dataset "open-r1/verifiable-coding-problems-python" \
+    --dataset open-r1/verifiable-coding-problems-python \
     --split train \
     --ngram_size 8 \
     --problem_column problem \
@@ -71,6 +73,7 @@ def build_ngram_single(document: str, ngram_size: int = 8) -> set[str]:
 
     parser = argparse.ArgumentParser()
     parser.add_argument("--dataset", type=str, required=True, help="Name of the dataset to check for contamination.")
+    parser.add_argument("--config", type=str, default=None, help="Name of the dataset config to load.")
     parser.add_argument("--split", type=str, default="train", help="Split to check for contamination, defaults to `train`.")
     parser.add_argument("--ngram_size", type=int, default=8, help="Size of n-grams to build, defaults to 8.")
     parser.add_argument(
@@ -92,7 +95,7 @@ def build_ngram_single(document: str, ngram_size: int = 8) -> set[str]:
     from datasets import load_dataset, Dataset
 
     # Load the dataset to check for contamination
-    ds = load_dataset(args.dataset, split=args.split)
+    ds = load_dataset(args.dataset, name=args.config, split=args.split)
 
     eval_datasets = {
         "aime_2024": (load_dataset("HuggingFaceH4/aime_2024", split="train"), "problem"),
@@ -138,5 +141,6 @@ def cleanup(dataset: Dataset) -> Dataset:
         ds = cleanup(ds)
 
     new_ds_name = args.new_dataset_name or f"{args.dataset}_decontaminated"
-    ds.push_to_hub(new_ds_name, split="train", private=False)
-    print(f"Decontaminated dataset: {new_ds_name}")
+    config_name = args.config if args.config is not None else "default"
+    url = ds.push_to_hub(new_ds_name, config_name=config_name, split="train")
+    print(f"Decontaminated dataset: {url}")

From a465641ec7c7c14e41da970e492b82d64f78e6c2 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Tue, 4 Mar 2025 14:25:58 +0100
Subject: [PATCH 077/137] Fix make evaluate (#470)

---
 Makefile | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/Makefile b/Makefile
index 2ca457a33..f6fba0bb7 100644
--- a/Makefile
+++ b/Makefile
@@ -37,17 +37,14 @@ evaluate:
 		fi \
 	),))
 	$(if $(filter tensor,$(PARALLEL)),export VLLM_WORKER_MULTIPROC_METHOD=spawn &&,) \
-	MODEL_ARGS="pretrained=$(MODEL),dtype=bfloat16,$(PARALLEL_ARGS),max_model_length=32768,gpu_memory_utilization=0.8" && \
-	lighteval vllm $$MODEL_ARGS "custom|$(TASK)|0|0" \
-		--custom-tasks src/open_r1/evaluate.py \
-		--use-chat-template \
-		--system-prompt="Please reason step by step, and put your final answer within \boxed{}." \
-		--output-dir data/evals/$(MODEL)
-
-# Example usage:
-# Single GPU:
-#   make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24
-# Data parallel:
-#   make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24 PARALLEL=data NUM_GPUS=8
-# Tensor parallel:
-#   make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24 PARALLEL=tensor NUM_GPUS=8
+	MODEL_ARGS="pretrained=$(MODEL),dtype=bfloat16,$(PARALLEL_ARGS),max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}" && \
+	if [ "$(TASK)" = "lcb" ]; then \
+		lighteval vllm $$MODEL_ARGS "extended|lcb:codegeneration|0|0" \
+			--use-chat-template \
+			--output-dir data/evals/$(MODEL); \
+	else \
+		lighteval vllm $$MODEL_ARGS "custom|$(TASK)|0|0" \
+			--custom-tasks src/open_r1/evaluate.py \
+			--use-chat-template \
+			--output-dir data/evals/$(MODEL); \
+	fi

From 3b5d6603bf67e89b2e49c9dd61822af517ce9990 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Wed, 5 Mar 2025 20:23:57 +0100
Subject: [PATCH 078/137] Add citation and acknowledgements (#481)

* Update README.md

* Update README.md

* Update README.md
---
 README.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/README.md b/README.md
index 264bf98c6..40d7a6b7b 100644
--- a/README.md
+++ b/README.md
@@ -578,3 +578,21 @@ sbatch slurm/generate.slurm \
 ## Contributing
 
 Contributions are welcome. Please refer to https://github.com/huggingface/open-r1/issues/23.
+
+## Acknowledgements
+
+This project is built with the collective efforts of many groups and individuals in the open AI community. We are especially grateful to the vLLM and SGLang teams for creating high-performance tooling to scale the rollouts of GRPO. We also thank the teams at [OpenThoughts](https://www.open-thoughts.ai), [Prime Intellect](https://www.primeintellect.ai), and [General Reasoning](https://gr.inc) for creating and sharing high-quality datasets for reasoning.
+
+## Citation
+
+If you find this project is useful in your own work, please consider citing as follows:
+
+```
+@misc{openr1,
+    title = {Open R1: A fully open reproduction of DeepSeek-R1},
+    url = {https://github.com/huggingface/open-r1},
+    author = {Hugging Face},
+    month = {January},
+    year = {2025}
+}
+```

From 6660a477eca71bf8d94c59cd2e458cf0ff6e1f80 Mon Sep 17 00:00:00 2001
From: A-transformer <cl5743590921@gmail.com>
Date: Thu, 6 Mar 2025 13:45:50 +0400
Subject: [PATCH 079/137] Remove unimplemented 'format_deepseek' from
 reward_funcs documentation (#480)

Removed 'format_deepseek' from GRPOScriptArguments help string as it is not implemented in REWARD_FUNCS_REGISTRY and format_reward already covers DeepSeek-style formatting needs.
---
 src/open_r1/grpo.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index d5f023e02..83b7e3e31 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -52,7 +52,7 @@ class GRPOScriptArguments(ScriptArguments):
 
     Args:
         reward_funcs (`list[str]`):
-            List of reward functions. Possible values: 'accuracy', 'format', 'format_deepseek', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length', tag_count', 'code', 'code_format'.
+            List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length', tag_count', 'code', 'code_format'.
         cosine_min_value_wrong (`float`):
             Minimum reward for cosine scaling for wrong answers.
         cosine_max_value_wrong (`float`):
@@ -70,7 +70,7 @@ class GRPOScriptArguments(ScriptArguments):
     reward_funcs: list[str] = field(
         default_factory=lambda: ["accuracy", "format", "tag_count"],
         metadata={
-            "help": "List of reward functions. Possible values: 'accuracy', 'format', 'format_deepseek', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length', tag_count', 'code', 'code_format'"
+            "help": "List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length', tag_count', 'code', 'code_format'"
         },
     )
     cosine_min_value_wrong: float = field(

From 88a1b002c120533f2a489e2f5f177a72b19872e4 Mon Sep 17 00:00:00 2001
From: A-transformer <cl5743590921@gmail.com>
Date: Fri, 7 Mar 2025 18:44:15 +0400
Subject: [PATCH 080/137] missing ' (#479)

missing '
---
 src/open_r1/grpo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 83b7e3e31..4ac788ff5 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -52,7 +52,7 @@ class GRPOScriptArguments(ScriptArguments):
 
     Args:
         reward_funcs (`list[str]`):
-            List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length', tag_count', 'code', 'code_format'.
+            List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length', 'tag_count', 'code', 'code_format'.
         cosine_min_value_wrong (`float`):
             Minimum reward for cosine scaling for wrong answers.
         cosine_max_value_wrong (`float`):

From 9890a8d9921ecf27784a18896f3b974b357df903 Mon Sep 17 00:00:00 2001
From: Agus <agustin.piqueres@gmail.com>
Date: Sat, 8 Mar 2025 15:41:15 +0100
Subject: [PATCH 081/137] Run e2b async sandbox by default (#484)

* Run e2b async sandbox by default

* Remove unnecessary rewards

* Fix run_sync variable

* Run linters

* Let only async version

* Remove unused Sandbox

---------

Co-authored-by: agus <agustin.piqueres@huggingface.com>
---
 src/open_r1/rewards.py | 129 ++++++++++++++++++++++++++---------------
 1 file changed, 81 insertions(+), 48 deletions(-)

diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 99d5d9b5a..54e072132 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -1,5 +1,6 @@
 """Reward functions for GRPO training."""
 
+import asyncio
 import json
 import math
 import re
@@ -13,7 +14,7 @@
 
 if is_e2b_available():
     from dotenv import load_dotenv
-    from e2b_code_interpreter import Sandbox
+    from e2b_code_interpreter import AsyncSandbox
 
     load_dotenv()
 
@@ -327,62 +328,54 @@ def code_reward(completions, **kwargs) -> list[float]:
             "`pip install e2b-code-interpreter` and add an API key to a `.env` file."
         )
 
-    rewards = []
     # TODO: add support for other languages in E2B: https://e2b.dev/docs/code-interpreting/supported-languages
-    try:
-        """Returns a reward function that evaluates code snippets in a sandbox."""
-        evaluation_script_template = """
-        import subprocess
-        import json
-
-        def evaluate_code(code, test_cases):
-            passed = 0
-            total = len(test_cases)
-            exec_timeout = 5
-
-            for case in test_cases:
-                process = subprocess.run(
-                    ["python3", "-c", code],
-                    input=case["input"],
-                    text=True,
-                    capture_output=True,
-                    timeout=exec_timeout
-                )
+    """Returns a reward function that evaluates code snippets in a sandbox."""
+    evaluation_script_template = """
+    import subprocess
+    import json
+
+    def evaluate_code(code, test_cases):
+        passed = 0
+        total = len(test_cases)
+        exec_timeout = 5
+
+        for case in test_cases:
+            process = subprocess.run(
+                ["python3", "-c", code],
+                input=case["input"],
+                text=True,
+                capture_output=True,
+                timeout=exec_timeout
+            )
 
-                if process.returncode != 0:  # Error in execution
-                    continue
+            if process.returncode != 0:  # Error in execution
+                continue
 
-                output = process.stdout.strip()
-                if output.strip() == case["output"].strip():
-                    passed += 1
+            output = process.stdout.strip()
+            if output.strip() == case["output"].strip():
+                passed += 1
 
-            success_rate = (passed / total)
-            return success_rate
+        success_rate = (passed / total)
+        return success_rate
 
-        code_snippet = {code}
-        test_cases = json.loads({test_cases})
+    code_snippet = {code}
+    test_cases = json.loads({test_cases})
+
+    evaluate_code(code_snippet, test_cases)
+    """
+    code_snippets = [extract_code(completion[-1]["content"]) for completion in completions]
+    verification_info = kwargs["verification_info"]
+    scripts = [
+        evaluation_script_template.format(code=json.dumps(code), test_cases=json.dumps(json.dumps(info["test_cases"])))
+        for code, info in zip(code_snippets, verification_info)
+    ]
+    try:
+        rewards = run_async_from_sync(scripts, verification_info["language"])
 
-        evaluate_code(code_snippet, test_cases)
-        """
-        code_snippets = [extract_code(completion[-1]["content"]) for completion in completions]
-        verification_info = kwargs["verification_info"]
-        scripts = [
-            evaluation_script_template.format(
-                code=json.dumps(code), test_cases=json.dumps(json.dumps(info["test_cases"]))
-            )
-            for code, info in zip(code_snippets, verification_info)
-        ]
-        with Sandbox(timeout=30, request_timeout=3) as sbx:
-            for script in scripts:
-                execution = sbx.run_code(script, language=verification_info["language"])
-                try:
-                    output = float(execution.text)
-                except (TypeError, ValueError):
-                    output = 0.0
-                rewards.append(output)
     except Exception as e:
         print(f"Error from E2B executor: {e}")
         rewards = [0.0] * len(completions)
+
     return rewards
 
 
@@ -400,3 +393,43 @@ def code_format_reward(completions, **kwargs):
         return [1.0 if match else 0.0 for match in matches]
 
     return code_format_reward
+
+
+def run_async_from_sync(scripts: list[str], language: str) -> list[float]:
+    """Function wrapping the `run_async` function."""
+    # Create a new event loop and set it
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+
+    try:
+        # Run the async function and get the result
+        rewards = loop.run_until_complete(run_async(scripts, language))
+    finally:
+        loop.close()
+
+    return rewards
+
+
+async def run_async(scripts: list[str], language: str) -> list[float]:
+    # Create the sandbox by hand, currently there's no context manager for this version
+    sbx = await AsyncSandbox.create(timeout=30, request_timeout=3)
+
+    # Create a list of tasks for running scripts concurrently
+    tasks = [run_script(sbx, script) for script in scripts]
+
+    # Wait for all tasks to complete and gather their results as they finish
+    results = await asyncio.gather(*tasks)
+    rewards = list(results)  # collect results
+
+    # Kill the sandbox after all the tasks are complete
+    await sbx.kill()
+
+    return rewards
+
+
+async def run_script(sbx, script: str, language: str) -> float:
+    execution = await sbx.run_code(script, language=language)
+    try:
+        return float(execution.text)
+    except (TypeError, ValueError):
+        return 0.0

From d5922af8ced85b20edb24b5b170186a1f6d30418 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Thu, 13 Mar 2025 19:08:34 +0100
Subject: [PATCH 082/137] Add OlympicCoder recipes (#505)

* Add OlympicCoder recipes

* Fix configs

* Add FSDP config
---
 README.md                                     |  5 ++
 .../OlympicCoder-32B/sft/config_v00.00.yaml   | 49 +++++++++++++++++++
 .../OlympicCoder-7B/sft/config_v00.00.yaml    | 46 +++++++++++++++++
 recipes/README.md                             | 16 +++++-
 recipes/accelerate_configs/fsdp.yaml          | 27 ++++++++++
 5 files changed, 142 insertions(+), 1 deletion(-)
 create mode 100644 recipes/OlympicCoder-32B/sft/config_v00.00.yaml
 create mode 100644 recipes/OlympicCoder-7B/sft/config_v00.00.yaml
 create mode 100644 recipes/accelerate_configs/fsdp.yaml

diff --git a/README.md b/README.md
index 40d7a6b7b..270e4cafd 100644
--- a/README.md
+++ b/README.md
@@ -40,6 +40,11 @@ We will use the DeepSeek-R1 [tech report](https://github.com/deepseek-ai/DeepSee
     <img src="assets/plan-of-attack.png" width="500">
 </center>
 
+## News 🗞️
+
+* **⚡️ [2025/03/11] [(update #3)](https://huggingface.co/blog/open-r1/update-3):** We release the [**CodeForces-CoTs**](https://huggingface.co/datasets/open-r1/codeforces-cots) dataset of 10k competitive programming problems and 100k solutions distilled from R1. We also release IOI24: a new benchmark of _very_ hard problems from international olympiads. A 7B Qwen model trained on CodeForces-CoTs can outperform Claude 3.7 Sonnet on IOI24, while a 32B model can outperform R1 itself.
+* **∞ [2025/02/10] [(update #2)](https://huggingface.co/blog/open-r1/update-2):** We release the [**OpenR1-Math-220k**](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k) dataset of 220k traces distilled from R1 on a new version of NuminaMath. Models trained on this dataset match the performance of DeepSeek's distilled ones.
+* **🔥 [2025/02/02] [(update #1)](https://huggingface.co/blog/open-r1/update-1):** We implement the first parts of the [training](https://github.com/huggingface/open-r1?tab=readme-ov-file#training-models), [inference](https://github.com/huggingface/open-r1?tab=readme-ov-file#data-generation), and [evaluation](https://github.com/huggingface/open-r1?tab=readme-ov-file#reproducing-deepseeks-evaluation-results) pipelines. Let's go!  
 
 ## Installation
 
diff --git a/recipes/OlympicCoder-32B/sft/config_v00.00.yaml b/recipes/OlympicCoder-32B/sft/config_v00.00.yaml
new file mode 100644
index 000000000..1d6a2ae69
--- /dev/null
+++ b/recipes/OlympicCoder-32B/sft/config_v00.00.yaml
@@ -0,0 +1,49 @@
+# Config for 16 nodes of 8 H100s with FSDP1
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-Coder-32B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/codeforces-cots
+dataset_config: solutions_decontaminated
+dataset_num_proc: 12
+
+# SFT trainer config
+bf16: true
+do_eval: false
+eval_strategy: 'no'
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_always_push: true
+hub_model_id: OlympicCoder-32B
+hub_strategy: every_save
+learning_rate: 4.0e-05
+log_level: info
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+packing: false
+max_grad_norm: 0.2
+max_length: 22528 # we were unable to train at 32k due to OOM. See https://github.com/huggingface/transformers/issues/35983 for context parallelism support.
+max_steps: -1
+num_train_epochs: 10
+optim: paged_adamw_8bit
+output_dir: data/OlympicCoder-32B
+overwrite_output_dir: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+save_only_model: true # needed to bypass FSDP errors with saving paged optimizers
+save_strategy: epoch
+save_total_limit: 1
+seed: 42
+use_liger: false # fails on multi-node
+warmup_ratio: 0.03
\ No newline at end of file
diff --git a/recipes/OlympicCoder-7B/sft/config_v00.00.yaml b/recipes/OlympicCoder-7B/sft/config_v00.00.yaml
new file mode 100644
index 000000000..69e2676b4
--- /dev/null
+++ b/recipes/OlympicCoder-7B/sft/config_v00.00.yaml
@@ -0,0 +1,46 @@
+# Config for 1 node of 8 H100s with DeepSpeed ZeRO-3
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-Coder-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/codeforces-cots
+dataset_config: solutions_decontaminated
+dataset_num_proc: 48
+
+# SFT trainer config
+bf16: true
+do_eval: false
+eval_strategy: 'no'
+gradient_accumulation_steps: 8
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/OlympicCoder-7B
+hub_strategy: every_save
+learning_rate: 1.0e-05
+log_level: info
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+packing: false
+max_grad_norm: 0.2
+max_length: 32768
+max_steps: -1
+num_train_epochs: 10
+output_dir: data/OlympicCoder-7B
+overwrite_output_dir: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 2
+push_to_hub: true
+report_to:
+- wandb
+save_strategy: epoch
+save_total_limit: 1
+seed: 42
+use_liger: true
+warmup_ratio: 0.03
\ No newline at end of file
diff --git a/recipes/README.md b/recipes/README.md
index a42ab27a1..a9e97ca17 100644
--- a/recipes/README.md
+++ b/recipes/README.md
@@ -1 +1,15 @@
-**TODO:** we will add more recipes in the future, just like alignment-handbook, this is the purpose of adding recipes to this project.
\ No newline at end of file
+# Post-training recipes
+
+## OlympicCoder
+
+To train the OlympicCoder models, run:
+
+```
+# 7B
+sbatch --nodes=1 slurm/train.slurm OlympicCoder-7B sft v00.00 zero3
+
+# 32B
+sbatch --nodes=16 slurm/train.slurm OlympicCoder-32B sft v00.00 fsdp
+```
+
+Note that we found it necessary to switch to FSDP1 and paged AdamW 8-bit for the 32B model in order to fit the largest possible context size.
\ No newline at end of file
diff --git a/recipes/accelerate_configs/fsdp.yaml b/recipes/accelerate_configs/fsdp.yaml
new file mode 100644
index 000000000..938427c90
--- /dev/null
+++ b/recipes/accelerate_configs/fsdp.yaml
@@ -0,0 +1,27 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+fsdp_config:
+  fsdp_activation_checkpointing: false # Need fix from: https://github.com/huggingface/transformers/pull/36610
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: true
+  fsdp_offload_params: false
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: true
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
\ No newline at end of file

From 5dcfae89795829c602495e2bc360a024212dcd92 Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Thu, 13 Mar 2025 22:54:15 +0100
Subject: [PATCH 083/137] Fixes bug with async code reward (#504)

* adds slow test for code reward

* fixes bug in setting language and the output parsing

* style

* removed redundant comment

* removed exeception as e

* remove rewards

* removed whitespace

* more whitespace

* remove need for loop with asyncio.run

* nits

* fix type error with e2n AsyncSandbox
---
 Makefile                       |  5 ++++-
 src/open_r1/rewards.py         | 31 +++++++++++++++++++----------
 tests/slow/test_code_reward.py | 36 ++++++++++++++++++++++++++++++++++
 tests/test_rewards.py          | 15 ++++++++++++++
 4 files changed, 76 insertions(+), 11 deletions(-)
 create mode 100644 tests/slow/test_code_reward.py

diff --git a/Makefile b/Makefile
index f6fba0bb7..848deb69f 100644
--- a/Makefile
+++ b/Makefile
@@ -24,7 +24,10 @@ quality:
 	flake8 --max-line-length 119 $(check_dirs) setup.py
 
 test:
-	pytest -sv tests/
+	pytest -sv --ignore=tests/slow/ tests/
+
+slow_test:
+	pytest -sv -vv tests/slow/
 
 # Evaluation
 
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 54e072132..e77a49a3f 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -17,6 +17,8 @@
     from e2b_code_interpreter import AsyncSandbox
 
     load_dotenv()
+else:
+    AsyncSandbox = None
 
 
 def accuracy_reward(completions, solution, **kwargs):
@@ -352,7 +354,13 @@ def evaluate_code(code, test_cases):
                 continue
 
             output = process.stdout.strip()
-            if output.strip() == case["output"].strip():
+
+            # TODO: implement a proper validator to compare against ground truth. For now we just check for exact string match on each line of stdout.
+            all_correct = True
+            for line1, line2 in zip(output.split('\\n'), case['output'].split('\\n')):
+                all_correct = all_correct and line1.strip() == line2.strip()
+
+            if all_correct:
                 passed += 1
 
         success_rate = (passed / total)
@@ -369,8 +377,13 @@ def evaluate_code(code, test_cases):
         evaluation_script_template.format(code=json.dumps(code), test_cases=json.dumps(json.dumps(info["test_cases"])))
         for code, info in zip(code_snippets, verification_info)
     ]
+
+    language = verification_info[0]["language"]
+
+    if not all(v["language"] == language for v in verification_info):
+        raise ValueError("All verification_info must have the same language", verification_info)
     try:
-        rewards = run_async_from_sync(scripts, verification_info["language"])
+        rewards = run_async_from_sync(scripts, language)
 
     except Exception as e:
         print(f"Error from E2B executor: {e}")
@@ -398,14 +411,12 @@ def code_format_reward(completions, **kwargs):
 def run_async_from_sync(scripts: list[str], language: str) -> list[float]:
     """Function wrapping the `run_async` function."""
     # Create a new event loop and set it
-    loop = asyncio.new_event_loop()
-    asyncio.set_event_loop(loop)
-
     try:
         # Run the async function and get the result
-        rewards = loop.run_until_complete(run_async(scripts, language))
-    finally:
-        loop.close()
+        rewards = asyncio.run(run_async(scripts, language))
+    except Exception as e:
+        print(f"Error from E2B executor async: {e}")
+        raise e
 
     return rewards
 
@@ -415,7 +426,7 @@ async def run_async(scripts: list[str], language: str) -> list[float]:
     sbx = await AsyncSandbox.create(timeout=30, request_timeout=3)
 
     # Create a list of tasks for running scripts concurrently
-    tasks = [run_script(sbx, script) for script in scripts]
+    tasks = [run_script(sbx, script, language) for script in scripts]
 
     # Wait for all tasks to complete and gather their results as they finish
     results = await asyncio.gather(*tasks)
@@ -427,7 +438,7 @@ async def run_async(scripts: list[str], language: str) -> list[float]:
     return rewards
 
 
-async def run_script(sbx, script: str, language: str) -> float:
+async def run_script(sbx: AsyncSandbox, script: str, language: str) -> float:
     execution = await sbx.run_code(script, language=language)
     try:
         return float(execution.text)
diff --git a/tests/slow/test_code_reward.py b/tests/slow/test_code_reward.py
new file mode 100644
index 000000000..16337ec0b
--- /dev/null
+++ b/tests/slow/test_code_reward.py
@@ -0,0 +1,36 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from datasets import load_dataset
+
+from open_r1.rewards import code_reward
+
+
+class TestCodeRewards(unittest.TestCase):
+    def test_code_reward(self):
+        code_dataset = load_dataset("open-r1/verifiable-coding-problems-python-10k")
+        NUM_SAMPLES = 20
+        samples = code_dataset["train"].select(range(NUM_SAMPLES))
+        test_completions = [[{"content": sample["gold_standard_solution"]}] for sample in samples]
+        reward_kwargs = {"verification_info": [sample["verification_info"] for sample in samples]}
+        rewards = code_reward(test_completions, **reward_kwargs)
+        print(rewards)
+        assert rewards == [1.0] * NUM_SAMPLES
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index 103b9981a..5a77bf0d1 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -1,3 +1,18 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 import unittest
 
 from open_r1.rewards import (

From d436b7b9c0e9205a2d329596273ca0600a794f54 Mon Sep 17 00:00:00 2001
From: koskotheim <98971550+koskotheim@users.noreply.github.com>
Date: Sat, 15 Mar 2025 12:56:14 -0700
Subject: [PATCH 084/137] fix typo (#507)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 270e4cafd..0ee4b2afb 100644
--- a/README.md
+++ b/README.md
@@ -467,7 +467,7 @@ python scripts/run_benchmarks.py --model-id {model_id}  --benchmarks gpqa
 
 We are able to reproduce Deepseek's reported results on the LiveCodeBench code generation benchmark within ~1-3 standard deviations:
 
-| Model                         | LiveCodeBench (🤗 LightEval) | GPQA Diamond (DeepSeek Reported) |
+| Model                         | LiveCodeBench (🤗 LightEval) | LiveCodeBench (DeepSeek Reported) |
 |:------------------------------|:----------------------------:|:--------------------------------:|
 | DeepSeek-R1-Distill-Qwen-1.5B |             16.3             |               16.9               |
 | DeepSeek-R1-Distill-Qwen-7B   |             36.6             |               37.6               |

From 7835979801de21a2132fffc6f082fca18b2b1db3 Mon Sep 17 00:00:00 2001
From: Guilherme Penedo <nostrumg@gmail.com>
Date: Fri, 21 Mar 2025 08:48:00 +0100
Subject: [PATCH 085/137] adds support for running GRPO on IOI problems (#495)

* adds support for running GRPO on IOI problems

* nit

* bugfixes + recipe

* added piston info and readme changes

* readme updates

* run isort to fix checks

* Update src/open_r1/rewards.py

Co-authored-by: Edward Beeching <edbeeching@users.noreply.github.com>

* adding ioi test

* fix merge issues with python slow tests

* style

* generalize piston workers

* generalize readme

* fix extract code

* finalize slow tests

---------

Co-authored-by: Edward Beeching <edbeeching@users.noreply.github.com>
Co-authored-by: edbeeching <edbeeching@gmail.com>
---
 README.md                                     |  15 +
 .../grpo/config_demo_code_ioi.yaml            |  62 ++++
 slurm/README.md                               |   2 +-
 slurm/piston/README.md                        |  67 ++++
 slurm/piston/launch_piston_workers.sh         |  16 +
 slurm/piston/launch_single_piston.sh          |  31 ++
 src/open_r1/grpo.py                           |  15 +-
 src/open_r1/rewards.py                        |  52 ++-
 src/open_r1/utils/ioi/__init__.py             |  12 +
 src/open_r1/utils/ioi/piston_client.py        | 250 +++++++++++++++
 src/open_r1/utils/ioi/scoring.py              | 298 ++++++++++++++++++
 src/open_r1/utils/ioi/utils.py                |  52 +++
 tests/slow/test_code_reward.py                |  20 +-
 13 files changed, 884 insertions(+), 8 deletions(-)
 create mode 100644 recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml
 create mode 100644 slurm/piston/README.md
 create mode 100755 slurm/piston/launch_piston_workers.sh
 create mode 100755 slurm/piston/launch_single_piston.sh
 create mode 100644 src/open_r1/utils/ioi/__init__.py
 create mode 100644 src/open_r1/utils/ioi/piston_client.py
 create mode 100644 src/open_r1/utils/ioi/scoring.py
 create mode 100644 src/open_r1/utils/ioi/utils.py

diff --git a/README.md b/README.md
index 0ee4b2afb..ce9bc5651 100644
--- a/README.md
+++ b/README.md
@@ -220,6 +220,21 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con
     --config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
 ```
 
+#### IOI problems
+
+We provide a `ioi_code_reward` reward function for executing problems from [IOI](https://hf.co/datasets/open-r1/ioi) using [piston](https://github.com/engineer-man/piston).
+
+To get piston workers running, see [slurm/piston/README.md](./slurm/piston/README.md).
+Set your environment variable `PISTON_ENDPOINTS` to `slurm` or to a list of piston worker endpoints.
+
+See the [example recipe](./recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml) for how to use the reward function:
+```shell
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero2.yaml \
+    --num_processes=7 src/open_r1/grpo.py \
+    --config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml
+```
+
+
 #### Data decontamination
 
 Following [s1: Simple test-time scaling](https://arxiv.org/abs/2501.19393) the data can be decontaminated using the script at: [scripts/decontaminate.py](./scripts/decontaminate.py), which decontaminates a dataset using 8-grams and deduplicate the data. Sample run:
diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml
new file mode 100644
index 000000000..f166cf053
--- /dev/null
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml
@@ -0,0 +1,62 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/ioi
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+beta: 0.01
+bf16: true
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.9
+do_eval: false
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-1.5B-Open-R1-Code-GRPO
+hub_strategy: every_save
+learning_rate: 5.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+max_prompt_length: 1024
+max_completion_length: 2048
+max_steps: 500
+num_generations: 14
+num_train_epochs: 1
+output_dir: data/Qwen2.5-1.5B-Open-R1-Code-GRPO
+overwrite_output_dir: true
+per_device_train_batch_size: 16
+push_to_hub: true
+report_to:
+- wandb
+save_strategy: "steps"
+save_steps: 50
+save_total_limit: 1
+seed: 42
+temperature: 1.0
+warmup_ratio: 0.03
+# ioi specific config
+code_language: cpp
+reward_funcs:
+- ioi_code
+- code_format
+- format
+reward_weights:
+- 1.0
+- 0.1
+- 0.1
+# for each generation, evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating
+# otherwise continue with the next batch of test cases. Useful to avoid overloading the eval server + save time on wrong solutions
+code_eval_test_batch_size: 3
\ No newline at end of file
diff --git a/slurm/README.md b/slurm/README.md
index 101064594..029d633d3 100644
--- a/slurm/README.md
+++ b/slurm/README.md
@@ -5,7 +5,7 @@
 conda create -n sglang124 python=3.11
 conda activate sglang124
 
-pip install torch=2.5.1 --index-url https://download.pytorch.org/whl/cu124
+pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cu124
 
 pip install sgl-kernel --force-reinstall --no-deps
 pip install "sglang[all]>=0.4.2.post4" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer/
diff --git a/slurm/piston/README.md b/slurm/piston/README.md
new file mode 100644
index 000000000..0a45e1cff
--- /dev/null
+++ b/slurm/piston/README.md
@@ -0,0 +1,67 @@
+# Piston workers (slurm)
+
+We have built a [piston](https://github.com/engineer-man/piston) package to run IOI problems.
+
+To launch a fleet of piston workers on a slurm cluster, you can adapt the paths in `launch_piston_workers.sh` and `launch_single_piston.sh` and run:
+```bash
+slurm/piston/launch_piston_workers.sh (number of workers to launch)
+```
+
+This command will launch a slurm job for each worker, which will be called `piston-worker-<port>`, where `<port>` is the port where the worker will be listening.
+
+## First time setup
+You will need to install the [IOI package](https://github.com/guipenedo/piston/tree/master/packages/cms_ioi/1.0.0) in the workers.
+1. Launch a single worker:
+```bash
+slurm/piston/launch_piston_workers.sh 1
+```
+
+2. Assuming it's running on `ip-10-53-86-146:1234`, send the package install request:
+```bash
+curl -X POST http://ip-10-53-86-146:1234/api/v2/packages -H "Content-Type: application/json" -d '{"language": "cms_ioi", "version": "1.0.0"}'
+```
+
+3. You can now launch more workers and due to the shared mounted packages directory, they should already have the package installed.
+
+To have the main script find the workers automatically, you can export the following environment variable:
+```bash
+export PISTON_ENDPOINTS=slurm
+```
+Alternatively your can add `PISTON_ENDPOINTS=slurm` to your .env file.
+
+You can also change `PISTON_MAX_REQUESTS_PER_ENDPOINT`, which tries to limit how many simultaneous requests each worker will handle (1 by default). Keep in mind that this is a local limit and in distributed setups, as there is no global limit, workers might sometimes be overwhelmed when some processes hit the same worker.
+
+If you would like to adapt the code to run without piston, please see the [ioi repo](https://github.com/huggingface/ioi).
+
+# Piston workers (local docker)
+This will launch a single worker in a docker container. Consider launching multiple workers for better scalability. Replace 2000 with the port you want to use.
+Make sure to change `/path/to/local/packages` to the path you want to persist for package installs.
+
+```bash
+docker run -d \
+  --name piston_worker \
+  -v /path/to/local/packages:/piston/packages \
+  -e PORT=2000 \
+  -e PISTON_COMPILE_TIMEOUT=60000 \
+  -e PISTON_RUN_TIMEOUT=60000 \
+  -e PISTON_OUTPUT_MAX_SIZE=1000000000 \
+  -e PISTON_MAX_FILE_SIZE=1000000000 \
+  -e PISTON_DISABLE_NETWORKING=true \
+  -e PISTON_REPO_URL=https://github.com/guipenedo/piston/releases/download/pkgs/index \
+  -p 2000:2000 \
+  --entrypoint /bin/bash \
+  ghcr.io/engineer-man/piston@sha256:63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a \
+  -c "sed -i '/app.use(body_parser.urlencoded/c\    app.use(body_parser.urlencoded({ extended: true, limit: \"512mb\" }));' src/index.js && \
+      sed -i '/app.use(body_parser.json/c\    app.use(body_parser.json({ limit: \"512mb\" }));' src/index.js && \
+      node src"
+```
+
+Install the package:
+```bash
+curl -X POST http://localhost:2000/api/v2/packages -H "Content-Type: application/json" -d '{"language": "cms_ioi", "version": "1.0.0"}'
+```
+
+Remember to set `PISTON_ENDPOINTS`:
+```bash
+export PISTON_ENDPOINTS=http://localhost:2000/api/v2,http://localhost:2001/api/v2,http://localhost:2002/api/v2
+```
diff --git a/slurm/piston/launch_piston_workers.sh b/slurm/piston/launch_piston_workers.sh
new file mode 100755
index 000000000..908efcc2e
--- /dev/null
+++ b/slurm/piston/launch_piston_workers.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# this simple script will launch a bunch of piston workers on the HF science cluster
+
+N_INSTANCES=${1:-5}  # Default to 5 instances
+
+for i in $(seq 1 $N_INSTANCES); do
+    # Find random (hopefully) available port
+    PORT=$(comm -23 <(seq 2000 10000 | sort) <(ss -tan | awk '{print $4}' | cut -d':' -f2 | sort -u) | shuf | head -n1)
+    
+    # the job name format is important for the code to then be able to get a list of workers. `piston-worker-<port>`
+    sbatch \
+        --job-name="piston-worker-$PORT" \
+        --export=ALL,PORT=$PORT \
+        slurm/piston/launch_single_piston.sh
+done
\ No newline at end of file
diff --git a/slurm/piston/launch_single_piston.sh b/slurm/piston/launch_single_piston.sh
new file mode 100755
index 000000000..27bc65bf2
--- /dev/null
+++ b/slurm/piston/launch_single_piston.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+#SBATCH --job-name=piston_worker
+#SBATCH --output=/fsx/open-r1/logs/piston/worker-logs/%x-%j.out
+#SBATCH --error=/fsx/open-r1/logs/piston/worker-logs/%x-%j.out  # Redirect error logs to .out
+#SBATCH --cpus-per-task=2
+#SBATCH --mem-per-cpu=1950M
+#SBATCH --partition=hopper-cpu
+#SBATCH --time=48:00:00
+
+# sometimes if a bunch of workers start at the same time pyxis dies
+sleep $(( RANDOM % 20 ))
+
+# mounting the packages folder lets us not have to manually install the package on each instance
+# we use 63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a as the latest image requires isolate, which does not work on the HF science cluster (cgroups incompatibility)
+# feel free try with the latest image
+# the code you see below increases the very constrained piston default limits, and sets the repo url to the one hosting our IOI package
+srun --container-mounts=/fsx/guilherme/ioi2024/piston_files/packages:/piston/packages --container-image "ghcr.io#engineer-man/piston:sha256:63b5654156a89c5a2ad281aface21416615d62ec056d88efe8fcd307ce73575a" \
+    bash -c "
+    export PISTON_COMPILE_TIMEOUT=60000
+    export PISTON_RUN_TIMEOUT=60000
+    export PISTON_OUTPUT_MAX_SIZE=1000000000
+    export PISTON_MAX_FILE_SIZE=1000000000
+    export PISTON_DISABLE_NETWORKING=true
+    export PISTON_REPO_URL=https://github.com/guipenedo/piston/releases/download/pkgs/index
+
+    sed -i '/app.use(body_parser.urlencoded/c\    app.use(body_parser.urlencoded({ extended: true, limit: \"512mb\" }));' src/index.js
+    sed -i '/app.use(body_parser.json/c\    app.use(body_parser.json({ limit: \"512mb\" }));' src/index.js
+
+    # Start server in background
+    node src
+    "
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 4ac788ff5..9f2bbf091 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -16,6 +16,7 @@
 import os
 import sys
 from dataclasses import dataclass, field
+from functools import partial, update_wrapper
 
 import datasets
 import torch
@@ -32,6 +33,7 @@
     get_code_format_reward,
     get_cosine_scaled_reward,
     get_repetition_penalty_reward,
+    ioi_code_reward,
     len_reward,
     reasoning_steps_reward,
     tag_count_reward,
@@ -52,7 +54,7 @@ class GRPOScriptArguments(ScriptArguments):
 
     Args:
         reward_funcs (`list[str]`):
-            List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length', 'tag_count', 'code', 'code_format'.
+            List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length', 'tag_count', 'code', 'ioi_code', 'code_format'.
         cosine_min_value_wrong (`float`):
             Minimum reward for cosine scaling for wrong answers.
         cosine_max_value_wrong (`float`):
@@ -105,7 +107,13 @@ class GRPOScriptArguments(ScriptArguments):
         default="python",
         metadata={
             "help": "Language for code format reward. Based on E2B supported languages https://e2b.dev/docs/code-interpreting/supported-languages",
-            "choices": ["python", "javascript", "r", "java", "bash"],
+            "choices": ["python", "javascript", "r", "java", "bash", "cpp"],
+        },
+    )
+    code_eval_test_batch_size: int = field(
+        default=1,
+        metadata={
+            "help": "for each generation, evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating; otherwise continue with the next batch of test cases. Useful to avoid overloading the eval server + save time on wrong solutions"
         },
     )
 
@@ -174,6 +182,9 @@ def main(script_args, training_args, model_args):
         ),
         "length": len_reward,
         "code": code_reward,
+        "ioi_code": update_wrapper(
+            partial(ioi_code_reward, test_batch_size=script_args.code_eval_test_batch_size), ioi_code_reward
+        ),
         "code_format": get_code_format_reward(language=script_args.code_language),
         "tag_count": tag_count_reward,
     }
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index e77a49a3f..37ba7b9c7 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -10,6 +10,7 @@
 from math_verify import LatexExtractionConfig, parse, verify
 
 from .utils import is_e2b_available
+from .utils.ioi import SubtaskResult, add_includes, get_piston_client_from_env, score_subtask
 
 
 if is_e2b_available():
@@ -312,8 +313,55 @@ def repetition_penalty_reward(completions, **kwargs) -> float:
     return repetition_penalty_reward
 
 
-def extract_code(completion: str) -> str:
-    pattern = re.compile(r"```python\n(.*?)```", re.DOTALL)
+def _init_event_loop():
+    try:
+        loop = asyncio.get_event_loop()
+    except RuntimeError:
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+    return loop
+
+
+def ioi_code_reward(completions, test_batch_size: int = 1, **kwargs) -> list[float]:
+    """Reward function that evaluates IOI problems using Piston+our IOI package.
+
+    Assumes the dataset has the same format as hf.co/datasets/open-r1/ioi
+
+    test_batch_size: evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating; otherwise continue with the next batch of test cases.
+    """
+    # for info on setting up piston workers, see slurm/piston/README.md
+    piston_client = get_piston_client_from_env()
+
+    code_snippets = [
+        # note: grading is automatically skipped if no code is extracted
+        add_includes(extract_code(completion[-1]["content"], "cpp"), problem_id)
+        for completion, problem_id in zip(completions, kwargs["id"])
+    ]
+
+    async def run_catch_exceptions(task):
+        try:
+            return await task
+        except Exception as e:
+            print(f"Error from Piston worker: {e}")
+            return SubtaskResult()  # score 0.0
+
+    # load problem data. undo separating kwargs by column
+    problems_data = [dict(zip(kwargs.keys(), values)) for values in zip(*kwargs.values())]
+
+    loop = _init_event_loop()
+    evals = [
+        loop.create_task(
+            run_catch_exceptions(score_subtask(piston_client, problem_data, code, test_batch_size=test_batch_size))
+        )
+        for problem_data, code in zip(problems_data, code_snippets)
+    ]
+    results = loop.run_until_complete(asyncio.gather(*evals))
+
+    return [result.score for result in results]
+
+
+def extract_code(completion: str, language: str = "python") -> str:
+    pattern = re.compile(rf"```{language}\n(.*?)```", re.DOTALL)
     matches = pattern.findall(completion)
     extracted_answer = matches[-1] if len(matches) >= 1 else ""
     return extracted_answer
diff --git a/src/open_r1/utils/ioi/__init__.py b/src/open_r1/utils/ioi/__init__.py
new file mode 100644
index 000000000..c7f91dc1f
--- /dev/null
+++ b/src/open_r1/utils/ioi/__init__.py
@@ -0,0 +1,12 @@
+from .piston_client import get_piston_client_from_env, get_slurm_piston_endpoints
+from .scoring import SubtaskResult, score_subtask
+from .utils import add_includes
+
+
+__all__ = [
+    "get_piston_client_from_env",
+    "get_slurm_piston_endpoints",
+    "score_subtask",
+    "add_includes",
+    "SubtaskResult",
+]
diff --git a/src/open_r1/utils/ioi/piston_client.py b/src/open_r1/utils/ioi/piston_client.py
new file mode 100644
index 000000000..e625d40a1
--- /dev/null
+++ b/src/open_r1/utils/ioi/piston_client.py
@@ -0,0 +1,250 @@
+import asyncio
+import os
+import random
+import re
+import subprocess
+from collections import Counter
+from functools import lru_cache
+
+import aiohttp
+
+
+class PistonError(Exception):
+    pass
+
+
+@lru_cache(maxsize=1)
+def get_piston_client_from_env():
+    piston_endpoints = os.getenv("PISTON_ENDPOINTS")
+    if piston_endpoints is None:
+        raise ValueError(
+            "For IOI problems Piston endpoints running our IOI package are required. Please add a list of valid Piston endpoints to a PISTON_ENDPOINTS varialbe in a `.env` file."
+        )
+    piston_endpoints = piston_endpoints.split(",") if piston_endpoints != "slurm" else get_slurm_piston_endpoints()
+    random.shuffle(piston_endpoints)
+    max_requests_per_endpoint = os.getenv("PISTON_MAX_REQUESTS_PER_ENDPOINT", "1")
+    return PistonClient(piston_endpoints, max_requests_per_endpoint=int(max_requests_per_endpoint))
+
+
+class PistonClient:
+    """
+    A client that will automatically load balance across multiple Piston (https://github.com/engineer-man/piston) workers.
+    This assumes piston is running our custom cms_ioi package: https://github.com/guipenedo/piston/releases/
+    We recommend starting the instances with the following script as otherwise some IOI problems will hit default limits:
+    ```
+    export PISTON_COMPILE_TIMEOUT=60000
+    export PISTON_RUN_TIMEOUT=60000
+    export PISTON_OUTPUT_MAX_SIZE=1000000000
+    export PISTON_MAX_FILE_SIZE=1000000000
+    export PISTON_DISABLE_NETWORKING=true
+    export PISTON_REPO_URL=https://github.com/guipenedo/piston/releases/download/pkgs/index
+    mkdir /piston
+
+    sed -i '/app.use(body_parser.urlencoded/c\    app.use(body_parser.urlencoded({ extended: true, limit: \"512mb\" }));' src/index.js
+    sed -i '/app.use(body_parser.json/c\    app.use(body_parser.json({ limit: \"512mb\" }));' src/index.js
+
+    # Start server in background
+    node src```
+
+    Piston docs for API usage: https://piston.readthedocs.io/en/latest/api-v2/
+    """
+
+    def __init__(
+        self,
+        base_endpoint: str | list[str] = "http://ip-10-53-80-65:3223/api/v2",
+        session=None,
+        max_requests_per_endpoint=1,
+    ):
+        self.max_requests_per_endpoint = max_requests_per_endpoint
+        self.base_endpoints = [base_endpoint] if isinstance(base_endpoint, str) else base_endpoint
+        self.endpoint_ids = {endpoint: i for i, endpoint in enumerate(self.base_endpoints)}
+
+        self._session = session
+        self.endpoint_tokens = asyncio.Queue(maxsize=max_requests_per_endpoint * len(self.base_endpoints))
+
+        for _ in range(max_requests_per_endpoint):
+            for base_endpoint in self.base_endpoints:
+                self.endpoint_tokens.put_nowait(base_endpoint)
+        self._endpoint_failures = Counter()
+        self._unhealthy_endpoints = set()
+        self._endpoint_failures_lock = asyncio.Lock()
+
+    @property
+    def session(self):
+        if self._session is None:
+            self._session = aiohttp.ClientSession(
+                timeout=aiohttp.ClientTimeout(sock_read=10),
+                connector=aiohttp.TCPConnector(
+                    limit=self.max_requests_per_endpoint * len(self.base_endpoints),
+                    ttl_dns_cache=300,
+                    keepalive_timeout=5 * 60,
+                ),
+            )
+        return self._session
+
+    async def _wait_for_endpoint(self):
+        endpoint = await self.endpoint_tokens.get()
+        return endpoint
+
+    async def _release_endpoint(self, endpoint):
+        await self.endpoint_tokens.put(endpoint)
+
+    async def _send_request(self, endpoint, route, data=None, method="post"):
+        async with self.session.request(
+            method, f"{endpoint.rstrip('/')}/{route}", json=data, headers={"Content-Type": "application/json"}
+        ) as response:
+            return await response.json(content_type=None)
+
+    async def _send_to_all(self, route, data=None, method="post"):
+        return await asyncio.gather(
+            *[self._send_request(endpoint, route, data, method) for endpoint in self.base_endpoints]
+        )
+
+    async def _send_to_one(self, endpoint, route, data=None, method="post"):
+        return await self._send_request(endpoint, route, data, method)
+
+    async def install_package(self, language, version):
+        return await self._send_to_all("packages", {"language": language, "version": version}, method="post")
+
+    async def uninstall_package(self, language, version):
+        return await self._send_to_all("packages", {"language": language, "version": version}, method="delete")
+
+    async def get_supported_runtimes(self):
+        return await self._send_to_all("runtimes", method="get")
+
+    async def execute(self, data) -> tuple[str, str]:
+        """
+        Requests to the IOI package return the score as a float in the stdout, as well as optional feedback/errors in stderr.
+        Returns a tuple of (score, feedback).
+        """
+        response = await self._send_execute(data)
+
+        if "message" in response:
+            raise PistonError(response["message"])
+
+        if "compile" in response and response["compile"]["code"] != 0:
+            return "0", "Compilation error exit code " + str(response["compile"]["code"]) + "\n" + response["compile"][
+                "stderr"
+            ]
+
+        if "run" not in response:
+            raise PistonError(response)
+
+        if response["run"]["code"] == 1 and "MemoryError" in response["run"]["stderr"]:
+            return "0", "Memory limit exceeded"
+
+        # successful result
+        if response["run"]["stdout"]:
+            return response["run"]["stdout"], response["run"]["stderr"]
+
+        if response["run"]["signal"] == "SIGKILL":
+            return "0", "Time limit exceeded"
+
+        # other issues
+        if response["run"]["code"] != 0:
+            raise PistonError(
+                f"language={response['language']}, version={response['version']}, exit code={response['run']['code']}, stderr={response['run']['stderr']}, signal={response['run']['signal']}"
+            )
+        return "0", "Unknown error"
+
+    async def _check_failed_endpoint(self, endpoint):
+        async with self._endpoint_failures_lock:
+            if endpoint in self._unhealthy_endpoints:
+                return
+            try:
+                await asyncio.sleep(5)
+                await self.get_supported_runtimes()
+            except Exception as e:
+                print(f"Error checking endpoint {endpoint}, dropping it ({e})")
+                self._unhealthy_endpoints.add(endpoint)
+
+    async def _send_execute(self, data):
+        data = data | {
+            "language": "cms_ioi",
+            "version": "*",
+        }
+
+        max_retries = 5
+        base_delay = 1.0
+
+        status = None
+        endpoint = None
+
+        for attempt in range(max_retries + 1):
+            try:
+                endpoint = await self._wait_for_endpoint()
+                if attempt > 0:
+                    await asyncio.sleep(1)
+                async with self.session.post(
+                    f"{endpoint.rstrip('/')}/execute", json=data, headers={"Content-Type": "application/json"}
+                ) as response:
+                    status = response.status
+                    res_json = await response.json(content_type=None)
+
+                    if status != 200:
+                        raise PistonError(f"Server error. status={status}")
+                    if res_json is None:
+                        raise PistonError(f"Empty response. status={status}")
+                    # piston overloaded
+                    if "run" in res_json and "Resource temporarily unavailable" in res_json["run"].get("stderr", ""):
+                        raise PistonError(f"Piston overloaded: {res_json['run']['stderr']}")
+                    return res_json
+
+            except (PistonError, asyncio.TimeoutError, aiohttp.ClientConnectionError, RuntimeError) as e:
+                # Only retry if we haven't reached max retries yet
+                if attempt < max_retries:
+                    # Calculate backoff with jitter
+                    delay = min(base_delay * (2**attempt), 10)  # Exponential backoff, capped at 10 seconds
+                    jitter = delay * 0.2 * (2 * asyncio.get_event_loop().time() % 1 - 0.5)  # Add ±10% jitter
+                    retry_delay = delay + jitter
+                    print(f"Retrying in {retry_delay} seconds [{self.endpoint_ids[endpoint]}] {endpoint}")
+
+                    # special case: worker died
+                    if isinstance(e, aiohttp.ClientConnectionError) and "Connect call failed" in str(e):
+                        await self._check_failed_endpoint(endpoint)
+                    else:
+                        # hopefully we won't get this one again
+                        await self._release_endpoint(endpoint)
+                    endpoint = None
+
+                    await asyncio.sleep(retry_delay)
+                else:
+                    print(f"Giving up on retries. {e}")
+                    raise e
+            except Exception as e:
+                print(f"Propagating exception {type(e)}: {e}")
+                raise e
+            finally:
+                # Ensure endpoint is always released, even if an exception occurs
+                if endpoint is not None:
+                    try:
+                        await self._release_endpoint(endpoint)
+                    except Exception as e:
+                        print(f"Error releasing endpoint {endpoint}: {e}")
+                    endpoint = None
+
+
+def get_slurm_piston_endpoints():
+    """Get list of active piston worker endpoints from squeue output"""
+    # Run squeue command to get job name, hostname and status, filtering for RUNNING state
+    result = subprocess.run(
+        ["squeue", '--format="%j %N %T"', "--noheader", "--states=RUNNING"], capture_output=True, text=True
+    )
+
+    # Split output into lines and skip header
+    lines = result.stdout.strip().split("\n")
+
+    endpoints = []
+    for line in lines:
+        # Parse job name from squeue output
+        fields = line.split()
+        job_name = fields[0].strip('"')  # Remove quotes
+        hostname = fields[1]
+
+        # Extract port if job name matches pattern
+        match = re.match(r"piston-worker-(\d+)", job_name)
+        if match:
+            port = match.group(1)
+            endpoints.append(f"http://{hostname}:{port}/api/v2")
+
+    return endpoints
diff --git a/src/open_r1/utils/ioi/scoring.py b/src/open_r1/utils/ioi/scoring.py
new file mode 100644
index 000000000..88045383a
--- /dev/null
+++ b/src/open_r1/utils/ioi/scoring.py
@@ -0,0 +1,298 @@
+import asyncio
+from dataclasses import asdict, dataclass, field
+from typing import Union
+
+from .piston_client import PistonClient
+from .utils import batched, load_ioi_tests
+
+
+@dataclass
+class TestResult:
+    """
+    Represents the result of a single test case execution.
+
+    Attributes:
+        test_name: Name of the test case
+        score: Score achieved for this test (0.0 to 1.0)
+        status: Status code of the test result (e.g., 'AC', 'WA', 'TLE')
+        feedback: Detailed feedback message from the judge or an error message
+    """
+
+    test_name: str
+    score: float = 0.0
+    status: str = "SKIPPED"
+    feedback: str = None
+
+
+@dataclass
+class SubtaskResult:
+    """
+    Represents the result of a subtask containing multiple test cases.
+
+    Attributes:
+        problem: Problem identifier
+        subtask: Subtask identifier
+        points: Maximum points available for this subtask
+        score_precision: Number of decimal places for score rounding
+        test_results: List of individual test case results
+    """
+
+    problem: str = None
+    subtask: str = None
+
+    points: float = 0.0
+    score_precision: int = 2
+
+    test_results: list[TestResult] = field(default_factory=list)
+
+    @property
+    def status(self):
+        """
+        Determines the overall status of the subtask based on the worst status among test results.
+        Status priorities are ordered from worst to best.
+
+        Returns:
+            str: The status with the highest priority (lowest value)
+        """
+        status_prios = {"CE": -1, "RE": 0, "WA": 1, "MLE": 2, "TLE": 3, "PA": 4, "AC": 5, "SKIPPED": 999}
+        return min([x.status for x in self.test_results], key=lambda x: status_prios[x])
+
+    @property
+    def score(self):
+        """
+        Calculates the raw score for the subtask as the minimum score across all test results.
+
+        Returns:
+            float: The rounded minimum score
+        """
+        return (
+            0
+            if not self.test_results
+            else round(min([test_result.score for test_result in self.test_results]), self.score_precision)
+        )
+
+    @property
+    def weighted_score(self):
+        """
+        Calculates the weighted score by multiplying the raw score by the available points.
+
+        Returns:
+            float: The rounded weighted score
+        """
+        return (
+            0
+            if not self.test_results
+            else round(
+                min([test_result.score for test_result in self.test_results]) * self.points, self.score_precision
+            )
+        )
+
+    def to_dict(self):
+        """
+        Converts the SubtaskResult to a dictionary representation.
+
+        Returns:
+            dict: Dictionary containing all subtask result data
+        """
+        return {
+            "problem": self.problem,
+            "subtask": self.subtask,
+            "score": self.score,
+            "weighted_score": self.weighted_score,
+            "points": self.points,
+            "score_precision": self.score_precision,
+            "status": self.status,
+            "test_results": [asdict(test_result) for test_result in self.test_results],
+        }
+
+
+def _extract_single_status(score: float, feedback: str) -> str:
+    """
+    Determines the status code based on the score and feedback message.
+
+    Args:
+        score: The numeric score (0.0 to 1.0)
+        feedback: The feedback message from the execution
+
+    Returns:
+        str: Status code ('CE', 'MLE', 'TLE', 'WA', 'RE', 'AC', or 'PA')
+    """
+    if score == 0.0:
+        if "Compilation error" in feedback:
+            return "CE"
+        elif "Memory limit exceeded" in feedback:
+            return "MLE"
+        elif "Time limit exceeded" in feedback:
+            return "TLE"
+        elif "Output isn't correct" in feedback:
+            return "WA"
+        else:
+            return "RE"
+    elif score == 1.0:
+        return "AC"
+    else:
+        return "PA"
+
+
+async def score_single_test_case(
+    client: PistonClient, subtask: dict, test_name: str, test_input: str, test_output: str, submission: str
+) -> TestResult:
+    """
+    Scores a single test case by running the submission against the provided input and output.
+
+    Args:
+        client: PistonClient instance for executing code
+        subtask: Dictionary containing subtask configuration
+        test_name: Name of the test case
+        test_input: Input data for the test case
+        test_output: Expected output for the test case
+        submission: Source code of the submission
+
+    Returns:
+        TestResult: Result of the test case execution
+    """
+    # Run submission for this test case
+    score, feedback = await run_submission(client, subtask, test_input, submission, test_output)
+    score = float(score)
+
+    return TestResult(
+        test_name=test_name, score=score, status=_extract_single_status(score, feedback), feedback=feedback
+    )
+
+
+async def score_subtask(
+    client: PistonClient,
+    subtask: dict,
+    submission: str,
+    test_case_run_cache: Union[dict, None] = None,
+    test_batch_size: int = 1,
+) -> SubtaskResult:
+    """
+    Scores all test cases in a subtask.
+
+    Args:
+        client: PistonClient instance for executing code
+        subtask: Dictionary containing subtask configuration
+        test_cases: Dictionary mapping test names to (input, output) tuples
+        submission: Source code of the submission
+        test_case_run_cache: Optional cache of previously run test cases
+        test_batch_size: evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating; otherwise continue with the next batch of test cases.
+        -1 to evaluate all test cases in parallel
+    Returns:
+        SubtaskResult: Result of the subtask evaluation
+    """
+    subtask_result = SubtaskResult(
+        problem=subtask["id"],
+        subtask=subtask["subtask"],
+        points=subtask["score"],
+        score_precision=subtask["score_precision"],
+        test_results=[],
+    )
+
+    # tests that are not cached
+    tests_to_run = [
+        (ti, test_name)
+        for ti, test_name in enumerate(subtask["test_names"])
+        if test_case_run_cache is None or test_name not in test_case_run_cache
+    ]
+
+    # initialize test results with cached results or empty (SKIPPED) TestResult objects
+    subtask_result.test_results = [
+        test_case_run_cache[test_name]
+        if test_case_run_cache is not None and test_name in test_case_run_cache
+        else TestResult(test_name=test_name)
+        for test_name in subtask["test_names"]
+    ]
+
+    # we skip submissions where no code was extracted
+    # no need to do anything, as we have a failed cached result
+    if not submission or any(
+        test_result.status != "SKIPPED" and test_result.score == 0.0 for test_result in subtask_result.test_results
+    ):
+        return subtask_result
+
+    if "test_cases" in subtask:
+        test_cases = subtask["test_cases"]
+        if isinstance(subtask["test_cases"], list):
+            test_cases = {test_name: test for test_name, test in zip(subtask["test_names"], subtask["test_cases"])}
+    else:
+        test_cases = load_ioi_tests(subtask["year"], subtask["id"])
+
+    # run one batch, check if any of them failed (0 score): if so stop evaluating; otherwise continue with the next batch of test cases.
+    for test_batch_to_run in batched(tests_to_run, test_batch_size):
+        results = await asyncio.gather(
+            *[
+                asyncio.create_task(
+                    score_single_test_case(
+                        client, subtask, test_name, test_cases[test_name][0], test_cases[test_name][1], submission
+                    )
+                )
+                for _, test_name in test_batch_to_run
+            ]
+        )
+        for (ti, test_name), test_result in zip(test_batch_to_run, results):
+            if test_case_run_cache is not None:
+                test_case_run_cache[test_name] = test_result
+            subtask_result.test_results[ti] = test_result
+
+        # Stop early if it failed
+        if any(test_result.score == 0.0 for test_result in results):
+            break
+
+    return subtask_result
+
+
+async def score_subtasks(
+    client: PistonClient, subtasks: list[dict], submission: str, skip_mode: bool = True
+) -> list[SubtaskResult]:
+    """
+    Scores multiple subtasks for a submission.
+
+    Args:
+        client: PistonClient instance for executing code
+        subtasks: List of dictionaries containing subtask configurations
+        submission: Source code of the submission
+        skip_mode: If True, evaluates test by test and stops after the first failure. Otherwise, runs all tests in parallel. Should be True when evaluating a large number of submissions.
+
+    Returns:
+        list[SubtaskResult]: Results for all subtasks
+    """
+    # avoid rerunning tests present in multiple subtasks
+    test_case_run_cache = {}
+
+    return [await score_subtask(client, subtask, submission, test_case_run_cache, skip_mode) for subtask in subtasks]
+
+
+async def run_submission(
+    client: PistonClient, problem: dict, test_input: str, submission: str, test_output: str | None = None
+) -> tuple[str, str]:
+    """
+    Executes a submission against a test case using the Piston execution environment.
+
+    Args:
+        client: PistonClient instance for executing code
+        problem: Dictionary containing problem configuration
+        test_input: Input data for the test case
+        submission: Source code of the submission
+        test_output: Optional expected output for the test case
+
+    Returns:
+        tuple[str, str]: A tuple containing (score, feedback)
+    """
+    data = {
+        "files": [
+            # the actual submission
+            {"name": f"graders/{problem['id'].lower()}.cpp", "content": submission},
+            # pass the input
+            {"name": "input.txt", "content": test_input},
+            # pass the expected output
+            *([{"name": "correct_output.txt", "content": test_output}] if test_output else []),
+            # grader files
+            *({"name": name, "content": content} for name, content in problem["grader_files"] if content),
+        ],
+        "run_timeout": round(
+            (problem["time_limit"] + 3) * 1000
+        ),  # +3 seconds hard limit. time limits are handled by the ioi script
+        "run_memory_limit": problem["memory_limit"],
+    }
+    return await client.execute(data)
diff --git a/src/open_r1/utils/ioi/utils.py b/src/open_r1/utils/ioi/utils.py
new file mode 100644
index 000000000..02c0aea59
--- /dev/null
+++ b/src/open_r1/utils/ioi/utils.py
@@ -0,0 +1,52 @@
+from collections import defaultdict
+from functools import lru_cache
+from itertools import islice
+
+from datasets import load_dataset
+
+
+def add_includes(code: str, problem_id: str) -> str:
+    """
+    Fix common compilation errors for IOI problems.
+    """
+    if not code:
+        return code
+    # has most of the useful functions
+    code_header = "#include <bits/stdc++.h>\n"
+    # include the problem header
+    problem_header_include = f'#include "{problem_id}.h"'
+    if problem_header_include not in code:
+        code_header += problem_header_include + "\n"
+    # use namespace std since models forget std:: often
+    if "using namespace std;" not in code and "std::" not in code:
+        code_header += "\nusing namespace std;\n\n"
+    return code_header + code
+
+
+@lru_cache
+def load_ioi_tests_for_year(year: int) -> dict[str, dict[str, tuple[str, str]]]:
+    """
+    Load IOI tests for a given year.
+    """
+    tests_dataset = load_dataset("open-r1/ioi-test-cases", name=f"{year}", split="train")
+    test_cases = defaultdict(dict)
+    for test_case in tests_dataset:
+        test_cases[test_case["problem_id"]][test_case["test_name"]] = test_case["test_input"], test_case["test_output"]
+    return test_cases
+
+
+def load_ioi_tests(year: int, problem_id: str) -> dict[str, tuple[str, str]]:
+    """
+    Load IOI tests for a given year and problem id.
+    """
+    return load_ioi_tests_for_year(year)[problem_id]
+
+
+def batched(iterable, n):
+    "Batch data into lists of length n. The last batch may be shorter."
+    # batched('ABCDEFG', 3) --> ABC DEF G
+    if n < 1:
+        return iterable
+    it = iter(iterable)
+    while batch := list(islice(it, n)):
+        yield batch
diff --git a/tests/slow/test_code_reward.py b/tests/slow/test_code_reward.py
index 16337ec0b..f3f985a98 100644
--- a/tests/slow/test_code_reward.py
+++ b/tests/slow/test_code_reward.py
@@ -17,12 +17,13 @@
 
 from datasets import load_dataset
 
-from open_r1.rewards import code_reward
+from open_r1.rewards import code_reward, ioi_code_reward
 
 
 class TestCodeRewards(unittest.TestCase):
-    def test_code_reward(self):
-        code_dataset = load_dataset("open-r1/verifiable-coding-problems-python-10k")
+    def test_python_code_reward(self):
+        # requires E2B, see the README.md file
+        code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated-tested")
         NUM_SAMPLES = 20
         samples = code_dataset["train"].select(range(NUM_SAMPLES))
         test_completions = [[{"content": sample["gold_standard_solution"]}] for sample in samples]
@@ -31,6 +32,19 @@ def test_code_reward(self):
         print(rewards)
         assert rewards == [1.0] * NUM_SAMPLES
 
+    def test_ioi_code_reward(self):
+        # This slow test case requires spinning up a bunch (I tested with ~64) of piston workers, see docs here
+        # slurm/piston/README.md
+        code_dataset = load_dataset("open-r1/ioi-reward-test-dataset")
+        NUM_SAMPLES = 16
+        samples = code_dataset["train"].select(range(NUM_SAMPLES))
+        test_completions = [[{"content": f"```cpp\n{sample['sample_solution']}```"}] for sample in samples]
+        keys = [key for key in samples[0] if key not in ["prompt", "completion"]]
+        reward_kwargs = {key: [example[key] for example in samples] for key in keys}
+        rewards = ioi_code_reward(test_completions, **reward_kwargs)
+        print(rewards)
+        assert rewards == [1.0] * NUM_SAMPLES
+
 
 if __name__ == "__main__":
     unittest.main()

From af487204ca09005d12b4d9a48b4162a02e9b6a35 Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Fri, 21 Mar 2025 12:53:38 +0100
Subject: [PATCH 086/137] Adds binary code reward (#528)

* adds binary code reward, refactors grpo with get_reward_funcs

* adds return type to the function

* add get_reward_funcs test

* remote type hint

* move script args to another file

* update test
---
 src/open_r1/configs.py |  71 +++++++++++++++++++++++++
 src/open_r1/grpo.py    | 117 ++---------------------------------------
 src/open_r1/rewards.py |  39 +++++++++++++-
 tests/test_rewards.py  |  44 +++++++++++++++-
 4 files changed, 157 insertions(+), 114 deletions(-)

diff --git a/src/open_r1/configs.py b/src/open_r1/configs.py
index 98cd0d108..b341e0bdd 100644
--- a/src/open_r1/configs.py
+++ b/src/open_r1/configs.py
@@ -83,3 +83,74 @@ class SFTConfig(trl.SFTConfig):
         default=None,
         metadata={"help": ("The project to store runs under.")},
     )
+
+
+@dataclass
+class GRPOScriptArguments(trl.ScriptArguments):
+    """
+    Script arguments for the GRPO training script.
+
+    Args:
+        reward_funcs (`list[str]`):
+            List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length', 'tag_count', 'code', 'ioi_code', 'code_format'.
+        cosine_min_value_wrong (`float`):
+            Minimum reward for cosine scaling for wrong answers.
+        cosine_max_value_wrong (`float`):
+            Maximum reward for cosine scaling for wrong answers.
+        cosine_min_value_correct (`float`):
+            Minimum reward for cosine scaling for correct answers.
+        cosine_max_value_correct (`float`):
+            Maximum reward for cosine scaling for correct answers.
+        cosine_max_len (`int`):
+            Maximum length for cosine scaling.
+        code_language (`str`):
+            Language for code format reward.
+    """
+
+    reward_funcs: list[str] = field(
+        default_factory=lambda: ["accuracy", "format", "tag_count"],
+        metadata={
+            "help": "List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length', tag_count', 'code', 'code_format'"
+        },
+    )
+    cosine_min_value_wrong: float = field(
+        default=0.0,
+        metadata={"help": "Minimum reward for wrong answers"},
+    )
+    cosine_max_value_wrong: float = field(
+        default=-0.5,
+        metadata={"help": "Maximum reward for wrong answers"},
+    )
+    cosine_min_value_correct: float = field(
+        default=0.5,
+        metadata={"help": "Minimum reward for correct answers"},
+    )
+    cosine_max_value_correct: float = field(
+        default=1.0,
+        metadata={"help": "Maximum reward for correct answers"},
+    )
+    cosine_max_len: int = field(
+        default=1000,
+        metadata={"help": "Maximum length for scaling"},
+    )
+    repetition_n_grams: int = field(
+        default=3,
+        metadata={"help": "Number of n-grams for repetition penalty reward"},
+    )
+    repetition_max_penalty: float = field(
+        default=-1.0,
+        metadata={"help": "Maximum (negative) penalty for for repetition penalty reward"},
+    )
+    code_language: str = field(
+        default="python",
+        metadata={
+            "help": "Language for code format reward. Based on E2B supported languages https://e2b.dev/docs/code-interpreting/supported-languages",
+            "choices": ["python", "javascript", "r", "java", "bash", "cpp"],
+        },
+    )
+    code_eval_test_batch_size: int = field(
+        default=1,
+        metadata={
+            "help": "for each generation, evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating; otherwise continue with the next batch of test cases. Useful to avoid overloading the eval server + save time on wrong solutions"
+        },
+    )
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 9f2bbf091..35f32b422 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -15,8 +15,6 @@
 import logging
 import os
 import sys
-from dataclasses import dataclass, field
-from functools import partial, update_wrapper
 
 import datasets
 import torch
@@ -25,99 +23,17 @@
 from transformers import set_seed
 from transformers.trainer_utils import get_last_checkpoint
 
-from open_r1.configs import GRPOConfig
-from open_r1.rewards import (
-    accuracy_reward,
-    code_reward,
-    format_reward,
-    get_code_format_reward,
-    get_cosine_scaled_reward,
-    get_repetition_penalty_reward,
-    ioi_code_reward,
-    len_reward,
-    reasoning_steps_reward,
-    tag_count_reward,
-)
+from open_r1.configs import GRPOConfig, GRPOScriptArguments
+from open_r1.rewards import get_reward_funcs
 from open_r1.utils import get_tokenizer
 from open_r1.utils.callbacks import get_callbacks
 from open_r1.utils.wandb_logging import init_wandb_training
-from trl import GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
+from trl import GRPOTrainer, ModelConfig, TrlParser, get_peft_config
 
 
 logger = logging.getLogger(__name__)
 
 
-@dataclass
-class GRPOScriptArguments(ScriptArguments):
-    """
-    Script arguments for the GRPO training script.
-
-    Args:
-        reward_funcs (`list[str]`):
-            List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length', 'tag_count', 'code', 'ioi_code', 'code_format'.
-        cosine_min_value_wrong (`float`):
-            Minimum reward for cosine scaling for wrong answers.
-        cosine_max_value_wrong (`float`):
-            Maximum reward for cosine scaling for wrong answers.
-        cosine_min_value_correct (`float`):
-            Minimum reward for cosine scaling for correct answers.
-        cosine_max_value_correct (`float`):
-            Maximum reward for cosine scaling for correct answers.
-        cosine_max_len (`int`):
-            Maximum length for cosine scaling.
-        code_language (`str`):
-            Language for code format reward.
-    """
-
-    reward_funcs: list[str] = field(
-        default_factory=lambda: ["accuracy", "format", "tag_count"],
-        metadata={
-            "help": "List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length', tag_count', 'code', 'code_format'"
-        },
-    )
-    cosine_min_value_wrong: float = field(
-        default=0.0,
-        metadata={"help": "Minimum reward for wrong answers"},
-    )
-    cosine_max_value_wrong: float = field(
-        default=-0.5,
-        metadata={"help": "Maximum reward for wrong answers"},
-    )
-    cosine_min_value_correct: float = field(
-        default=0.5,
-        metadata={"help": "Minimum reward for correct answers"},
-    )
-    cosine_max_value_correct: float = field(
-        default=1.0,
-        metadata={"help": "Maximum reward for correct answers"},
-    )
-    cosine_max_len: int = field(
-        default=1000,
-        metadata={"help": "Maximum length for scaling"},
-    )
-    repetition_n_grams: int = field(
-        default=3,
-        metadata={"help": "Number of n-grams for repetition penalty reward"},
-    )
-    repetition_max_penalty: float = field(
-        default=-1.0,
-        metadata={"help": "Maximum (negative) penalty for for repetition penalty reward"},
-    )
-    code_language: str = field(
-        default="python",
-        metadata={
-            "help": "Language for code format reward. Based on E2B supported languages https://e2b.dev/docs/code-interpreting/supported-languages",
-            "choices": ["python", "javascript", "r", "java", "bash", "cpp"],
-        },
-    )
-    code_eval_test_batch_size: int = field(
-        default=1,
-        metadata={
-            "help": "for each generation, evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating; otherwise continue with the next batch of test cases. Useful to avoid overloading the eval server + save time on wrong solutions"
-        },
-    )
-
-
 def main(script_args, training_args, model_args):
     # Set seed for reproducibility
     set_seed(training_args.seed)
@@ -164,31 +80,8 @@ def main(script_args, training_args, model_args):
     ################
     tokenizer = get_tokenizer(model_args, training_args)
 
-    # Get reward functions
-    REWARD_FUNCS_REGISTRY = {
-        "accuracy": accuracy_reward,
-        "format": format_reward,
-        "reasoning_steps": reasoning_steps_reward,
-        "cosine": get_cosine_scaled_reward(
-            min_value_wrong=script_args.cosine_min_value_wrong,
-            max_value_wrong=script_args.cosine_max_value_wrong,
-            min_value_correct=script_args.cosine_min_value_correct,
-            max_value_correct=script_args.cosine_max_value_correct,
-            max_len=script_args.cosine_max_len,
-        ),
-        "repetition_penalty": get_repetition_penalty_reward(
-            ngram_size=script_args.repetition_n_grams,
-            max_penalty=script_args.repetition_max_penalty,
-        ),
-        "length": len_reward,
-        "code": code_reward,
-        "ioi_code": update_wrapper(
-            partial(ioi_code_reward, test_batch_size=script_args.code_eval_test_batch_size), ioi_code_reward
-        ),
-        "code_format": get_code_format_reward(language=script_args.code_language),
-        "tag_count": tag_count_reward,
-    }
-    reward_funcs = [REWARD_FUNCS_REGISTRY[func] for func in script_args.reward_funcs]
+    # Get reward functions from the registry
+    reward_funcs = get_reward_funcs(script_args.reward_funcs)
 
     # Format into conversation
     def make_conversation(example):
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 37ba7b9c7..32e731cd0 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -4,7 +4,8 @@
 import json
 import math
 import re
-from typing import Dict
+from functools import partial, update_wrapper
+from typing import Callable, Dict
 
 from latex2sympy2_extended import NormalizationConfig
 from math_verify import LatexExtractionConfig, parse, verify
@@ -367,6 +368,12 @@ def extract_code(completion: str, language: str = "python") -> str:
     return extracted_answer
 
 
+def binary_code_reward(completions, **kwargs) -> list[float]:
+    rewards = code_reward(completions, **kwargs)
+    BINARY_THRESHOLD = 0.99
+    return [1.0 if reward > BINARY_THRESHOLD else 0.0 for reward in rewards]
+
+
 def code_reward(completions, **kwargs) -> list[float]:
     """Reward function that evaluates code snippets using the E2B code interpreter.
 
@@ -492,3 +499,33 @@ async def run_script(sbx: AsyncSandbox, script: str, language: str) -> float:
         return float(execution.text)
     except (TypeError, ValueError):
         return 0.0
+
+
+def get_reward_funcs(script_args) -> list[Callable]:
+    REWARD_FUNCS_REGISTRY = {
+        "accuracy": accuracy_reward,
+        "format": format_reward,
+        "reasoning_steps": reasoning_steps_reward,
+        "cosine": get_cosine_scaled_reward(
+            min_value_wrong=script_args.cosine_min_value_wrong,
+            max_value_wrong=script_args.cosine_max_value_wrong,
+            min_value_correct=script_args.cosine_min_value_correct,
+            max_value_correct=script_args.cosine_max_value_correct,
+            max_len=script_args.cosine_max_len,
+        ),
+        "repetition_penalty": get_repetition_penalty_reward(
+            ngram_size=script_args.repetition_n_grams,
+            max_penalty=script_args.repetition_max_penalty,
+        ),
+        "length": len_reward,
+        "code": code_reward,
+        "binary_code": binary_code_reward,
+        "ioi_code": update_wrapper(
+            partial(ioi_code_reward, test_batch_size=script_args.code_eval_test_batch_size), ioi_code_reward
+        ),
+        "code_format": get_code_format_reward(language=script_args.code_language),
+        "tag_count": tag_count_reward,
+    }
+    reward_funcs = [REWARD_FUNCS_REGISTRY[func] for func in script_args.reward_funcs]
+
+    return reward_funcs
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index 5a77bf0d1..51b544f82 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -15,24 +15,66 @@
 
 import unittest
 
+from open_r1.configs import GRPOScriptArguments
 from open_r1.rewards import (
     accuracy_reward,
     format_reward,
     get_code_format_reward,
     get_cosine_scaled_reward,
     get_repetition_penalty_reward,
+    get_reward_funcs,
     len_reward,
     reasoning_steps_reward,
     tag_count_reward,
 )
 
 
+class TestGetRewardFuncs(unittest.TestCase):
+    def test_get_reward_funcs(self):
+        """Test get_reward_funcs with various reward functions."""
+        reward_names = [
+            "accuracy",
+            "format",
+            "reasoning_steps",
+            "cosine",
+            "repetition_penalty",
+            "length",
+            "tag_count",
+            "code",
+            "ioi_code",
+            "code_format",
+            "binary_code",
+        ]
+        reward_func_names = [
+            "accuracy_reward",
+            "format_reward",
+            "reasoning_steps_reward",
+            "cosine_scaled_reward",
+            "repetition_penalty_reward",
+            "len_reward",
+            "tag_count_reward",
+            "code_reward",
+            "ioi_code_reward",
+            "code_format_reward",
+            "binary_code_reward",
+        ]
+
+        args = GRPOScriptArguments(
+            dataset_name="dummy",
+            reward_funcs=reward_names,
+        )
+
+        reward_funcs = get_reward_funcs(args)
+        self.assertEqual(len(reward_funcs), 11)
+        for func_name, func in zip(reward_func_names, reward_funcs):
+            self.assertEqual(func_name, func.__name__)
+
+
 class TestRewards(unittest.TestCase):
     def test_accuracy_reward_correct_answer(self):
         """Test accuracy_reward with a correct answer."""
         completion = [[{"content": r"\boxed{\frac{63}{400}}"}]]
         solution = [r"\frac{63}{400}"]
-
         rewards = accuracy_reward(completion, solution)
         self.assertEqual(rewards[0], 1.0)
 

From 9409dca6758b560995fcf153cdac2bda50f20633 Mon Sep 17 00:00:00 2001
From: Zhou Shao <shaozhou@pku.edu.cn>
Date: Sat, 22 Mar 2025 22:33:21 +0800
Subject: [PATCH 087/137] fix get_reward_funcs bug (#535)

change the input from `script_args.reward_funcs` to `script_args`
---
 src/open_r1/grpo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 35f32b422..c681b8194 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -81,7 +81,7 @@ def main(script_args, training_args, model_args):
     tokenizer = get_tokenizer(model_args, training_args)
 
     # Get reward functions from the registry
-    reward_funcs = get_reward_funcs(script_args.reward_funcs)
+    reward_funcs = get_reward_funcs(script_args)
 
     # Format into conversation
     def make_conversation(example):

From 86f9471f8ecdf670aa8a8d129506a82ec89d568f Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Mon, 24 Mar 2025 13:36:48 +0100
Subject: [PATCH 088/137] Fixes missing exception in run_script  (#532)

* adds binary code reward, refactors grpo with get_reward_funcs

* adds return type to the function

* fix exception in run_script causes batch of rewards to be zero

* style
---
 src/open_r1/rewards.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 32e731cd0..6720b385d 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -499,6 +499,9 @@ async def run_script(sbx: AsyncSandbox, script: str, language: str) -> float:
         return float(execution.text)
     except (TypeError, ValueError):
         return 0.0
+    except Exception as e:
+        print(f"Error from E2B executor run_script: {e}")
+        return 0.0
 
 
 def get_reward_funcs(script_args) -> list[Callable]:

From 8000dd2384853826882e5d42a385dd230573925e Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Mon, 24 Mar 2025 15:15:02 +0100
Subject: [PATCH 089/137] [WIP] RL goes brrr (#533)

* Fix vLLM recipes

* Add vllm server to Slurm

* Add overlap across srun

* Fix NUM_NODES

* Refactor TP to script

* fix train script to work withnew  GRPO

* lewis nits

* bump trl, transformers

---------

Co-authored-by: edbeeching <edbeeching@gmail.com>
---
 README.md                                     | 20 +----
 .../grpo/config_demo.yaml                     |  2 -
 .../grpo/config_demo.yaml                     |  2 -
 .../grpo/config_demo_code.yaml                |  2 -
 .../grpo/config_demo_code_ioi.yaml            |  2 -
 .../Qwen2.5-7B-Instruct/grpo/config_demo.yaml | 54 ++++++++++++++
 .../grpo/config_simple_rl.yaml                |  2 -
 scripts/get_tensor_parallel_size.py           | 28 +++++++
 setup.py                                      |  4 +-
 slurm/train.slurm                             | 73 ++++++++-----------
 10 files changed, 119 insertions(+), 70 deletions(-)
 create mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_demo.yaml
 create mode 100644 scripts/get_tensor_parallel_size.py

diff --git a/README.md b/README.md
index ce9bc5651..9e1ac3d75 100644
--- a/README.md
+++ b/README.md
@@ -160,26 +160,12 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con
 ```
 
 ### GRPO
-
-To train via the GRPO trainer, we use one GPU to run vLLM for faster generation and the remaining GPUs for training. For example, one a node with 8 GPUs, set `--num_processes` to override the default value in the `accelerate` configs:
-
+We use TRL's new distributed vLLM server and GRPOTraining in order to scale to larger >7B models. We provide an example slurm script:
 ```shell
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero2.yaml \
-    --num_processes=7 src/open_r1/grpo.py \
-    --config recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
+sbatch --job-name=trl-Qwen2.5-Math-7B-config_simple_rl --nodes=2 slurm/train.slurm Qwen2.5-Math-7B grpo config_simple_rl zero3 
 ```
 
-> [!WARNING]
-> The chat template used in the distilled DeepSeek models omits the contents of the reasoning block within the `<think>` and `</think>` tags. It also prefills the assistant response with `<think>` which interferes with the format reward function. To handle that, it is important to override the chat template as done in e.g.  [recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml](./recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml).
-
-
-We provide a minimal reproducible experiment using GRPO for mathematical reasoning, referencing the approach from [SimpleRL-Reason](https://hkust-nlp.notion.site/simplerl-reason) which uses a 7B model trained on 8K examples. Running this on 8 H100 80G GPU takes about 3 hours:
-
-```shell
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero2.yaml \
-    --num_processes=7 src/open_r1/grpo.py \
-    --config recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
-```
+You will need to adapt the `slurm/train.slurm` script to match your cluster.
 
 Our final [model](https://huggingface.co/Dongwei/Qwen-2.5-7B_Base_Math_smalllr), while using different learning rates, loss functions and reward structures, achieves 69.4% accuracy on MATH-500, demonstrating a 17%+ improvement over the base model.
 
diff --git a/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml b/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
index 13c4f668b..8eaf32ca8 100644
--- a/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
+++ b/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
@@ -13,8 +13,6 @@ system_prompt: "You are a helpful AI Assistant that provides well-reasoned and d
 # GRPO trainer config
 bf16: true
 use_vllm: true
-vllm_device: auto
-vllm_gpu_memory_utilization: 0.7
 do_eval: false
 gradient_accumulation_steps: 4
 gradient_checkpointing: true
diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
index 156ccf160..5f3b69222 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
@@ -11,8 +11,6 @@ system_prompt: "You are a helpful AI Assistant that provides well-reasoned and d
 # GRPO trainer config
 bf16: true
 use_vllm: true
-vllm_device: auto
-vllm_gpu_memory_utilization: 0.7
 do_eval: false
 gradient_accumulation_steps: 4
 gradient_checkpointing: true
diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
index 135f40aa1..768504960 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
@@ -12,8 +12,6 @@ system_prompt: "You are a helpful AI Assistant that provides well-reasoned and d
 beta: 0.01
 bf16: true
 use_vllm: true
-vllm_device: auto
-vllm_gpu_memory_utilization: 0.9
 do_eval: false
 gradient_accumulation_steps: 4
 gradient_checkpointing: true
diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml
index f166cf053..c032b1641 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml
@@ -12,8 +12,6 @@ system_prompt: "You are a helpful AI Assistant that provides well-reasoned and d
 beta: 0.01
 bf16: true
 use_vllm: true
-vllm_device: auto
-vllm_gpu_memory_utilization: 0.9
 do_eval: false
 gradient_accumulation_steps: 4
 gradient_checkpointing: true
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_demo.yaml
new file mode 100644
index 000000000..886e1f270
--- /dev/null
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_demo.yaml
@@ -0,0 +1,54 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/OpenR1-Math-cn_k12-86k
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+beta: 0.001
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-7B-Instruct-GRPO
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 16
+num_train_epochs: 1
+output_dir: data/Qwen2.5-7B-Instruct-GRPO
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 0.2
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
index d1c3d63c4..280e72167 100644
--- a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
+++ b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
@@ -12,8 +12,6 @@ system_prompt: "You are a helpful AI Assistant, designed to provided well-reason
 # GRPO trainer config
 bf16: true
 use_vllm: true
-vllm_device: auto
-vllm_gpu_memory_utilization: 0.7
 do_eval: true
 eval_strategy: steps
 eval_steps: 100
diff --git a/scripts/get_tensor_parallel_size.py b/scripts/get_tensor_parallel_size.py
new file mode 100644
index 000000000..d6c61154c
--- /dev/null
+++ b/scripts/get_tensor_parallel_size.py
@@ -0,0 +1,28 @@
+import argparse
+from transformers import AutoConfig
+from math import gcd
+
+def get_tensor_parallel_size(model_name: str, revision: str = None, default_tp: int = 8) -> int:
+    try:
+        config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=True)
+        num_heads = getattr(config, 'num_attention_heads', None)
+
+        if num_heads is not None and num_heads % default_tp != 0:
+            tp = gcd(num_heads, default_tp)
+            return max(tp, 1)
+        else:
+            return default_tp
+    except Exception as e:
+        print(f"Warning: Failed to fetch config for {model_name}@{revision}: {e}")
+        return default_tp
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str, required=True, help="Hugging Face model name or path")
+    parser.add_argument("--revision", type=str, default=None, help="Model revision if applicable")
+    parser.add_argument("--default_tp", type=int, default=8, help="Default TP size (usually GPUs per node)")
+
+    args = parser.parse_args()
+
+    tp = get_tensor_parallel_size(args.model_name, args.revision, args.default_tp)
+    print(tp)
diff --git a/setup.py b/setup.py
index 00b83d811..a9f46b6fb 100644
--- a/setup.py
+++ b/setup.py
@@ -66,8 +66,8 @@
     "safetensors>=0.3.3",
     "sentencepiece>=0.1.99",
     "torch==2.5.1",
-    "transformers==4.49.0",
-    "trl @ git+https://github.com/huggingface/trl.git@69ad852e5654a77f1695eb4c608906fe0c7e8624",
+    "transformers==4.50.0",
+    "trl==0.16.0",
     "vllm==0.7.2",
     "wandb>=0.19.1",
 ]
diff --git a/slurm/train.slurm b/slurm/train.slurm
index c1209e88a..dc51f9045 100644
--- a/slurm/train.slurm
+++ b/slurm/train.slurm
@@ -9,10 +9,7 @@
 #SBATCH --requeue
 
 # Specific configuration optimized for the Hugging Face Compute Cluster
-# Be ye warned this may not work on other clusters!
 module load cuda/12.4
-
-
 set -x -e
 
 source ~/.bashrc
@@ -24,42 +21,43 @@ TASK=$2
 CONFIG_SUFFIX=$3
 ACCELERATOR=$4
 OPTIONAL_ARGS=$5
+CONFIG_FILE=recipes/$MODEL/$TASK/config_$CONFIG_SUFFIX.yaml
+GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}')
+MODEL=$(grep 'model_name_or_path:' $CONFIG_FILE | awk '{print $2}')
+REVISION=$(grep 'model_revision:' $CONFIG_FILE | head -n 1 | awk '{print $2}')
 
-# Training setup
+# Distributed configuration
 NUM_NODES=$SLURM_NNODES
 GPUS_PER_NODE=8
 WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
-# Due to conflicts between Accelerate's DeepSpeed configs and Transformers' TrainingArguments, we need to parse the gradient accumulation steps from the config file to ensure they match
-CONFIG_FILE=recipes/$MODEL/$TASK/config_$CONFIG_SUFFIX.yaml
-GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}')
+NODELIST=($(scontrol show hostnames $SLURM_JOB_NODELIST))
+MASTER_ADDR=${NODELIST[0]}  # First node for main process
+MASTER_PORT=6000
+TRAIN_NODES=("${NODELIST[@]}")
 
-# Check if we are running vLLM during training to adjust the world size
-if grep -q 'use_vllm:\s*true' "$CONFIG_FILE"; then
+USE_VLLM="false"
+if [[ -f "$CONFIG_FILE" ]] && grep -qE '^\s*use_vllm:\s*true' "$CONFIG_FILE"; then
     USE_VLLM="true"
-else
-    USE_VLLM="false"
 fi
-
+# if using vllm
 if [[ "$USE_VLLM" == "true" ]]; then
-    WORLD_SIZE=$(($WORLD_SIZE - 1))
+     TRAIN_NODES=("${NODELIST[@]:0:$((NUM_NODES - 1))}")
+     VLLM_NODE=${NODELIST[-1]} # Last node
+     TP=$(python scripts/get_tensor_parallel_size.py --model_name $MODEL --revision $REVISION --default_tp $GPUS_PER_NODE)
+     WORLD_SIZE=$((WORLD_SIZE - GPUS_PER_NODE))
+     NUM_NODES=$((NUM_NODES - 1))
+     srun --nodes=1 --ntasks=1 --nodelist=$VLLM_NODE trl vllm-serve --model $MODEL --revision $REVISION --tensor_parallel_size $TP &
+
+     OPTIONAL_ARGS="$OPTIONAL_ARGS --vllm_server_host=$VLLM_NODE"
 fi
 
-# Split the string into individual arguments
-IFS=' ' read -ra ARGS <<< "$OPTIONAL_ARGS"
-
-# Loop through the arguments and find the one with "--gradient_accumulation_steps"
-for arg in "${ARGS[@]}"; do
-    if [[ "$arg" == "--gradient_accumulation_steps="* ]]; then
-        # Extract the value after the equals sign
-        GRAD_ACC_STEPS="${arg#*=}"
-        break  # Exit the loop once we find the desired argument
-    fi
-done
-
-echo "Gradient accumulation steps: $GRAD_ACC_STEPS"
-# so processes know who to talk to
-MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
-MASTER_PORT=6000
+# force crashing on nccl issues like hanging broadcast
+export NCCL_ASYNC_ERROR_HANDLING=1
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=COLL
+# export NCCL_SOCKET_NTHREADS=1
+# export NCCL_NSOCKS_PERTHREAD=1
+# export CUDA_LAUNCH_BLOCKING=1
 
 export CMD=" \
     src/open_r1/$TASK.py --config $CONFIG_FILE $OPTIONAL_ARGS
@@ -72,29 +70,22 @@ export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORM
     --num_processes $WORLD_SIZE \
     --main_process_ip $MASTER_ADDR \
     --main_process_port $MASTER_PORT \
-    --machine_rank \$SLURM_PROCID \
-    --rdzv_conf "rdzv_backend=c10d,rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT" \
+    --machine_rank $SLURM_PROCID \
+    --rdzv_backend=c10d \
     --max_restarts 1 \
     --role \$(hostname -s): \
     --tee 3 \
     "
-
-# force crashing on nccl issues like hanging broadcast
-export NCCL_ASYNC_ERROR_HANDLING=1
-# export NCCL_DEBUG=INFO
-# export NCCL_DEBUG_SUBSYS=COLL
-# export NCCL_SOCKET_NTHREADS=1
-# export NCCL_NSOCKS_PERTHREAD=1
-# export CUDA_LAUNCH_BLOCKING=1
-
 # srun error handling:
 # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
 # --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
 SRUN_ARGS=" \
     --wait=60 \
     --kill-on-bad-exit=1 \
+    --nodes=$NUM_NODES \
+    --ntasks=$NUM_NODES \
+    --nodelist=$TRAIN_NODES
     "
-
 clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --role \$SLURMD_NODENAME: $CMD" 2>&1
 
 echo "END TIME: $(date)"
\ No newline at end of file

From 4ec555b0c87b3c6734733c17486bfe8da1e0d595 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Thu, 27 Mar 2025 10:29:07 +0100
Subject: [PATCH 090/137] Restore single-node instructions to run GRPO (#549)

---
 README.md | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 9e1ac3d75..756e1a2a0 100644
--- a/README.md
+++ b/README.md
@@ -160,14 +160,31 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con
 ```
 
 ### GRPO
-We use TRL's new distributed vLLM server and GRPOTraining in order to scale to larger >7B models. We provide an example slurm script:
+
+We use TRL's [vLLM backend](https://huggingface.co/docs/trl/speeding_up_training?vllm+examples=GRPO#vllm-for-fast-generation-in-online-methods) to scale training to large models across multiple nodes. For single-node training of smol models across 8 GPUs, first spin up the vLLM server to run on e.g. 1 GPU as follows:
+
 ```shell
-sbatch --job-name=trl-Qwen2.5-Math-7B-config_simple_rl --nodes=2 slurm/train.slurm Qwen2.5-Math-7B grpo config_simple_rl zero3 
+CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 ```
 
-You will need to adapt the `slurm/train.slurm` script to match your cluster.
+Once the server is up, run training on the remaining GPUs as follows:
+
+```shell
+CUDA_VISIBLE_DEVICES=1,2,3,4,5,6,7 ACCELERATE_LOG_LEVEL=info \
+    accelerate launch --config_file recipes/accelerate_configs/zero2.yaml --num_processes 7 \
+    src/open_r1/grpo.py --config recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
+```
+
+> [!WARNING]
+> The chat template used in the distilled DeepSeek models omits the contents of the reasoning block within the `<think>` and `</think>` tags. It also prefills the assistant response with `<think>` which interferes with the format reward function. To handle that, it is important to override the chat template as done in e.g.  [recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml](./recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml).
+
+For multi-node training, we provide an example Slurm script:
 
-Our final [model](https://huggingface.co/Dongwei/Qwen-2.5-7B_Base_Math_smalllr), while using different learning rates, loss functions and reward structures, achieves 69.4% accuracy on MATH-500, demonstrating a 17%+ improvement over the base model.
+```shell
+sbatch --nodes=2 slurm/train.slurm Qwen2.5-Math-7B grpo config_simple_rl zero3 
+```
+
+You will need to adapt the `slurm/train.slurm` script to match your cluster.
 
 #### 👨‍💻 Training with a code interpreter
 
@@ -198,12 +215,18 @@ Then make sure your dataset contains a `verification_info` column with the follo
 }
 ```
 
-For example, to train a smol model on Python problems, run:
+For example, to train a smol model on Python problems, start the vLLM server:
 
 ```shell
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero2.yaml \
-    --num_processes=7 src/open_r1/grpo.py \
-    --config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
+CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen2.5-1.5B-Instruct
+```
+
+Then run training with:
+
+```shell
+CUDA_VISIBLE_DEVICES=1,2,3,4,5,6,7 ACCELERATE_LOG_LEVEL=info \ 
+    accelerate launch --config_file recipes/accelerate_configs/zero2.yaml --num_processes=7 
+    src/open_r1/grpo.py --config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
 ```
 
 #### IOI problems
@@ -214,6 +237,7 @@ To get piston workers running, see [slurm/piston/README.md](./slurm/piston/READM
 Set your environment variable `PISTON_ENDPOINTS` to `slurm` or to a list of piston worker endpoints.
 
 See the [example recipe](./recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml) for how to use the reward function:
+
 ```shell
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero2.yaml \
     --num_processes=7 src/open_r1/grpo.py \

From 1802bec75f4342d3b9313ba72b2ba45bab2b918d Mon Sep 17 00:00:00 2001
From: Zhou Shao <shaozhou@pku.edu.cn>
Date: Fri, 28 Mar 2025 20:17:04 +0800
Subject: [PATCH 091/137] fix dataset parsing error (#540)

* fix dataset parsing error

support defined question field to fix errors when datasets' question field is not 'problem'

* add question field config

add script_args: question field

* refactor: datasets prompt column

---------

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>
---
 .gitignore             | 3 ++-
 src/open_r1/configs.py | 4 ++++
 src/open_r1/grpo.py    | 7 +++++--
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index f4db78191..fe9af19a2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -177,4 +177,5 @@ logs/
 eval_results/
 results/
 
-.vscode/
\ No newline at end of file
+.vscode/
+.python-version
\ No newline at end of file
diff --git a/src/open_r1/configs.py b/src/open_r1/configs.py
index b341e0bdd..bf6f03b3c 100644
--- a/src/open_r1/configs.py
+++ b/src/open_r1/configs.py
@@ -154,3 +154,7 @@ class GRPOScriptArguments(trl.ScriptArguments):
             "help": "for each generation, evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating; otherwise continue with the next batch of test cases. Useful to avoid overloading the eval server + save time on wrong solutions"
         },
     )
+    dataset_prompt_column: str = field(
+        default="prompt",
+        metadata={"help": "Column to use as prompts for training."},
+    )
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index c681b8194..af394c26c 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -84,13 +84,16 @@ def main(script_args, training_args, model_args):
     reward_funcs = get_reward_funcs(script_args)
 
     # Format into conversation
-    def make_conversation(example):
+    def make_conversation(example, prompt_column: str = script_args.dataset_prompt_column):
         prompt = []
 
         if training_args.system_prompt is not None:
             prompt.append({"role": "system", "content": training_args.system_prompt})
 
-        prompt.append({"role": "user", "content": example["problem"]})
+        if prompt_column not in example:
+            raise ValueError(f"Dataset Question Field Error: {prompt_column} is not supported.")
+
+        prompt.append({"role": "user", "content": example[prompt_column]})
         return {"prompt": prompt}
 
     dataset = dataset.map(make_conversation)

From 9915e06f1e2e5cdf580b706110734e2a24146a4d Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Fri, 28 Mar 2025 14:08:15 +0100
Subject: [PATCH 092/137] Async code reward fixes (#546)

* expose num parallel code executions

* add e2b benchmarking script

* adds new parallel code execution with better execption handling

* style

* update default

* increase sandbox timeout

* Add pretty table and Sandbox IDs

* Add Sandbox ID

* fix merge

---------

Co-authored-by: Lewis Tunstall <lewis.c.tunstall@gmail.com>
---
 scripts/benchmark_e2b.py | 84 ++++++++++++++++++++++++++++++++++++++++
 setup.py                 |  2 +-
 src/open_r1/configs.py   |  7 ++++
 src/open_r1/rewards.py   | 84 ++++++++++++++++++++++++++++------------
 4 files changed, 152 insertions(+), 25 deletions(-)
 create mode 100644 scripts/benchmark_e2b.py

diff --git a/scripts/benchmark_e2b.py b/scripts/benchmark_e2b.py
new file mode 100644
index 000000000..cbaca41de
--- /dev/null
+++ b/scripts/benchmark_e2b.py
@@ -0,0 +1,84 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Benchmark script for the code_reward function with E2B.
+
+This script measures the performance of the code_reward function with varying numbers
+of samples and parallelization levels.
+
+Each sample is a CodeForces problem with a gold standard solution that is executed against a set of public test cases.
+"""
+
+from datasets import load_dataset
+from open_r1.rewards import code_reward
+import time
+from tqdm.auto import tqdm
+
+from dotenv import load_dotenv
+load_dotenv()
+
+def benchmark_code_reward(example):
+    start_time = time.time()
+    test_completions = [[{"content": example["gold_standard_solution"]}]]
+    reward_kwargs = {"verification_info": [example["verification_info"]]}
+    rewards = code_reward(test_completions, **reward_kwargs)
+    end_time = time.time()
+    example["test_reward"] = rewards[0]
+    example["reward_time"] = end_time - start_time
+    return example
+
+if __name__ == "__main__":
+    parallel_dict = {
+        16:[1,4,16],
+        64:[4,16, 64],
+        256:[16, 64, 96], # cap at 96 as PRO account is limited to 100
+    }
+    # Store results for table formatting
+    results = []
+    
+    for num_samples in tqdm([16, 64,256], desc="Benchmarking samples"):
+        for num_parallel in parallel_dict[num_samples]:
+            code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated")
+            code_dataset = code_dataset["train"].shuffle(seed=42).select(range(num_samples))
+
+            test_completions = [[{"content": example["gold_standard_solution"]}] for example in code_dataset]
+            reward_kwargs = {"verification_info": [example["verification_info"] for example in code_dataset]}
+
+            start_time = time.time()
+            rewards = code_reward(test_completions, num_parallel=num_parallel, **reward_kwargs)
+            execution_time = time.time() - start_time
+            
+            # Calculate some statistics about rewards
+            mean_reward = sum(rewards) / len(rewards)
+            min_reward = min(rewards)
+            max_reward = max(rewards)
+            
+            # Store results
+            results.append({
+                "num_samples": num_samples,
+                "num_parallel": num_parallel,
+                "execution_time": execution_time,
+                "mean_reward": mean_reward,
+                "min_reward": min_reward,
+                "max_reward": max_reward
+            })
+    
+    print("\n## Benchmark Results\n")
+    print("| Sample Size | Parallelization | Execution Time (s) | Mean Reward | Min Reward | Max Reward |")
+    print("|:-----------:|:---------------:|------------------:|:-----------:|:-----------:|:-----------:|")
+    
+    for result in results:
+        print(f"| {result['num_samples']:^11} | {result['num_parallel']:^15} | {result['execution_time']:17.2f} | {result['mean_reward']:^11.4f} | {result['min_reward']:^11.4f} | {result['max_reward']:^11.4f} |")
+    
\ No newline at end of file
diff --git a/setup.py b/setup.py
index a9f46b6fb..920697277 100644
--- a/setup.py
+++ b/setup.py
@@ -91,7 +91,7 @@ def deps_list(*pkgs):
 extras["quality"] = deps_list("ruff", "isort", "flake8")
 extras["code"] = deps_list("e2b-code-interpreter", "python-dotenv")
 extras["eval"] = deps_list("lighteval", "math-verify")
-extras["dev"] = extras["quality"] + extras["tests"] + extras["eval"]
+extras["dev"] = extras["quality"] + extras["tests"] + extras["eval"] + extras["code"]
 
 # core dependencies shared across the whole project - keep this to a bare minimum :)
 install_requires = [
diff --git a/src/open_r1/configs.py b/src/open_r1/configs.py
index bf6f03b3c..7d304c536 100644
--- a/src/open_r1/configs.py
+++ b/src/open_r1/configs.py
@@ -154,6 +154,13 @@ class GRPOScriptArguments(trl.ScriptArguments):
             "help": "for each generation, evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating; otherwise continue with the next batch of test cases. Useful to avoid overloading the eval server + save time on wrong solutions"
         },
     )
+    parallel_code_exec_per_proc: int = field(
+        default=2,
+        metadata={
+            "help": "Number of parallel E2B code executions per process. Default of 2 is suitable for the Free Hobby tier of E2B with 8 GPUs used for training."
+        },
+    )
+
     dataset_prompt_column: str = field(
         default="prompt",
         metadata={"help": "Column to use as prompts for training."},
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 6720b385d..4a4e4b4d6 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -1,3 +1,18 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Reward functions for GRPO training."""
 
 import asyncio
@@ -368,13 +383,13 @@ def extract_code(completion: str, language: str = "python") -> str:
     return extracted_answer
 
 
-def binary_code_reward(completions, **kwargs) -> list[float]:
-    rewards = code_reward(completions, **kwargs)
+def binary_code_reward(completions, num_parallel: int = 2, **kwargs) -> list[float]:
+    rewards = code_reward(completions, num_parallel=num_parallel, **kwargs)
     BINARY_THRESHOLD = 0.99
     return [1.0 if reward > BINARY_THRESHOLD else 0.0 for reward in rewards]
 
 
-def code_reward(completions, **kwargs) -> list[float]:
+def code_reward(completions, num_parallel: int = 2, **kwargs) -> list[float]:
     """Reward function that evaluates code snippets using the E2B code interpreter.
 
     Assumes the dataset contains a `verification_info` column with test cases.
@@ -438,7 +453,7 @@ def evaluate_code(code, test_cases):
     if not all(v["language"] == language for v in verification_info):
         raise ValueError("All verification_info must have the same language", verification_info)
     try:
-        rewards = run_async_from_sync(scripts, language)
+        rewards = run_async_from_sync(scripts, language, num_parallel)
 
     except Exception as e:
         print(f"Error from E2B executor: {e}")
@@ -463,12 +478,12 @@ def code_format_reward(completions, **kwargs):
     return code_format_reward
 
 
-def run_async_from_sync(scripts: list[str], language: str) -> list[float]:
+def run_async_from_sync(scripts: list[str], language: str, num_parallel: int) -> list[float]:
     """Function wrapping the `run_async` function."""
     # Create a new event loop and set it
     try:
         # Run the async function and get the result
-        rewards = asyncio.run(run_async(scripts, language))
+        rewards = asyncio.run(run_async(scripts, language, num_parallel))
     except Exception as e:
         print(f"Error from E2B executor async: {e}")
         raise e
@@ -476,32 +491,49 @@ def run_async_from_sync(scripts: list[str], language: str) -> list[float]:
     return rewards
 
 
-async def run_async(scripts: list[str], language: str) -> list[float]:
-    # Create the sandbox by hand, currently there's no context manager for this version
-    sbx = await AsyncSandbox.create(timeout=30, request_timeout=3)
+async def run_async(scripts: list[str], language: str, num_parallel: int) -> list[float]:
+    # Limit the number of concurrent tasks
+    semaphore = asyncio.Semaphore(num_parallel)
 
     # Create a list of tasks for running scripts concurrently
-    tasks = [run_script(sbx, script, language) for script in scripts]
+    tasks = [run_script(script, language, semaphore) for script in scripts]
 
     # Wait for all tasks to complete and gather their results as they finish
     results = await asyncio.gather(*tasks)
     rewards = list(results)  # collect results
 
-    # Kill the sandbox after all the tasks are complete
-    await sbx.kill()
-
     return rewards
 
 
-async def run_script(sbx: AsyncSandbox, script: str, language: str) -> float:
-    execution = await sbx.run_code(script, language=language)
-    try:
-        return float(execution.text)
-    except (TypeError, ValueError):
-        return 0.0
-    except Exception as e:
-        print(f"Error from E2B executor run_script: {e}")
-        return 0.0
+async def run_script(script: str, language: str, semaphore: asyncio.Semaphore) -> float:
+    # We set a timeout margin, as the AsyncSandbox timeout does not seem to work
+    # These values are based on running 256 examples with the gold solution
+    # from open-r1/verifiable-coding-problems-python_decontaminated
+    # see scripts/benchmark_e2b.py
+
+    SANDBOX_TIMEOUT = 30
+    MARGIN = 2
+    REQUEST_TIMEOUT = SANDBOX_TIMEOUT - MARGIN
+    ASYNCIO_TIMEOUT = SANDBOX_TIMEOUT + MARGIN
+
+    async with semaphore:
+        try:
+            sandbox = await AsyncSandbox.create(timeout=SANDBOX_TIMEOUT, request_timeout=REQUEST_TIMEOUT)
+            execution = await asyncio.wait_for(sandbox.run_code(script, language=language), timeout=ASYNCIO_TIMEOUT)
+            return float(execution.text)
+        except (TypeError, ValueError):
+            return 0.0
+        except asyncio.TimeoutError:
+            print("Operation timed out")
+            return 0.0
+        except Exception as e:
+            print(f"Error in `run_script` from E2B sandbox ID {sandbox.sandbox_id} : {e}")
+            return 0.0
+        finally:
+            try:
+                await sandbox.kill()
+            except Exception as e:
+                print(f"Error from E2B executor kill with sandbox ID {sandbox.sandbox_id} : {e}")
 
 
 def get_reward_funcs(script_args) -> list[Callable]:
@@ -521,8 +553,12 @@ def get_reward_funcs(script_args) -> list[Callable]:
             max_penalty=script_args.repetition_max_penalty,
         ),
         "length": len_reward,
-        "code": code_reward,
-        "binary_code": binary_code_reward,
+        "code": update_wrapper(
+            partial(code_reward, num_parallel=script_args.parallel_code_exec_per_proc), code_reward
+        ),
+        "binary_code": update_wrapper(
+            partial(binary_code_reward, num_parallel=script_args.parallel_code_exec_per_proc), binary_code_reward
+        ),
         "ioi_code": update_wrapper(
             partial(ioi_code_reward, test_batch_size=script_args.code_eval_test_batch_size), ioi_code_reward
         ),

From 4f5b21e21dec473af9729bce8e084deb16223ae4 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Tue, 1 Apr 2025 12:04:26 +0200
Subject: [PATCH 093/137] Fix accuracy reward for math (#566)

* Fix accuracy reward for math

* Add typing

* Add unit test

* Return None for invalid samples

* Fix order of answers

* Fix type

* Use None for non-verifiable answers
---
 src/open_r1/rewards.py | 15 +++++++--------
 tests/test_rewards.py  |  6 ++++++
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 4a4e4b4d6..76797868c 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -20,7 +20,7 @@
 import math
 import re
 from functools import partial, update_wrapper
-from typing import Callable, Dict
+from typing import Callable, Dict, Optional
 
 from latex2sympy2_extended import NormalizationConfig
 from math_verify import LatexExtractionConfig, parse, verify
@@ -38,7 +38,7 @@
     AsyncSandbox = None
 
 
-def accuracy_reward(completions, solution, **kwargs):
+def accuracy_reward(completions: list[list[dict[str, str]]], solution: list[str], **kwargs) -> list[Optional[float]]:
     """Reward function that checks if the completion is the same as the ground truth."""
     contents = [completion[0]["content"] for completion in completions]
     rewards = []
@@ -46,7 +46,6 @@ def accuracy_reward(completions, solution, **kwargs):
         gold_parsed = parse(
             sol,
             extraction_mode="first_match",
-            extraction_config=[LatexExtractionConfig()],
         )
         if len(gold_parsed) != 0:
             # We require the answer to be provided in correct latex (no malformed operators)
@@ -69,15 +68,15 @@ def accuracy_reward(completions, solution, **kwargs):
                 ],
                 extraction_mode="first_match",
             )
-            # Reward 1 if the content is the same as the ground truth, 0 otherwise
+            # Compute binary rewards if verifiable, `None` otherwise to skip this example
             try:
-                reward = float(verify(answer_parsed, gold_parsed))
+                reward = float(verify(gold_parsed, answer_parsed))
             except Exception as e:
                 print(f"verify failed: {e}, answer: {answer_parsed}, gold: {gold_parsed}")
-                reward = 0.0
+                reward = None
         else:
-            # If the gold solution is not parseable, we reward 1 to skip this example
-            reward = 1.0
+            # If the gold solution is not parseable, we assign `None` to skip this example
+            reward = None
             print("Failed to parse gold solution: ", sol)
         rewards.append(reward)
 
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index 51b544f82..3a9df6100 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -82,7 +82,13 @@ def test_accuracy_reward_wrong_answer(self):
         """Test accuracy_reward with an incorrect answer."""
         completion = [[{"content": r"\boxed{\frac{64}{400}}"}]]
         solution = [r"\frac{63}{400}"]
+        rewards = accuracy_reward(completion, solution)
+        self.assertEqual(rewards[0], 0.0)
 
+    def test_accuracy_reward_wrong_answer_no_latex(self):
+        """Test accuracy_reward with an incorrect answer and gold solution with no latex."""
+        completion = [[{"content": r"\boxed{3}"}]]
+        solution = ["6"]
         rewards = accuracy_reward(completion, solution)
         self.assertEqual(rewards[0], 0.0)
 

From ca8664df1c57753f067e92fc4f48bb9819b53421 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Wed, 2 Apr 2025 15:48:48 +0200
Subject: [PATCH 094/137] Fix missing prompt columns in recipes (#574)

---
 recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml  | 1 +
 recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml          | 1 +
 recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml     | 1 +
 recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml | 1 +
 recipes/Qwen2.5-7B-Instruct/grpo/config_demo.yaml            | 1 +
 recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml           | 1 +
 6 files changed, 6 insertions(+)

diff --git a/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml b/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
index 8eaf32ca8..ee416a848 100644
--- a/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
+++ b/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
@@ -8,6 +8,7 @@ attn_implementation: flash_attention_2
 # We edit the DeepSeek chat template to ensure (a) the reasoning block within <think> and </think> is included in the completion and (b) the <think> tag is not part of the prefill so that the format reward works
 chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}"
 dataset_name: open-r1/OpenR1-Math-220k
+dataset_prompt_column: problem
 system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
 
 # GRPO trainer config
diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
index 5f3b69222..d1a2a6bce 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
@@ -6,6 +6,7 @@ attn_implementation: flash_attention_2
 
 # Data training arguments
 dataset_name: open-r1/OpenR1-Math-220k
+dataset_prompt_column: problem
 system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
 
 # GRPO trainer config
diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
index 768504960..1c694b1cc 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
@@ -6,6 +6,7 @@ attn_implementation: flash_attention_2
 
 # Data training arguments
 dataset_name: open-r1/verifiable-coding-problems-python
+dataset_prompt_column: problem_statement
 system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
 
 # GRPO trainer config
diff --git a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml
index c032b1641..7ec23c6f1 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml
@@ -6,6 +6,7 @@ attn_implementation: flash_attention_2
 
 # Data training arguments
 dataset_name: open-r1/ioi
+dataset_prompt_column: problem
 system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
 
 # GRPO trainer config
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_demo.yaml
index 886e1f270..be6f06d79 100644
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_demo.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_demo.yaml
@@ -6,6 +6,7 @@ attn_implementation: flash_attention_2
 
 # Data training arguments
 dataset_name: open-r1/OpenR1-Math-cn_k12-86k
+dataset_prompt_column: problem
 system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
 
 # GRPO trainer config
diff --git a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
index 280e72167..d707693d3 100644
--- a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
+++ b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
@@ -7,6 +7,7 @@ attn_implementation: flash_attention_2
 # Data training arguments
 dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_config: default
+dataset_prompt_column: problem
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 
 # GRPO trainer config

From 2636a2130fac92cec79b0781101dedb888a9aa18 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Wed, 2 Apr 2025 15:48:59 +0200
Subject: [PATCH 095/137] Add WandB groups to logging (#573)

---
 src/open_r1/configs.py             | 8 ++++++++
 src/open_r1/utils/wandb_logging.py | 2 ++
 2 files changed, 10 insertions(+)

diff --git a/src/open_r1/configs.py b/src/open_r1/configs.py
index 7d304c536..0750e2a6d 100644
--- a/src/open_r1/configs.py
+++ b/src/open_r1/configs.py
@@ -50,6 +50,10 @@ class GRPOConfig(trl.GRPOConfig):
         default=None,
         metadata={"help": ("The project to store runs under.")},
     )
+    wandb_run_group: Optional[str] = field(
+        default=None,
+        metadata={"help": ("The group to store runs under.")},
+    )
 
 
 @dataclass
@@ -83,6 +87,10 @@ class SFTConfig(trl.SFTConfig):
         default=None,
         metadata={"help": ("The project to store runs under.")},
     )
+    wandb_run_group: Optional[str] = field(
+        default=None,
+        metadata={"help": ("The group to store runs under.")},
+    )
 
 
 @dataclass
diff --git a/src/open_r1/utils/wandb_logging.py b/src/open_r1/utils/wandb_logging.py
index 13b552766..e52f911c8 100644
--- a/src/open_r1/utils/wandb_logging.py
+++ b/src/open_r1/utils/wandb_logging.py
@@ -9,3 +9,5 @@ def init_wandb_training(training_args):
         os.environ["WANDB_ENTITY"] = training_args.wandb_entity
     if training_args.wandb_project is not None:
         os.environ["WANDB_PROJECT"] = training_args.wandb_project
+    if training_args.wandb_run_group is not None:
+        os.environ["WANDB_RUN_GROUP"] = training_args.wandb_run_group

From 1b3bf043dc24693d1213843e16efcfb8d0ce4bd2 Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Mon, 7 Apr 2025 21:01:06 +0200
Subject: [PATCH 096/137] Adds a E2B router server that executes batches of
 scripts (#561)

* adds a dedicated e2b server to handle batches of requests

* fix reward tests

* update slow reward

* style

* updates e2b router to be more generic

* refactor

* refactoring

* licence, cleanup

* update tests

* style

* fix import when e2b not present

* style

* rename sandbox file

* rename to RoutedSandbox

* update readme

* nits

* nits2

* unlimited max time

* update logs path
---
 README.md                           |  16 +++
 scripts/e2b_router.py               | 162 ++++++++++++++++++++++++++++
 slurm/e2b_router.slurm              |  16 +++
 src/open_r1/configs.py              |   5 +
 src/open_r1/rewards.py              |  44 ++++++--
 src/open_r1/utils/routed_sandbox.py | 100 +++++++++++++++++
 tests/slow/test_code_reward.py      |  68 ++++++++++++
 7 files changed, 404 insertions(+), 7 deletions(-)
 create mode 100644 scripts/e2b_router.py
 create mode 100644 slurm/e2b_router.slurm
 create mode 100644 src/open_r1/utils/routed_sandbox.py

diff --git a/README.md b/README.md
index 756e1a2a0..c3e5c5dcc 100644
--- a/README.md
+++ b/README.md
@@ -229,6 +229,22 @@ CUDA_VISIBLE_DEVICES=1,2,3,4,5,6,7 ACCELERATE_LOG_LEVEL=info \
     src/open_r1/grpo.py --config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
 ```
 
+It is possible to be rate limited when too many scripts are executed on E2B Sandboxes, so we provide an E2B router script that can be launched on a CPU node on your cluster:
+
+For GRPO training:
+First start the router and get its IP
+
+```shell
+sbatch slurm/e2b_router.slurm
+```
+
+Then add this line in your training YAML config: (for example)
+```
+e2b_router_url: 1.2.3.4:8000
+```
+The port here should match the one used when launching the router.
+All training jobs can share the same router IP which will ensure there are at most 20 parallel executions.
+
 #### IOI problems
 
 We provide a `ioi_code_reward` reward function for executing problems from [IOI](https://hf.co/datasets/open-r1/ioi) using [piston](https://github.com/engineer-man/piston).
diff --git a/scripts/e2b_router.py b/scripts/e2b_router.py
new file mode 100644
index 000000000..75bddfbb8
--- /dev/null
+++ b/scripts/e2b_router.py
@@ -0,0 +1,162 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import asyncio
+from fastapi import FastAPI
+from pydantic import BaseModel, ConfigDict
+from typing import  Optional
+from fastapi import FastAPI, Request
+import argparse
+import asyncio
+from fastapi import FastAPI
+import uvicorn
+from e2b_code_interpreter.models import Execution
+
+from dotenv import load_dotenv
+from e2b_code_interpreter import AsyncSandbox
+
+load_dotenv()
+
+class BatchRequest(BaseModel):
+    """
+    BatchRequest is a data model representing a batch processing request.
+
+    Attributes:
+        scripts (list[str]): A list of script names or paths to be executed.
+        language (str): The programming language in which the scripts are written.
+        timeout (int): The maximum allowed execution time for each script in seconds.
+        request_timeout (int): The maximum allowed time for the entire batch request in seconds.
+    """
+    scripts: list[str]
+    language: str
+    timeout: int
+    request_timeout: int
+
+class ScriptResult(BaseModel):
+    """
+    ScriptResult is a Pydantic model that represents the result of a script execution.
+    Attributes:
+        execution (Optional[Execution]): An optional instance of the `Execution` class 
+            that contains details about the script's execution, such as status, output, 
+            or any other relevant metadata.
+        exception_str (Optional[str]): An optional string that captures the exception 
+            message or details if an error occurred during the script's execution.
+        model_config (ConfigDict): A configuration dictionary that allows arbitrary 
+            types to be used within the Pydantic model. This is necessary to support 
+            custom types like `Execution` within the model.
+    """
+    execution: Optional[Execution]
+    exception_str: Optional[str]
+    
+    # required to allow arbitrary types in pydantic models such as Execution
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    
+def create_app(args):
+    """
+    Creates and configures a FastAPI application instance.
+    Args:
+        args: An object containing configuration parameters for the application.
+              - num_sandboxes (int): The maximum number of concurrent sandboxes allowed.
+    Returns:
+        FastAPI: A configured FastAPI application instance.
+    The application includes the following endpoints:
+        1. GET /health:
+            - Returns the health status of the application.
+            - Response: {"status": "ok"}
+        2. POST /execute_batch:
+            - Executes a batch of scripts in an isolated sandbox environment.
+            - Request Body: BatchRequest object containing:
+                - language (str): The programming language of the scripts (python or javascript).
+                - timeout (int): The maximum execution time for each script.
+                - request_timeout (int): The timeout for the request itself.
+                - scripts (List[str]): A list of scripts to execute.
+            - Response: A list of ScriptResult objects for each script, containing:
+                - execution: The result of the script execution.
+                - exception_str: Any exception encountered during execution.
+    Notes:
+        - A semaphore is used to limit the number of concurrent sandboxes.
+        - Each script execution is wrapped in a timeout to prevent hanging.
+        - Sandboxes are cleaned up after execution, even in case of errors.
+    """
+    app = FastAPI()
+
+    # Instantiate semaphore and attach it to app state
+    app.state.sandbox_semaphore = asyncio.Semaphore(args.max_num_sandboxes)
+
+    @app.get("/health")
+    async def health():
+        return {"status": "ok"}
+
+    @app.post("/execute_batch")
+    async def execute_batch(batch: BatchRequest, request: Request):
+        semaphore = request.app.state.sandbox_semaphore
+        language = batch.language
+        timeout = batch.timeout
+        request_timeout = batch.request_timeout
+        asyncio_timeout = batch.timeout + 1
+
+        async def run_script(script: str) -> ScriptResult:
+            try:
+                async with semaphore:
+                    sandbox = await AsyncSandbox.create(
+                        timeout=timeout,
+                        request_timeout=request_timeout,
+                    )
+                    execution = await asyncio.wait_for(
+                        sandbox.run_code(script, language=language),
+                        timeout=asyncio_timeout,
+                    )
+                    # note that execution.to_json() exists but does not serialize Result.is_main_result
+                    return ScriptResult(execution=execution, exception_str=None)
+            except Exception as e:
+                return ScriptResult(execution=None, exception_str=str(e))
+        
+            finally:
+                try:
+                    await sandbox.kill()
+                except Exception as e:
+                    # do nothing
+                    pass
+
+        tasks = [run_script(script) for script in batch.scripts]
+        return await asyncio.gather(*tasks)
+
+    return app
+
+
+def parse_args():
+    """
+    Parse command-line arguments for the e2b_router script.
+
+    Arguments:
+        --host (str): The hostname or IP address to bind the server to. Defaults to "0.0.0.0" (binds to all interfaces).
+        --port (int): The port number on which the server will listen. Defaults to 8000.
+        --max_num_sandboxes (int): The maximum number of sandboxes that can be created or managed simultaneously. Defaults to 20.
+
+    Returns:
+        argparse.Namespace: Parsed command-line arguments as an object.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--max_num_sandboxes", type=int, default=20)
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    args = parse_args()
+    app = create_app(args)
+
+    uvicorn.run(app, host=args.host, port=args.port)
\ No newline at end of file
diff --git a/slurm/e2b_router.slurm b/slurm/e2b_router.slurm
new file mode 100644
index 000000000..5be4bfebc
--- /dev/null
+++ b/slurm/e2b_router.slurm
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+#SBATCH --partition=hopper-cpu
+#SBATCH --mem=16g
+#SBATCH --cpus-per-task=16
+#SBATCH --output=/fsx/open-r1/logs/e2b_router/%x-%j.out
+#SBATCH --err=/fsx/open-r1/logs/e2b_router/%x-%j.err
+#SBATCH --requeue
+
+echo "Starting job"
+set -x -e
+
+source ~/.bashrc
+source openr1/bin/activate
+
+srun python scripts/e2b_router.py
\ No newline at end of file
diff --git a/src/open_r1/configs.py b/src/open_r1/configs.py
index 0750e2a6d..b24dd305e 100644
--- a/src/open_r1/configs.py
+++ b/src/open_r1/configs.py
@@ -173,3 +173,8 @@ class GRPOScriptArguments(trl.ScriptArguments):
         default="prompt",
         metadata={"help": "Column to use as prompts for training."},
     )
+
+    e2b_router_url: Optional[str] = field(
+        default=None,
+        metadata={"help": "URL for the E2B route. See scripts/e2b_router.py"},
+    )
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 76797868c..49e25f42e 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -33,6 +33,8 @@
     from dotenv import load_dotenv
     from e2b_code_interpreter import AsyncSandbox
 
+    from .utils.routed_sandbox import RoutedSandbox
+
     load_dotenv()
 else:
     AsyncSandbox = None
@@ -382,13 +384,13 @@ def extract_code(completion: str, language: str = "python") -> str:
     return extracted_answer
 
 
-def binary_code_reward(completions, num_parallel: int = 2, **kwargs) -> list[float]:
-    rewards = code_reward(completions, num_parallel=num_parallel, **kwargs)
+def binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]:
+    rewards = code_reward(completions, num_parallel=num_parallel, e2b_router_url=e2b_router_url, **kwargs)
     BINARY_THRESHOLD = 0.99
     return [1.0 if reward > BINARY_THRESHOLD else 0.0 for reward in rewards]
 
 
-def code_reward(completions, num_parallel: int = 2, **kwargs) -> list[float]:
+def code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]:
     """Reward function that evaluates code snippets using the E2B code interpreter.
 
     Assumes the dataset contains a `verification_info` column with test cases.
@@ -448,12 +450,30 @@ def evaluate_code(code, test_cases):
     ]
 
     language = verification_info[0]["language"]
-
     if not all(v["language"] == language for v in verification_info):
         raise ValueError("All verification_info must have the same language", verification_info)
+
+    if e2b_router_url is not None:
+        router_sandbox = RoutedSandbox(router_url=e2b_router_url)
+
+        executions = router_sandbox.run_code(
+            scripts=scripts,
+            language=language,
+            timeout=30,
+            request_timeout=28,
+        )
+
+        rewards = []
+        for execution in executions:
+            try:
+                reward = float(execution.text)
+                rewards.append(reward)
+            except Exception:
+                rewards.append(None)
+        return rewards
+
     try:
         rewards = run_async_from_sync(scripts, language, num_parallel)
-
     except Exception as e:
         print(f"Error from E2B executor: {e}")
         rewards = [0.0] * len(completions)
@@ -553,10 +573,20 @@ def get_reward_funcs(script_args) -> list[Callable]:
         ),
         "length": len_reward,
         "code": update_wrapper(
-            partial(code_reward, num_parallel=script_args.parallel_code_exec_per_proc), code_reward
+            partial(
+                code_reward,
+                num_parallel=script_args.parallel_code_exec_per_proc,
+                e2b_router_url=script_args.e2b_router_url,
+            ),
+            code_reward,
         ),
         "binary_code": update_wrapper(
-            partial(binary_code_reward, num_parallel=script_args.parallel_code_exec_per_proc), binary_code_reward
+            partial(
+                binary_code_reward,
+                num_parallel=script_args.parallel_code_exec_per_proc,
+                e2b_router_url=script_args.e2b_router_url,
+            ),
+            binary_code_reward,
         ),
         "ioi_code": update_wrapper(
             partial(ioi_code_reward, test_batch_size=script_args.code_eval_test_batch_size), ioi_code_reward
diff --git a/src/open_r1/utils/routed_sandbox.py b/src/open_r1/utils/routed_sandbox.py
new file mode 100644
index 000000000..c03904691
--- /dev/null
+++ b/src/open_r1/utils/routed_sandbox.py
@@ -0,0 +1,100 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import requests
+from e2b_code_interpreter.models import Execution, ExecutionError, Result
+
+
+class RoutedSandbox:
+    """
+    A sandbox environment that routes code execution requests to the E2B Router.
+    This class is designed for batched execution of scripts, primarily for Python code.
+    It mimics the usage of 'Sandbox' from 'e2b_code_interpreter', but adds support for batch processing.
+
+    Attributes:
+        router_url (str): The URL of the E2B Router to which code execution requests are sent.
+    """
+
+    def __init__(self, router_url: str):
+        """
+        Initializes the RoutedSandbox with the specified router URL.
+
+        Args:
+            router_url (str): The URL of the E2B Router.
+        """
+        self.router_url = router_url
+
+    def run_code(
+        self,
+        scripts: list[str],
+        language: str = "python",
+        timeout: Optional[int] = None,
+        request_timeout: Optional[int] = None,
+    ) -> list[Execution]:
+        """
+        Executes a batch of scripts in the sandbox environment.
+
+        Args:
+            scripts (list[str]): A list of code scripts to execute.
+            language (str, optional): The programming language of the scripts. Defaults to "python".
+            timeout (Optional[int], optional): The maximum execution time for each script in seconds. Defaults to 300 seconds.
+            request_timeout (Optional[int], optional): The timeout for the HTTP request in seconds. Defaults to 30 seconds.
+
+        Returns:
+            list[Execution]: A list of Execution objects containing the results, logs, and errors (if any) for each script.
+        """
+        # Set default values for timeouts if not provided
+        if timeout is None:
+            timeout = 300  # Default to 5 minutes
+        if request_timeout is None:
+            request_timeout = 30  # Default to 30 seconds
+
+        # Prepare the payload for the HTTP POST request
+        payload = {
+            "scripts": scripts,
+            "language": language,
+            "timeout": timeout,
+            "request_timeout": request_timeout,
+        }
+
+        # Send the request to the E2B Router
+        response = requests.post(f"http://{self.router_url}/execute_batch", json=payload)
+        if not response.ok:
+            print(f"Request failed with status code: {response.status_code}")
+
+        # Parse the response and construct Execution objects
+        results = response.json()
+        output = []
+        for result in results:
+            execution = Execution(
+                results=[Result(**r) for r in result["execution"]["results"]],
+                logs=result["execution"]["logs"],
+                error=ExecutionError(**result["execution"]["error"]) if result["execution"]["error"] else None,
+                execution_count=result["execution"]["execution_count"],
+            )
+            output.append(execution)
+
+        return output
+
+
+if __name__ == "__main__":
+    # for local testing launch an E2B router with: python scripts/e2b_router.py
+    sbx = RoutedSandbox(router_url="0.0.0.0:8000")
+    codes = ["print('hello world')", "print('hello world)"]
+    executions = sbx.run_code(codes)  # Execute Python inside the sandbox
+
+    print(executions)
diff --git a/tests/slow/test_code_reward.py b/tests/slow/test_code_reward.py
index f3f985a98..c225c8c13 100644
--- a/tests/slow/test_code_reward.py
+++ b/tests/slow/test_code_reward.py
@@ -17,7 +17,9 @@
 
 from datasets import load_dataset
 
+from e2b_code_interpreter.models import Execution, ExecutionError
 from open_r1.rewards import code_reward, ioi_code_reward
+from open_r1.utils.routed_sandbox import RoutedSandbox
 
 
 class TestCodeRewards(unittest.TestCase):
@@ -32,6 +34,36 @@ def test_python_code_reward(self):
         print(rewards)
         assert rewards == [1.0] * NUM_SAMPLES
 
+    def test_e2b_router(self):
+        # run router locally: python scripts/e2b_router.py
+        code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated-tested")
+        NUM_SAMPLES = 128
+        samples = code_dataset["train"].select(range(NUM_SAMPLES))
+        test_completions = [[{"content": sample["gold_standard_solution"]}] for sample in samples]
+        reward_kwargs = {"verification_info": [sample["verification_info"] for sample in samples]}
+        rewards = code_reward(test_completions, e2b_router_url="0.0.0.0:8000", **reward_kwargs)
+        print(rewards)
+        assert rewards == [1.0] * NUM_SAMPLES
+
+    def test_e2b_router_parallel(self):
+        # run router locally: python scripts/e2b_router.py
+        code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated-tested")
+
+        BATCH_SIZE = 32
+        NUM_SAMPLES = 256
+
+        def batch_code_reward(examples):
+            test_completions = [[{"content": solution}] for solution in examples["gold_standard_solution"]]
+            reward_kwargs = {
+                "verification_info": [verification_info for verification_info in examples["verification_info"]]
+            }
+            rewards = code_reward(test_completions, e2b_router_url="0.0.0.0:8000", **reward_kwargs)
+            assert rewards == [1.0] * BATCH_SIZE
+            return examples
+
+        code_dataset = code_dataset["train"].select(range(NUM_SAMPLES))
+        code_dataset = code_dataset.map(batch_code_reward, batched=True, batch_size=BATCH_SIZE, num_proc=4)
+
     def test_ioi_code_reward(self):
         # This slow test case requires spinning up a bunch (I tested with ~64) of piston workers, see docs here
         # slurm/piston/README.md
@@ -45,6 +77,42 @@ def test_ioi_code_reward(self):
         print(rewards)
         assert rewards == [1.0] * NUM_SAMPLES
 
+    def test_e2b_router_run_code_success():
+        # run router locally: python scripts/e2b_router.py
+        routed_sandbox = RoutedSandbox(router_url="localhost:8000")
+        scripts = ["print('hello from integration test')", "result = 2 + 2\nprint(result)"]
+
+        results = routed_sandbox.run_code(scripts)
+
+        assert len(results) == 2
+
+        for result in results:
+            assert isinstance(result, Execution)
+            assert result.exit_code == 0
+            assert result.error is None
+            assert "hello" in result.stdout or "4" in result.stdout
+
+    def test_e2b_router_run_code_with_error(sandbox):
+        # run router locally: python scripts/e2b_router.py
+
+        routed_sandbox = RoutedSandbox(router_url="localhost:8000")
+        scripts = ["print('this is fine')", "print('unterminated string"]
+
+        results = routed_sandbox.run_code(scripts)
+
+        assert len(results) == 2
+
+        # First one should be okay
+        assert results[0].exit_code == 0
+        assert results[0].error is None
+        assert "this is fine" in results[0].stdout
+
+        # Second one should have a syntax error
+        assert results[1].exit_code != 0
+        assert results[1].error is not None
+        assert isinstance(results[1].error, ExecutionError)
+        assert "SyntaxError" in results[1].error.type
+
 
 if __name__ == "__main__":
     unittest.main()

From bf08f5684990897ee2d283b9f376b2d6075b1218 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Tue, 8 Apr 2025 20:53:34 +0200
Subject: [PATCH 097/137] [WIP] Bump lighteval with proper pass@1 (#584)

* Bump lighteval with proper pass@1

* Bump lighteval

* Update AIME24
---
 README.md                       |  60 +++++------
 setup.py                        |   4 +-
 slurm/evaluate.slurm            |  17 +--
 src/open_r1/evaluate.py         | 185 --------------------------------
 src/open_r1/utils/evaluation.py |   8 +-
 5 files changed, 35 insertions(+), 239 deletions(-)
 delete mode 100644 src/open_r1/evaluate.py

diff --git a/README.md b/README.md
index c3e5c5dcc..0dc83fa4b 100644
--- a/README.md
+++ b/README.md
@@ -314,31 +314,28 @@ You can scale the number of nodes by increasing the `--nodes` flag.
 
 ## Evaluating models
 
-We use `lighteval` to evaluate models, with custom tasks defined in `src/open_r1/evaluate.py`. For models which fit on a single GPU, run:
+We use `lighteval` to evaluate models. For models which fit on a single GPU, run:
 
 ```shell
 MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,max_num_batched_tokens=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 OUTPUT_DIR=data/evals/$MODEL
 
 # AIME 2024
 TASK=aime24
-lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
-    --custom-tasks src/open_r1/evaluate.py \
+lighteval vllm $MODEL_ARGS "lighteval|$TASK|0|0" \
     --use-chat-template \
     --output-dir $OUTPUT_DIR
 
 # MATH-500
 TASK=math_500
-lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
-    --custom-tasks src/open_r1/evaluate.py \
+lighteval vllm $MODEL_ARGS "lighteval|$TASK|0|0" \
     --use-chat-template \
     --output-dir $OUTPUT_DIR
 
 # GPQA Diamond
 TASK=gpqa:diamond
-lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
-    --custom-tasks src/open_r1/evaluate.py \
+lighteval vllm $MODEL_ARGS "lighteval|$TASK|0|0" \
     --use-chat-template \
     --output-dir $OUTPUT_DIR
 
@@ -349,21 +346,20 @@ lighteval vllm $MODEL_ARGS "extended|lcb:codegeneration|0|0" \
 ```
 
 > [!IMPORTANT]
-> You must set `max_model_length=32768` in the `vllm` command to align with the `max_new_tokens` we define per eval. Without this, `lighteval` will throw an error.
+> You must set `max_model_length=32768` and `max_num_batched_tokens=32768` in the `vllm` command to align with the `max_new_tokens` we define per eval. Without this, `lighteval` will throw an error.
 
 To increase throughput across multiple GPUs, use _data parallel_ as follows:
 
 ```shell
 NUM_GPUS=8
 MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,max_num_batched_tokens=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 TASK=aime24
 OUTPUT_DIR=data/evals/$MODEL
 
-lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
-    --custom-tasks src/open_r1/evaluate.py \
+lighteval vllm $MODEL_ARGS "lighteval|$TASK|0|0" \
     --use-chat-template \
-    --output-dir $OUTPUT_DIR 
+    --output-dir $OUTPUT_DIR
 ```
 
 For large models which require sharding across GPUs, use _tensor parallel_ and run:
@@ -371,15 +367,14 @@ For large models which require sharding across GPUs, use _tensor parallel_ and r
 ```shell
 NUM_GPUS=8
 MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,max_num_batched_tokens=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 TASK=aime24
 OUTPUT_DIR=data/evals/$MODEL
 
 export VLLM_WORKER_MULTIPROC_METHOD=spawn
-lighteval vllm $MODEL_ARGS "custom|$TASK|0|0" \
-    --custom-tasks src/open_r1/evaluate.py \
+lighteval vllm $MODEL_ARGS "lighteval|$TASK|0|0" \
     --use-chat-template \
-    --output-dir $OUTPUT_DIR 
+    --output-dir $OUTPUT_DIR
 ```
 
 You can also launch an evaluation with `make evaluate`, specifying the model, task, and optionally the parallelism technique and number of GPUs.
@@ -405,7 +400,7 @@ make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24 PARALLE
 ## Reproducing Deepseek's evaluation results
 
 > [!NOTE]
-> The DeepSeek-R1 paper uses sampling with 64 responses per query to estimate `pass@1`. Below, we report the results from sampling 1 response per query, which likely explains the small 1-3σ discrepancies between our results and theirs.
+> The DeepSeek-R1 paper uses sampling with 4-64 responses per query to estimate `pass@1` accuracy, but does not specify the specific number of responses per benchmark. For AIME 2024, we report the results from sampling 32 response per query, while for all others we report the accuracy from sampling 1 response. These choices likely explains the small 1-3σ discrepancies between our results and DeepSeek's.
 
 ### AIME 2024
 
@@ -413,23 +408,22 @@ We are able to reproduce Deepseek's reported results on the AIME 2024 benchmark
 
 | Model                         | AIME 2024 (🤗 LightEval) | AIME 2024 (DeepSeek Reported) |
 |:------------------------------|:-----------------------:|:----------------------------:|
-| DeepSeek-R1-Distill-Qwen-1.5B |          26.7           |             28.9             |
-| DeepSeek-R1-Distill-Qwen-7B   |          56.6           |             55.5             |
-| DeepSeek-R1-Distill-Qwen-14B  |          60.0           |             69.7             |
-| DeepSeek-R1-Distill-Qwen-32B  |          73.2           |             72.6             |
-| DeepSeek-R1-Distill-Llama-8B  |          43.3           |             50.4             |
-| DeepSeek-R1-Distill-Llama-70B |          73.3           |             70.0             |
+| DeepSeek-R1-Distill-Qwen-1.5B |          31.8           |             28.9             |
+| DeepSeek-R1-Distill-Qwen-7B   |          52.2           |             55.5             |
+| DeepSeek-R1-Distill-Qwen-14B  |          66.5           |             69.7             |
+| DeepSeek-R1-Distill-Qwen-32B  |          68.0           |             72.6             |
+| DeepSeek-R1-Distill-Llama-8B  |          43.9           |             41.7             |
+| DeepSeek-R1-Distill-Llama-70B |          65.3           |             70.0             |
 
 To reproduce these results use the following command:
 
 ```shell
 NUM_GPUS=1 # Set to 8 for 32B and 70B models
 MODEL=deepseek-ai/{model_name}
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,max_num_batched_tokens=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 OUTPUT_DIR=data/evals/$MODEL
 
-lighteval vllm $MODEL_ARGS "custom|aime24|0|0" \
-    --custom-tasks src/open_r1/evaluate.py \
+lighteval vllm $MODEL_ARGS "lighteval|aime24|0|0" \
     --use-chat-template \
     --output-dir $OUTPUT_DIR
 ```
@@ -458,11 +452,10 @@ To reproduce these results use the following command:
 ```shell
 NUM_GPUS=1 # Set to 8 for 32B and 70B models
 MODEL=deepseek-ai/{model_name}
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,max_num_batched_tokens=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 OUTPUT_DIR=data/evals/$MODEL
 
-lighteval vllm $MODEL_ARGS "custom|math_500|0|0" \
-    --custom-tasks src/open_r1/evaluate.py \
+lighteval vllm $MODEL_ARGS "lighteval|math_500|0|0" \
     --use-chat-template \
     --output-dir $OUTPUT_DIR
 ```
@@ -491,11 +484,10 @@ To reproduce these results use the following command:
 ```shell
 NUM_GPUS=1 # Set to 8 for 32B and 70B models
 MODEL=deepseek-ai/{model_name}
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,max_num_batched_tokens=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 OUTPUT_DIR=data/evals/$MODEL
 
-lighteval vllm $MODEL_ARGS "custom|gpqa:diamond|0|0" \
-    --custom-tasks src/open_r1/evaluate.py \
+lighteval vllm $MODEL_ARGS "lighteval|gpqa:diamond|0|0" \
     --use-chat-template \
     --output-dir $OUTPUT_DIR
 ```
@@ -522,7 +514,7 @@ To reproduce these results use the following command:
 ```shell
 NUM_GPUS=1 # Set to 8 for 32B and 70B models, or data_parallel_size=8 with the smaller models for speed
 MODEL=deepseek-ai/{model_name}
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
+MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,max_num_batched_tokens=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 OUTPUT_DIR=data/evals/$MODEL
 
 lighteval vllm $MODEL_ARGS "extended|lcb:codegeneration|0|0" \
diff --git a/setup.py b/setup.py
index 920697277..c3bf532a6 100644
--- a/setup.py
+++ b/setup.py
@@ -50,12 +50,12 @@
     "einops>=0.8.0",
     "flake8>=6.0.0",
     "hf_transfer>=0.1.4",
-    "huggingface-hub[cli]>=0.19.2,<1.0",
+    "huggingface-hub[cli,hf_xet]>=0.19.2,<1.0",
     "isort>=5.12.0",
     "langdetect",  # Needed for LightEval's extended tasks
     "latex2sympy2_extended>=1.0.6",
     "liger_kernel==0.5.3",
-    "lighteval @ git+https://github.com/huggingface/lighteval.git@ed084813e0bd12d82a06d9f913291fdbee774905",
+    "lighteval @ git+https://github.com/huggingface/lighteval.git@bb14995c4eccab5cabd450b1e509c3c898a16921",  # pass@1 for AIME with n=32 samples per prompt
     "math-verify==0.5.2",  # Used for math verification in grpo
     "packaging>=23.0",
     "parameterized>=0.9.0",
diff --git a/slurm/evaluate.slurm b/slurm/evaluate.slurm
index 6c4631d0c..f3c59044d 100644
--- a/slurm/evaluate.slurm
+++ b/slurm/evaluate.slurm
@@ -30,9 +30,9 @@ if [ "$TENSOR_PARALLEL" = "True" ]; then
     # use TP to shard model across NUM_GPUS
     export VLLM_WORKER_MULTIPROC_METHOD=spawn
     # FIXME: lighteval now requires us to manually pass the generation params
-    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
+    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,max_num_batched_tokens=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 else
-    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
+    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,max_num_batched_tokens=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 fi
 
 LM_EVAL_REPO_ID="open-r1/open-r1-eval-leaderboard"
@@ -46,22 +46,11 @@ HF_HUB_ENABLE_HF_TRANSFER=1
 
 echo "Running lighteval script ..."
 echo "Eval results will be saved to $OUTPUT_DIR"
-# Check if "custom" is a substring of TASKS
-if [[ $TASKS == *"custom"* ]]; then
-    echo "Custom task detected. Running custom task evaluation script ..."
-    lighteval vllm "$MODEL_ARGS" $TASKS \
-    --custom-tasks "src/open_r1/evaluate.py" \
+lighteval vllm "$MODEL_ARGS" $TASKS \
     --use-chat-template \
     --output-dir $OUTPUT_DIR \
     --save-details \
     ${7:+--system-prompt "$7"}
-else
-    lighteval vllm "$MODEL_ARGS" $TASKS \
-    --use-chat-template \
-    --output-dir $OUTPUT_DIR \
-    --save-details \
-    ${7:+--system-prompt "$7"}
-fi
 
 OUTPUT_FILEPATHS=$(find $OUTPUT_DIR/results/ -type f \( -name "*.json" \))
 for filepath in $OUTPUT_FILEPATHS; do
diff --git a/src/open_r1/evaluate.py b/src/open_r1/evaluate.py
deleted file mode 100644
index 699ed66d2..000000000
--- a/src/open_r1/evaluate.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Custom evaluation tasks for LightEval."""
-
-import random
-
-from lighteval.metrics.dynamic_metrics import (
-    ExprExtractionConfig,
-    IndicesExtractionConfig,
-    LatexExtractionConfig,
-    multilingual_extractive_match_metric,
-)
-from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.requests import Doc
-from lighteval.utils.language import Language
-
-
-# Prompt template adapted from
-# - simple-evals: https://github.com/openai/simple-evals/blob/6e84f4e2aed6b60f6a0c7b8f06bbbf4bfde72e58/math_eval.py#L17
-# - Llama 3: https://huggingface.co/datasets/meta-llama/Llama-3.2-1B-Instruct-evals/viewer/Llama-3.2-1B-Instruct-evals__math__details?views%5B%5D=llama_32_1b_instruct_evals__math__details
-# Note that it is important to have the final answer in a box for math-verify to work correctly
-MATH_QUERY_TEMPLATE = """
-Solve the following math problem efficiently and clearly.  The last line of your response should be of the following format: 'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct' (without quotes) where ANSWER is just the final number or expression that solves the problem. Think step by step before answering.
-
-{Question}
-""".strip()
-
-# Prompt template from simple-evals: https://github.com/openai/simple-evals/blob/83ed7640a7d9cd26849bcb3340125002ef14abbe/common.py#L14
-GPQA_QUERY_TEMPLATE = """
-Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
-
-{Question}
-
-A) {A}
-B) {B}
-C) {C}
-D) {D}
-""".strip()
-
-latex_gold_metric = multilingual_extractive_match_metric(
-    language=Language.ENGLISH,
-    fallback_mode="first_match",
-    precision=5,
-    gold_extraction_target=(LatexExtractionConfig(),),
-    # Match boxed first before trying other regexes
-    pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)),
-    aggregation_function=max,
-)
-
-expr_gold_metric = multilingual_extractive_match_metric(
-    language=Language.ENGLISH,
-    fallback_mode="first_match",
-    precision=5,
-    gold_extraction_target=(ExprExtractionConfig(),),
-    # Match boxed first before trying other regexes
-    pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)),
-    aggregation_function=max,
-)
-
-gpqa_metric = multilingual_extractive_match_metric(
-    language=Language.ENGLISH,
-    gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
-    pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
-    precision=5,
-)
-
-
-def math_prompt_fn(line, task_name: str = None):
-    return Doc(
-        task_name=task_name,
-        query=MATH_QUERY_TEMPLATE.format(Question=line["problem"]),
-        choices=[line["solution"]],
-        gold_index=0,
-    )
-
-
-def aime_prompt_fn(line, task_name: str = None):
-    return Doc(
-        task_name=task_name,
-        query=MATH_QUERY_TEMPLATE.format(Question=line["problem"]),
-        choices=[line["answer"]],
-        gold_index=0,
-    )
-
-
-def gpqa_prompt_fn(line, task_name: str = None):
-    gold_index = random.randint(0, 3)
-    choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
-    choices.insert(gold_index, line["Correct Answer"])
-    query = GPQA_QUERY_TEMPLATE.format(
-        A=choices[0], B=choices[1], C=choices[2], D=choices[3], Question=line["Question"]
-    )
-    return Doc(
-        task_name=task_name,
-        query=query,
-        choices=["A", "B", "C", "D"],
-        gold_index=gold_index,
-        instruction=query,
-    )
-
-
-# Define tasks
-aime24 = LightevalTaskConfig(
-    name="aime24",
-    suite=["custom"],
-    prompt_function=aime_prompt_fn,
-    hf_repo="HuggingFaceH4/aime_2024",
-    hf_subset="default",
-    hf_avail_splits=["train"],
-    evaluation_splits=["train"],
-    few_shots_split=None,
-    few_shots_select=None,
-    generation_size=32768,
-    metric=[expr_gold_metric],
-    version=1,
-)
-aime25 = LightevalTaskConfig(
-    name="aime25",
-    suite=["custom"],
-    prompt_function=aime_prompt_fn,
-    hf_repo="yentinglin/aime_2025",
-    hf_subset="default",
-    hf_avail_splits=["train"],
-    evaluation_splits=["train"],
-    few_shots_split=None,
-    few_shots_select=None,
-    generation_size=32768,
-    metric=[expr_gold_metric],
-    version=1,
-)
-math_500 = LightevalTaskConfig(
-    name="math_500",
-    suite=["custom"],
-    prompt_function=math_prompt_fn,
-    hf_repo="HuggingFaceH4/MATH-500",
-    hf_subset="default",
-    hf_avail_splits=["test"],
-    evaluation_splits=["test"],
-    few_shots_split=None,
-    few_shots_select=None,
-    generation_size=32768,
-    metric=[latex_gold_metric],
-    version=1,
-)
-gpqa_diamond = LightevalTaskConfig(
-    name="gpqa:diamond",
-    suite=["custom"],
-    prompt_function=gpqa_prompt_fn,
-    hf_repo="Idavidrein/gpqa",
-    hf_subset="gpqa_diamond",
-    hf_avail_splits=["train"],
-    evaluation_splits=["train"],
-    few_shots_split=None,
-    few_shots_select=None,
-    generation_size=32768,  # needed for reasoning models like R1
-    metric=[gpqa_metric],
-    stop_sequence=[],  # no stop sequence, will use eos token
-    trust_dataset=True,
-    version=1,
-)
-
-
-# Add tasks to the table
-TASKS_TABLE = []
-TASKS_TABLE.append(aime24)
-TASKS_TABLE.append(aime25)
-TASKS_TABLE.append(math_500)
-TASKS_TABLE.append(gpqa_diamond)
-
-# MODULE LOGIC
-if __name__ == "__main__":
-    print([t["name"] for t in TASKS_TABLE])
-    print(len(TASKS_TABLE))
diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py
index 66a048ecf..5bb652155 100644
--- a/src/open_r1/utils/evaluation.py
+++ b/src/open_r1/utils/evaluation.py
@@ -46,10 +46,10 @@ def register_lighteval_task(
 
 LIGHTEVAL_TASKS = {}
 
-register_lighteval_task(LIGHTEVAL_TASKS, "custom", "math_500", "math_500", 0)
-register_lighteval_task(LIGHTEVAL_TASKS, "custom", "aime24", "aime24", 0)
-register_lighteval_task(LIGHTEVAL_TASKS, "custom", "aime25", "aime25", 0)
-register_lighteval_task(LIGHTEVAL_TASKS, "custom", "gpqa", "gpqa:diamond", 0)
+register_lighteval_task(LIGHTEVAL_TASKS, "lighteval", "math_500", "math_500", 0)
+register_lighteval_task(LIGHTEVAL_TASKS, "lighteval", "aime24", "aime24", 0)
+register_lighteval_task(LIGHTEVAL_TASKS, "lighteval", "aime25", "aime25", 0)
+register_lighteval_task(LIGHTEVAL_TASKS, "lighteval", "gpqa", "gpqa:diamond", 0)
 register_lighteval_task(LIGHTEVAL_TASKS, "extended", "lcb", "lcb:codegeneration", 0)
 register_lighteval_task(LIGHTEVAL_TASKS, "extended", "lcb_v4", "lcb:codegeneration_v4", 0)
 

From 2a7bb45f05a7a40cea32ca25fdb85ba4ab06c613 Mon Sep 17 00:00:00 2001
From: Shenghang Tsai <jackalcooper@gmail.com>
Date: Thu, 10 Apr 2025 19:11:35 +0800
Subject: [PATCH 098/137] Update README.md (#590)

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0dc83fa4b..7fbc29941 100644
--- a/README.md
+++ b/README.md
@@ -224,8 +224,8 @@ CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen2.5-1.5B-Instruct
 Then run training with:
 
 ```shell
-CUDA_VISIBLE_DEVICES=1,2,3,4,5,6,7 ACCELERATE_LOG_LEVEL=info \ 
-    accelerate launch --config_file recipes/accelerate_configs/zero2.yaml --num_processes=7 
+CUDA_VISIBLE_DEVICES=1,2,3,4,5,6,7 ACCELERATE_LOG_LEVEL=info \
+    accelerate launch --config_file recipes/accelerate_configs/zero2.yaml --num_processes=7 \
     src/open_r1/grpo.py --config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
 ```
 

From 3a0e89678c1a84e1dbb90c8a9a62c675e0915ffc Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Fri, 11 Apr 2025 11:23:06 +0200
Subject: [PATCH 099/137] Fix eval system prompt (#591)

* fix eval system prompt

* style
---
 slurm/evaluate.slurm            | 2 +-
 src/open_r1/utils/evaluation.py | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/slurm/evaluate.slurm b/slurm/evaluate.slurm
index f3c59044d..93c77de20 100644
--- a/slurm/evaluate.slurm
+++ b/slurm/evaluate.slurm
@@ -50,7 +50,7 @@ lighteval vllm "$MODEL_ARGS" $TASKS \
     --use-chat-template \
     --output-dir $OUTPUT_DIR \
     --save-details \
-    ${7:+--system-prompt "$7"}
+    ${7:+--system-prompt "$(echo "$7" | base64 --decode)"}
 
 OUTPUT_FILEPATHS=$(find $OUTPUT_DIR/results/ -type f \( -name "*.json" \))
 for filepath in $OUTPUT_FILEPATHS; do
diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py
index 5bb652155..5719350fb 100644
--- a/src/open_r1/utils/evaluation.py
+++ b/src/open_r1/utils/evaluation.py
@@ -7,6 +7,7 @@
 if TYPE_CHECKING:
     from trl import GRPOConfig, SFTConfig, ModelConfig
 
+import base64
 import os
 
 
@@ -88,7 +89,10 @@ def run_lighteval_job(
         f"{model_args.trust_remote_code}",
     ]
     if training_args.system_prompt is not None:
-        cmd_args.append(f"--system_prompt={training_args.system_prompt}")
+        # encode to base64 to avoid issues with special characters
+        # we decode in the sbatch script
+        prompt_encoded = base64.b64encode(training_args.system_prompt.encode()).decode()
+        cmd_args.append(prompt_encoded)
     cmd[-1] += " " + " ".join(cmd_args)
     subprocess.run(cmd, check=True)
 

From c1eadaa097afea5729a1b6109803c7d8a73b0b89 Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Fri, 11 Apr 2025 14:04:59 +0200
Subject: [PATCH 100/137] E2B Router bug fixes (#592)

* fix eval system prompt

* style

* fix a rare issue where the execution is None

* fixes a bug in the e2b router
---
 scripts/e2b_router.py               | 25 ++++++++++++-------------
 src/open_r1/rewards.py              |  4 ++--
 src/open_r1/utils/routed_sandbox.py | 17 +++++++++++------
 tests/slow/test_code_reward.py      |  4 +++-
 4 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/scripts/e2b_router.py b/scripts/e2b_router.py
index 75bddfbb8..237ad8f90 100644
--- a/scripts/e2b_router.py
+++ b/scripts/e2b_router.py
@@ -24,7 +24,6 @@
 from fastapi import FastAPI
 import uvicorn
 from e2b_code_interpreter.models import Execution
-
 from dotenv import load_dotenv
 from e2b_code_interpreter import AsyncSandbox
 
@@ -107,10 +106,11 @@ async def execute_batch(batch: BatchRequest, request: Request):
         timeout = batch.timeout
         request_timeout = batch.request_timeout
         asyncio_timeout = batch.timeout + 1
-
+        
         async def run_script(script: str) -> ScriptResult:
-            try:
-                async with semaphore:
+
+            async with semaphore:
+                try:
                     sandbox = await AsyncSandbox.create(
                         timeout=timeout,
                         request_timeout=request_timeout,
@@ -119,17 +119,16 @@ async def run_script(script: str) -> ScriptResult:
                         sandbox.run_code(script, language=language),
                         timeout=asyncio_timeout,
                     )
-                    # note that execution.to_json() exists but does not serialize Result.is_main_result
                     return ScriptResult(execution=execution, exception_str=None)
-            except Exception as e:
-                return ScriptResult(execution=None, exception_str=str(e))
-        
-            finally:
-                try:
-                    await sandbox.kill()
+
                 except Exception as e:
-                    # do nothing
-                    pass
+                    return ScriptResult(execution=None, exception_str=str(e))
+                
+                finally:
+                    try:
+                        await sandbox.kill()
+                    except Exception:
+                        pass
 
         tasks = [run_script(script) for script in batch.scripts]
         return await asyncio.gather(*tasks)
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 49e25f42e..b62a81a81 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -454,9 +454,9 @@ def evaluate_code(code, test_cases):
         raise ValueError("All verification_info must have the same language", verification_info)
 
     if e2b_router_url is not None:
-        router_sandbox = RoutedSandbox(router_url=e2b_router_url)
+        routed_sandbox = RoutedSandbox(router_url=e2b_router_url)
 
-        executions = router_sandbox.run_code(
+        executions = routed_sandbox.run_code(
             scripts=scripts,
             language=language,
             timeout=30,
diff --git a/src/open_r1/utils/routed_sandbox.py b/src/open_r1/utils/routed_sandbox.py
index c03904691..950175950 100644
--- a/src/open_r1/utils/routed_sandbox.py
+++ b/src/open_r1/utils/routed_sandbox.py
@@ -80,12 +80,17 @@ def run_code(
         results = response.json()
         output = []
         for result in results:
-            execution = Execution(
-                results=[Result(**r) for r in result["execution"]["results"]],
-                logs=result["execution"]["logs"],
-                error=ExecutionError(**result["execution"]["error"]) if result["execution"]["error"] else None,
-                execution_count=result["execution"]["execution_count"],
-            )
+            if result["execution"] is None:
+                # If execution is None, create an empty Execution object
+                # This can happen when a script times out or fails to execute
+                execution = Execution()
+            else:
+                execution = Execution(
+                    results=[Result(**r) for r in result["execution"]["results"]],
+                    logs=result["execution"]["logs"],
+                    error=ExecutionError(**result["execution"]["error"]) if result["execution"]["error"] else None,
+                    execution_count=result["execution"]["execution_count"],
+                )
             output.append(execution)
 
         return output
diff --git a/tests/slow/test_code_reward.py b/tests/slow/test_code_reward.py
index c225c8c13..06827828c 100644
--- a/tests/slow/test_code_reward.py
+++ b/tests/slow/test_code_reward.py
@@ -62,7 +62,9 @@ def batch_code_reward(examples):
             return examples
 
         code_dataset = code_dataset["train"].select(range(NUM_SAMPLES))
-        code_dataset = code_dataset.map(batch_code_reward, batched=True, batch_size=BATCH_SIZE, num_proc=4)
+        code_dataset = code_dataset.map(
+            batch_code_reward, batched=True, batch_size=BATCH_SIZE, num_proc=4, load_from_cache_file=False
+        )
 
     def test_ioi_code_reward(self):
         # This slow test case requires spinning up a bunch (I tested with ~64) of piston workers, see docs here

From 04dbf21989c101c077deb437a851f6fa963a84d5 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Fri, 11 Apr 2025 16:32:33 +0200
Subject: [PATCH 101/137] Bump TRL and vLLM (#595)

* Bump TRL and vLLM

* Fix style

* Bump liger

* Add liger
---
 Makefile                                             |  2 +-
 README.md                                            | 11 ++++++-----
 .../grpo/config_demo.yaml                            |  1 +
 recipes/OlympicCoder-32B/sft/config_v00.00.yaml      |  2 +-
 recipes/OlympicCoder-7B/sft/config_v00.00.yaml       |  2 +-
 recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml   |  2 +-
 setup.py                                             | 12 ++++++------
 7 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/Makefile b/Makefile
index 848deb69f..74994db6e 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ check_dirs := src tests
 # dev dependencies
 install:
 	uv venv openr1 --python 3.11 && . openr1/bin/activate && uv pip install --upgrade pip
-	uv pip install vllm==0.7.2
+	uv pip install vllm==0.8.3
 	uv pip install setuptools
 	uv pip install flash-attn --no-build-isolation
 	GIT_LFS_SKIP_SMUDGE=1 uv pip install -e ".[dev]"
diff --git a/README.md b/README.md
index 7fbc29941..56595cf1a 100644
--- a/README.md
+++ b/README.md
@@ -69,7 +69,7 @@ uv venv openr1 --python 3.11 && source openr1/bin/activate && uv pip install --u
 Next, install vLLM and FlashAttention:
 
 ```shell
-uv pip install vllm==0.7.2
+uv pip install vllm==0.8.3
 uv pip install setuptools && uv pip install flash-attn --no-build-isolation
 ```
 
@@ -100,6 +100,9 @@ sudo apt-get install git-lfs
 
 ## Training models
 
+> [!NOTE]
+> The training commands below are configured for a node of 8 x H100s (80GB). For different hardware and topologies, you may need to tune the batch size and number of gradient accumulation steps.
+
 We support training models with either DDP or DeepSpeed (ZeRO-2 and ZeRO-3). For example, to run SFT on a dataset distilled from DeepSeek-R1 with reasoning traces such as [open-r1/OpenR1-Math-220k](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k), run:
 
 ```shell
@@ -107,13 +110,14 @@ We support training models with either DDP or DeepSpeed (ZeRO-2 and ZeRO-3). For
 accelerate launch --config_file=recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py \
     --model_name_or_path Qwen/Qwen2.5-1.5B-Instruct \
     --dataset_name open-r1/OpenR1-Math-220k \
-    --learning_rate 1.0e-5 \
+    --learning_rate 5.0e-5 \
     --num_train_epochs 1 \
     --packing \
     --max_seq_length 16384 \
     --per_device_train_batch_size 16 \
     --gradient_checkpointing \
     --bf16 \
+    --use_liger_kernel \
     --output_dir data/Qwen2.5-1.5B-Open-R1-Distill
 
 # Train via YAML config
@@ -146,9 +150,6 @@ accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r
     --wandb_entity huggingface --wandb_project open-r1 --run_name Qwen2.5-1.5B-GRPO
 ```
 
-> [!NOTE]
-> The training commands below are configured for a node of 8 x H100s (80GB). For different hardware and topologies, you may need to tune the batch size and number of gradient accumulation steps.
-
 ### SFT
 
 To run SFT on a dataset distilled from DeepSeek-R1 with reasoning traces such as [open-r1/OpenR1-Math-220k](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k), run:
diff --git a/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml b/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
index ee416a848..639389cc2 100644
--- a/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
+++ b/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
@@ -54,4 +54,5 @@ save_strategy: "epoch"
 save_total_limit: 1
 seed: 42
 temperature: 0.7
+use_liger_kernel: true
 warmup_ratio: 0.1
diff --git a/recipes/OlympicCoder-32B/sft/config_v00.00.yaml b/recipes/OlympicCoder-32B/sft/config_v00.00.yaml
index 1d6a2ae69..754b78f6d 100644
--- a/recipes/OlympicCoder-32B/sft/config_v00.00.yaml
+++ b/recipes/OlympicCoder-32B/sft/config_v00.00.yaml
@@ -45,5 +45,5 @@ save_only_model: true # needed to bypass FSDP errors with saving paged optimizer
 save_strategy: epoch
 save_total_limit: 1
 seed: 42
-use_liger: false # fails on multi-node
+use_liger_kernel: false # fails on multi-node
 warmup_ratio: 0.03
\ No newline at end of file
diff --git a/recipes/OlympicCoder-7B/sft/config_v00.00.yaml b/recipes/OlympicCoder-7B/sft/config_v00.00.yaml
index 69e2676b4..dd0be5d96 100644
--- a/recipes/OlympicCoder-7B/sft/config_v00.00.yaml
+++ b/recipes/OlympicCoder-7B/sft/config_v00.00.yaml
@@ -42,5 +42,5 @@ report_to:
 save_strategy: epoch
 save_total_limit: 1
 seed: 42
-use_liger: true
+use_liger_kernel: true
 warmup_ratio: 0.03
\ No newline at end of file
diff --git a/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
index 8f9af88a9..9274abb12 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
@@ -40,5 +40,5 @@ save_strategy: "steps"
 save_steps: 100
 save_total_limit: 1
 seed: 42
-use_liger: true
+use_liger_kernel: true
 warmup_ratio: 0.05
\ No newline at end of file
diff --git a/setup.py b/setup.py
index c3bf532a6..74d7e0d77 100644
--- a/setup.py
+++ b/setup.py
@@ -54,7 +54,7 @@
     "isort>=5.12.0",
     "langdetect",  # Needed for LightEval's extended tasks
     "latex2sympy2_extended>=1.0.6",
-    "liger_kernel==0.5.3",
+    "liger-kernel>=0.5.6",
     "lighteval @ git+https://github.com/huggingface/lighteval.git@bb14995c4eccab5cabd450b1e509c3c898a16921",  # pass@1 for AIME with n=32 samples per prompt
     "math-verify==0.5.2",  # Used for math verification in grpo
     "packaging>=23.0",
@@ -65,10 +65,10 @@
     "ruff>=0.9.0",
     "safetensors>=0.3.3",
     "sentencepiece>=0.1.99",
-    "torch==2.5.1",
-    "transformers==4.50.0",
-    "trl==0.16.0",
-    "vllm==0.7.2",
+    "torch==2.6.0",
+    "transformers==4.51.2",
+    "trl @ git+https://github.com/huggingface/trl.git@d625c5533a6b1c84d3565c8080857f6bb81c538a",  # Bump for vLLM and 2x faster throughput: https://github.com/huggingface/trl/pull/3276
+    "vllm==0.8.3",
     "wandb>=0.19.1",
 ]
 
@@ -105,7 +105,7 @@ def deps_list(*pkgs):
     deps["langdetect"],
     deps["latex2sympy2_extended"],
     deps["math-verify"],
-    deps["liger_kernel"],
+    deps["liger-kernel"],
     deps["packaging"],  # utilities from PyPA to e.g., compare versions
     deps["safetensors"],
     deps["sentencepiece"],

From 068f13f236e4fe70e0881efa25c69e1c1df2607a Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Fri, 11 Apr 2025 17:45:38 +0200
Subject: [PATCH 102/137] Hotfix bin reward (#597)

* add WIP code GRPO configs

* hotfix bin reward

* remove unwanted files

* remote configs
---
 src/open_r1/rewards.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index b62a81a81..daa2f3252 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -387,7 +387,15 @@ def extract_code(completion: str, language: str = "python") -> str:
 def binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]:
     rewards = code_reward(completions, num_parallel=num_parallel, e2b_router_url=e2b_router_url, **kwargs)
     BINARY_THRESHOLD = 0.99
-    return [1.0 if reward > BINARY_THRESHOLD else 0.0 for reward in rewards]
+
+    output = []
+    for reward in rewards:
+        if reward is None:
+            output.append(None)
+        else:
+            output.append(1.0 if reward > BINARY_THRESHOLD else 0.0)
+
+    return output
 
 
 def code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]:

From 8cf42663fdc09b82d2593ac86e4748fa7d1b38fb Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Fri, 11 Apr 2025 20:09:15 +0200
Subject: [PATCH 103/137] Clean up recipes (#596)

---
 Makefile                                      |  5 +-
 .../sft/config_openr1_math.yaml               | 42 --------------
 .../Qwen2.5-7B-Instruct/grpo/config_demo.yaml | 55 -------------------
 recipes/SmolLM2-1.7B-Instruct/sft/config.yaml | 46 ----------------
 recipes/SmolLM2-1.7B/sft/config.yaml          | 46 ----------------
 5 files changed, 2 insertions(+), 192 deletions(-)
 delete mode 100644 recipes/Mistral-Small-24B-Instruct-2501/sft/config_openr1_math.yaml
 delete mode 100644 recipes/Qwen2.5-7B-Instruct/grpo/config_demo.yaml
 delete mode 100644 recipes/SmolLM2-1.7B-Instruct/sft/config.yaml
 delete mode 100644 recipes/SmolLM2-1.7B/sft/config.yaml

diff --git a/Makefile b/Makefile
index 74994db6e..3b307267f 100644
--- a/Makefile
+++ b/Makefile
@@ -40,14 +40,13 @@ evaluate:
 		fi \
 	),))
 	$(if $(filter tensor,$(PARALLEL)),export VLLM_WORKER_MULTIPROC_METHOD=spawn &&,) \
-	MODEL_ARGS="pretrained=$(MODEL),dtype=bfloat16,$(PARALLEL_ARGS),max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}" && \
+	MODEL_ARGS="pretrained=$(MODEL),dtype=bfloat16,$(PARALLEL_ARGS),max_model_length=32768,max_num_batched_tokens=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}" && \
 	if [ "$(TASK)" = "lcb" ]; then \
 		lighteval vllm $$MODEL_ARGS "extended|lcb:codegeneration|0|0" \
 			--use-chat-template \
 			--output-dir data/evals/$(MODEL); \
 	else \
-		lighteval vllm $$MODEL_ARGS "custom|$(TASK)|0|0" \
-			--custom-tasks src/open_r1/evaluate.py \
+		lighteval vllm $$MODEL_ARGS "lighteval|$(TASK)|0|0" \
 			--use-chat-template \
 			--output-dir data/evals/$(MODEL); \
 	fi
diff --git a/recipes/Mistral-Small-24B-Instruct-2501/sft/config_openr1_math.yaml b/recipes/Mistral-Small-24B-Instruct-2501/sft/config_openr1_math.yaml
deleted file mode 100644
index 417b86063..000000000
--- a/recipes/Mistral-Small-24B-Instruct-2501/sft/config_openr1_math.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-# To start the training, run the following command:
-# sbatch -N 4 --job-name=mistral_sft slurm/train.slurm Mistral-Small-24B-Instruct-2501 sft numina zero3
-
-model_name_or_path: mistralai/Mistral-Small-24B-Instruct-2501
-model_revision: main
-torch_dtype: bfloat16
-attn_implementation: flash_attention_2
-
-# Data training arguments
-# dataset_name: yentinglin/s1K-1.1-trl-format
-dataset_name: yentinglin/OpenR1-Math-220k-trl-format
-preprocessing_num_workers: 8
-
-# SFT trainer config
-bf16: true
-do_eval: true
-eval_strategy: no
-gradient_accumulation_steps: 4
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-hub_model_id: Mistral-Small-24B-Instruct-2501-Open-R1-Distill
-hub_strategy: every_save
-learning_rate: 2.0e-05
-log_level: info
-logging_steps: 1
-logging_strategy: steps
-lr_scheduler_type: cosine
-packing: true
-max_length: 32768
-max_steps: -1
-num_train_epochs: 5
-output_dir: data/Mistral-Small-24B-Instruct-2501-Open-R1-Distill
-overwrite_output_dir: true
-per_device_eval_batch_size: 1
-per_device_train_batch_size: 1
-push_to_hub: true
-report_to:
-- wandb
-save_strategy: epoch
-seed: 42
-warmup_ratio: 0.1
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_demo.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_demo.yaml
deleted file mode 100644
index be6f06d79..000000000
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_demo.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-# Model arguments
-model_name_or_path: Qwen/Qwen2.5-7B-Instruct
-model_revision: main
-torch_dtype: bfloat16
-attn_implementation: flash_attention_2
-
-# Data training arguments
-dataset_name: open-r1/OpenR1-Math-cn_k12-86k
-dataset_prompt_column: problem
-system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
-
-# GRPO trainer config
-beta: 0.001
-bf16: true
-do_eval: false
-eval_strategy: "no"
-use_vllm: true
-do_eval: false
-gradient_accumulation_steps: 16
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-hub_model_id: Qwen2.5-7B-Instruct-GRPO
-hub_strategy: every_save
-learning_rate: 1.0e-06
-log_completions: true
-log_level: info
-logging_first_step: true
-logging_steps: 1
-logging_strategy: steps
-lr_scheduler_type: constant_with_warmup
-max_grad_norm: 0.2
-max_prompt_length: 1024
-max_completion_length: 4096
-max_steps: -1
-num_generations: 16
-num_train_epochs: 1
-output_dir: data/Qwen2.5-7B-Instruct-GRPO
-overwrite_output_dir: true
-per_device_train_batch_size: 4
-push_to_hub: true
-report_to:
-- wandb
-reward_funcs:
-- accuracy
-- format
-reward_weights:
-- 1.0
-- 0.2
-save_strategy: "steps"
-save_steps: 0.1
-save_total_limit: 1
-seed: 42
-temperature: 0.7
-warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/SmolLM2-1.7B-Instruct/sft/config.yaml b/recipes/SmolLM2-1.7B-Instruct/sft/config.yaml
deleted file mode 100644
index 6f0bc9498..000000000
--- a/recipes/SmolLM2-1.7B-Instruct/sft/config.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Model arguments
-# You can download the model and manually change the rope to 300k/500k and max_position_embeddings to 32768
-model_name_or_path: HuggingFaceTB/SmolLM2-1.7B-Instruct
-model_revision: main
-torch_dtype: bfloat16
-attn_implementation: sdpa
-
-# Data training arguments
-dataset_name: open-r1/OpenR1-Math-220k
-dataset_num_proc: 48
-
-#SFT hyperparam
-max_length: 8192 # You can set this to 32768 if you change the rope, but you need to change the config.json file 
-weight_decay: 0.0001
-optim: adamw_torch
-lr_scheduler_type: linear
-warmup_ratio: 0.1
-learning_rate: 5.0e-05
-gradient_accumulation_steps: 2
-per_device_eval_batch_size: 4
-per_device_train_batch_size: 4 # Change this depending on the context length of the model to keep a 500M GBS. 
-
-# SFT trainer config
-max_steps: -1
-num_train_epochs: 3
-bf16: true
-do_eval: false
-eval_strategy: 'no'
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-hub_model_id: OpenR1-Qwen-7B-SFT
-hub_strategy: every_save
-log_level: info
-logging_steps: 5
-logging_strategy: steps
-packing: true
-output_dir: data/OpenR1-Qwen-7B-SFT
-overwrite_output_dir: true
-push_to_hub: true
-report_to:
-- wandb
-save_strategy: "steps"
-save_steps: 500
-save_total_limit: 1
-seed: 42
diff --git a/recipes/SmolLM2-1.7B/sft/config.yaml b/recipes/SmolLM2-1.7B/sft/config.yaml
deleted file mode 100644
index 4a1f2d68c..000000000
--- a/recipes/SmolLM2-1.7B/sft/config.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Model arguments
-# You can download the model and manually change the rope to 300k/500k and max_position_embeddings to 32768
-model_name_or_path: HuggingFaceTB/SmolLM2-1.7B
-model_revision: main
-torch_dtype: bfloat16
-attn_implementation: sdpa
-
-# Data training arguments
-dataset_name: open-r1/OpenR1-Math-220k 
-dataset_num_proc: 48
-
-#SFT hyperparam
-max_length: 8192 # You can set this to 32768 if you change the rope, but you need to change the config.json file 
-weight_decay: 0.0001
-optim: adamw_torch
-lr_scheduler_type: linear
-warmup_ratio: 0.1
-learning_rate: 5.0e-05
-gradient_accumulation_steps: 2
-per_device_eval_batch_size: 4
-per_device_train_batch_size: 4 # Change this depending on the context length of the model to keep a 500M GBS. 
-
-# SFT trainer config
-max_steps: -1
-num_train_epochs: 3
-bf16: true
-do_eval: false
-eval_strategy: 'no'
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-hub_model_id: OpenR1-Qwen-7B-SFT
-hub_strategy: every_save
-log_level: info
-logging_steps: 5
-logging_strategy: steps
-packing: true
-output_dir: data/OpenR1-Qwen-7B-SFT
-overwrite_output_dir: true
-push_to_hub: true
-report_to:
-- wandb
-save_strategy: "steps"
-save_steps: 500
-save_total_limit: 1
-seed: 42

From 8eb1b7860a727968cdb865b04749eb1654486bbe Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Wed, 16 Apr 2025 10:24:33 +0200
Subject: [PATCH 104/137] Set DP=1 due to vLLM <> LightEval hanging (#600)

* Update evaluate.slurm

* Disable DP

* Fix
---
 src/open_r1/utils/evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py
index 5719350fb..04c80e052 100644
--- a/src/open_r1/utils/evaluation.py
+++ b/src/open_r1/utils/evaluation.py
@@ -73,7 +73,7 @@ def run_lighteval_job(
     if get_param_count_from_repo_id(model_name) >= 30_000_000_000:
         tensor_parallel = True
     else:
-        num_gpus = 8
+        num_gpus = 1  # FIXME: vLLM 0.8.3 hangs with lighteval and DP > 1, so we disable it for now. See https://github.com/huggingface/lighteval/issues/670
         tensor_parallel = False
 
     cmd = VLLM_SLURM_PREFIX.copy()

From bcbb1da40187718c109d28815e5c50f7d9740544 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Wed, 16 Apr 2025 10:37:38 +0200
Subject: [PATCH 105/137] Update evaluation.py (#608)

---
 src/open_r1/utils/evaluation.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py
index 04c80e052..bde88c0c9 100644
--- a/src/open_r1/utils/evaluation.py
+++ b/src/open_r1/utils/evaluation.py
@@ -73,9 +73,12 @@ def run_lighteval_job(
     if get_param_count_from_repo_id(model_name) >= 30_000_000_000:
         tensor_parallel = True
     else:
-        num_gpus = 1  # FIXME: vLLM 0.8.3 hangs with lighteval and DP > 1, so we disable it for now. See https://github.com/huggingface/lighteval/issues/670
+        num_gpus = 8
         tensor_parallel = False
 
+    # FIXME: vLLM 0.8.3 hangs with lighteval and DP > 1, so we disable it for now and use TP for all evals. See https://github.com/huggingface/lighteval/issues/670
+    tensor_parallel = True
+
     cmd = VLLM_SLURM_PREFIX.copy()
     cmd_args = [
         f"--gres=gpu:{num_gpus}",

From 5112bfc4012d8bb4f919c79c900064a9c0d61430 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Wed, 16 Apr 2025 11:45:50 +0200
Subject: [PATCH 106/137] Fix SFT for base models (#604)

* Fix pad token bug in SFT

* Add ChatML default

* Clean up

* Refactor grpo model load

* Add doc

* Bump deepspeed
---
 README.md                                     | 40 +++++++++++++++++-
 recipes/OpenR1-Qwen-7B/sft/config.yaml        |  2 +-
 .../sft/config_demo.yaml                      |  2 +-
 setup.py                                      |  4 +-
 src/open_r1/grpo.py                           | 24 ++++-------
 src/open_r1/sft.py                            | 41 +++++--------------
 src/open_r1/utils/__init__.py                 |  4 +-
 src/open_r1/utils/model_utils.py              | 36 +++++++++++-----
 8 files changed, 90 insertions(+), 63 deletions(-)

diff --git a/README.md b/README.md
index 56595cf1a..e36c4815e 100644
--- a/README.md
+++ b/README.md
@@ -112,7 +112,6 @@ accelerate launch --config_file=recipes/accelerate_configs/zero3.yaml src/open_r
     --dataset_name open-r1/OpenR1-Math-220k \
     --learning_rate 5.0e-5 \
     --num_train_epochs 1 \
-    --packing \
     --max_seq_length 16384 \
     --per_device_train_batch_size 16 \
     --gradient_checkpointing \
@@ -150,6 +149,45 @@ accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r
     --wandb_entity huggingface --wandb_project open-r1 --run_name Qwen2.5-1.5B-GRPO
 ```
 
+**🚨 WARNING 🚨**
+
+Most base models like `meta-llama/Llama-3.2-1B` do not have a chat template, so we set ChatML as the default during training. However, for Qwen base models like `Qwen/Qwen2.5-1.5B`, a chat template is pre-defined in the tokenizer, so the EOS token must be set accordingly, e.g.
+
+```diff
+# Align EOS token with chat template for Qwen base models
+accelerate launch --config_file=recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py \
+    --model_name_or_path Qwen/Qwen2.5-1.5B \
++   --eos_token '<|im_end|>'
+    --dataset_name open-r1/OpenR1-Math-220k \
+    --learning_rate 5.0e-5 \
+    --num_train_epochs 1 \
+    --max_seq_length 16384 \
+    --per_device_train_batch_size 16 \
+    --gradient_checkpointing \
+    --bf16 \
+    --use_liger_kernel \
+    --output_dir data/Qwen2.5-1.5B-Open-R1-Distill
+```
+
+If you wish to use a custom chat template (e.g. Llama or Gemma), then the chat template and associated EOS token must be provided:
+
+```diff
+# Align EOS token with custom chat template
+accelerate launch --config_file=recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py \
+    --model_name_or_path meta-llama/Llama-3.2-1B \
++   --chat_template "$(cat llama_chat_template.jinja)" \
++   --eos_token '<|eot_id|>' \
+    --dataset_name open-r1/OpenR1-Math-220k \
+    --learning_rate 5.0e-5 \
+    --num_train_epochs 1 \
+    --max_seq_length 16384 \
+    --per_device_train_batch_size 16 \
+    --gradient_checkpointing \
+    --bf16 \
+    --use_liger_kernel \
+    --output_dir data/Llama-3.2-1B-Open-R1-Distill
+```
+
 ### SFT
 
 To run SFT on a dataset distilled from DeepSeek-R1 with reasoning traces such as [open-r1/OpenR1-Math-220k](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k), run:
diff --git a/recipes/OpenR1-Qwen-7B/sft/config.yaml b/recipes/OpenR1-Qwen-7B/sft/config.yaml
index 812469f00..9cc06c9f7 100644
--- a/recipes/OpenR1-Qwen-7B/sft/config.yaml
+++ b/recipes/OpenR1-Qwen-7B/sft/config.yaml
@@ -36,7 +36,7 @@ hub_strategy: every_save
 log_level: info
 logging_steps: 5
 logging_strategy: steps
-packing: true
+packing: false
 output_dir: data/OpenR1-Qwen-7B-SFT
 overwrite_output_dir: true
 push_to_hub: true
diff --git a/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
index 9274abb12..fabe9ed7f 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
@@ -25,7 +25,7 @@ logging_strategy: steps
 lr_scheduler_type: cosine_with_min_lr
 lr_scheduler_kwargs:
   min_lr_rate: 0.1
-packing: true
+packing: false
 max_length: 16384
 max_steps: -1
 num_train_epochs: 1
diff --git a/setup.py b/setup.py
index 74d7e0d77..ef3161218 100644
--- a/setup.py
+++ b/setup.py
@@ -44,7 +44,7 @@
     "accelerate==1.4.0",
     "bitsandbytes>=0.43.0",
     "datasets>=3.2.0",
-    "deepspeed==0.15.4",
+    "deepspeed==0.16.4",
     "distilabel[vllm,ray,openai]>=1.5.2",
     "e2b-code-interpreter>=1.0.5",
     "einops>=0.8.0",
@@ -67,7 +67,7 @@
     "sentencepiece>=0.1.99",
     "torch==2.6.0",
     "transformers==4.51.2",
-    "trl @ git+https://github.com/huggingface/trl.git@d625c5533a6b1c84d3565c8080857f6bb81c538a",  # Bump for vLLM and 2x faster throughput: https://github.com/huggingface/trl/pull/3276
+    "trl @ git+https://github.com/huggingface/trl.git@c04e84c4545acfaecdf7e0631ad07a86ab0fb2f6",  # Fix EOS token for SFT on base models: https://github.com/huggingface/trl/pull/3299
     "vllm==0.8.3",
     "wandb>=0.19.1",
 ]
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index af394c26c..80f5e9e71 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -17,7 +17,6 @@
 import sys
 
 import datasets
-import torch
 import transformers
 from datasets import load_dataset
 from transformers import set_seed
@@ -25,7 +24,7 @@
 
 from open_r1.configs import GRPOConfig, GRPOScriptArguments
 from open_r1.rewards import get_reward_funcs
-from open_r1.utils import get_tokenizer
+from open_r1.utils import get_model, get_tokenizer
 from open_r1.utils.callbacks import get_callbacks
 from open_r1.utils.wandb_logging import init_wandb_training
 from trl import GRPOTrainer, ModelConfig, TrlParser, get_peft_config
@@ -80,6 +79,12 @@ def main(script_args, training_args, model_args):
     ################
     tokenizer = get_tokenizer(model_args, training_args)
 
+    ##############
+    # Load model #
+    ##############
+    logger.info("*** Loading model ***")
+    model = get_model(model_args, training_args)
+
     # Get reward functions from the registry
     reward_funcs = get_reward_funcs(script_args)
 
@@ -102,24 +107,11 @@ def make_conversation(example, prompt_column: str = script_args.dataset_prompt_c
         if "messages" in dataset[split].column_names:
             dataset[split] = dataset[split].remove_columns("messages")
 
-    logger.info("*** Initializing model kwargs ***")
-    torch_dtype = (
-        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
-    )
-    model_kwargs = dict(
-        revision=model_args.model_revision,
-        trust_remote_code=model_args.trust_remote_code,
-        attn_implementation=model_args.attn_implementation,
-        torch_dtype=torch_dtype,
-        use_cache=False if training_args.gradient_checkpointing else True,
-    )
-    training_args.model_init_kwargs = model_kwargs
-
     #############################
     # Initialize the GRPO trainer
     #############################
     trainer = GRPOTrainer(
-        model=model_args.model_name_or_path,
+        model=model,
         reward_funcs=reward_funcs,
         args=training_args,
         train_dataset=dataset[script_args.dataset_train_split],
diff --git a/src/open_r1/sft.py b/src/open_r1/sft.py
index 82964574a..9566d0530 100644
--- a/src/open_r1/sft.py
+++ b/src/open_r1/sft.py
@@ -20,7 +20,7 @@
 # One 1 node of 8 x H100s
 accelerate launch --config_file=recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py \
     --model_name_or_path Qwen/Qwen2.5-1.5B-Instruct \
-    --dataset_name HuggingFaceH4/Bespoke-Stratos-17k \
+    --dataset_name open-r1/OpenR1-Math-220k \
     --learning_rate 2.0e-5 \
     --num_train_epochs 1 \
     --packing \
@@ -40,25 +40,16 @@
 import sys
 
 import datasets
-import torch
 import transformers
 from datasets import load_dataset
 from transformers import set_seed
 from transformers.trainer_utils import get_last_checkpoint
 
 from open_r1.configs import SFTConfig
-from open_r1.utils import get_tokenizer
+from open_r1.utils import get_model, get_tokenizer
 from open_r1.utils.callbacks import get_callbacks
 from open_r1.utils.wandb_logging import init_wandb_training
-from trl import (
-    ModelConfig,
-    ScriptArguments,
-    SFTTrainer,
-    TrlParser,
-    get_kbit_device_map,
-    get_peft_config,
-    get_quantization_config,
-)
+from trl import ModelConfig, ScriptArguments, SFTTrainer, TrlParser, get_peft_config, setup_chat_format
 
 
 logger = logging.getLogger(__name__)
@@ -106,32 +97,22 @@ def main(script_args, training_args, model_args):
     # Load tokenizer
     ################
     tokenizer = get_tokenizer(model_args, training_args)
-    tokenizer.pad_token = tokenizer.eos_token
 
     ###################
-    # Model init kwargs
+    # Load model
     ###################
-    logger.info("*** Initializing model kwargs ***")
-    torch_dtype = (
-        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
-    )
-    quantization_config = get_quantization_config(model_args)
-    model_kwargs = dict(
-        revision=model_args.model_revision,
-        trust_remote_code=model_args.trust_remote_code,
-        attn_implementation=model_args.attn_implementation,
-        torch_dtype=torch_dtype,
-        use_cache=False if training_args.gradient_checkpointing else True,
-        device_map=get_kbit_device_map() if quantization_config is not None else None,
-        quantization_config=quantization_config,
-    )
-    training_args.model_init_kwargs = model_kwargs
+    logger.info("*** Loading model ***")
+    model = get_model(model_args, training_args)
+
+    if tokenizer.chat_template is None:
+        logger.info("No chat template provided, using ChatML.")
+        model, tokenizer = setup_chat_format(model, tokenizer, format="chatml")
 
     ############################
     # Initialize the SFT Trainer
     ############################
     trainer = SFTTrainer(
-        model=model_args.model_name_or_path,
+        model=model,
         args=training_args,
         train_dataset=dataset[script_args.dataset_train_split],
         eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
diff --git a/src/open_r1/utils/__init__.py b/src/open_r1/utils/__init__.py
index 5302463e1..5ada1c6e2 100644
--- a/src/open_r1/utils/__init__.py
+++ b/src/open_r1/utils/__init__.py
@@ -1,5 +1,5 @@
 from .import_utils import is_e2b_available
-from .model_utils import get_tokenizer
+from .model_utils import get_model, get_tokenizer
 
 
-__all__ = ["get_tokenizer", "is_e2b_available"]
+__all__ = ["get_tokenizer", "is_e2b_available", "get_model"]
diff --git a/src/open_r1/utils/model_utils.py b/src/open_r1/utils/model_utils.py
index 1312ed66d..8191c17ea 100644
--- a/src/open_r1/utils/model_utils.py
+++ b/src/open_r1/utils/model_utils.py
@@ -1,16 +1,12 @@
-from transformers import AutoTokenizer, PreTrainedTokenizer
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer
 
-from trl import ModelConfig
+from trl import ModelConfig, get_kbit_device_map, get_quantization_config
 
 from ..configs import GRPOConfig, SFTConfig
 
 
-DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
-
-
-def get_tokenizer(
-    model_args: ModelConfig, training_args: SFTConfig | GRPOConfig, auto_set_chat_template: bool = True
-) -> PreTrainedTokenizer:
+def get_tokenizer(model_args: ModelConfig, training_args: SFTConfig | GRPOConfig) -> PreTrainedTokenizer:
     """Get the tokenizer for the model."""
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.model_name_or_path,
@@ -20,7 +16,27 @@ def get_tokenizer(
 
     if training_args.chat_template is not None:
         tokenizer.chat_template = training_args.chat_template
-    elif auto_set_chat_template and tokenizer.get_chat_template() is None:
-        tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE
 
     return tokenizer
+
+
+def get_model(model_args: ModelConfig, training_args: SFTConfig | GRPOConfig) -> AutoModelForCausalLM:
+    """Get the model"""
+    torch_dtype = (
+        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
+    )
+    quantization_config = get_quantization_config(model_args)
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        attn_implementation=model_args.attn_implementation,
+        torch_dtype=torch_dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        model_args.model_name_or_path,
+        **model_kwargs,
+    )
+    return model

From 14a81d2bd4605a7af24d2e8848579fb0e72bab09 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Thu, 17 Apr 2025 11:11:49 +0200
Subject: [PATCH 107/137] Update evaluation.py (#611)

---
 src/open_r1/utils/evaluation.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py
index bde88c0c9..0f1af3406 100644
--- a/src/open_r1/utils/evaluation.py
+++ b/src/open_r1/utils/evaluation.py
@@ -72,12 +72,10 @@ def run_lighteval_job(
     num_gpus = get_gpu_count_for_vllm(model_name, model_revision)
     if get_param_count_from_repo_id(model_name) >= 30_000_000_000:
         tensor_parallel = True
-    else:
-        num_gpus = 8
-        tensor_parallel = False
-
     # FIXME: vLLM 0.8.3 hangs with lighteval and DP > 1, so we disable it for now and use TP for all evals. See https://github.com/huggingface/lighteval/issues/670
-    tensor_parallel = True
+    # else:
+    #     num_gpus = 8
+    #     tensor_parallel = False
 
     cmd = VLLM_SLURM_PREFIX.copy()
     cmd_args = [

From 4c9b0f25d9798984704d11c1c79823c7fb5f5f82 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Thu, 17 Apr 2025 15:25:59 +0200
Subject: [PATCH 108/137] Fix TP once and for all :) (#613)

* Update evaluation.py

* Fix import
---
 src/open_r1/utils/evaluation.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py
index 0f1af3406..af149e348 100644
--- a/src/open_r1/utils/evaluation.py
+++ b/src/open_r1/utils/evaluation.py
@@ -1,7 +1,7 @@
 import subprocess
 from typing import TYPE_CHECKING, Dict, Union
 
-from .hub import get_gpu_count_for_vllm, get_param_count_from_repo_id
+from .hub import get_gpu_count_for_vllm
 
 
 if TYPE_CHECKING:
@@ -70,12 +70,13 @@ def run_lighteval_job(
     model_revision = training_args.hub_model_revision
     # For large models >= 30b params or those running the MATH benchmark, we need to shard them across the GPUs to avoid OOM
     num_gpus = get_gpu_count_for_vllm(model_name, model_revision)
-    if get_param_count_from_repo_id(model_name) >= 30_000_000_000:
-        tensor_parallel = True
     # FIXME: vLLM 0.8.3 hangs with lighteval and DP > 1, so we disable it for now and use TP for all evals. See https://github.com/huggingface/lighteval/issues/670
+    # if get_param_count_from_repo_id(model_name) >= 30_000_000_000:
+    #     tensor_parallel = True
     # else:
     #     num_gpus = 8
     #     tensor_parallel = False
+    tensor_parallel = True
 
     cmd = VLLM_SLURM_PREFIX.copy()
     cmd_args = [

From 715c8787fb929e20e5733b025f20c3e4252d11e7 Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Thu, 17 Apr 2025 16:41:39 +0200
Subject: [PATCH 109/137] add back grad accumulations steps (#612)

---
 slurm/train.slurm | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/slurm/train.slurm b/slurm/train.slurm
index dc51f9045..a5c00d707 100644
--- a/slurm/train.slurm
+++ b/slurm/train.slurm
@@ -23,6 +23,20 @@ ACCELERATOR=$4
 OPTIONAL_ARGS=$5
 CONFIG_FILE=recipes/$MODEL/$TASK/config_$CONFIG_SUFFIX.yaml
 GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}')
+
+# Split the string into individual arguments
+IFS=' ' read -ra ARGS <<< "$OPTIONAL_ARGS"
+# Loop through the arguments and find the one with "--gradient_accumulation_steps"
+for arg in "${ARGS[@]}"; do
+    if [[ "$arg" == "--gradient_accumulation_steps="* ]]; then
+        # Extract the value after the equals sign
+        GRAD_ACC_STEPS="${arg#*=}"
+        break  # Exit the loop once we find the desired argument
+    fi
+done
+
+echo "Gradient accumulation steps: $GRAD_ACC_STEPS"
+
 MODEL=$(grep 'model_name_or_path:' $CONFIG_FILE | awk '{print $2}')
 REVISION=$(grep 'model_revision:' $CONFIG_FILE | head -n 1 | awk '{print $2}')
 

From 50590a41b9c3c97dcca9018a1778f8dd1645f525 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Sat, 26 Apr 2025 11:50:08 +0200
Subject: [PATCH 110/137] Enable data and tensor parallelism for GRPO (#626)

* Bump deps

* Fix SLurm

* Fix
---
 Makefile                        |   2 +-
 README.md                       | 105 ++++++++++++++++++++------------
 recipes/README.md               |   4 +-
 setup.py                        |   9 ++-
 slurm/evaluate.slurm            |   2 -
 slurm/train.slurm               |  99 +++++++++++++++++++++++++-----
 src/open_r1/utils/evaluation.py |   2 +-
 7 files changed, 158 insertions(+), 65 deletions(-)

diff --git a/Makefile b/Makefile
index 3b307267f..ee5a514dd 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ check_dirs := src tests
 # dev dependencies
 install:
 	uv venv openr1 --python 3.11 && . openr1/bin/activate && uv pip install --upgrade pip
-	uv pip install vllm==0.8.3
+	uv pip install vllm==0.8.4
 	uv pip install setuptools
 	uv pip install flash-attn --no-build-isolation
 	GIT_LFS_SKIP_SMUDGE=1 uv pip install -e ".[dev]"
diff --git a/README.md b/README.md
index e36c4815e..d0f8dcda4 100644
--- a/README.md
+++ b/README.md
@@ -69,11 +69,11 @@ uv venv openr1 --python 3.11 && source openr1/bin/activate && uv pip install --u
 Next, install vLLM and FlashAttention:
 
 ```shell
-uv pip install vllm==0.8.3
+uv pip install vllm==0.8.4
 uv pip install setuptools && uv pip install flash-attn --no-build-isolation
 ```
 
-This will also install PyTorch `v2.5.1` and it is **very important** to use this version since the vLLM binaries are compiled for it. You can then install the remaining dependencies for your specific use case via `pip install -e .[LIST OF MODES]`. For most contributors, we recommend:
+This will also install PyTorch `v2.6.0` and it is **very important** to use this version since the vLLM binaries are compiled for it. You can then install the remaining dependencies for your specific use case via `pip install -e .[LIST OF MODES]`. For most contributors, we recommend:
 
 ```shell
 GIT_LFS_SKIP_SMUDGE=1 uv pip install -e ".[dev]"
@@ -217,13 +217,33 @@ CUDA_VISIBLE_DEVICES=1,2,3,4,5,6,7 ACCELERATE_LOG_LEVEL=info \
 > [!WARNING]
 > The chat template used in the distilled DeepSeek models omits the contents of the reasoning block within the `<think>` and `</think>` tags. It also prefills the assistant response with `<think>` which interferes with the format reward function. To handle that, it is important to override the chat template as done in e.g.  [recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml](./recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml).
 
-For multi-node training, we provide an example Slurm script:
+To increase the throughput with data parallel on e.g. 2 GPUs, run:
 
 ```shell
-sbatch --nodes=2 slurm/train.slurm Qwen2.5-Math-7B grpo config_simple_rl zero3 
+CUDA_VISIBLE_DEVICES=0,1 trl vllm-serve --model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --data_parallel_size 2
 ```
 
-You will need to adapt the `slurm/train.slurm` script to match your cluster.
+Then run training on the remaining GPUs as follows:
+
+```shell
+CUDA_VISIBLE_DEVICES=2,3,4,5,6,7 ACCELERATE_LOG_LEVEL=info \
+    accelerate launch --config_file recipes/accelerate_configs/zero2.yaml --num_processes 6 \
+    src/open_r1/grpo.py --config recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
+```
+
+For larger models, use tensor parallelism:
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1 trl vllm-serve --model deepseek-ai/DeepSeek-R1-Distill-Qwen-14B --tensor_parallel_size 2
+``` 
+
+For multi-node training on N+1 nodes, with 1 node running the vLLM server and N nodes running training, we provide an example Slurm script. For example, to run the above example on 1+1 nodes with data parallelism, run:
+
+```shell
+sbatch --nodes=2 slurm/train.slurm --model Qwen2.5-1.5B-Instruct --task grpo --config demo --accelerator zero2 --dp 8 --tp 1
+```
+
+See the [Launching jobs on a Slurm cluster](#launching-jobs-on-a-slurm-cluster) section for more details.
 
 #### 👨‍💻 Training with a code interpreter
 
@@ -299,55 +319,28 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con
     --config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml
 ```
 
-
-#### Data decontamination
-
-Following [s1: Simple test-time scaling](https://arxiv.org/abs/2501.19393) the data can be decontaminated using the script at: [scripts/decontaminate.py](./scripts/decontaminate.py), which decontaminates a dataset using 8-grams and deduplicate the data. Sample run:
-
-```shell
-python scripts/decontaminate.py \
-    --dataset "open-r1/verifiable-coding-problems-python" \
-    --problem_column problem \
-    --cleanup
-```
-
-It will decontaminate against the benchmark datasets, and remove the contaminated samples afterwards. If no argument `--new_dataset_name` is provided, the same dataset will be reused, adding a `_decontaminated`. It runs against the prompt, which for this dataset is the column `problem`, but a different one can be provided.
-
-Arguments for the script:
-
-```shell
-usage: decontaminate.py [-h] --dataset DATASET [--split SPLIT] [--ngram_size NGRAM_SIZE] [--problem_column PROBLEM_COLUMN] [--cleanup] [--new_dataset_name NEW_DATASET_NAME]
-
-options:
-  -h, --help            show this help message and exit
-  --dataset DATASET     Name of the dataset to check for contamination.
-  --split SPLIT         Split to check for contamination, defaults to `train`.
-  --ngram_size NGRAM_SIZE
-                        Size of n-grams to build, defaults to 8.
-  --problem_column PROBLEM_COLUMN
-                        Name of the column containing the problem (prompt).
-  --cleanup           Whether to remove the contaminated rows before pushing the dataset.
-  --new_dataset_name NEW_DATASET_NAME
-                        New name for the dataset. If not provided, will reuse the name and add a `_decontaminated` to the name.
-```
-
 ### Launching jobs on a Slurm cluster
 
 If you have access to a Slurm cluster, we provide a `slurm/train.slurm` script that will automatically queue training jobs for you. Here's how you can use it:
 
 ```shell
-sbatch --job-name=open_r1 --nodes=1 slurm/train.slurm {model_name} {task} {config_suffix} {accelerator}
+sbatch --job-name=open_r1 --nodes=1 slurm/train.slurm --model {model_name} --task {task} --config {config_suffix} --accelerator {accelerator}
 ```
 
 Here `{model_name}` and `{task}` are defined as above, while `{config_suffix}` refers to the specific config and `{accelerator}` refers to the choice of 🤗 Accelerate config in `recipes/accelerate_configs`. If you wish to override the default config parameters, you can provide them by appending a space-separated string like `'--arg1=value1 --arg2=value2'`. Here's a concrete example to run SFT on 1 node of 8 GPUs:
 
 ```shell
-# Launch on Slurm and override default hyperparameters
-sbatch --job-name=open_r1 --nodes=1 slurm/train.slurm Qwen2.5-1.5B-Instruct sft demo zero3 '--per_device_train_batch_size=1 --num_train_epochs=5'
+sbatch --job-name=open_r1 --nodes=1 slurm/train.slurm --model Qwen2.5-1.5B-Instruct --task sft --config demo --accelerator zero3
 ```
 
 You can scale the number of nodes by increasing the `--nodes` flag.
 
+For GRPO, we use 1 node for the vLLM server and N nodes for training. For example, to run GRPO on 1+1 nodes with mixed data and tensor parallelism, run:
+
+```shell
+sbatch --job-name=open_r1 --nodes=2 slurm/train.slurm --model Qwen2.5-1.5B-Instruct --task grpo --config demo --accelerator zero2 --dp 4 --tp 2
+```
+
 > [!NOTE]
 > The configuration in `slurm/train.slurm` is optimised for the Hugging Face Compute Cluster and may require tweaking to be adapted to your own compute nodes.
 
@@ -652,6 +645,38 @@ sbatch slurm/generate.slurm \
 > [!NOTE]  
 > While the job is running, you can setup an SSH tunnel through the cluster login node to access the Ray dashboard from your computer running `ssh -L 8265:ray_ip_head_node:8265 <login_node>`, then browsing `http://localhost:8265`
 
+
+### Data decontamination
+
+Following [s1: Simple test-time scaling](https://arxiv.org/abs/2501.19393) the data can be decontaminated using the script at: [scripts/decontaminate.py](./scripts/decontaminate.py), which decontaminates a dataset using 8-grams and deduplicate the data. Sample run:
+
+```shell
+python scripts/decontaminate.py \
+    --dataset "open-r1/verifiable-coding-problems-python" \
+    --problem_column problem \
+    --cleanup
+```
+
+It will decontaminate against the benchmark datasets, and remove the contaminated samples afterwards. If no argument `--new_dataset_name` is provided, the same dataset will be reused, adding a `_decontaminated`. It runs against the prompt, which for this dataset is the column `problem`, but a different one can be provided.
+
+Arguments for the script:
+
+```shell
+usage: decontaminate.py [-h] --dataset DATASET [--split SPLIT] [--ngram_size NGRAM_SIZE] [--problem_column PROBLEM_COLUMN] [--cleanup] [--new_dataset_name NEW_DATASET_NAME]
+
+options:
+  -h, --help            show this help message and exit
+  --dataset DATASET     Name of the dataset to check for contamination.
+  --split SPLIT         Split to check for contamination, defaults to `train`.
+  --ngram_size NGRAM_SIZE
+                        Size of n-grams to build, defaults to 8.
+  --problem_column PROBLEM_COLUMN
+                        Name of the column containing the problem (prompt).
+  --cleanup           Whether to remove the contaminated rows before pushing the dataset.
+  --new_dataset_name NEW_DATASET_NAME
+                        New name for the dataset. If not provided, will reuse the name and add a `_decontaminated` to the name.
+```
+
 ## Contributing
 
 Contributions are welcome. Please refer to https://github.com/huggingface/open-r1/issues/23.
diff --git a/recipes/README.md b/recipes/README.md
index a9e97ca17..445eb78d0 100644
--- a/recipes/README.md
+++ b/recipes/README.md
@@ -6,10 +6,10 @@ To train the OlympicCoder models, run:
 
 ```
 # 7B
-sbatch --nodes=1 slurm/train.slurm OlympicCoder-7B sft v00.00 zero3
+sbatch --nodes=1 slurm/train.slurm --model OlympicCoder-7B --task sft --config v00.00 --accelerator zero3
 
 # 32B
-sbatch --nodes=16 slurm/train.slurm OlympicCoder-32B sft v00.00 fsdp
+sbatch --nodes=16 slurm/train.slurm --model OlympicCoder-32B --task sft --config v00.00 --accelerator fsdp
 ```
 
 Note that we found it necessary to switch to FSDP1 and paged AdamW 8-bit for the 32B model in order to fit the largest possible context size.
\ No newline at end of file
diff --git a/setup.py b/setup.py
index ef3161218..d5800f421 100644
--- a/setup.py
+++ b/setup.py
@@ -44,13 +44,13 @@
     "accelerate==1.4.0",
     "bitsandbytes>=0.43.0",
     "datasets>=3.2.0",
-    "deepspeed==0.16.4",
+    "deepspeed==0.16.7",
     "distilabel[vllm,ray,openai]>=1.5.2",
     "e2b-code-interpreter>=1.0.5",
     "einops>=0.8.0",
     "flake8>=6.0.0",
     "hf_transfer>=0.1.4",
-    "huggingface-hub[cli,hf_xet]>=0.19.2,<1.0",
+    "huggingface-hub[cli,hf_xet]>=0.30.2,<1.0",
     "isort>=5.12.0",
     "langdetect",  # Needed for LightEval's extended tasks
     "latex2sympy2_extended>=1.0.6",
@@ -66,9 +66,8 @@
     "safetensors>=0.3.3",
     "sentencepiece>=0.1.99",
     "torch==2.6.0",
-    "transformers==4.51.2",
-    "trl @ git+https://github.com/huggingface/trl.git@c04e84c4545acfaecdf7e0631ad07a86ab0fb2f6",  # Fix EOS token for SFT on base models: https://github.com/huggingface/trl/pull/3299
-    "vllm==0.8.3",
+    "transformers @ git+https://github.com/huggingface/transformers.git@acdbe627e323dbc822f21499fead789b439cf45b",  # Fix DeepSpeed x vLLM conflict: https://github.com/huggingface/transformers/pull/37755
+    "trl[vllm] @ git+https://github.com/huggingface/trl.git@1bca49515ecd5b85d16e68c42c76670e252e19f1",  # Fix DeepSpeed x vLLM conflict: https://github.com/huggingface/trl/pull/3351
     "wandb>=0.19.1",
 ]
 
diff --git a/slurm/evaluate.slurm b/slurm/evaluate.slurm
index 93c77de20..abe1b11df 100644
--- a/slurm/evaluate.slurm
+++ b/slurm/evaluate.slurm
@@ -41,8 +41,6 @@ DETAILS_REPO_ID="open-r1/details-$MODEL_NAME"
 OUTPUT_DIR="eval_results/$MODEL_ID/$MODEL_REVISION/$TASK_NAME"
 # We need this flag since we run this script from training jobs that use DeepSpeed and the env vars get progated which causes errors during evaluation
 ACCELERATE_USE_DEEPSPEED=false
-# Enable fast downloads
-HF_HUB_ENABLE_HF_TRANSFER=1
 
 echo "Running lighteval script ..."
 echo "Eval results will be saved to $OUTPUT_DIR"
diff --git a/slurm/train.slurm b/slurm/train.slurm
index a5c00d707..61aec484c 100644
--- a/slurm/train.slurm
+++ b/slurm/train.slurm
@@ -1,26 +1,91 @@
 #!/bin/bash
-#SBATCH --job-name=open-r1-sft
+#SBATCH --job-name=open_r1
 #SBATCH --ntasks-per-node=1
 #SBATCH --exclusive
 #SBATCH --gres=gpu:8
 #SBATCH --partition=hopper-prod  # Adjust this for your cluster
 #SBATCH --output=./logs/%x-%j.out
-#SBATCH --err=./logs/%x-%j.err
+#SBATCH --error=./logs/%x-%j.err
 #SBATCH --requeue
 
+if [[ "$*" == *"--help"* ]]; then
+  echo "Usage: sbatch slurm/train.slurm [options]"
+  echo "Options:"
+  echo "  --model MODEL            Model name"
+  echo "  --task TASK              Task name (e.g. sft, grpo)"
+  echo "  --config SUFFIX          Configuration suffix (e.g. demo, v00.00)"
+  echo "  --accelerator CONFIG     Accelerator configuration name (e.g. zero3)"
+  echo "  --dp N                   Data parallelism for vLLM server (default: 1)"
+  echo "  --tp N                   Tensor parallelism for vLLM server (default: 1)"
+  echo "  --args \"ARGS\"          Optional arguments to pass to the training script"
+  exit 0
+fi
+
 # Specific configuration optimized for the Hugging Face Compute Cluster
 module load cuda/12.4
 set -x -e
 
 source ~/.bashrc
 source openr1/bin/activate
+START_TIME=$(date +%s)
 echo "START TIME: $(date)"
 
-MODEL=$1
-TASK=$2
-CONFIG_SUFFIX=$3
-ACCELERATOR=$4
-OPTIONAL_ARGS=$5
+# Default values
+MODEL=""
+TASK=""
+CONFIG_SUFFIX=""
+ACCELERATOR=""
+DP=1
+TP=1
+OPTIONAL_ARGS=""
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --model)
+      MODEL="$2"
+      shift 2
+      ;;
+    --task)
+      TASK="$2"
+      shift 2
+      ;;
+    --config)
+      CONFIG_SUFFIX="$2"
+      shift 2
+      ;;
+    --accelerator)
+      ACCELERATOR="$2"
+      shift 2
+      ;;
+    --dp)
+      DP="$2"
+      shift 2
+      ;;
+    --tp)
+      TP="$2"
+      shift 2
+      ;;
+    --args)
+      OPTIONAL_ARGS="$2"
+      shift 2
+      ;;
+    *)
+      echo "Unknown option: $1"
+      echo "Use --help for usage information"
+      exit 1
+      ;;
+  esac
+done
+
+# Validate required arguments
+if [[ -z "$MODEL" || -z "$TASK" || -z "$CONFIG_SUFFIX" || -z "$ACCELERATOR" ]]; then
+  echo "Error: Missing required arguments"
+  echo "Run with --help for usage information"
+  exit 1
+fi
+
+
 CONFIG_FILE=recipes/$MODEL/$TASK/config_$CONFIG_SUFFIX.yaml
 GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}')
 
@@ -57,10 +122,9 @@ fi
 if [[ "$USE_VLLM" == "true" ]]; then
      TRAIN_NODES=("${NODELIST[@]:0:$((NUM_NODES - 1))}")
      VLLM_NODE=${NODELIST[-1]} # Last node
-     TP=$(python scripts/get_tensor_parallel_size.py --model_name $MODEL --revision $REVISION --default_tp $GPUS_PER_NODE)
      WORLD_SIZE=$((WORLD_SIZE - GPUS_PER_NODE))
      NUM_NODES=$((NUM_NODES - 1))
-     srun --nodes=1 --ntasks=1 --nodelist=$VLLM_NODE trl vllm-serve --model $MODEL --revision $REVISION --tensor_parallel_size $TP &
+     srun --nodes=1 --ntasks=1 --nodelist=$VLLM_NODE trl vllm-serve --model $MODEL --revision $REVISION --tensor_parallel_size $TP --data_parallel_size $DP &
 
      OPTIONAL_ARGS="$OPTIONAL_ARGS --vllm_server_host=$VLLM_NODE"
 fi
@@ -77,7 +141,7 @@ export CMD=" \
     src/open_r1/$TASK.py --config $CONFIG_FILE $OPTIONAL_ARGS
     "
 
-export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \
+export LAUNCHER="ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \
     --config_file recipes/accelerate_configs/$ACCELERATOR.yaml  \
     --gradient_accumulation_steps $GRAD_ACC_STEPS \
     --num_machines $NUM_NODES \
@@ -87,19 +151,26 @@ export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORM
     --machine_rank $SLURM_PROCID \
     --rdzv_backend=c10d \
     --max_restarts 1 \
-    --role \$(hostname -s): \
     --tee 3 \
     "
 # srun error handling:
 # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
 # --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
+NODELIST=$(IFS=,; echo "${TRAIN_NODES[*]}")
+
 SRUN_ARGS=" \
     --wait=60 \
     --kill-on-bad-exit=1 \
     --nodes=$NUM_NODES \
     --ntasks=$NUM_NODES \
-    --nodelist=$TRAIN_NODES
+    --nodelist=$NODELIST
     "
-clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --role \$SLURMD_NODENAME: $CMD" 2>&1
+clear; srun $SRUN_ARGS bash -c "$LAUNCHER $CMD" 2>&1
 
-echo "END TIME: $(date)"
\ No newline at end of file
+END_TIME=$(date +%s)
+echo "END TIME: $(date)"
+ELAPSED_SECONDS=$((END_TIME - START_TIME))
+HOURS=$((ELAPSED_SECONDS / 3600))
+MINUTES=$(( (ELAPSED_SECONDS % 3600) / 60 ))
+SECONDS=$((ELAPSED_SECONDS % 60))
+echo "TOTAL JOB TIME: ${HOURS}h ${MINUTES}m ${SECONDS}s (${ELAPSED_SECONDS} seconds)"
\ No newline at end of file
diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py
index af149e348..e12081efb 100644
--- a/src/open_r1/utils/evaluation.py
+++ b/src/open_r1/utils/evaluation.py
@@ -70,7 +70,7 @@ def run_lighteval_job(
     model_revision = training_args.hub_model_revision
     # For large models >= 30b params or those running the MATH benchmark, we need to shard them across the GPUs to avoid OOM
     num_gpus = get_gpu_count_for_vllm(model_name, model_revision)
-    # FIXME: vLLM 0.8.3 hangs with lighteval and DP > 1, so we disable it for now and use TP for all evals. See https://github.com/huggingface/lighteval/issues/670
+    # FIXME: vLLM 0.8.4 hangs with lighteval and DP > 1, so we disable it for now and use TP for all evals. See https://github.com/huggingface/lighteval/issues/670
     # if get_param_count_from_repo_id(model_name) >= 30_000_000_000:
     #     tensor_parallel = True
     # else:

From 75c3999180a7293af74f790919f3d4a28864e0ef Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Wed, 30 Apr 2025 22:02:20 +0200
Subject: [PATCH 111/137] Bump LightEval to enable DP>1 (#629)

* Bump LightEval to enable DP>1

* Remove redundant arg

* Update eval scores

* Fix slurm
---
 Makefile                        |  2 +-
 README.md                       | 56 ++++++++++++++++-----------------
 setup.py                        |  2 +-
 slurm/e2b_router.slurm          |  2 +-
 slurm/evaluate.slurm            | 14 ++++-----
 slurm/generate.slurm            |  2 +-
 src/open_r1/utils/evaluation.py | 14 ++++-----
 7 files changed, 45 insertions(+), 47 deletions(-)

diff --git a/Makefile b/Makefile
index ee5a514dd..3bf653fb0 100644
--- a/Makefile
+++ b/Makefile
@@ -40,7 +40,7 @@ evaluate:
 		fi \
 	),))
 	$(if $(filter tensor,$(PARALLEL)),export VLLM_WORKER_MULTIPROC_METHOD=spawn &&,) \
-	MODEL_ARGS="pretrained=$(MODEL),dtype=bfloat16,$(PARALLEL_ARGS),max_model_length=32768,max_num_batched_tokens=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}" && \
+	MODEL_ARGS="pretrained=$(MODEL),dtype=bfloat16,$(PARALLEL_ARGS),max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}" && \
 	if [ "$(TASK)" = "lcb" ]; then \
 		lighteval vllm $$MODEL_ARGS "extended|lcb:codegeneration|0|0" \
 			--use-chat-template \
diff --git a/README.md b/README.md
index d0f8dcda4..80b7b5fb5 100644
--- a/README.md
+++ b/README.md
@@ -349,8 +349,9 @@ sbatch --job-name=open_r1 --nodes=2 slurm/train.slurm --model Qwen2.5-1.5B-Instr
 We use `lighteval` to evaluate models. For models which fit on a single GPU, run:
 
 ```shell
+export VLLM_WORKER_MULTIPROC_METHOD=spawn # Required for vLLM
 MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,max_num_batched_tokens=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
+MODEL_ARGS="model_name=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 OUTPUT_DIR=data/evals/$MODEL
 
 # AIME 2024
@@ -377,15 +378,12 @@ lighteval vllm $MODEL_ARGS "extended|lcb:codegeneration|0|0" \
     --output-dir $OUTPUT_DIR 
 ```
 
-> [!IMPORTANT]
-> You must set `max_model_length=32768` and `max_num_batched_tokens=32768` in the `vllm` command to align with the `max_new_tokens` we define per eval. Without this, `lighteval` will throw an error.
-
 To increase throughput across multiple GPUs, use _data parallel_ as follows:
 
 ```shell
 NUM_GPUS=8
 MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,max_num_batched_tokens=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
+MODEL_ARGS="model_name=$MODEL,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 TASK=aime24
 OUTPUT_DIR=data/evals/$MODEL
 
@@ -399,7 +397,7 @@ For large models which require sharding across GPUs, use _tensor parallel_ and r
 ```shell
 NUM_GPUS=8
 MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,max_num_batched_tokens=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
+MODEL_ARGS="model_name=$MODEL,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 TASK=aime24
 OUTPUT_DIR=data/evals/$MODEL
 
@@ -440,19 +438,19 @@ We are able to reproduce Deepseek's reported results on the AIME 2024 benchmark
 
 | Model                         | AIME 2024 (🤗 LightEval) | AIME 2024 (DeepSeek Reported) |
 |:------------------------------|:-----------------------:|:----------------------------:|
-| DeepSeek-R1-Distill-Qwen-1.5B |          31.8           |             28.9             |
-| DeepSeek-R1-Distill-Qwen-7B   |          52.2           |             55.5             |
-| DeepSeek-R1-Distill-Qwen-14B  |          66.5           |             69.7             |
-| DeepSeek-R1-Distill-Qwen-32B  |          68.0           |             72.6             |
-| DeepSeek-R1-Distill-Llama-8B  |          43.9           |             41.7             |
-| DeepSeek-R1-Distill-Llama-70B |          65.3           |             70.0             |
+| DeepSeek-R1-Distill-Qwen-1.5B |          30.6           |             28.9             |
+| DeepSeek-R1-Distill-Qwen-7B   |          52.8           |             55.5             |
+| DeepSeek-R1-Distill-Qwen-14B  |          65.6           |             69.7             |
+| DeepSeek-R1-Distill-Qwen-32B  |          71.0           |             72.6             |
+| DeepSeek-R1-Distill-Llama-8B  |          44.8           |             41.7             |
+| DeepSeek-R1-Distill-Llama-70B |          63.0           |             70.0             |
 
 To reproduce these results use the following command:
 
 ```shell
 NUM_GPUS=1 # Set to 8 for 32B and 70B models
 MODEL=deepseek-ai/{model_name}
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,max_num_batched_tokens=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
+MODEL_ARGS="model_name=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 OUTPUT_DIR=data/evals/$MODEL
 
 lighteval vllm $MODEL_ARGS "lighteval|aime24|0|0" \
@@ -472,19 +470,20 @@ We are able to reproduce Deepseek's reported results on the MATH-500 benchmark w
 
 | Model                         | MATH-500 (🤗 LightEval) | MATH-500 (DeepSeek Reported) |
 |:------------------------------|:-----------------------:|:----------------------------:|
-| DeepSeek-R1-Distill-Qwen-1.5B |          84.6           |             83.9             |
-| DeepSeek-R1-Distill-Qwen-7B   |          93.0           |             92.8             |
-| DeepSeek-R1-Distill-Qwen-14B  |          95.0           |             93.9             |
-| DeepSeek-R1-Distill-Qwen-32B  |          96.6           |             94.3             |
-| DeepSeek-R1-Distill-Llama-8B  |          88.6           |             89.1             |
-| DeepSeek-R1-Distill-Llama-70B |          96.4           |             94.5             |
+| DeepSeek-R1-Distill-Qwen-1.5B |          84.4           |             83.9             |
+| DeepSeek-R1-Distill-Qwen-7B   |          94.4           |             92.8             |
+| DeepSeek-R1-Distill-Qwen-14B  |          94.2           |             93.9             |
+| DeepSeek-R1-Distill-Qwen-32B  |          95.8           |             94.3             |
+| DeepSeek-R1-Distill-Llama-8B  |          88.4           |             89.1             |
+| DeepSeek-R1-Distill-Llama-70B |          96.0           |             94.5             |
 
 To reproduce these results use the following command:
 
 ```shell
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
 NUM_GPUS=1 # Set to 8 for 32B and 70B models
 MODEL=deepseek-ai/{model_name}
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,max_num_batched_tokens=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
+MODEL_ARGS="model_name=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 OUTPUT_DIR=data/evals/$MODEL
 
 lighteval vllm $MODEL_ARGS "lighteval|math_500|0|0" \
@@ -504,19 +503,20 @@ We are able to reproduce Deepseek's reported results on the GPQA Diamond benchma
 
 | Model                         | GPQA Diamond (🤗 LightEval) | GPQA Diamond (DeepSeek Reported) |
 |:------------------------------|:---------------------------:|:--------------------------------:|
-| DeepSeek-R1-Distill-Qwen-1.5B |            34.3             |               33.8               |
-| DeepSeek-R1-Distill-Qwen-7B   |            50.5             |               49.1               |
+| DeepSeek-R1-Distill-Qwen-1.5B |            36.9             |               33.8               |
+| DeepSeek-R1-Distill-Qwen-7B   |            51.6             |               49.1               |
 | DeepSeek-R1-Distill-Qwen-14B  |            59.6             |               59.1               |
-| DeepSeek-R1-Distill-Qwen-32B  |            63.6             |               62.1               |
-| DeepSeek-R1-Distill-Llama-8B  |            52.0             |               49.0               |
-| DeepSeek-R1-Distill-Llama-70B |            67.2             |               65.2               |
+| DeepSeek-R1-Distill-Qwen-32B  |            63.1             |               62.1               |
+| DeepSeek-R1-Distill-Llama-8B  |            54.0             |               49.0               |
+| DeepSeek-R1-Distill-Llama-70B |            68.2             |               65.2               |
 
 To reproduce these results use the following command:
 
 ```shell
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
 NUM_GPUS=1 # Set to 8 for 32B and 70B models
-MODEL=deepseek-ai/{model_name}
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,max_num_batched_tokens=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
+MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+MODEL_ARGS="model_name=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 OUTPUT_DIR=data/evals/$MODEL
 
 lighteval vllm $MODEL_ARGS "lighteval|gpqa:diamond|0|0" \
@@ -546,7 +546,7 @@ To reproduce these results use the following command:
 ```shell
 NUM_GPUS=1 # Set to 8 for 32B and 70B models, or data_parallel_size=8 with the smaller models for speed
 MODEL=deepseek-ai/{model_name}
-MODEL_ARGS="pretrained=$MODEL,dtype=bfloat16,max_model_length=32768,max_num_batched_tokens=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
+MODEL_ARGS="model_name=$MODEL,dtype=bfloat16,max_model_length=32768,gpu_memory_utilization=0.8,data_parallel_size=$NUM_GPUS,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 OUTPUT_DIR=data/evals/$MODEL
 
 lighteval vllm $MODEL_ARGS "extended|lcb:codegeneration|0|0" \
diff --git a/setup.py b/setup.py
index d5800f421..1fafd4121 100644
--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,7 @@
     "langdetect",  # Needed for LightEval's extended tasks
     "latex2sympy2_extended>=1.0.6",
     "liger-kernel>=0.5.6",
-    "lighteval @ git+https://github.com/huggingface/lighteval.git@bb14995c4eccab5cabd450b1e509c3c898a16921",  # pass@1 for AIME with n=32 samples per prompt
+    "lighteval @ git+https://github.com/huggingface/lighteval.git@989f5f5586de1ddfeceb0dfa5076bd0740d376fa",
     "math-verify==0.5.2",  # Used for math verification in grpo
     "packaging>=23.0",
     "parameterized>=0.9.0",
diff --git a/slurm/e2b_router.slurm b/slurm/e2b_router.slurm
index 5be4bfebc..254130594 100644
--- a/slurm/e2b_router.slurm
+++ b/slurm/e2b_router.slurm
@@ -4,7 +4,7 @@
 #SBATCH --mem=16g
 #SBATCH --cpus-per-task=16
 #SBATCH --output=/fsx/open-r1/logs/e2b_router/%x-%j.out
-#SBATCH --err=/fsx/open-r1/logs/e2b_router/%x-%j.err
+#SBATCH --error=/fsx/open-r1/logs/e2b_router/%x-%j.err
 #SBATCH --requeue
 
 echo "Starting job"
diff --git a/slurm/evaluate.slurm b/slurm/evaluate.slurm
index abe1b11df..4b81594b7 100644
--- a/slurm/evaluate.slurm
+++ b/slurm/evaluate.slurm
@@ -3,13 +3,16 @@
 #SBATCH --gres=gpu:8
 #SBATCH --partition=hopper-prod
 #SBATCH --output=./logs/%x-%j.out
-#SBATCH --err=./logs/%x-%j.err
+#SBATCH --error=./logs/%x-%j.err
 #SBATCH --requeue
 
 # Specific configuration optimized for the Hugging Face Compute Cluster
 # Be ye warned this may not work on other clusters!
 module load cuda/12.4
 
+# Needed for vLLM
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
+
 set -x -e
 
 source ~/.bashrc
@@ -25,14 +28,11 @@ MODEL_REVISION=$4
 # $7 is reserved for system_prompt, see line 51
 NUM_GPUS=$(nvidia-smi -L | wc -l)
 
-# Set Whether to use tensor parallelism or data parallelism
+# Use TP to shard model across GPUs
 if [ "$TENSOR_PARALLEL" = "True" ]; then
-    # use TP to shard model across NUM_GPUS
-    export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    # FIXME: lighteval now requires us to manually pass the generation params
-    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,max_num_batched_tokens=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
+    MODEL_ARGS="model_name=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 else
-    MODEL_ARGS="pretrained=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,max_num_batched_tokens=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
+    MODEL_ARGS="model_name=$MODEL_ID,revision=$MODEL_REVISION,trust_remote_code=$TRUST_REMOTE_CODE,dtype=bfloat16,data_parallel_size=$NUM_GPUS,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}"
 fi
 
 LM_EVAL_REPO_ID="open-r1/open-r1-eval-leaderboard"
diff --git a/slurm/generate.slurm b/slurm/generate.slurm
index 2fdf61795..0935d6245 100644
--- a/slurm/generate.slurm
+++ b/slurm/generate.slurm
@@ -6,7 +6,7 @@
 #SBATCH --exclusive
 #SBATCH --gpus-per-node=8
 #SBATCH --output=./logs/%x-%j.out
-#SBATCH --err=./logs/%x-%j.err
+#SBATCH --error=./logs/%x-%j.err
 #SBATCH --time=04-00:00:00
 
 # Parse command line arguments
diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py
index e12081efb..5719350fb 100644
--- a/src/open_r1/utils/evaluation.py
+++ b/src/open_r1/utils/evaluation.py
@@ -1,7 +1,7 @@
 import subprocess
 from typing import TYPE_CHECKING, Dict, Union
 
-from .hub import get_gpu_count_for_vllm
+from .hub import get_gpu_count_for_vllm, get_param_count_from_repo_id
 
 
 if TYPE_CHECKING:
@@ -70,13 +70,11 @@ def run_lighteval_job(
     model_revision = training_args.hub_model_revision
     # For large models >= 30b params or those running the MATH benchmark, we need to shard them across the GPUs to avoid OOM
     num_gpus = get_gpu_count_for_vllm(model_name, model_revision)
-    # FIXME: vLLM 0.8.4 hangs with lighteval and DP > 1, so we disable it for now and use TP for all evals. See https://github.com/huggingface/lighteval/issues/670
-    # if get_param_count_from_repo_id(model_name) >= 30_000_000_000:
-    #     tensor_parallel = True
-    # else:
-    #     num_gpus = 8
-    #     tensor_parallel = False
-    tensor_parallel = True
+    if get_param_count_from_repo_id(model_name) >= 30_000_000_000:
+        tensor_parallel = True
+    else:
+        num_gpus = 8
+        tensor_parallel = False
 
     cmd = VLLM_SLURM_PREFIX.copy()
     cmd_args = [

From 65211f4824f699795f6d13bc445d3862155b5055 Mon Sep 17 00:00:00 2001
From: binary-husky <96192199+binary-husky@users.noreply.github.com>
Date: Thu, 1 May 2025 04:02:59 +0800
Subject: [PATCH 112/137] =?UTF-8?q?=F0=9F=A6=9CEnhance=20repetition=20pena?=
 =?UTF-8?q?lty=20reward=20for=20language=20that=20cannot=20be=20split=20by?=
 =?UTF-8?q?=20whitespace=20(#516)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update rewards.py

* add test for repetition reward with language

* Update src/open_r1/rewards.py

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* Update src/open_r1/rewards.py

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

---------

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>
---
 src/open_r1/rewards.py | 31 +++++++++++++++++++++++--------
 tests/test_rewards.py  | 12 ++++++++++++
 2 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index daa2f3252..17f001f26 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -281,7 +281,7 @@ def cosine_scaled_reward(completions, solution, **kwargs):
     return cosine_scaled_reward
 
 
-def get_repetition_penalty_reward(ngram_size: int, max_penalty: float):
+def get_repetition_penalty_reward(ngram_size: int, max_penalty: float, language: str = "en"):
     """
     Computes N-gram repetition penalty as described in Appendix C.2 of https://arxiv.org/abs/2502.03373.
     Reference implementation from: https://github.com/eddycmu/demystify-long-cot/blob/release/openrlhf/openrlhf/reward/repetition.py
@@ -289,13 +289,25 @@ def get_repetition_penalty_reward(ngram_size: int, max_penalty: float):
     Args:
     ngram_size: size of the n-grams
     max_penalty: Maximum (negative) penalty for wrong answers
+    language: Language of the text, defaults to `en`. Used to choose the way to split the text into n-grams.
     """
     if max_penalty > 0:
         raise ValueError(f"max_penalty {max_penalty} should not be positive")
 
-    def zipngram(text: str, ngram_size: int):
-        words = text.lower().split()
-        return zip(*[words[i:] for i in range(ngram_size)])
+    if language == "en":
+        def zipngram(text: str, ngram_size: int):
+            words = text.lower().split()
+            return zip(*[words[i:] for i in range(ngram_size)]), words
+    elif language == "zh":
+        from transformers.utils.import_utils import _is_package_available
+        if not _is_package_available("jieba"):
+            raise ValueError("Please install jieba to use Chinese language")
+        def zipngram(text: str, ngram_size: int):
+            import jieba
+            seg_list = list(jieba.cut(text))
+            return zip(*[seg_list[i:] for i in range(ngram_size)]), seg_list
+    else:
+        raise ValueError(f"Word splitting for language `{language}` is not yet implemented. Please implement your own zip-ngram function.")
 
     def repetition_penalty_reward(completions, **kwargs) -> float:
         """
@@ -312,13 +324,16 @@ def repetition_penalty_reward(completions, **kwargs) -> float:
             if completion == "":
                 rewards.append(0.0)
                 continue
-            if len(completion.split()) < ngram_size:
-                rewards.append(0.0)
-                continue
 
             ngrams = set()
             total = 0
-            for ng in zipngram(completion, ngram_size):
+            ngram_array, words = zipngram(completion, ngram_size)
+
+            if len(words) < ngram_size:
+                rewards.append(0.0)
+                continue
+
+            for ng in ngram_array:
                 ngrams.add(ng)
                 total += 1
 
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index 3a9df6100..a5608cac5 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -413,6 +413,18 @@ def test_tag_count_rewards_missing_all_tags(self):
         rewards = tag_count_reward(completion)
         self.assertEqual(rewards[0], 0.0)
 
+    def test_full_repetition_with_language(self):
+        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0, language="en")
+        completions = [[{"content": "that that that that that"}]]
+        rewards = reward_fn(completions)
+        self.assertEqual(rewards, [-0.75])
+        # begin test for zh language
+        try: import jieba
+        except: self.skipTest("jieba is not installed")
+        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0, language="zh")
+        completions = [[{"content": "这个这个这个这个这个"}]]
+        rewards = reward_fn(completions)
+        self.assertEqual(rewards, [-0.75])
 
 class TestCodeFormat(unittest.TestCase):
     def test_correct_python_format(self):

From 9373ad3055e7a4095a263c8d9d810630077fe29c Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Wed, 30 Apr 2025 22:16:18 +0200
Subject: [PATCH 113/137] Update README.md

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 80b7b5fb5..1243ac5f8 100644
--- a/README.md
+++ b/README.md
@@ -534,12 +534,12 @@ We are able to reproduce Deepseek's reported results on the LiveCodeBench code g
 
 | Model                         | LiveCodeBench (🤗 LightEval) | LiveCodeBench (DeepSeek Reported) |
 |:------------------------------|:----------------------------:|:--------------------------------:|
-| DeepSeek-R1-Distill-Qwen-1.5B |             16.3             |               16.9               |
-| DeepSeek-R1-Distill-Qwen-7B   |             36.6             |               37.6               |
-| DeepSeek-R1-Distill-Qwen-14B  |             51.5             |               53.1               |
-| DeepSeek-R1-Distill-Qwen-32B  |             56.6             |               57.2               |
-| DeepSeek-R1-Distill-Llama-8B  |             37.0             |               39.6               |
-| DeepSeek-R1-Distill-Llama-70B |             54.5             |               57.5               |
+| DeepSeek-R1-Distill-Qwen-1.5B |             16.1             |               16.9               |
+| DeepSeek-R1-Distill-Qwen-7B   |             37.4             |               37.6               |
+| DeepSeek-R1-Distill-Qwen-14B  |             51.3             |               53.1               |
+| DeepSeek-R1-Distill-Qwen-32B  |             56.0             |               57.2               |
+| DeepSeek-R1-Distill-Llama-8B  |             37.4             |               39.6               |
+| DeepSeek-R1-Distill-Llama-70B |             55.9             |               57.5               |
 
 To reproduce these results use the following command:
 

From c8b989109d6b3b87d8ffcbf5e5f1c3a486799661 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Fri, 2 May 2025 14:45:17 +0000
Subject: [PATCH 114/137] Fix style

---
 src/open_r1/rewards.py | 8 +++++++-
 tests/test_rewards.py  | 7 +++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 17f001f26..df1eed3f9 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -295,19 +295,25 @@ def get_repetition_penalty_reward(ngram_size: int, max_penalty: float, language:
         raise ValueError(f"max_penalty {max_penalty} should not be positive")
 
     if language == "en":
+
         def zipngram(text: str, ngram_size: int):
             words = text.lower().split()
             return zip(*[words[i:] for i in range(ngram_size)]), words
     elif language == "zh":
         from transformers.utils.import_utils import _is_package_available
+
         if not _is_package_available("jieba"):
             raise ValueError("Please install jieba to use Chinese language")
+
         def zipngram(text: str, ngram_size: int):
             import jieba
+
             seg_list = list(jieba.cut(text))
             return zip(*[seg_list[i:] for i in range(ngram_size)]), seg_list
     else:
-        raise ValueError(f"Word splitting for language `{language}` is not yet implemented. Please implement your own zip-ngram function.")
+        raise ValueError(
+            f"Word splitting for language `{language}` is not yet implemented. Please implement your own zip-ngram function."
+        )
 
     def repetition_penalty_reward(completions, **kwargs) -> float:
         """
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index a5608cac5..b13e1bd01 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -419,13 +419,16 @@ def test_full_repetition_with_language(self):
         rewards = reward_fn(completions)
         self.assertEqual(rewards, [-0.75])
         # begin test for zh language
-        try: import jieba
-        except: self.skipTest("jieba is not installed")
+        try:
+            import jieba
+        except:
+            self.skipTest("jieba is not installed")
         reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0, language="zh")
         completions = [[{"content": "这个这个这个这个这个"}]]
         rewards = reward_fn(completions)
         self.assertEqual(rewards, [-0.75])
 
+
 class TestCodeFormat(unittest.TestCase):
     def test_correct_python_format(self):
         """Test code format reward with correct Python format."""

From 52520a6713f8ebe03637cd0b75b9308946a33b7f Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Mon, 5 May 2025 15:49:10 +0200
Subject: [PATCH 115/137] Fix style (#631)

* Fix style

* Fix

* Add jieba
---
 setup.py              | 5 +++--
 tests/test_rewards.py | 4 ----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index 1fafd4121..5ee173c0e 100644
--- a/setup.py
+++ b/setup.py
@@ -52,6 +52,7 @@
     "hf_transfer>=0.1.4",
     "huggingface-hub[cli,hf_xet]>=0.30.2,<1.0",
     "isort>=5.12.0",
+    "jieba",  # Needed for Chinese language support
     "langdetect",  # Needed for LightEval's extended tasks
     "latex2sympy2_extended>=1.0.6",
     "liger-kernel>=0.5.6",
@@ -85,10 +86,10 @@ def deps_list(*pkgs):
 
 
 extras = {}
-extras["tests"] = deps_list("pytest", "parameterized", "math-verify")
+extras["tests"] = deps_list("pytest", "parameterized", "math-verify", "jieba")
 extras["torch"] = deps_list("torch")
 extras["quality"] = deps_list("ruff", "isort", "flake8")
-extras["code"] = deps_list("e2b-code-interpreter", "python-dotenv")
+extras["code"] = deps_list("e2b-code-interpreter", "python-dotenv", "jieba")
 extras["eval"] = deps_list("lighteval", "math-verify")
 extras["dev"] = extras["quality"] + extras["tests"] + extras["eval"] + extras["code"]
 
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index b13e1bd01..37c17d6d3 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -419,10 +419,6 @@ def test_full_repetition_with_language(self):
         rewards = reward_fn(completions)
         self.assertEqual(rewards, [-0.75])
         # begin test for zh language
-        try:
-            import jieba
-        except:
-            self.skipTest("jieba is not installed")
         reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0, language="zh")
         completions = [[{"content": "这个这个这个这个这个"}]]
         rewards = reward_fn(completions)

From af81114044e3f26166d1af058e9f8f6de382beef Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Wed, 7 May 2025 23:59:54 -0700
Subject: [PATCH 116/137] Code Execution using Morph Cloud (#614)

* initial commit for morphcloud sandbox support

* initial

* fixed prints in morph client for ioi

* updated import

* context manager

* removed unnecessary comments

* more intelligent instance/snapshot management

* update

* Add documentation for Morph integration

* Delete MORPH_INTEGRATION.md

* added retry and modularity to morph client

* updates to kwargs and setup.py

* Update setup.py

* added languages codepath + fixed slurm + added m
orph tests

* make quality formatting fixes

* conditional imports for morph

---------

Co-authored-by: arb8020 <arbeightytwenty@gmail.com>
---
 README.md                              |  81 ++-
 scripts/benchmark_e2b.py               |   5 +-
 scripts/e2b_router.py                  |  12 +-
 scripts/morph_router.py                | 173 ++++++
 setup.py                               |   3 +-
 slurm/morph_router.slurm               |  16 +
 src/open_r1/configs.py                 |  63 ++-
 src/open_r1/generate.py                |  16 +-
 src/open_r1/grpo.py                    |  14 +-
 src/open_r1/rewards.py                 | 269 ++++-----
 src/open_r1/sft.py                     |   6 +-
 src/open_r1/utils/__init__.py          |   4 +-
 src/open_r1/utils/callbacks.py         |  12 +-
 src/open_r1/utils/code_providers.py    | 366 ++++++++++++
 src/open_r1/utils/evaluation.py        |  22 +-
 src/open_r1/utils/hub.py               |  36 +-
 src/open_r1/utils/import_utils.py      |   7 +
 src/open_r1/utils/ioi/__init__.py      |   5 +-
 src/open_r1/utils/ioi/morph_client.py  | 744 +++++++++++++++++++++++++
 src/open_r1/utils/ioi/piston_client.py |  91 ++-
 src/open_r1/utils/ioi/scoring.py       |  82 ++-
 src/open_r1/utils/ioi/utils.py         |   9 +-
 src/open_r1/utils/model_utils.py       |  12 +-
 src/open_r1/utils/routed_morph.py      | 130 +++++
 src/open_r1/utils/routed_sandbox.py    |  22 +-
 tests/slow/test_code_reward.py         | 189 ++++++-
 tests/test_rewards.py                  | 115 +++-
 27 files changed, 2216 insertions(+), 288 deletions(-)
 create mode 100644 scripts/morph_router.py
 create mode 100644 slurm/morph_router.slurm
 create mode 100644 src/open_r1/utils/code_providers.py
 create mode 100644 src/open_r1/utils/ioi/morph_client.py
 create mode 100644 src/open_r1/utils/routed_morph.py

diff --git a/README.md b/README.md
index 1243ac5f8..d8f80f3c9 100644
--- a/README.md
+++ b/README.md
@@ -247,23 +247,56 @@ See the [Launching jobs on a Slurm cluster](#launching-jobs-on-a-slurm-cluster)
 
 #### 👨‍💻 Training with a code interpreter
 
-We provide a `code` reward function for executing code generated by the policy during training. Currently, this reward function targets code contests like [Codeforces](https://codeforces.com), where solutions are executed against a set of test cases and the overall success rate is returned as the final reward. To ensure safe execution, we use [E2B](https://e2b.dev) sandboxes, which are fast and cheap to run. To use this reward function, first install the necessary dependencies:
+We provide a `code` reward function for executing code generated by the policy during training. Currently, this reward function targets code contests like [Codeforces](https://codeforces.com), where solutions are executed against a set of test cases and the overall success rate is returned as the final reward. To ensure safe execution, we support multiple sandbox providers:
+
+1. [E2B](https://e2b.dev) - Fast, cloud-based sandboxes with focus on Python execution
+2. [Morph](https://cloud.morph.so/web/) - Cloud-based sandboxes with broader language support - Python/JS/C++/Rust
+
+To use the code reward function, first install the necessary dependencies:
 
 ```shell
 uv pip install -e '.[code]'
 ```
 
-Then create a `.env` file and place an API token from E2B within it:
+##### E2B Provider
+
+To use E2B sandboxes, create a `.env` file and add your E2B API token:
 
 ```
 E2B_API_KEY="e2b_xxx"
 ```
 
-Then make sure your dataset contains a `verification_info` column with the following schema (adopted from PrimeIntellect's excellent [datasets](https://huggingface.co/collections/PrimeIntellect/synthetic-1-67a2c399cfdd6c9f7fae0c37) of verifiable problems):
+##### Morph Provider
+
+To use Morph, first install the morphcloud package:
+
+```shell
+pip install morphcloud
+```
+
+Then add your Morph API token to the `.env` file:
+
+```
+MORPH_API_KEY="YOUR_MORPH_API_KEY"
+```
+
+To specify which provider to use, add the `provider_type` parameter in your configuration:
+
+```yaml
+# For E2B
+provider_type: e2b
+
+# For Morph
+provider_type: morph
+```
+
+##### Dataset Requirements
+
+Make sure your dataset contains a `verification_info` column with the following schema (adopted from PrimeIntellect's excellent [datasets](https://huggingface.co/collections/PrimeIntellect/synthetic-1-67a2c399cfdd6c9f7fae0c37) of verifiable problems):
 
 ```python
 {
-    "language": "python",
+    "language": "python",  # Morph supports more languages including C++, Java, etc.
     "test_cases": [
         {
             "input": "4\n4\n0001\n1000\n0011\n0111\n3\n010\n101\n0\n2\n00000\n00001\n4\n01\n001\n0001\n00001\n",
@@ -272,7 +305,6 @@ Then make sure your dataset contains a `verification_info` column with the follo
         }
     ],
 }
-```
 
 For example, to train a smol model on Python problems, start the vLLM server:
 
@@ -288,28 +320,49 @@ CUDA_VISIBLE_DEVICES=1,2,3,4,5,6,7 ACCELERATE_LOG_LEVEL=info \
     src/open_r1/grpo.py --config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml
 ```
 
-It is possible to be rate limited when too many scripts are executed on E2B Sandboxes, so we provide an E2B router script that can be launched on a CPU node on your cluster:
+##### Using Router Services
 
-For GRPO training:
-First start the router and get its IP
+It is possible to be rate limited when too many scripts are executed on sandbox services. For both providers, we offer router scripts that can be launched on a CPU node:
 
+For E2B:
 ```shell
 sbatch slurm/e2b_router.slurm
 ```
 
-Then add this line in your training YAML config: (for example)
+For Morph:
+```shell
+sbatch slurm/morph_router.slurm
 ```
+
+Then add the router URL in your training YAML config:
+```yaml
+# For E2B
 e2b_router_url: 1.2.3.4:8000
+
+# For Morph
+morph_router_url: 1.2.3.4:8000
 ```
-The port here should match the one used when launching the router.
-All training jobs can share the same router IP which will ensure there are at most 20 parallel executions.
+
+The port should match the one used when launching the router.
+All training jobs can share the same router IP which will ensure parallel executions are properly managed.
 
 #### IOI problems
 
-We provide a `ioi_code_reward` reward function for executing problems from [IOI](https://hf.co/datasets/open-r1/ioi) using [piston](https://github.com/engineer-man/piston).
+We provide a `ioi_code_reward` reward function for executing problems from [IOI](https://hf.co/datasets/open-r1/ioi). You can use either [piston](https://github.com/engineer-man/piston) or Morph as your execution provider.
+
+##### Piston 
+
+To use Piston:
+1. Get piston workers running, see [slurm/piston/README.md](./slurm/piston/README.md)
+2. Set your environment variable `PISTON_ENDPOINTS` to `slurm` or to a list of piston worker endpoints
+3. In your configuration, use `ioi_provider: "piston"`
+
+##### Morph 
 
-To get piston workers running, see [slurm/piston/README.md](./slurm/piston/README.md).
-Set your environment variable `PISTON_ENDPOINTS` to `slurm` or to a list of piston worker endpoints.
+Morph is a cloud-based solution that provides sandboxed environments for running code. To use it:
+1. Install the Morph client: `pip install morphcloud`
+2. Add your Morph API key to the `.env` file: `MORPH_API_KEY="your_key_here"`
+3. In your configuration, use `ioi_provider: "morph"`
 
 See the [example recipe](./recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml) for how to use the reward function:
 
diff --git a/scripts/benchmark_e2b.py b/scripts/benchmark_e2b.py
index cbaca41de..ac2fb1835 100644
--- a/scripts/benchmark_e2b.py
+++ b/scripts/benchmark_e2b.py
@@ -22,13 +22,14 @@
 """
 
 from datasets import load_dataset
-from open_r1.rewards import code_reward
 import time
 from tqdm.auto import tqdm
 
 from dotenv import load_dotenv
 load_dotenv()
 
+from open_r1.rewards import code_reward
+
 def benchmark_code_reward(example):
     start_time = time.time()
     test_completions = [[{"content": example["gold_standard_solution"]}]]
@@ -81,4 +82,4 @@ def benchmark_code_reward(example):
     
     for result in results:
         print(f"| {result['num_samples']:^11} | {result['num_parallel']:^15} | {result['execution_time']:17.2f} | {result['mean_reward']:^11.4f} | {result['min_reward']:^11.4f} | {result['max_reward']:^11.4f} |")
-    
\ No newline at end of file
+    
diff --git a/scripts/e2b_router.py b/scripts/e2b_router.py
index 237ad8f90..9cc25603c 100644
--- a/scripts/e2b_router.py
+++ b/scripts/e2b_router.py
@@ -35,12 +35,12 @@ class BatchRequest(BaseModel):
 
     Attributes:
         scripts (list[str]): A list of script names or paths to be executed.
-        language (str): The programming language in which the scripts are written.
+        languages (list[str]): The programming languages for each script in the list.
         timeout (int): The maximum allowed execution time for each script in seconds.
         request_timeout (int): The maximum allowed time for the entire batch request in seconds.
     """
     scripts: list[str]
-    language: str
+    languages: list[str]
     timeout: int
     request_timeout: int
 
@@ -78,7 +78,7 @@ def create_app(args):
         2. POST /execute_batch:
             - Executes a batch of scripts in an isolated sandbox environment.
             - Request Body: BatchRequest object containing:
-                - language (str): The programming language of the scripts (python or javascript).
+                - languages (list[str]): The programming languages of the scripts (python or javascript).
                 - timeout (int): The maximum execution time for each script.
                 - request_timeout (int): The timeout for the request itself.
                 - scripts (List[str]): A list of scripts to execute.
@@ -102,12 +102,12 @@ async def health():
     @app.post("/execute_batch")
     async def execute_batch(batch: BatchRequest, request: Request):
         semaphore = request.app.state.sandbox_semaphore
-        language = batch.language
+        languages = batch.languages
         timeout = batch.timeout
         request_timeout = batch.request_timeout
         asyncio_timeout = batch.timeout + 1
         
-        async def run_script(script: str) -> ScriptResult:
+        async def run_script(script: str, language: str) -> ScriptResult:
 
             async with semaphore:
                 try:
@@ -130,7 +130,7 @@ async def run_script(script: str) -> ScriptResult:
                     except Exception:
                         pass
 
-        tasks = [run_script(script) for script in batch.scripts]
+        tasks = [run_script(script, lang) for script, lang in zip(batch.scripts, batch.languages)]
         return await asyncio.gather(*tasks)
 
     return app
diff --git a/scripts/morph_router.py b/scripts/morph_router.py
new file mode 100644
index 000000000..166b9cce1
--- /dev/null
+++ b/scripts/morph_router.py
@@ -0,0 +1,173 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import asyncio
+from fastapi import FastAPI
+from pydantic import BaseModel, ConfigDict
+from typing import Optional, List
+from fastapi import FastAPI, Request
+import uvicorn
+from dotenv import load_dotenv
+import os
+
+load_dotenv()
+
+class BatchRequest(BaseModel):
+    """
+    BatchRequest is a data model representing a batch processing request.
+
+    Attributes:
+        scripts (list[str]): A list of script names or paths to be executed.
+        languages (List[str]): The programming languages for each script in the list.
+        timeout (int): The maximum allowed execution time for each script in seconds.
+        request_timeout (int): The maximum allowed time for the entire batch request in seconds.
+    """
+    scripts: List[str]
+    languages: List[str]
+    timeout: int
+    request_timeout: int
+
+class ScriptResult(BaseModel):
+    """
+    ScriptResult is a Pydantic model that represents the result of a script execution.
+    Attributes:
+        text (Optional[str]): The output text from the script execution.
+        exception_str (Optional[str]): An optional string that captures the exception 
+            message or details if an error occurred during the script's execution.
+        model_config (ConfigDict): A configuration dictionary that allows arbitrary 
+            types to be used within the Pydantic model.
+    """
+    text: Optional[str]
+    exception_str: Optional[str]
+    
+    
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    
+def create_app(args):
+    """
+    Creates and configures a FastAPI application instance for the MorphCloud router.
+    
+    Args:
+        args: An object containing configuration parameters for the application.
+              - max_num_sandboxes (int): The maximum number of concurrent sandboxes allowed.
+              - api_key (str): The MorphCloud API key to use.
+    
+    Returns:
+        FastAPI: A configured FastAPI application instance.
+    """
+    app = FastAPI()
+    
+    from morphcloud.api import MorphCloudClient
+    from morphcloud.sandbox import Sandbox
+    
+    app.state.client = MorphCloudClient(api_key=args.api_key)
+    app.state.Sandbox = Sandbox
+
+    app.state.sandbox_semaphore = asyncio.Semaphore(args.max_num_sandboxes)
+
+    @app.get("/health")
+    async def health():
+        return {"status": "ok"}
+
+    @app.post("/execute_batch")
+    async def execute_batch(batch: BatchRequest, request: Request):
+        semaphore = request.app.state.sandbox_semaphore
+        client = request.app.state.client
+        Sandbox = request.app.state.Sandbox
+        
+        languages = batch.languages
+        timeout = batch.timeout
+        request_timeout = batch.request_timeout
+        asyncio_timeout = batch.timeout + 1
+        
+        async def run_script(script: str, language: str) -> ScriptResult:
+            sandbox = None
+            sandbox_id = "unknown"
+
+            async with semaphore:
+                try:
+                    sandbox = await asyncio.to_thread(
+                        Sandbox.new,
+                        client=client,
+                        ttl_seconds=timeout
+                    )
+                    
+                    sandbox_id = getattr(sandbox, 'id', None) or getattr(sandbox._instance, 'id', 'unknown')
+                    
+                    execution = await asyncio.wait_for(
+                        asyncio.to_thread(
+                            sandbox.run_code,
+                            script,
+                            language=language,
+                            timeout=timeout * 1000  
+                        ),
+                        timeout=asyncio_timeout,
+                    )
+                    
+                    if hasattr(execution, 'text') and execution.text:
+                        return ScriptResult(text=execution.text, exception_str=None)
+                    elif hasattr(execution, 'stdout') and execution.stdout:
+                        return ScriptResult(text=execution.stdout, exception_str=None)
+                    else:
+                        return ScriptResult(text="", exception_str="No output from execution")
+
+                except Exception as e:
+                    return ScriptResult(text=None, exception_str=str(e))
+                
+                finally:
+                    if sandbox:
+                        try:
+                            await asyncio.to_thread(sandbox.close)
+                            await asyncio.to_thread(sandbox.shutdown)
+                        except Exception:
+                            pass
+
+        tasks = [run_script(script, lang) for script, lang in zip(batch.scripts, batch.languages)]
+        return await asyncio.gather(*tasks)
+
+    return app
+
+def parse_args():
+    """
+    Parse command-line arguments for the morph_router script.
+
+    Arguments:
+        --host (str): The hostname or IP address to bind the server to. Defaults to "0.0.0.0".
+        --port (int): The port number on which the server will listen. Defaults to 8001.
+        --max_num_sandboxes (int): The maximum number of sandboxes that can be created simultaneously. Defaults to 20.
+        --api_key (str): The MorphCloud API key. If not provided, it will be read from the MORPH_API_KEY environment variable.
+
+    Returns:
+        argparse.Namespace: Parsed command-line arguments as an object.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=8001)
+    parser.add_argument("--max_num_sandboxes", type=int, default=20)
+    parser.add_argument("--api_key", default=os.getenv("MORPH_API_KEY"))
+    args = parser.parse_args()
+    
+    if not args.api_key:
+        raise ValueError("MorphCloud API key not provided. Please set MORPH_API_KEY environment variable or use --api_key.")
+    
+    return args
+
+if __name__ == "__main__":
+    args = parse_args()
+    app = create_app(args)
+    
+    print(f"Starting MorphCloud Router on {args.host}:{args.port}")
+    uvicorn.run(app, host=args.host, port=args.port)
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 5ee173c0e..ae2bcda7b 100644
--- a/setup.py
+++ b/setup.py
@@ -58,6 +58,7 @@
     "liger-kernel>=0.5.6",
     "lighteval @ git+https://github.com/huggingface/lighteval.git@989f5f5586de1ddfeceb0dfa5076bd0740d376fa",
     "math-verify==0.5.2",  # Used for math verification in grpo
+    "morphcloud==0.1.67",
     "packaging>=23.0",
     "parameterized>=0.9.0",
     "peft>=0.14.0",
@@ -89,7 +90,7 @@ def deps_list(*pkgs):
 extras["tests"] = deps_list("pytest", "parameterized", "math-verify", "jieba")
 extras["torch"] = deps_list("torch")
 extras["quality"] = deps_list("ruff", "isort", "flake8")
-extras["code"] = deps_list("e2b-code-interpreter", "python-dotenv", "jieba")
+extras["code"] = deps_list("e2b-code-interpreter", "python-dotenv", "morphcloud", "jieba")
 extras["eval"] = deps_list("lighteval", "math-verify")
 extras["dev"] = extras["quality"] + extras["tests"] + extras["eval"] + extras["code"]
 
diff --git a/slurm/morph_router.slurm b/slurm/morph_router.slurm
new file mode 100644
index 000000000..9bb3e79f1
--- /dev/null
+++ b/slurm/morph_router.slurm
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+#SBATCH --partition=hopper-cpu
+#SBATCH --mem=16g
+#SBATCH --cpus-per-task=16
+#SBATCH --output=/fsx/open-r1/logs/morph_router/%x-%j.out
+#SBATCH --err=/fsx/open-r1/logs/morph_router/%x-%j.err
+#SBATCH --requeue
+
+echo "Starting job"
+set -x -e
+
+source ~/.bashrc
+source openr1/bin/activate
+
+srun python scripts/morph_router.py --port 8001 --max_num_sandboxes 20
diff --git a/src/open_r1/configs.py b/src/open_r1/configs.py
index b24dd305e..17b4cb89e 100644
--- a/src/open_r1/configs.py
+++ b/src/open_r1/configs.py
@@ -27,12 +27,16 @@ class GRPOConfig(trl.GRPOConfig):
     """
 
     benchmarks: list[str] = field(
-        default_factory=lambda: [], metadata={"help": "The benchmarks to run after training."}
+        default_factory=lambda: [],
+        metadata={"help": "The benchmarks to run after training."},
     )
     callbacks: list[str] = field(
-        default_factory=lambda: [], metadata={"help": "The callbacks to run during training."}
+        default_factory=lambda: [],
+        metadata={"help": "The callbacks to run during training."},
+    )
+    chat_template: Optional[str] = field(
+        default=None, metadata={"help": "The chat template to use."}
     )
-    chat_template: Optional[str] = field(default=None, metadata={"help": "The chat template to use."})
     system_prompt: Optional[str] = field(
         default=None,
         metadata={"help": "The optional system prompt to use."},
@@ -40,8 +44,12 @@ class GRPOConfig(trl.GRPOConfig):
     hub_model_revision: Optional[str] = field(
         default="main", metadata={"help": "The Hub model branch to push the model to."}
     )
-    overwrite_hub_revision: bool = field(default=False, metadata={"help": "Whether to overwrite the Hub revision."})
-    push_to_hub_revision: bool = field(default=False, metadata={"help": "Whether to push to a Hub revision/branch."})
+    overwrite_hub_revision: bool = field(
+        default=False, metadata={"help": "Whether to overwrite the Hub revision."}
+    )
+    push_to_hub_revision: bool = field(
+        default=False, metadata={"help": "Whether to push to a Hub revision/branch."}
+    )
     wandb_entity: Optional[str] = field(
         default=None,
         metadata={"help": ("The entity to store runs under.")},
@@ -63,12 +71,16 @@ class SFTConfig(trl.SFTConfig):
     """
 
     benchmarks: list[str] = field(
-        default_factory=lambda: [], metadata={"help": "The benchmarks to run after training."}
+        default_factory=lambda: [],
+        metadata={"help": "The benchmarks to run after training."},
     )
     callbacks: list[str] = field(
-        default_factory=lambda: [], metadata={"help": "The callbacks to run during training."}
+        default_factory=lambda: [],
+        metadata={"help": "The callbacks to run during training."},
+    )
+    chat_template: Optional[str] = field(
+        default=None, metadata={"help": "The chat template to use."}
     )
-    chat_template: Optional[str] = field(default=None, metadata={"help": "The chat template to use."})
     system_prompt: Optional[str] = field(
         default=None,
         metadata={"help": "The optional system prompt to use for benchmarking."},
@@ -77,8 +89,12 @@ class SFTConfig(trl.SFTConfig):
         default="main",
         metadata={"help": "The Hub model branch to push the model to."},
     )
-    overwrite_hub_revision: bool = field(default=False, metadata={"help": "Whether to overwrite the Hub revision."})
-    push_to_hub_revision: bool = field(default=False, metadata={"help": "Whether to push to a Hub revision/branch."})
+    overwrite_hub_revision: bool = field(
+        default=False, metadata={"help": "Whether to overwrite the Hub revision."}
+    )
+    push_to_hub_revision: bool = field(
+        default=False, metadata={"help": "Whether to push to a Hub revision/branch."}
+    )
     wandb_entity: Optional[str] = field(
         default=None,
         metadata={"help": ("The entity to store runs under.")},
@@ -147,7 +163,9 @@ class GRPOScriptArguments(trl.ScriptArguments):
     )
     repetition_max_penalty: float = field(
         default=-1.0,
-        metadata={"help": "Maximum (negative) penalty for for repetition penalty reward"},
+        metadata={
+            "help": "Maximum (negative) penalty for for repetition penalty reward"
+        },
     )
     code_language: str = field(
         default="python",
@@ -176,5 +194,26 @@ class GRPOScriptArguments(trl.ScriptArguments):
 
     e2b_router_url: Optional[str] = field(
         default=None,
-        metadata={"help": "URL for the E2B route. See scripts/e2b_router.py"},
+        metadata={"help": "URL for the E2B router. See scripts/e2b_router.py"},
+    )
+
+    morph_router_url: Optional[str] = field(
+        default=None,
+        metadata={"help": "URL for the MorphCloud router. See scripts/morph_router.py"},
+    )
+
+    code_provider: Optional[str] = field(
+        default="e2b",
+        metadata={
+            "help": "Provider for code execution. Options: 'e2b', 'local', 'morph'.",
+            "choices": ["e2b", "local", "morph"],
+        },
+    )
+
+    ioi_provider: Optional[str] = field(
+        default="piston",
+        metadata={
+            "help": "Provider for IOI code execution. Options: 'piston', 'morph'.",
+            "choices": ["piston", "morph"],
+        },
     )
diff --git a/src/open_r1/generate.py b/src/open_r1/generate.py
index 40ff3b39f..a002632cf 100644
--- a/src/open_r1/generate.py
+++ b/src/open_r1/generate.py
@@ -53,7 +53,9 @@ def build_distilabel_pipeline(
                 generation_kwargs=generation_kwargs,
             ),
             template=prompt_template,
-            input_mappings={"instruction": prompt_column} if prompt_column is not None else {},
+            input_mappings=(
+                {"instruction": prompt_column} if prompt_column is not None else {}
+            ),
             input_batch_size=input_batch_size,
             num_generations=num_generations,
             group_generations=True,
@@ -68,7 +70,9 @@ def build_distilabel_pipeline(
 
     from datasets import load_dataset
 
-    parser = argparse.ArgumentParser(description="Run distilabel pipeline for generating responses with DeepSeek R1")
+    parser = argparse.ArgumentParser(
+        description="Run distilabel pipeline for generating responses with DeepSeek R1"
+    )
     parser.add_argument(
         "--hf-dataset",
         type=str,
@@ -175,8 +179,12 @@ def build_distilabel_pipeline(
         print(f"  {arg}: {value}")
     print()
 
-    print(f"Loading '{args.hf_dataset}' (config: {args.hf_dataset_config}, split: {args.hf_dataset_split}) dataset...")
-    dataset = load_dataset(args.hf_dataset, args.hf_dataset_config, split=args.hf_dataset_split)
+    print(
+        f"Loading '{args.hf_dataset}' (config: {args.hf_dataset_config}, split: {args.hf_dataset_split}) dataset..."
+    )
+    dataset = load_dataset(
+        args.hf_dataset, args.hf_dataset_config, split=args.hf_dataset_split
+    )
     print("Dataset loaded!")
 
     pipeline = build_distilabel_pipeline(
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index 80f5e9e71..bbec2e5e9 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -89,14 +89,18 @@ def main(script_args, training_args, model_args):
     reward_funcs = get_reward_funcs(script_args)
 
     # Format into conversation
-    def make_conversation(example, prompt_column: str = script_args.dataset_prompt_column):
+    def make_conversation(
+        example, prompt_column: str = script_args.dataset_prompt_column
+    ):
         prompt = []
 
         if training_args.system_prompt is not None:
             prompt.append({"role": "system", "content": training_args.system_prompt})
 
         if prompt_column not in example:
-            raise ValueError(f"Dataset Question Field Error: {prompt_column} is not supported.")
+            raise ValueError(
+                f"Dataset Question Field Error: {prompt_column} is not supported."
+            )
 
         prompt.append({"role": "user", "content": example[prompt_column]})
         return {"prompt": prompt}
@@ -115,7 +119,11 @@ def make_conversation(example, prompt_column: str = script_args.dataset_prompt_c
         reward_funcs=reward_funcs,
         args=training_args,
         train_dataset=dataset[script_args.dataset_train_split],
-        eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        eval_dataset=(
+            dataset[script_args.dataset_test_split]
+            if training_args.eval_strategy != "no"
+            else None
+        ),
         peft_config=get_peft_config(model_args),
         callbacks=get_callbacks(training_args, model_args),
         processing_class=tokenizer,
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index df1eed3f9..dfbb3fb47 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -25,22 +25,19 @@
 from latex2sympy2_extended import NormalizationConfig
 from math_verify import LatexExtractionConfig, parse, verify
 
-from .utils import is_e2b_available
-from .utils.ioi import SubtaskResult, add_includes, get_piston_client_from_env, score_subtask
-
-
-if is_e2b_available():
-    from dotenv import load_dotenv
-    from e2b_code_interpreter import AsyncSandbox
-
-    from .utils.routed_sandbox import RoutedSandbox
-
-    load_dotenv()
-else:
-    AsyncSandbox = None
-
-
-def accuracy_reward(completions: list[list[dict[str, str]]], solution: list[str], **kwargs) -> list[Optional[float]]:
+from .utils.code_providers import get_provider
+from .utils.ioi import (
+    SubtaskResult,
+    add_includes,
+    get_morph_client_from_env,
+    get_piston_client_from_env,
+    score_subtask,
+)
+
+
+def accuracy_reward(
+    completions: list[list[dict[str, str]]], solution: list[str], **kwargs
+) -> list[Optional[float]]:
     """Reward function that checks if the completion is the same as the ground truth."""
     contents = [completion[0]["content"] for completion in completions]
     rewards = []
@@ -74,7 +71,9 @@ def accuracy_reward(completions: list[list[dict[str, str]]], solution: list[str]
             try:
                 reward = float(verify(gold_parsed, answer_parsed))
             except Exception as e:
-                print(f"verify failed: {e}, answer: {answer_parsed}, gold: {gold_parsed}")
+                print(
+                    f"verify failed: {e}, answer: {answer_parsed}, gold: {gold_parsed}"
+                )
                 reward = None
         else:
             # If the gold solution is not parseable, we assign `None` to skip this example
@@ -89,7 +88,10 @@ def format_reward(completions, **kwargs):
     """Reward function that checks if the reasoning process is enclosed within <think> and </think> tags, while the final answer is enclosed within <answer> and </answer> tags."""
     pattern = r"^<think>\n.*?\n</think>\n<answer>\n.*?\n</answer>$"
     completion_contents = [completion[0]["content"] for completion in completions]
-    matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completion_contents]
+    matches = [
+        re.match(pattern, content, re.DOTALL | re.MULTILINE)
+        for content in completion_contents
+    ]
     return [1.0 if match else 0.0 for match in matches]
 
 
@@ -132,7 +134,9 @@ def reasoning_steps_reward(completions, **kwargs):
     return [min(1.0, count / 3) for count in matches]
 
 
-def len_reward(completions: list[Dict[str, str]], solution: list[str], **kwargs) -> float:
+def len_reward(
+    completions: list[Dict[str, str]], solution: list[str], **kwargs
+) -> float:
     """Compute length-based rewards to discourage overthinking and promote token efficiency.
 
     Taken from the Kimi 1.5 tech report: https://arxiv.org/abs/2501.12599
@@ -233,7 +237,11 @@ def cosine_scaled_reward(completions, solution, **kwargs):
         rewards = []
 
         for content, sol in zip(contents, solution):
-            gold_parsed = parse(sol, extraction_mode="first_match", extraction_config=[LatexExtractionConfig()])
+            gold_parsed = parse(
+                sol,
+                extraction_mode="first_match",
+                extraction_config=[LatexExtractionConfig()],
+            )
             if len(gold_parsed) == 0:
                 rewards.append(1.0)  # Skip unparseable examples
                 print("Failed to parse gold solution: ", sol)
@@ -281,7 +289,9 @@ def cosine_scaled_reward(completions, solution, **kwargs):
     return cosine_scaled_reward
 
 
-def get_repetition_penalty_reward(ngram_size: int, max_penalty: float, language: str = "en"):
+def get_repetition_penalty_reward(
+    ngram_size: int, max_penalty: float, language: str = "en"
+):
     """
     Computes N-gram repetition penalty as described in Appendix C.2 of https://arxiv.org/abs/2502.03373.
     Reference implementation from: https://github.com/eddycmu/demystify-long-cot/blob/release/openrlhf/openrlhf/reward/repetition.py
@@ -299,6 +309,7 @@ def get_repetition_penalty_reward(ngram_size: int, max_penalty: float, language:
         def zipngram(text: str, ngram_size: int):
             words = text.lower().split()
             return zip(*[words[i:] for i in range(ngram_size)]), words
+
     elif language == "zh":
         from transformers.utils.import_utils import _is_package_available
 
@@ -310,6 +321,7 @@ def zipngram(text: str, ngram_size: int):
 
             seg_list = list(jieba.cut(text))
             return zip(*[seg_list[i:] for i in range(ngram_size)]), seg_list
+
     else:
         raise ValueError(
             f"Word splitting for language `{language}` is not yet implemented. Please implement your own zip-ngram function."
@@ -352,6 +364,7 @@ def repetition_penalty_reward(completions, **kwargs) -> float:
 
 
 def _init_event_loop():
+    """Initialize or get the current event loop."""
     try:
         loop = asyncio.get_event_loop()
     except RuntimeError:
@@ -360,15 +373,26 @@ def _init_event_loop():
     return loop
 
 
-def ioi_code_reward(completions, test_batch_size: int = 1, **kwargs) -> list[float]:
-    """Reward function that evaluates IOI problems using Piston+our IOI package.
+def ioi_code_reward(
+    completions, test_batch_size: int = 1, provider_type: str = "piston", **kwargs
+) -> list[float]:
+    """Reward function that evaluates IOI problems using a specified execution client.
 
     Assumes the dataset has the same format as hf.co/datasets/open-r1/ioi
 
-    test_batch_size: evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating; otherwise continue with the next batch of test cases.
+    Args:
+        completions: List of model completions to evaluate
+        test_batch_size: Evaluate these many test cases in parallel, then check if any of them failed (0 score):
+                       if so stop evaluating; otherwise continue with the next batch of test cases.
+        provider_type: The execution provider to use (default: "piston"). Supported values: "piston", "morph"
+        **kwargs: Additional arguments passed from the dataset
     """
-    # for info on setting up piston workers, see slurm/piston/README.md
-    piston_client = get_piston_client_from_env()
+    # Get the appropriate client based on provider_type
+    if provider_type == "morph":
+        execution_client = get_morph_client_from_env()
+    else:
+        # for info on setting up piston workers, see slurm/piston/README.md
+        execution_client = get_piston_client_from_env()
 
     code_snippets = [
         # note: grading is automatically skipped if no code is extracted
@@ -380,16 +404,24 @@ async def run_catch_exceptions(task):
         try:
             return await task
         except Exception as e:
-            print(f"Error from Piston worker: {e}")
-            return SubtaskResult()  # score 0.0
+            print(f"Error from {provider_type} worker: {e}")
+            return SubtaskResult()
 
-    # load problem data. undo separating kwargs by column
-    problems_data = [dict(zip(kwargs.keys(), values)) for values in zip(*kwargs.values())]
+    problems_data = [
+        dict(zip(kwargs.keys(), values)) for values in zip(*kwargs.values())
+    ]
 
     loop = _init_event_loop()
     evals = [
         loop.create_task(
-            run_catch_exceptions(score_subtask(piston_client, problem_data, code, test_batch_size=test_batch_size))
+            run_catch_exceptions(
+                score_subtask(
+                    execution_client,
+                    problem_data,
+                    code,
+                    test_batch_size=test_batch_size,
+                )
+            )
         )
         for problem_data, code in zip(problems_data, code_snippets)
     ]
@@ -405,8 +437,20 @@ def extract_code(completion: str, language: str = "python") -> str:
     return extracted_answer
 
 
-def binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]:
-    rewards = code_reward(completions, num_parallel=num_parallel, e2b_router_url=e2b_router_url, **kwargs)
+def binary_code_reward(
+    completions,
+    num_parallel: int = 2,
+    provider_type: str = "e2b",
+    enforce_same_language: bool = False,
+    **kwargs,
+) -> list[float]:
+    rewards = code_reward(
+        completions,
+        num_parallel=num_parallel,
+        provider_type=provider_type,
+        enforce_same_language=enforce_same_language,
+        **kwargs,
+    )
     BINARY_THRESHOLD = 0.99
 
     output = []
@@ -419,19 +463,24 @@ def binary_code_reward(completions, num_parallel: int = 2, e2b_router_url=None,
     return output
 
 
-def code_reward(completions, num_parallel: int = 2, e2b_router_url=None, **kwargs) -> list[float]:
-    """Reward function that evaluates code snippets using the E2B code interpreter.
+def code_reward(
+    completions,
+    num_parallel: int = 2,
+    provider_type: str = "e2b",
+    enforce_same_language: bool = False,
+    **kwargs,
+) -> list[float]:
+    """Reward function that evaluates code snippets using a code execution provider.
 
     Assumes the dataset contains a `verification_info` column with test cases.
-    """
-    if not is_e2b_available():
-        raise ImportError(
-            "E2B is not available and required for this reward function. Please install E2B with "
-            "`pip install e2b-code-interpreter` and add an API key to a `.env` file."
-        )
 
-    # TODO: add support for other languages in E2B: https://e2b.dev/docs/code-interpreting/supported-languages
-    """Returns a reward function that evaluates code snippets in a sandbox."""
+    Args:
+        completions: List of model completions to evaluate
+        num_parallel: Number of parallel code executions (default: 2)
+        provider_type: Which code execution provider to use (default: "e2b")
+        enforce_same_language: If True, verify all problems use the same language (default: False)
+        **kwargs: Additional arguments passed to the verification
+    """
     evaluation_script_template = """
     import subprocess
     import json
@@ -471,43 +520,37 @@ def evaluate_code(code, test_cases):
 
     evaluate_code(code_snippet, test_cases)
     """
-    code_snippets = [extract_code(completion[-1]["content"]) for completion in completions]
+
+    code_snippets = [
+        extract_code(completion[-1]["content"]) for completion in completions
+    ]
     verification_info = kwargs["verification_info"]
+
+    template = evaluation_script_template
+
     scripts = [
-        evaluation_script_template.format(code=json.dumps(code), test_cases=json.dumps(json.dumps(info["test_cases"])))
+        template.format(
+            code=json.dumps(code), test_cases=json.dumps(json.dumps(info["test_cases"]))
+        )
         for code, info in zip(code_snippets, verification_info)
     ]
 
     language = verification_info[0]["language"]
-    if not all(v["language"] == language for v in verification_info):
-        raise ValueError("All verification_info must have the same language", verification_info)
-
-    if e2b_router_url is not None:
-        routed_sandbox = RoutedSandbox(router_url=e2b_router_url)
 
-        executions = routed_sandbox.run_code(
-            scripts=scripts,
-            language=language,
-            timeout=30,
-            request_timeout=28,
-        )
-
-        rewards = []
-        for execution in executions:
-            try:
-                reward = float(execution.text)
-                rewards.append(reward)
-            except Exception:
-                rewards.append(None)
-        return rewards
+    if enforce_same_language:
+        all_same_language = all(v["language"] == language for v in verification_info)
+        if not all_same_language:
+            raise ValueError(
+                "All verification_info must have the same language", verification_info
+            )
 
-    try:
-        rewards = run_async_from_sync(scripts, language, num_parallel)
-    except Exception as e:
-        print(f"Error from E2B executor: {e}")
-        rewards = [0.0] * len(completions)
+    execution_provider = get_provider(
+        provider_type=provider_type,
+        num_parallel=num_parallel,
+        **kwargs,
+    )
 
-    return rewards
+    return execution_provider.execute_scripts(scripts, ["python"] * len(scripts))
 
 
 def get_code_format_reward(language: str = "python"):
@@ -516,74 +559,21 @@ def get_code_format_reward(language: str = "python"):
     Args:
         language: Programming language supported by E2B https://e2b.dev/docs/code-interpreting/supported-languages
     """
-    pattern = rf"^<think>\n.*?\n</think>\n<answer>\n.*?```{language}.*?```.*?\n</answer>$"
+    pattern = (
+        rf"^<think>\n.*?\n</think>\n<answer>\n.*?```{language}.*?```.*?\n</answer>$"
+    )
 
     def code_format_reward(completions, **kwargs):
         completion_contents = [completion[0]["content"] for completion in completions]
-        matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completion_contents]
+        matches = [
+            re.match(pattern, content, re.DOTALL | re.MULTILINE)
+            for content in completion_contents
+        ]
         return [1.0 if match else 0.0 for match in matches]
 
     return code_format_reward
 
 
-def run_async_from_sync(scripts: list[str], language: str, num_parallel: int) -> list[float]:
-    """Function wrapping the `run_async` function."""
-    # Create a new event loop and set it
-    try:
-        # Run the async function and get the result
-        rewards = asyncio.run(run_async(scripts, language, num_parallel))
-    except Exception as e:
-        print(f"Error from E2B executor async: {e}")
-        raise e
-
-    return rewards
-
-
-async def run_async(scripts: list[str], language: str, num_parallel: int) -> list[float]:
-    # Limit the number of concurrent tasks
-    semaphore = asyncio.Semaphore(num_parallel)
-
-    # Create a list of tasks for running scripts concurrently
-    tasks = [run_script(script, language, semaphore) for script in scripts]
-
-    # Wait for all tasks to complete and gather their results as they finish
-    results = await asyncio.gather(*tasks)
-    rewards = list(results)  # collect results
-
-    return rewards
-
-
-async def run_script(script: str, language: str, semaphore: asyncio.Semaphore) -> float:
-    # We set a timeout margin, as the AsyncSandbox timeout does not seem to work
-    # These values are based on running 256 examples with the gold solution
-    # from open-r1/verifiable-coding-problems-python_decontaminated
-    # see scripts/benchmark_e2b.py
-
-    SANDBOX_TIMEOUT = 30
-    MARGIN = 2
-    REQUEST_TIMEOUT = SANDBOX_TIMEOUT - MARGIN
-    ASYNCIO_TIMEOUT = SANDBOX_TIMEOUT + MARGIN
-
-    async with semaphore:
-        try:
-            sandbox = await AsyncSandbox.create(timeout=SANDBOX_TIMEOUT, request_timeout=REQUEST_TIMEOUT)
-            execution = await asyncio.wait_for(sandbox.run_code(script, language=language), timeout=ASYNCIO_TIMEOUT)
-            return float(execution.text)
-        except (TypeError, ValueError):
-            return 0.0
-        except asyncio.TimeoutError:
-            print("Operation timed out")
-            return 0.0
-        except Exception as e:
-            print(f"Error in `run_script` from E2B sandbox ID {sandbox.sandbox_id} : {e}")
-            return 0.0
-        finally:
-            try:
-                await sandbox.kill()
-            except Exception as e:
-                print(f"Error from E2B executor kill with sandbox ID {sandbox.sandbox_id} : {e}")
-
-
 def get_reward_funcs(script_args) -> list[Callable]:
     REWARD_FUNCS_REGISTRY = {
         "accuracy": accuracy_reward,
@@ -605,7 +595,10 @@ def get_reward_funcs(script_args) -> list[Callable]:
             partial(
                 code_reward,
                 num_parallel=script_args.parallel_code_exec_per_proc,
-                e2b_router_url=script_args.e2b_router_url,
+                provider_type=script_args.code_provider,
+                enforce_same_language=getattr(
+                    script_args, "enforce_same_language", False
+                ),
             ),
             code_reward,
         ),
@@ -613,12 +606,20 @@ def get_reward_funcs(script_args) -> list[Callable]:
             partial(
                 binary_code_reward,
                 num_parallel=script_args.parallel_code_exec_per_proc,
-                e2b_router_url=script_args.e2b_router_url,
+                provider_type=script_args.code_provider,
+                enforce_same_language=getattr(
+                    script_args, "enforce_same_language", False
+                ),
             ),
             binary_code_reward,
         ),
         "ioi_code": update_wrapper(
-            partial(ioi_code_reward, test_batch_size=script_args.code_eval_test_batch_size), ioi_code_reward
+            partial(
+                ioi_code_reward,
+                test_batch_size=script_args.code_eval_test_batch_size,
+                provider_type=getattr(script_args, "ioi_provider", "piston"),
+            ),
+            ioi_code_reward,
         ),
         "code_format": get_code_format_reward(language=script_args.code_language),
         "tag_count": tag_count_reward,
diff --git a/src/open_r1/sft.py b/src/open_r1/sft.py
index 9566d0530..6110feded 100644
--- a/src/open_r1/sft.py
+++ b/src/open_r1/sft.py
@@ -115,7 +115,11 @@ def main(script_args, training_args, model_args):
         model=model,
         args=training_args,
         train_dataset=dataset[script_args.dataset_train_split],
-        eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        eval_dataset=(
+            dataset[script_args.dataset_test_split]
+            if training_args.eval_strategy != "no"
+            else None
+        ),
         processing_class=tokenizer,
         peft_config=get_peft_config(model_args),
         callbacks=get_callbacks(training_args, model_args),
diff --git a/src/open_r1/utils/__init__.py b/src/open_r1/utils/__init__.py
index 5ada1c6e2..7b1449731 100644
--- a/src/open_r1/utils/__init__.py
+++ b/src/open_r1/utils/__init__.py
@@ -1,5 +1,5 @@
-from .import_utils import is_e2b_available
+from .import_utils import is_e2b_available, is_morph_available
 from .model_utils import get_model, get_tokenizer
 
 
-__all__ = ["get_tokenizer", "is_e2b_available", "get_model"]
+__all__ = ["get_tokenizer", "is_e2b_available", "is_morph_available", "get_model"]
diff --git a/src/open_r1/utils/callbacks.py b/src/open_r1/utils/callbacks.py
index c1b0ac5dc..323966b64 100644
--- a/src/open_r1/utils/callbacks.py
+++ b/src/open_r1/utils/callbacks.py
@@ -28,7 +28,9 @@
 def is_slurm_available() -> bool:
     # returns true if a slurm queueing system is available
     try:
-        subprocess.run(["sinfo"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        subprocess.run(
+            ["sinfo"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
         return True
     except FileNotFoundError:
         return False
@@ -44,7 +46,13 @@ class PushToHubRevisionCallback(TrainerCallback):
     def __init__(self, model_config) -> None:
         self.model_config = model_config
 
-    def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+    def on_save(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
         if state.is_world_process_zero:
             global_step = state.global_step
 
diff --git a/src/open_r1/utils/code_providers.py b/src/open_r1/utils/code_providers.py
new file mode 100644
index 000000000..71830e6ae
--- /dev/null
+++ b/src/open_r1/utils/code_providers.py
@@ -0,0 +1,366 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Code execution providers for executing and evaluating code snippets."""
+
+import abc
+import asyncio
+from typing import List, Optional
+
+from ..utils import is_e2b_available, is_morph_available
+
+
+if is_e2b_available():
+    from e2b_code_interpreter import AsyncSandbox
+    from e2b_code_interpreter.models import Execution
+
+    from .routed_sandbox import RoutedSandbox
+else:
+    AsyncSandbox = None
+    Execution = None
+    RoutedSandbox = None
+
+if is_morph_available():
+    from morphcloud.api import MorphCloudClient
+    from morphcloud.sandbox import Sandbox
+
+    from .routed_morph import RoutedMorphSandbox
+else:
+    MorphCloudClient = None
+    Sandbox = None
+    RoutedMorphSandbox = None
+
+
+class CodeExecutionProvider(abc.ABC):
+    """Abstract base class for code execution providers."""
+
+    @abc.abstractmethod
+    def execute_scripts(self, scripts: List[str], languages: List[str]) -> List[float]:
+        """Execute multiple scripts and return their reward values.
+
+        Args:
+            scripts: List of code scripts to execute
+            language: The programming language of the scripts
+
+        Returns:
+            List of float rewards (one per script)
+        """
+        pass
+
+
+class E2BProvider(CodeExecutionProvider):
+    """Provider that executes code using E2B sandboxes."""
+
+    def __init__(self, num_parallel: int = 2, e2b_router_url: Optional[str] = None):
+        """Initialize the E2B provider.
+
+        Args:
+            num_parallel: Number of parallel sandboxes to use
+            e2b_router_url: URL for the E2B router (if using router mode)
+        """
+        if not is_e2b_available():
+            raise ImportError(
+                "E2B is not available and required for this provider. Please install E2B with "
+                "`pip install e2b-code-interpreter` and add an API key to a `.env` file."
+            )
+
+        self.num_parallel = num_parallel
+        self.e2b_router_url = e2b_router_url
+
+    def execute_scripts(self, scripts: List[str], languages: List[str]) -> List[float]:
+        """Execute scripts using E2B sandboxes.
+
+        If e2b_router_url is provided, uses the RoutedSandbox for batch processing.
+        Otherwise, uses direct AsyncSandbox with parallelization.
+        """
+        if self.e2b_router_url is not None:
+            routed_sandbox = RoutedSandbox(router_url=self.e2b_router_url)
+
+            executions = routed_sandbox.run_code(
+                scripts=scripts,
+                languages=languages,
+                timeout=30,
+                request_timeout=28,
+            )
+
+            rewards = []
+            for execution in executions:
+                try:
+                    reward = float(execution.text)
+                    rewards.append(reward)
+                except Exception:
+                    rewards.append(None)
+            return rewards
+
+        try:
+            rewards = self._run_async_from_sync(scripts, languages, self.num_parallel)
+        except Exception as e:
+            print(f"Error from E2B executor: {e}")
+            rewards = [0.0] * len(scripts)
+
+        return rewards
+
+    def _run_async_from_sync(self, scripts: List[str], languages: List[str], num_parallel: int) -> List[float]:
+        """Function wrapping the `_run_async` function."""
+        try:
+            rewards = asyncio.run(self._run_async(scripts, languages, num_parallel))
+        except Exception as e:
+            print(f"Error from E2B executor async: {e}")
+            raise e
+
+        return rewards
+
+    async def _run_async(self, scripts: List[str], languages: List[str], num_parallel: int) -> List[float]:
+        semaphore = asyncio.Semaphore(num_parallel)
+
+        tasks = [self._run_script(script, languages, semaphore) for script in scripts]
+
+        results = await asyncio.gather(*tasks)
+        rewards = list(results)
+
+        return rewards
+
+    async def _run_script(self, script: str, languages: List[str], semaphore: asyncio.Semaphore) -> float:
+        # We set a timeout margin, as the AsyncSandbox timeout does not seem to work
+        # These values are based on running 256 examples with the gold solution
+        # from open-r1/verifiable-coding-problems-python_decontaminated
+        # see scripts/benchmark_e2b.py
+
+        SANDBOX_TIMEOUT = 30
+        MARGIN = 2
+        REQUEST_TIMEOUT = SANDBOX_TIMEOUT - MARGIN
+        ASYNCIO_TIMEOUT = SANDBOX_TIMEOUT + MARGIN
+
+        async with semaphore:
+            try:
+                sandbox = await AsyncSandbox.create(timeout=SANDBOX_TIMEOUT, request_timeout=REQUEST_TIMEOUT)
+                execution = await asyncio.wait_for(
+                    sandbox.run_code(script, languages=languages),
+                    timeout=ASYNCIO_TIMEOUT,
+                )
+                return float(execution.text)
+            except (TypeError, ValueError):
+                return 0.0
+            except asyncio.TimeoutError:
+                print("Operation timed out")
+                return 0.0
+            except Exception as e:
+                print(f"Error in `_run_script` from E2B sandbox ID {sandbox.sandbox_id} : {e}")
+                return 0.0
+            finally:
+                try:
+                    await sandbox.kill()
+                except Exception as e:
+                    print(f"Error from E2B executor kill with sandbox ID {sandbox.sandbox_id} : {e}")
+
+
+class MorphProvider(CodeExecutionProvider):
+    """Provider that executes code using MorphCloud's Sandbox API."""
+
+    def __init__(self, num_parallel: int = 2, morph_router_url: Optional[str] = None):
+        """Initialize the Morph provider.
+
+        Args:
+            num_parallel: Number of parallel executions to use
+            morph_router_url: URL for the MorphCloud router (if using router mode)
+        """
+        if not is_morph_available():
+            raise ImportError(
+                "MorphCloud is not available and required for this provider. Please install MorphCloud with "
+                "`pip install morphcloud` and add an API key to a `.env` file."
+            )
+
+        try:
+            from dotenv import load_dotenv
+
+            load_dotenv()
+        except ImportError:
+            print("Warning: python-dotenv not installed. Environment variables must be set directly.")
+
+        self.num_parallel = num_parallel
+        self.morph_router_url = morph_router_url
+
+        if self.morph_router_url is not None:
+            self.routed_sandbox = RoutedMorphSandbox(router_url=self.morph_router_url)
+            return
+
+        import os
+
+        self.api_key = os.getenv("MORPH_API_KEY")
+        if not self.api_key:
+            raise ValueError("MorphCloud API key not found. Please set the MORPH_API_KEY environment variable.")
+
+        try:
+            self.client = MorphCloudClient(api_key=self.api_key)
+            self.Sandbox = Sandbox
+        except ImportError as e:
+            raise ImportError(f"Required MorphCloud dependencies not installed: {e}")
+
+    def execute_scripts(self, scripts: List[str], languages: List[str]) -> List[float]:
+        """Execute scripts using MorphCloud Sandbox API.
+
+        Args:
+            scripts: List of Python scripts to execute
+            language: Programming language
+
+        Returns:
+            List of float rewards (one per script)
+        """
+
+        if hasattr(self, "routed_sandbox"):
+            try:
+                results = self.routed_sandbox.run_code(
+                    scripts=scripts,
+                    languages=languages,
+                    timeout=90,
+                    request_timeout=96,
+                )
+
+                rewards = []
+                for result in results:
+                    try:
+                        reward = float(result.text)
+                        rewards.append(reward)
+                    except (ValueError, AttributeError):
+                        rewards.append(0.0)
+                return rewards
+            except Exception as e:
+                print(f"Error from MorphCloud router: {e}")
+                return [0.0] * len(scripts)
+
+        import asyncio
+
+        try:
+            rewards = asyncio.run(self._run_async(scripts, languages, self.num_parallel))
+        except Exception as e:
+            print(f"Error from MorphCloud executor: {e}")
+            rewards = [0.0] * len(scripts)
+
+        return rewards
+
+    async def _run_async(self, scripts: List[str], languages: List[str], num_parallel: int) -> List[float]:
+        """Run multiple scripts concurrently with limited parallelism.
+
+        Args:
+            scripts: List of scripts to execute
+            language: Programming language
+            num_parallel: Maximum number of concurrent executions
+
+        Returns:
+            List of rewards
+        """
+
+        semaphore = asyncio.Semaphore(num_parallel)
+
+        tasks = [self._run_script(script, languages, semaphore) for script in scripts]
+
+        results = await asyncio.gather(*tasks)
+
+        return list(results)
+
+    async def _run_script(self, script: str, languages: List[str], semaphore: asyncio.Semaphore) -> float:
+        """Execute a single script in a MorphCloud Sandbox.
+
+        Args:
+            script: The script to execute
+            language: Programming language
+            semaphore: Semaphore to limit concurrency
+
+        Returns:
+            Float reward from script execution
+        """
+        SANDBOX_TIMEOUT = 90
+        MARGIN = 6
+        ASYNCIO_TIMEOUT = SANDBOX_TIMEOUT + MARGIN
+
+        sandbox = None
+        async with semaphore:
+            try:
+                sandbox = await asyncio.to_thread(self.Sandbox.new, client=self.client, ttl_seconds=SANDBOX_TIMEOUT)
+                result = await asyncio.wait_for(
+                    asyncio.to_thread(
+                        sandbox.run_code,
+                        script,
+                        languages=languages,
+                        timeout=SANDBOX_TIMEOUT,
+                    ),
+                    timeout=ASYNCIO_TIMEOUT,
+                )
+
+                reward = 0.0
+                try:
+                    if hasattr(result, "text") and result.text:
+                        lines = result.text.strip().split("\n")
+                        if lines:
+                            try:
+                                reward = float(lines[-1])
+                            except ValueError:
+                                try:
+                                    reward = float(result.text.strip())
+                                except ValueError:
+                                    pass
+                    elif hasattr(result, "stdout") and result.stdout:
+                        lines = result.stdout.strip().split("\n")
+                        if lines:
+                            try:
+                                reward = float(lines[-1])
+                            except ValueError:
+                                pass
+                except (ValueError, AttributeError):
+                    pass
+
+                return reward
+
+            except asyncio.TimeoutError:
+                return 0.0
+            except Exception:
+                return 0.0
+            finally:
+                if sandbox:
+                    try:
+                        await asyncio.to_thread(sandbox.close)
+                        await asyncio.to_thread(sandbox.shutdown)
+                    except Exception:
+                        pass
+
+
+def get_provider(provider_type: str = "e2b", **kwargs) -> CodeExecutionProvider:
+    """Factory function to get the appropriate code execution provider.
+
+    Args:
+        provider_type: Type of provider to use ("e2b", "morph")
+        **kwargs: Additional arguments to pass to the provider
+
+    Returns:
+        An instance of CodeExecutionProvider
+    """
+    num_parallel = kwargs.pop("num_parallel", 2)
+
+    if provider_type == "e2b":
+        # Extract E2B-specific arguments
+        e2b_router_url = kwargs.pop("e2b_router_url", None)
+        return E2BProvider(
+            num_parallel=num_parallel,
+            e2b_router_url=e2b_router_url,
+        )
+    elif provider_type == "morph":
+        # Extract Morph-specific arguments
+        morph_router_url = kwargs.pop("morph_router_url", None)
+        return MorphProvider(
+            num_parallel=num_parallel,
+            morph_router_url=morph_router_url,
+        )
+    else:
+        raise ValueError(f"Unknown provider type: {provider_type}")
diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py
index 5719350fb..7eeccb2b7 100644
--- a/src/open_r1/utils/evaluation.py
+++ b/src/open_r1/utils/evaluation.py
@@ -25,7 +25,11 @@
 
 
 def register_lighteval_task(
-    configs: Dict[str, str], eval_suite: str, task_name: str, task_list: str, num_fewshot: int = 0
+    configs: Dict[str, str],
+    eval_suite: str,
+    task_name: str,
+    task_list: str,
+    num_fewshot: int = 0,
 ):
     """Registers a LightEval task configuration.
 
@@ -41,7 +45,9 @@ def register_lighteval_task(
         is_custom_task (bool, optional): Whether the task is a custom task. Defaults to False.
     """
     # Format task list in lighteval format
-    task_list = ",".join(f"{eval_suite}|{task}|{num_fewshot}|0" for task in task_list.split(","))
+    task_list = ",".join(
+        f"{eval_suite}|{task}|{num_fewshot}|0" for task in task_list.split(",")
+    )
     configs[task_name] = task_list
 
 
@@ -52,7 +58,9 @@ def register_lighteval_task(
 register_lighteval_task(LIGHTEVAL_TASKS, "lighteval", "aime25", "aime25", 0)
 register_lighteval_task(LIGHTEVAL_TASKS, "lighteval", "gpqa", "gpqa:diamond", 0)
 register_lighteval_task(LIGHTEVAL_TASKS, "extended", "lcb", "lcb:codegeneration", 0)
-register_lighteval_task(LIGHTEVAL_TASKS, "extended", "lcb_v4", "lcb:codegeneration_v4", 0)
+register_lighteval_task(
+    LIGHTEVAL_TASKS, "extended", "lcb_v4", "lcb:codegeneration_v4", 0
+)
 
 
 def get_lighteval_tasks():
@@ -63,7 +71,9 @@ def get_lighteval_tasks():
 
 
 def run_lighteval_job(
-    benchmark: str, training_args: Union["SFTConfig", "GRPOConfig"], model_args: "ModelConfig"
+    benchmark: str,
+    training_args: Union["SFTConfig", "GRPOConfig"],
+    model_args: "ModelConfig",
 ) -> None:
     task_list = LIGHTEVAL_TASKS[benchmark]
     model_name = training_args.hub_model_id
@@ -97,7 +107,9 @@ def run_lighteval_job(
     subprocess.run(cmd, check=True)
 
 
-def run_benchmark_jobs(training_args: Union["SFTConfig", "GRPOConfig"], model_args: "ModelConfig") -> None:
+def run_benchmark_jobs(
+    training_args: Union["SFTConfig", "GRPOConfig"], model_args: "ModelConfig"
+) -> None:
     benchmarks = training_args.benchmarks
     if len(benchmarks) == 1 and benchmarks[0] == "all":
         benchmarks = get_lighteval_tasks()
diff --git a/src/open_r1/utils/hub.py b/src/open_r1/utils/hub.py
index 8ac53a384..086d96928 100644
--- a/src/open_r1/utils/hub.py
+++ b/src/open_r1/utils/hub.py
@@ -36,11 +36,15 @@
 logger = logging.getLogger(__name__)
 
 
-def push_to_hub_revision(training_args: SFTConfig | GRPOConfig, extra_ignore_patterns=[]) -> Future:
+def push_to_hub_revision(
+    training_args: SFTConfig | GRPOConfig, extra_ignore_patterns=[]
+) -> Future:
     """Pushes the model to branch on a Hub repo."""
 
     # Create a repo if it doesn't exist yet
-    repo_url = create_repo(repo_id=training_args.hub_model_id, private=True, exist_ok=True)
+    repo_url = create_repo(
+        repo_id=training_args.hub_model_id, private=True, exist_ok=True
+    )
     # Get initial commit to branch from
     initial_commit = list_repo_commits(training_args.hub_model_id)[-1]
     # Now create the branch we'll be pushing to
@@ -62,7 +66,9 @@ def push_to_hub_revision(training_args: SFTConfig | GRPOConfig, extra_ignore_pat
         ignore_patterns=ignore_patterns,
         run_as_future=True,
     )
-    logger.info(f"Pushed to {repo_url} revision {training_args.hub_model_revision} successfully!")
+    logger.info(
+        f"Pushed to {repo_url} revision {training_args.hub_model_revision} successfully!"
+    )
 
     return future
 
@@ -72,13 +78,19 @@ def check_hub_revision_exists(training_args: SFTConfig | GRPOConfig):
     if repo_exists(training_args.hub_model_id):
         if training_args.push_to_hub_revision is True:
             # First check if the revision exists
-            revisions = [rev.name for rev in list_repo_refs(training_args.hub_model_id).branches]
+            revisions = [
+                rev.name for rev in list_repo_refs(training_args.hub_model_id).branches
+            ]
             # If the revision exists, we next check it has a README file
             if training_args.hub_model_revision in revisions:
                 repo_files = list_repo_files(
-                    repo_id=training_args.hub_model_id, revision=training_args.hub_model_revision
+                    repo_id=training_args.hub_model_id,
+                    revision=training_args.hub_model_revision,
                 )
-                if "README.md" in repo_files and training_args.overwrite_hub_revision is False:
+                if (
+                    "README.md" in repo_files
+                    and training_args.overwrite_hub_revision is False
+                ):
                     raise ValueError(
                         f"Revision {training_args.hub_model_revision} already exists. "
                         "Use --overwrite_hub_revision to overwrite it."
@@ -117,15 +129,21 @@ def get_param_count_from_repo_id(repo_id: str) -> int:
             return -1
 
 
-def get_gpu_count_for_vllm(model_name: str, revision: str = "main", num_gpus: int = 8) -> int:
+def get_gpu_count_for_vllm(
+    model_name: str, revision: str = "main", num_gpus: int = 8
+) -> int:
     """vLLM enforces a constraint that the number of attention heads must be divisible by the number of GPUs and 64 must be divisible by the number of GPUs.
     This function calculates the number of GPUs to use for decoding based on the number of attention heads in the model.
     """
-    config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=True)
+    config = AutoConfig.from_pretrained(
+        model_name, revision=revision, trust_remote_code=True
+    )
     # Get number of attention heads
     num_heads = config.num_attention_heads
     # Reduce num_gpus so that num_heads is divisible by num_gpus and 64 is divisible by num_gpus
     while num_heads % num_gpus != 0 or 64 % num_gpus != 0:
-        logger.info(f"Reducing num_gpus from {num_gpus} to {num_gpus - 1} to make num_heads divisible by num_gpus")
+        logger.info(
+            f"Reducing num_gpus from {num_gpus} to {num_gpus - 1} to make num_heads divisible by num_gpus"
+        )
         num_gpus -= 1
     return num_gpus
diff --git a/src/open_r1/utils/import_utils.py b/src/open_r1/utils/import_utils.py
index 8893264ae..5d6624302 100644
--- a/src/open_r1/utils/import_utils.py
+++ b/src/open_r1/utils/import_utils.py
@@ -21,3 +21,10 @@
 
 def is_e2b_available() -> bool:
     return _e2b_available
+
+
+_morph_available = _is_package_available("morphcloud")
+
+
+def is_morph_available() -> bool:
+    return _morph_available
diff --git a/src/open_r1/utils/ioi/__init__.py b/src/open_r1/utils/ioi/__init__.py
index c7f91dc1f..f4c5468c0 100644
--- a/src/open_r1/utils/ioi/__init__.py
+++ b/src/open_r1/utils/ioi/__init__.py
@@ -1,12 +1,15 @@
+from .morph_client import get_morph_client_from_env
 from .piston_client import get_piston_client_from_env, get_slurm_piston_endpoints
-from .scoring import SubtaskResult, score_subtask
+from .scoring import SubtaskResult, score_subtask, score_subtasks
 from .utils import add_includes
 
 
 __all__ = [
     "get_piston_client_from_env",
     "get_slurm_piston_endpoints",
+    "get_morph_client_from_env",
     "score_subtask",
+    "score_subtasks",
     "add_includes",
     "SubtaskResult",
 ]
diff --git a/src/open_r1/utils/ioi/morph_client.py b/src/open_r1/utils/ioi/morph_client.py
new file mode 100644
index 000000000..847e7b4e7
--- /dev/null
+++ b/src/open_r1/utils/ioi/morph_client.py
@@ -0,0 +1,744 @@
+import asyncio
+import json
+import logging
+import os
+import tempfile
+from typing import Any, Dict, Optional, Tuple
+
+from dotenv import load_dotenv
+from open_r1.utils.import_utils import is_morph_available
+
+
+# Replace direct imports with conditional imports
+if is_morph_available():
+    from morphcloud.api import Instance, InstanceExecResponse, MorphCloudClient
+else:
+    Instance = None
+    InstanceExecResponse = None
+    MorphCloudClient = None
+
+
+# Silence verbose logs from dependencies
+logging.getLogger("paramiko").setLevel(logging.ERROR)
+logging.getLogger("httpx").setLevel(logging.ERROR)
+
+
+class MorphCloudError(Exception):
+    pass
+
+
+class MorphCloudExecutionClient:
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        base_url: Optional[str] = None,
+        spans_log_path: Optional[str] = None,
+    ):
+        """
+        Initialize the MorphCloud execution client.
+
+        Args:
+            api_key: Optional API key for MorphCloud. If not provided, will use MORPH_API_KEY env var.
+            base_url: Optional base URL for MorphCloud API. If not provided, will use default.
+            spans_log_path: Path to log API call spans to. Defaults to 'logs/morph_api_spans.jsonl'.
+        """
+
+        self.client = MorphCloudClient(api_key=api_key, base_url=base_url)
+        self._snapshot_lock = asyncio.Lock()
+
+    async def _prepare_instance(self, snapshot_id=None) -> Instance:
+        """
+        Prepare and start a MorphCloud instance.
+
+        Args:
+          snapshot_id: Optional snapshot ID to use. If None, will get or create base snapshot.
+
+        Returns:
+          Instance: The ready-to-use MorphCloud instance
+
+        Raises:
+          TimeoutError: If instance fails to start or become ready
+        """
+
+        if not snapshot_id:
+            snapshot = await self._get_or_create_base_snapshot()
+            snapshot_id = snapshot.id
+
+        try:
+            instance = await self.client.instances.astart(
+                snapshot_id, ttl_seconds=600
+            )  # Auto-terminate after 10 minutes
+            await instance.await_until_ready(timeout=300)
+            return instance
+        except asyncio.TimeoutError as e:
+            print(f"Timeout while preparing instance: {str(e)}")
+            if instance:
+                try:
+                    await instance.astop()
+                except Exception:
+                    pass
+            raise
+
+    async def _prepare_files(self, data: Dict[str, Any], temp_dir: str) -> Tuple[str, Dict[str, Any], Dict[str, str]]:
+        """
+        Process files, determine problem ID, and prepare configuration.
+
+        Args:
+            data: Dictionary containing file information
+            temp_dir: Local temporary directory for file operations
+
+        Returns:
+            tuple: (problem_id, grader_config, local_files)
+
+        Raises:
+            ValueError: If problem ID cannot be determined
+        """
+        # Extract problem ID
+        problem_id = None
+        graders_files = []
+        for file in data["files"]:
+            if file["name"].startswith("graders/") and file["name"].endswith(".cpp"):
+                potential_id = os.path.basename(file["name"]).split(".")[0]
+                if potential_id not in ["grader", "manager", "stub"]:
+                    problem_id = potential_id
+
+            if file["name"].startswith("graders/"):
+                graders_files.append(file)
+
+        if not problem_id:
+            raise ValueError("Could not determine problem ID from files")
+
+        grader_config = {
+            "task_type": "Batch",
+            "code": problem_id,
+            "time_limit": data["run_timeout"] / 1000,
+            "memory_limit": data["run_memory_limit"] * 1024 * 1024,
+        }
+
+        for file in graders_files:
+            if "manager.cpp" in file["name"]:
+                grader_config["task_type"] = "Communication"
+                grader_config["task_type_parameters_Communication_num_processes"] = 1
+                grader_config["task_type_parameters_Communication_user_io"] = "std_io"
+                break
+
+        config_path = os.path.join(temp_dir, "grader_config.json")
+        with open(config_path, "w") as f:
+            json.dump(grader_config, f)
+
+        local_files = {"grader_config.json": config_path}
+
+        for file in data["files"]:
+            local_path = os.path.join(temp_dir, os.path.basename(file["name"]))
+            with open(local_path, "w") as f:
+                f.write(file["content"])
+            local_files[file["name"]] = local_path
+
+        return problem_id, grader_config, local_files
+
+    async def _upload_files(self, instance: Instance, local_files: Dict[str, str]) -> bool:
+        """
+        Upload all necessary files to the instance.
+
+        Args:
+            instance: The MorphCloud instance
+            local_files: Dictionary mapping remote paths to local file paths
+
+        Returns:
+            bool: True if all uploads were successful
+
+        Raises:
+            TimeoutError: If uploads time out
+        """
+        for remote_name, local_path in local_files.items():
+            target_path = f"/workspace/{remote_name}"
+            dir_path = os.path.dirname(target_path)
+
+            if dir_path != "/workspace":
+                await instance.aexec(f"mkdir -p {dir_path}")
+
+            await instance.aupload(local_path, target_path)
+
+        await instance.aupload(local_files["grader_config.json"], "/workspace/graders/grader_config.json")
+
+        return True
+
+    async def _compile_code(self, instance: Instance) -> InstanceExecResponse:
+        """
+        Compile the code on the instance.
+
+        Args:
+            instance: The MorphCloud instance
+
+        Returns:
+            InstanceExecResponse: Result of compilation
+
+        Raises:
+            RuntimeError: If compilation fails
+        """
+        compile_result = await instance.aexec("cd /workspace && ./compile")
+
+        if compile_result.exit_code != 0:
+            raise RuntimeError(f"Compilation error exit code {compile_result.exit_code}\n{compile_result.stderr}")
+
+        return compile_result
+
+    async def _run_tests(self, instance: Instance, data: Dict[str, Any]) -> Tuple[str, str]:
+        """
+        Run tests and evaluate results.
+
+        Args:
+            instance: The MorphCloud instance
+            data: Dictionary containing runtime parameters
+
+        Returns:
+            tuple: (score, feedback)
+
+        Raises:
+            TimeoutError: If test execution times out
+        """
+        hard_timeout = data["run_timeout"] / 1000 + 3
+        run_command = f"cd /workspace && timeout {hard_timeout}s ./run"
+
+        run_result = await instance.aexec(run_command)
+
+        if run_result.exit_code == 124 or run_result.exit_code == 137 or run_result.exit_code == 143:
+            return "0", "Time limit exceeded"
+
+        if run_result.exit_code != 0 and "Memory limit exceeded" in run_result.stderr:
+            return "0", "Memory limit exceeded"
+
+        if run_result.stdout:
+            return run_result.stdout.strip(), run_result.stderr.strip()
+
+        if run_result.exit_code != 0:
+            return (
+                "0",
+                f"Runtime error with exit code {run_result.exit_code}\n{run_result.stderr}",
+            )
+
+        return "0", "Unknown error"
+
+    async def _execute_with_instance(self, instance: Instance, data: Dict[str, Any], temp_dir: str) -> Tuple[str, str]:
+        """Execute code using a prepared instance.
+
+        Args:
+            instance: Ready MorphCloud instance
+            data: Execution data
+            temp_dir: Temporary directory for file operations
+
+        Returns:
+            Tuple of (score, feedback)
+
+        Raises:
+            Exception: Passes through exceptions for retry handling
+        """
+        await instance.await_until_ready(timeout=300)
+
+        problem_id, grader_config, local_files = await self._prepare_files(data, temp_dir)
+
+        await self._upload_files(instance, local_files)
+
+        try:
+            await self._compile_code(instance)
+        except RuntimeError as e:
+            return "0", str(e)
+
+        score, feedback = await self._run_tests(instance, data)
+        return score, feedback
+
+    async def _execute(self, data: Dict[str, Any]) -> Tuple[str, str]:
+        """
+        Internal implementation of execute with no retry logic.
+
+        Args:
+            data: Dictionary containing execution data
+
+        Returns:
+            Tuple of (score, feedback)
+
+        Raises:
+            Exception: If execution fails
+        """
+        instance = None
+
+        # Set timeouts to ensure we don't block indefinitely
+        # INSTANCE_TIMEOUT = 300  # 5 minutes for instance operations
+        TOTAL_EXECUTION_TIMEOUT = 600  # 10 minutes total execution time
+
+        with tempfile.TemporaryDirectory(prefix="morph_exec_") as temp_dir:
+            snapshot = await self._get_or_create_base_snapshot()
+            instance = await self.client.instances.astart(
+                snapshot.id, ttl_seconds=600
+            )  # Auto-terminate after 10 minutes
+
+            async with instance:
+                # Use asyncio.wait_for to add overall timeout to the execution process
+                return await asyncio.wait_for(
+                    self._execute_with_instance(instance, data, temp_dir),
+                    timeout=TOTAL_EXECUTION_TIMEOUT,
+                )
+
+    async def execute(self, data: Dict[str, Any]) -> Tuple[str, str]:
+        """
+        Execute code on MorphCloud based on the provided data with enhanced debugging and recovery.
+
+        Orchestrates the following steps with proper error handling and retries:
+        1. Prepare an instance (with retry)
+        2. Set up workspace (with retry)
+        3. Prepare and upload files (with retry)
+        4. Compile code (with retry)
+        5. Run tests (with retry)
+
+        Args:
+            data: Dictionary containing:
+                - files: List of file objects with name and content fields
+                - run_timeout: Timeout in milliseconds
+                - run_memory_limit: Memory limit in MB
+
+        Returns:
+            Tuple of (score, feedback) where:
+                - score is a string representation of a float between 0.0 and 1.0
+                - feedback is a string with execution details
+        """
+        # TODO: would be faster to pass info about the subtask as well to create a snapshot per subtask
+        # would cache the uploads of all files other than the submission: input.txt, correct_output.txt, grader files
+        # rather than reusing the snapshot that only has the compile/run scripts on it
+        # currently, run_submission -> client.execute(data) does not easily pass subtask info
+
+        # Retry configuration
+        max_retries = 4
+        base_delay = 1.0
+
+        # Try execution with retries and exponential backoff
+        for attempt in range(max_retries + 1):
+            try:
+                return await self._execute(data)
+
+            except asyncio.TimeoutError:
+                if attempt < max_retries:
+                    print(f"Execution timed out, retrying ({attempt+1}/{max_retries})")
+                else:
+                    return "0", "Execution timed out after multiple retries"
+
+            except Exception as e:
+                # Calculate exponential backoff
+                if attempt < max_retries:
+                    retry_delay = min(base_delay * (2**attempt), 30)  # Exponential backoff, capped at 30 seconds
+
+                    print(
+                        f"Execution failed with {type(e).__name__}: {str(e)}, retrying in {retry_delay:.2f}s ({attempt+1}/{max_retries})"
+                    )
+                    await asyncio.sleep(retry_delay)
+                else:
+                    print(f"Execution failed after {max_retries} retries: {type(e).__name__}: {str(e)}")
+                    return "0", f"Execution failed after multiple retries: {str(e)}"
+
+    async def _get_or_create_base_snapshot(self):
+        """Get or create a snapshot with the necessary dependencies and scripts for evaluation."""
+
+        async with self._snapshot_lock:
+            base_snapshots = await self.client.snapshots.alist(digest="ioi-evaluation-morph")
+
+            if not base_snapshots:
+                print("Creating base snapshot with build-essential cmake and g++")
+
+                # Create base snapshot with minimal specs
+                base_snapshot = await self.client.snapshots.acreate(
+                    vcpus=2,
+                    memory=4096,
+                    disk_size=10240,
+                    metadata={"purpose": "ioi_evaluation"},
+                )
+
+                # Start a temporary instance from the base snapshot
+                temp_instance = await self.client.instances.astart(
+                    base_snapshot.id, ttl_seconds=900
+                )  # Auto-terminate after 15 minutes
+
+                try:
+                    # Wait for the instance to be ready
+                    await temp_instance.await_until_ready(timeout=300)
+
+                    # Get script contents
+                    compile_script = await self._get_compile_script()
+                    run_script = await self._get_run_script()
+
+                    # Use temporary directory to store scripts
+                    with tempfile.TemporaryDirectory(prefix="morph_setup_") as temp_dir:
+                        # Create paths for script files
+                        compile_path = os.path.join(temp_dir, "compile.sh")
+                        run_path = os.path.join(temp_dir, "run.sh")
+
+                        # Write scripts to temp files
+                        with open(compile_path, "w") as f:
+                            f.write(compile_script)
+
+                        with open(run_path, "w") as f:
+                            f.write(run_script)
+
+                        async with temp_instance:
+                            # Install dependencies
+                            await temp_instance.aexec(
+                                "apt-get update && " "apt-get install -y build-essential cmake g++"
+                            )
+
+                            # Create workspace directory
+                            await temp_instance.aexec(
+                                "mkdir -p /workspace && mkdir -p /workspace/graders && chmod 777 /workspace"
+                            )
+
+                            # Upload scripts to instance
+                            await temp_instance.aupload(compile_path, "/workspace/compile")
+                            await temp_instance.aupload(run_path, "/workspace/run")
+
+                            # Make scripts executable
+                            await temp_instance.aexec("chmod +x /workspace/compile /workspace/run")
+
+                            # Create snapshot from the prepared instance
+                            final_snapshot = await temp_instance.asnapshot(digest="ioi-evaluation-morph")
+
+                except Exception as e:
+                    # Ensure instance is stopped if anything fails
+                    await temp_instance.astop()
+                    raise e
+            else:
+                final_snapshot = base_snapshots[0]
+
+            return final_snapshot
+
+    async def _get_compile_script(self):
+        """Get the compile script content."""
+        return """#!/bin/bash
+
+manager_files=()  # Array to store manager filenames
+current_dir="$(pwd)"
+
+# Checker compilation path
+checker_dir="$current_dir/checker"
+checker_src="$checker_dir/checker.cpp"
+
+if [ -e "$checker_src" ]; then
+    echo "Compiling checker"
+    checker_exe="$checker_dir/checker"
+    g++ -x c++ -std=gnu++17 -O2 -o "$checker_exe" "$checker_src"
+    chmod +x "$checker_exe"
+    if [ $? -ne 0 ]; then
+        echo "Could not compile checker" >&2
+        exit 1
+    fi
+    echo "Compiled checker"
+else
+    echo "No checker found at $checker_src"
+fi
+
+# Graders path
+graders_dir="$current_dir/graders"
+if [ ! -e "$graders_dir" ]; then
+    echo "Grader folder was not found" >&2
+    exit 1
+fi
+
+# Find and compile manager if it exists
+manager_src="$graders_dir/manager.cpp"
+if [ -e "$manager_src" ]; then
+    echo "Compiling manager"
+    manager_exe="$graders_dir/manager"
+    g++ -x c++ -std=gnu++17 -O2 -o "$manager_exe" "$manager_src"
+    chmod +x "$manager_exe"
+    if [ $? -ne 0 ]; then
+        echo "Could not compile manager" >&2
+        exit 1
+    fi
+    manager_files+=("manager")
+fi
+
+# Process other graders
+graders_list=($(ls "$graders_dir" | grep -v 'manager.cpp'))
+for grader_name in "${graders_list[@]}"; do
+    manager_files+=("$grader_name")
+done
+
+# Extract problem name and compile necessary files
+problem_name='?'
+for file in "${manager_files[@]}"; do
+    if [[ "$file" == *.h && "$file" != "testlib.h" ]]; then
+        problem_name="${file%.h}"
+        echo "Problem name: $problem_name"
+        break
+    fi
+done
+
+files_to_compile=("graders/$problem_name.cpp")
+[ -e graders/grader.cpp ] && files_to_compile+=("graders/grader.cpp")
+[ -e graders/stub.cpp ] && files_to_compile+=("graders/stub.cpp")
+
+g++ -DEVAL -std=gnu++17 -O2 -pipe -s -o graders/"$problem_name" "${files_to_compile[@]}"
+if [ $? -ne 0 ]; then
+    echo "Failed to compile $problem_name" >&2
+    exit 1
+fi
+chmod +x graders/"$problem_name"
+echo "Compiled $problem_name from ${files_to_compile[@]} successfully"
+
+echo "Manager files: ${manager_files[@]}"
+"""
+
+    async def _get_run_script(self):
+        """Get the run script content."""
+        return """#!/usr/bin/env bash
+# disable stack limit so you don't get RE with recursion
+ulimit -s unlimited
+# some problems have 10MB+ input/output files in their test cases and you might get RE. uncomment if needed
+# ulimit -f 2097152
+
+# Check if grader_config.json exists
+if [ ! -f "graders/grader_config.json" ]; then
+    echo "Error: graders/grader_config.json not found" >&2
+    echo "Current directory contents:" >&2
+    find . -type f -o -type d | sed -e 's/[^-][^\/]*\//  |/g' -e 's/|\([^ ]\)/|-\1/' >&2
+    exit 1
+fi
+
+# Read task type, code, and time limit from grader_config.json using grep and sed
+TASK_TYPE=$(grep -o '"task_type":[^,}]*' graders/grader_config.json | sed 's/"task_type":\\s*"\\([^"]*\\)"/\\1/')
+TASK_NAME=$(grep -o '"code":[^,}]*' graders/grader_config.json | sed 's/"code":\\s*"\\([^"]*\\)"/\\1/')
+TIME_LIMIT=$(grep -o '"time_limit":[^,}]*' graders/grader_config.json | sed 's/"time_limit":\\s*\\([^,}]*\\)/\\1/')
+MEMORY_LIMIT=$(grep -o '"memory_limit":[^,}]*' graders/grader_config.json | sed 's/"memory_limit":\\s*\\([^,}]*\\)/\\1/')
+TASK_EXECUTABLE="graders/$TASK_NAME"
+
+# Set memory limit in KB (convert from bytes)
+MEMORY_LIMIT_KB=0
+if [ -n "$MEMORY_LIMIT" ]; then
+    MEMORY_LIMIT_KB=$(($MEMORY_LIMIT / 1024))
+    # Set the memory limit for the entire script and all child processes
+    ulimit -v $MEMORY_LIMIT_KB
+fi
+
+# "Securely" handle the correct output file
+CORRECT_OUTPUT=""
+if [ -f "correct_output.txt" ]; then
+    # Read the content and immediately remove the file
+    CORRECT_OUTPUT=$(cat correct_output.txt)
+    rm -f correct_output.txt
+fi
+
+# Create a temporary file for solution output
+SOLUTION_OUTPUT=$(mktemp)
+
+# Global variables for process tracking
+declare -a ALL_PIDS
+declare -a FIFO_DIRS
+
+# Define cleanup function - simplified assuming timeout exists
+function cleanup {
+    # Kill all tracked processes silently
+    exec 2>/dev/null
+    for pid in "${ALL_PIDS[@]:-}"; do
+        kill -9 "$pid" 2>/dev/null || true
+    done
+
+    # Clean up FIFO directories
+    for dir in "${FIFO_DIRS[@]:-}"; do
+        [ -d "$dir" ] && rm -rf "$dir"
+    done
+
+    # Clean up temporary files
+    rm -f "$SOLUTION_OUTPUT" || true
+    exec 2>&2
+}
+
+# Set up signal handling
+trap cleanup EXIT INT TERM
+
+# Function to handle exit codes consistently across task types
+function handle_exit_code {
+    local exit_code=$1
+
+    # Check for known timeout exit codes:
+    # - 124: standard timeout exit code
+    # - 137: SIGKILL (128+9), used for hard timeouts
+    # - 143: SIGTERM (128+15), can also be used for timeouts
+    if [ $exit_code -eq 124 ] || [ $exit_code -eq 137 ] || [ $exit_code -eq 143 ]; then
+        echo "0"
+        echo "Time limit exceeded (${TIME_LIMIT}s)" >&2
+        return 124
+    # All other non-zero exit codes should be treated as runtime errors
+    elif [ $exit_code -ne 0 ]; then
+        echo "0"
+        echo "Runtime error with exit code $exit_code" >&2
+        return $exit_code
+    fi
+
+    # Success case - return 0
+    return 0
+}
+
+# Function to run a command with timeout (simplified assuming timeout exists)
+function run_with_timeout {
+    local soft_limit=$1; shift
+    local command_to_run="$@"
+
+    timeout --preserve-status "$soft_limit" "$@"
+    return $?
+}
+
+case "$TASK_TYPE" in
+    "Batch")
+        # Simple batch execution with timeout
+        run_with_timeout "$TIME_LIMIT" ./$TASK_EXECUTABLE < input.txt > "$SOLUTION_OUTPUT"
+        exit_code=$?
+
+        # Handle non-zero exit codes
+        handle_exit_code $exit_code
+        if [ $? -ne 0 ]; then
+            exit $?
+        fi
+
+        # Check the output if we have a correct output
+        if [ -n "$CORRECT_OUTPUT" ]; then
+            # Restore the correct output file
+            echo "$CORRECT_OUTPUT" > correct_output.txt
+
+            # Check if there's a custom checker
+            if [ -f "checker/checker" ]; then
+                # Let the checker handle everything
+                ./checker/checker input.txt correct_output.txt "$SOLUTION_OUTPUT"
+                exit $?
+            else
+                # Simple diff-based checking
+                if diff -bq <(echo "$CORRECT_OUTPUT") "$SOLUTION_OUTPUT" >/dev/null; then
+                    echo "1"
+                    echo "Output is correct (diff)" >&2
+                else
+                    echo "0"
+                    echo "Output isn't correct (diff)" >&2
+                    exit 0
+                fi
+            fi
+        else
+            # If no correct output was provided, just output the solution's output
+            cat "$SOLUTION_OUTPUT"
+        fi
+        ;;
+
+    "Communication")
+        # Read Communication-specific parameters
+        NUM_PROCESSES=$(grep -o '"task_type_parameters_Communication_num_processes":[^,}]*' graders/grader_config.json | sed 's/.*:\\s*\\([0-9]*\\)/\\1/' || true)
+        if [ -z "$NUM_PROCESSES" ]; then
+            NUM_PROCESSES=1
+        fi
+        USER_IO=$(grep -o '"task_type_parameters_Communication_user_io":[^,}]*' graders/grader_config.json | sed 's/.*:\\s*"\\([^"]*\\)"/\\1/' || echo "std_io")
+
+        # Read custom manager arguments if they exist
+        MANAGER_CUSTOM_ARGS=""
+        if grep -q '"task_type_parameters_Communication_manager_args"' graders/grader_config.json; then
+            MANAGER_CUSTOM_ARGS=$(grep -o '"task_type_parameters_Communication_manager_args":[^,}]*' graders/grader_config.json | sed 's/.*:\\s*"\\([^"]*\\)"/\\1/')
+        fi
+
+        # Create temporary directories for FIFOs
+        for i in $(seq 0 $((NUM_PROCESSES-1))); do
+            FIFO_DIRS[$i]=$(mktemp -d)
+
+            # Create FIFOs for this process
+            mkfifo "${FIFO_DIRS[$i]}/u${i}_to_m"
+            mkfifo "${FIFO_DIRS[$i]}/m_to_u${i}"
+            chmod 755 "${FIFO_DIRS[$i]}"
+            chmod 666 "${FIFO_DIRS[$i]}/u${i}_to_m" "${FIFO_DIRS[$i]}/m_to_u${i}"
+        done
+
+        # Prepare manager arguments
+        MANAGER_ARGS=""
+        for i in $(seq 0 $((NUM_PROCESSES-1))); do
+            MANAGER_ARGS="$MANAGER_ARGS ${FIFO_DIRS[$i]}/u${i}_to_m ${FIFO_DIRS[$i]}/m_to_u${i}"
+        done
+
+        # Add custom manager arguments if specified
+        if [ -n "$MANAGER_CUSTOM_ARGS" ]; then
+            MANAGER_ARGS="$MANAGER_ARGS $MANAGER_CUSTOM_ARGS"
+        fi
+
+        # Start all user processes first
+        for i in $(seq 0 $((NUM_PROCESSES-1))); do
+            if [ "$USER_IO" = "fifo_io" ]; then
+                # Pass FIFOs as arguments
+                ARGS="${FIFO_DIRS[$i]}/m_to_u${i} ${FIFO_DIRS[$i]}/u${i}_to_m"
+                if [ "$NUM_PROCESSES" -ne 1 ]; then
+                    ARGS="$ARGS $i"
+                fi
+                ./$TASK_EXECUTABLE $ARGS &
+                ALL_PIDS+=($!)
+            else
+                # Use stdin/stdout redirection
+                if [ "$NUM_PROCESSES" -ne 1 ]; then
+                    ./$TASK_EXECUTABLE "$i" < "${FIFO_DIRS[$i]}/m_to_u${i}" > "${FIFO_DIRS[$i]}/u${i}_to_m" 2>/dev/null &
+                    ALL_PIDS+=($!)
+                else
+                    ./$TASK_EXECUTABLE < "${FIFO_DIRS[$i]}/m_to_u${i}" > "${FIFO_DIRS[$i]}/u${i}_to_m" 2>/dev/null &
+                    ALL_PIDS+=($!)
+                fi
+            fi
+        done
+
+        # Run the manager with timeout using direct pipe from input.txt
+        run_with_timeout "$TIME_LIMIT" ./graders/manager $MANAGER_ARGS < input.txt > "$SOLUTION_OUTPUT"
+
+        exit_code=$?
+
+        # Handle non-zero exit codes
+        handle_exit_code $exit_code
+        if [ $? -ne 0 ]; then
+            exit $?
+        fi
+
+        # Check the output if we have a correct output AND there's a checker (otherwise we assume the manager handles everything)
+        if [ -n "$CORRECT_OUTPUT" ] && [ -f "checker/checker" ]; then
+            # Restore the correct output file
+            echo "$CORRECT_OUTPUT" > correct_output.txt
+
+            # Let the checker handle it
+            ./checker/checker input.txt correct_output.txt "$SOLUTION_OUTPUT"
+            exit $?
+        else
+            # we assume the manager handles it
+            cat "$SOLUTION_OUTPUT"
+        fi
+        ;;
+
+    *)
+        echo "0"
+        echo "Unsupported task type \"$TASK_TYPE\"" >&2
+        exit 1
+        ;;
+esac
+"""
+
+
+def get_morph_client_from_env(session=None) -> MorphCloudExecutionClient:
+    """
+    Creates a MorphCloudExecutionClient instance using environment variables.
+
+    Environment variables:
+        MORPH_API_KEY: API key for MorphCloud
+
+    Args:
+        session: Optional aiohttp.ClientSession to use for HTTP requests
+
+    Returns:
+        MorphCloudExecutionClient: A configured MorphCloud execution client
+    """
+    if not is_morph_available():
+        raise ImportError(
+            "MorphCloud is not available and required for this function. Please install MorphCloud with "
+            "`pip install morphcloud` and add an API key to a `.env` file."
+        )
+
+    load_dotenv()
+    api_key = os.environ.get("MORPH_API_KEY")
+    if not api_key:
+        raise ValueError("MORPH_API_KEY environment variable is required")
+
+    return MorphCloudExecutionClient(api_key=api_key)
+
+
+# noqa: W293
diff --git a/src/open_r1/utils/ioi/piston_client.py b/src/open_r1/utils/ioi/piston_client.py
index e625d40a1..fc281ec62 100644
--- a/src/open_r1/utils/ioi/piston_client.py
+++ b/src/open_r1/utils/ioi/piston_client.py
@@ -20,10 +20,16 @@ def get_piston_client_from_env():
         raise ValueError(
             "For IOI problems Piston endpoints running our IOI package are required. Please add a list of valid Piston endpoints to a PISTON_ENDPOINTS varialbe in a `.env` file."
         )
-    piston_endpoints = piston_endpoints.split(",") if piston_endpoints != "slurm" else get_slurm_piston_endpoints()
+    piston_endpoints = (
+        piston_endpoints.split(",")
+        if piston_endpoints != "slurm"
+        else get_slurm_piston_endpoints()
+    )
     random.shuffle(piston_endpoints)
     max_requests_per_endpoint = os.getenv("PISTON_MAX_REQUESTS_PER_ENDPOINT", "1")
-    return PistonClient(piston_endpoints, max_requests_per_endpoint=int(max_requests_per_endpoint))
+    return PistonClient(
+        piston_endpoints, max_requests_per_endpoint=int(max_requests_per_endpoint)
+    )
 
 
 class PistonClient:
@@ -56,11 +62,17 @@ def __init__(
         max_requests_per_endpoint=1,
     ):
         self.max_requests_per_endpoint = max_requests_per_endpoint
-        self.base_endpoints = [base_endpoint] if isinstance(base_endpoint, str) else base_endpoint
-        self.endpoint_ids = {endpoint: i for i, endpoint in enumerate(self.base_endpoints)}
+        self.base_endpoints = (
+            [base_endpoint] if isinstance(base_endpoint, str) else base_endpoint
+        )
+        self.endpoint_ids = {
+            endpoint: i for i, endpoint in enumerate(self.base_endpoints)
+        }
 
         self._session = session
-        self.endpoint_tokens = asyncio.Queue(maxsize=max_requests_per_endpoint * len(self.base_endpoints))
+        self.endpoint_tokens = asyncio.Queue(
+            maxsize=max_requests_per_endpoint * len(self.base_endpoints)
+        )
 
         for _ in range(max_requests_per_endpoint):
             for base_endpoint in self.base_endpoints:
@@ -91,23 +103,33 @@ async def _release_endpoint(self, endpoint):
 
     async def _send_request(self, endpoint, route, data=None, method="post"):
         async with self.session.request(
-            method, f"{endpoint.rstrip('/')}/{route}", json=data, headers={"Content-Type": "application/json"}
+            method,
+            f"{endpoint.rstrip('/')}/{route}",
+            json=data,
+            headers={"Content-Type": "application/json"},
         ) as response:
             return await response.json(content_type=None)
 
     async def _send_to_all(self, route, data=None, method="post"):
         return await asyncio.gather(
-            *[self._send_request(endpoint, route, data, method) for endpoint in self.base_endpoints]
+            *[
+                self._send_request(endpoint, route, data, method)
+                for endpoint in self.base_endpoints
+            ]
         )
 
     async def _send_to_one(self, endpoint, route, data=None, method="post"):
         return await self._send_request(endpoint, route, data, method)
 
     async def install_package(self, language, version):
-        return await self._send_to_all("packages", {"language": language, "version": version}, method="post")
+        return await self._send_to_all(
+            "packages", {"language": language, "version": version}, method="post"
+        )
 
     async def uninstall_package(self, language, version):
-        return await self._send_to_all("packages", {"language": language, "version": version}, method="delete")
+        return await self._send_to_all(
+            "packages", {"language": language, "version": version}, method="delete"
+        )
 
     async def get_supported_runtimes(self):
         return await self._send_to_all("runtimes", method="get")
@@ -123,9 +145,13 @@ async def execute(self, data) -> tuple[str, str]:
             raise PistonError(response["message"])
 
         if "compile" in response and response["compile"]["code"] != 0:
-            return "0", "Compilation error exit code " + str(response["compile"]["code"]) + "\n" + response["compile"][
-                "stderr"
-            ]
+            return (
+                "0",
+                "Compilation error exit code "
+                + str(response["compile"]["code"])
+                + "\n"
+                + response["compile"]["stderr"],
+            )
 
         if "run" not in response:
             raise PistonError(response)
@@ -176,7 +202,9 @@ async def _send_execute(self, data):
                 if attempt > 0:
                     await asyncio.sleep(1)
                 async with self.session.post(
-                    f"{endpoint.rstrip('/')}/execute", json=data, headers={"Content-Type": "application/json"}
+                    f"{endpoint.rstrip('/')}/execute",
+                    json=data,
+                    headers={"Content-Type": "application/json"},
                 ) as response:
                     status = response.status
                     res_json = await response.json(content_type=None)
@@ -186,21 +214,40 @@ async def _send_execute(self, data):
                     if res_json is None:
                         raise PistonError(f"Empty response. status={status}")
                     # piston overloaded
-                    if "run" in res_json and "Resource temporarily unavailable" in res_json["run"].get("stderr", ""):
-                        raise PistonError(f"Piston overloaded: {res_json['run']['stderr']}")
+                    if (
+                        "run" in res_json
+                        and "Resource temporarily unavailable"
+                        in res_json["run"].get("stderr", "")
+                    ):
+                        raise PistonError(
+                            f"Piston overloaded: {res_json['run']['stderr']}"
+                        )
                     return res_json
 
-            except (PistonError, asyncio.TimeoutError, aiohttp.ClientConnectionError, RuntimeError) as e:
+            except (
+                PistonError,
+                asyncio.TimeoutError,
+                aiohttp.ClientConnectionError,
+                RuntimeError,
+            ) as e:
                 # Only retry if we haven't reached max retries yet
                 if attempt < max_retries:
                     # Calculate backoff with jitter
-                    delay = min(base_delay * (2**attempt), 10)  # Exponential backoff, capped at 10 seconds
-                    jitter = delay * 0.2 * (2 * asyncio.get_event_loop().time() % 1 - 0.5)  # Add ±10% jitter
+                    delay = min(
+                        base_delay * (2**attempt), 10
+                    )  # Exponential backoff, capped at 10 seconds
+                    jitter = (
+                        delay * 0.2 * (2 * asyncio.get_event_loop().time() % 1 - 0.5)
+                    )  # Add ±10% jitter
                     retry_delay = delay + jitter
-                    print(f"Retrying in {retry_delay} seconds [{self.endpoint_ids[endpoint]}] {endpoint}")
+                    print(
+                        f"Retrying in {retry_delay} seconds [{self.endpoint_ids[endpoint]}] {endpoint}"
+                    )
 
                     # special case: worker died
-                    if isinstance(e, aiohttp.ClientConnectionError) and "Connect call failed" in str(e):
+                    if isinstance(
+                        e, aiohttp.ClientConnectionError
+                    ) and "Connect call failed" in str(e):
                         await self._check_failed_endpoint(endpoint)
                     else:
                         # hopefully we won't get this one again
@@ -228,7 +275,9 @@ def get_slurm_piston_endpoints():
     """Get list of active piston worker endpoints from squeue output"""
     # Run squeue command to get job name, hostname and status, filtering for RUNNING state
     result = subprocess.run(
-        ["squeue", '--format="%j %N %T"', "--noheader", "--states=RUNNING"], capture_output=True, text=True
+        ["squeue", '--format="%j %N %T"', "--noheader", "--states=RUNNING"],
+        capture_output=True,
+        text=True,
     )
 
     # Split output into lines and skip header
diff --git a/src/open_r1/utils/ioi/scoring.py b/src/open_r1/utils/ioi/scoring.py
index 88045383a..a93eb88ce 100644
--- a/src/open_r1/utils/ioi/scoring.py
+++ b/src/open_r1/utils/ioi/scoring.py
@@ -54,7 +54,16 @@ def status(self):
         Returns:
             str: The status with the highest priority (lowest value)
         """
-        status_prios = {"CE": -1, "RE": 0, "WA": 1, "MLE": 2, "TLE": 3, "PA": 4, "AC": 5, "SKIPPED": 999}
+        status_prios = {
+            "CE": -1,
+            "RE": 0,
+            "WA": 1,
+            "MLE": 2,
+            "TLE": 3,
+            "PA": 4,
+            "AC": 5,
+            "SKIPPED": 999,
+        }
         return min([x.status for x in self.test_results], key=lambda x: status_prios[x])
 
     @property
@@ -68,7 +77,10 @@ def score(self):
         return (
             0
             if not self.test_results
-            else round(min([test_result.score for test_result in self.test_results]), self.score_precision)
+            else round(
+                min([test_result.score for test_result in self.test_results]),
+                self.score_precision,
+            )
         )
 
     @property
@@ -83,7 +95,9 @@ def weighted_score(self):
             0
             if not self.test_results
             else round(
-                min([test_result.score for test_result in self.test_results]) * self.points, self.score_precision
+                min([test_result.score for test_result in self.test_results])
+                * self.points,
+                self.score_precision,
             )
         )
 
@@ -135,7 +149,12 @@ def _extract_single_status(score: float, feedback: str) -> str:
 
 
 async def score_single_test_case(
-    client: PistonClient, subtask: dict, test_name: str, test_input: str, test_output: str, submission: str
+    client: PistonClient,
+    subtask: dict,
+    test_name: str,
+    test_input: str,
+    test_output: str,
+    submission: str,
 ) -> TestResult:
     """
     Scores a single test case by running the submission against the provided input and output.
@@ -152,11 +171,16 @@ async def score_single_test_case(
         TestResult: Result of the test case execution
     """
     # Run submission for this test case
-    score, feedback = await run_submission(client, subtask, test_input, submission, test_output)
+    score, feedback = await run_submission(
+        client, subtask, test_input, submission, test_output
+    )
     score = float(score)
 
     return TestResult(
-        test_name=test_name, score=score, status=_extract_single_status(score, feedback), feedback=feedback
+        test_name=test_name,
+        score=score,
+        status=_extract_single_status(score, feedback),
+        feedback=feedback,
     )
 
 
@@ -198,23 +222,29 @@ async def score_subtask(
 
     # initialize test results with cached results or empty (SKIPPED) TestResult objects
     subtask_result.test_results = [
-        test_case_run_cache[test_name]
-        if test_case_run_cache is not None and test_name in test_case_run_cache
-        else TestResult(test_name=test_name)
+        (
+            test_case_run_cache[test_name]
+            if test_case_run_cache is not None and test_name in test_case_run_cache
+            else TestResult(test_name=test_name)
+        )
         for test_name in subtask["test_names"]
     ]
 
     # we skip submissions where no code was extracted
     # no need to do anything, as we have a failed cached result
     if not submission or any(
-        test_result.status != "SKIPPED" and test_result.score == 0.0 for test_result in subtask_result.test_results
+        test_result.status != "SKIPPED" and test_result.score == 0.0
+        for test_result in subtask_result.test_results
     ):
         return subtask_result
 
     if "test_cases" in subtask:
         test_cases = subtask["test_cases"]
         if isinstance(subtask["test_cases"], list):
-            test_cases = {test_name: test for test_name, test in zip(subtask["test_names"], subtask["test_cases"])}
+            test_cases = {
+                test_name: test
+                for test_name, test in zip(subtask["test_names"], subtask["test_cases"])
+            }
     else:
         test_cases = load_ioi_tests(subtask["year"], subtask["id"])
 
@@ -224,7 +254,12 @@ async def score_subtask(
             *[
                 asyncio.create_task(
                     score_single_test_case(
-                        client, subtask, test_name, test_cases[test_name][0], test_cases[test_name][1], submission
+                        client,
+                        subtask,
+                        test_name,
+                        test_cases[test_name][0],
+                        test_cases[test_name][1],
+                        submission,
                     )
                 )
                 for _, test_name in test_batch_to_run
@@ -260,11 +295,18 @@ async def score_subtasks(
     # avoid rerunning tests present in multiple subtasks
     test_case_run_cache = {}
 
-    return [await score_subtask(client, subtask, submission, test_case_run_cache, skip_mode) for subtask in subtasks]
+    return [
+        await score_subtask(client, subtask, submission, test_case_run_cache, skip_mode)
+        for subtask in subtasks
+    ]
 
 
 async def run_submission(
-    client: PistonClient, problem: dict, test_input: str, submission: str, test_output: str | None = None
+    client: PistonClient,
+    problem: dict,
+    test_input: str,
+    submission: str,
+    test_output: str | None = None,
 ) -> tuple[str, str]:
     """
     Executes a submission against a test case using the Piston execution environment.
@@ -286,9 +328,17 @@ async def run_submission(
             # pass the input
             {"name": "input.txt", "content": test_input},
             # pass the expected output
-            *([{"name": "correct_output.txt", "content": test_output}] if test_output else []),
+            *(
+                [{"name": "correct_output.txt", "content": test_output}]
+                if test_output
+                else []
+            ),
             # grader files
-            *({"name": name, "content": content} for name, content in problem["grader_files"] if content),
+            *(
+                {"name": name, "content": content}
+                for name, content in problem["grader_files"]
+                if content
+            ),
         ],
         "run_timeout": round(
             (problem["time_limit"] + 3) * 1000
diff --git a/src/open_r1/utils/ioi/utils.py b/src/open_r1/utils/ioi/utils.py
index 02c0aea59..2f9c392c5 100644
--- a/src/open_r1/utils/ioi/utils.py
+++ b/src/open_r1/utils/ioi/utils.py
@@ -28,10 +28,15 @@ def load_ioi_tests_for_year(year: int) -> dict[str, dict[str, tuple[str, str]]]:
     """
     Load IOI tests for a given year.
     """
-    tests_dataset = load_dataset("open-r1/ioi-test-cases", name=f"{year}", split="train")
+    tests_dataset = load_dataset(
+        "open-r1/ioi-test-cases", name=f"{year}", split="train"
+    )
     test_cases = defaultdict(dict)
     for test_case in tests_dataset:
-        test_cases[test_case["problem_id"]][test_case["test_name"]] = test_case["test_input"], test_case["test_output"]
+        test_cases[test_case["problem_id"]][test_case["test_name"]] = (
+            test_case["test_input"],
+            test_case["test_output"],
+        )
     return test_cases
 
 
diff --git a/src/open_r1/utils/model_utils.py b/src/open_r1/utils/model_utils.py
index 8191c17ea..9ae30415b 100644
--- a/src/open_r1/utils/model_utils.py
+++ b/src/open_r1/utils/model_utils.py
@@ -6,7 +6,9 @@
 from ..configs import GRPOConfig, SFTConfig
 
 
-def get_tokenizer(model_args: ModelConfig, training_args: SFTConfig | GRPOConfig) -> PreTrainedTokenizer:
+def get_tokenizer(
+    model_args: ModelConfig, training_args: SFTConfig | GRPOConfig
+) -> PreTrainedTokenizer:
     """Get the tokenizer for the model."""
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.model_name_or_path,
@@ -20,10 +22,14 @@ def get_tokenizer(model_args: ModelConfig, training_args: SFTConfig | GRPOConfig
     return tokenizer
 
 
-def get_model(model_args: ModelConfig, training_args: SFTConfig | GRPOConfig) -> AutoModelForCausalLM:
+def get_model(
+    model_args: ModelConfig, training_args: SFTConfig | GRPOConfig
+) -> AutoModelForCausalLM:
     """Get the model"""
     torch_dtype = (
-        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
+        model_args.torch_dtype
+        if model_args.torch_dtype in ["auto", None]
+        else getattr(torch, model_args.torch_dtype)
     )
     quantization_config = get_quantization_config(model_args)
     model_kwargs = dict(
diff --git a/src/open_r1/utils/routed_morph.py b/src/open_r1/utils/routed_morph.py
new file mode 100644
index 000000000..e179e5ede
--- /dev/null
+++ b/src/open_r1/utils/routed_morph.py
@@ -0,0 +1,130 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+import requests
+
+
+class RoutedMorphSandbox:
+    """
+    Client for the MorphCloud router service that mimics the API of MorphCloud's Sandbox.
+
+    This class provides a simple interface to execute code via a central MorphCloud router,
+    which manages sandbox creation and cleanup. It allows batch processing of multiple scripts
+    in a single request for improved efficiency.
+
+    Attributes:
+        router_url (str): The URL of the MorphCloud router service.
+        timeout (int): Execution timeout in seconds.
+        request_timeout (int): HTTP request timeout in seconds.
+    """
+
+    def __init__(self, router_url: str, timeout: int = 300, request_timeout: int = 60):
+        """
+        Initialize the routed MorphCloud sandbox client.
+
+        Args:
+            router_url: The URL of the MorphCloud router, including host and port.
+            timeout: Default execution timeout in seconds.
+            request_timeout: Default HTTP request timeout in seconds.
+        """
+        self.router_url = router_url
+        self.timeout = timeout
+        self.request_timeout = request_timeout
+
+    def run_code(
+        self,
+        scripts: List[str],
+        languages: Optional[List[str]] = None,
+        timeout: Optional[int] = None,
+        request_timeout: Optional[int] = None,
+    ) -> List:
+        """
+        Execute multiple scripts using MorphCloud via the router.
+
+        Args:
+            scripts: List of code scripts to execute.
+            languages: List of programming languages for each script. If None, defaults to Python for all scripts.
+            timeout: Execution timeout in seconds. If None, uses the instance timeout.
+            request_timeout: HTTP request timeout in seconds. If None, uses the instance request_timeout.
+
+        Returns:
+            List of execution results with text and exception_str properties.
+        """
+
+        actual_timeout = timeout if timeout is not None else self.timeout
+        actual_request_timeout = (
+            request_timeout if request_timeout is not None else self.request_timeout
+        )
+
+        # Default to Python for all scripts if languages is not provided
+        if languages is None:
+            languages = ["python"] * len(scripts)
+
+        payload = {
+            "scripts": scripts,
+            "languages": languages,
+            "timeout": actual_timeout,
+            "request_timeout": actual_request_timeout,
+        }
+
+        try:
+
+            endpoint = f"http://{self.router_url}/execute_batch"
+            response = requests.post(
+                endpoint, json=payload, timeout=actual_request_timeout
+            )
+
+            if response.status_code != 200:
+
+                error = f"Request to MorphCloud router failed with status code: {response.status_code}"
+                print(error)
+
+                results = []
+                for _ in scripts:
+                    results.append(
+                        type("obj", (object,), {"text": None, "exception_str": error})
+                    )
+                return results
+
+            response_data = response.json()
+            results = []
+
+            for item in response_data:
+                # Log the response data to see what we're getting
+                # print(f"RoutedMorphSandbox: Got response item: {item}")
+                result = type(
+                    "obj",
+                    (object,),
+                    {
+                        "text": item.get("text"),
+                        "exception_str": item.get("exception_str"),
+                    },
+                )
+                results.append(result)
+
+            return results
+
+        except Exception as e:
+            error = f"Error communicating with MorphCloud router: {str(e)}"
+            print(error)
+
+            results = []
+            for _ in scripts:
+                results.append(
+                    type("obj", (object,), {"text": None, "exception_str": error})
+                )
+            return results
diff --git a/src/open_r1/utils/routed_sandbox.py b/src/open_r1/utils/routed_sandbox.py
index 950175950..24737621c 100644
--- a/src/open_r1/utils/routed_sandbox.py
+++ b/src/open_r1/utils/routed_sandbox.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import List, Optional
 
 import requests
 from e2b_code_interpreter.models import Execution, ExecutionError, Result
@@ -41,7 +41,7 @@ def __init__(self, router_url: str):
     def run_code(
         self,
         scripts: list[str],
-        language: str = "python",
+        languages: Optional[List[str]] = None,
         timeout: Optional[int] = None,
         request_timeout: Optional[int] = None,
     ) -> list[Execution]:
@@ -50,7 +50,7 @@ def run_code(
 
         Args:
             scripts (list[str]): A list of code scripts to execute.
-            language (str, optional): The programming language of the scripts. Defaults to "python".
+            languages (list[str], optional): List of programming languages for each script. If None, defaults to Python for all scripts.
             timeout (Optional[int], optional): The maximum execution time for each script in seconds. Defaults to 300 seconds.
             request_timeout (Optional[int], optional): The timeout for the HTTP request in seconds. Defaults to 30 seconds.
 
@@ -63,16 +63,22 @@ def run_code(
         if request_timeout is None:
             request_timeout = 30  # Default to 30 seconds
 
+        # Default to Python for all scripts if languages is not provided
+        if languages is None:
+            languages = ["python"] * len(scripts)
+
         # Prepare the payload for the HTTP POST request
         payload = {
             "scripts": scripts,
-            "language": language,
+            "languages": languages,
             "timeout": timeout,
             "request_timeout": request_timeout,
         }
 
         # Send the request to the E2B Router
-        response = requests.post(f"http://{self.router_url}/execute_batch", json=payload)
+        response = requests.post(
+            f"http://{self.router_url}/execute_batch", json=payload
+        )
         if not response.ok:
             print(f"Request failed with status code: {response.status_code}")
 
@@ -88,7 +94,11 @@ def run_code(
                 execution = Execution(
                     results=[Result(**r) for r in result["execution"]["results"]],
                     logs=result["execution"]["logs"],
-                    error=ExecutionError(**result["execution"]["error"]) if result["execution"]["error"] else None,
+                    error=(
+                        ExecutionError(**result["execution"]["error"])
+                        if result["execution"]["error"]
+                        else None
+                    ),
                     execution_count=result["execution"]["execution_count"],
                 )
             output.append(execution)
diff --git a/tests/slow/test_code_reward.py b/tests/slow/test_code_reward.py
index 06827828c..d19acd06a 100644
--- a/tests/slow/test_code_reward.py
+++ b/tests/slow/test_code_reward.py
@@ -19,51 +19,81 @@
 
 from e2b_code_interpreter.models import Execution, ExecutionError
 from open_r1.rewards import code_reward, ioi_code_reward
+from open_r1.utils.routed_morph import RoutedMorphSandbox
 from open_r1.utils.routed_sandbox import RoutedSandbox
 
 
 class TestCodeRewards(unittest.TestCase):
+
     def test_python_code_reward(self):
         # requires E2B, see the README.md file
-        code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated-tested")
+        code_dataset = load_dataset(
+            "open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled"
+        )
         NUM_SAMPLES = 20
         samples = code_dataset["train"].select(range(NUM_SAMPLES))
-        test_completions = [[{"content": sample["gold_standard_solution"]}] for sample in samples]
-        reward_kwargs = {"verification_info": [sample["verification_info"] for sample in samples]}
+        test_completions = [
+            [{"content": sample["gold_standard_solution"]}] for sample in samples
+        ]
+        reward_kwargs = {
+            "verification_info": [sample["verification_info"] for sample in samples]
+        }
         rewards = code_reward(test_completions, **reward_kwargs)
         print(rewards)
         assert rewards == [1.0] * NUM_SAMPLES
 
     def test_e2b_router(self):
         # run router locally: python scripts/e2b_router.py
-        code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated-tested")
+        code_dataset = load_dataset(
+            "open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled"
+        )
         NUM_SAMPLES = 128
         samples = code_dataset["train"].select(range(NUM_SAMPLES))
-        test_completions = [[{"content": sample["gold_standard_solution"]}] for sample in samples]
-        reward_kwargs = {"verification_info": [sample["verification_info"] for sample in samples]}
-        rewards = code_reward(test_completions, e2b_router_url="0.0.0.0:8000", **reward_kwargs)
+        test_completions = [
+            [{"content": sample["gold_standard_solution"]}] for sample in samples
+        ]
+        reward_kwargs = {
+            "verification_info": [sample["verification_info"] for sample in samples]
+        }
+        rewards = code_reward(
+            test_completions, e2b_router_url="0.0.0.0:8000", **reward_kwargs
+        )
         print(rewards)
         assert rewards == [1.0] * NUM_SAMPLES
 
     def test_e2b_router_parallel(self):
         # run router locally: python scripts/e2b_router.py
-        code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated-tested")
+        code_dataset = load_dataset(
+            "open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled"
+        )
 
         BATCH_SIZE = 32
         NUM_SAMPLES = 256
 
         def batch_code_reward(examples):
-            test_completions = [[{"content": solution}] for solution in examples["gold_standard_solution"]]
+            test_completions = [
+                [{"content": solution}]
+                for solution in examples["gold_standard_solution"]
+            ]
             reward_kwargs = {
-                "verification_info": [verification_info for verification_info in examples["verification_info"]]
+                "verification_info": [
+                    verification_info
+                    for verification_info in examples["verification_info"]
+                ]
             }
-            rewards = code_reward(test_completions, e2b_router_url="0.0.0.0:8000", **reward_kwargs)
+            rewards = code_reward(
+                test_completions, e2b_router_url="0.0.0.0:8000", **reward_kwargs
+            )
             assert rewards == [1.0] * BATCH_SIZE
             return examples
 
         code_dataset = code_dataset["train"].select(range(NUM_SAMPLES))
         code_dataset = code_dataset.map(
-            batch_code_reward, batched=True, batch_size=BATCH_SIZE, num_proc=4, load_from_cache_file=False
+            batch_code_reward,
+            batched=True,
+            batch_size=BATCH_SIZE,
+            num_proc=4,
+            load_from_cache_file=False,
         )
 
     def test_ioi_code_reward(self):
@@ -72,17 +102,23 @@ def test_ioi_code_reward(self):
         code_dataset = load_dataset("open-r1/ioi-reward-test-dataset")
         NUM_SAMPLES = 16
         samples = code_dataset["train"].select(range(NUM_SAMPLES))
-        test_completions = [[{"content": f"```cpp\n{sample['sample_solution']}```"}] for sample in samples]
+        test_completions = [
+            [{"content": f"```cpp\n{sample['sample_solution']}```"}]
+            for sample in samples
+        ]
         keys = [key for key in samples[0] if key not in ["prompt", "completion"]]
         reward_kwargs = {key: [example[key] for example in samples] for key in keys}
         rewards = ioi_code_reward(test_completions, **reward_kwargs)
         print(rewards)
         assert rewards == [1.0] * NUM_SAMPLES
 
-    def test_e2b_router_run_code_success():
+    def test_e2b_router_run_code_success(self):
         # run router locally: python scripts/e2b_router.py
         routed_sandbox = RoutedSandbox(router_url="localhost:8000")
-        scripts = ["print('hello from integration test')", "result = 2 + 2\nprint(result)"]
+        scripts = [
+            "print('hello from integration test')",
+            "result = 2 + 2\nprint(result)",
+        ]
 
         results = routed_sandbox.run_code(scripts)
 
@@ -90,11 +126,13 @@ def test_e2b_router_run_code_success():
 
         for result in results:
             assert isinstance(result, Execution)
-            assert result.exit_code == 0
+            # assert result.exit_code == 0
             assert result.error is None
-            assert "hello" in result.stdout or "4" in result.stdout
+            assert (
+                "hello" in result.logs["stdout"][0] or "4" in result.logs["stdout"][0]
+            )
 
-    def test_e2b_router_run_code_with_error(sandbox):
+    def test_e2b_router_run_code_with_error(self):
         # run router locally: python scripts/e2b_router.py
 
         routed_sandbox = RoutedSandbox(router_url="localhost:8000")
@@ -105,15 +143,122 @@ def test_e2b_router_run_code_with_error(sandbox):
         assert len(results) == 2
 
         # First one should be okay
-        assert results[0].exit_code == 0
+        # assert results[0].exit_code == 0 # Execution object has no attribute 'exit_code'
         assert results[0].error is None
-        assert "this is fine" in results[0].stdout
+        assert "this is fine" in results[0].logs["stdout"][0]
 
         # Second one should have a syntax error
-        assert results[1].exit_code != 0
+
+        # assert results[1].exit_code != 0 # Execution object has no attribute 'exit_code'
         assert results[1].error is not None
         assert isinstance(results[1].error, ExecutionError)
-        assert "SyntaxError" in results[1].error.type
+        assert "SyntaxError" in results[1].error.name
+
+    def test_python_code_reward_morph(self):
+        # requires MorphCloud, see the README.md file
+        code_dataset = load_dataset(
+            "open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled"
+        )
+        NUM_SAMPLES = 20
+        samples = code_dataset["train"].select(range(NUM_SAMPLES))
+        test_completions = [
+            [{"content": sample["gold_standard_solution"]}] for sample in samples
+        ]
+        reward_kwargs = {
+            "verification_info": [sample["verification_info"] for sample in samples],
+            "provider_type": "morph",
+        }
+        rewards = code_reward(test_completions, **reward_kwargs)
+        print(rewards)
+        assert rewards == [1.0] * NUM_SAMPLES
+
+    def test_morph_router(self):
+        # run router locally: python scripts/morph_router.py --port 8001 --max_num_sandboxes 20
+        code_dataset = load_dataset(
+            "open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled"
+        )
+        NUM_SAMPLES = 32
+        samples = code_dataset["train"].select(range(NUM_SAMPLES))
+        test_completions = [
+            [{"content": sample["gold_standard_solution"]}] for sample in samples
+        ]
+        reward_kwargs = {
+            "verification_info": [sample["verification_info"] for sample in samples],
+            "provider_type": "morph",
+            "morph_router_url": "0.0.0.0:8001",
+        }
+        rewards = code_reward(test_completions, **reward_kwargs)
+        print(rewards)
+        assert rewards == [1.0] * NUM_SAMPLES
+
+    def test_morph_router_parallel(self):
+        # run router locally: python scripts/morph_router.py --port 8001 --max_num_sandboxes 20
+        code_dataset = load_dataset(
+            "open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled"
+        )
+
+        BATCH_SIZE = 32
+        NUM_SAMPLES = 256
+
+        def batch_code_reward(examples):
+            test_completions = [
+                [{"content": solution}]
+                for solution in examples["gold_standard_solution"]
+            ]
+            reward_kwargs = {
+                "verification_info": [
+                    verification_info
+                    for verification_info in examples["verification_info"]
+                ],
+                "provider_type": "morph",
+                "morph_router_url": "0.0.0.0:8001",
+            }
+            rewards = code_reward(test_completions, **reward_kwargs)
+            assert rewards == [1.0] * BATCH_SIZE
+            return examples
+
+        code_dataset = code_dataset["train"].select(range(NUM_SAMPLES))
+        code_dataset = code_dataset.map(
+            batch_code_reward,
+            batched=True,
+            batch_size=BATCH_SIZE,
+            num_proc=4,
+            load_from_cache_file=False,
+        )
+
+    def test_morph_router_run_code_success(self):
+        # run router locally: python scripts/morph_router.py --port 8001 --max_num_sandboxes 20
+
+        routed_sandbox = RoutedMorphSandbox(router_url="localhost:8001")
+        scripts = [
+            "print('hello from morph integration test')",
+            "result = 2 + 2\nprint(result)",
+        ]
+
+        results = routed_sandbox.run_code(scripts)
+
+        assert len(results) == 2
+
+        for result in results:
+            assert result.exception_str is None
+            assert "hello" in result.text or "4" in result.text
+
+    def test_morph_router_run_code_with_error(self):
+        # run router locally: python scripts/morph_router.py --port 8001 --max_num_sandboxes 20
+
+        routed_sandbox = RoutedMorphSandbox(router_url="localhost:8001")
+        scripts = ["print('this is fine with morph')", "print('unterminated string"]
+
+        results = routed_sandbox.run_code(scripts)
+
+        assert len(results) == 2
+
+        # First one should be okay
+        assert results[0].exception_str is None
+        assert "this is fine with morph" in results[0].text
+
+        # Second one should have a syntax error
+        assert "SyntaxError" in results[1].text
 
 
 if __name__ == "__main__":
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index 37c17d6d3..7ea6f350e 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -15,6 +15,7 @@
 
 import unittest
 
+from dotenv import load_dotenv
 from open_r1.configs import GRPOScriptArguments
 from open_r1.rewards import (
     accuracy_reward,
@@ -29,6 +30,9 @@
 )
 
 
+load_dotenv()
+
+
 class TestGetRewardFuncs(unittest.TestCase):
     def test_get_reward_funcs(self):
         """Test get_reward_funcs with various reward functions."""
@@ -94,7 +98,13 @@ def test_accuracy_reward_wrong_answer_no_latex(self):
 
     def test_format_reward_correct(self):
         """Test format_reward with correct format."""
-        completion = [[{"content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer\n</answer>"}]]
+        completion = [
+            [
+                {
+                    "content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer\n</answer>"
+                }
+            ]
+        ]
         rewards = format_reward(completion)
         self.assertEqual(rewards[0], 1.0)
 
@@ -133,7 +143,10 @@ def test_reasoning_steps_reward(self):
 
     def test_multiple_completions(self):
         """Test handling multiple completions at once."""
-        completions = [[{"content": r"\boxed{\frac{63}{400}}"}], [{"content": r"\boxed{\frac{64}{400}}"}]]
+        completions = [
+            [{"content": r"\boxed{\frac{63}{400}}"}],
+            [{"content": r"\boxed{\frac{64}{400}}"}],
+        ]
         solutions = [r"\frac{63}{400}", r"\frac{63}{400}"]
 
         rewards = accuracy_reward(completions, solutions)
@@ -154,11 +167,31 @@ def test_cosine_scaled_reward(self):
 
         test_cases = [
             # Correct answers with different lengths
-            (r"\boxed{\frac{63}{400}}", r"\frac{63}{400}", 20, 0.943),  # Short correct answer
-            (r"\boxed{\frac{63}{400}}", r"\frac{63}{400}", 80, 0.547),  # Long correct answer
+            (
+                r"\boxed{\frac{63}{400}}",
+                r"\frac{63}{400}",
+                20,
+                0.943,
+            ),  # Short correct answer
+            (
+                r"\boxed{\frac{63}{400}}",
+                r"\frac{63}{400}",
+                80,
+                0.547,
+            ),  # Long correct answer
             # Wrong answers with different lengths
-            (r"\boxed{\frac{64}{400}}", r"\frac{63}{400}", 20, -0.942),  # Short wrong answer
-            (r"\boxed{\frac{64}{400}}", r"\frac{63}{400}", 80, -0.547),  # Long wrong answer
+            (
+                r"\boxed{\frac{64}{400}}",
+                r"\frac{63}{400}",
+                20,
+                -0.942,
+            ),  # Short wrong answer
+            (
+                r"\boxed{\frac{64}{400}}",
+                r"\frac{63}{400}",
+                80,
+                -0.547,
+            ),  # Long wrong answer
         ]
 
         for content, solution, content_len, expected_reward in test_cases:
@@ -178,7 +211,10 @@ def test_format_reward_specific_multiline(self):
 
     def test_same_length_responses(self):
         """Test len_reward when all responses have the same length."""
-        completions = [[{"content": r"\boxed{\frac{63}{400}}"}], [{"content": r"\boxed{\frac{64}{400}}"}]]
+        completions = [
+            [{"content": r"\boxed{\frac{63}{400}}"}],
+            [{"content": r"\boxed{\frac{64}{400}}"}],
+        ]
         solutions = [r"\frac{63}{400}", r"\frac{63}{400}"]
 
         rewards = len_reward(completions, solutions)
@@ -193,8 +229,12 @@ def test_different_lengths_correct_answers(self):
         solutions = [r"\frac{63}{400}", r"\frac{63}{400}"]
 
         rewards = len_reward(completions, solutions)
-        self.assertGreater(rewards[0], rewards[1])  # shorter answer should get higher reward
-        self.assertAlmostEqual(rewards[0], 0.5)  # shortest correct answer gets maximum reward
+        self.assertGreater(
+            rewards[0], rewards[1]
+        )  # shorter answer should get higher reward
+        self.assertAlmostEqual(
+            rewards[0], 0.5
+        )  # shortest correct answer gets maximum reward
 
     def test_different_lengths_incorrect_answers(self):
         """Test len_reward with different length incorrect answers."""
@@ -205,9 +245,13 @@ def test_different_lengths_incorrect_answers(self):
         solutions = [r"\frac{63}{400}", r"\frac{63}{400}"]
 
         rewards = len_reward(completions, solutions)
-        self.assertLessEqual(rewards[0], 0.0)  # incorrect answers should get non-positive rewards
+        self.assertLessEqual(
+            rewards[0], 0.0
+        )  # incorrect answers should get non-positive rewards
         self.assertLessEqual(rewards[1], 0.0)
-        self.assertGreater(rewards[0], rewards[1])  # shorter answer should still be penalized less
+        self.assertGreater(
+            rewards[0], rewards[1]
+        )  # shorter answer should still be penalized less
 
     def test_mixed_correctness(self):
         """Test len_reward with mix of correct and incorrect answers of different lengths."""
@@ -238,19 +282,28 @@ def test_mixed_correctness(self):
 
     def test_unparseable_solution(self):
         """Test len_reward with unparseable solution."""
-        completions = [[{"content": r"\boxed{answer}"}], [{"content": r"\boxed{answer} " + "x" * 10}]]
+        completions = [
+            [{"content": r"\boxed{answer}"}],
+            [{"content": r"\boxed{answer} " + "x" * 10}],
+        ]
         solutions = ["unparseable_latex", "unparseable_latex"]
 
         rewards = len_reward(completions, solutions)
-        self.assertGreater(rewards[0], rewards[1])  # shorter answer should still get better reward
-        self.assertAlmostEqual(rewards[0], 0.5)  # treated as correct, shortest gets maximum reward
+        self.assertGreater(
+            rewards[0], rewards[1]
+        )  # shorter answer should still get better reward
+        self.assertAlmostEqual(
+            rewards[0], 0.5
+        )  # treated as correct, shortest gets maximum reward
 
 
 class TestRepetitionPenaltyReward(unittest.TestCase):
     def test_positive_max_penalty_raises_value_error(self):
         with self.assertRaises(ValueError):
             get_repetition_penalty_reward(ngram_size=2, max_penalty=1.0)
-        with self.assertRaisesRegex(ValueError, "max_penalty 1.5 should not be positive"):
+        with self.assertRaisesRegex(
+            ValueError, "max_penalty 1.5 should not be positive"
+        ):
             get_repetition_penalty_reward(ngram_size=2, max_penalty=1.5)
 
     def test_no_repetition(self):
@@ -379,31 +432,45 @@ def test_long_completion_without_repetition(self):
 
     def test_tag_count_rewards_all_correct(self):
         """Test tag_count_reward with correct tags."""
-        completion = [[{"content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer\n</answer>"}]]
+        completion = [
+            [
+                {
+                    "content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer\n</answer>"
+                }
+            ]
+        ]
         rewards = tag_count_reward(completion)
         self.assertEqual(rewards[0], 1.0)
 
     def test_tag_count_rewards_missing_think_begin(self):
         """Test tag_count_reward with missing <think> tag."""
-        completion = [[{"content": "Some reasoning\n</think>\n<answer>\nThe answer\n</answer>"}]]
+        completion = [
+            [{"content": "Some reasoning\n</think>\n<answer>\nThe answer\n</answer>"}]
+        ]
         rewards = tag_count_reward(completion)
         self.assertEqual(rewards[0], 0.75)
 
     def test_tag_count_rewards_missing_think_end(self):
         """Test tag_count_reward with missing </think> tag."""
-        completion = [[{"content": "<think>\nSome reasoning\n<answer>\nThe answer\n</answer>"}]]
+        completion = [
+            [{"content": "<think>\nSome reasoning\n<answer>\nThe answer\n</answer>"}]
+        ]
         rewards = tag_count_reward(completion)
         self.assertEqual(rewards[0], 0.75)
 
     def test_tag_count_rewards_missing_answer_begin(self):
         """Test tag_count_reward with missing <answer> tag."""
-        completion = [[{"content": "<think>\nSome reasoning\n</think>\nThe answer\n</answer>"}]]
+        completion = [
+            [{"content": "<think>\nSome reasoning\n</think>\nThe answer\n</answer>"}]
+        ]
         rewards = tag_count_reward(completion)
         self.assertEqual(rewards[0], 0.75)
 
     def test_tag_count_rewards_missing_answer_end(self):
         """Test tag_count_reward with missing </answer> tag."""
-        completion = [[{"content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer"}]]
+        completion = [
+            [{"content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer"}]
+        ]
         rewards = tag_count_reward(completion)
         self.assertEqual(rewards[0], 0.75)
 
@@ -414,12 +481,16 @@ def test_tag_count_rewards_missing_all_tags(self):
         self.assertEqual(rewards[0], 0.0)
 
     def test_full_repetition_with_language(self):
-        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0, language="en")
+        reward_fn = get_repetition_penalty_reward(
+            ngram_size=2, max_penalty=-1.0, language="en"
+        )
         completions = [[{"content": "that that that that that"}]]
         rewards = reward_fn(completions)
         self.assertEqual(rewards, [-0.75])
         # begin test for zh language
-        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0, language="zh")
+        reward_fn = get_repetition_penalty_reward(
+            ngram_size=2, max_penalty=-1.0, language="zh"
+        )
         completions = [[{"content": "这个这个这个这个这个"}]]
         rewards = reward_fn(completions)
         self.assertEqual(rewards, [-0.75])

From 6a0cd5c8ad031fc75118a4ce7f42a4860c3d8dea Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Thu, 8 May 2025 16:29:01 +0200
Subject: [PATCH 117/137] Fix style again :) (#636)

---
 src/open_r1/configs.py                 | 28 +++------
 src/open_r1/generate.py                | 16 ++---
 src/open_r1/grpo.py                    | 14 +----
 src/open_r1/rewards.py                 | 58 +++++------------
 src/open_r1/sft.py                     |  6 +-
 src/open_r1/utils/callbacks.py         |  4 +-
 src/open_r1/utils/evaluation.py        | 12 +---
 src/open_r1/utils/hub.py               | 33 +++-------
 src/open_r1/utils/ioi/morph_client.py  |  8 +--
 src/open_r1/utils/ioi/piston_client.py | 61 +++++-------------
 src/open_r1/utils/ioi/scoring.py       | 32 +++-------
 src/open_r1/utils/ioi/utils.py         |  4 +-
 src/open_r1/utils/model_utils.py       | 12 +---
 src/open_r1/utils/routed_morph.py      | 18 ++----
 src/open_r1/utils/routed_sandbox.py    | 10 +--
 tests/slow/test_code_reward.py         | 86 ++++++--------------------
 tests/test_rewards.py                  | 68 +++++---------------
 17 files changed, 110 insertions(+), 360 deletions(-)

diff --git a/src/open_r1/configs.py b/src/open_r1/configs.py
index 17b4cb89e..642089dd3 100644
--- a/src/open_r1/configs.py
+++ b/src/open_r1/configs.py
@@ -34,9 +34,7 @@ class GRPOConfig(trl.GRPOConfig):
         default_factory=lambda: [],
         metadata={"help": "The callbacks to run during training."},
     )
-    chat_template: Optional[str] = field(
-        default=None, metadata={"help": "The chat template to use."}
-    )
+    chat_template: Optional[str] = field(default=None, metadata={"help": "The chat template to use."})
     system_prompt: Optional[str] = field(
         default=None,
         metadata={"help": "The optional system prompt to use."},
@@ -44,12 +42,8 @@ class GRPOConfig(trl.GRPOConfig):
     hub_model_revision: Optional[str] = field(
         default="main", metadata={"help": "The Hub model branch to push the model to."}
     )
-    overwrite_hub_revision: bool = field(
-        default=False, metadata={"help": "Whether to overwrite the Hub revision."}
-    )
-    push_to_hub_revision: bool = field(
-        default=False, metadata={"help": "Whether to push to a Hub revision/branch."}
-    )
+    overwrite_hub_revision: bool = field(default=False, metadata={"help": "Whether to overwrite the Hub revision."})
+    push_to_hub_revision: bool = field(default=False, metadata={"help": "Whether to push to a Hub revision/branch."})
     wandb_entity: Optional[str] = field(
         default=None,
         metadata={"help": ("The entity to store runs under.")},
@@ -78,9 +72,7 @@ class SFTConfig(trl.SFTConfig):
         default_factory=lambda: [],
         metadata={"help": "The callbacks to run during training."},
     )
-    chat_template: Optional[str] = field(
-        default=None, metadata={"help": "The chat template to use."}
-    )
+    chat_template: Optional[str] = field(default=None, metadata={"help": "The chat template to use."})
     system_prompt: Optional[str] = field(
         default=None,
         metadata={"help": "The optional system prompt to use for benchmarking."},
@@ -89,12 +81,8 @@ class SFTConfig(trl.SFTConfig):
         default="main",
         metadata={"help": "The Hub model branch to push the model to."},
     )
-    overwrite_hub_revision: bool = field(
-        default=False, metadata={"help": "Whether to overwrite the Hub revision."}
-    )
-    push_to_hub_revision: bool = field(
-        default=False, metadata={"help": "Whether to push to a Hub revision/branch."}
-    )
+    overwrite_hub_revision: bool = field(default=False, metadata={"help": "Whether to overwrite the Hub revision."})
+    push_to_hub_revision: bool = field(default=False, metadata={"help": "Whether to push to a Hub revision/branch."})
     wandb_entity: Optional[str] = field(
         default=None,
         metadata={"help": ("The entity to store runs under.")},
@@ -163,9 +151,7 @@ class GRPOScriptArguments(trl.ScriptArguments):
     )
     repetition_max_penalty: float = field(
         default=-1.0,
-        metadata={
-            "help": "Maximum (negative) penalty for for repetition penalty reward"
-        },
+        metadata={"help": "Maximum (negative) penalty for for repetition penalty reward"},
     )
     code_language: str = field(
         default="python",
diff --git a/src/open_r1/generate.py b/src/open_r1/generate.py
index a002632cf..564dca071 100644
--- a/src/open_r1/generate.py
+++ b/src/open_r1/generate.py
@@ -53,9 +53,7 @@ def build_distilabel_pipeline(
                 generation_kwargs=generation_kwargs,
             ),
             template=prompt_template,
-            input_mappings=(
-                {"instruction": prompt_column} if prompt_column is not None else {}
-            ),
+            input_mappings=({"instruction": prompt_column} if prompt_column is not None else {}),
             input_batch_size=input_batch_size,
             num_generations=num_generations,
             group_generations=True,
@@ -70,9 +68,7 @@ def build_distilabel_pipeline(
 
     from datasets import load_dataset
 
-    parser = argparse.ArgumentParser(
-        description="Run distilabel pipeline for generating responses with DeepSeek R1"
-    )
+    parser = argparse.ArgumentParser(description="Run distilabel pipeline for generating responses with DeepSeek R1")
     parser.add_argument(
         "--hf-dataset",
         type=str,
@@ -179,12 +175,8 @@ def build_distilabel_pipeline(
         print(f"  {arg}: {value}")
     print()
 
-    print(
-        f"Loading '{args.hf_dataset}' (config: {args.hf_dataset_config}, split: {args.hf_dataset_split}) dataset..."
-    )
-    dataset = load_dataset(
-        args.hf_dataset, args.hf_dataset_config, split=args.hf_dataset_split
-    )
+    print(f"Loading '{args.hf_dataset}' (config: {args.hf_dataset_config}, split: {args.hf_dataset_split}) dataset...")
+    dataset = load_dataset(args.hf_dataset, args.hf_dataset_config, split=args.hf_dataset_split)
     print("Dataset loaded!")
 
     pipeline = build_distilabel_pipeline(
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index bbec2e5e9..b2d6aa1f3 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -89,18 +89,14 @@ def main(script_args, training_args, model_args):
     reward_funcs = get_reward_funcs(script_args)
 
     # Format into conversation
-    def make_conversation(
-        example, prompt_column: str = script_args.dataset_prompt_column
-    ):
+    def make_conversation(example, prompt_column: str = script_args.dataset_prompt_column):
         prompt = []
 
         if training_args.system_prompt is not None:
             prompt.append({"role": "system", "content": training_args.system_prompt})
 
         if prompt_column not in example:
-            raise ValueError(
-                f"Dataset Question Field Error: {prompt_column} is not supported."
-            )
+            raise ValueError(f"Dataset Question Field Error: {prompt_column} is not supported.")
 
         prompt.append({"role": "user", "content": example[prompt_column]})
         return {"prompt": prompt}
@@ -119,11 +115,7 @@ def make_conversation(
         reward_funcs=reward_funcs,
         args=training_args,
         train_dataset=dataset[script_args.dataset_train_split],
-        eval_dataset=(
-            dataset[script_args.dataset_test_split]
-            if training_args.eval_strategy != "no"
-            else None
-        ),
+        eval_dataset=(dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None),
         peft_config=get_peft_config(model_args),
         callbacks=get_callbacks(training_args, model_args),
         processing_class=tokenizer,
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index dfbb3fb47..33d735dc2 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -35,9 +35,7 @@
 )
 
 
-def accuracy_reward(
-    completions: list[list[dict[str, str]]], solution: list[str], **kwargs
-) -> list[Optional[float]]:
+def accuracy_reward(completions: list[list[dict[str, str]]], solution: list[str], **kwargs) -> list[Optional[float]]:
     """Reward function that checks if the completion is the same as the ground truth."""
     contents = [completion[0]["content"] for completion in completions]
     rewards = []
@@ -71,9 +69,7 @@ def accuracy_reward(
             try:
                 reward = float(verify(gold_parsed, answer_parsed))
             except Exception as e:
-                print(
-                    f"verify failed: {e}, answer: {answer_parsed}, gold: {gold_parsed}"
-                )
+                print(f"verify failed: {e}, answer: {answer_parsed}, gold: {gold_parsed}")
                 reward = None
         else:
             # If the gold solution is not parseable, we assign `None` to skip this example
@@ -88,10 +84,7 @@ def format_reward(completions, **kwargs):
     """Reward function that checks if the reasoning process is enclosed within <think> and </think> tags, while the final answer is enclosed within <answer> and </answer> tags."""
     pattern = r"^<think>\n.*?\n</think>\n<answer>\n.*?\n</answer>$"
     completion_contents = [completion[0]["content"] for completion in completions]
-    matches = [
-        re.match(pattern, content, re.DOTALL | re.MULTILINE)
-        for content in completion_contents
-    ]
+    matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completion_contents]
     return [1.0 if match else 0.0 for match in matches]
 
 
@@ -134,9 +127,7 @@ def reasoning_steps_reward(completions, **kwargs):
     return [min(1.0, count / 3) for count in matches]
 
 
-def len_reward(
-    completions: list[Dict[str, str]], solution: list[str], **kwargs
-) -> float:
+def len_reward(completions: list[Dict[str, str]], solution: list[str], **kwargs) -> float:
     """Compute length-based rewards to discourage overthinking and promote token efficiency.
 
     Taken from the Kimi 1.5 tech report: https://arxiv.org/abs/2501.12599
@@ -289,9 +280,7 @@ def cosine_scaled_reward(completions, solution, **kwargs):
     return cosine_scaled_reward
 
 
-def get_repetition_penalty_reward(
-    ngram_size: int, max_penalty: float, language: str = "en"
-):
+def get_repetition_penalty_reward(ngram_size: int, max_penalty: float, language: str = "en"):
     """
     Computes N-gram repetition penalty as described in Appendix C.2 of https://arxiv.org/abs/2502.03373.
     Reference implementation from: https://github.com/eddycmu/demystify-long-cot/blob/release/openrlhf/openrlhf/reward/repetition.py
@@ -373,9 +362,7 @@ def _init_event_loop():
     return loop
 
 
-def ioi_code_reward(
-    completions, test_batch_size: int = 1, provider_type: str = "piston", **kwargs
-) -> list[float]:
+def ioi_code_reward(completions, test_batch_size: int = 1, provider_type: str = "piston", **kwargs) -> list[float]:
     """Reward function that evaluates IOI problems using a specified execution client.
 
     Assumes the dataset has the same format as hf.co/datasets/open-r1/ioi
@@ -407,9 +394,7 @@ async def run_catch_exceptions(task):
             print(f"Error from {provider_type} worker: {e}")
             return SubtaskResult()
 
-    problems_data = [
-        dict(zip(kwargs.keys(), values)) for values in zip(*kwargs.values())
-    ]
+    problems_data = [dict(zip(kwargs.keys(), values)) for values in zip(*kwargs.values())]
 
     loop = _init_event_loop()
     evals = [
@@ -521,17 +506,13 @@ def evaluate_code(code, test_cases):
     evaluate_code(code_snippet, test_cases)
     """
 
-    code_snippets = [
-        extract_code(completion[-1]["content"]) for completion in completions
-    ]
+    code_snippets = [extract_code(completion[-1]["content"]) for completion in completions]
     verification_info = kwargs["verification_info"]
 
     template = evaluation_script_template
 
     scripts = [
-        template.format(
-            code=json.dumps(code), test_cases=json.dumps(json.dumps(info["test_cases"]))
-        )
+        template.format(code=json.dumps(code), test_cases=json.dumps(json.dumps(info["test_cases"])))
         for code, info in zip(code_snippets, verification_info)
     ]
 
@@ -540,9 +521,7 @@ def evaluate_code(code, test_cases):
     if enforce_same_language:
         all_same_language = all(v["language"] == language for v in verification_info)
         if not all_same_language:
-            raise ValueError(
-                "All verification_info must have the same language", verification_info
-            )
+            raise ValueError("All verification_info must have the same language", verification_info)
 
     execution_provider = get_provider(
         provider_type=provider_type,
@@ -559,16 +538,11 @@ def get_code_format_reward(language: str = "python"):
     Args:
         language: Programming language supported by E2B https://e2b.dev/docs/code-interpreting/supported-languages
     """
-    pattern = (
-        rf"^<think>\n.*?\n</think>\n<answer>\n.*?```{language}.*?```.*?\n</answer>$"
-    )
+    pattern = rf"^<think>\n.*?\n</think>\n<answer>\n.*?```{language}.*?```.*?\n</answer>$"
 
     def code_format_reward(completions, **kwargs):
         completion_contents = [completion[0]["content"] for completion in completions]
-        matches = [
-            re.match(pattern, content, re.DOTALL | re.MULTILINE)
-            for content in completion_contents
-        ]
+        matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completion_contents]
         return [1.0 if match else 0.0 for match in matches]
 
     return code_format_reward
@@ -596,9 +570,7 @@ def get_reward_funcs(script_args) -> list[Callable]:
                 code_reward,
                 num_parallel=script_args.parallel_code_exec_per_proc,
                 provider_type=script_args.code_provider,
-                enforce_same_language=getattr(
-                    script_args, "enforce_same_language", False
-                ),
+                enforce_same_language=getattr(script_args, "enforce_same_language", False),
             ),
             code_reward,
         ),
@@ -607,9 +579,7 @@ def get_reward_funcs(script_args) -> list[Callable]:
                 binary_code_reward,
                 num_parallel=script_args.parallel_code_exec_per_proc,
                 provider_type=script_args.code_provider,
-                enforce_same_language=getattr(
-                    script_args, "enforce_same_language", False
-                ),
+                enforce_same_language=getattr(script_args, "enforce_same_language", False),
             ),
             binary_code_reward,
         ),
diff --git a/src/open_r1/sft.py b/src/open_r1/sft.py
index 6110feded..7589fa778 100644
--- a/src/open_r1/sft.py
+++ b/src/open_r1/sft.py
@@ -115,11 +115,7 @@ def main(script_args, training_args, model_args):
         model=model,
         args=training_args,
         train_dataset=dataset[script_args.dataset_train_split],
-        eval_dataset=(
-            dataset[script_args.dataset_test_split]
-            if training_args.eval_strategy != "no"
-            else None
-        ),
+        eval_dataset=(dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None),
         processing_class=tokenizer,
         peft_config=get_peft_config(model_args),
         callbacks=get_callbacks(training_args, model_args),
diff --git a/src/open_r1/utils/callbacks.py b/src/open_r1/utils/callbacks.py
index 323966b64..88e656243 100644
--- a/src/open_r1/utils/callbacks.py
+++ b/src/open_r1/utils/callbacks.py
@@ -28,9 +28,7 @@
 def is_slurm_available() -> bool:
     # returns true if a slurm queueing system is available
     try:
-        subprocess.run(
-            ["sinfo"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
+        subprocess.run(["sinfo"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         return True
     except FileNotFoundError:
         return False
diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py
index 7eeccb2b7..b985d9a71 100644
--- a/src/open_r1/utils/evaluation.py
+++ b/src/open_r1/utils/evaluation.py
@@ -45,9 +45,7 @@ def register_lighteval_task(
         is_custom_task (bool, optional): Whether the task is a custom task. Defaults to False.
     """
     # Format task list in lighteval format
-    task_list = ",".join(
-        f"{eval_suite}|{task}|{num_fewshot}|0" for task in task_list.split(",")
-    )
+    task_list = ",".join(f"{eval_suite}|{task}|{num_fewshot}|0" for task in task_list.split(","))
     configs[task_name] = task_list
 
 
@@ -58,9 +56,7 @@ def register_lighteval_task(
 register_lighteval_task(LIGHTEVAL_TASKS, "lighteval", "aime25", "aime25", 0)
 register_lighteval_task(LIGHTEVAL_TASKS, "lighteval", "gpqa", "gpqa:diamond", 0)
 register_lighteval_task(LIGHTEVAL_TASKS, "extended", "lcb", "lcb:codegeneration", 0)
-register_lighteval_task(
-    LIGHTEVAL_TASKS, "extended", "lcb_v4", "lcb:codegeneration_v4", 0
-)
+register_lighteval_task(LIGHTEVAL_TASKS, "extended", "lcb_v4", "lcb:codegeneration_v4", 0)
 
 
 def get_lighteval_tasks():
@@ -107,9 +103,7 @@ def run_lighteval_job(
     subprocess.run(cmd, check=True)
 
 
-def run_benchmark_jobs(
-    training_args: Union["SFTConfig", "GRPOConfig"], model_args: "ModelConfig"
-) -> None:
+def run_benchmark_jobs(training_args: Union["SFTConfig", "GRPOConfig"], model_args: "ModelConfig") -> None:
     benchmarks = training_args.benchmarks
     if len(benchmarks) == 1 and benchmarks[0] == "all":
         benchmarks = get_lighteval_tasks()
diff --git a/src/open_r1/utils/hub.py b/src/open_r1/utils/hub.py
index 086d96928..25c4311c7 100644
--- a/src/open_r1/utils/hub.py
+++ b/src/open_r1/utils/hub.py
@@ -36,15 +36,11 @@
 logger = logging.getLogger(__name__)
 
 
-def push_to_hub_revision(
-    training_args: SFTConfig | GRPOConfig, extra_ignore_patterns=[]
-) -> Future:
+def push_to_hub_revision(training_args: SFTConfig | GRPOConfig, extra_ignore_patterns=[]) -> Future:
     """Pushes the model to branch on a Hub repo."""
 
     # Create a repo if it doesn't exist yet
-    repo_url = create_repo(
-        repo_id=training_args.hub_model_id, private=True, exist_ok=True
-    )
+    repo_url = create_repo(repo_id=training_args.hub_model_id, private=True, exist_ok=True)
     # Get initial commit to branch from
     initial_commit = list_repo_commits(training_args.hub_model_id)[-1]
     # Now create the branch we'll be pushing to
@@ -66,9 +62,7 @@ def push_to_hub_revision(
         ignore_patterns=ignore_patterns,
         run_as_future=True,
     )
-    logger.info(
-        f"Pushed to {repo_url} revision {training_args.hub_model_revision} successfully!"
-    )
+    logger.info(f"Pushed to {repo_url} revision {training_args.hub_model_revision} successfully!")
 
     return future
 
@@ -78,19 +72,14 @@ def check_hub_revision_exists(training_args: SFTConfig | GRPOConfig):
     if repo_exists(training_args.hub_model_id):
         if training_args.push_to_hub_revision is True:
             # First check if the revision exists
-            revisions = [
-                rev.name for rev in list_repo_refs(training_args.hub_model_id).branches
-            ]
+            revisions = [rev.name for rev in list_repo_refs(training_args.hub_model_id).branches]
             # If the revision exists, we next check it has a README file
             if training_args.hub_model_revision in revisions:
                 repo_files = list_repo_files(
                     repo_id=training_args.hub_model_id,
                     revision=training_args.hub_model_revision,
                 )
-                if (
-                    "README.md" in repo_files
-                    and training_args.overwrite_hub_revision is False
-                ):
+                if "README.md" in repo_files and training_args.overwrite_hub_revision is False:
                     raise ValueError(
                         f"Revision {training_args.hub_model_revision} already exists. "
                         "Use --overwrite_hub_revision to overwrite it."
@@ -129,21 +118,15 @@ def get_param_count_from_repo_id(repo_id: str) -> int:
             return -1
 
 
-def get_gpu_count_for_vllm(
-    model_name: str, revision: str = "main", num_gpus: int = 8
-) -> int:
+def get_gpu_count_for_vllm(model_name: str, revision: str = "main", num_gpus: int = 8) -> int:
     """vLLM enforces a constraint that the number of attention heads must be divisible by the number of GPUs and 64 must be divisible by the number of GPUs.
     This function calculates the number of GPUs to use for decoding based on the number of attention heads in the model.
     """
-    config = AutoConfig.from_pretrained(
-        model_name, revision=revision, trust_remote_code=True
-    )
+    config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=True)
     # Get number of attention heads
     num_heads = config.num_attention_heads
     # Reduce num_gpus so that num_heads is divisible by num_gpus and 64 is divisible by num_gpus
     while num_heads % num_gpus != 0 or 64 % num_gpus != 0:
-        logger.info(
-            f"Reducing num_gpus from {num_gpus} to {num_gpus - 1} to make num_heads divisible by num_gpus"
-        )
+        logger.info(f"Reducing num_gpus from {num_gpus} to {num_gpus - 1} to make num_heads divisible by num_gpus")
         num_gpus -= 1
     return num_gpus
diff --git a/src/open_r1/utils/ioi/morph_client.py b/src/open_r1/utils/ioi/morph_client.py
index 847e7b4e7..559b7f8a2 100644
--- a/src/open_r1/utils/ioi/morph_client.py
+++ b/src/open_r1/utils/ioi/morph_client.py
@@ -317,7 +317,7 @@ async def execute(self, data: Dict[str, Any]) -> Tuple[str, str]:
 
             except asyncio.TimeoutError:
                 if attempt < max_retries:
-                    print(f"Execution timed out, retrying ({attempt+1}/{max_retries})")
+                    print(f"Execution timed out, retrying ({attempt + 1}/{max_retries})")
                 else:
                     return "0", "Execution timed out after multiple retries"
 
@@ -327,7 +327,7 @@ async def execute(self, data: Dict[str, Any]) -> Tuple[str, str]:
                     retry_delay = min(base_delay * (2**attempt), 30)  # Exponential backoff, capped at 30 seconds
 
                     print(
-                        f"Execution failed with {type(e).__name__}: {str(e)}, retrying in {retry_delay:.2f}s ({attempt+1}/{max_retries})"
+                        f"Execution failed with {type(e).__name__}: {str(e)}, retrying in {retry_delay:.2f}s ({attempt + 1}/{max_retries})"
                     )
                     await asyncio.sleep(retry_delay)
                 else:
@@ -379,9 +379,7 @@ async def _get_or_create_base_snapshot(self):
 
                         async with temp_instance:
                             # Install dependencies
-                            await temp_instance.aexec(
-                                "apt-get update && " "apt-get install -y build-essential cmake g++"
-                            )
+                            await temp_instance.aexec("apt-get update && apt-get install -y build-essential cmake g++")
 
                             # Create workspace directory
                             await temp_instance.aexec(
diff --git a/src/open_r1/utils/ioi/piston_client.py b/src/open_r1/utils/ioi/piston_client.py
index fc281ec62..86ebe9d13 100644
--- a/src/open_r1/utils/ioi/piston_client.py
+++ b/src/open_r1/utils/ioi/piston_client.py
@@ -20,16 +20,10 @@ def get_piston_client_from_env():
         raise ValueError(
             "For IOI problems Piston endpoints running our IOI package are required. Please add a list of valid Piston endpoints to a PISTON_ENDPOINTS varialbe in a `.env` file."
         )
-    piston_endpoints = (
-        piston_endpoints.split(",")
-        if piston_endpoints != "slurm"
-        else get_slurm_piston_endpoints()
-    )
+    piston_endpoints = piston_endpoints.split(",") if piston_endpoints != "slurm" else get_slurm_piston_endpoints()
     random.shuffle(piston_endpoints)
     max_requests_per_endpoint = os.getenv("PISTON_MAX_REQUESTS_PER_ENDPOINT", "1")
-    return PistonClient(
-        piston_endpoints, max_requests_per_endpoint=int(max_requests_per_endpoint)
-    )
+    return PistonClient(piston_endpoints, max_requests_per_endpoint=int(max_requests_per_endpoint))
 
 
 class PistonClient:
@@ -62,17 +56,11 @@ def __init__(
         max_requests_per_endpoint=1,
     ):
         self.max_requests_per_endpoint = max_requests_per_endpoint
-        self.base_endpoints = (
-            [base_endpoint] if isinstance(base_endpoint, str) else base_endpoint
-        )
-        self.endpoint_ids = {
-            endpoint: i for i, endpoint in enumerate(self.base_endpoints)
-        }
+        self.base_endpoints = [base_endpoint] if isinstance(base_endpoint, str) else base_endpoint
+        self.endpoint_ids = {endpoint: i for i, endpoint in enumerate(self.base_endpoints)}
 
         self._session = session
-        self.endpoint_tokens = asyncio.Queue(
-            maxsize=max_requests_per_endpoint * len(self.base_endpoints)
-        )
+        self.endpoint_tokens = asyncio.Queue(maxsize=max_requests_per_endpoint * len(self.base_endpoints))
 
         for _ in range(max_requests_per_endpoint):
             for base_endpoint in self.base_endpoints:
@@ -112,24 +100,17 @@ async def _send_request(self, endpoint, route, data=None, method="post"):
 
     async def _send_to_all(self, route, data=None, method="post"):
         return await asyncio.gather(
-            *[
-                self._send_request(endpoint, route, data, method)
-                for endpoint in self.base_endpoints
-            ]
+            *[self._send_request(endpoint, route, data, method) for endpoint in self.base_endpoints]
         )
 
     async def _send_to_one(self, endpoint, route, data=None, method="post"):
         return await self._send_request(endpoint, route, data, method)
 
     async def install_package(self, language, version):
-        return await self._send_to_all(
-            "packages", {"language": language, "version": version}, method="post"
-        )
+        return await self._send_to_all("packages", {"language": language, "version": version}, method="post")
 
     async def uninstall_package(self, language, version):
-        return await self._send_to_all(
-            "packages", {"language": language, "version": version}, method="delete"
-        )
+        return await self._send_to_all("packages", {"language": language, "version": version}, method="delete")
 
     async def get_supported_runtimes(self):
         return await self._send_to_all("runtimes", method="get")
@@ -214,14 +195,8 @@ async def _send_execute(self, data):
                     if res_json is None:
                         raise PistonError(f"Empty response. status={status}")
                     # piston overloaded
-                    if (
-                        "run" in res_json
-                        and "Resource temporarily unavailable"
-                        in res_json["run"].get("stderr", "")
-                    ):
-                        raise PistonError(
-                            f"Piston overloaded: {res_json['run']['stderr']}"
-                        )
+                    if "run" in res_json and "Resource temporarily unavailable" in res_json["run"].get("stderr", ""):
+                        raise PistonError(f"Piston overloaded: {res_json['run']['stderr']}")
                     return res_json
 
             except (
@@ -233,21 +208,13 @@ async def _send_execute(self, data):
                 # Only retry if we haven't reached max retries yet
                 if attempt < max_retries:
                     # Calculate backoff with jitter
-                    delay = min(
-                        base_delay * (2**attempt), 10
-                    )  # Exponential backoff, capped at 10 seconds
-                    jitter = (
-                        delay * 0.2 * (2 * asyncio.get_event_loop().time() % 1 - 0.5)
-                    )  # Add ±10% jitter
+                    delay = min(base_delay * (2**attempt), 10)  # Exponential backoff, capped at 10 seconds
+                    jitter = delay * 0.2 * (2 * asyncio.get_event_loop().time() % 1 - 0.5)  # Add ±10% jitter
                     retry_delay = delay + jitter
-                    print(
-                        f"Retrying in {retry_delay} seconds [{self.endpoint_ids[endpoint]}] {endpoint}"
-                    )
+                    print(f"Retrying in {retry_delay} seconds [{self.endpoint_ids[endpoint]}] {endpoint}")
 
                     # special case: worker died
-                    if isinstance(
-                        e, aiohttp.ClientConnectionError
-                    ) and "Connect call failed" in str(e):
+                    if isinstance(e, aiohttp.ClientConnectionError) and "Connect call failed" in str(e):
                         await self._check_failed_endpoint(endpoint)
                     else:
                         # hopefully we won't get this one again
diff --git a/src/open_r1/utils/ioi/scoring.py b/src/open_r1/utils/ioi/scoring.py
index a93eb88ce..1595fc602 100644
--- a/src/open_r1/utils/ioi/scoring.py
+++ b/src/open_r1/utils/ioi/scoring.py
@@ -95,8 +95,7 @@ def weighted_score(self):
             0
             if not self.test_results
             else round(
-                min([test_result.score for test_result in self.test_results])
-                * self.points,
+                min([test_result.score for test_result in self.test_results]) * self.points,
                 self.score_precision,
             )
         )
@@ -171,9 +170,7 @@ async def score_single_test_case(
         TestResult: Result of the test case execution
     """
     # Run submission for this test case
-    score, feedback = await run_submission(
-        client, subtask, test_input, submission, test_output
-    )
+    score, feedback = await run_submission(client, subtask, test_input, submission, test_output)
     score = float(score)
 
     return TestResult(
@@ -233,18 +230,14 @@ async def score_subtask(
     # we skip submissions where no code was extracted
     # no need to do anything, as we have a failed cached result
     if not submission or any(
-        test_result.status != "SKIPPED" and test_result.score == 0.0
-        for test_result in subtask_result.test_results
+        test_result.status != "SKIPPED" and test_result.score == 0.0 for test_result in subtask_result.test_results
     ):
         return subtask_result
 
     if "test_cases" in subtask:
         test_cases = subtask["test_cases"]
         if isinstance(subtask["test_cases"], list):
-            test_cases = {
-                test_name: test
-                for test_name, test in zip(subtask["test_names"], subtask["test_cases"])
-            }
+            test_cases = {test_name: test for test_name, test in zip(subtask["test_names"], subtask["test_cases"])}
     else:
         test_cases = load_ioi_tests(subtask["year"], subtask["id"])
 
@@ -295,10 +288,7 @@ async def score_subtasks(
     # avoid rerunning tests present in multiple subtasks
     test_case_run_cache = {}
 
-    return [
-        await score_subtask(client, subtask, submission, test_case_run_cache, skip_mode)
-        for subtask in subtasks
-    ]
+    return [await score_subtask(client, subtask, submission, test_case_run_cache, skip_mode) for subtask in subtasks]
 
 
 async def run_submission(
@@ -328,17 +318,9 @@ async def run_submission(
             # pass the input
             {"name": "input.txt", "content": test_input},
             # pass the expected output
-            *(
-                [{"name": "correct_output.txt", "content": test_output}]
-                if test_output
-                else []
-            ),
+            *([{"name": "correct_output.txt", "content": test_output}] if test_output else []),
             # grader files
-            *(
-                {"name": name, "content": content}
-                for name, content in problem["grader_files"]
-                if content
-            ),
+            *({"name": name, "content": content} for name, content in problem["grader_files"] if content),
         ],
         "run_timeout": round(
             (problem["time_limit"] + 3) * 1000
diff --git a/src/open_r1/utils/ioi/utils.py b/src/open_r1/utils/ioi/utils.py
index 2f9c392c5..0719f4a3f 100644
--- a/src/open_r1/utils/ioi/utils.py
+++ b/src/open_r1/utils/ioi/utils.py
@@ -28,9 +28,7 @@ def load_ioi_tests_for_year(year: int) -> dict[str, dict[str, tuple[str, str]]]:
     """
     Load IOI tests for a given year.
     """
-    tests_dataset = load_dataset(
-        "open-r1/ioi-test-cases", name=f"{year}", split="train"
-    )
+    tests_dataset = load_dataset("open-r1/ioi-test-cases", name=f"{year}", split="train")
     test_cases = defaultdict(dict)
     for test_case in tests_dataset:
         test_cases[test_case["problem_id"]][test_case["test_name"]] = (
diff --git a/src/open_r1/utils/model_utils.py b/src/open_r1/utils/model_utils.py
index 9ae30415b..8191c17ea 100644
--- a/src/open_r1/utils/model_utils.py
+++ b/src/open_r1/utils/model_utils.py
@@ -6,9 +6,7 @@
 from ..configs import GRPOConfig, SFTConfig
 
 
-def get_tokenizer(
-    model_args: ModelConfig, training_args: SFTConfig | GRPOConfig
-) -> PreTrainedTokenizer:
+def get_tokenizer(model_args: ModelConfig, training_args: SFTConfig | GRPOConfig) -> PreTrainedTokenizer:
     """Get the tokenizer for the model."""
     tokenizer = AutoTokenizer.from_pretrained(
         model_args.model_name_or_path,
@@ -22,14 +20,10 @@ def get_tokenizer(
     return tokenizer
 
 
-def get_model(
-    model_args: ModelConfig, training_args: SFTConfig | GRPOConfig
-) -> AutoModelForCausalLM:
+def get_model(model_args: ModelConfig, training_args: SFTConfig | GRPOConfig) -> AutoModelForCausalLM:
     """Get the model"""
     torch_dtype = (
-        model_args.torch_dtype
-        if model_args.torch_dtype in ["auto", None]
-        else getattr(torch, model_args.torch_dtype)
+        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
     )
     quantization_config = get_quantization_config(model_args)
     model_kwargs = dict(
diff --git a/src/open_r1/utils/routed_morph.py b/src/open_r1/utils/routed_morph.py
index e179e5ede..835c784af 100644
--- a/src/open_r1/utils/routed_morph.py
+++ b/src/open_r1/utils/routed_morph.py
@@ -66,9 +66,7 @@ def run_code(
         """
 
         actual_timeout = timeout if timeout is not None else self.timeout
-        actual_request_timeout = (
-            request_timeout if request_timeout is not None else self.request_timeout
-        )
+        actual_request_timeout = request_timeout if request_timeout is not None else self.request_timeout
 
         # Default to Python for all scripts if languages is not provided
         if languages is None:
@@ -82,22 +80,16 @@ def run_code(
         }
 
         try:
-
             endpoint = f"http://{self.router_url}/execute_batch"
-            response = requests.post(
-                endpoint, json=payload, timeout=actual_request_timeout
-            )
+            response = requests.post(endpoint, json=payload, timeout=actual_request_timeout)
 
             if response.status_code != 200:
-
                 error = f"Request to MorphCloud router failed with status code: {response.status_code}"
                 print(error)
 
                 results = []
                 for _ in scripts:
-                    results.append(
-                        type("obj", (object,), {"text": None, "exception_str": error})
-                    )
+                    results.append(type("obj", (object,), {"text": None, "exception_str": error}))
                 return results
 
             response_data = response.json()
@@ -124,7 +116,5 @@ def run_code(
 
             results = []
             for _ in scripts:
-                results.append(
-                    type("obj", (object,), {"text": None, "exception_str": error})
-                )
+                results.append(type("obj", (object,), {"text": None, "exception_str": error}))
             return results
diff --git a/src/open_r1/utils/routed_sandbox.py b/src/open_r1/utils/routed_sandbox.py
index 24737621c..97bb65cf4 100644
--- a/src/open_r1/utils/routed_sandbox.py
+++ b/src/open_r1/utils/routed_sandbox.py
@@ -76,9 +76,7 @@ def run_code(
         }
 
         # Send the request to the E2B Router
-        response = requests.post(
-            f"http://{self.router_url}/execute_batch", json=payload
-        )
+        response = requests.post(f"http://{self.router_url}/execute_batch", json=payload)
         if not response.ok:
             print(f"Request failed with status code: {response.status_code}")
 
@@ -94,11 +92,7 @@ def run_code(
                 execution = Execution(
                     results=[Result(**r) for r in result["execution"]["results"]],
                     logs=result["execution"]["logs"],
-                    error=(
-                        ExecutionError(**result["execution"]["error"])
-                        if result["execution"]["error"]
-                        else None
-                    ),
+                    error=(ExecutionError(**result["execution"]["error"]) if result["execution"]["error"] else None),
                     execution_count=result["execution"]["execution_count"],
                 )
             output.append(execution)
diff --git a/tests/slow/test_code_reward.py b/tests/slow/test_code_reward.py
index d19acd06a..8718eb35a 100644
--- a/tests/slow/test_code_reward.py
+++ b/tests/slow/test_code_reward.py
@@ -24,66 +24,41 @@
 
 
 class TestCodeRewards(unittest.TestCase):
-
     def test_python_code_reward(self):
         # requires E2B, see the README.md file
-        code_dataset = load_dataset(
-            "open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled"
-        )
+        code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled")
         NUM_SAMPLES = 20
         samples = code_dataset["train"].select(range(NUM_SAMPLES))
-        test_completions = [
-            [{"content": sample["gold_standard_solution"]}] for sample in samples
-        ]
-        reward_kwargs = {
-            "verification_info": [sample["verification_info"] for sample in samples]
-        }
+        test_completions = [[{"content": sample["gold_standard_solution"]}] for sample in samples]
+        reward_kwargs = {"verification_info": [sample["verification_info"] for sample in samples]}
         rewards = code_reward(test_completions, **reward_kwargs)
         print(rewards)
         assert rewards == [1.0] * NUM_SAMPLES
 
     def test_e2b_router(self):
         # run router locally: python scripts/e2b_router.py
-        code_dataset = load_dataset(
-            "open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled"
-        )
+        code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled")
         NUM_SAMPLES = 128
         samples = code_dataset["train"].select(range(NUM_SAMPLES))
-        test_completions = [
-            [{"content": sample["gold_standard_solution"]}] for sample in samples
-        ]
-        reward_kwargs = {
-            "verification_info": [sample["verification_info"] for sample in samples]
-        }
-        rewards = code_reward(
-            test_completions, e2b_router_url="0.0.0.0:8000", **reward_kwargs
-        )
+        test_completions = [[{"content": sample["gold_standard_solution"]}] for sample in samples]
+        reward_kwargs = {"verification_info": [sample["verification_info"] for sample in samples]}
+        rewards = code_reward(test_completions, e2b_router_url="0.0.0.0:8000", **reward_kwargs)
         print(rewards)
         assert rewards == [1.0] * NUM_SAMPLES
 
     def test_e2b_router_parallel(self):
         # run router locally: python scripts/e2b_router.py
-        code_dataset = load_dataset(
-            "open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled"
-        )
+        code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled")
 
         BATCH_SIZE = 32
         NUM_SAMPLES = 256
 
         def batch_code_reward(examples):
-            test_completions = [
-                [{"content": solution}]
-                for solution in examples["gold_standard_solution"]
-            ]
+            test_completions = [[{"content": solution}] for solution in examples["gold_standard_solution"]]
             reward_kwargs = {
-                "verification_info": [
-                    verification_info
-                    for verification_info in examples["verification_info"]
-                ]
+                "verification_info": [verification_info for verification_info in examples["verification_info"]]
             }
-            rewards = code_reward(
-                test_completions, e2b_router_url="0.0.0.0:8000", **reward_kwargs
-            )
+            rewards = code_reward(test_completions, e2b_router_url="0.0.0.0:8000", **reward_kwargs)
             assert rewards == [1.0] * BATCH_SIZE
             return examples
 
@@ -102,10 +77,7 @@ def test_ioi_code_reward(self):
         code_dataset = load_dataset("open-r1/ioi-reward-test-dataset")
         NUM_SAMPLES = 16
         samples = code_dataset["train"].select(range(NUM_SAMPLES))
-        test_completions = [
-            [{"content": f"```cpp\n{sample['sample_solution']}```"}]
-            for sample in samples
-        ]
+        test_completions = [[{"content": f"```cpp\n{sample['sample_solution']}```"}] for sample in samples]
         keys = [key for key in samples[0] if key not in ["prompt", "completion"]]
         reward_kwargs = {key: [example[key] for example in samples] for key in keys}
         rewards = ioi_code_reward(test_completions, **reward_kwargs)
@@ -128,9 +100,7 @@ def test_e2b_router_run_code_success(self):
             assert isinstance(result, Execution)
             # assert result.exit_code == 0
             assert result.error is None
-            assert (
-                "hello" in result.logs["stdout"][0] or "4" in result.logs["stdout"][0]
-            )
+            assert "hello" in result.logs["stdout"][0] or "4" in result.logs["stdout"][0]
 
     def test_e2b_router_run_code_with_error(self):
         # run router locally: python scripts/e2b_router.py
@@ -156,14 +126,10 @@ def test_e2b_router_run_code_with_error(self):
 
     def test_python_code_reward_morph(self):
         # requires MorphCloud, see the README.md file
-        code_dataset = load_dataset(
-            "open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled"
-        )
+        code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled")
         NUM_SAMPLES = 20
         samples = code_dataset["train"].select(range(NUM_SAMPLES))
-        test_completions = [
-            [{"content": sample["gold_standard_solution"]}] for sample in samples
-        ]
+        test_completions = [[{"content": sample["gold_standard_solution"]}] for sample in samples]
         reward_kwargs = {
             "verification_info": [sample["verification_info"] for sample in samples],
             "provider_type": "morph",
@@ -174,14 +140,10 @@ def test_python_code_reward_morph(self):
 
     def test_morph_router(self):
         # run router locally: python scripts/morph_router.py --port 8001 --max_num_sandboxes 20
-        code_dataset = load_dataset(
-            "open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled"
-        )
+        code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled")
         NUM_SAMPLES = 32
         samples = code_dataset["train"].select(range(NUM_SAMPLES))
-        test_completions = [
-            [{"content": sample["gold_standard_solution"]}] for sample in samples
-        ]
+        test_completions = [[{"content": sample["gold_standard_solution"]}] for sample in samples]
         reward_kwargs = {
             "verification_info": [sample["verification_info"] for sample in samples],
             "provider_type": "morph",
@@ -193,23 +155,15 @@ def test_morph_router(self):
 
     def test_morph_router_parallel(self):
         # run router locally: python scripts/morph_router.py --port 8001 --max_num_sandboxes 20
-        code_dataset = load_dataset(
-            "open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled"
-        )
+        code_dataset = load_dataset("open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled")
 
         BATCH_SIZE = 32
         NUM_SAMPLES = 256
 
         def batch_code_reward(examples):
-            test_completions = [
-                [{"content": solution}]
-                for solution in examples["gold_standard_solution"]
-            ]
+            test_completions = [[{"content": solution}] for solution in examples["gold_standard_solution"]]
             reward_kwargs = {
-                "verification_info": [
-                    verification_info
-                    for verification_info in examples["verification_info"]
-                ],
+                "verification_info": [verification_info for verification_info in examples["verification_info"]],
                 "provider_type": "morph",
                 "morph_router_url": "0.0.0.0:8001",
             }
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index 7ea6f350e..74ddc0907 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -98,13 +98,7 @@ def test_accuracy_reward_wrong_answer_no_latex(self):
 
     def test_format_reward_correct(self):
         """Test format_reward with correct format."""
-        completion = [
-            [
-                {
-                    "content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer\n</answer>"
-                }
-            ]
-        ]
+        completion = [[{"content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer\n</answer>"}]]
         rewards = format_reward(completion)
         self.assertEqual(rewards[0], 1.0)
 
@@ -229,12 +223,8 @@ def test_different_lengths_correct_answers(self):
         solutions = [r"\frac{63}{400}", r"\frac{63}{400}"]
 
         rewards = len_reward(completions, solutions)
-        self.assertGreater(
-            rewards[0], rewards[1]
-        )  # shorter answer should get higher reward
-        self.assertAlmostEqual(
-            rewards[0], 0.5
-        )  # shortest correct answer gets maximum reward
+        self.assertGreater(rewards[0], rewards[1])  # shorter answer should get higher reward
+        self.assertAlmostEqual(rewards[0], 0.5)  # shortest correct answer gets maximum reward
 
     def test_different_lengths_incorrect_answers(self):
         """Test len_reward with different length incorrect answers."""
@@ -245,13 +235,9 @@ def test_different_lengths_incorrect_answers(self):
         solutions = [r"\frac{63}{400}", r"\frac{63}{400}"]
 
         rewards = len_reward(completions, solutions)
-        self.assertLessEqual(
-            rewards[0], 0.0
-        )  # incorrect answers should get non-positive rewards
+        self.assertLessEqual(rewards[0], 0.0)  # incorrect answers should get non-positive rewards
         self.assertLessEqual(rewards[1], 0.0)
-        self.assertGreater(
-            rewards[0], rewards[1]
-        )  # shorter answer should still be penalized less
+        self.assertGreater(rewards[0], rewards[1])  # shorter answer should still be penalized less
 
     def test_mixed_correctness(self):
         """Test len_reward with mix of correct and incorrect answers of different lengths."""
@@ -289,21 +275,15 @@ def test_unparseable_solution(self):
         solutions = ["unparseable_latex", "unparseable_latex"]
 
         rewards = len_reward(completions, solutions)
-        self.assertGreater(
-            rewards[0], rewards[1]
-        )  # shorter answer should still get better reward
-        self.assertAlmostEqual(
-            rewards[0], 0.5
-        )  # treated as correct, shortest gets maximum reward
+        self.assertGreater(rewards[0], rewards[1])  # shorter answer should still get better reward
+        self.assertAlmostEqual(rewards[0], 0.5)  # treated as correct, shortest gets maximum reward
 
 
 class TestRepetitionPenaltyReward(unittest.TestCase):
     def test_positive_max_penalty_raises_value_error(self):
         with self.assertRaises(ValueError):
             get_repetition_penalty_reward(ngram_size=2, max_penalty=1.0)
-        with self.assertRaisesRegex(
-            ValueError, "max_penalty 1.5 should not be positive"
-        ):
+        with self.assertRaisesRegex(ValueError, "max_penalty 1.5 should not be positive"):
             get_repetition_penalty_reward(ngram_size=2, max_penalty=1.5)
 
     def test_no_repetition(self):
@@ -432,45 +412,31 @@ def test_long_completion_without_repetition(self):
 
     def test_tag_count_rewards_all_correct(self):
         """Test tag_count_reward with correct tags."""
-        completion = [
-            [
-                {
-                    "content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer\n</answer>"
-                }
-            ]
-        ]
+        completion = [[{"content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer\n</answer>"}]]
         rewards = tag_count_reward(completion)
         self.assertEqual(rewards[0], 1.0)
 
     def test_tag_count_rewards_missing_think_begin(self):
         """Test tag_count_reward with missing <think> tag."""
-        completion = [
-            [{"content": "Some reasoning\n</think>\n<answer>\nThe answer\n</answer>"}]
-        ]
+        completion = [[{"content": "Some reasoning\n</think>\n<answer>\nThe answer\n</answer>"}]]
         rewards = tag_count_reward(completion)
         self.assertEqual(rewards[0], 0.75)
 
     def test_tag_count_rewards_missing_think_end(self):
         """Test tag_count_reward with missing </think> tag."""
-        completion = [
-            [{"content": "<think>\nSome reasoning\n<answer>\nThe answer\n</answer>"}]
-        ]
+        completion = [[{"content": "<think>\nSome reasoning\n<answer>\nThe answer\n</answer>"}]]
         rewards = tag_count_reward(completion)
         self.assertEqual(rewards[0], 0.75)
 
     def test_tag_count_rewards_missing_answer_begin(self):
         """Test tag_count_reward with missing <answer> tag."""
-        completion = [
-            [{"content": "<think>\nSome reasoning\n</think>\nThe answer\n</answer>"}]
-        ]
+        completion = [[{"content": "<think>\nSome reasoning\n</think>\nThe answer\n</answer>"}]]
         rewards = tag_count_reward(completion)
         self.assertEqual(rewards[0], 0.75)
 
     def test_tag_count_rewards_missing_answer_end(self):
         """Test tag_count_reward with missing </answer> tag."""
-        completion = [
-            [{"content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer"}]
-        ]
+        completion = [[{"content": "<think>\nSome reasoning\n</think>\n<answer>\nThe answer"}]]
         rewards = tag_count_reward(completion)
         self.assertEqual(rewards[0], 0.75)
 
@@ -481,16 +447,12 @@ def test_tag_count_rewards_missing_all_tags(self):
         self.assertEqual(rewards[0], 0.0)
 
     def test_full_repetition_with_language(self):
-        reward_fn = get_repetition_penalty_reward(
-            ngram_size=2, max_penalty=-1.0, language="en"
-        )
+        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0, language="en")
         completions = [[{"content": "that that that that that"}]]
         rewards = reward_fn(completions)
         self.assertEqual(rewards, [-0.75])
         # begin test for zh language
-        reward_fn = get_repetition_penalty_reward(
-            ngram_size=2, max_penalty=-1.0, language="zh"
-        )
+        reward_fn = get_repetition_penalty_reward(ngram_size=2, max_penalty=-1.0, language="zh")
         completions = [[{"content": "这个这个这个这个这个"}]]
         rewards = reward_fn(completions)
         self.assertEqual(rewards, [-0.75])

From 21b48fbe4604dc875493d716de83476adb11e248 Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Fri, 9 May 2025 17:26:34 +0200
Subject: [PATCH 118/137] soft_overlong_punishment from DAPO paper (#638)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* soft_overlong_punishment_reward

* tests

* doc string updated

* style

* non-sensical import removed

* Update src/open_r1/rewards.py

Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>

* Update src/open_r1/rewards.py

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* max_completion_length set to 3.6

* style

* quality

* test case added for <max_com_len

* style

* max_len +cache len updated based on num chars

* max_len_completion docstring added in cofig

* Update configs.py

* refactor soft overlong penalty to use completion ids

* change decription to be tokens

---------

Co-authored-by: shirinyamani <yamani.shirin@ucalgary.ca>
Co-authored-by: Shirin Yamani <75791599+shirinyamani@users.noreply.github.com>
Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>
---
 src/open_r1/configs.py | 15 ++++++++++++++-
 src/open_r1/rewards.py | 30 ++++++++++++++++++++++++++++++
 tests/test_rewards.py  | 24 ++++++++++++++++++++++++
 3 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/src/open_r1/configs.py b/src/open_r1/configs.py
index 642089dd3..c16af30a3 100644
--- a/src/open_r1/configs.py
+++ b/src/open_r1/configs.py
@@ -104,7 +104,7 @@ class GRPOScriptArguments(trl.ScriptArguments):
 
     Args:
         reward_funcs (`list[str]`):
-            List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length', 'tag_count', 'code', 'ioi_code', 'code_format'.
+            List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'cosine', 'repetition_penalty', 'length', 'tag_count', 'code', 'ioi_code', 'code_format', 'soft_overlong_punishment'.
         cosine_min_value_wrong (`float`):
             Minimum reward for cosine scaling for wrong answers.
         cosine_max_value_wrong (`float`):
@@ -117,6 +117,10 @@ class GRPOScriptArguments(trl.ScriptArguments):
             Maximum length for cosine scaling.
         code_language (`str`):
             Language for code format reward.
+        max_completion_len (`int`):
+            Maximum number of tokens in completion.
+        soft_punish_cache (`int`):
+            Minimum number of tokens in completion.
     """
 
     reward_funcs: list[str] = field(
@@ -203,3 +207,12 @@ class GRPOScriptArguments(trl.ScriptArguments):
             "choices": ["piston", "morph"],
         },
     )
+
+    max_completion_len: int = field(
+        default=16384,
+        metadata={"help": "Maximum number of characters in completion."},
+    )
+    soft_punish_cache: int = field(
+        default=4096,
+        metadata={"help": "Minimum number of characters in completion."},
+    )
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 33d735dc2..1f99ecd14 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -548,6 +548,32 @@ def code_format_reward(completions, **kwargs):
     return code_format_reward
 
 
+def get_soft_overlong_punishment(max_completion_len, soft_punish_cache):
+    """
+    Reward function that penalizes overlong completions. It is used to penalize overlong completions,
+    but not to reward shorter completions. Reference: Eq. (13) from the DAPO paper (https://huggingface.co/papers/2503.14476)
+
+    Args:
+        max_completion_len: Maximum length of the completion
+        soft_punish_cache: Minimum length of the completion. If set to 0, no minimum length is applied.
+    """
+
+    def soft_overlong_punishment_reward(completion_ids: list[list[int]], **kwargs) -> list[float]:
+        """Reward function that penalizes overlong completions."""
+        rewards = []
+        for ids in completion_ids:
+            completion_length = len(ids)
+            if completion_length <= max_completion_len - soft_punish_cache:
+                rewards.append(0.0)
+            elif max_completion_len - soft_punish_cache < completion_length <= max_completion_len:
+                rewards.append((max_completion_len - soft_punish_cache - completion_length) / soft_punish_cache)
+            else:
+                rewards.append(-1.0)
+        return rewards
+
+    return soft_overlong_punishment_reward
+
+
 def get_reward_funcs(script_args) -> list[Callable]:
     REWARD_FUNCS_REGISTRY = {
         "accuracy": accuracy_reward,
@@ -593,6 +619,10 @@ def get_reward_funcs(script_args) -> list[Callable]:
         ),
         "code_format": get_code_format_reward(language=script_args.code_language),
         "tag_count": tag_count_reward,
+        "soft_overlong_punishment": get_soft_overlong_punishment(
+            max_completion_len=script_args.max_completion_len,
+            soft_punish_cache=script_args.soft_punish_cache,
+        ),
     }
     reward_funcs = [REWARD_FUNCS_REGISTRY[func] for func in script_args.reward_funcs]
 
diff --git a/tests/test_rewards.py b/tests/test_rewards.py
index 74ddc0907..03ac517c9 100644
--- a/tests/test_rewards.py
+++ b/tests/test_rewards.py
@@ -24,6 +24,7 @@
     get_cosine_scaled_reward,
     get_repetition_penalty_reward,
     get_reward_funcs,
+    get_soft_overlong_punishment,
     len_reward,
     reasoning_steps_reward,
     tag_count_reward,
@@ -457,6 +458,29 @@ def test_full_repetition_with_language(self):
         rewards = reward_fn(completions)
         self.assertEqual(rewards, [-0.75])
 
+    def test_soft_overlong_punishment_short_completion(self):
+        """Test soft overlong punishment reward function with a short completion."""
+        # length 50, with max=100 and soft cache=20, reward should be 0.
+        reward_fn = get_soft_overlong_punishment(max_completion_len=100, soft_punish_cache=20)
+        completion_ids = [[1] * 50]  # 50 <= 80
+        rewards = reward_fn(completion_ids=completion_ids)
+        self.assertEqual(rewards, [0])
+
+    def test_soft_overlong_punishment_long_completion(self):
+        """Test soft overlong punishment reward function with a longer than max completion."""
+        # 110 > 100, reward should be -1.
+        reward_fn = get_soft_overlong_punishment(max_completion_len=100, soft_punish_cache=20)
+        completion_ids = [[1] * 110]
+        rewards = reward_fn(completion_ids)
+        self.assertEqual(rewards, [-1])
+
+    def test_soft_overlong_punishment_intermediate_completion(self):
+        """Test soft overlong punishment reward function for intermediate length completion."""
+        reward_fn = get_soft_overlong_punishment(max_completion_len=100, soft_punish_cache=20)
+        completion_ids = [[1] * 90]  # 90 is between 80 and 100
+        rewards = reward_fn(completion_ids)
+        self.assertAlmostEqual(rewards[0], -0.5, places=4)
+
 
 class TestCodeFormat(unittest.TestCase):
     def test_correct_python_format(self):

From c802f00512fe5474d27bab9c0a55d0a6969e32d8 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Fri, 9 May 2025 17:42:36 +0200
Subject: [PATCH 119/137] Use pass@1 for all evals (#633)

* Use pass@1 for all evals

* Update scores
---
 README.md | 63 +++++++++++++++++++++++++++++++------------------------
 setup.py  |  2 +-
 2 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/README.md b/README.md
index d8f80f3c9..b270e074c 100644
--- a/README.md
+++ b/README.md
@@ -482,21 +482,30 @@ make evaluate MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B TASK=aime24 PARALLE
 
 ## Reproducing Deepseek's evaluation results
 
-> [!NOTE]
-> The DeepSeek-R1 paper uses sampling with 4-64 responses per query to estimate `pass@1` accuracy, but does not specify the specific number of responses per benchmark. For AIME 2024, we report the results from sampling 32 response per query, while for all others we report the accuracy from sampling 1 response. These choices likely explains the small 1-3σ discrepancies between our results and DeepSeek's.
+The DeepSeek-R1 paper uses sampling with 4-64 responses per query to estimate `pass@1` accuracy, but does not specify the specific number of responses per benchmark. In the tables below, we estimate `pass@1` accuracy with the following number of responses per query:
+
+|   Benchmark   | Number of responses per query |
+|:-------------:|:-----------------------------:|
+|   AIME 2024   |              64               |
+|   MATH-500    |               4               |
+| GPQA Diamond  |               8               |
+| LiveCodeBench |              16               |
+
+
+Note that for benchmarks like AIME24, it is important to sample many responses as there are only 30 problems and this can introduce high variance across repeated runs. The choice of how many responses to sample per prompt likely explains the small differences between our evaluation results and those reported by DeepSeek.
 
 ### AIME 2024
 
 We are able to reproduce Deepseek's reported results on the AIME 2024 benchmark within ~1-3 standard deviations:
 
 | Model                         | AIME 2024 (🤗 LightEval) | AIME 2024 (DeepSeek Reported) |
-|:------------------------------|:-----------------------:|:----------------------------:|
-| DeepSeek-R1-Distill-Qwen-1.5B |          30.6           |             28.9             |
-| DeepSeek-R1-Distill-Qwen-7B   |          52.8           |             55.5             |
-| DeepSeek-R1-Distill-Qwen-14B  |          65.6           |             69.7             |
-| DeepSeek-R1-Distill-Qwen-32B  |          71.0           |             72.6             |
-| DeepSeek-R1-Distill-Llama-8B  |          44.8           |             41.7             |
-| DeepSeek-R1-Distill-Llama-70B |          63.0           |             70.0             |
+|:------------------------------|:------------------------:|:-----------------------------:|
+| DeepSeek-R1-Distill-Qwen-1.5B |           30.7           |             28.9              |
+| DeepSeek-R1-Distill-Qwen-7B   |           50.8           |             55.5              |
+| DeepSeek-R1-Distill-Qwen-14B  |           65.9           |             69.7              |
+| DeepSeek-R1-Distill-Qwen-32B  |           69.7           |             72.6              |
+| DeepSeek-R1-Distill-Llama-8B  |           43.9           |             41.7              |
+| DeepSeek-R1-Distill-Llama-70B |           63.0           |             70.0              |
 
 To reproduce these results use the following command:
 
@@ -523,12 +532,12 @@ We are able to reproduce Deepseek's reported results on the MATH-500 benchmark w
 
 | Model                         | MATH-500 (🤗 LightEval) | MATH-500 (DeepSeek Reported) |
 |:------------------------------|:-----------------------:|:----------------------------:|
-| DeepSeek-R1-Distill-Qwen-1.5B |          84.4           |             83.9             |
-| DeepSeek-R1-Distill-Qwen-7B   |          94.4           |             92.8             |
-| DeepSeek-R1-Distill-Qwen-14B  |          94.2           |             93.9             |
-| DeepSeek-R1-Distill-Qwen-32B  |          95.8           |             94.3             |
-| DeepSeek-R1-Distill-Llama-8B  |          88.4           |             89.1             |
-| DeepSeek-R1-Distill-Llama-70B |          96.0           |             94.5             |
+| DeepSeek-R1-Distill-Qwen-1.5B |          83.1           |             83.9             |
+| DeepSeek-R1-Distill-Qwen-7B   |          94.5           |             92.8             |
+| DeepSeek-R1-Distill-Qwen-14B  |          94.1           |             93.9             |
+| DeepSeek-R1-Distill-Qwen-32B  |          95.6           |             94.3             |
+| DeepSeek-R1-Distill-Llama-8B  |          88.6           |             89.1             |
+| DeepSeek-R1-Distill-Llama-70B |          95.1           |             94.5             |
 
 To reproduce these results use the following command:
 
@@ -556,12 +565,12 @@ We are able to reproduce Deepseek's reported results on the GPQA Diamond benchma
 
 | Model                         | GPQA Diamond (🤗 LightEval) | GPQA Diamond (DeepSeek Reported) |
 |:------------------------------|:---------------------------:|:--------------------------------:|
-| DeepSeek-R1-Distill-Qwen-1.5B |            36.9             |               33.8               |
-| DeepSeek-R1-Distill-Qwen-7B   |            51.6             |               49.1               |
-| DeepSeek-R1-Distill-Qwen-14B  |            59.6             |               59.1               |
+| DeepSeek-R1-Distill-Qwen-1.5B |            35.8             |               33.8               |
+| DeepSeek-R1-Distill-Qwen-7B   |            50.5             |               49.1               |
+| DeepSeek-R1-Distill-Qwen-14B  |            61.5             |               59.1               |
 | DeepSeek-R1-Distill-Qwen-32B  |            63.1             |               62.1               |
-| DeepSeek-R1-Distill-Llama-8B  |            54.0             |               49.0               |
-| DeepSeek-R1-Distill-Llama-70B |            68.2             |               65.2               |
+| DeepSeek-R1-Distill-Llama-8B  |            46.7             |               49.0               |
+| DeepSeek-R1-Distill-Llama-70B |            67.4             |               65.2               |
 
 To reproduce these results use the following command:
 
@@ -586,13 +595,13 @@ python scripts/run_benchmarks.py --model-id {model_id}  --benchmarks gpqa
 We are able to reproduce Deepseek's reported results on the LiveCodeBench code generation benchmark within ~1-3 standard deviations:
 
 | Model                         | LiveCodeBench (🤗 LightEval) | LiveCodeBench (DeepSeek Reported) |
-|:------------------------------|:----------------------------:|:--------------------------------:|
-| DeepSeek-R1-Distill-Qwen-1.5B |             16.1             |               16.9               |
-| DeepSeek-R1-Distill-Qwen-7B   |             37.4             |               37.6               |
-| DeepSeek-R1-Distill-Qwen-14B  |             51.3             |               53.1               |
-| DeepSeek-R1-Distill-Qwen-32B  |             56.0             |               57.2               |
-| DeepSeek-R1-Distill-Llama-8B  |             37.4             |               39.6               |
-| DeepSeek-R1-Distill-Llama-70B |             55.9             |               57.5               |
+|:------------------------------|:----------------------------:|:---------------------------------:|
+| DeepSeek-R1-Distill-Qwen-1.5B |             16.1             |               16.9                |
+| DeepSeek-R1-Distill-Qwen-7B   |             37.4             |               37.6                |
+| DeepSeek-R1-Distill-Qwen-14B  |             51.3             |               53.1                |
+| DeepSeek-R1-Distill-Qwen-32B  |             56.0             |               57.2                |
+| DeepSeek-R1-Distill-Llama-8B  |             37.4             |               39.6                |
+| DeepSeek-R1-Distill-Llama-70B |             55.9             |               57.5                |
 
 To reproduce these results use the following command:
 
diff --git a/setup.py b/setup.py
index ae2bcda7b..0026fb1c6 100644
--- a/setup.py
+++ b/setup.py
@@ -56,7 +56,7 @@
     "langdetect",  # Needed for LightEval's extended tasks
     "latex2sympy2_extended>=1.0.6",
     "liger-kernel>=0.5.6",
-    "lighteval @ git+https://github.com/huggingface/lighteval.git@989f5f5586de1ddfeceb0dfa5076bd0740d376fa",
+    "lighteval @ git+https://github.com/huggingface/lighteval.git@d50bc3072b8814656633400a1850c500c6aa2e39",
     "math-verify==0.5.2",  # Used for math verification in grpo
     "morphcloud==0.1.67",
     "packaging>=23.0",

From 4fc2a3ff82c6f652b20370f1977b74bf8f74336d Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Fri, 9 May 2025 19:19:51 +0200
Subject: [PATCH 120/137] Add time to Slurm (#639)

---
 slurm/e2b_router.slurm   | 1 +
 slurm/evaluate.slurm     | 2 ++
 slurm/morph_router.slurm | 2 ++
 slurm/train.slurm        | 2 ++
 4 files changed, 7 insertions(+)

diff --git a/slurm/e2b_router.slurm b/slurm/e2b_router.slurm
index 254130594..5f1e2a673 100644
--- a/slurm/e2b_router.slurm
+++ b/slurm/e2b_router.slurm
@@ -6,6 +6,7 @@
 #SBATCH --output=/fsx/open-r1/logs/e2b_router/%x-%j.out
 #SBATCH --error=/fsx/open-r1/logs/e2b_router/%x-%j.err
 #SBATCH --requeue
+#SBATCH --time=7-00:00:00
 
 echo "Starting job"
 set -x -e
diff --git a/slurm/evaluate.slurm b/slurm/evaluate.slurm
index 4b81594b7..d10a70a7d 100644
--- a/slurm/evaluate.slurm
+++ b/slurm/evaluate.slurm
@@ -5,6 +5,8 @@
 #SBATCH --output=./logs/%x-%j.out
 #SBATCH --error=./logs/%x-%j.err
 #SBATCH --requeue
+#SBATCH --time=1-00:00:00
+
 
 # Specific configuration optimized for the Hugging Face Compute Cluster
 # Be ye warned this may not work on other clusters!
diff --git a/slurm/morph_router.slurm b/slurm/morph_router.slurm
index 9bb3e79f1..6e0001b3d 100644
--- a/slurm/morph_router.slurm
+++ b/slurm/morph_router.slurm
@@ -6,6 +6,8 @@
 #SBATCH --output=/fsx/open-r1/logs/morph_router/%x-%j.out
 #SBATCH --err=/fsx/open-r1/logs/morph_router/%x-%j.err
 #SBATCH --requeue
+#SBATCH --time=7-00:00:00
+
 
 echo "Starting job"
 set -x -e
diff --git a/slurm/train.slurm b/slurm/train.slurm
index 61aec484c..3eec89564 100644
--- a/slurm/train.slurm
+++ b/slurm/train.slurm
@@ -7,6 +7,8 @@
 #SBATCH --output=./logs/%x-%j.out
 #SBATCH --error=./logs/%x-%j.err
 #SBATCH --requeue
+#SBATCH --time=3-00:00:00
+
 
 if [[ "$*" == *"--help"* ]]; then
   echo "Usage: sbatch slurm/train.slurm [options]"

From ea5b7edf22ea573092eb211864e41b81b67cb6f0 Mon Sep 17 00:00:00 2001
From: Edward Beeching <edbeeching@users.noreply.github.com>
Date: Fri, 16 May 2025 10:26:49 +0200
Subject: [PATCH 121/137] Add dataset filtering script (#637)

* add dataset filtering script

* remove subset selection

* save wip

* save wip

* update filter script

* refactor to run on chunks

* rename script

* cleanup

* update dapo filtering

* fixes

* dapo filt config

* udpate compute pass rate

* clean

* update readme and config

* add merging snippet
---
 README.md                                     |   3 +
 recipes/dataset_filtering/config_demo.yaml    |  28 +++
 recipes/dataset_filtering/filter_dapo.yaml    |  28 +++
 recipes/dataset_filtering/filter_python.yaml  |  26 +++
 scripts/pass_rate_filtering/README.md         |  36 +++
 .../pass_rate_filtering/compute_pass_rate.py  | 205 ++++++++++++++++++
 .../pass_rate_filtering/launch_filtering.sh   |  15 ++
 slurm/compute_pass_rate.slurm                 |  20 ++
 8 files changed, 361 insertions(+)
 create mode 100644 recipes/dataset_filtering/config_demo.yaml
 create mode 100644 recipes/dataset_filtering/filter_dapo.yaml
 create mode 100644 recipes/dataset_filtering/filter_python.yaml
 create mode 100644 scripts/pass_rate_filtering/README.md
 create mode 100644 scripts/pass_rate_filtering/compute_pass_rate.py
 create mode 100644 scripts/pass_rate_filtering/launch_filtering.sh
 create mode 100644 slurm/compute_pass_rate.slurm

diff --git a/README.md b/README.md
index b270e074c..38c22c451 100644
--- a/README.md
+++ b/README.md
@@ -245,6 +245,9 @@ sbatch --nodes=2 slurm/train.slurm --model Qwen2.5-1.5B-Instruct --task grpo --c
 
 See the [Launching jobs on a Slurm cluster](#launching-jobs-on-a-slurm-cluster) section for more details.
 
+### GRPO dataset filtering
+We provide support to filter datasets by generating and computing pass rate on veriable tasks, see this [README](scripts/pass_rate_filtering/README.md)
+
 #### 👨‍💻 Training with a code interpreter
 
 We provide a `code` reward function for executing code generated by the policy during training. Currently, this reward function targets code contests like [Codeforces](https://codeforces.com), where solutions are executed against a set of test cases and the overall success rate is returned as the final reward. To ensure safe execution, we support multiple sandbox providers:
diff --git a/recipes/dataset_filtering/config_demo.yaml b/recipes/dataset_filtering/config_demo.yaml
new file mode 100644
index 000000000..a1168512e
--- /dev/null
+++ b/recipes/dataset_filtering/config_demo.yaml
@@ -0,0 +1,28 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+# We edit the DeepSeek chat template to ensure (a) the reasoning block within <think> and </think> is included in the completion and (b) the <think> tag is not part of the prefill so that the format reward works
+chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}"
+dataset_name: open-r1/OpenR1-Math-220k
+dataset_prompt_column: problem
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# Generation arguments
+max_completion_length: 2048
+num_generations: 8
+temperature: 0.7
+top_p: 0.95
+
+# Reward func arguments
+reward_funcs:
+- accuracy
+reward_weights:
+- 1.0
+
+# Filtering arguments. Samples with a pass rate outside the interval `pass_rate_min < x < pass_rate_max` will be filtered.  
+pass_rate_min: 0.2
+pass_rate_max: 0.8
diff --git a/recipes/dataset_filtering/filter_dapo.yaml b/recipes/dataset_filtering/filter_dapo.yaml
new file mode 100644
index 000000000..8c8e68c93
--- /dev/null
+++ b/recipes/dataset_filtering/filter_dapo.yaml
@@ -0,0 +1,28 @@
+# Model arguments
+model_name_or_path: open-r1/R1-Distill-Qwen-Math-7B
+model_revision: v03.00-step-000008190
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+# We edit the DeepSeek chat template to ensure (a) the reasoning block within <think> and </think> is included in the completion and (b) the <think> tag is not part of the prefill so that the format reward works
+dataset_name: open-r1/DAPO-Math-17k-Processed
+dataset_config: all
+dataset_split: train
+
+# Generation arguments
+max_completion_length: 32000
+num_generations: 8
+temperature: 1.0
+
+# Reward func arguments
+reward_funcs:
+- accuracy
+reward_weights:
+- 1.0
+
+# Filtering arguments. Samples with mean reward outside of low / high will be filtered
+pass_rate_min: 0.1
+pass_rate_max: 0.6
+
+output_dataset_name: open-r1/DAPO-Math-17k-Processed-R1-Distill-Qwen-Math-7B-v03.00-step-000008190-filter
diff --git a/recipes/dataset_filtering/filter_python.yaml b/recipes/dataset_filtering/filter_python.yaml
new file mode 100644
index 000000000..ce699dcec
--- /dev/null
+++ b/recipes/dataset_filtering/filter_python.yaml
@@ -0,0 +1,26 @@
+# Model arguments
+model_name_or_path: open-r1/R1-Distill-Qwen-Math-7B-Merges
+model_revision: v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+# We edit the DeepSeek chat template to ensure (a) the reasoning block within <think> and </think> is included in the completion and (b) the <think> tag is not part of the prefill so that the format reward works
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+# Generation arguments
+max_completion_length: 16000
+num_generations: 8
+temperature: 0.7
+
+# Reward func arguments
+reward_funcs:
+- binary_code
+reward_weights:
+- 1.0
+e2b_router_url: ip-10-53-85-92:8000
+
+# Filtering arguments. Samples with mean reward outside of low / high will be filtered
+pass_rate_min: 0.1
+pass_rate_max: 0.6
diff --git a/scripts/pass_rate_filtering/README.md b/scripts/pass_rate_filtering/README.md
new file mode 100644
index 000000000..0c2fd88fb
--- /dev/null
+++ b/scripts/pass_rate_filtering/README.md
@@ -0,0 +1,36 @@
+# Pass rate filtering
+
+We provide support to filter datasets by generating and computing pass rate on veriable tasks
+
+See `scripts/pass_rate_filtering/compute_pass_rate.py` and `scripts/pass_rate_filtering/launch_filtering.sh` (hardcoded for DAPO at the moment)
+
+By default the script chunks the dataset, merge can be run using the following snippet (example for DAPO) :
+
+from datasets import load_dataset, concatenate_datasets
+
+name = "open-r1/DAPO-Math-17k-Processed-R1-Distill-Qwen-Math-7B-Merges-v00.02-v01.02-0.3-0.7-filter"
+
+```python
+gen_datasets = []
+filt_datasets = []
+for start in range(0,17400,200):
+    end = start + 200
+    if start == 17200:
+        end = 17398
+    gen_config_name = f"gen-{start}-{end}"
+    gen_dataset = load_dataset(name, gen_config_name, revision="gen",  split="train")
+    gen_datasets.append(gen_dataset)
+    
+    filt_config_name = f"filt-0.1-0.6-{start}-{end}"
+    filt_dataset = load_dataset(name, filt_config_name, revision="pass_rate",  split="train")
+    filt_datasets.append(filt_dataset)
+    
+gen_dataset = concatenate_datasets(gen_datasets)
+gen_dataset.push_to_hub(name, config_name="gen", split="train")
+print(gen_dataset)
+
+filt_dataset = concatenate_datasets(filt_datasets)
+filt_dataset.push_to_hub(name, config_name="default", split="train")
+
+print(filt_dataset)
+```
\ No newline at end of file
diff --git a/scripts/pass_rate_filtering/compute_pass_rate.py b/scripts/pass_rate_filtering/compute_pass_rate.py
new file mode 100644
index 000000000..dcc5286d3
--- /dev/null
+++ b/scripts/pass_rate_filtering/compute_pass_rate.py
@@ -0,0 +1,205 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# example usage python scripts/filter_dataset.py --config recipes/dataset_filtering/config_demo.yaml
+
+import logging
+from dataclasses import dataclass
+from git import Optional
+import torch
+import sys
+
+import datasets
+import transformers
+from datasets import load_dataset
+from transformers import set_seed
+
+from open_r1.configs import GRPOConfig, GRPOScriptArguments
+from open_r1.rewards import get_reward_funcs
+from open_r1.utils import get_tokenizer
+from trl import ModelConfig, TrlParser
+from trl.data_utils import apply_chat_template
+from vllm import LLM, SamplingParams
+
+logger = logging.getLogger(__name__)
+
+@dataclass
+class PassRateScriptArguments(GRPOScriptArguments):
+    # we can be lazy and just use the same script args as GRPO
+    output_dataset_name: Optional[str] = None
+    pass_rate_min: float = 0.1
+    pass_rate_max: float = 0.9
+    dataset_start_index: Optional[int] = None
+    dataset_end_index: Optional[int] = None
+    dataset_split: str = "train"
+
+
+def main(script_args, training_args, model_args):
+    # Set seed for reproducibility
+    set_seed(training_args.seed)
+
+    ###############
+    # Setup logging
+    ###############
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    logger.info(f"Model parameters {model_args}")
+    logger.info(f"Script parameters {script_args}")
+    logger.info(f"Training parameters {training_args}")
+
+    # Load the dataset
+    dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config, split=script_args.dataset_split)
+    if script_args.dataset_start_index is not None and script_args.dataset_end_index is not None:
+        dataset = dataset.select(range(script_args.dataset_start_index, script_args.dataset_end_index))
+
+    # Get reward functions from the registry
+    reward_funcs = get_reward_funcs(script_args)
+
+    # Format into conversation
+    def make_conversation(example, prompt_column: str = script_args.dataset_prompt_column):
+        example["prompt_backup"] = example[prompt_column]
+        
+        prompt = []
+
+        if training_args.system_prompt is not None:
+            prompt.append({"role": "system", "content": training_args.system_prompt})
+
+        if prompt_column not in example:
+            raise ValueError(f"Dataset Question Field Error: {prompt_column} is not supported.")
+
+        prompt.append({"role": "user", "content": example[prompt_column]})
+        return {"prompt": prompt}
+
+    dataset = dataset.map(make_conversation)
+    tokenizer = get_tokenizer(model_args, training_args)
+    
+    if "messages" in dataset.column_names:
+        dataset = dataset.remove_columns("messages")
+    
+    dataset = dataset.map(apply_chat_template, fn_kwargs={"tokenizer": tokenizer})
+    llm = LLM(
+        model=model_args.model_name_or_path,
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+    )
+
+    sampling_params=SamplingParams(
+        temperature=training_args.temperature,
+        top_p=training_args.top_p,
+        top_k=training_args.top_k,
+        n=training_args.num_generations,
+        max_tokens=training_args.max_completion_length,
+    )
+    
+    def batch_score(examples):
+        prompts = examples["prompt"]
+        
+        outputs = llm.generate(
+            prompts,
+            sampling_params=sampling_params,
+            use_tqdm=False,
+        )
+        repeated_prompts = []
+        reward_completions = []
+        grouped_completions = []
+        for output in outputs:
+            prompt = output.prompt
+            group = []
+            for completion in output.outputs:
+                text = completion.text
+                group.append(text)
+                message = [{"role": "assistant", "content": text}]
+                repeated_prompts.append(prompt)
+                reward_completions.append(message)
+            grouped_completions.append(group)
+        
+        def repeat_each_element_k_times(list_to_repeat: list, k: int) -> list:
+            return [element for item in list_to_repeat for element in [item] * k]
+        
+        rewards_per_func = torch.zeros(len(repeated_prompts), len(reward_funcs))
+        for i, reward_func in enumerate(reward_funcs):
+            keys = [key for key in examples.data.keys() if key not in ["prompt", "completion"]]
+            reward_kwargs = {key: repeat_each_element_k_times(examples[key], training_args.num_generations) for key in keys}
+            output_reward_func = reward_func(prompts=repeated_prompts, completions=reward_completions, **reward_kwargs)
+            # Convert None values to NaN
+            output_reward_func = [reward if reward is not None else torch.nan for reward in output_reward_func]
+
+            rewards_per_func[:, i] = torch.tensor(output_reward_func, dtype=torch.float32)
+            
+        reshaped_rewards = rewards_per_func.view(-1, training_args.num_generations)
+        
+        examples["pass_rate_generations"] = grouped_completions
+        examples["pass_rate_rewards"] = reshaped_rewards.tolist()
+
+            
+        return examples
+    
+    dataset = dataset.map(batch_score, batched=True, batch_size=64)
+    
+    # we need to restore the prompt for the final dataset
+    def restore_prompt(example):
+        example["prompt"] = example["prompt_backup"]
+        return example
+    
+    dataset = dataset.map(restore_prompt)
+    dataset = dataset.remove_columns("prompt_backup")
+    
+    if script_args.output_dataset_name is not None:
+        output_dataset_name = script_args.output_dataset_name
+    else:
+        model_name = model_args.model_name_or_path
+        if "/" in model_name:
+            model_name = model_name.split("/")[-1]
+        model_revision = model_args.model_revision
+    
+        output_dataset_name = f"{script_args.dataset_name}-{model_name}-{model_revision}-gen"
+    
+    config_name="default"
+    filtered_config_name = f"filt-{script_args.pass_rate_min}-{script_args.pass_rate_max}"
+    
+    if script_args.dataset_start_index is not None and script_args.dataset_end_index is not None:
+        config_name = f"gen-{script_args.dataset_start_index}-{script_args.dataset_end_index}"
+        filtered_config_name = f"{filtered_config_name}-{script_args.dataset_start_index}-{script_args.dataset_end_index}"
+        
+    dataset.push_to_hub(output_dataset_name, config_name=config_name, revision="gen")
+    
+    def filter_func(example):
+        rewards = example["pass_rate_rewards"]
+        # get the mean of the rewards that are not None
+        mean_reward = torch.nanmean(torch.tensor(rewards, dtype=torch.float32))
+        
+        return script_args.pass_rate_min < mean_reward < script_args.pass_rate_max
+    
+    logger.info(f"Filtering dataset with low reward threshold {script_args.pass_rate_min} and high reward threshold {script_args.pass_rate_max}")
+    logger.info(f"Dataset size before filtering: {dataset}")
+    dataset = dataset.filter(filter_func)
+    logger.info(f"Dataset size after filtering: {dataset}")
+    dataset.push_to_hub(output_dataset_name, config_name=filtered_config_name, revision="pass_rate")
+    
+    
+
+if __name__ == "__main__":
+    parser = TrlParser((PassRateScriptArguments, GRPOConfig, ModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    main(script_args, training_args, model_args)
diff --git a/scripts/pass_rate_filtering/launch_filtering.sh b/scripts/pass_rate_filtering/launch_filtering.sh
new file mode 100644
index 000000000..be357d0a6
--- /dev/null
+++ b/scripts/pass_rate_filtering/launch_filtering.sh
@@ -0,0 +1,15 @@
+
+
+# a bash foor loop from 0 to 17,400 in chunks of 200
+
+for i in {0..17000..200}
+do
+  START=$i
+  END=$((i + 200))
+  echo "Processing chunk from $START to $END"
+  
+  # Submit the job to SLURM
+  sbatch slurm/compute_pass_rate.slurm recipes/dataset_filtering/filter_dapo.yaml $START $END
+done
+
+sbatch slurm/compute_pass_rate.slurm recipes/dataset_filtering/filter_dapo.yaml 17200 17398
diff --git a/slurm/compute_pass_rate.slurm b/slurm/compute_pass_rate.slurm
new file mode 100644
index 000000000..2c1cc54e7
--- /dev/null
+++ b/slurm/compute_pass_rate.slurm
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+#SBATCH --job-name=open-r1-compute-pass-rate
+#SBATCH --partition=hopper-prod
+#SBATCH --qos=normal
+#SBATCH --nodes=1
+#SBATCH --gpus-per-node=1
+#SBATCH --output=./logs/%x-%j.out
+#SBATCH --error=./logs/%x-%j.err
+#SBATCH --time=01-00:00:00
+#SBATCH --requeue
+
+# example usage: sbatch slurm/dataset_filter.slurm recipes/dataset_filtering/filter_dapo.yaml 0 500
+
+set -x -e
+
+source ~/.bashrc
+source openr1/bin/activate
+
+python scripts/pass_rate_filtering/compute_pass_rate.py --config $1 --dataset_start_index $2 --dataset_end_index $3
\ No newline at end of file

From ebd5913a85d60819d68a209a030583740fdf98a8 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Fri, 16 May 2025 10:52:05 +0200
Subject: [PATCH 122/137] Bump LightEval (#643)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 0026fb1c6..7189502cc 100644
--- a/setup.py
+++ b/setup.py
@@ -56,7 +56,7 @@
     "langdetect",  # Needed for LightEval's extended tasks
     "latex2sympy2_extended>=1.0.6",
     "liger-kernel>=0.5.6",
-    "lighteval @ git+https://github.com/huggingface/lighteval.git@d50bc3072b8814656633400a1850c500c6aa2e39",
+    "lighteval @ git+https://github.com/huggingface/lighteval.git@d3da6b9bbf38104c8b5e1acc86f83541f9a502d1",  # Critical bug fix for tokenizer revisions: https://github.com/huggingface/lighteval/pull/721
     "math-verify==0.5.2",  # Used for math verification in grpo
     "morphcloud==0.1.67",
     "packaging>=23.0",

From 5e0c210f9c1c8d271ec6b461cddd5656f0163d36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Mon, 19 May 2025 04:48:14 -0700
Subject: [PATCH 123/137] use hf papers (#646)

---
 README.md                | 2 +-
 scripts/decontaminate.py | 2 +-
 src/open_r1/rewards.py   | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 38c22c451..514e93455 100644
--- a/README.md
+++ b/README.md
@@ -713,7 +713,7 @@ sbatch slurm/generate.slurm \
 
 ### Data decontamination
 
-Following [s1: Simple test-time scaling](https://arxiv.org/abs/2501.19393) the data can be decontaminated using the script at: [scripts/decontaminate.py](./scripts/decontaminate.py), which decontaminates a dataset using 8-grams and deduplicate the data. Sample run:
+Following [s1: Simple test-time scaling](https://huggingface.co/papers/2501.19393) the data can be decontaminated using the script at: [scripts/decontaminate.py](./scripts/decontaminate.py), which decontaminates a dataset using 8-grams and deduplicate the data. Sample run:
 
 ```shell
 python scripts/decontaminate.py \
diff --git a/scripts/decontaminate.py b/scripts/decontaminate.py
index 0cb2a7a93..0ef13df3a 100644
--- a/scripts/decontaminate.py
+++ b/scripts/decontaminate.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 """
 This script is used to decontaminate a dataset by checking for n-gram overlap with other datasets.
-It uses the same approach presented in https://arxiv.org/abs/2501.19393,
+It uses the same approach presented in https://huggingface.co/papers/2501.19393,
 as found in: https://github.com/simplescaling/s1/blob/main/data/decontaminate_util.py
 
 Usage:
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 1f99ecd14..9818b5414 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -130,7 +130,7 @@ def reasoning_steps_reward(completions, **kwargs):
 def len_reward(completions: list[Dict[str, str]], solution: list[str], **kwargs) -> float:
     """Compute length-based rewards to discourage overthinking and promote token efficiency.
 
-    Taken from the Kimi 1.5 tech report: https://arxiv.org/abs/2501.12599
+    Taken from the Kimi 1.5 tech report: https://huggingface.co/papers/2501.12599
 
     Args:
         completions: List of model completions
@@ -282,7 +282,7 @@ def cosine_scaled_reward(completions, solution, **kwargs):
 
 def get_repetition_penalty_reward(ngram_size: int, max_penalty: float, language: str = "en"):
     """
-    Computes N-gram repetition penalty as described in Appendix C.2 of https://arxiv.org/abs/2502.03373.
+    Computes N-gram repetition penalty as described in Appendix C.2 of https://huggingface.co/papers/2502.03373.
     Reference implementation from: https://github.com/eddycmu/demystify-long-cot/blob/release/openrlhf/openrlhf/reward/repetition.py
 
     Args:

From 9366aa2df302debc20dedf46935f00e32b660f55 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Tue, 20 May 2025 11:40:42 +0200
Subject: [PATCH 124/137] Add dataset mixer (#647)

* Prototype

* Clean up

* Refactor

* Add tests

* Add doc and make scripts work

* Tune doc

* Up

* Tune

* Add column verification

* Fix types

* Fix YAML

* Fix types

* Fix doc

* f

* f
---
 README.md                     |  28 +++++++-
 src/open_r1/configs.py        | 105 ++++++++++++++++++++++++++-
 src/open_r1/grpo.py           |   5 +-
 src/open_r1/sft.py            |   9 ++-
 src/open_r1/utils/__init__.py |   3 +-
 src/open_r1/utils/data.py     |  65 +++++++++++++++++
 tests/utils/test_data.py      | 129 ++++++++++++++++++++++++++++++++++
 7 files changed, 332 insertions(+), 12 deletions(-)
 create mode 100644 src/open_r1/utils/data.py
 create mode 100644 tests/utils/test_data.py

diff --git a/README.md b/README.md
index 514e93455..5e9039d13 100644
--- a/README.md
+++ b/README.md
@@ -245,7 +245,8 @@ sbatch --nodes=2 slurm/train.slurm --model Qwen2.5-1.5B-Instruct --task grpo --c
 
 See the [Launching jobs on a Slurm cluster](#launching-jobs-on-a-slurm-cluster) section for more details.
 
-### GRPO dataset filtering
+#### GRPO dataset filtering
+
 We provide support to filter datasets by generating and computing pass rate on veriable tasks, see this [README](scripts/pass_rate_filtering/README.md)
 
 #### 👨‍💻 Training with a code interpreter
@@ -400,6 +401,31 @@ sbatch --job-name=open_r1 --nodes=2 slurm/train.slurm --model Qwen2.5-1.5B-Instr
 > [!NOTE]
 > The configuration in `slurm/train.slurm` is optimised for the Hugging Face Compute Cluster and may require tweaking to be adapted to your own compute nodes.
 
+### Customising the dataset mixture
+
+To combine multiple datasets as a single training mixture, you can specify the `dataset_mixture` parameter in the YAML config file. Here's a template for how to do this:
+
+```yaml
+dataset_mixture:
+  datasets:                     # List of datasets to include in the mixture
+    - id: dataset_1             # Hub dataset ID
+      config: config_name_1     # Name of the dataset config
+      split: split_1            # Split to use from the dataset
+      columns:                  # Columns to keep
+        - column_1              
+        - column_2    
+      weight: 0.25              # Fraction of dataset to use
+    - id: dataset_2
+      config: config_name_2
+      split: split_2
+      columns:                  
+        - column_1              
+        - column_2   
+      weight: 0.5
+  seed: 42                      # Seed for shuffling the combined dataset
+  test_split_size: 0.1          # Fraction of mixture to use for a test split
+```
+
 ## Evaluating models
 
 We use `lighteval` to evaluate models. For models which fit on a single GPU, run:
diff --git a/src/open_r1/configs.py b/src/open_r1/configs.py
index c16af30a3..6cbace96c 100644
--- a/src/open_r1/configs.py
+++ b/src/open_r1/configs.py
@@ -14,11 +14,112 @@
 # limitations under the License.
 
 from dataclasses import dataclass, field
-from typing import Optional
+from typing import Any, Optional
 
 import trl
 
 
+@dataclass
+class DatasetConfig:
+    """Configuration for a dataset in a mixture."""
+
+    id: str
+    config: Optional[str] = None
+    split: str = "train"
+    columns: Optional[list[str]] = None
+    weight: Optional[float] = None
+
+
+@dataclass
+class DatasetMixtureConfig:
+    """Configuration for a mixture of datasets."""
+
+    datasets: list[DatasetConfig]
+    seed: int = 0
+    test_split_size: Optional[float] = None
+
+
+@dataclass
+class ScriptArguments(trl.ScriptArguments):
+    """
+    Extended version of ScriptArguments with support for dataset mixtures.
+
+    Args:
+        dataset_mixture (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
+            Configuration for creating dataset mixtures with advanced options.
+            Format:
+              dataset_mixture:
+                datasets:
+                  - id: dataset_id1
+                    config: config_name
+                    columns:
+                      - col1
+                      - col2
+                    weight: 0.5
+                  - id: dataset_id2
+                    config: config_name
+                    columns:
+                      - col1
+                      - col2
+                    weight: 0.5
+                seed: 42
+                test_split_size: 0.1
+    """
+
+    # Override the dataset_name to make it optional
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "Dataset name. Can be omitted if using dataset_mixture."}
+    )
+    dataset_mixture: Optional[dict[str, Any]] = field(
+        default=None,
+        metadata={"help": "Configuration for creating dataset mixtures with advanced options like shuffling."},
+    )
+
+    def __post_init__(self):
+        if self.dataset_name is None and self.dataset_mixture is None:
+            raise ValueError("Either `dataset_name` or `dataset_mixture` must be provided")
+
+        if self.dataset_mixture is not None:
+            if not isinstance(self.dataset_mixture, dict) or "datasets" not in self.dataset_mixture:
+                raise ValueError(
+                    "dataset_mixture must be a dictionary with a 'datasets' key. "
+                    "Expected format: {'datasets': [...], 'seed': int}"
+                )
+
+            datasets_list = []
+            datasets_data = self.dataset_mixture.get("datasets", [])
+
+            if isinstance(datasets_data, list):
+                for dataset_config in datasets_data:
+                    datasets_list.append(
+                        DatasetConfig(
+                            id=dataset_config.get("id"),
+                            config=dataset_config.get("config"),
+                            split=dataset_config.get("split", "train"),
+                            columns=dataset_config.get("columns"),
+                            weight=dataset_config.get("weight", 1.0),
+                        )
+                    )
+            else:
+                raise ValueError("'datasets' must be a list of dataset configurations")
+
+            self.dataset_mixture = DatasetMixtureConfig(
+                datasets=datasets_list,
+                seed=self.dataset_mixture.get("seed", 0),
+                test_split_size=self.dataset_mixture.get("test_split_size", None),
+            )
+
+            # Check that column names are consistent across all dataset configs
+            columns_sets = [set(dataset.columns) for dataset in datasets_list if dataset.columns is not None]
+            if columns_sets:
+                first_columns = columns_sets[0]
+                if not all(columns == first_columns for columns in columns_sets):
+                    raise ValueError(
+                        "Column names must be consistent across all dataset configurations in a mixture. "
+                        f"Found different column sets: {[list(cols) for cols in columns_sets]}"
+                    )
+
+
 # TODO: add the shared options with a mixin to reduce code duplication
 @dataclass
 class GRPOConfig(trl.GRPOConfig):
@@ -98,7 +199,7 @@ class SFTConfig(trl.SFTConfig):
 
 
 @dataclass
-class GRPOScriptArguments(trl.ScriptArguments):
+class GRPOScriptArguments(ScriptArguments):
     """
     Script arguments for the GRPO training script.
 
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index b2d6aa1f3..a7385361c 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -18,13 +18,12 @@
 
 import datasets
 import transformers
-from datasets import load_dataset
 from transformers import set_seed
 from transformers.trainer_utils import get_last_checkpoint
 
 from open_r1.configs import GRPOConfig, GRPOScriptArguments
 from open_r1.rewards import get_reward_funcs
-from open_r1.utils import get_model, get_tokenizer
+from open_r1.utils import get_dataset, get_model, get_tokenizer
 from open_r1.utils.callbacks import get_callbacks
 from open_r1.utils.wandb_logging import init_wandb_training
 from trl import GRPOTrainer, ModelConfig, TrlParser, get_peft_config
@@ -72,7 +71,7 @@ def main(script_args, training_args, model_args):
         init_wandb_training(training_args)
 
     # Load the dataset
-    dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
+    dataset = get_dataset(script_args)
 
     ################
     # Load tokenizer
diff --git a/src/open_r1/sft.py b/src/open_r1/sft.py
index 7589fa778..e257cd936 100644
--- a/src/open_r1/sft.py
+++ b/src/open_r1/sft.py
@@ -41,15 +41,14 @@
 
 import datasets
 import transformers
-from datasets import load_dataset
 from transformers import set_seed
 from transformers.trainer_utils import get_last_checkpoint
 
-from open_r1.configs import SFTConfig
-from open_r1.utils import get_model, get_tokenizer
+from open_r1.configs import ScriptArguments, SFTConfig
+from open_r1.utils import get_dataset, get_model, get_tokenizer
 from open_r1.utils.callbacks import get_callbacks
 from open_r1.utils.wandb_logging import init_wandb_training
-from trl import ModelConfig, ScriptArguments, SFTTrainer, TrlParser, get_peft_config, setup_chat_format
+from trl import ModelConfig, SFTTrainer, TrlParser, get_peft_config, setup_chat_format
 
 
 logger = logging.getLogger(__name__)
@@ -91,7 +90,7 @@ def main(script_args, training_args, model_args):
     ################
     # Load datasets
     ################
-    dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
+    dataset = get_dataset(script_args)
 
     ################
     # Load tokenizer
diff --git a/src/open_r1/utils/__init__.py b/src/open_r1/utils/__init__.py
index 7b1449731..d3b84a99d 100644
--- a/src/open_r1/utils/__init__.py
+++ b/src/open_r1/utils/__init__.py
@@ -1,5 +1,6 @@
+from .data import get_dataset
 from .import_utils import is_e2b_available, is_morph_available
 from .model_utils import get_model, get_tokenizer
 
 
-__all__ = ["get_tokenizer", "is_e2b_available", "is_morph_available", "get_model"]
+__all__ = ["get_tokenizer", "is_e2b_available", "is_morph_available", "get_model", "get_dataset"]
diff --git a/src/open_r1/utils/data.py b/src/open_r1/utils/data.py
new file mode 100644
index 000000000..b151a8a7f
--- /dev/null
+++ b/src/open_r1/utils/data.py
@@ -0,0 +1,65 @@
+import logging
+
+import datasets
+from datasets import DatasetDict, concatenate_datasets
+
+from ..configs import ScriptArguments
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_dataset(args: ScriptArguments) -> DatasetDict:
+    """Load a dataset or a mixture of datasets based on the configuration.
+
+    Args:
+        args (ScriptArguments): Script arguments containing dataset configuration.
+
+    Returns:
+        DatasetDict: The loaded datasets.
+    """
+    if args.dataset_name and not args.dataset_mixture:
+        logger.info(f"Loading dataset: {args.dataset_name}")
+        return datasets.load_dataset(args.dataset_name, args.dataset_config)
+    elif args.dataset_mixture:
+        logger.info(f"Creating dataset mixture with {len(args.dataset_mixture.datasets)} datasets")
+        seed = args.dataset_mixture.seed
+        datasets_list = []
+
+        for dataset_config in args.dataset_mixture.datasets:
+            logger.info(f"Loading dataset for mixture: {dataset_config.id} (config: {dataset_config.config})")
+            ds = datasets.load_dataset(
+                dataset_config.id,
+                dataset_config.config,
+                split=dataset_config.split,
+            )
+            if dataset_config.columns is not None:
+                ds = ds.select_columns(dataset_config.columns)
+            if dataset_config.weight is not None:
+                ds = ds.shuffle(seed=seed).select(range(int(len(ds) * dataset_config.weight)))
+                logger.info(
+                    f"Subsampled dataset '{dataset_config.id}' (config: {dataset_config.config}) with weight={dataset_config.weight} to {len(ds)} examples"
+                )
+
+            datasets_list.append(ds)
+
+        if datasets_list:
+            combined_dataset = concatenate_datasets(datasets_list)
+            combined_dataset = combined_dataset.shuffle(seed=seed)
+            logger.info(f"Created dataset mixture with {len(combined_dataset)} examples")
+
+            if args.dataset_mixture.test_split_size is not None:
+                combined_dataset = combined_dataset.train_test_split(
+                    test_size=args.dataset_mixture.test_split_size, seed=seed
+                )
+                logger.info(
+                    f"Split dataset into train and test sets with test size: {args.dataset_mixture.test_split_size}"
+                )
+                return combined_dataset
+            else:
+                return DatasetDict({"train": combined_dataset})
+        else:
+            raise ValueError("No datasets were loaded from the mixture configuration")
+
+    else:
+        raise ValueError("Either `dataset_name` or `dataset_mixture` must be provided")
diff --git a/tests/utils/test_data.py b/tests/utils/test_data.py
new file mode 100644
index 000000000..669057e78
--- /dev/null
+++ b/tests/utils/test_data.py
@@ -0,0 +1,129 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from dataclasses import asdict
+
+from datasets import DatasetDict, load_dataset
+
+from open_r1.configs import DatasetConfig, DatasetMixtureConfig, ScriptArguments
+from open_r1.utils.data import get_dataset
+
+
+class TestGetDataset(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.dataset_name = "trl-internal-testing/zen"
+        cls.dataset_config = "conversational_preference"
+        cls.ref_dataset = load_dataset(cls.dataset_name, cls.dataset_config)
+
+    def test_dataset_and_config_name(self):
+        args = ScriptArguments(dataset_name=self.dataset_name, dataset_config=self.dataset_config)
+        dataset = get_dataset(args)
+        self.assertIsInstance(dataset, DatasetDict)
+        self.assertIn("train", dataset)
+        self.assertEqual(len(dataset["train"]), len(self.ref_dataset["train"]))
+
+    def test_unweighted_mixture(self):
+        """Mix train and test splits of the same dataset."""
+        dataset_configs = [
+            DatasetConfig(id=self.dataset_name, config=self.dataset_config, split="train", columns=None, weight=None),
+            DatasetConfig(id=self.dataset_name, config=self.dataset_config, split="test", columns=None, weight=None),
+        ]
+        dataset_mixture = DatasetMixtureConfig(
+            datasets=dataset_configs,
+        )
+        args = ScriptArguments(dataset_mixture=asdict(dataset_mixture))
+        dataset = get_dataset(args)
+        self.assertIsInstance(dataset, DatasetDict)
+        self.assertIn("train", dataset)
+        self.assertEqual(len(dataset["train"]), len(self.ref_dataset["train"]) + len(self.ref_dataset["test"]))
+
+    def test_weighted_mixture(self):
+        """Test loading a dataset mixture with weights."""
+        dataset_configs = [
+            DatasetConfig(id=self.dataset_name, config=self.dataset_config, split="train", columns=None, weight=0.25),
+            DatasetConfig(id=self.dataset_name, config=self.dataset_config, split="test", columns=None, weight=0.5),
+        ]
+        dataset_mixture = DatasetMixtureConfig(
+            datasets=dataset_configs,
+        )
+        args = ScriptArguments(dataset_mixture=asdict(dataset_mixture))
+        dataset = get_dataset(args)
+        self.assertIsInstance(dataset, DatasetDict)
+        self.assertIn("train", dataset)
+        self.assertEqual(
+            len(dataset["train"]), len(self.ref_dataset["train"]) // 4 + len(self.ref_dataset["test"]) // 2
+        )
+
+    def test_mixture_and_test_split(self):
+        """Test loading a dataset mixture with test split."""
+        dataset_configs = [
+            DatasetConfig(
+                id=self.dataset_name, config=self.dataset_config, split="train[:10]", columns=None, weight=None
+            ),
+        ]
+        dataset_mixture = DatasetMixtureConfig(datasets=dataset_configs, test_split_size=0.2)
+        args = ScriptArguments(dataset_name=None, dataset_mixture=asdict(dataset_mixture))
+        dataset = get_dataset(args)
+        self.assertIsInstance(dataset, DatasetDict)
+        self.assertIn("train", dataset)
+        self.assertIn("test", dataset)
+        self.assertEqual(len(dataset["train"]), 8)
+        self.assertEqual(len(dataset["test"]), 2)
+
+    def test_mixture_column_selection(self):
+        """Test loading a dataset mixture with column selection."""
+        dataset_configs = [
+            DatasetConfig(
+                id=self.dataset_name,
+                config=self.dataset_config,
+                split="train",
+                columns=["prompt", "chosen"],
+                weight=None,
+            ),
+        ]
+        dataset_mixture = DatasetMixtureConfig(
+            datasets=dataset_configs,
+        )
+        args = ScriptArguments(dataset_mixture=asdict(dataset_mixture))
+        dataset = get_dataset(args)
+        self.assertIsInstance(dataset, DatasetDict)
+        self.assertIn("train", dataset)
+        self.assertIn("prompt", dataset["train"].column_names)
+        self.assertIn("chosen", dataset["train"].column_names)
+
+    def test_mixture_with_mismatched_columns(self):
+        dataset_configs = [
+            DatasetConfig(
+                id=self.dataset_name, config=self.dataset_config, split="train", columns=["prompt"], weight=None
+            ),
+            DatasetConfig(
+                id=self.dataset_name, config=self.dataset_config, split="train", columns=["chosen"], weight=None
+            ),
+        ]
+        dataset_mixture = DatasetMixtureConfig(
+            datasets=dataset_configs,
+        )
+        with self.assertRaises(ValueError) as context:
+            _ = ScriptArguments(dataset_mixture=asdict(dataset_mixture))
+        self.assertIn("Column names must be consistent", str(context.exception))
+
+    def test_no_dataset_name_or_mixture(self):
+        with self.assertRaises(ValueError) as context:
+            _ = ScriptArguments(dataset_name=None, dataset_mixture=None)
+        self.assertIn("Either `dataset_name` or `dataset_mixture` must be provided", str(context.exception))
+
+
+if __name__ == "__main__":
+    unittest.main()

From 8067149e90738629f9f98994bf4d91736a7d0a30 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Wed, 21 May 2025 22:25:57 +0200
Subject: [PATCH 125/137] Bump DeepSpeed to 0.16.8 to fix OOM on Qwen3 (#653)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 7189502cc..a7bb71436 100644
--- a/setup.py
+++ b/setup.py
@@ -44,7 +44,7 @@
     "accelerate==1.4.0",
     "bitsandbytes>=0.43.0",
     "datasets>=3.2.0",
-    "deepspeed==0.16.7",
+    "deepspeed==0.16.8",
     "distilabel[vllm,ray,openai]>=1.5.2",
     "e2b-code-interpreter>=1.0.5",
     "einops>=0.8.0",

From db2d9b011a39b09f50b3a9859f131420f5d9a6f7 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Thu, 22 May 2025 08:44:13 +0200
Subject: [PATCH 126/137] Bump lower bound on liger-kernel (#654)

Related to https://github.com/huggingface/open-r1/pull/653

(I forgot to include this in that PR)
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index a7bb71436..c3db3d991 100644
--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,7 @@
     "jieba",  # Needed for Chinese language support
     "langdetect",  # Needed for LightEval's extended tasks
     "latex2sympy2_extended>=1.0.6",
-    "liger-kernel>=0.5.6",
+    "liger-kernel>=0.5.9",
     "lighteval @ git+https://github.com/huggingface/lighteval.git@d3da6b9bbf38104c8b5e1acc86f83541f9a502d1",  # Critical bug fix for tokenizer revisions: https://github.com/huggingface/lighteval/pull/721
     "math-verify==0.5.2",  # Used for math verification in grpo
     "morphcloud==0.1.67",

From c1e11922945753881e1992da765549098e6af6d4 Mon Sep 17 00:00:00 2001
From: Guilherme Penedo <nostrumg@gmail.com>
Date: Sun, 25 May 2025 10:55:27 +0100
Subject: [PATCH 127/137] GRPO with codeforces problems (#627)

* add

* update

* updates

* updates #2

* weighted_sum and python fixes

* bugfix

* merging ioi/cf setups

* integrating the morph changes

* move morph_client

* run style

* small changes for mixed languages training

* revert grpo.py changes

* piston readme

* local test fetching

* bug fixes

* updated readme

* style fixes

* style fixes 2

* deps changes

* import sorting

* fix tests

* Update README.md

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* Update README.md

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

* Update src/open_r1/rewards.py

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

---------

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>
---
 README.md                                     |  30 +++-
 .../grpo/config_codeforces.yaml               |  80 ++++++++++
 setup.py                                      |   6 +-
 slurm/piston/README.md                        |  14 ++
 slurm/train.slurm                             |   2 +-
 src/open_r1/configs.py                        |   7 +-
 src/open_r1/rewards.py                        |  89 ++++++++++-
 .../__init__.py                               |   8 +-
 .../competitive_programming/cf_scoring.py     | 145 ++++++++++++++++++
 .../competitive_programming/code_patcher.py   | 123 +++++++++++++++
 .../ioi_scoring.py}                           |  95 ++++++------
 .../ioi_utils.py}                             |  16 +-
 .../morph_client.py                           |   0
 .../piston_client.py                          |  92 +++--------
 .../utils/competitive_programming/utils.py    |  11 ++
 15 files changed, 577 insertions(+), 141 deletions(-)
 create mode 100644 recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_codeforces.yaml
 rename src/open_r1/utils/{ioi => competitive_programming}/__init__.py (59%)
 create mode 100644 src/open_r1/utils/competitive_programming/cf_scoring.py
 create mode 100644 src/open_r1/utils/competitive_programming/code_patcher.py
 rename src/open_r1/utils/{ioi/scoring.py => competitive_programming/ioi_scoring.py} (80%)
 rename src/open_r1/utils/{ioi/utils.py => competitive_programming/ioi_utils.py} (77%)
 rename src/open_r1/utils/{ioi => competitive_programming}/morph_client.py (100%)
 rename src/open_r1/utils/{ioi => competitive_programming}/piston_client.py (73%)
 create mode 100644 src/open_r1/utils/competitive_programming/utils.py

diff --git a/README.md b/README.md
index 5e9039d13..a34cd32b9 100644
--- a/README.md
+++ b/README.md
@@ -350,17 +350,32 @@ morph_router_url: 1.2.3.4:8000
 The port should match the one used when launching the router.
 All training jobs can share the same router IP which will ensure parallel executions are properly managed.
 
-#### IOI problems
+#### Competitive Programming problems: IOI & CodeForces
 
-We provide a `ioi_code_reward` reward function for executing problems from [IOI](https://hf.co/datasets/open-r1/ioi). You can use either [piston](https://github.com/engineer-man/piston) or Morph as your execution provider.
+We provide `ioi_code_reward` and `cf_code_reward` reward functions for executing problems from [IOI](https://hf.co/datasets/open-r1/ioi) and [CodeForces](https://huggingface.co/datasets/open-r1/codeforces), respectively. You can use either [piston](https://github.com/engineer-man/piston) or Morph (currently IOI only) as your execution provider.
 
 ##### Piston 
 
 To use Piston:
 1. Get piston workers running, see [slurm/piston/README.md](./slurm/piston/README.md)
 2. Set your environment variable `PISTON_ENDPOINTS` to `slurm` or to a list of piston worker endpoints
+
+For IOI:
+
 3. In your configuration, use `ioi_provider: "piston"`
 
+For CodeForces:
+
+3. Download the generated (hard) test cases:
+```
+# change PATH_TO_SAVE_TESTCASES. Increase --max-workers according to your machine's capacity
+huggingface-cli download open-r1/codeforces --repo-type=dataset --include='generated_tests/*.parquet' --max-workers=8 --local-dir PATH_TO_SAVE_TESTCASES 
+```
+4. Save the path in .env:
+```
+CF_TESTS_FOLDER=PATH_TO_SAVE_TESTCASES
+```
+
 ##### Morph 
 
 Morph is a cloud-based solution that provides sandboxed environments for running code. To use it:
@@ -368,7 +383,10 @@ Morph is a cloud-based solution that provides sandboxed environments for running
 2. Add your Morph API key to the `.env` file: `MORPH_API_KEY="your_key_here"`
 3. In your configuration, use `ioi_provider: "morph"`
 
-See the [example recipe](./recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml) for how to use the reward function:
+##### Example recipes
+For IOI:
+
+See the [example recipe](./recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml) for how to use the IOI reward function:
 
 ```shell
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero2.yaml \
@@ -376,6 +394,12 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con
     --config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code_ioi.yaml
 ```
 
+For CodeForces:
+
+```shell
+sbatch --job-name=cf-grpo --nodes=2 slurm/train.slurm --model Qwen2.5-Coder-7B-Instruct --task grpo --config codeforces --accelerator zero3 --dp 8 --tp 1
+```
+
 ### Launching jobs on a Slurm cluster
 
 If you have access to a Slurm cluster, we provide a `slurm/train.slurm` script that will automatically queue training jobs for you. Here's how you can use it:
diff --git a/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_codeforces.yaml b/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_codeforces.yaml
new file mode 100644
index 000000000..cc6f95c64
--- /dev/null
+++ b/recipes/Qwen2.5-Coder-7B-Instruct/grpo/config_codeforces.yaml
@@ -0,0 +1,80 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-Coder-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/codeforces
+dataset_prompt_column: prompt
+dataset_config: verifiable-prompts
+dataset_test_split: test
+dataset_train_split: train
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.0
+loss_type: dr_grpo
+scale_rewards: false
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+gradient_accumulation_steps: 32
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/Qwen2.5-Coder-7B-Instruct-Codeforces-GRPO
+hub_model_revision: v01.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 2000
+max_completion_length: 8192
+max_steps: -1
+num_generations: 16
+# aiming for 1k optimization steps
+# total_samples_per_batch = num_gpus * grad_accumulation_steps * per_device_batch_size = 8 * 32 * 4 = 1024
+# unique_prompts_per_batch = total_samples_per_batch / num_generations = 1024 / 16 = 64
+# #dataset ~= 16k (8k * 2, for python and cpp)
+# global_steps_per_epoch = #dataset / unique_prompts_per_batch = 16k / 64 ~= 250
+# epochs_for_1k_steps = 1000/250 = 4 epochs
+num_train_epochs: 4
+output_dir: data/Qwen2.5-Coder-7B-Instruct-Codeforces-GRPO_v01.00
+overwrite_output_dir: true
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- cf_code
+- code_format
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface
+wandb_project: open-r1
+warmup_ratio: 0.1
+
+mask_truncated_completions: true
+# for each generation, evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating
+# otherwise continue with the next batch of test cases. Useful to avoid overloading the eval server + save time on wrong solutions
+code_eval_test_batch_size: -1
+code_eval_scoring_mode: weighted_sum
\ No newline at end of file
diff --git a/setup.py b/setup.py
index c3db3d991..531a9e502 100644
--- a/setup.py
+++ b/setup.py
@@ -71,6 +71,9 @@
     "transformers @ git+https://github.com/huggingface/transformers.git@acdbe627e323dbc822f21499fead789b439cf45b",  # Fix DeepSpeed x vLLM conflict: https://github.com/huggingface/transformers/pull/37755
     "trl[vllm] @ git+https://github.com/huggingface/trl.git@1bca49515ecd5b85d16e68c42c76670e252e19f1",  # Fix DeepSpeed x vLLM conflict: https://github.com/huggingface/trl/pull/3351
     "wandb>=0.19.1",
+    "async-lru>=2.0.5",
+    "aiofiles>=24.1.0",
+    "pandas>=2.2.3",
 ]
 
 # this is a lookup table with items like:
@@ -90,7 +93,7 @@ def deps_list(*pkgs):
 extras["tests"] = deps_list("pytest", "parameterized", "math-verify", "jieba")
 extras["torch"] = deps_list("torch")
 extras["quality"] = deps_list("ruff", "isort", "flake8")
-extras["code"] = deps_list("e2b-code-interpreter", "python-dotenv", "morphcloud", "jieba")
+extras["code"] = deps_list("e2b-code-interpreter", "python-dotenv", "morphcloud", "jieba", "pandas", "aiofiles")
 extras["eval"] = deps_list("lighteval", "math-verify")
 extras["dev"] = extras["quality"] + extras["tests"] + extras["eval"] + extras["code"]
 
@@ -113,6 +116,7 @@ def deps_list(*pkgs):
     deps["transformers"],
     deps["trl"],
     deps["wandb"],
+    deps["async-lru"]
 ]
 
 setup(
diff --git a/slurm/piston/README.md b/slurm/piston/README.md
index 0a45e1cff..94699cff5 100644
--- a/slurm/piston/README.md
+++ b/slurm/piston/README.md
@@ -17,10 +17,17 @@ slurm/piston/launch_piston_workers.sh 1
 ```
 
 2. Assuming it's running on `ip-10-53-86-146:1234`, send the package install request:
+
+For IOI:
 ```bash
 curl -X POST http://ip-10-53-86-146:1234/api/v2/packages -H "Content-Type: application/json" -d '{"language": "cms_ioi", "version": "1.0.0"}'
 ```
 
+For CodeForces:
+```bash
+curl -X POST http://ip-10-53-86-146:1234/api/v2/packages -H "Content-Type: application/json" -d '{"language": "codeforces", "version": "1.0.0"}'
+```
+
 3. You can now launch more workers and due to the shared mounted packages directory, they should already have the package installed.
 
 To have the main script find the workers automatically, you can export the following environment variable:
@@ -32,6 +39,7 @@ Alternatively your can add `PISTON_ENDPOINTS=slurm` to your .env file.
 You can also change `PISTON_MAX_REQUESTS_PER_ENDPOINT`, which tries to limit how many simultaneous requests each worker will handle (1 by default). Keep in mind that this is a local limit and in distributed setups, as there is no global limit, workers might sometimes be overwhelmed when some processes hit the same worker.
 
 If you would like to adapt the code to run without piston, please see the [ioi repo](https://github.com/huggingface/ioi).
+For CodeForces, you should implement the [`run`](https://github.com/guipenedo/piston/blob/master/packages/codeforces/1.0.0/run) and [`compile`](https://github.com/guipenedo/piston/blob/master/packages/codeforces/1.0.0/compile) scripts.
 
 # Piston workers (local docker)
 This will launch a single worker in a docker container. Consider launching multiple workers for better scalability. Replace 2000 with the port you want to use.
@@ -57,10 +65,16 @@ docker run -d \
 ```
 
 Install the package:
+For IOI:
 ```bash
 curl -X POST http://localhost:2000/api/v2/packages -H "Content-Type: application/json" -d '{"language": "cms_ioi", "version": "1.0.0"}'
 ```
 
+For CodeForces:
+```bash
+curl -X POST http://localhost:2000/api/v2/packages -H "Content-Type: application/json" -d '{"language": "codeforces", "version": "1.0.0"}'
+```
+
 Remember to set `PISTON_ENDPOINTS`:
 ```bash
 export PISTON_ENDPOINTS=http://localhost:2000/api/v2,http://localhost:2001/api/v2,http://localhost:2002/api/v2
diff --git a/slurm/train.slurm b/slurm/train.slurm
index 3eec89564..31b0601d6 100644
--- a/slurm/train.slurm
+++ b/slurm/train.slurm
@@ -167,7 +167,7 @@ SRUN_ARGS=" \
     --ntasks=$NUM_NODES \
     --nodelist=$NODELIST
     "
-clear; srun $SRUN_ARGS bash -c "$LAUNCHER $CMD" 2>&1
+srun $SRUN_ARGS bash -c "$LAUNCHER $CMD" 2>&1
 
 END_TIME=$(date +%s)
 echo "END TIME: $(date)"
diff --git a/src/open_r1/configs.py b/src/open_r1/configs.py
index 6cbace96c..86a3d0524 100644
--- a/src/open_r1/configs.py
+++ b/src/open_r1/configs.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 from dataclasses import dataclass, field
-from typing import Any, Optional
+from typing import Any, Literal, Optional
 
 import trl
 
@@ -260,6 +260,7 @@ class GRPOScriptArguments(ScriptArguments):
     )
     code_language: str = field(
         default="python",
+        # '(?:python|cpp)'
         metadata={
             "help": "Language for code format reward. Based on E2B supported languages https://e2b.dev/docs/code-interpreting/supported-languages",
             "choices": ["python", "javascript", "r", "java", "bash", "cpp"],
@@ -271,6 +272,10 @@ class GRPOScriptArguments(ScriptArguments):
             "help": "for each generation, evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating; otherwise continue with the next batch of test cases. Useful to avoid overloading the eval server + save time on wrong solutions"
         },
     )
+    code_eval_scoring_mode: Literal["pass_fail", "partial", "weighted_sum"] = field(
+        default="weighted_sum",
+        metadata={"help": "use fraction of passed test cases as reward. If false, use 0/1 scoring."},
+    )
     parallel_code_exec_per_proc: int = field(
         default=2,
         metadata={
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
index 9818b5414..0b3662841 100644
--- a/src/open_r1/rewards.py
+++ b/src/open_r1/rewards.py
@@ -20,19 +20,21 @@
 import math
 import re
 from functools import partial, update_wrapper
-from typing import Callable, Dict, Optional
+from typing import Callable, Dict, Literal, Optional
 
 from latex2sympy2_extended import NormalizationConfig
 from math_verify import LatexExtractionConfig, parse, verify
 
 from .utils.code_providers import get_provider
-from .utils.ioi import (
+from .utils.competitive_programming import (
     SubtaskResult,
     add_includes,
     get_morph_client_from_env,
     get_piston_client_from_env,
-    score_subtask,
 )
+from .utils.competitive_programming import patch_code as cf_patch_code
+from .utils.competitive_programming import score_submission as cf_score_submission
+from .utils.competitive_programming import score_subtask
 
 
 def accuracy_reward(completions: list[list[dict[str, str]]], solution: list[str], **kwargs) -> list[Optional[float]]:
@@ -415,7 +417,65 @@ async def run_catch_exceptions(task):
     return [result.score for result in results]
 
 
-def extract_code(completion: str, language: str = "python") -> str:
+def cf_code_reward(
+    completions,
+    test_batch_size: int = 1,
+    patch_code: bool = False,
+    scoring_mode: Literal["pass_fail", "partial", "weighted_sum"] = "weighted_sum",
+    **kwargs,
+) -> list[float]:
+    """Reward function that evaluates Codeforces problems using Piston+our CF package.
+
+    Assumes the dataset has the same format as hf.co/datasets/open-r1/codeforces (verifiable-prompts subset)
+
+    test_batch_size: evaluate these many test cases in parallel, then check if any of them failed (0 score): if so stop evaluating; otherwise continue with the next batch of test cases.
+    """
+    # for info on setting up piston workers, see slurm/piston/README.md
+    piston_client = get_piston_client_from_env()
+
+    languages = kwargs["language"] if "language" in kwargs else [None] * len(completions)
+    code_snippets = [
+        # note: grading is automatically skipped if a problem has no tests
+        cf_patch_code(extract_code(completion[-1]["content"], language), language)
+        if patch_code
+        else extract_code(completion[-1]["content"], language)
+        for completion, language in zip(completions, languages)
+    ]
+
+    async def run_catch_exceptions(task):
+        try:
+            return await task
+        except Exception as e:
+            print(f"Error from Piston worker: {e}")
+            return None
+
+    # load problem data. undo separating kwargs by column
+    problems_data = [dict(zip(kwargs.keys(), values)) for values in zip(*kwargs.values())]
+
+    loop = _init_event_loop()
+    evals = [
+        loop.create_task(
+            run_catch_exceptions(
+                cf_score_submission(
+                    piston_client,
+                    problem_data,
+                    code,
+                    test_batch_size=test_batch_size,
+                    scoring_mode=scoring_mode,
+                    submission_language=problem_data.get("language", None),
+                )
+            )
+        )
+        for problem_data, code in zip(problems_data, code_snippets)
+    ]
+    results = loop.run_until_complete(asyncio.gather(*evals))
+
+    return results
+
+
+def extract_code(completion: str, language: str | None = "python") -> str:
+    if language is None:
+        return ""
     pattern = re.compile(rf"```{language}\n(.*?)```", re.DOTALL)
     matches = pattern.findall(completion)
     extracted_answer = matches[-1] if len(matches) >= 1 else ""
@@ -538,11 +598,20 @@ def get_code_format_reward(language: str = "python"):
     Args:
         language: Programming language supported by E2B https://e2b.dev/docs/code-interpreting/supported-languages
     """
-    pattern = rf"^<think>\n.*?\n</think>\n<answer>\n.*?```{language}.*?```.*?\n</answer>$"
 
     def code_format_reward(completions, **kwargs):
+        # if there is a language field, use it instead of the default language. This way we can have mixed language training.
+        languages = kwargs["language"] if "language" in kwargs else [language] * len(completions)
+
         completion_contents = [completion[0]["content"] for completion in completions]
-        matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completion_contents]
+        matches = [
+            re.match(
+                rf"^<think>\n.*?\n</think>\n<answer>\n.*?```{sample_language}.*?```.*?\n</answer>$",
+                content,
+                re.DOTALL | re.MULTILINE,
+            )
+            for content, sample_language in zip(completion_contents, languages)
+        ]
         return [1.0 if match else 0.0 for match in matches]
 
     return code_format_reward
@@ -617,6 +686,14 @@ def get_reward_funcs(script_args) -> list[Callable]:
             ),
             ioi_code_reward,
         ),
+        "cf_code": update_wrapper(
+            partial(
+                cf_code_reward,
+                test_batch_size=script_args.code_eval_test_batch_size,
+                scoring_mode=script_args.code_eval_scoring_mode,
+            ),
+            cf_code_reward,
+        ),
         "code_format": get_code_format_reward(language=script_args.code_language),
         "tag_count": tag_count_reward,
         "soft_overlong_punishment": get_soft_overlong_punishment(
diff --git a/src/open_r1/utils/ioi/__init__.py b/src/open_r1/utils/competitive_programming/__init__.py
similarity index 59%
rename from src/open_r1/utils/ioi/__init__.py
rename to src/open_r1/utils/competitive_programming/__init__.py
index f4c5468c0..081e16fea 100644
--- a/src/open_r1/utils/ioi/__init__.py
+++ b/src/open_r1/utils/competitive_programming/__init__.py
@@ -1,13 +1,17 @@
+from .cf_scoring import score_submission
+from .code_patcher import patch_code
+from .ioi_scoring import SubtaskResult, score_subtask, score_subtasks
+from .ioi_utils import add_includes
 from .morph_client import get_morph_client_from_env
 from .piston_client import get_piston_client_from_env, get_slurm_piston_endpoints
-from .scoring import SubtaskResult, score_subtask, score_subtasks
-from .utils import add_includes
 
 
 __all__ = [
     "get_piston_client_from_env",
     "get_slurm_piston_endpoints",
     "get_morph_client_from_env",
+    "patch_code",
+    "score_submission",
     "score_subtask",
     "score_subtasks",
     "add_includes",
diff --git a/src/open_r1/utils/competitive_programming/cf_scoring.py b/src/open_r1/utils/competitive_programming/cf_scoring.py
new file mode 100644
index 000000000..ae1bbe5e0
--- /dev/null
+++ b/src/open_r1/utils/competitive_programming/cf_scoring.py
@@ -0,0 +1,145 @@
+import asyncio
+import os
+from io import BytesIO
+from typing import Literal
+
+from async_lru import alru_cache
+
+from .piston_client import PistonClient
+from .utils import batched
+
+
+async def score_single_test_case(
+    client: PistonClient,
+    problem_data: dict,
+    test_input: str,
+    test_output: str,
+    submission: str,
+    submission_language: str = "cpp",
+) -> tuple[str, str]:
+    if submission_language not in ["python", "cpp"]:
+        raise ValueError(f"Invalid submission language: {submission_language}")
+    try:
+        result = await client.send_execute(
+            {
+                "files": [
+                    {"name": f"main.{submission_language}", "content": submission},
+                    *(
+                        [{"name": "checker.py", "content": problem_data["generated_checker"]}]
+                        if problem_data["generated_checker"]
+                        else []
+                    ),
+                    {"name": "input.txt", "content": test_input},
+                    {"name": "correct_output.txt", "content": test_output},
+                    {
+                        "name": "grader_config",
+                        "content": "\n".join(
+                            f"{key}={value}"
+                            for key, value in {
+                                "TIME_LIMIT": problem_data["time_limit"],
+                                "MEMORY_LIMIT": problem_data["memory_limit"],
+                                "INPUT_MODE": problem_data["input_mode"],
+                            }.items()
+                        ),
+                    },
+                ],
+                "run_timeout": (problem_data["time_limit"] + 10) * 1000,
+                # +10 seconds hard limit. time limits are handled by the codeforces script
+            },
+            language="cf_python3" if submission_language == "python" else "c++17",
+        )
+    except Exception as e:
+        print(f"Error scoring submission: {e}")
+        return False
+
+    return result
+
+
+@alru_cache(maxsize=32)  # TODO make this configurable
+async def get_generated_contest_tests(contest_id: str) -> list[dict]:
+    import pandas as pd
+
+    import aiofiles
+    import aiofiles.os
+    tests_folder = os.environ.get("CF_TESTS_FOLDER", None)
+    if not tests_folder:
+        raise ValueError(
+            "CF_TESTS_FOLDER environment variable not set! Please download the codeforces generated tests and set CF_TESTS_FOLDER to the folder path. See https://huggingface.co/datasets/open-r1/codeforces for more information."
+        )
+    if not await aiofiles.os.path.exists(tests_folder):
+        raise ValueError(
+            f"CF_TESTS_FOLDER path '{tests_folder}' does not exist! Please download the codeforces generated tests and set CF_TESTS_FOLDER to the folder path. See https://huggingface.co/datasets/open-r1/codeforces for more information."
+        )
+    parquet_path = os.path.join(tests_folder, f"test_cases_{int(contest_id):04d}.parquet")
+    if not await aiofiles.os.path.exists(parquet_path):
+        return {}
+
+    # Read parquet file asynchronously
+    async with aiofiles.open(parquet_path, "rb") as f:
+        content = await f.read()
+        df = pd.read_parquet(BytesIO(content))
+
+    # Group by problem_id and convert to dictionary of lists
+    grouped_tests = df.groupby("problem_id").apply(lambda x: x[["input", "output"]].to_dict("records")).to_dict()
+
+    return grouped_tests
+
+
+async def get_generated_tests(problem_id: str) -> list[dict]:
+    contest_id = problem_id.split("/")[0]
+    return (await get_generated_contest_tests(contest_id)).get(problem_id, [])
+
+
+async def score_submission(
+    client: PistonClient,
+    problem_data: dict,
+    submission: str,
+    test_batch_size: int = 1,
+    scoring_mode: Literal["pass_fail", "partial", "weighted_sum"] = "weighted_sum",
+    no_compile_reward: float = -0.1,
+    no_submission_reward: float = -1.0,
+    submission_language: str = "cpp",
+) -> float:
+    if submission_language not in ["python", "cpp"]:
+        raise ValueError(f"Invalid submission language: {submission_language}")
+    test_cases = problem_data["official_tests"] + (await get_generated_tests(problem_data["id"]))
+    # invalid/not a coding problem
+    if test_cases is None or len(test_cases) == 0:
+        return None
+    # no code extracted
+    if not submission:
+        return no_submission_reward
+
+    passed_test_cases = 0
+    # run one batch, check if any of them failed (0 score): if so stop evaluating (assuming non partial score); otherwise continue with the next batch of test cases.
+    for test_batch_to_run in batched(test_cases, test_batch_size) if test_batch_size >= 1 else [test_cases]:
+        results = await asyncio.gather(
+            *[
+                asyncio.create_task(
+                    score_single_test_case(
+                        client, problem_data, test_case["input"], test_case["output"], submission, submission_language
+                    )
+                )
+                for test_case in test_batch_to_run
+            ]
+        )
+        if any(result and result["compile"]["code"] != 0 for result in results):
+            return no_compile_reward
+
+        tests_passed_results = [
+            result and result["run"]["code"] == 0 and result["run"]["stdout"].strip() == "1" for result in results
+        ]
+        if scoring_mode == "pass_fail" and any(not test_passed for test_passed in tests_passed_results):
+            break
+        passed_test_cases += sum(1 for test_passed in tests_passed_results if test_passed)
+
+    pass_fail_score = 1.0 if passed_test_cases == len(test_cases) else 0.0
+
+    if scoring_mode == "pass_fail":
+        return pass_fail_score
+    elif scoring_mode == "partial":
+        return passed_test_cases / len(test_cases)
+    elif scoring_mode == "weighted_sum":
+        return pass_fail_score + 0.1 * (passed_test_cases / len(test_cases))
+    else:
+        raise ValueError(f"Invalid scoring mode: {scoring_mode}")
diff --git a/src/open_r1/utils/competitive_programming/code_patcher.py b/src/open_r1/utils/competitive_programming/code_patcher.py
new file mode 100644
index 000000000..4d5536020
--- /dev/null
+++ b/src/open_r1/utils/competitive_programming/code_patcher.py
@@ -0,0 +1,123 @@
+import re
+
+
+def fix_python3_imports(source_code):
+    """
+    Fix common import and function changes between Python 3 versions
+
+    Args:
+        source_code (str): The Python source code to update
+
+    Returns:
+        str: The updated source code
+    """
+    # Dictionary of patterns to replacements
+    replacements = [
+        # Fix collections.abc imports (changed in Python 3.3+)
+        (
+            r"from collections import (Mapping|Sequence|Set|Container|MutableMapping|MutableSet|MutableSequence)",
+            r"from collections.abc import \1",
+        ),
+        # Fix imp module deprecation (deprecated in 3.4)
+        (r"import imp", r"import importlib"),
+        # Fix asyncio.async() to asyncio.ensure_future() (renamed in 3.4.4)
+        (r"asyncio\.async\(", r"asyncio.ensure_future("),
+        # Fix inspect.getargspec to inspect.getfullargspec (deprecated in 3.5)
+        (r"inspect\.getargspec", r"inspect.getfullargspec"),
+        # Fix array.array 'c' type code to 'b' (removed in 3.9)
+        (r"array\.array\('c'", r"array.array('b'"),
+        # Fix backslash line continuation with multiple newlines (Python-specific issue)
+        (r"\\(\r\n|\r|\n)+", "\\\n"),
+        # some solutions use getlogin() to check if they are debugging or on an actual submission
+        (r"(?:os\s*\.\s*)?getlogin\s*\(\s*\)", "False"),
+        # Fix usage of fractions.gcd (moved to math in 3.5)
+        # 1. Fix direct usage: fractions.gcd -> math.gcd
+        (r"\bfractions\.gcd\b", r"math.gcd"),
+        # 2. Fix 'from fractions import gcd, X' -> 'from fractions import X' (start/middle)
+        (r"(from\s+fractions\s+import\s+(?:\([^)]*)?)\bgcd\s*,\s*", r"\1"),
+        # 3. Fix 'from fractions import X, gcd' -> 'from fractions import X' (end)
+        (r"(from\s+fractions\s+import\s+.*?\S)\s*,\s*\bgcd(\s*\)?\s*(?:#.*)?)", r"\1\2"),
+        # 4. Fix standalone 'from fractions import gcd' -> 'from math import gcd'
+        (r"from\s+fractions\s+import\s+\(?\s*gcd\s*\)?", r""),
+        # --- End: Replacement for the faulty line ---
+    ]
+
+    lines = source_code.splitlines()
+    last_import = max(
+        [
+            i
+            for i, line in enumerate(lines)
+            if line.strip().startswith("import") or (line.strip().startswith("from") and "import" in line)
+        ],
+        default=0,
+    )
+    import_section = "\n".join(lines[: last_import + 1])
+    main_source = "\n".join(lines[last_import:])
+
+    if "fractions.gcd" in source_code and "import math" not in source_code:
+        import_section += "\nimport math"
+    elif "gcd" in source_code and "from math import gcd" not in source_code:
+        import_section += "\nfrom math import gcd"
+
+    if "set_int_max_str_digits" not in source_code:
+        import_section += "\nimport sys\nsys.set_int_max_str_digits(0)"
+
+    source_code = import_section + "\n" + main_source
+
+    # Apply each replacement
+    for pattern, replacement in replacements:
+        source_code = re.sub(pattern, replacement, source_code)
+
+    source_code = source_code.rstrip("\\")
+
+    return source_code
+
+
+def fix_cpp_includes(source_code):
+    # has most of the useful functions
+    code_header = "#include <bits/stdc++.h>\n"
+    # use namespace std since models forget std:: often
+    if "using namespace std;" not in source_code and "std::" not in source_code:
+        code_header += "\nusing namespace std;\n\n"
+    return code_header + source_code
+
+
+def is_patchable(lang):
+    return lang in ("python", "python3", "Python 3", "PyPy 3", "PyPy 3-64", "cpp") or "C++" in lang
+
+
+def patch_code(text, lang):
+    if not text:
+        return text
+    if lang in ("python", "python3", "Python 3", "PyPy 3", "PyPy 3-64"):
+        return fix_python3_imports(text)
+    elif "cpp" in lang or "C++" in lang:
+        return fix_cpp_includes(text)
+    return text
+
+
+tests = [
+    """read = lambda: map(int, input().split())
+n, m, z = read()
+from fractions import gcd
+ans = z // (n * m // gcd(n, m))
+print(ans)""",
+    """from fractions import Fraction,gcd
+
+a,b,c,d = [int(x) for x in input().split()]
+
+if a*d > b*c:
+    num = a*d-b*c
+    denom = a*d
+else:
+    num = b*c-a*d
+    denom = b*c
+div = gcd(num,denom)
+print('%d/%d'%(num//div,denom//div))""",
+]
+
+if __name__ == "__main__":
+    for test in tests:
+        print("ORIGINAL:", test, sep="\n\n")
+        print("PATCHED:", patch_code(test, "Python 3"), sep="\n\n")
+        print("=" * 50)
diff --git a/src/open_r1/utils/ioi/scoring.py b/src/open_r1/utils/competitive_programming/ioi_scoring.py
similarity index 80%
rename from src/open_r1/utils/ioi/scoring.py
rename to src/open_r1/utils/competitive_programming/ioi_scoring.py
index 1595fc602..357156c89 100644
--- a/src/open_r1/utils/ioi/scoring.py
+++ b/src/open_r1/utils/competitive_programming/ioi_scoring.py
@@ -2,8 +2,9 @@
 from dataclasses import asdict, dataclass, field
 from typing import Union
 
-from .piston_client import PistonClient
-from .utils import batched, load_ioi_tests
+from .ioi_utils import load_ioi_tests
+from .piston_client import PistonClient, PistonError
+from .utils import batched
 
 
 @dataclass
@@ -54,16 +55,7 @@ def status(self):
         Returns:
             str: The status with the highest priority (lowest value)
         """
-        status_prios = {
-            "CE": -1,
-            "RE": 0,
-            "WA": 1,
-            "MLE": 2,
-            "TLE": 3,
-            "PA": 4,
-            "AC": 5,
-            "SKIPPED": 999,
-        }
+        status_prios = {"CE": -1, "RE": 0, "WA": 1, "MLE": 2, "TLE": 3, "PA": 4, "AC": 5, "SKIPPED": 999}
         return min([x.status for x in self.test_results], key=lambda x: status_prios[x])
 
     @property
@@ -77,10 +69,7 @@ def score(self):
         return (
             0
             if not self.test_results
-            else round(
-                min([test_result.score for test_result in self.test_results]),
-                self.score_precision,
-            )
+            else round(min([test_result.score for test_result in self.test_results]), self.score_precision)
         )
 
     @property
@@ -95,8 +84,7 @@ def weighted_score(self):
             0
             if not self.test_results
             else round(
-                min([test_result.score for test_result in self.test_results]) * self.points,
-                self.score_precision,
+                min([test_result.score for test_result in self.test_results]) * self.points, self.score_precision
             )
         )
 
@@ -148,12 +136,7 @@ def _extract_single_status(score: float, feedback: str) -> str:
 
 
 async def score_single_test_case(
-    client: PistonClient,
-    subtask: dict,
-    test_name: str,
-    test_input: str,
-    test_output: str,
-    submission: str,
+    client: PistonClient, subtask: dict, test_name: str, test_input: str, test_output: str, submission: str
 ) -> TestResult:
     """
     Scores a single test case by running the submission against the provided input and output.
@@ -174,10 +157,7 @@ async def score_single_test_case(
     score = float(score)
 
     return TestResult(
-        test_name=test_name,
-        score=score,
-        status=_extract_single_status(score, feedback),
-        feedback=feedback,
+        test_name=test_name, score=score, status=_extract_single_status(score, feedback), feedback=feedback
     )
 
 
@@ -219,11 +199,9 @@ async def score_subtask(
 
     # initialize test results with cached results or empty (SKIPPED) TestResult objects
     subtask_result.test_results = [
-        (
-            test_case_run_cache[test_name]
-            if test_case_run_cache is not None and test_name in test_case_run_cache
-            else TestResult(test_name=test_name)
-        )
+        test_case_run_cache[test_name]
+        if test_case_run_cache is not None and test_name in test_case_run_cache
+        else TestResult(test_name=test_name)
         for test_name in subtask["test_names"]
     ]
 
@@ -247,12 +225,7 @@ async def score_subtask(
             *[
                 asyncio.create_task(
                     score_single_test_case(
-                        client,
-                        subtask,
-                        test_name,
-                        test_cases[test_name][0],
-                        test_cases[test_name][1],
-                        submission,
+                        client, subtask, test_name, test_cases[test_name][0], test_cases[test_name][1], submission
                     )
                 )
                 for _, test_name in test_batch_to_run
@@ -292,11 +265,7 @@ async def score_subtasks(
 
 
 async def run_submission(
-    client: PistonClient,
-    problem: dict,
-    test_input: str,
-    submission: str,
-    test_output: str | None = None,
+    client: PistonClient, problem: dict, test_input: str, submission: str, test_output: str | None = None
 ) -> tuple[str, str]:
     """
     Executes a submission against a test case using the Piston execution environment.
@@ -327,4 +296,40 @@ async def run_submission(
         ),  # +3 seconds hard limit. time limits are handled by the ioi script
         "run_memory_limit": problem["memory_limit"],
     }
-    return await client.execute(data)
+    return await execute_ioi(client, data)
+
+
+async def execute_ioi(client, data) -> tuple[str, str]:
+    """
+    Requests to the IOI package return the score as a float in the stdout, as well as optional feedback/errors in stderr.
+    Returns a tuple of (score, feedback).
+    """
+    response = await client.send_execute(data)
+
+    if "message" in response:
+        raise PistonError(response["message"])
+
+    if "compile" in response and response["compile"]["code"] != 0:
+        return "0", "Compilation error exit code " + str(response["compile"]["code"]) + "\n" + response["compile"][
+            "stderr"
+        ]
+
+    if "run" not in response:
+        raise PistonError(response)
+
+    if response["run"]["code"] == 1 and "MemoryError" in response["run"]["stderr"]:
+        return "0", "Memory limit exceeded"
+
+    # successful result
+    if response["run"]["stdout"]:
+        return response["run"]["stdout"], response["run"]["stderr"]
+
+    if response["run"]["signal"] == "SIGKILL":
+        return "0", "Time limit exceeded"
+
+    # other issues
+    if response["run"]["code"] != 0:
+        raise PistonError(
+            f"language={response['language']}, version={response['version']}, exit code={response['run']['code']}, stderr={response['run']['stderr']}, signal={response['run']['signal']}"
+        )
+    return "0", "Unknown error"
diff --git a/src/open_r1/utils/ioi/utils.py b/src/open_r1/utils/competitive_programming/ioi_utils.py
similarity index 77%
rename from src/open_r1/utils/ioi/utils.py
rename to src/open_r1/utils/competitive_programming/ioi_utils.py
index 0719f4a3f..02fe2b39b 100644
--- a/src/open_r1/utils/ioi/utils.py
+++ b/src/open_r1/utils/competitive_programming/ioi_utils.py
@@ -1,6 +1,5 @@
 from collections import defaultdict
 from functools import lru_cache
-from itertools import islice
 
 from datasets import load_dataset
 
@@ -31,10 +30,7 @@ def load_ioi_tests_for_year(year: int) -> dict[str, dict[str, tuple[str, str]]]:
     tests_dataset = load_dataset("open-r1/ioi-test-cases", name=f"{year}", split="train")
     test_cases = defaultdict(dict)
     for test_case in tests_dataset:
-        test_cases[test_case["problem_id"]][test_case["test_name"]] = (
-            test_case["test_input"],
-            test_case["test_output"],
-        )
+        test_cases[test_case["problem_id"]][test_case["test_name"]] = test_case["test_input"], test_case["test_output"]
     return test_cases
 
 
@@ -43,13 +39,3 @@ def load_ioi_tests(year: int, problem_id: str) -> dict[str, tuple[str, str]]:
     Load IOI tests for a given year and problem id.
     """
     return load_ioi_tests_for_year(year)[problem_id]
-
-
-def batched(iterable, n):
-    "Batch data into lists of length n. The last batch may be shorter."
-    # batched('ABCDEFG', 3) --> ABC DEF G
-    if n < 1:
-        return iterable
-    it = iter(iterable)
-    while batch := list(islice(it, n)):
-        yield batch
diff --git a/src/open_r1/utils/ioi/morph_client.py b/src/open_r1/utils/competitive_programming/morph_client.py
similarity index 100%
rename from src/open_r1/utils/ioi/morph_client.py
rename to src/open_r1/utils/competitive_programming/morph_client.py
diff --git a/src/open_r1/utils/ioi/piston_client.py b/src/open_r1/utils/competitive_programming/piston_client.py
similarity index 73%
rename from src/open_r1/utils/ioi/piston_client.py
rename to src/open_r1/utils/competitive_programming/piston_client.py
index 86ebe9d13..7dfc9a5ec 100644
--- a/src/open_r1/utils/ioi/piston_client.py
+++ b/src/open_r1/utils/competitive_programming/piston_client.py
@@ -14,16 +14,23 @@ class PistonError(Exception):
 
 
 @lru_cache(maxsize=1)
-def get_piston_client_from_env():
+def get_piston_client_from_env(session=None):
     piston_endpoints = os.getenv("PISTON_ENDPOINTS")
     if piston_endpoints is None:
         raise ValueError(
-            "For IOI problems Piston endpoints running our IOI package are required. Please add a list of valid Piston endpoints to a PISTON_ENDPOINTS varialbe in a `.env` file."
+            "For IOI/CF problems Piston endpoints running our IOI package are required. Please add a list of valid Piston endpoints to a PISTON_ENDPOINTS variable in a `.env` file."
         )
-    piston_endpoints = piston_endpoints.split(",") if piston_endpoints != "slurm" else get_slurm_piston_endpoints()
+    piston_endpoints = sorted(
+        piston_endpoints.split(",") if piston_endpoints != "slurm" else get_slurm_piston_endpoints()
+    )
+    gpu_nb = int(os.getenv("LOCAL_RANK", 0))  # per‑GPU index
+    world = int(os.getenv("WORLD_SIZE", 1))  # total GPUs
+    if world > 1:
+        print(f"Using a subset of piston endpoints for GPU#{gpu_nb}")
+        piston_endpoints = piston_endpoints[gpu_nb::world]
     random.shuffle(piston_endpoints)
     max_requests_per_endpoint = os.getenv("PISTON_MAX_REQUESTS_PER_ENDPOINT", "1")
-    return PistonClient(piston_endpoints, max_requests_per_endpoint=int(max_requests_per_endpoint))
+    return PistonClient(piston_endpoints, session, max_requests_per_endpoint=int(max_requests_per_endpoint))
 
 
 class PistonClient:
@@ -57,6 +64,8 @@ def __init__(
     ):
         self.max_requests_per_endpoint = max_requests_per_endpoint
         self.base_endpoints = [base_endpoint] if isinstance(base_endpoint, str) else base_endpoint
+        if len(self.base_endpoints) == 0:
+            raise ValueError("No Piston endpoints provided. Please check your PISTON_ENDPOINTS environment variable.")
         self.endpoint_ids = {endpoint: i for i, endpoint in enumerate(self.base_endpoints)}
 
         self._session = session
@@ -73,7 +82,7 @@ def __init__(
     def session(self):
         if self._session is None:
             self._session = aiohttp.ClientSession(
-                timeout=aiohttp.ClientTimeout(sock_read=10),
+                timeout=aiohttp.ClientTimeout(sock_read=30),
                 connector=aiohttp.TCPConnector(
                     limit=self.max_requests_per_endpoint * len(self.base_endpoints),
                     ttl_dns_cache=300,
@@ -91,10 +100,7 @@ async def _release_endpoint(self, endpoint):
 
     async def _send_request(self, endpoint, route, data=None, method="post"):
         async with self.session.request(
-            method,
-            f"{endpoint.rstrip('/')}/{route}",
-            json=data,
-            headers={"Content-Type": "application/json"},
+            method, f"{endpoint.rstrip('/')}/{route}", json=data, headers={"Content-Type": "application/json"}
         ) as response:
             return await response.json(content_type=None)
 
@@ -115,45 +121,6 @@ async def uninstall_package(self, language, version):
     async def get_supported_runtimes(self):
         return await self._send_to_all("runtimes", method="get")
 
-    async def execute(self, data) -> tuple[str, str]:
-        """
-        Requests to the IOI package return the score as a float in the stdout, as well as optional feedback/errors in stderr.
-        Returns a tuple of (score, feedback).
-        """
-        response = await self._send_execute(data)
-
-        if "message" in response:
-            raise PistonError(response["message"])
-
-        if "compile" in response and response["compile"]["code"] != 0:
-            return (
-                "0",
-                "Compilation error exit code "
-                + str(response["compile"]["code"])
-                + "\n"
-                + response["compile"]["stderr"],
-            )
-
-        if "run" not in response:
-            raise PistonError(response)
-
-        if response["run"]["code"] == 1 and "MemoryError" in response["run"]["stderr"]:
-            return "0", "Memory limit exceeded"
-
-        # successful result
-        if response["run"]["stdout"]:
-            return response["run"]["stdout"], response["run"]["stderr"]
-
-        if response["run"]["signal"] == "SIGKILL":
-            return "0", "Time limit exceeded"
-
-        # other issues
-        if response["run"]["code"] != 0:
-            raise PistonError(
-                f"language={response['language']}, version={response['version']}, exit code={response['run']['code']}, stderr={response['run']['stderr']}, signal={response['run']['signal']}"
-            )
-        return "0", "Unknown error"
-
     async def _check_failed_endpoint(self, endpoint):
         async with self._endpoint_failures_lock:
             if endpoint in self._unhealthy_endpoints:
@@ -164,14 +131,15 @@ async def _check_failed_endpoint(self, endpoint):
             except Exception as e:
                 print(f"Error checking endpoint {endpoint}, dropping it ({e})")
                 self._unhealthy_endpoints.add(endpoint)
+                if len(self._unhealthy_endpoints) >= len(self.base_endpoints):
+                    raise PistonError("All endpoints are unhealthy. Please check your Piston workers.")
 
-    async def _send_execute(self, data):
+    async def send_execute(self, data, language="cms_ioi", max_retries=5):
         data = data | {
-            "language": "cms_ioi",
+            "language": language,
             "version": "*",
         }
 
-        max_retries = 5
         base_delay = 1.0
 
         status = None
@@ -183,15 +151,13 @@ async def _send_execute(self, data):
                 if attempt > 0:
                     await asyncio.sleep(1)
                 async with self.session.post(
-                    f"{endpoint.rstrip('/')}/execute",
-                    json=data,
-                    headers={"Content-Type": "application/json"},
+                    f"{endpoint.rstrip('/')}/execute", json=data, headers={"Content-Type": "application/json"}
                 ) as response:
                     status = response.status
                     res_json = await response.json(content_type=None)
 
                     if status != 200:
-                        raise PistonError(f"Server error. status={status}")
+                        raise PistonError(f"Server error. status={status}. {res_json}")
                     if res_json is None:
                         raise PistonError(f"Empty response. status={status}")
                     # piston overloaded
@@ -199,19 +165,14 @@ async def _send_execute(self, data):
                         raise PistonError(f"Piston overloaded: {res_json['run']['stderr']}")
                     return res_json
 
-            except (
-                PistonError,
-                asyncio.TimeoutError,
-                aiohttp.ClientConnectionError,
-                RuntimeError,
-            ) as e:
+            except (PistonError, asyncio.TimeoutError, aiohttp.ClientConnectionError, RuntimeError) as e:
                 # Only retry if we haven't reached max retries yet
                 if attempt < max_retries:
                     # Calculate backoff with jitter
                     delay = min(base_delay * (2**attempt), 10)  # Exponential backoff, capped at 10 seconds
                     jitter = delay * 0.2 * (2 * asyncio.get_event_loop().time() % 1 - 0.5)  # Add ±10% jitter
                     retry_delay = delay + jitter
-                    print(f"Retrying in {retry_delay} seconds [{self.endpoint_ids[endpoint]}] {endpoint}")
+                    print(f"Retrying in {retry_delay:.2f} seconds [{self.endpoint_ids[endpoint]}] {endpoint} - {e}")
 
                     # special case: worker died
                     if isinstance(e, aiohttp.ClientConnectionError) and "Connect call failed" in str(e):
@@ -223,8 +184,7 @@ async def _send_execute(self, data):
 
                     await asyncio.sleep(retry_delay)
                 else:
-                    print(f"Giving up on retries. {e}")
-                    raise e
+                    await self._check_failed_endpoint(endpoint)
             except Exception as e:
                 print(f"Propagating exception {type(e)}: {e}")
                 raise e
@@ -242,9 +202,7 @@ def get_slurm_piston_endpoints():
     """Get list of active piston worker endpoints from squeue output"""
     # Run squeue command to get job name, hostname and status, filtering for RUNNING state
     result = subprocess.run(
-        ["squeue", '--format="%j %N %T"', "--noheader", "--states=RUNNING"],
-        capture_output=True,
-        text=True,
+        ["squeue", '--format="%j %N %T"', "--noheader", "--states=RUNNING"], capture_output=True, text=True
     )
 
     # Split output into lines and skip header
diff --git a/src/open_r1/utils/competitive_programming/utils.py b/src/open_r1/utils/competitive_programming/utils.py
new file mode 100644
index 000000000..7e1bf730f
--- /dev/null
+++ b/src/open_r1/utils/competitive_programming/utils.py
@@ -0,0 +1,11 @@
+from itertools import islice
+
+
+def batched(iterable, n):
+    "Batch data into lists of length n. The last batch may be shorter."
+    # batched('ABCDEFG', 3) --> ABC DEF G
+    if n < 1:
+        return iterable
+    it = iter(iterable)
+    while batch := list(islice(it, n)):
+        yield batch

From 57e85b522f58c5abdb750495f49679cf338725b3 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Sun, 25 May 2025 13:24:52 +0200
Subject: [PATCH 128/137] Add better logging defaults for GRPO (#657)

---
 setup.py                                          |  2 +-
 src/open_r1/configs.py                            | 15 +++++++++++----
 .../utils/competitive_programming/cf_scoring.py   |  1 +
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index 531a9e502..500c3c2f5 100644
--- a/setup.py
+++ b/setup.py
@@ -116,7 +116,7 @@ def deps_list(*pkgs):
     deps["transformers"],
     deps["trl"],
     deps["wandb"],
-    deps["async-lru"]
+    deps["async-lru"],
 ]
 
 setup(
diff --git a/src/open_r1/configs.py b/src/open_r1/configs.py
index 86a3d0524..ddb6e53b0 100644
--- a/src/open_r1/configs.py
+++ b/src/open_r1/configs.py
@@ -136,15 +136,22 @@ class GRPOConfig(trl.GRPOConfig):
         metadata={"help": "The callbacks to run during training."},
     )
     chat_template: Optional[str] = field(default=None, metadata={"help": "The chat template to use."})
-    system_prompt: Optional[str] = field(
-        default=None,
-        metadata={"help": "The optional system prompt to use."},
-    )
     hub_model_revision: Optional[str] = field(
         default="main", metadata={"help": "The Hub model branch to push the model to."}
     )
+    num_completions_to_print: int = field(default=0, metadata={"help": "Number of completions to print."})
     overwrite_hub_revision: bool = field(default=False, metadata={"help": "Whether to overwrite the Hub revision."})
     push_to_hub_revision: bool = field(default=False, metadata={"help": "Whether to push to a Hub revision/branch."})
+    system_prompt: Optional[str] = field(
+        default=None,
+        metadata={"help": "The optional system prompt to use."},
+    )
+    wandb_log_unique_prompts: bool = field(
+        default=True,
+        metadata={
+            "help": ("Whether to log the unique prompts to wandb. This will create a new run for each unique prompt.")
+        },
+    )
     wandb_entity: Optional[str] = field(
         default=None,
         metadata={"help": ("The entity to store runs under.")},
diff --git a/src/open_r1/utils/competitive_programming/cf_scoring.py b/src/open_r1/utils/competitive_programming/cf_scoring.py
index ae1bbe5e0..d3ede4f7e 100644
--- a/src/open_r1/utils/competitive_programming/cf_scoring.py
+++ b/src/open_r1/utils/competitive_programming/cf_scoring.py
@@ -61,6 +61,7 @@ async def get_generated_contest_tests(contest_id: str) -> list[dict]:
 
     import aiofiles
     import aiofiles.os
+
     tests_folder = os.environ.get("CF_TESTS_FOLDER", None)
     if not tests_folder:
         raise ValueError(

From 5ac5971ea5603c72a2722978424ec05f3dd49558 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Mon, 26 May 2025 17:57:44 +0200
Subject: [PATCH 129/137] Add OpenR1-Distill recipe (#661)

---
 README.md                                     | 26 +++++-----
 .../OpenR1-Distill-7B/sft/config_distill.yaml | 48 +++++++++++++++++++
 recipes/OpenR1-Qwen-7B/sft/config.yaml        | 48 -------------------
 .../sft/config_demo.yaml                      |  7 +--
 recipes/README.md                             |  8 ++++
 5 files changed, 75 insertions(+), 62 deletions(-)
 create mode 100644 recipes/OpenR1-Distill-7B/sft/config_distill.yaml
 delete mode 100644 recipes/OpenR1-Qwen-7B/sft/config.yaml

diff --git a/README.md b/README.md
index a34cd32b9..3802efbcf 100644
--- a/README.md
+++ b/README.md
@@ -42,6 +42,7 @@ We will use the DeepSeek-R1 [tech report](https://github.com/deepseek-ai/DeepSee
 
 ## News 🗞️
 
+* **🧑‍🍳 [2025/05/26] (Step 1 completed!)** We release [**Mixture-of-Thoughts**](https://huggingface.co/datasets/open-r1/Mixture-of-Thoughts)--a curated reasoning dataset of 350k verified traces distilled from R1. The dataset spans tasks in mathematics, coding, and science, and is designed to teach language models to reason step-by-step. We also provide a recipe to train [OpenR1-Distill-7B](https://huggingface.co/open-r1/OpenR1-Distill-7B), which replicates the reasoning capabilities of [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) and marks the completion of step 1 in the Open R1 project.
 * **⚡️ [2025/03/11] [(update #3)](https://huggingface.co/blog/open-r1/update-3):** We release the [**CodeForces-CoTs**](https://huggingface.co/datasets/open-r1/codeforces-cots) dataset of 10k competitive programming problems and 100k solutions distilled from R1. We also release IOI24: a new benchmark of _very_ hard problems from international olympiads. A 7B Qwen model trained on CodeForces-CoTs can outperform Claude 3.7 Sonnet on IOI24, while a 32B model can outperform R1 itself.
 * **∞ [2025/02/10] [(update #2)](https://huggingface.co/blog/open-r1/update-2):** We release the [**OpenR1-Math-220k**](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k) dataset of 220k traces distilled from R1 on a new version of NuminaMath. Models trained on this dataset match the performance of DeepSeek's distilled ones.
 * **🔥 [2025/02/02] [(update #1)](https://huggingface.co/blog/open-r1/update-1):** We implement the first parts of the [training](https://github.com/huggingface/open-r1?tab=readme-ov-file#training-models), [inference](https://github.com/huggingface/open-r1?tab=readme-ov-file#data-generation), and [evaluation](https://github.com/huggingface/open-r1?tab=readme-ov-file#reproducing-deepseeks-evaluation-results) pipelines. Let's go!  
@@ -103,14 +104,15 @@ sudo apt-get install git-lfs
 > [!NOTE]
 > The training commands below are configured for a node of 8 x H100s (80GB). For different hardware and topologies, you may need to tune the batch size and number of gradient accumulation steps.
 
-We support training models with either DDP or DeepSpeed (ZeRO-2 and ZeRO-3). For example, to run SFT on a dataset distilled from DeepSeek-R1 with reasoning traces such as [open-r1/OpenR1-Math-220k](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k), run:
+We support training models with either DDP or DeepSpeed (ZeRO-2 and ZeRO-3). For example, to run SFT on a dataset distilled from DeepSeek-R1 with reasoning traces such as [open-r1/Mixture-of-Thoughts](https://huggingface.co/datasets/open-r1/Mixture-of-Thoughts), run:
 
 ```shell
 # Train via command line
 accelerate launch --config_file=recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py \
     --model_name_or_path Qwen/Qwen2.5-1.5B-Instruct \
-    --dataset_name open-r1/OpenR1-Math-220k \
-    --learning_rate 5.0e-5 \
+    --dataset_name open-r1/Mixture-of-Thoughts \
+    --dataset_config all \
+    --learning_rate 4.0e-5 \
     --num_train_epochs 1 \
     --max_seq_length 16384 \
     --per_device_train_batch_size 16 \
@@ -158,10 +160,11 @@ Most base models like `meta-llama/Llama-3.2-1B` do not have a chat template, so
 accelerate launch --config_file=recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py \
     --model_name_or_path Qwen/Qwen2.5-1.5B \
 +   --eos_token '<|im_end|>'
-    --dataset_name open-r1/OpenR1-Math-220k \
-    --learning_rate 5.0e-5 \
+    --dataset_name open-r1/Mixture-of-Thoughts \
+    --dataset_config all \
+    --learning_rate 4.0e-5 \
     --num_train_epochs 1 \
-    --max_seq_length 16384 \
+    --max_seq_length 32768 \
     --per_device_train_batch_size 16 \
     --gradient_checkpointing \
     --bf16 \
@@ -177,10 +180,11 @@ accelerate launch --config_file=recipes/accelerate_configs/zero3.yaml src/open_r
     --model_name_or_path meta-llama/Llama-3.2-1B \
 +   --chat_template "$(cat llama_chat_template.jinja)" \
 +   --eos_token '<|eot_id|>' \
-    --dataset_name open-r1/OpenR1-Math-220k \
-    --learning_rate 5.0e-5 \
+    --dataset_name open-r1/Mixture-of-Thoughts \
+    --dataset_config all \
+    --learning_rate 4.0e-5 \
     --num_train_epochs 1 \
-    --max_seq_length 16384 \
+    --max_seq_length 32768 \
     --per_device_train_batch_size 16 \
     --gradient_checkpointing \
     --bf16 \
@@ -190,12 +194,12 @@ accelerate launch --config_file=recipes/accelerate_configs/zero3.yaml src/open_r
 
 ### SFT
 
-To run SFT on a dataset distilled from DeepSeek-R1 with reasoning traces such as [open-r1/OpenR1-Math-220k](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k), run:
+To run SFT on a dataset distilled from DeepSeek-R1 with reasoning traces such as [open-r1/Mixture-of-Thoughts](https://huggingface.co/datasets/open-r1/Mixture-of-Thoughts), run:
 
 ```shell
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero3.yaml \
     src/open_r1/sft.py \
-    --config recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
+    --config recipes/OpenR1-Distill-7B/sft/config_distill.yaml
 ```
 
 ### GRPO
diff --git a/recipes/OpenR1-Distill-7B/sft/config_distill.yaml b/recipes/OpenR1-Distill-7B/sft/config_distill.yaml
new file mode 100644
index 000000000..a11e8a257
--- /dev/null
+++ b/recipes/OpenR1-Distill-7B/sft/config_distill.yaml
@@ -0,0 +1,48 @@
+# Config for 1 node of 8 x H100s (80GB)
+# Model arguments
+model_name_or_path: open-r1/Qwen2.5-Math-7B-RoPE-300k
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Open-R1, a language model trained by Hugging Face to help users. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> Thought section </think> Solution section. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion. Now, try to solve the following question through the above guidelines.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Open-R1, a language model trained by Hugging Face to help users. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> Thought section </think> Solution section. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion. Now, try to solve the following question through the above guidelines.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n"
+dataset_name: open-r1/Mixture-of-Thoughts
+dataset_config: all
+dataset_num_proc: 12
+eos_token: <|im_end|>
+
+# SFT trainer config
+bf16: true
+do_eval: false
+eval_strategy: 'no'
+gradient_accumulation_steps: 8
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: OpenR1-Distill-7B
+hub_strategy: every_save
+learning_rate: 4.0e-05
+log_level: info
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine_with_min_lr
+lr_scheduler_kwargs:
+  min_lr_rate: 0.1
+packing: false
+max_grad_norm: 0.2
+max_length: 32768
+max_steps: -1
+num_train_epochs: 5
+output_dir: data/OpenR1-Distill-7B
+overwrite_output_dir: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 2
+push_to_hub: true
+report_to:
+- wandb
+save_strategy: epoch
+save_total_limit: 1
+seed: 42
+use_liger: true
+warmup_ratio: 0.03
\ No newline at end of file
diff --git a/recipes/OpenR1-Qwen-7B/sft/config.yaml b/recipes/OpenR1-Qwen-7B/sft/config.yaml
deleted file mode 100644
index 9cc06c9f7..000000000
--- a/recipes/OpenR1-Qwen-7B/sft/config.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-# Model arguments
-# You need to download the model and manually change the rope to 300k and max_position_embeddings to 32768
-# the config file should match https://huggingface.co/open-r1/OpenR1-Qwen-7B/blob/main/config.json
-model_name_or_path: Qwen/Qwen2.5-Math-7B-Instruct 
-model_revision: main
-torch_dtype: bfloat16
-attn_implementation: sdpa
-
-# Data training arguments
-dataset_name: open-r1/OpenR1-Math-220k
-dataset_num_proc: 48
-
-#SFT hyperparam
-max_length: 32768
-weight_decay: 0.0001
-optim: adamw_torch
-lr_scheduler_type: linear
-warmup_ratio: 0.1
-learning_rate: 5.0e-05
-gradient_accumulation_steps: 2
-per_device_eval_batch_size: 1
-per_device_train_batch_size: 1
-
-# SFT trainer config
-max_steps: -1
-num_train_epochs: 3
-bf16: true
-do_eval: false
-use_liger_kernel: true
-eval_strategy: 'no'
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-hub_model_id: OpenR1-Qwen-7B-SFT
-hub_strategy: every_save
-log_level: info
-logging_steps: 5
-logging_strategy: steps
-packing: false
-output_dir: data/OpenR1-Qwen-7B-SFT
-overwrite_output_dir: true
-push_to_hub: true
-report_to:
-- wandb
-save_strategy: "steps"
-save_steps: 500
-save_total_limit: 1
-seed: 42
\ No newline at end of file
diff --git a/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
index fabe9ed7f..c31e41e63 100644
--- a/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
+++ b/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
@@ -5,7 +5,8 @@ torch_dtype: bfloat16
 attn_implementation: flash_attention_2
 
 # Data training arguments
-dataset_name: open-r1/OpenR1-Math-220k
+dataset_name: open-r1/Mixture-of-Thoughts
+dataset_config: all
 dataset_num_proc: 48
 
 # SFT trainer config
@@ -18,7 +19,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 hub_model_id: Qwen2.5-1.5B-Open-R1-Distill
 hub_strategy: every_save
-learning_rate: 5.0e-05
+learning_rate: 4.0e-05
 log_level: info
 logging_steps: 5
 logging_strategy: steps
@@ -26,7 +27,7 @@ lr_scheduler_type: cosine_with_min_lr
 lr_scheduler_kwargs:
   min_lr_rate: 0.1
 packing: false
-max_length: 16384
+max_length: 32768
 max_steps: -1
 num_train_epochs: 1
 output_dir: data/Qwen2.5-1.5B-Open-R1-Distill
diff --git a/recipes/README.md b/recipes/README.md
index 445eb78d0..4301fd66e 100644
--- a/recipes/README.md
+++ b/recipes/README.md
@@ -1,5 +1,13 @@
 # Post-training recipes
 
+## OpenR1 Distill 7B
+
+To train the OpenR1 Distill 7B model, run:
+
+```
+sbatch --nodes=1 slurm/train.slurm --model OpenR1-Distill-7B --task sft --config distill --accelerator zero3
+```
+
 ## OlympicCoder
 
 To train the OlympicCoder models, run:

From 9eef995b4d9a3df5cbf98641efae12609d0267c2 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Tue, 27 May 2025 15:38:21 +0200
Subject: [PATCH 130/137] Bump deps (#656)

---
 setup.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 500c3c2f5..27ac9eed6 100644
--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,7 @@
     "jieba",  # Needed for Chinese language support
     "langdetect",  # Needed for LightEval's extended tasks
     "latex2sympy2_extended>=1.0.6",
-    "liger-kernel>=0.5.9",
+    "liger-kernel>=0.5.10",
     "lighteval @ git+https://github.com/huggingface/lighteval.git@d3da6b9bbf38104c8b5e1acc86f83541f9a502d1",  # Critical bug fix for tokenizer revisions: https://github.com/huggingface/lighteval/pull/721
     "math-verify==0.5.2",  # Used for math verification in grpo
     "morphcloud==0.1.67",
@@ -68,8 +68,8 @@
     "safetensors>=0.3.3",
     "sentencepiece>=0.1.99",
     "torch==2.6.0",
-    "transformers @ git+https://github.com/huggingface/transformers.git@acdbe627e323dbc822f21499fead789b439cf45b",  # Fix DeepSpeed x vLLM conflict: https://github.com/huggingface/transformers/pull/37755
-    "trl[vllm] @ git+https://github.com/huggingface/trl.git@1bca49515ecd5b85d16e68c42c76670e252e19f1",  # Fix DeepSpeed x vLLM conflict: https://github.com/huggingface/trl/pull/3351
+    "transformers==4.52.3",
+    "trl[vllm] @ git+https://github.com/huggingface/trl.git@9ac614fb081e17805f7f62ab3f5f7036bdefe7b0",  # Support for activation offload: https://github.com/huggingface/trl/pull/2954
     "wandb>=0.19.1",
     "async-lru>=2.0.5",
     "aiofiles>=24.1.0",

From 33f84def0d20f5abc5281804f7fc91fbf276850c Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Tue, 27 May 2025 17:20:13 +0200
Subject: [PATCH 131/137] Align EOS token ID between tokenizer and generation
 config (#663)

* Align EOS token ID between tokenizer and generation config

* Fix
---
 README.md                                     | 42 +++++++++------
 .../OpenR1-Distill-7B/sft/config_distill.yaml |  2 +-
 .../sft/config_demo.yaml                      | 45 ----------------
 .../grpo/config_simple_rl.yaml                | 52 -------------------
 src/open_r1/grpo.py                           |  3 ++
 src/open_r1/sft.py                            | 41 ++++++---------
 6 files changed, 46 insertions(+), 139 deletions(-)
 delete mode 100644 recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
 delete mode 100644 recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml

diff --git a/README.md b/README.md
index 3802efbcf..e35a58cc9 100644
--- a/README.md
+++ b/README.md
@@ -21,10 +21,9 @@
 The goal of this repo is to build the missing pieces of the R1 pipeline such that everybody can reproduce and build on top of it. The project is simple by design and mostly consists of:
 
 
-- `src/open_r1`: contains the scripts to train and evaluate models as well as generate synthetic data:
+- `src/open_r1`: contains the scripts to train models as well as generate synthetic data:
     - `grpo.py`: trains a model with GRPO on a given dataset.
     - `sft.py`: performs a simple SFT of a model on a dataset.
-    - `evaluate.py`: evaluates a model on the R1 benchmarks.
     - `generate.py`: generates synthetic data from a model using [Distilabel](https://github.com/argilla-io/distilabel).
 - `Makefile`: contains easy-to-run commands for each step in the R1 pipeline leveraging the scripts above.
 
@@ -104,26 +103,27 @@ sudo apt-get install git-lfs
 > [!NOTE]
 > The training commands below are configured for a node of 8 x H100s (80GB). For different hardware and topologies, you may need to tune the batch size and number of gradient accumulation steps.
 
-We support training models with either DDP or DeepSpeed (ZeRO-2 and ZeRO-3). For example, to run SFT on a dataset distilled from DeepSeek-R1 with reasoning traces such as [open-r1/Mixture-of-Thoughts](https://huggingface.co/datasets/open-r1/Mixture-of-Thoughts), run:
+We support training models with either DDP or DeepSpeed (ZeRO-2 and ZeRO-3). For example, to perform SFT on a dataset distilled from DeepSeek-R1 with reasoning traces such as [open-r1/Mixture-of-Thoughts](https://huggingface.co/datasets/open-r1/Mixture-of-Thoughts), run:
 
 ```shell
 # Train via command line
 accelerate launch --config_file=recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py \
-    --model_name_or_path Qwen/Qwen2.5-1.5B-Instruct \
+    --model_name_or_path open-r1/Qwen2.5-Math-7B-RoPE-300k \
     --dataset_name open-r1/Mixture-of-Thoughts \
     --dataset_config all \
+    --eos_token '<|im_end|>' \
     --learning_rate 4.0e-5 \
-    --num_train_epochs 1 \
-    --max_seq_length 16384 \
-    --per_device_train_batch_size 16 \
+    --num_train_epochs 5 \
+    --max_seq_length 32768 \
+    --per_device_train_batch_size 2 \
     --gradient_checkpointing \
     --bf16 \
     --use_liger_kernel \
-    --output_dir data/Qwen2.5-1.5B-Open-R1-Distill
+    --output_dir data/OpenR1-Distill-7B
 
 # Train via YAML config
 accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py \
-    --config recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
+    --config recipes/OpenR1-Distill-7B/sft/config_distill.yaml
 ```
 
 Currently, the following tasks are supported:
@@ -137,17 +137,18 @@ Currently, the following tasks are supported:
 By default, these scripts will push each model to your Hugging Face Hub username, i.e. `{username}/{model_name}-{task}`. You can override the parameters in each YAML config by appending them to the command as follows: 
 
 ```shell
-# Change batch size, number of epochs etc
+# Change the base model to a smaller variant
 accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py \
-    --config recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
-    --per_device_train_batch_size=1 --num_train_epochs=5
+    --config recipes/OpenR1-Distill-7B/sft/config_distill.yaml \
+    --model_name_or_path Qwen/Qwen3-0.6-Base \
+    --hub_model_id OpenR1-Distill-0.6B
 ```
 
 If you also wish to override the Weights and Biases default settings, you can do so as follows:
 
 ```shell
 accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py \
-    --config recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
+    --config recipes/OpenR1-Distill-7B/sft/config_distill.yaml
     --wandb_entity huggingface --wandb_project open-r1 --run_name Qwen2.5-1.5B-GRPO
 ```
 
@@ -192,9 +193,9 @@ accelerate launch --config_file=recipes/accelerate_configs/zero3.yaml src/open_r
     --output_dir data/Llama-3.2-1B-Open-R1-Distill
 ```
 
-### SFT
+### SFT distillation
 
-To run SFT on a dataset distilled from DeepSeek-R1 with reasoning traces such as [open-r1/Mixture-of-Thoughts](https://huggingface.co/datasets/open-r1/Mixture-of-Thoughts), run:
+We provide a recipe to reproduce the reasoning capabilities of [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B), starting from the same base model. To do so, run:
 
 ```shell
 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/zero3.yaml \
@@ -202,6 +203,15 @@ ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_con
     --config recipes/OpenR1-Distill-7B/sft/config_distill.yaml
 ```
 
+The result will be a model like [open-r1/OpenR1-Distill-7B](https://huggingface.co/open-r1/OpenR1-Distill-7B), with the following downstream performance:
+
+| Model                       | AIME 2024 | MATH-500 | GPQA Diamond | LiveCodeBench v5 |
+|-----------------------------|-----------|----------|--------------|------------------|
+| OpenR1-Distill-7B           | 52.7      | 89.0     | 52.8         | 39.4             |
+| DeepSeek-R1-Distill-Qwen-7B | 51.3      | 93.5     | 52.4         | 37.4             |
+
+You can adjust the YAML config to train on a different base model or dataset.
+
 ### GRPO
 
 We use TRL's [vLLM backend](https://huggingface.co/docs/trl/speeding_up_training?vllm+examples=GRPO#vllm-for-fast-generation-in-online-methods) to scale training to large models across multiple nodes. For single-node training of smol models across 8 GPUs, first spin up the vLLM server to run on e.g. 1 GPU as follows:
@@ -415,7 +425,7 @@ sbatch --job-name=open_r1 --nodes=1 slurm/train.slurm --model {model_name} --tas
 Here `{model_name}` and `{task}` are defined as above, while `{config_suffix}` refers to the specific config and `{accelerator}` refers to the choice of 🤗 Accelerate config in `recipes/accelerate_configs`. If you wish to override the default config parameters, you can provide them by appending a space-separated string like `'--arg1=value1 --arg2=value2'`. Here's a concrete example to run SFT on 1 node of 8 GPUs:
 
 ```shell
-sbatch --job-name=open_r1 --nodes=1 slurm/train.slurm --model Qwen2.5-1.5B-Instruct --task sft --config demo --accelerator zero3
+sbatch --job-name=open_r1 --nodes=1 slurm/train.slurm --model OpenR1-Distill-7B --task sft --config distill --accelerator zero3
 ```
 
 You can scale the number of nodes by increasing the `--nodes` flag.
diff --git a/recipes/OpenR1-Distill-7B/sft/config_distill.yaml b/recipes/OpenR1-Distill-7B/sft/config_distill.yaml
index a11e8a257..44d9c09f6 100644
--- a/recipes/OpenR1-Distill-7B/sft/config_distill.yaml
+++ b/recipes/OpenR1-Distill-7B/sft/config_distill.yaml
@@ -44,5 +44,5 @@ report_to:
 save_strategy: epoch
 save_total_limit: 1
 seed: 42
-use_liger: true
+use_liger_kernel: true
 warmup_ratio: 0.03
\ No newline at end of file
diff --git a/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml b/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
deleted file mode 100644
index c31e41e63..000000000
--- a/recipes/Qwen2.5-1.5B-Instruct/sft/config_demo.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# Model arguments
-model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
-model_revision: main
-torch_dtype: bfloat16
-attn_implementation: flash_attention_2
-
-# Data training arguments
-dataset_name: open-r1/Mixture-of-Thoughts
-dataset_config: all
-dataset_num_proc: 48
-
-# SFT trainer config
-bf16: true
-do_eval: false
-eval_strategy: 'no'
-gradient_accumulation_steps: 1
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-hub_model_id: Qwen2.5-1.5B-Open-R1-Distill
-hub_strategy: every_save
-learning_rate: 4.0e-05
-log_level: info
-logging_steps: 5
-logging_strategy: steps
-lr_scheduler_type: cosine_with_min_lr
-lr_scheduler_kwargs:
-  min_lr_rate: 0.1
-packing: false
-max_length: 32768
-max_steps: -1
-num_train_epochs: 1
-output_dir: data/Qwen2.5-1.5B-Open-R1-Distill
-overwrite_output_dir: true
-per_device_eval_batch_size: 16
-per_device_train_batch_size: 16
-push_to_hub: true
-report_to:
-- wandb
-save_strategy: "steps"
-save_steps: 100
-save_total_limit: 1
-seed: 42
-use_liger_kernel: true
-warmup_ratio: 0.05
\ No newline at end of file
diff --git a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
deleted file mode 100644
index d707693d3..000000000
--- a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-# Model arguments
-model_name_or_path: Qwen/Qwen2.5-Math-7B
-model_revision: main
-torch_dtype: bfloat16
-attn_implementation: flash_attention_2
-
-# Data training arguments
-dataset_name: DigitalLearningGmbH/MATH-lighteval
-dataset_config: default
-dataset_prompt_column: problem
-system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
-
-# GRPO trainer config
-bf16: true
-use_vllm: true
-do_eval: true
-eval_strategy: steps
-eval_steps: 100
-gradient_accumulation_steps: 8
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-hub_model_id: Qwen-2.5-7B-Simple-RL
-hub_strategy: every_save
-learning_rate: 3.0e-06
-log_completions: true
-log_level: info
-logging_first_step: true
-logging_steps: 5
-logging_strategy: steps
-lr_scheduler_type: cosine
-max_prompt_length: 512
-max_completion_length: 1024
-max_steps: -1
-num_generations: 7
-num_train_epochs: 1
-output_dir: data/Qwen-2.5-7B-Simple-RL
-overwrite_output_dir: true
-per_device_eval_batch_size: 16
-per_device_train_batch_size: 16
-push_to_hub: true
-report_to:
-- wandb
-reward_funcs:
-- accuracy
-- format
-reward_weights:
-- 1.0
-- 1.0
-save_strategy: "no"
-seed: 42
-warmup_ratio: 0.1
diff --git a/src/open_r1/grpo.py b/src/open_r1/grpo.py
index a7385361c..44865b848 100644
--- a/src/open_r1/grpo.py
+++ b/src/open_r1/grpo.py
@@ -140,6 +140,9 @@ def make_conversation(example, prompt_column: str = script_args.dataset_prompt_c
     # Save model and create model card
     ##################################
     logger.info("*** Save model ***")
+    # Align the model's generation config with the tokenizer's eos token
+    # to avoid unbounded generation in the transformers `pipeline()` function
+    trainer.model.generation_config.eos_token_id = tokenizer.eos_token_id
     trainer.save_model(training_args.output_dir)
     logger.info(f"Model saved to {training_args.output_dir}")
 
diff --git a/src/open_r1/sft.py b/src/open_r1/sft.py
index e257cd936..c11c023ca 100644
--- a/src/open_r1/sft.py
+++ b/src/open_r1/sft.py
@@ -19,20 +19,18 @@
 
 # One 1 node of 8 x H100s
 accelerate launch --config_file=recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py \
-    --model_name_or_path Qwen/Qwen2.5-1.5B-Instruct \
-    --dataset_name open-r1/OpenR1-Math-220k \
-    --learning_rate 2.0e-5 \
-    --num_train_epochs 1 \
-    --packing \
-    --max_seq_length 4096 \
+    --model_name_or_path open-r1/Qwen2.5-Math-7B-RoPE-300k \
+    --dataset_name open-r1/Mixture-of-Thoughts \
+    --dataset_config all \
+    --eos_token '<|im_end|>' \
+    --learning_rate 4.0e-5 \
+    --num_train_epochs 5 \
+    --max_seq_length 32768 \
     --per_device_train_batch_size 2 \
-    --gradient_accumulation_steps 8 \
     --gradient_checkpointing \
     --bf16 \
-    --logging_steps 5 \
-    --eval_strategy steps \
-    --eval_steps 100 \
-    --output_dir data/Qwen2.5-1.5B-Open-R1-Distill
+    --use_liger_kernel \
+    --output_dir data/OpenR1-Distill-7B
 """
 
 import logging
@@ -55,7 +53,6 @@
 
 
 def main(script_args, training_args, model_args):
-    # Set seed for reproducibility
     set_seed(training_args.seed)
 
     ###############
@@ -87,24 +84,15 @@ def main(script_args, training_args, model_args):
     if "wandb" in training_args.report_to:
         init_wandb_training(training_args)
 
-    ################
-    # Load datasets
-    ################
+    ######################################
+    # Load dataset, tokenizer, and model #
+    ######################################
     dataset = get_dataset(script_args)
-
-    ################
-    # Load tokenizer
-    ################
     tokenizer = get_tokenizer(model_args, training_args)
-
-    ###################
-    # Load model
-    ###################
-    logger.info("*** Loading model ***")
     model = get_model(model_args, training_args)
 
     if tokenizer.chat_template is None:
-        logger.info("No chat template provided, using ChatML.")
+        logger.info("No chat template provided, defaulting to ChatML.")
         model, tokenizer = setup_chat_format(model, tokenizer, format="chatml")
 
     ############################
@@ -140,6 +128,9 @@ def main(script_args, training_args, model_args):
     # Save model and create model card
     ##################################
     logger.info("*** Save model ***")
+    # Align the model's generation config with the tokenizer's eos token
+    # to avoid unbounded generation in the transformers `pipeline()` function
+    trainer.model.generation_config.eos_token_id = tokenizer.eos_token_id
     trainer.save_model(training_args.output_dir)
     logger.info(f"Model saved to {training_args.output_dir}")
 

From 722f144d21f4ea19b169221e94e61ee88a85e3ca Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Tue, 27 May 2025 19:21:15 +0200
Subject: [PATCH 132/137] Refresh Weka on Slurm (#662)

* Refresh Weka on Slurm

* Include current working dir
---
 slurm/evaluate.slurm | 5 +++++
 slurm/train.slurm    | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/slurm/evaluate.slurm b/slurm/evaluate.slurm
index d10a70a7d..6484a2c9e 100644
--- a/slurm/evaluate.slurm
+++ b/slurm/evaluate.slurm
@@ -12,6 +12,11 @@
 # Be ye warned this may not work on other clusters!
 module load cuda/12.4
 
+# Refresh Weka on h4 cache & current working directory
+echo "Refreshing Weka filesystem..."
+find -L /fsx/h4/ -type f | xargs -d '\n' -r -n512 -P64 weka fs tier fetch
+find -L . -type f -not -path "./data/*" | xargs -d '\n' -r -n512 -P64 weka fs tier fetch
+
 # Needed for vLLM
 export VLLM_WORKER_MULTIPROC_METHOD=spawn
 
diff --git a/slurm/train.slurm b/slurm/train.slurm
index 31b0601d6..f14a4827c 100644
--- a/slurm/train.slurm
+++ b/slurm/train.slurm
@@ -32,6 +32,11 @@ source openr1/bin/activate
 START_TIME=$(date +%s)
 echo "START TIME: $(date)"
 
+# Refresh Weka on h4 cache & current working directory
+echo "Refreshing Weka filesystem..."
+find -L /fsx/h4/ -type f | xargs -d '\n' -r -n512 -P64 weka fs tier fetch
+find -L . -type f -not -path "./data/*" | xargs -d '\n' -r -n512 -P64 weka fs tier fetch
+
 # Default values
 MODEL=""
 TASK=""

From 01b4351c4549887d0311f02993402d5577b88051 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Wed, 28 May 2025 09:23:12 +0200
Subject: [PATCH 133/137] Set DP=2 for smol model evals (#664)

* Set DP=2 for smol model evals

Temporary hack while the HF cluster is at max capacity :)

* Style
---
 src/open_r1/utils/evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py
index b985d9a71..e79cd2972 100644
--- a/src/open_r1/utils/evaluation.py
+++ b/src/open_r1/utils/evaluation.py
@@ -79,7 +79,7 @@ def run_lighteval_job(
     if get_param_count_from_repo_id(model_name) >= 30_000_000_000:
         tensor_parallel = True
     else:
-        num_gpus = 8
+        num_gpus = 2  # Hack while cluster is full
         tensor_parallel = False
 
     cmd = VLLM_SLURM_PREFIX.copy()

From a6b4f668fb990f0341f0fdf942e73132de7fcec5 Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Wed, 28 May 2025 13:45:48 +0200
Subject: [PATCH 134/137] Fix Weka refresh (#666)

* Fix Weka refresh

* Update evaluate.slurm
---
 slurm/evaluate.slurm | 3 +--
 slurm/train.slurm    | 5 ++---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/slurm/evaluate.slurm b/slurm/evaluate.slurm
index 6484a2c9e..dbfbb33f6 100644
--- a/slurm/evaluate.slurm
+++ b/slurm/evaluate.slurm
@@ -12,10 +12,9 @@
 # Be ye warned this may not work on other clusters!
 module load cuda/12.4
 
-# Refresh Weka on h4 cache & current working directory
+# Refresh Weka on h4 cache
 echo "Refreshing Weka filesystem..."
 find -L /fsx/h4/ -type f | xargs -d '\n' -r -n512 -P64 weka fs tier fetch
-find -L . -type f -not -path "./data/*" | xargs -d '\n' -r -n512 -P64 weka fs tier fetch
 
 # Needed for vLLM
 export VLLM_WORKER_MULTIPROC_METHOD=spawn
diff --git a/slurm/train.slurm b/slurm/train.slurm
index f14a4827c..15a70d62c 100644
--- a/slurm/train.slurm
+++ b/slurm/train.slurm
@@ -32,10 +32,9 @@ source openr1/bin/activate
 START_TIME=$(date +%s)
 echo "START TIME: $(date)"
 
-# Refresh Weka on h4 cache & current working directory
+# Refresh Weka on h4 cache
 echo "Refreshing Weka filesystem..."
 find -L /fsx/h4/ -type f | xargs -d '\n' -r -n512 -P64 weka fs tier fetch
-find -L . -type f -not -path "./data/*" | xargs -d '\n' -r -n512 -P64 weka fs tier fetch
 
 # Default values
 MODEL=""
@@ -180,4 +179,4 @@ ELAPSED_SECONDS=$((END_TIME - START_TIME))
 HOURS=$((ELAPSED_SECONDS / 3600))
 MINUTES=$(( (ELAPSED_SECONDS % 3600) / 60 ))
 SECONDS=$((ELAPSED_SECONDS % 60))
-echo "TOTAL JOB TIME: ${HOURS}h ${MINUTES}m ${SECONDS}s (${ELAPSED_SECONDS} seconds)"
\ No newline at end of file
+echo "TOTAL JOB TIME: ${HOURS}h ${MINUTES}m ${SECONDS}s (${ELAPSED_SECONDS} seconds)"

From b806e1092ad490138027e6049e82ef2bb2497a7f Mon Sep 17 00:00:00 2001
From: lewtun <lewis.c.tunstall@gmail.com>
Date: Wed, 28 May 2025 13:47:25 +0200
Subject: [PATCH 135/137] Bump vLLM and TRL (#665)

* Bump vLLM and TRL

* Fix Makefile
---
 Makefile  |  9 +++++----
 README.md | 42 +++++++++---------------------------------
 setup.py  |  2 +-
 3 files changed, 15 insertions(+), 38 deletions(-)

diff --git a/Makefile b/Makefile
index 3bf653fb0..112528a19 100644
--- a/Makefile
+++ b/Makefile
@@ -8,10 +8,11 @@ check_dirs := src tests
 
 # dev dependencies
 install:
-	uv venv openr1 --python 3.11 && . openr1/bin/activate && uv pip install --upgrade pip
-	uv pip install vllm==0.8.4
-	uv pip install setuptools
-	uv pip install flash-attn --no-build-isolation
+	uv venv openr1 --python 3.11
+	. openr1/bin/activate && uv pip install --upgrade pip && \
+	uv pip install vllm==0.8.5.post1 && \
+	uv pip install setuptools && \
+	uv pip install flash-attn --no-build-isolation && \
 	GIT_LFS_SKIP_SMUDGE=1 uv pip install -e ".[dev]"
 
 style:
diff --git a/README.md b/README.md
index e35a58cc9..538a9248f 100644
--- a/README.md
+++ b/README.md
@@ -69,7 +69,7 @@ uv venv openr1 --python 3.11 && source openr1/bin/activate && uv pip install --u
 Next, install vLLM and FlashAttention:
 
 ```shell
-uv pip install vllm==0.8.4
+uv pip install vllm==0.8.5.post1
 uv pip install setuptools && uv pip install flash-attn --no-build-isolation
 ```
 
@@ -140,8 +140,9 @@ By default, these scripts will push each model to your Hugging Face Hub username
 # Change the base model to a smaller variant
 accelerate launch --config_file recipes/accelerate_configs/zero3.yaml src/open_r1/sft.py \
     --config recipes/OpenR1-Distill-7B/sft/config_distill.yaml \
-    --model_name_or_path Qwen/Qwen3-0.6-Base \
-    --hub_model_id OpenR1-Distill-0.6B
+    --model_name_or_path Qwen/Qwen3-0.6B-Base \
+    --hub_model_id OpenR1-Distill-0.6B \
+    --output_dir data/OpenR1-Distill-0.6B
 ```
 
 If you also wish to override the Weights and Biases default settings, you can do so as follows:
@@ -214,43 +215,18 @@ You can adjust the YAML config to train on a different base model or dataset.
 
 ### GRPO
 
-We use TRL's [vLLM backend](https://huggingface.co/docs/trl/speeding_up_training?vllm+examples=GRPO#vllm-for-fast-generation-in-online-methods) to scale training to large models across multiple nodes. For single-node training of smol models across 8 GPUs, first spin up the vLLM server to run on e.g. 1 GPU as follows:
+We use TRL's [vLLM backend](https://huggingface.co/docs/trl/speeding_up_training?vllm+examples=GRPO#vllm-for-fast-generation-in-online-methods) to scale training to large models across multiple nodes. For single-node training of smol models across 8 GPUs, use `vllm_mode="colocate"` to run vLLM in the same process as the training script:
 
 ```shell
-CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
-```
-
-Once the server is up, run training on the remaining GPUs as follows:
-
-```shell
-CUDA_VISIBLE_DEVICES=1,2,3,4,5,6,7 ACCELERATE_LOG_LEVEL=info \
-    accelerate launch --config_file recipes/accelerate_configs/zero2.yaml --num_processes 7 \
-    src/open_r1/grpo.py --config recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
+ACCELERATE_LOG_LEVEL=info \
+    accelerate launch --config_file recipes/accelerate_configs/zero3.yaml \
+    src/open_r1/grpo.py --config recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml \
+    --vllm_mode colocate
 ```
 
 > [!WARNING]
 > The chat template used in the distilled DeepSeek models omits the contents of the reasoning block within the `<think>` and `</think>` tags. It also prefills the assistant response with `<think>` which interferes with the format reward function. To handle that, it is important to override the chat template as done in e.g.  [recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml](./recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml).
 
-To increase the throughput with data parallel on e.g. 2 GPUs, run:
-
-```shell
-CUDA_VISIBLE_DEVICES=0,1 trl vllm-serve --model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --data_parallel_size 2
-```
-
-Then run training on the remaining GPUs as follows:
-
-```shell
-CUDA_VISIBLE_DEVICES=2,3,4,5,6,7 ACCELERATE_LOG_LEVEL=info \
-    accelerate launch --config_file recipes/accelerate_configs/zero2.yaml --num_processes 6 \
-    src/open_r1/grpo.py --config recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_demo.yaml
-```
-
-For larger models, use tensor parallelism:
-
-```shell
-CUDA_VISIBLE_DEVICES=0,1 trl vllm-serve --model deepseek-ai/DeepSeek-R1-Distill-Qwen-14B --tensor_parallel_size 2
-``` 
-
 For multi-node training on N+1 nodes, with 1 node running the vLLM server and N nodes running training, we provide an example Slurm script. For example, to run the above example on 1+1 nodes with data parallelism, run:
 
 ```shell
diff --git a/setup.py b/setup.py
index 27ac9eed6..a88508b94 100644
--- a/setup.py
+++ b/setup.py
@@ -69,7 +69,7 @@
     "sentencepiece>=0.1.99",
     "torch==2.6.0",
     "transformers==4.52.3",
-    "trl[vllm] @ git+https://github.com/huggingface/trl.git@9ac614fb081e17805f7f62ab3f5f7036bdefe7b0",  # Support for activation offload: https://github.com/huggingface/trl/pull/2954
+    "trl[vllm]==0.18.0",
     "wandb>=0.19.1",
     "async-lru>=2.0.5",
     "aiofiles>=24.1.0",

From 7e700c6218e8105af730785f070f4aa6506b1f43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Mon, 7 Jul 2025 10:23:08 -0700
Subject: [PATCH 136/137] Update citation (#688)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 538a9248f..4cc00f79d 100644
--- a/README.md
+++ b/README.md
@@ -798,7 +798,7 @@ If you find this project is useful in your own work, please consider citing as f
 @misc{openr1,
     title = {Open R1: A fully open reproduction of DeepSeek-R1},
     url = {https://github.com/huggingface/open-r1},
-    author = {Hugging Face},
+    author = {{Hugging Face}},
     month = {January},
     year = {2025}
 }

From 0e06249d1caa3c1d27a93e590929531291c9c493 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Thu, 17 Jul 2025 13:20:00 -0700
Subject: [PATCH 137/137] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 4cc00f79d..adc049bfd 100644
--- a/README.md
+++ b/README.md
@@ -299,6 +299,7 @@ Make sure your dataset contains a `verification_info` column with the following
         }
     ],
 }
+```
 
 For example, to train a smol model on Python problems, start the vLLM server: