ServiceNow · rizar · May 29, 2025 · May 8, 2025 · May 9, 2025 · May 12, 2025
diff --git a/README.md b/README.md
@@ -34,8 +34,8 @@ cd pipelinerl
 Create the environments with dependencies.
 ```bash
 conda create -n pipeline-rl -y python=3.11
-conda run --no-capture-output -n pipeline-rl pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/cu121 
-conda run --no-capture-output -n pipeline-rl pip install -r requirements.txt --no-build-isolation
+conda run --no-capture-output -n pipeline-rl pip install torch==2.6.0 
+conda run --no-capture-output -n pipeline-rl pip install -e . --no-build-isolation
 ```
 
 By default Pipeline-RL will use the file system as the medium for streaming the generated data to the trainer processes. This works on one node, but the files can get quite large. To use Redis instead you will need to install the Redis server in the same conda environment:

diff --git a/conf/actor/web.yaml b/conf/actor/web.yaml
@@ -0,0 +1,109 @@
+log_each_n_secs: 10
+llm_max_rollouts: 128
+rollout_workers: 1
+rollout_policy: pipelinerl.tapeagents_rollouts.generate_rollout
+
+environment:
+  _target_: tapeagents.mcp.MCPEnvironment
+  config_path: conf/mcp/web.json
+
+llm:
+  _target_: tapeagents.llms.LiteLLM
+  model_name: o4-mini-2025-04-16
+  use_cache: true
+  context_size: 200000
+  parameters:
+    temperature: 1
+    max_completion_tokens: 16000
+
+agent:
+  _target_: tapeagents.agent.Agent
+  name : web_agent
+  llms:
+    default: ${llm}
+  templates:
+    system_prompt: |
+      You are an expert AI Agent trained to assist users with complex information processing tasks.
+      Your role is to understand user queries and respond in a helpful and accurate manner.
+      Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
+      Do not express emotions or opinions about user questions.
+    allowed_tools: |
+      You have access to the following tools:
+      {tools_description}
+    thought_format: |
+      Important! Respond with the plain text, do not include any JSON or code.
+      Do not output anything besides what I asked in this message.
+    allowed_steps: |
+      You have access to the following tools:
+      {tools_description}
+      You are allowed to produce ONLY steps with the following JSON schemas:
+      {allowed_steps}
+      Do not reproduce the schema when producing steps; use it as a reference.
+    format: >
+      Output only a single JSON dict or a single JSON list.
+      DO NOT OUTPUT ANYTHING BESIDES THE JSON! DO NOT PLACE ANY COMMENTS INSIDE THE JSON.
+      It will break the system that processes the output.
+
+  nodes:
+    - _target_: tapeagents.nodes.StandardNode
+      name: plan
+      system_prompt: ${agent.templates.system_prompt}
+      guidance: |
+        Write a concise multi-step plan explaining which steps should be performed to find the answer for the given task.
+        Be specific about how each step should be performed. Only describe the intended actions here, do not perform them yet.
+        Consider that next steps may depend on results of previous steps, so include conditional branching using "if" statements where needed.
+        Start with the title "Plan". Every step should have short name and description.
+        ${agent.templates.thought_format}
+      steps_prompt: ${agent.templates.allowed_tools}
+
+    - _target_: tapeagents.nodes.StandardNode
+      name: reflect
+      system_prompt: ${agent.templates.system_prompt}
+      guidance: |
+        Observe the current state of the task and produce the reflection text strictly following these rules:
+          1. Evaluate the action's success, explain its impact on the task and our plan,
+          2. If the last action was not successful, describe errors and the possible reasons for failure.
+          3. List the next steps to accomplish the current plan step and propose single next immediate action.
+          4. When proposing webpage interactions:
+            - Always accept cookie and close popups first before interacting
+            - If the last action was not successful, check if the target element is visible and use scrolling if it is not.
+          5. Describe the expected effect of the proposed action.
+        ${agent.templates.thought_format}
+      steps_prompt: ${agent.templates.allowed_tools}
+
+    - _target_: tapeagents.nodes.StandardNode
+      name: act
+      system_prompt: ${agent.templates.system_prompt}
+      guidance: Then produce single function call for the next step. If the answer is ready, call FinalStep function.
+      steps:
+        - tapeagents.steps.ReasoningThought
+        - tapeagents.core.FinalStep
+      use_known_actions: true
+      use_function_calls: true
+      next_node: act
+
+    - _target_: tapeagents.nodes.StandardNode
+      name: summarize
+      system_prompt: ${agent.templates.system_prompt}
+      guidance: |
+        Summarize last observation. If its an image, thoroughly describe it with all details.
+        Describe the results of the last action and observed changes. Discuss its impact on the task and our plan.
+        Do not hallucinate or make up any information, only describe what you see in the observation.
+        Do not guess or assume action effects, describe only visible changes.
+        ${agent.templates.thought_format}
+      steps_prompt: ${agent.templates.allowed_tools}
+      next_node: reflect
+
+split: validation
+batch: 2
+retry_unsolved: true
+
+only_tasks: #[] # list of (level, task_num)
+- [1, 0]
+- [1, 1]
+- [1, 2]
+- [1, 3]
+- [1, 4]
+- [1, 5]
+- [1, 6]
+- [1, 7]
diff --git a/conf/base.yaml b/conf/base.yaml
@@ -38,11 +38,16 @@ finetune:
   weight_update_interval: 1
   pop_old_data: ${..pop_old_data}
 actor:
+  log_each_n_secs: 0
   llm_max_rollouts: 128
   rollout_workers: 1
-verifier:
-  host: localhost
-  port: 7777
+  rollout_policy: pipelinerl.math.rollouts.generate_math_rollout
+  discount_factor: 1
+  system_prompt: Please reason step by step, and put your final answer within \boxed{}.
+  task_template: |-
+    {task}
+environment:
+  _target_: pipelinerl.math.verifier_api.MathEnvironment
 preprocess:
   input: actor
   output: training_data
@@ -81,19 +86,18 @@ vllm_config:
     generation-config: vllm
 
 world:
-  actors: 1
-  preprocessors: 1
+  replicas: 1
 
   actor_fraction: 4
   preprocessor_fraction: 0
   finetune_fraction: 4
 
-  actor_group_port: 9000
+  env_replicas: 2
 
-# changed
-system_prompt: Please reason step by step, and put your final answer within \boxed{}.
-task_template: |-
-  {task}
+  actor_group_port: 9000
+  environment_start_port: 7777
+# this will be autocreated based on the config
+jobs: []
 
 eval_every_n_versions: 78000
 
@@ -115,7 +119,8 @@ force_restart: false
 pop_old_data: true
 max_lag: null
 attempts: 8
-discount_factor: 1
+
+dataset_loader: pipelinerl.math.load_datasets.load_datasets
 train_dataset_names:
 - open_reasoner_zero_57k
 - open_reasoner_zero_extended_72k 
@@ -130,6 +135,10 @@ debug:
   streams_from: null
   place_inference_workers: true
 
+me:
+  # Which job is this one? This will be autopopulated
+  job_idx: null
+
 hydra:
   run:
     dir: ${output_dir}
diff --git a/conf/counting.yaml b/conf/counting.yaml
@@ -0,0 +1,19 @@
+defaults:
+    - base
+finetune:
+    seq_length: 4000
+    gradient_accumulation_passes: 1024
+llm:
+    parameters:
+        max_tokens: 1000
+test_llm:
+    parameters:
+        max_tokens: 1000
+actor:
+    rollout_policy: pipelinerl.counting.counting.generate_counting_rollout
+environment: null
+dataset_loader: pipelinerl.counting.counting.load_problems
+train_dataset_names:
+    - train_counting_problems
+test_dataset_names:
+    - test_counting_problems
diff --git a/conf/debug.yaml b/conf/debug.yaml
@@ -0,0 +1,27 @@
+defaults:
+  - base
+  - override streams: redis
+  - _self_
+
+finetune:
+  seq_length: 5000
+  gradient_accumulation_passes: 1024
+
+llm:
+  parameters:
+    max_tokens: 4096
+
+test_llm:
+  parameters:
+    max_tokens: 4096
+
+# debug:
+  # mode: open_loop
+
+output_dir: results/debug_4gpu_7b/${now:%Y_%m_%d}/${now:start_at_%H_%M_%S}
+
+# model_path: Qwen/Qwen2.5-0.5B
+
+# vllm_config:
+#   vllm_kwargs:
+#     enforce_eager: ""
diff --git a/conf/mcp/web.json b/conf/mcp/web.json
@@ -0,0 +1,23 @@
+{
+    "mcpServers": {
+        "serper-search": {
+            "command": "uv",
+            "args": ["run", "tapeagents/tools/mcp_servers/web_search.py"],
+            "env": {"SERPER_API_KEY": ""}
+        },
+        "fetch": {
+            "command": "uvx",
+            "args": [
+                "mcp-server-fetch"
+            ]
+        },
+        "python_exec": {
+            "command": "npx",
+            "args": [
+                "-y",
+                "@pydantic/mcp-run-python",
+                "stdio"
+            ]
+        }
+    }
+}
diff --git a/pipelinerl/async_llm.py b/pipelinerl/async_llm.py
@@ -1,7 +1,7 @@
 import logging
 import aiohttp
 
-from tapeagents.core import LLMCall, LLMOutput, Prompt, TokenLogprob
+from tapeagents.core import LLMCall, LLMOutput, Prompt
 from tapeagents.llms.trainable import TrainableLLM
 
 

diff --git a/pipelinerl/cot_math_agent.py b/pipelinerl/cot_math_agent.py