Skip to content

Extract code blocks only after Code marker #1223

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/smolagents/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def parse_json_blob(json_blob: str) -> Tuple[Dict[str, str], str]:


def parse_code_blobs(text: str) -> str:
"""Extract code blocs from the LLM's output.
"""Extract code blocks from the LLM's output.

If a valid code block is passed, it returns it directly.

Expand All @@ -187,6 +187,7 @@ def parse_code_blobs(text: str) -> str:
ValueError: If no valid code block is found in the text.
"""
pattern = r"```(?:py|python)?\s*\n(.*?)\n```"
text = text.split("Code:")[-1]
Copy link
Member Author

@albertvillanova albertvillanova Apr 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change assumes that only one "Code:" marker appears in the model output.

@aymeric-roucher do you think this is a sensible assumption? Alternative assumptions?

matches = re.findall(pattern, text, re.DOTALL)
if matches:
return "\n\n".join(match.strip() for match in matches)
Expand Down
125 changes: 96 additions & 29 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import inspect
import os
import textwrap
import unittest

import pytest
from IPython.core.interactiveshell import InteractiveShell
Expand Down Expand Up @@ -93,34 +92,102 @@ def valid_tool_function(input: str) -> str:
'''


class AgentTextTests(unittest.TestCase):
def test_parse_code_blobs(self):
with pytest.raises(ValueError):
parse_code_blobs("Wrong blob!")

# Parsing mardkwon with code blobs should work
output = parse_code_blobs("""
Here is how to solve the problem:
Code:
```py
import numpy as np
```<end_code>
""")
assert output == "import numpy as np"

# Parsing code blobs should work
code_blob = "import numpy as np"
output = parse_code_blobs(code_blob)
assert output == code_blob

# Allow whitespaces after header
output = parse_code_blobs("```py \ncode_a\n````")
assert output == "code_a"

def test_multiple_code_blobs(self):
test_input = "```\nFoo\n```\n\n```py\ncode_a\n````\n\n```python\ncode_b\n```"
result = parse_code_blobs(test_input)
assert result == "Foo\n\ncode_a\n\ncode_b"
class TestParseCodeBlobs:
@pytest.mark.parametrize(
"input_text, expected_output, should_raise",
[
# Valid cases with "Code:" marker
(
textwrap.dedent(
"""\
Here is how to solve the problem:
Code:
```py
import numpy as np
```<end_code>"""
),
"import numpy as np",
False,
),
# Bare code without "Code:" marker or code block
("import numpy as np", "import numpy as np", False),
# Code blocks in "Thought:" and "Code:" markers: only "Code:" code block should be matched
(
textwrap.dedent(
"""\
Thought:
```
# this code should not be matched
invalid code
```

Code:
```
# this code should be matched
print("valid code")
```"""
),
'# this code should be matched\nprint("valid code")',
False,
),
# Whitespace handling
(
textwrap.dedent(
"""\
Code:
```py \ncode_a\n```
"""
),
"code_a",
False,
),
("```py \ncode_a\n````", "code_a", False),
# Multiple code blocks after Code:
(
textwrap.dedent(
"""\
Code:
```py
def func1():
pass
```

```python
def func2():
pass
```
"""
),
"def func1():\n pass\n\ndef func2():\n pass",
False,
),
("```\nFoo\n```\n\n```py\ncode_a\n````\n\n```python\ncode_b\n```", "Foo\n\ncode_a\n\ncode_b", False),
# Invalid cases
("Wrong blob!", None, True), # No code blob
# Code blocks before "Code:" marker should be ignored
(
textwrap.dedent(
"""\
```
code before marker
```
Code:
```
code after marker
```"""
),
"code after marker",
False,
),
],
)
def test_parse_code_blobs(self, input_text, expected_output, should_raise):
if should_raise:
with pytest.raises(ValueError):
parse_code_blobs(input_text)
else:
output = parse_code_blobs(input_text)
assert output == expected_output


@pytest.fixture(scope="function")
Expand Down