Evaluation
Testing tools

Testing tool calling and multi-step calls

One scenario that can be tricky to test is when you have a tool that requires multiple steps to complete a task. This could be a chatbot which can access storage, search on the internet or compile javascript code.

In this example, we will test a calculator tool that can calculate the result of a complex mathematical expression - using a simple single operand calculator, and a tool to indicate the final answer.

Defining available tools

The first thing we do is to define the tools that are available to LLM.

@pytest.fixture()
def tools():
    return [
        {
            "type": "function",
            "function": {
                "name": "calc",
                "description": "Calculate the result of a mathematical expression with one operand and two arguments.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "operand": {
                            "type": "string",
                            "description": "The operator to use in the calculation. For example '+', '-', '*', or '/' or more advanced once like 'sqrt' and 'log10'.",
                        },
                        "arg1": {
                            "type": "number",
                            "description": "The first argument to use in the calculation.",
                        },
                        "arg2": {
                            "type": "number",
                            "description": "The second argument to use in the calculation.",
                        },
                    },
                    "required": ["operand", "arg1", "arg2"],
                    "additionalProperties": False,
                },
            },
        },
        {
            "type": "function",
            "function": {
                "name": "final_answer",
                "description": "Indicate the final answer of a calculation.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "answer": {
                            "type": "number",
                            "description": "The final answer to the calculation.",
                        }
                    },
                    "required": ["answer"],
                    "additionalProperties": False,
                },
            },
        },
    ]

Calling the LLM with the natural language query

Next, we call the LLM with a natural language query.

def test_calculator(tools):
    """
    Tests multi step tool calling
    """
 
    client = OpenAI()
    messages = [
        {
            "role": "system",
            "content": "You help users with their math work. You have a tool to call to compute numbers, and a tool to indicate the final answer.",
        },
        {
            "role": "user",
            "content": "What is the logarithm of 1000, multiplied by square root of 2?",
        },
    ]
 
    #
    # This should be the first calls to the calculator: log10(1000) and sqrt(2)
    #
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        tools=tools,
        messages=messages,
    )

The first call should return two tool calls that we need to process.

    assert response.choices[0].message.tool_calls[0].function.name == "calc"
    log10_arguments = json.loads(
        response.choices[0].message.tool_calls[0].function.arguments
    )
    assert log10_arguments["operand"] == "log10"
    assert log10_arguments["arg1"] == 1000
    assert log10_arguments["arg2"] == 0
 
    assert response.choices[0].message.tool_calls[1].function.name == "calc"
    sqrt_arguments = json.loads(
        response.choices[0].message.tool_calls[1].function.arguments
    )
    assert sqrt_arguments["operand"] == "sqrt"
    assert sqrt_arguments["arg1"] == 2
    assert sqrt_arguments["arg2"] == 0
 
    messages.append(response.choices[0].message)

We then call back into the LLM with those two tool calls resolved:

    messages.append(response.choices[0].message)
 
    #
    # Call calculator with the results of log10(1000) and sqrt(2)
    #
    log10_value = math.log10(log10_arguments["arg1"])
    log10_result = {
        "role": "tool",
        "content": json.dumps(
            {
                "operand": log10_arguments["operand"],
                "arg1": log10_arguments["arg1"],
                "arg2": log10_arguments["arg2"],
                "calc": log10_value,
            }
        ),
        "tool_call_id": response.choices[0].message.tool_calls[0].id,
    }
 
    sqrt_value = math.sqrt(sqrt_arguments["arg1"])
    sqrt_result = {
        "role": "tool",
        "content": json.dumps(
            {
                "operand": sqrt_arguments["operand"],
                "arg1": sqrt_arguments["arg1"],
                "arg2": sqrt_arguments["arg2"],
                "calc": sqrt_value,
            }
        ),
        "tool_call_id": response.choices[0].message.tool_calls[1].id,
    }
 
    messages.extend([log10_result, sqrt_result])
 
    #
    # We should now expect to receive another step to calculate the final answer
    #
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        tools=tools,
        messages=messages,
    )

Then, we check that the final calculation was requested and call the LLM again.

    assert response.choices[0].message.tool_calls[0].function.name == "calc"
    arguments = json.loads(response.choices[0].message.tool_calls[0].function.arguments)
    assert arguments["operand"] == "*"
    assert arguments["arg1"] == log10_value
    assert arguments["arg2"] == sqrt_value
 
    messages.append(response.choices[0].message)
 
    #
    # Call calculator with the results of log10(1000) and sqrt(2)
    #
    final_value = log10_value * sqrt_value
    final_result = {
        "role": "tool",
        "content": json.dumps(
            {
                "operand": arguments["operand"],
                "arg1": arguments["arg1"],
                "arg2": arguments["arg2"],
                "calc": final_value,
            }
        ),
        "tool_call_id": response.choices[0].message.tool_calls[0].id,
    }
 
    messages.append(final_result)
 
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        tools=tools,
        messages=messages,
    )

The final step should now be to conclude, and call our final answer tool.

    assert response.choices[0].message.tool_calls[0].function.name == "final_answer"
    final_arguments = json.loads(
        response.choices[0].message.tool_calls[0].function.arguments
    )
    assert final_arguments["answer"] == final_value

Putting it all together

from log10.load import OpenAI
 
import pytest
import json
import math
 
 
@pytest.fixture()
def tools():
    return [
        {
            "type": "function",
            "function": {
                "name": "calc",
                "description": "Calculate the result of a mathematical expression with one operand and two arguments.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "operand": {
                            "type": "string",
                            "description": "The operator to use in the calculation. For example '+', '-', '*', or '/' or more advanced once like 'sqrt' and 'log10'.",
                        },
                        "arg1": {
                            "type": "number",
                            "description": "The first argument to use in the calculation.",
                        },
                        "arg2": {
                            "type": "number",
                            "description": "The second argument to use in the calculation.",
                        },
                    },
                    "required": ["operand", "arg1", "arg2"],
                    "additionalProperties": False,
                },
            },
        },
        {
            "type": "function",
            "function": {
                "name": "final_answer",
                "description": "Indicate the final answer of a calculation.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "answer": {
                            "type": "number",
                            "description": "The final answer to the calculation.",
                        }
                    },
                    "required": ["answer"],
                    "additionalProperties": False,
                },
            },
        },
    ]
 
 
def test_calculator(tools):
    """
    Tests multi step tool calling
    """
 
    client = OpenAI()
    messages = [
        {
            "role": "system",
            "content": "You help users with their math work. You have a tool to call to compute numbers, and a tool to indicate the final answer.",
        },
        {
            "role": "user",
            "content": "What is the logarithm of 1000, multiplied by square root of 2?",
        },
    ]
 
    #
    # This should be the first calls to the calculator: log10(1000) and sqrt(2)
    #
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        tools=tools,
        messages=messages,
    )
 
    assert response.choices[0].message.tool_calls[0].function.name == "calc"
    log10_arguments = json.loads(
        response.choices[0].message.tool_calls[0].function.arguments
    )
    assert log10_arguments["operand"] == "log10"
    assert log10_arguments["arg1"] == 1000
    assert log10_arguments["arg2"] == 0
 
    assert response.choices[0].message.tool_calls[1].function.name == "calc"
    sqrt_arguments = json.loads(
        response.choices[0].message.tool_calls[1].function.arguments
    )
    assert sqrt_arguments["operand"] == "sqrt"
    assert sqrt_arguments["arg1"] == 2
    assert sqrt_arguments["arg2"] == 0
 
    messages.append(response.choices[0].message)
 
    #
    # Call calculator with the results of log10(1000) and sqrt(2)
    #
    log10_value = math.log10(log10_arguments["arg1"])
    log10_result = {
        "role": "tool",
        "content": json.dumps(
            {
                "operand": log10_arguments["operand"],
                "arg1": log10_arguments["arg1"],
                "arg2": log10_arguments["arg2"],
                "calc": log10_value,
            }
        ),
        "tool_call_id": response.choices[0].message.tool_calls[0].id,
    }
 
    sqrt_value = math.sqrt(sqrt_arguments["arg1"])
    sqrt_result = {
        "role": "tool",
        "content": json.dumps(
            {
                "operand": sqrt_arguments["operand"],
                "arg1": sqrt_arguments["arg1"],
                "arg2": sqrt_arguments["arg2"],
                "calc": sqrt_value,
            }
        ),
        "tool_call_id": response.choices[0].message.tool_calls[1].id,
    }
 
    messages.extend([log10_result, sqrt_result])
 
    #
    # We should now expect to receive another step to calculate the final answer
    #
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        tools=tools,
        messages=messages,
    )
 
    assert response.choices[0].message.tool_calls[0].function.name == "calc"
    arguments = json.loads(response.choices[0].message.tool_calls[0].function.arguments)
    assert arguments["operand"] == "*"
    assert arguments["arg1"] == log10_value
    assert arguments["arg2"] == sqrt_value
 
    messages.append(response.choices[0].message)
 
    #
    # Call calculator with the results of log10(1000) and sqrt(2)
    #
    final_value = log10_value * sqrt_value
    final_result = {
        "role": "tool",
        "content": json.dumps(
            {
                "operand": arguments["operand"],
                "arg1": arguments["arg1"],
                "arg2": arguments["arg2"],
                "calc": final_value,
            }
        ),
        "tool_call_id": response.choices[0].message.tool_calls[0].id,
    }
 
    messages.append(final_result)
 
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        tools=tools,
        messages=messages,
    )
 
    assert response.choices[0].message.tool_calls[0].function.name == "final_answer"
    final_arguments = json.loads(
        response.choices[0].message.tool_calls[0].function.arguments
    )
    assert final_arguments["answer"] == final_value