Few-Shot Experiments on Multiverse Math#

This notebook walks through how to run few-shot experiments on the multiverse math dataset, which you can find here.


First, letā€™s set up our notebook by collecting the correct imports

import datetime
import uuid

from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage
from langchain_core.messages.utils import convert_to_messages
from langsmith.client import Client
from langsmith.evaluation import evaluate

from langchain_benchmarks import __version__, registry
from langchain_benchmarks.tool_usage.tasks.multiverse_math import *

# Define our LangSmith client so we have access to it throughout the rest of the notebook
client = Client()

List tests#

Next, we can list the models that we will be running experiments over. Due to the new universal configurable model, this is easier than ever. All we need to do is save a list of model names and providers, like so:

models = [
    ("gpt-3.5-turbo-0125", "openai"),
    ("gpt-4o-mini", "openai"),

Feel free to add/remove any models that you personally want to test.

IMPORTANT: Make sure you have installed the correct packages (i.e langchain_openai if using ā€œopenaiā€ provided models) and set the correct environment keys (i.e OPENAI_API_KEY if using ā€œopneaiā€ provided models). If you do not do so - the code will not work!

Defining Helper Functions#

Extracting few-shot examples from our dataset#

First, we need to define a function that takes the examples from the dataset multiverse-math-examples-for-few-shot and turns them into something we can use for our model. In our example we are going to compare how passing in all 9 few-shot examples differs from just passing in the first 3.

def get_few_shot_messages():
    uncleaned_examples = [
        for e in client.list_examples(
    few_shot_messages = []
    few_shot_first_three_messages = []
    examples = []
    for i in range(len(uncleaned_examples)):
        converted_messages = convert_to_messages(
            # The message at index 1 is the human message asking the actual math question (0th message is system prompt)
                "question": converted_messages[1].content,
                "messages": [
                    for m in converted_messages
                    # The system prompt should only appear once at the beginning, so we remove it from few-shot examples
                    if isinstance(m, SystemMessage) == False
        few_shot_messages += converted_messages
        if i < 3:
            few_shot_first_three_messages += converted_messages

    return (
        [m for m in few_shot_messages if not isinstance(m, SystemMessage)],
        [m for m in few_shot_first_three_messages if not isinstance(m, SystemMessage)],

In this experiment we are also comparing passing in the few-shot examples as strings instead of passing them in as a list of messages, so we define a few more helper functions to allow us to do this:

def turn_messages_to_str(few_shot_messages):
    few_shot_str = ""
    for m in few_shot_messages:
        if isinstance(m.content, list):
            few_shot_str += "<|im_start|>assistant"
            for tool_use in m.content:
                if "name" in tool_use:
                    few_shot_str += f"Use tool {tool_use['name']}, input: {', '.join(f'{k}:{v}' for k,v in tool_use['input'].items())}"
                    few_shot_str += tool_use["text"]
                few_shot_str += "\n"
            few_shot_str += "\n<|im_end|>"
            if isinstance(m, HumanMessage):
                few_shot_str += f"<|im_start|>user\n{m.content}\n<|im_end|>"
            elif isinstance(m, ToolMessage):
                few_shot_str += f"<|im_start|>tool\n{m.content}\n<|im_end|>"
                few_shot_str += f"<|im_start|>assistant\n{m.content}\n<|im_end|>"

        few_shot_str += "\n"
    return few_shot_str

def get_few_shot_str_from_messages(few_shot_messages, few_shot_first_three_messages):
    few_shot_str = turn_messages_to_str(few_shot_messages)
    few_shot_first_three_str = turn_messages_to_str(few_shot_first_three_messages)
    return few_shot_str, few_shot_first_three_str

Lastly, we will define a function to return the different few-shot prompts we are going to use:

def get_prompts():
    return [

Now we are ready to actually run our experiment!

Running Experiment#

To run our experiment, we can iterate over our models and prompts and use the evaluate function to send our results to LangSmith.

def predict_from_callable(callable, instructions):
    def predict(run):
        return callable.invoke(
            {"question": run["question"], "instructions": instructions}

    return predict

experiment_uuid = uuid.uuid4().hex[:4]
today = datetime.date.today().isoformat()
dataset_name = task.name
examples, few_shot_messages, few_shot_three_messages = get_few_shot_messages()
few_shot_str, few_shot_three_str = get_few_shot_str_from_messages(
    few_shot_messages, few_shot_three_messages
prompts = get_prompts()

for model_name, model_provider in models:
    model = init_chat_model(model_name, model_provider=model_provider, temperature=0)

    print(f"Benchmarking {task.name} with model: {model_name}")
    eval_config = task.get_eval_config()

    for prompt, prompt_name in prompts[:-1]:
        tools = task.create_environment().tools
        agent = create_tool_calling_agent(model, tools, prompt)
        agent_executor = AgentExecutor(
            agent=agent, tools=tools, return_intermediate_steps=True

            predict_from_callable(agent_executor, task.instructions),
                "model": model_name,
                "id": experiment_uuid,
                "task": task.name,
                "date": today,
                "langchain_benchmarks_version": __version__,

Visualize Results#

Now that we have run the evaluation, we can visualize our results.

projects = [
    for p in client.list_projects(reference_dataset_name="Multiverse Math")
    if p.metadata["id"] == experiment_uuid
models = set([p.name.split("-Multiverse")[0] for p in projects])
few_shot_type = set([p.name.split("Math-")[1] for p in projects])
few_shot_type = set([t[: t.rfind("-")] for t in few_shot_type])
results_dic = {}
for model in models:
    model_results = []
    for few_shot in few_shot_type:
        experiment = [
            for p in projects
            if model + "-Multiverse" in p.name and few_shot in p.name
        experiment_stats = client.get_run_stats(
            project_ids=[experiment.id], is_root=True
            * experiment_stats["feedback_stats"]["correctness"]["avg"]
            * (1 - experiment_stats["error_rate"])
    results_dic[model] = model_results
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

results = np.array([results_dic[model] for model in models])

# Create a DataFrame for Seaborn plotting
df = pd.DataFrame(results.T, columns=list(models))
df["Few Shot Type"] = list(few_shot_type)
df.iloc[[0, 1, 2, 3, 4]] = df.iloc[[3, 4, 0, 2, 1]].to_numpy()
df = df[df.columns[[3, 2, 0, 5, 6, 4, 1, 7]]]

# Melt the DataFrame to long format for Seaborn's barplot
df_melted = df.melt(
    id_vars="Few Shot Type", var_name="Model", value_name="Percent Correct"

# Set up Seaborn parameters
plt.figure(figsize=(12, 6))

# Plot using Seaborn
    y="Percent Correct",
    hue="Few Shot Type",

# Add labels and title
plt.ylabel("Percent Correct")
plt.title("Multiverse Math Performance")

legend_labels = {
    "no-few-shot": "zero-shot",
    "few-shot-messages": "few-shot-msgs, k=9",
    "few-shot-string": "few-shot-str, k=9",
    "few-shot-three-messages": "few-shot-msgs, k=3",
    "few-shot-three-strings": "few-shot-str, k=3",
# Get the current handles and labels from the plot
handles, labels = plt.gca().get_legend_handles_labels()
labels = [legend_labels.get(label, label) for label in labels]

# Update the legend with the new labels
plt.legend(handles, labels, loc="upper left", bbox_to_anchor=(1, 1))