{ "cells": [ { "cell_type": "markdown", "id": "60bb467d-861d-4b07-a48d-8e5aa177c969", "metadata": { "tags": [] }, "source": [ "# Running Locally\n", "\n", "The LangChain benchmarks package is best used with LangSmith. You can create a free account [here](https://smith.langchain.com/) and read the [docs here](https://docs.smith.langchain.com/).\n", "\n", "\n", "If you are unable to make an account, you can still run these benchmarks locally without an account.\n", "\n", "Below is an example." ] }, { "cell_type": "code", "execution_count": 1, "id": "a00a1a5f-43ef-4445-a792-8bf6a5f74643", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Prove that we can run without LangSmith\n", "import os\n", "\n", "_ = [\n", " os.environ.pop(key)\n", " for key in list(os.environ.keys())\n", " if key.startswith(\"LANGCHAIN_\")\n", "]" ] }, { "cell_type": "code", "execution_count": 2, "id": "b39159d0-9ea1-414f-a9d8-4a7b22b3d2cc", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Name Multiverse Math
Type ToolUsageTask
Dataset ID 594f9f60-30a0-49bf-b075-f44beabf546a
DescriptionAn environment that contains a few basic math operations, but with altered results.\n", "\n", "For example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\n", "\n", "The objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.
" ], "text/plain": [ "ToolUsageTask(name='Multiverse Math', dataset_id='https://smith.langchain.com/public/594f9f60-30a0-49bf-b075-f44beabf546a/d', description='An environment that contains a few basic math operations, but with altered results.\\n\\nFor example, multiplication of 5*3 will be re-interpreted as 5*3*1.1. The basic operations retain some basic properties, such as commutativity, associativity, and distributivity; however, the results are different than expected.\\n\\nThe objective of this task is to evaluate the ability to use the provided tools to solve simple math questions and ignore any innate knowledge about math.\\n', create_environment=, instructions='You are requested to solve math questions in an alternate mathematical universe. The operations have been altered to yield different results than expected. Do not guess the answer or rely on your innate knowledge of math. Use the provided tools to answer the question. While associativity and commutativity apply, distributivity does not. Answer the question using the fewest possible tools. Only include the numeric response without any clarifications.', eval_params={'output_evaluation': 'qa_math'})" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from langchain_benchmarks import registry\n", "\n", "task = registry[\"Multiverse Math\"]\n", "task" ] }, { "cell_type": "markdown", "id": "3821e4b0-8e67-418a-840c-470fcde42df0", "metadata": {}, "source": [ "## Eval\n", "\n", "Let's evaluate an agent now. Nothing will be saved to langsmith, so be sure to save the test results to your file system if you want to use them later." ] }, { "cell_type": "code", "execution_count": 3, "id": "fb32763c-79ab-426a-8fc6-bf8ebb0dd432", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "bb6a27e067fa4887beaa78a28d8d431d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Running Evaluation: 0%| | 0/10 [00:00Experiment Results:" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
inputs.questionoutputs.inputoutputs.outputoutputs.intermediate_stepsfeedback.Intermediate steps correctnessfeedback.# steps / # expected stepsfeedback.correctnesserrorexecution_time
count1010101010.010.010.0010.000000
unique101011NaNNaNNaN0NaN
topmultiply the result of (log of 100 to base 10)...multiply the result of (log of 100 to base 10)...[]NaNNaNNaNNaNNaN
freq111010NaNNaNNaNNaNNaN
meanNaNNaNNaNNaN0.00.00.0NaN1.453172
stdNaNNaNNaNNaN0.00.00.0NaN0.496547
minNaNNaNNaNNaN0.00.00.0NaN0.763208
25%NaNNaNNaNNaN0.00.00.0NaN0.963885
50%NaNNaNNaNNaN0.00.00.0NaN1.593439
75%NaNNaNNaNNaN0.00.00.0NaN1.870549
maxNaNNaNNaNNaN0.00.00.0NaN1.957470
\n", "
" ], "text/plain": [ " inputs.question \\\n", "count 10 \n", "unique 10 \n", "top multiply the result of (log of 100 to base 10)... \n", "freq 1 \n", "mean NaN \n", "std NaN \n", "min NaN \n", "25% NaN \n", "50% NaN \n", "75% NaN \n", "max NaN \n", "\n", " outputs.input outputs.output \\\n", "count 10 10 \n", "unique 10 1 \n", "top multiply the result of (log of 100 to base 10)... \n", "freq 1 10 \n", "mean NaN NaN \n", "std NaN NaN \n", "min NaN NaN \n", "25% NaN NaN \n", "50% NaN NaN \n", "75% NaN NaN \n", "max NaN NaN \n", "\n", " outputs.intermediate_steps feedback.Intermediate steps correctness \\\n", "count 10 10.0 \n", "unique 1 NaN \n", "top [] NaN \n", "freq 10 NaN \n", "mean NaN 0.0 \n", "std NaN 0.0 \n", "min NaN 0.0 \n", "25% NaN 0.0 \n", "50% NaN 0.0 \n", "75% NaN 0.0 \n", "max NaN 0.0 \n", "\n", " feedback.# steps / # expected steps feedback.correctness error \\\n", "count 10.0 10.0 0 \n", "unique NaN NaN 0 \n", "top NaN NaN NaN \n", "freq NaN NaN NaN \n", "mean 0.0 0.0 NaN \n", "std 0.0 0.0 NaN \n", "min 0.0 0.0 NaN \n", "25% 0.0 0.0 NaN \n", "50% 0.0 0.0 NaN \n", "75% 0.0 0.0 NaN \n", "max 0.0 0.0 NaN \n", "\n", " execution_time \n", "count 10.000000 \n", "unique NaN \n", "top NaN \n", "freq NaN \n", "mean 1.453172 \n", "std 0.496547 \n", "min 0.763208 \n", "25% 0.963885 \n", "50% 1.593439 \n", "75% 1.870549 \n", "max 1.957470 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import uuid\n", "\n", "from langchain_benchmarks.tool_usage import agents, get_eval_config\n", "from langchain_benchmarks.utils import run_without_langsmith\n", "\n", "experiment_uuid = uuid.uuid4().hex[:4]\n", "\n", "\n", "models = [\"gpt-3.5-turbo-1106\"]\n", "\n", "for model in models:\n", " print()\n", " eval_config = get_eval_config(output_evaluation=\"qa_math\")\n", " agent_factory = agents.OpenAIAgentFactory(task, model=model)\n", " test_run = run_without_langsmith(\n", " # This will clone the dataset locally if not already there\n", " path_or_token_id=task.dataset_id,\n", " llm_or_chain_factory=agent_factory,\n", " evaluation=eval_config,\n", " verbose=True,\n", " )" ] }, { "cell_type": "code", "execution_count": 4, "id": "da3015b0-61b2-4748-ab0f-a0239bb74d58", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
inputs.questionoutputs.inputoutputs.outputoutputs.intermediate_stepsfeedback.Intermediate steps correctnessfeedback.# steps / # expected stepsfeedback.correctnesserrorexecution_time
27c44572-6c67-4129-a95a-fe1509c350bemultiply the result of (log of 100 to base 10)...multiply the result of (log of 100 to base 10)...[]00.00None0.763208
2a20a13d-050e-4a16-84ff-22d9582f1449after calculating the sin of 1.5 radians, divi...after calculating the sin of 1.5 radians, divi...[]00.00None1.413695
67867526-791a-452f-b534-ef2c1f5efd20ecoli divides every 20 minutes. How many cells...ecoli divides every 20 minutes. How many cells...[]00.00None1.773183
4ac33c1a-62f0-4da4-9455-07b582f6ff52calculate 101 to the power of 0.5 to 4 digits ...calculate 101 to the power of 0.5 to 4 digits ...[]00.00None1.819677
2e82a924-8382-425e-8738-daa2d912e9feconvert 15 degrees to radiansconvert 15 degrees to radians[]00.00None1.957470
\n", "
" ], "text/plain": [ " inputs.question \\\n", "27c44572-6c67-4129-a95a-fe1509c350be multiply the result of (log of 100 to base 10)... \n", "2a20a13d-050e-4a16-84ff-22d9582f1449 after calculating the sin of 1.5 radians, divi... \n", "67867526-791a-452f-b534-ef2c1f5efd20 ecoli divides every 20 minutes. How many cells... \n", "4ac33c1a-62f0-4da4-9455-07b582f6ff52 calculate 101 to the power of 0.5 to 4 digits ... \n", "2e82a924-8382-425e-8738-daa2d912e9fe convert 15 degrees to radians \n", "\n", " outputs.input \\\n", "27c44572-6c67-4129-a95a-fe1509c350be multiply the result of (log of 100 to base 10)... \n", "2a20a13d-050e-4a16-84ff-22d9582f1449 after calculating the sin of 1.5 radians, divi... \n", "67867526-791a-452f-b534-ef2c1f5efd20 ecoli divides every 20 minutes. How many cells... \n", "4ac33c1a-62f0-4da4-9455-07b582f6ff52 calculate 101 to the power of 0.5 to 4 digits ... \n", "2e82a924-8382-425e-8738-daa2d912e9fe convert 15 degrees to radians \n", "\n", " outputs.output \\\n", "27c44572-6c67-4129-a95a-fe1509c350be \n", "2a20a13d-050e-4a16-84ff-22d9582f1449 \n", "67867526-791a-452f-b534-ef2c1f5efd20 \n", "4ac33c1a-62f0-4da4-9455-07b582f6ff52 \n", "2e82a924-8382-425e-8738-daa2d912e9fe \n", "\n", " outputs.intermediate_steps \\\n", "27c44572-6c67-4129-a95a-fe1509c350be [] \n", "2a20a13d-050e-4a16-84ff-22d9582f1449 [] \n", "67867526-791a-452f-b534-ef2c1f5efd20 [] \n", "4ac33c1a-62f0-4da4-9455-07b582f6ff52 [] \n", "2e82a924-8382-425e-8738-daa2d912e9fe [] \n", "\n", " feedback.Intermediate steps correctness \\\n", "27c44572-6c67-4129-a95a-fe1509c350be 0 \n", "2a20a13d-050e-4a16-84ff-22d9582f1449 0 \n", "67867526-791a-452f-b534-ef2c1f5efd20 0 \n", "4ac33c1a-62f0-4da4-9455-07b582f6ff52 0 \n", "2e82a924-8382-425e-8738-daa2d912e9fe 0 \n", "\n", " feedback.# steps / # expected steps \\\n", "27c44572-6c67-4129-a95a-fe1509c350be 0.0 \n", "2a20a13d-050e-4a16-84ff-22d9582f1449 0.0 \n", "67867526-791a-452f-b534-ef2c1f5efd20 0.0 \n", "4ac33c1a-62f0-4da4-9455-07b582f6ff52 0.0 \n", "2e82a924-8382-425e-8738-daa2d912e9fe 0.0 \n", "\n", " feedback.correctness error \\\n", "27c44572-6c67-4129-a95a-fe1509c350be 0 None \n", "2a20a13d-050e-4a16-84ff-22d9582f1449 0 None \n", "67867526-791a-452f-b534-ef2c1f5efd20 0 None \n", "4ac33c1a-62f0-4da4-9455-07b582f6ff52 0 None \n", "2e82a924-8382-425e-8738-daa2d912e9fe 0 None \n", "\n", " execution_time \n", "27c44572-6c67-4129-a95a-fe1509c350be 0.763208 \n", "2a20a13d-050e-4a16-84ff-22d9582f1449 1.413695 \n", "67867526-791a-452f-b534-ef2c1f5efd20 1.773183 \n", "4ac33c1a-62f0-4da4-9455-07b582f6ff52 1.819677 \n", "2e82a924-8382-425e-8738-daa2d912e9fe 1.957470 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# You can interact with the object directly or as a flattened dataframe\n", "df = test_run.to_dataframe()\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 5, "id": "1bf4ea77-147f-4687-a2c6-7528a6eba08d", "metadata": {}, "outputs": [], "source": [ "df.to_csv(\"output.csv\", index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.2" } }, "nbformat": 4, "nbformat_minor": 5 }