Semi-structured eval: Chunk size tuning#

Semi-structured Reports is a public dataset that contains question-answer pairs from documents with text and tables.

The question-answer pairs are derived from the tables as well as some of the paragraphs in the docs.

We evaluation performance of various chunk sizes with RAG.

Pre-requisites#

# %pip install -U langchain langsmith langchain_benchmarks
# %pip install --quiet chromadb openai pypdf tiktoken fireworks-ai
import getpass
import os

os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
env_vars = ["LANGCHAIN_API_KEY", "OPENAI_API_KEY", "FIREWORKS_API_KEY"]
for var in env_vars:
    if var not in os.environ:
        os.environ[var] = getpass.getpass(prompt=f"Enter your {var}: ")

Dataset#

Fetch the associated PDFs from remote cache for the dataset so that we can perform ingestion.

import os

from langchain_benchmarks import clone_public_dataset, registry
from langchain_benchmarks.rag.tasks.semi_structured_reports import get_file_names

# Task
task = registry["Semi-structured Reports"]

# Files used
paths = list(get_file_names())
files = [str(p) for p in paths]

Clone the dataset so that it’s available in our LangSmith datasets.

clone_public_dataset(task.dataset_id, dataset_name=task.name)
Dataset Semi-structured Reports already exists. Skipping.
You can access the dataset at https://smith.langchain.com/o/1fa8b1f4-fcb9-4072-9aa9-983e35ad61b8/datasets/6549a3a5-1cb9-463f-951d-0166cb9cf45c.

Load and index#

We load each file, split it, embed with OpenAIEmbeddings, and create an index with Chroma vectorstore.

from langchain.chat_models import ChatFireworks, ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma


def load_and_split(file, token_count, split_document=True):
    """
    Load and optionally split PDF files.

    Args:
        file (str): File path.
        token_count (int): Token count for splitting.
        split_document (bool): Flag for splitting or returning pages.
    """

    loader = PyPDFLoader(file)
    pdf_pages = loader.load()

    if split_document:
        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            chunk_size=token_count, chunk_overlap=50
        )

        docs = text_splitter.split_documents(pdf_pages)
        texts = [d.page_content for d in docs]
    else:
        texts = [d.page_content for d in pdf_pages]

    print(f"There are {len(texts)} text elements")
    return texts


def load_files(files, token_count, split_document):
    """
    Load files.

    Args:
        files (list): List of file names.
        dir (str): Directory path.
        token_count (int): Token count for splitting.
        split_document (bool): Flag for splitting documents.
    """

    texts = []
    for fi in files:
        texts.extend(load_and_split(fi, token_count, split_document))
    return texts


def make_retriever(texts, expt):
    """
    Make vector store.

    Args:
        texts (list): List of texts.
        expt (str): Experiment name.
    """
    vectorstore = Chroma.from_texts(
        texts=texts, collection_name=expt, embedding=OpenAIEmbeddings()
    )
    retriever = vectorstore.as_retriever()
    return retriever


def rag_chain(retriever, llm):
    """
    RAG chain.

    Args:
        retriever: The retriever to use.
        llm: The llm to use.
    """

    # Prompt template
    template = """Answer the question based only on the following context, which can include text and tables:
    {context}
    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(template)

    # LLM
    if llm == "mixtral":
        model = ChatFireworks(
            model="accounts/fireworks/models/mixtral-8x7b-instruct", temperature=0
        )
    else:
        model = ChatOpenAI(temperature=0, model="gpt-4")

    # RAG pipeline
    chain = (
        {
            "context": retriever | (lambda x: "\n\n".join([i.page_content for i in x])),
            "question": RunnablePassthrough(),
        }
        | prompt
        | model
        | StrOutputParser()
    )
    return chain


# Experiment configurations
experiments = [
    (None, False, "page_split-oai", "oai"),
    (50, True, "50_tok_split-oai", "oai"),
    (100, True, "100_tok_split-oai", "oai"),
    (250, True, "250_tok_split-oai", "oai"),
    (250, True, "250_tok_split-mixtral", "mixtral"),
]

# Run
stor_chain = {}
for token_count, split_document, expt, llm in experiments:
    texts = load_files(files, token_count, split_document)
    retriever = make_retriever(texts, expt)
    stor_chain[expt] = rag_chain(retriever, llm)

Eval#

Run eval onm our dataset, Semi-structured Reports.

import uuid

from langchain.smith import RunEvalConfig
from langsmith.client import Client

# Config
client = Client()
eval_config = RunEvalConfig(
    evaluators=["cot_qa"],
)

# Experiments
chain_map = {
    "page_split": stor_chain["page_split-oai"],
    "baseline-50-tok": stor_chain["50_tok_split-oai"],
    "baseline-100-tok": stor_chain["100_tok_split-oai"],
    "baseline-250-tok": stor_chain["250_tok_split-oai"],
    "baseline-250-tok-mixtral": stor_chain["250_tok_split-mixtral"],
}

# Run evaluation
run_id = uuid.uuid4().hex[:4]
test_runs = {}
for project_name, chain in chain_map.items():
    test_runs[project_name] = client.run_on_dataset(
        dataset_name=task.name,
        llm_or_chain_factory=lambda: (lambda x: x["question"]) | chain,
        evaluation=eval_config,
        verbose=True,
        project_name=f"{run_id}-{project_name}",
        project_metadata={"chain": project_name},
    )