Semi-structured eval: Chunk size tuning#
Semi-structured Reports
is a public dataset that contains question-answer pairs from documents with text and tables.
The question-answer pairs are derived from the tables as well as some of the paragraphs in the docs.
We evaluation performance of various chunk sizes with RAG.
# %pip install -U langchain langsmith langchain_benchmarks
# %pip install --quiet chromadb openai pypdf tiktoken fireworks-ai
import getpass
import os
os.environ["LANGCHAIN_ENDPOINT"] = ""
for var in env_vars:
if var not in os.environ:
os.environ[var] = getpass.getpass(prompt=f"Enter your {var}: ")
Fetch the associated PDFs from remote cache for the dataset so that we can perform ingestion.
import os
from langchain_benchmarks import clone_public_dataset, registry
from langchain_benchmarks.rag.tasks.semi_structured_reports import get_file_names
# Task
task = registry["Semi-structured Reports"]
# Files used
paths = list(get_file_names())
files = [str(p) for p in paths]
Clone the dataset so that it’s available in our LangSmith datasets.
Dataset Semi-structured Reports already exists. Skipping.
You can access the dataset at
Load and index#
We load each file, split it, embed with OpenAIEmbeddings
, and create an index with Chroma
from langchain.chat_models import ChatFireworks, ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
def load_and_split(file, token_count, split_document=True):
Load and optionally split PDF files.
file (str): File path.
token_count (int): Token count for splitting.
split_document (bool): Flag for splitting or returning pages.
loader = PyPDFLoader(file)
pdf_pages = loader.load()
if split_document:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=token_count, chunk_overlap=50
docs = text_splitter.split_documents(pdf_pages)
texts = [d.page_content for d in docs]
texts = [d.page_content for d in pdf_pages]
print(f"There are {len(texts)} text elements")
return texts
def load_files(files, token_count, split_document):
Load files.
files (list): List of file names.
dir (str): Directory path.
token_count (int): Token count for splitting.
split_document (bool): Flag for splitting documents.
texts = []
for fi in files:
texts.extend(load_and_split(fi, token_count, split_document))
return texts
def make_retriever(texts, expt):
Make vector store.
texts (list): List of texts.
expt (str): Experiment name.
vectorstore = Chroma.from_texts(
texts=texts, collection_name=expt, embedding=OpenAIEmbeddings()
retriever = vectorstore.as_retriever()
return retriever
def rag_chain(retriever, llm):
RAG chain.
retriever: The retriever to use.
llm: The llm to use.
# Prompt template
template = """Answer the question based only on the following context, which can include text and tables:
Question: {question}
prompt = ChatPromptTemplate.from_template(template)
if llm == "mixtral":
model = ChatFireworks(
model="accounts/fireworks/models/mixtral-8x7b-instruct", temperature=0
model = ChatOpenAI(temperature=0, model="gpt-4")
# RAG pipeline
chain = (
"context": retriever | (lambda x: "\n\n".join([i.page_content for i in x])),
"question": RunnablePassthrough(),
| prompt
| model
| StrOutputParser()
return chain
# Experiment configurations
experiments = [
(None, False, "page_split-oai", "oai"),
(50, True, "50_tok_split-oai", "oai"),
(100, True, "100_tok_split-oai", "oai"),
(250, True, "250_tok_split-oai", "oai"),
(250, True, "250_tok_split-mixtral", "mixtral"),
# Run
stor_chain = {}
for token_count, split_document, expt, llm in experiments:
texts = load_files(files, token_count, split_document)
retriever = make_retriever(texts, expt)
stor_chain[expt] = rag_chain(retriever, llm)
Run eval onm our dataset, Semi-structured Reports
import uuid
from langchain.smith import RunEvalConfig
from langsmith.client import Client
# Config
client = Client()
eval_config = RunEvalConfig(
# Experiments
chain_map = {
"page_split": stor_chain["page_split-oai"],
"baseline-50-tok": stor_chain["50_tok_split-oai"],
"baseline-100-tok": stor_chain["100_tok_split-oai"],
"baseline-250-tok": stor_chain["250_tok_split-oai"],
"baseline-250-tok-mixtral": stor_chain["250_tok_split-mixtral"],
# Run evaluation
run_id = uuid.uuid4().hex[:4]
test_runs = {}
for project_name, chain in chain_map.items():
test_runs[project_name] = client.run_on_dataset(,
llm_or_chain_factory=lambda: (lambda x: x["question"]) | chain,
project_metadata={"chain": project_name},