Semi-structured eval: Multi vector#
Semi-structured Reports
is a public dataset that contains question-answer pairs from documents with text and tables.
The question-answer pairs are derived from the tables as well as some of the paragraphs in the docs.
We evaluation performance using multi-vector retriever for RAG.
Pre-requisites#
# %pip install -U langchain langsmith langchain_benchmarks
# %pip install --quiet chromadb openai pypdf tiktoken
import getpass
import os
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
env_vars = ["LANGCHAIN_API_KEY", "OPENAI_API_KEY"]
for var in env_vars:
if var not in os.environ:
os.environ[var] = getpass.getpass(prompt=f"Enter your {var}: ")
Dataset#
Fetch the associated PDFs from remote cache for the dataset so that we can perform ingestion.
import os
from langchain_benchmarks import clone_public_dataset, registry
from langchain_benchmarks.rag.tasks.semi_structured_reports import get_file_names
# Task
task = registry["Semi-structured Reports"]
# Files used
paths = list(get_file_names())
files = [str(p) for p in paths]
Clone the dataset so that it’s available in our LangSmith datasets.
clone_public_dataset(task.dataset_id, dataset_name=task.name)
Load and index#
We build a retriever that focuses on tables.
To do this, we use an LLM to scan each page and summarize any tables within the page.
We then index those summaries for retrieval and store the raw page text containing the table with multi-vector retriever.
Finally, we use ensemble retriever to mix retrieved table chunks with the raw text chunks:
Combines the rankings from different retrievers into a single, unified ranking.
Each retriever provides a list of documents (or search results) ranked based on their relevance to the query.
The weights represent the relative importance or trust you place in each retriever’s results.
The weights are used to scale the contribution of each retriever to the final combined ranking.
The RRF method uses the rank of each item in the lists provided by the retrievers.
The basic idea is to give higher scores to items that are ranked higher (i.e., have a lower rank number) in the lists.
import uuid
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.retrievers import EnsembleRetriever
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
def prepare_documents(docs):
"""
Prepare documents for prompt. Concatenates Document objects (after extracting their page_content)
and strings into a single string, separated by two newlines.
:param docs: A list of str or Document objects.
:return: A single string containing all documents.
"""
# Process each document and append it to the list
processed_docs = [
doc.page_content if isinstance(doc, Document) else doc for doc in docs
]
# Join all processed documents into a single string
return "\n\n".join(processed_docs)
def create_multi_vector_retriever(vectorstore, text_summaries, texts):
"""
Create retriever that indexes summaries, but returns raw images or texts
"""
# Initialize the storage layer
store = InMemoryStore()
id_key = "doc_id"
# Create the multi-vector retriever
retriever = MultiVectorRetriever(
vectorstore=vectorstore,
docstore=store,
id_key=id_key,
)
# Helper function to add documents to the vectorstore and docstore
def add_documents(retriever, doc_summaries, doc_contents):
doc_ids = [str(uuid.uuid4()) for _ in doc_contents]
summary_docs = [
Document(page_content=s, metadata={id_key: doc_ids[i]})
for i, s in enumerate(doc_summaries)
]
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, doc_contents)))
# Add texts, tables, and images
add_documents(retriever, text_summaries, texts)
return retriever
def generate_doc_summary(file):
"""
Create a doc summary
"""
# Prompt
prompt_text = """You are an assistant tasked extracting two attributes \
from financial documents. (1) Tell me the company that the document is \
focused on. (2) Look at any tables in the document and tell me the units \
of the table. Many table will have '(In thousands)' or '(in millions)' prior \
to the table text. Provide these two for the document: \n\n {document} """
prompt = ChatPromptTemplate.from_template(prompt_text)
# Text summary chain
model = ChatOpenAI(temperature=0, model="gpt-4-1106-preview")
summarize_chain = {"document": lambda x: x} | prompt | model | StrOutputParser()
# Load doc
loader = PyPDFLoader(file)
pdf_pages = loader.load()
texts = [t.page_content for t in pdf_pages]
text_string = " ".join(texts)
summary = summarize_chain.invoke({"document": text_string})
return summary
def generate_table_summaries(texts):
"""
Summarize text elements
texts: List of str
"""
# Prompt
prompt_text = """You are an assistant tasked with summarizing tables within a provided text chunk. \
If the text chunk contains tables, then give a brief summary of the table and list the row and column \
names to identify what is captured in the table. Do not sumnmarize quantitative results in the table. \
If there is no table present, then just return "No table". \n\n Text: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)
# Text summary chain
model = ChatOpenAI(temperature=0, model="gpt-4")
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
# Initialize empty summaries
text_summaries = []
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
return text_summaries
def load_and_split(file, token_count, split_document=True):
"""
Load and optionally split PDF files.
Args:
file (str): File path.
token_count (int): Token count for splitting.
split_document (bool): Flag for splitting or returning pages.
"""
loader = PyPDFLoader(file)
pdf_pages = loader.load()
if split_document:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=token_count, chunk_overlap=50
)
docs = text_splitter.split_documents(pdf_pages)
texts = [d.page_content for d in docs]
else:
texts = [d.page_content for d in pdf_pages]
print(f"There are {len(texts)} text elements")
return texts
def load_files(files, token_count, split_document):
"""
Load files.
Args:
files (list): List of file names.
dir (str): Directory path.
token_count (int): Token count for splitting.
split_document (bool): Flag for splitting documents.
"""
texts = []
for fi in files:
doc_summary = generate_doc_summary(fi)
texts.extend(load_and_split(fi, token_count, split_document))
return texts, doc_summary
def rag_chain(retriever):
"""
RAG chain.
Args:
retriever: The retriever to use.
"""
# Prompt template
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
# LLM
model = ChatOpenAI(temperature=0, model="gpt-4")
# RAG pipeline
chain = (
{
"context": retriever | RunnableLambda(prepare_documents),
"question": RunnablePassthrough(),
}
| prompt
| model
| StrOutputParser()
)
return chain
# Experiment configurations
experiments = [
(None, False, "page_split_multivector"),
]
# Run
stor_chain = {}
for token_count, split_document, expt in experiments:
# Get texts and doc summary
doc_texts, doc_summary = load_files(files, token_count, split_document)
# Get table summaries
doc_table_summaries = generate_table_summaries(doc_texts)
# Add doc summary to table summary to preserve context
doc_text_summaries = [
"Here is a summary of the doc: \n\n"
+ doc_summary
+ "\n\n Here is a summary of a table within this doc: \n\n"
+ t
for t in doc_table_summaries
]
# The vectorstore to use to index the summaries
vectorstore = Chroma(collection_name=expt, embedding_function=OpenAIEmbeddings())
# Create our table retriever
table_retriever = create_multi_vector_retriever(
vectorstore, doc_table_summaries, doc_texts
)
# Create our docs retriever
vectorstore_docs = Chroma.from_texts(
texts=doc_texts, collection_name=expt + "docs", embedding=OpenAIEmbeddings()
)
docs_retriever = vectorstore_docs.as_retriever()
# Initialize ensemble retriever
ensemble_retriever = EnsembleRetriever(
retrievers=[table_retriever, docs_retriever], weights=[0.75, 0.25]
)
# Chain
stor_chain[expt] = rag_chain(ensemble_retriever)
Eval#
Run eval onm our dataset, Semi-structured Reports
.
import uuid
from langchain.smith import RunEvalConfig
from langsmith.client import Client
# Config
client = Client()
eval_config = RunEvalConfig(
evaluators=["cot_qa"],
)
# Experiments
chain_map = {
"page_split_multivector_emsemble": stor_chain["page_split_multivector"],
}
# Run evaluation
run_id = uuid.uuid4().hex[:4]
test_runs = {}
for project_name, chain in chain_map.items():
test_runs[project_name] = client.run_on_dataset(
dataset_name=task.name,
llm_or_chain_factory=lambda: (lambda x: x["Question"]) | chain,
evaluation=eval_config,
verbose=True,
project_name=f"{run_id}-{project_name}",
project_metadata={"chain": project_name},
)