Semi-structured eval: Multi vector#

Semi-structured Reports is a public dataset that contains question-answer pairs from documents with text and tables.

The question-answer pairs are derived from the tables as well as some of the paragraphs in the docs.

We evaluation performance using multi-vector retriever for RAG.

Pre-requisites#

# %pip install -U langchain langsmith langchain_benchmarks
# %pip install --quiet chromadb openai pypdf tiktoken
import getpass
import os

os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
env_vars = ["LANGCHAIN_API_KEY", "OPENAI_API_KEY"]
for var in env_vars:
    if var not in os.environ:
        os.environ[var] = getpass.getpass(prompt=f"Enter your {var}: ")

Dataset#

Fetch the associated PDFs from remote cache for the dataset so that we can perform ingestion.

import os

from langchain_benchmarks import clone_public_dataset, registry
from langchain_benchmarks.rag.tasks.semi_structured_reports import get_file_names

# Task
task = registry["Semi-structured Reports"]

# Files used
paths = list(get_file_names())
files = [str(p) for p in paths]

Clone the dataset so that it’s available in our LangSmith datasets.

clone_public_dataset(task.dataset_id, dataset_name=task.name)

Load and index#

We build a retriever that focuses on tables.

To do this, we use an LLM to scan each page and summarize any tables within the page.

We then index those summaries for retrieval and store the raw page text containing the table with multi-vector retriever.

Finally, we use ensemble retriever to mix retrieved table chunks with the raw text chunks:

  • Combines the rankings from different retrievers into a single, unified ranking.

  • Each retriever provides a list of documents (or search results) ranked based on their relevance to the query.

  • The weights represent the relative importance or trust you place in each retriever’s results.

  • The weights are used to scale the contribution of each retriever to the final combined ranking.

  • The RRF method uses the rank of each item in the lists provided by the retrievers.

  • The basic idea is to give higher scores to items that are ranked higher (i.e., have a lower rank number) in the lists.

import uuid

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.retrievers import EnsembleRetriever
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma


def prepare_documents(docs):
    """
    Prepare documents for prompt. Concatenates Document objects (after extracting their page_content)
    and strings into a single string, separated by two newlines.

    :param docs: A list of str or Document objects.
    :return: A single string containing all documents.
    """
    # Process each document and append it to the list
    processed_docs = [
        doc.page_content if isinstance(doc, Document) else doc for doc in docs
    ]

    # Join all processed documents into a single string
    return "\n\n".join(processed_docs)


def create_multi_vector_retriever(vectorstore, text_summaries, texts):
    """
    Create retriever that indexes summaries, but returns raw images or texts
    """

    # Initialize the storage layer
    store = InMemoryStore()
    id_key = "doc_id"

    # Create the multi-vector retriever
    retriever = MultiVectorRetriever(
        vectorstore=vectorstore,
        docstore=store,
        id_key=id_key,
    )

    # Helper function to add documents to the vectorstore and docstore
    def add_documents(retriever, doc_summaries, doc_contents):
        doc_ids = [str(uuid.uuid4()) for _ in doc_contents]
        summary_docs = [
            Document(page_content=s, metadata={id_key: doc_ids[i]})
            for i, s in enumerate(doc_summaries)
        ]
        retriever.vectorstore.add_documents(summary_docs)
        retriever.docstore.mset(list(zip(doc_ids, doc_contents)))

    # Add texts, tables, and images
    add_documents(retriever, text_summaries, texts)
    return retriever


def generate_doc_summary(file):
    """
    Create a doc summary
    """

    # Prompt
    prompt_text = """You are an assistant tasked extracting two attributes \
    from financial documents. (1) Tell me the company that the document is \
    focused on. (2) Look at any tables in the document and tell me the units \ 
    of the table. Many table will have '(In thousands)' or '(in millions)' prior \
    to the table text. Provide these two for the document: \n\n {document} """
    prompt = ChatPromptTemplate.from_template(prompt_text)

    # Text summary chain
    model = ChatOpenAI(temperature=0, model="gpt-4-1106-preview")
    summarize_chain = {"document": lambda x: x} | prompt | model | StrOutputParser()

    # Load doc
    loader = PyPDFLoader(file)
    pdf_pages = loader.load()
    texts = [t.page_content for t in pdf_pages]
    text_string = " ".join(texts)
    summary = summarize_chain.invoke({"document": text_string})
    return summary


def generate_table_summaries(texts):
    """
    Summarize text elements
    texts: List of str
    """

    # Prompt
    prompt_text = """You are an assistant tasked with summarizing tables within a provided text chunk. \
    If the text chunk contains tables, then give a brief summary of the table and list the row and column \
    names to identify what is captured in the table. Do not sumnmarize quantitative results in the table. \ 
    If there is no table present, then just return "No table". \n\n Text: {element} """
    prompt = ChatPromptTemplate.from_template(prompt_text)

    # Text summary chain
    model = ChatOpenAI(temperature=0, model="gpt-4")
    summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

    # Initialize empty summaries
    text_summaries = []
    text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})

    return text_summaries


def load_and_split(file, token_count, split_document=True):
    """
    Load and optionally split PDF files.

    Args:
        file (str): File path.
        token_count (int): Token count for splitting.
        split_document (bool): Flag for splitting or returning pages.
    """

    loader = PyPDFLoader(file)
    pdf_pages = loader.load()

    if split_document:
        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            chunk_size=token_count, chunk_overlap=50
        )

        docs = text_splitter.split_documents(pdf_pages)
        texts = [d.page_content for d in docs]
    else:
        texts = [d.page_content for d in pdf_pages]

    print(f"There are {len(texts)} text elements")
    return texts


def load_files(files, token_count, split_document):
    """
    Load files.

    Args:
        files (list): List of file names.
        dir (str): Directory path.
        token_count (int): Token count for splitting.
        split_document (bool): Flag for splitting documents.
    """

    texts = []
    for fi in files:
        doc_summary = generate_doc_summary(fi)
        texts.extend(load_and_split(fi, token_count, split_document))
    return texts, doc_summary


def rag_chain(retriever):
    """
    RAG chain.

    Args:
        retriever: The retriever to use.
    """

    # Prompt template
    template = """Answer the question based only on the following context, which can include text and tables:
    {context}
    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(template)

    # LLM
    model = ChatOpenAI(temperature=0, model="gpt-4")

    # RAG pipeline
    chain = (
        {
            "context": retriever | RunnableLambda(prepare_documents),
            "question": RunnablePassthrough(),
        }
        | prompt
        | model
        | StrOutputParser()
    )
    return chain


# Experiment configurations
experiments = [
    (None, False, "page_split_multivector"),
]

# Run
stor_chain = {}
for token_count, split_document, expt in experiments:
    # Get texts and doc summary
    doc_texts, doc_summary = load_files(files, token_count, split_document)

    # Get table summaries
    doc_table_summaries = generate_table_summaries(doc_texts)

    # Add doc summary to table summary to preserve context
    doc_text_summaries = [
        "Here is a summary of the doc: \n\n"
        + doc_summary
        + "\n\n Here is a summary of a table within this doc: \n\n"
        + t
        for t in doc_table_summaries
    ]

    # The vectorstore to use to index the summaries
    vectorstore = Chroma(collection_name=expt, embedding_function=OpenAIEmbeddings())

    # Create our table retriever
    table_retriever = create_multi_vector_retriever(
        vectorstore, doc_table_summaries, doc_texts
    )

    # Create our docs retriever
    vectorstore_docs = Chroma.from_texts(
        texts=doc_texts, collection_name=expt + "docs", embedding=OpenAIEmbeddings()
    )
    docs_retriever = vectorstore_docs.as_retriever()

    # Initialize ensemble retriever
    ensemble_retriever = EnsembleRetriever(
        retrievers=[table_retriever, docs_retriever], weights=[0.75, 0.25]
    )

    # Chain
    stor_chain[expt] = rag_chain(ensemble_retriever)

Eval#

Run eval onm our dataset, Semi-structured Reports.

import uuid

from langchain.smith import RunEvalConfig
from langsmith.client import Client

# Config
client = Client()
eval_config = RunEvalConfig(
    evaluators=["cot_qa"],
)

# Experiments
chain_map = {
    "page_split_multivector_emsemble": stor_chain["page_split_multivector"],
}

# Run evaluation
run_id = uuid.uuid4().hex[:4]
test_runs = {}
for project_name, chain in chain_map.items():
    test_runs[project_name] = client.run_on_dataset(
        dataset_name=task.name,
        llm_or_chain_factory=lambda: (lambda x: x["Question"]) | chain,
        evaluation=eval_config,
        verbose=True,
        project_name=f"{run_id}-{project_name}",
        project_metadata={"chain": project_name},
    )