from unstructured.partition.pdf import partition_pdf
elements = partition_pdf( filename="TAGIV.pdf", # mandatory strategy="hi_res", # mandatory to use ``hi_res`` strategy extract_images_in_pdf=True, # mandatory to set as ``True`` extract_image_block_types=["Image", "Table"], # optional extract_image_block_to_payload=False, # optional extract_image_block_output_dir="saved_images", # optional - only works when ``extract_image_block_to_payload=False`` )
from unstructured.chunking.title import chunk_by_title # might be better for an article from typing importAny
chunks = chunk_by_title(elements)
# different category in the document category_counts = {}
for element in chunks: category = str(type(element)) if category in category_counts: category_counts[category] += 1 else: category_counts[category] = 1 # Unique_categories will have unique elements unique_categories = set(category_counts.keys()) category_counts
# 按类型分类 categorized_elements = [] for element in chunks: if"unstructured.documents.elements.CompositeElement"instr(type(element)): categorized_elements.append(Element(type="text", text=str(element))) elif"unstructured.documents.elements.Table"instr(type(element)): categorized_elements.append(Element(type="table", text=str(element)))
# 文本 text_elements = [e for e in categorized_elements if e.type == "text"]
# 表格 table_elements = [e for e in categorized_elements if e.type == "table"]
from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import ChatPromptTemplate from langchain_openai import ChatOpenAI
## 检索器
# 提示 prompt_text = """You are an expert Research Assistant tasked with summarizing tables and texts from research articles. \ Give a concise summary of the text. text chunk: {element} """
defgenerate_img_summaries(path): """ Generate summaries and base64 encoded strings for images path: Path to list of .jpg files extracted by Unstructured """
# Store base64 encoded images img_base64_list = []
# Store image summaries image_summaries = []
# Prompt prompt = """You are an assistant tasked with summarizing images for retrieval. \ These summaries will be embedded and used to retrieve the raw image. \ Give a concise summary of the image that is well optimized for retrieval."""
# Apply to images for img_file insorted(os.listdir(path)): if img_file.endswith(".jpg"): img_path = os.path.join(path, img_file) base64_image = encode_image(img_path) img_base64_list.append(base64_image) image_summaries.append(image_summarize(base64_image, prompt))
from langchain.retrievers.multi_vector import MultiVectorRetriever from langchain.storage import InMemoryStore from langchain_chroma import Chroma from langchain_core.documents import Document from langchain_openai import OpenAIEmbeddings
defcreate_multi_vector_retriever( vectorstore, text_summaries, texts, table_summaries, tables, image_summaries, images): """ Create retriever that indexes summaries, but returns raw images, table, or texts """
# Initialize the storage layer store = InMemoryStore() id_key = "doc_id"
# Create the multi-vector retriever retriever = MultiVectorRetriever( vectorstore=vectorstore, docstore=store, id_key=id_key, search_kwargs={"k": 2} # Limit to top 2 results )
# Helper function to add documents to the vectorstore and docstore defadd_documents(retriever, doc_summaries, doc_contents): doc_ids = [str(uuid.uuid4()) for _ in doc_contents] summary_docs = [ Document(page_content=s, metadata={id_key: doc_ids[i]}) for i, s inenumerate(doc_summaries) ] retriever.vectorstore.add_documents(summary_docs) retriever.docstore.mset(list(zip(doc_ids, doc_contents)))
# Add texts, tables, and images # Check that text_summaries is not empty before adding if text_summaries: add_documents(retriever, text_summaries, texts) # # Check that table_summaries is not empty before adding if table_summaries: add_documents(retriever, table_summaries, tables) # Check that image_summaries is not empty before adding if image_summaries: add_documents(retriever, image_summaries, images)
return retriever
创建检索器
现在,让我们使用 OpenAI 嵌入模型分配一个 Chroma 向量存储并创建我们的检索器。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# The vectorstore to use to index the summaries vectorstore = Chroma( collection_name="mm_tagiv_paper", embedding_function=OpenAIEmbeddings() )
retriever_multi_vector_img.invoke("How is the performance of TAGI-V for the Boston dataset compared to the other methods?")
1
['TAGI-V are averaged over 3 random seeds. The test log-likelihood values show that TAGI-V performs better than all other methods in 4 out of the 5 datasets. The TAGI-V method is also competitive for RMSE values where it provides the best results in 2 out of the 5 datasets, i.e., Elevators and KeggD, while it is second best for KeggU and Pol. Both PCA+ VI and NL outperform the others in two datasets.']
if data_dict["context"]["images"]: for image in data_dict["context"]["images"]: image_message = { "type": "image_url", "image_url": {"url": f"data:image/jpg;base64,{image}"}, } messages.append(image_message) chat_history = data_dict.get("chat_history", []) formatted_chat_history = "\n".join([f"{m.type}: {m.content}"for m in chat_history])
text_message = { "type": "text", "text": ( "You are a Research Assistant tasked with answering questions on research articles.\n" "You will be given a mixed of text, tables, and image(s) usually of tables, charts or graphs.\n" "Use this information to provide accurate information related to the user question. \n" f"User-provided question: {data_dict['question']}\n\n" "Text and / or tables:\n" f"{formatted_texts}" "Chat History:\n" f"{formatted_chat_history}\n\n" ), } messages.append(text_message) return [HumanMessage(content=messages)]
# Second Question query = "What is the performance of the same method for the Concrete dataset compared to the other methods?" print(chain_mm_rag(query))
# Check retrieval query = "How is the performance of He compared to modified He for the various datasets such as Boston, Concrete etc.?" docs = retriever_multi_vector_img.invoke(query, limit=6)
import streamlit as st from unstructured.partition.pdf import partition_pdf from unstructured.chunking.title import chunk_by_title from typing importAny from pydantic import BaseModel from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import ChatPromptTemplate from langchain_openai import ChatOpenAI from langchain_chroma import Chroma from langchain_openai import OpenAIEmbeddings from utils.image_processing import generate_img_summaries from utils.retriever import create_multi_vector_retriever from utils.rag_chain import multi_modal_rag_chain, plt_img_base64 from utils.rag_evaluation import LLM_Metric from io import BytesIO import base64 from PIL import Image import io
defprocess_document(uploaded_file): # 处理 PDF with st.spinner('正在处理 PDF...'): st.sidebar.info('正在从 PDF 中提取元素...') pdf_bytes = uploaded_file.read() elements = partition_pdf( file=BytesIO(pdf_bytes), strategy="hi_res", extract_images_in_pdf=True, extract_image_block_types=["Image", "Table"], extract_image_block_to_payload=False, extract_image_block_output_dir="docs/saved_images", ) st.sidebar.success('PDF 元素提取成功!')
# 按标题创建块 with st.spinner('正在分块内容...'): st.sidebar.info('正在按标题创建块...') chunks = chunk_by_title(elements) st.sidebar.success('分块完成!')
# 分类元素 classElement(BaseModel): type: str text: Any
categorized_elements = [] for element in chunks: if"unstructured.documents.elements.CompositeElement"instr(type(element)): categorized_elements.append(Element(type="text", text=str(element))) elif"unstructured.documents.elements.Table"instr(type(element)): categorized_elements.append(Element(type="table", text=str(element)))
text_elements = [e for e in categorized_elements if e.type == "text"] table_elements = [e for e in categorized_elements if e.type == "table"]
with st.sidebar: # 文件上传 st.subheader('添加您的 PDF') uploaded_file = st.file_uploader("上传 PDF 文件", type=["pdf"]) if st.button('提交'): if uploaded_file isnotNone: process_document(uploaded_file) st.success('文档处理成功!') else: st.error('请先上传 PDF 文件。')