defget_files(dir): file_list = [] for f in listdir(dir): if isfile(join(dir,f)): file_list.append(join(dir,f)) elif isdir(join(dir,f)): file_list= file_list + get_files(join(dir,f)) return file_list
一旦所有文件都被检索到列表中,我们可以读取包含文本的文件内容。在这个工具中,首先我们将支持 MS Word 文档(扩展名为“.docx”)、PDF 文档、MS PowerPoint 演示文稿(扩展名为“.pptx”)和纯文本文件(扩展名为“.txt”)。
为了读取 MS Word 文档,我们可以使用 docx-python 库。将文档读取到一个字符串变量的函数如下所示:
1 2 3 4 5 6 7
import docx defgetTextFromWord(filename): doc = docx.Document(filename) fullText = [] for para in doc.paragraphs: fullText.append(para.text) return'\n'.join(fullText)
类似的操作也可以用于 MS PowerPoint 文件。为此,我们需要下载并安装 pptx-python 库,并编写一个类似这样的函数:
1 2 3 4 5 6 7 8
from pptx import Presentation defgetTextFromPPTX(filename): prs = Presentation(filename) fullText = [] for slide in prs.slides: for shape in slide.shapes: fullText.append(shape.text) return'\n'.join(fullText)
读取文本文件非常简单:
1 2 3
f = open(file,'r') file_content = f.read() f.close()
对于 PDF 文件,我们在这个案例中将使用 PyPDF2 库:
1 2 3
reader = PyPDF2.PdfReader(file) for i inrange(0,len(reader.pages)): file_content = file_content + " "+reader.pages[i].extract_text()
from fastapi import FastAPI from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_qdrant import Qdrant from qdrant_client import QdrantClient from pydantic import BaseModel import torch from transformers import AutoTokenizer, AutoModelForCausalLM import environment_var import os from openai import OpenAI
如前所述,我们使用FastAPI来创建API接口。我们将利用qdrant_client库访问我们创建的索引数据,并借助langchain_qdrant库提供额外支持。对于嵌入和本地加载Llama 3模型,我们将使用PyTorch和Transformers库。此外,我们将使用OpenAI库调用NVIDIA NIM API,API密钥存储在我们创建的environment_var文件中(适用于Nvidia和HuggingFace)。
@app.post("/search") defsearch(Item:Item): query = Item.query search_result = qdrant.similarity_search( query=query, k=10 ) i = 0 list_res = [] for res in search_result: list_res.append({"id":i,"path":res.metadata.get("path"),"content":res.page_content}) return list_res
@app.post("/ask_localai") asyncdefask_localai(Item:Item): query = Item.query search_result = qdrant.similarity_search( query=query, k=10 ) i = 0 list_res = [] context = "" mappings = {} i = 0 for res in search_result: context = context + str(i)+"\n"+res.page_content+"\n\n" mappings[i] = res.metadata.get("path") list_res.append({"id":i,"path":res.metadata.get("path"),"content":res.page_content}) i = i +1
rolemsg = {"role": "system", "content": "Answer user's question using documents given in the context. In the context are documents that should contain an answer. Please always reference document id (in squere brackets, for example [0],[1]) of the document that was used to make a claim. Use as many citations and documents as it is necessary to answer question."} messages = [ rolemsg, {"role": "user", "content": "Documents:\n"+context+"\n\nQuestion: "+query}, ] if use_nvidia_api: completion = client_ai.chat.completions.create( model="meta/llama3-70b-instruct", messages=messages, temperature=0.5, top_p=1, max_tokens=1024, stream=False ) response = completion.choices[0].message.content else: input_ids = tokenizer.apply_chat_template( messages, add_generation_prompt=True, return_tensors="pt" ).to(model.device)
import re import streamlit as st import requests import json st.title('_:blue[Local GenAI Search]_ :sunglasses:') question = st.text_input("Ask a question based on your local files", "") if st.button("Ask a question"): st.write("The current question is \"", question+"\"") url = "http://127.0.0.1:8000/ask_localai"
answer = json.loads(response.text)["answer"] rege = re.compile("\[Document\ [0-9]+\]|\[[0-9]+\]") m = rege.findall(answer) num = [] for n in m: num = num + [int(s) for s in re.findall(r'\b\d+\b', n)]
st.markdown(answer) documents = json.loads(response.text)['context'] show_docs = [] for n in num: for doc in documents: ifint(doc['id']) == n: show_docs.append(doc) a = 1244 for doc in show_docs: with st.expander(str(doc['id'])+" - "+doc['path']): st.write(doc['content']) withopen(doc['path'], 'rb') as f: st.download_button("Downlaod file", f, file_name=doc['path'].split('/')[-1],key=a ) a = a + 1