local_doc_qa.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. from langchain.embeddings.huggingface import HuggingFaceEmbeddings
  2. from langchain.vectorstores import FAISS
  3. from langchain.document_loaders import UnstructuredFileLoader
  4. from models.chatglm_llm import ChatGLM
  5. from configs.model_config import *
  6. import datetime
  7. from textsplitter import ChineseTextSplitter
  8. from typing import List, Tuple
  9. from langchain.docstore.document import Document
  10. import numpy as np
  11. from utils import torch_gc
  12. from tqdm import tqdm
  13. DEVICE_ = EMBEDDING_DEVICE
  14. DEVICE_ID = "0" if torch.cuda.is_available() else None
  15. DEVICE = f"{DEVICE_}:{DEVICE_ID}" if DEVICE_ID else DEVICE_
  16. def load_file(filepath):
  17. if filepath.lower().endswith(".md"):
  18. loader = UnstructuredFileLoader(filepath, mode="elements")
  19. docs = loader.load()
  20. elif filepath.lower().endswith(".pdf"):
  21. loader = UnstructuredFileLoader(filepath)
  22. textsplitter = ChineseTextSplitter(pdf=True)
  23. docs = loader.load_and_split(textsplitter)
  24. else:
  25. loader = UnstructuredFileLoader(filepath, mode="elements")
  26. textsplitter = ChineseTextSplitter(pdf=False)
  27. docs = loader.load_and_split(text_splitter=textsplitter)
  28. return docs
  29. def generate_prompt(related_docs: List[str],
  30. query: str,
  31. prompt_template=PROMPT_TEMPLATE) -> str:
  32. context = "\n".join([doc.page_content for doc in related_docs])
  33. prompt = prompt_template.replace("{question}", query).replace("{context}", context)
  34. return prompt
  35. def get_docs_with_score(docs_with_score):
  36. docs = []
  37. for doc, score in docs_with_score:
  38. doc.metadata["score"] = score
  39. docs.append(doc)
  40. return docs
  41. def seperate_list(ls: List[int]) -> List[List[int]]:
  42. lists = []
  43. ls1 = [ls[0]]
  44. for i in range(1, len(ls)):
  45. if ls[i - 1] + 1 == ls[i]:
  46. ls1.append(ls[i])
  47. else:
  48. lists.append(ls1)
  49. ls1 = [ls[i]]
  50. lists.append(ls1)
  51. return lists
  52. def similarity_search_with_score_by_vector(
  53. self, embedding: List[float], k: int = 4,
  54. ) -> List[Tuple[Document, float]]:
  55. scores, indices = self.index.search(np.array([embedding], dtype=np.float32), k)
  56. docs = []
  57. id_set = set()
  58. store_len = len(self.index_to_docstore_id)
  59. for j, i in enumerate(indices[0]):
  60. if i == -1:
  61. # This happens when not enough docs are returned.
  62. continue
  63. _id = self.index_to_docstore_id[i]
  64. doc = self.docstore.search(_id)
  65. id_set.add(i)
  66. docs_len = len(doc.page_content)
  67. for k in range(1, max(i, store_len-i)):
  68. break_flag = False
  69. for l in [i + k, i - k]:
  70. if 0 <= l < len(self.index_to_docstore_id):
  71. _id0 = self.index_to_docstore_id[l]
  72. doc0 = self.docstore.search(_id0)
  73. if docs_len + len(doc0.page_content) > self.chunk_size:
  74. break_flag=True
  75. break
  76. elif doc0.metadata["source"] == doc.metadata["source"]:
  77. docs_len += len(doc0.page_content)
  78. id_set.add(l)
  79. if break_flag:
  80. break
  81. id_list = sorted(list(id_set))
  82. id_lists = seperate_list(id_list)
  83. for id_seq in id_lists:
  84. for id in id_seq:
  85. if id == id_seq[0]:
  86. _id = self.index_to_docstore_id[id]
  87. doc = self.docstore.search(_id)
  88. else:
  89. _id0 = self.index_to_docstore_id[id]
  90. doc0 = self.docstore.search(_id0)
  91. doc.page_content += doc0.page_content
  92. if not isinstance(doc, Document):
  93. raise ValueError(f"Could not find document for id {_id}, got {doc}")
  94. doc_score = min([scores[0][id] for id in [indices[0].tolist().index(i) for i in id_seq if i in indices[0]]])
  95. docs.append((doc, doc_score))
  96. torch_gc()
  97. return docs
  98. class LocalDocQA:
  99. llm: object = None
  100. embeddings: object = None
  101. top_k: int = VECTOR_SEARCH_TOP_K
  102. chunk_size: int = CHUNK_SIZE
  103. def init_cfg(self,
  104. embedding_model: str = EMBEDDING_MODEL,
  105. embedding_device=EMBEDDING_DEVICE,
  106. llm_history_len: int = LLM_HISTORY_LEN,
  107. llm_model: str = LLM_MODEL,
  108. llm_device=LLM_DEVICE,
  109. top_k=VECTOR_SEARCH_TOP_K,
  110. use_ptuning_v2: bool = USE_PTUNING_V2,
  111. use_lora: bool = USE_LORA,
  112. ):
  113. self.llm = ChatGLM()
  114. self.llm.load_model(model_name_or_path=llm_model_dict[llm_model],
  115. llm_device=llm_device, use_ptuning_v2=use_ptuning_v2, use_lora=use_lora)
  116. self.llm.history_len = llm_history_len
  117. self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model_dict[embedding_model],
  118. model_kwargs={'device': embedding_device})
  119. self.top_k = top_k
  120. def init_knowledge_vector_store(self,
  121. filepath: str or List[str],
  122. vs_path: str or os.PathLike = None):
  123. loaded_files = []
  124. failed_files = []
  125. if isinstance(filepath, str):
  126. if not os.path.exists(filepath):
  127. print("路径不存在")
  128. return None
  129. elif os.path.isfile(filepath):
  130. file = os.path.split(filepath)[-1]
  131. try:
  132. docs = load_file(filepath)
  133. print(f"{file} 已成功加载")
  134. loaded_files.append(filepath)
  135. except Exception as e:
  136. print(e)
  137. print(f"{file} 未能成功加载")
  138. return None
  139. elif os.path.isdir(filepath):
  140. docs = []
  141. for file in tqdm(os.listdir(filepath), desc="加载文件"):
  142. fullfilepath = os.path.join(filepath, file)
  143. try:
  144. docs += load_file(fullfilepath)
  145. loaded_files.append(fullfilepath)
  146. except Exception as e:
  147. failed_files.append(file)
  148. if len(failed_files) > 0:
  149. print("以下文件未能成功加载:")
  150. for file in failed_files:
  151. print(file,end="\n")
  152. else:
  153. docs = []
  154. for file in filepath:
  155. try:
  156. docs += load_file(file)
  157. print(f"{file} 已成功加载")
  158. loaded_files.append(file)
  159. except Exception as e:
  160. print(e)
  161. print(f"{file} 未能成功加载")
  162. if len(docs) > 0:
  163. print("文件加载完毕,正在生成向量库")
  164. if vs_path and os.path.isdir(vs_path):
  165. vector_store = FAISS.load_local(vs_path, self.embeddings)
  166. vector_store.add_documents(docs)
  167. torch_gc()
  168. else:
  169. if not vs_path:
  170. vs_path = os.path.join(VS_ROOT_PATH,
  171. f"""{os.path.splitext(file)[0]}_FAISS_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}""")
  172. vector_store = FAISS.from_documents(docs, self.embeddings)
  173. torch_gc()
  174. vector_store.save_local(vs_path)
  175. return vs_path, loaded_files
  176. else:
  177. print("文件均未成功加载,请检查依赖包或替换为其他文件再次上传。")
  178. return None, loaded_files
  179. def get_knowledge_based_answer(self,
  180. query,
  181. vs_path,
  182. chat_history=[],
  183. streaming: bool = STREAMING):
  184. vector_store = FAISS.load_local(vs_path, self.embeddings)
  185. FAISS.similarity_search_with_score_by_vector = similarity_search_with_score_by_vector
  186. vector_store.chunk_size = self.chunk_size
  187. related_docs_with_score = vector_store.similarity_search_with_score(query,
  188. k=self.top_k)
  189. related_docs = get_docs_with_score(related_docs_with_score)
  190. torch_gc()
  191. prompt = generate_prompt(related_docs, query)
  192. # if streaming:
  193. # for result, history in self.llm._stream_call(prompt=prompt,
  194. # history=chat_history):
  195. # history[-1][0] = query
  196. # response = {"query": query,
  197. # "result": result,
  198. # "source_documents": related_docs}
  199. # yield response, history
  200. # else:
  201. for result, history in self.llm._call(prompt=prompt,
  202. history=chat_history,
  203. streaming=streaming):
  204. torch_gc()
  205. history[-1][0] = query
  206. response = {"query": query,
  207. "result": result,
  208. "source_documents": related_docs}
  209. yield response, history
  210. torch_gc()
  211. if __name__ == "__main__":
  212. local_doc_qa = LocalDocQA()
  213. local_doc_qa.init_cfg()
  214. query = "本项目使用的embedding模型是什么,消耗多少显存"
  215. vs_path = "/Users/liuqian/Downloads/glm-dev/vector_store/aaa"
  216. last_print_len = 0
  217. for resp, history in local_doc_qa.get_knowledge_based_answer(query=query,
  218. vs_path=vs_path,
  219. chat_history=[],
  220. streaming=True):
  221. print(resp["result"][last_print_len:], end="", flush=True)
  222. last_print_len = len(resp["result"])
  223. source_text = [f"""出处 [{inum + 1}] {os.path.split(doc.metadata['source'])[-1]}:\n\n{doc.page_content}\n\n"""
  224. # f"""相关度:{doc.metadata['score']}\n\n"""
  225. for inum, doc in
  226. enumerate(resp["source_documents"])]
  227. print("\n\n" + "\n\n".join(source_text))
  228. pass