text_load.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. import os
  2. import pinecone
  3. from tqdm import tqdm
  4. from langchain.llms import OpenAI
  5. from langchain.text_splitter import SpacyTextSplitter
  6. from langchain.document_loaders import TextLoader
  7. from langchain.document_loaders import DirectoryLoader
  8. from langchain.indexes import VectorstoreIndexCreator
  9. from langchain.embeddings.openai import OpenAIEmbeddings
  10. from langchain.vectorstores import Pinecone
  11. #一些配置文件
  12. openai_key="你的key" # 注册 openai.com 后获得
  13. pinecone_key="你的key" # 注册 app.pinecone.io 后获得
  14. pinecone_index="你的库" #app.pinecone.io 获得
  15. pinecone_environment="你的Environment" # 登录pinecone后,在indexes页面 查看Environment
  16. pinecone_namespace="你的Namespace" #如果不存在自动创建
  17. #科学上网你懂得
  18. os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7890'
  19. os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7890'
  20. #初始化pinecone
  21. pinecone.init(
  22. api_key=pinecone_key,
  23. environment=pinecone_environment
  24. )
  25. index = pinecone.Index(pinecone_index)
  26. #初始化OpenAI的embeddings
  27. embeddings = OpenAIEmbeddings(openai_api_key=openai_key)
  28. #初始化text_splitter
  29. text_splitter = SpacyTextSplitter(pipeline='zh_core_web_sm',chunk_size=1000,chunk_overlap=200)
  30. # 读取目录下所有后缀是txt的文件
  31. loader = DirectoryLoader('../docs', glob="**/*.txt", loader_cls=TextLoader)
  32. #读取文本文件
  33. documents = loader.load()
  34. # 使用text_splitter对文档进行分割
  35. split_text = text_splitter.split_documents(documents)
  36. try:
  37. for document in tqdm(split_text):
  38. # 获取向量并储存到pinecone
  39. Pinecone.from_documents([document], embeddings, index_name=pinecone_index)
  40. except Exception as e:
  41. print(f"Error: {e}")
  42. quit()