|
@@ -0,0 +1,52 @@
|
|
|
+import os
|
|
|
+import pinecone
|
|
|
+from tqdm import tqdm
|
|
|
+from langchain.llms import OpenAI
|
|
|
+from langchain.text_splitter import SpacyTextSplitter
|
|
|
+from langchain.document_loaders import TextLoader
|
|
|
+from langchain.document_loaders import DirectoryLoader
|
|
|
+from langchain.indexes import VectorstoreIndexCreator
|
|
|
+from langchain.embeddings.openai import OpenAIEmbeddings
|
|
|
+from langchain.vectorstores import Pinecone
|
|
|
+
|
|
|
+#一些配置文件
|
|
|
+openai_key="你的key" # 注册 openai.com 后获得
|
|
|
+pinecone_key="你的key" # 注册 app.pinecone.io 后获得
|
|
|
+pinecone_index="你的库" #app.pinecone.io 获得
|
|
|
+pinecone_environment="你的Environment" # 登录pinecone后,在indexes页面 查看Environment
|
|
|
+pinecone_namespace="你的Namespace" #如果不存在自动创建
|
|
|
+
|
|
|
+#科学上网你懂得
|
|
|
+os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7890'
|
|
|
+os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7890'
|
|
|
+
|
|
|
+#初始化pinecone
|
|
|
+pinecone.init(
|
|
|
+ api_key=pinecone_key,
|
|
|
+ environment=pinecone_environment
|
|
|
+)
|
|
|
+index = pinecone.Index(pinecone_index)
|
|
|
+
|
|
|
+#初始化OpenAI的embeddings
|
|
|
+embeddings = OpenAIEmbeddings(openai_api_key=openai_key)
|
|
|
+
|
|
|
+#初始化text_splitter
|
|
|
+text_splitter = SpacyTextSplitter(pipeline='zh_core_web_sm',chunk_size=1000,chunk_overlap=200)
|
|
|
+
|
|
|
+# 读取目录下所有后缀是txt的文件
|
|
|
+loader = DirectoryLoader('../docs', glob="**/*.txt", loader_cls=TextLoader)
|
|
|
+
|
|
|
+#读取文本文件
|
|
|
+documents = loader.load()
|
|
|
+
|
|
|
+# 使用text_splitter对文档进行分割
|
|
|
+split_text = text_splitter.split_documents(documents)
|
|
|
+try:
|
|
|
+ for document in tqdm(split_text):
|
|
|
+ # 获取向量并储存到pinecone
|
|
|
+ Pinecone.from_documents([document], embeddings, index_name=pinecone_index)
|
|
|
+except Exception as e:
|
|
|
+ print(f"Error: {e}")
|
|
|
+ quit()
|
|
|
+
|
|
|
+
|