123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121 |
- from langchain.vectorstores import FAISS
- from typing import Any, Callable, List, Optional, Tuple, Dict
- from langchain.docstore.document import Document
- from langchain.docstore.base import Docstore
- from langchain.vectorstores.utils import maximal_marginal_relevance
- from langchain.embeddings.base import Embeddings
- import uuid
- from langchain.docstore.in_memory import InMemoryDocstore
- import numpy as np
- def dependable_faiss_import() -> Any:
- """Import faiss if available, otherwise raise error."""
- try:
- import faiss
- except ImportError:
- raise ValueError(
- "Could not import faiss python package. "
- "Please install it with `pip install faiss` "
- "or `pip install faiss-cpu` (depending on Python version)."
- )
- return faiss
- class FAISSVS(FAISS):
- def __init__(self,
- embedding_function: Callable[..., Any],
- index: Any,
- docstore: Docstore,
- index_to_docstore_id: Dict[int, str]):
- super().__init__(embedding_function, index, docstore, index_to_docstore_id)
- def max_marginal_relevance_search_by_vector(
- self, embedding: List[float], k: int = 4, fetch_k: int = 20, **kwargs: Any
- ) -> List[Tuple[Document, float]]:
- """Return docs selected using the maximal marginal relevance.
- Maximal marginal relevance optimizes for similarity to query AND diversity
- among selected documents.
- Args:
- embedding: Embedding to look up documents similar to.
- k: Number of Documents to return. Defaults to 4.
- fetch_k: Number of Documents to fetch to pass to MMR algorithm.
- Returns:
- List of Documents with scores selected by maximal marginal relevance.
- """
- scores, indices = self.index.search(np.array([embedding], dtype=np.float32), fetch_k)
- # -1 happens when not enough docs are returned.
- embeddings = [self.index.reconstruct(int(i)) for i in indices[0] if i != -1]
- mmr_selected = maximal_marginal_relevance(
- np.array([embedding], dtype=np.float32), embeddings, k=k
- )
- selected_indices = [indices[0][i] for i in mmr_selected]
- selected_scores = [scores[0][i] for i in mmr_selected]
- docs = []
- for i, score in zip(selected_indices, selected_scores):
- if i == -1:
- # This happens when not enough docs are returned.
- continue
- _id = self.index_to_docstore_id[i]
- doc = self.docstore.search(_id)
- if not isinstance(doc, Document):
- raise ValueError(f"Could not find document for id {_id}, got {doc}")
- docs.append((doc, score))
- return docs
- def max_marginal_relevance_search(
- self,
- query: str,
- k: int = 4,
- fetch_k: int = 20,
- **kwargs: Any,
- ) -> List[Tuple[Document, float]]:
- """Return docs selected using the maximal marginal relevance.
- Maximal marginal relevance optimizes for similarity to query AND diversity
- among selected documents.
- Args:
- query: Text to look up documents similar to.
- k: Number of Documents to return. Defaults to 4.
- fetch_k: Number of Documents to fetch to pass to MMR algorithm.
- Returns:
- List of Documents with scores selected by maximal marginal relevance.
- """
- embedding = self.embedding_function(query)
- docs = self.max_marginal_relevance_search_by_vector(embedding, k, fetch_k)
- return docs
-
- @classmethod
- def __from(
- cls,
- texts: List[str],
- embeddings: List[List[float]],
- embedding: Embeddings,
- metadatas: Optional[List[dict]] = None,
- **kwargs: Any,
- ) -> FAISS:
- faiss = dependable_faiss_import()
- index = faiss.IndexFlatIP(len(embeddings[0]))
- index.add(np.array(embeddings, dtype=np.float32))
- # # my code, for speeding up search
- # quantizer = faiss.IndexFlatL2(len(embeddings[0]))
- # index = faiss.IndexIVFFlat(quantizer, len(embeddings[0]), 100)
- # index.train(np.array(embeddings, dtype=np.float32))
- # index.add(np.array(embeddings, dtype=np.float32))
- documents = []
- for i, text in enumerate(texts):
- metadata = metadatas[i] if metadatas else {}
- documents.append(Document(page_content=text, metadata=metadata))
- index_to_id = {i: str(uuid.uuid4()) for i in range(len(documents))}
- docstore = InMemoryDocstore(
- {index_to_id[i]: doc for i, doc in enumerate(documents)}
- )
- return cls(embedding.embed_query, index, docstore, index_to_id)
|