1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
| import os import nest_asyncio from typing import List from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.core.postprocessor import SentenceTransformerRerank from llama_index.core import StorageContext, VectorStoreIndex from llama_index.vector_stores.milvus.utils import BaseSparseEmbeddingFunction from llama_index.vector_stores.milvus import MilvusVectorStore from FlagEmbedding import BGEM3FlagModel
nest_asyncio.apply()
os.environ['OPENAI_API_KEY'] = '' os.environ['OPENAI_API_BASE'] = ''
embed_model = HuggingFaceEmbedding( model_name="/home/wangjiabin/model/embed/bge-m3", max_length=8192 )
rerank = SentenceTransformerRerank( top_n=5, model="/home/wangjiabin/model/rerank/bge-reranker-v2-m3/" )
class ExampleEmbeddingFunction(BaseSparseEmbeddingFunction): def __init__(self): self.model = BGEM3FlagModel("/home/wangjiabin/model/embed/bge-m3", use_fp16=False)
def encode_queries(self, queries: List[str]): outputs = self.model.encode( queries, return_dense=False, return_sparse=True, return_colbert_vecs=False, )["lexical_weights"] return [self._to_standard_dict(output) for output in outputs]
def encode_documents(self, documents: List[str]): outputs = self.model.encode( documents, return_dense=False, return_sparse=True, return_colbert_vecs=False, )["lexical_weights"] return [self._to_standard_dict(output) for output in outputs]
def _to_standard_dict(self, raw_output): result = {} for k in raw_output: result[int(k)] = raw_output[k] return result
vector_store = MilvusVectorStore( dim=1024, uri="http://localhost:19530", collection_name='summary_html_index', overwrite=True, enable_sparse=True, sparse_embedding_function=ExampleEmbeddingFunction(), hybrid_ranker="RRFRanker", hybrid_ranker_params={"k": 60} )
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex( documents=documents_cp, # Ensure this variable is defined storage_context=storage_context, embed_model=embed_model, use_async=True, show_progress=True )
|