RAG中向量数据库的使用及踩坑

由于时效问题,该文某些代码、技术可能已经过期,请注意!!!本文最后更新于:2 个月前

测试的向量数据库有:Faiss, Milvus, Chroma, ElasticSearch, MeiliSearch

文件加载和llm, embed的初始化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
import time
import html2text
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from llama_index.core import (
VectorStoreIndex, PromptTemplate, StorageContext, load_index_from_storage,
Settings, SimpleDirectoryReader, Document
)
from llama_index.llms.ollama import Ollama
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter
from llama_index.core.postprocessor import LLMRerank
from llama_index.core.readers.base import BaseReader

import nest_asyncio
nest_asyncio.apply()

# Setting the global LLM model to None
Settings.llm = None

# llama index 默认chunk
# Settings.chunk_size = 1024
# Settings.chunk_overlap = 20

# 创建 HTML 到文本转换器
converter = html2text.HTML2Text()

def clean_html(html):
soup = BeautifulSoup(html, 'html.parser')
col_content_div = soup.find(id="col-content")

for a_tag in col_content_div.find_all('a'):
del a_tag['href']

text = converter.handle(str(col_content_div))
return text

# 自定义的 HTML 文件读取器
class HtmlFileReader(BaseReader):
def load_data(self, file, extra_info=None):
with open(file, "r", encoding="utf-8") as f:
html = f.read()

try:
clean_data = clean_html(html)
except Exception as e:
print(f"Error processing {file}: {e}")
clean_data = converter.handle(html)

# 添加额外的元数据信息
metadata = {
"file_name": file.name,
"file_path": str(file.resolve()),
}
# 返回 Document 对象列表
return [Document(text=clean_data + " Foobar", metadata=metadata)]

# 扩展 SimpleDirectoryReader 以显示进度条
class ProgressSimpleDirectoryReader(SimpleDirectoryReader):
def load_data(self):
documents = []
file_list = list(self.input_dir.glob("*"))
for file in tqdm(file_list, desc="Loading files"):
file_extension = file.suffix.lower()
reader = self.file_extractor.get(file_extension)
if reader:
documents.extend(reader.load_data(file))
return documents

# 指定输入目录和文件读取器
reader = ProgressSimpleDirectoryReader(
input_dir="/data/html/",
file_extractor={".html": HtmlFileReader()}
)

# 加载数据
documents = reader.load_data()

llm_gpt = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
llm_qwen = Ollama(base_url='http://localhost:11434', model="qwen2:latest", temperature=0.1, request_timeout=300.0)
embed_model = HuggingFaceEmbedding(
model_name="/model/embed/bge-m3", max_length=8192
)

node_parser = SentenceSplitter(chunk_size=512)
nodes = node_parser.get_nodes_from_documents(documents)

向量数据库

Faiss的使用
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import faiss
from llama_index.vector_stores.faiss import FaissVectorStore

if os.path.exists('./index/faiss_db'):
vector_store = FaissVectorStore.from_persist_dir("./index/faiss_db")
storage_context = StorageContext.from_defaults(
vector_store=vector_store, persist_dir="./index/faiss_db"
)
index = load_index_from_storage(storage_context=storage_context, embed_model=embed_model)
else:
d = 1024
faiss_index = faiss.IndexFlatIP(d)
vector_store = FaissVectorStore(faiss_index=faiss_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex(
nodes,
storage_context=storage_context,
embed_model=embed_model,
use_async=True,
show_progress=True
)
index.storage_context.persist('./index/faiss_db')
Milvus的使用
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from llama_index.vector_stores.milvus import MilvusVectorStore
from pymilvus import MilvusClient

client = MilvusClient(
uri="http://localhost:19530"
)

if client.has_collection('html_index'):
vector_store = MilvusVectorStore(dim=1024, uri="http://localhost:19530",collection_name='html_index', overwrite=False,enable_sparse=True,
hybrid_ranker="RRFRanker",
hybrid_ranker_params={"k": 60})
index = VectorStoreIndex.from_vector_store(vector_store=vector_store, embed_model=embed_model)

else:
vector_store = MilvusVectorStore(dim=1024,
uri="http://localhost:19530",
collection_name='html_index',
overwrite=True,
enable_sparse=True,
hybrid_ranker="RRFRanker",
hybrid_ranker_params={"k": 60}
)

storage_context = StorageContext.from_defaults(vector_store=vector_store)


index = VectorStoreIndex(
nodes,
storage_context=storage_context,
embed_model=embed_model,
use_async=True,
show_progress=True
)

Milvus的一些基础查询

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from pymilvus import MilvusClient

client = MilvusClient(
uri="http://localhost:19530"
)

## 查询collection
client.list_collections()
## 修改collection name
client.rename_collection('html_index', 'html_index_new')
## 删除collection
client.drop_collection('html_index_new')
## 查询 index
client.list_indexes('html_index') # 比如输出 ['index-test']
## 查看collection 的描述信息
client.describe_collection('html_index')
## 查看 index 的描述信息
client.describe_index(collection_name='html_index', index_name='index-test')
Chroma的使用
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore

chroma_client = chromadb.PersistentClient(path="./index/chroma_db")
collection_name = "clean_html_sentence_window"
if collection_name in chroma_client.list_collections():
chroma_collection_sentence_window = chroma_client.get_collection("clean_html_sentence_window")
vector_store_sentence_window = ChromaVectorStore(chroma_collection=chroma_collection_sentence_window)
index = VectorStoreIndex.from_vector_store(vector_store=vector_store_sentence_window)
else:
chroma_collection_sentence_window = chroma_client.get_or_create_collection(collection_name)
vector_store_sentence_window = ChromaVectorStore(chroma_collection=chroma_collection_sentence_window)

storage_context_sentence_window = StorageContext.from_defaults(vector_store=vector_store_sentence_window)
index = VectorStoreIndex(
nodes,
storage_context=storage_context_sentence_window,
embed_model=embed_model,
use_async=True,
show_progress=True
)
ES
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from elasticsearch import Elasticsearch

# 连接到Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'scheme': 'http'}])
indices = es.cat.indices(format="json")

for index in indices:
es.indices.delete(index=index['index'], ignore_unavailable=True)
# 打印索引信息
for index in indices:
print(index['index'])

from llama_index.vector_stores.elasticsearch import ElasticsearchStore

vector_store = ElasticsearchStore(
es_url="http://localhost:9200", # see Elasticsearch Vector Store for more authentication options
index_name="html",

)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
documents, storage_context=storage_context, use_async=True,embed_model=embed_model,
show_progress=True
)
MeiliSearch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import meilisearch

client = meilisearch.Client('http://127.0.0.1:7700', 'masterKey')

# An index is where the documents are stored.
index = client.index('movies')

documents = [
{ 'id': 1, 'title': 'Carol', 'genres': ['Romance', 'Drama'] },
{ 'id': 2, 'title': 'Wonder Woman', 'genres': ['Action', 'Adventure'] },
{ 'id': 3, 'title': 'Life of Pi', 'genres': ['Adventure', 'Drama'] },
{ 'id': 4, 'title': 'Mad Max: Fury Road', 'genres': ['Adventure', 'Science Fiction'] },
{ 'id': 5, 'title': 'Moana', 'genres': ['Fantasy', 'Action']},
{ 'id': 6, 'title': 'Philadelphia', 'genres': ['Drama'] },
]

# If the index 'movies' does not exist, Meilisearch creates it when you first add the documents.
index.add_documents(documents) # => { "uid": 0 }

# 检索
index.search('caorl')

从索引构建速度上看,测试下来是faiss最快,当然可能也看场景数据。另外Milvus 和 ElasticSearch、MeiliSearch需要后端启动对应的服务,这点比较麻烦,当然这也是可能更适合大数据搜索吧

混合检索:关键词和向量检索

faiss 和 chroma不支持混合检索,需自行实现

BM25 + Faiss

这里以BM25算法作为关键词检索,faiss作为向量检索

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
## 混合检索
from llama_index.core.tools import RetrieverTool
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import VectorIndexRetriever

faiss_retriever = VectorIndexRetriever(index)
bm25_retriever = BM25Retriever.from_defaults(nodes=documents, similarity_top_k=2)

retriever_tools = [
RetrieverTool.from_defaults(
retriever=faiss_retriever,
description="Useful in most cases",
),
RetrieverTool.from_defaults(
retriever=bm25_retriever,
description="Useful if searching about specific information",
),
]

from llama_index.core.retrievers import RouterRetriever

retriever = RouterRetriever.from_defaults(
retriever_tools=retriever_tools,
llm=llm_qwen,
select_multi=True,
)

nodes = retriever.retrieve(
"the schools related to undergraduate studies"
)

# 查看检索到的节点信息
for n in nodes:
print(n)

Milvus

Milvus的混合检索开启只需要把模式改成hybrid即可

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# rerank模型
from llama_index.core.postprocessor import SentenceTransformerRerank
rerank = SentenceTransformerRerank(
top_n = 3,
model = "/model/rerank/bge-reranker-large/",
)

query_engine_rerank = index.as_query_engine(
similarity_top_k=5,
# text_qa_template=PromptTemplate(PROMPT),
node_postprocessors=[rerank],
embed_model=embed_model,
# llm=llm_qwen,
vector_store_query_mode="hybrid"
)
response = query_engine_rerank.query('the schools related to undergraduate studies')

PS: ES查询超显存了

参考:
https://docs.llamaindex.ai/en/stable/module_guides/storing/vector_stores/


本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!