RAG实战——选择最佳embeding和rerank模型测试效果

由于时效问题,该文某些代码、技术可能已经过期,请注意!!!本文最后更新于:1 年前

基础使用 embeding和rerank

为了衡量我们的检索系统的有效性,我们选择被广泛接受的两个指标:Hit Rate和 Mean Reciprocal Rank (MRR)。
命中率(Hit Rate):命中率计算在前k个检索到的文档中找到正确答案的查询的百分比。简单地说,这是关于我们的系统在前几次猜测中正确的频率。
平均倒数排名(Mean Reciprocal Rank, MRR):对于每个查询,MRR通过查看排名最高的相关文档的排名来评估系统的准确性。具体来说,它是所有查询中这些排名的倒数的平均值。因此,如果第一个相关文档是最高结果,则倒数为1;如果是第二个,则倒数为1/2,依此类推。

选择最佳组合

代码参考: https://colab.research.google.com/drive/1TxDVA__uimVPOJiMEQgP5fwHiqgKqm4-?usp=sharing

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
from llama_index import GPTKeywordTableIndex, VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.node_parser import SimpleNodeParser
# LLM
from llama_index.llms import AzureOpenAI

# Embeddings
from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding, CohereEmbedding
# from langchain.embeddings import VoyageEmbeddings, GooglePalmEmbeddings

# Retrievers
from llama_index.retrievers import (
BaseRetriever,
VectorIndexRetriever,
)

# Rerankers
from llama_index.indices.query.schema import QueryBundle, QueryType
from llama_index.schema import NodeWithScore
# from llama_index.indices.postprocessor.cohere_rerank import CohereRerank
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index.finetuning.embeddings.common import EmbeddingQAFinetuneDataset

# Evaluator
from llama_index.evaluation import (
generate_question_context_pairs,
EmbeddingQAFinetuneDataset,
)
from llama_index.evaluation import RetrieverEvaluator


from typing import List
import pandas as pd
import openai


documents = SimpleDirectoryReader(input_files=["data/tmp_files/11845723.pdf.txt"]).load_data()
# documents = SimpleDirectoryReader('./data').load_data()
node_parser = SimpleNodeParser.from_defaults(chunk_size=1024)
nodes = node_parser.get_nodes_from_documents(documents)


# 初始化 AzureOpenAI 对象
llm = AzureOpenAI(
engine="gpt3",
model="gpt-35-turbo-16k",
temperature=0.0,
azure_endpoint="xxx",
api_key="xxx",
api_version="2023-07-01-preview",
)

EMBEDDINGS = {
"bge-large-en": HuggingFaceEmbedding(model_name='/data/database/hg-embed/bge-large-en-v1.5', device='cuda'), # You can use mean pooling by addin pooling='mean' parameter
"bge-large-zh": HuggingFaceEmbedding(model_name='/data/database/hg-embed/bge-large-zh-v1.5', device='cuda'),
"bce-base": HuggingFaceEmbedding(model_name='/data/database/hg-embed/bce-embedding-base_v1', device='cpu'),
"JinaAI-Base": HuggingFaceEmbedding(model_name='/data/database/hg-embed/jina-embeddings-v2-base-en', device='cuda'),

}

RERANKERS = {
"WithoutReranker": "None",
"bce-reranker-base": SentenceTransformerRerank(model="/data/database/hg-embed/bce-reranker-base_v1", top_n=5),
"bge-reranker-large": SentenceTransformerRerank(model="/data/database/hg-embed/bge-reranker-large", top_n=5)
}


# Prompt to generate questions
qa_generate_prompt_tmpl = """\
Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge.
generate only questions based on the below query.
You are a Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. The questions should not contain options, not start with Q1/ Q2. \
Restrict the questions to the context information provided.\
"""

qa_dataset = generate_question_context_pairs(
nodes, llm=llm, num_questions_per_chunk=2
)

# function to clean the dataset
def filter_qa_dataset(qa_dataset):
"""
Filters out queries from the qa_dataset that contain certain phrases and the corresponding
entries in the relevant_docs, and creates a new EmbeddingQAFinetuneDataset object with
the filtered data.

:param qa_dataset: An object that has 'queries', 'corpus', and 'relevant_docs' attributes.
:return: An EmbeddingQAFinetuneDataset object with the filtered queries, corpus and relevant_docs.
"""

# Extract keys from queries and relevant_docs that need to be removed
queries_relevant_docs_keys_to_remove = {
k for k, v in qa_dataset.queries.items()
if 'Here are 2' in v or 'Here are two' in v
}

# Filter queries and relevant_docs using dictionary comprehensions
filtered_queries = {
k: v for k, v in qa_dataset.queries.items()
if k not in queries_relevant_docs_keys_to_remove
}
filtered_relevant_docs = {
k: v for k, v in qa_dataset.relevant_docs.items()
if k not in queries_relevant_docs_keys_to_remove
}

# Create a new instance of EmbeddingQAFinetuneDataset with the filtered data
return EmbeddingQAFinetuneDataset(
queries=filtered_queries,
corpus=qa_dataset.corpus,
relevant_docs=filtered_relevant_docs
)

# filter out pairs with phrases `Here are 2 questions based on provided context`
qa_dataset = filter_qa_dataset(qa_dataset)

def display_results(embedding_name, reranker_name, eval_results):
"""Display results from evaluate."""

metric_dicts = []
for eval_result in eval_results:
metric_dict = eval_result.metric_vals_dict
metric_dicts.append(metric_dict)

full_df = pd.DataFrame(metric_dicts)

hit_rate = full_df["hit_rate"].mean()
mrr = full_df["mrr"].mean()

metric_df = pd.DataFrame(
{"Embedding": [embedding_name], "Reranker": [reranker_name], "hit_rate": [hit_rate], "mrr": [mrr]}
)

return metric_df

results_df = pd.DataFrame()

# Define Retriever
class CustomRetriever(BaseRetriever):
"""Custom retriever that performs both Vector search and Knowledge Graph search"""

def __init__(
self,
vector_retriever: VectorIndexRetriever,
) -> None:
"""Init params."""

self._vector_retriever = vector_retriever

def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
"""Retrieve nodes given query."""

retrieved_nodes = self._vector_retriever.retrieve(query_bundle)

if reranker != 'None':
retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
else:
retrieved_nodes = retrieved_nodes[:5]

return retrieved_nodes

async def _aretrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
"""Asynchronously retrieve nodes given query.

Implemented by the user.

"""
return self._retrieve(query_bundle)

async def aretrieve(self, str_or_query_bundle: QueryType) -> List[NodeWithScore]:
if isinstance(str_or_query_bundle, str):
str_or_query_bundle = QueryBundle(str_or_query_bundle)
return await self._aretrieve(str_or_query_bundle)

# Loop over embeddings
for embed_name, embed_model in EMBEDDINGS.items():

service_context = ServiceContext.from_defaults(llm=None, embed_model=embed_model)
vector_index = VectorStoreIndex(nodes, service_context=service_context)
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=10, service_context=service_context)

# Loop over rerankers
for rerank_name, reranker in RERANKERS.items():

print(f"Running Evaluation for Embedding Model: {embed_name} and Reranker: {rerank_name}")
custom_retriever = CustomRetriever(vector_retriever)

retriever_evaluator = RetrieverEvaluator.from_metric_names(
["mrr", "hit_rate"], retriever=custom_retriever
)
eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

current_df = display_results(embed_name, rerank_name, eval_results)
results_df = pd.concat([results_df, current_df], ignore_index=True)

根据以上代码执行的结果选择最佳组合,比如bge-large-en 和 bce-reranker-base进行检索

RAG + Rerank检索

基础RAG
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from llama_index import ServiceContext, VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding, CohereEmbedding
from llama_index.node_parser import SimpleNodeParser

from typing import List
# Retrievers
from llama_index.retrievers import (
BaseRetriever,
VectorIndexRetriever,
)

# Rerankers
from llama_index.indices.query.schema import QueryBundle, QueryType
from llama_index.schema import NodeWithScore
# from llama_index.indices.postprocessor.cohere_rerank import CohereRerank
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index.finetuning.embeddings.common import EmbeddingQAFinetuneDataset

embed_model = HuggingFaceEmbedding(model_name='/data/database/hg-embed/bge-large-en-v1.5', device='cuda:1')

documents = SimpleDirectoryReader(input_files=["data/ssh_btmp.pdf"]).load_data()
node_parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=100)
nodes = node_parser.get_nodes_from_documents(documents)

query_bundle = QueryBundle('如何查看攻击者ip')
service_context = ServiceContext.from_defaults(llm=None, embed_model=embed_model)
vector_index = VectorStoreIndex(nodes, service_context=service_context)
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5, service_context=service_context)

retrieved_nodes = vector_retriever.retrieve(query_bundle)
# retrieved_nodes
for r in retrieved_nodes:
print('文本:',r.get_text())
print('Score:', r.score)
Rerank
1
2
3
4
5
reranker = SentenceTransformerRerank(model="/data/database/hg-embed/bce-reranker-base_v1", top_n=1)
retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
for r in retrieved_nodes:
print('文本:',r.get_text())
print('Score:', r.score)

最后可以用Rerank的结果再喂给LLM获取答案。

参考:
https://blog.csdn.net/wshzd/article/details/135092669?spm=1001.2014.3001.5502
https://mp.weixin.qq.com/s/SypSZwApvNBnPQwk7Uw5aA


本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!