1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
| from llama_index import GPTKeywordTableIndex, VectorStoreIndex, SimpleDirectoryReader, ServiceContext from llama_index.node_parser import SimpleNodeParser
from llama_index.llms import AzureOpenAI
from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding, CohereEmbedding
from llama_index.retrievers import ( BaseRetriever, VectorIndexRetriever, )
from llama_index.indices.query.schema import QueryBundle, QueryType from llama_index.schema import NodeWithScore
from llama_index.indices.postprocessor import SentenceTransformerRerank from llama_index.finetuning.embeddings.common import EmbeddingQAFinetuneDataset
from llama_index.evaluation import ( generate_question_context_pairs, EmbeddingQAFinetuneDataset, ) from llama_index.evaluation import RetrieverEvaluator
from typing import List import pandas as pd import openai
documents = SimpleDirectoryReader(input_files=["data/tmp_files/11845723.pdf.txt"]).load_data()
node_parser = SimpleNodeParser.from_defaults(chunk_size=1024) nodes = node_parser.get_nodes_from_documents(documents)
llm = AzureOpenAI( engine="gpt3", model="gpt-35-turbo-16k", temperature=0.0, azure_endpoint="xxx", api_key="xxx", api_version="2023-07-01-preview", )
EMBEDDINGS = { "bge-large-en": HuggingFaceEmbedding(model_name='/data/database/hg-embed/bge-large-en-v1.5', device='cuda'), "bge-large-zh": HuggingFaceEmbedding(model_name='/data/database/hg-embed/bge-large-zh-v1.5', device='cuda'), "bce-base": HuggingFaceEmbedding(model_name='/data/database/hg-embed/bce-embedding-base_v1', device='cpu'), "JinaAI-Base": HuggingFaceEmbedding(model_name='/data/database/hg-embed/jina-embeddings-v2-base-en', device='cuda'),
}
RERANKERS = { "WithoutReranker": "None", "bce-reranker-base": SentenceTransformerRerank(model="/data/database/hg-embed/bce-reranker-base_v1", top_n=5), "bge-reranker-large": SentenceTransformerRerank(model="/data/database/hg-embed/bge-reranker-large", top_n=5) }
qa_generate_prompt_tmpl = """\ Context information is below. --------------------- {context_str} --------------------- Given the context information and not prior knowledge. generate only questions based on the below query. You are a Professor. Your task is to setup \ {num_questions_per_chunk} questions for an upcoming \ quiz/examination. The questions should be diverse in nature \ across the document. The questions should not contain options, not start with Q1/ Q2. \ Restrict the questions to the context information provided.\ """
qa_dataset = generate_question_context_pairs( nodes, llm=llm, num_questions_per_chunk=2 )
def filter_qa_dataset(qa_dataset): """ Filters out queries from the qa_dataset that contain certain phrases and the corresponding entries in the relevant_docs, and creates a new EmbeddingQAFinetuneDataset object with the filtered data.
:param qa_dataset: An object that has 'queries', 'corpus', and 'relevant_docs' attributes. :return: An EmbeddingQAFinetuneDataset object with the filtered queries, corpus and relevant_docs. """
queries_relevant_docs_keys_to_remove = { k for k, v in qa_dataset.queries.items() if 'Here are 2' in v or 'Here are two' in v }
filtered_queries = { k: v for k, v in qa_dataset.queries.items() if k not in queries_relevant_docs_keys_to_remove } filtered_relevant_docs = { k: v for k, v in qa_dataset.relevant_docs.items() if k not in queries_relevant_docs_keys_to_remove }
return EmbeddingQAFinetuneDataset( queries=filtered_queries, corpus=qa_dataset.corpus, relevant_docs=filtered_relevant_docs )
qa_dataset = filter_qa_dataset(qa_dataset)
def display_results(embedding_name, reranker_name, eval_results): """Display results from evaluate."""
metric_dicts = [] for eval_result in eval_results: metric_dict = eval_result.metric_vals_dict metric_dicts.append(metric_dict)
full_df = pd.DataFrame(metric_dicts)
hit_rate = full_df["hit_rate"].mean() mrr = full_df["mrr"].mean()
metric_df = pd.DataFrame( {"Embedding": [embedding_name], "Reranker": [reranker_name], "hit_rate": [hit_rate], "mrr": [mrr]} )
return metric_df
results_df = pd.DataFrame()
class CustomRetriever(BaseRetriever): """Custom retriever that performs both Vector search and Knowledge Graph search"""
def __init__( self, vector_retriever: VectorIndexRetriever, ) -> None: """Init params."""
self._vector_retriever = vector_retriever
def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]: """Retrieve nodes given query."""
retrieved_nodes = self._vector_retriever.retrieve(query_bundle)
if reranker != 'None': retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle) else: retrieved_nodes = retrieved_nodes[:5]
return retrieved_nodes
async def _aretrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]: """Asynchronously retrieve nodes given query.
Implemented by the user.
""" return self._retrieve(query_bundle)
async def aretrieve(self, str_or_query_bundle: QueryType) -> List[NodeWithScore]: if isinstance(str_or_query_bundle, str): str_or_query_bundle = QueryBundle(str_or_query_bundle) return await self._aretrieve(str_or_query_bundle)
for embed_name, embed_model in EMBEDDINGS.items(): service_context = ServiceContext.from_defaults(llm=None, embed_model=embed_model) vector_index = VectorStoreIndex(nodes, service_context=service_context) vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=10, service_context=service_context) for rerank_name, reranker in RERANKERS.items():
print(f"Running Evaluation for Embedding Model: {embed_name} and Reranker: {rerank_name}") custom_retriever = CustomRetriever(vector_retriever)
retriever_evaluator = RetrieverEvaluator.from_metric_names( ["mrr", "hit_rate"], retriever=custom_retriever ) eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)
current_df = display_results(embed_name, rerank_name, eval_results) results_df = pd.concat([results_df, current_df], ignore_index=True)
|