在前两篇文章中,我们介绍了RAG技术的基础架构和优化策略。随着RAG技术的发展,各种高级检索技术不断涌现,旨在解决传统语义检索的局限性。本文将深入探讨RAG系统中的混合检索与重排序策略,这些技术能够显著提升检索质量和回答准确性。
传统的RAG系统主要依赖于向量相似度检索,但这种方法在处理某些类型的查询时存在局限性。混合检索通过结合多种检索方法的优势,能够更全面地捕捉文档与查询之间的相关性。
混合检索的核心思想是:不同的检索方法在不同场景下各有优势,将它们结合起来可以获得更好的整体性能。主要包括以下几种检索方法的组合:
class HybridRetrievalSystem:
def __init__(self, vector_store, keyword_store, fusion_method="rrf"):
self.vector_store = vector_store # 向量数据库 self.keyword_store = keyword_store # 关键词搜索引擎(如Elasticsearch) self.fusion_method = fusion_method # 结果融合方法 def vector_search(self, query, top_k=10):
"""执行向量相似度检索""" results = self.vector_store.similarity_search_with_score(query, k=top_k)
return [(doc, score) for doc, score in results]
def keyword_search(self, query, top_k=10):
"""执行关键词检索""" # 这里假设keyword_store提供了搜索方法 results = self.keyword_store.search(query, size=top_k)
return [(doc, score) for doc, score in results]
def reciprocal_rank_fusion(self, vector_results, keyword_results, top_k=5):
"""使用倒数排序融合(RRF)方法合并结果 RRF公式: score = sum(1/(k + rank)),其中k通常为60 """ k = 60 # RRF参数 fused_scores = {}
# 处理向量检索结果 for rank, (doc, _) in enumerate(vector_results, 1):
doc_id = doc.metadata.get("id", str(id(doc)))
if doc_id not in fused_scores:
fused_scores[doc_id] = {"doc": doc, "score": 0}
fused_scores[doc_id]["score"] += 1 / (k + rank)
# 处理关键词检索结果 for rank, (doc, _) in enumerate(keyword_results, 1):
doc_id = doc.metadata.get("id", str(id(doc)))
if doc_id not in fused_scores:
fused_scores[doc_id] = {"doc": doc, "score": 0}
fused_scores[doc_id]["score"] += 1 / (k + rank)
# 按融合分数排序并返回 sorted_results = sorted(
fused_scores.values(),
key=lambda x: x["score"],
reverse=True )
return [(item["doc"], item["score"]) for item in sorted_results[:top_k]]
def linear_fusion(self, vector_results, keyword_results, weights=(0.6, 0.4), top_k=5):
"""线性加权融合结果""" vector_weight, keyword_weight = weights
doc_scores = {}
# 归一化向量分数 if vector_results:
max_vector_score = max(score for _, score in vector_results)
else:
max_vector_score = 1 # 归一化关键词分数 if keyword_results:
max_keyword_score = max(score for _, score in keyword_results)
else:
max_keyword_score = 1 # 计算向量结果的加权分数 for doc, score in vector_results:
doc_id = doc.metadata.get("id", str(id(doc)))
if doc_id not in doc_scores:
doc_scores[doc_id] = {"doc": doc, "vector_score": 0, "keyword_score": 0}
doc_scores[doc_id]["vector_score"] = (score / max_vector_score) * vector_weight
# 计算关键词结果的加权分数 for doc, score in keyword_results:
doc_id = doc.metadata.get("id", str(id(doc)))
if doc_id not in doc_scores:
doc_scores[doc_id] = {"doc": doc, "vector_score": 0, "keyword_score": 0}
doc_scores[doc_id]["keyword_score"] = (score / max_keyword_score) * keyword_weight
# 计算最终分数并排序 results = []
for doc_id, scores in doc_scores.items():
final_score = scores["vector_score"] + scores["keyword_score"]
results.append((scores["doc"], final_score))
results.sort(key=lambda x: x[1], reverse=True)
return results[:top_k]
def search(self, query, top_k=5, **kwargs):
"""执行混合检索""" # 并行执行两种检索 vector_results = self.vector_search(query, top_k=top_k * 3) # 获取更多结果用于融合 keyword_results = self.keyword_search(query, top_k=top_k * 3)
# 根据指定的融合方法合并结果 if self.fusion_method == "rrf":
return self.reciprocal_rank_fusion(vector_results, keyword_results, top_k=top_k)
else: # linear weights = kwargs.get("weights", (0.6, 0.4))
return self.linear_fusion(vector_results, keyword_results, weights=weights, top_k=top_k)
BM25作为经典的关键词检索算法,与语义检索的结合是混合检索中最常用的策略之一。
BM25(Best Matching 25)是一种基于概率检索模型的算法,它考虑了:
class BM25SemanticHybrid:
def __init__(self, documents, embedding_model):
# 初始化BM25 from rank_bm25 import BM25Okapi
self.tokenized_corpus = [doc.split() for doc in documents]
self.bm25 = BM25Okapi(self.tokenized_corpus)
self.documents = documents
# 初始化嵌入模型 self.embedding_model = embedding_model
# 预计算文档嵌入 self.document_embeddings = self.embedding_model.encode(documents)
def bm25_search(self, query, top_k=10):
"""执行BM25检索""" tokenized_query = query.split()
bm25_scores = self.bm25.get_scores(tokenized_query)
# 获取排序后的文档索引 sorted_indices = bm25_scores.argsort()[::-1][:top_k]
results = []
for idx in sorted_indices:
results.append((self.documents[idx], float(bm25_scores[idx]), idx))
return results
def semantic_search(self, query, top_k=10):
"""执行语义检索""" import numpy as np
# 计算查询嵌入 query_embedding = self.embedding_model.encode([query])[0]
# 计算相似度 similarities = np.dot(self.document_embeddings, query_embedding) / (
np.linalg.norm(self.document_embeddings, axis=1) * np.linalg.norm(query_embedding)
)
# 获取排序后的文档索引 sorted_indices = similarities.argsort()[::-1][:top_k]
results = []
for idx in sorted_indices:
results.append((self.documents[idx], float(similarities[idx]), idx))
return results
def hybrid_search(self, query, top_k=5, alpha=0.5):
"""执行混合检索 alpha: 控制BM25和语义检索的权重,范围[0,1] alpha=0: 纯BM25 alpha=1: 纯语义检索 """ # 获取两种检索结果 bm25_results = self.bm25_search(query, top_k=top_k * 3)
semantic_results = self.semantic_search(query, top_k=top_k * 3)
# 构建文档索引到分数的映射 doc_scores = {}
# 处理BM25结果 if bm25_results:
max_bm25_score = max(score for _, score, _ in bm25_results)
for doc, score, idx in bm25_results:
normalized_score = score / max_bm25_score if max_bm25_score > 0 else 0 doc_scores[idx] = {
"doc": doc,
"bm25_score": normalized_score,
"semantic_score": 0,
"final_score": 0 }
# 处理语义结果 if semantic_results:
max_semantic_score = max(score for _, score, _ in semantic_results)
for doc, score, idx in semantic_results:
normalized_score = score / max_semantic_score if max_semantic_score > 0 else 0 if idx not in doc_scores:
doc_scores[idx] = {
"doc": doc,
"bm25_score": 0,
"semantic_score": normalized_score,
"final_score": 0 }
else:
doc_scores[idx]["semantic_score"] = normalized_score
# 计算混合分数 for idx, scores in doc_scores.items():
scores["final_score"] = (
scores["bm25_score"] * (1 - alpha) +
scores["semantic_score"] * alpha
)
# 按混合分数排序 sorted_results = sorted(
doc_scores.values(),
key=lambda x: x["final_score"],
reverse=True )
return [(item["doc"], item["final_score"]) for item in sorted_results[:top_k]]