第四部分:RAG技术实战与优化

第15期 RAG高级技术:混合检索与重排序策略

4.3 RAG高级技术:混合检索与重排序策略

在前两篇文章中,我们介绍了RAG技术的基础架构和优化策略。随着RAG技术的发展,各种高级检索技术不断涌现,旨在解决传统语义检索的局限性。本文将深入探讨RAG系统中的混合检索与重排序策略,这些技术能够显著提升检索质量和回答准确性。

1. 混合检索技术原理

传统的RAG系统主要依赖于向量相似度检索,但这种方法在处理某些类型的查询时存在局限性。混合检索通过结合多种检索方法的优势,能够更全面地捕捉文档与查询之间的相关性。

1.1 混合检索的核心思想

混合检索的核心思想是:不同的检索方法在不同场景下各有优势,将它们结合起来可以获得更好的整体性能。主要包括以下几种检索方法的组合:

1.2 常见的混合检索策略

class HybridRetrievalSystem:
    def __init__(self, vector_store, keyword_store, fusion_method="rrf"):
        self.vector_store = vector_store  # 向量数据库        self.keyword_store = keyword_store  # 关键词搜索引擎(如Elasticsearch)        self.fusion_method = fusion_method  # 结果融合方法    def vector_search(self, query, top_k=10):
        """执行向量相似度检索"""        results = self.vector_store.similarity_search_with_score(query, k=top_k)
        return [(doc, score) for doc, score in results]
    def keyword_search(self, query, top_k=10):
        """执行关键词检索"""        # 这里假设keyword_store提供了搜索方法        results = self.keyword_store.search(query, size=top_k)
        return [(doc, score) for doc, score in results]
    def reciprocal_rank_fusion(self, vector_results, keyword_results, top_k=5):
        """使用倒数排序融合(RRF)方法合并结果        RRF公式: score = sum(1/(k + rank)),其中k通常为60        """        k = 60  # RRF参数        fused_scores = {}
        # 处理向量检索结果        for rank, (doc, _) in enumerate(vector_results, 1):
            doc_id = doc.metadata.get("id", str(id(doc)))
            if doc_id not in fused_scores:
                fused_scores[doc_id] = {"doc": doc, "score": 0}
            fused_scores[doc_id]["score"] += 1 / (k + rank)
        # 处理关键词检索结果        for rank, (doc, _) in enumerate(keyword_results, 1):
            doc_id = doc.metadata.get("id", str(id(doc)))
            if doc_id not in fused_scores:
                fused_scores[doc_id] = {"doc": doc, "score": 0}
            fused_scores[doc_id]["score"] += 1 / (k + rank)
        # 按融合分数排序并返回        sorted_results = sorted(
            fused_scores.values(),
            key=lambda x: x["score"],
            reverse=True        )
        return [(item["doc"], item["score"]) for item in sorted_results[:top_k]]
    def linear_fusion(self, vector_results, keyword_results, weights=(0.6, 0.4), top_k=5):
        """线性加权融合结果"""        vector_weight, keyword_weight = weights
        doc_scores = {}
        # 归一化向量分数        if vector_results:
            max_vector_score = max(score for _, score in vector_results)
        else:
            max_vector_score = 1        # 归一化关键词分数        if keyword_results:
            max_keyword_score = max(score for _, score in keyword_results)
        else:
            max_keyword_score = 1        # 计算向量结果的加权分数        for doc, score in vector_results:
            doc_id = doc.metadata.get("id", str(id(doc)))
            if doc_id not in doc_scores:
                doc_scores[doc_id] = {"doc": doc, "vector_score": 0, "keyword_score": 0}
            doc_scores[doc_id]["vector_score"] = (score / max_vector_score) * vector_weight
        # 计算关键词结果的加权分数        for doc, score in keyword_results:
            doc_id = doc.metadata.get("id", str(id(doc)))
            if doc_id not in doc_scores:
                doc_scores[doc_id] = {"doc": doc, "vector_score": 0, "keyword_score": 0}
            doc_scores[doc_id]["keyword_score"] = (score / max_keyword_score) * keyword_weight
        # 计算最终分数并排序        results = []
        for doc_id, scores in doc_scores.items():
            final_score = scores["vector_score"] + scores["keyword_score"]
            results.append((scores["doc"], final_score))
        results.sort(key=lambda x: x[1], reverse=True)
        return results[:top_k]
    def search(self, query, top_k=5, **kwargs):
        """执行混合检索"""        # 并行执行两种检索        vector_results = self.vector_search(query, top_k=top_k * 3)  # 获取更多结果用于融合        keyword_results = self.keyword_search(query, top_k=top_k * 3)
        # 根据指定的融合方法合并结果        if self.fusion_method == "rrf":
            return self.reciprocal_rank_fusion(vector_results, keyword_results, top_k=top_k)
        else:  # linear            weights = kwargs.get("weights", (0.6, 0.4))
            return self.linear_fusion(vector_results, keyword_results, weights=weights, top_k=top_k)

2. BM25与语义检索的结合

BM25作为经典的关键词检索算法,与语义检索的结合是混合检索中最常用的策略之一。

2.1 BM25原理回顾

BM25(Best Matching 25)是一种基于概率检索模型的算法,它考虑了:

2.2 BM25与语义检索的集成实现

class BM25SemanticHybrid:
    def __init__(self, documents, embedding_model):
        # 初始化BM25        from rank_bm25 import BM25Okapi
        self.tokenized_corpus = [doc.split() for doc in documents]
        self.bm25 = BM25Okapi(self.tokenized_corpus)
        self.documents = documents
        # 初始化嵌入模型        self.embedding_model = embedding_model
        # 预计算文档嵌入        self.document_embeddings = self.embedding_model.encode(documents)
    def bm25_search(self, query, top_k=10):
        """执行BM25检索"""        tokenized_query = query.split()
        bm25_scores = self.bm25.get_scores(tokenized_query)
        # 获取排序后的文档索引        sorted_indices = bm25_scores.argsort()[::-1][:top_k]
        results = []
        for idx in sorted_indices:
            results.append((self.documents[idx], float(bm25_scores[idx]), idx))
        return results
    def semantic_search(self, query, top_k=10):
        """执行语义检索"""        import numpy as np
        # 计算查询嵌入        query_embedding = self.embedding_model.encode([query])[0]
        # 计算相似度        similarities = np.dot(self.document_embeddings, query_embedding) / (
            np.linalg.norm(self.document_embeddings, axis=1) * np.linalg.norm(query_embedding)
        )
        # 获取排序后的文档索引        sorted_indices = similarities.argsort()[::-1][:top_k]
        results = []
        for idx in sorted_indices:
            results.append((self.documents[idx], float(similarities[idx]), idx))
        return results
    def hybrid_search(self, query, top_k=5, alpha=0.5):
        """执行混合检索        alpha: 控制BM25和语义检索的权重,范围[0,1]              alpha=0: 纯BM25              alpha=1: 纯语义检索        """        # 获取两种检索结果        bm25_results = self.bm25_search(query, top_k=top_k * 3)
        semantic_results = self.semantic_search(query, top_k=top_k * 3)
        # 构建文档索引到分数的映射        doc_scores = {}
        # 处理BM25结果        if bm25_results:
            max_bm25_score = max(score for _, score, _ in bm25_results)
            for doc, score, idx in bm25_results:
                normalized_score = score / max_bm25_score if max_bm25_score > 0 else 0                doc_scores[idx] = {
                    "doc": doc,
                    "bm25_score": normalized_score,
                    "semantic_score": 0,
                    "final_score": 0                }
        # 处理语义结果        if semantic_results:
            max_semantic_score = max(score for _, score, _ in semantic_results)
            for doc, score, idx in semantic_results:
                normalized_score = score / max_semantic_score if max_semantic_score > 0 else 0                if idx not in doc_scores:
                    doc_scores[idx] = {
                        "doc": doc,
                        "bm25_score": 0,
                        "semantic_score": normalized_score,
                        "final_score": 0                    }
                else:
                    doc_scores[idx]["semantic_score"] = normalized_score
        # 计算混合分数        for idx, scores in doc_scores.items():
            scores["final_score"] = (
                scores["bm25_score"] * (1 - alpha) +
                scores["semantic_score"] * alpha
            )
        # 按混合分数排序        sorted_results = sorted(
            doc_scores.values(),
            key=lambda x: x["final_score"],
            reverse=True        )
        return [(item["doc"], item["final_score"]) for item in sorted_results[:top_k]]

3. 重排序技术详解