Hybrid Search
Combine semantic vector search with keyword-based search for better retrieval
Overview
Hybrid search combines the strengths of two retrieval methods:
- Semantic search (vector/dense): Understands meaning and context
- Keyword search (BM25/sparse): Matches exact terms and rare words
By combining both, you get the best of both worlds: semantic understanding AND precise term matching.
Why Hybrid Search?
Semantic Search Limitations
Struggles with:
- Exact matches (product codes, IDs, names)
- Rare or technical terms
- Acronyms and abbreviations
- Numbers and dates
# Semantic search fails:
query = "iPhone 15 Pro"
# May return: "iPhone 14 Pro", "Samsung Galaxy S23"
# Misses exact model number
Keyword Search Limitations
Struggles with:
- Synonyms and paraphrasing
- Conceptual similarity
- Context and meaning
# Keyword search fails:
query = "how to fix broken screen"
# Misses: "screen repair guide", "display replacement tutorial"
# Only matches exact words
Hybrid Solution
# Hybrid search succeeds:
query = "iPhone 15 Pro screen repair"
# Combines:
# - Semantic: "screen repair", "display replacement"
# - Keyword: "iPhone 15 Pro" (exact match)
# Result: Best of both!
Implementation
Basic Hybrid Search
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
import numpy as np
class HybridRetriever:
def __init__(self, documents):
self.documents = documents
# Semantic search setup
self.semantic_model = SentenceTransformer('all-mpnet-base-v2')
self.doc_embeddings = self.semantic_model.encode(documents)
# Keyword search setup
tokenized_docs = [doc.lower().split() for doc in documents]
self.bm25 = BM25Okapi(tokenized_docs)
def hybrid_search(self, query, k=5, alpha=0.5):
"""
Hybrid search combining semantic and keyword
Args:
query: Search query
k: Number of results
alpha: Weight for semantic (0=keyword only, 1=semantic only)
"""
# Semantic scores
query_emb = self.semantic_model.encode(query)
semantic_scores = np.dot(self.doc_embeddings, query_emb)
semantic_scores = (semantic_scores - semantic_scores.min()) / \
(semantic_scores.max() - semantic_scores.min())
# Keyword scores (BM25)
tokenized_query = query.lower().split()
bm25_scores = self.bm25.get_scores(tokenized_query)
bm25_scores = (bm25_scores - bm25_scores.min()) / \
(bm25_scores.max() - bm25_scores.min() + 1e-6)
# Combine scores
hybrid_scores = alpha * semantic_scores + (1 - alpha) * bm25_scores
# Get top K
top_indices = np.argsort(hybrid_scores)[::-1][:k]
return [(self.documents[i], hybrid_scores[i]) for i in top_indices]
# Usage
documents = [
"iPhone 15 Pro features A17 Pro chip",
"iPhone 14 Pro has great camera",
"Samsung Galaxy S23 Ultra review",
"How to repair iPhone screen",
"Display replacement guide for smartphones"
]
retriever = HybridRetriever(documents)
results = retriever.hybrid_search(
query="iPhone 15 Pro screen repair",
k=3,
alpha=0.5 # Equal weight
)
for doc, score in results:
print(f"{score:.3f}: {doc}")
With Vector Database
import lancedb
from rank_bm25 import BM25Okapi
class VectorDBHybridSearch:
def __init__(self, db_path="./hybrid-db"):
self.model = SentenceTransformer('all-mpnet-base-v2')
self.db = lancedb.connect(db_path)
def index_documents(self, documents):
"""Index documents for hybrid search"""
# Create embeddings
embeddings = self.model.encode(documents)
# Store in vector DB
data = [
{
"id": i,
"text": doc,
"vector": emb.tolist(),
"tokens": doc.lower().split() # For BM25
}
for i, (doc, emb) in enumerate(zip(documents, embeddings))
]
self.table = self.db.create_table("docs", data, mode="overwrite")
# Build BM25 index
tokenized_docs = [doc.lower().split() for doc in documents]
self.bm25 = BM25Okapi(tokenized_docs)
def search(self, query, k=10, alpha=0.5):
"""Hybrid search"""
# Vector search
query_emb = self.model.encode(query)
vector_results = self.table.search(query_emb).limit(k*2).to_list()
# BM25 search
tokenized_query = query.lower().split()
bm25_scores = self.bm25.get_scores(tokenized_query)
# Normalize and combine
doc_scores = {}
# Add vector scores
for result in vector_results:
doc_id = result['id']
# Convert distance to similarity
sim_score = 1 / (1 + result['_distance'])
doc_scores[doc_id] = alpha * sim_score
# Add BM25 scores
for doc_id, bm25_score in enumerate(bm25_scores):
if doc_id in doc_scores:
doc_scores[doc_id] += (1 - alpha) * bm25_score
else:
doc_scores[doc_id] = (1 - alpha) * bm25_score
# Sort and return top K
sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)[:k]
results = []
for doc_id, score in sorted_docs:
doc = self.table.search().where(f"id = {doc_id}").limit(1).to_list()[0]
results.append((doc['text'], score))
return results
Score Normalization
Min-Max Normalization
def normalize_scores(scores):
"""Normalize to 0-1 range"""
min_score = np.min(scores)
max_score = np.max(scores)
return (scores - min_score) / (max_score - min_score + 1e-6)
Z-Score Normalization
def z_score_normalize(scores):
"""Normalize using z-scores"""
mean = np.mean(scores)
std = np.std(scores)
return (scores - mean) / (std + 1e-6)
Rank-Based Fusion (RRF)
def reciprocal_rank_fusion(semantic_ranks, keyword_ranks, k=60):
"""
Combine rankings using RRF
RRF(d) = Σ 1/(k + rank(d))
"""
scores = {}
# Add semantic ranks
for rank, doc_id in enumerate(semantic_ranks):
scores[doc_id] = scores.get(doc_id, 0) + 1/(k + rank + 1)
# Add keyword ranks
for rank, doc_id in enumerate(keyword_ranks):
scores[doc_id] = scores.get(doc_id, 0) + 1/(k + rank + 1)
# Sort by combined score
return sorted(scores.items(), key=lambda x: x[1], reverse=True)
# Usage
semantic_results = [0, 2, 5, 7, 9] # doc IDs
keyword_results = [1, 0, 3, 5, 8] # doc IDs
fused = reciprocal_rank_fusion(semantic_results, keyword_results)
Alpha Parameter Tuning
Query-Adaptive Alpha
def adaptive_alpha(query):
"""Adjust alpha based on query characteristics"""
# Has exact terms (product codes, IDs)
if any(char.isdigit() for char in query):
return 0.3 # Favor keyword search
# Has quotes (exact phrase)
if '"' in query:
return 0.2 # Strong keyword preference
# Long conceptual query
if len(query.split()) > 8:
return 0.7 # Favor semantic search
# Default: balanced
return 0.5
# Usage
query = "iPhone 15 Pro"
alpha = adaptive_alpha(query) # 0.3 (has numbers)
results = retriever.hybrid_search(query, alpha=alpha)
Domain-Specific Alpha
# E-commerce: Favor keyword (exact product matches)
ECOMMERCE_ALPHA = 0.3
# Documentation: Favor semantic (conceptual search)
DOCS_ALPHA = 0.7
# General search: Balanced
GENERAL_ALPHA = 0.5
Advanced Techniques
Field-Specific Weighting
class FieldWeightedHybrid:
def search(self, query, k=5):
"""Weight different fields differently"""
# Search in title (higher weight for keywords)
title_results = self.hybrid_search(
query,
field='title',
alpha=0.3 # Favor exact matches in titles
)
# Search in content (higher weight for semantic)
content_results = self.hybrid_search(
query,
field='content',
alpha=0.7 # Favor semantic in content
)
# Combine with field weights
combined = {}
for doc, score in title_results:
combined[doc] = 0.6 * score # Title weight: 60%
for doc, score in content_results:
combined[doc] = combined.get(doc, 0) + 0.4 * score # Content: 40%
return sorted(combined.items(), key=lambda x: x[1], reverse=True)[:k]
Query Expansion + Hybrid
def expanded_hybrid_search(query, k=5):
"""Expand query then do hybrid search"""
# Expand query with synonyms
expanded_query = expand_query(query)
# "phone repair" -> "phone repair fix maintenance"
# Hybrid search on expanded query
results = retriever.hybrid_search(expanded_query, k=k)
return results
Multi-Stage Hybrid
def multi_stage_hybrid(query, k=5):
"""
Stage 1: Hybrid retrieval (fast, broad)
Stage 2: Semantic re-ranking (slow, precise)
"""
# Stage 1: Get candidates with hybrid
candidates = retriever.hybrid_search(query, k=k*5, alpha=0.5)
# Stage 2: Re-rank with cross-encoder
from sentence_transformers import CrossEncoder
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
pairs = [[query, doc] for doc, _ in candidates]
scores = reranker.predict(pairs)
# Sort by reranker scores
reranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
return [doc for (doc, _), score in reranked[:k]]
Evaluation
Compare Approaches
def compare_search_methods(queries, ground_truth):
"""Compare semantic, keyword, and hybrid"""
results = {
'semantic': [],
'keyword': [],
'hybrid': []
}
for query, relevant_docs in zip(queries, ground_truth):
# Semantic only
sem_results = retriever.hybrid_search(query, alpha=1.0)
results['semantic'].append(calculate_recall(sem_results, relevant_docs))
# Keyword only
kw_results = retriever.hybrid_search(query, alpha=0.0)
results['keyword'].append(calculate_recall(kw_results, relevant_docs))
# Hybrid
hyb_results = retriever.hybrid_search(query, alpha=0.5)
results['hybrid'].append(calculate_recall(hyb_results, relevant_docs))
# Average performance
for method, recalls in results.items():
print(f"{method}: {np.mean(recalls):.2%}")
Use Cases
E-commerce
# Product search: exact SKUs + semantic understanding
query = "wireless headphones noise cancelling"
results = retriever.hybrid_search(query, alpha=0.4)
# Finds: Exact "wireless headphones" + semantically similar "bluetooth earbuds"
Documentation
# Code/docs search: function names + concepts
query = "authenticate user JWT"
results = retriever.hybrid_search(query, alpha=0.6)
# Finds: Exact "JWT" + semantic "authentication", "login"
Legal/Medical
# Precise terminology + context
query = "myocardial infarction treatment"
results = retriever.hybrid_search(query, alpha=0.5)
# Finds: Exact medical term + related concepts
Best Practices
- Start with α=0.5 and tune based on domain
- Use query-adaptive alpha for different query types
- Normalize scores before combining
- Monitor both methods separately to identify weaknesses
- A/B test to find optimal alpha for your use case
Next Steps
- Retrieval Fundamentals - Core vector search
- MMR - Diversify results
- Query Expansion - Improve recall
- Parent Document Retrieval - Better context