Parent Document Retrieval
Retrieve small chunks for search, return larger parent documents for context
Overview
Parent Document Retrieval solves a common RAG dilemma: small chunks are better for precise search, but larger chunks provide better context for generation. This technique retrieves small, focused chunks but returns their larger parent documents to the LLM.
The Problem: Chunk Size Tradeoff
Small Chunks (Good for Search)
# Small chunks: 200 tokens
chunk = "Vector databases use ANN algorithms like HNSW for fast similarity search."
Pros:
- Precise matching
- Less noise
- Better retrieval accuracy
Cons:
- Missing context
- Incomplete information
- Poor generation quality
Large Chunks (Good for Context)
# Large chunks: 1000 tokens
chunk = """
# Vector Databases
Vector databases store embeddings and enable semantic search...
[500 tokens of context]
## Indexing Algorithms
Vector databases use ANN algorithms like HNSW for fast similarity search.
[400 tokens more]
"""
Pros:
- Complete context
- Better generation
- Self-contained information
Cons:
- Noisy retrieval
- Lower precision
- Irrelevant matches
The Solution: Parent Document Retrieval
Strategy: Index small chunks, retrieve parent documents
Document (Parent)
├── Chunk 1 (Child) ← Search this
├── Chunk 2 (Child) ← Search this
└── Chunk 3 (Child) ← Search this
When Chunk 2 matches → Return entire Document (Parent)
Basic Implementation
from sentence_transformers import SentenceTransformer
import lancedb
from typing import List, Dict
import hashlib
class ParentDocumentRetriever:
def __init__(self, db_path: str = "./vector-db"):
self.model = SentenceTransformer('all-mpnet-base-v2')
self.db = lancedb.connect(db_path)
self.chunk_size = 200 # Small for search
self.parent_chunk_size = 1000 # Large for context
def chunk_text(self, text: str, chunk_size: int, overlap: int = 50) -> List[str]:
"""Split text into chunks with overlap"""
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunk = ' '.join(words[i:i + chunk_size])
if chunk:
chunks.append(chunk)
return chunks
def index_document(self, document: str, doc_id: str, metadata: Dict = None):
"""Index document with parent-child relationship"""
# Create parent chunks (large)
parent_chunks = self.chunk_text(
document,
self.parent_chunk_size,
overlap=100
)
# Create child chunks (small) for each parent
all_records = []
for parent_idx, parent_text in enumerate(parent_chunks):
parent_id = f"{doc_id}_parent_{parent_idx}"
# Split parent into small child chunks
child_chunks = self.chunk_text(
parent_text,
self.chunk_size,
overlap=50
)
for child_idx, child_text in enumerate(child_chunks):
child_id = f"{parent_id}_child_{child_idx}"
# Embed child chunk
child_embedding = self.model.encode(child_text)
# Store both child and parent
record = {
'id': child_id,
'parent_id': parent_id,
'doc_id': doc_id,
'child_text': child_text,
'parent_text': parent_text, # Store full parent
'vector': child_embedding.tolist(),
'metadata': metadata or {}
}
all_records.append(record)
# Create or append to table
try:
table = self.db.open_table("parent_child_docs")
table.add(all_records)
except:
table = self.db.create_table("parent_child_docs", all_records)
def search(self, query: str, k: int = 5) -> List[Dict]:
"""Search child chunks, return parent documents"""
# Step 1: Search using child chunks
query_embedding = self.model.encode(query)
table = self.db.open_table("parent_child_docs")
# Get more results than needed (we'll deduplicate parents)
results = table.search(query_embedding).limit(k * 3).to_list()
# Step 2: Deduplicate by parent_id and return parent text
seen_parents = set()
parent_results = []
for result in results:
parent_id = result['parent_id']
if parent_id not in seen_parents:
seen_parents.add(parent_id)
parent_results.append({
'id': parent_id,
'text': result['parent_text'], # Return parent, not child!
'matched_child': result['child_text'], # For debugging
'score': result['_distance'],
'metadata': result['metadata']
})
if len(parent_results) >= k:
break
return parent_results
# Usage
retriever = ParentDocumentRetriever()
# Index a document
document = """
# Introduction to Vector Databases
Vector databases are specialized systems designed to store and query high-dimensional vectors...
[Large document with multiple sections]
"""
retriever.index_document(
document=document,
doc_id="doc_001",
metadata={'title': 'Vector Databases Guide', 'author': 'Jane Doe'}
)
# Search: matches small chunks, returns large parents
results = retriever.search("How do vector databases work?", k=3)
for result in results:
print(f"Parent ID: {result['id']}")
print(f"Score: {result['score']:.3f}")
print(f"Matched Child: {result['matched_child'][:100]}...")
print(f"Parent Text: {result['text'][:200]}...\n")
Advanced: Sentence Window Retrieval
Instead of fixed-size chunks, use sentence-level precision with larger context windows:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
class SentenceWindowRetriever:
def __init__(self, window_size: int = 5):
self.model = SentenceTransformer('all-mpnet-base-v2')
self.db = lancedb.connect("./vector-db")
self.window_size = window_size # Sentences before/after
def index_document(self, document: str, doc_id: str):
"""Index individual sentences with surrounding context"""
# Split into sentences
sentences = sent_tokenize(document)
records = []
for i, sentence in enumerate(sentences):
# Create window of surrounding sentences
start = max(0, i - self.window_size)
end = min(len(sentences), i + self.window_size + 1)
# Context window (what we'll return)
context_window = ' '.join(sentences[start:end])
# Embed just the sentence (for search)
sentence_embedding = self.model.encode(sentence)
record = {
'id': f"{doc_id}_sent_{i}",
'doc_id': doc_id,
'sentence': sentence, # Search on this
'context': context_window, # Return this
'sentence_index': i,
'vector': sentence_embedding.tolist()
}
records.append(record)
# Store
try:
table = self.db.open_table("sentence_windows")
table.add(records)
except:
table = self.db.create_table("sentence_windows", records)
def search(self, query: str, k: int = 5) -> List[Dict]:
"""Search sentences, return context windows"""
query_embedding = self.model.encode(query)
table = self.db.open_table("sentence_windows")
results = table.search(query_embedding).limit(k).to_list()
return [{
'id': r['id'],
'matched_sentence': r['sentence'],
'context': r['context'], # Full window
'score': r['_distance']
} for r in results]
# Usage
retriever = SentenceWindowRetriever(window_size=3)
document = """
Vector databases store embeddings. They enable semantic search.
This is crucial for RAG systems. Embeddings capture meaning.
HNSW is a popular indexing algorithm. It provides fast approximate search.
"""
retriever.index_document(document, "doc_001")
results = retriever.search("What are vector databases?", k=2)
for r in results:
print(f"Matched: {r['matched_sentence']}")
print(f"Context: {r['context']}\n")
Output:
Matched: Vector databases store embeddings.
Context: Vector databases store embeddings. They enable semantic search.
This is crucial for RAG systems. Embeddings capture meaning.
Matched: This is crucial for RAG systems.
Context: They enable semantic search. This is crucial for RAG systems.
Embeddings capture meaning. HNSW is a popular indexing algorithm.
Hierarchical Document Structure
For complex documents, use multi-level hierarchy:
class HierarchicalRetriever:
"""Document → Section → Paragraph → Sentence"""
def __init__(self):
self.model = SentenceTransformer('all-mpnet-base-v2')
self.db = lancedb.connect("./vector-db")
def index_hierarchical_document(self, document: Dict):
"""
document = {
'id': 'doc_001',
'title': 'Vector Databases',
'sections': [
{
'title': 'Introduction',
'paragraphs': ['Para 1...', 'Para 2...']
},
{
'title': 'Indexing',
'paragraphs': ['Para 1...', 'Para 2...']
}
]
}
"""
records = []
doc_id = document['id']
for section_idx, section in enumerate(document['sections']):
section_id = f"{doc_id}_sec_{section_idx}"
for para_idx, paragraph in enumerate(section['paragraphs']):
para_id = f"{section_id}_para_{para_idx}"
# Split paragraph into sentences
sentences = sent_tokenize(paragraph)
for sent_idx, sentence in enumerate(sentences):
# Embed sentence
embedding = self.model.encode(sentence)
# Store with full hierarchy
record = {
'id': f"{para_id}_sent_{sent_idx}",
'doc_id': doc_id,
'section_id': section_id,
'para_id': para_id,
'sentence': sentence,
'paragraph': paragraph, # Parent 1
'section_title': section['title'],
'section_content': '\n'.join(section['paragraphs']), # Parent 2
'document_title': document['title'],
'vector': embedding.tolist()
}
records.append(record)
# Store
try:
table = self.db.open_table("hierarchical_docs")
table.add(records)
except:
table = self.db.create_table("hierarchical_docs", records)
def search(
self,
query: str,
k: int = 5,
return_level: str = 'paragraph'
) -> List[Dict]:
"""
Search at sentence level, return at specified level
return_level: 'sentence', 'paragraph', 'section', or 'document'
"""
query_embedding = self.model.encode(query)
table = self.db.open_table("hierarchical_docs")
results = table.search(query_embedding).limit(k * 2).to_list()
# Deduplicate based on return level
seen = set()
final_results = []
for r in results:
if return_level == 'sentence':
key = r['id']
text = r['sentence']
elif return_level == 'paragraph':
key = r['para_id']
text = r['paragraph']
elif return_level == 'section':
key = r['section_id']
text = f"# {r['section_title']}\n\n{r['section_content']}"
else: # document
key = r['doc_id']
text = f"# {r['document_title']}\n\n..." # Would load full doc
if key not in seen:
seen.add(key)
final_results.append({
'id': key,
'text': text,
'matched_sentence': r['sentence'],
'score': r['_distance']
})
if len(final_results) >= k:
break
return final_results
Adaptive Context Size
Dynamically adjust context based on query complexity:
def adaptive_parent_retrieval(query: str, retriever: ParentDocumentRetriever):
"""Adjust context size based on query"""
# Simple query: smaller context
if len(query.split()) <= 5:
context_size = 300
# Complex query: larger context
elif len(query.split()) > 15:
context_size = 1500
# Medium query: medium context
else:
context_size = 800
retriever.parent_chunk_size = context_size
return retriever.search(query, k=5)
Best Practices
- Child chunk size: 100-300 tokens for precise retrieval
- Parent chunk size: 800-1500 tokens for good context
- Overlap: 10-20% to avoid cutting semantic units
- Deduplicate parents to avoid returning same content multiple times
- Store metadata at parent level for filtering
Evaluation
def evaluate_parent_document_retrieval(
test_queries: List[str],
ground_truth: Dict[str, List[str]]
):
"""Compare parent document retrieval vs standard chunking"""
# Standard chunking
standard_retriever = StandardRetriever(chunk_size=500)
# Parent document retrieval
parent_retriever = ParentDocumentRetriever(
chunk_size=200,
parent_chunk_size=1000
)
for query in test_queries:
# Standard
standard_results = standard_retriever.search(query, k=5)
# Parent document
parent_results = parent_retriever.search(query, k=5)
# Evaluate
print(f"Query: {query}")
print(f"Standard precision: {calculate_precision(standard_results, ground_truth[query])}")
print(f"Parent precision: {calculate_precision(parent_results, ground_truth[query])}")
print()
Common Issues
1. Parent Chunks Too Large
Problem: Exceeds LLM context window
Solution:
- Limit parent size to 1500 tokens
- Use hierarchical retrieval with sections
- Implement truncation strategy
2. Duplicate Content
Problem: Multiple child chunks match, returning same parent multiple times
Solution:
- Deduplicate by parent_id
- Score parents by best child match
- Limit results per parent document
3. Storage Overhead
Problem: Storing both child embeddings and parent text uses more space
Solution:
- Store parent text once, reference from children
- Use compression for parent text
- Consider parent text retrieval from original source
Use Cases
Technical Documentation
- Child: Individual code examples
- Parent: Complete function documentation with context
Research Papers
- Child: Key sentences or claims
- Parent: Full paragraphs or sections
Books
- Child: Important quotes
- Parent: Full chapters or sections
Legal Documents
- Child: Specific clauses
- Parent: Full sections with preamble
Next Steps
- Retrieval Fundamentals - Core vector search concepts
- MMR - Diversify search results
- Query Expansion - Improve recall
- Hybrid Search - Combine semantic and keyword search