Structured Metadata and SQL Integration
Hybrid retrieval combining vector search with metadata filtering and SQL queries
Overview
Pure semantic search excels at finding conceptually similar content but struggles with queries containing specific constraints like "red sneakers under $100" or "invoices from Q3 2023." By incorporating structured metadata and database integration, you can build RAG systems that handle these complex, real-world queries effectively.
This approach combines multiple retrieval methods:
- Vector search for semantic understanding
- Metadata filtering for specific constraints
- SQL queries for database information
- Structured extraction from documents
Why Structured Data Matters
- Handle Constraints: Process queries with specific filters (price, date, category)
- Improve Precision: Combine semantic and exact matching
- Expand Capabilities: Answer database-level questions
- Better Citations: Maintain document structure for precise references
Implementation Guide
1. Extract Structured Metadata
import instructor
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import List, Literal
# Define schema
class ProductMetadata(BaseModel):
category: Literal["shirts", "pants", "shoes", "accessories"]
color: List[str] = Field(description="Main colors")
material: List[str] = Field(description="Primary materials")
price_range: Literal["under_50", "50_100", "100_200", "over_200"]
style: List[str] = Field(description="Style tags")
season: List[Literal["spring", "summer", "fall", "winter"]]
client = instructor.from_openai(OpenAI())
def extract_metadata(product_description: str, image_url: str = None):
"""Extract structured metadata from product data"""
messages = [
{"role": "system", "content": "Extract product metadata accurately"},
{"role": "user", "content": f"Description: {product_description}"}
]
# Add image if available (multimodal)
if image_url:
messages[1]["content"] = [
{"type": "text", "text": f"Description: {product_description}"},
{"type": "image_url", "image_url": {"url": image_url}}
]
metadata = client.chat.completions.create(
model="gpt-4o", # Supports vision
response_model=ProductMetadata,
messages=messages
)
return metadata
2. Hybrid Retrieval with Metadata Filtering
import lancedb
from typing import Optional
class HybridRetriever:
def __init__(self, db_path: str):
self.db = lancedb.connect(db_path)
self.table = self.db.open_table("products")
def search(
self,
query: str,
category: Optional[str] = None,
price_range: Optional[str] = None,
colors: Optional[List[str]] = None,
k: int = 5
):
"""Combine semantic search with metadata filters"""
# Parse query for implicit constraints
filters = self._extract_filters(query)
# Override with explicit filters
if category:
filters['category'] = category
if price_range:
filters['price_range'] = price_range
if colors:
filters['colors'] = colors
# Build filter string
filter_conditions = []
if filters.get('category'):
filter_conditions.append(f"category = '{filters['category']}'")
if filters.get('price_range'):
filter_conditions.append(f"price_range = '{filters['price_range']}'")
if filters.get('colors'):
color_filters = " OR ".join([f"'{c}' IN colors" for c in filters['colors']])
filter_conditions.append(f"({color_filters})")
where_clause = " AND ".join(filter_conditions) if filter_conditions else None
# Execute hybrid search
results = (
self.table.search(query)
.where(where_clause) if where_clause else self.table.search(query)
).limit(k).to_list()
return results
def _extract_filters(self, query: str) -> dict:
"""Extract filters from natural language query"""
extraction = client.chat.completions.create(
model="gpt-4o-mini",
response_model=ProductMetadata,
messages=[
{"role": "system", "content": "Extract filters from query"},
{"role": "user", "content": query}
]
)
return extraction.dict()
3. Safe Text-to-SQL Integration
import instructor
from pydantic import BaseModel, Field, validator
import sqlite3
class SQLQuery(BaseModel):
query: str = Field(description="SQL SELECT query")
requires_rag: bool = Field(description="Whether results need RAG context")
@validator('query')
def validate_read_only(cls, v):
"""Ensure only SELECT queries"""
normalized = v.strip().upper()
if not normalized.startswith('SELECT'):
raise ValueError("Only SELECT queries allowed")
if any(keyword in normalized for keyword in ['DROP', 'DELETE', 'UPDATE', 'INSERT']):
raise ValueError("Only read operations permitted")
return v
class Text2SQLSystem:
def __init__(self, db_path: str, retriever: HybridRetriever):
self.conn = sqlite3.connect(db_path)
self.retriever = retriever
self.client = instructor.from_openai(OpenAI())
def query(self, user_question: str):
"""Route question to SQL or RAG as appropriate"""
# Generate SQL query
sql_result = self.client.chat.completions.create(
model="gpt-4",
response_model=SQLQuery,
messages=[
{"role": "system", "content": self._get_schema()},
{"role": "user", "content": user_question}
]
)
# Execute SQL safely
cursor = self.conn.cursor()
cursor.execute(sql_result.query)
sql_data = cursor.fetchall()
# If RAG context needed, retrieve relevant docs
if sql_result.requires_rag:
context = self.retriever.search(user_question, k=3)
return self._synthesize_answer(user_question, sql_data, context)
else:
return self._format_sql_results(sql_data)
def _get_schema(self) -> str:
"""Provide database schema to LLM"""
return """
Database Schema:
users (id, name, email, created_at)
orders (id, user_id, total, status, created_at)
order_items (id, order_id, product_id, quantity, price)
products (id, name, category, price, stock)
Only SELECT queries allowed. No modifications.
"""
4. Query Routing
from typing import Literal
class QueryRoute(BaseModel):
route: Literal["vector_search", "metadata_filter", "sql_query", "hybrid"]
reasoning: str
extracted_filters: dict = {}
def route_query(question: str) -> QueryRoute:
"""Determine appropriate retrieval strategy"""
route = client.chat.completions.create(
model="gpt-4o-mini",
response_model=QueryRoute,
messages=[
{"role": "system", "content": """
Route queries based on their characteristics:
- vector_search: Semantic/conceptual questions
- metadata_filter: Queries with specific constraints (color, price, category)
- sql_query: Database-level questions (counts, aggregations, user data)
- hybrid: Combination of semantic + filters
"""},
{"role": "user", "content": question}
]
)
return route
# Usage
question = "Find red cotton shirts under $50"
route = route_query(question)
if route.route == "hybrid":
results = retriever.search(
question,
**route.extracted_filters
)
elif route.route == "sql_query":
results = text2sql.query(question)
Metadata Extraction Best Practices
Define Clear Taxonomies
# taxonomy.yaml
product_categories:
- shirts
- pants
- shoes
- accessories
color_palette:
- red
- blue
- green
- black
- white
price_ranges:
- under_50: [0, 50]
- 50_100: [50, 100]
- 100_200: [100, 200]
- over_200: [200, None]
Validate Extracted Data
from pydantic import validator
class ValidatedMetadata(ProductMetadata):
@validator('color')
def validate_colors(cls, v):
valid_colors = {"red", "blue", "green", "black", "white", "gray", "brown"}
for color in v:
if color.lower() not in valid_colors:
raise ValueError(f"Invalid color: {color}")
return v
@validator('price_range')
def validate_price(cls, v):
valid_ranges = {"under_50", "50_100", "100_200", "over_200"}
if v not in valid_ranges:
raise ValueError(f"Invalid price range: {v}")
return v
Performance Comparison
Pure Semantic vs. Hybrid Search
def compare_approaches(test_queries):
"""Compare pure semantic vs. hybrid retrieval"""
results = {
'semantic': {'correct': 0, 'total': 0},
'hybrid': {'correct': 0, 'total': 0}
}
for query, expected_product in test_queries:
# Semantic only
semantic_results = vector_search(query, k=5)
if expected_product in [r['id'] for r in semantic_results]:
results['semantic']['correct'] += 1
results['semantic']['total'] += 1
# Hybrid with metadata
hybrid_results = hybrid_retriever.search(query, k=5)
if expected_product in [r['id'] for r in hybrid_results]:
results['hybrid']['correct'] += 1
results['hybrid']['total'] += 1
# Calculate improvements
semantic_accuracy = results['semantic']['correct'] / results['semantic']['total']
hybrid_accuracy = results['hybrid']['correct'] / results['hybrid']['total']
improvement = (hybrid_accuracy - semantic_accuracy) / semantic_accuracy * 100
print(f"Semantic Search: {semantic_accuracy:.2%}")
print(f"Hybrid Search: {hybrid_accuracy:.2%}")
print(f"Improvement: {improvement:.1f}%")
Expected Outcomes
By incorporating structured metadata:
- 30-50% improvement on constraint-based queries
- Ability to answer database-level questions
- Better precision when multiple filters apply
- More accurate results for specific attribute searches
Common Issues
Over-Filtering
Problem: Too many filters eliminate all results
Solution: Implement progressive relaxation
def search_with_fallback(query, filters):
results = hybrid_search(query, filters)
if len(results) < 3:
# Relax least important filters
relaxed_filters = remove_least_important(filters)
results = hybrid_search(query, relaxed_filters)
return results
Inconsistent Metadata
Problem: Extraction produces varying attributes
Solution: Use strict schemas with validation and examples
# Provide few-shot examples
few_shot_examples = [
{"description": "Red cotton t-shirt $45", "metadata": {...}},
{"description": "Blue denim jeans $89", "metadata": {...}}
]
Next Steps
- Define your metadata taxonomy
- Extract structured data from existing documents
- Implement hybrid retrieval
- Compare performance on constraint-based queries
- Consider SQL integration for database questions