Structured Metadata and SQL Integration

Hybrid retrieval combining vector search with metadata filtering and SQL queries

Overview

Pure semantic search excels at finding conceptually similar content but struggles with queries containing specific constraints like "red sneakers under $100" or "invoices from Q3 2023." By incorporating structured metadata and database integration, you can build RAG systems that handle these complex, real-world queries effectively.

This approach combines multiple retrieval methods:

  • Vector search for semantic understanding
  • Metadata filtering for specific constraints
  • SQL queries for database information
  • Structured extraction from documents

Why Structured Data Matters

  • Handle Constraints: Process queries with specific filters (price, date, category)
  • Improve Precision: Combine semantic and exact matching
  • Expand Capabilities: Answer database-level questions
  • Better Citations: Maintain document structure for precise references

Implementation Guide

1. Extract Structured Metadata

import instructor
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import List, Literal

# Define schema
class ProductMetadata(BaseModel):
    category: Literal["shirts", "pants", "shoes", "accessories"]
    color: List[str] = Field(description="Main colors")
    material: List[str] = Field(description="Primary materials")
    price_range: Literal["under_50", "50_100", "100_200", "over_200"]
    style: List[str] = Field(description="Style tags")
    season: List[Literal["spring", "summer", "fall", "winter"]]

client = instructor.from_openai(OpenAI())

def extract_metadata(product_description: str, image_url: str = None):
    """Extract structured metadata from product data"""
    
    messages = [
        {"role": "system", "content": "Extract product metadata accurately"},
        {"role": "user", "content": f"Description: {product_description}"}
    ]
    
    # Add image if available (multimodal)
    if image_url:
        messages[1]["content"] = [
            {"type": "text", "text": f"Description: {product_description}"},
            {"type": "image_url", "image_url": {"url": image_url}}
        ]
    
    metadata = client.chat.completions.create(
        model="gpt-4o",  # Supports vision
        response_model=ProductMetadata,
        messages=messages
    )
    
    return metadata

2. Hybrid Retrieval with Metadata Filtering

import lancedb
from typing import Optional

class HybridRetriever:
    def __init__(self, db_path: str):
        self.db = lancedb.connect(db_path)
        self.table = self.db.open_table("products")
    
    def search(
        self,
        query: str,
        category: Optional[str] = None,
        price_range: Optional[str] = None,
        colors: Optional[List[str]] = None,
        k: int = 5
    ):
        """Combine semantic search with metadata filters"""
        
        # Parse query for implicit constraints
        filters = self._extract_filters(query)
        
        # Override with explicit filters
        if category:
            filters['category'] = category
        if price_range:
            filters['price_range'] = price_range
        if colors:
            filters['colors'] = colors
        
        # Build filter string
        filter_conditions = []
        if filters.get('category'):
            filter_conditions.append(f"category = '{filters['category']}'")
        if filters.get('price_range'):
            filter_conditions.append(f"price_range = '{filters['price_range']}'")
        if filters.get('colors'):
            color_filters = " OR ".join([f"'{c}' IN colors" for c in filters['colors']])
            filter_conditions.append(f"({color_filters})")
        
        where_clause = " AND ".join(filter_conditions) if filter_conditions else None
        
        # Execute hybrid search
        results = (
            self.table.search(query)
            .where(where_clause) if where_clause else self.table.search(query)
        ).limit(k).to_list()
        
        return results
    
    def _extract_filters(self, query: str) -> dict:
        """Extract filters from natural language query"""
        
        extraction = client.chat.completions.create(
            model="gpt-4o-mini",
            response_model=ProductMetadata,
            messages=[
                {"role": "system", "content": "Extract filters from query"},
                {"role": "user", "content": query}
            ]
        )
        
        return extraction.dict()

3. Safe Text-to-SQL Integration

import instructor
from pydantic import BaseModel, Field, validator
import sqlite3

class SQLQuery(BaseModel):
    query: str = Field(description="SQL SELECT query")
    requires_rag: bool = Field(description="Whether results need RAG context")
    
    @validator('query')
    def validate_read_only(cls, v):
        """Ensure only SELECT queries"""
        normalized = v.strip().upper()
        if not normalized.startswith('SELECT'):
            raise ValueError("Only SELECT queries allowed")
        if any(keyword in normalized for keyword in ['DROP', 'DELETE', 'UPDATE', 'INSERT']):
            raise ValueError("Only read operations permitted")
        return v

class Text2SQLSystem:
    def __init__(self, db_path: str, retriever: HybridRetriever):
        self.conn = sqlite3.connect(db_path)
        self.retriever = retriever
        self.client = instructor.from_openai(OpenAI())
    
    def query(self, user_question: str):
        """Route question to SQL or RAG as appropriate"""
        
        # Generate SQL query
        sql_result = self.client.chat.completions.create(
            model="gpt-4",
            response_model=SQLQuery,
            messages=[
                {"role": "system", "content": self._get_schema()},
                {"role": "user", "content": user_question}
            ]
        )
        
        # Execute SQL safely
        cursor = self.conn.cursor()
        cursor.execute(sql_result.query)
        sql_data = cursor.fetchall()
        
        # If RAG context needed, retrieve relevant docs
        if sql_result.requires_rag:
            context = self.retriever.search(user_question, k=3)
            return self._synthesize_answer(user_question, sql_data, context)
        else:
            return self._format_sql_results(sql_data)
    
    def _get_schema(self) -> str:
        """Provide database schema to LLM"""
        return """
        Database Schema:
        
        users (id, name, email, created_at)
        orders (id, user_id, total, status, created_at)
        order_items (id, order_id, product_id, quantity, price)
        products (id, name, category, price, stock)
        
        Only SELECT queries allowed. No modifications.
        """

4. Query Routing

from typing import Literal

class QueryRoute(BaseModel):
    route: Literal["vector_search", "metadata_filter", "sql_query", "hybrid"]
    reasoning: str
    extracted_filters: dict = {}

def route_query(question: str) -> QueryRoute:
    """Determine appropriate retrieval strategy"""
    
    route = client.chat.completions.create(
        model="gpt-4o-mini",
        response_model=QueryRoute,
        messages=[
            {"role": "system", "content": """
                Route queries based on their characteristics:
                - vector_search: Semantic/conceptual questions
                - metadata_filter: Queries with specific constraints (color, price, category)
                - sql_query: Database-level questions (counts, aggregations, user data)
                - hybrid: Combination of semantic + filters
            """},
            {"role": "user", "content": question}
        ]
    )
    
    return route

# Usage
question = "Find red cotton shirts under $50"
route = route_query(question)

if route.route == "hybrid":
    results = retriever.search(
        question,
        **route.extracted_filters
    )
elif route.route == "sql_query":
    results = text2sql.query(question)

Metadata Extraction Best Practices

Define Clear Taxonomies

# taxonomy.yaml
product_categories:
  - shirts
  - pants
  - shoes
  - accessories

color_palette:
  - red
  - blue
  - green
  - black
  - white

price_ranges:
  - under_50: [0, 50]
  - 50_100: [50, 100]
  - 100_200: [100, 200]
  - over_200: [200, None]

Validate Extracted Data

from pydantic import validator

class ValidatedMetadata(ProductMetadata):
    @validator('color')
    def validate_colors(cls, v):
        valid_colors = {"red", "blue", "green", "black", "white", "gray", "brown"}
        for color in v:
            if color.lower() not in valid_colors:
                raise ValueError(f"Invalid color: {color}")
        return v
    
    @validator('price_range')
    def validate_price(cls, v):
        valid_ranges = {"under_50", "50_100", "100_200", "over_200"}
        if v not in valid_ranges:
            raise ValueError(f"Invalid price range: {v}")
        return v

Performance Comparison

Pure Semantic vs. Hybrid Search

def compare_approaches(test_queries):
    """Compare pure semantic vs. hybrid retrieval"""
    
    results = {
        'semantic': {'correct': 0, 'total': 0},
        'hybrid': {'correct': 0, 'total': 0}
    }
    
    for query, expected_product in test_queries:
        # Semantic only
        semantic_results = vector_search(query, k=5)
        if expected_product in [r['id'] for r in semantic_results]:
            results['semantic']['correct'] += 1
        results['semantic']['total'] += 1
        
        # Hybrid with metadata
        hybrid_results = hybrid_retriever.search(query, k=5)
        if expected_product in [r['id'] for r in hybrid_results]:
            results['hybrid']['correct'] += 1
        results['hybrid']['total'] += 1
    
    # Calculate improvements
    semantic_accuracy = results['semantic']['correct'] / results['semantic']['total']
    hybrid_accuracy = results['hybrid']['correct'] / results['hybrid']['total']
    
    improvement = (hybrid_accuracy - semantic_accuracy) / semantic_accuracy * 100
    
    print(f"Semantic Search: {semantic_accuracy:.2%}")
    print(f"Hybrid Search: {hybrid_accuracy:.2%}")
    print(f"Improvement: {improvement:.1f}%")

Expected Outcomes

By incorporating structured metadata:

  • 30-50% improvement on constraint-based queries
  • Ability to answer database-level questions
  • Better precision when multiple filters apply
  • More accurate results for specific attribute searches

Common Issues

Over-Filtering

Problem: Too many filters eliminate all results

Solution: Implement progressive relaxation

def search_with_fallback(query, filters):
    results = hybrid_search(query, filters)
    
    if len(results) < 3:
        # Relax least important filters
        relaxed_filters = remove_least_important(filters)
        results = hybrid_search(query, relaxed_filters)
    
    return results

Inconsistent Metadata

Problem: Extraction produces varying attributes

Solution: Use strict schemas with validation and examples

# Provide few-shot examples
few_shot_examples = [
    {"description": "Red cotton t-shirt $45", "metadata": {...}},
    {"description": "Blue denim jeans $89", "metadata": {...}}
]

Next Steps

  • Define your metadata taxonomy
  • Extract structured data from existing documents
  • Implement hybrid retrieval
  • Compare performance on constraint-based queries
  • Consider SQL integration for database questions

Additional Resources