Multi-modal RAG

Extend RAG to handle images, tables, charts, and other non-text content.

Overview

Modern documents contain more than text: images, charts, tables, diagrams. Multi-modal RAG handles all content types.

Vision-Language Models

import openai
import base64

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def extract_image_content(image_path):
    base64_image = encode_image(image_path)
    
    response = openai.ChatCompletion.create(
        model="gpt-4-vision-preview",
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": "Describe this image in detail for a search index."},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
            ]
        }]
    )
    
    return response.choices[0].message.content

Multi-modal Embeddings

from transformers import CLIPProcessor, CLIPModel

class MultiModalEmbedder:
    def __init__(self):
        self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    
    def embed_image(self, image):
        inputs = self.processor(images=image, return_tensors="pt")
        image_features = self.model.get_image_features(**inputs)
        return image_features.detach().numpy()
    
    def embed_text(self, text):
        inputs = self.processor(text=[text], return_tensors="pt")
        text_features = self.model.get_text_features(**inputs)
        return text_features.detach().numpy()

Table Handling

def process_table(table_df):
    # Convert to natural language
    descriptions = []
    for _, row in table_df.iterrows():
        desc = f"The {row['Category']} has {row['Metric']} of {row['Value']}"
        descriptions.append(desc)
    
    # Also keep markdown for context
    markdown = table_df.to_markdown()
    
    return {
        'descriptions': descriptions,
        'markdown': markdown,
        'type': 'table'
    }

Chart/Diagram Extraction

def extract_chart_data(image_path):
    """Use GPT-4V to extract data from charts"""
    base64_image = encode_image(image_path)
    
    prompt = """
    Extract the data from this chart. Return:
    1. Chart type (bar, line, pie, etc.)
    2. X and Y axis labels
    3. Data points
    4. Key insights
    """
    
    response = openai.ChatCompletion.create(
        model="gpt-4-vision-preview",
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
            ]
        }]
    )
    
    return response.choices[0].message.content

Unified Storage

class MultiModalVectorDB:
    def add_document(self, doc_path):
        # Extract text
        text = extract_text(doc_path)
        text_chunks = chunk_text(text)
        
        # Extract images
        images = extract_images(doc_path)
        
        # Store both
        for chunk in text_chunks:
            self.db.insert({
                'type': 'text',
                'content': chunk,
                'vector': self.embed_text(chunk)
            })
        
        for img in images:
            img_description = extract_image_content(img)
            self.db.insert({
                'type': 'image',
                'content': img_description,
                'image_path': img,
                'vector': self.embed_image(img)
            })

Next Steps