Multi-modal RAG
Extend RAG to handle images, tables, charts, and other non-text content.
Overview
Modern documents contain more than text: images, charts, tables, diagrams. Multi-modal RAG handles all content types.
Vision-Language Models
import openai
import base64
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def extract_image_content(image_path):
base64_image = encode_image(image_path)
response = openai.ChatCompletion.create(
model="gpt-4-vision-preview",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": "Describe this image in detail for a search index."},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
]
}]
)
return response.choices[0].message.content
Multi-modal Embeddings
from transformers import CLIPProcessor, CLIPModel
class MultiModalEmbedder:
def __init__(self):
self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
def embed_image(self, image):
inputs = self.processor(images=image, return_tensors="pt")
image_features = self.model.get_image_features(**inputs)
return image_features.detach().numpy()
def embed_text(self, text):
inputs = self.processor(text=[text], return_tensors="pt")
text_features = self.model.get_text_features(**inputs)
return text_features.detach().numpy()
Table Handling
def process_table(table_df):
# Convert to natural language
descriptions = []
for _, row in table_df.iterrows():
desc = f"The {row['Category']} has {row['Metric']} of {row['Value']}"
descriptions.append(desc)
# Also keep markdown for context
markdown = table_df.to_markdown()
return {
'descriptions': descriptions,
'markdown': markdown,
'type': 'table'
}
Chart/Diagram Extraction
def extract_chart_data(image_path):
"""Use GPT-4V to extract data from charts"""
base64_image = encode_image(image_path)
prompt = """
Extract the data from this chart. Return:
1. Chart type (bar, line, pie, etc.)
2. X and Y axis labels
3. Data points
4. Key insights
"""
response = openai.ChatCompletion.create(
model="gpt-4-vision-preview",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
]
}]
)
return response.choices[0].message.content
Unified Storage
class MultiModalVectorDB:
def add_document(self, doc_path):
# Extract text
text = extract_text(doc_path)
text_chunks = chunk_text(text)
# Extract images
images = extract_images(doc_path)
# Store both
for chunk in text_chunks:
self.db.insert({
'type': 'text',
'content': chunk,
'vector': self.embed_text(chunk)
})
for img in images:
img_description = extract_image_content(img)
self.db.insert({
'type': 'image',
'content': img_description,
'image_path': img,
'vector': self.embed_image(img)
})
Next Steps
- Graph RAG - Relationship-aware retrieval
- Document Parsing - Extract multi-modal content