Multilingual Embeddings
Handling multiple languages in RAG systems with multilingual embedding models
Overview
Multilingual embeddings enable RAG systems to work across languages, allowing users to query in one language and retrieve relevant documents in another. This is essential for global applications, multilingual support, and cross-lingual information retrieval.
Why Multilingual Embeddings?
Cross-Language Retrieval
Users can query in their native language and find relevant content regardless of document language:
# Query in English, find Spanish documents
query = "How do I reset my password?"
# Matches: "¿Cómo restablezco mi contraseña?"
# Query in French, find English documents
query = "politique de remboursement"
# Matches: "refund policy"
Single Model for All Languages
Instead of maintaining separate models per language:
- Before: English model + Spanish model + French model...
- After: One multilingual model handles all
Semantic Similarity Across Languages
Multilingual models understand that these mean the same thing:
- English: "Thank you"
- Spanish: "Gracias"
- French: "Merci"
- German: "Danke"
All map to similar vectors in embedding space.
Popular Multilingual Models
paraphrase-multilingual-mpnet-base-v2
Coverage: 50+ languages
Dimensions: 768
Best for: General multilingual RAG
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
# Encode in different languages
en_emb = model.encode("Hello, how are you?")
es_emb = model.encode("Hola, ¿cómo estás?")
fr_emb = model.encode("Bonjour, comment allez-vous?")
# All are semantically similar
from sentence_transformers import util
print(util.cos_sim(en_emb, es_emb)) # High similarity: ~0.85
print(util.cos_sim(en_emb, fr_emb)) # High similarity: ~0.82
distiluse-base-multilingual-cased-v2
Coverage: 50+ languages
Dimensions: 512
Best for: Faster inference, lower resource usage
model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
# Smaller, faster, but slightly lower quality
embedding = model.encode("你好") # Chinese
LaBSE (Language-agnostic BERT Sentence Embedding)
Coverage: 109 languages
Dimensions: 768
Best for: Maximum language coverage
model = SentenceTransformer('sentence-transformers/LaBSE')
# Supports rare languages
embedding = model.encode("สวัสดี") # Thai
Multilingual-E5
Coverage: 100 languages
Dimensions: 768
Best for: State-of-the-art multilingual retrieval
model = SentenceTransformer('intfloat/multilingual-e5-large')
# Prefix queries and passages
query = "query: How to reset password?"
passage = "passage: Para restablecer su contraseña..."
query_emb = model.encode(query)
passage_emb = model.encode(passage)
Implementation Guide
Basic Multilingual RAG
from sentence_transformers import SentenceTransformer
import lancedb
class MultilingualRAG:
def __init__(self):
self.model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
self.db = lancedb.connect("./multilingual-db")
def add_documents(self, documents, languages):
"""Add documents in multiple languages"""
data = []
for i, (doc, lang) in enumerate(zip(documents, languages)):
data.append({
"id": i,
"text": doc,
"language": lang,
"vector": self.model.encode(doc).tolist()
})
self.table = self.db.create_table("docs", data, mode="overwrite")
def search(self, query, k=5):
"""Search across all languages"""
query_vector = self.model.encode(query)
results = self.table.search(query_vector).limit(k).to_list()
return [{
'text': r['text'],
'language': r['language']
} for r in results]
# Usage
rag = MultilingualRAG()
# Add documents in different languages
rag.add_documents(
documents=[
"How to reset your password",
"Cómo restablecer tu contraseña",
"Comment réinitialiser votre mot de passe",
"So setzen Sie Ihr Passwort zurück"
],
languages=['en', 'es', 'fr', 'de']
)
# Query in any language
results = rag.search("password reset", k=2)
for r in results:
print(f"[{r['language']}] {r['text']}")
Language Detection
Automatically detect query language:
from langdetect import detect
def detect_language(text):
"""Detect language of input text"""
try:
return detect(text)
except:
return 'unknown'
# Usage
query = "¿Cómo puedo obtener un reembolso?"
lang = detect_language(query)
print(f"Detected language: {lang}") # 'es'
Language-Specific Filtering
Filter results by language if needed:
def search_with_language_filter(query, target_language=None, k=5):
"""Search with optional language filtering"""
query_vector = model.encode(query)
if target_language:
# Filter to specific language
results = table.search(query_vector)\
.where(f"language = '{target_language}'")\
.limit(k)\
.to_list()
else:
# Search across all languages
results = table.search(query_vector).limit(k).to_list()
return results
# Search only Spanish documents
spanish_results = search_with_language_filter(
"password reset",
target_language='es'
)
Best Practices
1. Consistent Text Preprocessing
Normalize text across languages:
def normalize_text(text, language):
"""Normalize text for embedding"""
# Lowercase (language-aware)
text = text.lower()
# Remove extra whitespace
text = ' '.join(text.split())
# Language-specific normalization
if language == 'de':
# German: ß → ss
text = text.replace('ß', 'ss')
elif language == 'fr':
# French: remove accents (optional)
import unicodedata
text = ''.join(
c for c in unicodedata.normalize('NFD', text)
if unicodedata.category(c) != 'Mn'
)
return text
2. Handle Mixed-Language Documents
For documents containing multiple languages:
def split_by_language(document):
"""Split document into language-specific chunks"""
from langdetect import detect_langs
chunks = []
for paragraph in document.split('\n\n'):
if paragraph.strip():
lang = detect(paragraph)
chunks.append({
'text': paragraph,
'language': lang
})
return chunks
# Embed each chunk separately
mixed_doc = """
English paragraph here.
Párrafo en español aquí.
Paragraphe français ici.
"""
chunks = split_by_language(mixed_doc)
for chunk in chunks:
embedding = model.encode(chunk['text'])
# Store with language metadata
3. Quality Varies by Language
Multilingual models perform differently across languages:
High Quality:
- English, Spanish, French, German, Chinese
- Well-represented in training data
Medium Quality:
- Italian, Portuguese, Russian, Japanese
- Moderate training data
Lower Quality:
- Low-resource languages
- Limited training data
Solution: Fine-tune on your specific languages if needed.
Language-Specific Optimizations
Monolingual Models for Critical Languages
For your primary language, consider using a specialized model:
class HybridMultilingualRAG:
def __init__(self):
# English-specific model (higher quality)
self.en_model = SentenceTransformer('all-mpnet-base-v2')
# Multilingual model for other languages
self.multi_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
def encode(self, text, language):
"""Use best model for each language"""
if language == 'en':
return self.en_model.encode(text)
else:
return self.multi_model.encode(text)
Translation-Based Approach
Alternative: Translate everything to English:
from googletrans import Translator
translator = Translator()
def translate_to_english(text, source_lang):
"""Translate text to English"""
if source_lang == 'en':
return text
result = translator.translate(text, src=source_lang, dest='en')
return result.text
# Translate query, use English model
query_es = "¿Cómo restablezco mi contraseña?"
query_en = translate_to_english(query_es, 'es')
# Use high-quality English model
en_model = SentenceTransformer('all-mpnet-base-v2')
embedding = en_model.encode(query_en)
Pros:
- Use best English models
- Simpler infrastructure
Cons:
- Translation latency
- Translation errors
- Additional API costs
Evaluation
Cross-Lingual Retrieval Metrics
Test retrieval across language pairs:
def evaluate_cross_lingual(model, test_pairs):
"""
test_pairs: [(query_lang, query_text, doc_lang, expected_doc)]
"""
correct = 0
for query_lang, query, doc_lang, expected_doc in test_pairs:
query_emb = model.encode(query)
results = search(query_emb, k=5)
if expected_doc in results:
correct += 1
accuracy = correct / len(test_pairs)
return accuracy
# Test dataset
test_pairs = [
('en', 'password reset', 'es', 'doc_es_123'),
('fr', 'politique de remboursement', 'en', 'doc_en_456'),
('de', 'Versandrichtlinie', 'en', 'doc_en_789'),
]
accuracy = evaluate_cross_lingual(model, test_pairs)
print(f"Cross-lingual accuracy: {accuracy:.2%}")
Per-Language Performance
Track performance by language:
def evaluate_by_language(model, test_data):
"""Measure performance for each language"""
results = {}
for lang in ['en', 'es', 'fr', 'de', 'zh']:
lang_queries = [q for q in test_data if q['language'] == lang]
accuracy = evaluate_queries(model, lang_queries)
results[lang] = accuracy
return results
# Example output
# {'en': 0.92, 'es': 0.87, 'fr': 0.85, 'de': 0.83, 'zh': 0.78}
Common Challenges
1. Code-Switching
Users mixing languages in single query:
# Mixed English-Spanish
query = "How do I cambiar mi contraseña?"
# Solution: Multilingual models handle this naturally
embedding = model.encode(query)
2. Language-Specific Characters
Handle special characters properly:
# Preserve language-specific characters
text = "Müller" # German
text = "São Paulo" # Portuguese
text = "北京" # Chinese
# Don't strip accents/characters - they carry meaning
3. Right-to-Left Languages
Arabic, Hebrew require special handling:
# Ensure proper text direction
arabic_text = "كيف أعيد تعيين كلمة المرور؟"
# Model handles RTL internally, but UI needs consideration
Next Steps
- Embedding Fundamentals - Understand embedding basics
- Choosing Embedding Models - Select the right model
- Fine-Tuning Embeddings - Improve multilingual performance