Security & Privacy
Protect sensitive data and ensure compliance in RAG systems.
Overview
RAG systems handle sensitive data. This guide covers PII protection, access control, and compliance.
PII Detection & Redaction
import re
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
class PIIProtector:
def __init__(self):
self.analyzer = AnalyzerEngine()
self.anonymizer = AnonymizerEngine()
def detect_pii(self, text):
results = self.analyzer.analyze(
text=text,
language='en',
entities=["PHONE_NUMBER", "EMAIL_ADDRESS", "CREDIT_CARD", "SSN"]
)
return results
def redact_pii(self, text):
results = self.detect_pii(text)
anonymized = self.anonymizer.anonymize(text, results)
return anonymized.text
Access Control
class SecureRAG:
def query(self, user_id, query):
# Check user permissions
if not self.has_access(user_id, query):
raise PermissionError("Access denied")
# Filter results by user permissions
docs = self.retrieve(query)
filtered_docs = [d for d in docs if self.can_access(user_id, d)]
return self.generate(query, filtered_docs)
def can_access(self, user_id, document):
# Check document-level permissions
return user_id in document.metadata.get('allowed_users', [])
Data Encryption
from cryptography.fernet import Fernet
class EncryptedVectorDB:
def __init__(self, key):
self.cipher = Fernet(key)
def store(self, text, vector):
# Encrypt text before storage
encrypted_text = self.cipher.encrypt(text.encode())
self.db.insert({
'text': encrypted_text,
'vector': vector # Vectors not encrypted (needed for search)
})
def retrieve(self, query_vector):
results = self.db.search(query_vector)
# Decrypt results
for r in results:
r.text = self.cipher.decrypt(r.text).decode()
return results
Audit Logging
class AuditLogger:
def log_access(self, user_id, query, documents):
log_entry = {
'timestamp': datetime.utcnow(),
'user_id': user_id,
'query': query,
'documents_accessed': [d.id for d in documents],
'ip_address': request.remote_addr
}
self.db.insert('audit_log', log_entry)
Compliance (GDPR, HIPAA)
class ComplianceManager:
def delete_user_data(self, user_id):
"""GDPR right to be forgotten"""
# Delete user queries
self.db.delete('queries', {'user_id': user_id})
# Delete user documents
self.vector_db.delete({'metadata.user_id': user_id})
def export_user_data(self, user_id):
"""GDPR right to data portability"""
data = {
'queries': self.db.find('queries', {'user_id': user_id}),
'documents': self.vector_db.find({'metadata.user_id': user_id})
}
return json.dumps(data)
Next Steps
- Testing - Test security controls
- Observability - Monitor access patterns