Security & Privacy

Protect sensitive data and ensure compliance in RAG systems.

Overview

RAG systems handle sensitive data. This guide covers PII protection, access control, and compliance.

PII Detection & Redaction

import re
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine

class PIIProtector:
    def __init__(self):
        self.analyzer = AnalyzerEngine()
        self.anonymizer = AnonymizerEngine()
    
    def detect_pii(self, text):
        results = self.analyzer.analyze(
            text=text,
            language='en',
            entities=["PHONE_NUMBER", "EMAIL_ADDRESS", "CREDIT_CARD", "SSN"]
        )
        return results
    
    def redact_pii(self, text):
        results = self.detect_pii(text)
        anonymized = self.anonymizer.anonymize(text, results)
        return anonymized.text

Access Control

class SecureRAG:
    def query(self, user_id, query):
        # Check user permissions
        if not self.has_access(user_id, query):
            raise PermissionError("Access denied")
        
        # Filter results by user permissions
        docs = self.retrieve(query)
        filtered_docs = [d for d in docs if self.can_access(user_id, d)]
        
        return self.generate(query, filtered_docs)
    
    def can_access(self, user_id, document):
        # Check document-level permissions
        return user_id in document.metadata.get('allowed_users', [])

Data Encryption

from cryptography.fernet import Fernet

class EncryptedVectorDB:
    def __init__(self, key):
        self.cipher = Fernet(key)
    
    def store(self, text, vector):
        # Encrypt text before storage
        encrypted_text = self.cipher.encrypt(text.encode())
        
        self.db.insert({
            'text': encrypted_text,
            'vector': vector  # Vectors not encrypted (needed for search)
        })
    
    def retrieve(self, query_vector):
        results = self.db.search(query_vector)
        
        # Decrypt results
        for r in results:
            r.text = self.cipher.decrypt(r.text).decode()
        
        return results

Audit Logging

class AuditLogger:
    def log_access(self, user_id, query, documents):
        log_entry = {
            'timestamp': datetime.utcnow(),
            'user_id': user_id,
            'query': query,
            'documents_accessed': [d.id for d in documents],
            'ip_address': request.remote_addr
        }
        self.db.insert('audit_log', log_entry)

Compliance (GDPR, HIPAA)

class ComplianceManager:
    def delete_user_data(self, user_id):
        """GDPR right to be forgotten"""
        # Delete user queries
        self.db.delete('queries', {'user_id': user_id})
        
        # Delete user documents
        self.vector_db.delete({'metadata.user_id': user_id})
    
    def export_user_data(self, user_id):
        """GDPR right to data portability"""
        data = {
            'queries': self.db.find('queries', {'user_id': user_id}),
            'documents': self.vector_db.find({'metadata.user_id': user_id})
        }
        return json.dumps(data)

Next Steps