Skip to content

上下文管理策略

概述

上下文管理是AI应用开发中的关键环节,直接影响模型的理解能力和响应质量。合理的上下文管理策略能够在Token限制内最大化信息传递效率,同时控制成本。本文将深入探讨各种上下文管理技术和最佳实践。

核心内容

上下文窗口限制

主流模型的上下文限制

模型上下文窗口输入限制输出限制
GPT-3.5-turbo16K/4K取决于版本4K
GPT-48K/32K取决于版本4K
GPT-4-turbo128K128K4K
GPT-4o128K128K16K
Claude 3 Opus200K200K4K
Claude 3.5 Sonnet200K200K8K

Token分配策略

python
class TokenAllocator:
    def __init__(self, model="gpt-4-turbo"):
        self.limits = {
            "gpt-4-turbo": {"total": 128000, "output": 4096},
            "gpt-4o": {"total": 128000, "output": 16384},
            "gpt-3.5-turbo": {"total": 16384, "output": 4096}
        }
        self.model = model
    
    def allocate(self, system_tokens, history_tokens):
        total = self.limits[self.model]["total"]
        output = self.limits[self.model]["output"]
        
        available = total - output - system_tokens - history_tokens
        return {
            "system": system_tokens,
            "history": history_tokens,
            "user_input": available,
            "output": output,
            "total_used": total - available
        }

对话历史管理

1. 滑动窗口策略

python
class SlidingWindowHistory:
    def __init__(self, max_messages=10, max_tokens=4000):
        self.messages = []
        self.max_messages = max_messages
        self.max_tokens = max_tokens
    
    def add_message(self, role, content):
        self.messages.append({"role": role, "content": content})
        self._trim_history()
    
    def _trim_history(self):
        while len(self.messages) > self.max_messages:
            self.messages.pop(0)
        
        while self._count_tokens() > self.max_tokens and len(self.messages) > 2:
            self.messages.pop(0)
    
    def _count_tokens(self):
        total = sum(len(msg["content"]) for msg in self.messages)
        return total // 3

2. 优先级保留策略

python
class PriorityHistory:
    def __init__(self, max_tokens=4000):
        self.messages = []
        self.max_tokens = max_tokens
        self.priorities = {}  # message_id -> priority
    
    def add_message(self, role, content, priority=1):
        msg_id = len(self.messages)
        self.messages.append({"role": role, "content": content})
        self.priorities[msg_id] = priority
        self._trim_by_priority()
    
    def _trim_by_priority(self):
        while self._count_tokens() > self.max_tokens:
            lowest_priority = min(self.priorities.values())
            for msg_id, priority in list(self.priorities.items()):
                if priority == lowest_priority and len(self.messages) > 2:
                    self.messages.pop(msg_id)
                    del self.priorities[msg_id]
                    break

3. 摘要压缩策略

python
class SummaryHistory:
    def __init__(self, max_turns=5):
        self.recent_messages = []
        self.summary = ""
        self.max_turns = max_turns
    
    def add_message(self, role, content):
        self.recent_messages.append({"role": role, "content": content})
        
        if len(self.recent_messages) > self.max_turns * 2:
            self._compress_old_messages()
    
    def _compress_old_messages(self):
        old_messages = self.recent_messages[:-self.max_turns * 2]
        self.recent_messages = self.recent_messages[-self.max_turns * 2:]
        
        old_text = "\n".join(
            f"{msg['role']}: {msg['content']}" 
            for msg in old_messages
        )
        
        self.summary = self._generate_summary(old_text)
    
    def get_context(self):
        context = []
        if self.summary:
            context.append({
                "role": "system",
                "content": f"对话历史摘要:{self.summary}"
            })
        context.extend(self.recent_messages)
        return context

长文本处理

1. 分块处理

python
def chunk_text(text, chunk_size=2000, overlap=200):
    chunks = []
    start = 0
    
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        
        if end < len(text):
            last_period = chunk.rfind('。')
            if last_period > chunk_size // 2:
                chunk = chunk[:last_period + 1]
                end = start + last_period + 1
        
        chunks.append(chunk)
        start = end - overlap
    
    return chunks

2. 层次化摘要

python
class HierarchicalSummary:
    def __init__(self, levels=3):
        self.levels = levels
        self.summaries = {i: [] for i in range(levels)}
    
    def process_document(self, text):
        chunks = chunk_text(text, chunk_size=4000)
        
        for chunk in chunks:
            summary = self._summarize(chunk)
            self.summaries[0].append(summary)
        
        for level in range(1, self.levels):
            self._compress_level(level)
    
    def _compress_level(self, level):
        items = self.summaries[level - 1]
        for i in range(0, len(items), 4):
            combined = " ".join(items[i:i+4])
            summary = self._summarize(combined)
            self.summaries[level].append(summary)
    
    def get_context_for_query(self, query, max_tokens=2000):
        relevant_chunks = self._find_relevant_chunks(query)
        return self._build_context(relevant_chunks, max_tokens)

3. 检索增强生成(RAG)

python
class RAGContextManager:
    def __init__(self, embedding_model, vector_store):
        self.embedding_model = embedding_model
        self.vector_store = vector_store
    
    def index_documents(self, documents):
        for doc in documents:
            embedding = self.embedding_model.encode(doc)
            self.vector_store.add(embedding, doc)
    
    def get_relevant_context(self, query, top_k=5, max_tokens=3000):
        query_embedding = self.embedding_model.encode(query)
        results = self.vector_store.search(query_embedding, top_k)
        
        context = []
        total_tokens = 0
        
        for doc, score in results:
            doc_tokens = count_tokens(doc)
            if total_tokens + doc_tokens <= max_tokens:
                context.append(doc)
                total_tokens += doc_tokens
            else:
                break
        
        return "\n\n".join(context)

系统提示优化

1. 动态系统提示

python
class DynamicSystemPrompt:
    def __init__(self):
        self.base_prompt = "你是一个AI助手。"
        self.context_rules = []
    
    def add_context_rule(self, condition, prompt_addition):
        self.context_rules.append({
            "condition": condition,
            "prompt": prompt_addition
        })
    
    def generate_system_prompt(self, user_input):
        prompt = self.base_prompt
        
        for rule in self.context_rules:
            if rule["condition"](user_input):
                prompt += "\n" + rule["prompt"]
        
        return prompt

2. 角色与任务分离

python
def create_system_prompt(role, task, constraints=None):
    prompt_parts = [f"角色:{role}"]
    
    if task:
        prompt_parts.append(f"任务:{task}")
    
    if constraints:
        prompt_parts.append(f"约束:{constraints}")
    
    return "\n".join(prompt_parts)

多轮对话优化

1. 上下文继承

python
class ContextInheritance:
    def __init__(self):
        self.shared_context = {}
        self.session_contexts = {}
    
    def set_shared(self, key, value):
        self.shared_context[key] = value
    
    def get_full_context(self, session_id):
        session_ctx = self.session_contexts.get(session_id, {})
        return {**self.shared_context, **session_ctx}

2. 上下文隔离

python
class IsolatedContext:
    def __init__(self):
        self.contexts = {}
    
    def create_context(self, context_id, system_prompt):
        self.contexts[context_id] = {
            "system": system_prompt,
            "history": []
        }
    
    def add_message(self, context_id, role, content):
        if context_id in self.contexts:
            self.contexts[context_id]["history"].append({
                "role": role,
                "content": content
            })

实用技巧

1. Token计数优化

python
def optimize_context(messages, max_tokens):
    total = sum(count_tokens(msg["content"]) for msg in messages)
    
    if total <= max_tokens:
        return messages
    
    optimized = []
    remaining_tokens = max_tokens
    
    for msg in reversed(messages):
        msg_tokens = count_tokens(msg["content"])
        if remaining_tokens >= msg_tokens:
            optimized.insert(0, msg)
            remaining_tokens -= msg_tokens
        else:
            truncated = truncate_message(msg, remaining_tokens)
            if truncated:
                optimized.insert(0, truncated)
            break
    
    return optimized

2. 智能上下文选择

python
def select_relevant_history(query, history, max_tokens=2000):
    query_embedding = encode(query)
    scored_history = []
    
    for msg in history:
        msg_embedding = encode(msg["content"])
        similarity = cosine_similarity(query_embedding, msg_embedding)
        scored_history.append((msg, similarity))
    
    scored_history.sort(key=lambda x: x[1], reverse=True)
    
    selected = []
    total_tokens = 0
    
    for msg, score in scored_history:
        msg_tokens = count_tokens(msg["content"])
        if total_tokens + msg_tokens <= max_tokens:
            selected.append(msg)
            total_tokens += msg_tokens
    
    return sorted(selected, key=lambda m: history.index(m))

3. 上下文预热

python
def preheat_context(essential_info):
    return {
        "role": "system",
        "content": f"关键信息:{essential_info}"
    }

4. 上下文版本控制

python
class ContextVersionControl:
    def __init__(self):
        self.versions = {}
        self.current_version = 0
    
    def save_version(self, context):
        self.current_version += 1
        self.versions[self.current_version] = context.copy()
        return self.current_version
    
    def restore_version(self, version_id):
        return self.versions.get(version_id, None)

小结

有效的上下文管理策略包括:

  1. 了解限制:熟悉不同模型的上下文窗口和Token限制
  2. 历史管理:使用滑动窗口、优先级保留、摘要压缩等策略
  3. 长文本处理:采用分块、层次化摘要、RAG等技术
  4. 系统提示优化:动态生成、角色任务分离
  5. 多轮对话优化:上下文继承与隔离
  6. 智能选择:基于相关性选择历史消息

通过合理运用这些策略,可以在Token限制内最大化上下文价值,提升AI应用的整体性能和用户体验。