上下文管理策略
概述
上下文管理是AI应用开发中的关键环节,直接影响模型的理解能力和响应质量。合理的上下文管理策略能够在Token限制内最大化信息传递效率,同时控制成本。本文将深入探讨各种上下文管理技术和最佳实践。
核心内容
上下文窗口限制
主流模型的上下文限制
| 模型 | 上下文窗口 | 输入限制 | 输出限制 |
|---|---|---|---|
| GPT-3.5-turbo | 16K/4K | 取决于版本 | 4K |
| GPT-4 | 8K/32K | 取决于版本 | 4K |
| GPT-4-turbo | 128K | 128K | 4K |
| GPT-4o | 128K | 128K | 16K |
| Claude 3 Opus | 200K | 200K | 4K |
| Claude 3.5 Sonnet | 200K | 200K | 8K |
Token分配策略
python
class TokenAllocator:
def __init__(self, model="gpt-4-turbo"):
self.limits = {
"gpt-4-turbo": {"total": 128000, "output": 4096},
"gpt-4o": {"total": 128000, "output": 16384},
"gpt-3.5-turbo": {"total": 16384, "output": 4096}
}
self.model = model
def allocate(self, system_tokens, history_tokens):
total = self.limits[self.model]["total"]
output = self.limits[self.model]["output"]
available = total - output - system_tokens - history_tokens
return {
"system": system_tokens,
"history": history_tokens,
"user_input": available,
"output": output,
"total_used": total - available
}对话历史管理
1. 滑动窗口策略
python
class SlidingWindowHistory:
def __init__(self, max_messages=10, max_tokens=4000):
self.messages = []
self.max_messages = max_messages
self.max_tokens = max_tokens
def add_message(self, role, content):
self.messages.append({"role": role, "content": content})
self._trim_history()
def _trim_history(self):
while len(self.messages) > self.max_messages:
self.messages.pop(0)
while self._count_tokens() > self.max_tokens and len(self.messages) > 2:
self.messages.pop(0)
def _count_tokens(self):
total = sum(len(msg["content"]) for msg in self.messages)
return total // 32. 优先级保留策略
python
class PriorityHistory:
def __init__(self, max_tokens=4000):
self.messages = []
self.max_tokens = max_tokens
self.priorities = {} # message_id -> priority
def add_message(self, role, content, priority=1):
msg_id = len(self.messages)
self.messages.append({"role": role, "content": content})
self.priorities[msg_id] = priority
self._trim_by_priority()
def _trim_by_priority(self):
while self._count_tokens() > self.max_tokens:
lowest_priority = min(self.priorities.values())
for msg_id, priority in list(self.priorities.items()):
if priority == lowest_priority and len(self.messages) > 2:
self.messages.pop(msg_id)
del self.priorities[msg_id]
break3. 摘要压缩策略
python
class SummaryHistory:
def __init__(self, max_turns=5):
self.recent_messages = []
self.summary = ""
self.max_turns = max_turns
def add_message(self, role, content):
self.recent_messages.append({"role": role, "content": content})
if len(self.recent_messages) > self.max_turns * 2:
self._compress_old_messages()
def _compress_old_messages(self):
old_messages = self.recent_messages[:-self.max_turns * 2]
self.recent_messages = self.recent_messages[-self.max_turns * 2:]
old_text = "\n".join(
f"{msg['role']}: {msg['content']}"
for msg in old_messages
)
self.summary = self._generate_summary(old_text)
def get_context(self):
context = []
if self.summary:
context.append({
"role": "system",
"content": f"对话历史摘要:{self.summary}"
})
context.extend(self.recent_messages)
return context长文本处理
1. 分块处理
python
def chunk_text(text, chunk_size=2000, overlap=200):
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunk = text[start:end]
if end < len(text):
last_period = chunk.rfind('。')
if last_period > chunk_size // 2:
chunk = chunk[:last_period + 1]
end = start + last_period + 1
chunks.append(chunk)
start = end - overlap
return chunks2. 层次化摘要
python
class HierarchicalSummary:
def __init__(self, levels=3):
self.levels = levels
self.summaries = {i: [] for i in range(levels)}
def process_document(self, text):
chunks = chunk_text(text, chunk_size=4000)
for chunk in chunks:
summary = self._summarize(chunk)
self.summaries[0].append(summary)
for level in range(1, self.levels):
self._compress_level(level)
def _compress_level(self, level):
items = self.summaries[level - 1]
for i in range(0, len(items), 4):
combined = " ".join(items[i:i+4])
summary = self._summarize(combined)
self.summaries[level].append(summary)
def get_context_for_query(self, query, max_tokens=2000):
relevant_chunks = self._find_relevant_chunks(query)
return self._build_context(relevant_chunks, max_tokens)3. 检索增强生成(RAG)
python
class RAGContextManager:
def __init__(self, embedding_model, vector_store):
self.embedding_model = embedding_model
self.vector_store = vector_store
def index_documents(self, documents):
for doc in documents:
embedding = self.embedding_model.encode(doc)
self.vector_store.add(embedding, doc)
def get_relevant_context(self, query, top_k=5, max_tokens=3000):
query_embedding = self.embedding_model.encode(query)
results = self.vector_store.search(query_embedding, top_k)
context = []
total_tokens = 0
for doc, score in results:
doc_tokens = count_tokens(doc)
if total_tokens + doc_tokens <= max_tokens:
context.append(doc)
total_tokens += doc_tokens
else:
break
return "\n\n".join(context)系统提示优化
1. 动态系统提示
python
class DynamicSystemPrompt:
def __init__(self):
self.base_prompt = "你是一个AI助手。"
self.context_rules = []
def add_context_rule(self, condition, prompt_addition):
self.context_rules.append({
"condition": condition,
"prompt": prompt_addition
})
def generate_system_prompt(self, user_input):
prompt = self.base_prompt
for rule in self.context_rules:
if rule["condition"](user_input):
prompt += "\n" + rule["prompt"]
return prompt2. 角色与任务分离
python
def create_system_prompt(role, task, constraints=None):
prompt_parts = [f"角色:{role}"]
if task:
prompt_parts.append(f"任务:{task}")
if constraints:
prompt_parts.append(f"约束:{constraints}")
return "\n".join(prompt_parts)多轮对话优化
1. 上下文继承
python
class ContextInheritance:
def __init__(self):
self.shared_context = {}
self.session_contexts = {}
def set_shared(self, key, value):
self.shared_context[key] = value
def get_full_context(self, session_id):
session_ctx = self.session_contexts.get(session_id, {})
return {**self.shared_context, **session_ctx}2. 上下文隔离
python
class IsolatedContext:
def __init__(self):
self.contexts = {}
def create_context(self, context_id, system_prompt):
self.contexts[context_id] = {
"system": system_prompt,
"history": []
}
def add_message(self, context_id, role, content):
if context_id in self.contexts:
self.contexts[context_id]["history"].append({
"role": role,
"content": content
})实用技巧
1. Token计数优化
python
def optimize_context(messages, max_tokens):
total = sum(count_tokens(msg["content"]) for msg in messages)
if total <= max_tokens:
return messages
optimized = []
remaining_tokens = max_tokens
for msg in reversed(messages):
msg_tokens = count_tokens(msg["content"])
if remaining_tokens >= msg_tokens:
optimized.insert(0, msg)
remaining_tokens -= msg_tokens
else:
truncated = truncate_message(msg, remaining_tokens)
if truncated:
optimized.insert(0, truncated)
break
return optimized2. 智能上下文选择
python
def select_relevant_history(query, history, max_tokens=2000):
query_embedding = encode(query)
scored_history = []
for msg in history:
msg_embedding = encode(msg["content"])
similarity = cosine_similarity(query_embedding, msg_embedding)
scored_history.append((msg, similarity))
scored_history.sort(key=lambda x: x[1], reverse=True)
selected = []
total_tokens = 0
for msg, score in scored_history:
msg_tokens = count_tokens(msg["content"])
if total_tokens + msg_tokens <= max_tokens:
selected.append(msg)
total_tokens += msg_tokens
return sorted(selected, key=lambda m: history.index(m))3. 上下文预热
python
def preheat_context(essential_info):
return {
"role": "system",
"content": f"关键信息:{essential_info}"
}4. 上下文版本控制
python
class ContextVersionControl:
def __init__(self):
self.versions = {}
self.current_version = 0
def save_version(self, context):
self.current_version += 1
self.versions[self.current_version] = context.copy()
return self.current_version
def restore_version(self, version_id):
return self.versions.get(version_id, None)小结
有效的上下文管理策略包括:
- 了解限制:熟悉不同模型的上下文窗口和Token限制
- 历史管理:使用滑动窗口、优先级保留、摘要压缩等策略
- 长文本处理:采用分块、层次化摘要、RAG等技术
- 系统提示优化:动态生成、角色任务分离
- 多轮对话优化:上下文继承与隔离
- 智能选择:基于相关性选择历史消息
通过合理运用这些策略,可以在Token限制内最大化上下文价值,提升AI应用的整体性能和用户体验。