LlamaIndex使用
概述
LlamaIndex(原GPT Index)是一个专门用于构建大语言模型应用的框架,特别擅长处理数据连接和检索增强生成(RAG)场景。它提供了丰富的数据加载器、索引结构和查询引擎,让开发者能够轻松构建基于私有数据的AI应用。
LlamaIndex的核心优势:
- 数据连接器 - 支持100+种数据源
- 索引结构 - 多种索引类型适应不同场景
- 查询引擎 - 灵活的查询和检索策略
- 易于定制 - 高度可配置的组件
核心概念
数据加载器
LlamaIndex提供了丰富的数据加载器,支持各种数据源。
python
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.readers.file import PyMuPDFReader
documents = SimpleDirectoryReader(
input_dir="./documents",
required_exts=[".pdf", ".txt", ".md"]
).load_data()
print(f"加载了 {len(documents)} 个文档")节点与文档
文档被分割成节点(Node)进行处理。
python
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
doc = Document(text="这是一个长文档..." * 100)
parser = SentenceSplitter(
chunk_size=512,
chunk_overlap=50
)
nodes = parser.get_nodes_from_documents([doc])
print(f"分割成 {len(nodes)} 个节点")索引类型
LlamaIndex支持多种索引类型。
python
from llama_index.core import VectorStoreIndex, SummaryIndex, KeywordTableIndex
from llama_index.core import SimpleKeywordTableIndex
vector_index = VectorStoreIndex.from_documents(documents)
summary_index = SummaryIndex.from_documents(documents)
keyword_index = KeywordTableIndex.from_documents(documents)核心功能
向量索引
最常用的索引类型,适合语义搜索。
python
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.openai import OpenAIEmbedding
documents = SimpleDirectoryReader("./data").load_data()
embed_model = OpenAIEmbedding(model="text-embedding-3-small")
index = VectorStoreIndex.from_documents(
documents,
embed_model=embed_model
)
query_engine = index.as_query_engine()
response = query_engine.query("文档的主要内容是什么?")
print(response)查询引擎
查询引擎提供多种查询策略。
python
from llama_index.core import VectorStoreIndex
from llama_index.core.response_synthesizers import ResponseMode
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine(
similarity_top_k=3,
response_mode=ResponseMode.COMPACT
)
response = query_engine.query("什么是机器学习?")
print(response.response)
for source in response.source_nodes:
print(f"来源:{source.node.metadata}")检索器
自定义检索策略。
python
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
retriever = VectorIndexRetriever(
index=index,
similarity_top_k=5
)
response_synthesizer = get_response_synthesizer(
response_mode="compact"
)
query_engine = RetrieverQueryEngine(
retriever=retriever,
response_synthesizer=response_synthesizer
)使用示例
示例1:PDF文档问答系统
python
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
Settings.llm = OpenAI(model="gpt-4o-mini")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
documents = SimpleDirectoryReader(
input_files=["./documents/report.pdf"]
).load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine(
similarity_top_k=3,
verbose=True
)
questions = [
"这份报告的主要结论是什么?",
"有哪些关键数据?",
"有什么建议?"
]
for question in questions:
response = query_engine.query(question)
print(f"问题:{question}")
print(f"回答:{response}\n")示例2:多文档索引
python
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.storage.storage_context import StorageContext
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core import load_index_from_storage
import os
PERSIST_DIR = "./storage"
if os.path.exists(PERSIST_DIR):
storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
index = load_index_from_storage(storage_context)
else:
documents = SimpleDirectoryReader("./documents").load_data()
index = VectorStoreIndex.from_documents(documents)
index.storage_context.persist(persist_dir=PERSIST_DIR)
query_engine = index.as_query_engine()
response = query_engine.query("总结所有文档的核心内容")
print(response)示例3:混合检索
python
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.retrievers import KeywordTableSimpleRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.response_synthesizers import get_response_synthesizer
documents = SimpleDirectoryReader("./data").load_data()
index = VectorStoreIndex.from_documents(documents)
vector_retriever = VectorIndexRetriever(
index=index,
similarity_top_k=3
)
keyword_retriever = KeywordTableSimpleRetriever(
index=index,
num_keywords_per_chunk=5
)
response_synthesizer = get_response_synthesizer(
response_mode="compact"
)
query_engine = RetrieverQueryEngine(
retriever=vector_retriever,
response_synthesizer=response_synthesizer
)
response = query_engine.query("查询关键词相关的内容")
print(response)示例4:流式输出
python
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
documents = SimpleDirectoryReader("./data").load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine(streaming=True)
streaming_response = query_engine.query("详细解释这个概念")
for text in streaming_response.response_gen:
print(text, end="", flush=True)高级功能
自定义提示词
python
from llama_index.core import PromptTemplate
qa_prompt = PromptTemplate(
"上下文信息如下:\n"
"---------------------\n"
"{context_str}\n"
"---------------------\n"
"根据上下文信息回答问题:{query_str}\n"
"如果上下文中没有相关信息,请说明不知道。\n"
)
query_engine = index.as_query_engine(text_qa_template=qa_prompt)元数据过滤
python
from llama_index.core.vector_stores import MetadataFilters, MetadataFilter
documents = SimpleDirectoryReader("./data").load_data()
for i, doc in enumerate(documents):
doc.metadata["category"] = "技术" if i % 2 == 0 else "业务"
index = VectorStoreIndex.from_documents(documents)
filters = MetadataFilters(
filters=[
MetadataFilter(key="category", value="技术")
]
)
query_engine = index.as_query_engine(filters=filters)
response = query_engine.query("技术相关的内容")回溯增强
python
from llama_index.core.indices.keyword_table import KeywordTableIndex
from llama_index.core import SimpleDirectoryReader
documents = SimpleDirectoryReader("./data").load_data()
keyword_index = KeywordTableIndex.from_documents(documents)
query_engine = keyword_index.as_query_engine()
response = query_engine.query("查找包含特定关键词的文档")
print(response)与向量数据库集成
使用Chroma
python
import chromadb
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
chroma_client = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = chroma_client.get_or_create_collection("my_collection")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
documents = SimpleDirectoryReader("./data").load_data()
index = VectorStoreIndex.from_documents(
documents,
storage_context=storage_context
)
query_engine = index.as_query_engine()使用Pinecone
python
from pinecone import Pinecone
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext
pc = Pinecone(api_key="your-api-key")
pinecone_index = pc.Index("my-index")
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
documents = SimpleDirectoryReader("./data").load_data()
index = VectorStoreIndex.from_documents(
documents,
storage_context=storage_context
)最佳实践
1. 合理设置分块大小
python
from llama_index.core.node_parser import SentenceSplitter
splitter = SentenceSplitter(
chunk_size=1024,
chunk_overlap=200
)
nodes = splitter.get_nodes_from_documents(documents)2. 使用缓存提高性能
python
from llama_index.core import set_global_handler
set_global_handler("simple")
from llama_index.core import StorageContext
from llama_index.core.storage.docstore import SimpleDocumentStore
storage_context = StorageContext.from_defaults(
docstore=SimpleDocumentStore()
)3. 监控和调试
python
from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler
llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])
Settings.callback_manager = callback_manager
response = query_engine.query("测试查询")小结
LlamaIndex是构建RAG应用的强大工具,通过本章节的学习,你应该掌握了:
- 数据加载 - 使用各种数据加载器导入数据
- 索引创建 - 理解不同索引类型的应用场景
- 查询引擎 - 配置和使用查询引擎
- 高级功能 - 自定义提示词、元数据过滤等
- 向量数据库集成 - 与Chroma、Pinecone等集成
LlamaIndex特别适合需要处理大量私有数据的场景,如企业知识库、文档问答系统等。建议结合实际项目需求,选择合适的索引类型和查询策略。