Skip to content

LlamaIndex使用

概述

LlamaIndex(原GPT Index)是一个专门用于构建大语言模型应用的框架,特别擅长处理数据连接和检索增强生成(RAG)场景。它提供了丰富的数据加载器、索引结构和查询引擎,让开发者能够轻松构建基于私有数据的AI应用。

LlamaIndex的核心优势:

  • 数据连接器 - 支持100+种数据源
  • 索引结构 - 多种索引类型适应不同场景
  • 查询引擎 - 灵活的查询和检索策略
  • 易于定制 - 高度可配置的组件

核心概念

数据加载器

LlamaIndex提供了丰富的数据加载器,支持各种数据源。

python
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.readers.file import PyMuPDFReader

documents = SimpleDirectoryReader(
    input_dir="./documents",
    required_exts=[".pdf", ".txt", ".md"]
).load_data()

print(f"加载了 {len(documents)} 个文档")

节点与文档

文档被分割成节点(Node)进行处理。

python
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter

doc = Document(text="这是一个长文档..." * 100)

parser = SentenceSplitter(
    chunk_size=512,
    chunk_overlap=50
)
nodes = parser.get_nodes_from_documents([doc])

print(f"分割成 {len(nodes)} 个节点")

索引类型

LlamaIndex支持多种索引类型。

python
from llama_index.core import VectorStoreIndex, SummaryIndex, KeywordTableIndex
from llama_index.core import SimpleKeywordTableIndex

vector_index = VectorStoreIndex.from_documents(documents)

summary_index = SummaryIndex.from_documents(documents)

keyword_index = KeywordTableIndex.from_documents(documents)

核心功能

向量索引

最常用的索引类型,适合语义搜索。

python
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.openai import OpenAIEmbedding

documents = SimpleDirectoryReader("./data").load_data()

embed_model = OpenAIEmbedding(model="text-embedding-3-small")

index = VectorStoreIndex.from_documents(
    documents,
    embed_model=embed_model
)

query_engine = index.as_query_engine()
response = query_engine.query("文档的主要内容是什么?")
print(response)

查询引擎

查询引擎提供多种查询策略。

python
from llama_index.core import VectorStoreIndex
from llama_index.core.response_synthesizers import ResponseMode

index = VectorStoreIndex.from_documents(documents)

query_engine = index.as_query_engine(
    similarity_top_k=3,
    response_mode=ResponseMode.COMPACT
)

response = query_engine.query("什么是机器学习?")
print(response.response)

for source in response.source_nodes:
    print(f"来源:{source.node.metadata}")

检索器

自定义检索策略。

python
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine

retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=5
)

response_synthesizer = get_response_synthesizer(
    response_mode="compact"
)

query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer
)

使用示例

示例1:PDF文档问答系统

python
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

Settings.llm = OpenAI(model="gpt-4o-mini")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

documents = SimpleDirectoryReader(
    input_files=["./documents/report.pdf"]
).load_data()

index = VectorStoreIndex.from_documents(documents)

query_engine = index.as_query_engine(
    similarity_top_k=3,
    verbose=True
)

questions = [
    "这份报告的主要结论是什么?",
    "有哪些关键数据?",
    "有什么建议?"
]

for question in questions:
    response = query_engine.query(question)
    print(f"问题:{question}")
    print(f"回答:{response}\n")

示例2:多文档索引

python
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.storage.storage_context import StorageContext
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core import load_index_from_storage
import os

PERSIST_DIR = "./storage"

if os.path.exists(PERSIST_DIR):
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)
else:
    documents = SimpleDirectoryReader("./documents").load_data()
    index = VectorStoreIndex.from_documents(documents)
    index.storage_context.persist(persist_dir=PERSIST_DIR)

query_engine = index.as_query_engine()

response = query_engine.query("总结所有文档的核心内容")
print(response)

示例3:混合检索

python
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.retrievers import KeywordTableSimpleRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.response_synthesizers import get_response_synthesizer

documents = SimpleDirectoryReader("./data").load_data()
index = VectorStoreIndex.from_documents(documents)

vector_retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=3
)

keyword_retriever = KeywordTableSimpleRetriever(
    index=index,
    num_keywords_per_chunk=5
)

response_synthesizer = get_response_synthesizer(
    response_mode="compact"
)

query_engine = RetrieverQueryEngine(
    retriever=vector_retriever,
    response_synthesizer=response_synthesizer
)

response = query_engine.query("查询关键词相关的内容")
print(response)

示例4:流式输出

python
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader("./data").load_data()
index = VectorStoreIndex.from_documents(documents)

query_engine = index.as_query_engine(streaming=True)

streaming_response = query_engine.query("详细解释这个概念")

for text in streaming_response.response_gen:
    print(text, end="", flush=True)

高级功能

自定义提示词

python
from llama_index.core import PromptTemplate

qa_prompt = PromptTemplate(
    "上下文信息如下:\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "根据上下文信息回答问题:{query_str}\n"
    "如果上下文中没有相关信息,请说明不知道。\n"
)

query_engine = index.as_query_engine(text_qa_template=qa_prompt)

元数据过滤

python
from llama_index.core.vector_stores import MetadataFilters, MetadataFilter

documents = SimpleDirectoryReader("./data").load_data()

for i, doc in enumerate(documents):
    doc.metadata["category"] = "技术" if i % 2 == 0 else "业务"

index = VectorStoreIndex.from_documents(documents)

filters = MetadataFilters(
    filters=[
        MetadataFilter(key="category", value="技术")
    ]
)

query_engine = index.as_query_engine(filters=filters)
response = query_engine.query("技术相关的内容")

回溯增强

python
from llama_index.core.indices.keyword_table import KeywordTableIndex
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader("./data").load_data()

keyword_index = KeywordTableIndex.from_documents(documents)

query_engine = keyword_index.as_query_engine()

response = query_engine.query("查找包含特定关键词的文档")
print(response)

与向量数据库集成

使用Chroma

python
import chromadb
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

chroma_client = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = chroma_client.get_or_create_collection("my_collection")

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

documents = SimpleDirectoryReader("./data").load_data()

index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context
)

query_engine = index.as_query_engine()

使用Pinecone

python
from pinecone import Pinecone
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext

pc = Pinecone(api_key="your-api-key")
pinecone_index = pc.Index("my-index")

vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

documents = SimpleDirectoryReader("./data").load_data()

index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context
)

最佳实践

1. 合理设置分块大小

python
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=200
)

nodes = splitter.get_nodes_from_documents(documents)

2. 使用缓存提高性能

python
from llama_index.core import set_global_handler

set_global_handler("simple")

from llama_index.core import StorageContext
from llama_index.core.storage.docstore import SimpleDocumentStore

storage_context = StorageContext.from_defaults(
    docstore=SimpleDocumentStore()
)

3. 监控和调试

python
from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler

llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

Settings.callback_manager = callback_manager

response = query_engine.query("测试查询")

小结

LlamaIndex是构建RAG应用的强大工具,通过本章节的学习,你应该掌握了:

  1. 数据加载 - 使用各种数据加载器导入数据
  2. 索引创建 - 理解不同索引类型的应用场景
  3. 查询引擎 - 配置和使用查询引擎
  4. 高级功能 - 自定义提示词、元数据过滤等
  5. 向量数据库集成 - 与Chroma、Pinecone等集成

LlamaIndex特别适合需要处理大量私有数据的场景,如企业知识库、文档问答系统等。建议结合实际项目需求,选择合适的索引类型和查询策略。