Skip to content

文档处理与分块

概述

文档处理与分块是RAG系统的基础环节,直接影响检索质量和生成效果。合理的分块策略能够保留语义完整性,提高检索准确性,同时避免上下文过长导致的成本和性能问题。

文档加载

常见文档格式

格式加载工具特点
PDFPyPDF2、pdfplumber需处理表格、图片
Wordpython-docx保留格式信息
Markdown直接读取结构清晰
HTMLBeautifulSoup需清洗标签
CSV/Excelpandas结构化数据
JSONjson库层级结构

使用LangChain加载文档

python
from langchain.document_loaders import (
    PyPDFLoader,
    Docx2txtLoader,
    UnstructuredMarkdownLoader,
    CSVLoader,
    DirectoryLoader
)

pdf_loader = PyPDFLoader("document.pdf")
pdf_docs = pdf_loader.load()

docx_loader = Docx2txtLoader("document.docx")
docx_docs = docx_loader.load()

md_loader = UnstructuredMarkdownLoader("document.md")
md_docs = md_loader.load()

csv_loader = CSVLoader("data.csv")
csv_docs = csv_loader.load()

directory_loader = DirectoryLoader(
    "./documents",
    glob="**/*.pdf",
    loader_cls=PyPDFLoader
)
all_docs = directory_loader.load()

自定义文档加载器

python
from langchain.document_loaders.base import BaseLoader
from langchain.schema import Document

class CustomLoader(BaseLoader):
    def __init__(self, file_path):
        self.file_path = file_path
    
    def load(self):
        with open(self.file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        metadata = {
            "source": self.file_path,
            "type": "custom"
        }
        
        return [Document(page_content=text, metadata=metadata)]

文本分块策略

为什么需要分块

  1. 模型限制:大模型有上下文长度限制
  2. 检索精度:小块更精准匹配查询
  3. 成本控制:减少不必要的Token消耗
  4. 语义完整:合理的块保持语义连贯

分块方法对比

方法原理优点缺点
固定长度按字符数切分简单快速可能截断语义
递归字符按分隔符层级切分保留结构需要调参
语义分块基于语义相似度语义完整计算成本高
文档结构按段落、章节切分结构清晰需要解析

1. 固定长度分块

python
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

chunks = text_splitter.split_text(text)

2. 递归字符分块(推荐)

python
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", "。", "!", "?", ";", " ", ""]
)

chunks = text_splitter.split_documents(documents)

3. 语义分块

python
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import OpenAIEmbeddings

semantic_splitter = SemanticChunker(
    OpenAIEmbeddings(),
    breakpoint_threshold_type="percentile"
)

chunks = semantic_splitter.split_text(text)

4. 基于文档结构分块

python
from langchain.text_splitter import MarkdownHeaderTextSplitter

markdown_text = """
# 第一章 概述

这是第一章的内容。

## 1.1 背景

背景介绍。

## 1.2 目标

目标说明。
"""

headers_to_split_on = [
    ("#", "header1"),
    ("##", "header2"),
    ("###", "header3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)

chunks = markdown_splitter.split_text(markdown_text)

Chunk大小选择

影响因素

因素小Chunk大Chunk
检索精度高,精准匹配低,可能包含无关信息
上下文完整性低,可能缺失背景高,保留完整语境
Token成本
响应速度

经验值参考

python
chunk_sizes = {
    "短文本问答": {"chunk_size": 200, "overlap": 50},
    "技术文档": {"chunk_size": 1000, "overlap": 200},
    "长篇报告": {"chunk_size": 2000, "overlap": 400},
    "代码文件": {"chunk_size": 1500, "overlap": 100},
    "对话记录": {"chunk_size": 500, "overlap": 100}
}

动态分块策略

python
def dynamic_chunk_size(text, base_size=1000):
    text_length = len(text)
    
    if text_length < 500:
        return text_length
    elif text_length < 5000:
        return base_size
    else:
        return int(base_size * 1.5)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=dynamic_chunk_size(text),
    chunk_overlap=200
)

元数据提取

常见元数据字段

python
metadata = {
    "source": "document.pdf",
    "page": 1,
    "chunk_id": "chunk_001",
    "chunk_index": 0,
    "total_chunks": 10,
    "file_type": "pdf",
    "created_at": "2024-01-01",
    "author": "作者名",
    "title": "文档标题",
    "category": "技术文档"
}

自动提取元数据

python
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("document.pdf")
documents = loader.load()

for i, doc in enumerate(documents):
    doc.metadata["chunk_id"] = f"chunk_{i:03d}"
    doc.metadata["chunk_index"] = i
    doc.metadata["total_chunks"] = len(documents)

使用LLM提取元数据

python
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate

llm = OpenAI(temperature=0)

extract_prompt = PromptTemplate.from_template("""
从以下文本中提取关键信息:
- 主题
- 关键词(3-5个)
- 摘要(一句话)

文本:{text}

以JSON格式返回。
""")

def extract_metadata(text):
    response = llm(extract_prompt.format(text=text[:500]))
    return eval(response)

文本清洗

清洗步骤

python
import re

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
    text = text.strip()
    
    return text

def remove_headers_footers(text):
    patterns = [
        r'\s*\d+\s*',
        r'Page \d+',
        r'版权所有.*',
    ]
    
    for pattern in patterns:
        text = re.sub(pattern, '', text)
    
    return text

处理特殊内容

python
def process_code_blocks(text):
    code_pattern = r'```[\s\S]*?```'
    codes = re.findall(code_pattern, text)
    
    for i, code in enumerate(codes):
        placeholder = f"[CODE_BLOCK_{i}]"
        text = text.replace(code, placeholder, 1)
    
    return text, codes

def process_tables(text):
    table_pattern = r'\|[^\n]+\|[\n\r]+\|[-:\s|]+\|'
    tables = re.findall(table_pattern, text)
    
    return text, tables

完整处理流程

python
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

class DocumentProcessor:
    def __init__(self, chunk_size=1000, chunk_overlap=200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", "。", "!", "?", ";", " ", ""]
        )
    
    def load_document(self, file_path):
        if file_path.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif file_path.endswith('.md'):
            loader = UnstructuredMarkdownLoader(file_path)
        else:
            raise ValueError(f"Unsupported file type: {file_path}")
        
        return loader.load()
    
    def clean_text(self, text):
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
        return text.strip()
    
    def process(self, file_path):
        documents = self.load_document(file_path)
        
        for doc in documents:
            doc.page_content = self.clean_text(doc.page_content)
        
        chunks = self.text_splitter.split_documents(documents)
        
        for i, chunk in enumerate(chunks):
            chunk.metadata["chunk_id"] = f"chunk_{i:04d}"
            chunk.metadata["chunk_index"] = i
        
        return chunks

processor = DocumentProcessor(chunk_size=1000, chunk_overlap=200)
chunks = processor.process("document.pdf")
print(f"共生成 {len(chunks)} 个文本块")

小结

文档处理与分块是RAG系统的基石。选择合适的分块策略需要平衡检索精度、上下文完整性和成本。递归字符分块是最常用的方法,语义分块适合高质量需求场景。合理的元数据提取和文本清洗能显著提升后续检索效果。

下一章将介绍向量数据库的选型与使用,这是存储和检索文本向量的关键组件。