Hugging Face生态
概述
Hugging Face是AI领域最重要的开源社区和平台,提供了丰富的预训练模型、数据集和工具。它不仅是模型托管平台,更是一个完整的AI开发生态系统,让开发者能够轻松获取、使用和部署各种AI模型。
Hugging Face的核心组件:
- Transformers库 - 统一的模型接口
- Model Hub - 数万个预训练模型
- Datasets库 - 海量开源数据集
- Spaces - 模型演示和部署平台
核心概念
Transformers库
Transformers是Hugging Face的核心库,提供了统一的API来使用各种预训练模型。
python
from transformers import pipeline
classifier = pipeline("sentiment-analysis")
result = classifier("I love learning AI!")
print(result)模型架构
支持多种主流模型架构。
python
from transformers import AutoModel, AutoTokenizer
model_name = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
text = "你好,世界"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)Pipeline
Pipeline是快速使用模型的便捷方式。
python
from transformers import pipeline
nlp_pipeline = pipeline("text-classification")
qa_pipeline = pipeline("question-answering")
summarizer = pipeline("summarization")
translator = pipeline("translation_en_to_zh")
generator = pipeline("text-generation")核心功能
文本分类
python
from transformers import pipeline
classifier = pipeline(
"text-classification",
model="distilbert-base-uncased-finetuned-sst-2-english"
)
texts = [
"This movie is amazing!",
"I don't like this product.",
"The service was okay."
]
results = classifier(texts)
for text, result in zip(texts, results):
print(f"文本:{text}")
print(f"标签:{result['label']}, 置信度:{result['score']:.2f}\n")命名实体识别
python
from transformers import pipeline
ner = pipeline("ner", grouped_entities=True)
text = "Apple was founded by Steve Jobs in Cupertino, California."
entities = ner(text)
for entity in entities:
print(f"实体:{entity['word']}")
print(f"类型:{entity['entity_group']}")
print(f"置信度:{entity['score']:.2f}\n")文本生成
python
from transformers import pipeline, set_seed
set_seed(42)
generator = pipeline(
"text-generation",
model="gpt2"
)
prompt = "Once upon a time"
results = generator(
prompt,
max_length=100,
num_return_sequences=3,
temperature=0.7
)
for i, result in enumerate(results, 1):
print(f"生成 {i}:\n{result['generated_text']}\n")问答系统
python
from transformers import pipeline
qa = pipeline(
"question-answering",
model="deepset/roberta-base-squad2"
)
context = """
Python是一种广泛使用的高级编程语言,由Guido van Rossum于1991年创建。
Python的设计哲学强调代码的可读性和简洁性。
"""
questions = [
"Python是什么时候创建的?",
"谁创建了Python?",
"Python的设计哲学是什么?"
]
for question in questions:
result = qa(question=question, context=context)
print(f"问题:{question}")
print(f"答案:{result['answer']}")
print(f"置信度:{result['score']:.2f}\n")使用示例
示例1:中文文本处理
python
from transformers import BertTokenizer, BertForSequenceClassification
import torch
model_name = "bert-base-chinese"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
texts = ["这个产品很好用", "这个产品质量很差"]
inputs = tokenizer(
texts,
padding=True,
truncation=True,
return_tensors="pt"
)
with torch.no_grad():
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)
for text, pred in zip(texts, predictions):
sentiment = "正面" if pred == 1 else "负面"
print(f"文本:{text} -> 情感:{sentiment}")示例2:机器翻译
python
from transformers import pipeline
translator = pipeline(
"translation",
model="Helsinki-NLP/opus-mt-en-zh"
)
english_texts = [
"Hello, how are you?",
"Machine learning is fascinating.",
"I love programming."
]
for text in english_texts:
result = translator(text)
print(f"英文:{text}")
print(f"中文:{result[0]['translation_text']}\n")示例3:文本摘要
python
from transformers import pipeline
summarizer = pipeline(
"summarization",
model="facebook/bart-large-cnn"
)
article = """
Artificial intelligence (AI) is intelligence demonstrated by machines,
as opposed to natural intelligence displayed by animals including humans.
AI research has been defined as the field of study of intelligent agents,
which refers to any system that perceives its environment and takes actions
that maximize its chance of achieving its goals. The term artificial intelligence
had previously been used to describe machines that mimic and display human cognitive
skills that are associated with the human mind, such as learning and problem-solving.
"""
summary = summarizer(
article,
max_length=100,
min_length=30,
do_sample=False
)
print("原文:")
print(article[:200] + "...\n")
print("摘要:")
print(summary[0]['summary_text'])示例4:图像分类
python
from transformers import pipeline
from PIL import Image
classifier = pipeline(
"image-classification",
model="google/vit-base-patch16-224"
)
image = Image.open("example.jpg")
results = classifier(image)
print("图像分类结果:")
for result in results[:5]:
print(f"{result['label']}: {result['score']:.2%}")本地部署
模型下载
python
from transformers import AutoModel, AutoTokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
save_path = "./local_models/gpt2"
tokenizer.save_pretrained(save_path)
model.save_pretrained(save_path)
print(f"模型已保存到:{save_path}")离线使用
python
from transformers import AutoModel, AutoTokenizer
local_model_path = "./local_models/gpt2"
tokenizer = AutoTokenizer.from_pretrained(local_model_path)
model = AutoModel.from_pretrained(local_model_path)
text = "Hello, world!"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
print("离线模型运行成功!")模型量化
python
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
low_cpu_mem_usage=True
)
print(f"模型参数量:{model.num_parameters() / 1e6:.2f}M")Datasets数据集
数据集加载
python
from datasets import load_dataset
dataset = load_dataset("imdb")
print(dataset)
train_data = dataset['train']
print(f"训练集大小:{len(train_data)}")
print(f"样本示例:{train_data[0]}")数据预处理
python
from datasets import load_dataset
from transformers import AutoTokenizer
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenize_function(examples):
return tokenizer(
examples['text'],
padding="max_length",
truncation=True,
max_length=512
)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
print(tokenized_dataset)自定义数据集
python
from datasets import Dataset
data = {
"text": ["我喜欢这个产品", "这个产品很糟糕", "质量一般"],
"label": [1, 0, 0]
}
dataset = Dataset.from_dict(data)
print(dataset)
dataset = dataset.train_test_split(test_size=0.2)
print(dataset)模型微调
准备训练数据
python
from datasets import load_dataset
from transformers import AutoTokenizer
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize(examples):
return tokenizer(
examples["text"],
padding="max_length",
truncation=True,
max_length=512
)
tokenized_datasets = dataset.map(tokenize, batched=True)训练配置
python
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate
model = AutoModelForSequenceClassification.from_pretrained(
"distilbert-base-uncased",
num_labels=2
)
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"].select(range(1000)),
eval_dataset=tokenized_datasets["test"].select(range(200)),
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
trainer.train()最佳实践
1. 选择合适的模型
python
from transformers import AutoModel, AutoTokenizer
model_name = "bert-base-uncased"
print(f"模型:{model_name}")
print(f"参数量:约110M")
print(f"适用任务:文本分类、命名实体识别、问答等")2. 批处理优化
python
from transformers import pipeline
classifier = pipeline("sentiment-analysis", device=0)
texts = ["文本1", "文本2", "文本3"] * 100
results = classifier(texts, batch_size=32)
print(f"处理了 {len(results)} 个文本")3. 内存优化
python
from transformers import AutoModelForCausalLM
import torch
model = AutoModelForCausalLM.from_pretrained(
"gpt2",
torch_dtype=torch.float16,
low_cpu_mem_usage=True
)
if torch.cuda.is_available():
model = model.to("cuda")小结
Hugging Face是AI开发的重要工具,通过本章节的学习,你应该掌握了:
- Transformers库 - 使用Pipeline快速调用模型
- 核心功能 - 文本分类、NER、生成、问答等
- 本地部署 - 模型下载和离线使用
- Datasets - 数据集加载和处理
- 模型微调 - 自定义训练流程
Hugging Face的优势在于其丰富的开源生态和统一的API,适合快速原型开发和学习。建议多探索Model Hub,找到适合自己任务的预训练模型。