Skip to content

Spring Boot 可观测性

1. 可观测性概述

1.1 三大支柱

支柱说明工具
日志(Logs)记录事件和错误Logback、ELK
指标(Metrics)量化系统状态Micrometer、Prometheus
追踪(Traces)请求链路追踪Zipkin、Jaeger

1.2 Micrometer 简介

Micrometer 是一个度量门面,为 Java 应用提供与供应商无关的指标收集接口。

2. Actuator 监控

2.1 添加依赖

xml
<dependency>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-starter-actuator</artifactId>
</dependency>

2.2 配置端点

yaml
management:
  endpoints:
    web:
      exposure:
        include: health,info,metrics,prometheus,loggers,env
      base-path: /actuator
  endpoint:
    health:
      show-details: always
      probes:
        enabled: true
    loggers:
      enabled: true
    prometheus:
      enabled: true
  info:
    env:
      enabled: true
    java:
      enabled: true
    os:
      enabled: true

2.3 常用端点

端点说明
/actuator/health健康检查
/actuator/info应用信息
/actuator/metrics指标列表
/actuator/metrics/具体指标
/actuator/prometheusPrometheus 格式
/actuator/loggers日志配置
/actuator/env环境变量

2.4 自定义健康检查

java
@Component
public class DatabaseHealthIndicator implements HealthIndicator {
    
    private final DataSource dataSource;
    
    @Override
    public Health health() {
        try (Connection connection = dataSource.getConnection()) {
            if (connection.isValid(1)) {
                return Health.up()
                    .withDetail("database", "MySQL")
                    .withDetail("validationQuery", "SELECT 1")
                    .build();
            }
        } catch (SQLException e) {
            return Health.down()
                .withException(e)
                .build();
        }
        return Health.down().build();
    }
}

@Component
public class RedisHealthIndicator implements HealthIndicator {
    
    private final RedisTemplate<String, Object> redisTemplate;
    
    @Override
    public Health health() {
        try {
            String pong = redisTemplate.getConnectionFactory()
                .getConnection()
                .ping();
            return Health.up()
                .withDetail("redis", pong)
                .build();
        } catch (Exception e) {
            return Health.down()
                .withException(e)
                .build();
        }
    }
}

3. 指标收集

3.1 添加依赖

xml
<dependency>
    <groupId>io.micrometer</groupId>
    <artifactId>micrometer-registry-prometheus</artifactId>
</dependency>

3.2 自定义指标

java
@Service
public class OrderService {
    
    private final Counter orderCounter;
    private final Timer orderTimer;
    private final Gauge orderGauge;
    
    public OrderService(MeterRegistry registry) {
        this.orderCounter = Counter.builder("orders.created")
            .description("订单创建数量")
            .tag("type", "online")
            .register(registry);
        
        this.orderTimer = Timer.builder("orders.processing.time")
            .description("订单处理时间")
            .register(registry);
        
        this.orderGauge = Gauge.builder("orders.pending", this::getPendingOrders)
            .description("待处理订单数")
            .register(registry);
    }
    
    public Order createOrder(OrderRequest request) {
        return orderTimer.record(() -> {
            orderCounter.increment();
            // 创建订单逻辑
            return doCreateOrder(request);
        });
    }
    
    private int getPendingOrders() {
        return orderRepository.countByStatus(OrderStatus.PENDING);
    }
}

3.3 指标类型

java
@Configuration
public class MetricsConfig {
    
    @Bean
    public MeterRegistryCustomizer<MeterRegistry> metricsCommonTags() {
        return registry -> registry.config()
            .commonTags("application", "myapp")
            .commonTags("env", "prod");
    }
}

@Service
public class MetricsService {
    
    private final MeterRegistry registry;
    
    public void recordCounter() {
        Counter counter = Counter.builder("api.requests")
            .tag("endpoint", "/users")
            .tag("method", "GET")
            .description("API 请求计数")
            .register(registry);
        
        counter.increment();
    }
    
    public void recordGauge() {
        AtomicInteger value = new AtomicInteger(0);
        
        Gauge.builder("queue.size", value, AtomicInteger::get)
            .description("队列大小")
            .register(registry);
        
        value.set(100);
    }
    
    public void recordTimer() {
        Timer timer = Timer.builder("operation.duration")
            .description("操作耗时")
            .register(registry);
        
        timer.record(() -> {
            // 执行操作
        });
    }
    
    public void recordSummary() {
        DistributionSummary summary = DistributionSummary.builder("response.size")
            .description("响应大小")
            .baseUnit("bytes")
            .register(registry);
        
        summary.record(1024);
    }
}

3.4 AOP 指标

java
@Aspect
@Component
public class MetricsAspect {
    
    private final MeterRegistry registry;
    
    @Around("@annotation(Timed)")
    public Object timeMethod(ProceedingJoinPoint joinPoint) throws Throwable {
        Timed timed = ((MethodSignature) joinPoint.getSignature())
            .getMethod()
            .getAnnotation(Timed.class);
        
        Timer.Sample sample = Timer.start(registry);
        
        try {
            Object result = joinPoint.proceed();
            sample.stop(Timer.builder(timed.value())
                .description(timed.description())
                .register(registry));
            return result;
        } catch (Exception e) {
            sample.stop(Timer.builder(timed.value() + ".error")
                .register(registry));
            throw e;
        }
    }
}

@Target(ElementType.METHOD)
@Retention(RetentionPolicy.RUNTIME)
public @interface Timed {
    String value();
    String description() default "";
}

@Service
public class UserService {
    
    @Timed(value = "user.find", description = "查找用户耗时")
    public User findById(Long id) {
        return userRepository.findById(id).orElse(null);
    }
}

4. 分布式追踪

4.1 添加依赖

xml
<dependency>
    <groupId>io.micrometer</groupId>
    <artifactId>micrometer-tracing-bridge-brave</artifactId>
</dependency>
<dependency>
    <groupId>io.zipkin.reporter2</groupId>
    <artifactId>zipkin-reporter-brave</artifactId>
</dependency>

4.2 配置追踪

yaml
management:
  tracing:
    enabled: true
    sampling:
      probability: 1.0
  zipkin:
    tracing:
      endpoint: http://localhost:9411/api/v2/spans

spring:
  application:
    name: myapp

4.3 自定义 Span

java
@Service
public class OrderService {
    
    private final Tracer tracer;
    
    public Order createOrder(OrderRequest request) {
        Span span = tracer.nextSpan().name("create-order");
        
        try (Tracer.SpanInScope ws = tracer.withSpan(span.start())) {
            span.tag("order.type", request.getType());
            span.event("order.created");
            
            Order order = doCreateOrder(request);
            
            span.tag("order.id", order.getId().toString());
            return order;
        } finally {
            span.end();
        }
    }
    
    @NewSpan("process-payment")
    public Payment processPayment(Long orderId) {
        // 自动创建 Span
        return paymentService.process(orderId);
    }
    
    @NewSpan
    @SpanTag(key = "order.id", expression = "#orderId")
    public Order getOrder(@SpanTag("order.source") String source, Long orderId) {
        return orderRepository.findById(orderId).orElse(null);
    }
}

4.4 追踪日志

yaml
logging:
  pattern:
    level: "%5p [${spring.application.name:},%X{traceId:-},%X{spanId:-}]"
java
@Slf4j
@Service
public class TracedService {
    
    public void tracedMethod() {
        log.info("这条日志会包含 traceId 和 spanId");
    }
}

5. 日志管理

5.1 日志配置

yaml
logging:
  level:
    root: INFO
    com.example: DEBUG
  file:
    name: logs/application.log
    max-size: 10MB
    max-history: 30
  pattern:
    file: "%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] [%X{traceId:-},%X{spanId:-}] %-5level %logger{36} - %msg%n"
    console: "%d{yyyy-MM-dd HH:mm:ss.SSS} %highlight(%-5level) [%thread] [%X{traceId:-},%X{spanId:-}] %cyan(%logger{36}) - %msg%n"

5.2 结构化日志

xml
<dependency>
    <groupId>net.logstash.logback</groupId>
    <artifactId>logstash-logback-encoder</artifactId>
    <version>7.4</version>
</dependency>
xml
<!-- logback-spring.xml -->
<configuration>
    <appender name="JSON" class="ch.qos.logback.core.rolling.RollingFileAppender">
        <file>logs/application.json</file>
        <encoder class="net.logstash.logback.encoder.LogstashEncoder">
            <includeMdcKeyName>traceId</includeMdcKeyName>
            <includeMdcKeyName>spanId</includeMdcKeyName>
            <customFields>{"app":"myapp"}</customFields>
        </encoder>
        <rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
            <fileNamePattern>logs/application.%d{yyyy-MM-dd}.json</fileNamePattern>
            <maxHistory>30</maxHistory>
        </rollingPolicy>
    </appender>
    
    <root level="INFO">
        <appender-ref ref="JSON"/>
    </root>
</configuration>

5.3 MDC 日志

java
@Component
public class TraceFilter implements Filter {
    
    @Override
    public void doFilter(ServletRequest request, ServletResponse response, 
                        FilterChain chain) throws IOException, ServletException {
        String traceId = UUID.randomUUID().toString().replace("-", "");
        
        MDC.put("traceId", traceId);
        
        try {
            chain.doFilter(request, response);
        } finally {
            MDC.clear();
        }
    }
}

@Slf4j
@Service
public class MdcService {
    
    public void logWithContext() {
        MDC.put("userId", "12345");
        MDC.put("operation", "create-order");
        
        try {
            log.info("创建订单");
        } finally {
            MDC.remove("userId");
            MDC.remove("operation");
        }
    }
}

6. Prometheus 集成

6.1 配置 Prometheus

yaml
# prometheus.yml
scrape_configs:
  - job_name: 'spring-boot'
    metrics_path: '/actuator/prometheus'
    static_configs:
      - targets: ['localhost:8080']

6.2 Grafana 仪表盘

java
@Configuration
public class GrafanaConfig {
    
    @Bean
    public MeterRegistryCustomizer<MeterRegistry> commonTags() {
        return registry -> registry.config()
            .commonTags("application", "myapp")
            .commonTags("instance", InetAddress.getLocalHost().getHostName());
    }
}

6.3 告警规则

yaml
# alert.rules.yml
groups:
  - name: spring-boot-alerts
    rules:
      - alert: HighErrorRate
        expr: rate(http_server_requests_seconds_count{status=~"5.."}[5m]) > 0.1
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "高错误率"
          description: "错误率超过 10%"
      
      - alert: HighLatency
        expr: histogram_quantile(0.95, rate(http_server_requests_seconds_bucket[5m])) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "高延迟"
          description: "P95 延迟超过 1 秒"

7. 健康检查

7.1 Kubernetes 探针

yaml
management:
  endpoint:
    health:
      probes:
        enabled: true
  health:
    livenessstate:
      enabled: true
    readinessstate:
      enabled: true
yaml
# Kubernetes Deployment
livenessProbe:
  httpGet:
    path: /actuator/health/liveness
    port: 8080
  initialDelaySeconds: 30
  periodSeconds: 10

readinessProbe:
  httpGet:
    path: /actuator/health/readiness
    port: 8080
  initialDelaySeconds: 10
  periodSeconds: 5

7.2 自定义健康状态

java
@Component
public class ReadinessHealthIndicator implements HealthIndicator {
    
    private volatile boolean ready = false;
    
    @Override
    public Health health() {
        if (ready) {
            return Health.up().build();
        }
        return Health.down().build();
    }
    
    public void setReady(boolean ready) {
        this.ready = ready;
    }
}

8. 小结

本章学习了 Spring Boot 可观测性的核心内容:

内容要点
Actuator健康检查、端点配置
指标收集Counter、Gauge、Timer
分布式追踪Span、Trace、Zipkin
日志管理结构化日志、MDC
Prometheus指标暴露、Grafana
健康检查自定义指标、K8s 探针

下一章将学习 Spring Boot 部署。