日志管理与监控
2026/3/20大约 8 分钟
日志管理与监控
日志配置详解
access_log 配置
http {
# 基本语法
# access_log path [format [buffer=size] [gzip[=level]] [flush=time] [if=condition]];
# 定义日志格式
log_format main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for"';
# 使用日志格式
access_log /var/log/nginx/access.log main;
# 带缓冲的日志(提高性能)
access_log /var/log/nginx/access.log main buffer=32k flush=5s;
# 压缩日志
access_log /var/log/nginx/access.log.gz main gzip=9 buffer=32k;
server {
# 站点专用日志
access_log /var/log/nginx/example.access.log main;
# 关闭特定路径的日志
location /health {
access_log off;
return 200 "OK";
}
# 静态资源不记录日志
location ~* \.(jpg|jpeg|png|gif|ico|css|js)$ {
access_log off;
}
}
}
条件日志记录
http {
# 根据状态码决定是否记录
map $status $loggable {
~^[23] 0; # 2xx 和 3xx 不记录
default 1;
}
# 不记录特定 User-Agent
map $http_user_agent $log_ua {
~*kube-probe 0;
~*health 0;
default 1;
}
# 组合条件
map $loggable$log_ua $final_loggable {
"11" 1;
default 0;
}
server {
# 条件记录
access_log /var/log/nginx/access.log main if=$loggable;
# 只记录错误请求
access_log /var/log/nginx/error_requests.log main if=$loggable;
}
}
error_log 配置
# 全局错误日志
error_log /var/log/nginx/error.log warn;
# 错误级别(从低到高):
# debug - 调试信息
# info - 一般信息
# notice - 需要注意的信息
# warn - 警告
# error - 错误
# crit - 严重错误
# alert - 必须立即处理
# emerg - 系统不可用
http {
error_log /var/log/nginx/http_error.log error;
server {
# 站点专用错误日志
error_log /var/log/nginx/example.error.log warn;
}
}
# 调试特定连接
events {
debug_connection 192.168.1.100;
debug_connection 192.168.1.0/24;
}
自定义日志格式
http {
# 标准格式
log_format main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for"';
# 详细格式(包含性能指标)
log_format detailed '$remote_addr - $remote_user [$time_local] '
'"$request" $status $body_bytes_sent '
'"$http_referer" "$http_user_agent" '
'rt=$request_time '
'uct="$upstream_connect_time" '
'uht="$upstream_header_time" '
'urt="$upstream_response_time" '
'cs=$upstream_cache_status';
# 带请求 ID 的格式(用于链路追踪)
log_format trace '$remote_addr - $remote_user [$time_local] '
'"$request" $status $body_bytes_sent '
'"$http_referer" "$http_user_agent" '
'request_id="$request_id" '
'trace_id="$http_x_trace_id"';
}
JSON 格式日志
http {
# JSON 格式日志(便于 ELK 等日志系统解析)
log_format json_log escape=json '{'
'"time":"$time_iso8601",'
'"remote_addr":"$remote_addr",'
'"remote_user":"$remote_user",'
'"request_method":"$request_method",'
'"request_uri":"$request_uri",'
'"server_protocol":"$server_protocol",'
'"status":$status,'
'"body_bytes_sent":$body_bytes_sent,'
'"request_time":$request_time,'
'"upstream_response_time":"$upstream_response_time",'
'"upstream_connect_time":"$upstream_connect_time",'
'"upstream_header_time":"$upstream_header_time",'
'"http_referer":"$http_referer",'
'"http_user_agent":"$http_user_agent",'
'"http_x_forwarded_for":"$http_x_forwarded_for",'
'"request_id":"$request_id",'
'"upstream_cache_status":"$upstream_cache_status"'
'}';
server {
access_log /var/log/nginx/access.json json_log;
}
}
日志变量详解
| 变量 | 说明 |
|---|---|
$remote_addr | 客户端 IP 地址 |
$remote_user | Basic 认证用户名 |
$time_local | 本地时间(Common Log Format) |
$time_iso8601 | ISO 8601 格式时间 |
$request | 完整请求行 |
$request_method | 请求方法(GET、POST 等) |
$request_uri | 完整 URI(含参数) |
$uri | 当前 URI(可能被重写) |
$args | 查询参数 |
$status | 响应状态码 |
$body_bytes_sent | 发送的响应体大小 |
$bytes_sent | 发送的总字节数 |
$http_referer | Referer 头 |
$http_user_agent | User-Agent 头 |
$http_x_forwarded_for | X-Forwarded-For 头 |
$request_time | 请求处理时间(秒,毫秒精度) |
$upstream_response_time | 后端响应时间 |
$upstream_connect_time | 连接后端的时间 |
$upstream_header_time | 收到后端响应头的时间 |
$upstream_cache_status | 缓存状态(HIT/MISS/BYPASS 等) |
$request_id | 唯一请求 ID(16 位十六进制) |
$connection | 连接序号 |
$connection_requests | 当前连接的请求数 |
$msec | 毫秒时间戳 |
$pipe | 管道请求标识(. 或 p) |
$server_name | 匹配的 server_name |
$host | 请求的 Host |
$scheme | 协议(http/https) |
$ssl_protocol | SSL 协议版本 |
$ssl_cipher | SSL 加密套件 |
日志切割
logrotate 配置
# /etc/logrotate.d/nginx
/var/log/nginx/*.log {
daily # 每天切割
missingok # 日志不存在不报错
rotate 30 # 保留 30 天
compress # 压缩旧日志
delaycompress # 延迟压缩(保留最近一个不压缩)
notifempty # 空日志不切割
create 0640 nginx adm # 创建新日志的权限
sharedscripts # 所有日志切割后只执行一次脚本
postrotate
# 通知 Nginx 重新打开日志文件
if [ -f /var/run/nginx.pid ]; then
kill -USR1 `cat /var/run/nginx.pid`
fi
endscript
}
手动切割脚本
#!/bin/bash
# /usr/local/bin/nginx-log-rotate.sh
LOG_DIR="/var/log/nginx"
BACKUP_DIR="/var/log/nginx/backup"
DATE=$(date +%Y%m%d%H%M)
# 创建备份目录
mkdir -p $BACKUP_DIR
# 移动日志文件
for log in $LOG_DIR/*.log; do
if [ -f "$log" ]; then
mv "$log" "$BACKUP_DIR/$(basename $log).$DATE"
fi
done
# 通知 Nginx 重新打开日志
if [ -f /var/run/nginx.pid ]; then
kill -USR1 $(cat /var/run/nginx.pid)
fi
# 压缩旧日志
find $BACKUP_DIR -name "*.log.*" -mtime +1 -exec gzip {} \;
# 删除 30 天前的日志
find $BACKUP_DIR -name "*.gz" -mtime +30 -delete
echo "$(date): Log rotation completed" >> /var/log/nginx-rotate.log
按时间自动切割
http {
# 使用变量在日志名中包含时间
map $time_iso8601 $log_date {
~^(?<ymd>\d{4}-\d{2}-\d{2}) $ymd;
}
server {
# 按天分割日志(需要设置 cron 任务重载配置)
access_log /var/log/nginx/access.$log_date.log main;
}
}
配合 cron 定时重载:
# 每天零点重载 Nginx
0 0 * * * /usr/sbin/nginx -s reopen
日志分析
常用命令行分析
# 查看访问量最多的 IP
awk '{print $1}' access.log | sort | uniq -c | sort -rn | head -20
# 查看访问量最多的 URL
awk '{print $7}' access.log | sort | uniq -c | sort -rn | head -20
# 统计每秒请求数(QPS)
awk '{print $4}' access.log | cut -d: -f2-4 | uniq -c | sort -rn | head -20
# 统计 HTTP 状态码
awk '{print $9}' access.log | sort | uniq -c | sort -rn
# 查看 5xx 错误
awk '$9 ~ /^5/ {print $0}' access.log | tail -100
# 统计平均响应时间(需要 rt 字段)
awk -F'rt=' '{if($2)print $2}' access.log | awk -F' ' '{sum+=$1; count++} END {print sum/count}'
# 查看慢请求(响应时间超过 2 秒)
awk -F'rt=' '$2>2 {print $0}' access.log
# 统计带宽使用
awk '{sum+=$10} END {print sum/1024/1024 " MB"}' access.log
# 查看特定时间段的日志
awk '$4 >= "[27/Jan/2024:10:00" && $4 <= "[27/Jan/2024:11:00"' access.log
# 统计 User-Agent
awk -F'"' '{print $6}' access.log | sort | uniq -c | sort -rn | head -20
# 查看爬虫访问
grep -i "bot\|spider\|crawl" access.log | awk '{print $1}' | sort | uniq -c | sort -rn
GoAccess 实时分析
# 安装 GoAccess
# CentOS
sudo yum install -y goaccess
# Ubuntu
sudo apt install -y goaccess
# 终端实时分析
goaccess /var/log/nginx/access.log -c
# 生成 HTML 报告
goaccess /var/log/nginx/access.log -o /var/www/html/report.html --log-format=COMBINED
# 自定义日志格式
goaccess /var/log/nginx/access.log \
--log-format='%h %^[%d:%t %^] "%r" %s %b "%R" "%u" rt=%T' \
--date-format='%d/%b/%Y' \
--time-format='%H:%M:%S' \
-o report.html
# 实时 HTML 报告(WebSocket)
goaccess /var/log/nginx/access.log -o /var/www/html/report.html --real-time-html
ELK Stack 集成
Filebeat 配置:
# /etc/filebeat/filebeat.yml
filebeat.inputs:
- type: log
enabled: true
paths:
- /var/log/nginx/access.log
json.keys_under_root: true
json.add_error_key: true
fields:
type: nginx-access
fields_under_root: true
- type: log
enabled: true
paths:
- /var/log/nginx/error.log
fields:
type: nginx-error
fields_under_root: true
output.elasticsearch:
hosts: ["elasticsearch:9200"]
index: "nginx-%{+yyyy.MM.dd}"
# 或输出到 Logstash
output.logstash:
hosts: ["logstash:5044"]
Logstash 配置:
# /etc/logstash/conf.d/nginx.conf
input {
beats {
port => 5044
}
}
filter {
if [type] == "nginx-access" {
json {
source => "message"
}
date {
match => ["time", "ISO8601"]
target => "@timestamp"
}
geoip {
source => "remote_addr"
target => "geoip"
}
useragent {
source => "http_user_agent"
target => "user_agent"
}
}
}
output {
elasticsearch {
hosts => ["elasticsearch:9200"]
index => "nginx-%{+YYYY.MM.dd}"
}
}
监控方案
stub_status 模块
server {
listen 80;
server_name localhost;
# 状态监控页面
location /nginx_status {
stub_status on;
access_log off;
# 限制访问
allow 127.0.0.1;
allow 192.168.0.0/16;
deny all;
}
}
状态页面输出:
Active connections: 291
server accepts handled requests
16630948 16630948 31070465
Reading: 6 Writing: 179 Waiting: 106
| 指标 | 说明 |
|---|---|
| Active connections | 当前活动连接数 |
| accepts | 已接受的连接总数 |
| handled | 已处理的连接总数(= accepts 说明无丢弃) |
| requests | 已处理的请求总数 |
| Reading | 正在读取请求头的连接数 |
| Writing | 正在发送响应的连接数 |
| Waiting | 空闲 Keep-Alive 连接数 |
Prometheus + Grafana 监控
安装 nginx-prometheus-exporter:
# 下载
wget https://github.com/nginxinc/nginx-prometheus-exporter/releases/download/v0.11.0/nginx-prometheus-exporter_0.11.0_linux_amd64.tar.gz
tar -xzf nginx-prometheus-exporter_0.11.0_linux_amd64.tar.gz
# 运行
./nginx-prometheus-exporter -nginx.scrape-uri=http://localhost/nginx_status
Prometheus 配置:
# /etc/prometheus/prometheus.yml
scrape_configs:
- job_name: "nginx"
static_configs:
- targets: ["localhost:9113"]
metrics_path: /metrics
常用监控指标:
# 活动连接数
nginx_connections_active
# 接受的连接数
nginx_connections_accepted
# 处理的连接数
nginx_connections_handled
# 请求总数
nginx_http_requests_total
# 正在读取的连接
nginx_connections_reading
# 正在写入的连接
nginx_connections_writing
# 等待中的连接
nginx_connections_waiting
Grafana Dashboard 示例查询:
# QPS
rate(nginx_http_requests_total[5m])
# 活动连接数
nginx_connections_active
# 连接接受率
rate(nginx_connections_accepted[5m])
# 丢弃的连接数
nginx_connections_accepted - nginx_connections_handled
Zabbix 监控
Zabbix Agent 配置:
# /etc/zabbix/zabbix_agentd.d/nginx.conf
UserParameter=nginx.active,curl -s http://localhost/nginx_status | awk '/Active/ {print $3}'
UserParameter=nginx.accepts,curl -s http://localhost/nginx_status | awk 'NR==3 {print $1}'
UserParameter=nginx.handled,curl -s http://localhost/nginx_status | awk 'NR==3 {print $2}'
UserParameter=nginx.requests,curl -s http://localhost/nginx_status | awk 'NR==3 {print $3}'
UserParameter=nginx.reading,curl -s http://localhost/nginx_status | awk '/Reading/ {print $2}'
UserParameter=nginx.writing,curl -s http://localhost/nginx_status | awk '/Writing/ {print $4}'
UserParameter=nginx.waiting,curl -s http://localhost/nginx_status | awk '/Waiting/ {print $6}'
告警配置
常见告警规则
# Prometheus AlertManager 规则
groups:
- name: nginx
rules:
# 5xx 错误率超过 5%
- alert: NginxHighError5xxRate
expr: |
sum(rate(nginx_http_requests_total{status=~"5.."}[5m]))
/ sum(rate(nginx_http_requests_total[5m])) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "Nginx 5xx 错误率过高"
description: "5xx 错误率超过 5%,当前值: {{ $value | humanizePercentage }}"
# 活动连接数过高
- alert: NginxHighConnections
expr: nginx_connections_active > 10000
for: 5m
labels:
severity: warning
annotations:
summary: "Nginx 连接数过高"
description: "活动连接数超过 10000,当前值: {{ $value }}"
# Nginx 服务不可用
- alert: NginxDown
expr: nginx_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Nginx 服务不可用"
description: "Nginx 服务已停止运行"
# 请求延迟过高
- alert: NginxHighLatency
expr: |
histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[5m])) by (le)) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Nginx 请求延迟过高"
description: "P99 延迟超过 2 秒"
基于日志的告警脚本
#!/bin/bash
# /usr/local/bin/nginx-alert.sh
LOG_FILE="/var/log/nginx/access.log"
ALERT_EMAIL="admin@example.com"
ERROR_THRESHOLD=100
TIME_WINDOW=5 # 分钟
# 统计最近 N 分钟的 5xx 错误数
ERROR_COUNT=$(awk -v minutes=$TIME_WINDOW '
BEGIN {
now = systime()
threshold = now - minutes * 60
}
{
# 解析时间
gsub(/\[|\]/, "", $4)
gsub(/:/, " ", $4)
gsub(/\//, " ", $4)
# 转换月份
months = "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec"
split($4, parts, " ")
for (i=1; i<=12; i++) {
if (parts[2] == substr(months, (i-1)*4+1, 3)) {
parts[2] = i
break
}
}
timestamp = mktime(parts[3] " " parts[2] " " parts[1] " " parts[4] " " parts[5] " " parts[6])
if (timestamp > threshold && $9 ~ /^5/) {
count++
}
}
END {
print count+0
}' $LOG_FILE)
if [ "$ERROR_COUNT" -gt "$ERROR_THRESHOLD" ]; then
echo "警告: 最近 ${TIME_WINDOW} 分钟内发生 ${ERROR_COUNT} 次 5xx 错误" | \
mail -s "[Nginx Alert] 5xx 错误率过高" $ALERT_EMAIL
fi
实战:完整日志分析脚本
#!/bin/bash
# /usr/local/bin/nginx-log-analysis.sh
# Nginx 日志分析脚本
LOG_FILE="${1:-/var/log/nginx/access.log}"
OUTPUT_DIR="/var/log/nginx/reports"
DATE=$(date +%Y%m%d)
mkdir -p $OUTPUT_DIR
echo "=== Nginx 日志分析报告 ===" > $OUTPUT_DIR/report_$DATE.txt
echo "分析时间: $(date)" >> $OUTPUT_DIR/report_$DATE.txt
echo "日志文件: $LOG_FILE" >> $OUTPUT_DIR/report_$DATE.txt
echo "" >> $OUTPUT_DIR/report_$DATE.txt
# 总请求数
TOTAL_REQUESTS=$(wc -l < $LOG_FILE)
echo "总请求数: $TOTAL_REQUESTS" >> $OUTPUT_DIR/report_$DATE.txt
# HTTP 状态码统计
echo "" >> $OUTPUT_DIR/report_$DATE.txt
echo "=== HTTP 状态码统计 ===" >> $OUTPUT_DIR/report_$DATE.txt
awk '{print $9}' $LOG_FILE | sort | uniq -c | sort -rn >> $OUTPUT_DIR/report_$DATE.txt
# Top 20 访问 IP
echo "" >> $OUTPUT_DIR/report_$DATE.txt
echo "=== Top 20 访问 IP ===" >> $OUTPUT_DIR/report_$DATE.txt
awk '{print $1}' $LOG_FILE | sort | uniq -c | sort -rn | head -20 >> $OUTPUT_DIR/report_$DATE.txt
# Top 20 访问 URL
echo "" >> $OUTPUT_DIR/report_$DATE.txt
echo "=== Top 20 访问 URL ===" >> $OUTPUT_DIR/report_$DATE.txt
awk '{print $7}' $LOG_FILE | sort | uniq -c | sort -rn | head -20 >> $OUTPUT_DIR/report_$DATE.txt
# 5xx 错误详情
echo "" >> $OUTPUT_DIR/report_$DATE.txt
echo "=== 5xx 错误详情(最近100条)===" >> $OUTPUT_DIR/report_$DATE.txt
awk '$9 ~ /^5/ {print $0}' $LOG_FILE | tail -100 >> $OUTPUT_DIR/report_$DATE.txt
# 带宽统计
BANDWIDTH=$(awk '{sum+=$10} END {printf "%.2f MB", sum/1024/1024}' $LOG_FILE)
echo "" >> $OUTPUT_DIR/report_$DATE.txt
echo "总带宽使用: $BANDWIDTH" >> $OUTPUT_DIR/report_$DATE.txt
# 输出报告位置
echo "报告已生成: $OUTPUT_DIR/report_$DATE.txt"
总结
本章介绍了 Nginx 日志管理与监控:
- 日志配置:access_log、error_log、自定义格式、JSON 格式
- 日志变量:常用日志变量及其含义
- 日志切割:logrotate 配置、手动切割脚本
- 日志分析:命令行分析、GoAccess、ELK Stack
- 监控方案:stub_status、Prometheus+Grafana、Zabbix
- 告警配置:基于指标和日志的告警