共计 9825 个字符,预计需要花费 25 分钟才能阅读完成。

前面博主已将商品中心服务接入到skywalking,实现了链路追踪功能。而在运维过程中,我们还需要配置监控来触发告警,让故障信息尽快通知到相关人员进行分析,所以这里我们就给我们的服务加上监控和告警配置
告警指标
对域skywalking的告警指标,默认路径在skywalking服务的config/oal/core.oal内
root@8e2113b4496c:/skywalking# cat config/oal/core.oal
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
// For services using protocols HTTP 1/2, gRPC, RPC, etc., the cpm metrics means "calls per minute",
// for services that are built on top of TCP, the cpm means "packages per minute".
// Service scope metrics
service_resp_time = from(Service.latency).longAvg();
service_sla = from(Service.*).percent(status == true);
service_cpm = from(Service.*).cpm();
service_percentile = from(Service.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_apdex = from(Service.latency).apdex(name, status);
service_mq_consume_count = from(Service.*).filter(type == RequestType.MQ).count();
service_mq_consume_latency = from((str->long)Service.tag["transmission.latency"]).filter(type == RequestType.MQ).filter(tag["transmission.latency"] != null).longAvg();
// Service relation scope metrics for topology
service_relation_client_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();
service_relation_server_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
service_relation_client_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);
service_relation_server_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
service_relation_client_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();
service_relation_server_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();
service_relation_client_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_relation_server_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99
// Service Instance relation scope metrics for topology
service_instance_relation_client_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();
service_instance_relation_server_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
service_instance_relation_client_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);
service_instance_relation_server_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
service_instance_relation_client_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();
service_instance_relation_server_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();
service_instance_relation_client_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_instance_relation_server_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99
// Service Instance Scope metrics
service_instance_sla = from(ServiceInstance.*).percent(status == true);
service_instance_resp_time= from(ServiceInstance.latency).longAvg();
service_instance_cpm = from(ServiceInstance.*).cpm();
// Endpoint scope metrics
endpoint_cpm = from(Endpoint.*).cpm();
endpoint_resp_time = from(Endpoint.latency).longAvg();
endpoint_sla = from(Endpoint.*).percent(status == true);
endpoint_percentile = from(Endpoint.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
endpoint_mq_consume_count = from(Endpoint.*).filter(type == RequestType.MQ).count();
endpoint_mq_consume_latency = from((str->long)Endpoint.tag["transmission.latency"]).filter(type == RequestType.MQ).filter(tag["transmission.latency"] != null).longAvg();
// Endpoint relation scope metrics
endpoint_relation_cpm = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
endpoint_relation_resp_time = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).longAvg();
endpoint_relation_sla = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
endpoint_relation_percentile = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99
database_access_resp_time = from(DatabaseAccess.latency).longAvg();
database_access_sla = from(DatabaseAccess.*).percent(status == true);
database_access_cpm = from(DatabaseAccess.*).cpm();
database_access_percentile = from(DatabaseAccess.latency).percentile(10);
- service_resp_time #服务的响应时间
- service_sla #服务的http请求成功率SLA,比如99%等。
- service_cpm #表示每分钟的吞吐量.
- service_apdex : 应用性能指数是0.8是0.x
- service_percentile: 指定最近多少数据范围内的响应时间百分比,即p99, p95, p90, p75, p50在内的数据统计结果
- endpoint_relation_cpm #端点的每分钟的吞吐量
- endpoint_relation_resp_time #端点的响应时间
- endpoint_relation_sla #端点的http请求成功率SLA,比如99%等。
- endpoint_relation_percentile ##端点的最近多少数据范围内的响应时间百分比,即p99、 p95、 p90、 p75、p50在内的数据统计结果
如果指标不满足自己的业务的需求,可以参考上面去定制
告警规则
skywalking的告警配置文件为config/alarm-settings.yml
普通告警
- 规则名称:在告警信息中显示的唯一名称,必须以
_rule结尾。 - metrics-name:度量名称,也是OAL脚本中的度量名。默认配置中可以用于告警的度量有:服务,实例,端点,服务关系,实例关系,端点关系。它只支持long,double和int类型。
- include-names:包含在此规则之内的实体名称列表。
- exclude-names:排除在此规则以外的实体名称列表。
- include-names-regex:提供一个正则表达式来包含实体名称。如果同时设置包含名称列表和包含名称的正则表达式,则两个规则都将生效。
- exclude-names-regex:提供一个正则表达式来排除实体名称。如果同时设置排除名称列表和排除名称的正则表达式,则两个规则都将生效。
- include-labels:包含在此规则之内的标签。
- exclude-labels:排除在此规则以外的标签。
- include-labels-regex:提供一个正则表达式来包含标签。如果同时设置包含标签列表和包含标签的正则表达式,则两个规则都将生效。
- exclude-labels-regex:提供一个正则表达式来排除标签。如果同时设置排除标签列表和排除标签的正则表达式,则两个规则都将生效。
- threshold:阈值。对于多个值指标,例如percentile,阈值是一个数组。像
value1value2value3value4value5这样描述。 每个值可以作为度量中每个值的阈值。如果不想通过此值或某些值触发警报,则将值设置为-。 例如在percentile中,value1是P50的阈值,value2是P75的阈值,那么-,-,value3, value4, value5的意思是,没有阈值的P50和P75的percentile告警规则。
- op:操作符,支持
>,>=,<,<=,=。 - period:多久告警规则需要被检查一下。这是一个时间窗口,与后端部署环境时间相匹配。
- count:在一个周期窗口中,如果按op计算超过阈值的次数达到count,则发送告警。
- only-as-condition:
true或者false,指定规则是否可以发送告警,或者仅作为复合规则的条件。 - silence-period:在时间N中触发报警后,在N -> N + silence-period这段时间内不告警。默认情况下,它和period一样,这意味着相同的告警(同一个度量名称拥有相同的Id)在同一个周期内只会触发一次。
- message:该规则触发时,发送的通知消息。
下面是两条告警示例:
- 服务在2分钟内调用次数大于1,一次就触发,触发告警后静默2分钟
- 服务响应时间在2分钟内超过100ms,一次就触发,触发告警后静默2分钟
root@8e2113b4496c:/skywalking# cat config/alarm-settings.yml
rules:
service_cpm_rule:
# 服务调用次数
metrics-name: service_cpm
op: ">"
threshold: 1
period: 2
count: 1
silence-period: 2
message: 服务 {name} 访问次数大于1
# Rule unique name, must be ended with `_rule`.
service_resp_time_rule:
metrics-name: service_resp_time
op: ">"
threshold: 100
period: 2
count: 1
silence-period: 2
message: Response time of service {name} is more than 100ms in last 2 minutes.
# webhook
官方的一些样例规则
rules:
# Rule unique name, must be ended with `_rule`.
endpoint_percent_rule:
# Metrics value need to be long, double or int
metrics-name: endpoint_percent
threshold: 75
op: <
# The length of time to evaluate the metrics
period: 10
# How many times after the metrics match the condition, will trigger alarm
count: 3
# How many times of checks, the alarm keeps silence after alarm triggered, default as same as period.
silence-period: 10
# Specify if the rule can send notification or just as an condition of composite rule
only-as-condition: false
tags:
level: WARNING
service_percent_rule:
metrics-name: service_percent
# [Optional] Default, match all services in this metrics
include-names:
- service_a
- service_b
exclude-names:
- service_c
# Single value metrics threshold.
threshold: 85
op: <
period: 10
count: 4
only-as-condition: false
service_resp_time_percentile_rule:
# Metrics value need to be long, double or int
metrics-name: service_percentile
op: ">"
# Multiple value metrics threshold. Thresholds for P50, P75, P90, P95, P99.
threshold: 1000,1000,1000,1000,1000
period: 10
count: 3
silence-period: 5
message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 1000, p75 > 1000, p90 > 1000, p95 > 1000, p99 > 1000
only-as-condition: false
meter_service_status_code_rule:
metrics-name: meter_status_code
exclude-labels:
- "200"
op: ">"
threshold: 10
period: 10
count: 3
silence-period: 5
message: The request number of entity {name} non-200 status is more than expected.
only-as-condition: false
读者自行根据上面的用法编写合适业务的告警监控规则,注意: endpoint 规则,相比 service、instance 规则耗费更多内存及资源~
复合规则
就是将多个规则进行判断:
composite-rules:
comp_rule:
# Must satisfied percent rule and resp time rule
expression: service_percent_rule && service_resp_time_percentile_rule
message: Service {name} successful rate is less than 80% and P50 of response time is over 1000ms
tags:
level: CRITICAL
- 规则名称:在告警信息中显示的唯一名称,必须以_rule结尾
- expression:指定如何组成规则,支持&&, ||, ()操作符
- message:该规则触发时,发送的通知消息
告警通知
钉钉告警
root@8e2113b4496c:/skywalking# cat config/alarm-settings.yml
# 略
dingtalkHooks:
textTemplate: |-
{
"msgtype": "text",
"text": {
"content": "Apache SkyWalking Alarm: \n %s."
}
}
webhooks:
- url: https://oapi.dingtalk.com/robot/send?access_token=ceb8f51dddddddddddaf640cae1db92
secret: SEC80939ddddddddd21bcb6e2652f
配置完规则重启服务,测试下接口访问

在告警或事件中可以看到触发信息

回到钉钉群中可以看到告警信息以推送过来

微信告警
微信告警配置样例
wechatHooks:
textTemplate: |-
{
"msgtype": "text",
"text": {
"content": "Apache SkyWalking Alarm: \n %s."
}
}
webhooks:
- https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=dummy_key
飞书告警
飞书告警样例配置
feishuHooks:
textTemplate: |-
{
"msg_type": "text",
"content": {
"text": "Apache SkyWalking Alarm: \n %s."
},
"ats":"feishu_user_id_1,feishu_user_id_2"
}
webhooks:
- url: https://open.feishu.cn/open-apis/bot/v2/hook/dummy_token
secret: dummysecret
参考文档
正文完
隐私政策
留言板
金色传说
kubernetes
terraform
云生原
helm
代码编程
Java
Python
Shell
DevOps
Ansible
Gitlab
Jenkins
运维
老司机
Linux 杂锦
Nginx
数据库
elasticsearch
监控
上帝视角
DJI FPV
DJI mini 3 pro
关于本站