|
1
|
|
|
# -*- coding: utf-8 -*- |
|
2
|
|
|
# |
|
3
|
|
|
# This file is part of Glances. |
|
4
|
|
|
# |
|
5
|
|
|
# SPDX-FileCopyrightText: 2023 Nicolas Hennion <[email protected]> |
|
6
|
|
|
# |
|
7
|
|
|
# SPDX-License-Identifier: LGPL-3.0-only |
|
8
|
|
|
# |
|
9
|
|
|
|
|
10
|
|
|
"""Alert plugin.""" |
|
11
|
|
|
|
|
12
|
|
|
from datetime import datetime |
|
13
|
|
|
from time import tzname |
|
14
|
|
|
import pytz |
|
15
|
|
|
|
|
16
|
|
|
from glances.logger import logger |
|
17
|
|
|
from glances.events import glances_events |
|
18
|
|
|
from glances.thresholds import glances_thresholds |
|
19
|
|
|
|
|
20
|
|
|
# from glances.logger import logger |
|
21
|
|
|
from glances.plugins.plugin.model import GlancesPluginModel |
|
22
|
|
|
|
|
23
|
|
|
# Static decision tree for the global alert message |
|
24
|
|
|
# - msg: Message to be displayed (result of the decision tree) |
|
25
|
|
|
# - thresholds: a list of stats to take into account |
|
26
|
|
|
# - thresholds_min: minimal value of the thresholds sum |
|
27
|
|
|
# - 0: OK |
|
28
|
|
|
# - 1: CAREFUL |
|
29
|
|
|
# - 2: WARNING |
|
30
|
|
|
# - 3: CRITICAL |
|
31
|
|
|
tree = [ |
|
32
|
|
|
{'msg': 'No warning or critical alert detected', 'thresholds': [], 'thresholds_min': 0}, |
|
33
|
|
|
{'msg': 'High CPU user mode', 'thresholds': ['cpu_user'], 'thresholds_min': 2}, |
|
34
|
|
|
{'msg': 'High CPU kernel usage', 'thresholds': ['cpu_system'], 'thresholds_min': 2}, |
|
35
|
|
|
{'msg': 'High CPU I/O waiting', 'thresholds': ['cpu_iowait'], 'thresholds_min': 2}, |
|
36
|
|
|
{ |
|
37
|
|
|
'msg': 'Large CPU stolen time. System running the hypervisor is too busy.', |
|
38
|
|
|
'thresholds': ['cpu_steal'], |
|
39
|
|
|
'thresholds_min': 2, |
|
40
|
|
|
}, |
|
41
|
|
|
{'msg': 'High CPU niced value', 'thresholds': ['cpu_niced'], 'thresholds_min': 2}, |
|
42
|
|
|
{'msg': 'System overloaded in the last 5 minutes', 'thresholds': ['load'], 'thresholds_min': 2}, |
|
43
|
|
|
{'msg': 'High swap (paging) usage', 'thresholds': ['memswap'], 'thresholds_min': 2}, |
|
44
|
|
|
{'msg': 'High memory consumption', 'thresholds': ['mem'], 'thresholds_min': 2}, |
|
45
|
|
|
] |
|
46
|
|
|
|
|
47
|
|
|
# TODO: change the algo to use the following decision tree |
|
48
|
|
|
# Source: Inspire by https://scoutapm.com/blog/slow_server_flow_chart |
|
49
|
|
|
# _yes means threshold >= 2 |
|
50
|
|
|
# _no means threshold < 2 |
|
51
|
|
|
# With threshold: |
|
52
|
|
|
# - 0: OK |
|
53
|
|
|
# - 1: CAREFUL |
|
54
|
|
|
# - 2: WARNING |
|
55
|
|
|
# - 3: CRITICAL |
|
56
|
|
|
tree_new = { |
|
57
|
|
|
'cpu_iowait': { |
|
58
|
|
|
'_yes': { |
|
59
|
|
|
'memswap': { |
|
60
|
|
|
'_yes': { |
|
61
|
|
|
'mem': { |
|
62
|
|
|
'_yes': { |
|
63
|
|
|
# Once you've identified the offenders, the resolution will again |
|
64
|
|
|
# depend on whether their memory usage seems business-as-usual or not. |
|
65
|
|
|
# For example, a memory leak can be satisfactorily addressed by a one-time |
|
66
|
|
|
# or periodic restart of the process. |
|
67
|
|
|
# - if memory usage seems anomalous: kill the offending processes. |
|
68
|
|
|
# - if memory usage seems business-as-usual: add RAM to the server, |
|
69
|
|
|
# or split high-memory using services to other servers. |
|
70
|
|
|
'_msg': "Memory issue" |
|
71
|
|
|
}, |
|
72
|
|
|
'_no': { |
|
73
|
|
|
# ??? |
|
74
|
|
|
'_msg': "Swap issue" |
|
75
|
|
|
}, |
|
76
|
|
|
} |
|
77
|
|
|
}, |
|
78
|
|
|
'_no': { |
|
79
|
|
|
# Low swap means you have a "real" IO wait problem. The next step is to see what's hogging your IO. |
|
80
|
|
|
# iotop is an awesome tool for identifying io offenders. Two things to note: |
|
81
|
|
|
# unless you've already installed iotop, it's probably not already on your system. |
|
82
|
|
|
# Recommendation: install it before you need it - - it's no fun trying to install a troubleshooting |
|
83
|
|
|
# tool on an overloaded machine (iotop requires a Linux of 2.62 or above) |
|
84
|
|
|
'_msg': "I/O issue" |
|
85
|
|
|
}, |
|
86
|
|
|
} |
|
87
|
|
|
}, |
|
88
|
|
|
'_no': { |
|
89
|
|
|
'cpu_total': { |
|
90
|
|
|
'_yes': { |
|
91
|
|
|
'cpu_user': { |
|
92
|
|
|
'_yes': { |
|
93
|
|
|
# We expect the user-time percentage to be high. |
|
94
|
|
|
# There's most likely a program or service you've configured on you server that's |
|
95
|
|
|
# hogging CPU. |
|
96
|
|
|
# Checking the % user time just confirms this. When you see that the % user-time is high, |
|
97
|
|
|
# it's time to see what executable is monopolizing the CPU |
|
98
|
|
|
# Once you've confirmed that the % usertime is high, check the process list(also provided |
|
99
|
|
|
# by top). |
|
100
|
|
|
# Be default, top sorts the process list by % CPU, so you can just look at the top process |
|
101
|
|
|
# or processes. |
|
102
|
|
|
# If there's a single process hogging the CPU in a way that seems abnormal, it's an |
|
103
|
|
|
# anomalous situation |
|
104
|
|
|
# that a service restart can fix. If there are are multiple processes taking up CPU |
|
105
|
|
|
# resources, or it |
|
106
|
|
|
# there's one process that takes lots of resources while otherwise functioning normally, |
|
107
|
|
|
# than your setup |
|
108
|
|
|
# may just be underpowered. You'll need to upgrade your server(add more cores), |
|
109
|
|
|
# or split services out onto |
|
110
|
|
|
# other boxes. In either case, you have a resolution: |
|
111
|
|
|
# - if situation seems anomalous: kill the offending processes. |
|
112
|
|
|
# - if situation seems typical given history: upgrade server or add more servers. |
|
113
|
|
|
'_msg': "CPU issue with user process(es)" |
|
114
|
|
|
}, |
|
115
|
|
|
'_no': { |
|
116
|
|
|
'cpu_steal': { |
|
117
|
|
|
'_yes': { |
|
118
|
|
|
'_msg': "CPU issue with stolen time. System running the hypervisor may be too busy." |
|
119
|
|
|
}, |
|
120
|
|
|
'_no': {'_msg': "CPU issue with system process(es)"}, |
|
121
|
|
|
} |
|
122
|
|
|
}, |
|
123
|
|
|
} |
|
124
|
|
|
}, |
|
125
|
|
|
'_no': { |
|
126
|
|
|
'_yes': { |
|
127
|
|
|
# ??? |
|
128
|
|
|
'_msg': "Memory issue" |
|
129
|
|
|
}, |
|
130
|
|
|
'_no': { |
|
131
|
|
|
# Your slowness isn't due to CPU or IO problems, so it's likely an application-specific issue. |
|
132
|
|
|
# It's also possible that the slowness is being caused by another server in your cluster, or |
|
133
|
|
|
# by an external service you rely on. |
|
134
|
|
|
# start by checking important applications for uncharacteristic slowness(the DB is a good place |
|
135
|
|
|
# to start), think through which parts of your infrastructure could be slowed down externally. |
|
136
|
|
|
# For example, do you use an externally hosted email service that could slow down critical |
|
137
|
|
|
# parts of your application ? |
|
138
|
|
|
# If you suspect another server in your cluster, strace and lsof can provide information on |
|
139
|
|
|
# what the process is doing or waiting on. Strace will show you which file descriptors are |
|
140
|
|
|
# being read or written to (or being attempted to be read from) and lsof can give you a |
|
141
|
|
|
# mapping of those file descriptors to network connections. |
|
142
|
|
|
'_msg': "External issue" |
|
143
|
|
|
}, |
|
144
|
|
|
}, |
|
145
|
|
|
} |
|
146
|
|
|
}, |
|
147
|
|
|
} |
|
148
|
|
|
} |
|
149
|
|
|
|
|
150
|
|
|
|
|
151
|
|
|
def global_message(): |
|
152
|
|
|
"""Parse the decision tree and return the message. |
|
153
|
|
|
|
|
154
|
|
|
Note: message corresponding to the current thresholds values |
|
155
|
|
|
""" |
|
156
|
|
|
# Compute the weight for each item in the tree |
|
157
|
|
|
current_thresholds = glances_thresholds.get() |
|
158
|
|
|
for i in tree: |
|
159
|
|
|
i['weight'] = sum([current_thresholds[t].value() for t in i['thresholds'] if t in current_thresholds]) |
|
160
|
|
|
themax = max(tree, key=lambda d: d['weight']) |
|
161
|
|
|
if themax['weight'] >= themax['thresholds_min']: |
|
162
|
|
|
# Check if the weight is > to the minimal threshold value |
|
163
|
|
|
return themax['msg'] |
|
164
|
|
|
else: |
|
165
|
|
|
return tree[0]['msg'] |
|
166
|
|
|
|
|
167
|
|
|
|
|
168
|
|
|
class PluginModel(GlancesPluginModel): |
|
169
|
|
|
"""Glances alert plugin. |
|
170
|
|
|
|
|
171
|
|
|
Only for display. |
|
172
|
|
|
""" |
|
173
|
|
|
|
|
174
|
|
|
def __init__(self, args=None, config=None): |
|
175
|
|
|
"""Init the plugin.""" |
|
176
|
|
|
super(PluginModel, self).__init__(args=args, |
|
177
|
|
|
config=config, |
|
178
|
|
|
stats_init_value=[]) |
|
179
|
|
|
|
|
180
|
|
|
# We want to display the stat in the curse interface |
|
181
|
|
|
self.display_curse = True |
|
182
|
|
|
|
|
183
|
|
|
# Set the message position |
|
184
|
|
|
self.align = 'bottom' |
|
185
|
|
|
|
|
186
|
|
|
# Set the maximum number of events to display |
|
187
|
|
|
if config is not None and (config.has_section('alert') or config.has_section('alerts')): |
|
188
|
|
|
glances_events.set_max_events(config.get_int_value('alert', 'max_events', default=10)) |
|
189
|
|
|
|
|
190
|
|
|
def update(self): |
|
191
|
|
|
"""Nothing to do here. Just return the global glances_log.""" |
|
192
|
|
|
# Set the stats to the glances_events |
|
193
|
|
|
self.stats = glances_events.get() |
|
194
|
|
|
# Define the global message thanks to the current thresholds |
|
195
|
|
|
# and the decision tree |
|
196
|
|
|
# !!! Call directly in the msg_curse function |
|
197
|
|
|
# global_message() |
|
198
|
|
|
|
|
199
|
|
|
def msg_curse(self, args=None, max_width=None): |
|
200
|
|
|
"""Return the dict to display in the curse interface.""" |
|
201
|
|
|
# Init the return message |
|
202
|
|
|
ret = [] |
|
203
|
|
|
|
|
204
|
|
|
# Only process if display plugin enable... |
|
205
|
|
|
if not self.stats or self.is_disabled(): |
|
206
|
|
|
return ret |
|
207
|
|
|
|
|
208
|
|
|
# Build the string message |
|
209
|
|
|
# Header |
|
210
|
|
|
ret.append(self.curse_add_line(global_message(), "TITLE")) |
|
211
|
|
|
# Loop over alerts |
|
212
|
|
|
for alert in self.stats: |
|
213
|
|
|
# New line |
|
214
|
|
|
ret.append(self.curse_new_line()) |
|
215
|
|
|
# Start |
|
216
|
|
|
msg = str(datetime.fromtimestamp(alert[0], |
|
217
|
|
|
tz=pytz.timezone(tzname[0] if tzname[0] else 'UTC'))) |
|
218
|
|
|
ret.append(self.curse_add_line(msg)) |
|
219
|
|
|
# Duration |
|
220
|
|
|
if alert[1] > 0: |
|
221
|
|
|
# If finished display duration |
|
222
|
|
|
msg = ' ({})'.format(datetime.fromtimestamp(alert[1]) - datetime.fromtimestamp(alert[0])) |
|
223
|
|
|
else: |
|
224
|
|
|
msg = ' (ongoing)' |
|
225
|
|
|
ret.append(self.curse_add_line(msg)) |
|
226
|
|
|
ret.append(self.curse_add_line(" - ")) |
|
227
|
|
|
# Infos |
|
228
|
|
|
if alert[1] > 0: |
|
229
|
|
|
# If finished do not display status |
|
230
|
|
|
msg = '{} on {}'.format(alert[2], alert[3]) |
|
231
|
|
|
ret.append(self.curse_add_line(msg)) |
|
232
|
|
|
else: |
|
233
|
|
|
msg = str(alert[3]) |
|
234
|
|
|
ret.append(self.curse_add_line(msg, decoration=alert[2])) |
|
235
|
|
|
# Min / Mean / Max |
|
236
|
|
|
if self.approx_equal(alert[6], alert[4], tolerance=0.1): |
|
237
|
|
|
msg = ' ({:.1f})'.format(alert[5]) |
|
238
|
|
|
else: |
|
239
|
|
|
msg = ' (Min:{:.1f} Mean:{:.1f} Max:{:.1f})'.format(alert[6], alert[5], alert[4]) |
|
240
|
|
|
ret.append(self.curse_add_line(msg)) |
|
241
|
|
|
# Top processes |
|
242
|
|
|
top_process = ', '.join([p['name'] for p in alert[9]]) |
|
243
|
|
|
if top_process != '': |
|
244
|
|
|
msg = ': {}'.format(top_process) |
|
245
|
|
|
ret.append(self.curse_add_line(msg)) |
|
246
|
|
|
|
|
247
|
|
|
return ret |
|
248
|
|
|
|
|
249
|
|
|
def approx_equal(self, a, b, tolerance=0.0): |
|
250
|
|
|
"""Compare a with b using the tolerance (if numerical).""" |
|
251
|
|
|
if str(int(a)).isdigit() and str(int(b)).isdigit(): |
|
252
|
|
|
return abs(a - b) <= max(abs(a), abs(b)) * tolerance |
|
253
|
|
|
else: |
|
254
|
|
|
return a == b |
|
255
|
|
|
|