1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
# |
3
|
|
|
# This file is part of Glances. |
4
|
|
|
# |
5
|
|
|
# SPDX-FileCopyrightText: 2023 Nicolas Hennion <[email protected]> |
6
|
|
|
# |
7
|
|
|
# SPDX-License-Identifier: LGPL-3.0-only |
8
|
|
|
# |
9
|
|
|
|
10
|
|
|
"""Alert plugin.""" |
11
|
|
|
|
12
|
|
|
from datetime import datetime |
13
|
|
|
from time import tzname |
14
|
|
|
import pytz |
15
|
|
|
|
16
|
|
|
from glances.logger import logger |
17
|
|
|
from glances.events import glances_events |
18
|
|
|
from glances.thresholds import glances_thresholds |
19
|
|
|
|
20
|
|
|
# from glances.logger import logger |
21
|
|
|
from glances.plugins.plugin.model import GlancesPluginModel |
22
|
|
|
|
23
|
|
|
# Static decision tree for the global alert message |
24
|
|
|
# - msg: Message to be displayed (result of the decision tree) |
25
|
|
|
# - thresholds: a list of stats to take into account |
26
|
|
|
# - thresholds_min: minimal value of the thresholds sum |
27
|
|
|
# - 0: OK |
28
|
|
|
# - 1: CAREFUL |
29
|
|
|
# - 2: WARNING |
30
|
|
|
# - 3: CRITICAL |
31
|
|
|
tree = [ |
32
|
|
|
{'msg': 'No warning or critical alert detected', 'thresholds': [], 'thresholds_min': 0}, |
33
|
|
|
{'msg': 'High CPU user mode', 'thresholds': ['cpu_user'], 'thresholds_min': 2}, |
34
|
|
|
{'msg': 'High CPU kernel usage', 'thresholds': ['cpu_system'], 'thresholds_min': 2}, |
35
|
|
|
{'msg': 'High CPU I/O waiting', 'thresholds': ['cpu_iowait'], 'thresholds_min': 2}, |
36
|
|
|
{ |
37
|
|
|
'msg': 'Large CPU stolen time. System running the hypervisor is too busy.', |
38
|
|
|
'thresholds': ['cpu_steal'], |
39
|
|
|
'thresholds_min': 2, |
40
|
|
|
}, |
41
|
|
|
{'msg': 'High CPU niced value', 'thresholds': ['cpu_niced'], 'thresholds_min': 2}, |
42
|
|
|
{'msg': 'System overloaded in the last 5 minutes', 'thresholds': ['load'], 'thresholds_min': 2}, |
43
|
|
|
{'msg': 'High swap (paging) usage', 'thresholds': ['memswap'], 'thresholds_min': 2}, |
44
|
|
|
{'msg': 'High memory consumption', 'thresholds': ['mem'], 'thresholds_min': 2}, |
45
|
|
|
] |
46
|
|
|
|
47
|
|
|
# TODO: change the algo to use the following decision tree |
48
|
|
|
# Source: Inspire by https://scoutapm.com/blog/slow_server_flow_chart |
49
|
|
|
# _yes means threshold >= 2 |
50
|
|
|
# _no means threshold < 2 |
51
|
|
|
# With threshold: |
52
|
|
|
# - 0: OK |
53
|
|
|
# - 1: CAREFUL |
54
|
|
|
# - 2: WARNING |
55
|
|
|
# - 3: CRITICAL |
56
|
|
|
tree_new = { |
57
|
|
|
'cpu_iowait': { |
58
|
|
|
'_yes': { |
59
|
|
|
'memswap': { |
60
|
|
|
'_yes': { |
61
|
|
|
'mem': { |
62
|
|
|
'_yes': { |
63
|
|
|
# Once you've identified the offenders, the resolution will again |
64
|
|
|
# depend on whether their memory usage seems business-as-usual or not. |
65
|
|
|
# For example, a memory leak can be satisfactorily addressed by a one-time |
66
|
|
|
# or periodic restart of the process. |
67
|
|
|
# - if memory usage seems anomalous: kill the offending processes. |
68
|
|
|
# - if memory usage seems business-as-usual: add RAM to the server, |
69
|
|
|
# or split high-memory using services to other servers. |
70
|
|
|
'_msg': "Memory issue" |
71
|
|
|
}, |
72
|
|
|
'_no': { |
73
|
|
|
# ??? |
74
|
|
|
'_msg': "Swap issue" |
75
|
|
|
}, |
76
|
|
|
} |
77
|
|
|
}, |
78
|
|
|
'_no': { |
79
|
|
|
# Low swap means you have a "real" IO wait problem. The next step is to see what's hogging your IO. |
80
|
|
|
# iotop is an awesome tool for identifying io offenders. Two things to note: |
81
|
|
|
# unless you've already installed iotop, it's probably not already on your system. |
82
|
|
|
# Recommendation: install it before you need it - - it's no fun trying to install a troubleshooting |
83
|
|
|
# tool on an overloaded machine (iotop requires a Linux of 2.62 or above) |
84
|
|
|
'_msg': "I/O issue" |
85
|
|
|
}, |
86
|
|
|
} |
87
|
|
|
}, |
88
|
|
|
'_no': { |
89
|
|
|
'cpu_total': { |
90
|
|
|
'_yes': { |
91
|
|
|
'cpu_user': { |
92
|
|
|
'_yes': { |
93
|
|
|
# We expect the user-time percentage to be high. |
94
|
|
|
# There's most likely a program or service you've configured on you server that's |
95
|
|
|
# hogging CPU. |
96
|
|
|
# Checking the % user time just confirms this. When you see that the % user-time is high, |
97
|
|
|
# it's time to see what executable is monopolizing the CPU |
98
|
|
|
# Once you've confirmed that the % usertime is high, check the process list(also provided |
99
|
|
|
# by top). |
100
|
|
|
# Be default, top sorts the process list by % CPU, so you can just look at the top process |
101
|
|
|
# or processes. |
102
|
|
|
# If there's a single process hogging the CPU in a way that seems abnormal, it's an |
103
|
|
|
# anomalous situation |
104
|
|
|
# that a service restart can fix. If there are are multiple processes taking up CPU |
105
|
|
|
# resources, or it |
106
|
|
|
# there's one process that takes lots of resources while otherwise functioning normally, |
107
|
|
|
# than your setup |
108
|
|
|
# may just be underpowered. You'll need to upgrade your server(add more cores), |
109
|
|
|
# or split services out onto |
110
|
|
|
# other boxes. In either case, you have a resolution: |
111
|
|
|
# - if situation seems anomalous: kill the offending processes. |
112
|
|
|
# - if situation seems typical given history: upgrade server or add more servers. |
113
|
|
|
'_msg': "CPU issue with user process(es)" |
114
|
|
|
}, |
115
|
|
|
'_no': { |
116
|
|
|
'cpu_steal': { |
117
|
|
|
'_yes': { |
118
|
|
|
'_msg': "CPU issue with stolen time. System running the hypervisor may be too busy." |
119
|
|
|
}, |
120
|
|
|
'_no': {'_msg': "CPU issue with system process(es)"}, |
121
|
|
|
} |
122
|
|
|
}, |
123
|
|
|
} |
124
|
|
|
}, |
125
|
|
|
'_no': { |
126
|
|
|
'_yes': { |
127
|
|
|
# ??? |
128
|
|
|
'_msg': "Memory issue" |
129
|
|
|
}, |
130
|
|
|
'_no': { |
131
|
|
|
# Your slowness isn't due to CPU or IO problems, so it's likely an application-specific issue. |
132
|
|
|
# It's also possible that the slowness is being caused by another server in your cluster, or |
133
|
|
|
# by an external service you rely on. |
134
|
|
|
# start by checking important applications for uncharacteristic slowness(the DB is a good place |
135
|
|
|
# to start), think through which parts of your infrastructure could be slowed down externally. |
136
|
|
|
# For example, do you use an externally hosted email service that could slow down critical |
137
|
|
|
# parts of your application ? |
138
|
|
|
# If you suspect another server in your cluster, strace and lsof can provide information on |
139
|
|
|
# what the process is doing or waiting on. Strace will show you which file descriptors are |
140
|
|
|
# being read or written to (or being attempted to be read from) and lsof can give you a |
141
|
|
|
# mapping of those file descriptors to network connections. |
142
|
|
|
'_msg': "External issue" |
143
|
|
|
}, |
144
|
|
|
}, |
145
|
|
|
} |
146
|
|
|
}, |
147
|
|
|
} |
148
|
|
|
} |
149
|
|
|
|
150
|
|
|
|
151
|
|
|
def global_message(): |
152
|
|
|
"""Parse the decision tree and return the message. |
153
|
|
|
|
154
|
|
|
Note: message corresponding to the current thresholds values |
155
|
|
|
""" |
156
|
|
|
# Compute the weight for each item in the tree |
157
|
|
|
current_thresholds = glances_thresholds.get() |
158
|
|
|
for i in tree: |
159
|
|
|
i['weight'] = sum([current_thresholds[t].value() for t in i['thresholds'] if t in current_thresholds]) |
160
|
|
|
themax = max(tree, key=lambda d: d['weight']) |
161
|
|
|
if themax['weight'] >= themax['thresholds_min']: |
162
|
|
|
# Check if the weight is > to the minimal threshold value |
163
|
|
|
return themax['msg'] |
164
|
|
|
else: |
165
|
|
|
return tree[0]['msg'] |
166
|
|
|
|
167
|
|
|
|
168
|
|
|
class PluginModel(GlancesPluginModel): |
169
|
|
|
"""Glances alert plugin. |
170
|
|
|
|
171
|
|
|
Only for display. |
172
|
|
|
""" |
173
|
|
|
|
174
|
|
|
def __init__(self, args=None, config=None): |
175
|
|
|
"""Init the plugin.""" |
176
|
|
|
super(PluginModel, self).__init__(args=args, |
177
|
|
|
config=config, |
178
|
|
|
stats_init_value=[]) |
179
|
|
|
|
180
|
|
|
# We want to display the stat in the curse interface |
181
|
|
|
self.display_curse = True |
182
|
|
|
|
183
|
|
|
# Set the message position |
184
|
|
|
self.align = 'bottom' |
185
|
|
|
|
186
|
|
|
# Set the maximum number of events to display |
187
|
|
|
if config is not None and (config.has_section('alert') or config.has_section('alerts')): |
188
|
|
|
glances_events.set_max_events(config.get_int_value('alert', 'max_events', default=10)) |
189
|
|
|
|
190
|
|
|
def update(self): |
191
|
|
|
"""Nothing to do here. Just return the global glances_log.""" |
192
|
|
|
# Set the stats to the glances_events |
193
|
|
|
self.stats = glances_events.get() |
194
|
|
|
# Define the global message thanks to the current thresholds |
195
|
|
|
# and the decision tree |
196
|
|
|
# !!! Call directly in the msg_curse function |
197
|
|
|
# global_message() |
198
|
|
|
|
199
|
|
|
def msg_curse(self, args=None, max_width=None): |
200
|
|
|
"""Return the dict to display in the curse interface.""" |
201
|
|
|
# Init the return message |
202
|
|
|
ret = [] |
203
|
|
|
|
204
|
|
|
# Only process if display plugin enable... |
205
|
|
|
if not self.stats or self.is_disabled(): |
206
|
|
|
return ret |
207
|
|
|
|
208
|
|
|
# Build the string message |
209
|
|
|
# Header |
210
|
|
|
ret.append(self.curse_add_line(global_message(), "TITLE")) |
211
|
|
|
# Loop over alerts |
212
|
|
|
for alert in self.stats: |
213
|
|
|
# New line |
214
|
|
|
ret.append(self.curse_new_line()) |
215
|
|
|
# Start |
216
|
|
|
msg = str(datetime.fromtimestamp(alert[0], |
217
|
|
|
tz=pytz.timezone(tzname[0] if tzname[0] else 'UTC'))) |
218
|
|
|
ret.append(self.curse_add_line(msg)) |
219
|
|
|
# Duration |
220
|
|
|
if alert[1] > 0: |
221
|
|
|
# If finished display duration |
222
|
|
|
msg = ' ({})'.format(datetime.fromtimestamp(alert[1]) - datetime.fromtimestamp(alert[0])) |
223
|
|
|
else: |
224
|
|
|
msg = ' (ongoing)' |
225
|
|
|
ret.append(self.curse_add_line(msg)) |
226
|
|
|
ret.append(self.curse_add_line(" - ")) |
227
|
|
|
# Infos |
228
|
|
|
if alert[1] > 0: |
229
|
|
|
# If finished do not display status |
230
|
|
|
msg = '{} on {}'.format(alert[2], alert[3]) |
231
|
|
|
ret.append(self.curse_add_line(msg)) |
232
|
|
|
else: |
233
|
|
|
msg = str(alert[3]) |
234
|
|
|
ret.append(self.curse_add_line(msg, decoration=alert[2])) |
235
|
|
|
# Min / Mean / Max |
236
|
|
|
if self.approx_equal(alert[6], alert[4], tolerance=0.1): |
237
|
|
|
msg = ' ({:.1f})'.format(alert[5]) |
238
|
|
|
else: |
239
|
|
|
msg = ' (Min:{:.1f} Mean:{:.1f} Max:{:.1f})'.format(alert[6], alert[5], alert[4]) |
240
|
|
|
ret.append(self.curse_add_line(msg)) |
241
|
|
|
# Top processes |
242
|
|
|
top_process = ', '.join([p['name'] for p in alert[9]]) |
243
|
|
|
if top_process != '': |
244
|
|
|
msg = ': {}'.format(top_process) |
245
|
|
|
ret.append(self.curse_add_line(msg)) |
246
|
|
|
|
247
|
|
|
return ret |
248
|
|
|
|
249
|
|
|
def approx_equal(self, a, b, tolerance=0.0): |
250
|
|
|
"""Compare a with b using the tolerance (if numerical).""" |
251
|
|
|
if str(int(a)).isdigit() and str(int(b)).isdigit(): |
252
|
|
|
return abs(a - b) <= max(abs(a), abs(b)) * tolerance |
253
|
|
|
else: |
254
|
|
|
return a == b |
255
|
|
|
|