1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
# |
3
|
|
|
# This file is part of Glances. |
4
|
|
|
# |
5
|
|
|
# SPDX-FileCopyrightText: 2022 Nicolas Hennion <[email protected]> |
6
|
|
|
# |
7
|
|
|
# SPDX-License-Identifier: LGPL-3.0-only |
8
|
|
|
# |
9
|
|
|
|
10
|
|
|
"""Manage Glances events list (previously Glances logs in Glances < 3.1).""" |
11
|
|
|
|
12
|
|
|
import time |
13
|
|
|
from datetime import datetime |
14
|
|
|
from pydantic import RootModel |
15
|
|
|
|
16
|
|
|
from glances.processes import glances_processes |
17
|
|
|
from glances.thresholds import glances_thresholds |
18
|
|
|
from glances.event import GlancesEvent |
19
|
|
|
|
20
|
|
|
# Static decision tree for the global alert message |
21
|
|
|
# - msg: Message to be displayed (result of the decision tree) |
22
|
|
|
# - thresholds: a list of stats to take into account |
23
|
|
|
# - thresholds_min: minimal value of the thresholds sum |
24
|
|
|
# - 0: OK |
25
|
|
|
# - 1: CAREFUL |
26
|
|
|
# - 2: WARNING |
27
|
|
|
# - 3: CRITICAL |
28
|
|
|
tree = [ |
29
|
|
|
{'msg': 'EVENTS history', 'thresholds': [], 'thresholds_min': 0}, |
30
|
|
|
{'msg': 'High CPU user mode', 'thresholds': ['cpu_user'], 'thresholds_min': 2}, |
31
|
|
|
{'msg': 'High CPU kernel usage', 'thresholds': ['cpu_system'], 'thresholds_min': 2}, |
32
|
|
|
{'msg': 'High CPU I/O waiting', 'thresholds': ['cpu_iowait'], 'thresholds_min': 2}, |
33
|
|
|
{ |
34
|
|
|
'msg': 'Large CPU stolen time. System running the hypervisor is too busy.', |
35
|
|
|
'thresholds': ['cpu_steal'], |
36
|
|
|
'thresholds_min': 2, |
37
|
|
|
}, |
38
|
|
|
{'msg': 'High CPU niced value', 'thresholds': ['cpu_niced'], 'thresholds_min': 2}, |
39
|
|
|
{'msg': 'System overloaded in the last 5 minutes', 'thresholds': ['load'], 'thresholds_min': 2}, |
40
|
|
|
{'msg': 'High swap (paging) usage', 'thresholds': ['memswap'], 'thresholds_min': 2}, |
41
|
|
|
{'msg': 'High memory consumption', 'thresholds': ['mem'], 'thresholds_min': 2}, |
42
|
|
|
] |
43
|
|
|
|
44
|
|
|
# TODO: change the algo to use the following decision tree |
45
|
|
|
# Source: Inspire by https://scoutapm.com/blog/slow_server_flow_chart |
46
|
|
|
# _yes means threshold >= 2 |
47
|
|
|
# _no means threshold < 2 |
48
|
|
|
# With threshold: |
49
|
|
|
# - 0: OK |
50
|
|
|
# - 1: CAREFUL |
51
|
|
|
# - 2: WARNING |
52
|
|
|
# - 3: CRITICAL |
53
|
|
|
tree_new = { |
54
|
|
|
'cpu_iowait': { |
55
|
|
|
'_yes': { |
56
|
|
|
'memswap': { |
57
|
|
|
'_yes': { |
58
|
|
|
'mem': { |
59
|
|
|
'_yes': { |
60
|
|
|
# Once you've identified the offenders, the resolution will again |
61
|
|
|
# depend on whether their memory usage seems business-as-usual or not. |
62
|
|
|
# For example, a memory leak can be satisfactorily addressed by a one-time |
63
|
|
|
# or periodic restart of the process. |
64
|
|
|
# - if memory usage seems anomalous: kill the offending processes. |
65
|
|
|
# - if memory usage seems business-as-usual: add RAM to the server, |
66
|
|
|
# or split high-memory using services to other servers. |
67
|
|
|
'_msg': "Memory issue" |
68
|
|
|
}, |
69
|
|
|
'_no': { |
70
|
|
|
# ??? |
71
|
|
|
'_msg': "Swap issue" |
72
|
|
|
}, |
73
|
|
|
} |
74
|
|
|
}, |
75
|
|
|
'_no': { |
76
|
|
|
# Low swap means you have a "real" IO wait problem. The next step is to see what's hogging your IO. |
77
|
|
|
# iotop is an awesome tool for identifying io offenders. Two things to note: |
78
|
|
|
# unless you've already installed iotop, it's probably not already on your system. |
79
|
|
|
# Recommendation: install it before you need it - - it's no fun trying to install a troubleshooting |
80
|
|
|
# tool on an overloaded machine (iotop requires a Linux of 2.62 or above) |
81
|
|
|
'_msg': "I/O issue" |
82
|
|
|
}, |
83
|
|
|
} |
84
|
|
|
}, |
85
|
|
|
'_no': { |
86
|
|
|
'cpu_total': { |
87
|
|
|
'_yes': { |
88
|
|
|
'cpu_user': { |
89
|
|
|
'_yes': { |
90
|
|
|
# We expect the user-time percentage to be high. |
91
|
|
|
# There's most likely a program or service you've configured on you server that's |
92
|
|
|
# hogging CPU. |
93
|
|
|
# Checking the % user time just confirms this. When you see that the % user-time is high, |
94
|
|
|
# it's time to see what executable is monopolizing the CPU |
95
|
|
|
# Once you've confirmed that the % usertime is high, check the process list(also provided |
96
|
|
|
# by top). |
97
|
|
|
# Be default, top sorts the process list by % CPU, so you can just look at the top process |
98
|
|
|
# or processes. |
99
|
|
|
# If there's a single process hogging the CPU in a way that seems abnormal, it's an |
100
|
|
|
# anomalous situation |
101
|
|
|
# that a service restart can fix. If there are are multiple processes taking up CPU |
102
|
|
|
# resources, or it |
103
|
|
|
# there's one process that takes lots of resources while otherwise functioning normally, |
104
|
|
|
# than your setup |
105
|
|
|
# may just be underpowered. You'll need to upgrade your server(add more cores), |
106
|
|
|
# or split services out onto |
107
|
|
|
# other boxes. In either case, you have a resolution: |
108
|
|
|
# - if situation seems anomalous: kill the offending processes. |
109
|
|
|
# - if situation seems typical given history: upgrade server or add more servers. |
110
|
|
|
'_msg': "CPU issue with user process(es)" |
111
|
|
|
}, |
112
|
|
|
'_no': { |
113
|
|
|
'cpu_steal': { |
114
|
|
|
'_yes': { |
115
|
|
|
'_msg': "CPU issue with stolen time. System running the hypervisor may be too busy." |
116
|
|
|
}, |
117
|
|
|
'_no': {'_msg': "CPU issue with system process(es)"}, |
118
|
|
|
} |
119
|
|
|
}, |
120
|
|
|
} |
121
|
|
|
}, |
122
|
|
|
'_no': { |
123
|
|
|
'_yes': { |
124
|
|
|
# ??? |
125
|
|
|
'_msg': "Memory issue" |
126
|
|
|
}, |
127
|
|
|
'_no': { |
128
|
|
|
# Your slowness isn't due to CPU or IO problems, so it's likely an application-specific issue. |
129
|
|
|
# It's also possible that the slowness is being caused by another server in your cluster, or |
130
|
|
|
# by an external service you rely on. |
131
|
|
|
# start by checking important applications for uncharacteristic slowness(the DB is a good place |
132
|
|
|
# to start), think through which parts of your infrastructure could be slowed down externally. |
133
|
|
|
# For example, do you use an externally hosted email service that could slow down critical |
134
|
|
|
# parts of your application ? |
135
|
|
|
# If you suspect another server in your cluster, strace and lsof can provide information on |
136
|
|
|
# what the process is doing or waiting on. Strace will show you which file descriptors are |
137
|
|
|
# being read or written to (or being attempted to be read from) and lsof can give you a |
138
|
|
|
# mapping of those file descriptors to network connections. |
139
|
|
|
'_msg': "External issue" |
140
|
|
|
}, |
141
|
|
|
}, |
142
|
|
|
} |
143
|
|
|
}, |
144
|
|
|
} |
145
|
|
|
} |
146
|
|
|
|
147
|
|
|
|
148
|
|
|
def build_global_message(): |
149
|
|
|
"""Parse the decision tree and return the message. |
150
|
|
|
|
151
|
|
|
Note: message corresponding to the current thresholds values |
152
|
|
|
""" |
153
|
|
|
# Compute the weight for each item in the tree |
154
|
|
|
current_thresholds = glances_thresholds.get() |
155
|
|
|
for i in tree: |
156
|
|
|
i['weight'] = sum([current_thresholds[t].value() for t in i['thresholds'] if t in current_thresholds]) |
157
|
|
|
themax = max(tree, key=lambda d: d['weight']) |
158
|
|
|
if themax['weight'] >= themax['thresholds_min']: |
159
|
|
|
# Check if the weight is > to the minimal threshold value |
160
|
|
|
return themax['msg'] |
161
|
|
|
else: |
162
|
|
|
return tree[0]['msg'] |
163
|
|
|
|
164
|
|
|
|
165
|
|
|
class GlancesEventsList(object): |
166
|
|
|
"""This class manages events inside the Glances software. |
167
|
|
|
GlancesEventsList is a list of GlancesEvent. |
168
|
|
|
GlancesEvent is defined in the event.py file |
169
|
|
|
""" |
170
|
|
|
|
171
|
|
|
def __init__(self, max_events=10, min_duration=6, min_interval=6): |
172
|
|
|
"""Init the events class. |
173
|
|
|
|
174
|
|
|
max_events: maximum size of the events list |
175
|
|
|
min_duration: events duration should be > min_duration to be taken into account (in seconds) |
176
|
|
|
min_interval: minimal interval between same kind of alert (in seconds) |
177
|
|
|
""" |
178
|
|
|
# Maximum size of the events list |
179
|
|
|
self.set_max_events(max_events) |
180
|
|
|
|
181
|
|
|
# Minimal event duraton time (in seconds) |
182
|
|
|
self.set_min_duration(min_duration) |
183
|
|
|
|
184
|
|
|
# Minimal interval between same kind of alert (in seconds) |
185
|
|
|
self.set_min_interval(min_interval) |
186
|
|
|
|
187
|
|
|
# Init the logs list |
188
|
|
|
self.events_list = [] |
189
|
|
|
|
190
|
|
|
def set_max_events(self, max_events): |
191
|
|
|
"""Set the maximum size of the events list.""" |
192
|
|
|
self.max_events = max_events |
193
|
|
|
|
194
|
|
|
def set_min_duration(self, min_duration): |
195
|
|
|
"""Set the minimal event duration time (in seconds).""" |
196
|
|
|
self.min_duration = min_duration |
197
|
|
|
|
198
|
|
|
def set_min_interval(self, min_interval): |
199
|
|
|
"""Set the minimum interval between same kind of alert (in seconds).""" |
200
|
|
|
self.min_interval = min_interval |
201
|
|
|
|
202
|
|
|
def get(self): |
203
|
|
|
"""Return the RAW events list.""" |
204
|
|
|
return [RootModel[GlancesEvent](e).model_dump() for e in self.events_list] |
205
|
|
|
|
206
|
|
|
def len(self): |
207
|
|
|
"""Return the number of events in the logs list.""" |
208
|
|
|
return self.events_list.__len__() |
209
|
|
|
|
210
|
|
|
def __event_exist(self, event_time, event_type): |
211
|
|
|
"""Return the event position in the events list if: |
212
|
|
|
type is matching |
213
|
|
|
and (end is < 0 or event_time - end < min_interval) |
214
|
|
|
Return -1 if the item is not found. |
215
|
|
|
""" |
216
|
|
|
for i in range(self.len()): |
217
|
|
|
if ( |
218
|
|
|
self.events_list[i].is_ongoing() or (event_time - self.events_list[i].end < self.min_interval) |
219
|
|
|
) and self.events_list[i].type == event_type: |
220
|
|
|
return i |
221
|
|
|
return -1 |
222
|
|
|
|
223
|
|
|
def get_event_sort_key(self, event_type): |
224
|
|
|
"""Return the process sort key""" |
225
|
|
|
# Process sort depending on alert type |
226
|
|
|
if event_type.startswith("MEM"): |
227
|
|
|
# Sort TOP process by memory_percent |
228
|
|
|
ret = 'memory_percent' |
229
|
|
|
elif event_type.startswith("CPU_IOWAIT"): |
230
|
|
|
# Sort TOP process by io_counters (only for Linux OS) |
231
|
|
|
ret = 'io_counters' |
232
|
|
|
else: |
233
|
|
|
# Default sort is... |
234
|
|
|
ret = 'cpu_percent' |
235
|
|
|
return ret |
236
|
|
|
|
237
|
|
|
def set_process_sort(self, event_type): |
238
|
|
|
"""Define the process auto sort key from the alert type.""" |
239
|
|
|
if glances_processes.auto_sort: |
240
|
|
|
glances_processes.set_sort_key(self.get_event_sort_key(event_type)) |
241
|
|
|
|
242
|
|
|
def reset_process_sort(self): |
243
|
|
|
"""Reset the process auto sort key.""" |
244
|
|
|
if glances_processes.auto_sort: |
245
|
|
|
glances_processes.set_sort_key('auto') |
246
|
|
|
|
247
|
|
|
def add(self, event_state, event_type, event_value, proc_list=None, proc_desc=""): |
248
|
|
|
"""Add a new item to the logs list. |
249
|
|
|
|
250
|
|
|
event_state = "OK|CAREFUL|WARNING|CRITICAL" |
251
|
|
|
event_type = "CPU|LOAD|MEM|..." |
252
|
|
|
event_value = value |
253
|
|
|
proc_list = list of processes |
254
|
|
|
proc_desc = processes description |
255
|
|
|
global_message = global alert message |
256
|
|
|
|
257
|
|
|
If 'event' is a 'new one', add it at the beginning of the list. |
258
|
|
|
If 'event' is not a 'new one', update the list . |
259
|
|
|
When finished if event duration < peak_time then the alert is not set. |
260
|
|
|
""" |
261
|
|
|
event_time = time.mktime(datetime.now().timetuple()) |
262
|
|
|
global_message = build_global_message() |
263
|
|
|
proc_list = proc_list or glances_processes.get_list() |
264
|
|
|
|
265
|
|
|
# Add or update the log |
266
|
|
|
event_index = self.__event_exist(event_time, event_type) |
267
|
|
|
if event_index < 0: |
268
|
|
|
# Event did not exist, add it |
269
|
|
|
self._create_event(event_time, event_state, event_type, event_value, proc_desc, global_message) |
270
|
|
|
else: |
271
|
|
|
# Event exist, update it |
272
|
|
|
self._update_event( |
273
|
|
|
event_time, event_index, event_state, event_type, event_value, proc_list, proc_desc, global_message |
274
|
|
|
) |
275
|
|
|
|
276
|
|
|
return self.len() |
277
|
|
|
|
278
|
|
|
def _create_event(self, event_time, event_state, event_type, event_value, proc_desc, global_message): |
279
|
|
|
"""Add a new item in the log list. |
280
|
|
|
|
281
|
|
|
Item is added only if the criticality (event_state) is WARNING or CRITICAL. |
282
|
|
|
""" |
283
|
|
|
if event_state not in ('WARNING', 'CRITICAL'): |
284
|
|
|
return |
285
|
|
|
|
286
|
|
|
# Define the automatic process sort key |
287
|
|
|
self.set_process_sort(event_type) |
288
|
|
|
|
289
|
|
|
# Create the new log item |
290
|
|
|
# Time is stored in Epoch format |
291
|
|
|
# Epoch -> DMYHMS = datetime.fromtimestamp(epoch) |
292
|
|
|
event = GlancesEvent( |
293
|
|
|
begin=event_time, |
294
|
|
|
state=event_state, |
295
|
|
|
type=event_type, |
296
|
|
|
min=event_value, |
297
|
|
|
max=event_value, |
298
|
|
|
sum=event_value, |
299
|
|
|
count=1, |
300
|
|
|
avg=event_value, |
301
|
|
|
top=[], |
302
|
|
|
desc=proc_desc, |
303
|
|
|
sort=glances_processes.sort_key, |
304
|
|
|
global_msg=global_message, |
305
|
|
|
) |
306
|
|
|
|
307
|
|
|
# Add the event to the list |
308
|
|
|
self.events_list.insert(0, event) |
309
|
|
|
|
310
|
|
|
# Limit the list to 'max_events' items |
311
|
|
|
if self.len() > self.max_events: |
312
|
|
|
self.events_list.pop() |
313
|
|
|
|
314
|
|
|
def _update_event( |
315
|
|
|
self, event_time, event_index, event_state, event_type, event_value, proc_list, proc_desc, global_message |
316
|
|
|
): |
317
|
|
|
"""Update an event in the list""" |
318
|
|
|
if event_state in ('OK', 'CAREFUL') and self.events_list[event_index].is_ongoing(): |
319
|
|
|
# Close the event |
320
|
|
|
self._close_event(event_time, event_index) |
321
|
|
|
elif event_state in ('OK', 'CAREFUL') and self.events_list[event_index].is_finished(): |
322
|
|
|
# Event is already closed, do nothing |
323
|
|
|
pass |
324
|
|
|
else: # event_state == "WARNING" or event_state == "CRITICAL" |
325
|
|
|
# Set process sort key |
326
|
|
|
self.set_process_sort(event_type) |
327
|
|
|
|
328
|
|
|
# Update an ongoing event |
329
|
|
|
self.events_list[event_index].update( |
330
|
|
|
state=event_state, |
331
|
|
|
value=event_value, |
332
|
|
|
sort_key=self.get_event_sort_key(event_type), |
333
|
|
|
proc_list=proc_list, |
334
|
|
|
proc_desc=proc_desc, |
335
|
|
|
global_msg=global_message, |
336
|
|
|
) |
337
|
|
|
|
338
|
|
|
def _close_event(self, event_time, event_index): |
339
|
|
|
"""Close an event in the list""" |
340
|
|
|
# Reset the automatic process sort key |
341
|
|
|
self.reset_process_sort() |
342
|
|
|
|
343
|
|
|
# Set the end of the events |
344
|
|
|
if event_time - self.events_list[event_index].begin >= self.min_duration: |
345
|
|
|
# If event is >= min_duration seconds |
346
|
|
|
self.events_list[event_index].end = event_time |
347
|
|
|
else: |
348
|
|
|
# If event < min_duration seconds, ignore |
349
|
|
|
self.events_list.remove(self.events_list[event_index]) |
350
|
|
|
|
351
|
|
|
def clean(self, critical=False): |
352
|
|
|
"""Clean the logs list by deleting finished items. |
353
|
|
|
|
354
|
|
|
By default, only delete WARNING message. |
355
|
|
|
If critical = True, also delete CRITICAL message. |
356
|
|
|
""" |
357
|
|
|
# Create a new clean list |
358
|
|
|
clean_events_list = [] |
359
|
|
|
while self.len() > 0: |
360
|
|
|
event = self.events_list.pop() |
361
|
|
|
if event.end < 0 or (not critical and event.state.startswith("CRITICAL")): |
362
|
|
|
clean_events_list.insert(0, event) |
363
|
|
|
# The list is now the clean one |
364
|
|
|
self.events_list = clean_events_list |
365
|
|
|
return self.len() |
366
|
|
|
|
367
|
|
|
|
368
|
|
|
glances_events = GlancesEventsList() |
369
|
|
|
|