Test Failed
Push — master ( ee826a...d9056e )
by Nicolas
03:09
created

glances.events_list.build_global_message()   A

Complexity

Conditions 4

Size

Total Lines 15
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
eloc 8
nop 0
dl 0
loc 15
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
#
3
# This file is part of Glances.
4
#
5
# SPDX-FileCopyrightText: 2022 Nicolas Hennion <[email protected]>
6
#
7
# SPDX-License-Identifier: LGPL-3.0-only
8
#
9
10
"""Manage Glances events list (previously Glances logs in Glances < 3.1)."""
11
12
import time
13
from datetime import datetime
14
from pydantic import RootModel
15
16
from glances.processes import glances_processes
17
from glances.thresholds import glances_thresholds
18
from glances.event import GlancesEvent
19
20
# Static decision tree for the global alert message
21
# - msg: Message to be displayed (result of the decision tree)
22
# - thresholds: a list of stats to take into account
23
# - thresholds_min: minimal value of the thresholds sum
24
# -                 0: OK
25
# -                 1: CAREFUL
26
# -                 2: WARNING
27
# -                 3: CRITICAL
28
tree = [
29
    {'msg': 'EVENTS history', 'thresholds': [], 'thresholds_min': 0},
30
    {'msg': 'High CPU user mode', 'thresholds': ['cpu_user'], 'thresholds_min': 2},
31
    {'msg': 'High CPU kernel usage', 'thresholds': ['cpu_system'], 'thresholds_min': 2},
32
    {'msg': 'High CPU I/O waiting', 'thresholds': ['cpu_iowait'], 'thresholds_min': 2},
33
    {
34
        'msg': 'Large CPU stolen time. System running the hypervisor is too busy.',
35
        'thresholds': ['cpu_steal'],
36
        'thresholds_min': 2,
37
    },
38
    {'msg': 'High CPU niced value', 'thresholds': ['cpu_niced'], 'thresholds_min': 2},
39
    {'msg': 'System overloaded in the last 5 minutes', 'thresholds': ['load'], 'thresholds_min': 2},
40
    {'msg': 'High swap (paging) usage', 'thresholds': ['memswap'], 'thresholds_min': 2},
41
    {'msg': 'High memory consumption', 'thresholds': ['mem'], 'thresholds_min': 2},
42
]
43
44
# TODO: change the algo to use the following decision tree
45
# Source: Inspire by https://scoutapm.com/blog/slow_server_flow_chart
46
# _yes means threshold >= 2
47
# _no  means threshold < 2
48
# With threshold:
49
# - 0: OK
50
# - 1: CAREFUL
51
# - 2: WARNING
52
# - 3: CRITICAL
53
tree_new = {
54
    'cpu_iowait': {
55
        '_yes': {
56
            'memswap': {
57
                '_yes': {
58
                    'mem': {
59
                        '_yes': {
60
                            # Once you've identified the offenders, the resolution will again
61
                            # depend on whether their memory usage seems business-as-usual or not.
62
                            # For example, a memory leak can be satisfactorily addressed by a one-time
63
                            # or periodic restart of the process.
64
                            # - if memory usage seems anomalous: kill the offending processes.
65
                            # - if memory usage seems business-as-usual: add RAM to the server,
66
                            # or split high-memory using services to other servers.
67
                            '_msg': "Memory issue"
68
                        },
69
                        '_no': {
70
                            # ???
71
                            '_msg': "Swap issue"
72
                        },
73
                    }
74
                },
75
                '_no': {
76
                    # Low swap means you have a "real" IO wait problem. The next step is to see what's hogging your IO.
77
                    # iotop is an awesome tool for identifying io offenders. Two things to note:
78
                    # unless you've already installed iotop, it's probably not already on your system.
79
                    # Recommendation: install it before you need it - - it's no fun trying to install a troubleshooting
80
                    # tool on an overloaded machine (iotop requires a Linux of 2.62 or above)
81
                    '_msg': "I/O issue"
82
                },
83
            }
84
        },
85
        '_no': {
86
            'cpu_total': {
87
                '_yes': {
88
                    'cpu_user': {
89
                        '_yes': {
90
                            # We expect the user-time percentage to be high.
91
                            # There's most likely a program or service you've configured on you server that's
92
                            # hogging CPU.
93
                            # Checking the % user time just confirms this. When you see that the % user-time is high,
94
                            # it's time to see what executable is monopolizing the CPU
95
                            # Once you've confirmed that the % usertime is high, check the process list(also provided
96
                            # by top).
97
                            # Be default, top sorts the process list by % CPU, so you can just look at the top process
98
                            # or processes.
99
                            # If there's a single process hogging the CPU in a way that seems abnormal, it's an
100
                            # anomalous situation
101
                            # that a service restart can fix. If there are are multiple processes taking up CPU
102
                            # resources, or it
103
                            # there's one process that takes lots of resources while otherwise functioning normally,
104
                            # than your setup
105
                            # may just be underpowered. You'll need to upgrade your server(add more cores),
106
                            # or split services out onto
107
                            # other boxes. In either case, you have a resolution:
108
                            # - if situation seems anomalous: kill the offending processes.
109
                            # - if situation seems typical given history: upgrade server or add more servers.
110
                            '_msg': "CPU issue with user process(es)"
111
                        },
112
                        '_no': {
113
                            'cpu_steal': {
114
                                '_yes': {
115
                                    '_msg': "CPU issue with stolen time. System running the hypervisor may be too busy."
116
                                },
117
                                '_no': {'_msg': "CPU issue with system process(es)"},
118
                            }
119
                        },
120
                    }
121
                },
122
                '_no': {
123
                    '_yes': {
124
                        # ???
125
                        '_msg': "Memory issue"
126
                    },
127
                    '_no': {
128
                        # Your slowness isn't due to CPU or IO problems, so it's likely an application-specific issue.
129
                        # It's also possible that the slowness is being caused by another server in your cluster, or
130
                        # by an external service you rely on.
131
                        # start by checking important applications for uncharacteristic slowness(the DB is a good place
132
                        # to start), think through which parts of your infrastructure could be slowed down externally.
133
                        # For example, do you use an externally hosted email service that could slow down critical
134
                        # parts of your application ?
135
                        # If you suspect another server in your cluster, strace and lsof can provide information on
136
                        # what the process is doing or waiting on. Strace will show you which file descriptors are
137
                        # being read or written to (or being attempted to be read from) and lsof can give you a
138
                        # mapping of those file descriptors to network connections.
139
                        '_msg': "External issue"
140
                    },
141
                },
142
            }
143
        },
144
    }
145
}
146
147
148
def build_global_message():
149
    """Parse the decision tree and return the message.
150
151
    Note: message corresponding to the current thresholds values
152
    """
153
    # Compute the weight for each item in the tree
154
    current_thresholds = glances_thresholds.get()
155
    for i in tree:
156
        i['weight'] = sum([current_thresholds[t].value() for t in i['thresholds'] if t in current_thresholds])
157
    themax = max(tree, key=lambda d: d['weight'])
158
    if themax['weight'] >= themax['thresholds_min']:
159
        # Check if the weight is > to the minimal threshold value
160
        return themax['msg']
161
    else:
162
        return tree[0]['msg']
163
164
165
class GlancesEventsList(object):
166
    """This class manages events inside the Glances software.
167
    GlancesEventsList is a list of GlancesEvent.
168
    GlancesEvent is defined in the event.py file
169
    """
170
171
    def __init__(self, max_events=10, min_duration=6, min_interval=6):
172
        """Init the events class.
173
174
        max_events: maximum size of the events list
175
        min_duration: events duration should be > min_duration to be taken into account (in seconds)
176
        min_interval: minimal interval between same kind of alert (in seconds)
177
        """
178
        # Maximum size of the events list
179
        self.set_max_events(max_events)
180
181
        # Minimal event duraton time (in seconds)
182
        self.set_min_duration(min_duration)
183
184
        # Minimal interval between same kind of alert (in seconds)
185
        self.set_min_interval(min_interval)
186
187
        # Init the logs list
188
        self.events_list = []
189
190
    def set_max_events(self, max_events):
191
        """Set the maximum size of the events list."""
192
        self.max_events = max_events
193
194
    def set_min_duration(self, min_duration):
195
        """Set the minimal event duration time (in seconds)."""
196
        self.min_duration = min_duration
197
198
    def set_min_interval(self, min_interval):
199
        """Set the minimum interval between same kind of alert (in seconds)."""
200
        self.min_interval = min_interval
201
202
    def get(self):
203
        """Return the RAW events list."""
204
        return [RootModel[GlancesEvent](e).model_dump() for e in self.events_list]
205
206
    def len(self):
207
        """Return the number of events in the logs list."""
208
        return self.events_list.__len__()
209
210
    def __event_exist(self, event_time, event_type):
211
        """Return the event position in the events list if:
212
        type is matching
213
        and (end is < 0 or event_time - end < min_interval)
214
        Return -1 if the item is not found.
215
        """
216
        for i in range(self.len()):
217
            if (
218
                self.events_list[i].is_ongoing() or (event_time - self.events_list[i].end < self.min_interval)
219
            ) and self.events_list[i].type == event_type:
220
                return i
221
        return -1
222
223
    def get_event_sort_key(self, event_type):
224
        """Return the process sort key"""
225
        # Process sort depending on alert type
226
        if event_type.startswith("MEM"):
227
            # Sort TOP process by memory_percent
228
            ret = 'memory_percent'
229
        elif event_type.startswith("CPU_IOWAIT"):
230
            # Sort TOP process by io_counters (only for Linux OS)
231
            ret = 'io_counters'
232
        else:
233
            # Default sort is...
234
            ret = 'cpu_percent'
235
        return ret
236
237
    def set_process_sort(self, event_type):
238
        """Define the process auto sort key from the alert type."""
239
        if glances_processes.auto_sort:
240
            glances_processes.set_sort_key(self.get_event_sort_key(event_type))
241
242
    def reset_process_sort(self):
243
        """Reset the process auto sort key."""
244
        if glances_processes.auto_sort:
245
            glances_processes.set_sort_key('auto')
246
247
    def add(self, event_state, event_type, event_value, proc_list=None, proc_desc=""):
248
        """Add a new item to the logs list.
249
250
        event_state = "OK|CAREFUL|WARNING|CRITICAL"
251
        event_type = "CPU|LOAD|MEM|..."
252
        event_value = value
253
        proc_list = list of processes
254
        proc_desc = processes description
255
        global_message = global alert message
256
257
        If 'event' is a 'new one', add it at the beginning of the list.
258
        If 'event' is not a 'new one', update the list .
259
        When finished if event duration < peak_time then the alert is not set.
260
        """
261
        event_time = time.mktime(datetime.now().timetuple())
262
        global_message = build_global_message()
263
        proc_list = proc_list or glances_processes.get_list()
264
265
        # Add or update the log
266
        event_index = self.__event_exist(event_time, event_type)
267
        if event_index < 0:
268
            # Event did not exist, add it
269
            self._create_event(event_time, event_state, event_type, event_value, proc_desc, global_message)
270
        else:
271
            # Event exist, update it
272
            self._update_event(
273
                event_time, event_index, event_state, event_type, event_value, proc_list, proc_desc, global_message
274
            )
275
276
        return self.len()
277
278
    def _create_event(self, event_time, event_state, event_type, event_value, proc_desc, global_message):
279
        """Add a new item in the log list.
280
281
        Item is added only if the criticality (event_state) is WARNING or CRITICAL.
282
        """
283
        if event_state not in ('WARNING', 'CRITICAL'):
284
            return
285
286
        # Define the automatic process sort key
287
        self.set_process_sort(event_type)
288
289
        # Create the new log item
290
        # Time is stored in Epoch format
291
        # Epoch -> DMYHMS = datetime.fromtimestamp(epoch)
292
        event = GlancesEvent(
293
            begin=event_time,
294
            state=event_state,
295
            type=event_type,
296
            min=event_value,
297
            max=event_value,
298
            sum=event_value,
299
            count=1,
300
            avg=event_value,
301
            top=[],
302
            desc=proc_desc,
303
            sort=glances_processes.sort_key,
304
            global_msg=global_message,
305
        )
306
307
        # Add the event to the list
308
        self.events_list.insert(0, event)
309
310
        # Limit the list to 'max_events' items
311
        if self.len() > self.max_events:
312
            self.events_list.pop()
313
314
    def _update_event(
315
        self, event_time, event_index, event_state, event_type, event_value, proc_list, proc_desc, global_message
316
    ):
317
        """Update an event in the list"""
318
        if event_state in ('OK', 'CAREFUL') and self.events_list[event_index].is_ongoing():
319
            # Close the event
320
            self._close_event(event_time, event_index)
321
        elif event_state in ('OK', 'CAREFUL') and self.events_list[event_index].is_finished():
322
            # Event is already closed, do nothing
323
            pass
324
        else:  # event_state == "WARNING" or event_state == "CRITICAL"
325
            # Set process sort key
326
            self.set_process_sort(event_type)
327
328
            # Update an ongoing event
329
            self.events_list[event_index].update(
330
                state=event_state,
331
                value=event_value,
332
                sort_key=self.get_event_sort_key(event_type),
333
                proc_list=proc_list,
334
                proc_desc=proc_desc,
335
                global_msg=global_message,
336
            )
337
338
    def _close_event(self, event_time, event_index):
339
        """Close an event in the list"""
340
        # Reset the automatic process sort key
341
        self.reset_process_sort()
342
343
        # Set the end of the events
344
        if event_time - self.events_list[event_index].begin >= self.min_duration:
345
            # If event is >= min_duration seconds
346
            self.events_list[event_index].end = event_time
347
        else:
348
            # If event < min_duration seconds, ignore
349
            self.events_list.remove(self.events_list[event_index])
350
351
    def clean(self, critical=False):
352
        """Clean the logs list by deleting finished items.
353
354
        By default, only delete WARNING message.
355
        If critical = True, also delete CRITICAL message.
356
        """
357
        # Create a new clean list
358
        clean_events_list = []
359
        while self.len() > 0:
360
            event = self.events_list.pop()
361
            if event.end < 0 or (not critical and event.state.startswith("CRITICAL")):
362
                clean_events_list.insert(0, event)
363
        # The list is now the clean one
364
        self.events_list = clean_events_list
365
        return self.len()
366
367
368
glances_events = GlancesEventsList()
369