Test Failed
Push — master ( ee826a...d9056e )
by Nicolas
03:09
created

GlancesEventsList._update_event()   A

Complexity

Conditions 5

Size

Total Lines 22
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
eloc 14
nop 9
dl 0
loc 22
rs 9.2333
c 0
b 0
f 0

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
# -*- coding: utf-8 -*-
2
#
3
# This file is part of Glances.
4
#
5
# SPDX-FileCopyrightText: 2022 Nicolas Hennion <[email protected]>
6
#
7
# SPDX-License-Identifier: LGPL-3.0-only
8
#
9
10
"""Manage Glances events list (previously Glances logs in Glances < 3.1)."""
11
12
import time
13
from datetime import datetime
14
from pydantic import RootModel
15
16
from glances.processes import glances_processes
17
from glances.thresholds import glances_thresholds
18
from glances.event import GlancesEvent
19
20
# Static decision tree for the global alert message
21
# - msg: Message to be displayed (result of the decision tree)
22
# - thresholds: a list of stats to take into account
23
# - thresholds_min: minimal value of the thresholds sum
24
# -                 0: OK
25
# -                 1: CAREFUL
26
# -                 2: WARNING
27
# -                 3: CRITICAL
28
tree = [
29
    {'msg': 'EVENTS history', 'thresholds': [], 'thresholds_min': 0},
30
    {'msg': 'High CPU user mode', 'thresholds': ['cpu_user'], 'thresholds_min': 2},
31
    {'msg': 'High CPU kernel usage', 'thresholds': ['cpu_system'], 'thresholds_min': 2},
32
    {'msg': 'High CPU I/O waiting', 'thresholds': ['cpu_iowait'], 'thresholds_min': 2},
33
    {
34
        'msg': 'Large CPU stolen time. System running the hypervisor is too busy.',
35
        'thresholds': ['cpu_steal'],
36
        'thresholds_min': 2,
37
    },
38
    {'msg': 'High CPU niced value', 'thresholds': ['cpu_niced'], 'thresholds_min': 2},
39
    {'msg': 'System overloaded in the last 5 minutes', 'thresholds': ['load'], 'thresholds_min': 2},
40
    {'msg': 'High swap (paging) usage', 'thresholds': ['memswap'], 'thresholds_min': 2},
41
    {'msg': 'High memory consumption', 'thresholds': ['mem'], 'thresholds_min': 2},
42
]
43
44
# TODO: change the algo to use the following decision tree
45
# Source: Inspire by https://scoutapm.com/blog/slow_server_flow_chart
46
# _yes means threshold >= 2
47
# _no  means threshold < 2
48
# With threshold:
49
# - 0: OK
50
# - 1: CAREFUL
51
# - 2: WARNING
52
# - 3: CRITICAL
53
tree_new = {
54
    'cpu_iowait': {
55
        '_yes': {
56
            'memswap': {
57
                '_yes': {
58
                    'mem': {
59
                        '_yes': {
60
                            # Once you've identified the offenders, the resolution will again
61
                            # depend on whether their memory usage seems business-as-usual or not.
62
                            # For example, a memory leak can be satisfactorily addressed by a one-time
63
                            # or periodic restart of the process.
64
                            # - if memory usage seems anomalous: kill the offending processes.
65
                            # - if memory usage seems business-as-usual: add RAM to the server,
66
                            # or split high-memory using services to other servers.
67
                            '_msg': "Memory issue"
68
                        },
69
                        '_no': {
70
                            # ???
71
                            '_msg': "Swap issue"
72
                        },
73
                    }
74
                },
75
                '_no': {
76
                    # Low swap means you have a "real" IO wait problem. The next step is to see what's hogging your IO.
77
                    # iotop is an awesome tool for identifying io offenders. Two things to note:
78
                    # unless you've already installed iotop, it's probably not already on your system.
79
                    # Recommendation: install it before you need it - - it's no fun trying to install a troubleshooting
80
                    # tool on an overloaded machine (iotop requires a Linux of 2.62 or above)
81
                    '_msg': "I/O issue"
82
                },
83
            }
84
        },
85
        '_no': {
86
            'cpu_total': {
87
                '_yes': {
88
                    'cpu_user': {
89
                        '_yes': {
90
                            # We expect the user-time percentage to be high.
91
                            # There's most likely a program or service you've configured on you server that's
92
                            # hogging CPU.
93
                            # Checking the % user time just confirms this. When you see that the % user-time is high,
94
                            # it's time to see what executable is monopolizing the CPU
95
                            # Once you've confirmed that the % usertime is high, check the process list(also provided
96
                            # by top).
97
                            # Be default, top sorts the process list by % CPU, so you can just look at the top process
98
                            # or processes.
99
                            # If there's a single process hogging the CPU in a way that seems abnormal, it's an
100
                            # anomalous situation
101
                            # that a service restart can fix. If there are are multiple processes taking up CPU
102
                            # resources, or it
103
                            # there's one process that takes lots of resources while otherwise functioning normally,
104
                            # than your setup
105
                            # may just be underpowered. You'll need to upgrade your server(add more cores),
106
                            # or split services out onto
107
                            # other boxes. In either case, you have a resolution:
108
                            # - if situation seems anomalous: kill the offending processes.
109
                            # - if situation seems typical given history: upgrade server or add more servers.
110
                            '_msg': "CPU issue with user process(es)"
111
                        },
112
                        '_no': {
113
                            'cpu_steal': {
114
                                '_yes': {
115
                                    '_msg': "CPU issue with stolen time. System running the hypervisor may be too busy."
116
                                },
117
                                '_no': {'_msg': "CPU issue with system process(es)"},
118
                            }
119
                        },
120
                    }
121
                },
122
                '_no': {
123
                    '_yes': {
124
                        # ???
125
                        '_msg': "Memory issue"
126
                    },
127
                    '_no': {
128
                        # Your slowness isn't due to CPU or IO problems, so it's likely an application-specific issue.
129
                        # It's also possible that the slowness is being caused by another server in your cluster, or
130
                        # by an external service you rely on.
131
                        # start by checking important applications for uncharacteristic slowness(the DB is a good place
132
                        # to start), think through which parts of your infrastructure could be slowed down externally.
133
                        # For example, do you use an externally hosted email service that could slow down critical
134
                        # parts of your application ?
135
                        # If you suspect another server in your cluster, strace and lsof can provide information on
136
                        # what the process is doing or waiting on. Strace will show you which file descriptors are
137
                        # being read or written to (or being attempted to be read from) and lsof can give you a
138
                        # mapping of those file descriptors to network connections.
139
                        '_msg': "External issue"
140
                    },
141
                },
142
            }
143
        },
144
    }
145
}
146
147
148
def build_global_message():
149
    """Parse the decision tree and return the message.
150
151
    Note: message corresponding to the current thresholds values
152
    """
153
    # Compute the weight for each item in the tree
154
    current_thresholds = glances_thresholds.get()
155
    for i in tree:
156
        i['weight'] = sum([current_thresholds[t].value() for t in i['thresholds'] if t in current_thresholds])
157
    themax = max(tree, key=lambda d: d['weight'])
158
    if themax['weight'] >= themax['thresholds_min']:
159
        # Check if the weight is > to the minimal threshold value
160
        return themax['msg']
161
    else:
162
        return tree[0]['msg']
163
164
165
class GlancesEventsList(object):
166
    """This class manages events inside the Glances software.
167
    GlancesEventsList is a list of GlancesEvent.
168
    GlancesEvent is defined in the event.py file
169
    """
170
171
    def __init__(self, max_events=10, min_duration=6, min_interval=6):
172
        """Init the events class.
173
174
        max_events: maximum size of the events list
175
        min_duration: events duration should be > min_duration to be taken into account (in seconds)
176
        min_interval: minimal interval between same kind of alert (in seconds)
177
        """
178
        # Maximum size of the events list
179
        self.set_max_events(max_events)
180
181
        # Minimal event duraton time (in seconds)
182
        self.set_min_duration(min_duration)
183
184
        # Minimal interval between same kind of alert (in seconds)
185
        self.set_min_interval(min_interval)
186
187
        # Init the logs list
188
        self.events_list = []
189
190
    def set_max_events(self, max_events):
191
        """Set the maximum size of the events list."""
192
        self.max_events = max_events
193
194
    def set_min_duration(self, min_duration):
195
        """Set the minimal event duration time (in seconds)."""
196
        self.min_duration = min_duration
197
198
    def set_min_interval(self, min_interval):
199
        """Set the minimum interval between same kind of alert (in seconds)."""
200
        self.min_interval = min_interval
201
202
    def get(self):
203
        """Return the RAW events list."""
204
        return [RootModel[GlancesEvent](e).model_dump() for e in self.events_list]
205
206
    def len(self):
207
        """Return the number of events in the logs list."""
208
        return self.events_list.__len__()
209
210
    def __event_exist(self, event_time, event_type):
211
        """Return the event position in the events list if:
212
        type is matching
213
        and (end is < 0 or event_time - end < min_interval)
214
        Return -1 if the item is not found.
215
        """
216
        for i in range(self.len()):
217
            if (
218
                self.events_list[i].is_ongoing() or (event_time - self.events_list[i].end < self.min_interval)
219
            ) and self.events_list[i].type == event_type:
220
                return i
221
        return -1
222
223
    def get_event_sort_key(self, event_type):
224
        """Return the process sort key"""
225
        # Process sort depending on alert type
226
        if event_type.startswith("MEM"):
227
            # Sort TOP process by memory_percent
228
            ret = 'memory_percent'
229
        elif event_type.startswith("CPU_IOWAIT"):
230
            # Sort TOP process by io_counters (only for Linux OS)
231
            ret = 'io_counters'
232
        else:
233
            # Default sort is...
234
            ret = 'cpu_percent'
235
        return ret
236
237
    def set_process_sort(self, event_type):
238
        """Define the process auto sort key from the alert type."""
239
        if glances_processes.auto_sort:
240
            glances_processes.set_sort_key(self.get_event_sort_key(event_type))
241
242
    def reset_process_sort(self):
243
        """Reset the process auto sort key."""
244
        if glances_processes.auto_sort:
245
            glances_processes.set_sort_key('auto')
246
247
    def add(self, event_state, event_type, event_value, proc_list=None, proc_desc=""):
248
        """Add a new item to the logs list.
249
250
        event_state = "OK|CAREFUL|WARNING|CRITICAL"
251
        event_type = "CPU|LOAD|MEM|..."
252
        event_value = value
253
        proc_list = list of processes
254
        proc_desc = processes description
255
        global_message = global alert message
256
257
        If 'event' is a 'new one', add it at the beginning of the list.
258
        If 'event' is not a 'new one', update the list .
259
        When finished if event duration < peak_time then the alert is not set.
260
        """
261
        event_time = time.mktime(datetime.now().timetuple())
262
        global_message = build_global_message()
263
        proc_list = proc_list or glances_processes.get_list()
264
265
        # Add or update the log
266
        event_index = self.__event_exist(event_time, event_type)
267
        if event_index < 0:
268
            # Event did not exist, add it
269
            self._create_event(event_time, event_state, event_type, event_value, proc_desc, global_message)
270
        else:
271
            # Event exist, update it
272
            self._update_event(
273
                event_time, event_index, event_state, event_type, event_value, proc_list, proc_desc, global_message
274
            )
275
276
        return self.len()
277
278
    def _create_event(self, event_time, event_state, event_type, event_value, proc_desc, global_message):
279
        """Add a new item in the log list.
280
281
        Item is added only if the criticality (event_state) is WARNING or CRITICAL.
282
        """
283
        if event_state not in ('WARNING', 'CRITICAL'):
284
            return
285
286
        # Define the automatic process sort key
287
        self.set_process_sort(event_type)
288
289
        # Create the new log item
290
        # Time is stored in Epoch format
291
        # Epoch -> DMYHMS = datetime.fromtimestamp(epoch)
292
        event = GlancesEvent(
293
            begin=event_time,
294
            state=event_state,
295
            type=event_type,
296
            min=event_value,
297
            max=event_value,
298
            sum=event_value,
299
            count=1,
300
            avg=event_value,
301
            top=[],
302
            desc=proc_desc,
303
            sort=glances_processes.sort_key,
304
            global_msg=global_message,
305
        )
306
307
        # Add the event to the list
308
        self.events_list.insert(0, event)
309
310
        # Limit the list to 'max_events' items
311
        if self.len() > self.max_events:
312
            self.events_list.pop()
313
314
    def _update_event(
315
        self, event_time, event_index, event_state, event_type, event_value, proc_list, proc_desc, global_message
316
    ):
317
        """Update an event in the list"""
318
        if event_state in ('OK', 'CAREFUL') and self.events_list[event_index].is_ongoing():
319
            # Close the event
320
            self._close_event(event_time, event_index)
321
        elif event_state in ('OK', 'CAREFUL') and self.events_list[event_index].is_finished():
322
            # Event is already closed, do nothing
323
            pass
324
        else:  # event_state == "WARNING" or event_state == "CRITICAL"
325
            # Set process sort key
326
            self.set_process_sort(event_type)
327
328
            # Update an ongoing event
329
            self.events_list[event_index].update(
330
                state=event_state,
331
                value=event_value,
332
                sort_key=self.get_event_sort_key(event_type),
333
                proc_list=proc_list,
334
                proc_desc=proc_desc,
335
                global_msg=global_message,
336
            )
337
338
    def _close_event(self, event_time, event_index):
339
        """Close an event in the list"""
340
        # Reset the automatic process sort key
341
        self.reset_process_sort()
342
343
        # Set the end of the events
344
        if event_time - self.events_list[event_index].begin >= self.min_duration:
345
            # If event is >= min_duration seconds
346
            self.events_list[event_index].end = event_time
347
        else:
348
            # If event < min_duration seconds, ignore
349
            self.events_list.remove(self.events_list[event_index])
350
351
    def clean(self, critical=False):
352
        """Clean the logs list by deleting finished items.
353
354
        By default, only delete WARNING message.
355
        If critical = True, also delete CRITICAL message.
356
        """
357
        # Create a new clean list
358
        clean_events_list = []
359
        while self.len() > 0:
360
            event = self.events_list.pop()
361
            if event.end < 0 or (not critical and event.state.startswith("CRITICAL")):
362
                clean_events_list.insert(0, event)
363
        # The list is now the clean one
364
        self.events_list = clean_events_list
365
        return self.len()
366
367
368
glances_events = GlancesEventsList()
369