Test Failed
Push — develop ( 684d1d...272dc2 )
by Nicolas
02:09
created

glances.events.GlancesEvents._update_event()   B

Complexity

Conditions 6

Size

Total Lines 42
Code Lines 21

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 6
eloc 21
nop 9
dl 0
loc 42
rs 8.4426
c 0
b 0
f 0

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
# -*- coding: utf-8 -*-
2
#
3
# This file is part of Glances.
4
#
5
# SPDX-FileCopyrightText: 2022 Nicolas Hennion <[email protected]>
6
#
7
# SPDX-License-Identifier: LGPL-3.0-only
8
#
9
10
"""Manage Glances events (previously Glances logs in Glances < 3.1)."""
11
12
import time
13
from datetime import datetime
14
15
from glances.logger import logger
16
from glances.processes import glances_processes, sort_stats
17
from glances.thresholds import glances_thresholds
18
19
# Static decision tree for the global alert message
20
# - msg: Message to be displayed (result of the decision tree)
21
# - thresholds: a list of stats to take into account
22
# - thresholds_min: minimal value of the thresholds sum
23
# -                 0: OK
24
# -                 1: CAREFUL
25
# -                 2: WARNING
26
# -                 3: CRITICAL
27
tree = [
28
    {'msg': 'No warning or critical alert detected', 'thresholds': [], 'thresholds_min': 0},
29
    {'msg': 'High CPU user mode', 'thresholds': ['cpu_user'], 'thresholds_min': 2},
30
    {'msg': 'High CPU kernel usage', 'thresholds': ['cpu_system'], 'thresholds_min': 2},
31
    {'msg': 'High CPU I/O waiting', 'thresholds': ['cpu_iowait'], 'thresholds_min': 2},
32
    {
33
        'msg': 'Large CPU stolen time. System running the hypervisor is too busy.',
34
        'thresholds': ['cpu_steal'],
35
        'thresholds_min': 2,
36
    },
37
    {'msg': 'High CPU niced value', 'thresholds': ['cpu_niced'], 'thresholds_min': 2},
38
    {'msg': 'System overloaded in the last 5 minutes', 'thresholds': ['load'], 'thresholds_min': 2},
39
    {'msg': 'High swap (paging) usage', 'thresholds': ['memswap'], 'thresholds_min': 2},
40
    {'msg': 'High memory consumption', 'thresholds': ['mem'], 'thresholds_min': 2},
41
]
42
43
# TODO: change the algo to use the following decision tree
44
# Source: Inspire by https://scoutapm.com/blog/slow_server_flow_chart
45
# _yes means threshold >= 2
46
# _no  means threshold < 2
47
# With threshold:
48
# - 0: OK
49
# - 1: CAREFUL
50
# - 2: WARNING
51
# - 3: CRITICAL
52
tree_new = {
53
    'cpu_iowait': {
54
        '_yes': {
55
            'memswap': {
56
                '_yes': {
57
                    'mem': {
58
                        '_yes': {
59
                            # Once you've identified the offenders, the resolution will again
60
                            # depend on whether their memory usage seems business-as-usual or not.
61
                            # For example, a memory leak can be satisfactorily addressed by a one-time
62
                            # or periodic restart of the process.
63
                            # - if memory usage seems anomalous: kill the offending processes.
64
                            # - if memory usage seems business-as-usual: add RAM to the server,
65
                            # or split high-memory using services to other servers.
66
                            '_msg': "Memory issue"
67
                        },
68
                        '_no': {
69
                            # ???
70
                            '_msg': "Swap issue"
71
                        },
72
                    }
73
                },
74
                '_no': {
75
                    # Low swap means you have a "real" IO wait problem. The next step is to see what's hogging your IO.
76
                    # iotop is an awesome tool for identifying io offenders. Two things to note:
77
                    # unless you've already installed iotop, it's probably not already on your system.
78
                    # Recommendation: install it before you need it - - it's no fun trying to install a troubleshooting
79
                    # tool on an overloaded machine (iotop requires a Linux of 2.62 or above)
80
                    '_msg': "I/O issue"
81
                },
82
            }
83
        },
84
        '_no': {
85
            'cpu_total': {
86
                '_yes': {
87
                    'cpu_user': {
88
                        '_yes': {
89
                            # We expect the user-time percentage to be high.
90
                            # There's most likely a program or service you've configured on you server that's
91
                            # hogging CPU.
92
                            # Checking the % user time just confirms this. When you see that the % user-time is high,
93
                            # it's time to see what executable is monopolizing the CPU
94
                            # Once you've confirmed that the % usertime is high, check the process list(also provided
95
                            # by top).
96
                            # Be default, top sorts the process list by % CPU, so you can just look at the top process
97
                            # or processes.
98
                            # If there's a single process hogging the CPU in a way that seems abnormal, it's an
99
                            # anomalous situation
100
                            # that a service restart can fix. If there are are multiple processes taking up CPU
101
                            # resources, or it
102
                            # there's one process that takes lots of resources while otherwise functioning normally,
103
                            # than your setup
104
                            # may just be underpowered. You'll need to upgrade your server(add more cores),
105
                            # or split services out onto
106
                            # other boxes. In either case, you have a resolution:
107
                            # - if situation seems anomalous: kill the offending processes.
108
                            # - if situation seems typical given history: upgrade server or add more servers.
109
                            '_msg': "CPU issue with user process(es)"
110
                        },
111
                        '_no': {
112
                            'cpu_steal': {
113
                                '_yes': {
114
                                    '_msg': "CPU issue with stolen time. System running the hypervisor may be too busy."
115
                                },
116
                                '_no': {'_msg': "CPU issue with system process(es)"},
117
                            }
118
                        },
119
                    }
120
                },
121
                '_no': {
122
                    '_yes': {
123
                        # ???
124
                        '_msg': "Memory issue"
125
                    },
126
                    '_no': {
127
                        # Your slowness isn't due to CPU or IO problems, so it's likely an application-specific issue.
128
                        # It's also possible that the slowness is being caused by another server in your cluster, or
129
                        # by an external service you rely on.
130
                        # start by checking important applications for uncharacteristic slowness(the DB is a good place
131
                        # to start), think through which parts of your infrastructure could be slowed down externally.
132
                        # For example, do you use an externally hosted email service that could slow down critical
133
                        # parts of your application ?
134
                        # If you suspect another server in your cluster, strace and lsof can provide information on
135
                        # what the process is doing or waiting on. Strace will show you which file descriptors are
136
                        # being read or written to (or being attempted to be read from) and lsof can give you a
137
                        # mapping of those file descriptors to network connections.
138
                        '_msg': "External issue"
139
                    },
140
                },
141
            }
142
        },
143
    }
144
}
145
146
147
def build_global_message():
148
    """Parse the decision tree and return the message.
149
150
    Note: message corresponding to the current thresholds values
151
    """
152
    # Compute the weight for each item in the tree
153
    current_thresholds = glances_thresholds.get()
154
    for i in tree:
155
        i['weight'] = sum([current_thresholds[t].value() for t in i['thresholds'] if t in current_thresholds])
156
    themax = max(tree, key=lambda d: d['weight'])
157
    if themax['weight'] >= themax['thresholds_min']:
158
        # Check if the weight is > to the minimal threshold value
159
        return themax['msg']
160
    else:
161
        return tree[0]['msg']
162
163
164
class GlancesEvents(object):
165
166
    """This class manages events inside the Glances software.
167
168
    Events is a list of event (stored in the self.events_list var)
169
    event_state = "OK|CAREFUL|WARNING|CRITICAL"
170
    event_type = "CPU*|LOAD|MEM|MON"
171
    event_value = value
172
173
    Item (or event) is defined by:
174
        {
175
            "begin": "begin",
176
            "end": "end",
177
            "state": "WARNING|CRITICAL",
178
            "type": "CPU|LOAD|MEM",
179
            "max": MAX,
180
            "avg": AVG,
181
            "min": MIN,
182
            "sum": SUM,
183
            "count": COUNT,
184
            "top": [top 3 process name],
185
            "desc": "Processes description",
186
            "sort": "top sort key",
187
            "global": "global alert message"
188
        }
189
    """
190
191
    def __init__(self, max_events=10, min_duration=6, min_interval=6):
192
        """Init the events class.
193
194
        max_events: maximum size of the events list
195
        min_duration: events duration should be > min_duration to be taken into account (in seconds)
196
        min_interval: minimal interval between same kind of alert (in seconds)
197
        """
198
        # Maximum size of the events list
199
        self.set_max_events(max_events)
200
201
        # Minimal event duraton time (in seconds)
202
        self.set_min_duration(min_duration)
203
204
        # Minimal interval between same kind of alert (in seconds)
205
        self.set_min_interval(min_interval)
206
207
        # Init the logs list
208
        self.events_list = []
209
210
    def set_max_events(self, max_events):
211
        """Set the maximum size of the events list."""
212
        self.max_events = max_events
213
214
    def set_min_duration(self, min_duration):
215
        """Set the minimal event duration time (in seconds)."""
216
        self.min_duration = min_duration
217
218
    def set_min_interval(self, min_interval):
219
        """Set the minimum interval between same kind of alert (in seconds)."""
220
        self.min_interval = min_interval
221
222
    def get(self):
223
        """Return the raw events list."""
224
        return self.events_list
225
226
    def len(self):
227
        """Return the number of events in the logs list."""
228
        return self.events_list.__len__()
229
230
    def __event_exist(self, event_time, event_type):
231
        """Return the event position in the events list if:
232
        type is matching
233
        and (end is < 0 or event_time - end < min_interval)
234
        Return -1 if the item is not found.
235
        """
236
        for i in range(self.len()):
237
            if ((self.events_list[i]['end'] < 0) or
238
                (event_time - self.events_list[i]['end'] < self.min_interval)) and \
239
               self.events_list[i]['type'] == event_type:
240
                return i
241
        return -1
242
243
    def get_event_sort_key(self, event_type):
244
        """Return the process sort key"""
245
        # Process sort depending on alert type
246
        if event_type.startswith("MEM"):
247
            # Sort TOP process by memory_percent
248
            ret = 'memory_percent'
249
        elif event_type.startswith("CPU_IOWAIT"):
250
            # Sort TOP process by io_counters (only for Linux OS)
251
            ret = 'io_counters'
252
        else:
253
            # Default sort is...
254
            ret = 'cpu_percent'
255
        return ret
256
257
    def set_process_sort(self, event_type):
258
        """Define the process auto sort key from the alert type."""
259
        if glances_processes.auto_sort:
260
            glances_processes.set_sort_key(self.get_event_sort_key(event_type))
261
262
    def reset_process_sort(self):
263
        """Reset the process auto sort key."""
264
        if glances_processes.auto_sort:
265
            glances_processes.set_sort_key('auto')
266
267
    def add(self, event_state, event_type, event_value, proc_list=None, proc_desc="", min_duration=None):
268
        """Add a new item to the logs list.
269
270
        event_state = "OK|CAREFUL|WARNING|CRITICAL"
271
        event_type = "CPU|LOAD|MEM|..."
272
        event_value = value
273
        proc_list = list of processes
274
        proc_desc = processes description
275
        global_message = global alert message
276
277
        If 'event' is a 'new one', add it at the beginning of the list.
278
        If 'event' is not a 'new one', update the list .
279
        When finished if event duration < peak_time then the alert is not set.
280
        """
281
        event_time = time.mktime(datetime.now().timetuple())
282
        global_message = build_global_message()
283
        proc_list = proc_list or glances_processes.get_list()
284
285
        # Add or update the log
286
        event_index = self.__event_exist(event_time, event_type)
287
        if event_index < 0:
288
            # Event did not exist, add it
289
            self._create_event(event_time, event_state, event_type, event_value,
290
                               proc_desc, global_message)
291
        else:
292
            # Event exist, update it
293
            self._update_event(event_time, event_index, event_state, event_type, event_value,
294
                               proc_list, proc_desc, global_message)
295
296
        return self.len()
297
298
    def _create_event(self, event_time, event_state, event_type, event_value,
299
                      proc_desc, global_message):
300
        """Add a new item in the log list.
301
302
        Item is added only if the criticality (event_state) is WARNING or CRITICAL.
303
        """
304
        if event_state == "WARNING" or event_state == "CRITICAL":
305
            # Define the automatic process sort key
306
            self.set_process_sort(event_type)
307
308
            # Create the new log item
309
            # Time is stored in Epoch format
310
            # Epoch -> DMYHMS = datetime.fromtimestamp(epoch)
311
            item = {
312
                "begin": event_time,
313
                "end": -1,
314
                "state": event_state,
315
                "type": event_type,
316
                "max": event_value,
317
                "avg": event_value,
318
                "min": event_value,
319
                "sum": event_value,
320
                "count": 1,
321
                "top": [],
322
                "desc": proc_desc,
323
                "sort": glances_processes.sort_key,
324
                "global": global_message,
325
            }
326
327
            # Add the item to the list
328
            self.events_list.insert(0, item)
329
330
            # Limit the list to 'max_events' items
331
            if self.len() > self.max_events:
332
                self.events_list.pop()
333
            return True
334
        else:
335
            return False
336
337
    def _update_event(self, event_time, event_index, event_state, event_type, event_value,
338
                      proc_list, proc_desc, global_message):
339
        """Update an event in the list"""
340
        if event_state in ('OK', 'CAREFUL') and self.events_list[event_index]['end'] < 0:
341
            # Close the event
342
            self._close_event(event_time, event_index)
343
        elif event_state in ('OK', 'CAREFUL') and self.events_list[event_index]['end'] >= 0:
344
            # Event is already closed, do nothing
345
            pass
346
        else:  # event_state == "WARNING" or event_state == "CRITICAL"
347
            # Set process sort key
348
            self.set_process_sort(event_type)
349
350
            # It's an ongoing event, set the end time to -1
351
            self.events_list[event_index]['end'] = -1
352
353
            # Min/Max/Sum/Count/Avergae value
354
            self.events_list[event_index]['min'] = min(self.events_list[event_index]['min'], event_value)
355
            self.events_list[event_index]['max'] = max(self.events_list[event_index]['max'], event_value)
356
            self.events_list[event_index]['sum'] += event_value
357
            self.events_list[event_index]['count'] += 1
358
            self.events_list[event_index]['avg'] = self.events_list[event_index]['sum'] / self.events_list[event_index]['count']
359
360
            if event_state == "CRITICAL":
361
                # Avoid to change from CRITICAL to WARNING
362
                # If an events have reached the CRITICAL state, it can't go back to WARNING
363
                self.events_list[event_index]['state'] = event_state
364
365
                # TOP PROCESS LIST (only for CRITICAL ALERT)
366
                events_sort_key = self.get_event_sort_key(event_type)
367
368
                # Sort the current process list to retrieve the TOP 3 processes
369
                self.events_list[event_index]['top'] = [p['name'] for p in sort_stats(proc_list, events_sort_key)[0:3]]
370
                self.events_list[event_index]['sort'] = events_sort_key
371
372
            # MONITORED PROCESSES DESC
373
            self.events_list[event_index]['desc'] = proc_desc
374
375
            # Global message:
376
            self.events_list[event_index]['global'] = global_message
377
378
        return True
379
380
    def _close_event(self, event_time, event_index):
381
        """Close an event in the list"""
382
        # Reset the automatic process sort key
383
        self.reset_process_sort()
384
385
        # Set the end of the events
386
        if event_time - self.events_list[event_index]['begin'] >= self.min_duration:
387
            # If event is >= min_duration seconds
388
            self.events_list[event_index]['end'] = event_time
389
        else:
390
            # If event < min_duration seconds, ignore
391
            self.events_list.remove(self.events_list[event_index])
392
393
    def clean(self, critical=False):
394
        """Clean the logs list by deleting finished items.
395
396
        By default, only delete WARNING message.
397
        If critical = True, also delete CRITICAL message.
398
        """
399
        # Create a new clean list
400
        clean_events_list = []
401
        while self.len() > 0:
402
            item = self.events_list.pop()
403
            if item['end'] < 0 or (not critical and item['state'].startswith("CRITICAL")):
404
                clean_events_list.insert(0, item)
405
        # The list is now the clean one
406
        self.events_list = clean_events_list
407
        return self.len()
408
409
410
glances_events = GlancesEvents()
411