Test Failed
Push — develop ( 684d1d...272dc2 )
by Nicolas
02:09
created

glances.events.GlancesEvents._close_event()   A

Complexity

Conditions 2

Size

Total Lines 12
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 5
nop 3
dl 0
loc 12
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
#
3
# This file is part of Glances.
4
#
5
# SPDX-FileCopyrightText: 2022 Nicolas Hennion <[email protected]>
6
#
7
# SPDX-License-Identifier: LGPL-3.0-only
8
#
9
10
"""Manage Glances events (previously Glances logs in Glances < 3.1)."""
11
12
import time
13
from datetime import datetime
14
15
from glances.logger import logger
16
from glances.processes import glances_processes, sort_stats
17
from glances.thresholds import glances_thresholds
18
19
# Static decision tree for the global alert message
20
# - msg: Message to be displayed (result of the decision tree)
21
# - thresholds: a list of stats to take into account
22
# - thresholds_min: minimal value of the thresholds sum
23
# -                 0: OK
24
# -                 1: CAREFUL
25
# -                 2: WARNING
26
# -                 3: CRITICAL
27
tree = [
28
    {'msg': 'No warning or critical alert detected', 'thresholds': [], 'thresholds_min': 0},
29
    {'msg': 'High CPU user mode', 'thresholds': ['cpu_user'], 'thresholds_min': 2},
30
    {'msg': 'High CPU kernel usage', 'thresholds': ['cpu_system'], 'thresholds_min': 2},
31
    {'msg': 'High CPU I/O waiting', 'thresholds': ['cpu_iowait'], 'thresholds_min': 2},
32
    {
33
        'msg': 'Large CPU stolen time. System running the hypervisor is too busy.',
34
        'thresholds': ['cpu_steal'],
35
        'thresholds_min': 2,
36
    },
37
    {'msg': 'High CPU niced value', 'thresholds': ['cpu_niced'], 'thresholds_min': 2},
38
    {'msg': 'System overloaded in the last 5 minutes', 'thresholds': ['load'], 'thresholds_min': 2},
39
    {'msg': 'High swap (paging) usage', 'thresholds': ['memswap'], 'thresholds_min': 2},
40
    {'msg': 'High memory consumption', 'thresholds': ['mem'], 'thresholds_min': 2},
41
]
42
43
# TODO: change the algo to use the following decision tree
44
# Source: Inspire by https://scoutapm.com/blog/slow_server_flow_chart
45
# _yes means threshold >= 2
46
# _no  means threshold < 2
47
# With threshold:
48
# - 0: OK
49
# - 1: CAREFUL
50
# - 2: WARNING
51
# - 3: CRITICAL
52
tree_new = {
53
    'cpu_iowait': {
54
        '_yes': {
55
            'memswap': {
56
                '_yes': {
57
                    'mem': {
58
                        '_yes': {
59
                            # Once you've identified the offenders, the resolution will again
60
                            # depend on whether their memory usage seems business-as-usual or not.
61
                            # For example, a memory leak can be satisfactorily addressed by a one-time
62
                            # or periodic restart of the process.
63
                            # - if memory usage seems anomalous: kill the offending processes.
64
                            # - if memory usage seems business-as-usual: add RAM to the server,
65
                            # or split high-memory using services to other servers.
66
                            '_msg': "Memory issue"
67
                        },
68
                        '_no': {
69
                            # ???
70
                            '_msg': "Swap issue"
71
                        },
72
                    }
73
                },
74
                '_no': {
75
                    # Low swap means you have a "real" IO wait problem. The next step is to see what's hogging your IO.
76
                    # iotop is an awesome tool for identifying io offenders. Two things to note:
77
                    # unless you've already installed iotop, it's probably not already on your system.
78
                    # Recommendation: install it before you need it - - it's no fun trying to install a troubleshooting
79
                    # tool on an overloaded machine (iotop requires a Linux of 2.62 or above)
80
                    '_msg': "I/O issue"
81
                },
82
            }
83
        },
84
        '_no': {
85
            'cpu_total': {
86
                '_yes': {
87
                    'cpu_user': {
88
                        '_yes': {
89
                            # We expect the user-time percentage to be high.
90
                            # There's most likely a program or service you've configured on you server that's
91
                            # hogging CPU.
92
                            # Checking the % user time just confirms this. When you see that the % user-time is high,
93
                            # it's time to see what executable is monopolizing the CPU
94
                            # Once you've confirmed that the % usertime is high, check the process list(also provided
95
                            # by top).
96
                            # Be default, top sorts the process list by % CPU, so you can just look at the top process
97
                            # or processes.
98
                            # If there's a single process hogging the CPU in a way that seems abnormal, it's an
99
                            # anomalous situation
100
                            # that a service restart can fix. If there are are multiple processes taking up CPU
101
                            # resources, or it
102
                            # there's one process that takes lots of resources while otherwise functioning normally,
103
                            # than your setup
104
                            # may just be underpowered. You'll need to upgrade your server(add more cores),
105
                            # or split services out onto
106
                            # other boxes. In either case, you have a resolution:
107
                            # - if situation seems anomalous: kill the offending processes.
108
                            # - if situation seems typical given history: upgrade server or add more servers.
109
                            '_msg': "CPU issue with user process(es)"
110
                        },
111
                        '_no': {
112
                            'cpu_steal': {
113
                                '_yes': {
114
                                    '_msg': "CPU issue with stolen time. System running the hypervisor may be too busy."
115
                                },
116
                                '_no': {'_msg': "CPU issue with system process(es)"},
117
                            }
118
                        },
119
                    }
120
                },
121
                '_no': {
122
                    '_yes': {
123
                        # ???
124
                        '_msg': "Memory issue"
125
                    },
126
                    '_no': {
127
                        # Your slowness isn't due to CPU or IO problems, so it's likely an application-specific issue.
128
                        # It's also possible that the slowness is being caused by another server in your cluster, or
129
                        # by an external service you rely on.
130
                        # start by checking important applications for uncharacteristic slowness(the DB is a good place
131
                        # to start), think through which parts of your infrastructure could be slowed down externally.
132
                        # For example, do you use an externally hosted email service that could slow down critical
133
                        # parts of your application ?
134
                        # If you suspect another server in your cluster, strace and lsof can provide information on
135
                        # what the process is doing or waiting on. Strace will show you which file descriptors are
136
                        # being read or written to (or being attempted to be read from) and lsof can give you a
137
                        # mapping of those file descriptors to network connections.
138
                        '_msg': "External issue"
139
                    },
140
                },
141
            }
142
        },
143
    }
144
}
145
146
147
def build_global_message():
148
    """Parse the decision tree and return the message.
149
150
    Note: message corresponding to the current thresholds values
151
    """
152
    # Compute the weight for each item in the tree
153
    current_thresholds = glances_thresholds.get()
154
    for i in tree:
155
        i['weight'] = sum([current_thresholds[t].value() for t in i['thresholds'] if t in current_thresholds])
156
    themax = max(tree, key=lambda d: d['weight'])
157
    if themax['weight'] >= themax['thresholds_min']:
158
        # Check if the weight is > to the minimal threshold value
159
        return themax['msg']
160
    else:
161
        return tree[0]['msg']
162
163
164
class GlancesEvents(object):
165
166
    """This class manages events inside the Glances software.
167
168
    Events is a list of event (stored in the self.events_list var)
169
    event_state = "OK|CAREFUL|WARNING|CRITICAL"
170
    event_type = "CPU*|LOAD|MEM|MON"
171
    event_value = value
172
173
    Item (or event) is defined by:
174
        {
175
            "begin": "begin",
176
            "end": "end",
177
            "state": "WARNING|CRITICAL",
178
            "type": "CPU|LOAD|MEM",
179
            "max": MAX,
180
            "avg": AVG,
181
            "min": MIN,
182
            "sum": SUM,
183
            "count": COUNT,
184
            "top": [top 3 process name],
185
            "desc": "Processes description",
186
            "sort": "top sort key",
187
            "global": "global alert message"
188
        }
189
    """
190
191
    def __init__(self, max_events=10, min_duration=6, min_interval=6):
192
        """Init the events class.
193
194
        max_events: maximum size of the events list
195
        min_duration: events duration should be > min_duration to be taken into account (in seconds)
196
        min_interval: minimal interval between same kind of alert (in seconds)
197
        """
198
        # Maximum size of the events list
199
        self.set_max_events(max_events)
200
201
        # Minimal event duraton time (in seconds)
202
        self.set_min_duration(min_duration)
203
204
        # Minimal interval between same kind of alert (in seconds)
205
        self.set_min_interval(min_interval)
206
207
        # Init the logs list
208
        self.events_list = []
209
210
    def set_max_events(self, max_events):
211
        """Set the maximum size of the events list."""
212
        self.max_events = max_events
213
214
    def set_min_duration(self, min_duration):
215
        """Set the minimal event duration time (in seconds)."""
216
        self.min_duration = min_duration
217
218
    def set_min_interval(self, min_interval):
219
        """Set the minimum interval between same kind of alert (in seconds)."""
220
        self.min_interval = min_interval
221
222
    def get(self):
223
        """Return the raw events list."""
224
        return self.events_list
225
226
    def len(self):
227
        """Return the number of events in the logs list."""
228
        return self.events_list.__len__()
229
230
    def __event_exist(self, event_time, event_type):
231
        """Return the event position in the events list if:
232
        type is matching
233
        and (end is < 0 or event_time - end < min_interval)
234
        Return -1 if the item is not found.
235
        """
236
        for i in range(self.len()):
237
            if ((self.events_list[i]['end'] < 0) or
238
                (event_time - self.events_list[i]['end'] < self.min_interval)) and \
239
               self.events_list[i]['type'] == event_type:
240
                return i
241
        return -1
242
243
    def get_event_sort_key(self, event_type):
244
        """Return the process sort key"""
245
        # Process sort depending on alert type
246
        if event_type.startswith("MEM"):
247
            # Sort TOP process by memory_percent
248
            ret = 'memory_percent'
249
        elif event_type.startswith("CPU_IOWAIT"):
250
            # Sort TOP process by io_counters (only for Linux OS)
251
            ret = 'io_counters'
252
        else:
253
            # Default sort is...
254
            ret = 'cpu_percent'
255
        return ret
256
257
    def set_process_sort(self, event_type):
258
        """Define the process auto sort key from the alert type."""
259
        if glances_processes.auto_sort:
260
            glances_processes.set_sort_key(self.get_event_sort_key(event_type))
261
262
    def reset_process_sort(self):
263
        """Reset the process auto sort key."""
264
        if glances_processes.auto_sort:
265
            glances_processes.set_sort_key('auto')
266
267
    def add(self, event_state, event_type, event_value, proc_list=None, proc_desc="", min_duration=None):
268
        """Add a new item to the logs list.
269
270
        event_state = "OK|CAREFUL|WARNING|CRITICAL"
271
        event_type = "CPU|LOAD|MEM|..."
272
        event_value = value
273
        proc_list = list of processes
274
        proc_desc = processes description
275
        global_message = global alert message
276
277
        If 'event' is a 'new one', add it at the beginning of the list.
278
        If 'event' is not a 'new one', update the list .
279
        When finished if event duration < peak_time then the alert is not set.
280
        """
281
        event_time = time.mktime(datetime.now().timetuple())
282
        global_message = build_global_message()
283
        proc_list = proc_list or glances_processes.get_list()
284
285
        # Add or update the log
286
        event_index = self.__event_exist(event_time, event_type)
287
        if event_index < 0:
288
            # Event did not exist, add it
289
            self._create_event(event_time, event_state, event_type, event_value,
290
                               proc_desc, global_message)
291
        else:
292
            # Event exist, update it
293
            self._update_event(event_time, event_index, event_state, event_type, event_value,
294
                               proc_list, proc_desc, global_message)
295
296
        return self.len()
297
298
    def _create_event(self, event_time, event_state, event_type, event_value,
299
                      proc_desc, global_message):
300
        """Add a new item in the log list.
301
302
        Item is added only if the criticality (event_state) is WARNING or CRITICAL.
303
        """
304
        if event_state == "WARNING" or event_state == "CRITICAL":
305
            # Define the automatic process sort key
306
            self.set_process_sort(event_type)
307
308
            # Create the new log item
309
            # Time is stored in Epoch format
310
            # Epoch -> DMYHMS = datetime.fromtimestamp(epoch)
311
            item = {
312
                "begin": event_time,
313
                "end": -1,
314
                "state": event_state,
315
                "type": event_type,
316
                "max": event_value,
317
                "avg": event_value,
318
                "min": event_value,
319
                "sum": event_value,
320
                "count": 1,
321
                "top": [],
322
                "desc": proc_desc,
323
                "sort": glances_processes.sort_key,
324
                "global": global_message,
325
            }
326
327
            # Add the item to the list
328
            self.events_list.insert(0, item)
329
330
            # Limit the list to 'max_events' items
331
            if self.len() > self.max_events:
332
                self.events_list.pop()
333
            return True
334
        else:
335
            return False
336
337
    def _update_event(self, event_time, event_index, event_state, event_type, event_value,
338
                      proc_list, proc_desc, global_message):
339
        """Update an event in the list"""
340
        if event_state in ('OK', 'CAREFUL') and self.events_list[event_index]['end'] < 0:
341
            # Close the event
342
            self._close_event(event_time, event_index)
343
        elif event_state in ('OK', 'CAREFUL') and self.events_list[event_index]['end'] >= 0:
344
            # Event is already closed, do nothing
345
            pass
346
        else:  # event_state == "WARNING" or event_state == "CRITICAL"
347
            # Set process sort key
348
            self.set_process_sort(event_type)
349
350
            # It's an ongoing event, set the end time to -1
351
            self.events_list[event_index]['end'] = -1
352
353
            # Min/Max/Sum/Count/Avergae value
354
            self.events_list[event_index]['min'] = min(self.events_list[event_index]['min'], event_value)
355
            self.events_list[event_index]['max'] = max(self.events_list[event_index]['max'], event_value)
356
            self.events_list[event_index]['sum'] += event_value
357
            self.events_list[event_index]['count'] += 1
358
            self.events_list[event_index]['avg'] = self.events_list[event_index]['sum'] / self.events_list[event_index]['count']
359
360
            if event_state == "CRITICAL":
361
                # Avoid to change from CRITICAL to WARNING
362
                # If an events have reached the CRITICAL state, it can't go back to WARNING
363
                self.events_list[event_index]['state'] = event_state
364
365
                # TOP PROCESS LIST (only for CRITICAL ALERT)
366
                events_sort_key = self.get_event_sort_key(event_type)
367
368
                # Sort the current process list to retrieve the TOP 3 processes
369
                self.events_list[event_index]['top'] = [p['name'] for p in sort_stats(proc_list, events_sort_key)[0:3]]
370
                self.events_list[event_index]['sort'] = events_sort_key
371
372
            # MONITORED PROCESSES DESC
373
            self.events_list[event_index]['desc'] = proc_desc
374
375
            # Global message:
376
            self.events_list[event_index]['global'] = global_message
377
378
        return True
379
380
    def _close_event(self, event_time, event_index):
381
        """Close an event in the list"""
382
        # Reset the automatic process sort key
383
        self.reset_process_sort()
384
385
        # Set the end of the events
386
        if event_time - self.events_list[event_index]['begin'] >= self.min_duration:
387
            # If event is >= min_duration seconds
388
            self.events_list[event_index]['end'] = event_time
389
        else:
390
            # If event < min_duration seconds, ignore
391
            self.events_list.remove(self.events_list[event_index])
392
393
    def clean(self, critical=False):
394
        """Clean the logs list by deleting finished items.
395
396
        By default, only delete WARNING message.
397
        If critical = True, also delete CRITICAL message.
398
        """
399
        # Create a new clean list
400
        clean_events_list = []
401
        while self.len() > 0:
402
            item = self.events_list.pop()
403
            if item['end'] < 0 or (not critical and item['state'].startswith("CRITICAL")):
404
                clean_events_list.insert(0, item)
405
        # The list is now the clean one
406
        self.events_list = clean_events_list
407
        return self.len()
408
409
410
glances_events = GlancesEvents()
411