Test Failed
Pull Request — develop (#2556)
by
unknown
02:15
created

glances.plugins.alert.PluginModel.msg_curse()   C

Complexity

Conditions 9

Size

Total Lines 49
Code Lines 29

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 9
eloc 29
nop 3
dl 0
loc 49
rs 6.6666
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
#
3
# This file is part of Glances.
4
#
5
# SPDX-FileCopyrightText: 2023 Nicolas Hennion <[email protected]>
6
#
7
# SPDX-License-Identifier: LGPL-3.0-only
8
#
9
10
"""Alert plugin."""
11
12
from datetime import datetime
13
from time import tzname
14
import pytz
15
16
from glances.logger import logger
17
from glances.events import glances_events
18
from glances.thresholds import glances_thresholds
19
20
# from glances.logger import logger
21
from glances.plugins.plugin.model import GlancesPluginModel
22
23
# Static decision tree for the global alert message
24
# - msg: Message to be displayed (result of the decision tree)
25
# - thresholds: a list of stats to take into account
26
# - thresholds_min: minimal value of the thresholds sum
27
# -                 0: OK
28
# -                 1: CAREFUL
29
# -                 2: WARNING
30
# -                 3: CRITICAL
31
tree = [
32
    {'msg': 'No warning or critical alert detected', 'thresholds': [], 'thresholds_min': 0},
33
    {'msg': 'High CPU user mode', 'thresholds': ['cpu_user'], 'thresholds_min': 2},
34
    {'msg': 'High CPU kernel usage', 'thresholds': ['cpu_system'], 'thresholds_min': 2},
35
    {'msg': 'High CPU I/O waiting', 'thresholds': ['cpu_iowait'], 'thresholds_min': 2},
36
    {
37
        'msg': 'Large CPU stolen time. System running the hypervisor is too busy.',
38
        'thresholds': ['cpu_steal'],
39
        'thresholds_min': 2,
40
    },
41
    {'msg': 'High CPU niced value', 'thresholds': ['cpu_niced'], 'thresholds_min': 2},
42
    {'msg': 'System overloaded in the last 5 minutes', 'thresholds': ['load'], 'thresholds_min': 2},
43
    {'msg': 'High swap (paging) usage', 'thresholds': ['memswap'], 'thresholds_min': 2},
44
    {'msg': 'High memory consumption', 'thresholds': ['mem'], 'thresholds_min': 2},
45
]
46
47
# TODO: change the algo to use the following decision tree
48
# Source: Inspire by https://scoutapm.com/blog/slow_server_flow_chart
49
# _yes means threshold >= 2
50
# _no  means threshold < 2
51
# With threshold:
52
# - 0: OK
53
# - 1: CAREFUL
54
# - 2: WARNING
55
# - 3: CRITICAL
56
tree_new = {
57
    'cpu_iowait': {
58
        '_yes': {
59
            'memswap': {
60
                '_yes': {
61
                    'mem': {
62
                        '_yes': {
63
                            # Once you've identified the offenders, the resolution will again
64
                            # depend on whether their memory usage seems business-as-usual or not.
65
                            # For example, a memory leak can be satisfactorily addressed by a one-time
66
                            # or periodic restart of the process.
67
                            # - if memory usage seems anomalous: kill the offending processes.
68
                            # - if memory usage seems business-as-usual: add RAM to the server,
69
                            # or split high-memory using services to other servers.
70
                            '_msg': "Memory issue"
71
                        },
72
                        '_no': {
73
                            # ???
74
                            '_msg': "Swap issue"
75
                        },
76
                    }
77
                },
78
                '_no': {
79
                    # Low swap means you have a "real" IO wait problem. The next step is to see what's hogging your IO.
80
                    # iotop is an awesome tool for identifying io offenders. Two things to note:
81
                    # unless you've already installed iotop, it's probably not already on your system.
82
                    # Recommendation: install it before you need it - - it's no fun trying to install a troubleshooting
83
                    # tool on an overloaded machine (iotop requires a Linux of 2.62 or above)
84
                    '_msg': "I/O issue"
85
                },
86
            }
87
        },
88
        '_no': {
89
            'cpu_total': {
90
                '_yes': {
91
                    'cpu_user': {
92
                        '_yes': {
93
                            # We expect the user-time percentage to be high.
94
                            # There's most likely a program or service you've configured on you server that's
95
                            # hogging CPU.
96
                            # Checking the % user time just confirms this. When you see that the % user-time is high,
97
                            # it's time to see what executable is monopolizing the CPU
98
                            # Once you've confirmed that the % usertime is high, check the process list(also provided
99
                            # by top).
100
                            # Be default, top sorts the process list by % CPU, so you can just look at the top process
101
                            # or processes.
102
                            # If there's a single process hogging the CPU in a way that seems abnormal, it's an
103
                            # anomalous situation
104
                            # that a service restart can fix. If there are are multiple processes taking up CPU
105
                            # resources, or it
106
                            # there's one process that takes lots of resources while otherwise functioning normally,
107
                            # than your setup
108
                            # may just be underpowered. You'll need to upgrade your server(add more cores),
109
                            # or split services out onto
110
                            # other boxes. In either case, you have a resolution:
111
                            # - if situation seems anomalous: kill the offending processes.
112
                            # - if situation seems typical given history: upgrade server or add more servers.
113
                            '_msg': "CPU issue with user process(es)"
114
                        },
115
                        '_no': {
116
                            'cpu_steal': {
117
                                '_yes': {
118
                                    '_msg': "CPU issue with stolen time. System running the hypervisor may be too busy."
119
                                },
120
                                '_no': {'_msg': "CPU issue with system process(es)"},
121
                            }
122
                        },
123
                    }
124
                },
125
                '_no': {
126
                    '_yes': {
127
                        # ???
128
                        '_msg': "Memory issue"
129
                    },
130
                    '_no': {
131
                        # Your slowness isn't due to CPU or IO problems, so it's likely an application-specific issue.
132
                        # It's also possible that the slowness is being caused by another server in your cluster, or
133
                        # by an external service you rely on.
134
                        # start by checking important applications for uncharacteristic slowness(the DB is a good place
135
                        # to start), think through which parts of your infrastructure could be slowed down externally.
136
                        # For example, do you use an externally hosted email service that could slow down critical
137
                        # parts of your application ?
138
                        # If you suspect another server in your cluster, strace and lsof can provide information on
139
                        # what the process is doing or waiting on. Strace will show you which file descriptors are
140
                        # being read or written to (or being attempted to be read from) and lsof can give you a
141
                        # mapping of those file descriptors to network connections.
142
                        '_msg': "External issue"
143
                    },
144
                },
145
            }
146
        },
147
    }
148
}
149
150
151
def global_message():
152
    """Parse the decision tree and return the message.
153
154
    Note: message corresponding to the current thresholds values
155
    """
156
    # Compute the weight for each item in the tree
157
    current_thresholds = glances_thresholds.get()
158
    for i in tree:
159
        i['weight'] = sum([current_thresholds[t].value() for t in i['thresholds'] if t in current_thresholds])
160
    themax = max(tree, key=lambda d: d['weight'])
161
    if themax['weight'] >= themax['thresholds_min']:
162
        # Check if the weight is > to the minimal threshold value
163
        return themax['msg']
164
    else:
165
        return tree[0]['msg']
166
167
168
class PluginModel(GlancesPluginModel):
169
    """Glances alert plugin.
170
171
    Only for display.
172
    """
173
174
    def __init__(self, args=None, config=None):
175
        """Init the plugin."""
176
        super(PluginModel, self).__init__(args=args,
177
                                          config=config,
178
                                          stats_init_value=[])
179
180
        # We want to display the stat in the curse interface
181
        self.display_curse = True
182
183
        # Set the message position
184
        self.align = 'bottom'
185
186
        # Set the maximum number of events to display
187
        if config is not None and (config.has_section('alert') or config.has_section('alerts')):
188
            glances_events.set_max_events(config.get_int_value('alert', 'max_events', default=10))
189
190
    def update(self):
191
        """Nothing to do here. Just return the global glances_log."""
192
        # Set the stats to the glances_events
193
        self.stats = glances_events.get()
194
        # Define the global message thanks to the current thresholds
195
        # and the decision tree
196
        # !!! Call directly in the msg_curse function
197
        # global_message()
198
199
    def msg_curse(self, args=None, max_width=None):
200
        """Return the dict to display in the curse interface."""
201
        # Init the return message
202
        ret = []
203
204
        # Only process if display plugin enable...
205
        if not self.stats or self.is_disabled():
206
            return ret
207
208
        # Build the string message
209
        # Header
210
        ret.append(self.curse_add_line(global_message(), "TITLE"))
211
        # Loop over alerts
212
        for alert in self.stats:
213
            # New line
214
            ret.append(self.curse_new_line())
215
            # Start
216
            msg = str(datetime.fromtimestamp(alert[0],
217
                                             tz=pytz.timezone(tzname[0] if tzname[0] else 'UTC')))
218
            ret.append(self.curse_add_line(msg))
219
            # Duration
220
            if alert[1] > 0:
221
                # If finished display duration
222
                msg = ' ({})'.format(datetime.fromtimestamp(alert[1]) - datetime.fromtimestamp(alert[0]))
223
            else:
224
                msg = ' (ongoing)'
225
            ret.append(self.curse_add_line(msg))
226
            ret.append(self.curse_add_line(" - "))
227
            # Infos
228
            if alert[1] > 0:
229
                # If finished do not display status
230
                msg = '{} on {}'.format(alert[2], alert[3])
231
                ret.append(self.curse_add_line(msg))
232
            else:
233
                msg = str(alert[3])
234
                ret.append(self.curse_add_line(msg, decoration=alert[2]))
235
            # Min / Mean / Max
236
            if self.approx_equal(alert[6], alert[4], tolerance=0.1):
237
                msg = ' ({:.1f})'.format(alert[5])
238
            else:
239
                msg = ' (Min:{:.1f} Mean:{:.1f} Max:{:.1f})'.format(alert[6], alert[5], alert[4])
240
            ret.append(self.curse_add_line(msg))
241
            # Top processes
242
            top_process = ', '.join([p['name'] for p in alert[9]])
243
            if top_process != '':
244
                msg = ': {}'.format(top_process)
245
                ret.append(self.curse_add_line(msg))
246
247
        return ret
248
249
    def approx_equal(self, a, b, tolerance=0.0):
250
        """Compare a with b using the tolerance (if numerical)."""
251
        if str(int(a)).isdigit() and str(int(b)).isdigit():
252
            return abs(a - b) <= max(abs(a), abs(b)) * tolerance
253
        else:
254
            return a == b
255