Completed
Pull Request — master (#323)
by
unknown
03:01
created

NewRelicHookSensor._is_alert_opened()   A

Complexity

Conditions 1

Size

Total Lines 2

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 1
dl 0
loc 2
rs 10
1
# Licensed to the StackStorm, Inc ('StackStorm') under one or more
2
3
# contributor license agreements.  See the NOTICE file distributed with
4
# this work for additional information regarding copyright ownership.
5
# The ASF licenses this file to You under the Apache License, Version 2.0
6
# (the "License"); you may not use this file except in compliance with
7
# the License.  You may obtain a copy of the License at
8
#
9
#     http://www.apache.org/licenses/LICENSE-2.0
10
#
11
# Unless required by applicable law or agreed to in writing, software
12
# distributed under the License is distributed on an "AS IS" BASIS,
13
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
# See the License for the specific language governing permissions and
15
# limitations under the License.
16
17
import six
18
import sys
19
20
import eventlet
21
import requests
22
from flask import request, Flask
23
from six.moves import urllib_parse
24
from st2reactor.sensor.base import Sensor
25
26
eventlet.monkey_patch(
27
    os=True,
28
    select=True,
29
    socket=True,
30
    thread=False if '--use-debugger' in sys.argv else True,
31
    time=True)
32
33
PACK = 'newrelic'
34
WEB_APP_ALERT_TRIGGER_REF = '{}.{}'.format(PACK, 'WebAppAlertTrigger')
35
WEB_APP_NORMAL_TRIGGER_REF = '{}.{}'.format(PACK, 'WebAppNormalTrigger')
36
SERVER_ALERT_TRIGGER_REF = '{}.{}'.format(PACK, 'ServerAlertTrigger')
37
SERVER_NORMAL_TRIGGER_REF = '{}.{}'.format(PACK, 'ServerNormalTrigger')
38
39
NR_API_URL_KEY = 'api_url'
40
NR_API_KEY_KEY = 'api_key'
41
42
APP_HOST_KEY = 'host'
43
APP_PORT_KEY = 'port'
44
APP_URL_KEY = 'url'
45
NORMAL_REPORT_DELAY_KEY = 'normal_report_delay'
46
47
48
class NewRelicHookSensor(Sensor):
49
50
    """
51
    Sensor class that starts up a flask webapp that listens to alert hooks from NewRelic.
52
    It translates hooks into appropriate triggers using the following mapping -
53
       1. Web app incident and apdex problem opened -> WEB_APP_ALERT_TRIGGER
54
       2. Incident escalated to downtime (app)      -> WEB_APP_ALERT_TRIGGER
55
       3. Apdex problem closed (app)                -> WEB_APP_NORMAL_TRIGGER_REF
56
       4. Downtime problem closed (app)             -> WEB_APP_NORMAL_TRIGGER_REF
57
       5. Server incident and CPU problem opened    -> SERVER_ALERT_TRIGGER_REF
58
       6. Incident escalated after 5 minutes        -> SERVER_ALERT_TRIGGER_REF
59
       7. Server downtime ends                      -> SERVER_NORMAL_TRIGGER_REF
60
       8. CPU problem closed                        -> SERVER_NORMAL_TRIGGER_REF
61
62
    Note : Some hooks like cancel or disable of an inciden and open or close of alert policy
63
    are ignored.
64
65
    All return to normal events are always fired after a delay period.
66
    """
67
68
    def __init__(self, sensor_service, config=None):
69
        self._config = config
70
        self._sensor_service = sensor_service
71
72
        self._api_url = config.get(NR_API_URL_KEY, None)
73
        self._api_key = config.get(NR_API_KEY_KEY, None)
74
75
        self._host = self._get_sensor_config_param(self._config, APP_HOST_KEY)
76
        self._port = self._get_sensor_config_param(self._config, APP_PORT_KEY)
77
        self._url = self._get_sensor_config_param(self._config, APP_URL_KEY)
78
        self._normal_report_delay = self._get_sensor_config_param(
79
            self._config, NORMAL_REPORT_DELAY_KEY, 300)
80
        self._app = Flask(__name__)
81
        self._log = self._sensor_service.get_logger(__name__)
82
        self._headers = {'X-Api-Key': self._api_key}
83
84
    def setup(self):
85
        pass
86
87
    def run(self):
88
        """
89
        Validate required params and starts up the webapp that listen to hooks from NewRelic.
90
        """
91
        if not self._api_url:
92
            raise Exception('NewRelic API url not found.')
93
        if not self._api_key:
94
            raise Exception('NewRelic API key not found.')
95
        if not self._host or not self._port or not self._url:
96
            raise Exception('NewRelic webhook app config (host:%s, port:%s, url:%s)' % (
97
                self._host, self._port, self._url))
98
        self._log.info('NewRelicHookSensor up. host %s, port %s, url %s', self._host, self._port,
99
                       self._url)
100
101
        @self._app.route(self._url, methods=['POST'])
102
        def handle_nrhook():
103
104
            # hooks are sent for alerts and deployments. Only care about alerts so ignoring
105
            # deployments.
106
            # alert body is based on the example documentation
107
            # https://docs.newrelic.com/docs/alerts/new-relic-alerts-beta/managing-notification-channels/customize-your-webhook-payload
108
109
            try:
110
                data = request.get_json()
111
                alert_body = data
112
                self._log.info('Webhook data  %s' % (alert_body))
113
            except Exception:
114
                self._log.exception(
115
                    'Failed to parse request body: %s' % (alert_body))
116
                return 'IGNORED'
117
118
            if alert_body.get('severity', None) not in ['CRITICAL', 'WARN']:
119
                self._log.debug(
120
                    'Ignoring alert %s as it is not severe enough.', alert_body)
121
                return 'ACCEPTED'
122
123
            hook_headers = self._get_headers_as_dict(request.headers)
124
            hook_handler = self._get_hook_handler(alert_body, hook_headers)
125
126
            # all handling based off 'docs' found in this documentation -
127
            # https://docs.newrelic.com/docs/alerts/new-relic-alerts-beta/managing-notification-channels/customize-your-webhook-payload#webhook-format-examples
128
            try:
129
                if hook_handler:
130
                    hook_handler(alert_body, hook_headers)
131
            except Exception:
132
                self._log.exception('Failed to handle nr hook %s.', alert_body)
133
134
            return 'ACCEPTED'
135
136
        self._app.run(host=self._host, port=self._port)
137
138
    def _get_hook_handler(self, alert_body, hook_headers):
139
        if not alert_body:
140
            return None
141
        try:
142
            if 'Server' in alert_body.get('targets')[0].get('type'):
143
                return self._server_hook_handler
144
            elif 'Application' in alert_body.get('targets')[0].get('type'):
145
                return self._app_hook_handler
146
        except Exception:
147
            return None
148
        self._log.info(
149
            'No application or server found for alert %s. Will Ignore.', alert_body)
150
151
        return
152
153
    def _app_hook_handler(self, alert_body, hook_headers):
154
155
        if alert_body['current_state'] == 'open':
156
157
            # handled opened and escalation to downtime immediately.
158
            payload = {
159
                'alert': alert_body,
160
                'header': hook_headers
161
            }
162
            self._dispatch_trigger(WEB_APP_ALERT_TRIGGER_REF, payload)
163
164
        elif alert_body['current_state'] == 'closed':
165
166
            # handled closed and recovered after a delay.
167
            payload = {
168
                'alert': alert_body,
169
                'header': hook_headers
170
            }
171
            self._log.info('App alert closed. Delay.')
172
            eventlet.spawn_after(self._normal_report_delay, self._dispatch_application_normal,
173
                                 payload)
174
175
        elif alert_body['current_state'] == 'acknowledged':
176
177
            # ignore canceled or acknowledged
178
            self._log.info(
179
                'Ignored alert or alert acknowledged : %s.', alert_body)
180
181
    def _dispatch_application_normal(self, payload, attempt_no=0):
182
        '''
183
        Dispatches WEB_APP_NORMAL_TRIGGER_REF if the application health_status is 'green'.
184
        '''
185
        # basic guard to avoid queuing up forever.
186
        if attempt_no == 10:
187
            self._log.warning(
188
                'Abandoning WEB_APP_NORMAL_TRIGGER_REF dispatch. Payload %s', payload)
189
            return
190
        try:
191
            application = self._get_application(
192
                payload['alert']['targets'][0]['id'])
193
            if application['health_status'] in ['green']:
194
                self._dispatch_trigger(WEB_APP_NORMAL_TRIGGER_REF, payload)
195
            else:
196
                self._log.info('Application %s has state %s. Rescheduling normal check.',
197
                               application['name'], application['health_status'])
198
                eventlet.spawn_after(self._normal_report_delay, self._dispatch_application_normal,
199
                                     payload, attempt_no + 1)
200
        except Exception:
201
            self._log.exception('Failed delay dispatch. Payload %s.', payload)
202
203
    def _server_hook_handler(self, alert_body, hook_headers):
204
        if alert_body['current_state'] == 'open':
205
206
            payload = {
207
                'alert': alert_body,
208
                'header': hook_headers
209
            }
210
            self._dispatch_trigger(SERVER_ALERT_TRIGGER_REF, payload)
211
212
        elif alert_body['current_state'] == 'closed':
213
214
            payload = {
215
                'alert': alert_body,
216
                'header': hook_headers
217
            }
218
            self._log.info('App alert closed. Delay.')
219
            eventlet.spawn_after(self._normal_report_delay, self._dispatch_server_normal,
220
                                 payload)
221
222
        elif alert_body['current_state'] == 'acknowledged':
223
            self._log.info('Alert is acknowledged : %s.', alert_body)
224
225
    def _dispatch_server_normal(self, payload, attempt_no=0):
226
        '''
227
        Dispatches SERVER_NORMAL_TRIGGER_REF if the all servers health_status is 'green'.
228
        '''
229
        # basic guard to avoid queuing up forever.
230
        if attempt_no == 10:
231
            self._log.warning(
232
                'Abandoning SERVER_NORMAL_TRIGGER_REF dispatch. Payload %s', payload)
233
            return
234
        try:
235
            servers = self._get_servers(
236
                [i['name'] for i in payload['alert']['targets']])
237
            # make sure all servers are ok.
238
            all_servers_ok = True
239
            for name, server in six.iteritems(servers):
240
                all_servers_ok &= server['health_status'] in ['green']
241
                if not all_servers_ok:
242
                    break
243
244
            if all_servers_ok:
245
                self._dispatch_trigger(SERVER_NORMAL_TRIGGER_REF, payload)
246
            else:
247
                for server in servers:
248
                    self._log.info('server %s has state %s. Rescheduling normal check.',
249
                                   server['name'], server['health_status'])
250
                eventlet.spawn_after(self._normal_report_delay, self._dispatch_server_normal,
251
                                     payload, attempt_no + 1)
252
        except:
253
            self._log.exception('Failed delay dispatch. Payload %s.', payload)
254
255
    def _dispatch_trigger(self, trigger, payload):
256
        self._sensor_service.dispatch(trigger, payload)
257
        self._log.info('Dispatched %s with payload %s.', trigger, payload)
258
259
    # newrelic API methods
260
    def _get_application(self, app_id):
261
        url = urllib_parse.urljoin(
262
            self._api_url+'applications/', str(app_id)+'.json')
263
        resp = requests.get(url, headers=self._headers).json()
264
        if 'application' in resp:
265
            # pick 1st application
266
            return resp['application'] if resp['application'] else None
267
        return None
268
269
    def _get_servers(self, server_names):
270
        servers = {}
271
        # No batch query by name support so making API calls in a tight loop. Might be
272
        # ok to get all severs and filter manually but that gets complex for a large number
273
        # of server since the API pages data.
274
        for server_name in server_names:
275
            params = {'filter[name]': server_name}
276
            url = urllib_parse.urljoin(self._api_url, 'servers.json')
277
            resp = requests.get(
278
                url, headers=self._headers, params=params).json()
279
            servers[server_name] = resp['servers'][
280
                0] if resp['servers'] else None
281
        return servers
282
283
    @staticmethod
284
    def _get_sensor_config_param(config, param_name, default=None):
285
        sensor_config = NewRelicHookSensor._get_sensor_config(config)
286
        if sensor_config:
287
            return sensor_config.get(param_name, default)
288
        return default
289
290
    @staticmethod
291
    def _get_sensor_config(config):
292
        return config.get('sensor_config', None)
293
294
    @staticmethod
295
    def _get_headers_as_dict(headers):
296
        headers_dict = {}
297
        for k, v in headers:
298
            headers_dict[k] = v
299
        return headers_dict
300
301
    # ignore
302
    def cleanup(self):
303
        pass
304
305
    def add_trigger(self, trigger):
306
        pass
307
308
    def update_trigger(self, trigger):
309
        pass
310
311
    def remove_trigger(self, trigger):
312
        pass
313