Completed
Pull Request — master (#352)
by James
02:07
created

NewRelicHookSensor._server_hook_handler()   A

Complexity

Conditions 4

Size

Total Lines 21

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 4
dl 0
loc 21
rs 9.0534
1
# Licensed to the StackStorm, Inc ('StackStorm') under one or more
2
3
# contributor license agreements.  See the NOTICE file distributed with
4
# this work for additional information regarding copyright ownership.
5
# The ASF licenses this file to You under the Apache License, Version 2.0
6
# (the "License"); you may not use this file except in compliance with
7
# the License.  You may obtain a copy of the License at
8
#
9
#     http://www.apache.org/licenses/LICENSE-2.0
10
#
11
# Unless required by applicable law or agreed to in writing, software
12
# distributed under the License is distributed on an "AS IS" BASIS,
13
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
# See the License for the specific language governing permissions and
15
# limitations under the License.
16
17
import six
18
import sys
19
20
import eventlet
21
import requests
22
from flask import request, Flask
23
from six.moves import urllib_parse
24
from st2reactor.sensor.base import Sensor
25
26
eventlet.monkey_patch(
27
    os=True,
28
    select=True,
29
    socket=True,
30
    thread=False if '--use-debugger' in sys.argv else True,
31
    time=True)
32
33
PACK = 'newrelic'
34
WEB_APP_ALERT_TRIGGER_REF = '{}.{}'.format(PACK, 'WebAppAlertTrigger')
35
WEB_APP_NORMAL_TRIGGER_REF = '{}.{}'.format(PACK, 'WebAppNormalTrigger')
36
SERVER_ALERT_TRIGGER_REF = '{}.{}'.format(PACK, 'ServerAlertTrigger')
37
SERVER_NORMAL_TRIGGER_REF = '{}.{}'.format(PACK, 'ServerNormalTrigger')
38
39
NR_API_URL_KEY = 'api_url'
40
NR_API_KEY_KEY = 'api_key'
41
42
APP_HOST_KEY = 'host'
43
APP_PORT_KEY = 'port'
44
APP_URL_KEY = 'url'
45
NORMAL_REPORT_DELAY_KEY = 'normal_report_delay'
46
47
48
class NewRelicHookSensor(Sensor):
49
50
    """
51
    Sensor class that starts up a flask webapp that listens to alert hooks from NewRelic.
52
    It translates hooks into appropriate triggers using the following mapping -
53
       1. Web app incident and apdex problem opened -> WEB_APP_ALERT_TRIGGER
54
       2. Incident escalated to downtime (app)      -> WEB_APP_ALERT_TRIGGER
55
       3. Apdex problem closed (app)                -> WEB_APP_NORMAL_TRIGGER_REF
56
       4. Downtime problem closed (app)             -> WEB_APP_NORMAL_TRIGGER_REF
57
       5. Server incident and CPU problem opened    -> SERVER_ALERT_TRIGGER_REF
58
       6. Incident escalated after 5 minutes        -> SERVER_ALERT_TRIGGER_REF
59
       7. Server downtime ends                      -> SERVER_NORMAL_TRIGGER_REF
60
       8. CPU problem closed                        -> SERVER_NORMAL_TRIGGER_REF
61
62
    Note : Some hooks like cancel or disable of an inciden and open or close of alert policy
63
    are ignored.
64
65
    All return to normal events are always fired after a delay period.
66
    """
67
68
    def __init__(self, sensor_service, config=None):
69
        self._config = config
70
        self._sensor_service = sensor_service
71
72
        self._api_url = config.get(NR_API_URL_KEY, None)
73
        self._api_key = config.get(NR_API_KEY_KEY, None)
74
75
        self._host = self._get_sensor_config_param(self._config, APP_HOST_KEY)
76
        self._port = self._get_sensor_config_param(self._config, APP_PORT_KEY)
77
        self._url = self._get_sensor_config_param(self._config, APP_URL_KEY)
78
        self._normal_report_delay = self._get_sensor_config_param(self._config,
79
                                                                  NORMAL_REPORT_DELAY_KEY, 300)
80
81
        self._app = Flask(__name__)
82
        self._log = self._sensor_service.get_logger(__name__)
83
        self._headers = {'X-Api-Key': self._api_key}
84
85
    def setup(self):
86
        pass
87
88
    def run(self):
89
        """
90
        Validate required params and starts up the webapp that listen to hooks from NewRelic.
91
        """
92
        if not self._api_url:
93
            raise Exception('NewRelic API url not found.')
94
        if not self._api_key:
95
            raise Exception('NewRelic API key not found.')
96
        if not self._host or not self._port or not self._url:
97
            raise Exception('NewRelic webhook app config (host:%s, port:%s, url:%s)' %
98
                            (self._host, self._port, self._url))
99
        self._log.info('NewRelicHookSensor up. host %s, port %s, url %s', self._host, self._port,
100
                       self._url)
101
102
        @self._app.route(self._url, methods=['POST'])
103
        def handle_nrhook():
104
105
            # hooks are sent for alerts and deployments. Only care about alerts so ignoring
106
            # deployments.
107
            # alert body is based on the example documentation
108
            # https://docs.newrelic.com/docs/alerts/new-relic-alerts-beta/managing-notification-channels/customize-your-webhook-payload
109
110
            try:
111
                data = request.get_json()
112
                alert_body = data
113
                self._log.info('Webhook data  %s' % (alert_body))
114
            except Exception:
115
                self._log.exception('Failed to parse request body: %s' % (alert_body))
116
                return 'IGNORED'
117
118
            if alert_body.get('severity', None) not in ['CRITICAL', 'WARN']:
119
                self._log.debug('Ignoring alert %s as it is not severe enough.', alert_body)
120
                return 'ACCEPTED'
121
122
            hook_headers = self._get_headers_as_dict(request.headers)
123
            hook_handler = self._get_hook_handler(alert_body, hook_headers)
124
125
            # all handling based off 'docs' found in this documentation -
126
            # https://docs.newrelic.com/docs/alerts/new-relic-alerts-beta/managing-notification-channels/customize-your-webhook-payload#webhook-format-examples
127
128
            try:
129
                if hook_handler:
130
                    hook_handler(alert_body, hook_headers)
131
            except Exception:
132
                self._log.exception('Failed to handle nr hook %s.', alert_body)
133
134
            return 'ACCEPTED'
135
136
        self._app.run(host=self._host, port=self._port)
137
138
    def _get_hook_handler(self, alert_body, hook_headers):
139
        if not alert_body:
140
            return None
141
        try:
142
            if 'Server' in alert_body.get('targets')[0].get('type'):
143
                return self._server_hook_handler
144
            elif 'Application' in alert_body.get('targets')[0].get('type'):
145
                return self._app_hook_handler
146
147
        except Exception:
148
            return None
149
        self._log.info('No application or server found for alert %s. Will Ignore.', alert_body)
150
151
        return
152
153
    def _app_hook_handler(self, alert_body, hook_headers):
154
155
        if alert_body['current_state'] == 'open':
156
157
            # handled opened and escalation to downtime immediately.
158
            payload = {
159
                'alert': alert_body,
160
                'header': hook_headers
161
            }
162
            self._dispatch_trigger(WEB_APP_ALERT_TRIGGER_REF, payload)
163
164
        elif alert_body['current_state'] == 'closed':
165
166
            # handled closed and recovered after a delay.
167
            payload = {
168
                'alert': alert_body,
169
                'header': hook_headers
170
            }
171
            self._log.info('App alert closed. Delay.')
172
            eventlet.spawn_after(self._normal_report_delay, self._dispatch_application_normal,
173
                                 payload)
174
175
        elif alert_body['current_state'] == 'acknowledged':
176
177
            # ignore canceled or acknowledged
178
            self._log.info('Ignored alert or alert acknowledged : %s.', alert_body)
179
180
    def _dispatch_application_normal(self, payload, attempt_no=0):
181
        '''
182
        Dispatches WEB_APP_NORMAL_TRIGGER_REF if the application health_status is 'green'.
183
        '''
184
        # basic guard to avoid queuing up forever.
185
        if attempt_no == 10:
186
            self._log.warning('Abandoning WEB_APP_NORMAL_TRIGGER_REF dispatch. Payload %s', payload)
187
            return
188
        try:
189
            application = self._get_application(payload['alert']['targets'][0]['id'])
190
            if application['health_status'] in ['green']:
191
                self._dispatch_trigger(WEB_APP_NORMAL_TRIGGER_REF, payload)
192
            else:
193
                self._log.info('Application %s has state %s. Rescheduling normal check.',
194
                               application['name'], application['health_status'])
195
                eventlet.spawn_after(self._normal_report_delay, self._dispatch_application_normal,
196
                                     payload, attempt_no + 1)
197
        except Exception:
198
            self._log.exception('Failed delay dispatch. Payload %s.', payload)
199
200
    def _server_hook_handler(self, alert_body, hook_headers):
201
        if alert_body['current_state'] == 'open':
202
203
            payload = {
204
                'alert': alert_body,
205
                'header': hook_headers
206
            }
207
            self._dispatch_trigger(SERVER_ALERT_TRIGGER_REF, payload)
208
209
        elif alert_body['current_state'] == 'closed':
210
211
            payload = {
212
                'alert': alert_body,
213
                'header': hook_headers
214
            }
215
            self._log.info('App alert closed. Delay.')
216
            eventlet.spawn_after(self._normal_report_delay, self._dispatch_server_normal,
217
                                 payload)
218
219
        elif alert_body['current_state'] == 'acknowledged':
220
            self._log.info('Alert is acknowledged : %s.', alert_body)
221
222
    def _dispatch_server_normal(self, payload, attempt_no=0):
223
        '''
224
        Dispatches SERVER_NORMAL_TRIGGER_REF if the all servers health_status is 'green'.
225
        '''
226
        # basic guard to avoid queuing up forever.
227
        if attempt_no == 10:
228
            self._log.warning('Abandoning SERVER_NORMAL_TRIGGER_REF dispatch. Payload %s', payload)
229
            return
230
        try:
231
            servers = self._get_servers([i['name'] for i in payload['alert']['targets']])
232
            # make sure all servers are ok.
233
            all_servers_ok = True
234
            for name, server in six.iteritems(servers):
235
                all_servers_ok &= server['health_status'] in ['green']
236
                if not all_servers_ok:
237
                    break
238
239
            if all_servers_ok:
240
                self._dispatch_trigger(SERVER_NORMAL_TRIGGER_REF, payload)
241
            else:
242
                for server in servers:
243
                    self._log.info('server %s has state %s. Rescheduling normal check.',
244
                                   server['name'], server['health_status'])
245
                eventlet.spawn_after(self._normal_report_delay, self._dispatch_server_normal,
246
                                     payload, attempt_no + 1)
247
        except:
248
            self._log.exception('Failed delay dispatch. Payload %s.', payload)
249
250
    def _dispatch_trigger(self, trigger, payload):
251
        self._sensor_service.dispatch(trigger, payload)
252
        self._log.info('Dispatched %s with payload %s.', trigger, payload)
253
254
    # newrelic API methods
255
    def _get_application(self, app_id):
256
        url = urllib_parse.urljoin(self._api_url + 'applications/', str(app_id) + '.json')
257
        resp = requests.get(url, headers=self._headers).json()
258
        if 'application' in resp:
259
            # pick 1st application
260
            return resp['application'] if resp['application'] else None
261
        return None
262
263
    def _get_servers(self, server_names):
264
        servers = {}
265
        # No batch query by name support so making API calls in a tight loop. Might be
266
        # ok to get all severs and filter manually but that gets complex for a large number
267
        # of server since the API pages data.
268
        for server_name in server_names:
269
            params = {'filter[name]': server_name}
270
            url = urllib_parse.urljoin(self._api_url, 'servers.json')
271
            resp = requests.get(url, headers=self._headers, params=params).json()
272
            servers[server_name] = resp['servers'][0] if resp['servers'] else None
273
        return servers
274
275
    @staticmethod
276
    def _get_sensor_config_param(config, param_name, default=None):
277
        sensor_config = NewRelicHookSensor._get_sensor_config(config)
278
        if sensor_config:
279
            return sensor_config.get(param_name, default)
280
        return default
281
282
    @staticmethod
283
    def _get_sensor_config(config):
284
        return config.get('sensor_config', None)
285
286
    @staticmethod
287
    def _get_headers_as_dict(headers):
288
        headers_dict = {}
289
        for k, v in headers:
290
            headers_dict[k] = v
291
        return headers_dict
292
293
    # ignore
294
    def cleanup(self):
295
        pass
296
297
    def add_trigger(self, trigger):
298
        pass
299
300
    def update_trigger(self, trigger):
301
        pass
302
303
    def remove_trigger(self, trigger):
304
        pass
305