Completed
Pull Request — master (#323)
by
unknown
02:41
created

NewRelicHookSensor._get_hook_handler()   B

Complexity

Conditions 5

Size

Total Lines 14

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 5
dl 0
loc 14
rs 8.5455
1
# Licensed to the StackStorm, Inc ('StackStorm') under one or more
2
3
# contributor license agreements.  See the NOTICE file distributed with
4
# this work for additional information regarding copyright ownership.
5
# The ASF licenses this file to You under the Apache License, Version 2.0
6
# (the "License"); you may not use this file except in compliance with
7
# the License.  You may obtain a copy of the License at
8
#
9
#     http://www.apache.org/licenses/LICENSE-2.0
10
#
11
# Unless required by applicable law or agreed to in writing, software
12
# distributed under the License is distributed on an "AS IS" BASIS,
13
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
# See the License for the specific language governing permissions and
15
# limitations under the License.
16
17
import six
18
import sys
19
import json
20
21
import eventlet
22
import requests
23
from flask import request, Flask
24
from six.moves import urllib_parse
25
from st2reactor.sensor.base import Sensor
26
27
eventlet.monkey_patch(
28
    os=True,
29
    select=True,
30
    socket=True,
31
    thread=False if '--use-debugger' in sys.argv else True,
32
    time=True)
33
34
PACK = 'newrelic'
35
WEB_APP_ALERT_TRIGGER_REF = '{}.{}'.format(PACK, 'WebAppAlertTrigger')
36
WEB_APP_NORMAL_TRIGGER_REF = '{}.{}'.format(PACK, 'WebAppNormalTrigger')
37
SERVER_ALERT_TRIGGER_REF = '{}.{}'.format(PACK, 'ServerAlertTrigger')
38
SERVER_NORMAL_TRIGGER_REF = '{}.{}'.format(PACK, 'ServerNormalTrigger')
39
40
NR_API_URL_KEY = 'api_url'
41
NR_API_KEY_KEY = 'api_key'
42
43
APP_HOST_KEY = 'host'
44
APP_PORT_KEY = 'port'
45
APP_URL_KEY = 'url'
46
NORMAL_REPORT_DELAY_KEY = 'normal_report_delay'
47
48
49
class NewRelicHookSensor(Sensor):
50
    """
51
    Sensor class that starts up a flask webapp that listens to alert hooks from NewRelic.
52
    It translates hooks into appropriate triggers using the following mapping -
53
       1. Web app incident and apdex problem opened -> WEB_APP_ALERT_TRIGGER
54
       2. Incident escalated to downtime (app)      -> WEB_APP_ALERT_TRIGGER
55
       3. Apdex problem closed (app)                -> WEB_APP_NORMAL_TRIGGER_REF
56
       4. Downtime problem closed (app)             -> WEB_APP_NORMAL_TRIGGER_REF
57
       5. Server incident and CPU problem opened    -> SERVER_ALERT_TRIGGER_REF
58
       6. Incident escalated after 5 minutes        -> SERVER_ALERT_TRIGGER_REF
59
       7. Server downtime ends                      -> SERVER_NORMAL_TRIGGER_REF
60
       8. CPU problem closed                        -> SERVER_NORMAL_TRIGGER_REF
61
62
    Note : Some hooks like cancel or disable of an inciden and open or close of alert policy
63
    are ignored.
64
65
    All return to normal events are always fired after a delay period.
66
    """
67
68
    def __init__(self, sensor_service, config=None):
69
        self._config = config
70
        self._sensor_service = sensor_service
71
72
        self._api_url = config.get(NR_API_URL_KEY, None)
73
        self._api_key = config.get(NR_API_KEY_KEY, None)
74
75
        self._host = self._get_sensor_config_param(self._config, APP_HOST_KEY)
76
        self._port = self._get_sensor_config_param(self._config, APP_PORT_KEY)
77
        self._url = self._get_sensor_config_param(self._config, APP_URL_KEY)
78
        self._normal_report_delay = self._get_sensor_config_param(self._config,
79
                                                                  NORMAL_REPORT_DELAY_KEY, 300)
80
81
        self._app = Flask(__name__)
82
        self._log = self._sensor_service.get_logger(__name__)
83
        self._headers = {'X-Api-Key': self._api_key}
84
85
    def setup(self):
86
        pass
87
88
    def run(self):
89
        """
90
        Validate required params and starts up the webapp that listen to hooks from NewRelic.
91
        """
92
        if not self._api_url:
93
            raise Exception('NewRelic API url not found.')
94
        if not self._api_key:
95
            raise Exception('NewRelic API key not found.')
96
        if not self._host or not self._port or not self._url:
97
            raise Exception('NewRelic webhook app config (host:%s, port:%s, url:%s)' %
98
                            (self._host, self._port, self._url))
99
        self._log.info('NewRelicHookSensor up. host %s, port %s, url %s', self._host, self._port,
100
                       self._url)
101
102
        @self._app.route(self._url, methods=['POST'])
103
        def handle_nrhook():
104
105
            # hooks are sent for alerts and deployments. Only care about alerts so ignoring
106
            # deployments. 
107
            # alert body is based on the example documentation
108
            # https://docs.newrelic.com/docs/alerts/new-relic-alerts-beta/managing-notification-channels/customize-your-webhook-payload
109
110
            try:
111
                data = request.get_json()
112
                alert_body = data
113
                self._log.info('Webhook data  %s' % (alert_body))
114
            except Exception:
115
                self._log.exception('Failed to parse request body: %s' % (alert_body))
116
                return 'IGNORED'
117
118
            if alert_body.get('severity', None) not in ['CRITICAL', 'WARN']:
119
                self._log.debug('Ignoring alert %s as it is not severe enough.', alert_body)
120
                return 'ACCEPTED'
121
122
            hook_headers = self._get_headers_as_dict(request.headers)
123
            hook_handler = self._get_hook_handler(alert_body, hook_headers)
124
125
            # all handling based off 'docs' found in this documentation -
126
            # https://docs.newrelic.com/docs/alerts/new-relic-alerts-beta/managing-notification-channels/customize-your-webhook-payload#webhook-format-examples
127
128
            try:
129
                if hook_handler:
130
                    hook_handler(alert_body, hook_headers)
131
            except Exception:
132
                self._log.exception('Failed to handle nr hook %s.', alert_body)
133
134
            return 'ACCEPTED'
135
136
        self._app.run(host=self._host, port=self._port)
137
138
    def _get_hook_handler(self, alert_body, hook_headers):
139
        if not alert_body:
140
            return None
141
        try:
142
            if 'Server' in alert_body.get('targets')[0].get('type'):
143
                return self._server_hook_handler
144
            elif 'Application' in alert_body.get('targets')[0].get('type'):
145
                return self._app_hook_handler
146
            
147
        except Exception:
148
            return None
149
        self._log.info('No application or server found for alert %s. Will Ignore.', alert_body)
150
151
        return
152
153
    def _app_hook_handler(self, alert_body, hook_headers):
154
155
        if alert_body['current_state']=='open':
156
157
            # handled opened and escalation to downtime immediately.
158
            payload = {
159
                'alert': alert_body,
160
                'header': hook_headers
161
            }
162
            self._dispatch_trigger(WEB_APP_ALERT_TRIGGER_REF, payload)
163
164
        elif alert_body['current_state']=='closed':
165
166
            # handled closed and recovered after a delay.
167
            payload = {
168
                'alert': alert_body,
169
                'header': hook_headers
170
            }
171
            self._log.info('App alert closed. Delay.')
172
            eventlet.spawn_after(self._normal_report_delay, self._dispatch_application_normal,
173
                                 payload)
174
175
        elif alert_body['current_state']=='acknowledged':
176
177
            # ignore canceled or acknowledged
178
            self._log.info('Ignored alert or alert acknowledged : %s.', alert_body)
179
180
    def _dispatch_application_normal(self, payload, attempt_no=0):
181
        '''
182
        Dispatches WEB_APP_NORMAL_TRIGGER_REF if the application health_status is 'green'.
183
        '''
184
        # basic guard to avoid queuing up forever.
185
        if attempt_no == 10:
186
            self._log.warning('Abandoning WEB_APP_NORMAL_TRIGGER_REF dispatch. Payload %s', payload)
187
            return
188
        try:
189
            application = self._get_application(payload['alert']['targets'][0]['id'])
190
            if application['health_status'] in ['green']:
191
                self._dispatch_trigger(WEB_APP_NORMAL_TRIGGER_REF, payload)
192
            else:
193
                self._log.info('Application %s has state %s. Rescheduling normal check.',
194
                               application['name'], application['health_status'])
195
                eventlet.spawn_after(self._normal_report_delay, self._dispatch_application_normal,
196
                                     payload, attempt_no + 1)
197
        except Exception:
198
            self._log.exception('Failed delay dispatch. Payload %s.', payload)
199
200
    def _server_hook_handler(self, alert_body, hook_headers):
201
        if alert_body['current_state']=='open':
202
203
            payload = {
204
                'alert': alert_body,
205
                'header': hook_headers
206
            }
207
            self._dispatch_trigger(SERVER_ALERT_TRIGGER_REF, payload)
208
209
        elif alert_body['current_state']=='closed':
210
211
            payload = {
212
                'alert': alert_body,
213
                'header': hook_headers
214
            }
215
            self._log.info('App alert closed. Delay.')
216
            eventlet.spawn_after(self._normal_report_delay, self._dispatch_server_normal,
217
                                 payload)
218
219
        elif alert_body['current_state']=='acknowledged':
220
            self._log.info('Alert is acknowledged : %s.', alert_body)
221
222
    def _dispatch_server_normal(self, payload, attempt_no=0):
223
        '''
224
        Dispatches SERVER_NORMAL_TRIGGER_REF if the all servers health_status is 'green'.
225
        '''
226
        # basic guard to avoid queuing up forever.
227
        if attempt_no == 10:
228
            self._log.warning('Abandoning SERVER_NORMAL_TRIGGER_REF dispatch. Payload %s', payload)
229
            return
230
        try:
231
            servers = self._get_servers([i['name'] for i in payload['alert']['targets']])
232
            # make sure all servers are ok.
233
            all_servers_ok = True
234
            for name, server in six.iteritems(servers):
235
                all_servers_ok &= server['health_status'] in ['green']
236
                if not all_servers_ok:
237
                    break
238
239
            if all_servers_ok:
240
                self._dispatch_trigger(SERVER_NORMAL_TRIGGER_REF, payload)
241
            else:
242
                for server in servers:
243
                    self._log.info('server %s has state %s. Rescheduling normal check.',
244
                                   server['name'], server['health_status'])
245
                eventlet.spawn_after(self._normal_report_delay, self._dispatch_server_normal,
246
                                     payload, attempt_no + 1)
247
        except:
248
            self._log.exception('Failed delay dispatch. Payload %s.', payload)
249
250
    def _dispatch_trigger(self, trigger, payload):
251
        self._sensor_service.dispatch(trigger, payload)
252
        self._log.info('Dispatched %s with payload %s.', trigger, payload)
253
254
255
    # newrelic API methods
256
    def _get_application(self, app_id):
257
        params = None
258
        url = urllib_parse.urljoin(self._api_url+'applications/', str(app_id)+'.json')
259
        resp = requests.get(url, headers=self._headers).json()
260
        if 'application' in resp:
261
            # pick 1st application
262
            return resp['application'] if resp['application'] else None
263
        return None
264
265
    def _get_servers(self, server_names):
266
        servers = {}
267
        # No batch query by name support so making API calls in a tight loop. Might be
268
        # ok to get all severs and filter manually but that gets complex for a large number
269
        # of server since the API pages data.
270
        for server_name in server_names:
271
            params = {'filter[name]': server_name}
272
            url = urllib_parse.urljoin(self._api_url, 'servers.json')
273
            resp = requests.get(url, headers=self._headers, params=params).json()
274
            servers[server_name] = resp['servers'][0] if resp['servers'] else None
275
        return servers
276
277
    @staticmethod
278
    def _get_sensor_config_param(config, param_name, default=None):
279
        sensor_config = NewRelicHookSensor._get_sensor_config(config)
280
        if sensor_config:
281
            return sensor_config.get(param_name, default)
282
        return default
283
284
    @staticmethod
285
    def _get_sensor_config(config):
286
        return config.get('sensor_config', None)
287
288
    @staticmethod
289
    def _get_headers_as_dict(headers):
290
        headers_dict = {}
291
        for k, v in headers:
292
            headers_dict[k] = v
293
        return headers_dict
294
295
    # ignore
296
    def cleanup(self):
297
        pass
298
299
    def add_trigger(self, trigger):
300
        pass
301
302
    def update_trigger(self, trigger):
303
        pass
304
305
    def remove_trigger(self, trigger):
306
        pass
307