Completed
Push — master ( fa474b...faefc5 )
by Kenny
01:20
created

Proc.proc_net_dev()   C

Complexity

Conditions 7

Size

Total Lines 35

Duplication

Lines 35
Ratio 100 %

Importance

Changes 3
Bugs 2 Features 1
Metric Value
cc 7
c 3
b 2
f 1
dl 35
loc 35
rs 5.5
1
# -*- coding: utf-8 -*-
2
3
__author__ = 'Kenny Freeman'
4
__email__ = '[email protected]'
5
__license__ = "ISCL"
6
__docformat__ = 'reStructuredText'
7
8
import re
9
import os
10
import time
11
import os.path
12
import traceback
13
import multiprocessing
14
from collections import deque
15
16
import plumd
17
import plumd.plugins
18
from plumd.calc import Differential
19
from plumd.util import get_file_map, get_file_map_list, get_file_list, get_file
20
21
22
## todo: switch from list with pop(0) to deque
23
class Proc(plumd.plugins.Reader):
24
    """Plugin to measure various kernel metrics from /proc."""
25
    defaults = {
26
        'poll.interval': 10,
27
        'proc_path': '/proc',
28
        'skip_proc_stat': ['btime'],
29
        'skip_proc_meminfo': ["Active(anon)","Active(file)",
30
            "AnonHugePages","AnonPages","Bounce","DirectMap2M","CommitLimit",
31
            "DirectMap4k","HugePages_Free","Hugepagesize",
32
            "HugePages_Rsvd","HugePages_Surp","HugePages_Total",
33
            "Inactive","Inactive(anon)","Inactive(file)",
34
            "KernelStack","NFS_Unstable","PageTables",
35
            "Shmem","Slab","SReclaimable","SUnreclaim",
36
            "SwapCached","SwapFree","SwapTotal","Writeback","WritebackTmp"],
37
        'skip_proc_net_snmp': ["Icmp.InAddrMaskReps","Icmp.InAddrMasks",
38
            "Icmp.InTimestampReps","Icmp.InTimestamps","Icmp.OutAddrMaskReps",
39
            "Icmp.OutAddrMasks","Icmp.OutTimestampReps","Icmp.OutTimestamps",
40
            "Ip.DefaultTTL","Ip.Forwarding","Tcp.MaxConn","Tcp.RtoAlgorithm",
41
            "Tcp.RtoMax","Tcp.RtoMin"],
42
        'skip_proc_net_sockstat': [],
43
        'skip_proc_net_netstat': ["TcpExt.ArpFilter","TcpExt.BusyPollRxPackets",
44
            "TcpExt.DelayedACKLocked","TcpExt.DelayedACKLost",
45
            "TcpExt.DelayedACKs","TcpExt.EmbryonicRsts",
46
            "TcpExt.IPReversePathFilter","TcpExt.LockDroppedIcmps",
47
            "TcpExt.OfoPruned","TcpExt.OutOfWindowIcmps",
48
            "TcpExt.PAWSActive","TcpExt.PAWSEstab", "TcpExt.PAWSPassive",
49
            "TcpExt.PruneCalled","TcpExt.RcvPruned",
50
            "TcpExt.TCPACKSkippedChallenge","TcpExt.TCPACKSkippedFinWait2",
51
            "TcpExt.TCPACKSkippedPAWS","TcpExt.TCPACKSkippedSeq",
52
            "TcpExt.TCPACKSkippedSynRecv","TcpExt.TCPACKSkippedTimeWait",
53
            "TcpExt.TCPAutoCorking","TcpExt.TCPChallengeACK",
54
            "TcpExt.TCPDirectCopyFromBacklog","TcpExt.TCPDirectCopyFromPrequeue",
55
            "TcpExt.TCPDSACKIgnoredNoUndo","TcpExt.TCPDSACKIgnoredOld",
56
            "TcpExt.TCPDSACKOfoRecv","TcpExt.TCPDSACKOfoSent",
57
            "TcpExt.TCPDSACKOldSent","TcpExt.TCPDSACKRecv","TcpExt.TCPDSACKUndo",
58
            "TcpExt.TCPFACKReorder","TcpExt.TCPFromZeroWindowAdv",
59
            "TcpExt.TCPFullUndo","TcpExt.TCPHPAcks","TcpExt.TCPHPHits",
60
            "TcpExt.TCPHPHitsToUser","TcpExt.TCPHystartDelayCwnd",
61
            "TcpExt.TCPHystartDelayDetect","TcpExt.TCPHystartTrainCwnd",
62
            "TcpExt.TCPHystartTrainDetect","TcpExt.TCPLossFailures",
63
            "TcpExt.TCPLossProbeRecovery","TcpExt.TCPLossProbes",
64
            "TcpExt.TCPLossUndo","TcpExt.TCPLostRetransmit",
65
            "TcpExt.TCPMD5NotFound","TcpExt.TCPMD5Unexpected",
66
            "TcpExt.TCPMTUPFail","TcpExt.TCPMTUPSuccess","TcpExt.TCPOFODrop",
67
            "TcpExt.TCPOFOMerge","TcpExt.TCPOFOQueue","TcpExt.TCPOrigDataSent",
68
            "TcpExt.TCPPartialUndo","TcpExt.TCPPrequeued",
69
            "TcpExt.TCPPrequeueDropped","TcpExt.TCPPureAcks",
70
            "TcpExt.TCPRcvCoalesce","TcpExt.TCPRcvCollapsed",
71
            "TcpExt.TCPRenoFailures","TcpExt.TCPRenoRecovery",
72
            "TcpExt.TCPRenoRecoveryFail","TcpExt.TCPRenoReorder",
73
            "TcpExt.TCPRetransFail","TcpExt.TCPSACKDiscard",
74
            "TcpExt.TCPSackFailures","TcpExt.TCPSackMerged",
75
            "TcpExt.TCPSackRecovery","TcpExt.TCPSackRecoveryFail",
76
            "TcpExt.TCPSACKReneging","TcpExt.TCPSACKReorder",
77
            "TcpExt.TCPSackShifted","TcpExt.TCPSackShiftFallback",
78
            "TcpExt.TCPSchedulerFailed","TcpExt.TCPSpuriousRTOs",
79
            "TcpExt.TCPSpuriousRtxHostQueues","TcpExt.TCPSYNChallenge",
80
            "TcpExt.TCPToZeroWindowAdv","TcpExt.TCPTSReorder",
81
            "TcpExt.TCPWantZeroWindowAdv","TcpExt.TCPWinProbe",
82
            "TcpExt.TCPKeepAlive", "TcpExt.TCPFastOpenCookieReqd",
83
            "IpExt.InNoECTPkts", "IpExt.InCEPkts"],
84
        'cpu_metrics': ["user", "nice", "system", "idle", "iowait", "irq",
85
                        "softirq","steal", "guest", "guest_nice"],
86
        'per_cpu': False,
87
        'diskstats_dev_re': "dm-\d",
88
        'diskstats_cols': ["r", "r_merge", "r_sector", "r_time", "w", "w_merge",
89
                           "w_sector", "w_time", "io_inprog", "io_time",
90
                           "io_weighted_time"],
91
        'net_dev_re': "(virbr\d+)|(vnet\d+)",
92
        'net_dev_cols': ["rx_bytes", "rx_pkt", "rx_errs", "rx_drop",
93
                         "rx_fifo_errs", "rx_frame_errs", "rx_compressed",
94
                         "rx_mcast", "tx_bytes", "tx_pkt", "tx_errs", "tx_drop",
95
                         "tx_fifo_errs", "collissions", "carrier",
96
                         "tx_compressed"],
97
        'net_snmp_items': ["Ip:", "Icmp:", "IcmpMsg:", "Tcp:", "Udp:",
98
                           "UdpLite:"]
99
    }
100
101
    def __init__(self, log, config):
102
        """Plugin to measure various kernel metrics from /proc.
103
104
        :param log: A logger
105
        :type log: logging.RootLogger
106
        :param config: a plumd.config.Conf configuration helper instance.
107
        :type config: plumd.config.Conf
108
        """
109
        super(Proc, self).__init__(log, config)
110
        self.config.defaults(Proc.defaults)
111
        self.calc = Differential()
112
        self.proc_path = config.get('proc_path')
113
        self.page_size = os.sysconf("SC_PAGESIZE")
114
        self.diskstats_dev_re = re.compile(config.get('diskstats_dev_re'))
115
        self.net_dev_re = re.compile(config.get('net_dev_re'))
116
117
118
    def poll(self):
119
        """Poll for kernel metrics under /proc.
120
121
        :rtype: ResultSet
122
        """
123
        ret = plumd.ResultSet([])
124
        ret.add(self.proc_stat())
125
        ret.add(self.proc_meminfo())
126
        ret.add(self.proc_loadavg())
127
        ret.add(self.proc_swap())
128
        ret.add(self.proc_uptime())
129
        ret.add(self.proc_diskstats())
130
        ret.add(self.proc_net_dev())
131
        ret.add(self.proc_net_snmp())
132
        ret.add(self.proc_net_sockstat())
133
        ret.add(self.proc_net_netstat())
134
        return ret
135
136
137 View Code Duplication
    def proc_stat_cpu_percent(self, key, val, ts):
1 ignored issue
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
138
        """Return cpu utilization metrics in percentage.
139
140
        :param key: The metric name (eg. cpu, cpu0, cpu1, etc)
141
        :type key: str
142
        :param val: A deque populated with the metric values from stat
143
        :type val: deque
144
        :rtype: list
145
        """
146
        ret = []
147
        total = sum([ float(i) for i in val])
148
        cpu = self.config.get('cpu_metrics')
149
        for map_val in cpu:
150
            if len(val) < 1:
151
                break
152
            metric_val = float(val.popleft())
153
            mstr = "{0}.{1}".format(key, map_val)
154
            percent_val = metric_val / total * 100.00
155
            ret.append(plumd.Float(mstr, percent_val))
156
        return ret
157
158
159 View Code Duplication
    def proc_stat_cpu(self, key, val, ts):
1 ignored issue
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
160
        """Return cpu utilization metrics in USER_HZ or Jiffies
161
        (most likely units of 100Hz intervals ie. 100ms intervals).
162
163
        :param key: The metric name (eg. cpu, cpu0, cpu1, etc)
164
        :type key: str
165
        :param val: A deque populated with the metric values from stat
166
        :type val: deque
167
        :rtype: list
168
        """
169
        ret = []
170
        total = sum([ float(i) for i in val])
171
        cpu = self.config.get('cpu_metrics')
172
        for map_val in cpu:
173
            if len(val) < 1:
174
                break
175
            metric_val = float(val.popleft())
176
            mstr = "{0}.{1}".format(key, map_val)
177
            percent_val = float(metric_val / total) * 100.00
178
            mval = self.calc.per_second(key, percent_val, ts)
179
            ret.append(plumd.Float(mstr, mval))
180
        return ret
181
182
183
    def proc_stat(self):
184
        """Return cpu utilization and process metrics from proc file stat.
185
186
        :rtype: plumd.Result
187
        """
188
        skip = self.config.get('skip_proc_stat')
189
        per_cpu = self.config.get('per_cpu')
190
        result = plumd.Result("stat")
191
        fname = "{0}/stat".format(self.proc_path)
192
        # read and process /proc/stat
193
        dat = get_file_map(fname, 0, 0)
194
        ts = time.time()
195
        # parse
196
        for key, val in dat.items():
197
            # cpu is the only special metric
198
            if val is None:
199
                self.log.error("proc_stat: null value for {0}".format(key))
200
                continue
201
            elif key in skip:
202
                continue
203
            elif key == "cpu":
204
                result.add_list(self.proc_stat_cpu_percent(key, val, ts))
205
            elif key.startswith("cpu"):
206
                if not per_cpu:
207
                    continue
208
                result.add_list(self.proc_stat_cpu_percent(key, val, ts))
209
            else:
210
                mval = self.calc.per_second(key, float(val[0]), ts)
211
                result.add(plumd.Int(key, mval))
212
        return result
213
214
215
    def proc_meminfo(self):
216
        """Return memory utilization metrics from proc file mem.
217
218
        :rtype: plumd.Result
219
        """
220
        skip = self.config.get('skip_proc_meminfo')
221
        result = plumd.Result("mem")
222
        fname = "{0}/meminfo".format(self.proc_path)
223
        # read and process /proc/stat
224
        dat = get_file_map(fname, 0, 0)
225
        ts = time.time()
226
        # parse
227
        for key, val in dat.items():
228
            mstr = key.replace(":", "")
229
            # cpu is the only special metric
230
            if val is None:
231
                self.log.error("proc_meminfo: null value for {0}".format(mstr))
232
                continue
233
            elif mstr in skip:
234
                continue
235
            else:
236
                #mval = dcalc.per_second(key, float(val[0]), ts)
237
                result.add(plumd.Int(mstr, val[0]))
238
        return result
239
240
241
    def proc_loadavg(self):
242
        """Return 1, 5 and 15 minute load averages from proc file loadavg.
243
244
        :rtype: plumd.Result
245
        """
246
        result = plumd.Result("loadavg")
247
        fname = "{0}/loadavg".format(self.proc_path)
248
        dat = []
249
        # read and process /proc/stat
250
        try:
251
            dat = get_file(fname).split()
252
        except Exception as e:
253
            tb = traceback.format_exc()
254
            self.log.error("proc_loadavg: exception: {0} : {1}".format(e, tb))
255
            return result
256
        if len(dat) >= 3:
257
            result.add(plumd.Float("1", dat[0]))
258
            result.add(plumd.Float("5", dat[1]))
259
            result.add(plumd.Float("15", dat[2]))
260
        return result
261
262
263
    def proc_swap(self):
264
        """Return swap file usage metrics from proc file swap.
265
266
        :rtype: plumd.Result
267
        """
268
        result = plumd.Result("swap")
269
        fname = "{0}/swaps".format(self.proc_path)
270
        dat = []
271
        # read and process /proc/stat
272
        dat = get_file_list(fname)
273
        # header: file, type, size, used, priority
274
        if len(dat) > 1:
275
            dat.popleft()
276
        for entry in dat:
277
            if not entry:
278
                continue
279
            #sfname, stype, ssize, sused, sprio = ("", None, 0, 0, 0)
280
            try:
281
                sfname, stype, ssize, sused, sprio = entry.split()
282
            except Exception as e:
283
                tb = traceback.format_exc()
284
                self.log.error("proc_swap: exception: {0}: {1}".format(e, tb))
285
                continue
286
            sname = os.path.basename(sfname)
287
            mstr = "{0}.used".format(sname)
288
            result.add(plumd.Float(mstr, sused))
289
            mstr = "{0}.size".format(sname)
290
            result.add(plumd.Float(mstr, ssize))
291
            sfree = float(ssize) - float(sused)
292
            mstr = "{0}.free".format(sname)
293
            result.add(plumd.Float(mstr, sfree))
294
        return result
295
296
297
    def proc_uptime(self):
298
        """Return uptime from proc file swap.
299
300
        :rtype: plumd.Result
301
        """
302
        result = plumd.Result("uptime")
303
        fname = "{0}/uptime".format(self.proc_path)
304
        dat = []
305
        # read and process /proc/stat
306
        try:
307
            up, idle = get_file(fname).split()
308
        except Exception as e:
309
            tb = traceback.format_exc()
310
            self.log.error("proc_uptime: exception: {0}: {1}".format(e, tb))
311
            return result
312
        pidle = float(idle)/float(up) * 100 / multiprocessing.cpu_count()
313
        result.add(plumd.Float("up", up))
314
        result.add(plumd.Float("idle", idle))
315
        result.add(plumd.Float("idle_percent", pidle))
316
        return result
317
318
319 View Code Duplication
    def proc_diskstats(self):
1 ignored issue
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
320
        """Return disk io metrics from proc file diskstats.
321
322
        :rtype: plumd.Result
323
        """
324
        # times in ms
325
        cols = self.config.get('diskstats_cols')
326
        result = plumd.Result("diskstats")
327
        fname = "{0}/diskstats".format(self.proc_path)
328
        dat = {}
329
        # read and process /proc/stat
330
        try:
331
            dat = get_file_map(fname, 2, 0)
332
        except Exception as e:
333
            tb = traceback.format_exc()
334
            self.log.error("proc_diskstats: exception: {0}: {1}".format(e, tb))
335
            return result
336
        ts = time.time()
337
        for key, val in dat.items():
338
            if self.diskstats_dev_re.match(key):
339
                continue
340
            if len(val) != 13:
341
                self.log.error("proc_diskstats: invalid entry: {0}".format(val))
342
                continue
343
            for mname in cols:
344
                mval = int(val.popleft())
345
                mstr = "{0}.{1}".format(key, mname)
346
                dval = self.calc.per_second(mstr, mval, ts)
347
                result.add(plumd.Int(mstr, dval))
348
        return result
349
350
351 View Code Duplication
    def proc_net_dev(self):
1 ignored issue
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
352
        """Return network interface metrics from proc file net/dev.
353
354
        Add entries to the configuration value 'net_dev_re' to skip
355
        any network interfaces that match the regular expression.
356
357
        :rtype: plumd.Result
358
        """
359
        cols = self.config.get('net_dev_cols')
360
        result = plumd.Result("net")
361
        fname = "{0}/net/dev".format(self.proc_path)
362
        dat = {}
363
        # read and process /proc/stat
364
        try:
365
            dat = get_file_map(fname, 0, 0)
366
        except Exception as e:
367
            tb = traceback.format_exc()
368
            self.log.error("proc_net_dev: exception: {0}: {1}".format(e, tb))
369
            return result
370
        ts = time.time()
371
        for key, val in dat.items():
372
            key = key.replace(":", "")
373
            if self.net_dev_re.match(key):
374
                continue
375
            if len(val) < len(cols):
376
                #self.log.error("proc_net_dev: invalid entry: {0}".format(val))
377
                continue
378
            for mname in cols:
379
                if len(val) < 1:
380
                    break
381
                mval = int(val.popleft())
382
                mstr = "{0}.{1}".format(key, mname)
383
                dval = self.calc.per_second(mstr, mval, ts)
384
                result.add(plumd.Int(mstr, dval))
385
        return result
386
387
388
    def proc_net_snmp(self):
389
        """Return network protocol metrics from proc file net/snmp.
390
391
        Add entries to the configuration value 'skip_proc_net_snmp' to skip
392
        metrics.
393
394
        Add entries to the configuration value 'net_snmp_items' to match the
395
        format/order of the proc file net/snmp entries on the system.
396
397
        :rtype: plumd.Result
398
        """
399
        skip = self.config.get('skip_proc_net_snmp')
400
        items = self.config.get('net_snmp_items')
401
        result = plumd.Result("net_snmp")
402
        fname = "{0}/net/snmp".format(self.proc_path)
403
        dat = {}
404
        # read and process - dat is a list of lines from fname
405
        dat = get_file_list(fname)
406
        ts = time.time()
407
408
        # process each pair of lines
409
        for item in items:
410
            # older kernels may not have all items
411
            if len(dat) < 2:
412
                break
413
            try:
414
                # first line is a list of: metric: header values
415
                header = deque(dat.popleft().split())
416
                # second line is a list of: <metric>: metric values
417
                vals = deque([ int(i) for i in dat.popleft().split()[1:] ])
418
            except Exception as e:
419
                tb = traceback.format_exc()
420
                self.log.error("proc_net_snmp: exception: {0}: {1}".format(e, tb))
421
                continue
422
            if len(header) < 2 or header[0] != item:
423
                self.log.error("proc_net_snmp: invalid entry: {0}: {1}".format(header, item))
424
                continue
425
            # first value is the name of the metric eg. Ip, Icmp, etc
426
            mheader = header.popleft().replace(":", "")
427
            for mname in header:
428
                if len(vals) < 1:
429
                    break
430
                mval = vals.popleft()
431
                mstr = "{0}.{1}".format(mheader, mname)
432
                if mstr in skip:
433
                    continue
434
                dval = self.calc.per_second(mstr, mval, ts)
435
                result.add(plumd.Int(mstr, dval))
436
        return result
437
438
439
    def proc_net_sockstat(self):
440
        """Return network socket metrics from proc file net/sockstat.
441
442
        Note: sockstat.TCP.mem is measured in pages, you can get the system page
443
        size from os.sysconf("SC_PAGESIZE")
444
445
        Note: FRAG: ip fragmentation related
446
447
        :rtype: plumd.Result
448
        """
449
        skip = self.config.get('skip_proc_net_sockstat')
450
        result = plumd.Result("sockstat")
451
        fname = "{0}/net/sockstat".format(self.proc_path)
452
        # sys/net/ipv4/tcp_mem format: min, pressure, max
453
        fname_limits = "{0}/sys/net/ipv4/tcp_mem".format(self.proc_path)
454
        # orphan limit: /proc/sys/net/ipv4/tcp_max_orphans
455
        fname_orph = "{0}/sys/net/ipv4/tcp_max_orphans".format(self.proc_path)
456
        dat = {}
457
        # read and process - dat is a list of lines from fname
458
        dat = get_file_map_list(fname, 0, 0)
459
        ts = time.time()
460
        # each entry is a key: [metric, val, metric, val]
461
        for key, val in dat.items():
462
            if len(val) < 2:
463
                continue
464
            mstr = key.replace(":", "")
465
            if mstr in skip:
466
                continue
467
            mnames = val[::2]
468
            mvals = deque([ int(i) for i in val[1::2] ])
469
            if len(mnames) != len(mvals):
470
                self.log.error("proc_net_sockstat: invalid entry: {0}".format(mnames))
471
                continue
472
            for mname in mnames:
473
                metric = "{0}.{1}".format(mstr, mname)
474
                result.add(plumd.Int(metric, mvals.popleft()))
475
        # also record configured tcp mem limits
476
        dat = get_file(fname_limits).split()
477
        if len(dat) == 3:
478
            # eg. for alerting/dashboard on pages allocated vs max values
479
            result.add(plumd.Int("TCP.mem_min", dat[0]))
480
            result.add(plumd.Int("TCP.mem_pressure", dat[1]))
481
            result.add(plumd.Int("TCP.mem_max", dat[1]))
482
        dat = get_file(fname_orph)
483
        result.add(plumd.Int("TCP.orphan_max", dat))
484
        return result
485
486
487
    def proc_net_netstat(self):
488
        """Return detailed network statitistics proc file net/netstat.
489
490
        Note: add entries to the configuration value 'skip_proc_net_netstat' to
491
        skip metric names (eg. 'TcpExt.TCPMTUPSuccess'). Defaults should be
492
        reasonable however.
493
494
        Note: ECT1Pkts and ECT0Pkts relate to ECT congestion notifications.
495
496
        :rtype: plumd.Result
497
        """
498
        skip = self.config.get('skip_proc_net_netstat')
499
        result = plumd.Result("netstat")
500
        fname = "{0}/net/netstat".format(self.proc_path)
501
        dat = {}
502
        # read and process - dat is a list of lines from fname
503
        dat = get_file_list(fname)
504
        ts = time.time()
505
        while len(dat) > 1:
506
            headers = deque(dat.popleft().split())
507
            if len(dat) < 1 or len(headers) < 1:
508
                break
509
            mvals = deque([ int(i) for i in dat.popleft().split()[1:] ])
510
            mstr = headers.popleft().replace(":", "")
511
            if len(headers) != len(mvals):
512
                self.log.error("proc_net_netstat: invalid entry: {0}".format(headers))
513
                continue
514
            for mname in headers:
515
                if len(mvals) < 1:
516
                    break
517
                metric = "{0}.{1}".format(mstr, mname)
518
                if metric in skip:
519
                    continue
520
                dval = self.calc.per_second(metric, mvals.popleft(), ts)
521
                result.add(plumd.Int(metric, dval))
522
        return result
523