#!/usr/bin/env python """ Munin plugin which reports selected counters regarding ports on a Brocade SAN FC-switch. Only enabled ports are considered. The counters shown: bits: Number of bits transmitted(tx)/received(rx) by the port. Inspecting this graph will help determining if the port is saturated. frames: Number of frames transmitted(tx)/received(rx) by the port. Provides another perspective on port business than bits-trafic. no_tx_credits: Number of times when the transmit credit has reached zero. rx_crcs: CRC errors detected in received frames. Together with enc_out errors, CRC errors indicate a GBIC/SFP problem. enc_out: Encoding or disparity errors outside frame received. If there is a high number for this counter, it could reflect: - If there is also a high value for rx_crcs for the port, then there is likely a GBIC/SFP problem. - If there the value of rx_crcs for the port is low, there is likely a cable/connector problem. bad_os: Number of invalid Ordered Sets received. It seems that a cable which is only partially inserted into the SFP may generate large amounts of this. c3_discards: Number of Class 3 frames that the port has discarded. When symlinking to the plugin, indicate hostname like this: brocade_san_switch_ports_HOSTNAME # Special requirements: # - the pyasn1 module # - the pysnmp module # Beware: Certain combinations of pyasn1 and pysnmp don't work well # together: https://sourceforge.net/tracker/?func=detail&aid=3314419&group_id=14735&atid=114735 """ # Note: In the SNMP output from brocade switches, the interesting # counters are named with numbers starting with 1, while the # ports' real names on the box and in the administration interface # start with 0. And there doesn't seem to be a way to map between # ifDesc and the interesting crc and enc_out counters :-( # Therefore, this plugin is Brocade-specific, and thus some # manipulation of port numbers are performed for the output # of this plugin (see comments marked ARGH below). # TODOs: # - implement snmpconf? # Munin magic markers #%# family=manual #%# capabilities= # http://community.brocade.com/servlet/JiveServlet/download/5581-1453/portErrShow.pdf # is useful when trying to understand counters on a Brocade switch. # Author: Troels Arvin # See http://troels.arvin.dk/code/munin/ for latest version. # Only tested with Red Hat Enterprise Linux 5, currently. # Released according to the "New BSD License" AKA the 3-clause # BSD License: # ==================================================================== # Copyright (c) 2011, Danish National Board of Health. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of the the Danish National Board of Health nor the # names of its contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY the Danish National Board of Health ''AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL the Danish National Board of Health BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ==================================================================== # $Id: brocade_san_switch_ports_ 16285 2011-06-09 21:27:44Z tra $ import os, sys, re from pysnmp.entity.rfc3413.oneliner import cmdgen my_canonical_name = 'brocade_san_switch_ports_' # If called as - e.g. - # brocade_san_switch_ports_sansw1,then # sansw1 will be interpreted as # the host_name # For reference: # FC-MGMT-MIB::connUnitPortSpeed = .1.3.6.1.3.94.1.10.1.15 # The speed of the port in kilobytes per second, e.g. 250000, 500000 or 1000000 # SW-MIB::swFCPortLinkState = .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.6 # SW-MIB::swFCPortTxWords = .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.11 # SW-MIB::swFCPortRxWords = .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.12 # SW-MIB::swFCPortTxFrames = .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.13 # SW-MIB::swFCPortRxFrames = .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.14 # SW-MIB::swFCPortNoTxCredits = .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.20 # SW-MIB::swFCPortRxCrcs = .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.22 # SW-MIB::swFCPortRxEncOutFrs = .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.26 # SW-MIB::swFCPortRxBadOs = .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.27 # SW-MIB::swFCPortC3Discards = .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.28 # SW-MIB::swFCPortName = .1.3.6.1.4.1.1588.2.1.1.1.6.2.1.36 # manually set port name # SW-MIB::swNsPortSymb = .1.3.6.1.4.1.1588.2.1.1.1.7.2.1.5 # e.g. "HITACHI DF600F 0000" # OID strings must be without leading dot in this script port_link_state_oidstr = '1.3.6.1.4.1.1588.2.1.1.1.6.2.1.6' oidstrs = { 'port_speed' : '1.3.6.1.3.94.1.10.1.15', 'tx_words' : '1.3.6.1.4.1.1588.2.1.1.1.6.2.1.11', 'rx_words' : '1.3.6.1.4.1.1588.2.1.1.1.6.2.1.12', 'tx_frames' : '1.3.6.1.4.1.1588.2.1.1.1.6.2.1.13', 'rx_frames' : '1.3.6.1.4.1.1588.2.1.1.1.6.2.1.14', 'no_tx_credits' : '1.3.6.1.4.1.1588.2.1.1.1.6.2.1.20', 'rx_crcs' : '1.3.6.1.4.1.1588.2.1.1.1.6.2.1.22', 'enc_out' : '1.3.6.1.4.1.1588.2.1.1.1.6.2.1.26', 'bad_os' : '1.3.6.1.4.1.1588.2.1.1.1.6.2.1.27', 'c3_discards' : '1.3.6.1.4.1.1588.2.1.1.1.6.2.1.28', 'port_name' : '1.3.6.1.4.1.1588.2.1.1.1.6.2.1.36' } descriptions = { 'bits' : 'received(rx)/transmitted(tx) bits', 'frames' : 'received(rx)/transmitted(tx) frames', 'no_tx_credits' : 'number of times when the transmit credit has reached zero', 'rx_crcs' : 'the number of CRC errors detected for frames received', 'enc_out' : 'encoding or disparity errors outside FC frame', 'bad_os' : 'invalid Ordered Sets received', 'c3_discards' : 'number of Class 3 frames that the port has discarded' } # Some helper functions: def bailout(msg): sys.stderr.write(msg+"\n") sys.exit(1) def debug(msg): print('Debug: %s\n' % msg) # Break OID-string in to a tuple of elements def oidstr2tuple(oidstr): int_list = [ int(s) for s in oidstr.split('.') ] return tuple(int_list) # if object_name is 1.3.6.1.4.1.1588.2.1.1.1.6.2.1.26.1, return # 1.3.6.1.4.1.1588.2.1.1.1.6.2.1.26 def get_ObjectName_subtree(obj): return obj[:len(obj)-1] # Convert SNMP objects to simpler structure, and cut off # excessive return-value data (which bulkCmd may generated) def varBindTable2plainDict(varBindTable): ret_dict = {} wanted_subtree = get_ObjectName_subtree(varBindTable[0][0][0]) #debug('wanted_subtree: '+str(wanted_subtree)) for varBindTableRow in varBindTable: if get_ObjectName_subtree(varBindTableRow[0][0]) == wanted_subtree: portnum = varBindTableRow[0][0][-1] count = int(varBindTableRow[0][1]) ret_dict[portnum] = count else: #debug('Skipped '+str(varBindTableRow)) pass #debug('ret_dict: '+str(ret_dict)) return ret_dict # The more interesting functions: # Honor the munin-APIs "config" command def print_config(host_name,community,port_speed_str,port_name_str,enabled_ports): port_speeds = get_port_speeds(host_name,community,port_speed_str) port_names = get_port_names(host_name,community,port_name_str) print('host_name %s' % host_name) # Per-port for counter_type in descriptions: for portnum in enabled_ports: graph_info = 'This graph shows the count of %s. Interface speed is %dGbit/s.' % (descriptions[counter_type],port_speeds[portnum]) try: graph_info = "%s Interface name manually set to '%s'." % (graph_info,port_names[portnum]) except: pass print('multigraph %s.port_%d' % (counter_type,portnum-1)) # ARGH: numbering base stuff print('graph_title Port %d %s' % (portnum-1,counter_type)) # ARGH: numbering base stuff print('graph_args --base 1000 -l 0') print('graph_category SAN') print('graph_info %s' % graph_info) if counter_type == 'bits': print('graph_vlabel bits rx (-) / tx (+) per ${graph_period}') print('graph_order rx tx') print('rx.label rx') print('rx.graph no') print('rx.type COUNTER') print('rx.max 20000000000') # Initial-spike prevention: 20Gbit/s is max FC speed print('tx.label bps') print('tx.negative rx') print('tx.type COUNTER') print('tx.max 20000000000') # Initial-spike prevention: 20Gbit/s is max FC speed elif counter_type == 'frames': print('graph_vlabel frames rx (-) / tx (+) per ${graph_period}') print('graph_order rx tx') print('rx.label rx') print('rx.graph no') print('rx.type COUNTER') print('rx.max 89285714') # Inital spike prevention: FC minimum frame length is 24 bytes == 224 bits; max FC bit-speed is 20Gbit/s print('tx.label frames per sec') print('tx.negative rx') print('tx.type COUNTER') print('tx.max 89285714') # Inital spike prevention: FC minimum frame length is 24 bytes == 224 bits; max FC bit-speed is 20Gbit/s else: print('graph_vlabel count') print('count.label count') print('count.type COUNTER') # Totals for counter_type in descriptions: print('multigraph %s' % counter_type) print('graph_title %s total %s' % (host_name,counter_type)) print('graph_args --base 1000 -l 0') print('graph_category SAN') print('graph_info This graph shows the total count of %s across all ports' % descriptions[counter_type]) if counter_type == 'bits': print('graph_vlabel bits rx (-) / tx (+) per ${graph_period}') print('rx.label rx') print('rx.graph no') print('rx.type COUNTER') print('rx.max 800000000000') # initial-spike prevention: Assuming a max of 40 ports with each 20Gbit/s max print('tx.label bps') print('tx.negative rx') print('tx.type COUNTER') print('tx.max 800000000000') # initial-spike prevention: Assuming a max of 40 ports with each 20Gbit/s max elif counter_type == 'frames': print('graph_vlabel frames rx (-) / tx (+) per ${graph_period}') print('rx.label rx') print('rx.graph no') print('rx.type COUNTER') print('rx.max 89285714') # Inital spike prevention: FC minimum frame length is 24 bytes == 224 bits; max FC bit-speed is 20Gbit/s print('tx.label bps') print('tx.negative rx') print('tx.type COUNTER') print('tx.max 89285714') # Inital spike prevention: FC minimum frame length is 24 bytes == 224 bits; max FC bit-speed is 20Gbit/s else: print('graph_vlabel count') print('count.label count') print('count.type COUNTER') # We don't care for disabled ports def get_enabled_ports(host_name,community): link_states = get_port_values(host_name,community,port_link_state_oidstr) # status 1 means enabled return [ portnum for portnum in link_states if link_states[portnum] == 1 ] # Talk to the SNMP agent performing a bulk-get (presumably faster than a walk), # the starting point indicated by the oid_start_tpl tuple. # Handle potential errors. def pull_values_bulk(host_name,community,oid_start_str): oid_start_tpl = oidstr2tuple(oid_start_str) try: errorIndication, errorStatus, errorIndex, varBindTable = cmdgen.CommandGenerator().bulkCmd( cmdgen.CommunityData('whatever', community), cmdgen.UdpTransportTarget((host_name, 161)), 300, 0, (oid_start_tpl) ) except Exception, e: bailout("Walking %s threw exception: %s" % (oid_start_str,str(e))) if errorStatus: bailout("Walking %s failed: %s" % (oid_start_str,errorStatus.prettyPrint())) if errorIndication: bailout("Walking %s failed with errorIndication=" % (oid_start_str,errorIndication)) if len(varBindTable) < 1: bailout("Empty result from walk of %s" % oid_start_str) #debug('Pull result: %s' % varBindTable) return varBindTable # Talk to the SNMP agent performing an SMNP-walk (presumably slower than a bulk-get), # the starting point indicated by the oid_start_tpl tuple. # Handle potential errors. def pull_values_walk(host_name,community,oid_start_str): oid_start_tpl = oidstr2tuple(oid_start_str) try: errorIndication, errorStatus, errorIndex, varBindTable = cmdgen.CommandGenerator().nextCmd( cmdgen.CommunityData('whatever', community), cmdgen.UdpTransportTarget((host_name, 161)), (oid_start_tpl) ) except Exception, e: bailout("Walking %s threw exception: %s" % (oid_start_str,str(e))) if errorStatus: bailout("Walking %s failed: %s" % (oid_start_str,errorStatus.prettyPrint())) if errorIndication: bailout("Walking %s failed with errorIndication=" % (oid_start_str,errorIndication)) if len(varBindTable) < 1: bailout("Empty result from walk of %s" % oid_start_str) #debug('Pull result: %s' % varBindTable) return varBindTable # Port speeds need to be grabbed by walking, not via bulk-get. # Port speeds are returned per port index, using a unit of Gbit/s def get_port_speeds(host_name,community,oid_start_str): port_speeds = {} speeds_raw=pull_values_walk(host_name,community,oid_start_str) for speed_raw in speeds_raw: oid_obj,kb_speed = speed_raw[0] port_idx = oid_obj[-1] port_speeds[port_idx] = (int(kb_speed) * 8000) / 1000000000 return port_speeds def get_port_names(host_name,community,oid_start_str): port_names = {} names_raw = pull_values_bulk(host_name,community,oid_start_str) for tpl in names_raw: oid_obj,port_name = tpl[0] port_idx = oid_obj[-1] if port_name <> '': port_names[port_idx] = port_name return port_names # Combine oidstr2tupl, pull_values_bulk and varBindTable2plainDict. # Return dict of port-number => count def get_port_values(host_name,community,oid_start_str): return varBindTable2plainDict( pull_values_bulk(host_name,community,oid_start_str) ) # Initial sanity check n_args=len(sys.argv) if n_args > 2: # At most one arg expected bailout('%d arguments given - expecting only one' % n_args) # Make sure that multigraphs are supported if 'MUNIN_CAP_MULTIGRAPH' not in os.environ: bailout('MUNIN_CAP_MULTIGRAPH not found in environment') # Parse host_name and counter type from arg0 called_as = os.path.basename(sys.argv[0]) regex_str = '^'+my_canonical_name+'(.+)' match = re.match(regex_str, called_as) if match: host_name = match.group(1) else: bailout('Missing host_name and/or counter type') # Determine SNMP community try: community = os.environ['community'] except: community = 'public' enabled_ports = get_enabled_ports(host_name,community) # See how we were called if n_args == 2: # An argument was given, so let's not simply print # values. arg = sys.argv[1] if arg == 'config': print_config(host_name,community,oidstrs['port_speed'],oidstrs['port_name'],enabled_ports) sys.exit(0) if arg == 'fetch': pass else: bailout("Unknown argument '%s'" % arg) sys.exit(1) # So we are performing a fetch. Let's go: # Prepare some structures counters = {} counters['rx_words' ] = get_port_values(host_name,community,oidstrs['rx_words' ]) counters['tx_words' ] = get_port_values(host_name,community,oidstrs['tx_words' ]) counters['rx_frames' ] = get_port_values(host_name,community,oidstrs['rx_frames' ]) counters['tx_frames' ] = get_port_values(host_name,community,oidstrs['tx_frames' ]) counters['no_tx_credits' ] = get_port_values(host_name,community,oidstrs['no_tx_credits' ]) counters['rx_crcs' ] = get_port_values(host_name,community,oidstrs['rx_crcs' ]) counters['enc_out' ] = get_port_values(host_name,community,oidstrs['enc_out' ]) counters['bad_os' ] = get_port_values(host_name,community,oidstrs['bad_os' ]) counters['c3_discards' ] = get_port_values(host_name,community,oidstrs['c3_discards' ]) totals = {} totals['rx_bits' ] = 0 totals['tx_bits' ] = 0 totals['rx_frames' ] = 0 totals['tx_frames' ] = 0 totals['no_tx_credits'] = 0 totals['rx_crcs' ] = 0 totals['enc_out' ] = 0 totals['bad_os' ] = 0 totals['c3_discards' ] = 0 #debug('counters: ' + str(counters)) # Handle the default case (fetch) # Per-port values for portnum in enabled_ports: for counter_type in descriptions: print('multigraph %s.port_%d' % (counter_type,portnum-1)) # ARGH: numbering base stuff # For some of the graphs, there is an in/out aspect, for others # they are combined or not applicable if counter_type == 'bits': rx_value = counters['rx_words'][portnum] tx_value = counters['tx_words'][portnum] rx_bits = rx_value * 40 # Each word consists of four tx_bits = tx_value * 40 # 10-bit units. print('rx.value %d' % rx_bits) print('tx.value %d' % tx_bits) totals['rx_bits'] += rx_bits totals['tx_bits'] += tx_bits elif counter_type == 'frames': rx_value = counters['rx_frames'][portnum] tx_value = counters['tx_frames'][portnum] print('rx.value %d' % rx_value) print('tx.value %d' % tx_value) totals['rx_frames'] += rx_value totals['tx_frames'] += tx_value else: print('count.value %d' % counters[counter_type][portnum]) totals[counter_type] += counters[counter_type][portnum] # Totals for counter_type in descriptions: print('multigraph %s' % (counter_type)) # For some of the graphs, there is an in/out aspect, for others # they are combined or not applicable if counter_type == 'bits': print('rx.value %d' % totals['rx_bits']) print('tx.value %d' % totals['tx_bits']) elif counter_type == 'frames': print('rx.value %d' % totals['rx_frames']) print('tx.value %d' % totals['tx_frames']) else: print('count.value %d' % totals[counter_type])