#!/usr/bin/env python
# ex: set encoding=utf8 tabstop=4 expandtab shiftwidth=4 softtabstop=4:
#
# © Copyright IBM Corp. 2008.  All Rights Reserved.
# Author: Keith Mannthey <kmannth@us.ibm.com>
#         Vernon Mauery <vernux@us.ibm.com>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.


import os
import sys
import re
import syslog
import signal
import fcntl
import termios
from time import sleep

def die(msg, ret=-1):
    print msg
    sys.exit(ret)

class InvalidMachineError(Exception):
    pass

def detach_tty():
        childpid = os.fork()
        if childpid < 0:
                die("fork failed")
        elif childpid > 0: 
                sys.exit(0)

        # lose controlling tty
        try:
                ttyfd = os.open('/dev/tty', os.O_RDWR)
                if ttyfd >= 0:
                        fcntl.ioctl(ttyfd, termios.TIOCNOTTY, 0)
                        os.close(ttyfd)
        except OSError:
                pass
        os.setpgrp()

        # ignore sighup
        signal.signal(signal.SIGHUP, signal.SIG_IGN)
        childpid = os.fork()
        if childpid < 0:
                die("fork failed")
        elif childpid > 0:
                sys.exit(0)

def daemonize (logfile=None):
        if os.getppid() != 1: 
                detach_tty()
        os.close(0)

        # Reopen stdin descriptor on /dev/null */
        fd = os.open("/dev/null", os.O_RDWR)
        if fd < 0:       # stdin
                die("cannot open /dev/null for stdin")
        if logfile is not None:
                logfd = os.open(logfile, os.O_CREAT|os.O_WRONLY|os.O_APPEND, 0666)
                if logfd < 0: # stdout
                        die("could not open logfile (%s) for stdout" % logfile)
        else:
                logfd = 0
        os.close(1)
        os.close(2)
        fd = os.dup(logfd)
        if fd < 0:
                die('failed to reopen stdout')
        fd = os.dup(logfd)
        if fd < 0:
                die('failed to reopen stdout')

        # move to root directory, so we don't prevent filesystem unmounts
        os.chdir("/");

        # set our umask to something reasonable (we hope)
        os.umask(022);

        return 0

def print_syslog(message):
    syslog.syslog(message)
    print (message)

def check_rtpm_enabled():
    if not os.path.exists("/sys/devices/system/ibm_rtl/state"):
        return False

    # check IBM RTL state is enabled
    return bool(int(open("/sys/devices/system/ibm_rtl/state").read()))

class edac_factory(type):
    subclasses = {}
    def __new__(cls, name, bases, attrs):
        # print cls, name, bases, attrs
        obj = type.__new__(cls, name, bases, attrs)
        cls.subclasses[name] = obj
        obj.subclasses = cls.subclasses
        return obj

class Edac(object):
    __metaclass__ = edac_factory
    sysfs_mc_name_path = '/sys/devices/system/edac/mc/mc0/mc_name'
    # edac_mc_name should be defined in each subclass as the name the
    # edac driver exports via the mc_name file in sysfs
    edac_mc_name = 'Unsupported device'
    edac_data = []

    @staticmethod
    def create():
        obj = None
        for n,s in Edac.subclasses.iteritems():
            if s.probe():
                obj = s()
                break
        if obj is None:
            raise InvalidMachineError()
        return obj

    def __init__(self):
        self.edac_data = self.read_edac()
        self.dmi_data = self.read_dmi()
        if self.get_edac_size() != self.get_dmi_size():
            print_syslog("DMI and EDAC do not agree on the amount of memory.")
            print_syslog("Proper reporting of ecc errors is not possible.")
            print_syslog("Please check your system configuration.")
            sys.exit(-1)

        self.fixup_tables()

    @staticmethod
    def probe():
        return False

    def fixup_tables(self, tables):
        pass

    # Read EDAC data and map the real location data from the DMI data
    # onto the channels.  The return data contains a list of channels
    # with real dimm location
    def setup_edac_data(self):
        return edac_data

    # Read and decode dmidecode
    def read_dmi(self):
        dmi_record = {"dimm": re.compile(r"Locator: (?P<dimm>DIMM\d+)"),
                  "size": re.compile(r"Size: (?P<size>\d+) MB")}

        dmi_handle = r"Handle 0x[A-F\d]{4}\, DMI type \d+\, \d+ bytes[\s,.]*"

        count = 0
        dmi_list = []
        for handle in re.split(dmi_handle, os.popen("dmidecode").read()):
            if "Memory Device\n" in handle[:15]:
                slot = {"count": count, "enabled": False, "dimm": None, "size": 0}
                record = handle[15:].split("\n")

                r = dmi_record["size"].match(record[4][1:])
                if r:
                    slot["enabled"] = True
                    slot["size"] = int(r.group(1))
                    dmi_list.append(slot);
                    count = count + 1

                r = dmi_record["dimm"].match(record[7][1:])
                if r:
                    slot["dimm"] = int(r.group("dimm")[4:])
        # Reorder DMI entries to csrow order exposed by EDAC.
        return dmi_list

    # return the n'th (dimm) dimm in dmi_list
    def get_dmi_dimm(self, dimm):
        if dimm < 0 or dimm > len(self.dmi_data):
            return -1
        else:
            return self.dmi_data[dimm]["dimm"]

    # return the total amount of memory reported by dmi_list
    def get_dmi_size(self):
        size = 0
        for dimm in self.dmi_data:
            size = size +  dimm["size"]
        return size

    # return the "real dimm" value for a dimm
    def get_dmi_dimm_size(self, dimm):
        if dimm < 0 or dimm > len(self.dmi_data):
            return 0
        else:
            return self.dmi_data[dimm]["size"]


    # decode each cscrow dir and add all the channels
    def get_rowdata(self, mcdir):
        csrowlist = []
        csrowdirs = sorted(os.listdir(mcdir))

        for csrow in csrowdirs:
            if csrow[0:5] == "csrow":
                dpath=mcdir+"/"+csrow
                chdir = os.popen("find " + dpath + " -name 'ch[0-9]_ce_count'").readlines()
                size_num = int(open(dpath+"/size_mb").read().strip())/len(chdir)
                for ch in chdir:
                    ce_num = int(open(ch.strip()).read().strip())
                    csrowlist.append(dict(name=csrow, path=ch, ce=ce_num, size=size_num, dimm=-1, count=0, reported=False))

        return csrowlist

    # read info on all EDAC channels
    # data returned will not have proper location ("dimm")
    def read_edac(self):
        mcpath = "/sys/devices/system/edac/mc"
        mclist = []
        edac = []
        # Check edac dir exists
        if(not os.path.exists(mcpath)):
            return mclist
        # Get list of mc dirs
        mcdir = sorted(os.listdir(mcpath))
        for obj in mcdir:
            if obj[0:2] == "mc":
                dpath = mcpath+"/"+obj
                mclist.extend(self.get_rowdata(dpath))
        return mclist

    # return the total size of memory reported by EDAC
    def get_edac_size(self):
        size = 0
        for ch in self.edac_data:
            size = size + ch["size"]
        return size

    def mark_dimm_reported(self, dimm):
        for ch in self.edac_data:
            if ch["dimm"] == dimm:
                ch["reported"] = True

    # This is the 16 byte value of the SEL entry as defined on p.411 of the IPMI v.2 rev1 spec:
    # Byte Field Description
    # 1-2:Record ID
    # 3:Record Type
    # 4-7:Timestamp
    # 8-9:Generator ID
    # 10 EvM Rev
    # 11 Sensor Type
    # 12 Sensor Number
    # 13 Event Dir / Event Type
    # 14 Event Data 1
    # 15 Event Data 2
    # 16 Event Data 3 (encode the dimm number here)
    # This emulates what the SMI handler does for correctable ecc errors.
    def generate_ecc_ipmi_msg(self, dimm):
        os.popen("ipmitool raw 0xa 0x44 0x0d 0x00 0x02 0xb4 0x14 0x10 0x47 " +
                     "0x21 0x00 0x04 0x0c 0x00 0x6f 0x35 0xff 0x%x"%(dimm))
        # toggle led on
        os.popen("ipmitool raw 0x3A 0x08 0x00 0x%x 0x01" %(0x5f+dimm))

    def check_edac(self):
        # compare the ce count for each channel in EDAC
        for ch in self.edac_data:
            val = int(os.popen("cat "+ ch["path"]).read())
            if val != ch["ce"] and not ch["reported"]:
                ch["ce"] = val
                if ch["count"] > 2:
                    self.generate_ecc_ipmi_msg(ch["dimm"])
                    self.mark_dimm_reported(ch["dimm"])
                else:
                    ch["count"] = ch["count"] + 1
            else:
                ch["count"] = 0

    # Do OS level tests.
    def diagnostics(self, do_write):
        print "Found a (%s) => %s" % (self.edac_mc_name, type(self))
        if os.system("which ipmitool 1>/dev/null 2>&1") > 0:
            print "ipmitool is not installed"
            do_write = False
        else:
            try:
                if int(os.popen("ipmitool raw 0x3A 0x1A 2>/dev/null").read()) == 0:
                    print "BMC reports that SMIs are off"
                else:
                    print "BMC reports that SMIs are on"

            except:
                print "ipmitool present but kernel drivers not loaded"
                do_write = False

        if os.path.exists("/sys/devices/system/ibm_rtl/state"):
            print "ibm_rtl appears to be loaded"

            if check_rtpm_enabled():
                print "BIOS reports SMIs are off"
            else:
                print "BIOS reports SMIs are on"
        else:
            print "ibm_rtl appears to not be loaded"


        print "EDAC information in order of appearance in DMI system " + \
                                "information\n"
        for ch in self.edac_data:
            print "DIMM %(dimm)d (%(n)d) includes %(path)s" %ch + \
                "\tSize is %(size)d MB with %(ce)d ECC errors\n" %ch
            if do_write:
                self.generate_ecc_ipmi_msg(ch["dimm"])
        if do_write:
            print "For each Channel above you should see an error " + \
                        "message in your BladeCenter log"

class amd_edac(Edac):
    def fixup_tables(self, order=None):
        # amd generic stuff
        dmi_size = 0
        dmi_dimm = -1
        if order != None:
            new_data = []
            for dimm in order:
                for slot in self.dmi_data:
                    if slot["dimm"] == dimm:
                        new_data.append(slot);
            self.dmi_data = new_data

        # loop through all the edac channels and assign dmi dimm info
        n = 0
        for ch in self.edac_data:
            if dmi_size == 0:
                dmi_dimm = dmi_dimm + 1
                dmi_size = self.get_dmi_dimm_size(dmi_dimm)
                ch_per_dimm = dmi_size / ch["size"]

            n = n + 1
            if dmi_dimm & 0x01:
                offset = -(n % ch_per_dimm)
            else:
                offset = (n+1) % ch_per_dimm
            ch["dimm"] = self.get_dmi_dimm(dmi_dimm + offset)
            ch["n"] = n
            dmi_size = dmi_size - ch["size"]

        # one or more edac channels should == DMI dimm size
        if dmi_size != 0:
            print_syslog("ERROR: DMI dimm sizes do not agree with EDAC.")
            print_syslog("Proper reporting of ecc errors is not possible.")
            print_syslog("Memory DIMMs with identical size may be required.")
            print_syslog("Please check your system configuration.")
            sys.exit(-1);

class ls21_edac(amd_edac):
    edac_mc_name = 'Athlon64/Opteron/Rev F'
    def __init__(self):
        super(ls21_edac, self).__init__()

    @staticmethod
    def probe():
        exec_str = "grep '%s' %s" % (
            ls21_edac.edac_mc_name, Edac.sysfs_mc_name_path
        )
        return os.popen(exec_str).readlines() != []

    def fixup_tables(self):
        order = None
        super(ls21_edac, self).fixup_tables(order);
        for ch in self.edac_data:
            if ch["dimm"] % 2 == 0:
                ch["dimm"] -= 1
            else:
                ch["dimm"] += 1

class ls22_edac(amd_edac):
    edac_mc_name = 'Family-F10h-Quad-Core'
    def __init__(self):
        super(ls22_edac, self).__init__()

    @staticmethod
    def probe():
        exec_str = "grep '%s' %s" % (
            ls22_edac.edac_mc_name, Edac.sysfs_mc_name_path
        )
        return os.popen(exec_str).readlines() != []

    def fixup_tables(self):
        order = 4,2,3,1,8,6,7,5 
        super(ls22_edac, self).fixup_tables(order);

class hs21_edac(Edac):
    edac_mc_name = 'I5000'
    def __init__(self):
        super(hs21_edac, self).__init__()

    @staticmethod
    def probe():
        # check for 5000P memory controller and HS21 in dmidecode
        exec_str = "grep '%s' %s" % (
            hs21_edac.edac_mc_name, Edac.sysfs_mc_name_path
        )
        if os.popen(exec_str).readlines() != [] and \
                os.popen("dmidecode | grep HS21").readlines() != []:
            return True
        return False

    def fixup_tables(self):
        ch_list = self.edac_data
        ch_list.sort()
        # With the channels sorted toplogy always looks like
        order=4,2,5,7,4,2,5,7,3,1,6,8,3,1,6,8

        for channel in range(len(ch_list)):
            ch_list[channel]["dimm"] = order[channel]

class i7core_hs22(Edac):
    edac_mc_name = 'i7 Core'
    def __init__(self):
        super(i7core_hs22, self).__init__()

    @staticmethod
    def probe():
        # check for i7 core memory controller and 7870 machine type
        exec_str = "grep '%s' %s" % (
            i7core_hs22.edac_mc_name, Edac.sysfs_mc_name_path
        )
        if os.popen(exec_str).readlines() != [] and \
                os.popen("dmidecode | grep 7870").readlines() != []:
            return True
        return False

    def fixup_tables(self):
        ch_list = self.edac_data
        new_dmi = []
        map=2,1,6,5,4,3,8,7,12,11,10,9
        
        def def_sort(x,y):
            return int(x["name"][5:]) - int(y["name"][5:])

        ch_list.sort(def_sort)

        for m in map:
            for dmi in self.dmi_data: 
                if dmi["dimm"] == m:
                    new_dmi.append(dmi)
	
        for channel in range(len(ch_list)):
            ch_list[channel]["dimm"] = new_dmi[channel]["dimm"]
            ch_list[channel]["n"] = channel

    def generate_ecc_ipmi_msg(self, dimm):
        # we are setting bit 12 the "sensor number" 0xd1 == dimm 1
        dim_value = 0xd0 + dimm 
        os.popen("ipmitool raw  0xa 0x44 0xe3 0x00 0x02 0x23 0xf2 0x92 0x4a " +
                 " 0x20 0x00 0x04 0x0c 0x%x 0x6f 0x05 0xff 0xff"%(dim_value))

class i7core_rack(Edac):
    edac_mc_name = 'i7 Core'
    def __init__(self):
        super(i7core_rack, self).__init__()

    @staticmethod
    def probe():
        # check for i7 core memory controller and 7947 in dmidecode
        exec_str = "grep '%s' %s" % (
            i7core_rack.edac_mc_name, Edac.sysfs_mc_name_path
        )
        if os.popen(exec_str).readlines() != [] and \
                os.popen("dmidecode | grep 794[7,6]").readlines() != []:
            return True
        return False

    def fixup_tables(self):
        ch_list = self.edac_data
        new_dmi = []
        map=3,2,1,6,5,4,8,7,11,10,9,14,13,12,16,15
        
        def def_sort(x,y):
            return int(x["name"][5:]) - int(y["name"][5:])

        ch_list.sort(def_sort)

        for m in map:
            for dmi in self.dmi_data: 
                if dmi["dimm"] == m:
                    new_dmi.append(dmi)

        for channel in range(len(ch_list)):
            ch_list[channel]["dimm"] = new_dmi[channel]["dimm"]
            ch_list[channel]["n"] = channel

    def generate_ecc_ipmi_msg(self, dimm):
        # we are setting bit 12 the "sensor number" 0xd0 == dimm 1
        dim_value = 0xd0 + dimm -1 
        os.popen("ipmitool raw  0xa 0x44 0xe3 0x00 0x02 0x23 0xf2 0x92 0x4a " +
                 " 0x20 0x00 0x04 0x0c 0x%x 0x6f 0x05 0xff 0xff"%(dim_value))

def usage(name, exitval=0):
    print "Usage: %s [options]" % name
    print " --test          run tests, but don't start daemon"
    print " --nowrite       don't report test findings to BMC"
    print " --pidfile f     write pid of daemon to file f (/var/run/%s.pid)" % name
    print " --foreground    don't daemonize, run in foreground"
    sys.exit(exitval)

def main(args):
    name = args[0].split('/')[-1]
    Test = False
    Write_Test = True
    foreground = False
    pidfile = "/var/run/ibm-prtmd.pid"
    i = 1
    while i < len(args):
        arg = args[i]
        if arg == "--test":
            Test = True
        elif arg == "--nowrite":
            Write_Test = False
        elif arg == "--pidfile":
            i += 1
            pidfile = args[i]
        elif arg == "--foreground":
            foreground = True
        else:
            ret = 0
            if arg not in ["--help", "-h"]:
                print "unknown argument: %s" % args[i]
                ret = 1
            usage(name, ret)
        i += 1

    syslog.openlog("IBM ECC Detection Service")
    if Test:
        print "Testing Started:"
        try:
            edac = Edac.create()
        except InvalidMachineError:
            print "No valid EDAC class found for this machine"
            sys.exit(1)
        edac.diagnostics(Write_Test)
        print "Testing Done: Service not started"
        sys.exit(0)

    if not check_rtpm_enabled():
        syslog.syslog("IBM Premium Real-Time Mode not enabled!")
        print "IBM Premium Real-Time Mode not enabled!"
        sys.exit(-1);

    try:
        edac = Edac.create()
    except InvalidMachineError:
        print "No valid EDAC class found for this machine"
        sys.exit(1)

    if not foreground:
        daemonize()

    # write pid to pidfile
    open(pidfile, 'w+').write("%d\n" % os.getpid())

    while True:
        edac.check_edac()
        sleep(10)

if __name__ == '__main__':
    main(sys.argv)
