Death/condition notification

Fault notification is a crucial part of the availability of a system. Apart from performing recovery per se, we also need to keep track of failures in order to be able to analyze the system at a later point.

For fault notification, you can use standard notification mechanisms such as pulses or signals. Clients specify what pulse/signal with specific values they want for each notification, and a HAM delivers the notifications at the appropriate times.

/* regevent.c */

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/neutrino.h>
#include <sys/iomsg.h>
#include <sys/netmgr.h>
#include <signal.h>
#include <ha/ham.h>

#define PCODEINETDDEATH      _PULSE_CODE_MINAVAIL+1
#define PCODEINETDDETACH     _PULSE_CODE_MINAVAIL+2
#define PCODENFSDELAYED      _PULSE_CODE_MINAVAIL+3
#define PCODEINETDRESTART1   _PULSE_CODE_MINAVAIL+4
#define PCODEINETDRESTART2   _PULSE_CODE_MINAVAIL+5

#define MYSIG SIGRTMIN+1

int fsnfs_value;

/* Signal handler to handle the death notify of fs-nfs2 */
void MySigHandler(int signo, siginfo_t *info, void *extra)
{
  printf("Received signal %d, with code = %d, value %d\n",
        signo, info->si_code, info->si_value.sival_int);
  if (info->si_value.sival_int == fsnfs_value)
    printf("FS-nfs2 died, this is the notify signal\n");
  return;
}

int main(int argc, char *argv[])
{
  int chid, coid, rcvid;
  struct _pulse pulse;
  pid_t pid;
  int status;
  int value;
  ham_entity_t *ehdl;
  ham_condition_t *chdl;
  ham_action_t *ahdl;
  struct sigaction sa;
  int scode;
  int svalue;

  /* we need a channel to receive the pulse notification on */
  chid = ChannelCreate( 0 ); 

  /* and we need a connection to that channel for the pulse to be
     delivered on */
  coid = ConnectAttach( 0, 0, chid, _NTO_SIDE_CHANNEL, 0 );

  /* fill in the event structure for a pulse */
  pid = getpid();
  value = 13;
  ham_connect(0);
  /* Assumes there is already an entity by the name "inetd" */
  chdl = ham_condition_handle(ND_LOCAL_NODE, "inetd","death",0);
  ahdl = ham_action_notify_pulse(chdl, "notifypulsedeath",ND_LOCAL_NODE, pid, chid, 
                              PCODEINETDDEATH, value, HREARMAFTERRESTART);
  ham_action_handle_free(ahdl);
  ham_condition_handle_free(chdl);
  ehdl = ham_entity_handle(ND_LOCAL_NODE, "inetd", 0);
  chdl = ham_condition(ehdl, CONDDETACH, "detach", HREARMAFTERRESTART);
  ahdl = ham_action_notify_pulse(chdl, "notifypulsedetach",ND_LOCAL_NODE, pid, chid, 
                              PCODEINETDDETACH, value, HREARMAFTERRESTART);
  ham_action_handle_free(ahdl);
  ham_condition_handle_free(chdl);
  ham_entity_handle_free(ehdl);
  fsnfs_value = 18; /* value we expect when fs-nfs dies */
  scode = 0;
  svalue = fsnfs_value; 
  sa.sa_sigaction = MySigHandler;
  sigemptyset(&sa.sa_mask);
  sa.sa_flags = SA_SIGINFO;
  sigaction(MYSIG, &sa, NULL);
  /*
   Assumes there is an entity by the name "Fs-nfs2".
   We use "Fs-nfs2" to symbolically represent the entity
   fs-nfs2. Any name can be used to represent the
   entity, but it's best to use a readable and meaningful name.
  */
  ehdl = ham_entity_handle(ND_LOCAL_NODE, "Fs-nfs2", 0);

  /* 
   Add a new condition, which will be an "independent" condition
   this means that notifications/actions inside this condition
   are not affected by "waitfor" delays in other action
   sequence threads
  */
  chdl = ham_condition(ehdl,CONDDEATH, "DeathSep",
                    HCONDINDEPENDENT|HREARMAFTERRESTART);
  ahdl = ham_action_notify_signal(chdl, "notifysignaldeath",ND_LOCAL_NODE, pid, MYSIG, 
                    scode, svalue, HREARMAFTERRESTART);
  ham_action_handle_free(ahdl);
  ham_condition_handle_free(chdl);
  ham_entity_handle_free(ehdl);
  chdl = ham_condition_handle(ND_LOCAL_NODE, "Fs-nfs2","Death",0);
  /*
   this actions is added to a condition that does not
   have a hcondnowait. Since we are unaware what the condition
   already contains, we might end up getting a delayed notification
   since the action sequence might have "arbitrary" delays, and
   "waits" in it.
  */
  ahdl = ham_action_notify_pulse(chdl, "delayednfsdeathpulse", ND_LOCAL_NODE, 
             pid, chid, PCODENFSDELAYED, value, HREARMAFTERRESTART);
  ham_action_handle_free(ahdl);
  ham_condition_handle_free(chdl);
  ehdl = ham_entity_handle(ND_LOCAL_NODE, "inetd", 0);
  chdl = ham_condition(ehdl, CONDRESTART, "restart", 
                             HREARMAFTERRESTART|HCONDINDEPENDENT);
  ahdl = ham_action_notify_pulse(chdl, "notifyrestart_imm", ND_LOCAL_NODE, 
                    pid, chid, PCODEINETDRESTART1, value, HREARMAFTERRESTART);
  ham_action_handle_free(ahdl);
  ahdl = ham_action_waitfor(chdl, "delay",NULL,6532, HREARMAFTERRESTART); 
  ham_action_handle_free(ahdl);
  ahdl = ham_action_notify_pulse(chdl, "notifyrestart_delayed", ND_LOCAL_NODE, 
                    pid, chid, PCODEINETDRESTART2, value, HREARMAFTERRESTART);
  ham_action_handle_free(ahdl);
  ham_condition_handle_free(chdl);
  ham_entity_handle_free(ehdl);
  while (1) {
    rcvid = MsgReceivePulse( chid, &pulse, sizeof( pulse ), NULL );
    if (rcvid < 0) {
      if (errno != EINTR) {
        exit(-1);
      }
    }
    else {
            switch (pulse.code) {
                case PCODEINETDDEATH:
                      printf("Inetd Death Pulse\n");
                    break;
                case PCODENFSDELAYED:
                      printf("Fs-nfs2 died: this is the possibly delayed pulse\n");
                      break;
                case PCODEINETDDETACH:
                      printf("Inetd detached, so quitting\n");
                     goto the_end;
                case PCODEINETDRESTART1:
                      printf("Inetd Restart Pulse: Immediate\n");
                    break;
                case PCODEINETDRESTART2:
                      printf("Inetd Restart Pulse: Delayed\n");
                    break;
              }
    }
  }
  /*
   At this point we are no longer waiting for the
   information about inetd, since we know that it
   has exited.
   We will still continue to obtain information about the
   death of fs-nfs2, since we did not remove those actions
   if we exit now, the next time those actions are executed
   they will fail (notifications fail if the receiver does
   exist anymore), and they will automatically get removed and
   cleaned up.
  */
the_end:
  ham_disconnect(0);
  exit(0);
}

In the above example a client registers for various different types of notifications relating to significant events concerning inetd and fs-nfs2. Notifications can be sent immediately or after a certain delay.

The notifications can also be received for each condition independently — for the entity's death (CONDDEATH), restart (CONDRESTART), and detaching (CONDDETACH).

The CONDRESTART is asserted by a HAM when an entity is successfully restarted.