[padb-devel] [padb commit] r89 - Add code to check all remote processes are discovered and are

codesite-noreply at google.com codesite-noreply at google.com
Thu Jul 2 11:26:50 BST 2009


Author: apittman
Date: Thu Jul  2 02:33:22 2009
New Revision: 89

Modified:
    branches/full-duplex/src/padb

Log:
Add code to check all remote processes are discovered and are
vaguely in the same state (executable name).
It's enabled by default but can be controlled by setting "check-signon" to
one of "none", "missing" or "all".


Modified: branches/full-duplex/src/padb
==============================================================================
--- branches/full-duplex/src/padb	(original)
+++ branches/full-duplex/src/padb	Thu Jul  2 02:33:22 2009
@@ -371,6 +371,10 @@
  $conf{"dump-raw"}     = 0;
  $conf{"showcmd"}      = 0;

+# Valid values are "none" "missing" or "all".  Anything not recognised
+# is treated as "all".
+$conf{"check-signon"} = "all";
+
  $conf{slurm_job_step} = "0";

  # Output options.
@@ -3268,6 +3272,50 @@
      return $req;
  }

+sub report_failed_signon {
+    my ( $key, $data ) = @_;
+    my %values;
+    foreach my $proc ( keys( %{$data} ) ) {
+        push( @{ $values{ $data->{$proc}{$key} } }, $proc );
+    }
+    my %c;
+    $c{i} = length($key);
+    printf("$key : ranks\n");
+    foreach my $value ( sort( keys(%values) ) ) {
+        printf( "%$c{i}s : %s\n", $value, compress( @{ $values{$value} } )  
);
+    }
+}
+
+sub check_signon {
+    my ( $comm_data, $data ) = @_;
+    return if ( $conf{"check-signon"} eq "none" );
+    my @missing;
+    my %target_state;
+    my %target_exe;
+    for ( my $proc = 0 ; $proc < $comm_data->{nprocesses} ; $proc++ ) {
+        if ( not defined $data->{target_responce}{$proc} ) {
+            push( @missing, $proc );
+            next;
+        }
+        $target_exe{ $data->{target_responce}{$proc}{name} }++;
+        $target_state{ $data->{target_responce}{$proc}{state} }++;
+    }
+    if ( $#missing != -1 ) {
+        printf( "Warning, failed to locates ranks %s\n",  
compress(@missing) );
+    }
+    return if ( $conf{"check-signon"} eq "missing" );
+    my $exe_count = keys(%target_exe);
+    if ( $exe_count != 1 ) {
+        printf("Warning, remote process name differs across ranks\n");
+        report_failed_signon( "name", $data->{target_responce} );
+    }
+    my $state_count = keys(%target_state);
+    if ( $state_count != 1 ) {
+        printf("Warning, remote process state differs across ranks\n");
+        report_failed_signon( "state", $data->{target_responce} );
+    }
+}
+
  sub command_from_inner {
      my ( $comm_data, $cdata, $line ) = @_;

@@ -3292,9 +3340,7 @@
          $comm_data->{current_req} = next_command($comm_data);
          issue_command_to_inner( $cdata, $comm_data->{current_req} );
          $comm_data->{state} = "live";
-
-        #XXX: Check all target_processes are here.
-        # print Dumper $d;
+        check_signon( $comm_data, $d );
          return;
      }

@@ -3342,7 +3388,7 @@
      $comm_data->{remote}{$host}{key}  = $key;
      $comm_data->{signons}++;

-    if ( $comm_data->{signons} == $comm_data->{hosts} ) {
+    if ( $comm_data->{signons} == $comm_data->{nhosts} ) {
          connect_to_children($comm_data);
      }
  }
@@ -3359,7 +3405,7 @@

      handle_signon( $comm_data, $words[2], $words[3], $words[4] );

-    if ( $comm_data->{signons} == $comm_data->{hosts} ) {
+    if ( $comm_data->{signons} == $comm_data->{nhosts} ) {

          # Don't listen on this port any more;
          $comm_data->{sel}->remove( $comm_data->{listen} );
@@ -3383,7 +3429,7 @@
      print("inner: $line\n");
  }

-sub inner_sterr_cb {
+sub inner_stderr_cb {
      my ( $comm_data, $cdata, $line ) = @_;
      print("einner: $line\n");
  }
@@ -3428,10 +3474,10 @@
  }

  sub go_parallel {
-    my $jobid = shift;
-    my $cmd   = shift;
-    my $ncpus = shift;
-    my $hosts = shift;
+    my $jobid      = shift;
+    my $cmd        = shift;
+    my $nprocesses = shift;
+    my $nhosts     = shift;

      my $comm_data;

@@ -3470,10 +3516,11 @@

      close $pcmd->{in};

-    $comm_data->{hosts}   = $hosts;
-    $comm_data->{cmd}     = $cmd;
-    $comm_data->{jobid}   = $jobid;
-    $comm_data->{signons} = 0;
+    $comm_data->{nhosts}     = $nhosts;
+    $comm_data->{nprocesses} = $nprocesses;
+    $comm_data->{cmd}        = $cmd;
+    $comm_data->{jobid}      = $jobid;
+    $comm_data->{signons}    = 0;

      # State, one of "connecting" "live" and "shutdown";
      $comm_data->{state} = "connecting";
@@ -3517,8 +3564,8 @@
          if ( $count > 0 ) {

              #printf("Still here, time:$t2 comm_count:$count\n");
-            if ( $comm_data->{signons} != $comm_data->{hosts} ) {
-                my $missing = $comm_data->{hosts} - $comm_data->{signons};
+            if ( $comm_data->{signons} != $comm_data->{nhosts} ) {
+                my $missing = $comm_data->{nhosts} - $comm_data->{signons};
                  print("Waiting for signon from $missing hosts.\n");
              }
          }
@@ -6435,7 +6482,6 @@
          $netdata->{target_responce}{$vp}->{name}  = $name;
          $netdata->{target_responce}{$vp}->{state} = $state;
      }
-
  }

  # Receive a command (perl reference) from our parent.




More information about the padb-devel mailing list