[padb] r458 committed - Fix the deadlock detection code....

padb at googlecode.com padb at googlecode.com
Sat Jan 4 19:24:34 GMT 2014


Revision: 458
Author:   apittman at gmail.com
Date:     Sat Jan  4 19:23:59 2014 UTC
Log:      Fix the deadlock detection code.

There were some changes to the handling of minfo output and the collective
state had not been updated so wasn't being reported correctly, and hence
the deadlock detection code had nothing to work with.  Alter the minfo
output to be in the new format and correctly match/forward this in the padb
inner processes.

http://code.google.com/p/padb/source/detail?r=458

Modified:
  /trunk/src/minfo.c
  /trunk/src/padb

=======================================
--- /trunk/src/minfo.c	Sat Jan  4 18:58:01 2014 UTC
+++ /trunk/src/minfo.c	Sat Jan  4 19:23:59 2014 UTC
@@ -52,21 +52,6 @@

  struct dll_entry_points dll_ep = {};

-char *collective_names[] = { "Barrier",
-			     "Bcast",
-			     "Allgather",
-			     "Allgatherv",
-			     "Allreduce",
-			     "Alltoall",
-			     "Alltoallv",
-			     "Reduce_Scatter",
-			     "Reduce",
-			     "Gather",
-			     "Gatherv",
-			     "Scan",
-			     "Scatter",
-			     "Scatterv" };
-
  char *op_types[] = { "pending_send",
  		     "pending_receive",
  		     "unexpected_message" };
@@ -476,11 +461,10 @@
  	int r = dll_ep.get_comm_coll_state(target_process,i,&seq,&active);
  	if ( r == mqs_ok ) {
  	    if ( seq != 0 )
-		printf("comm%d: Collective '%s': call count %d, %sactive\n",
-		       comm_id,
-		       collective_names[i],
+		printf("col: id:%d count:%d active:%d\n",
+		       i,
  		       seq,
-		       active ? "" : "not ");
+		       active ? 1 : 0);
  	} else if ( r != mqs_no_information ) {
  	    show_dll_error_code(r);
  	}
=======================================
--- /trunk/src/padb	Sat Jan  4 19:17:54 2014 UTC
+++ /trunk/src/padb	Sat Jan  4 19:23:59 2014 UTC
@@ -7286,6 +7286,33 @@
              } else {
                  target_error( $vp, "UNPARSEABLE MINFO: $r" );
              }
+        } elsif ( $cmd eq 'col:' ) {
+            $stats{out}++;
+            if (
+                $r =~ m{\A
+                        col:
+                        [ ]
+                        id:(\d+)
+                        [ ]?
+                        count:(\d+)
+                        [ ]?
+                        active:(\d+)
+                        \z
+                        }x
+              )
+            {
+                my $call = {
+                    id     => $1,
+                    count  => $2,
+                    active => $3,
+                };
+
+                push @{ $communicator_descriptor{coll} }, $call;
+
+            } else {
+                target_error( $vp, "UNPARSEABLE MINFO: $r" );
+            }
+
          } elsif ( $cmd eq 'Msg:' ) {
              $stats{msg}++;
              if (
@@ -7393,6 +7420,13 @@
  sub minfo_to_array {
      my ($cd) = @_;

+    my @coll_name_lookup = (
+        "Barrier",   "Bcast",    "Allgather", "Allgatherv",
+        "Allreduce", "Alltoall", "Alltoallv", "Reduce_Scatter",
+        "Reduce",    "Gather",   "Gatherv",   "Scan",
+        "Scatter",   "Scatterv"
+    );
+
      my @mq;
      foreach my $comm ( @{$cd} ) {

@@ -7410,6 +7444,13 @@

          my $mid = 0;

+        foreach my $cc ( @{ $comm->{coll} } ) {
+            my $coll_name = $coll_name_lookup[ $cc->{id} ];
+            my $active_desc = $cc->{active} ? "active" : "not active";
+            push @mq,
+"comm$comm->{mid}: Collective '$coll_name': call count $cc->{count},  
$active_desc";
+        }
+
          foreach my $m ( @{ $comm->{messages} } ) {
              my @op_desc = qw(pending_send pending_receive  
unexpected_message);
              my @status_desc = qw(pending matched complete);




More information about the padb-devel mailing list