[padb-devel] [padb] r289 committed - Tighten up integration between padb and minfo, the inner padb processe...

codesite-noreply at google.com codesite-noreply at google.com
Thu Oct 15 14:31:26 BST 2009


Revision: 289
Author: apittman
Date: Thu Oct 15 06:30:47 2009
Log: Tighten up integration between padb and minfo, the inner padb  
processes no longer
just forward all output to the outer process but rather parse it locally  
into
a type they can understand.  Forward and error or user messages from the dll
or minfo back to the user and present them in a useful way.
Once the inner process has parsed the information from minfo it re-formats  
it
into the format it was before for the outer process, next on this list is to
try and improve this formatting and to make the collective detection code
use reduced key/pair values rather than arrays of strings.

http://code.google.com/p/padb/source/detail?r=289

Modified:
  /trunk/src/minfo.c
  /trunk/src/padb

=======================================
--- /trunk/src/minfo.c	Thu Jun  4 11:43:09 2009
+++ /trunk/src/minfo.c	Thu Oct 15 06:30:47 2009
@@ -26,6 +26,43 @@
      char   name[128];
      int    size;
  };
+
+char *(*es)(int errorcode);
+
+void show_string (char *desc, char *str)
+{
+    printf("zzz: str:%d %s\n%s\n",
+	   strlen(str),
+	   desc,
+	   str);
+}
+
+void show_warning (const char *msg)
+{
+    show_string("warning",(char *)msg);
+}
+
+void show_dll_error_code (int res)
+{
+    char *msg;
+    msg = es(res);
+    show_string("dllerror",msg);
+}
+
+void die (char *msg)
+{
+    show_string("dmsg",msg);
+    show_string("exit","die");
+
+    fflush(NULL);
+    exit(0);
+}
+
+void die_with_code (int res, char *msg)
+{
+    show_dll_error_code(res);
+    die(msg);
+}

  #define QUERY_SIZE 1280

@@ -73,7 +110,7 @@

  void show_msg (const char *msg)
  {
-    printf("message from DLL:%s\n",msg);
+    show_string("dlldebugmessage",(char *)msg);
  }

  char *get_msg (int msg )
@@ -196,7 +233,7 @@
  {
      struct process *p = (struct process *)process;
      if ( p->rank == -1 )
-	printf("Warning, DLL called find_rank before setup_process!\n");
+	show_warning("DLL called find_rank before setup_process!");
      return p->rank;
  }

@@ -291,27 +328,31 @@
      strncpy(local,ans,QUERY_SIZE);
      return 0;
  }
-
-void die (char *msg)
-{
-    printf("Error: %s\n",msg);
-    fflush(NULL);
-    exit(1);
-}

  int msgid = 0;

  int show_comm (struct process *p,mqs_communicator *comm)
  {
      static int c = 0;
-    printf("comm%d: name: '%s'\n",
-	   c,comm->name);
-    printf("comm%d: rank: '%d'\n",
-	   c,(int)comm->local_rank);
-    printf("comm%d: size: '%d'\n",
-	   c,(int)comm->size);
-    printf("comm%d: id: '%p'\n",
-	   c,(void *)comm->unique_id);
+    if ( comm->local_rank >= 0 )
+	printf("out: c:%d rank:%d\n",
+	       c,
+	       (int)comm->local_rank);
+
+    printf("out: c:%d size:%d\n",
+	   c,
+	   (int)comm->size);
+
+    printf("out: c:%d str:%d name\n%s\n",
+	   c,
+	   strlen(comm->name),
+	   comm->name);
+
+    printf("out: c:%d id:%ld\n",
+	   c,
+	   comm->unique_id);
+
+
      msgid=0;
      return c++; /* This is not a political statement although if it was  
I'd stand by it */
  }
@@ -399,7 +440,7 @@
      int (*sp)(mqs_process *process,mqs_process_callbacks *pcb);
      int (*phq)(mqs_process *process, char **msg);
      void (*ucl)(mqs_process *process);
-    char *(*es)(int errorcode);
+
      int (*sci)(mqs_process *process);
      int (*gc)(mqs_process *process, mqs_communicator *comm);
      int (*nc)(mqs_process *process);
@@ -417,7 +458,7 @@
  	char dll[PATH_MAX];
  	void *base = find_sym("sym","MPIR_dll_name");
  	if ( ! base ) {
-	    die("Could not find dll_name symbol");
+	    die("Could not find MPIR_dll_name symbol");
  	}
  	fetch_string(NULL,&dll[0],(mqs_taddr_t)base,PATH_MAX);
  	dlhandle = dlopen(dll,RTLD_NOW);
@@ -466,25 +507,17 @@

      res = si((mqs_image *)&i,&icb);
      if ( res != mqs_ok ) {
-	die("Failed mqs_setup_image");
+	die_with_code(res,"setup_image() failed");
      }

      {
  	char *m = NULL;
  	res = ihq((mqs_image *)&i,&m);
  	if ( m ) {
-	    char image[QUERY_SIZE];
-	    if ( fetch_image(image) == 0 ) {
-		printf(m,image);
-		printf("\n");
-	    } else
-		printf("%s\n",m);
+	    show_string("ihqm",m);
  	}
  	if ( res != mqs_ok ) {
-	    char *msg;
-	    msg = es(res);
-	    printf("message from DLL %d '%s'\n",res,msg);
-	    die("Failed image_has_queues");
+	    die_with_code(res,"image_has_queues() failed");
  	}
      }

@@ -498,7 +531,7 @@

      res = sp((mqs_process *)&p,&pcb);
      if ( res != mqs_ok ) {
-	die("Failed mqs_setup_process");
+	die_with_code(res,"mqs_setup_process() failed");
      }

      if ( gr ) {
@@ -512,12 +545,9 @@
  	char *m = NULL;
  	res = phq((mqs_process *)&p,&m);
  	if ( m )
-	    printf("%s\n",m);
+	    show_string("phqm",m);
  	if ( res != mqs_ok ) {
-	    char *msg;
-	    msg = es(res);
-	    printf("%s\n",msg);
-	    die("Failed process has_queue");
+	    die_with_code(res,"process_has_queue() failed");
  	}
      }

@@ -525,7 +555,7 @@

      res = sci((mqs_process *)&p);
      if ( res != mqs_ok ) {
-	die("Failed sci");
+	die_with_code(res,"setup_communicator_iterator() failed");
      }

      do {
@@ -533,11 +563,7 @@

  	res = gc((mqs_process *)&p,&comm);
  	if ( res != mqs_ok ) {
-	    char *msg;
-	    msg = es(res);
-	    printf("gc returned %d\n",res);
-	    printf("%s\n",msg);
-	    die("gc");
+	    die_with_code(res,"get_communicator() failed");
  	}

  	if ( res == mqs_ok ) {
@@ -553,7 +579,7 @@
  		    if ( r == mqs_ok ) {
  			int i;
  			for ( i = 0 ; i < comm.size ; i++ ) {
-			    printf("comm%d: Rank: local %d global %d\n",c,i,ranks[i]);
+			    printf("out: c:%d rt:%d\n",c,ranks[i]);
  			}
  		    }
  		    free(ranks);
@@ -583,12 +609,15 @@
  		load_ops((mqs_process *)&p,mqs_pending_sends);

  	    }
+	    printf("done\n"
+		   );

  	    nres = nc((mqs_process *)&p);

  	}
      } while ( res == mqs_ok && nres == mqs_ok );
-
+
+    show_string("exit","ok");
      return 0;
  }

=======================================
--- /trunk/src/padb	Mon Oct 12 03:50:51 2009
+++ /trunk/src/padb	Thu Oct 15 06:30:47 2009
@@ -2669,7 +2669,7 @@

      foreach my $pid ( get_process_list($user) ) {
          my $name = find_from_status( $pid, "Name" );
-        if ( defined $mpirun{$name} ) {
+        if ( defined $name and defined $mpirun{$name} ) {
              push @jobs, $pid;
              next;
          }
@@ -3880,7 +3880,8 @@
      }

      if ( defined $allfns{$mode}{out_handler} ) {
-        $allfns{$mode}{out_handler}( $conf{mode_options}{$mode}, $d );
+        $allfns{$mode}{out_handler}( $conf{mode_options}{$mode}, $d,
+            $comm_data->{current_req} );
      } else {
          default_output_handler( $comm_data->{current_req}, $d );
      }
@@ -4836,7 +4837,7 @@
          find_any_rmgr($user);

          @jobids = get_all_jobids($user);
-        printf "Active jobs (%d) are @jobids\n", @jobids
+        printf "Active jobs (%d) are @jobids\n", $#jobids + 1
            if $conf{verbose};
          if ( @jobids == 0 ) {
              printf "No active jobs could be found for user '$user'\n";
@@ -5468,7 +5469,6 @@
          tracepid => -1,
          attached => 0,
      };
-    my @mq;

      my $cmd = $inner_conf{minfo};
      $h->{hpid} = open3( $h->{fd}{wtr}, $h->{fd}{rdr}, $h->{fd}{err}, $cmd )
@@ -5480,18 +5480,123 @@

      my %stats;

+    # Communicator data.
+    my %cd;
+
+    my %global;
+
+    $global{exit} = 'unknown';
+
+    my @cd;
+    my $bytes_to_read;
+    my $str_name;
+    my $str_value = $EMPTY_STRING;
+    my $str_global;
      while (<$handle>) {
          my $r = $_;
+
+        if ( defined $bytes_to_read ) {
+            $str_value .= $r;
+            if ( length $str_value eq $bytes_to_read + 1 ) {
+                chomp $str_value;
+                if ($str_global) {
+                    $global{$str_name} = $str_value;
+
+                    if ( $str_name eq 'ihqm' ) {
+                        my $image = readlink "/proc/$gdb->{tracepid}/exe";
+                        $str_value =~ s{%s}{$image};
+                    }
+
+                    if ( $str_name ne 'exit' and $str_name ne 'dmsg' ) {
+
+			# Report the string back to the outer process,
+			# don't bother forwarding exit status as that's
+			# done below.
+                        target_key_pair( $vp, "minfo_msg_$str_name",
+                            $str_value );
+                        target_key_pair( $vp, "minfo_msg", $str_name );
+                    }
+                } else {
+                    $cd{$str_name} = $str_value;
+                }
+                $bytes_to_read = undef;
+                $str_value     = "";
+            }
+            next;
+        }
+
          chomp $r;
-        if ( $r =~ m{\Areq:}x ) {
+        my $cmd = substr $r, 0, 4;
+        if ( $cmd eq 'req:' ) {
              my $res = minfo_handle_query( $gdb, $vp, $r, \%stats );

              # Some things *do* fail here, symbol lookups for example,
              # and we don't need to report it.

              print {$out} "$res\n";
+        } elsif ( $cmd eq 'out:' ) {
+            if (
+                $r =~ m{\A
+                        out:
+                        [ ]
+                        c:(\d+)
+                        [ ]
+                        (\w+):
+                        (\d+)
+                        [ ]?
+                        (\w+)?
+                        \z
+                        }x
+              )
+            {
+                my $cid   = $1;
+                my $key   = $2;
+                my $value = $3;
+                my $name  = $4;
+
+                if ( $key eq 'str' ) {
+                    $bytes_to_read = $value;
+                    $str_name      = $name;
+                    $str_global    = 0;
+                } elsif ( $key eq 'rt' ) {
+                    push @{ $cd{rt} }, $value;
+                } else {
+                    $cd{$key} = $value;
+                    $cd{mid} = $cid;
+                }
+            } else {
+                target_key_pair( $vp, "UNPARSEABLE MINFO", $r );
+            }
+        } elsif ( $cmd eq 'zzz:' ) {
+            if (
+                $r =~ m{\A
+                        zzz:
+                        [ ]
+                        (\w+):
+                        (\d+)
+                        [ ]?
+                        (\w+)?
+                        \z
+                        }x
+              )
+            {
+                my $key    = $1;
+                my $length = $2;
+                my $name   = $3;
+
+                if ( $key eq 'str' ) {
+                    $bytes_to_read = $length;
+                    $str_name      = $name;
+                    $str_global    = 1;
+                }
+            } else {
+                target_key_pair( $vp, "UNPARSEABLE MINFO", $r );
+            }
+        } elsif ( $cmd eq 'done' ) {
+            push @cd, dclone( \%cd );
+            undef %cd;
          } else {
-            push @mq, $r;
+            push @{ $cd{raw} }, $r;
          }
      }

@@ -5509,13 +5614,44 @@
          return;
      }

-    if ( $? != 0 ) {
-
-        # Bad exit code but we did talk to it so run with what we have.
-        target_error( $vp,
-            "Error running $inner_conf{minfo}: Bad exit code $?" );
+    if ( $global{exit} ne 'ok' ) {
+        if ( $global{exit} eq 'die' ) {
+            target_error( $vp,
+                "Error message from $inner_conf{minfo}: $global{dmsg}" );
+
+        } else {
+            target_error( $vp,
+                "Error running $inner_conf{minfo}: Bad exit code $?" );
+        }
      }

+    return minfo_to_array( \@cd );
+
+}
+
+sub minfo_to_array {
+    my ($cd) = @_;
+
+    my @mq;
+    foreach my $comm ( @{$cd} ) {
+
+        #print Dumper $comm;
+        push @mq, "comm$comm->{mid}: name: '$comm->{name}'";
+        if ( defined $comm->{rank} ) {
+            push @mq, "comm$comm->{mid}: rank: '$comm->{rank}'";
+        }
+        push @mq, "comm$comm->{mid}: size: '$comm->{size}'";
+        my $id = sprintf( "%#Lx", $comm->{id} );
+        push @mq, "comm$comm->{mid}: id: '$id'";
+
+        for my $i ( 0 .. $#{ $comm->{rt} } ) {
+            push @mq, "comm$comm->{mid}: Rank: local $i global  
$comm->{rt}[$i]";
+        }
+
+        foreach my $l ( @{ $comm->{raw} } ) {
+            push @mq, $l;
+        }
+    }
      return @mq;
  }

@@ -5686,6 +5822,45 @@
      }
      return $ret;
  }
+
+sub mpi_queue_output_handler {
+    my ( $carg, $lines, $three ) = @_;
+
+    my %headers = (
+        ihqm            => 'Message from DLL',
+        phqm            => 'Message from DLL',
+        dllerror        => 'Error string from DLL',
+        warning         => 'Warning message from minfo',
+        dlldebugmessage => 'Debug message from DLL',
+    );
+
+    if ( exists $lines->{target_data}{minfo_msg} ) {
+        my @keys = sort keys %{ $lines->{target_data}{minfo_msg} };
+
+        foreach my $key (@keys) {
+            my @values = keys %{ $lines->{target_data}{"minfo_msg_$key"} };
+            my $head;
+            if ( defined $headers{$key} ) {
+                $head = $headers{$key};
+            } else {
+                $head = "Message from minfo/dll using unknown key: '$key'";
+            }
+            foreach my $value ( sort @values ) {
+                printf("----------------\n");
+                printf(
+
+		     rng_convert_to_user(
+                        $lines->{target_data}{"minfo_msg_$key"}{$value}
+                      ) .
+		    ": $head\n");
+                printf("----------------\n");
+                printf( "%s\n", $value );
+            }
+        }
+    }
+
+    default_output_handler( $three, $lines );
+}

  sub mpi_go_deadlock_detect_helper {
      my $str        = shift;    # tagged onto the end of the line.
@@ -5727,6 +5902,8 @@
              $tg{$gid}++;
          }
      }
+
+    my $no_data = 0;

      foreach my $process ( keys %{$cd} ) {
          my $rd = $cd->{$process};
@@ -5750,6 +5927,9 @@
              }
              $ad{$gid}{size} = $gd->{size};
              $ad{$gid}{name} = $gd->{name};
+            if ( not exists $gd->{coll} ) {
+                $no_data++;
+            }
              foreach my $coll ( keys %{ $gd->{coll} } ) {
                  my $count = $gd->{coll}{$coll}{count};
                  if ( defined $gd->{coll}{$coll}{active} ) {
@@ -5843,13 +6023,21 @@

      my $count = keys %ad;

+    if ( $count eq $no_data ) {
+        $ret .=
+          "Total: $count communicators, no communication data recorded.\n";
+        return $ret;
+    }
+
      if ( $count == 1 ) {
          my $use_str = ( $i_count == 1 ) ? $EMPTY_STRING : ' not';
-        $ret .= "Total: $count group which is$use_str in use.\n";
+        $ret .= "Total: $count communicators which is$use_str in use.\n";
      } else {
          my $i_str = ( $i_count == 1 ) ? 'is' : 'are';
-        $ret .= "Total: $count groups of which $i_count $i_str in use.\n";
-    }
+        $ret .=
+          "Total: $count communicators of which $i_count $i_str in use.\n";
+    }
+    $ret .= "No data was recorded for $no_data communicators\n";

      return $ret;
  }
@@ -5887,7 +6075,7 @@
              } elsif ( $line =~ /^msg\d+/ ) {
                  ;    # nop
              } else {
-                print "Failed to match minfo output: $line\n";
+		#print "Failed to match minfo output: $line\n";
              }
          }
          $coll_data{$rank} = \%lid;
@@ -6784,8 +6972,12 @@
      my %remote_env = get_remote_env($pid);

      if ( defined $remote_env{LD_LIBRARY_PATH} ) {
-        $ENV{LD_LIBRARY_PATH} =
-          "$remote_env{LD_LIBRARY_PATH}:$inner_conf{myld}";
+        if ( defined $inner_conf{myld} ) {
+            $ENV{LD_LIBRARY_PATH} =
+              "$remote_env{LD_LIBRARY_PATH}:$inner_conf{myld}";
+        } else {
+            $ENV{LD_LIBRARY_PATH} = "$remote_env{LD_LIBRARY_PATH}";
+        }
      }

      my $cmd = "$inner_conf{edb} --queues --pid=$pid";
@@ -7852,12 +8044,13 @@
      # Sort out secondary and options_i so they are handled in the same way.

      $allfns{queue} = {
-        arg_long  => 'message-queue',
-        qsnet     => 1,
-        arg_short => 'q',
-        handler   => \&show_queue,
-        help      => 'Show the message queues',
-        options_i => { mpi_dll => undef, }
+        out_handler => \&mpi_queue_output_handler,
+        arg_long    => 'message-queue',
+        qsnet       => 1,
+        arg_short   => 'q',
+        handler     => \&show_queue,
+        help        => 'Show the message queues',
+        options_i   => { mpi_dll => undef, }

      };





More information about the padb-devel mailing list