[padb-devel] [padb] r125 committed - Merged full-duplex changes r55:r124 into the trunk

codesite-noreply at google.com codesite-noreply at google.com
Wed Aug 19 22:56:28 BST 2009


Revision: 125
Author: apittman
Date: Wed Aug 19 14:56:10 2009
Log: Merged full-duplex changes r55:r124 into the trunk
http://code.google.com/p/padb/source/detail?r=125

Modified:
  /trunk/src
  /trunk/src/padb

=======================================
--- /trunk/src/padb	Thu Jun 18 01:29:09 2009
+++ /trunk/src/padb	Wed Aug 19 14:56:10 2009
@@ -1,6 +1,6 @@
-#!/usr/bin/perl
-
-# padb. a simple parallel debugging aid from Quadrics.
+#!/usr/bin/perl -w
+
+# padb. a simple parallel debugging aid.

  # For help and support visit http://padb.pittman.org.uk
  # or email padb-users at pittman.org.uk
@@ -23,6 +23,19 @@
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA   
02110-1301  USA

  # Revision history
+# Version 3.0
+#  * Full-duplex communication between inner and outer processes, padb
+#    no longer simply sends request on the command line and processes
+#    the response but is truly interactive between the inner and outer
+#    processes.  This avails greater flexibility in what can be achieved
+#    and hopefully helps with scalability as well.
+#  * Enabled warnings (-w) by default.  Fixed lots of warnings, mostly
+#    about comparing with undef
+#  * Much more complete separation into "modes" of operation, most options
+#    are now mode specific rather than simply using global variables or
+#    global conf options.
+#  * Overhaul of the allfns (mode) callbacks and in particular their  
parameters
+#  * Performance improvements.
  #
  # Version 2.5
  #  * First Non-Quadrics version
@@ -205,6 +218,8 @@
  use File::Temp qw(tempfile);
  use MIME::Base64;
  use Config;
+use IO::Socket;
+use IO::Select;

   
###############################################################################
  #
@@ -237,10 +252,12 @@
  # Main.

  my $prog    = basename $0;
-my $version = "2.5";
+my $version = "3.0-beta";

  my %conf;

+my $secret;
+
  # Config options the inner knows about, only forward options if they are  
in this list.
  my @inner_conf = qw(edb edbopt minfo rmgr scripts slurm-job-step verbose);

@@ -278,11 +295,12 @@
  };

  $rmgr{"mpd"} = {
-    'is_installed'    => \&mpd_is_installed,
-    'get_active_jobs' => \&mpd_get_jobs,
-    'setup_pcmd'      => \&mpd_setup_pcmd,
-    'cleanup_pcmd'    => \&mpd_cleanup_pcmd,
-    'find_pids'       => \&mpd_find_pids,
+    'is_installed'           => \&mpd_is_installed,
+    'get_active_jobs'        => \&mpd_get_jobs,
+    'setup_pcmd'             => \&mpd_setup_pcmd,
+    'cleanup_pcmd'           => \&mpd_cleanup_pcmd,
+    'find_pids'              => \&mpd_find_pids,
+    'require_inner_callback' => 1,
  };

  $rmgr{"orte"} = {
@@ -301,33 +319,37 @@
  };

  $rmgr{"slurm"} = {
-    'is_installed'    => \&slurm_is_installed,
-    'get_active_jobs' => \&slurm_get_jobs,
-    'job_is_running'  => \&slurm_job_is_running,
-    'setup_pcmd'      => \&slurm_setup_pcmd,
-    'find_pids'       => \&slurm_find_pids,
+    'is_installed'           => \&slurm_is_installed,
+    'get_active_jobs'        => \&slurm_get_jobs,
+    'job_is_running'         => \&slurm_job_is_running,
+    'setup_pcmd'             => \&slurm_setup_pcmd,
+    'find_pids'              => \&slurm_find_pids,
+    'require_inner_callback' => 1,
  };

  $rmgr{"local"} = {
-    'get_active_jobs' => \&local_get_jobs,
-    'job_is_running'  => \&local_job_is_running,
-    'setup_pcmd'      => \&local_setup_pcmd,
-    'find_pids'       => \&local_find_pids,
+    'get_active_jobs'        => \&local_get_jobs,
+    'job_is_running'         => \&local_job_is_running,
+    'setup_pcmd'             => \&local_setup_pcmd,
+    'find_pids'              => \&local_find_pids,
+    'require_inner_callback' => 1,
  };

  $rmgr{"local-qsnet"} = {
-    'is_installed'    => \&local_q_is_installed,
-    'get_active_jobs' => \&local_q_get_jobs,
-    'job_is_running'  => \&local_job_is_running,
-    'setup_pcmd'      => \&local_setup_pcmd,
-    'inner_rmgr'      => "local",
+    'is_installed'           => \&local_q_is_installed,
+    'get_active_jobs'        => \&local_q_get_jobs,
+    'job_is_running'         => \&local_job_is_running,
+    'setup_pcmd'             => \&local_setup_pcmd,
+    'inner_rmgr'             => "local",
+    'require_inner_callback' => 1,
  };

  $rmgr{"local-fd"} = {
-    'get_active_jobs' => \&local_fd_get_jobs,
-    'job_is_running'  => \&local_job_is_running,
-    'setup_pcmd'      => \&local_setup_pcmd,
-    'inner_rmgr'      => "local",
+    'get_active_jobs'        => \&local_fd_get_jobs,
+    'job_is_running'         => \&local_job_is_running,
+    'setup_pcmd'             => \&local_setup_pcmd,
+    'inner_rmgr'             => "local",
+    'require_inner_callback' => 1,
  };

   
###############################################################################
@@ -344,50 +366,42 @@

  my %allfns;

-my $line_formatted = 0;
-my $countoutput    = 0;
-
  my %cinner;    # Config options to be passed to inner.
  my $rem_jobid;

  # Debug options.
-$conf{"verbose"}      = 0;
-$conf{"tree-verbose"} = 0;
-$conf{"dump-raw"}     = 0;
-$conf{"showcmd"}      = 0;
+$conf{"verbose"}  = 0;
+$conf{"dump-raw"} = 0;
+
+# Valid values are "none" "missing" or "all".  Anything not recognised
+# is treated as "all".
+$conf{"check-signon"} = "all";

  $conf{slurm_job_step} = "0";

  # Output options.
-$conf{"stats-sort-key"}      = "vp";
-$conf{"proc-sort-key"}       = "vp";
-$conf{"proc-show-header"}    = 1;
-$conf{"stats-reverse"}       = 0;
-$conf{"stats-short"}         = 0;
-$conf{"show-group-members"}  = 0;
-$conf{"show-all-stats"}      = 0;
-$conf{"show-all-groups"}     = 0;
  $conf{"interval"}            = 10;
  $conf{"watch-clears-screen"} = 1;
-$conf{"stats-name"}          = undef;
-$conf{"stats-raw"}           = 0;
  $conf{"scripts"}             = "bash,sh,dash,ash,perl,xterm";
-$conf{"stack-strip-below"}   = "main";
  $conf{"lsf-job-offset"}      = 1;
  $conf{"local-fd-name"}       = "/dev/null";
-$conf{"stack-strip-above"} =
-  "elan_waitWord,elan_pollWord,elan_deviceCheck,opal_condition_wait";
-
-# $conf{stack-format}        = undef;
+$conf{"inner-callback"}      = 0;
+
+# These two are used by deadlock and QsNet group
+# code, they need migrating in the group code
+# when I have access to a test system again.
+#$conf{"show-group-members"}  = 0;
+#$conf{"show-all-groups"}     = 0;

  # Tuning options.
  $conf{"prun-timeout"}     = 120;
  $conf{"prun-exittimeout"} = 120;
  $conf{"rmgr"}             = "auto";

+$conf{"slurm-job-step"} = 0;
+
  # These settings are passed onto inner only.
-$conf{"edbopt"}  = "";
-$conf{"mpi-dll"} = "auto";
+$conf{"edbopt"} = "";

  $conf{"edb"}   = find_edb();
  $conf{"minfo"} = find_minfo();
@@ -419,8 +433,7 @@
  sub show_version {
      printf("$prog version $version\n\n");
      printf("Written by Ashley Pittman\n");
-
-    #ashley at quadrics.com
+    printf("http://padb.pittman.org.uk\n");
      exit 0;
  }

@@ -431,15 +444,13 @@

  -a --all               report on all running jobs for user.
  -A --any               report on a running job for user.
--u --user=USER         report on jobs for username=<user>.
-
--r --rank=RANK         report only on processes <RANK>.
-   --group-id=ID       report only on group <ID>.
-
--s --statistics        Show the job-wide statistics.
--g --group             Show the state of collective operations (groups).
+-u --user=<USER>       report on jobs for username=<user>.
+
+-r --rank=<RANK>       report only on processes <RANK>.
+   --group-id=<ID>     report only on group <ID>.
+
  XXXX
-   --full-report=JOBID All of the above.
+   --full-report=<JOBID> Generate a full report of job state.

     --nostrip-below-main Don\'t strip stack traces below main.
     --nostrip-above-wait Don\'t strip stack traces about elan_waitWord.
@@ -468,7 +479,6 @@
    prun-timeout         Timeout to use when launching parallel job.

    Stack trace options:
-  tree-verbose         turn on debugging for the stack trace tree  
generation code.
    gdb-retry-count      Number of times to try getting a 'good' stack trace  
from gdb.
    stack-show-params    Show function parameters in stack traces.
    stack-show-locals    Show locals in stack traces.
@@ -495,8 +505,10 @@
      chomp $usage;

      my $extra = "";
+    $extra .= "Modes of operation\n";
      foreach my $arg ( sort( keys %allfns ) ) {
          next unless ( defined $allfns{$arg}{help} );
+        next if ( defined $allfns{$arg}{qsnet} );
          if ( defined $allfns{$arg}{arg_short} ) {
              $extra .= "-$allfns{$arg}{arg_short}";
          } else {
@@ -506,6 +518,20 @@
              $allfns{$arg}{arg_long},
              $allfns{$arg}{help} );
      }
+
+    $extra .= "\nQsNet specific modes\n";
+    foreach my $arg ( sort( keys %allfns ) ) {
+        next unless ( defined $allfns{$arg}{help} );
+        next unless ( defined $allfns{$arg}{qsnet} );
+        if ( defined $allfns{$arg}{arg_short} ) {
+            $extra .= "-$allfns{$arg}{arg_short}";
+        } else {
+            $extra .= "  ";
+        }
+        $extra .= sprintf( " --%-18s%s.\n",
+            $allfns{$arg}{arg_long},
+            $allfns{$arg}{help} );
+    }

      $usage =~ s!XXXX!$extra!;

@@ -530,9 +556,6 @@
  # Number of functions provided on the command line from the allfns hash.
  my $have_allfns_option = 0;

-my $stats_total;
-my $group;
-
  my $full_report;
  my $core_stack;
  my $list_rmgrs;
@@ -543,17 +566,11 @@
  my $core_name;
  my $exe_name;

-my $proc_format =
-  "vp=vpid,hostname,pid,vmsize,vmrss,stat.state=S,pcpu=%cpu,name=command";
-
  my $input_file;
  my $compress;
  my $compress_C;
  my $tree;

-my $strip_below_main = 1;
-my $strip_above_wait = 1;
-
  my @config_options;
  my %ic_names;

@@ -561,40 +578,67 @@
  # can access secondary comamnd line argunments by name.
  my %secondary_args;

+# Debugging: this function is called periodically with
+# a mode, an abritary ref and a string, it can either print simply
+# the string or call dumper on the ref as well.
+# Enable with --debug=type1,type2=all
+my %debugModes;
+my $start_time = time();
+
+sub debug_log {
+    my ( $type, $handle, $str, @params ) = @_;
+    if ( not exists $debugModes{$type} ) {
+        printf("Unknown debug mode: $type\n");
+        exit(1);
+    }
+    return unless $debugModes{$type};
+    my $time = time() - $start_time;
+    printf( "DEBUG ($type): %3d: $str\n", $time, @params );
+    return if $debugModes{$type} eq "basic";
+    return unless defined $handle;
+    print Dumper $handle;
+}
+
+# Valid debug modes, a full list is maintained here so using unexpected
+# ones can generate warnings.
+$debugModes{"full-duplex"} = undef;
+$debugModes{"show-cmd"}    = undef;
+$debugModes{"all"}         = undef;
+$debugModes{"tree"}        = undef;
+$debugModes{"verbose"}     = undef;
+$debugModes{"signon"}      = undef;
+$debugModes{"rmgr"}        = undef;
+
  sub parse_args_outer {

      Getopt::Long::Configure("bundling");
-    my $mode;
+    my $debugflag;

      my %optionhash = (
-        "verbose|v+"                     => \$conf{verbose},
-        "user|u=s"                       => \$user,
-        "rank|r=i"                       => \@ranks,
-        "group-id=i"                     => \@target_groups,
-        "help|h"                         => \&usage,
-        "all|a"                          => \$all,
-        "any|A"                          => \$any,
-        "statistics-total|stat|sta|st|s" => \$stats_total,
-        "version|V"                      => \&show_version,
-        "compress|c"                     => \$compress,
-        "compress-long|C"                => \$compress_C,
-        "group|g"                        => \$group,
-        "tree|t"                         => \$tree,
-        "input-file|file|i=s"            => \$input_file,
-        "config-option|O=s"              => \@config_options,
-        "full-report=s"                  => \$full_report,
-        "core-stack"                     => \$core_stack,
-        "core=s"                         => \$core_name,
-        "exe=s"                          => \$exe_name,
-        "list-rmgrs"                     => \$list_rmgrs,
-        "strip-below-main!"              => \$strip_below_main,
-        "strip-above-wait!"              => \$strip_above_wait,
-        "watch!"                         => \$watch,
-        "local-stats"                    => \$local_stats,
-        "proc-format=s"                  => \$proc_format,
-        "show-jobs"                      => \$show_jobs,
-        "norc"                           => \$norc,
-        "config-file=s"                  => \$configfile
+        "verbose|v+"          => \$conf{verbose},
+        "user|u=s"            => \$user,
+        "rank|r=i"            => \@ranks,
+        "group-id=s"          => \@target_groups,
+        "help|h"              => \&usage,
+        "all|a"               => \$all,
+        "any|A"               => \$any,
+        "version|V"           => \&show_version,
+        "compress|c"          => \$compress,
+        "compress-long|C"     => \$compress_C,
+        "tree|t"              => \$tree,
+        "input-file|file|i=s" => \$input_file,
+        "config-option|O=s"   => \@config_options,
+        "full-report=s"       => \$full_report,
+        "core-stack"          => \$core_stack,
+        "core=s"              => \$core_name,
+        "exe=s"               => \$exe_name,
+        "list-rmgrs"          => \$list_rmgrs,
+        "watch"               => \$watch,
+        "local-stats"         => \$local_stats,
+        "show-jobs"           => \$show_jobs,
+        "norc"                => \$norc,
+        "config-file=s"       => \$configfile,
+        "debug=s"             => \$debugflag,
      );

      my %config_hash;
@@ -608,14 +652,32 @@
          }
          if ( defined $allfns{$arg}{options_i} ) {
              foreach my $o ( keys( %{ $allfns{$arg}{options_i} } ) ) {
-                $conf{$o} = $allfns{$arg}{options_i}{$o};
-                $ic_names{$o}++;
+                $conf{mode_options}{$arg}{$o} =  
$allfns{$arg}{options_i}{$o};
+                $conf{mode_options_reverse}{$o}{$arg} = 1;
              }
          }
      }

      GetOptions(%optionhash) or exit(1);

+    if ( defined $debugflag ) {
+        foreach my $f ( split( ",", $debugflag ) ) {
+            my ( $name, $v ) = split( "=", $f );
+            if ( exists $debugModes{$name} ) {
+                $debugModes{$name} = defined($v) ? $v : "basic";
+            } else {
+                printf("Attempt to set unknown debug flag \"$name\".\n");
+            }
+        }
+        if ( $debugModes{all} ) {
+            foreach my $mode ( keys(%debugModes) ) {
+                $debugModes{$mode} = $debugModes{all};
+            }
+        }
+    }
+
+    my $mode;
+
      foreach my $arg ( keys %config_hash ) {
          next unless defined $config_hash{$arg};
          $mode = $arg;
@@ -623,7 +685,7 @@
      }

      # Put the args in a hash so that they can be referenced by name.
-    if ( defined $allfns{$mode}{secondary} ) {
+    if ( defined $mode and defined $allfns{$mode}{secondary} ) {
          foreach my $sec ( @{ $allfns{$mode}{secondary} } ) {
              $secondary_args{ $sec->{arg_long} } = $sec->{value};
          }
@@ -1417,6 +1479,9 @@
          print("QsNet Statistics not valid\n");
          return;
      }
+
+    my $stats_total = 0;
+    my $group       = 0;

      if ($stats_total) {

@@ -1500,11 +1565,11 @@

      my %ad;

-    my @tg;
+    my %tg;

      if ( $#target_groups != -1 ) {
          foreach my $gid (@target_groups) {
-            $tg[$gid]++;
+            $tg{$gid}++;
          }
      }

@@ -1515,7 +1580,7 @@
          foreach my $gid ( keys %{ $dataset->{'subsystems'}{'Group'} } ) {

              if ( $#target_groups != -1 ) {
-                next unless defined $tg[$gid];
+                next unless defined $tg{$gid};
              }

              my $str;
@@ -1566,7 +1631,7 @@
      foreach my $gid ( sort { $a <=> $b } keys %ad ) {

          if ( $#target_groups != -1 ) {
-            next unless defined $tg[$gid];
+            next unless defined $tg{$gid};
          }

          my $gstr = "Information for group '$gid'\n";
@@ -1734,7 +1799,7 @@

      my $s = read_stats(@data);

-    $stats_total = 1;
+    # $stats_total = 1;

      show_stats($s);
  }
@@ -1792,8 +1857,7 @@
      my $prev;
      my $tag = $tags[0];

-    printf("called tag:$tag, level:$level tags:@tags\n")
-      if $conf{"tree-verbose"};
+    debug_log( "tree", undef, "called tag:$tag, level:$level tags:@tags" );

      return if ( !defined($tag) );
      return if ( !defined( $lines->{$tag} ) );
@@ -1852,9 +1916,11 @@
          $endlevel = ( $#{ $lines->{$tag} } );
      }

-    printf(
-"level $level, endlevel $endlevel, identical:@identical  
different:@different\n",
-    ) if $conf{"tree-verbose"};
+    debug_log(
+        "tree", undef,
+
+"level $level, endlevel $endlevel, identical:@identical  
different:@different"
+    );

      for ( my $l = $level ; $l <= $endlevel ; $l++ ) {

@@ -1882,9 +1948,11 @@
          $prev->{children} = go_p( $endlevel + 1, $lines, @identical );
      }

-    printf(
-"returning level:$level endlevel:$endlevel identical:@identical  
different:@different\n"
-    ) if $conf{"tree-verbose"};
+    debug_log(
+        "tree", undef,
+
+"returning level:$level endlevel:$endlevel identical:@identical  
different:@different"
+    );

      if (@different) {
          my $new = go_p( $level, $lines, @different );
@@ -1926,6 +1994,7 @@

  sub show_tree {
      my $ref = shift;
+    debug_log( "tree", $ref, "Complete tree" );
      return _show_tree( $ref, undef, "" );
  }

@@ -2007,7 +2076,7 @@
  }

  sub rms_is_installed {
-    return find_exe("prun");
+    return ( find_exe("prun") and find_exe("rmsquery") );
  }

  sub rms_get_jobs {
@@ -2114,7 +2183,7 @@
   
###############################################################################

  sub slurm_is_installed {
-    return find_exe("srun");
+    return ( find_exe("srun") and find_exe("squeue") and  
find_exe("scontrol") );
  }

  sub slurm_get_jobs {
@@ -2124,15 +2193,20 @@
      return @res;
  }

+# Query the process count for the "step" as that's how many
+# processes we are going to be looking for.
  sub slurm_job_to_ncpus {
      my $job   = shift;
-    my @steps = `squeue -s -o "%i %A" 2>/dev/null`;
+    my $s     = "$job." . $conf{"slurm-job-step"};
+    my @steps = `squeue -s $s -o "%i %A" 2>/dev/null`;
      return undef if ( $? != 0 );

-# The %A option is new so ensure we have the TASKS output before we  
believe what we see here...
+    # The %A option is new so ensure we have the TASKS output
+    # before we believe what we see here...
+    # Mind you %A is several years old now so if it's not there
+    # we probably can't do anything anyway.
      my $tasks;
      my $have_tasks = 0;
-    my $s          = "$job." . $conf{"slurm-job-step"};
      foreach my $step (@steps) {
          my ( $step, $cpus ) = split( " ", $step );
          $tasks      = $cpus if ( $step eq $s );
@@ -2141,6 +2215,37 @@
      return $tasks if $have_tasks;
      return undef;
  }
+
+# Query the nodecount for the "job" as that is what we shall be running on.
+sub slurm_job_to_nodecount {
+    my $job  = shift;
+    my @jobs = `squeue -o "%i %D" 2>/dev/null`;
+    return undef if ( $? != 0 );
+
+    foreach my $step (@jobs) {
+        my ( $left, $right ) = split( " ", $step );
+        return $right if ( $left eq $job );
+    }
+    return undef;
+}
+
+# Query the node list for the "step" which isn't the same as the node list
+# for the job, care should be taken if using this function to ensure this
+# is correct.
+# This functions isn't used currently.
+sub slurm_job_to_nodelist {
+    my $job   = shift;
+    my $s     = "$job." . $conf{"slurm-job-step"};
+    my @steps = `squeue -s $s -o "%i %N" 2>/dev/null`;
+    return undef if ( $? != 0 );
+
+    foreach my $step (@steps) {
+        my ( $left, $right ) = split( " ", $step );
+        return $right if ( $left eq $s );
+
+    }
+    return undef;
+}

  sub slurm_job_is_running {
      my $job    = shift;
@@ -2152,7 +2257,8 @@
  sub slurm_setup_pcmd {
      my $job  = shift;
      my $cpus = slurm_job_to_ncpus($job);
-    return ( "srun --jobid=$job", $cpus );
+    my $nc   = slurm_job_to_nodecount($job);
+    return ( "srun --jobid=$job", $cpus, $nc );
  }

   
###############################################################################
@@ -2242,7 +2348,7 @@
  }

  sub local_setup_pcmd {
-    return ( "", undef );
+    return ( "", 1, 1 );
  }

   
###############################################################################
@@ -2252,7 +2358,7 @@
   
###############################################################################

  sub mpd_is_installed {
-    return find_exe("mpdlistjobs");
+    return ( find_exe("mpdlistjobs") and find_exe("mpdrun") );
  }

  sub mpd_get_data {
@@ -2318,9 +2424,11 @@

      $mpd_dfile = $fn;

-    my $cmd = "mpirun -machinefile $fn -np $i";
-
-    return ( $cmd, undef );
+    my $cmd = "mpdrun -machinefile $fn -np $i";
+
+    my $hosts = $#hosts + 1;
+
+    return ( $cmd, undef, $hosts );
  }

  sub mpd_cleanup_pcmd {
@@ -2346,31 +2454,24 @@
  }

  sub open_is_installed {
-    return find_exe("ompi-ps");
+    return ( find_exe("ompi-ps") and find_exe("orterun") );
  }

  my %open_jobs;

  sub open_get_data {
-    my ($filename) = @_;

      # Simply return if called more than once.
      if ( keys(%open_jobs) != 0 ) {
          return;

      }
-    my $hostname = hostname();
+
      my $job;
-    my @out;
-    if ( defined $filename ) {
-        open( OPEN, $filename ) or return;
-        @out = <OPEN>;
-        close OPEN;
-    } else {
-        open( OPEN, "ompi-ps|" ) or return;
-        @out = <OPEN>;
-        close OPEN;
-    }
+
+    open( OPEN, "ompi-ps|" ) or return;
+    my @out = <OPEN>;
+    close OPEN;

      # Handle being called multiple times, zero the hash every
      # time we are called.  Of course we could just return the
@@ -2387,7 +2488,11 @@
          } else {
              my @elems = split( /\|/, $l );

-            if ( $#elems == 6 ) {
+            if ( $#elems == 4 ) {
+                my $nprocs = $elems[3];
+                $nprocs =~ s/ //g;
+                $open_jobs{$job}{nprocs} = $nprocs;
+            } elsif ( $#elems == 6 ) {

                  my $host = $elems[4];
                  $host =~ s/ //g;
@@ -2395,24 +2500,22 @@
                  next if $host eq "Node";
                  $open_jobs{$job}{hosts}{$host}++;

-                if ( $host eq $hostname ) {
-                    my $name = $elems[1];
-                    $name =~ /\[\[(\d+)\,(\d+)\]\,(\d+)\]/;
-                    my $rank = $3;
-
-                    my $pid = $elems[3];
-                    $rank =~ s/ //g;
-                    $pid  =~ s/ //g;
-                    $open_jobs{$job}{ranks}{$host}{$rank} = $pid;
-                }
+                my $name = $elems[1];
+                $name =~ /\[\[(\d+)\,(\d+)\]\,(\d+)\]/;
+                my $rank = $3;
+
+                my $pid = $elems[3];
+                $rank =~ s/ //g;
+                $pid  =~ s/ //g;
+                $open_jobs{$job}{ranks}{$host}{$rank} = $pid;
              }
          }

      }

-    if ( $conf{"verbose"} ) {
-        print Dumper \%open_jobs;
-    }
+    #if ( $conf{"verbose"} ) {
+    #print Dumper \%open_jobs;
+    #}
  }

  sub open_get_jobs {
@@ -2423,24 +2526,12 @@
  }

  my $open_dfile;
-my $open_tfile;

  sub open_setup_pcmd {
      my $job = shift;

      open_get_data();

-    my ( $th, $tn ) = tempfile(".padb.XXXX");
-
-    open( my $oh, "ompi-ps|" );
-    while (<$oh>) {
-        print $th $_;
-    }
-    close $th;
-    $cinner{"open-ps"} = $tn;
-
-    $open_tfile = $tn;
-
      my @hosts = keys %{ $open_jobs{$job}{hosts} };
      my $i     = @hosts;

@@ -2454,14 +2545,14 @@
      $open_dfile = $fn;

      my $prefix = find_ompi_prefix();
-    my $cmd    = "mpirun -machinefile $fn -np $i $prefix";
-
-    return ( $cmd, undef );
+    my $cmd    = "orterun -machinefile $fn -np $i $prefix";
+    my $hosts  = $#hosts + 1;
+
+    return ( $cmd, $open_jobs{$job}{nprocs}, $hosts );
  }

  sub open_cleanup_pcmd {
      unlink($open_dfile) if ( defined($open_dfile) );
-    unlink($open_tfile) if ( defined($open_tfile) );
  }

   
###############################################################################
@@ -2474,9 +2565,7 @@

      # Check for both LSF and RMS, I know LSF works in other ways but I  
don't
      # know how to launch jobs then...
-    my $rms = find_exe("rinfo");
-    return 0 unless $rms;
-    return find_exe("bjobs");
+    return ( find_exe("bjobs") and rms_is_installed() );
  }

  sub lsf_get_jobs {
@@ -2553,9 +2642,16 @@
  sub find_rmgr {

  # If it's been set on the command line and it's valid then just use what  
we are given.
-# Don't do any sanity checks here to cope with non-default installs.
+# Do sanity checks here but only warn on the result to cope with  
non-default installs.

      if ( defined $rmgr{ $conf{"rmgr"} } ) {
+        if ( defined $rmgr{ $conf{"rmgr"} }{is_installed}
+            and not $rmgr{ $conf{"rmgr"} }{is_installed}() )
+        {
+            printf(
+"Warning: Selected resource manager $conf{rmgr} does not appear to be  
installed\n"
+            );
+        }
          setup_rmgr( $conf{"rmgr"} );
          return;
      }
@@ -2593,9 +2689,16 @@
  sub find_any_rmgr {

  # If it's been set on the command line and it's valid then just use what  
we are given.
-# Don't do any sanity checks here to cope with non-default installs.
+# Do sanity checks here but only warn on the result to cope with  
non-default installs.

      if ( defined $rmgr{ $conf{"rmgr"} } ) {
+        if ( defined $rmgr{ $conf{"rmgr"} }{is_installed}
+            and not $rmgr{ $conf{"rmgr"} }{is_installed}() )
+        {
+            printf(
+"Warning: Selected resource manager $conf{rmgr} does not appear to be  
installed\n"
+            );
+        }
          setup_rmgr( $conf{"rmgr"} );
          return;
      }
@@ -2658,6 +2761,7 @@

  sub get_all_jobids {
      my $user = shift;
+    debug_log( "rmgr", undef, "Loading active jobs list", undef );
      return $rmgr{ $conf{"rmgr"} }{get_active_jobs}($user);
  }

@@ -2703,13 +2807,15 @@
   
###############################################################################

  sub strip_stack_traces {
-    my $lines = shift;
+    my ( $cargs, $lines ) = @_;

      my %above;
      my %below;

-    map { $above{$_}++ } split( ",", $conf{"stack-strip-above"} );
-    map { $below{$_}++ } split( ",", $conf{"stack-strip-below"} );
+    map { $above{$_}++ }
+      split( ",", $conf{mode_options}{stack}{"stack-strip-above"} );
+    map { $below{$_}++ }
+      split( ",", $conf{mode_options}{stack}{"stack-strip-below"} );

      foreach my $tag ( keys %$lines ) {

@@ -2734,10 +2840,11 @@
          $main_idx = 0 if not defined $main_idx;
          if ( $main_idx != 0 or $wait_idx != 0 ) {
              my $end =
-              ( $strip_above_wait and $wait_idx )
+              ( $cargs->{"strip-above-wait"} and $wait_idx )
                ? $wait_idx
                : $#{ $lines->{$tag} };
-            my $start = ( $strip_below_main and $main_idx ) ? $main_idx :  
0;
+            my $start =
+              ( $cargs->{"strip-below-main"} and $main_idx ) ? $main_idx :  
0;

              printf( "Stripping 0.."
                    . $#{ $lines->{$tag} }
@@ -2752,15 +2859,18 @@
  }

  sub sort_proc_hashes {
-    my $key = shift;
-    my @all = @_;
-
-    #print Dumper $all;
-    return ( reverse( sort { $a->{$key} <=> $b->{$key} } @all ) );
+    my $carg = shift;
+    my $key  = shift;
+    my @all  = @_;
+
+    if ( $carg->{"reverse-sort-order"} ) {
+        return ( reverse( sort { $a->{$key} <=> $b->{$key} } @all ) );
+    } else {
+        return ( sort { $a->{$key} <=> $b->{$key} } @all );
+    }
  }

  sub pre_mpi_watch {
-    my ($cpus) = @_;
      my $header = <<EOF;
  u: unexpected messages U: unexpected and other messages
  s: sending messages r: receiving messages m: sending and receiving
@@ -2769,9 +2879,6 @@
  .: consuming CPU cycles ,: using CPU but no queue data -: sleeping *: error
  EOF
      printf($header);
-    my %data;
-    $data{cpus} = $cpus;
-    return \%data;
  }

  sub show_mpi_watch {
@@ -2779,8 +2886,9 @@

      #     print Dumper $lines;
      my $s = "";
-    foreach my $l ( sort { $a <=> $b } ( keys %{ $lines->{raw} } ) ) {
-        $s .= $lines->{raw}{$l}{state};
+    foreach my $l ( sort { $a <=> $b } ( keys %{ $lines->{target_responce}  
} ) )
+    {
+        $s .= $lines->{target_responce}{$l}{state};
      }
      print("$s\n");
  }
@@ -2788,20 +2896,21 @@
  # Nicely format process information.
  # XXX: proc-sort-key should probably sort on column headers as
  # well as keys.
+# Idealy we'd know what format we wanted and only ask the nodes
+# to report relevent info, for now they still report everything.
  sub show_proc_format {
-    my ( $nlines, $mode, $handle ) = @_;
-
-    my $lines = $nlines->{lines};
+    my ( $carg, $nlines ) = @_;

      my @proc_format_array;
      my %proc_format_header;
      my $show_fields = 0;

      my %proc_format_lengths;
-
-    my $separator = $conf{"column-seperator"};
-
-    my @columns = split( ",", $proc_format );
+    my %proc_header_reverse;
+
+    my $separator = $carg->{"column-seperator"};
+
+    my @columns = split( ",", $carg->{"proc-format"} );
      foreach my $column (@columns) {

          $show_fields = 1 if ( $column eq "fields" );
@@ -2811,6 +2920,7 @@
              push @proc_format_array, lc($name);
              $proc_format_header{ lc($name) }  = $desc;
              $proc_format_lengths{ lc($name) } = length($desc);
+            $proc_header_reverse{ lc($desc) } = lc($name);
          } else {
              push @proc_format_array, lc($column);
              $proc_format_header{ lc($column) }  = $column;
@@ -2819,21 +2929,21 @@
      }

      my @all;
+    my $lines = $nlines->{target_responce};
      foreach my $tag ( sort ( keys %$lines ) ) {
          my %hash;
          $hash{vp} = $tag;
-        foreach my $data ( @{ $lines->{$tag} } ) {
-            if ( $data =~ /([\w\.]+)\:[ \t]*(.+)/ ) {
-                my $key = lc($1);
-
-                next unless defined $proc_format_lengths{$key} or  
$show_fields;
-
-                if ( length($2) > $proc_format_lengths{$key} ) {
-                    $proc_format_lengths{$key} = length($2);
-                }
-
-                $hash{$key} = $2;
-            }
+        foreach my $key ( keys( %{ $lines->{$tag} } ) ) {
+
+            my $value = $lines->{$tag}{$key};
+            next unless defined $proc_format_lengths{$key} or $show_fields;
+
+            if ( length($value) > $proc_format_lengths{$key} ) {
+                $proc_format_lengths{$key} = length($value);
+            }
+
+            $hash{$key} = $value;
+
          }
          if ($show_fields) {
              my @fields = sort ( keys(%hash) );
@@ -2843,9 +2953,14 @@
          push @all, \%hash;
      }

-    @all = sort_proc_hashes( $conf{"proc-sort-key"}, @all );
***The diff for this file has been truncated for email.***




More information about the padb-devel mailing list