[padb-devel] [padb] r125 committed - Merged full-duplex changes r55:r124 into the trunk
codesite-noreply at google.com
codesite-noreply at google.com
Wed Aug 19 22:56:28 BST 2009
Revision: 125
Author: apittman
Date: Wed Aug 19 14:56:10 2009
Log: Merged full-duplex changes r55:r124 into the trunk
http://code.google.com/p/padb/source/detail?r=125
Modified:
/trunk/src
/trunk/src/padb
=======================================
--- /trunk/src/padb Thu Jun 18 01:29:09 2009
+++ /trunk/src/padb Wed Aug 19 14:56:10 2009
@@ -1,6 +1,6 @@
-#!/usr/bin/perl
-
-# padb. a simple parallel debugging aid from Quadrics.
+#!/usr/bin/perl -w
+
+# padb. a simple parallel debugging aid.
# For help and support visit http://padb.pittman.org.uk
# or email padb-users at pittman.org.uk
@@ -23,6 +23,19 @@
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301 USA
# Revision history
+# Version 3.0
+# * Full-duplex communication between inner and outer processes, padb
+# no longer simply sends request on the command line and processes
+# the response but is truly interactive between the inner and outer
+# processes. This avails greater flexibility in what can be achieved
+# and hopefully helps with scalability as well.
+# * Enabled warnings (-w) by default. Fixed lots of warnings, mostly
+# about comparing with undef
+# * Much more complete separation into "modes" of operation, most options
+# are now mode specific rather than simply using global variables or
+# global conf options.
+# * Overhaul of the allfns (mode) callbacks and in particular their
parameters
+# * Performance improvements.
#
# Version 2.5
# * First Non-Quadrics version
@@ -205,6 +218,8 @@
use File::Temp qw(tempfile);
use MIME::Base64;
use Config;
+use IO::Socket;
+use IO::Select;
###############################################################################
#
@@ -237,10 +252,12 @@
# Main.
my $prog = basename $0;
-my $version = "2.5";
+my $version = "3.0-beta";
my %conf;
+my $secret;
+
# Config options the inner knows about, only forward options if they are
in this list.
my @inner_conf = qw(edb edbopt minfo rmgr scripts slurm-job-step verbose);
@@ -278,11 +295,12 @@
};
$rmgr{"mpd"} = {
- 'is_installed' => \&mpd_is_installed,
- 'get_active_jobs' => \&mpd_get_jobs,
- 'setup_pcmd' => \&mpd_setup_pcmd,
- 'cleanup_pcmd' => \&mpd_cleanup_pcmd,
- 'find_pids' => \&mpd_find_pids,
+ 'is_installed' => \&mpd_is_installed,
+ 'get_active_jobs' => \&mpd_get_jobs,
+ 'setup_pcmd' => \&mpd_setup_pcmd,
+ 'cleanup_pcmd' => \&mpd_cleanup_pcmd,
+ 'find_pids' => \&mpd_find_pids,
+ 'require_inner_callback' => 1,
};
$rmgr{"orte"} = {
@@ -301,33 +319,37 @@
};
$rmgr{"slurm"} = {
- 'is_installed' => \&slurm_is_installed,
- 'get_active_jobs' => \&slurm_get_jobs,
- 'job_is_running' => \&slurm_job_is_running,
- 'setup_pcmd' => \&slurm_setup_pcmd,
- 'find_pids' => \&slurm_find_pids,
+ 'is_installed' => \&slurm_is_installed,
+ 'get_active_jobs' => \&slurm_get_jobs,
+ 'job_is_running' => \&slurm_job_is_running,
+ 'setup_pcmd' => \&slurm_setup_pcmd,
+ 'find_pids' => \&slurm_find_pids,
+ 'require_inner_callback' => 1,
};
$rmgr{"local"} = {
- 'get_active_jobs' => \&local_get_jobs,
- 'job_is_running' => \&local_job_is_running,
- 'setup_pcmd' => \&local_setup_pcmd,
- 'find_pids' => \&local_find_pids,
+ 'get_active_jobs' => \&local_get_jobs,
+ 'job_is_running' => \&local_job_is_running,
+ 'setup_pcmd' => \&local_setup_pcmd,
+ 'find_pids' => \&local_find_pids,
+ 'require_inner_callback' => 1,
};
$rmgr{"local-qsnet"} = {
- 'is_installed' => \&local_q_is_installed,
- 'get_active_jobs' => \&local_q_get_jobs,
- 'job_is_running' => \&local_job_is_running,
- 'setup_pcmd' => \&local_setup_pcmd,
- 'inner_rmgr' => "local",
+ 'is_installed' => \&local_q_is_installed,
+ 'get_active_jobs' => \&local_q_get_jobs,
+ 'job_is_running' => \&local_job_is_running,
+ 'setup_pcmd' => \&local_setup_pcmd,
+ 'inner_rmgr' => "local",
+ 'require_inner_callback' => 1,
};
$rmgr{"local-fd"} = {
- 'get_active_jobs' => \&local_fd_get_jobs,
- 'job_is_running' => \&local_job_is_running,
- 'setup_pcmd' => \&local_setup_pcmd,
- 'inner_rmgr' => "local",
+ 'get_active_jobs' => \&local_fd_get_jobs,
+ 'job_is_running' => \&local_job_is_running,
+ 'setup_pcmd' => \&local_setup_pcmd,
+ 'inner_rmgr' => "local",
+ 'require_inner_callback' => 1,
};
###############################################################################
@@ -344,50 +366,42 @@
my %allfns;
-my $line_formatted = 0;
-my $countoutput = 0;
-
my %cinner; # Config options to be passed to inner.
my $rem_jobid;
# Debug options.
-$conf{"verbose"} = 0;
-$conf{"tree-verbose"} = 0;
-$conf{"dump-raw"} = 0;
-$conf{"showcmd"} = 0;
+$conf{"verbose"} = 0;
+$conf{"dump-raw"} = 0;
+
+# Valid values are "none" "missing" or "all". Anything not recognised
+# is treated as "all".
+$conf{"check-signon"} = "all";
$conf{slurm_job_step} = "0";
# Output options.
-$conf{"stats-sort-key"} = "vp";
-$conf{"proc-sort-key"} = "vp";
-$conf{"proc-show-header"} = 1;
-$conf{"stats-reverse"} = 0;
-$conf{"stats-short"} = 0;
-$conf{"show-group-members"} = 0;
-$conf{"show-all-stats"} = 0;
-$conf{"show-all-groups"} = 0;
$conf{"interval"} = 10;
$conf{"watch-clears-screen"} = 1;
-$conf{"stats-name"} = undef;
-$conf{"stats-raw"} = 0;
$conf{"scripts"} = "bash,sh,dash,ash,perl,xterm";
-$conf{"stack-strip-below"} = "main";
$conf{"lsf-job-offset"} = 1;
$conf{"local-fd-name"} = "/dev/null";
-$conf{"stack-strip-above"} =
- "elan_waitWord,elan_pollWord,elan_deviceCheck,opal_condition_wait";
-
-# $conf{stack-format} = undef;
+$conf{"inner-callback"} = 0;
+
+# These two are used by deadlock and QsNet group
+# code, they need migrating in the group code
+# when I have access to a test system again.
+#$conf{"show-group-members"} = 0;
+#$conf{"show-all-groups"} = 0;
# Tuning options.
$conf{"prun-timeout"} = 120;
$conf{"prun-exittimeout"} = 120;
$conf{"rmgr"} = "auto";
+$conf{"slurm-job-step"} = 0;
+
# These settings are passed onto inner only.
-$conf{"edbopt"} = "";
-$conf{"mpi-dll"} = "auto";
+$conf{"edbopt"} = "";
$conf{"edb"} = find_edb();
$conf{"minfo"} = find_minfo();
@@ -419,8 +433,7 @@
sub show_version {
printf("$prog version $version\n\n");
printf("Written by Ashley Pittman\n");
-
- #ashley at quadrics.com
+ printf("http://padb.pittman.org.uk\n");
exit 0;
}
@@ -431,15 +444,13 @@
-a --all report on all running jobs for user.
-A --any report on a running job for user.
--u --user=USER report on jobs for username=<user>.
-
--r --rank=RANK report only on processes <RANK>.
- --group-id=ID report only on group <ID>.
-
--s --statistics Show the job-wide statistics.
--g --group Show the state of collective operations (groups).
+-u --user=<USER> report on jobs for username=<user>.
+
+-r --rank=<RANK> report only on processes <RANK>.
+ --group-id=<ID> report only on group <ID>.
+
XXXX
- --full-report=JOBID All of the above.
+ --full-report=<JOBID> Generate a full report of job state.
--nostrip-below-main Don\'t strip stack traces below main.
--nostrip-above-wait Don\'t strip stack traces about elan_waitWord.
@@ -468,7 +479,6 @@
prun-timeout Timeout to use when launching parallel job.
Stack trace options:
- tree-verbose turn on debugging for the stack trace tree
generation code.
gdb-retry-count Number of times to try getting a 'good' stack trace
from gdb.
stack-show-params Show function parameters in stack traces.
stack-show-locals Show locals in stack traces.
@@ -495,8 +505,10 @@
chomp $usage;
my $extra = "";
+ $extra .= "Modes of operation\n";
foreach my $arg ( sort( keys %allfns ) ) {
next unless ( defined $allfns{$arg}{help} );
+ next if ( defined $allfns{$arg}{qsnet} );
if ( defined $allfns{$arg}{arg_short} ) {
$extra .= "-$allfns{$arg}{arg_short}";
} else {
@@ -506,6 +518,20 @@
$allfns{$arg}{arg_long},
$allfns{$arg}{help} );
}
+
+ $extra .= "\nQsNet specific modes\n";
+ foreach my $arg ( sort( keys %allfns ) ) {
+ next unless ( defined $allfns{$arg}{help} );
+ next unless ( defined $allfns{$arg}{qsnet} );
+ if ( defined $allfns{$arg}{arg_short} ) {
+ $extra .= "-$allfns{$arg}{arg_short}";
+ } else {
+ $extra .= " ";
+ }
+ $extra .= sprintf( " --%-18s%s.\n",
+ $allfns{$arg}{arg_long},
+ $allfns{$arg}{help} );
+ }
$usage =~ s!XXXX!$extra!;
@@ -530,9 +556,6 @@
# Number of functions provided on the command line from the allfns hash.
my $have_allfns_option = 0;
-my $stats_total;
-my $group;
-
my $full_report;
my $core_stack;
my $list_rmgrs;
@@ -543,17 +566,11 @@
my $core_name;
my $exe_name;
-my $proc_format =
- "vp=vpid,hostname,pid,vmsize,vmrss,stat.state=S,pcpu=%cpu,name=command";
-
my $input_file;
my $compress;
my $compress_C;
my $tree;
-my $strip_below_main = 1;
-my $strip_above_wait = 1;
-
my @config_options;
my %ic_names;
@@ -561,40 +578,67 @@
# can access secondary comamnd line argunments by name.
my %secondary_args;
+# Debugging: this function is called periodically with
+# a mode, an abritary ref and a string, it can either print simply
+# the string or call dumper on the ref as well.
+# Enable with --debug=type1,type2=all
+my %debugModes;
+my $start_time = time();
+
+sub debug_log {
+ my ( $type, $handle, $str, @params ) = @_;
+ if ( not exists $debugModes{$type} ) {
+ printf("Unknown debug mode: $type\n");
+ exit(1);
+ }
+ return unless $debugModes{$type};
+ my $time = time() - $start_time;
+ printf( "DEBUG ($type): %3d: $str\n", $time, @params );
+ return if $debugModes{$type} eq "basic";
+ return unless defined $handle;
+ print Dumper $handle;
+}
+
+# Valid debug modes, a full list is maintained here so using unexpected
+# ones can generate warnings.
+$debugModes{"full-duplex"} = undef;
+$debugModes{"show-cmd"} = undef;
+$debugModes{"all"} = undef;
+$debugModes{"tree"} = undef;
+$debugModes{"verbose"} = undef;
+$debugModes{"signon"} = undef;
+$debugModes{"rmgr"} = undef;
+
sub parse_args_outer {
Getopt::Long::Configure("bundling");
- my $mode;
+ my $debugflag;
my %optionhash = (
- "verbose|v+" => \$conf{verbose},
- "user|u=s" => \$user,
- "rank|r=i" => \@ranks,
- "group-id=i" => \@target_groups,
- "help|h" => \&usage,
- "all|a" => \$all,
- "any|A" => \$any,
- "statistics-total|stat|sta|st|s" => \$stats_total,
- "version|V" => \&show_version,
- "compress|c" => \$compress,
- "compress-long|C" => \$compress_C,
- "group|g" => \$group,
- "tree|t" => \$tree,
- "input-file|file|i=s" => \$input_file,
- "config-option|O=s" => \@config_options,
- "full-report=s" => \$full_report,
- "core-stack" => \$core_stack,
- "core=s" => \$core_name,
- "exe=s" => \$exe_name,
- "list-rmgrs" => \$list_rmgrs,
- "strip-below-main!" => \$strip_below_main,
- "strip-above-wait!" => \$strip_above_wait,
- "watch!" => \$watch,
- "local-stats" => \$local_stats,
- "proc-format=s" => \$proc_format,
- "show-jobs" => \$show_jobs,
- "norc" => \$norc,
- "config-file=s" => \$configfile
+ "verbose|v+" => \$conf{verbose},
+ "user|u=s" => \$user,
+ "rank|r=i" => \@ranks,
+ "group-id=s" => \@target_groups,
+ "help|h" => \&usage,
+ "all|a" => \$all,
+ "any|A" => \$any,
+ "version|V" => \&show_version,
+ "compress|c" => \$compress,
+ "compress-long|C" => \$compress_C,
+ "tree|t" => \$tree,
+ "input-file|file|i=s" => \$input_file,
+ "config-option|O=s" => \@config_options,
+ "full-report=s" => \$full_report,
+ "core-stack" => \$core_stack,
+ "core=s" => \$core_name,
+ "exe=s" => \$exe_name,
+ "list-rmgrs" => \$list_rmgrs,
+ "watch" => \$watch,
+ "local-stats" => \$local_stats,
+ "show-jobs" => \$show_jobs,
+ "norc" => \$norc,
+ "config-file=s" => \$configfile,
+ "debug=s" => \$debugflag,
);
my %config_hash;
@@ -608,14 +652,32 @@
}
if ( defined $allfns{$arg}{options_i} ) {
foreach my $o ( keys( %{ $allfns{$arg}{options_i} } ) ) {
- $conf{$o} = $allfns{$arg}{options_i}{$o};
- $ic_names{$o}++;
+ $conf{mode_options}{$arg}{$o} =
$allfns{$arg}{options_i}{$o};
+ $conf{mode_options_reverse}{$o}{$arg} = 1;
}
}
}
GetOptions(%optionhash) or exit(1);
+ if ( defined $debugflag ) {
+ foreach my $f ( split( ",", $debugflag ) ) {
+ my ( $name, $v ) = split( "=", $f );
+ if ( exists $debugModes{$name} ) {
+ $debugModes{$name} = defined($v) ? $v : "basic";
+ } else {
+ printf("Attempt to set unknown debug flag \"$name\".\n");
+ }
+ }
+ if ( $debugModes{all} ) {
+ foreach my $mode ( keys(%debugModes) ) {
+ $debugModes{$mode} = $debugModes{all};
+ }
+ }
+ }
+
+ my $mode;
+
foreach my $arg ( keys %config_hash ) {
next unless defined $config_hash{$arg};
$mode = $arg;
@@ -623,7 +685,7 @@
}
# Put the args in a hash so that they can be referenced by name.
- if ( defined $allfns{$mode}{secondary} ) {
+ if ( defined $mode and defined $allfns{$mode}{secondary} ) {
foreach my $sec ( @{ $allfns{$mode}{secondary} } ) {
$secondary_args{ $sec->{arg_long} } = $sec->{value};
}
@@ -1417,6 +1479,9 @@
print("QsNet Statistics not valid\n");
return;
}
+
+ my $stats_total = 0;
+ my $group = 0;
if ($stats_total) {
@@ -1500,11 +1565,11 @@
my %ad;
- my @tg;
+ my %tg;
if ( $#target_groups != -1 ) {
foreach my $gid (@target_groups) {
- $tg[$gid]++;
+ $tg{$gid}++;
}
}
@@ -1515,7 +1580,7 @@
foreach my $gid ( keys %{ $dataset->{'subsystems'}{'Group'} } ) {
if ( $#target_groups != -1 ) {
- next unless defined $tg[$gid];
+ next unless defined $tg{$gid};
}
my $str;
@@ -1566,7 +1631,7 @@
foreach my $gid ( sort { $a <=> $b } keys %ad ) {
if ( $#target_groups != -1 ) {
- next unless defined $tg[$gid];
+ next unless defined $tg{$gid};
}
my $gstr = "Information for group '$gid'\n";
@@ -1734,7 +1799,7 @@
my $s = read_stats(@data);
- $stats_total = 1;
+ # $stats_total = 1;
show_stats($s);
}
@@ -1792,8 +1857,7 @@
my $prev;
my $tag = $tags[0];
- printf("called tag:$tag, level:$level tags:@tags\n")
- if $conf{"tree-verbose"};
+ debug_log( "tree", undef, "called tag:$tag, level:$level tags:@tags" );
return if ( !defined($tag) );
return if ( !defined( $lines->{$tag} ) );
@@ -1852,9 +1916,11 @@
$endlevel = ( $#{ $lines->{$tag} } );
}
- printf(
-"level $level, endlevel $endlevel, identical:@identical
different:@different\n",
- ) if $conf{"tree-verbose"};
+ debug_log(
+ "tree", undef,
+
+"level $level, endlevel $endlevel, identical:@identical
different:@different"
+ );
for ( my $l = $level ; $l <= $endlevel ; $l++ ) {
@@ -1882,9 +1948,11 @@
$prev->{children} = go_p( $endlevel + 1, $lines, @identical );
}
- printf(
-"returning level:$level endlevel:$endlevel identical:@identical
different:@different\n"
- ) if $conf{"tree-verbose"};
+ debug_log(
+ "tree", undef,
+
+"returning level:$level endlevel:$endlevel identical:@identical
different:@different"
+ );
if (@different) {
my $new = go_p( $level, $lines, @different );
@@ -1926,6 +1994,7 @@
sub show_tree {
my $ref = shift;
+ debug_log( "tree", $ref, "Complete tree" );
return _show_tree( $ref, undef, "" );
}
@@ -2007,7 +2076,7 @@
}
sub rms_is_installed {
- return find_exe("prun");
+ return ( find_exe("prun") and find_exe("rmsquery") );
}
sub rms_get_jobs {
@@ -2114,7 +2183,7 @@
###############################################################################
sub slurm_is_installed {
- return find_exe("srun");
+ return ( find_exe("srun") and find_exe("squeue") and
find_exe("scontrol") );
}
sub slurm_get_jobs {
@@ -2124,15 +2193,20 @@
return @res;
}
+# Query the process count for the "step" as that's how many
+# processes we are going to be looking for.
sub slurm_job_to_ncpus {
my $job = shift;
- my @steps = `squeue -s -o "%i %A" 2>/dev/null`;
+ my $s = "$job." . $conf{"slurm-job-step"};
+ my @steps = `squeue -s $s -o "%i %A" 2>/dev/null`;
return undef if ( $? != 0 );
-# The %A option is new so ensure we have the TASKS output before we
believe what we see here...
+ # The %A option is new so ensure we have the TASKS output
+ # before we believe what we see here...
+ # Mind you %A is several years old now so if it's not there
+ # we probably can't do anything anyway.
my $tasks;
my $have_tasks = 0;
- my $s = "$job." . $conf{"slurm-job-step"};
foreach my $step (@steps) {
my ( $step, $cpus ) = split( " ", $step );
$tasks = $cpus if ( $step eq $s );
@@ -2141,6 +2215,37 @@
return $tasks if $have_tasks;
return undef;
}
+
+# Query the nodecount for the "job" as that is what we shall be running on.
+sub slurm_job_to_nodecount {
+ my $job = shift;
+ my @jobs = `squeue -o "%i %D" 2>/dev/null`;
+ return undef if ( $? != 0 );
+
+ foreach my $step (@jobs) {
+ my ( $left, $right ) = split( " ", $step );
+ return $right if ( $left eq $job );
+ }
+ return undef;
+}
+
+# Query the node list for the "step" which isn't the same as the node list
+# for the job, care should be taken if using this function to ensure this
+# is correct.
+# This functions isn't used currently.
+sub slurm_job_to_nodelist {
+ my $job = shift;
+ my $s = "$job." . $conf{"slurm-job-step"};
+ my @steps = `squeue -s $s -o "%i %N" 2>/dev/null`;
+ return undef if ( $? != 0 );
+
+ foreach my $step (@steps) {
+ my ( $left, $right ) = split( " ", $step );
+ return $right if ( $left eq $s );
+
+ }
+ return undef;
+}
sub slurm_job_is_running {
my $job = shift;
@@ -2152,7 +2257,8 @@
sub slurm_setup_pcmd {
my $job = shift;
my $cpus = slurm_job_to_ncpus($job);
- return ( "srun --jobid=$job", $cpus );
+ my $nc = slurm_job_to_nodecount($job);
+ return ( "srun --jobid=$job", $cpus, $nc );
}
###############################################################################
@@ -2242,7 +2348,7 @@
}
sub local_setup_pcmd {
- return ( "", undef );
+ return ( "", 1, 1 );
}
###############################################################################
@@ -2252,7 +2358,7 @@
###############################################################################
sub mpd_is_installed {
- return find_exe("mpdlistjobs");
+ return ( find_exe("mpdlistjobs") and find_exe("mpdrun") );
}
sub mpd_get_data {
@@ -2318,9 +2424,11 @@
$mpd_dfile = $fn;
- my $cmd = "mpirun -machinefile $fn -np $i";
-
- return ( $cmd, undef );
+ my $cmd = "mpdrun -machinefile $fn -np $i";
+
+ my $hosts = $#hosts + 1;
+
+ return ( $cmd, undef, $hosts );
}
sub mpd_cleanup_pcmd {
@@ -2346,31 +2454,24 @@
}
sub open_is_installed {
- return find_exe("ompi-ps");
+ return ( find_exe("ompi-ps") and find_exe("orterun") );
}
my %open_jobs;
sub open_get_data {
- my ($filename) = @_;
# Simply return if called more than once.
if ( keys(%open_jobs) != 0 ) {
return;
}
- my $hostname = hostname();
+
my $job;
- my @out;
- if ( defined $filename ) {
- open( OPEN, $filename ) or return;
- @out = <OPEN>;
- close OPEN;
- } else {
- open( OPEN, "ompi-ps|" ) or return;
- @out = <OPEN>;
- close OPEN;
- }
+
+ open( OPEN, "ompi-ps|" ) or return;
+ my @out = <OPEN>;
+ close OPEN;
# Handle being called multiple times, zero the hash every
# time we are called. Of course we could just return the
@@ -2387,7 +2488,11 @@
} else {
my @elems = split( /\|/, $l );
- if ( $#elems == 6 ) {
+ if ( $#elems == 4 ) {
+ my $nprocs = $elems[3];
+ $nprocs =~ s/ //g;
+ $open_jobs{$job}{nprocs} = $nprocs;
+ } elsif ( $#elems == 6 ) {
my $host = $elems[4];
$host =~ s/ //g;
@@ -2395,24 +2500,22 @@
next if $host eq "Node";
$open_jobs{$job}{hosts}{$host}++;
- if ( $host eq $hostname ) {
- my $name = $elems[1];
- $name =~ /\[\[(\d+)\,(\d+)\]\,(\d+)\]/;
- my $rank = $3;
-
- my $pid = $elems[3];
- $rank =~ s/ //g;
- $pid =~ s/ //g;
- $open_jobs{$job}{ranks}{$host}{$rank} = $pid;
- }
+ my $name = $elems[1];
+ $name =~ /\[\[(\d+)\,(\d+)\]\,(\d+)\]/;
+ my $rank = $3;
+
+ my $pid = $elems[3];
+ $rank =~ s/ //g;
+ $pid =~ s/ //g;
+ $open_jobs{$job}{ranks}{$host}{$rank} = $pid;
}
}
}
- if ( $conf{"verbose"} ) {
- print Dumper \%open_jobs;
- }
+ #if ( $conf{"verbose"} ) {
+ #print Dumper \%open_jobs;
+ #}
}
sub open_get_jobs {
@@ -2423,24 +2526,12 @@
}
my $open_dfile;
-my $open_tfile;
sub open_setup_pcmd {
my $job = shift;
open_get_data();
- my ( $th, $tn ) = tempfile(".padb.XXXX");
-
- open( my $oh, "ompi-ps|" );
- while (<$oh>) {
- print $th $_;
- }
- close $th;
- $cinner{"open-ps"} = $tn;
-
- $open_tfile = $tn;
-
my @hosts = keys %{ $open_jobs{$job}{hosts} };
my $i = @hosts;
@@ -2454,14 +2545,14 @@
$open_dfile = $fn;
my $prefix = find_ompi_prefix();
- my $cmd = "mpirun -machinefile $fn -np $i $prefix";
-
- return ( $cmd, undef );
+ my $cmd = "orterun -machinefile $fn -np $i $prefix";
+ my $hosts = $#hosts + 1;
+
+ return ( $cmd, $open_jobs{$job}{nprocs}, $hosts );
}
sub open_cleanup_pcmd {
unlink($open_dfile) if ( defined($open_dfile) );
- unlink($open_tfile) if ( defined($open_tfile) );
}
###############################################################################
@@ -2474,9 +2565,7 @@
# Check for both LSF and RMS, I know LSF works in other ways but I
don't
# know how to launch jobs then...
- my $rms = find_exe("rinfo");
- return 0 unless $rms;
- return find_exe("bjobs");
+ return ( find_exe("bjobs") and rms_is_installed() );
}
sub lsf_get_jobs {
@@ -2553,9 +2642,16 @@
sub find_rmgr {
# If it's been set on the command line and it's valid then just use what
we are given.
-# Don't do any sanity checks here to cope with non-default installs.
+# Do sanity checks here but only warn on the result to cope with
non-default installs.
if ( defined $rmgr{ $conf{"rmgr"} } ) {
+ if ( defined $rmgr{ $conf{"rmgr"} }{is_installed}
+ and not $rmgr{ $conf{"rmgr"} }{is_installed}() )
+ {
+ printf(
+"Warning: Selected resource manager $conf{rmgr} does not appear to be
installed\n"
+ );
+ }
setup_rmgr( $conf{"rmgr"} );
return;
}
@@ -2593,9 +2689,16 @@
sub find_any_rmgr {
# If it's been set on the command line and it's valid then just use what
we are given.
-# Don't do any sanity checks here to cope with non-default installs.
+# Do sanity checks here but only warn on the result to cope with
non-default installs.
if ( defined $rmgr{ $conf{"rmgr"} } ) {
+ if ( defined $rmgr{ $conf{"rmgr"} }{is_installed}
+ and not $rmgr{ $conf{"rmgr"} }{is_installed}() )
+ {
+ printf(
+"Warning: Selected resource manager $conf{rmgr} does not appear to be
installed\n"
+ );
+ }
setup_rmgr( $conf{"rmgr"} );
return;
}
@@ -2658,6 +2761,7 @@
sub get_all_jobids {
my $user = shift;
+ debug_log( "rmgr", undef, "Loading active jobs list", undef );
return $rmgr{ $conf{"rmgr"} }{get_active_jobs}($user);
}
@@ -2703,13 +2807,15 @@
###############################################################################
sub strip_stack_traces {
- my $lines = shift;
+ my ( $cargs, $lines ) = @_;
my %above;
my %below;
- map { $above{$_}++ } split( ",", $conf{"stack-strip-above"} );
- map { $below{$_}++ } split( ",", $conf{"stack-strip-below"} );
+ map { $above{$_}++ }
+ split( ",", $conf{mode_options}{stack}{"stack-strip-above"} );
+ map { $below{$_}++ }
+ split( ",", $conf{mode_options}{stack}{"stack-strip-below"} );
foreach my $tag ( keys %$lines ) {
@@ -2734,10 +2840,11 @@
$main_idx = 0 if not defined $main_idx;
if ( $main_idx != 0 or $wait_idx != 0 ) {
my $end =
- ( $strip_above_wait and $wait_idx )
+ ( $cargs->{"strip-above-wait"} and $wait_idx )
? $wait_idx
: $#{ $lines->{$tag} };
- my $start = ( $strip_below_main and $main_idx ) ? $main_idx :
0;
+ my $start =
+ ( $cargs->{"strip-below-main"} and $main_idx ) ? $main_idx :
0;
printf( "Stripping 0.."
. $#{ $lines->{$tag} }
@@ -2752,15 +2859,18 @@
}
sub sort_proc_hashes {
- my $key = shift;
- my @all = @_;
-
- #print Dumper $all;
- return ( reverse( sort { $a->{$key} <=> $b->{$key} } @all ) );
+ my $carg = shift;
+ my $key = shift;
+ my @all = @_;
+
+ if ( $carg->{"reverse-sort-order"} ) {
+ return ( reverse( sort { $a->{$key} <=> $b->{$key} } @all ) );
+ } else {
+ return ( sort { $a->{$key} <=> $b->{$key} } @all );
+ }
}
sub pre_mpi_watch {
- my ($cpus) = @_;
my $header = <<EOF;
u: unexpected messages U: unexpected and other messages
s: sending messages r: receiving messages m: sending and receiving
@@ -2769,9 +2879,6 @@
.: consuming CPU cycles ,: using CPU but no queue data -: sleeping *: error
EOF
printf($header);
- my %data;
- $data{cpus} = $cpus;
- return \%data;
}
sub show_mpi_watch {
@@ -2779,8 +2886,9 @@
# print Dumper $lines;
my $s = "";
- foreach my $l ( sort { $a <=> $b } ( keys %{ $lines->{raw} } ) ) {
- $s .= $lines->{raw}{$l}{state};
+ foreach my $l ( sort { $a <=> $b } ( keys %{ $lines->{target_responce}
} ) )
+ {
+ $s .= $lines->{target_responce}{$l}{state};
}
print("$s\n");
}
@@ -2788,20 +2896,21 @@
# Nicely format process information.
# XXX: proc-sort-key should probably sort on column headers as
# well as keys.
+# Idealy we'd know what format we wanted and only ask the nodes
+# to report relevent info, for now they still report everything.
sub show_proc_format {
- my ( $nlines, $mode, $handle ) = @_;
-
- my $lines = $nlines->{lines};
+ my ( $carg, $nlines ) = @_;
my @proc_format_array;
my %proc_format_header;
my $show_fields = 0;
my %proc_format_lengths;
-
- my $separator = $conf{"column-seperator"};
-
- my @columns = split( ",", $proc_format );
+ my %proc_header_reverse;
+
+ my $separator = $carg->{"column-seperator"};
+
+ my @columns = split( ",", $carg->{"proc-format"} );
foreach my $column (@columns) {
$show_fields = 1 if ( $column eq "fields" );
@@ -2811,6 +2920,7 @@
push @proc_format_array, lc($name);
$proc_format_header{ lc($name) } = $desc;
$proc_format_lengths{ lc($name) } = length($desc);
+ $proc_header_reverse{ lc($desc) } = lc($name);
} else {
push @proc_format_array, lc($column);
$proc_format_header{ lc($column) } = $column;
@@ -2819,21 +2929,21 @@
}
my @all;
+ my $lines = $nlines->{target_responce};
foreach my $tag ( sort ( keys %$lines ) ) {
my %hash;
$hash{vp} = $tag;
- foreach my $data ( @{ $lines->{$tag} } ) {
- if ( $data =~ /([\w\.]+)\:[ \t]*(.+)/ ) {
- my $key = lc($1);
-
- next unless defined $proc_format_lengths{$key} or
$show_fields;
-
- if ( length($2) > $proc_format_lengths{$key} ) {
- $proc_format_lengths{$key} = length($2);
- }
-
- $hash{$key} = $2;
- }
+ foreach my $key ( keys( %{ $lines->{$tag} } ) ) {
+
+ my $value = $lines->{$tag}{$key};
+ next unless defined $proc_format_lengths{$key} or $show_fields;
+
+ if ( length($value) > $proc_format_lengths{$key} ) {
+ $proc_format_lengths{$key} = length($value);
+ }
+
+ $hash{$key} = $value;
+
}
if ($show_fields) {
my @fields = sort ( keys(%hash) );
@@ -2843,9 +2953,14 @@
push @all, \%hash;
}
- @all = sort_proc_hashes( $conf{"proc-sort-key"}, @all );
***The diff for this file has been truncated for email.***
More information about the padb-devel
mailing list