[padb] r362 committed - Update to the orte resource manager to support spawned jobs. Like slu...
padb at googlecode.com
padb at googlecode.com
Fri Dec 18 21:26:25 GMT 2009
Revision: 362
Author: apittman
Date: Fri Dec 18 13:25:23 2009
Log: Update to the orte resource manager to support spawned jobs. Like
slurm
each job, uniquely identified by it's number can have a number of
different steps within it. This commit adds knowedge of these steps
to padb so it can do the right thing. Allow targetting of different
steps via the orte-job-step configuration option with the default
step being the lowest numbered one detected.
http://code.google.com/p/padb/source/detail?r=362
Modified:
/trunk/src/padb
=======================================
--- /trunk/src/padb Thu Dec 10 07:21:32 2009
+++ /trunk/src/padb Fri Dec 18 13:25:23 2009
@@ -547,6 +547,7 @@
$conf{rmgr} = undef;
$conf{slurm_job_step} = undef;
+$conf{orte_job_step} = undef;
$conf{pbs_server} = undef;
@@ -568,7 +569,7 @@
my @conf_time = qw(prun_exittimeout prun_timeout interval);
# Config options which take an integer.
-my @conf_int = qw(lsf_job_offset slurm_job_step tree_width);
+my @conf_int = qw(lsf_job_offset slurm_job_step orte_job_step tree_width);
my $norc = 0;
my $configfile = '/etc/padb.conf';
@@ -2865,18 +2866,21 @@
if ( @elems == 4 ) {
my $nprocs = $elems[3];
my $name = $elems[0];
- if ( $name =~ m{\A\[(\d+)\,\d+]\z}x ) {
- $open_jobs{$1}{nprocs} = $nprocs;
+ if ( $name =~ m{\A\[(\d+)\,(\d+)]\z}x ) {
+ my $job = $1;
+ my $step = $2;
+ $open_jobs{$job}{$step}{nprocs} = $nprocs;
}
} elsif ( @elems == 6 ) {
my $name = $elems[1];
- if ( $name =~ m{\A\[\[(\d+)\,\d+\]\,(\d+)\]}x ) {
+ if ( $name =~ m{\A\[\[(\d+)\,(\d+)\]\,(\d+)\]}x ) {
my $job = $1;
- my $rank = $2;
+ my $step = $2;
+ my $rank = $3;
my $pid = $elems[3];
my $host = $elems[4];
- $open_jobs{$job}{hosts}{$host}++;
- $open_jobs{$job}{ranks}{$host}{$rank} = $pid;
+ $open_jobs{$job}{$step}{hosts}{$host}++;
+ $open_jobs{$job}{$step}{ranks}{$host}{$rank} = $pid;
}
}
}
@@ -2895,7 +2899,22 @@
open_get_data();
- my @hosts = keys %{ $open_jobs{$job}{hosts} };
+ my $step = $conf{orte_job_step};
+ if ( not defined $step ) {
+ my @steps = keys %{ $open_jobs{$job} };
+
+ my @ordered = sort { $a <=> $b } @steps;
+
+ $step = $ordered[0];
+
+ }
+
+ if ( not defined $open_jobs{$job}{$step} ) {
+ printf("Job $job (step $step) does not exist\n");
+ return;
+ }
+
+ my @hosts = keys %{ $open_jobs{$job}{$step}{hosts} };
my $i = @hosts;
my ( $fh, $fn ) = tempfile('/tmp/padb.XXXXXXXX');
@@ -2909,9 +2928,9 @@
my $cmd = "orterun -machinefile $fn -np $i $prefix";
my %pcmd;
- $pcmd{nprocesses} = $open_jobs{$job}{nprocs};
+ $pcmd{nprocesses} = $open_jobs{$job}{$step}{nprocs};
$pcmd{nhosts} = @hosts;
- $pcmd{process_data} = $open_jobs{$job}{ranks};
+ $pcmd{process_data} = $open_jobs{$job}{$step}{ranks};
$pcmd{command} = $cmd;
@{ $pcmd{host_list} } = @hosts;
$pcmd{cleanup_cb} = \&unlink_file;
More information about the padb-devel
mailing list