[padb] r362 committed - Update to the orte resource manager to support spawned jobs. Like slu...

padb at googlecode.com padb at googlecode.com
Fri Dec 18 21:26:25 GMT 2009


Revision: 362
Author: apittman
Date: Fri Dec 18 13:25:23 2009
Log: Update to the orte resource manager to support spawned jobs.  Like  
slurm
each job, uniquely identified by it's number can have a number of
different steps within it.  This commit adds knowedge of these steps
to padb so it can do the right thing.  Allow targetting of different
steps via the orte-job-step configuration option with the default
step being the lowest numbered one detected.

http://code.google.com/p/padb/source/detail?r=362

Modified:
  /trunk/src/padb

=======================================
--- /trunk/src/padb	Thu Dec 10 07:21:32 2009
+++ /trunk/src/padb	Fri Dec 18 13:25:23 2009
@@ -547,6 +547,7 @@
  $conf{rmgr}             = undef;

  $conf{slurm_job_step} = undef;
+$conf{orte_job_step}  = undef;

  $conf{pbs_server} = undef;

@@ -568,7 +569,7 @@
  my @conf_time = qw(prun_exittimeout prun_timeout interval);

  # Config options which take an integer.
-my @conf_int = qw(lsf_job_offset slurm_job_step tree_width);
+my @conf_int = qw(lsf_job_offset slurm_job_step orte_job_step tree_width);

  my $norc       = 0;
  my $configfile = '/etc/padb.conf';
@@ -2865,18 +2866,21 @@
          if ( @elems == 4 ) {
              my $nprocs = $elems[3];
              my $name   = $elems[0];
-            if ( $name =~ m{\A\[(\d+)\,\d+]\z}x ) {
-                $open_jobs{$1}{nprocs} = $nprocs;
+            if ( $name =~ m{\A\[(\d+)\,(\d+)]\z}x ) {
+                my $job  = $1;
+                my $step = $2;
+                $open_jobs{$job}{$step}{nprocs} = $nprocs;
              }
          } elsif ( @elems == 6 ) {
              my $name = $elems[1];
-            if ( $name =~ m{\A\[\[(\d+)\,\d+\]\,(\d+)\]}x ) {
+            if ( $name =~ m{\A\[\[(\d+)\,(\d+)\]\,(\d+)\]}x ) {
                  my $job  = $1;
-                my $rank = $2;
+                my $step = $2;
+                my $rank = $3;
                  my $pid  = $elems[3];
                  my $host = $elems[4];
-                $open_jobs{$job}{hosts}{$host}++;
-                $open_jobs{$job}{ranks}{$host}{$rank} = $pid;
+                $open_jobs{$job}{$step}{hosts}{$host}++;
+                $open_jobs{$job}{$step}{ranks}{$host}{$rank} = $pid;
              }
          }
      }
@@ -2895,7 +2899,22 @@

      open_get_data();

-    my @hosts = keys %{ $open_jobs{$job}{hosts} };
+    my $step = $conf{orte_job_step};
+    if ( not defined $step ) {
+        my @steps = keys %{ $open_jobs{$job} };
+
+        my @ordered = sort { $a <=> $b } @steps;
+
+        $step = $ordered[0];
+
+    }
+
+    if ( not defined $open_jobs{$job}{$step} ) {
+        printf("Job $job (step $step) does not exist\n");
+        return;
+    }
+
+    my @hosts = keys %{ $open_jobs{$job}{$step}{hosts} };
      my $i     = @hosts;

      my ( $fh, $fn ) = tempfile('/tmp/padb.XXXXXXXX');
@@ -2909,9 +2928,9 @@
      my $cmd    = "orterun -machinefile $fn -np $i $prefix";

      my %pcmd;
-    $pcmd{nprocesses}   = $open_jobs{$job}{nprocs};
+    $pcmd{nprocesses}   = $open_jobs{$job}{$step}{nprocs};
      $pcmd{nhosts}       = @hosts;
-    $pcmd{process_data} = $open_jobs{$job}{ranks};
+    $pcmd{process_data} = $open_jobs{$job}{$step}{ranks};
      $pcmd{command}      = $cmd;
      @{ $pcmd{host_list} } = @hosts;
      $pcmd{cleanup_cb}     = \&unlink_file;




More information about the padb-devel mailing list