[padb-devel] [padb] r123 committed - Get slurm support working again on the branch, basically query squeue ...
codesite-noreply at google.com
codesite-noreply at google.com
Wed Aug 19 15:29:27 BST 2009
Revision: 123
Author: apittman
Date: Wed Aug 19 07:28:24 2009
Log: Get slurm support working again on the branch, basically query squeue
for a host
list when setting up the job to know how many hosts to expect signons from.
http://code.google.com/p/padb/source/detail?r=123
Modified:
/branches/full-duplex/src/padb
=======================================
--- /branches/full-duplex/src/padb Tue Aug 18 14:13:05 2009
+++ /branches/full-duplex/src/padb Wed Aug 19 07:28:24 2009
@@ -319,11 +319,12 @@
};
$rmgr{"slurm"} = {
- 'is_installed' => \&slurm_is_installed,
- 'get_active_jobs' => \&slurm_get_jobs,
- 'job_is_running' => \&slurm_job_is_running,
- 'setup_pcmd' => \&slurm_setup_pcmd,
- 'find_pids' => \&slurm_find_pids,
+ 'is_installed' => \&slurm_is_installed,
+ 'get_active_jobs' => \&slurm_get_jobs,
+ 'job_is_running' => \&slurm_job_is_running,
+ 'setup_pcmd' => \&slurm_setup_pcmd,
+ 'find_pids' => \&slurm_find_pids,
+ 'require_inner_callback' => 1,
};
$rmgr{"local"} = {
@@ -397,6 +398,8 @@
$conf{"prun-exittimeout"} = 120;
$conf{"rmgr"} = "auto";
+$conf{"slurm-job-step"} = 0;
+
# These settings are passed onto inner only.
$conf{"edbopt"} = "";
@@ -604,6 +607,7 @@
$debugModes{"tree"} = undef;
$debugModes{"verbose"} = undef;
$debugModes{"signon"} = undef;
+$debugModes{"rmgr"} = undef;
sub parse_args_outer {
@@ -2206,6 +2210,21 @@
return $tasks if $have_tasks;
return undef;
}
+
+sub slurm_job_to_nodelist {
+ my $job = shift;
+ my @steps = `squeue -s -o "%i %N" 2>/dev/null`;
+ return undef if ( $? != 0 );
+
+ my $hosts;
+ my $s = "$job." . $conf{"slurm-job-step"};
+ foreach my $step (@steps) {
+ my ( $left, $right ) = split( " ", $step );
+ $hosts = $right if ( $left eq $s );
+
+ }
+ return $hosts;
+}
sub slurm_job_is_running {
my $job = shift;
@@ -2215,9 +2234,11 @@
}
sub slurm_setup_pcmd {
- my $job = shift;
- my $cpus = slurm_job_to_ncpus($job);
- return ( "srun --jobid=$job", $cpus );
+ my $job = shift;
+ my $cpus = slurm_job_to_ncpus($job);
+ my @nodes = slurm_job_to_nodelist($job);
+ my $nc = $#nodes + 1;
+ return ( "srun --jobid=$job", $cpus, $nc );
}
###############################################################################
@@ -2720,6 +2741,7 @@
sub get_all_jobids {
my $user = shift;
+ debug_log( "rmgr", undef, "Loading active jobs list", undef );
return $rmgr{ $conf{"rmgr"} }{get_active_jobs}($user);
}
@@ -3683,6 +3705,7 @@
# Check that the file is mode 100600 (Octal)
if ( $mode != 33152 ) {
printf("Wrong permissions on secret file, should be 0600
($file)\n");
+ exit(1);
}
open( SFD, $file ) or return;
More information about the padb-devel
mailing list