[padb-devel] [padb commit] r39 - Setup a find_pids callback for the inner to use when
codesite-noreply at google.com
codesite-noreply at google.com
Tue Jun 9 12:43:21 BST 2009
Author: apittman
Date: Tue Jun 9 04:42:53 2009
New Revision: 39
Modified:
trunk/src/padb
Log:
Setup a find_pids callback for the inner to use when
accessing the resource manager. Simplifies the code
and should make it easier to add more resource managers
in future.
Modified: trunk/src/padb
==============================================================================
--- trunk/src/padb (original)
+++ trunk/src/padb Tue Jun 9 04:42:53 2009
@@ -33,7 +33,9 @@
# maintainer.
# * Added "orte" to the list of resource managers supported
# * Don't enable local-qsnet on non-qsnet systems.
-#
+# * inner_main() now uses callbacks for resource manager support.
+# * --signal now takes names rather than numbers.
+
# Version 2.2
# * Add a --core-stack option along with --core and --exe to extract stack
# traces from core files.
@@ -165,12 +167,10 @@
# * Multi-pass argument handling, --kill also accepts --signal for example,
# this should really be done at the getopt layer. Also proper usage
# info for these secondary args.
-# * inner_main() should possibly use callbacks for resource manager
support,
-# it's starting to look a bit messy.
-# * find_slurm_pids() has some good code in it for finding parallel
processes
+# * slurm_find_pids() has some good code in it for finding parallel
processes
# this should be extrapolated out and so it can be used in the mpd case,
# ideally on non-rms systems (RMS rocks in this regard) the rmgr callback
-# should return a list of spawned pids and the code in find_slurm_pids()
should
+# should return a list of spawned pids and the code in slurm_find_pids()
should
# pass this tree to find the most interesting one.
# * The mode {handler} functions should only be called once per node, it
could then
# correctly handle $confInner{gdb_file} and also attach to every process
per node
@@ -259,6 +259,7 @@
# job_to_key job key no Convert from jobId to shm key.
# setup_pcmd job cmd|ncpus yes Command needed to launch shadow
jobs.
# cleanup_pcmd - - no Cleans up and temporary files.
+# find_pids job - maybe Called on the inner to locate
pids.
# inner_rmgr var n/a no Resource manager to masquerade
as.
@@ -270,6 +271,7 @@
'job_is_running' => \&rms_job_is_running,
'job_to_key' => \&rms_job_to_key,
'setup_pcmd' => \&rms_setup_pcmd,
+ 'find_pids' => \&rms_find_pids,
};
$rmgr{"mpd"} = {
@@ -277,6 +279,7 @@
'get_active_jobs' => \&mpd_get_jobs,
'setup_pcmd' => \&mpd_setup_pcmd,
'cleanup_pcmd' => \&mpd_cleanup_pcmd,
+ 'find_pids' => \&mpd_find_pids,
};
$rmgr{"orte"} = {
@@ -284,6 +287,7 @@
'get_active_jobs' => \&open_get_jobs,
'setup_pcmd' => \&open_setup_pcmd,
'cleanup_pcmd' => \&open_cleanup_pcmd,
+ 'find_pids' => \&open_find_pids,
};
$rmgr{"lsf-rms"} = {
@@ -298,12 +302,14 @@
'get_active_jobs' => \&slurm_get_jobs,
'job_is_running' => \&slurm_job_is_running,
'setup_pcmd' => \&slurm_setup_pcmd,
+ 'find_pids' => \&slurm_find_pids,
};
$rmgr{"local"} = {
'get_active_jobs' => \&local_get_jobs,
'job_is_running' => \&local_job_is_running,
'setup_pcmd' => \&local_setup_pcmd,
+ 'find_pids' => \&local_find_pids,
};
$rmgr{"local-qsnet"} = {
@@ -5414,7 +5420,7 @@
}
# Do the right thing with slurm...
-sub find_slurm_pids {
+sub slurm_find_pids {
my $jobid = shift;
# Slurm has the concept of a "job" and a "job step" which are
@@ -5509,7 +5515,7 @@
}
# Local processes per node, i.e. no resource manager support.
-sub find_local_pids {
+sub local_find_pids {
my $pid = shift;
# Hard-wire this to vp 0, probably not true but without the resource
manager it's difficult
@@ -5522,7 +5528,7 @@
maybe_show_pid( $vp, $pid );
}
-sub find_mpd_pids {
+sub mpd_find_pids {
my $job = shift;
my $d = mpd_get_data();
@@ -5533,7 +5539,7 @@
}
}
-sub find_open_pids {
+sub open_find_pids {
my $job = shift;
open_get_data( $confInner{"open-ps"} );
my $hostname = hostname();
@@ -5543,7 +5549,7 @@
}
}
-sub show_all_pids {
+sub rms_find_pids {
my $jobid = shift;
my %vps;
@@ -5703,6 +5709,10 @@
$confInner{"myld"} = $ENV{"LD_LIBRARY_PATH"};
+ # $rjobid is used for accessing the stats on slurm
+ # systems, on rms it's just the jobId but on combined
+ # slurm/rms systems it's modifed to be the rms id
+ # and the jobid is left as the slurm job id.
my $rjobid = $jobid;
if ( exists $ENV{"SLURM_PROCID"} ) {
$rjobid = get_rms_jobid($jobid);
@@ -5719,25 +5729,23 @@
exit(0);
}
- if ( $confInner{"rmgr"} eq "local" ) {
-
- # Takes a pid.
- find_local_pids($jobid);
-
- } elsif ( $confInner{"rmgr"} eq "mpd" ) {
- find_mpd_pids($jobid);
+ # Handle resource managers better, simply call a callback
+ # as the outer does.
+ # As usual there is a special case, on Slurm systems
+ # running QsNet you can have the RMS kernel module loaded
+ # and these need to be handled differently so deal with
+ # them first and then go to the standard callback.
- } elsif ( $confInner{"rmgr"} eq "orte" ) {
- find_open_pids($jobid);
-
- } elsif ( -d "/proc/rms" ) {
+ if ( ( $confInner{rmgr} eq "slurm" ) and ( -d "/proc/rms" ) ) {
# Takes a RMS job id.
- show_all_pids($rjobid);
+ rms_find_pids($rjobid);
} else {
-
- # Takes a native job id.
- find_slurm_pids($jobid);
+ if ( not defined $rmgr{ $confInner{rmgr} }{find_pids} ) {
+ printf("Error, rmgr $confInner{rmgr} has no find_pids
callback\n");
+ exit(1);
+ }
+ $rmgr{ $confInner{rmgr} }{find_pids}($jobid);
}
if ( defined $allfns{$mode}{handler_all} ) {
More information about the padb-devel
mailing list