--- padb_r341 2009-11-26 13:18:34.955667000 +0100 +++ padb_r341_New 2009-11-30 14:51:00.070329600 +0100 @@ -473,6 +473,14 @@ setup_job => \&pbs_setup_job, find_pids => \&pbs_find_pids, }; +$rmgr{'sl-orte'} = { + is_installed => \&slurm_is_installed, + get_active_jobs => \&slurm_get_jobs, + job_is_running => \&slurm_job_is_running, + setup_pcmd => \&slurm_setup_pcmd, + find_pids => \&sl_orte_find_pids, + require_inner_callback => 1, +}; ############################################################################### # @@ -2744,6 +2752,95 @@ return %pcmd; } +# slurm + orte support. +# +############################################################################### +# Do the right thing with openmpi in slurm ... +sub sl_orte_find_pids { + my $jobid = shift; + my @pids; + my @pids_orte; + my @pids_stepd; + # using scontrol listpids it got message from slurmd: + # slurmd[inti14]: proctrack/pgid does not implement slurm_container_get_pid + # so it disturbs padb, try another method + opendir( DIR, "/proc/" ); + my @pids_dir = readdir(DIR); + closedir(DIR); + # First Step: look for slurmstepd + # Search for slurmstepd which match jobid + # We may have many slurmstepd that have jobid + # so pick them all, it would be stepd of this script + foreach my $pid (@pids_dir) { + next unless ( $pid =~ /^\d+$/ ); #skip non-nemeric + if ( -e "/proc/$pid") { + if ( -f "/proc/version" ) { + open( PCMD, "/proc/$pid/cmdline" ) or return undef; + while () { + my $l = $_; + if ($l =~ /slurmstepd/i && $l =~ /$jobid/ ) { + push (@pids_stepd,$pid); + } + } + close PCMD; + } else { + my @fields = slurp_cmd("ps -o cmd $pid"); + foreach my $l (@fields) { + if ($l =~ /slurmstepd/i && $l =~ /$jobid/ ) { + push (@pids_stepd,$pid); + } + } + } + } + } +# my $host=hostname(); +# print "Warning: host=$host target_user $target_user\n"; +# print "Warning: stepd=@@pids_stepd\n"; + # Second step: + # Out of slurmstepd list search for it children (should be mpirun/orted/padb) + # The result of this step est equivalent to scpntrol listpids + my @handle = slurp_cmd("ps -o pid,ppid,cmd -u $target_user"); + my $mypid = $$; + foreach my $stepd (@pids_stepd) { + foreach my $line (@handle) { + $line =~ s/^ +//; # take off leading space + my ($pid,$ppid,$cmd) = split ( " ", $line); + next if ($ppid eq "PPID"); # skip titles line + if ($ppid == $stepd) { + if ($pid != $mypid) { #not to take this script pid (padb --inner) + push (@pids,$pid); #slurm pids of this jobid except this padb + } + } + } + } + # Last step: + # Out of slurmstepd children pids get pids + # of applications (openmpi executables) + foreach my $pid (@pids) { + foreach my $line (@handle) { + $line =~ s/^ +//; # take off leading space + my ($proc_work,$ppid,$proc_cmd) = split ( " ", $line); + next if ($ppid eq "PPID"); # skip titles line + if ($ppid == $pid ) { + my $dirnm = dirname ($proc_cmd); + my $base = basename ($proc_cmd); + if ($base !~ /srun/i) { # not to take srun + push(@pids_orte,$proc_work); + } + } + } + } + foreach my $pid (@pids_orte) { + my $vp; + my %env = get_remote_env($pid); + if (!defined ($env{OMPI_COMM_WORLD_RANK}) || !defined ($env{SLURM_PROCID}) ) { + %env = get_remote_env_bygdb($pid); + } + $vp = $env{OMPI_COMM_WORLD_RANK}; + maybe_show_pid( $vp, $pid ); + } +} + # open support. # ###############################################################################