From codesite-noreply at google.com Tue Aug 4 19:21:10 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Tue, 04 Aug 2009 18:21:10 +0000 Subject: [padb-devel] [padb] r117 committed - Add some more debugging logs, implement a inner timeout where... Message-ID: <001485f5af6e728527047054f33e@google.com> Revision: 117 Author: apittman Date: Tue Aug 4 11:20:27 2009 Log: Add some more debugging logs, implement a inner timeout where if it isn't contacted by the outer in thirty seconds it exits to avoid leaving around stale processes (and hence jobs). http://code.google.com/p/padb/source/detail?r=117 Modified: /branches/full-duplex/src/padb ======================================= --- /branches/full-duplex/src/padb Sun Jul 19 13:53:38 2009 +++ /branches/full-duplex/src/padb Tue Aug 4 11:20:27 2009 @@ -584,7 +584,7 @@ sub debug_log { my ( $type, $handle, $str, @params ) = @_; - if ( not defined $debugModes{$type} ) { + if ( not exists $debugModes{$type} ) { printf("Unknown debug mode: $type\n"); exit(1); } @@ -598,10 +598,12 @@ # Valid debug modes, a full list is maintained here so using unexpected # ones can generate warnings. -$debugModes{"full-duplex"} = 0; -$debugModes{"show-cmd"} = 0; -$debugModes{"all"} = 0; -$debugModes{"tree"} = 0; +$debugModes{"full-duplex"} = undef; +$debugModes{"show-cmd"} = undef; +$debugModes{"all"} = undef; +$debugModes{"tree"} = undef; +$debugModes{"verbose"} = undef; +$debugModes{"signon"} = undef; sub parse_args_outer { @@ -657,7 +659,7 @@ if ( defined $debugflag ) { foreach my $f ( split( ",", $debugflag ) ) { my ( $name, $v ) = split( "=", $f ); - if ( defined $debugModes{$name} ) { + if ( exists $debugModes{$name} ) { $debugModes{$name} = defined($v) ? $v : "basic"; } else { printf("Attempt to set unknown debug flag \"$name\".\n"); @@ -3216,6 +3218,9 @@ # Called once when we have the socket details of the last child. sub connect_to_children { my $comm_data = shift; + + debug_log( "signon", undef, "Received last signon, connecting to inner" ); + @{ $comm_data->{host_ids} } = sort( keys( %{ $comm_data->{remote} } ) ); $comm_data->{connection_tree} = generate_comm_tree( $comm_data->{host_ids} ); @@ -3729,6 +3734,9 @@ $conf{"verbose"} && defined $ncpus && print "Job has $ncpus processe(s)\n"; $conf{"verbose"} && defined $hosts && print "Job spans $hosts host(s)\n"; + debug_log( "verbose", undef, + "There are $ncpus processes over $hosts hosts" ); + # Maybe do it this way, edb works best when run with the same LD_LIBRARY_PATH # as the application. It's very important when running the message queue # extraction code but less so here. You may find you get linker errors though @@ -3759,6 +3767,9 @@ return 1; } my $errors = go_parallel( $jobid, $cmd, $ncpus, $hosts ); + + debug_log( "verbose", undef, "Completed command" ); + cleanup_pcmd(); return $errors; } @@ -6824,52 +6835,63 @@ my $sel = $netdata->{sel}; - while ( my @data = $sel->can_read() ) { - foreach my $s (@data) { - if ( $s == $server ) { - my $new = $server->accept() or die("Failed accept"); - $sel->add($new); - my $peer = getpeername($new); - my ( $port, $addr ) = unpack_sockaddr_in($peer); - my $ip = inet_ntoa($addr); - my $hostname = gethostbyaddr( $addr, AF_INET ); - - #printf "New connection from $hostname ($ip) $port\n"; - my %sinfo; - $sinfo{hostname} = $hostname; - $sinfo{trusted} = 0; - $sinfo{port} = $port; - $sinfo{desc} = "$hostname:$port"; - $sinfo{socket} = $new; - $sinfo{line_cb} = \&command_from_outer; - $netdata->{connections}{$new} = \%sinfo; - - # $new->printf("Hello from padb\n"); - #$new->autoflush(); - next; - } - - my $sinfo = $netdata->{connections}{$s}; - my $d; - my $count = sysread( $s, $d, 65536 ); - - # Dead connection. - if ( not defined $d or $count eq 0 ) { - - # printf("null read from $sinfo->{desc}\n"); - if ( eof($s) ) { - $sel->remove($s); - $s->close(); - $sinfo->{trusted} = 0; - $sinfo->{dead} = 1; - my $scount = $sel->count(); - } - next; - } - - $sinfo->{str} .= $d; - extract_line( $netdata, $sinfo ); - + my $stime = time(); + + while ( $sel->count() > 0 ) { + while ( my @data = $sel->can_read(5) ) { + foreach my $s (@data) { + if ( $s == $server ) { + my $new = $server->accept() or die("Failed accept"); + $sel->add($new); + my $peer = getpeername($new); + my ( $port, $addr ) = unpack_sockaddr_in($peer); + my $ip = inet_ntoa($addr); + my $hostname = gethostbyaddr( $addr, AF_INET ); + + #printf "New connection from $hostname ($ip) $port\n"; + my %sinfo; + $sinfo{hostname} = $hostname; + $sinfo{trusted} = 0; + $sinfo{port} = $port; + $sinfo{desc} = "$hostname:$port"; + $sinfo{socket} = $new; + $sinfo{line_cb} = \&command_from_outer; + $netdata->{connections}{$new} = \%sinfo; + + # $new->printf("Hello from padb\n"); + #$new->autoflush(); + next; + } + + my $sinfo = $netdata->{connections}{$s}; + my $d; + my $count = sysread( $s, $d, 65536 ); + + # Dead connection. + if ( not defined $d or $count eq 0 ) { + + # printf("null read from $sinfo->{desc}\n"); + if ( eof($s) ) { + $sel->remove($s); + $s->close(); + $sinfo->{trusted} = 0; + $sinfo->{dead} = 1; + my $scount = $sel->count(); + } + next; + } + + $sinfo->{str} .= $d; + extract_line( $netdata, $sinfo ); + + } + } + my $time = time(); + + # Should probably handle this better, if the outer or tree never signons + # for whatever reason silently die as it's probably the best thing do to. + if ( ( $sel->count() == 1 ) and ( ( $time - $stime ) > 30 ) ) { + exit(0); } } my $count = $sel->count(); From codesite-noreply at google.com Sat Aug 15 14:06:44 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Sat, 15 Aug 2009 13:06:44 +0000 Subject: [padb-devel] [padb] r118 committed - Update the stack trace example from tping to IMB. Message-ID: <000e0cd229d82ba48704712dd781@google.com> Revision: 118 Author: apittman Date: Sat Aug 15 06:06:16 2009 Log: Update the stack trace example from tping to IMB. http://code.google.com/p/padb/source/detail?r=118 Modified: /trunk/doc/modes.html ======================================= --- /trunk/doc/modes.html Tue Jun 30 05:43:52 2009 +++ /trunk/doc/modes.html Sat Aug 15 06:06:16 2009 @@ -134,34 +134,34 @@
  -----------------
-[0-5] (6 processes)
+[0-1] (2 processes)
  -----------------
-main() at bench/tping.c:2306
-  -----------------
-  [0-1,4] (3 processes)
-  -----------------
-  timed_ping() at bench/tping.c:345
-    elan_tportRxWait() at elan4/tportRx.c:1194
-      _elan_tportRxWait() at elan4/tportRx.c:1053
-      elan_pollWord() at common/progress.c:144
-  -----------------
-  [2-3,5] (3 processes)
-  -----------------
-    timed_ping() at bench/tping.c:340
-      elan_tportTxWait() at elan4/tportTx.c:940
-        elan_wait() at common/events.c:288
-          _elan_tportTxWait() at elan4/tportTx.c:920
-            elan_waitWord() at common/progress.c:819
+main() at IMB.c:262
+  IMB_init_buffers_iter() at IMB_mem_manager.c:798
+    -----------------
+    0 (1 processes)
+    -----------------
+    IMB_pingpong() at IMB_pingpong.c:170
+      PMPI_Recv() at ?:?
+        mca_pml_ob1_recv() at ?:?
+          opal_progress() at ?:?
+    -----------------
+    1 (1 processes)
+    -----------------
+    IMB_pingpong() at IMB_pingpong.c:194
+      PMPI_Recv() at ?:?
+        mca_pml_ob1_recv() at ?:?
+          opal_progress() at ?:?
  -----------------
-6 (1 processes)
+[2-15] (14 processes)
  -----------------
-main() at bench/tping.c:2362
-  elan_gsync() at elan4/groupGsync.c:149
-    elan_gsyncShm() at elan4/groupGsync.c:464
-      elan_gsyncNet() at elan4/groupGsync.c:99
-        elan_gsyncGeneric() at elan4/groupGsync.c:76
-          _elan_aeventWait() at elan4/devent.c:361
-            elan_deviceCheck() at common/progress.c:847
+main() at IMB.c:276
+  PMPI_Barrier() at ?:?
+    ompi_coll_tuned_barrier_intra_dec_fixed() at ?:?
+      ompi_coll_tuned_barrier_intra_recursivedoubling() at ?:?
+        ompi_coll_tuned_sendrecv_actual() at ?:?
+          ompi_request_default_wait_all() at ?:?
+            opal_progress() at ?:?
  
From codesite-noreply at google.com Sat Aug 15 14:10:47 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Sat, 15 Aug 2009 13:10:47 +0000 Subject: [padb-devel] [padb] r119 committed - Don't put that it's done outside of Quadrics, most people should know ... Message-ID: <000e0cd229d8a3f0cf04712de5f9@google.com> Revision: 119 Author: apittman Date: Sat Aug 15 06:07:02 2009 Log: Don't put that it's done outside of Quadrics, most people should know that by now. http://code.google.com/p/padb/source/detail?r=119 Modified: /trunk/doc/index.html ======================================= --- /trunk/doc/index.html Mon Jun 29 13:54:04 2009 +++ /trunk/doc/index.html Sat Aug 15 06:07:02 2009 @@ -10,7 +10,7 @@ for use by programmers and system administrators alike.

-Padb is currently maintained outside of Quadrics by +Padb is developed and maintained by Ashley Pittman.

News

From codesite-noreply at google.com Sat Aug 15 14:14:47 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Sat, 15 Aug 2009 13:14:47 +0000 Subject: [padb-devel] [padb] r120 committed - Also strip above opal_progress, opal_condition_wait is optimised... Message-ID: <000e0cd148befbdb3a04712df35b@google.com> Revision: 120 Author: apittman Date: Sat Aug 15 06:09:02 2009 Log: Also strip above opal_progress, opal_condition_wait is optimised out in some builds. http://code.google.com/p/padb/source/detail?r=120 Modified: /branches/full-duplex/src/padb ======================================= --- /branches/full-duplex/src/padb Tue Aug 4 11:20:27 2009 +++ /branches/full-duplex/src/padb Sat Aug 15 06:09:02 2009 @@ -7074,7 +7074,7 @@ "stack-shows-locals" => 0, "gdb-retry-count" => 3, "stack-strip-above" => -"elan_waitWord,elan_pollWord,elan_deviceCheck,opal_condition_wait", +"elan_waitWord,elan_pollWord,elan_deviceCheck,opal_condition_wait,opal_progress", "stack-strip-below" => "main", }, 'secondary' => [ From codesite-noreply at google.com Sat Aug 15 14:37:22 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Sat, 15 Aug 2009 13:37:22 +0000 Subject: [padb-devel] [padb] r121 committed - Check for the existance of a variable type before reading it, this avo... Message-ID: <0016e64b9a1cc2d8e004712e4463@google.com> Revision: 121 Author: apittman Date: Sat Aug 15 06:37:06 2009 Log: Check for the existance of a variable type before reading it, this avoids crrashes in IMB when run with -Ostack-shows-params=1 because gdb can't always read argc and argv http://code.google.com/p/padb/source/detail?r=121 Modified: /branches/full-duplex/src/padb ======================================= --- /branches/full-duplex/src/padb Sat Aug 15 06:09:02 2009 +++ /branches/full-duplex/src/padb Sat Aug 15 06:37:06 2009 @@ -5790,6 +5790,7 @@ my %l; $l{t} = 0; $l{n} = 0; + return unless defined ( $frame->{$type} ); return if ( @{ $frame->{$type} } == 0 ); foreach my $arg ( @{ $frame->{$type} } ) { $l{t} = length( $arg->{type} ) if ( length( $arg->{type} ) > $l{t} ); From codesite-noreply at google.com Tue Aug 18 22:14:01 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Tue, 18 Aug 2009 21:14:01 +0000 Subject: [padb-devel] [padb] r122 committed - Add a --ping option to allow timing of the comms code within... Message-ID: <001636e1fc686475f0047170ff4b@google.com> Revision: 122 Author: apittman Date: Tue Aug 18 14:13:05 2009 Log: Add a --ping option to allow timing of the comms code within padb, you can now run the following command to measure timings (with second accuracy!) padb --debug=verbose --ping -a -c http://code.google.com/p/padb/source/detail?r=122 Modified: /branches/full-duplex/src/padb ======================================= --- /branches/full-duplex/src/padb Sat Aug 15 06:37:06 2009 +++ /branches/full-duplex/src/padb Tue Aug 18 14:13:05 2009 @@ -5790,7 +5790,7 @@ my %l; $l{t} = 0; $l{n} = 0; - return unless defined ( $frame->{$type} ); + return unless defined( $frame->{$type} ); return if ( @{ $frame->{$type} } == 0 ); foreach my $arg ( @{ $frame->{$type} } ) { $l{t} = length( $arg->{type} ) if ( length( $arg->{type} ) > $l{t} ); @@ -5933,6 +5933,12 @@ kill( $signal, $pid ); return; } + +sub ping_rank { + my ( $cargs, $vp, $pid ) = @_; + output( $vp, "ACK" ); + return; +} sub show_queue { my ( $carg, $vp, $pid ) = @_; @@ -7112,6 +7118,12 @@ } }; + $allfns{ping} = { + 'handler' => \&ping_rank, + 'arg_long' => 'ping', + 'help' => "Internal ping", + }; + $allfns{set_debug} = { 'handler' => \&set_debug, 'qsnet' => 1, From codesite-noreply at google.com Wed Aug 19 15:29:27 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Wed, 19 Aug 2009 14:29:27 +0000 Subject: [padb-devel] [padb] r123 committed - Get slurm support working again on the branch, basically query squeue ... Message-ID: <000e0cd290ce5fe1c704717f76f8@google.com> Revision: 123 Author: apittman Date: Wed Aug 19 07:28:24 2009 Log: Get slurm support working again on the branch, basically query squeue for a host list when setting up the job to know how many hosts to expect signons from. http://code.google.com/p/padb/source/detail?r=123 Modified: /branches/full-duplex/src/padb ======================================= --- /branches/full-duplex/src/padb Tue Aug 18 14:13:05 2009 +++ /branches/full-duplex/src/padb Wed Aug 19 07:28:24 2009 @@ -319,11 +319,12 @@ }; $rmgr{"slurm"} = { - 'is_installed' => \&slurm_is_installed, - 'get_active_jobs' => \&slurm_get_jobs, - 'job_is_running' => \&slurm_job_is_running, - 'setup_pcmd' => \&slurm_setup_pcmd, - 'find_pids' => \&slurm_find_pids, + 'is_installed' => \&slurm_is_installed, + 'get_active_jobs' => \&slurm_get_jobs, + 'job_is_running' => \&slurm_job_is_running, + 'setup_pcmd' => \&slurm_setup_pcmd, + 'find_pids' => \&slurm_find_pids, + 'require_inner_callback' => 1, }; $rmgr{"local"} = { @@ -397,6 +398,8 @@ $conf{"prun-exittimeout"} = 120; $conf{"rmgr"} = "auto"; +$conf{"slurm-job-step"} = 0; + # These settings are passed onto inner only. $conf{"edbopt"} = ""; @@ -604,6 +607,7 @@ $debugModes{"tree"} = undef; $debugModes{"verbose"} = undef; $debugModes{"signon"} = undef; +$debugModes{"rmgr"} = undef; sub parse_args_outer { @@ -2206,6 +2210,21 @@ return $tasks if $have_tasks; return undef; } + +sub slurm_job_to_nodelist { + my $job = shift; + my @steps = `squeue -s -o "%i %N" 2>/dev/null`; + return undef if ( $? != 0 ); + + my $hosts; + my $s = "$job." . $conf{"slurm-job-step"}; + foreach my $step (@steps) { + my ( $left, $right ) = split( " ", $step ); + $hosts = $right if ( $left eq $s ); + + } + return $hosts; +} sub slurm_job_is_running { my $job = shift; @@ -2215,9 +2234,11 @@ } sub slurm_setup_pcmd { - my $job = shift; - my $cpus = slurm_job_to_ncpus($job); - return ( "srun --jobid=$job", $cpus ); + my $job = shift; + my $cpus = slurm_job_to_ncpus($job); + my @nodes = slurm_job_to_nodelist($job); + my $nc = $#nodes + 1; + return ( "srun --jobid=$job", $cpus, $nc ); } ############################################################################### @@ -2720,6 +2741,7 @@ sub get_all_jobids { my $user = shift; + debug_log( "rmgr", undef, "Loading active jobs list", undef ); return $rmgr{ $conf{"rmgr"} }{get_active_jobs}($user); } @@ -3683,6 +3705,7 @@ # Check that the file is mode 100600 (Octal) if ( $mode != 33152 ) { printf("Wrong permissions on secret file, should be 0600 ($file)\n"); + exit(1); } open( SFD, $file ) or return; From codesite-noreply at google.com Wed Aug 19 21:16:07 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Wed, 19 Aug 2009 20:16:07 +0000 Subject: [padb-devel] [padb] r124 committed - Tidy up the slurm interface code, query host count for the job... Message-ID: <000e0cd29c5c2527a20471844ecc@google.com> Revision: 124 Author: apittman Date: Wed Aug 19 13:15:00 2009 Log: Tidy up the slurm interface code, query host count for the job rather than the job step which is probably correct as shadow jobs only take a job. Make the code run cleanly with warnings enabled. http://code.google.com/p/padb/source/detail?r=124 Modified: /branches/full-duplex/src/padb ======================================= --- /branches/full-duplex/src/padb Wed Aug 19 07:28:24 2009 +++ /branches/full-duplex/src/padb Wed Aug 19 13:15:00 2009 @@ -2193,15 +2193,20 @@ return @res; } +# Query the process count for the "step" as that's how many +# processes we are going to be looking for. sub slurm_job_to_ncpus { my $job = shift; - my @steps = `squeue -s -o "%i %A" 2>/dev/null`; + my $s = "$job." . $conf{"slurm-job-step"}; + my @steps = `squeue -s $s -o "%i %A" 2>/dev/null`; return undef if ( $? != 0 ); -# The %A option is new so ensure we have the TASKS output before we believe what we see here... + # The %A option is new so ensure we have the TASKS output + # before we believe what we see here... + # Mind you %A is several years old now so if it's not there + # we probably can't do anything anyway. my $tasks; my $have_tasks = 0; - my $s = "$job." . $conf{"slurm-job-step"}; foreach my $step (@steps) { my ( $step, $cpus ) = split( " ", $step ); $tasks = $cpus if ( $step eq $s ); @@ -2211,19 +2216,35 @@ return undef; } +# Query the nodecount for the "job" as that is what we shall be running on. +sub slurm_job_to_nodecount { + my $job = shift; + my @jobs = `squeue -o "%i %D" 2>/dev/null`; + return undef if ( $? != 0 ); + + foreach my $step (@jobs) { + my ( $left, $right ) = split( " ", $step ); + return $right if ( $left eq $job ); + } + return undef; +} + +# Query the node list for the "step" which isn't the same as the node list +# for the job, care should be taken if using this function to ensure this +# is correct. +# This functions isn't used currently. sub slurm_job_to_nodelist { my $job = shift; - my @steps = `squeue -s -o "%i %N" 2>/dev/null`; + my $s = "$job." . $conf{"slurm-job-step"}; + my @steps = `squeue -s $s -o "%i %N" 2>/dev/null`; return undef if ( $? != 0 ); - my $hosts; - my $s = "$job." . $conf{"slurm-job-step"}; foreach my $step (@steps) { my ( $left, $right ) = split( " ", $step ); - $hosts = $right if ( $left eq $s ); + return $right if ( $left eq $s ); } - return $hosts; + return undef; } sub slurm_job_is_running { @@ -2234,10 +2255,9 @@ } sub slurm_setup_pcmd { - my $job = shift; - my $cpus = slurm_job_to_ncpus($job); - my @nodes = slurm_job_to_nodelist($job); - my $nc = $#nodes + 1; + my $job = shift; + my $cpus = slurm_job_to_ncpus($job); + my $nc = slurm_job_to_nodecount($job); return ( "srun --jobid=$job", $cpus, $nc ); } @@ -6318,7 +6338,7 @@ foreach my $proc (@procs) { my ( $pid, $job, $step, $local, $global ) = split( " ", $proc ); next if ( $global eq "-" ); - next unless ( $job == $jobid ); + next unless ( $job eq $jobid ); next unless ( $step == $confInner{"slurm-job-step"} ); $gids{$pid} = $global; } @@ -6380,16 +6400,19 @@ if (%gids) { $vp = vp_from_pid( \%gids, \%resmgr, $pid ); - debug $vp, "Found $vp from $pid using scontrol listpids"; + + # debug $vp, "Found $vp from $pid using scontrol listpids"; } if ( not defined $vp ) { my %env = get_remote_env($pid); - debug undef, + if (0) { + debug undef, "Checking slurm pid: $pid, job $env{SLURM_JOBID}, step $env{SLURM_STEPID}, proc $env{SLURM_PROCID}, script $script"; - debug undef, + debug undef, "Checking rms pid: $pid, job $env{RMS_JOBID}, proc $env{RMS_PROCID}, script $script"; + } if ( $env{SLURM_JOBID} eq $jobid ) { if ( $env{SLURM_STEPID} eq $confInner{"slurm-job-step"} ) { @@ -6938,6 +6961,7 @@ $confInner{"edb"} = find_edb(); $confInner{"minfo"} = find_minfo(); $confInner{"hostname"} = hostname(); + $confInner{"scripts"} = "bash,sh,dash,ash,perl,xterm"; # Local vars to help with command line parsing my $outerloc; From codesite-noreply at google.com Wed Aug 19 22:56:28 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Wed, 19 Aug 2009 21:56:28 +0000 Subject: [padb-devel] [padb] r125 committed - Merged full-duplex changes r55:r124 into the trunk Message-ID: <000e0cd2bd640360a9047185b5a2@google.com> Revision: 125 Author: apittman Date: Wed Aug 19 14:56:10 2009 Log: Merged full-duplex changes r55:r124 into the trunk http://code.google.com/p/padb/source/detail?r=125 Modified: /trunk/src /trunk/src/padb ======================================= --- /trunk/src/padb Thu Jun 18 01:29:09 2009 +++ /trunk/src/padb Wed Aug 19 14:56:10 2009 @@ -1,6 +1,6 @@ -#!/usr/bin/perl - -# padb. a simple parallel debugging aid from Quadrics. +#!/usr/bin/perl -w + +# padb. a simple parallel debugging aid. # For help and support visit http://padb.pittman.org.uk # or email padb-users at pittman.org.uk @@ -23,6 +23,19 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # Revision history +# Version 3.0 +# * Full-duplex communication between inner and outer processes, padb +# no longer simply sends request on the command line and processes +# the response but is truly interactive between the inner and outer +# processes. This avails greater flexibility in what can be achieved +# and hopefully helps with scalability as well. +# * Enabled warnings (-w) by default. Fixed lots of warnings, mostly +# about comparing with undef +# * Much more complete separation into "modes" of operation, most options +# are now mode specific rather than simply using global variables or +# global conf options. +# * Overhaul of the allfns (mode) callbacks and in particular their parameters +# * Performance improvements. # # Version 2.5 # * First Non-Quadrics version @@ -205,6 +218,8 @@ use File::Temp qw(tempfile); use MIME::Base64; use Config; +use IO::Socket; +use IO::Select; ############################################################################### # @@ -237,10 +252,12 @@ # Main. my $prog = basename $0; -my $version = "2.5"; +my $version = "3.0-beta"; my %conf; +my $secret; + # Config options the inner knows about, only forward options if they are in this list. my @inner_conf = qw(edb edbopt minfo rmgr scripts slurm-job-step verbose); @@ -278,11 +295,12 @@ }; $rmgr{"mpd"} = { - 'is_installed' => \&mpd_is_installed, - 'get_active_jobs' => \&mpd_get_jobs, - 'setup_pcmd' => \&mpd_setup_pcmd, - 'cleanup_pcmd' => \&mpd_cleanup_pcmd, - 'find_pids' => \&mpd_find_pids, + 'is_installed' => \&mpd_is_installed, + 'get_active_jobs' => \&mpd_get_jobs, + 'setup_pcmd' => \&mpd_setup_pcmd, + 'cleanup_pcmd' => \&mpd_cleanup_pcmd, + 'find_pids' => \&mpd_find_pids, + 'require_inner_callback' => 1, }; $rmgr{"orte"} = { @@ -301,33 +319,37 @@ }; $rmgr{"slurm"} = { - 'is_installed' => \&slurm_is_installed, - 'get_active_jobs' => \&slurm_get_jobs, - 'job_is_running' => \&slurm_job_is_running, - 'setup_pcmd' => \&slurm_setup_pcmd, - 'find_pids' => \&slurm_find_pids, + 'is_installed' => \&slurm_is_installed, + 'get_active_jobs' => \&slurm_get_jobs, + 'job_is_running' => \&slurm_job_is_running, + 'setup_pcmd' => \&slurm_setup_pcmd, + 'find_pids' => \&slurm_find_pids, + 'require_inner_callback' => 1, }; $rmgr{"local"} = { - 'get_active_jobs' => \&local_get_jobs, - 'job_is_running' => \&local_job_is_running, - 'setup_pcmd' => \&local_setup_pcmd, - 'find_pids' => \&local_find_pids, + 'get_active_jobs' => \&local_get_jobs, + 'job_is_running' => \&local_job_is_running, + 'setup_pcmd' => \&local_setup_pcmd, + 'find_pids' => \&local_find_pids, + 'require_inner_callback' => 1, }; $rmgr{"local-qsnet"} = { - 'is_installed' => \&local_q_is_installed, - 'get_active_jobs' => \&local_q_get_jobs, - 'job_is_running' => \&local_job_is_running, - 'setup_pcmd' => \&local_setup_pcmd, - 'inner_rmgr' => "local", + 'is_installed' => \&local_q_is_installed, + 'get_active_jobs' => \&local_q_get_jobs, + 'job_is_running' => \&local_job_is_running, + 'setup_pcmd' => \&local_setup_pcmd, + 'inner_rmgr' => "local", + 'require_inner_callback' => 1, }; $rmgr{"local-fd"} = { - 'get_active_jobs' => \&local_fd_get_jobs, - 'job_is_running' => \&local_job_is_running, - 'setup_pcmd' => \&local_setup_pcmd, - 'inner_rmgr' => "local", + 'get_active_jobs' => \&local_fd_get_jobs, + 'job_is_running' => \&local_job_is_running, + 'setup_pcmd' => \&local_setup_pcmd, + 'inner_rmgr' => "local", + 'require_inner_callback' => 1, }; ############################################################################### @@ -344,50 +366,42 @@ my %allfns; -my $line_formatted = 0; -my $countoutput = 0; - my %cinner; # Config options to be passed to inner. my $rem_jobid; # Debug options. -$conf{"verbose"} = 0; -$conf{"tree-verbose"} = 0; -$conf{"dump-raw"} = 0; -$conf{"showcmd"} = 0; +$conf{"verbose"} = 0; +$conf{"dump-raw"} = 0; + +# Valid values are "none" "missing" or "all". Anything not recognised +# is treated as "all". +$conf{"check-signon"} = "all"; $conf{slurm_job_step} = "0"; # Output options. -$conf{"stats-sort-key"} = "vp"; -$conf{"proc-sort-key"} = "vp"; -$conf{"proc-show-header"} = 1; -$conf{"stats-reverse"} = 0; -$conf{"stats-short"} = 0; -$conf{"show-group-members"} = 0; -$conf{"show-all-stats"} = 0; -$conf{"show-all-groups"} = 0; $conf{"interval"} = 10; $conf{"watch-clears-screen"} = 1; -$conf{"stats-name"} = undef; -$conf{"stats-raw"} = 0; $conf{"scripts"} = "bash,sh,dash,ash,perl,xterm"; -$conf{"stack-strip-below"} = "main"; $conf{"lsf-job-offset"} = 1; $conf{"local-fd-name"} = "/dev/null"; -$conf{"stack-strip-above"} = - "elan_waitWord,elan_pollWord,elan_deviceCheck,opal_condition_wait"; - -# $conf{stack-format} = undef; +$conf{"inner-callback"} = 0; + +# These two are used by deadlock and QsNet group +# code, they need migrating in the group code +# when I have access to a test system again. +#$conf{"show-group-members"} = 0; +#$conf{"show-all-groups"} = 0; # Tuning options. $conf{"prun-timeout"} = 120; $conf{"prun-exittimeout"} = 120; $conf{"rmgr"} = "auto"; +$conf{"slurm-job-step"} = 0; + # These settings are passed onto inner only. -$conf{"edbopt"} = ""; -$conf{"mpi-dll"} = "auto"; +$conf{"edbopt"} = ""; $conf{"edb"} = find_edb(); $conf{"minfo"} = find_minfo(); @@ -419,8 +433,7 @@ sub show_version { printf("$prog version $version\n\n"); printf("Written by Ashley Pittman\n"); - - #ashley at quadrics.com + printf("http://padb.pittman.org.uk\n"); exit 0; } @@ -431,15 +444,13 @@ -a --all report on all running jobs for user. -A --any report on a running job for user. --u --user=USER report on jobs for username=. - --r --rank=RANK report only on processes . - --group-id=ID report only on group . - --s --statistics Show the job-wide statistics. --g --group Show the state of collective operations (groups). +-u --user= report on jobs for username=. + +-r --rank= report only on processes . + --group-id= report only on group . + XXXX - --full-report=JOBID All of the above. + --full-report= Generate a full report of job state. --nostrip-below-main Don\'t strip stack traces below main. --nostrip-above-wait Don\'t strip stack traces about elan_waitWord. @@ -468,7 +479,6 @@ prun-timeout Timeout to use when launching parallel job. Stack trace options: - tree-verbose turn on debugging for the stack trace tree generation code. gdb-retry-count Number of times to try getting a 'good' stack trace from gdb. stack-show-params Show function parameters in stack traces. stack-show-locals Show locals in stack traces. @@ -495,8 +505,10 @@ chomp $usage; my $extra = ""; + $extra .= "Modes of operation\n"; foreach my $arg ( sort( keys %allfns ) ) { next unless ( defined $allfns{$arg}{help} ); + next if ( defined $allfns{$arg}{qsnet} ); if ( defined $allfns{$arg}{arg_short} ) { $extra .= "-$allfns{$arg}{arg_short}"; } else { @@ -506,6 +518,20 @@ $allfns{$arg}{arg_long}, $allfns{$arg}{help} ); } + + $extra .= "\nQsNet specific modes\n"; + foreach my $arg ( sort( keys %allfns ) ) { + next unless ( defined $allfns{$arg}{help} ); + next unless ( defined $allfns{$arg}{qsnet} ); + if ( defined $allfns{$arg}{arg_short} ) { + $extra .= "-$allfns{$arg}{arg_short}"; + } else { + $extra .= " "; + } + $extra .= sprintf( " --%-18s%s.\n", + $allfns{$arg}{arg_long}, + $allfns{$arg}{help} ); + } $usage =~ s!XXXX!$extra!; @@ -530,9 +556,6 @@ # Number of functions provided on the command line from the allfns hash. my $have_allfns_option = 0; -my $stats_total; -my $group; - my $full_report; my $core_stack; my $list_rmgrs; @@ -543,17 +566,11 @@ my $core_name; my $exe_name; -my $proc_format = - "vp=vpid,hostname,pid,vmsize,vmrss,stat.state=S,pcpu=%cpu,name=command"; - my $input_file; my $compress; my $compress_C; my $tree; -my $strip_below_main = 1; -my $strip_above_wait = 1; - my @config_options; my %ic_names; @@ -561,40 +578,67 @@ # can access secondary comamnd line argunments by name. my %secondary_args; +# Debugging: this function is called periodically with +# a mode, an abritary ref and a string, it can either print simply +# the string or call dumper on the ref as well. +# Enable with --debug=type1,type2=all +my %debugModes; +my $start_time = time(); + +sub debug_log { + my ( $type, $handle, $str, @params ) = @_; + if ( not exists $debugModes{$type} ) { + printf("Unknown debug mode: $type\n"); + exit(1); + } + return unless $debugModes{$type}; + my $time = time() - $start_time; + printf( "DEBUG ($type): %3d: $str\n", $time, @params ); + return if $debugModes{$type} eq "basic"; + return unless defined $handle; + print Dumper $handle; +} + +# Valid debug modes, a full list is maintained here so using unexpected +# ones can generate warnings. +$debugModes{"full-duplex"} = undef; +$debugModes{"show-cmd"} = undef; +$debugModes{"all"} = undef; +$debugModes{"tree"} = undef; +$debugModes{"verbose"} = undef; +$debugModes{"signon"} = undef; +$debugModes{"rmgr"} = undef; + sub parse_args_outer { Getopt::Long::Configure("bundling"); - my $mode; + my $debugflag; my %optionhash = ( - "verbose|v+" => \$conf{verbose}, - "user|u=s" => \$user, - "rank|r=i" => \@ranks, - "group-id=i" => \@target_groups, - "help|h" => \&usage, - "all|a" => \$all, - "any|A" => \$any, - "statistics-total|stat|sta|st|s" => \$stats_total, - "version|V" => \&show_version, - "compress|c" => \$compress, - "compress-long|C" => \$compress_C, - "group|g" => \$group, - "tree|t" => \$tree, - "input-file|file|i=s" => \$input_file, - "config-option|O=s" => \@config_options, - "full-report=s" => \$full_report, - "core-stack" => \$core_stack, - "core=s" => \$core_name, - "exe=s" => \$exe_name, - "list-rmgrs" => \$list_rmgrs, - "strip-below-main!" => \$strip_below_main, - "strip-above-wait!" => \$strip_above_wait, - "watch!" => \$watch, - "local-stats" => \$local_stats, - "proc-format=s" => \$proc_format, - "show-jobs" => \$show_jobs, - "norc" => \$norc, - "config-file=s" => \$configfile + "verbose|v+" => \$conf{verbose}, + "user|u=s" => \$user, + "rank|r=i" => \@ranks, + "group-id=s" => \@target_groups, + "help|h" => \&usage, + "all|a" => \$all, + "any|A" => \$any, + "version|V" => \&show_version, + "compress|c" => \$compress, + "compress-long|C" => \$compress_C, + "tree|t" => \$tree, + "input-file|file|i=s" => \$input_file, + "config-option|O=s" => \@config_options, + "full-report=s" => \$full_report, + "core-stack" => \$core_stack, + "core=s" => \$core_name, + "exe=s" => \$exe_name, + "list-rmgrs" => \$list_rmgrs, + "watch" => \$watch, + "local-stats" => \$local_stats, + "show-jobs" => \$show_jobs, + "norc" => \$norc, + "config-file=s" => \$configfile, + "debug=s" => \$debugflag, ); my %config_hash; @@ -608,14 +652,32 @@ } if ( defined $allfns{$arg}{options_i} ) { foreach my $o ( keys( %{ $allfns{$arg}{options_i} } ) ) { - $conf{$o} = $allfns{$arg}{options_i}{$o}; - $ic_names{$o}++; + $conf{mode_options}{$arg}{$o} = $allfns{$arg}{options_i}{$o}; + $conf{mode_options_reverse}{$o}{$arg} = 1; } } } GetOptions(%optionhash) or exit(1); + if ( defined $debugflag ) { + foreach my $f ( split( ",", $debugflag ) ) { + my ( $name, $v ) = split( "=", $f ); + if ( exists $debugModes{$name} ) { + $debugModes{$name} = defined($v) ? $v : "basic"; + } else { + printf("Attempt to set unknown debug flag \"$name\".\n"); + } + } + if ( $debugModes{all} ) { + foreach my $mode ( keys(%debugModes) ) { + $debugModes{$mode} = $debugModes{all}; + } + } + } + + my $mode; + foreach my $arg ( keys %config_hash ) { next unless defined $config_hash{$arg}; $mode = $arg; @@ -623,7 +685,7 @@ } # Put the args in a hash so that they can be referenced by name. - if ( defined $allfns{$mode}{secondary} ) { + if ( defined $mode and defined $allfns{$mode}{secondary} ) { foreach my $sec ( @{ $allfns{$mode}{secondary} } ) { $secondary_args{ $sec->{arg_long} } = $sec->{value}; } @@ -1417,6 +1479,9 @@ print("QsNet Statistics not valid\n"); return; } + + my $stats_total = 0; + my $group = 0; if ($stats_total) { @@ -1500,11 +1565,11 @@ my %ad; - my @tg; + my %tg; if ( $#target_groups != -1 ) { foreach my $gid (@target_groups) { - $tg[$gid]++; + $tg{$gid}++; } } @@ -1515,7 +1580,7 @@ foreach my $gid ( keys %{ $dataset->{'subsystems'}{'Group'} } ) { if ( $#target_groups != -1 ) { - next unless defined $tg[$gid]; + next unless defined $tg{$gid}; } my $str; @@ -1566,7 +1631,7 @@ foreach my $gid ( sort { $a <=> $b } keys %ad ) { if ( $#target_groups != -1 ) { - next unless defined $tg[$gid]; + next unless defined $tg{$gid}; } my $gstr = "Information for group '$gid'\n"; @@ -1734,7 +1799,7 @@ my $s = read_stats(@data); - $stats_total = 1; + # $stats_total = 1; show_stats($s); } @@ -1792,8 +1857,7 @@ my $prev; my $tag = $tags[0]; - printf("called tag:$tag, level:$level tags:@tags\n") - if $conf{"tree-verbose"}; + debug_log( "tree", undef, "called tag:$tag, level:$level tags:@tags" ); return if ( !defined($tag) ); return if ( !defined( $lines->{$tag} ) ); @@ -1852,9 +1916,11 @@ $endlevel = ( $#{ $lines->{$tag} } ); } - printf( -"level $level, endlevel $endlevel, identical:@identical different:@different\n", - ) if $conf{"tree-verbose"}; + debug_log( + "tree", undef, + +"level $level, endlevel $endlevel, identical:@identical different:@different" + ); for ( my $l = $level ; $l <= $endlevel ; $l++ ) { @@ -1882,9 +1948,11 @@ $prev->{children} = go_p( $endlevel + 1, $lines, @identical ); } - printf( -"returning level:$level endlevel:$endlevel identical:@identical different:@different\n" - ) if $conf{"tree-verbose"}; + debug_log( + "tree", undef, + +"returning level:$level endlevel:$endlevel identical:@identical different:@different" + ); if (@different) { my $new = go_p( $level, $lines, @different ); @@ -1926,6 +1994,7 @@ sub show_tree { my $ref = shift; + debug_log( "tree", $ref, "Complete tree" ); return _show_tree( $ref, undef, "" ); } @@ -2007,7 +2076,7 @@ } sub rms_is_installed { - return find_exe("prun"); + return ( find_exe("prun") and find_exe("rmsquery") ); } sub rms_get_jobs { @@ -2114,7 +2183,7 @@ ############################################################################### sub slurm_is_installed { - return find_exe("srun"); + return ( find_exe("srun") and find_exe("squeue") and find_exe("scontrol") ); } sub slurm_get_jobs { @@ -2124,15 +2193,20 @@ return @res; } +# Query the process count for the "step" as that's how many +# processes we are going to be looking for. sub slurm_job_to_ncpus { my $job = shift; - my @steps = `squeue -s -o "%i %A" 2>/dev/null`; + my $s = "$job." . $conf{"slurm-job-step"}; + my @steps = `squeue -s $s -o "%i %A" 2>/dev/null`; return undef if ( $? != 0 ); -# The %A option is new so ensure we have the TASKS output before we believe what we see here... + # The %A option is new so ensure we have the TASKS output + # before we believe what we see here... + # Mind you %A is several years old now so if it's not there + # we probably can't do anything anyway. my $tasks; my $have_tasks = 0; - my $s = "$job." . $conf{"slurm-job-step"}; foreach my $step (@steps) { my ( $step, $cpus ) = split( " ", $step ); $tasks = $cpus if ( $step eq $s ); @@ -2141,6 +2215,37 @@ return $tasks if $have_tasks; return undef; } + +# Query the nodecount for the "job" as that is what we shall be running on. +sub slurm_job_to_nodecount { + my $job = shift; + my @jobs = `squeue -o "%i %D" 2>/dev/null`; + return undef if ( $? != 0 ); + + foreach my $step (@jobs) { + my ( $left, $right ) = split( " ", $step ); + return $right if ( $left eq $job ); + } + return undef; +} + +# Query the node list for the "step" which isn't the same as the node list +# for the job, care should be taken if using this function to ensure this +# is correct. +# This functions isn't used currently. +sub slurm_job_to_nodelist { + my $job = shift; + my $s = "$job." . $conf{"slurm-job-step"}; + my @steps = `squeue -s $s -o "%i %N" 2>/dev/null`; + return undef if ( $? != 0 ); + + foreach my $step (@steps) { + my ( $left, $right ) = split( " ", $step ); + return $right if ( $left eq $s ); + + } + return undef; +} sub slurm_job_is_running { my $job = shift; @@ -2152,7 +2257,8 @@ sub slurm_setup_pcmd { my $job = shift; my $cpus = slurm_job_to_ncpus($job); - return ( "srun --jobid=$job", $cpus ); + my $nc = slurm_job_to_nodecount($job); + return ( "srun --jobid=$job", $cpus, $nc ); } ############################################################################### @@ -2242,7 +2348,7 @@ } sub local_setup_pcmd { - return ( "", undef ); + return ( "", 1, 1 ); } ############################################################################### @@ -2252,7 +2358,7 @@ ############################################################################### sub mpd_is_installed { - return find_exe("mpdlistjobs"); + return ( find_exe("mpdlistjobs") and find_exe("mpdrun") ); } sub mpd_get_data { @@ -2318,9 +2424,11 @@ $mpd_dfile = $fn; - my $cmd = "mpirun -machinefile $fn -np $i"; - - return ( $cmd, undef ); + my $cmd = "mpdrun -machinefile $fn -np $i"; + + my $hosts = $#hosts + 1; + + return ( $cmd, undef, $hosts ); } sub mpd_cleanup_pcmd { @@ -2346,31 +2454,24 @@ } sub open_is_installed { - return find_exe("ompi-ps"); + return ( find_exe("ompi-ps") and find_exe("orterun") ); } my %open_jobs; sub open_get_data { - my ($filename) = @_; # Simply return if called more than once. if ( keys(%open_jobs) != 0 ) { return; } - my $hostname = hostname(); + my $job; - my @out; - if ( defined $filename ) { - open( OPEN, $filename ) or return; - @out = ; - close OPEN; - } else { - open( OPEN, "ompi-ps|" ) or return; - @out = ; - close OPEN; - } + + open( OPEN, "ompi-ps|" ) or return; + my @out = ; + close OPEN; # Handle being called multiple times, zero the hash every # time we are called. Of course we could just return the @@ -2387,7 +2488,11 @@ } else { my @elems = split( /\|/, $l ); - if ( $#elems == 6 ) { + if ( $#elems == 4 ) { + my $nprocs = $elems[3]; + $nprocs =~ s/ //g; + $open_jobs{$job}{nprocs} = $nprocs; + } elsif ( $#elems == 6 ) { my $host = $elems[4]; $host =~ s/ //g; @@ -2395,24 +2500,22 @@ next if $host eq "Node"; $open_jobs{$job}{hosts}{$host}++; - if ( $host eq $hostname ) { - my $name = $elems[1]; - $name =~ /\[\[(\d+)\,(\d+)\]\,(\d+)\]/; - my $rank = $3; - - my $pid = $elems[3]; - $rank =~ s/ //g; - $pid =~ s/ //g; - $open_jobs{$job}{ranks}{$host}{$rank} = $pid; - } + my $name = $elems[1]; + $name =~ /\[\[(\d+)\,(\d+)\]\,(\d+)\]/; + my $rank = $3; + + my $pid = $elems[3]; + $rank =~ s/ //g; + $pid =~ s/ //g; + $open_jobs{$job}{ranks}{$host}{$rank} = $pid; } } } - if ( $conf{"verbose"} ) { - print Dumper \%open_jobs; - } + #if ( $conf{"verbose"} ) { + #print Dumper \%open_jobs; + #} } sub open_get_jobs { @@ -2423,24 +2526,12 @@ } my $open_dfile; -my $open_tfile; sub open_setup_pcmd { my $job = shift; open_get_data(); - my ( $th, $tn ) = tempfile(".padb.XXXX"); - - open( my $oh, "ompi-ps|" ); - while (<$oh>) { - print $th $_; - } - close $th; - $cinner{"open-ps"} = $tn; - - $open_tfile = $tn; - my @hosts = keys %{ $open_jobs{$job}{hosts} }; my $i = @hosts; @@ -2454,14 +2545,14 @@ $open_dfile = $fn; my $prefix = find_ompi_prefix(); - my $cmd = "mpirun -machinefile $fn -np $i $prefix"; - - return ( $cmd, undef ); + my $cmd = "orterun -machinefile $fn -np $i $prefix"; + my $hosts = $#hosts + 1; + + return ( $cmd, $open_jobs{$job}{nprocs}, $hosts ); } sub open_cleanup_pcmd { unlink($open_dfile) if ( defined($open_dfile) ); - unlink($open_tfile) if ( defined($open_tfile) ); } ############################################################################### @@ -2474,9 +2565,7 @@ # Check for both LSF and RMS, I know LSF works in other ways but I don't # know how to launch jobs then... - my $rms = find_exe("rinfo"); - return 0 unless $rms; - return find_exe("bjobs"); + return ( find_exe("bjobs") and rms_is_installed() ); } sub lsf_get_jobs { @@ -2553,9 +2642,16 @@ sub find_rmgr { # If it's been set on the command line and it's valid then just use what we are given. -# Don't do any sanity checks here to cope with non-default installs. +# Do sanity checks here but only warn on the result to cope with non-default installs. if ( defined $rmgr{ $conf{"rmgr"} } ) { + if ( defined $rmgr{ $conf{"rmgr"} }{is_installed} + and not $rmgr{ $conf{"rmgr"} }{is_installed}() ) + { + printf( +"Warning: Selected resource manager $conf{rmgr} does not appear to be installed\n" + ); + } setup_rmgr( $conf{"rmgr"} ); return; } @@ -2593,9 +2689,16 @@ sub find_any_rmgr { # If it's been set on the command line and it's valid then just use what we are given. -# Don't do any sanity checks here to cope with non-default installs. +# Do sanity checks here but only warn on the result to cope with non-default installs. if ( defined $rmgr{ $conf{"rmgr"} } ) { + if ( defined $rmgr{ $conf{"rmgr"} }{is_installed} + and not $rmgr{ $conf{"rmgr"} }{is_installed}() ) + { + printf( +"Warning: Selected resource manager $conf{rmgr} does not appear to be installed\n" + ); + } setup_rmgr( $conf{"rmgr"} ); return; } @@ -2658,6 +2761,7 @@ sub get_all_jobids { my $user = shift; + debug_log( "rmgr", undef, "Loading active jobs list", undef ); return $rmgr{ $conf{"rmgr"} }{get_active_jobs}($user); } @@ -2703,13 +2807,15 @@ ############################################################################### sub strip_stack_traces { - my $lines = shift; + my ( $cargs, $lines ) = @_; my %above; my %below; - map { $above{$_}++ } split( ",", $conf{"stack-strip-above"} ); - map { $below{$_}++ } split( ",", $conf{"stack-strip-below"} ); + map { $above{$_}++ } + split( ",", $conf{mode_options}{stack}{"stack-strip-above"} ); + map { $below{$_}++ } + split( ",", $conf{mode_options}{stack}{"stack-strip-below"} ); foreach my $tag ( keys %$lines ) { @@ -2734,10 +2840,11 @@ $main_idx = 0 if not defined $main_idx; if ( $main_idx != 0 or $wait_idx != 0 ) { my $end = - ( $strip_above_wait and $wait_idx ) + ( $cargs->{"strip-above-wait"} and $wait_idx ) ? $wait_idx : $#{ $lines->{$tag} }; - my $start = ( $strip_below_main and $main_idx ) ? $main_idx : 0; + my $start = + ( $cargs->{"strip-below-main"} and $main_idx ) ? $main_idx : 0; printf( "Stripping 0.." . $#{ $lines->{$tag} } @@ -2752,15 +2859,18 @@ } sub sort_proc_hashes { - my $key = shift; - my @all = @_; - - #print Dumper $all; - return ( reverse( sort { $a->{$key} <=> $b->{$key} } @all ) ); + my $carg = shift; + my $key = shift; + my @all = @_; + + if ( $carg->{"reverse-sort-order"} ) { + return ( reverse( sort { $a->{$key} <=> $b->{$key} } @all ) ); + } else { + return ( sort { $a->{$key} <=> $b->{$key} } @all ); + } } sub pre_mpi_watch { - my ($cpus) = @_; my $header = < $b } ( keys %{ $lines->{raw} } ) ) { - $s .= $lines->{raw}{$l}{state}; + foreach my $l ( sort { $a <=> $b } ( keys %{ $lines->{target_responce} } ) ) + { + $s .= $lines->{target_responce}{$l}{state}; } print("$s\n"); } @@ -2788,20 +2896,21 @@ # Nicely format process information. # XXX: proc-sort-key should probably sort on column headers as # well as keys. +# Idealy we'd know what format we wanted and only ask the nodes +# to report relevent info, for now they still report everything. sub show_proc_format { - my ( $nlines, $mode, $handle ) = @_; - - my $lines = $nlines->{lines}; + my ( $carg, $nlines ) = @_; my @proc_format_array; my %proc_format_header; my $show_fields = 0; my %proc_format_lengths; - - my $separator = $conf{"column-seperator"}; - - my @columns = split( ",", $proc_format ); + my %proc_header_reverse; + + my $separator = $carg->{"column-seperator"}; + + my @columns = split( ",", $carg->{"proc-format"} ); foreach my $column (@columns) { $show_fields = 1 if ( $column eq "fields" ); @@ -2811,6 +2920,7 @@ push @proc_format_array, lc($name); $proc_format_header{ lc($name) } = $desc; $proc_format_lengths{ lc($name) } = length($desc); + $proc_header_reverse{ lc($desc) } = lc($name); } else { push @proc_format_array, lc($column); $proc_format_header{ lc($column) } = $column; @@ -2819,21 +2929,21 @@ } my @all; + my $lines = $nlines->{target_responce}; foreach my $tag ( sort ( keys %$lines ) ) { my %hash; $hash{vp} = $tag; - foreach my $data ( @{ $lines->{$tag} } ) { - if ( $data =~ /([\w\.]+)\:[ \t]*(.+)/ ) { - my $key = lc($1); - - next unless defined $proc_format_lengths{$key} or $show_fields; - - if ( length($2) > $proc_format_lengths{$key} ) { - $proc_format_lengths{$key} = length($2); - } - - $hash{$key} = $2; - } + foreach my $key ( keys( %{ $lines->{$tag} } ) ) { + + my $value = $lines->{$tag}{$key}; + next unless defined $proc_format_lengths{$key} or $show_fields; + + if ( length($value) > $proc_format_lengths{$key} ) { + $proc_format_lengths{$key} = length($value); + } + + $hash{$key} = $value; + } if ($show_fields) { my @fields = sort ( keys(%hash) ); @@ -2843,9 +2953,14 @@ push @all, \%hash; } - @all = sort_proc_hashes( $conf{"proc-sort-key"}, @all ); ***The diff for this file has been truncated for email.*** From codesite-noreply at google.com Thu Aug 20 10:18:28 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Thu, 20 Aug 2009 09:18:28 +0000 Subject: [padb-devel] [padb] r126 committed - Tidy up some debug_log calls to not do printf formatting... Message-ID: <001636b2add4055c4d04718f3cf2@google.com> Revision: 126 Author: apittman at gmail.com Date: Thu Aug 20 02:18:10 2009 Log: Tidy up some debug_log calls to not do printf formatting unless it's needed. http://code.google.com/p/padb/source/detail?r=126 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Wed Aug 19 14:56:10 2009 +++ /trunk/src/padb Thu Aug 20 02:18:10 2009 @@ -1857,7 +1857,8 @@ my $prev; my $tag = $tags[0]; - debug_log( "tree", undef, "called tag:$tag, level:$level tags:@tags" ); + debug_log( "tree", \@tags, "called tag:%s, level:%d", + $tag, $level ); return if ( !defined($tag) ); return if ( !defined( $lines->{$tag} ) ); @@ -2761,7 +2762,7 @@ sub get_all_jobids { my $user = shift; - debug_log( "rmgr", undef, "Loading active jobs list", undef ); + debug_log( "rmgr", undef, "Loading active jobs list" ); return $rmgr{ $conf{"rmgr"} }{get_active_jobs}($user); } @@ -3778,7 +3779,7 @@ $conf{"verbose"} && defined $hosts && print "Job spans $hosts host(s)\n"; debug_log( "verbose", undef, - "There are $ncpus processes over $hosts hosts" ); + "There are %d processes over %d hosts", $ncpus, $hosts ); # Maybe do it this way, edb works best when run with the same LD_LIBRARY_PATH # as the application. It's very important when running the message queue From codesite-noreply at google.com Thu Aug 20 10:22:29 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Thu, 20 Aug 2009 09:22:29 +0000 Subject: [padb-devel] [padb] r127 committed - Remove the unused get_rms_jobid() function. It was used... Message-ID: <00504502cb1e6526f404718f4a02@google.com> Revision: 127 Author: apittman at gmail.com Date: Thu Aug 20 02:20:54 2009 Log: Remove the unused get_rms_jobid() function. It was used to convert from slurm job id to a rms job id however is no longer called and I don't have access to such a system to test http://code.google.com/p/padb/source/detail?r=127 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Thu Aug 20 02:18:10 2009 +++ /trunk/src/padb Thu Aug 20 02:20:54 2009 @@ -5422,64 +5422,6 @@ close(PIDFILE); return @procs; } - -sub get_rms_jobid { - my $slurmid = shift; - my $rmsid = 0; - my $rmsdir = "/proc/rms/programs"; - - return undef unless ( -d $rmsdir ); - - # This is a bit odd and isn't well tested (I don't have access to slurm) - # We have been given the slurm ID on the command line and need to convert - # this to a RMS id (as the kernel module sees it). - # For each active RMS job on the node check if this pid translates to - # a slurm pid for the job we have; - - my %gids = slurm_get_ids($slurmid); - - opendir( DIR, $rmsdir ) or die "Unable to open $rmsdir: $!\n"; - my @ids = readdir(DIR); - closedir(DIR); - - for my $id (@ids) { - next unless ( $id =~ /^\d+$/ ); - - my @pids = load_rms_procs($id); - next unless @pids; - - my $self = $$; - my $is_self = 0; - foreach my $pid (@pids) { - if ( $self == $pid->{pid} ) { - $is_self = 1; - } - } - next if ($is_self); - my $pid = $pids[0]->{pid}; - - # Best way, if this pid is in the target slurm job. - return $id if defined $gids{$pid}; - - my %env = get_remote_env($pid); - - # Discard this RMS job if it's the wrong job-step. - next - if ( defined $env{SLURM_STEPID} - and $env{SLURM_STEPID} ne $confInner{"slurm-job-step"} ); - - # The prefered although not perfect way... - if ( $env{SLURM_JOBID} eq $slurmid ) { - return $id; - } - - # Seems to be a legacy option no longer used. - # `scontrol pid2jobid $pid 2>&1` =~ m/id (\d+) /; - # return $id if ( $1 == $slurmid ); - } - - return undef; -} sub show_task_file { my ( $vp, $file, $prefix ) = @_; From codesite-noreply at google.com Thu Aug 20 12:29:00 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Thu, 20 Aug 2009 11:29:00 +0000 Subject: [padb-devel] [padb] r128 committed - Clean up the slurm process finding code, it's now a very simple... Message-ID: <000e0cd1061ce524a80471910ec0@google.com> Revision: 128 Author: apittman Date: Thu Aug 20 04:27:55 2009 Log: Clean up the slurm process finding code, it's now a very simple wrapper around scontrol getpids. Keep all the complex code for walking the process tree but move it to a generic function which is now called for all resource managers. As such padb can now automatically detect "wrapper scripts" for parallel applications and skip over them onto any real application which is a child of them. http://code.google.com/p/padb/source/detail?r=128 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Thu Aug 20 02:20:54 2009 +++ /trunk/src/padb Thu Aug 20 04:27:55 2009 @@ -36,6 +36,12 @@ # global conf options. # * Overhaul of the allfns (mode) callbacks and in particular their parameters # * Performance improvements. +# * Simplify the slurm_find_pids() function to just return the output +# of scontrol listpids +# * Take the old process-tree walking code from slurm_find_pids() and make +# it independant and call it for all resource managers. This allows scripts +# which call parallel applications to be bypassed and the applications +# themselves targetted. # # Version 2.5 # * First Non-Quadrics version @@ -183,11 +189,6 @@ # * Multi-pass argument handling, --kill also accepts --signal for example, # this should really be done at the getopt layer. Also proper usage # info for these secondary args. -# * slurm_find_pids() has some good code in it for finding parallel processes -# this should be extrapolated out and so it can be used in the mpd case, -# ideally on non-rms systems (RMS rocks in this regard) the rmgr callback -# should return a list of spawned pids and the code in slurm_find_pids() should -# pass this tree to find the most interesting one. # * The mode {handler} functions should only be called once per node, it could then # correctly handle $confInner{gdb_file} and also attach to every process per node # simultaneously, this would help stack trace and message queue support as doing @@ -1857,8 +1858,7 @@ my $prev; my $tag = $tags[0]; - debug_log( "tree", \@tags, "called tag:%s, level:%d", - $tag, $level ); + debug_log( "tree", \@tags, "called tag:%s, level:%d", $tag, $level ); return if ( !defined($tag) ); return if ( !defined( $lines->{$tag} ) ); @@ -3778,8 +3778,8 @@ $conf{"verbose"} && defined $ncpus && print "Job has $ncpus processe(s)\n"; $conf{"verbose"} && defined $hosts && print "Job spans $hosts host(s)\n"; - debug_log( "verbose", undef, - "There are %d processes over %d hosts", $ncpus, $hosts ); + debug_log( "verbose", undef, "There are %d processes over %d hosts", + $ncpus, $hosts ); # Maybe do it this way, edb works best when run with the same LD_LIBRARY_PATH # as the application. It's very important when running the message queue @@ -6194,11 +6194,7 @@ debug( $vp, "maybe_show_pid vp $vp, pid: $pid" ); - my %d; - $d{vp} = $vp; - $d{pid} = $pid; - push( @{ $confInner{"all-pids"} }, \%d ); - + $confInner{rmpids}{$pid}{rank} = $vp; } sub find_from_status { @@ -6224,57 +6220,11 @@ return 1 if ( defined $mgrs->{$name} ); } -sub is_pid_script { - my $pid = shift; - my $exe = readlink("/proc/$pid/exe"); - my $cmd; - if ( defined $exe ) { - $cmd = basename($exe); - } else { - $cmd = find_from_status( $pid, "Name" ); - } - my %scripts; - map { $scripts{$_}++ } split( ",", $confInner{"scripts"} ); - return 1 if ( defined $scripts{$cmd} ); - return 0; -} - -sub is_desc_of_resmgr { - my $resmgrs = shift; - my $pid = shift; - - my $ppid = find_from_status( $pid, "PPid" ); - - while ( defined $ppid and $ppid != 1 ) { - return 1 if ( defined $resmgrs->{$ppid} ); - $ppid = find_from_status( $ppid, "PPid" ); - } - - return 0; -} - -sub vp_from_pid { - my $gids = shift; - my $resmgrs = shift; - my $pid = shift; - - return $gids->{$pid} if ( defined $gids->{$pid} ); - - my $ppid = find_from_status( $pid, "PPid" ); - - while ( defined $ppid and $ppid != 1 ) { - return $gids->{$ppid} if ( defined $gids->{$ppid} ); - return undef if ( defined $resmgrs->{$ppid} ); - $ppid = find_from_status( $ppid, "PPid" ); - } - return undef; -} - -sub slurm_get_ids { +# Report the pids as reported by slurm, don't worry about tracing children or +# anything at this stage. +sub slurm_find_pids { my $jobid = shift; - my %gids; - my @procs = `scontrol listpids $jobid.$confInner{"slurm-job-step"} 2>/dev/null`; return undef if ( $? != 0 ); @@ -6283,106 +6233,7 @@ next if ( $global eq "-" ); next unless ( $job eq $jobid ); next unless ( $step == $confInner{"slurm-job-step"} ); - $gids{$pid} = $global; - } - return %gids; -} - -# Do the right thing with slurm... -sub slurm_find_pids { - my $jobid = shift; - - # Slurm has the concept of a "job" and a "job step" which are - # roughly analogous to "resource" and "job" in RMS terms, - # the difference being that steps within a job are counted - # from 0 in slurm whereas there is a global job namespace in - # RMS. - # Therefore padb *has* to target slurm jobs as they have the only - # globally unique identifier. You can use - # -Oslurm-job-step= to target individual job steps within - # a job however. - - # Modern slurm systems have a scontol listpids option which we use however - # older systems require a little more legwork and aren't precise. - - # These are the key variables... - # SLURM_JOBID=1234 - # SLURM_STEPID=0 - # RMS_RESOURCE=1234 (Not needed) - # RMS_JOBID=5678 - - # SLURM_JOBID - # RMS_JOBID - - my %gids = slurm_get_ids($jobid); - - opendir( DIR, "/proc/" ); - my @pids = readdir(DIR); - closedir(DIR); - - my %resmgr; # All processes which are resource managers. - - foreach my $pid (@pids) { - next unless ( $pid =~ /^\d+$/ ); - if ( is_resmgr_process($pid) ) { - $resmgr{$pid} = find_from_status( $pid, "Name" ); - } - } - - my %pjobs; # All parallel jobs (children of resource managers).; - - foreach my $pid (@pids) { - next unless ( $pid =~ /^\d+$/ ); - - # Skip over this process unless it's spawned from a resource manager. - next unless is_desc_of_resmgr( \%resmgr, $pid ); - - my $script = is_pid_script($pid); - - my $vp; - - if (%gids) { - $vp = vp_from_pid( \%gids, \%resmgr, $pid ); - - # debug $vp, "Found $vp from $pid using scontrol listpids"; - } - - if ( not defined $vp ) { - my %env = get_remote_env($pid); - - if (0) { - debug undef, -"Checking slurm pid: $pid, job $env{SLURM_JOBID}, step $env{SLURM_STEPID}, proc $env{SLURM_PROCID}, script $script"; - debug undef, -"Checking rms pid: $pid, job $env{RMS_JOBID}, proc $env{RMS_PROCID}, script $script"; - } - - if ( $env{SLURM_JOBID} eq $jobid ) { - if ( $env{SLURM_STEPID} eq $confInner{"slurm-job-step"} ) { - $vp = $env{SLURM_PROCID}; - } - } elsif ( $env{RMS_JOBID} eq $jobid ) { - $vp = $env{RMS_PROCID}; - } - } - - next unless ( defined $vp ); - - # Ignore bash/sh/perl wrappers. - next if $script; - - push( @{ $pjobs{$vp} }, $pid ); - - } - - foreach my $vp ( keys(%pjobs) ) { - - # If there are multiple possible processes then target each of them, - # this is possibly wrong and suggestions for handling this better are - # welcome. - foreach my $pid ( @{ $pjobs{$vp} } ) { - maybe_show_pid( $vp, $pid ); - } + maybe_show_pid( $global, $pid ); } } @@ -6588,6 +6439,98 @@ $handle->{child_replys} = 0; $handle->{target_responce} = undef; } + +# Convert from a pid to a command name and do it in a safe manner to avoid +# warnings. suid programs tend to have the exe link which is un-readable +# so if that yeilds nothing then load the name from the status file. +sub pid_to_name { + my $pid = shift; + my $exe = readlink("/proc/$pid/exe"); + my $cmd; + if ( defined $exe ) { + return basename($exe); + } else { + return find_from_status( $pid, "Name" ); + } +} + +# Take the resource manager list of pids and possibly convert these into +# more interesting pids, in particular look for pids which appear to be +# scripts and, if they have any children, look at the children instead. +sub convert_pids_to_child_pids { + + opendir( DIR, "/proc/" ); + my @pids = readdir(DIR); + closedir(DIR); + + my $uid = $<; + + my %scripts; + map { $scripts{$_}++ } split( ",", $confInner{"scripts"} ); + + my $ipids = $confInner{rmpids}; + + foreach my $pid (@pids) { + + # Ignore entries that aren't numeric. + next unless ( $pid =~ /^\d+$/ ); + + # Ignore processes with the wrong ownership. + my ( undef, undef, undef, undef, $owner ) = stat("/proc/$pid"); + next unless $owner == $uid; + + # The resource manager pid this pid is associated with. + my $rmpid; + + if ( defined $ipids->{$pid} ) { + $rmpid = $pid; + } else { + my $ppid = find_from_status( $pid, "PPid" ); + + while ( defined $ppid and $ppid != 1 ) { + if ( defined $ipids->{$ppid} ) { + $rmpid = $ppid; + $ppid = undef; + } else { + $ppid = find_from_status( $ppid, "PPid" ); + } + } + } + + next unless defined $rmpid; + + if ( defined( $scripts{ pid_to_name($pid) } ) ) { + push( @{ $ipids->{$rmpid}{scripts} }, $pid ); + } else { + push( @{ $ipids->{$rmpid}{notscripts} }, $pid ); + } + } + + # Now chose what pid to target. + foreach my $key ( keys( %{$ipids} ) ) { + my $ip = $ipids->{$key}; + + my $newpid; + + if ( defined( $ip->{scripts} ) ) { + my @ppids = sort( @{ $ip->{scripts} } ); + $newpid = $ppids[0]; + } + + # If there are any pids which aren't scripts then target the + # first one. + if ( defined( $ip->{notscripts} ) ) { + my @ppids = sort( @{ $ip->{notscripts} } ); + $newpid = $ppids[0]; + } + my %pd; + $pd{pid} = $newpid; + $pd{vp} = $ip->{rank}; + push( @{ $confInner{"all-pids"} }, \%pd ); + + } + +} # Find and report pids as part of the signon protocol, we should # also report name @@ -6601,9 +6544,12 @@ $confInner{"orte-data"} = $cmd->{jobconfig}{"orte-data"}; } -# Query the resource manager to find the pids, they'll be added to the "all-pids" field. + # Query the resource manager to find the pids, + # they'll be added to the "all-pids" array. $rmgr{ $cmd->{jobconfig}{rmgr} }{find_pids}( $cmd->{jobconfig}{jobid} ); + convert_pids_to_child_pids(); + foreach my $proc ( @{ $confInner{"all-pids"} } ) { my $pid = $proc->{pid}; my $vp = $proc->{vp}; From codesite-noreply at google.com Thu Aug 20 15:58:48 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Thu, 20 Aug 2009 14:58:48 +0000 Subject: [padb-devel] [padb] r129 committed - If we fail to attach to a process then check if gdb is actuall install... Message-ID: <001636e1fdb02fffda047193fda9@google.com> Revision: 129 Author: apittman Date: Thu Aug 20 07:57:42 2009 Log: If we fail to attach to a process then check if gdb is actuall installed and if it isn't warn about this as the probable cause. http://code.google.com/p/padb/source/detail?r=129 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Thu Aug 20 04:27:55 2009 +++ /trunk/src/padb Thu Aug 20 07:57:42 2009 @@ -4302,7 +4302,14 @@ send_cont_signal($pid); my $result = gdb_send( $gdb, "attach $pid" ); - return unless defined $result; + if ( not defined $result ) { + $gdb->{error} = "Failed to attach to process"; + if ( not find_exe("gdb") ) { + $gdb->{error} = "Failed to attach to process (gdb not installed?)"; + } + return; + } + return if ( $result eq "error" ); $gdb->{attached} = 1; @@ -5812,7 +5819,11 @@ $proc->{gdb} = $gdb; push( @all, $proc ); } else { - output $vp, "Failed to attach to process"; + if ( defined $gdb->{error} ) { + output $vp, $gdb->{error}; + } else { + output $vp, "Failed to attach to process"; + } } } @@ -5838,7 +5849,11 @@ if ( gdb_attach( $g, $pid ) ) { $gdb = $g; } else { - output $vp, "Failed to attach to process"; + if ( defined $gdb->{error} ) { + output $vp, $gdb->{error}; + } else { + output $vp, "Failed to attach to process"; + } } } else { $gdb = $proc->{gdb}; From codesite-noreply at google.com Sat Aug 22 17:27:57 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Sat, 22 Aug 2009 16:27:57 +0000 Subject: [padb-devel] [padb] r130 committed - Add a "port-range" config option to limit the ports that padb uses... Message-ID: <00504502c6d3b5be8c0471bd77fa@google.com> Revision: 130 Author: apittman Date: Sat Aug 22 09:27:26 2009 Log: Add a "port-range" config option to limit the ports that padb uses and use/honour it for the outer process. Still needs to be forwared to the inner process (before it creates a port) for this feature to work however. http://code.google.com/p/padb/source/detail?r=130 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Thu Aug 20 07:57:42 2009 +++ /trunk/src/padb Sat Aug 22 09:27:26 2009 @@ -378,8 +378,6 @@ # is treated as "all". $conf{"check-signon"} = "all"; -$conf{slurm_job_step} = "0"; - # Output options. $conf{"interval"} = 10; $conf{"watch-clears-screen"} = 1; @@ -407,6 +405,9 @@ $conf{"edb"} = find_edb(); $conf{"minfo"} = find_minfo(); +# Option to define a list of ports used by padb. +$conf{"port-range"} = undef; + my $norc = 0; my $configfile = "/etc/padb.conf"; @@ -3590,6 +3591,80 @@ $cdata{event_cb} = \&handle_event_from_socket; $comm_data->{sockets}{$new} = \%cdata; } + +# "shift" a rank or port number from the standard spec format, returns the id +# and the range with the first entry removed. Returns both the first entry and +# the new range with the first removed. +sub shift_from_range { + my ($range) = @_; + + my $newrange; + + return undef unless defined $range; + return undef if $range eq ""; + return undef if $range eq "[]"; + + if ( $range =~ m/^\[([\d\-\,]+)\]$/ ) { + $newrange = $1; + } else { + die("Failed to recognise $range as range\n"); + } + + my @parts = split( ",", $newrange ); + + my $part = shift(@parts); + + my $id; + + if ( $part =~ m/^(\d+)$/ ) { + $id = $1; + } elsif ( $part =~ m/^(\d+)\-(\d+)$/ ) { + my $lower = $1; + my $upper = $2; + $id = $lower; + if ( $lower > $upper ) { + die("Invalid range $lower-$upper\n"); + } + if ( $lower++ != $upper ) { + unshift( @parts, "$lower-$upper" ); + } + } else { + die("Failed to recognise $part as range\n"); + } + + my $r = join( ",", @parts ); + + return ( $id, "[$r]" ); +} + +sub create_local_port { + my ($range) = @_; + + my %options = ( + Reuse => 1, + Proto => 'tcp', + Listen => 2, + ); + + if ( not defined $range ) { + my $sl = IO::Socket::INET->new(%options) + or die("Failed to create local port: $!"); + return $sl; + } + + my $irange = $range; + my $port; + + while ( ( $port, $range ) = shift_from_range($range) and defined $port ) { + $options{LocalPort} = $port; + my $sl = IO::Socket::INET->new(%options); + return $sl if defined $sl; + + #$range = $nrange; + } + + die("Failed to create local port, no free range (\"$irange\")\n"); +} sub go_parallel { my $jobid = shift; @@ -3601,11 +3676,7 @@ my $sel = IO::Select->new(); if ( $conf{"inner-callback"} ) { - my $sl = IO::Socket::INET->new( - Reuse => 1, - Proto => 'tcp', - Listen => 2, - ) or die("Failed to create local port"); + my $sl = create_local_port( $conf{"port-range"} ); $comm_data->{listen} = $sl; my $port = $sl->sockport(); @@ -3617,6 +3688,8 @@ $cdata{event_cb} = \&handle_event_from_port; $comm_data->{sockets}{$sl} = \%cdata; } + + debug_log( "show-cmd", undef, $cmd ); my $pcmd = { pid => -1, @@ -3802,8 +3875,6 @@ $cmd .= " $0 --inner"; #} - - debug_log( "show-cmd", undef, $cmd ); if ( not defined $hosts ) { printf("Full duplex mode needs to know the host count\n"); @@ -6755,11 +6826,7 @@ sub inner_loop_for_comms { my ($outerloc) = @_; - my $server = IO::Socket::INET->new( - Reuse => 1, - Proto => 'tcp', - Listen => 2, - ) or die("Failed to create local port"); + my $server = create_local_port(); my $lport = $server->sockport(); my $hostname = hostname(); From codesite-noreply at google.com Sat Aug 22 17:41:03 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Sat, 22 Aug 2009 16:41:03 +0000 Subject: [padb-devel] [padb] r131 committed - Use carp and confess() in many places where it's more appropiate... Message-ID: <0016e64dc8ea8391610471bda636@google.com> Revision: 131 Author: apittman Date: Sat Aug 22 09:39:55 2009 Log: Use carp and confess() in many places where it's more appropiate than calling die(). http://code.google.com/p/padb/source/detail?r=131 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Sat Aug 22 09:27:26 2009 +++ /trunk/src/padb Sat Aug 22 09:39:55 2009 @@ -221,6 +221,7 @@ use Config; use IO::Socket; use IO::Select; +use Carp; ############################################################################### # @@ -1790,7 +1791,7 @@ my @data; open( PCMD, "edb -k $key --stats-raw 2>/dev/null|" ) - or die "$prog: cant open file: $!\n"; + or confess "$prog: cant open file: $!\n"; local $/ = "\n\n"; while () { s/\n//g; @@ -3629,7 +3630,7 @@ unshift( @parts, "$lower-$upper" ); } } else { - die("Failed to recognise $part as range\n"); + confess("Failed to recognise $part as range\n"); } my $r = join( ",", @parts ); @@ -3648,7 +3649,7 @@ if ( not defined $range ) { my $sl = IO::Socket::INET->new(%options) - or die("Failed to create local port: $!"); + or confess("Failed to create local port: $!"); return $sl; } @@ -3699,7 +3700,7 @@ }; $pcmd->{pid} = open3( $pcmd->{in}, *OUT, *ERR, $cmd ) - or die "Unable to open3() pcmd: $!\n"; + or confess "Unable to open3() pcmd: $!\n"; close $pcmd->{in}; @@ -4600,7 +4601,7 @@ my ( $this, $l ) = strip_first_quotes($value); return ( $key, $this, $l ); } else { - die("unknown type '$type' str '$str'"); + confess("unknown type '$type' str '$str'"); } return ( $key, \%res, $leftover ); @@ -4815,7 +4816,7 @@ my $cmd = $confInner{"minfo"}; $h->{hpid} = open3( $h->{wtr}, $h->{rdr}, $h->{err}, $cmd ) - or die "Unable to popen() h: $!\n"; + or confess "Unable to popen() h: $!\n"; my $handle = $h->{rdr}; @@ -6838,7 +6839,7 @@ PeerAddr => $ohost, PeerPort => $oport, Proto => 'tcp', - ) or die("Failed to connect to outer"); + ) or confess("Failed to connect to outer"); my $secret = find_padb_secret(); die("No secret") if not defined $secret; $os->print("Hello $secret $hostname $lport $key\n"); @@ -6865,7 +6866,7 @@ while ( my @data = $sel->can_read(5) ) { foreach my $s (@data) { if ( $s == $server ) { - my $new = $server->accept() or die("Failed accept"); + my $new = $server->accept() or confess("Failed accept"); $sel->add($new); my $peer = getpeername($new); my ( $port, $addr ) = unpack_sockaddr_in($peer); @@ -6941,7 +6942,7 @@ Getopt::Long::Configure("bundling"); - GetOptions(%optionhash) or die("could not parse options\n"); + GetOptions(%optionhash) or confess("could not parse options\n"); $confInner{"myld"} = $ENV{"LD_LIBRARY_PATH"}; From codesite-noreply at google.com Sat Aug 22 19:02:26 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Sat, 22 Aug 2009 18:02:26 +0000 Subject: [padb-devel] [padb] r132 committed - Make port-range apply to the inner as well. Do this by passing... Message-ID: <00504502c6d39aea100471bec9bd@google.com> Revision: 132 Author: apittman Date: Sat Aug 22 11:02:02 2009 Log: Make port-range apply to the inner as well. Do this by passing the option along on the command line so the inner process has it before binding to a socket. http://code.google.com/p/padb/source/detail?r=132 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Sat Aug 22 09:39:55 2009 +++ /trunk/src/padb Sat Aug 22 11:02:02 2009 @@ -42,6 +42,8 @@ # it independant and call it for all resource managers. This allows scripts # which call parallel applications to be bypassed and the applications # themselves targetted. +# * Added "port-range" option to limit port usage in case people try and use +# padb with firewalls enabled. # # Version 2.5 # * First Non-Quadrics version @@ -263,6 +265,10 @@ # Config options the inner knows about, only forward options if they are in this list. my @inner_conf = qw(edb edbopt minfo rmgr scripts slurm-job-step verbose); +# More config options the inner knows about, these are forwarded on the command line +# rather than over the sockets. +my @inner_conf_cmd = qw(port-range); + ############################################################################### # # Resource manager setup @@ -368,7 +374,9 @@ my %allfns; -my %cinner; # Config options to be passed to inner. +my %cinner; # Config options to be passed to inner. +my %cinner_cmd; # Config options to be passed to inner. + my $rem_jobid; # Debug options. @@ -575,7 +583,9 @@ my $tree; my @config_options; + my %ic_names; +my %ic_names_cmd; # Populated in the outer args section so that outer code # can access secondary comamnd line argunments by name. @@ -3664,7 +3674,7 @@ #$range = $nrange; } - die("Failed to create local port, no free range (\"$irange\")\n"); + die("Failed to create local port, no free ports in range \"$irange\"\n"); } sub go_parallel { @@ -3876,6 +3886,8 @@ $cmd .= " $0 --inner"; #} + + map { $cmd .= " --$_=\"$cinner_cmd{$_}\"" } keys(%cinner_cmd); if ( not defined $hosts ) { printf("Full duplex mode needs to know the host count\n"); @@ -3904,6 +3916,7 @@ sub config_init { map { $ic_names{$_}++ } @inner_conf; + map { $ic_names_cmd{$_}++ } @inner_conf_cmd; } sub config_set { @@ -3927,6 +3940,11 @@ if ( defined $ic_names{$key} ) { $cinner{$key} = $value; } + + if ( defined $ic_names_cmd{$key} ) { + $cinner_cmd{$key} = $value; + } + } sub config_from_file { @@ -6827,7 +6845,7 @@ sub inner_loop_for_comms { my ($outerloc) = @_; - my $server = create_local_port(); + my $server = create_local_port( $confInner{"port-range"} ); my $lport = $server->sockport(); my $hostname = hostname(); @@ -6934,12 +6952,17 @@ $confInner{"minfo"} = find_minfo(); $confInner{"hostname"} = hostname(); $confInner{"scripts"} = "bash,sh,dash,ash,perl,xterm"; + $confInner{"port-range"} = undef; # Local vars to help with command line parsing my $outerloc; my %optionhash = ( "outer=s" => \$outerloc, ); + foreach my $key (@inner_conf_cmd) { + $optionhash{"$key=s"} = \$confInner{$key}; + } + Getopt::Long::Configure("bundling"); GetOptions(%optionhash) or confess("could not parse options\n"); From codesite-noreply at google.com Sat Aug 22 19:06:37 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Sat, 22 Aug 2009 18:06:37 +0000 Subject: [padb-devel] [padb] r133 committed - Allow a port range to be just a number as well as a bracket... Message-ID: <0016e64af8a88b21e00471bed815@google.com> Revision: 133 Author: apittman Date: Sat Aug 22 11:05:12 2009 Log: Allow a port range to be just a number as well as a bracket specified list. http://code.google.com/p/padb/source/detail?r=133 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Sat Aug 22 11:02:02 2009 +++ /trunk/src/padb Sat Aug 22 11:05:12 2009 @@ -3617,6 +3617,8 @@ if ( $range =~ m/^\[([\d\-\,]+)\]$/ ) { $newrange = $1; + } elsif ( $range =~ m/^(\d+)$/ ) { + $newrange = $1; } else { die("Failed to recognise $range as range\n"); } From codesite-noreply at google.com Sun Aug 23 11:01:00 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Sun, 23 Aug 2009 10:01:00 +0000 Subject: [padb-devel] [padb] r134 committed - The {nailing_my_colours_to_the_mast} commit. I've decided that hash... Message-ID: <0016e64dda8eb526120471cc2ddd@google.com> Revision: 134 Author: apittman Date: Sun Aug 23 03:00:38 2009 Log: The {nailing_my_colours_to_the_mast} commit. I've decided that hash keys are better not quoted so don't quote any of them. This means replacing all instances of dash with underscore and a lot of fiddling when loading and reporting these options to the user. A wide ranging and potentially error prone commit however it does make the code more readable as a result. http://code.google.com/p/padb/source/detail?r=134 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Sat Aug 22 11:05:12 2009 +++ /trunk/src/padb Sun Aug 23 03:00:38 2009 @@ -263,11 +263,11 @@ my $secret; # Config options the inner knows about, only forward options if they are in this list. -my @inner_conf = qw(edb edbopt minfo rmgr scripts slurm-job-step verbose); +my @inner_conf = qw(edb edbopt minfo rmgr scripts slurm_job_step verbose); # More config options the inner knows about, these are forwarded on the command line # rather than over the sockets. -my @inner_conf_cmd = qw(port-range); +my @inner_conf_cmd = qw(port_range outer); ############################################################################### # @@ -380,20 +380,20 @@ my $rem_jobid; # Debug options. -$conf{"verbose"} = 0; -$conf{"dump-raw"} = 0; +$conf{verbose} = 0; +$conf{dump_raw} = 0; # Valid values are "none" "missing" or "all". Anything not recognised # is treated as "all". -$conf{"check-signon"} = "all"; +$conf{check_signon} = "all"; # Output options. -$conf{"interval"} = 10; -$conf{"watch-clears-screen"} = 1; -$conf{"scripts"} = "bash,sh,dash,ash,perl,xterm"; -$conf{"lsf-job-offset"} = 1; -$conf{"local-fd-name"} = "/dev/null"; -$conf{"inner-callback"} = 0; +$conf{interval} = 10; +$conf{watch_clears_screen} = 1; +$conf{scripts} = "bash,sh,dash,ash,perl,xterm"; +$conf{lsf_job_offset} = 1; +$conf{local_fd_name} = "/dev/null"; +$conf{inner_callback} = 0; # These two are used by deadlock and QsNet group # code, they need migrating in the group code @@ -402,20 +402,20 @@ #$conf{"show-all-groups"} = 0; # Tuning options. -$conf{"prun-timeout"} = 120; -$conf{"prun-exittimeout"} = 120; -$conf{"rmgr"} = "auto"; - -$conf{"slurm-job-step"} = 0; +$conf{prun_timeout} = 120; +$conf{prun_exittimeout} = 120; +$conf{rmgr} = "auto"; + +$conf{slurm_job_step} = 0; # These settings are passed onto inner only. -$conf{"edbopt"} = ""; - -$conf{"edb"} = find_edb(); -$conf{"minfo"} = find_minfo(); +$conf{edbopt} = ""; + +$conf{edb} = find_edb(); +$conf{minfo} = find_minfo(); # Option to define a list of ports used by padb. -$conf{"port-range"} = undef; +$conf{port_range} = undef; my $norc = 0; my $configfile = "/etc/padb.conf"; @@ -821,7 +821,7 @@ my $toshow; foreach my $counter ( sort keys %{$d} ) { - if ( $d->{$counter}{raw}[0] != 0 or $conf{"show-all-stats"} ) { + if ( $d->{$counter}{raw}[0] != 0 or $conf{show_all_stats} ) { if ( defined $toshow ) { $ret .= " Counter: '$d->{$toshow}{name}' = '$d->{$toshow}{raw}[0]','$d->{$counter}{name}' = '$d->{$counter}{raw}[0]'\n"; @@ -870,7 +870,7 @@ my $ret = ""; foreach my $tally ( sort keys %{$d} ) { - if ( $d->{$tally}{raw}[0] or $conf{"show-all-stats"} ) { + if ( $d->{$tally}{raw}[0] or $conf{show_all_stats} ) { $ret .= sprintf( "%16s: Total: %d Active: %d HWM: %d\n", $d->{$tally}{name}, $d->{$tally}{raw}[0], @@ -888,7 +888,7 @@ foreach my $bin ( sort keys %{$d} ) { if ( $d->{$bin}{raw}[0] || $d->{$bin}{raw}[34] - or $conf{"show-all-stats"} ) + or $conf{show_all_stats} ) { my $total = $d->{$bin}{raw}[34]; my $scale = 0; @@ -904,7 +904,7 @@ my @vals; for ( my $j = 0 ; $j < 32 ; $j++ ) { - if ( $d->{$bin}{raw}[$j] or $conf{"show-all-stats"} > 1 ) { + if ( $d->{$bin}{raw}[$j] or $conf{show_all_stats} > 1 ) { push( @vals, sprintf( "%9s: %10d", @@ -1459,7 +1459,7 @@ sub read_stats { my @data = @_; - printf Dumper \@data if $conf{"dump-raw"}; + printf Dumper \@data if $conf{dump_raw}; my $header = parse_header( shift @data ); @@ -1473,7 +1473,7 @@ } } - printf Dumper \@out if $conf{"dump-raw"}; + printf Dumper \@out if $conf{dump_raw}; return \@out; } @@ -1498,7 +1498,7 @@ if ($stats_total) { - if ( $conf{"stats-short"} ) { + if ( $conf{stats_short} ) { my $new; if ( $#ranks != -1 ) { my @ret; @@ -1515,22 +1515,21 @@ $new = summarise_many($d); } - display_hashes( $new, $conf{"stats-sort-key"}, - $conf{"stats-reverse"} ); + display_hashes( $new, $conf{stats_sort_key}, $conf{stats_reverse} ); return; } if ( $#ranks != -1 ) { foreach my $rank (@ranks) { if ( defined $d->[$rank] ) { - print show_name $conf{"stats-name"}, $d->[$rank]; + print show_name $conf{stats_name}, $d->[$rank]; } else { my $vps = $#{$d} + 1; print "Invalid rank $rank (0 to $vps)\n"; } } } else { - print show_name $conf{"stats-name"}, total($d); + print show_name $conf{stats_name}, total($d); } } @@ -1612,7 +1611,7 @@ $ad{$gid}{size} = $this_group->{statistics}{Attribute}{Size}{raw}[0]; $ad{$gid}{map}[$ident] = $dataset->{vp} - if ( $conf{"show-group-members"} ); + if ( $conf{show_group_members} ); } $ad{$gid}{idents}{$ident}{'statistics'}++; @@ -1651,7 +1650,7 @@ # Maybe show the group members, hope that the user doesn't turn # this on unless also setting target_groups! - if ( $conf{"show-group-members"} ) { + if ( $conf{show_group_members} ) { $gstr .= "group has $ad{$gid}{size} members\n"; if ( defined $ad{$gid}{size} and $gid != 1 ) { for ( my $ident = 0 ; $ident < $ad{$gid}{size} ; $ident++ ) { @@ -1670,7 +1669,7 @@ } } if ( $#invalid != -1 ) { - if ( $conf{"show-all-groups"} ) { + if ( $conf{show_all_groups} ) { $ret .= $gstr . group_status_helper( "showing the group as removed", 0, $ad{$gid}{size}, @invalid ); @@ -1693,7 +1692,7 @@ } if ( $#identical != -1 ) { $missing_self++; - if ( $conf{"show-all-groups"} ) { + if ( $conf{show_all_groups} ) { $ret .= $gstr . group_status_helper( "no statistics for this group *(1)", @@ -1740,7 +1739,7 @@ } } } else { - next unless ( $conf{"show-all-groups"} ); + next unless ( $conf{show_all_groups} ); } { @@ -2127,12 +2126,12 @@ # Try to prevent zombie jobs, fairly rare but I have seen # nodes run different versions of edb which can cause problems # XXX: Fixme. This isn't high enough. - if ( $conf{"prun-exittimeout"} != 0 ) { - $ENV{"RMS_EXITTIMEOUT"} = $conf{"prun-exittimeout"}; + if ( $conf{prun_exittimeout} != 0 ) { + $ENV{"RMS_EXITTIMEOUT"} = $conf{prun_exittimeout}; } - if ( $conf{"prun-timeout"} != 0 ) { - $ENV{"RMS_TIMELIMIT"} = $conf{"prun-timeout"}; + if ( $conf{prun_timeout} != 0 ) { + $ENV{"RMS_TIMELIMIT"} = $conf{prun_timeout}; } { @@ -2177,7 +2176,7 @@ $ncpus += $n[$idx] * $c[$idx]; } - printf("extracted $ncpus from $cpus and $nodes\n") if $conf{"verbose"} > 1; + printf("extracted $ncpus from $cpus and $nodes\n") if $conf{verbose} > 1; return $ncpus; } @@ -2210,7 +2209,7 @@ # processes we are going to be looking for. sub slurm_job_to_ncpus { my $job = shift; - my $s = "$job." . $conf{"slurm-job-step"}; + my $s = "$job." . $conf{slurm_job_step}; my @steps = `squeue -s $s -o "%i %A" 2>/dev/null`; return undef if ( $? != 0 ); @@ -2248,7 +2247,7 @@ # This functions isn't used currently. sub slurm_job_to_nodelist { my $job = shift; - my $s = "$job." . $conf{"slurm-job-step"}; + my $s = "$job." . $conf{slurm_job_step}; my @steps = `squeue -s $s -o "%i %N" 2>/dev/null`; return undef if ( $? != 0 ); @@ -2343,7 +2342,7 @@ sub local_fd_get_jobs { my $user = shift; - return local_fd_get_jobs_real( $user, $conf{"local-fd-name"} ); + return local_fd_get_jobs_real( $user, $conf{local_fd_name} ); } sub local_q_is_installed { @@ -2525,10 +2524,6 @@ } } - - #if ( $conf{"verbose"} ) { - #print Dumper \%open_jobs; - #} } sub open_get_jobs { @@ -2624,7 +2619,7 @@ my $rjob; - my $idx = $conf{"lsf-job-offset"}; + my $idx = $conf{lsf_job_offset}; $idx = 1 if ( $idx > $#out ); $rjob = $out[$idx]; chomp $rjob; @@ -2642,13 +2637,13 @@ ############################################################################### sub setup_rmgr { - $conf{"rmgr"} = shift; + $conf{rmgr} = shift; # Now setup the variable for the rest of the program. - if ( defined $rmgr{ $conf{"rmgr"} }{inner_rmgr} ) { - $cinner{rmgr} = $rmgr{ $conf{"rmgr"} }{inner_rmgr}; + if ( defined $rmgr{ $conf{rmgr} }{inner_rmgr} ) { + $cinner{rmgr} = $rmgr{ $conf{rmgr} }{inner_rmgr}; } else { - $cinner{rmgr} = $conf{"rmgr"}; + $cinner{rmgr} = $conf{rmgr}; } } @@ -2657,19 +2652,19 @@ # If it's been set on the command line and it's valid then just use what we are given. # Do sanity checks here but only warn on the result to cope with non-default installs. - if ( defined $rmgr{ $conf{"rmgr"} } ) { - if ( defined $rmgr{ $conf{"rmgr"} }{is_installed} - and not $rmgr{ $conf{"rmgr"} }{is_installed}() ) + if ( defined $rmgr{ $conf{rmgr} } ) { + if ( defined $rmgr{ $conf{rmgr} }{is_installed} + and not $rmgr{ $conf{rmgr} }{is_installed}() ) { printf( "Warning: Selected resource manager $conf{rmgr} does not appear to be installed\n" ); } - setup_rmgr( $conf{"rmgr"} ); + setup_rmgr( $conf{rmgr} ); return; } - if ( $conf{"rmgr"} ne "auto" ) { + if ( $conf{rmgr} ne "auto" ) { printf("Error, resource manager \"$conf{rmgr}\" not supported\n"); exit(1); } @@ -2704,19 +2699,19 @@ # If it's been set on the command line and it's valid then just use what we are given. # Do sanity checks here but only warn on the result to cope with non-default installs. - if ( defined $rmgr{ $conf{"rmgr"} } ) { - if ( defined $rmgr{ $conf{"rmgr"} }{is_installed} - and not $rmgr{ $conf{"rmgr"} }{is_installed}() ) + if ( defined $rmgr{ $conf{rmgr} } ) { + if ( defined $rmgr{ $conf{rmgr} }{is_installed} + and not $rmgr{ $conf{rmgr} }{is_installed}() ) { printf( "Warning: Selected resource manager $conf{rmgr} does not appear to be installed\n" ); } - setup_rmgr( $conf{"rmgr"} ); + setup_rmgr( $conf{rmgr} ); return; } - if ( $conf{"rmgr"} ne "auto" ) { + if ( $conf{rmgr} ne "auto" ) { printf("Error, resource manager \"$conf{rmgr}\" not supported\n"); exit(1); } @@ -2775,17 +2770,17 @@ sub get_all_jobids { my $user = shift; debug_log( "rmgr", undef, "Loading active jobs list" ); - return $rmgr{ $conf{"rmgr"} }{get_active_jobs}($user); + return $rmgr{ $conf{rmgr} }{get_active_jobs}($user); } sub job_is_running { my $job = shift; - if ( defined $rmgr{ $conf{"rmgr"} }{job_is_running} ) { - return $rmgr{ $conf{"rmgr"} }{job_is_running}($job); + if ( defined $rmgr{ $conf{rmgr} }{job_is_running} ) { + return $rmgr{ $conf{rmgr} }{job_is_running}($job); } - my @jobs = $rmgr{ $conf{"rmgr"} }{get_active_jobs}($user); + my @jobs = $rmgr{ $conf{rmgr} }{get_active_jobs}($user); my %j; map { $j{$_}++ } @jobs; return defined $j{$job}; @@ -2794,8 +2789,8 @@ sub job_to_key { my $job = shift; - if ( defined $rmgr{ $conf{"rmgr"} }{job_to_key} ) { - return $rmgr{ $conf{"rmgr"} }{job_to_key}($job); + if ( defined $rmgr{ $conf{rmgr} }{job_to_key} ) { + return $rmgr{ $conf{rmgr} }{job_to_key}($job); } return undef; @@ -2803,13 +2798,13 @@ sub setup_pcmd { my $job = shift; - return $rmgr{ $conf{"rmgr"} }{setup_pcmd}($job); + return $rmgr{ $conf{rmgr} }{setup_pcmd}($job); } sub cleanup_pcmd { my $job = shift; - if ( defined( $rmgr{ $conf{"rmgr"} }{cleanup_pcmd} ) ) { - $rmgr{ $conf{"rmgr"} }{cleanup_pcmd}(); + if ( defined( $rmgr{ $conf{rmgr} }{cleanup_pcmd} ) ) { + $rmgr{ $conf{rmgr} }{cleanup_pcmd}(); } } @@ -2826,9 +2821,9 @@ my %below; map { $above{$_}++ } - split( ",", $conf{mode_options}{stack}{"stack-strip-above"} ); + split( ",", $conf{mode_options}{stack}{stack_strip_above} ); map { $below{$_}++ } - split( ",", $conf{mode_options}{stack}{"stack-strip-below"} ); + split( ",", $conf{mode_options}{stack}{stack_strip_below} ); foreach my $tag ( keys %$lines ) { @@ -2862,7 +2857,7 @@ printf( "Stripping 0.." . $#{ $lines->{$tag} } . " to $start..$end for $tag\n" ) - if $conf{"verbose"} > 1; + if $conf{verbose} > 1; my @new = @{ $lines->{$tag} }; @new = @new[ $start .. $end ]; @@ -2876,7 +2871,7 @@ my $key = shift; my @all = @_; - if ( $carg->{"reverse-sort-order"} ) { + if ( $carg->{reverse_sort_order} ) { return ( reverse( sort { $a->{$key} <=> $b->{$key} } @all ) ); } else { return ( sort { $a->{$key} <=> $b->{$key} } @all ); @@ -2921,7 +2916,7 @@ my %proc_format_lengths; my %proc_header_reverse; - my $separator = $carg->{"column-seperator"}; + my $separator = $carg->{column_seperator}; my @columns = split( ",", $carg->{"proc-format"} ); foreach my $column (@columns) { @@ -2967,13 +2962,13 @@ } # Allow sort keys to be based on column names as well as real keys. - my $key = lc( $carg->{"proc-sort-key"} ); + my $key = lc( $carg->{proc_sort_key} ); if ( defined $proc_header_reverse{$key} ) { $key = $proc_header_reverse{$key}; } @all = sort_proc_hashes( $carg, $key, @all ); - if ( $carg->{"proc-show-header"} ) { + if ( $carg->{proc_show_header} ) { my @res; foreach my $key (@proc_format_array) { my $l .= sprintf( "%-$proc_format_lengths{$key}s", @@ -2983,7 +2978,7 @@ my $line = join( $separator, @res ); print "$line\n"; } - my $count = $carg->{"nprocs-output"}; + my $count = $carg->{nprocs_output}; foreach my $hash (@all) { my @res; foreach my $key (@proc_format_array) { @@ -3005,7 +3000,7 @@ sub show_results { my ( $nlines, $mode, $handle ) = @_; - printf Dumper $nlines if $conf{"dump-raw"}; + printf Dumper $nlines if $conf{dump_raw}; my $lines = $nlines->{lines}; @@ -3196,7 +3191,7 @@ sub maybe_clear_screen { return unless $watch; - if ( $conf{"watch-clears-screen"} ) { + if ( $conf{watch_clears_screen} ) { printf( "%s", " \033[1;1H" ); printf( "%s", "\033[2J" ); } @@ -3336,8 +3331,6 @@ sub push_command { my ( $mode, $out_format, $args ) = @_; - # $args = $conf{mode_options}{$mode}; - my %cmd; $cmd{mode} = $mode; $cmd{out_format} = $out_format if defined($out_format); @@ -3419,7 +3412,7 @@ sub check_signon { my ( $comm_data, $data ) = @_; - return if ( $conf{"check-signon"} eq "none" ); + return if ( $conf{check_signon} eq "none" ); my @missing; my %target_state; my %target_exe; @@ -3434,7 +3427,7 @@ if ( $#missing != -1 ) { printf( "Warning, failed to locate ranks %s\n", compress(@missing) ); } - return if ( $conf{"check-signon"} eq "missing" ); + return if ( $conf{check_signon} eq "missing" ); my $exe_count = keys(%target_exe); if ( $exe_count != 1 ) { printf("Warning, remote process name differs across ranks\n"); @@ -3688,19 +3681,21 @@ my $comm_data; my $sel = IO::Select->new(); - if ( $conf{"inner-callback"} ) { - my $sl = create_local_port( $conf{"port-range"} ); + if ( $conf{inner_callback} ) { + my $sl = create_local_port( $conf{port_range} ); $comm_data->{listen} = $sl; my $port = $sl->sockport(); my $hostname = hostname(); - $cmd .= " --outer=$hostname:$port"; + config_set_internal( "outer", "$hostname:$port" ); $sel->add($sl); my %cdata; $cdata{event_cb} = \&handle_event_from_port; $comm_data->{sockets}{$sl} = \%cdata; } + + map { $cmd .= " --$_=\"$cinner_cmd{$_}\"" } keys(%cinner_cmd); debug_log( "show-cmd", undef, $cmd ); @@ -3780,7 +3775,7 @@ ); } printf("result from parallel command is $res (state=$comm_data->{state})\n") - if ( $conf{"verbose"} ); + if ( $conf{verbose} ); if ( $res != 0 ) { my %status = rc_status($res); @@ -3832,10 +3827,10 @@ if ( defined $rmgr{ $conf{rmgr} }{require_inner_callback} and $rmgr{ $conf{rmgr} }{require_inner_callback} ) { - $conf{"inner-callback"} = 1; + $conf{inner_callback} = 1; } - if ( $conf{"inner-callback"} ) { + if ( $conf{inner_callback} ) { $secret = find_padb_secret(); if ( not defined $secret ) { @@ -3845,7 +3840,7 @@ } - $conf{"verbose"} && print "Attaching to job $jobid\n"; + $conf{verbose} && print "Attaching to job $jobid\n"; $rem_jobid = $jobid; @@ -3861,8 +3856,8 @@ my $ncpus = $res[1]; my $hosts = $res[2]; - $conf{"verbose"} && defined $ncpus && print "Job has $ncpus processe(s)\n"; - $conf{"verbose"} && defined $hosts && print "Job spans $hosts host(s)\n"; + $conf{verbose} && defined $ncpus && print "Job has $ncpus process(es)\n"; + $conf{verbose} && defined $hosts && print "Job spans $hosts host(s)\n"; debug_log( "verbose", undef, "There are %d processes over %d hosts", $ncpus, $hosts ); @@ -3888,8 +3883,6 @@ $cmd .= " $0 --inner"; #} - - map { $cmd .= " --$_=\"$cinner_cmd{$_}\"" } keys(%cinner_cmd); if ( not defined $hosts ) { printf("Full duplex mode needs to know the host count\n"); @@ -3921,15 +3914,8 @@ map { $ic_names_cmd{$_}++ } @inner_conf_cmd; } -sub config_set { +sub config_set_internal { my ( $key, $value ) = @_; - printf("Setting '$key' to '$value'\n") if ( $conf{"verbose"} ); - - if ( !exists $conf{$key} and !exists $conf{mode_options_reverse}{$key} ) { - printf( STDERR - "Warning, unknown config option '$key' value '$value'.\n" ); - } - if ( exists $conf{$key} ) { $conf{$key} = $value; } else { @@ -3946,19 +3932,31 @@ if ( defined $ic_names_cmd{$key} ) { $cinner_cmd{$key} = $value; } - +} + +sub config_set { + my ( $key, $value ) = @_; + printf("Setting '$key' to '$value'\n") if ( $conf{verbose} ); + + if ( !exists $conf{$key} and !exists $conf{mode_options_reverse}{$key} ) { + printf( STDERR + "Warning, unknown config option '$key' value '$value'.\n" ); + } + + config_set_internal( $key, $value ); } sub config_from_file { my $file = shift; - printf("Loading config from \"$file\"\n") if ( $conf{"verbose"} ); + printf("Loading config from \"$file\"\n") if ( $conf{verbose} ); open( CFILE, $file ) or return; while () { if (/^([\w-]+)\s*\=\s*(.*)/) { my $key = $1; my $value = $2; + $key =~ s/\-/\_/g; config_set( $key, $value ); } } @@ -3967,21 +3965,21 @@ } sub config_from_env { - printf("Loading config from environment\n") if ( $conf{"verbose"} ); + printf("Loading config from environment\n") if ( $conf{verbose} ); foreach my $key ( keys(%conf) ) { - my $name = uc($key); - $name =~ s/\-/\_/g; - if ( defined $ENV{"PADB_$name"} ) { - config_set( $key, $ENV{"PADB_$name"} ); + $key =~ s/\-/\_/g; + my $name = "PADB_" . uc($key); + if ( defined $ENV{$name} ) { + config_set( $key, $ENV{$name} ); } } foreach my $key ( keys( %{ $conf{mode_options_reverse} } ) ) { - my $name = uc($key); - $name =~ s/\-/\_/g; - if ( defined $ENV{"PADB_$name"} ) { - config_set( $key, $ENV{"PADB_$name"} ); + $key =~ s/\-/\_/g; + my $name = "PADB_" . uc($key); + if ( defined $ENV{$name} ) { + config_set( $key, $ENV{$name} ); } } } @@ -4012,13 +4010,15 @@ foreach my $mode ( sort( keys( %{ $conf{mode_options} } ) ) ) { printf("\nOptions for mode '$allfns{$mode}{arg_long}'\n"); foreach my $key ( sort( keys( %{ $conf{mode_options}{$mode} } ) ) ) { + my $name = $key; + $name =~ s/\_/\-/g; if ( defined $conf{mode_options}{$mode}{$key} ) { printf( " %$max_len" . "s = '$conf{mode_options}{$mode}{$key}'\n", - $key + $name ); } else { - printf( " %$max_len" . "s = undef\n", $key ); + printf( " %$max_len" . "s = undef\n", $name ); } } @@ -4044,7 +4044,7 @@ config_from_env(); - printf("Loading config from command line\n") if ( $conf{"verbose"} ); + printf("Loading config from command line\n") if ( $conf{verbose} ); # # Once again there is a 'bugette' here, you cant pass the @@ -4055,19 +4055,21 @@ # -Oedbopt="--pagesize 8192 --pagesize-header 4096" foreach my $config_option (@config_options) { - my ( $name, $val ) = split( "=", $config_option ); - - # $name =~ s/\-/\_/g; - - if ( $name eq "scriptDir" ) { + my ( $key, $val ) = split( "=", $config_option ); + + my $name = $key; + + $key =~ s/\-/\_/g; + + if ( $key eq "scriptDir" ) { printf( "$prog: -OscriptDir deprecated, use -Oedb=/path/to/edb instead\n" ); exit(1); } - if ( !exists $conf{$name} - and !exists $conf{mode_options_reverse}{$name} ) + if ( !exists $conf{$key} + and !exists $conf{mode_options_reverse}{$key} ) { printf("Error, unknown config option '$name'\n"); config_help(); @@ -4078,7 +4080,7 @@ config_help(); exit(1); } - config_set( $name, $val ); + config_set( $key, $val ); } if ($list_rmgrs) { @@ -4180,7 +4182,7 @@ while (1) { maybe_clear_screen(); local_stats(); - sleep( $conf{"interval"} ); + sleep( $conf{interval} ); } } else { local_stats(); @@ -4253,7 +4255,7 @@ @jobids = get_all_jobids($user); printf( "Active jobs (%d) are @jobids\n", $#jobids + 1 ) - if $conf{"verbose"}; + if $conf{verbose}; if ( $#jobids == -1 ) { printf("No active jobs could be found for user '$user'\n"); exit 1; @@ -4282,7 +4284,7 @@ foreach my $jobid (@jobids) { printf "\nCollecting information for job '$jobid'\n\n" - if ( $conf{"verbose"} or ( $#jobids > 0 ) ); + if ( $conf{verbose} or ( $#jobids > 0 ) ); if ( defined $allfns{$mode}{pre_out_handler} ) { $allfns{$mode}{pre_out_handler}(); @@ -4312,7 +4314,7 @@ sub debug { my ( $vp, $str ) = @_; - $confInner{"verbose"} or return; + $confInner{verbose} or return; $vp = -1 unless defined $vp; print "$confInner{hostname}.$vp:$str\n"; @@ -4343,7 +4345,7 @@ sub p_die { my ( $vp, $str ) = @_; - $confInner{"verbose"}++; + $confInner{verbose}++; debug( $vp, "$str, '$@'" ); exit(1); } @@ -4834,7 +4836,7 @@ }; my @mq; - my $cmd = $confInner{"minfo"}; + my $cmd = $confInner{minfo}; $h->{hpid} = open3( $h->{wtr}, $h->{rdr}, $h->{err}, $cmd ) or confess "Unable to popen() h: $!\n"; @@ -4893,8 +4895,8 @@ return; } - if ( defined $carg->{"mpi-dll"} ) { - $ENV{MPINFO_DLL} = $carg->{"mpi-dll"}; + if ( defined $carg->{mpi_dll} ) { + $ENV{MPINFO_DLL} = $carg->{mpi_dll}; } else { my $base = gdb_var_addr( $g, "MPIR_dll_name" ); if ( !defined $base ) { @@ -4914,8 +4916,8 @@ sub fetch_mpi_queue_gdb { my ( $carg, $vp, $pid, $g ) = @_; - if ( defined $carg->{"mpi-dll"} ) { - $ENV{MPINFO_DLL} = $carg->{"mpi-dll"}; + if ( defined $carg->{mpi_dll} ) { + $ENV{MPINFO_DLL} = $carg->{mpi_dll}; } else { my $base = gdb_var_addr( $g, "MPIR_dll_name" ); if ( !defined $base ) { @@ -5083,7 +5085,7 @@ # Maybe show the group members, hope that the user doesn't turn # this on unless also setting target_groups! - if ( $carg->{"show-group-members"} ) { + if ( $carg->{show_group_members} ) { $gstr .= "group has $ad{$gid}{size} members\n"; if ( defined $ad{$gid}{size} ) { for ( my $ident = 0 ; $ident < $ad{$gid}{size} ; $ident++ ) { @@ -5126,7 +5128,7 @@ } } } else { - next unless ( $carg->{"show-all-groups"} ); + next unless ( $carg->{show_all_groups} ); $ret .= $gstr; $gstr = ""; } @@ -5377,7 +5379,7 @@ print "\n"; my @threads; - if ( $conf{"stack-shows-params"} ) { + if ( $conf{stack_shows_params} ) { @threads = gdb_dump_frames_per_thread( $gdb, 1 ); } else { @threads = gdb_dump_frames_per_thread($gdb); @@ -5399,7 +5401,7 @@ next unless exists $$frame{level}; next unless exists $$frame{func}; - if ( $conf{"stack-shows-params"} ) { + if ( $conf{stack_shows_params} ) { my @a; foreach my $arg ( @{ $frame->{params} } ) { if ( defined $frame->{vals}{$arg} ) { @@ -5413,7 +5415,7 @@ my $line = $frame->{line} || "?"; printf("$frame->{func}($a) at $file:$line\n"); - if ( $conf{"stack-shows-locals"} ) { + if ( $conf{stack_shows_locals} ) { foreach my $arg ( @{ $frame->{locals} } ) { if ( defined $frame->{vals}{$arg} ) { printf(" $arg = $frame->{vals}{$arg}\n"); @@ -5567,7 +5569,7 @@ sub show_task_dir { my ( $carg, $vp, $pid, $dir ) = @_; - if ( $carg->{"proc-shows-proc"} ) { + if ( $carg->{proc_shows_proc} ) { my $exe = readlink "$dir/exe"; if ( defined $exe ) { proc_output( $vp, "exe", $exe ); @@ -5576,7 +5578,7 @@ show_task_file( $vp, "$dir/status" ); show_task_file( $vp, "$dir/wchan", "wchan" ); show_task_file( $vp, "$dir/stat", "stat" ); - if ( $carg->{"proc-shows-stat"} ) { + if ( $carg->{proc_shows_stat} ) { show_task_stat_file( $vp, "$dir/stat" ); } @@ -5607,7 +5609,8 @@ } } } - if ( $carg->{"proc-shows-fds"} ) { + + if ( $carg->{proc_shows_fds} ) { opendir( FDS, "$dir/fd" ); my @fds = readdir(FDS); closedir(FDS); @@ -5623,7 +5626,7 @@ # New fdinfo data, it's verbose so only enable it # if requested by -O proc-shows-fds=full - if ( $carg->{"proc-shows-fds"} eq "full" ) { + if ( $carg->{proc_shows_fds} eq "full" ) { if ( -f "$dir/fdinfo/$fd" ) { open( FDI, "$dir/fdinfo/$fd" ); my @fdi = (); @@ -5647,7 +5650,7 @@ } } } - if ( $carg->{"proc-shows-maps"} ) { + if ( $carg->{proc_shows_maps} ) { show_task_file( $vp, "$dir/maps", "maps" ); } } @@ -5738,7 +5741,7 @@ my $jiffies_start; my $load_avg; - if ( $carg->{"proc-shows-proc"} ) { + if ( $carg->{proc_shows_proc} ) { foreach my $proc ( @{$list} ) { my $pid = $proc->{pid}; open( $proc->{handle}, "/proc/$pid/stat" ); @@ -5773,7 +5776,7 @@ show_proc( $carg, $vp, $pid ); } - if ( $carg->{"proc-shows-proc"} ) { + if ( $carg->{proc_shows_proc} ) { sleep(1); seek( SFD, 0, 0 ); @@ -5846,11 +5849,11 @@ sub show_proc { my ( $carg, $vp, $pid ) = @_; - if ( $carg->{"proc-shows-proc"} ) { + if ( $carg->{proc_shows_proc} ) { proc_output( $vp, "hostname", $confInner{hostname} ); } - if ( -d "/proc/$pid/task" and $carg->{"proc-shows-proc"} ) { + if ( -d "/proc/$pid/task" and $carg->{proc_shows_proc} ) { my $threads = 0; @@ -5953,8 +5956,8 @@ $ok = 0; if ( defined $gdb ) { - if ( $carg->{"stack-shows-params"} - or $carg->{"stack-shows-locals"} ) + if ( $carg->{stack_shows_params} + or $carg->{stack_shows_locals} ) { @threads = gdb_dump_frames_per_thread( $gdb, 1 ); } else { @@ -5975,7 +5978,7 @@ } $tries++; } while ( ( $ok != 1 ) - and ( $tries < $carg->{"gdb-retry-count"} ) ); + and ( $tries < $carg->{gdb_retry_count} ) ); if ( not defined $threads[0]{id} ) { output( $vp, "Could not extract stack trace from application" ); @@ -6007,10 +6010,10 @@ . "() at " . ( $$frame{file} || "?" ) . ":" . ( $$frame{line} || "?" ) ); - if ( $carg->{"stack-shows-params"} ) { + if ( $carg->{stack_shows_params} ) { show_vars( $vp, $frame, "params" ); } - if ( $carg->{"stack-shows-locals"} ) { + if ( $carg->{stack_shows_locals} ) { show_vars( $vp, $frame, "locals" ); } @@ -6151,9 +6154,9 @@ # File is a csv file, # Name,c,function1,function2 - if ( defined $carg->{"mpi-watch-file"} ) { + if ( defined $carg->{mpi_watch_file} ) { my %fns; - my $f = $carg->{"mpi-watch-file"}; + my $f = $carg->{mpi_watch_file}; open( MW, $f ) or return; my @d = (); close(MW); @@ -6356,7 +6359,7 @@ my $job = shift; my $d = mpd_get_data(); - my $j = $d->{$job}{pids}{ $confInner{"hostname"} }; + my $j = $d->{$job}{pids}{ $confInner{hostname} }; foreach my $pid ( keys %{$j} ) { maybe_show_pid( $j->{$pid}, $pid ); @@ -6573,7 +6576,7 @@ my $uid = $<; my %scripts; - map { $scripts{$_}++ } split( ",", $confInner{"scripts"} ); + map { $scripts{$_}++ } split( ",", $confInner{scripts} ); my $ipids = $confInner{rmpids}; @@ -6633,7 +6636,7 @@ my %pd; $pd{pid} = $newpid; $pd{vp} = $ip->{rank}; - push( @{ $confInner{"all-pids"} }, \%pd ); ***The diff for this file has been truncated for email.*** From codesite-noreply at google.com Sun Aug 23 11:12:03 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Sun, 23 Aug 2009 10:12:03 +0000 Subject: [padb-devel] [padb] r135 committed - Don't use "auto" for resource manager, use undef as we do for other... Message-ID: <00504502f5783262360471cc55f0@google.com> Revision: 135 Author: apittman Date: Sun Aug 23 03:11:41 2009 Log: Don't use "auto" for resource manager, use undef as we do for other config options. http://code.google.com/p/padb/source/detail?r=135 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Sun Aug 23 03:00:38 2009 +++ /trunk/src/padb Sun Aug 23 03:11:41 2009 @@ -404,7 +404,7 @@ # Tuning options. $conf{prun_timeout} = 120; $conf{prun_exittimeout} = 120; -$conf{rmgr} = "auto"; +$conf{rmgr} = undef; $conf{slurm_job_step} = 0; @@ -2652,7 +2652,12 @@ # If it's been set on the command line and it's valid then just use what we are given. # Do sanity checks here but only warn on the result to cope with non-default installs. - if ( defined $rmgr{ $conf{rmgr} } ) { + if ( defined $conf{rmgr} ) { + if ( not defined $rmgr{ $conf{rmgr} } ) { + printf("Error, resource manager \"$conf{rmgr}\" not supported\n"); + exit(1); + } + if ( defined $rmgr{ $conf{rmgr} }{is_installed} and not $rmgr{ $conf{rmgr} }{is_installed}() ) { @@ -2663,11 +2668,6 @@ setup_rmgr( $conf{rmgr} ); return; } - - if ( $conf{rmgr} ne "auto" ) { - printf("Error, resource manager \"$conf{rmgr}\" not supported\n"); - exit(1); - } my @ok; foreach my $res ( sort( keys %rmgr ) ) { @@ -2699,7 +2699,12 @@ # If it's been set on the command line and it's valid then just use what we are given. # Do sanity checks here but only warn on the result to cope with non-default installs. - if ( defined $rmgr{ $conf{rmgr} } ) { + if ( defined $conf{rmgr} ) { + if ( not defined $rmgr{ $conf{rmgr} } ) { + printf("Error, resource manager \"$conf{rmgr}\" not supported\n"); + exit(1); + } + if ( defined $rmgr{ $conf{rmgr} }{is_installed} and not $rmgr{ $conf{rmgr} }{is_installed}() ) { @@ -2710,11 +2715,6 @@ setup_rmgr( $conf{rmgr} ); return; } - - if ( $conf{rmgr} ne "auto" ) { - printf("Error, resource manager \"$conf{rmgr}\" not supported\n"); - exit(1); - } my @installed; foreach my $res ( sort( keys %rmgr ) ) { From codesite-noreply at google.com Sun Aug 23 11:48:10 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Sun, 23 Aug 2009 10:48:10 +0000 Subject: [padb-devel] [padb] r136 committed - Remove the (unused) output_dtype() function. Message-ID: <001636b2add46239930471ccd631@google.com> Revision: 136 Author: apittman Date: Sun Aug 23 03:47:38 2009 Log: Remove the (unused) output_dtype() function. http://code.google.com/p/padb/source/detail?r=136 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Sun Aug 23 03:11:41 2009 +++ /trunk/src/padb Sun Aug 23 03:47:38 2009 @@ -4319,20 +4319,6 @@ print "$confInner{hostname}.$vp:$str\n"; } - -sub output_dtype { - my ( $vp, $ref ) = @_; - if ( defined $vp ) { - my $p = nfreeze $ref; - my $q = encode_base64($p); - foreach my $l ( split( "\n", $q ) ) { - print "$vp:raw:$l\n"; - } - } else { - my $str = Dumper $ref; - print "$confInner{hostname}.-1:ERROR: $str\n"; - } -} my %inner_output; @@ -5011,8 +4997,6 @@ my @mq = fetch_mpi_queue_gdb( $carg, $vp, $pid, $gdb ); $ret->{$vp} = \@mq; - - #output_dtype( $vp, \@mq ); } foreach my $proc (@all) { From codesite-noreply at google.com Sun Aug 23 12:05:14 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Sun, 23 Aug 2009 11:05:14 +0000 Subject: [padb-devel] [padb] r137 committed - Removed a lot more quotes from hash keys where possible, some... Message-ID: <0016367f92fa625b390471cd1389@google.com> Revision: 137 Author: apittman Date: Sun Aug 23 04:04:28 2009 Log: Removed a lot more quotes from hash keys where possible, some still exist, for example in the interaction with gdb where they are gdb provided keys. Resource manager names still use dashes in places and these need further work to remove them completely. http://code.google.com/p/padb/source/detail?r=137 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Sun Aug 23 03:47:38 2009 +++ /trunk/src/padb Sun Aug 23 04:04:28 2009 @@ -293,7 +293,7 @@ my %rmgr; -$rmgr{"rms"} = { +$rmgr{rms} = { 'is_installed' => \&rms_is_installed, 'get_active_jobs' => \&rms_get_jobs, 'job_is_running' => \&rms_job_is_running, @@ -302,7 +302,7 @@ 'find_pids' => \&rms_find_pids, }; -$rmgr{"mpd"} = { +$rmgr{mpd} = { 'is_installed' => \&mpd_is_installed, 'get_active_jobs' => \&mpd_get_jobs, 'setup_pcmd' => \&mpd_setup_pcmd, @@ -311,7 +311,7 @@ 'require_inner_callback' => 1, }; -$rmgr{"orte"} = { +$rmgr{orte} = { 'is_installed' => \&open_is_installed, 'get_active_jobs' => \&open_get_jobs, 'setup_pcmd' => \&open_setup_pcmd, @@ -326,7 +326,7 @@ 'inner_rmgr' => "rms", }; -$rmgr{"slurm"} = { +$rmgr{slurm} = { 'is_installed' => \&slurm_is_installed, 'get_active_jobs' => \&slurm_get_jobs, 'job_is_running' => \&slurm_job_is_running, @@ -335,7 +335,7 @@ 'require_inner_callback' => 1, }; -$rmgr{"local"} = { +$rmgr{local} = { 'get_active_jobs' => \&local_get_jobs, 'job_is_running' => \&local_job_is_running, 'setup_pcmd' => \&local_setup_pcmd, @@ -614,13 +614,13 @@ # Valid debug modes, a full list is maintained here so using unexpected # ones can generate warnings. -$debugModes{"full-duplex"} = undef; -$debugModes{"show-cmd"} = undef; -$debugModes{"all"} = undef; -$debugModes{"tree"} = undef; -$debugModes{"verbose"} = undef; -$debugModes{"signon"} = undef; -$debugModes{"rmgr"} = undef; +$debugModes{full_duplex} = undef; +$debugModes{show_cmd } = undef; +$debugModes{all} = undef; +$debugModes{tree} = undef; +$debugModes{verbose} = undef; +$debugModes{signon} = undef; +$debugModes{rmgr} = undef; sub parse_args_outer { @@ -2127,11 +2127,11 @@ # nodes run different versions of edb which can cause problems # XXX: Fixme. This isn't high enough. if ( $conf{prun_exittimeout} != 0 ) { - $ENV{"RMS_EXITTIMEOUT"} = $conf{prun_exittimeout}; + $ENV{RMS_EXITTIMEOUT} = $conf{prun_exittimeout}; } if ( $conf{prun_timeout} != 0 ) { - $ENV{"RMS_TIMELIMIT"} = $conf{prun_timeout}; + $ENV{RMS_TIMELIMIT} = $conf{prun_timeout}; } { @@ -2141,8 +2141,8 @@ # for a while, the second one is limited # to 'qsrmslibs-2.82-15' my $partition = rms_res_to_partition($res); - $ENV{"RMS_PARTITION"} = "$partition"; - $ENV{"RMS_RESOURCEID"} = "$partition.$res"; + $ENV{RMS_PARTITION} = $partition; + $ENV{RMS_RESOURCEID} = "$partition.$res"; } my $cmd = "prun -i /dev/null -T $res"; @@ -3297,7 +3297,7 @@ sub issue_command_to_inner { my ( $cdata, $cmd ) = @_; my $str = my_encode($cmd); - debug_log( "full-duplex", $cmd, "Sending command to inner, %d bytes", + debug_log( "full_duplex", $cmd, "Sending command to inner, %d bytes", length($str) ); $cdata->{socket}->print("$str\n"); } @@ -3317,7 +3317,7 @@ $req->{jobconfig}{rmgr} = $conf{rmgr}; if ( $conf{rmgr} eq "orte" ) { - $req->{jobconfig}{"orte-data"} = $open_jobs{ $comm_data->{jobid} }; + $req->{jobconfig}{orte_data} = $open_jobs{ $comm_data->{jobid} }; } $req->{cinner} = \%cinner; @@ -3454,7 +3454,7 @@ # A reply from inner. my $d = my_decode($line); - debug_log( "full-duplex", $d, "Reply from inner, %d bytes", length($line) ); + debug_log( "full_duplex", $d, "Reply from inner, %d bytes", length($line) ); # The inner process has signed on. if ( $comm_data->{current_req}->{mode} eq "signon" ) { @@ -3697,7 +3697,7 @@ map { $cmd .= " --$_=\"$cinner_cmd{$_}\"" } keys(%cinner_cmd); - debug_log( "show-cmd", undef, $cmd ); + debug_log( "show_cmd", undef, $cmd ); my $pcmd = { pid => -1, @@ -4227,7 +4227,7 @@ if ( defined($mode) and ( $mode eq "kill" ) ) { my $signal = uc( $secondary_args{signal} ); my %sig_names; - map { $sig_names{$_} = 1 } split( " ", $Config{"sig_name"} ); + map { $sig_names{$_} = 1 } split( " ", $Config{sig_name} ); if ( not defined $sig_names{$signal} ) { cmdline_error("$prog: Error: signal \"$signal\" is invalid\n"); @@ -5697,7 +5697,7 @@ sub proc_output { my ( $vp, $key, $value ) = @_; - if ( $confInner{mode} eq "proc-summary" ) { + if ( $confInner{mode} eq "proc_summary" ) { if ( defined $proc_keys{ lc($key) } ) { $proc_info->{$vp}{ lc($key) } = $value; } @@ -5824,7 +5824,7 @@ } } - if ( $confInner{mode} eq "proc-summary" ) { + if ( $confInner{mode} eq "proc_summary" ) { return $proc_info; } return; @@ -6027,8 +6027,7 @@ my %remote_env = get_remote_env($pid); if ( defined $remote_env{LD_LIBRARY_PATH} ) { - $ENV{"LD_LIBRARY_PATH"} = - "$remote_env{LD_LIBRARY_PATH}:$confInner{myld}"; + $ENV{LD_LIBRARY_PATH} = "$remote_env{LD_LIBRARY_PATH}:$confInner{myld}"; } my $lines = run_ptrack_cmd( $vp, $pid, @@ -6320,13 +6319,13 @@ my $jobid = shift; my @procs = - `scontrol listpids $jobid.$confInner{"slurm-job-step"} 2>/dev/null`; + `scontrol listpids $jobid.$confInner{slurm_job_step} 2>/dev/null`; return undef if ( $? != 0 ); foreach my $proc (@procs) { my ( $pid, $job, $step, $local, $global ) = split( " ", $proc ); next if ( $global eq "-" ); next unless ( $job eq $jobid ); - next unless ( $step == $confInner{"slurm-job-step"} ); + next unless ( $step == $confInner{slurm_job_step} ); maybe_show_pid( $global, $pid ); } } @@ -6355,9 +6354,9 @@ # Be careful here, we are the inner process then load data from # the outer. - if ( defined $confInner{"orte-data"} ) { + if ( defined $confInner{orte_data} ) { %open_jobs = (); - $open_jobs{$job} = $confInner{"orte-data"}; + $open_jobs{$job} = $confInner{orte_data}; } else { open_get_data(); } @@ -6635,7 +6634,7 @@ # data in $netdata->{target_responce} and $netdata->?? if ( $cmd->{jobconfig}{rmgr} eq "orte" ) { - $confInner{"orte-data"} = $cmd->{jobconfig}{"orte-data"}; + $confInner{orte_data} = $cmd->{jobconfig}{orte_data}; } # Query the resource manager to find the pids, @@ -6834,7 +6833,7 @@ sub inner_loop_for_comms { my ($outerloc) = @_; - my $server = create_local_port( $confInner{"port-range"} ); + my $server = create_local_port( $confInner{port_range} ); my $lport = $server->sockport(); my $hostname = hostname(); @@ -6961,7 +6960,7 @@ GetOptions(%optionhash) or confess("could not parse options\n"); - $confInner{myld} = $ENV{"LD_LIBRARY_PATH"}; + $confInner{myld} = $ENV{LD_LIBRARY_PATH}; inner_loop_for_comms( $confInner{outer} ); exit(0); @@ -7078,7 +7077,7 @@ }; - $allfns{"proc-summary"} = { + $allfns{proc_summary} = { 'handler_all' => \&show_proc_all, 'out_handler' => \&show_proc_format, 'arg_long' => 'proc-summary', From codesite-noreply at google.com Sun Aug 23 12:17:15 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Sun, 23 Aug 2009 11:17:15 +0000 Subject: [padb-devel] [padb] r138 committed - Remove the dump-raw option as it's not widely used and superceeded... Message-ID: <00504502f57860b27a0471cd3ea9@google.com> Revision: 138 Author: apittman Date: Sun Aug 23 04:16:23 2009 Log: Remove the dump-raw option as it's not widely used and superceeded by the --debug code. http://code.google.com/p/padb/source/detail?r=138 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Sun Aug 23 04:04:28 2009 +++ /trunk/src/padb Sun Aug 23 04:16:23 2009 @@ -381,7 +381,6 @@ # Debug options. $conf{verbose} = 0; -$conf{dump_raw} = 0; # Valid values are "none" "missing" or "all". Anything not recognised # is treated as "all". @@ -1459,8 +1458,6 @@ sub read_stats { my @data = @_; - printf Dumper \@data if $conf{dump_raw}; - my $header = parse_header( shift @data ); return undef unless $header; @@ -1472,8 +1469,6 @@ push( @out, $parsed ); } } - - printf Dumper \@out if $conf{dump_raw}; return \@out; } @@ -3000,8 +2995,6 @@ sub show_results { my ( $nlines, $mode, $handle ) = @_; - printf Dumper $nlines if $conf{dump_raw}; - my $lines = $nlines->{lines}; if ( defined $allfns{$mode}{out_handler} ) { From codesite-noreply at google.com Sun Aug 23 12:21:15 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Sun, 23 Aug 2009 11:21:15 +0000 Subject: [padb-devel] [padb] r139 committed - Remove lots of commented out Dumper calls Message-ID: <00504502f52bb4ab8f0471cd4cc6@google.com> Revision: 139 Author: apittman Date: Sun Aug 23 04:19:30 2009 Log: Remove lots of commented out Dumper calls http://code.google.com/p/padb/source/detail?r=139 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Sun Aug 23 04:16:23 2009 +++ /trunk/src/padb Sun Aug 23 04:19:30 2009 @@ -1801,8 +1801,6 @@ s/\n//g; push @data, $_; } - - # print Dumper \@data; my $s = read_stats(@data); @@ -2887,7 +2885,6 @@ sub show_mpi_watch { my ( $handle, $lines ) = @_; - # print Dumper $lines; my $s = ""; foreach my $l ( sort { $a <=> $b } ( keys %{ $lines->{target_responce} } ) ) { @@ -4849,9 +4846,6 @@ close( $h->{wtr} ); close( $h->{err} ); - # Useful for tuning the dll itself... - # print Dumper \%stats; - return @mq; } @@ -5137,8 +5131,6 @@ } return "$ret"; - - # print Dumper \%ad; } sub deadlock_detect { @@ -5155,7 +5147,6 @@ $data = $lines->{lines}; } - # print Dumper $data; my %coll_data; foreach my $rank ( keys( %{$data} ) ) { my $r = $data->{$rank}; @@ -5181,8 +5172,6 @@ } $coll_data{$rank} = \%lid; } - - # print Dumper \%coll_data; my $r = go_deadlock_detect( $carg, \%coll_data ); print $r; @@ -6778,7 +6767,6 @@ if ( not $cdata->{trusted} ) { if ( $line eq "hello $netdata->{key}" ) { - #printf("Trusting connection from $cdata->{desc}\n"); $cdata->{trusted} = 1; $cdata->{str} = ""; $s->printf("Welcome\n"); From codesite-noreply at google.com Sun Aug 23 12:26:16 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Sun, 23 Aug 2009 11:26:16 +0000 Subject: [padb-devel] [padb] r140 committed - Remove calls to hostname with the caches version if possible. Message-ID: <001636b149b5a1d2c50471cd5ed2@google.com> Revision: 140 Author: apittman Date: Sun Aug 23 04:25:47 2009 Log: Remove calls to hostname with the caches version if possible. http://code.google.com/p/padb/source/detail?r=140 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Sun Aug 23 04:19:30 2009 +++ /trunk/src/padb Sun Aug 23 04:25:47 2009 @@ -380,7 +380,7 @@ my $rem_jobid; # Debug options. -$conf{verbose} = 0; +$conf{verbose} = 0; # Valid values are "none" "missing" or "all". Anything not recognised # is treated as "all". @@ -614,7 +614,7 @@ # Valid debug modes, a full list is maintained here so using unexpected # ones can generate warnings. $debugModes{full_duplex} = undef; -$debugModes{show_cmd } = undef; +$debugModes{show_cmd} = undef; $debugModes{all} = undef; $debugModes{tree} = undef; $debugModes{verbose} = undef; @@ -6334,18 +6334,10 @@ sub open_find_pids { my $job = shift; - # Be careful here, we are the inner process then load data from - # the outer. - if ( defined $confInner{orte_data} ) { - %open_jobs = (); - $open_jobs{$job} = $confInner{orte_data}; - } else { - open_get_data(); - } - my $hostname = hostname(); - - foreach my $rank ( keys( %{ $open_jobs{$job}{ranks}{$hostname} } ) ) { - maybe_show_pid( $rank, $open_jobs{$job}{ranks}{$hostname}{$rank} ); + my $hostname = $confInner{hostname}; + + foreach my $rank ( keys( %{ $confInner{orte_data}{ranks}{$hostname} } ) ) { + maybe_show_pid( $rank, $confInner{orte_data}{ranks}{$hostname}{$rank} ); } } @@ -6751,7 +6743,7 @@ sub reply_to_parent { my ( $netdata, $cmd ) = @_; - $cmd->{host_responce}{ok}{ hostname() } = 1; + $cmd->{host_responce}{ok}{ $confInner{hostname} } = 1; my $reply = my_encode($cmd); $netdata->{parent}->{socket}->printf("$reply\n"); @@ -6817,7 +6809,7 @@ my $server = create_local_port( $confInner{port_range} ); my $lport = $server->sockport(); - my $hostname = hostname(); + my $hostname = $confInner{hostname}; my $key = rand(); if ( defined $outerloc ) { @@ -6860,7 +6852,6 @@ my $ip = inet_ntoa($addr); my $hostname = gethostbyaddr( $addr, AF_INET ); - #printf "New connection from $hostname ($ip) $port\n"; my %sinfo; $sinfo{hostname} = $hostname; $sinfo{trusted} = 0; @@ -6869,9 +6860,6 @@ $sinfo{socket} = $new; $sinfo{line_cb} = \&command_from_outer; $netdata->{connections}{$new} = \%sinfo; - - # $new->printf("Hello from padb\n"); - #$new->autoflush(); next; } From codesite-noreply at google.com Mon Aug 24 09:39:20 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Mon, 24 Aug 2009 08:39:20 +0000 Subject: [padb-devel] [padb] r141 committed - Convert the secondary are to underscores as well, only changing them... Message-ID: <00504502f5616f4f380471df2737@google.com> Revision: 141 Author: apittman Date: Mon Aug 24 01:38:47 2009 Log: Convert the secondary are to underscores as well, only changing them to dashes when setting the command line options for them. Now all code that can be uses underscores and therefore quotes aren't needed in hash keys. http://code.google.com/p/padb/source/detail?r=141 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Sun Aug 23 04:25:47 2009 +++ /trunk/src/padb Mon Aug 24 01:38:47 2009 @@ -2841,11 +2841,11 @@ $main_idx = 0 if not defined $main_idx; if ( $main_idx != 0 or $wait_idx != 0 ) { my $end = - ( $cargs->{"strip-above-wait"} and $wait_idx ) + ( $cargs->{strip_above_wait} and $wait_idx ) ? $wait_idx : $#{ $lines->{$tag} }; my $start = - ( $cargs->{"strip-below-main"} and $main_idx ) ? $main_idx : 0; + ( $cargs->{strip_below_main} and $main_idx ) ? $main_idx : 0; printf( "Stripping 0.." . $#{ $lines->{$tag} } @@ -2910,7 +2910,7 @@ my $separator = $carg->{column_seperator}; - my @columns = split( ",", $carg->{"proc-format"} ); + my @columns = split( ",", $carg->{proc_format} ); foreach my $column (@columns) { $show_fields = 1 if ( $column eq "fields" ); @@ -3087,7 +3087,7 @@ $output = $req->{out_format} if defined $req->{out_format}; if ( $mode eq "stack" or $input_file ) { - if ( $cargs->{"strip-below-main"} or $cargs->{"strip-above-wait"} ) { + if ( $cargs->{strip_below_main} or $cargs->{strip_above_wait} ) { strip_stack_traces( $cargs, $lines ); } } @@ -4152,7 +4152,7 @@ push_command("deadlock"); my %c; - $c{"strip-above-wait"} = 0; + $c{strip_above_wait} = 0; push_command( "stack", "tree", \%c ); go_job($full_report); @@ -5693,8 +5693,8 @@ %proc_keys = (); - if ( defined $carg->{"proc-format"} ) { - my @columns = split( ",", $carg->{"proc-format"} ); + if ( defined $carg->{proc_format} ) { + my @columns = split( ",", $carg->{proc_format} ); foreach my $column (@columns) { my ( $name, $desc ) = split( "=", $column ); $proc_keys{ lc($name) } = 1; @@ -6957,6 +6957,7 @@ sub to_arg { my $arg = shift; my $res = "$arg->{arg_long}"; + $res =~ s/\_/-/g; if ( defined $arg->{arg_short} ) { $res .= "|$arg->{arg_short}"; } @@ -7064,7 +7065,7 @@ }, 'secondary' => [ { - 'arg_long' => 'proc-format', + 'arg_long' => 'proc_format', 'type' => '=s', 'default' => 'vp=vpid,hostname,pid,vmsize,vmrss,stat.state=S,load1=uptime,pcpu=%cpu,stat.processor=lcore,name=command' @@ -7088,12 +7089,12 @@ }, 'secondary' => [ { - 'arg_long' => 'strip-below-main', + 'arg_long' => 'strip_below_main', 'type' => '!', 'default' => 1, }, { - 'arg_long' => 'strip-above-wait', + 'arg_long' => 'strip_above_wait', 'type' => '!', 'default' => 1, }, From codesite-noreply at google.com Mon Aug 24 09:58:23 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Mon, 24 Aug 2009 08:58:23 +0000 Subject: [padb-devel] [padb] r142 committed - Update the TODO and remove some old long-expired comments. Message-ID: <0016e64dda8e968c670471df6b42@google.com> Revision: 142 Author: apittman Date: Mon Aug 24 01:57:58 2009 Log: Update the TODO and remove some old long-expired comments. http://code.google.com/p/padb/source/detail?r=142 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Mon Aug 24 01:38:47 2009 +++ /trunk/src/padb Mon Aug 24 01:57:58 2009 @@ -191,22 +191,16 @@ # * Multi-pass argument handling, --kill also accepts --signal for example, # this should really be done at the getopt layer. Also proper usage # info for these secondary args. -# * The mode {handler} functions should only be called once per node, it could then -# correctly handle $confInner{gdb_file} and also attach to every process per node -# simultaneously, this would help stack trace and message queue support as doing -# then one at a time results in weird artifacts. (Done for stack traces but not -# message queues). -# * Output parsing, {out_handler} is a good start but in stack traces the tree -# format is optional, maybe have the secondary arg have a {out_hander} attached? +# * Paramater checking of secondary args, signal has a hacky implementation and +# port-range doesn't have any checking currently. # * libunwind support? lighter weight than gdb and possibly more reliable. -# * --watch should launch one persistent parallel job rather than a new one every go, -# this would require full-duplex comms between inner and outer however so could -# present scaling problems. Maybe PMI would help here? +# * Maybe PMI would help? # * POD? generated man page? # * mode specific defaults, for example --mpi-watch should enable --watch # -Owatch-clears-screen=0 # * Make -q fallback to -Q if tports are not available # * ??? +# * Allow ranges of ranks to be specified. ############################################################################### @@ -3836,15 +3830,9 @@ # Setup whatever is needed for running parallel commands, note this might # involve setting environment variables. - my @res = setup_pcmd($jobid); - - return 1 unless (@res); - - my $cmd = $res[0]; - - # These two are only defined by some resource managers. - my $ncpus = $res[1]; - my $hosts = $res[2]; + my ( $cmd, $ncpus, $hosts ) = setup_pcmd($jobid); + + return 1 unless ($cmd); $conf{verbose} && defined $ncpus && print "Job has $ncpus process(es)\n"; $conf{verbose} && defined $hosts && print "Job spans $hosts host(s)\n"; @@ -3852,27 +3840,7 @@ debug_log( "verbose", undef, "There are %d processes over %d hosts", $ncpus, $hosts ); - # Maybe do it this way, edb works best when run with the same LD_LIBRARY_PATH - # as the application. It's very important when running the message queue - # extraction code but less so here. You may find you get linker errors though - # although they shouldn't be to hard to work around. - - # Another problem, if using slurm then the key isn't valid, you need to - # convert from jobId to key locally on the node, hence you need to use - # a padb-helper process - #if ( $stats_total or $group ) { - # $stats = 1; - # if ( defined $key ) { - # $cmd .= - # " $conf{edb} --stats-raw --parallel --key=$key $conf{edbopt}"; - # } else { - # $cmd .= - # " $0 --inner --jobid=$rem_jobid $rops --stats-full $conf{edbopt}"; - # } - #} else { $cmd .= " $0 --inner"; - - #} if ( not defined $hosts ) { printf("Full duplex mode needs to know the host count\n"); @@ -4131,28 +4099,12 @@ printf("padb version $version\n"); printf("full job report for job $full_report\n\n"); - - # Bit of a cheat here, do two things at once... - # This should probably me modified to work better on - # non Quadrics systems. - #my $res; - #$stats_total = 1; - #$group = 1; - #$res = go_job( $full_report, "stats" ); - #undef $stats_total; - #undef $group; - - # Don't exit on failure here. - #if ( $res != 0 ) { - # exit 1; - #} push_command( "mqueue", "compress" ); push_command("deadlock"); - my %c; - $c{strip_above_wait} = 0; + my %c = ( 'strip_above_wait' => 0 ); push_command( "stack", "tree", \%c ); go_job($full_report); From codesite-noreply at google.com Mon Aug 24 16:24:11 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Mon, 24 Aug 2009 15:24:11 +0000 Subject: [padb-devel] [padb] r143 committed - Add a mechanism for getting error strings back from the rank targettin... Message-ID: <000e0cd117da4d5ea10471e4cfe4@google.com> Revision: 143 Author: apittman Date: Mon Aug 24 08:23:48 2009 Log: Add a mechanism for getting error strings back from the rank targetting code to the outer process and onto the user. Each rank can report error via a call to target_error($vp,$errstr), which then causes the errstr to be passed back up the tree and eventually onto the user. The data for this is reduced in the network properly using some new merge_ranges() functions for merging node ranges. Each rank can report more than one error, errors are reported in alphabetical order with each error message also stating which ranks received that error. http://code.google.com/p/padb/source/detail?r=143 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Mon Aug 24 01:57:58 2009 +++ /trunk/src/padb Mon Aug 24 08:23:48 2009 @@ -3470,6 +3470,14 @@ # may not be the same thing as the request we are currently # sending. my $mode = $comm_data->{current_req}->{mode}; + + if ( defined $d->{target_error} ) { + printf("Warning: errors reported by some ranks\n========\n"); + foreach my $error ( sort( keys( %{ $d->{target_error} } ) ) ) { + printf("$d->{target_error}{$error}: $error\n"); + } + printf("========\n"); + } if ( defined( $allfns{$mode}{out_handler} ) ) { $allfns{$mode}{out_handler}( $conf{mode_options}{$mode}, $d ); } else { @@ -3579,6 +3587,130 @@ $cdata{event_cb} = \&handle_event_from_socket; $comm_data->{sockets}{$new} = \%cdata; } + +sub sub_range_assemble { + my ( $lower, $upper ) = @_; + + if ( $lower == $upper ) { + return $lower; + } else { + return "$lower-$upper"; + } +} + +sub add_value_to_range { + my ( $range, $value ) = @_; + + if ( $range =~ m/^\[([\d\-\,]+)\]$/ ) { + $range = $1; + } else { + carp("Bad range $range"); + printf("Bad range $range"); + return undef; + + } + + my @parts; + + my $added = 0; + + # Walk through the individal contigous sub-ranges adding the + # value to any subrange it's adjacent to. If it falls between two + # then insert it as a new subrange, if it's not found then add it + # after the last subrange. + # If a sub-range is extended upwards then it's possible it not + # meets the subsequent one so handle that case as well. + + my $prev_part_upper; + my $prev_part_lower; + + foreach my $part ( split( ",", $range ) ) { + my $lower; + my $upper; + + if ( $part =~ m/^(\d+)$/ ) { + $lower = $1; + $upper = $1; + } elsif ( $part =~ m/^(\d+)\-(\d+)$/ ) { + $lower = $1; + $upper = $2; + } else { + carp("Bad range $range"); + return undef; + } + + if ( not $added ) { + if ( $lower == $value + 1 ) { + $lower--; + $added = 1; + } elsif ( $upper == $value - 1 ) { + + # Extend the current part upwards to include the new value. + $upper++; + $added = 1; + $prev_part_upper = $upper; + $prev_part_lower = $lower; + next; + } elsif ( $lower > $value ) { + + # If we have skipped over the value then add it now. + # note that we are adding it here and hence cannot have extended + # the last sub-range. + push( @parts, $value ); + $added = 1; + } + } + + if ( defined $prev_part_upper ) { + + # We have extended a segment. + if ( $prev_part_upper == $lower - 1 ) { + $part = sub_range_assemble( $prev_part_lower, $upper ); + push( @parts, $part ); + } else { + $part = + sub_range_assemble( $prev_part_lower, $prev_part_upper ); + push( @parts, $part ); + $part = sub_range_assemble( $lower, $upper ); + push( @parts, $part ); + } + undef($prev_part_upper); + undef($prev_part_lower); + } else { + $part = sub_range_assemble( $lower, $upper ); + push( @parts, $part ); + } + } + + if ( defined $prev_part_upper ) { + my $part = sub_range_assemble( $prev_part_lower, $prev_part_upper ); + push( @parts, $part ); + } + + # If we haven't add it stick it on the end now. + if ( not $added ) { + push( @parts, $value ); + } + + my $newrange = join( ",", @parts ); + + return "[$newrange]"; +} + +# Merge two port ranges. For now just do the simple thing, +# this really needs to be re-visited from a scalability aspect however. +sub merge_ranges { + my ( $left, $right ) = @_; + + my ( $val, $range ) = shift_from_range($right); + + while ( defined $val ) { + $left = add_value_to_range( $left, $val ); + ( $val, $range ) = shift_from_range($range); + } + + return "[$left]"; +} # "shift" a rank or port number from the standard spec format, returns the id # and the range with the first entry removed. Returns both the first entry and @@ -4263,6 +4395,7 @@ } my %inner_output; +my %inner_error; sub output { my ( $vp, $str ) = @_; @@ -4270,6 +4403,18 @@ push( @{ $inner_output{$vp} }, $str ); } + +# Report a single string error for a given target rank. +sub target_error { + my ( $rank, $error ) = @_; + + if ( defined $inner_error{$error} ) { + $inner_error{$error} = + add_value_to_range( $inner_error{$error}, $rank ); + } else { + $inner_error{$error} = "[$rank]"; + } +} sub p_die { my ( $vp, $str ) = @_; @@ -5833,9 +5978,9 @@ push( @all, $proc ); } else { if ( defined $gdb->{error} ) { - output $vp, $gdb->{error}; + target_error( $vp, $gdb->{error} ); } else { - output $vp, "Failed to attach to process"; + target_error( $vp, "Failed to attach to process" ); } } @@ -6400,8 +6545,6 @@ # Merge this reply into the local one. $handle->{child_replys}++; - # $handle->{all_replys}{raw}{ $sd->{hostname} } = $r; - # Combine the host responces. foreach my $status ( keys( %{ $r->{host_responce} } ) ) { foreach my $host ( keys( %{ $r->{host_responce}{$status} } ) ) { @@ -6418,7 +6561,7 @@ } } - # Combine the target process responces. + # Combine the target process responces from child. if ( exists $r->{target_output} ) { foreach my $tp ( keys( %{ $r->{target_output} } ) ) { $handle->{all_replys}->{target_output}{$tp} = @@ -6426,12 +6569,29 @@ } } + # Copy the target local responces. if ( exists $handle->{target_responce} ) { foreach my $tp ( keys( %{ $handle->{target_responce} } ) ) { $handle->{all_replys}->{target_responce}{$tp} = $handle->{target_responce}{$tp}; } } + + # Copy the network target errors into responce. + if ( exists $r->{target_error} ) { + $handle->{all_replys}->{target_error} = $r->{target_error}; + } + + # Merge in local target responces. + foreach my $key ( keys(%inner_error) ) { + if ( defined $handle->{all_replys}->{target_error}{$key} ) { + $handle->{all_replys}->{target_error}{$key} = + merge_ranges( $handle->{all_replys}->{target_error}{$key}, + $inner_error{$key} ); + } else { + $handle->{all_replys}->{target_error}{$key} = $inner_error{$key}; + } + } # Save any output we've got from this node. foreach my $key ( keys(%inner_output) ) { @@ -6439,6 +6599,7 @@ } %inner_output = (); + %inner_error = (); # If this isn't the last child to signon don't reply up-stream yet. if ( $handle->{child_replys} != $handle->{children} ) { @@ -6457,6 +6618,7 @@ $handle->{all_replys} = undef; $handle->{child_replys} = 0; $handle->{target_responce} = undef; + $handle->{target_error} = undef; } # Convert from a pid to a command name and do it in a safe manner to avoid @@ -6673,7 +6835,6 @@ } if ($res) { $netdata->{target_responce} = $res; - $netdata->{all_replys}{target_responce} = $res; } return; @@ -6698,7 +6859,7 @@ $cmd->{host_responce}{ok}{ $confInner{hostname} } = 1; my $reply = my_encode($cmd); - $netdata->{parent}->{socket}->printf("$reply\n"); + $netdata->{parent}->{socket}->print("$reply\n"); } # Process a single line of input onto a socket we are @@ -6743,9 +6904,15 @@ $res->{target_output}{$key} = $inner_output{$key}; } - %inner_output = (); + if (%inner_error) { + $res->{target_error} = \%inner_error; + } reply_to_parent( $netdata, $res ); + + # Clear down the local inputs. + %inner_error = (); + %inner_output = (); $netdata->{target_responce} = undef; if ( $netdata->{shutdown} ) { From codesite-noreply at google.com Mon Aug 24 16:29:17 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Mon, 24 Aug 2009 15:29:17 +0000 Subject: [padb-devel] [padb] r144 committed - Sort ranks numerically if using the default (prefix-based) output... Message-ID: <00504502cbef87f8880471e4e1b9@google.com> Revision: 144 Author: apittman Date: Mon Aug 24 08:28:18 2009 Log: Sort ranks numerically if using the default (prefix-based) output formatting. http://code.google.com/p/padb/source/detail?r=144 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Mon Aug 24 08:23:48 2009 +++ /trunk/src/padb Mon Aug 24 08:28:18 2009 @@ -3120,7 +3120,7 @@ } } else { my $nprocesses = keys( %{ $d->{target_output} } ); - foreach my $process ( sort( keys( %{ $d->{target_output} } ) ) ) { + foreach my $process ( sortn( keys( %{ $d->{target_output} } ) ) ) { foreach my $line ( @{ $d->{target_output}{$process} } ) { if ( $nprocesses == 1 ) { print "$line\n"; From codesite-noreply at google.com Mon Aug 24 16:50:28 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Mon, 24 Aug 2009 15:50:28 +0000 Subject: [padb-devel] [padb] r145 committed - Pass any attach errors from gdb back to the user. Message-ID: <000e0cd5f6ac4ca9170471e52d6c@google.com> Revision: 145 Author: apittman Date: Mon Aug 24 08:50:05 2009 Log: Pass any attach errors from gdb back to the user. http://code.google.com/p/padb/source/detail?r=145 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Mon Aug 24 08:28:18 2009 +++ /trunk/src/padb Mon Aug 24 08:50:05 2009 @@ -4467,9 +4467,9 @@ my ( $gdb, $pid ) = @_; send_cont_signal($pid); - my $result = gdb_send( $gdb, "attach $pid" ); - - if ( not defined $result ) { + my %p = gdb_n_send( $gdb, "attach $pid" ); + + if ( not defined $p{status} ) { $gdb->{error} = "Failed to attach to process"; if ( not find_exe("gdb") ) { $gdb->{error} = "Failed to attach to process (gdb not installed?)"; @@ -4477,7 +4477,15 @@ return; } - return if ( $result eq "error" ); + if ( $p{status} eq "error" ) { + my $r = gdb_parse_reason( $p{reason} ); + if ( defined $r->{msg} ) { + $gdb->{error} = "Failed to attach to process: $r->{msg}"; + } else { + $gdb->{error} = "Failed to attach to process"; + } + return; + } $gdb->{attached} = 1; $gdb->{tracepid} = $pid; From codesite-noreply at google.com Mon Aug 24 18:33:22 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Mon, 24 Aug 2009 17:33:22 +0000 Subject: [padb-devel] [padb] r146 committed - Catch the case where message queues are inspected but the minfo binary... Message-ID: <001636e1fdb150da190471e69d4e@google.com> Revision: 146 Author: apittman Date: Mon Aug 24 10:33:09 2009 Log: Catch the case where message queues are inspected but the minfo binary isn't found. Detect this case and report a sensible error message to the user. http://code.google.com/p/padb/source/detail?r=146 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Mon Aug 24 08:50:05 2009 +++ /trunk/src/padb Mon Aug 24 10:33:09 2009 @@ -4400,6 +4400,10 @@ sub output { my ( $vp, $str ) = @_; + if ( not defined $str ) { + carp("no output"); + } + push( @{ $inner_output{$vp} }, $str ); } @@ -4942,15 +4946,31 @@ debug( $vp, "Failed dll request $r\n" ); } } else { - push @mq, $r; + push( @mq, $r ); } } + + my $sc = keys(%stats); waitpid( $h->{hpid}, 0 ); close( $h->{rdr} ); close( $h->{wtr} ); close( $h->{err} ); + if ( $sc == 0 ) { + + # No interaction was had with minfo, abort with nothing. + target_error( $vp, "Error running $confInner{minfo}: No contact" ); + return undef; + } + + if ( $? ne 0 ) { + + # Bad exit code but we did talk to it so run with what we have. + target_error( $vp, + "Error running $confInner{minfo}: Bad exit code $?" ); + } + return @mq; } @@ -5011,6 +5031,7 @@ my ( $carg, $vp, $pid ) = @_; my @mq = fetch_mpi_queue( $carg, $vp, $pid ); + return unless $mq[0]; foreach my $o (@mq) { output( $vp, $o ); } @@ -5043,8 +5064,10 @@ my $gdb = $proc->{gdb}; my @mq = fetch_mpi_queue_gdb( $carg, $vp, $pid, $gdb ); - foreach my $o (@mq) { - output( $vp, $o ); + if ( $mq[0] ) { + foreach my $o (@mq) { + output( $vp, $o ); + } } } From codesite-noreply at google.com Tue Aug 25 10:51:13 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Tue, 25 Aug 2009 09:51:13 +0000 Subject: [padb-devel] [padb] r147 committed - Add the hostname and port information to the error message when... Message-ID: <001636ed642d6697800471f4469e@google.com> Revision: 147 Author: apittman Date: Tue Aug 25 02:51:02 2009 Log: Add the hostname and port information to the error message when failing to connect to host. http://code.google.com/p/padb/source/detail?r=147 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Mon Aug 24 10:33:09 2009 +++ /trunk/src/padb Tue Aug 25 02:51:02 2009 @@ -3188,7 +3188,7 @@ PeerAddr => $host, PeerPort => $port, Proto => 'tcp', - ) or die("Failed to connect to child"); + ) or die("Failed to connect to child ($host:port)"); print $socket "hello $word\n"; From codesite-noreply at google.com Tue Aug 25 11:03:15 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Tue, 25 Aug 2009 10:03:15 +0000 Subject: [padb-devel] [padb] r148 committed - Strip out all the range code I wrote yesterday and replace it ... Message-ID: <001636e1fd886d8dc30471f47124@google.com> Revision: 148 Author: apittman Date: Tue Aug 25 03:02:09 2009 Log: Strip out all the range code I wrote yesterday and replace it with something simpler and faster. Add a new set of rng_ functions that work with ranges in a array rather than converting back to a string each time. http://code.google.com/p/padb/source/detail?r=148 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Tue Aug 25 02:51:02 2009 +++ /trunk/src/padb Tue Aug 25 03:02:09 2009 @@ -3188,7 +3188,7 @@ PeerAddr => $host, PeerPort => $port, Proto => 'tcp', - ) or die("Failed to connect to child ($host:port)"); + ) or die("Failed to connect to child ($host:$port)"); print $socket "hello $word\n"; @@ -3474,7 +3474,8 @@ if ( defined $d->{target_error} ) { printf("Warning: errors reported by some ranks\n========\n"); foreach my $error ( sort( keys( %{ $d->{target_error} } ) ) ) { - printf("$d->{target_error}{$error}: $error\n"); + printf( "%s: $error\n", + rng_convert_to_user( $d->{target_error}{$error} ) ); } printf("========\n"); } @@ -3588,175 +3589,154 @@ $comm_data->{sockets}{$new} = \%cdata; } -sub sub_range_assemble { - my ( $lower, $upper ) = @_; - - if ( $lower == $upper ) { - return $lower; - } else { - return "$lower-$upper"; - } -} - -sub add_value_to_range { - my ( $range, $value ) = @_; - - if ( $range =~ m/^\[([\d\-\,]+)\]$/ ) { - $range = $1; - } else { - carp("Bad range $range"); - printf("Bad range $range"); - return undef; - - } - - my @parts; - - my $added = 0; - - # Walk through the individal contigous sub-ranges adding the - # value to any subrange it's adjacent to. If it falls between two - # then insert it as a new subrange, if it's not found then add it - # after the last subrange. - # If a sub-range is extended upwards then it's possible it not - # meets the subsequent one so handle that case as well. - - my $prev_part_upper; - my $prev_part_lower; - - foreach my $part ( split( ",", $range ) ) { - my $lower; - my $upper; - - if ( $part =~ m/^(\d+)$/ ) { - $lower = $1; - $upper = $1; - } elsif ( $part =~ m/^(\d+)\-(\d+)$/ ) { - $lower = $1; - $upper = $2; - } else { - carp("Bad range $range"); - return undef; - } - - if ( not $added ) { - if ( $lower == $value + 1 ) { - $lower--; - $added = 1; - } elsif ( $upper == $value - 1 ) { - - # Extend the current part upwards to include the new value. - $upper++; - $added = 1; - $prev_part_upper = $upper; - $prev_part_lower = $lower; - next; - } elsif ( $lower > $value ) { - - # If we have skipped over the value then add it now. - # note that we are adding it here and hence cannot have extended - # the last sub-range. - push( @parts, $value ); - $added = 1; - } - } - - if ( defined $prev_part_upper ) { - - # We have extended a segment. - if ( $prev_part_upper == $lower - 1 ) { - $part = sub_range_assemble( $prev_part_lower, $upper ); - push( @parts, $part ); - } else { - $part = - sub_range_assemble( $prev_part_lower, $prev_part_upper ); - push( @parts, $part ); - $part = sub_range_assemble( $lower, $upper ); - push( @parts, $part ); - } - undef($prev_part_upper); - undef($prev_part_lower); +############################################################################### +# +# Range mapping functions. +# +############################################################################### + +# A common set of functions for dealing with (integer based) ranges. +# +# Internally a array format is used for speed, functions exist to convert from +# the normal list format "[0-12,15,16]" to the internal one and back again. + +# rng_convert_from_user($userrange) +# rc_convert_to_user($range) +# Convert to and from the normal type to the internal type. + +# rng_shift($range) +# Pop the lowest value off the range. + +# rng_add_value($range,$value) +# Add a value to the range. + +# rng_merge($range,$new) +# Merge two ranges. + +# Potentially needed but not implemented yet + +# rng_user_verify() +# is_value_in_range() +# nvalues_in_range() + +# Convert from a user range to a internal one. +sub rng_convert_from_user { + my ($range) = @_; + + return undef unless defined $range; + return undef if $range eq ""; + return undef if $range eq "[]"; + + my $newrange; + + if ( $range =~ m/^\[([\d\-\,]+)\]$/ ) { + $newrange = $1; + } elsif ( $range =~ m/^(\d+)$/ ) { + $newrange = $1; + } else { + confess("Failed to recognise $range as range\n"); + } + + my @user_parts = split( ",", $newrange ); + + my @parts; + + foreach my $part (@user_parts) { + my %part; + if ( $part =~ m/^(\d+)$/ ) { + $part{l} = $1; + $part{u} = $1; + } elsif ( $part =~ m/^(\d+)\-(\d+)$/ ) { + $part{l} = $1; + $part{u} = $2; } else { - $part = sub_range_assemble( $lower, $upper ); - push( @parts, $part ); - } - } - - if ( defined $prev_part_upper ) { - my $part = sub_range_assemble( $prev_part_lower, $prev_part_upper ); - push( @parts, $part ); - } - - # If we haven't add it stick it on the end now. - if ( not $added ) { - push( @parts, $value ); - } - - my $newrange = join( ",", @parts ); - - return "[$newrange]"; + confess("Failed to recognise $part as range\n"); + } + push( @parts, \%part ); + } + return \@parts; +} + +sub rng_convert_to_user { + my ($rg) = @_; + + my @entries; + foreach my $part ( @{$rg} ) { + if ( $part->{l} == $part->{u} ) { + push( @entries, $part->{l} ); + } else { + push( @entries, "$part->{l}-$part->{u}" ); + } + } + my $range = join( ",", @entries ); + return "[$range]"; } -# Merge two port ranges. For now just do the simple thing, -# this really needs to be re-visited from a scalability aspect however. -sub merge_ranges { - my ( $left, $right ) = @_; - - my ( $val, $range ) = shift_from_range($right); - - while ( defined $val ) { - $left = add_value_to_range( $left, $val ); - ( $val, $range ) = shift_from_range($range); - } - - return "[$left]"; +sub rng_shift { + my ($rg) = @_; + + # Return undef if this range is empty. + return undef if ( $#{$rg} == -1 ); + + my $value = $rg->[0]->{l}; + if ( $rg->[0]->{l} == $rg->[0]->{u} ) { + shift( @{$rg} ); + } else { + $rg->[0]->{l}++; + } + return $value; } -# "shift" a rank or port number from the standard spec format, returns the id -# and the range with the first entry removed. Returns both the first entry and -# the new range with the first removed. -sub shift_from_range { - my ($range) = @_; - - my $newrange; - - return undef unless defined $range; - return undef if $range eq ""; - return undef if $range eq "[]"; - - if ( $range =~ m/^\[([\d\-\,]+)\]$/ ) { - $newrange = $1; - } elsif ( $range =~ m/^(\d+)$/ ) { - $newrange = $1; - } else { - die("Failed to recognise $range as range\n"); - } - - my @parts = split( ",", $newrange ); - - my $part = shift(@parts); - - my $id; - - if ( $part =~ m/^(\d+)$/ ) { - $id = $1; - } elsif ( $part =~ m/^(\d+)\-(\d+)$/ ) { - my $lower = $1; - my $upper = $2; - $id = $lower; - if ( $lower > $upper ) { - die("Invalid range $lower-$upper\n"); - } - if ( $lower++ != $upper ) { - unshift( @parts, "$lower-$upper" ); - } - } else { - confess("Failed to recognise $part as range\n"); - } - - my $r = join( ",", @parts ); - - return ( $id, "[$r]" ); +sub rng_add_value { + my ( $rg, $value ) = @_; + + # If it's after the last value then just add it. + if ( $value > $rg->[-1]->{u} + 1 ) { + push( @{$rg}, { 'l' => $value, 'u' => $value } ); + return; + } + + my $idx = 0; + foreach my $part ( @{$rg} ) { + + if ( $value == $part->{l} - 1 ) { + + # Extend the current entry downwards. + $part->{l}--; + return; + } elsif ( $value < $part->{l} ) { + + # If it's before the current entry then insert it. + splice( @{$rg}, $idx, 0, { 'l' => $value, 'u' => $value } ); + return; + } elsif ( $value == $part->{u} + 1 ) { + + # Extend the current entry upwards. + $part->{u}++; + + # If we meet the subsequent entry then merge the two. + if ( exists $rg->[ $idx + 1 ] + and $part->{u} + 1 == $rg->[ $idx + 1 ]->{l} ) + { + $part->{u} = $rg->[ $idx + 1 ]->{u}; + splice( @{$rg}, $idx + 1, 1 ); + } + return; + } + $idx++; + } + confess("Failed to add value to range"); +} + +sub rng_merge { + my ( $rg, $new ) = @_; + + # Need to use defined here as zero is a valid value to store + # in a range. + while ( defined( my $val = rng_shift($new) ) ) { + rng_add_value( $rg, $val ); + } + return; } sub create_local_port { @@ -3774,18 +3754,15 @@ return $sl; } - my $irange = $range; - my $port; - - while ( ( $port, $range ) = shift_from_range($range) and defined $port ) { + my $rg = rng_convert_from_user($range); + + while ( my $port = rng_shift($rg) ) { $options{LocalPort} = $port; my $sl = IO::Socket::INET->new(%options); return $sl if defined $sl; - - #$range = $nrange; } - die("Failed to create local port, no free ports in range \"$irange\"\n"); + die("Failed to create local port, no free ports in range \"$range\"\n"); } sub go_parallel { @@ -4413,10 +4390,9 @@ my ( $rank, $error ) = @_; if ( defined $inner_error{$error} ) { - $inner_error{$error} = - add_value_to_range( $inner_error{$error}, $rank ); + rng_add_value( $inner_error{$error}, $rank ); } else { - $inner_error{$error} = "[$rank]"; + $inner_error{$error} = rng_convert_from_user($rank); } } @@ -6616,8 +6592,7 @@ # Merge in local target responces. foreach my $key ( keys(%inner_error) ) { if ( defined $handle->{all_replys}->{target_error}{$key} ) { - $handle->{all_replys}->{target_error}{$key} = - merge_ranges( $handle->{all_replys}->{target_error}{$key}, + rng_merge( $handle->{all_replys}->{target_error}{$key}, $inner_error{$key} ); } else { $handle->{all_replys}->{target_error}{$key} = $inner_error{$key}; From codesite-noreply at google.com Tue Aug 25 11:10:16 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Tue, 25 Aug 2009 10:10:16 +0000 Subject: [padb-devel] [padb] r149 committed - Don't attempt to display output in the default output handler... Message-ID: <00504502cb1e8754650471f48a88@google.com> Revision: 149 Author: apittman Date: Tue Aug 25 03:10:00 2009 Log: Don't attempt to display output in the default output handler if there isn't any. http://code.google.com/p/padb/source/detail?r=149 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Tue Aug 25 03:02:09 2009 +++ /trunk/src/padb Tue Aug 25 03:10:00 2009 @@ -3072,7 +3072,8 @@ my $cargs = $req->{cargs}; - # Could warn on missing output here... + # Warn on missing output here... + return unless exists $d->{target_output}; my $lines = $d->{target_output}; my $mode = $req->{mode}; From codesite-noreply at google.com Tue Aug 25 11:59:42 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Tue, 25 Aug 2009 10:59:42 +0000 Subject: [padb-devel] [padb] r150 committed - Convert the rank handling code (-r) to use ranges everywhere. Now... Message-ID: <001636d34a3e4ce1b90471f53b19@google.com> Revision: 150 Author: apittman Date: Tue Aug 25 03:59:14 2009 Log: Convert the rank handling code (-r) to use ranges everywhere. Now padb can take rank ranges on the command line. http://code.google.com/p/padb/source/detail?r=150 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Tue Aug 25 03:10:00 2009 +++ /trunk/src/padb Tue Aug 25 03:59:14 2009 @@ -552,7 +552,8 @@ ############################################################################### my $user = getpwuid($<); -my @ranks; +my $rank_rng; + my @target_groups; my $all; my $any; @@ -620,10 +621,12 @@ Getopt::Long::Configure("bundling"); my $debugflag; + my @ranks; + my %optionhash = ( "verbose|v+" => \$conf{verbose}, "user|u=s" => \$user, - "rank|r=i" => \@ranks, + "rank|r=s" => \@ranks, "group-id=s" => \@target_groups, "help|h" => \&usage, "all|a" => \$all, @@ -689,6 +692,14 @@ $mode = $arg; $have_allfns_option++; } + + if (@ranks) { + $rank_rng = rng_convert_from_user( shift(@ranks) ); + + foreach my $rank (@ranks) { + rng_merge( $rank_rng, rng_convert_from_user($rank) ); + } + } # Put the args in a hash so that they can be referenced by name. if ( defined $mode and defined $allfns{$mode}{secondary} ) { @@ -1489,9 +1500,10 @@ if ( $conf{stats_short} ) { my $new; - if ( $#ranks != -1 ) { + if ( defined $rank_rng ) { my @ret; - foreach my $rank (@ranks) { + my $rng = rng_dup($rank_rng); + while ( defined( my $rank = rng_shift($rng) ) ) { if ( defined $d->[$rank] ) { push( @ret, summarise( $d->[$rank] ) ); } else { @@ -1508,8 +1520,9 @@ return; } - if ( $#ranks != -1 ) { - foreach my $rank (@ranks) { + if ( defined $rank_rng ) { + my $rng = rng_dup($rank_rng); + while ( defined( my $rank = rng_shift($rng) ) ) { if ( defined $d->[$rank] ) { print show_name $conf{stats_name}, $d->[$rank]; } else { @@ -3349,8 +3362,8 @@ # XXX: Should only send this list over if it makes sense, for example # the deadlock code only works when targetting all ranks. - if ( $#ranks != -1 ) { - @{ $req->{ranks} } = @ranks; + if ( defined $rank_rng ) { + $req->{ranks} = $rank_rng; } if ( defined $cmd->{out_format} ) { @@ -3723,6 +3736,8 @@ splice( @{$rg}, $idx + 1, 1 ); } return; + } elsif ( $value >= $part->{l} and $value <= $part->{u} ) { + confess("Failed to add value to range (Value already in range)"); } $idx++; } @@ -3739,6 +3754,11 @@ } return; } + +sub rng_dup { + my ($rg) = @_; + return dclone($rg); +} sub create_local_port { my ($range) = @_; @@ -6819,11 +6839,13 @@ # If supplied with a rank list then use it now to generate a list of # processes to inspect. if ( exists( $cmd->{ranks} ) ) { - my @ranks = @{ $cmd->{ranks} }; - foreach my $proc ( @{ $confInner{all_pids} } ) { - my $vp = $proc->{vp}; - my $pid = $proc->{pid}; - foreach my $rank (@ranks) { + my $rng = rng_dup( $cmd->{ranks} ); + + # Loop over ranks first as there are potentially more of them. + while ( defined( my $rank = rng_shift($rng) ) ) { + foreach my $proc ( @{ $confInner{all_pids} } ) { + my $vp = $proc->{vp}; + my $pid = $proc->{pid}; if ( $vp == $rank ) { push @{$pid_list}, $proc; } From codesite-noreply at google.com Tue Aug 25 13:24:33 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Tue, 25 Aug 2009 12:24:33 +0000 Subject: [padb-devel] [padb] r151 committed - Replace target_error with a more generic target_data that... Message-ID: <001636d34bfebe46430471f66a1b@google.com> Revision: 151 Author: apittman Date: Tue Aug 25 05:23:46 2009 Log: Replace target_error with a more generic target_data that takes not just values and ranges but also namespaces. Replicate the behaviour before by using the "error" namespace. http://code.google.com/p/padb/source/detail?r=151 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Tue Aug 25 03:59:14 2009 +++ /trunk/src/padb Tue Aug 25 05:23:46 2009 @@ -3485,14 +3485,15 @@ # sending. my $mode = $comm_data->{current_req}->{mode}; - if ( defined $d->{target_error} ) { + if ( defined $d->{target_data}{error} ) { printf("Warning: errors reported by some ranks\n========\n"); - foreach my $error ( sort( keys( %{ $d->{target_error} } ) ) ) { + foreach my $error ( sort( keys( %{ $d->{target_data}{error} } ) ) ) { printf( "%s: $error\n", - rng_convert_to_user( $d->{target_error}{$error} ) ); + rng_convert_to_user( $d->{target_data}{error}{$error} ) ); } printf("========\n"); } + if ( defined( $allfns{$mode}{out_handler} ) ) { $allfns{$mode}{out_handler}( $conf{mode_options}{$mode}, $d ); } else { @@ -4393,7 +4394,7 @@ } my %inner_output; -my %inner_error; +my %local_target_data; sub output { my ( $vp, $str ) = @_; @@ -4409,11 +4410,18 @@ # Report a single string error for a given target rank. sub target_error { my ( $rank, $error ) = @_; - - if ( defined $inner_error{$error} ) { - rng_add_value( $inner_error{$error}, $rank ); + target_key_pair( $rank, "error", $error ); + return; +} + +# Report a single string error for a given target rank. +sub target_key_pair ($$$) { + my ( $rank, $key, $value ) = @_; + + if ( defined $local_target_data{$key}{$value} ) { + rng_add_value( $local_target_data{$key}{$value}, $rank ); } else { - $inner_error{$error} = rng_convert_from_user($rank); + $local_target_data{$key}{$value} = rng_convert_from_user($rank); } } @@ -6604,29 +6612,33 @@ $handle->{target_responce}{$tp}; } } + + # Save any output we've got from this node. + foreach my $key ( keys(%inner_output) ) { + $handle->{all_replys}->{target_output}{$key} = $inner_output{$key}; + } + + %inner_output = (); # Copy the network target errors into responce. - if ( exists $r->{target_error} ) { - $handle->{all_replys}->{target_error} = $r->{target_error}; + if ( exists $r->{target_data} ) { + $handle->{all_replys}->{target_data} = $r->{target_data}; } # Merge in local target responces. - foreach my $key ( keys(%inner_error) ) { - if ( defined $handle->{all_replys}->{target_error}{$key} ) { - rng_merge( $handle->{all_replys}->{target_error}{$key}, - $inner_error{$key} ); - } else { - $handle->{all_replys}->{target_error}{$key} = $inner_error{$key}; + foreach my $key ( keys(%local_target_data) ) { + foreach my $value ( keys( %{ $local_target_data{$key} } ) ) { + if ( defined $handle->{all_replys}->{target_data}{$key}{$value} ) { + rng_merge( $handle->{all_replys}->{target_data}{$key}{$value}, + $local_target_data{$key}{$value} ); + } else { + $handle->{all_replys}->{target_data}{$key}{$value} = + $local_target_data{$key}{$value}; + } } } - # Save any output we've got from this node. - foreach my $key ( keys(%inner_output) ) { - $handle->{all_replys}->{target_output}{$key} = $inner_output{$key}; - } - - %inner_output = (); - %inner_error = (); + %local_target_data = (); # If this isn't the last child to signon don't reply up-stream yet. if ( $handle->{child_replys} != $handle->{children} ) { @@ -6645,7 +6657,6 @@ $handle->{all_replys} = undef; $handle->{child_replys} = 0; $handle->{target_responce} = undef; - $handle->{target_error} = undef; } # Convert from a pid to a command name and do it in a safe manner to avoid @@ -6933,15 +6944,15 @@ $res->{target_output}{$key} = $inner_output{$key}; } - if (%inner_error) { - $res->{target_error} = \%inner_error; + if (%local_target_data) { + $res->{target_data} = \%local_target_data; } reply_to_parent( $netdata, $res ); # Clear down the local inputs. - %inner_error = (); %inner_output = (); + %local_target_data = (); $netdata->{target_responce} = undef; if ( $netdata->{shutdown} ) { From codesite-noreply at google.com Tue Aug 25 13:44:41 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Tue, 25 Aug 2009 12:44:41 +0000 Subject: [padb-devel] [padb] r152 committed - Handle the initial signon data through the handle_key_pair() fucntion... Message-ID: <0016e64af376b890800471f6b259@google.com> Revision: 152 Author: apittman Date: Tue Aug 25 05:44:35 2009 Log: Handle the initial signon data through the handle_key_pair() fucntion to make use of the in-network reduction. http://code.google.com/p/padb/source/detail?r=152 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Tue Aug 25 05:23:46 2009 +++ /trunk/src/padb Tue Aug 25 05:44:35 2009 @@ -3396,46 +3396,45 @@ sub report_failed_signon { my ( $key, $data ) = @_; - my %values; - foreach my $proc ( keys( %{$data} ) ) { - push( @{ $values{ $data->{$proc}{$key} } }, $proc ); - } my %c; $c{i} = length($key); printf("$key : ranks\n"); - foreach my $value ( sort( keys(%values) ) ) { - printf( "%$c{i}s : %s\n", $value, compress( @{ $values{$value} } ) ); + foreach my $value ( sort( keys( %{$data} ) ) ) { + printf( "%$c{i}s : %s\n", + $value, rng_convert_to_user( $data->{$value} ) ); } } sub check_signon { my ( $comm_data, $data ) = @_; return if ( $conf{check_signon} eq "none" ); + my %here; + while ( + defined( my $proc = rng_shift( $data->{target_data}{found}{yes} ) ) ) + { + $here{$proc} = 1; + } my @missing; - my %target_state; - my %target_exe; for ( my $proc = 0 ; $proc < $comm_data->{nprocesses} ; $proc++ ) { - if ( not defined $data->{target_responce}{$proc} ) { + if ( not defined $here{$proc} ) { push( @missing, $proc ); - next; - } - $target_exe{ $data->{target_responce}{$proc}{name} }++; - $target_state{ $data->{target_responce}{$proc}{state} }++; + } } if ( $#missing != -1 ) { printf( "Warning, failed to locate ranks %s\n", compress(@missing) ); } return if ( $conf{check_signon} eq "missing" ); - my $exe_count = keys(%target_exe); - if ( $exe_count != 1 ) { + + if ( keys( %{ $data->{target_data}{name} } ) != 1 ) { printf("Warning, remote process name differs across ranks\n"); - report_failed_signon( "name", $data->{target_responce} ); - } - my $state_count = keys(%target_state); - if ( $state_count != 1 ) { + report_failed_signon( "name", \%{ $data->{target_data}{name} } ); + } + + if ( keys( %{ $data->{target_data}{state} } ) != 1 ) { printf("Warning, remote process state differs across ranks\n"); - report_failed_signon( "state", $data->{target_responce} ); - } + report_failed_signon( "state", \%{ $data->{target_data}{state} } ); + } + } sub command_from_inner { @@ -6756,9 +6755,6 @@ sub inner_find_pids { my ( $netdata, $cmd ) = @_; - # Cache config data and search for pids, storing - # data in $netdata->{target_responce} and $netdata->?? - if ( $cmd->{jobconfig}{rmgr} eq "orte" ) { $confInner{orte_data} = $cmd->{jobconfig}{orte_data}; } @@ -6774,8 +6770,9 @@ my $vp = $proc->{vp}; my $name = readlink("/proc/$pid/exe"); my $state = find_from_status( $pid, "State" ); - $netdata->{target_responce}{$vp}->{name} = $name; - $netdata->{target_responce}{$vp}->{state} = $state; + target_key_pair( $vp, "found", "yes" ); + target_key_pair( $vp, "name", $name ); + target_key_pair( $vp, "state", $state ); } } @@ -6937,7 +6934,9 @@ if ( $netdata->{children} == 0 ) { my $res; - $res->{target_responce} = $netdata->{target_responce}; + if ( defined $netdata->{target_responce} ) { + $res->{target_responce} = $netdata->{target_responce}; + } # Save any output we've got from this node. foreach my $key ( keys(%inner_output) ) { From codesite-noreply at google.com Tue Aug 25 13:49:44 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Tue, 25 Aug 2009 12:49:44 +0000 Subject: [padb-devel] [padb] r153 committed - Only send the rank information over then network when using orte, the ... Message-ID: <001636e1f77fd37bc50471f6c455@google.com> Revision: 153 Author: apittman Date: Tue Aug 25 05:48:55 2009 Log: Only send the rank information over then network when using orte, the rest of the information isn't used so avoid sending it. http://code.google.com/p/padb/source/detail?r=153 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Tue Aug 25 05:44:35 2009 +++ /trunk/src/padb Tue Aug 25 05:48:55 2009 @@ -3315,7 +3315,7 @@ $req->{jobconfig}{rmgr} = $conf{rmgr}; if ( $conf{rmgr} eq "orte" ) { - $req->{jobconfig}{orte_data} = $open_jobs{ $comm_data->{jobid} }; + $req->{jobconfig}{orte_data} = $open_jobs{ $comm_data->{jobid} }{ranks}; } $req->{cinner} = \%cinner; @@ -6468,8 +6468,8 @@ my $hostname = $confInner{hostname}; - foreach my $rank ( keys( %{ $confInner{orte_data}{ranks}{$hostname} } ) ) { - maybe_show_pid( $rank, $confInner{orte_data}{ranks}{$hostname}{$rank} ); + foreach my $rank ( keys( %{ $confInner{orte_data}{$hostname} } ) ) { + maybe_show_pid( $rank, $confInner{orte_data}{$hostname}{$rank} ); } } From codesite-noreply at google.com Tue Aug 25 13:59:48 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Tue, 25 Aug 2009 12:59:48 +0000 Subject: [padb-devel] [padb] r154 committed - Get rid of jobconfig in the signon request, it only contained... Message-ID: <00504502f561c853f70471f6e8f3@google.com> Revision: 154 Author: apittman Date: Tue Aug 25 05:58:39 2009 Log: Get rid of jobconfig in the signon request, it only contained the jobid and optionally the orte info so just move these into the config and global namespace respectively. http://code.google.com/p/padb/source/detail?r=154 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Tue Aug 25 05:48:55 2009 +++ /trunk/src/padb Tue Aug 25 05:58:39 2009 @@ -3311,14 +3311,13 @@ # Also send over some of the per-run (as opposed to per-mode) # configuration options. # XXX: Need to send over scripts and other stuff here as well. - $req->{jobconfig}{jobid} = $comm_data->{jobid}; - $req->{jobconfig}{rmgr} = $conf{rmgr}; if ( $conf{rmgr} eq "orte" ) { - $req->{jobconfig}{orte_data} = $open_jobs{ $comm_data->{jobid} }{ranks}; + $req->{orte_data} = $open_jobs{ $comm_data->{jobid} }{ranks}; } $req->{cinner} = \%cinner; + $req->{cinner}{jobid} = $comm_data->{jobid}; return $req; } @@ -6755,13 +6754,13 @@ sub inner_find_pids { my ( $netdata, $cmd ) = @_; - if ( $cmd->{jobconfig}{rmgr} eq "orte" ) { - $confInner{orte_data} = $cmd->{jobconfig}{orte_data}; + if ( $confInner{rmgr} eq "orte" ) { + $confInner{orte_data} = $cmd->{orte_data}; } # Query the resource manager to find the pids, # they'll be added to the "all_pids" array. - $rmgr{ $cmd->{jobconfig}{rmgr} }{find_pids}( $cmd->{jobconfig}{jobid} ); + $rmgr{ $confInner{rmgr} }{find_pids}( $confInner{jobid} ); convert_pids_to_child_pids(); From codesite-noreply at google.com Tue Aug 25 14:32:07 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Tue, 25 Aug 2009 13:32:07 +0000 Subject: [padb-devel] [padb] r155 committed - Document the new rng_ functions and use them to find missing processes... Message-ID: <0016367f928e6421c40471f75c5c@google.com> Revision: 155 Author: apittman Date: Tue Aug 25 06:31:14 2009 Log: Document the new rng_ functions and use them to find missing processes during signon. http://code.google.com/p/padb/source/detail?r=155 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Tue Aug 25 05:58:39 2009 +++ /trunk/src/padb Tue Aug 25 06:31:14 2009 @@ -3413,15 +3413,19 @@ { $here{$proc} = 1; } - my @missing; + my $rng = rng_create_empty(); + for ( my $proc = 0 ; $proc < $comm_data->{nprocesses} ; $proc++ ) { if ( not defined $here{$proc} ) { - push( @missing, $proc ); + rng_add_value( $rng, $proc ); } } - if ( $#missing != -1 ) { - printf( "Warning, failed to locate ranks %s\n", compress(@missing) ); - } + + if ( not rng_empty($rng) ) { + printf( "Warning, failed to locate ranks %s\n", + rng_convert_to_user($rng) ); + } + return if ( $conf{check_signon} eq "missing" ); if ( keys( %{ $data->{target_data}{name} } ) != 1 ) { @@ -3615,22 +3619,37 @@ # rng_convert_from_user($userrange) # rc_convert_to_user($range) -# Convert to and from the normal type to the internal type. +# Convert to and from the normal type to the internal type. # rng_shift($range) -# Pop the lowest value off the range. +# Pop the lowest value off the range. # rng_add_value($range,$value) -# Add a value to the range. +# Add a value to the range. # rng_merge($range,$new) -# Merge two ranges. +# Merge two ranges. + +# rng_dup($range) +# Duplicate a range + +# rng_create_from_array(@array) +# Create a range from an array + +# rng_create_empty +# Create a empty range + +# rng_empty +# Test for emptyness. # Potentially needed but not implemented yet # rng_user_verify() # is_value_in_range() # nvalues_in_range() +# rng_find_missing() +# Take two ranges and return all that are in the first but not in the +# second. (see check_signon). # Convert from a user range to a internal one. sub rng_convert_from_user { @@ -3703,6 +3722,11 @@ sub rng_add_value { my ( $rg, $value ) = @_; + if ( ref( $rg->[0] ) eq "" ) { + push( @{$rg}, { 'l' => $value, 'u' => $value } ); + return; + } + # If it's after the last value then just add it. if ( $value > $rg->[-1]->{u} + 1 ) { push( @{$rg}, { 'l' => $value, 'u' => $value } ); @@ -3758,6 +3782,27 @@ my ($rg) = @_; return dclone($rg); } + +sub rng_create_from_array { + my (@r) = @_; + + my $rng = rng_convert_from_user( shift(@r) ); + while ( defined( my $v = shift(@r) ) ) { + rng_add_value( $rng, $v ); + } + return $rng; +} + +sub rng_create_empty { + my @r; + return \@r; +} + +sub rng_empty { + my ($rg) = @_; + + return ( ref( $rg->[0] ) eq "" ); +} sub create_local_port { my ($range) = @_; From codesite-noreply at google.com Tue Aug 25 16:42:30 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Tue, 25 Aug 2009 15:42:30 +0000 Subject: [padb-devel] [padb] r156 committed - Actually generate a tree rather than a "string" of nodes when working... Message-ID: <00504502f561a9ff750471f92ea8@google.com> Revision: 156 Author: apittman Date: Tue Aug 25 08:41:34 2009 Log: Actually generate a tree rather than a "string" of nodes when working out the connection tree. Previously each node connected to just one other leading to very deep messages, now it's tree_width wide (default=4) which should keep the communication overhead down and help with the scaling no end. http://code.google.com/p/padb/source/detail?r=156 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Tue Aug 25 06:31:14 2009 +++ /trunk/src/padb Tue Aug 25 08:41:34 2009 @@ -410,6 +410,8 @@ # Option to define a list of ports used by padb. $conf{port_range} = undef; +$conf{tree_width} = 4; + my $norc = 0; my $configfile = "/etc/padb.conf"; @@ -615,6 +617,7 @@ $debugModes{verbose} = undef; $debugModes{signon} = undef; $debugModes{rmgr} = undef; +$debugModes{ctree} = undef; sub parse_args_outer { @@ -680,7 +683,9 @@ } if ( $debugModes{all} ) { foreach my $mode ( keys(%debugModes) ) { - $debugModes{$mode} = $debugModes{all}; + if ( not defined $debugModes{$mode} ) { + $debugModes{$mode} = $debugModes{all}; + } } } } @@ -3245,11 +3250,8 @@ } -# For each remote process generate a tree, giving each -# process a parent and a number of children. -# Currently just make this a simple "ladder" but should -# probably be a f-nomial tree. -sub generate_comm_tree { +# A simple "ladder" or 1-wide tree +sub generate_comm_tree_ladder { my ($a) = @_; my @b = @{$a}; my $last = "root"; @@ -3262,6 +3264,50 @@ return \%comm_tree; } + +# Fairly simple this, walk through the hosts keeping a list +# of joints (Those able to accept children this iteration) and +# leaves (those able to accept children next iteration) and +# loop until there are no more hosts left to add. +sub generate_binary_tree { + my ( $a, $width ) = @_; + my @b = @{$a}; + my $last = "root"; + my %comm_tree; + + my @leaves; + + my $root = shift( @{$a} ); + + my @joints; + push( @joints, $root ); + + $comm_tree{root}{children}[0] = $root; + + while ( @{$a} ) { + foreach my $joint (@joints) { + my @children = splice( @{$a}, 0, $width ); + if ( $#children > -1 ) { + push( @leaves, @children ); + @{ $comm_tree{$joint}{children} } = @children; + } + } + @joints = @leaves; + @leaves = (); + } + + return \%comm_tree; +} + +# For each remote process generate a tree, giving each +# process a parent and a number of children. +# Currently just make this a simple "ladder" but should +# probably be a f-nomial tree. +sub generate_comm_tree { + my ($a) = @_; + + return generate_binary_tree( $a, $conf{tree_width} ); +} # Called once when we have the socket details of the last child. sub connect_to_children { @@ -3269,12 +3315,14 @@ debug_log( "signon", undef, "Received last signon, connecting to inner" ); - @{ $comm_data->{host_ids} } = sort( keys( %{ $comm_data->{remote} } ) ); + @{ $comm_data->{host_ids} } = sortn( keys( %{ $comm_data->{remote} } ) ); $comm_data->{connection_tree} = generate_comm_tree( $comm_data->{host_ids} ); + my $td = $comm_data->{connection_tree}->{root}{children}[0]; - #printf("I'm connecting to $td\n"); + debug_log( "ctree", $comm_data->{connection_tree}, "connection tree" ); + my $cdata; $cdata->{socket} = connect_to_child( $td, @@ -6174,6 +6222,7 @@ sub ping_rank { my ( $cargs, $vp, $pid ) = @_; + target_key_pair( $vp, "PING", "ACK" ); output( $vp, "ACK" ); return; } From codesite-noreply at google.com Tue Aug 25 17:26:51 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Tue, 25 Aug 2009 16:26:51 +0000 Subject: [padb-devel] [padb] r157 committed - When reducing data in the tree be sure to reduce the data from... Message-ID: <0016e649c93048e3510471f9cd8b@google.com> Revision: 157 Author: apittman Date: Tue Aug 25 09:25:50 2009 Log: When reducing data in the tree be sure to reduce the data from all children, rather than reducing the data from the first child with the local data and then over-writing it with the next childs data. http://code.google.com/p/padb/source/detail?r=157 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Tue Aug 25 08:41:34 2009 +++ /trunk/src/padb Tue Aug 25 09:25:50 2009 @@ -6714,7 +6714,26 @@ # Copy the network target errors into responce. if ( exists $r->{target_data} ) { - $handle->{all_replys}->{target_data} = $r->{target_data}; + if ( exists $handle->{all_replys}->{target_data} ) { + foreach my $key ( keys( %{ $r->{target_data} } ) ) { + foreach my $value ( keys( %{ $r->{target_data}{$key} } ) ) { + if ( + defined $handle->{all_replys} + ->{target_data}{$key}{$value} ) + { + rng_merge( + $handle->{all_replys}->{target_data}{$key}{$value}, + $r->{target_data}{$key}{$value} + ); + } else { + $handle->{all_replys}->{target_data}{$key}{$value} = + $r->{target_data}{$key}{$value}; + } + } + } + } else { + $handle->{all_replys}->{target_data} = $r->{target_data}; + } } # Merge in local target responces. From codesite-noreply at google.com Tue Aug 25 18:45:46 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Tue, 25 Aug 2009 17:45:46 +0000 Subject: [padb-devel] [padb] r158 committed - Report errors properly when looking for message queues and gdb... Message-ID: <000e0cd20f808473ff0471fae71d@google.com> Revision: 158 Author: apittman Date: Tue Aug 25 10:45:00 2009 Log: Report errors properly when looking for message queues and gdb isn't installed or other erros pop up. http://code.google.com/p/padb/source/detail?r=158 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Tue Aug 25 09:25:50 2009 +++ /trunk/src/padb Tue Aug 25 10:45:00 2009 @@ -5085,14 +5085,24 @@ my $g = gdb_start(); my $p = gdb_attach( $g, $pid ); if ( !$p ) { - debug( $vp, "Failed to attach to process\n" ); + if ( defined $g->{error} ) { + target_error( $vp, $g->{error} ); + } else { + target_error( $vp, "Failed to attach to process" ); + } return; } + + my $base = gdb_var_addr( $g, "MPIR_dll_name" ); + if ( !defined $base ) { + target_error( $vp, + "Process does not appear to be using MPI (No MPIR_dll_name symbol)" + ); + } if ( defined $carg->{mpi_dll} ) { $ENV{MPINFO_DLL} = $carg->{mpi_dll}; } else { - my $base = gdb_var_addr( $g, "MPIR_dll_name" ); if ( !defined $base ) { gdb_detach($g); gdb_quit($g); @@ -5109,11 +5119,17 @@ # As above but take a gdb handle sub fetch_mpi_queue_gdb { my ( $carg, $vp, $pid, $g ) = @_; + + my $base = gdb_var_addr( $g, "MPIR_dll_name" ); + if ( !defined $base ) { + target_error( $vp, + "Process does not appear to be using MPI (No MPIR_dll_name symbol)" + ); + } if ( defined $carg->{mpi_dll} ) { $ENV{MPINFO_DLL} = $carg->{mpi_dll}; } else { - my $base = gdb_var_addr( $g, "MPIR_dll_name" ); if ( !defined $base ) { return; } @@ -5148,7 +5164,11 @@ $proc->{gdb} = $gdb; push( @all, $proc ); } else { - output $vp, "Failed to attach to to process"; + if ( defined $gdb->{error} ) { + target_error( $vp, $gdb->{error} ); + } else { + target_error( $vp, "Failed to attach to process" ); + } } } @@ -6135,9 +6155,9 @@ $gdb = $g; } else { if ( defined $gdb->{error} ) { - output $vp, $gdb->{error}; + target_error( $vp, $gdb->{error} ); } else { - output $vp, "Failed to attach to process"; + target_error( $vp, "Failed to attach to process" ); } } } else { From codesite-noreply at google.com Wed Aug 26 09:44:49 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Wed, 26 Aug 2009 08:44:49 +0000 Subject: [padb-devel] [padb] r159 committed - In mpi-watch mode sample process status before attempting to attach wi... Message-ID: <001636c5be01c5ac9b0472077635@google.com> Revision: 159 Author: apittman Date: Wed Aug 26 01:43:48 2009 Log: In mpi-watch mode sample process status before attempting to attach with gdb, otherwise the process state is always T which is treated as an error. Also handle the case where gdb isn't installed, previously an error was given and the program hung, now it just doesn't report stack data. http://code.google.com/p/padb/source/detail?r=159 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Tue Aug 25 10:45:00 2009 +++ /trunk/src/padb Wed Aug 26 01:43:48 2009 @@ -3808,7 +3808,8 @@ } return; } elsif ( $value >= $part->{l} and $value <= $part->{u} ) { - confess("Failed to add value to range (Value already in range)"); + carp("Failed to add value to range (Value already in range)"); + return; } $idx++; } @@ -6415,17 +6416,40 @@ foreach my $proc ( @{$list} ) { my $vp = $proc->{vp}; my $pid = $proc->{pid}; + + # Load the status now before we attach with GDB, + # otherwise we'll just see it as "T" (Stopped). + my $m = find_from_status( $pid, "State" ); + if ( $m eq "R" ) { + $m = ","; + } elsif ( $m eq "S" ) { + $m = "-"; + } else { + $m = "*"; + } + $proc->{state} = $m; + my $gdb = gdb_start(); if ( gdb_attach( $gdb, $pid ) ) { $proc->{gdb} = $gdb; } else { - output $vp, "Failed to attach to to process"; + if ( defined $gdb->{error} ) { + target_error( $vp, $gdb->{error} ); + } else { + target_error( $vp, "Failed to attach to process" ); + } } } foreach my $proc ( @{$list} ) { my $vp = $proc->{vp}; my $pid = $proc->{pid}; + + if ( not defined $proc->{gdb} ) { + $res{$vp}{state} = $proc->{state}; + next; + } + my $gdb = $proc->{gdb}; my @mq; @@ -6488,22 +6512,15 @@ } - my $m = find_from_status( $pid, "State" ); - if ( $m eq "R" ) { - $m = $good; - } elsif ( $m eq "S" ) { - $m = "-"; - } else { - $m = "*"; - } - - $res{$vp}{state} = $m; + # Fall through case. + $res{$vp}{state} = $proc->{state}; } foreach my $proc ( @{$list} ) { - my $gdb = $proc->{gdb}; - gdb_detach($gdb); - gdb_quit($gdb); + if ( $proc->{gdb} ) { + gdb_detach( $proc->{gdb} ); + gdb_quit( $proc->{gdb} ); + } } return \%res; From codesite-noreply at google.com Wed Aug 26 10:19:01 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Wed, 26 Aug 2009 09:19:01 +0000 Subject: [padb-devel] [padb] r160 committed - Move the pre_out_handler around so that it can be called with... Message-ID: <001636283c480bd58b047207f143@google.com> Revision: 160 Author: apittman Date: Wed Aug 26 02:18:13 2009 Log: Move the pre_out_handler around so that it can be called with $nprocs as a paramater. Modify mpi_watch to give a list of ranks as part of the header. http://code.google.com/p/padb/source/detail?r=160 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Wed Aug 26 01:43:48 2009 +++ /trunk/src/padb Wed Aug 26 02:18:13 2009 @@ -2884,6 +2884,7 @@ } sub pre_mpi_watch { + my ($nprocs) = @_; my $header = <{current_req}{mode}; + + if ( defined $allfns{$mode}{pre_out_handler} ) { + $allfns{$mode}{pre_out_handler}( $comm_data->{nprocesses} ); + } + $header_shown = 1; +} sub command_from_inner { my ( $comm_data, $cdata, $line ) = @_; @@ -3529,6 +3554,7 @@ } maybe_clear_screen(); + maybe_show_header($comm_data); # Mode here is the mode for the reply we just got, this # may not be the same thing as the request we are currently @@ -4451,10 +4477,6 @@ printf "\nCollecting information for job '$jobid'\n\n" if ( $conf{verbose} or ( $#jobids > 0 ) ); - if ( defined $allfns{$mode}{pre_out_handler} ) { - $allfns{$mode}{pre_out_handler}(); - } - my $of; $of = "tree" if $tree; $of = "compress" if $compress; From codesite-noreply at google.com Wed Aug 26 11:16:36 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Wed, 26 Aug 2009 10:16:36 +0000 Subject: [padb-devel] [padb] r161 committed - Added a "tdata" debug option to print the contents of target_data.... Message-ID: <00163630ec45fa4a5c047208be5d@google.com> Revision: 161 Author: apittman Date: Wed Aug 26 03:16:06 2009 Log: Added a "tdata" debug option to print the contents of target_data. As this is a namespace/value hash with range lists reduced in the network it prints really nicely and is very useful to have. http://code.google.com/p/padb/source/detail?r=161 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Wed Aug 26 02:18:13 2009 +++ /trunk/src/padb Wed Aug 26 03:16:06 2009 @@ -618,6 +618,7 @@ $debugModes{signon} = undef; $debugModes{rmgr} = undef; $debugModes{ctree} = undef; +$debugModes{tdata} = undef; sub parse_args_outer { @@ -3512,6 +3513,20 @@ } $header_shown = 1; } + +sub format_target_data { + my ($td) = @_; + + my $ret = "\n"; + foreach my $name ( sort( keys( %{$td} ) ) ) { + $ret .= "Namespace: \"$name\"\n"; + foreach my $value ( sortn( keys( %{ $td->{$name} } ) ) ) { + $ret .= " $value\t"; + $ret .= rng_convert_to_user( $td->{$name}{$value} ) . "\n"; + } + } + return $ret; +} sub command_from_inner { my ( $comm_data, $cdata, $line ) = @_; @@ -3552,6 +3567,14 @@ if ( not $watch ) { issue_command_to_inner( $cdata, $req ); } + + if ( defined $d->{target_data} ) { + debug_log( + "tdata", $d->{target_data}, + "Target data %s", + format_target_data( $d->{target_data} ) + ); + } maybe_clear_screen(); maybe_show_header($comm_data); @@ -5948,6 +5971,7 @@ my ( $vp, $key, $value ) = @_; if ( $confInner{mode} eq "proc_summary" ) { if ( defined $proc_keys{ lc($key) } ) { + target_key_pair( $vp, lc($key), $value ); $proc_info->{$vp}{ lc($key) } = $value; } } else { @@ -6442,6 +6466,7 @@ # Load the status now before we attach with GDB, # otherwise we'll just see it as "T" (Stopped). my $m = find_from_status( $pid, "State" ); + target_key_pair( $vp, "state", $m ); if ( $m eq "R" ) { $m = ","; } elsif ( $m eq "S" ) { From codesite-noreply at google.com Wed Aug 26 11:43:56 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Wed, 26 Aug 2009 10:43:56 +0000 Subject: [padb-devel] [padb] r162 committed - Convert mpi-watch to use target_key_pair() to return data over the... Message-ID: <001636283c48c3560c047209200e@google.com> Revision: 162 Author: apittman Date: Wed Aug 26 03:43:13 2009 Log: Convert mpi-watch to use target_key_pair() to return data over the network. Add a array_from_target_namespace() function to convert from a target namespace to a array of values and call this from show_mpi_watch. This should both reduce the amount of data mpi-watch needs to pass around and also lower the CPU overhead required as well. http://code.google.com/p/padb/source/detail?r=162 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Wed Aug 26 03:16:06 2009 +++ /trunk/src/padb Wed Aug 26 03:43:13 2009 @@ -2906,16 +2906,32 @@ } printf("$l\n"); } + +# Convert back from a set of values (with ranges) in a namespace to a array of +# ranks containing the values. Assume that each rank only appears in the +# namespace with one value. +sub array_from_target_namespace { + my ($r) = @_; + + my @all; + foreach my $value ( sort( keys( %{$r} ) ) ) { + while ( defined( my $rank = rng_shift( $r->{$value} ) ) ) { + $all[$rank] = $value; + } + } + return @all; +} sub show_mpi_watch { my ( $handle, $lines ) = @_; - my $s = ""; - foreach my $l ( sort { $a <=> $b } ( keys %{ $lines->{target_responce} } ) ) - { - $s .= $lines->{target_responce}{$l}{state}; - } - print("$s\n"); + my @all = array_from_target_namespace( $lines->{target_data}{state} ); + + my $o = ""; + while ( defined( my $v = shift(@all) ) ) { + $o .= $v; + } + print("$o\n"); } # Nicely format process information. @@ -6456,7 +6472,6 @@ sub mpi_watch_all { my ( $carg, $list ) = @_; - my %res; my $fns = mpi_watch_load($carg); foreach my $proc ( @{$list} ) { @@ -6466,7 +6481,7 @@ # Load the status now before we attach with GDB, # otherwise we'll just see it as "T" (Stopped). my $m = find_from_status( $pid, "State" ); - target_key_pair( $vp, "state", $m ); + target_key_pair( $vp, "proc_state", $m ); if ( $m eq "R" ) { $m = ","; } elsif ( $m eq "S" ) { @@ -6493,7 +6508,7 @@ my $pid = $proc->{pid}; if ( not defined $proc->{gdb} ) { - $res{$vp}{state} = $proc->{state}; + target_key_pair( $vp, "state", $proc->{state} ); next; } @@ -6536,7 +6551,7 @@ $mode = "m"; } } - $res{$vp}{state} = $mode; + target_key_pair( $vp, "state", $mode ); next; } @@ -6554,13 +6569,13 @@ } if ( defined $fnmode ) { - $res{$vp}{state} = $fns->{names}{$fnmode}; + target_key_pair( $vp, "state", $fns->{names}{$fnmode} ); next; } # Fall through case. - $res{$vp}{state} = $proc->{state}; + target_key_pair( $vp, "state", $proc->{state} ); } foreach my $proc ( @{$list} ) { @@ -6570,7 +6585,6 @@ } } - return \%res; } sub maybe_show_pid { From codesite-noreply at google.com Wed Aug 26 11:57:02 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Wed, 26 Aug 2009 10:57:02 +0000 Subject: [padb-devel] [padb] r163 committed - Update proc-format to use the target_data code. This allows it to ... Message-ID: <00163630ec459cdd370472094f28@google.com> Revision: 163 Author: apittman Date: Wed Aug 26 03:56:39 2009 Log: Update proc-format to use the target_data code. This allows it to reduce the data in the network and decrease the amount of traffic sent considerably. http://code.google.com/p/padb/source/detail?r=163 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Wed Aug 26 03:43:13 2009 +++ /trunk/src/padb Wed Aug 26 03:56:39 2009 @@ -2921,6 +2921,24 @@ } return @all; } + +sub tree_from_namespace { + my ($r) = @_; + + my %res; + + foreach my $namespace ( keys( %{$r} ) ) { + foreach my $value ( keys( %{ $r->{$namespace} } ) ) { + while ( + defined( my $rank = rng_shift( $r->{$namespace}{$value} ) ) ) + { + $res{$rank}{$namespace} = $value; + } + } + } + + return \%res; +} sub show_mpi_watch { my ( $handle, $lines ) = @_; @@ -2970,7 +2988,8 @@ } my @all; - my $lines = $nlines->{target_responce}; + + my $lines = tree_from_namespace( $nlines->{target_data} ); foreach my $tag ( sort ( keys %$lines ) ) { my %hash; $hash{vp} = $tag; @@ -5980,7 +5999,6 @@ return sprintf( "%d", $used ); } -my $proc_info; my %proc_keys; sub proc_output { @@ -5988,7 +6006,6 @@ if ( $confInner{mode} eq "proc_summary" ) { if ( defined $proc_keys{ lc($key) } ) { target_key_pair( $vp, lc($key), $value ); - $proc_info->{$vp}{ lc($key) } = $value; } } else { output( $vp, "$key: $value" ); @@ -6007,8 +6024,6 @@ $proc_keys{ lc($name) } = 1; } } - - $proc_info = undef; my @all; @@ -6113,9 +6128,6 @@ } } - if ( $confInner{mode} eq "proc_summary" ) { - return $proc_info; - } return; } From codesite-noreply at google.com Wed Aug 26 13:04:40 2009 From: codesite-noreply at google.com (codesite-noreply at google.com) Date: Wed, 26 Aug 2009 12:04:40 +0000 Subject: [padb-devel] [padb] r164 committed - Work out the number of proceeses when using mpd and return ... Message-ID: <001636284afc7d5bd804720a41c1@google.com> Revision: 164 Author: apittman Date: Wed Aug 26 05:04:08 2009 Log: Work out the number of proceeses when using mpd and return the correct value from mpd_setup_pcmd()/ http://code.google.com/p/padb/source/detail?r=164 Modified: /trunk/src/padb ======================================= --- /trunk/src/padb Wed Aug 26 03:56:39 2009 +++ /trunk/src/padb Wed Aug 26 05:04:08 2009 @@ -2409,6 +2409,11 @@ } if ( $key eq "rank" ) { $jobs{$job}{pids}{$host}{$pid} = $value; + if ( ( not defined $jobs{$job}{lastproc} ) + or ( $value > $jobs{$job}{lastproc} ) ) + { + $jobs{$job}{lastproc} = $value; + } } } return \%jobs; @@ -2446,7 +2451,7 @@ my $hosts = $#hosts + 1; - return ( $cmd, undef, $hosts ); + return ( $cmd, $d->{$job}{lastproc} + 1, $hosts ); } sub mpd_cleanup_pcmd {