[padb] r458 committed - Fix the deadlock detection code....
padb at googlecode.com
padb at googlecode.com
Sat Jan 4 19:24:34 GMT 2014
Revision: 458
Author: apittman at gmail.com
Date: Sat Jan 4 19:23:59 2014 UTC
Log: Fix the deadlock detection code.
There were some changes to the handling of minfo output and the collective
state had not been updated so wasn't being reported correctly, and hence
the deadlock detection code had nothing to work with. Alter the minfo
output to be in the new format and correctly match/forward this in the padb
inner processes.
http://code.google.com/p/padb/source/detail?r=458
Modified:
/trunk/src/minfo.c
/trunk/src/padb
=======================================
--- /trunk/src/minfo.c Sat Jan 4 18:58:01 2014 UTC
+++ /trunk/src/minfo.c Sat Jan 4 19:23:59 2014 UTC
@@ -52,21 +52,6 @@
struct dll_entry_points dll_ep = {};
-char *collective_names[] = { "Barrier",
- "Bcast",
- "Allgather",
- "Allgatherv",
- "Allreduce",
- "Alltoall",
- "Alltoallv",
- "Reduce_Scatter",
- "Reduce",
- "Gather",
- "Gatherv",
- "Scan",
- "Scatter",
- "Scatterv" };
-
char *op_types[] = { "pending_send",
"pending_receive",
"unexpected_message" };
@@ -476,11 +461,10 @@
int r = dll_ep.get_comm_coll_state(target_process,i,&seq,&active);
if ( r == mqs_ok ) {
if ( seq != 0 )
- printf("comm%d: Collective '%s': call count %d, %sactive\n",
- comm_id,
- collective_names[i],
+ printf("col: id:%d count:%d active:%d\n",
+ i,
seq,
- active ? "" : "not ");
+ active ? 1 : 0);
} else if ( r != mqs_no_information ) {
show_dll_error_code(r);
}
=======================================
--- /trunk/src/padb Sat Jan 4 19:17:54 2014 UTC
+++ /trunk/src/padb Sat Jan 4 19:23:59 2014 UTC
@@ -7286,6 +7286,33 @@
} else {
target_error( $vp, "UNPARSEABLE MINFO: $r" );
}
+ } elsif ( $cmd eq 'col:' ) {
+ $stats{out}++;
+ if (
+ $r =~ m{\A
+ col:
+ [ ]
+ id:(\d+)
+ [ ]?
+ count:(\d+)
+ [ ]?
+ active:(\d+)
+ \z
+ }x
+ )
+ {
+ my $call = {
+ id => $1,
+ count => $2,
+ active => $3,
+ };
+
+ push @{ $communicator_descriptor{coll} }, $call;
+
+ } else {
+ target_error( $vp, "UNPARSEABLE MINFO: $r" );
+ }
+
} elsif ( $cmd eq 'Msg:' ) {
$stats{msg}++;
if (
@@ -7393,6 +7420,13 @@
sub minfo_to_array {
my ($cd) = @_;
+ my @coll_name_lookup = (
+ "Barrier", "Bcast", "Allgather", "Allgatherv",
+ "Allreduce", "Alltoall", "Alltoallv", "Reduce_Scatter",
+ "Reduce", "Gather", "Gatherv", "Scan",
+ "Scatter", "Scatterv"
+ );
+
my @mq;
foreach my $comm ( @{$cd} ) {
@@ -7410,6 +7444,13 @@
my $mid = 0;
+ foreach my $cc ( @{ $comm->{coll} } ) {
+ my $coll_name = $coll_name_lookup[ $cc->{id} ];
+ my $active_desc = $cc->{active} ? "active" : "not active";
+ push @mq,
+"comm$comm->{mid}: Collective '$coll_name': call count $cc->{count},
$active_desc";
+ }
+
foreach my $m ( @{ $comm->{messages} } ) {
my @op_desc = qw(pending_send pending_receive
unexpected_message);
my @status_desc = qw(pending matched complete);
More information about the padb-devel
mailing list