[padb-users] padb not finding remote ranks

Steve Wise swise at opengridcomputing.com
Mon Aug 30 20:59:40 BST 2010


Hey,

I have an openmpi-1.4.1 8 node 64NP cluster, which is running jobs via 
orte/mpirun.  I start a job that is hanging, and then run padb to get 
the stack traces, yet padb displays this error and only shows the local 
process stacks (see output below).

Any ideas?

Thanks in advance.

Steve

------


[root at n0 ~]# mpirun --output-filename /share/log/out -np 64 --host 
n0,n1,n2,n3,n4,n5,n6,n7 --mca btl_openib_verbose 0 --mca 
btl_openib_receive_queues P,65536,64 --mca btl openib,sm,self 
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 -npmin 64 gather &
[1] 4621
[root at n0 ~]# ompi-ps


Information from mpirun [20398,0]
-----------------------------------

     JobID |   State |  Slots | Num Procs |
------------------------------------------
[20398,1] | Running |      8 |        64 |
                                          Process Name |      ORTE Name 
| Local Rank |    PID |                 Node |   State |
     
---------------------------------------------------------------------------------------------------------------------------
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 |  [[20398,1],0] 
|          0 |   4629 | n0.asicdesigners.com | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 |  [[20398,1],1] 
|          0 |   4673 |                   n1 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 |  [[20398,1],2] 
|          0 |   4794 |                   n2 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 |  [[20398,1],3] 
|          0 |   4694 |                   n3 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 |  [[20398,1],4] 
|          0 |   4666 |                   n4 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 |  [[20398,1],5] 
|          0 |   4674 |                   n5 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 |  [[20398,1],6] 
|          0 |   4671 |                   n6 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 |  [[20398,1],7] 
|          0 |   4876 |                   n7 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 |  [[20398,1],8] 
|          1 |   4630 | n0.asicdesigners.com | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 |  [[20398,1],9] 
|          1 |   4674 |                   n1 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],10] 
|          1 |   4795 |                   n2 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],11] 
|          1 |   4695 |                   n3 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],12] 
|          1 |   4667 |                   n4 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],13] 
|          1 |   4675 |                   n5 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],14] 
|          1 |   4672 |                   n6 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],15] 
|          1 |   4877 |                   n7 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],16] 
|          2 |   4631 | n0.asicdesigners.com | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],17] 
|          2 |   4675 |                   n1 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],18] 
|          2 |   4796 |                   n2 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],19] 
|          2 |   4696 |                   n3 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],20] 
|          2 |   4668 |                   n4 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],21] 
|          2 |   4676 |                   n5 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],22] 
|          2 |   4673 |                   n6 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],23] 
|          2 |   4878 |                   n7 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],24] 
|          3 |   4632 | n0.asicdesigners.com | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],25] 
|          3 |   4676 |                   n1 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],26] 
|          3 |   4797 |                   n2 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],27] 
|          3 |   4697 |                   n3 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],28] 
|          3 |   4669 |                   n4 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],29] 
|          3 |   4677 |                   n5 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],30] 
|          3 |   4674 |                   n6 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],31] 
|          3 |   4879 |                   n7 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],32] 
|          4 |   4633 | n0.asicdesigners.com | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],33] 
|          4 |   4677 |                   n1 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],34] 
|          4 |   4798 |                   n2 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],35] 
|          4 |   4698 |                   n3 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],36] 
|          4 |   4670 |                   n4 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],37] 
|          4 |   4678 |                   n5 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],38] 
|          4 |   4675 |                   n6 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],39] 
|          4 |   4880 |                   n7 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],40] 
|          5 |   4634 | n0.asicdesigners.com | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],41] 
|          5 |   4678 |                   n1 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],42] 
|          5 |   4799 |                   n2 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],43] 
|          5 |   4699 |                   n3 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],44] 
|          5 |   4671 |                   n4 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],45] 
|          5 |   4679 |                   n5 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],46] 
|          5 |   4676 |                   n6 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],47] 
|          5 |   4881 |                   n7 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],48] 
|          6 |   4635 | n0.asicdesigners.com | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],49] 
|          6 |   4679 |                   n1 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],50] 
|          6 |   4800 |                   n2 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],51] 
|          6 |   4700 |                   n3 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],52] 
|          6 |   4672 |                   n4 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],53] 
|          6 |   4680 |                   n5 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],54] 
|          6 |   4677 |                   n6 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],55] 
|          6 |   4882 |                   n7 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],56] 
|          7 |   4636 | n0.asicdesigners.com | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],57] 
|          7 |   4680 |                   n1 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],58] 
|          7 |   4801 |                   n2 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],59] 
|          7 |   4701 |                   n3 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],60] 
|          7 |   4673 |                   n4 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],61] 
|          7 |   4681 |                   n5 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],62] 
|          7 |   4678 |                   n6 | Running |
     /usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],63] 
|          7 |   4883 |                   n7 | Running |


[root at n0 ~]# /share/bin/padb --all --stack-trace --tree --config-option 
rmgr=orte
Warning, failed to locate ranks 
[1-7,9-15,17-23,25-31,33-39,41-47,49-55,57-63]
-----------------
[0,8,16,24,32,40,48,56] (8 processes)
-----------------
main() at ?:?
   IMB_init_buffers_iter() at ?:?
     IMB_gather() at ?:?
       PMPI_Gather() at pgather.c:175
         mca_coll_sync_gather() at coll_sync_gather.c:46
           ompi_coll_tuned_gather_intra_dec_fixed() at 
coll_tuned_decision_fixed.c:714
             ompi_coll_tuned_gather_intra_linear_sync() at 
coll_tuned_gather.c:248
               mca_pml_ob1_recv() at ../../../../opal/threads/condition.h:99
                 -----------------
                 [0,8,16,24,32,40,56] (7 processes)
                 -----------------
                 opal_progress() at runtime/opal_progress.c:207
                 -----------------
                 48 (1 processes)
                 -----------------
                 opal_progress() at 
../opal/include/opal/sys/amd64/timer.h:46
[root at n0 ~]#





More information about the padb-users mailing list