[padb-users] padb not finding remote ranks
Steve Wise
swise at opengridcomputing.com
Mon Aug 30 20:59:40 BST 2010
Hey,
I have an openmpi-1.4.1 8 node 64NP cluster, which is running jobs via
orte/mpirun. I start a job that is hanging, and then run padb to get
the stack traces, yet padb displays this error and only shows the local
process stacks (see output below).
Any ideas?
Thanks in advance.
Steve
------
[root at n0 ~]# mpirun --output-filename /share/log/out -np 64 --host
n0,n1,n2,n3,n4,n5,n6,n7 --mca btl_openib_verbose 0 --mca
btl_openib_receive_queues P,65536,64 --mca btl openib,sm,self
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 -npmin 64 gather &
[1] 4621
[root at n0 ~]# ompi-ps
Information from mpirun [20398,0]
-----------------------------------
JobID | State | Slots | Num Procs |
------------------------------------------
[20398,1] | Running | 8 | 64 |
Process Name | ORTE Name
| Local Rank | PID | Node | State |
---------------------------------------------------------------------------------------------------------------------------
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],0]
| 0 | 4629 | n0.asicdesigners.com | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],1]
| 0 | 4673 | n1 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],2]
| 0 | 4794 | n2 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],3]
| 0 | 4694 | n3 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],4]
| 0 | 4666 | n4 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],5]
| 0 | 4674 | n5 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],6]
| 0 | 4671 | n6 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],7]
| 0 | 4876 | n7 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],8]
| 1 | 4630 | n0.asicdesigners.com | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],9]
| 1 | 4674 | n1 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],10]
| 1 | 4795 | n2 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],11]
| 1 | 4695 | n3 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],12]
| 1 | 4667 | n4 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],13]
| 1 | 4675 | n5 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],14]
| 1 | 4672 | n6 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],15]
| 1 | 4877 | n7 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],16]
| 2 | 4631 | n0.asicdesigners.com | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],17]
| 2 | 4675 | n1 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],18]
| 2 | 4796 | n2 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],19]
| 2 | 4696 | n3 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],20]
| 2 | 4668 | n4 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],21]
| 2 | 4676 | n5 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],22]
| 2 | 4673 | n6 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],23]
| 2 | 4878 | n7 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],24]
| 3 | 4632 | n0.asicdesigners.com | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],25]
| 3 | 4676 | n1 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],26]
| 3 | 4797 | n2 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],27]
| 3 | 4697 | n3 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],28]
| 3 | 4669 | n4 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],29]
| 3 | 4677 | n5 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],30]
| 3 | 4674 | n6 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],31]
| 3 | 4879 | n7 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],32]
| 4 | 4633 | n0.asicdesigners.com | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],33]
| 4 | 4677 | n1 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],34]
| 4 | 4798 | n2 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],35]
| 4 | 4698 | n3 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],36]
| 4 | 4670 | n4 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],37]
| 4 | 4678 | n5 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],38]
| 4 | 4675 | n6 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],39]
| 4 | 4880 | n7 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],40]
| 5 | 4634 | n0.asicdesigners.com | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],41]
| 5 | 4678 | n1 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],42]
| 5 | 4799 | n2 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],43]
| 5 | 4699 | n3 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],44]
| 5 | 4671 | n4 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],45]
| 5 | 4679 | n5 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],46]
| 5 | 4676 | n6 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],47]
| 5 | 4881 | n7 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],48]
| 6 | 4635 | n0.asicdesigners.com | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],49]
| 6 | 4679 | n1 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],50]
| 6 | 4800 | n2 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],51]
| 6 | 4700 | n3 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],52]
| 6 | 4672 | n4 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],53]
| 6 | 4680 | n5 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],54]
| 6 | 4677 | n6 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],55]
| 6 | 4882 | n7 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],56]
| 7 | 4636 | n0.asicdesigners.com | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],57]
| 7 | 4680 | n1 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],58]
| 7 | 4801 | n2 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],59]
| 7 | 4701 | n3 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],60]
| 7 | 4673 | n4 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],61]
| 7 | 4681 | n5 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],62]
| 7 | 4678 | n6 | Running |
/usr/mpi/gcc/openmpi-1.4.1/tests/IMB-3.2/IMB-MPI1 | [[20398,1],63]
| 7 | 4883 | n7 | Running |
[root at n0 ~]# /share/bin/padb --all --stack-trace --tree --config-option
rmgr=orte
Warning, failed to locate ranks
[1-7,9-15,17-23,25-31,33-39,41-47,49-55,57-63]
-----------------
[0,8,16,24,32,40,48,56] (8 processes)
-----------------
main() at ?:?
IMB_init_buffers_iter() at ?:?
IMB_gather() at ?:?
PMPI_Gather() at pgather.c:175
mca_coll_sync_gather() at coll_sync_gather.c:46
ompi_coll_tuned_gather_intra_dec_fixed() at
coll_tuned_decision_fixed.c:714
ompi_coll_tuned_gather_intra_linear_sync() at
coll_tuned_gather.c:248
mca_pml_ob1_recv() at ../../../../opal/threads/condition.h:99
-----------------
[0,8,16,24,32,40,56] (7 processes)
-----------------
opal_progress() at runtime/opal_progress.c:207
-----------------
48 (1 processes)
-----------------
opal_progress() at
../opal/include/opal/sys/amd64/timer.h:46
[root at n0 ~]#
More information about the padb-users
mailing list