| 26 | |
---|
| 27 | /* Implements the "mirror permutation" of "bits" bits of an integer "x". |
---|
| 28 | |
---|
| 29 | positions 76543210, bits==3 yields 76543012. |
---|
| 30 | |
---|
| 31 | This function could/should be moved to a common utility location for use in |
---|
| 32 | other collectives as well. */ |
---|
| 33 | ATTRIBUTE((const)) /* tells the compiler that this func only depends on its args |
---|
| 34 | and may be optimized much more aggressively, similar to "pure" */ |
---|
| 35 | static inline int mirror_permutation(unsigned int x, int bits) |
---|
| 36 | { |
---|
| 37 | /* a mask for the high order bits that should be copied as-is */ |
---|
| 38 | int high_mask = ~((0x1 << bits) - 1); |
---|
| 39 | int retval = x & high_mask; |
---|
| 40 | int i; |
---|
| 41 | |
---|
| 42 | for (i = 0; i < bits; ++i) { |
---|
| 43 | unsigned int bitval = (x & (0x1 << i)) >> i; /* 0x1 or 0x0 */ |
---|
| 44 | retval |= bitval << ((bits - i) - 1); |
---|
| 45 | } |
---|
| 46 | |
---|
| 47 | return retval; |
---|
| 48 | } |
---|
| 49 | |
---|
| 50 | /* FIXME should we be checking the op_errno here? */ |
---|
| 51 | #ifdef HAVE_CXX_BINDING |
---|
| 52 | /* NOTE: assumes 'uop' is the operator function pointer and |
---|
| 53 | that 'is_cxx_uop' is is a boolean indicating the obvious */ |
---|
| 54 | #define call_uop(in_, inout_, count_, datatype_) \ |
---|
| 55 | do { \ |
---|
| 56 | if (is_cxx_uop) { \ |
---|
| 57 | (*MPIR_Process.cxx_call_op_fn)((in_), (inout_), (count_), (datatype_), uop); \ |
---|
| 58 | } \ |
---|
| 59 | else { \ |
---|
| 60 | (*uop)((in_), (inout_), &(count_), &(datatype_)); \ |
---|
| 61 | } \ |
---|
| 62 | } while (0) |
---|
| 63 | |
---|
| 64 | #else |
---|
| 65 | #define call_uop(in_, inout_, count_, datatype_) \ |
---|
| 66 | (*uop)((in_), (inout_), &(count_), &(datatype_)) |
---|
| 67 | #endif |
---|
| 68 | |
---|
| 69 | /* Implements the reduce-scatter butterfly algorithm described in J. L. Traff's |
---|
| 70 | * "An Improved Algorithm for (Non-commutative) Reduce-Scatter with an Application" |
---|
| 71 | * from EuroPVM/MPI 2005. This function currently only implements support for |
---|
| 72 | * the power-of-2, block-regular case (all receive counts are equal). */ |
---|
| 73 | #undef FUNCNAME |
---|
| 74 | #define FUNCNAME MPIR_Reduce_scatter_noncomm |
---|
| 75 | #undef FCNAME |
---|
| 76 | #define FCNAME MPIU_QUOTE(FUNCNAME) |
---|
| 77 | static int MPIR_Reduce_scatter_noncomm ( |
---|
| 78 | void *sendbuf, |
---|
| 79 | void *recvbuf, |
---|
| 80 | int *recvcnts, |
---|
| 81 | MPI_Datatype datatype, |
---|
| 82 | MPI_Op op, |
---|
| 83 | MPID_Comm *comm_ptr ) |
---|
| 84 | { |
---|
| 85 | int mpi_errno = MPI_SUCCESS; |
---|
| 86 | int comm_size = comm_ptr->local_size; |
---|
| 87 | int rank = comm_ptr->rank; |
---|
| 88 | int pof2; |
---|
| 89 | int log2_comm_size; |
---|
| 90 | int i, k; |
---|
| 91 | int recv_offset, send_offset; |
---|
| 92 | int block_size, total_count, size; |
---|
| 93 | MPI_Aint extent, true_extent, true_lb; |
---|
| 94 | int is_commutative; |
---|
| 95 | int buf0_was_inout; |
---|
| 96 | void *tmp_buf0; |
---|
| 97 | void *tmp_buf1; |
---|
| 98 | void *result_ptr; |
---|
| 99 | MPI_Comm comm = comm_ptr->handle; |
---|
| 100 | MPI_User_function *uop; |
---|
| 101 | MPID_Op *op_ptr; |
---|
| 102 | #ifdef HAVE_CXX_BINDING |
---|
| 103 | int is_cxx_uop = 0; |
---|
| 104 | #endif |
---|
| 105 | MPIU_CHKLMEM_DECL(3); |
---|
| 106 | |
---|
| 107 | MPID_Datatype_get_extent_macro(datatype, extent); |
---|
| 108 | /* assumes nesting is handled by the caller right now, may not be true in the future */ |
---|
| 109 | mpi_errno = NMPI_Type_get_true_extent(datatype, &true_lb, &true_extent); |
---|
| 110 | |
---|
| 111 | if (HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) { |
---|
| 112 | is_commutative = 1; |
---|
| 113 | /* get the function by indexing into the op table */ |
---|
| 114 | uop = MPIR_Op_table[op%16 - 1]; |
---|
| 115 | } |
---|
| 116 | else { |
---|
| 117 | MPID_Op_get_ptr(op, op_ptr); |
---|
| 118 | if (op_ptr->kind == MPID_OP_USER_NONCOMMUTE) |
---|
| 119 | is_commutative = 0; |
---|
| 120 | else |
---|
| 121 | is_commutative = 1; |
---|
| 122 | |
---|
| 123 | #ifdef HAVE_CXX_BINDING |
---|
| 124 | if (op_ptr->language == MPID_LANG_CXX) { |
---|
| 125 | uop = (MPI_User_function *) op_ptr->function.c_function; |
---|
| 126 | is_cxx_uop = 1; |
---|
| 127 | } |
---|
| 128 | else |
---|
| 129 | #endif |
---|
| 130 | if ((op_ptr->language == MPID_LANG_C)) |
---|
| 131 | uop = (MPI_User_function *) op_ptr->function.c_function; |
---|
| 132 | else |
---|
| 133 | uop = (MPI_User_function *) op_ptr->function.f77_function; |
---|
| 134 | } |
---|
| 135 | |
---|
| 136 | pof2 = 1; |
---|
| 137 | log2_comm_size = 0; |
---|
| 138 | while (pof2 < comm_size) { |
---|
| 139 | pof2 <<= 1; |
---|
| 140 | ++log2_comm_size; |
---|
| 141 | } |
---|
| 142 | |
---|
| 143 | /* begin error checking */ |
---|
| 144 | MPIU_Assert(pof2 == comm_size); /* FIXME this version only works for power of 2 procs */ |
---|
| 145 | |
---|
| 146 | for (i = 0; i < (comm_size - 1); ++i) { |
---|
| 147 | MPIU_Assert(recvcnts[i] == recvcnts[i+1]); |
---|
| 148 | } |
---|
| 149 | /* end error checking */ |
---|
| 150 | |
---|
| 151 | /* size of a block (count of datatype per block, NOT bytes per block) */ |
---|
| 152 | block_size = recvcnts[0]; |
---|
| 153 | total_count = block_size * comm_size; |
---|
| 154 | |
---|
| 155 | MPIU_CHKLMEM_MALLOC(tmp_buf0, void *, true_extent * total_count, mpi_errno, "tmp_buf0"); |
---|
| 156 | MPIU_CHKLMEM_MALLOC(tmp_buf1, void *, true_extent * total_count, mpi_errno, "tmp_buf1"); |
---|
| 157 | /* adjust for potential negative lower bound in datatype */ |
---|
| 158 | tmp_buf0 = (void *)((char*)tmp_buf0 - true_lb); |
---|
| 159 | tmp_buf1 = (void *)((char*)tmp_buf1 - true_lb); |
---|
| 160 | |
---|
| 161 | /* Copy our send data to tmp_buf0. We do this one block at a time and |
---|
| 162 | permute the blocks as we go according to the mirror permutation. */ |
---|
| 163 | for (i = 0; i < comm_size; ++i) { |
---|
| 164 | mpi_errno = MPIR_Localcopy((char *)(sendbuf == MPI_IN_PLACE ? recvbuf : sendbuf) + (i * true_extent * block_size), block_size, datatype, |
---|
| 165 | (char *)tmp_buf0 + (mirror_permutation(i, log2_comm_size) * true_extent * block_size), block_size, datatype); |
---|
| 166 | if (mpi_errno) MPIU_ERR_POP(mpi_errno); |
---|
| 167 | } |
---|
| 168 | buf0_was_inout = 1; |
---|
| 169 | |
---|
| 170 | send_offset = 0; |
---|
| 171 | recv_offset = 0; |
---|
| 172 | size = total_count; |
---|
| 173 | for (k = 0; k < log2_comm_size; ++k) { |
---|
| 174 | /* use a double-buffering scheme to avoid local copies */ |
---|
| 175 | char *incoming_data = (buf0_was_inout ? tmp_buf1 : tmp_buf0); |
---|
| 176 | char *outgoing_data = (buf0_was_inout ? tmp_buf0 : tmp_buf1); |
---|
| 177 | int peer = rank ^ (0x1 << k); |
---|
| 178 | size /= 2; |
---|
| 179 | |
---|
| 180 | if (rank > peer) { |
---|
| 181 | /* we have the higher rank: send top half, recv bottom half */ |
---|
| 182 | recv_offset += size; |
---|
| 183 | } |
---|
| 184 | else { |
---|
| 185 | /* we have the lower rank: recv top half, send bottom half */ |
---|
| 186 | send_offset += size; |
---|
| 187 | } |
---|
| 188 | |
---|
| 189 | mpi_errno = MPIC_Sendrecv(outgoing_data + send_offset*true_extent, |
---|
| 190 | size, datatype, peer, MPIR_REDUCE_SCATTER_TAG, |
---|
| 191 | incoming_data + recv_offset*true_extent, |
---|
| 192 | size, datatype, peer, MPIR_REDUCE_SCATTER_TAG, |
---|
| 193 | comm, MPI_STATUS_IGNORE); |
---|
| 194 | /* always perform the reduction at recv_offset, the data at send_offset |
---|
| 195 | is now our peer's responsibility */ |
---|
| 196 | if (rank > peer) { |
---|
| 197 | /* higher ranked value so need to call op(received_data, my_data) */ |
---|
| 198 | call_uop(incoming_data + recv_offset*true_extent, |
---|
| 199 | outgoing_data + recv_offset*true_extent, |
---|
| 200 | size, datatype); |
---|
| 201 | buf0_was_inout = buf0_was_inout; |
---|
| 202 | } |
---|
| 203 | else { |
---|
| 204 | /* lower ranked value so need to call op(my_data, received_data) */ |
---|
| 205 | call_uop(outgoing_data + recv_offset*true_extent, |
---|
| 206 | incoming_data + recv_offset*true_extent, |
---|
| 207 | size, datatype); |
---|
| 208 | buf0_was_inout = !buf0_was_inout; |
---|
| 209 | } |
---|
| 210 | |
---|
| 211 | /* the next round of send/recv needs to happen within the block (of size |
---|
| 212 | "size") that we just received and reduced */ |
---|
| 213 | send_offset = recv_offset; |
---|
| 214 | } |
---|
| 215 | |
---|
| 216 | MPIU_Assert(size == recvcnts[rank]); |
---|
| 217 | |
---|
| 218 | /* copy the reduced data to the recvbuf */ |
---|
| 219 | result_ptr = (char *)(buf0_was_inout ? tmp_buf0 : tmp_buf1) + recv_offset * true_extent; |
---|
| 220 | mpi_errno = MPIR_Localcopy(result_ptr, size, datatype, |
---|
| 221 | recvbuf, size, datatype); |
---|
| 222 | fn_exit: |
---|
| 223 | MPIU_CHKLMEM_FREEALL(); |
---|
| 224 | return mpi_errno; |
---|
| 225 | fn_fail: |
---|
| 226 | goto fn_exit; |
---|
| 227 | } |
---|
| 228 | |
---|
615 | | |
---|
616 | | /* noncommutative, use recursive doubling. */ |
---|
617 | | |
---|
618 | | /* need to allocate temporary buffer to receive incoming data*/ |
---|
619 | | tmp_recvbuf = MPIU_Malloc(total_count*(MPIR_MAX(true_extent,extent))); |
---|
620 | | /* --BEGIN ERROR HANDLING-- */ |
---|
621 | | if (!tmp_recvbuf) { |
---|
622 | | mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0 ); |
---|
623 | | return mpi_errno; |
---|
624 | | } |
---|
625 | | /* --END ERROR HANDLING-- */ |
---|
626 | | /* adjust for potential negative lower bound in datatype */ |
---|
627 | | tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb); |
---|
628 | | |
---|
629 | | /* need to allocate another temporary buffer to accumulate |
---|
630 | | results */ |
---|
631 | | tmp_results = MPIU_Malloc(total_count*(MPIR_MAX(true_extent,extent))); |
---|
632 | | /* --BEGIN ERROR HANDLING-- */ |
---|
633 | | if (!tmp_results) { |
---|
634 | | mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0 ); |
---|
635 | | return mpi_errno; |
---|
636 | | } |
---|
637 | | /* --END ERROR HANDLING-- */ |
---|
638 | | /* adjust for potential negative lower bound in datatype */ |
---|
639 | | tmp_results = (void *)((char*)tmp_results - true_lb); |
---|
640 | | |
---|
641 | | /* copy sendbuf into tmp_results */ |
---|
642 | | if (sendbuf != MPI_IN_PLACE) |
---|
643 | | mpi_errno = MPIR_Localcopy(sendbuf, total_count, datatype, |
---|
644 | | tmp_results, total_count, datatype); |
---|
645 | | else |
---|
646 | | mpi_errno = MPIR_Localcopy(recvbuf, total_count, datatype, |
---|
647 | | tmp_results, total_count, datatype); |
---|
648 | | |
---|
649 | | /* --BEGIN ERROR HANDLING-- */ |
---|
650 | | if (mpi_errno) |
---|
651 | | { |
---|
652 | | mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**fail", 0); |
---|
653 | | return mpi_errno; |
---|
654 | | } |
---|
655 | | /* --END ERROR HANDLING-- */ |
---|
656 | | |
---|
657 | | mask = 0x1; |
---|
658 | | i = 0; |
---|
659 | | while (mask < comm_size) { |
---|
660 | | dst = rank ^ mask; |
---|
661 | | |
---|
662 | | dst_tree_root = dst >> i; |
---|
663 | | dst_tree_root <<= i; |
---|
664 | | |
---|
665 | | my_tree_root = rank >> i; |
---|
666 | | my_tree_root <<= i; |
---|
667 | | |
---|
668 | | /* At step 1, processes exchange (n-n/p) amount of |
---|
669 | | data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p) |
---|
670 | | amount of data, and so forth. We use derived datatypes for this. |
---|
671 | | |
---|
672 | | At each step, a process does not need to send data |
---|
673 | | indexed from my_tree_root to |
---|
674 | | my_tree_root+mask-1. Similarly, a process won't receive |
---|
675 | | data indexed from dst_tree_root to dst_tree_root+mask-1. */ |
---|
676 | | |
---|
677 | | /* calculate sendtype */ |
---|
678 | | blklens[0] = blklens[1] = 0; |
---|
679 | | for (j=0; j<my_tree_root; j++) |
---|
680 | | blklens[0] += recvcnts[j]; |
---|
681 | | for (j=my_tree_root+mask; j<comm_size; j++) |
---|
682 | | blklens[1] += recvcnts[j]; |
---|
683 | | |
---|
684 | | dis[0] = 0; |
---|
685 | | dis[1] = blklens[0]; |
---|
686 | | for (j=my_tree_root; (j<my_tree_root+mask) && (j<comm_size); j++) |
---|
687 | | dis[1] += recvcnts[j]; |
---|
688 | | |
---|
689 | | NMPI_Type_indexed(2, blklens, dis, datatype, &sendtype); |
---|
690 | | NMPI_Type_commit(&sendtype); |
---|
691 | | |
---|
692 | | /* calculate recvtype */ |
---|
693 | | blklens[0] = blklens[1] = 0; |
---|
694 | | for (j=0; j<dst_tree_root && j<comm_size; j++) |
---|
695 | | blklens[0] += recvcnts[j]; |
---|
696 | | for (j=dst_tree_root+mask; j<comm_size; j++) |
---|
697 | | blklens[1] += recvcnts[j]; |
---|
698 | | |
---|
699 | | dis[0] = 0; |
---|
700 | | dis[1] = blklens[0]; |
---|
701 | | for (j=dst_tree_root; (j<dst_tree_root+mask) && (j<comm_size); j++) |
---|
702 | | dis[1] += recvcnts[j]; |
---|
703 | | |
---|
704 | | NMPI_Type_indexed(2, blklens, dis, datatype, &recvtype); |
---|
705 | | NMPI_Type_commit(&recvtype); |
---|
706 | | |
---|
707 | | received = 0; |
---|
708 | | if (dst < comm_size) { |
---|
709 | | /* tmp_results contains data to be sent in each step. Data is |
---|
710 | | received in tmp_recvbuf and then accumulated into |
---|
711 | | tmp_results. accumulation is done later below. */ |
---|
712 | | |
---|
713 | | mpi_errno = MPIC_Sendrecv(tmp_results, 1, sendtype, dst, |
---|
714 | | MPIR_REDUCE_SCATTER_TAG, |
---|
715 | | tmp_recvbuf, 1, recvtype, dst, |
---|
716 | | MPIR_REDUCE_SCATTER_TAG, comm, |
---|
717 | | MPI_STATUS_IGNORE); |
---|
718 | | received = 1; |
---|
719 | | /* --BEGIN ERROR HANDLING-- */ |
---|
720 | | if (mpi_errno) |
---|
721 | | { |
---|
722 | | mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**fail", 0); |
---|
723 | | return mpi_errno; |
---|
724 | | } |
---|
725 | | /* --END ERROR HANDLING-- */ |
---|
726 | | } |
---|
727 | | |
---|
728 | | /* if some processes in this process's subtree in this step |
---|
729 | | did not have any destination process to communicate with |
---|
730 | | because of non-power-of-two, we need to send them the |
---|
731 | | result. We use a logarithmic recursive-halfing algorithm |
---|
732 | | for this. */ |
---|
733 | | |
---|
734 | | if (dst_tree_root + mask > comm_size) { |
---|
735 | | nprocs_completed = comm_size - my_tree_root - mask; |
---|
736 | | /* nprocs_completed is the number of processes in this |
---|
737 | | subtree that have all the data. Send data to others |
---|
738 | | in a tree fashion. First find root of current tree |
---|
739 | | that is being divided into two. k is the number of |
---|
740 | | least-significant bits in this process's rank that |
---|
741 | | must be zeroed out to find the rank of the root */ |
---|
742 | | j = mask; |
---|
743 | | k = 0; |
---|
744 | | while (j) { |
---|
745 | | j >>= 1; |
---|
746 | | k++; |
---|
| 723 | int is_block_regular = 1; |
---|
| 724 | for (i = 0; i < (comm_size - 1); ++i) { |
---|
| 725 | if (recvcnts[i] != recvcnts[i+1]) { |
---|
| 726 | is_block_regular = 0; |
---|
| 727 | break; |
---|
| 728 | } |
---|
| 729 | } |
---|
| 730 | |
---|
| 731 | /* slightly retask pof2 to mean pof2 equal or greater, not always greater as it is above */ |
---|
| 732 | pof2 = 1; |
---|
| 733 | while (pof2 < comm_size) pof2 <<= 1; |
---|
| 734 | |
---|
| 735 | if (pof2 == comm_size && is_block_regular) { |
---|
| 736 | /* noncommutative, pof2 size, and block regular */ |
---|
| 737 | mpi_errno = MPIR_Reduce_scatter_noncomm(sendbuf, recvbuf, recvcnts, datatype, op, comm_ptr); |
---|
| 738 | if (mpi_errno) MPIU_ERR_POP(mpi_errno); |
---|
| 739 | } |
---|
| 740 | else { |
---|
| 741 | /* noncommutative and (non-pof2 or block irregular), use recursive doubling. */ |
---|
| 742 | |
---|
| 743 | /* need to allocate temporary buffer to receive incoming data*/ |
---|
| 744 | MPIU_CHKLMEM_MALLOC(tmp_recvbuf, void *, total_count*(MPIR_MAX(true_extent,extent)), mpi_errno, "tmp_recvbuf"); |
---|
| 745 | /* adjust for potential negative lower bound in datatype */ |
---|
| 746 | tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb); |
---|
| 747 | |
---|
| 748 | /* need to allocate another temporary buffer to accumulate |
---|
| 749 | results */ |
---|
| 750 | MPIU_CHKLMEM_MALLOC(tmp_results, void *, total_count*(MPIR_MAX(true_extent,extent)), mpi_errno, "tmp_results"); |
---|
| 751 | /* adjust for potential negative lower bound in datatype */ |
---|
| 752 | tmp_results = (void *)((char*)tmp_results - true_lb); |
---|
| 753 | |
---|
| 754 | /* copy sendbuf into tmp_results */ |
---|
| 755 | if (sendbuf != MPI_IN_PLACE) |
---|
| 756 | mpi_errno = MPIR_Localcopy(sendbuf, total_count, datatype, |
---|
| 757 | tmp_results, total_count, datatype); |
---|
| 758 | else |
---|
| 759 | mpi_errno = MPIR_Localcopy(recvbuf, total_count, datatype, |
---|
| 760 | tmp_results, total_count, datatype); |
---|
| 761 | |
---|
| 762 | if (mpi_errno) MPIU_ERR_POP(mpi_errno); |
---|
| 763 | |
---|
| 764 | mask = 0x1; |
---|
| 765 | i = 0; |
---|
| 766 | while (mask < comm_size) { |
---|
| 767 | dst = rank ^ mask; |
---|
| 768 | |
---|
| 769 | dst_tree_root = dst >> i; |
---|
| 770 | dst_tree_root <<= i; |
---|
| 771 | |
---|
| 772 | my_tree_root = rank >> i; |
---|
| 773 | my_tree_root <<= i; |
---|
| 774 | |
---|
| 775 | /* At step 1, processes exchange (n-n/p) amount of |
---|
| 776 | data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p) |
---|
| 777 | amount of data, and so forth. We use derived datatypes for this. |
---|
| 778 | |
---|
| 779 | At each step, a process does not need to send data |
---|
| 780 | indexed from my_tree_root to |
---|
| 781 | my_tree_root+mask-1. Similarly, a process won't receive |
---|
| 782 | data indexed from dst_tree_root to dst_tree_root+mask-1. */ |
---|
| 783 | |
---|
| 784 | /* calculate sendtype */ |
---|
| 785 | blklens[0] = blklens[1] = 0; |
---|
| 786 | for (j=0; j<my_tree_root; j++) |
---|
| 787 | blklens[0] += recvcnts[j]; |
---|
| 788 | for (j=my_tree_root+mask; j<comm_size; j++) |
---|
| 789 | blklens[1] += recvcnts[j]; |
---|
| 790 | |
---|
| 791 | dis[0] = 0; |
---|
| 792 | dis[1] = blklens[0]; |
---|
| 793 | for (j=my_tree_root; (j<my_tree_root+mask) && (j<comm_size); j++) |
---|
| 794 | dis[1] += recvcnts[j]; |
---|
| 795 | |
---|
| 796 | NMPI_Type_indexed(2, blklens, dis, datatype, &sendtype); |
---|
| 797 | NMPI_Type_commit(&sendtype); |
---|
| 798 | |
---|
| 799 | /* calculate recvtype */ |
---|
| 800 | blklens[0] = blklens[1] = 0; |
---|
| 801 | for (j=0; j<dst_tree_root && j<comm_size; j++) |
---|
| 802 | blklens[0] += recvcnts[j]; |
---|
| 803 | for (j=dst_tree_root+mask; j<comm_size; j++) |
---|
| 804 | blklens[1] += recvcnts[j]; |
---|
| 805 | |
---|
| 806 | dis[0] = 0; |
---|
| 807 | dis[1] = blklens[0]; |
---|
| 808 | for (j=dst_tree_root; (j<dst_tree_root+mask) && (j<comm_size); j++) |
---|
| 809 | dis[1] += recvcnts[j]; |
---|
| 810 | |
---|
| 811 | NMPI_Type_indexed(2, blklens, dis, datatype, &recvtype); |
---|
| 812 | NMPI_Type_commit(&recvtype); |
---|
| 813 | |
---|
| 814 | received = 0; |
---|
| 815 | if (dst < comm_size) { |
---|
| 816 | /* tmp_results contains data to be sent in each step. Data is |
---|
| 817 | received in tmp_recvbuf and then accumulated into |
---|
| 818 | tmp_results. accumulation is done later below. */ |
---|
| 819 | |
---|
| 820 | mpi_errno = MPIC_Sendrecv(tmp_results, 1, sendtype, dst, |
---|
| 821 | MPIR_REDUCE_SCATTER_TAG, |
---|
| 822 | tmp_recvbuf, 1, recvtype, dst, |
---|
| 823 | MPIR_REDUCE_SCATTER_TAG, comm, |
---|
| 824 | MPI_STATUS_IGNORE); |
---|
| 825 | received = 1; |
---|
| 826 | if (mpi_errno) MPIU_ERR_POP(mpi_errno); |
---|
794 | | } |
---|
795 | | } |
---|
796 | | |
---|
797 | | /* The following reduction is done here instead of after |
---|
798 | | the MPIC_Sendrecv or MPIC_Recv above. This is |
---|
799 | | because to do it above, in the noncommutative |
---|
800 | | case, we would need an extra temp buffer so as not to |
---|
801 | | overwrite temp_recvbuf, because temp_recvbuf may have |
---|
802 | | to be communicated to other processes in the |
---|
803 | | non-power-of-two case. To avoid that extra allocation, |
---|
804 | | we do the reduce here. */ |
---|
805 | | if (received) { |
---|
806 | | if (is_commutative || (dst_tree_root < my_tree_root)) { |
---|
807 | | #ifdef HAVE_CXX_BINDING |
---|
808 | | if (is_cxx_uop) { |
---|
809 | | (*MPIR_Process.cxx_call_op_fn)( tmp_recvbuf, |
---|
810 | | tmp_results, blklens[0], |
---|
811 | | datatype, uop); |
---|
812 | | (*MPIR_Process.cxx_call_op_fn)( |
---|
813 | | ((char *)tmp_recvbuf + dis[1]*extent), |
---|
814 | | ((char *)tmp_results + dis[1]*extent), |
---|
815 | | blklens[1], datatype, uop ); |
---|
816 | | } |
---|
817 | | else |
---|
818 | | #endif |
---|
819 | | { |
---|
820 | | (*uop)(tmp_recvbuf, tmp_results, &blklens[0], |
---|
821 | | &datatype); |
---|
822 | | (*uop)(((char *)tmp_recvbuf + dis[1]*extent), |
---|
823 | | ((char *)tmp_results + dis[1]*extent), |
---|
824 | | &blklens[1], &datatype); |
---|
| 850 | |
---|
| 851 | tmp_mask = mask >> 1; |
---|
| 852 | while (tmp_mask) { |
---|
| 853 | dst = rank ^ tmp_mask; |
---|
| 854 | |
---|
| 855 | tree_root = rank >> k; |
---|
| 856 | tree_root <<= k; |
---|
| 857 | |
---|
| 858 | /* send only if this proc has data and destination |
---|
| 859 | doesn't have data. at any step, multiple processes |
---|
| 860 | can send if they have the data */ |
---|
| 861 | if ((dst > rank) && |
---|
| 862 | (rank < tree_root + nprocs_completed) |
---|
| 863 | && (dst >= tree_root + nprocs_completed)) { |
---|
| 864 | /* send the current result */ |
---|
| 865 | mpi_errno = MPIC_Send(tmp_recvbuf, 1, recvtype, |
---|
| 866 | dst, MPIR_REDUCE_SCATTER_TAG, |
---|
| 867 | comm); |
---|
| 868 | if (mpi_errno) MPIU_ERR_POP(mpi_errno); |
---|
| 869 | } |
---|
| 870 | /* recv only if this proc. doesn't have data and sender |
---|
| 871 | has data */ |
---|
| 872 | else if ((dst < rank) && |
---|
| 873 | (dst < tree_root + nprocs_completed) && |
---|
| 874 | (rank >= tree_root + nprocs_completed)) { |
---|
| 875 | mpi_errno = MPIC_Recv(tmp_recvbuf, 1, recvtype, dst, |
---|
| 876 | MPIR_REDUCE_SCATTER_TAG, |
---|
| 877 | comm, MPI_STATUS_IGNORE); |
---|
| 878 | received = 1; |
---|
| 879 | if (mpi_errno) MPIU_ERR_POP(mpi_errno); |
---|
| 880 | } |
---|
| 881 | tmp_mask >>= 1; |
---|
| 882 | k--; |
---|