#include #include #include #include #include #include #include #undef max #undef min // The program computes the product of two matrices, A and B by using Canon's algorithms. // The matrices must be square, NxN, matrices. // The cumputing nodes (processes) are also arranged in a square, compSize x compSize, // and N must be a multiple of compSize. // The node 0 has the initial matrices, and distributes sub-matrices of size chunkSize x chunkSize, // where chumkSize = N / compSize, to each node. // Next, the algorithm performs compSize rounds, in each round each node performs a multiplication of // two matrices of size chunkSize x chunkSize, and then it passes the chunk of A to the left and the chunk of B upwards. // After the last round, the node 0 collects the result. struct Sizes { size_t fullSize; // nr rows = nr columns of the matrices size_t blockSize; // nr rows = nr columns in each block unsigned nrBlocksSide; // number of blocks in a row or columns of blocks (the total number of blocks is nrBlocksSide^2 unsigned thisBlockRowIdx; // for a worker, the row index in the result of the block it processes unsigned thisBlockColIdx; // for a worker, the column index in the result of the block it processes MPI_Comm comm; // The communicator to be used }; enum MpiTags { tag_InitialBlockA = 1, tag_InitialBlockB = 2, tag_ResultBlock = 3, tag_ShiftBlockA = 4, tag_ShiftBlockB = 5 }; void multiplyBlock(std::vector& a, std::vector& b, std::vector& r, size_t rowsA, size_t colsA, size_t colsB) { for (size_t i=0 ; i& a, std::vector& b, std::vector& r) { std::vector tmpA(a.size()); std::vector tmpB(b.size()); int const me = (sizes.thisBlockRowIdx*sizes.nrBlocksSide) + (sizes.thisBlockColIdx + 1) % sizes.nrBlocksSide; int const nextBlockOnRow = (sizes.thisBlockRowIdx*sizes.nrBlocksSide) + (sizes.thisBlockColIdx + 1) % sizes.nrBlocksSide; int const prevBlockOnRow = (sizes.thisBlockRowIdx*sizes.nrBlocksSide) + (sizes.thisBlockColIdx + sizes.nrBlocksSide - 1) % sizes.nrBlocksSide; int const nextBlockOnCol = (((sizes.thisBlockRowIdx + 1) % sizes.nrBlocksSide) * sizes.nrBlocksSide) + sizes.thisBlockColIdx; int const prevBlockOnCol = (((sizes.thisBlockRowIdx + sizes.nrBlocksSide - 1) % sizes.nrBlocksSide) * sizes.nrBlocksSide) + sizes.thisBlockColIdx; size_t iter = 0; while(true) { std::cout << me << ":Computing\n"; multiplyBlock(a, b, r, sizes.blockSize, sizes.blockSize, sizes.blockSize); ++iter; if(iter == sizes.nrBlocksSide) return; // Send the block of A to the next process on the same row MPI_Status status; std::cout << me << ":Exchanging blocks\n"; if(sizes.thisBlockColIdx == 0) { MPI_Ssend(a.data(), a.size(), MPI_INT, nextBlockOnRow, tag_ShiftBlockA, sizes.comm); MPI_Recv(tmpA.data(), tmpA.size(), MPI_INT, prevBlockOnRow, tag_ShiftBlockA, sizes.comm, &status); } else { MPI_Recv(tmpA.data(), tmpA.size(), MPI_INT, prevBlockOnRow, tag_ShiftBlockA, sizes.comm, &status); MPI_Ssend(a.data(), a.size(), MPI_INT, nextBlockOnRow, tag_ShiftBlockA, sizes.comm); } // Send the block of B to the next process on the same column if(sizes.thisBlockRowIdx == 0) { MPI_Ssend(b.data(), b.size(), MPI_INT, nextBlockOnCol, tag_ShiftBlockB, sizes.comm); MPI_Recv(tmpB.data(), tmpB.size(), MPI_INT, prevBlockOnCol, tag_ShiftBlockB, sizes.comm, &status); } else { MPI_Recv(tmpB.data(), tmpB.size(), MPI_INT, prevBlockOnCol, tag_ShiftBlockB, sizes.comm, &status); MPI_Ssend(b.data(), b.size(), MPI_INT, nextBlockOnCol, tag_ShiftBlockB, sizes.comm); } std::cout << me << ":Exchanging blocks done\n"; std::swap(a, tmpA); std::swap(b, tmpB); } } bool computeSizes(size_t fullSize, MPI_Comm comm, Sizes& sizes) { sizes.fullSize = fullSize; int nrProcs; MPI_Comm_size(comm, &nrProcs); int r = (int)sqrt(nrProcs); if( (r+1)*(r+1) <= nrProcs ) { sizes.nrBlocksSide = r+1; } else { sizes.nrBlocksSide = r; } sizes.blockSize = fullSize / sizes.nrBlocksSide; if(nrProcs != sizes.nrBlocksSide*sizes.nrBlocksSide) { std::cerr << "The number of processes must be a perfect square\n"; return false; } if(sizes.fullSize != sizes.blockSize*sizes.nrBlocksSide) { std::cerr << "The matrix size must be divisible to the square root of the number of processes\n"; return false; } int me; MPI_Comm_rank(comm, &me); sizes.thisBlockRowIdx = me / sizes.nrBlocksSide; sizes.thisBlockColIdx = me % sizes.nrBlocksSide; sizes.comm = comm; return true; } /// @brief Top function for the root process. // // It distributes chunks to all other processes, then it performs the computation rounds corresponding to the // upper-left sub-matrix of the result, and finally it reads the chunks from the other nodes void initialWorker(MPI_Comm comm, int n, std::vector const& a, std::vector const& b, std::vector& r) { Sizes sizes; if(!computeSizes(n, comm, sizes)) { n = 0; } MPI_Bcast(&n, 1, MPI_INT, 0, comm); if(n == 0) return; // Make copies of the blocks and distribute them std::vector > aBlocks(sizes.nrBlocksSide*sizes.nrBlocksSide); std::vector > bBlocks(sizes.nrBlocksSide*sizes.nrBlocksSide); std::vector > rBlocks(sizes.nrBlocksSide*sizes.nrBlocksSide); for(unsigned blockIdx=0 ; blockIdx aBlock(sizes.blockSize*sizes.blockSize); std::vector bBlock(sizes.blockSize*sizes.blockSize); int const me = (sizes.thisBlockRowIdx*sizes.nrBlocksSide) + (sizes.thisBlockColIdx + 1) % sizes.nrBlocksSide; // receive the initial data from the parent MPI_Status status; std::cout << me << ":Receiving initial data\n"; MPI_Recv(aBlock.data(), aBlock.size(), MPI_INT, 0, tag_InitialBlockA, sizes.comm, &status); MPI_Recv(bBlock.data(), bBlock.size(), MPI_INT, 0, tag_InitialBlockB, sizes.comm, &status); // do the local work std::vector rBlock(sizes.blockSize*sizes.blockSize, 0); doWork(sizes, aBlock, bBlock, rBlock); // send back the result to the parent std::cout << me << ":Sending final data\n"; MPI_Ssend(rBlock.data(), sizes.blockSize*sizes.blockSize, MPI_INT, 0, tag_ResultBlock, sizes.comm); } void generateMatrix(std::vector& matr, size_t nrRows, size_t nrCols) { size_t const size = nrRows*nrCols; matr.reserve(size); for(size_t i=0 ; i const& a, std::vector const& b, std::vector const& result, size_t rowsA, size_t colsA, size_t colsB) { for(size_t i=0 ; i a; std::vector b; std::vector c; if(argc != 2 || 1!=sscanf(argv[1], "%u", &n) ){ std::cerr << "usage: matrix-mpi \n"; return 1; } generateMatrix(a, n, n); generateMatrix(b, n, n); c.reserve(n*n); std::cout << "generated\n"; std::chrono::system_clock::time_point beginTime = std::chrono::system_clock::now(); initialWorker(MPI_COMM_WORLD, n, a, b, c); std::chrono::system_clock::time_point endTime = std::chrono::system_clock::now(); std::cout << me <<":Elapsed time=" << std::chrono::duration_cast(endTime-beginTime).count() <<"ms\n"; std::cout << (checkResult(a, b, c, n, n, n) ? "Ok\n" : "WRONG\n"); } else { std::chrono::system_clock::time_point beginTime = std::chrono::system_clock::now(); worker(MPI_COMM_WORLD); std::chrono::system_clock::time_point endTime = std::chrono::system_clock::now(); std::cout << me <<":Elapsed time=" << std::chrono::duration_cast(endTime-beginTime).count() <<"ms\n"; } MPI_Finalize(); }