next up previous contents
Next: Alltoallv Up: Collectives Previous: Allreduce

Alltoall

int alltoall_is_short(int count, MPI_Datatype dtype, MPI_Comm comm)
{
  int size;

  MPI_Type_size(dtype, &size);
  if (count * size < 2048) {
    return(1);
  } else {
    return(0);
  }
}

int MPI_Alltoall(void *sbuf, int scount, MPI_Datatype sdtype, 
                void *rbuf, int rcount, MPI_Datatype rdtype, 
                MPI_Comm comm)
{
  if (alltoall_is_short(scount, sdtype, comm)) {
    alltoall_short(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm);
  } else {
    alltoall_long(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm);
  }
}

int alltoall_long(void *sbuf, int scount, MPI_Datatype sdtype, 
                  void *rbuf, int rcount, MPI_Datatype rdtype, 
                  MPI_Comm comm)
{
  MPI_Request *reqs;
  MPI_Status  *stats;
  MPI_Aint    sextent, rextent;
  int         i, nprocs;

  MPI_Comm_size(comm, &nprocs);
  MPI_Type_extent(sdtype, &sextent);
  MPI_Type_extent(rdtype, &rextent);

  reqs = (MPI_Request *) malloc(2 * nprocs * sizeof(MPI_Request));
  stats = (MPI_Status *) malloc(2 * nprocs * sizeof(MPI_Status));

  for (i = 0; i < nprocs; i++) { 
    MPI_Irecv((char *) rbuf + i * rcount * rextent, rcount, 
              rdtype, i, IMPI_ALLTOALL_TAG, comm, &reqs[2*i]);
    MPI_Isend((char *) sbuf + i * scount * sextent, scount,
              sdtype, i, IMPI_ALLTOALL_TAG, comm, &reqs[2*i + 1]);
  }

  MPI_Waitall(2 * nprocs, reqs, stats);

  free(reqs);
  free(stats);
  return(MPI_SUCCESS);
}

int alltoall_short(void *sbuf, int scount, MPI_Datatype sdtype, 
                   void *rbuf, int rcount, MPI_Datatype rdtype, 
                   MPI_Comm comm)
{
  MPI_Status  status;
  int         i, j, myrank, nmasters, packsize, size;
  int         rootmaster, nroot;

  MPI_Comm_rank(comm, &myrank);
  MPI_Pack_size(scount, sdtype, comm, &packsize);

  /* local phase */
  do local all to all exchange;

  /* global phase */
  /* This phase rotates around the nodes treating each one in turn as 
   * the root. The root node master collects in turn the buffers 
   * destinated for each other node and sends them in a single transfer
   * to the node where a local operation scatters them to the local 
   * destination processes.
   */
  nmasters = num_masters(comm);
  for (i = 0; i < nmasters; i++) { 
    rootmaster = master_rank(i, comm);
    nroot = num_local_to_master(i, comm);
  
    if (are_local(myrank, rootmaster, comm)) {
      for (j = 0; j < nmasters; j++) {
        if (i == j) continue;
 
        if (myrank == rootmaster) {
          size = nroot * num_local_to_master(j, comm) * packsize;
          allocate a temporary buffer tmpbuf of size bytes;
        }

        gather into the tmpbuf at rootmaster all send buffers
          destined from processes on node i to processes on node j, they
          are concatenated in order of sender rank by receiver rank;

        if (myrank == rootmaster) {
          MPI_Send(tmpbuf, size, MPI_BYTE, master_rank(j, comm),
                   MPI_ALLTOALL_TAG, comm);
          free tmpbuf;
        }
      }
    } else {
      /* not local to the rootmaster */
      if (is_master(myrank, comm)) {
          size = nroot * num_local_to_rank(myrank, comm) * packsize;
          allocate a temporary buffer tmpbuf of size bytes;

          MPI_Recv(tmpbuf, size, MPI_BYTE, rootmaster,
                   MPI_ALLTOALL_TAG, comm, &status);
      }

      scatter the packed send buffers receieved from rootmaster from 
        the tmpbuf of the local master to the local processes;

      if (is_master(myrank, comm)) {
        free tmpbuf;
      }
    }
  }

  return(MPI_SUCCESS);
}



IMPI Protocol ver 0.0
DRAFT March 22, 1999