@InProceedings{fernandez03:BCS,
  Author =       {Juan Fernandez and Eitan Frachtenberg and Fabrizio
                  Petrini},
  Title =        {{BCS-MPI}: A New Approach in the System Software
                  Design for Large-Scale Parallel Computers},
  Booktitle =    SC,
  Address =      {Phoenix, {AZ}},
  Month =        nov,
  Year =         {2003},
  Annote =       {BCS-MPI is an implementation of several core MPI
                  functions on top of the BCS model (and QsNet
                  hardware). The BCS model buffers all communication,
                  and divides communication activity into phases and
                  micro-phases. A communicating process first posts a
                  descriptor to the NIC; then, thread code running on
                  the NIC exchanges information with other NICs
                  (sources and destinations; schedules the actual
                  communication to be sent during the next phase; and
                  then executes it. The paper shows that typical
                  applications with asynchronous communication
                  (i.e. MPI_Isend, MPI_Irecv) suffer very little from
                  buffering of the communication, and sometimes even
                  gain a little, due to reduced overhead. Synchronous
                  (blocking) applications can suffer overheads of up
                  to 30\%, but at least one of them can be easily
                  converted into a non-blocking application.},
  Note =         {Available from \url{http://www.cs.huji.ac.il/etcs/pubs/}},
}