@Article{petrini05:reduce,
  Author =	 {Fabrizio Petrini and Juan Fernandez and Adam Moody
                  and Eitan Frachtenberg and Dhabaleswar K. Panda},
  Title =	 {{NIC}-based Reduction Algorithms for Large-scale
                  Clusters},
  Journal =	 {IJHPCN},
  Year =	 {2006},
  Volume =	 {4},
  Number =	 {3/4},
  Pages =	 {122--136},
  Month =	 {Aug},
  Note =	 {Available from \url{http://www.cs.huji.ac.il/etcs/pubs/}},
  Annote =	 {Presents both analytical and experimental
                  evaluations of reduce algorithms that use the
                  processing capability of the network interface. A
                  model for a family of reduction algorithms using
                  $f$-nomial trees is introduced. The model allows the
                  design and evaluation of reduction algorithms that
                  are optimized for cluster and network specific
                  parameters. Extensive performance and scalability
                  evaluations were carried out on the 960-node,
                  1920-processor ASCI Linux Cluster (ALC) at Lawrence
                  Livermore National Laboratory, using up to 1812
                  processors. The experiments show that modern
                  interconnects do indeed allow for more efficient,
                  scalable, and consistently-performing NIC-based
                  reductions than host-based reductions. In
                  particular, on the largest configuration tested,
                  NIC-based operations provided speedups of 121\% and
                  39\% over the host-based, production-level MPI
                  operations for small integer and floating point
                  reduction arrays, respectively.},
  Keywords =	 {Cluster Computing, Reduce, Allreduce, Quadrics
                  QsNet, NIC-based operations, collective
                  communication},
}
