Sierra Toolkit  Version of the Day
ParallelReduce.hpp
1 /*------------------------------------------------------------------------*/
2 /* Copyright 2010 Sandia Corporation. */
3 /* Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive */
4 /* license for use of this work by or on behalf of the U.S. Government. */
5 /* Export of this program may require a license from the */
6 /* United States Government. */
7 /*------------------------------------------------------------------------*/
8 
9 #ifndef stk_util_parallel_ParallelReduce_hpp
10 #define stk_util_parallel_ParallelReduce_hpp
11 
12 #include <cstddef>
13 #include <iosfwd>
14 #include <string>
15 #include <stk_util/parallel/Parallel.hpp>
16 #include <stk_util/util/SimpleArrayOps.hpp>
17 
18 //------------------------------------------------------------------------
19 
20 namespace stk_classic {
21 
26 // REFACTOR: Replace ReduceSum with Sum?, etc... Should be possible
27 
32  std::ostream & ,
33  const std::string & );
34 
37  const double * local , double * global , unsigned count );
38 
41  const float * local , float * global , unsigned count );
42 
45  const int * local , int * global , unsigned count );
46 
49  const size_t * local , size_t * global , unsigned count );
50 
53  const unsigned * local ,
54  unsigned * global , unsigned count );
55 
74 template < class ReduceOp >
75 void all_reduce( ParallelMachine , const ReduceOp & );
76 
79 }
80 
81 //----------------------------------------------------------------------
82 //----------------------------------------------------------------------
83 
84 #ifndef DOXYGEN_COMPILE
85 
86 namespace stk_classic {
87 namespace {
88 // Blank namespace so that this class produces local symbols,
89 // avoiding complaints from a linker of multiple-define symbols.
90 
91 struct ReduceEnd {
92  struct WorkType {};
93  void copyin( WorkType & ) const {}
94  void copyout( WorkType & ) const {}
95  static void op( WorkType & , WorkType & ) {}
96 };
97 
98 // Workhorse class for aggregating reduction operations.
99 
100 template <class Op, typename T, class Next>
101 struct Reduce {
102 
103  typedef T Type ;
104  enum { N = Op::N };
105 
106  struct WorkType {
107  typename Next::WorkType m_next ;
108  Type m_value[N];
109  };
110 
111  Next m_next ;
112  Type * m_value ;
113 
114  // Copy values into buffer:
115  void copyin( WorkType & w ) const
116  { Copy<N>( w.m_value , m_value ); m_next.copyin( w.m_next ); }
117 
118  // Copy value out from buffer:
119  void copyout( WorkType & w ) const
120  { Copy<N>( m_value , w.m_value ); m_next.copyout( w.m_next ); }
121 
122  // Reduction function
123  static void op( WorkType & out , WorkType & in )
124  { Op( out.m_value , in.m_value ); Next::op( out.m_next , in.m_next ); }
125 
126  // Aggregate reduction operations, use '&' for left-to-right evaluation
127  template<class OpB, typename TB>
128  Reduce<OpB, TB, Reduce<Op,T,Next> >
129  operator & ( const Reduce<OpB,TB,ReduceEnd> & rhs )
130  { return Reduce<OpB, TB, Reduce<Op,T,Next> >( rhs , *this ); }
131 
132  // Constructor for aggregation:
133  Reduce( const Reduce<Op,T, ReduceEnd> & arg_val , const Next & arg_next )
134  : m_next( arg_next ), m_value( arg_val.m_value ) {}
135 
136  // Constructor for aggregate member:
137  explicit Reduce( Type * arg_value )
138  : m_next(), m_value( arg_value ) {}
139 
140  static void void_op( void*inv, void*inoutv, int*, ParallelDatatype*);
141 };
142 
143 template <class Op, typename T, class Next>
144 void Reduce<Op,T,Next>::void_op( void*inv, void*inoutv,int*,ParallelDatatype*)
145 {
146  op( * reinterpret_cast<WorkType*>( inoutv ) ,
147  * reinterpret_cast<WorkType*>( inv ) );
148 }
149 
150 }
151 }
152 
153 //----------------------------------------------------------------------
154 //----------------------------------------------------------------------
155 
156 namespace stk_classic {
157 
158 template<unsigned N, typename T>
159 inline
160 Reduce< Sum<N> , T, ReduceEnd> ReduceSum( T * value )
161 { return Reduce< Sum<N>, T, ReduceEnd >( value ); }
162 
163 template<unsigned N, typename T>
164 inline
165 Reduce< Prod<N>, T, ReduceEnd > ReduceProd( T * value )
166 { return Reduce< Prod<N>, T, ReduceEnd >( value ); }
167 
168 template<unsigned N, typename T>
169 inline
170 Reduce< Max<N>, T, ReduceEnd> ReduceMax( T * value )
171 { return Reduce< Max<N>, T, ReduceEnd>( value ); }
172 
173 template<unsigned N, typename T>
174 inline
175 Reduce< Min<N>, T, ReduceEnd> ReduceMin( T * value )
176 { return Reduce<Min<N>, T, ReduceEnd>( value ); }
177 
178 template<unsigned N, typename T>
179 inline
180 Reduce< BitOr<N>, T, ReduceEnd> ReduceBitOr( T * value )
181 { return Reduce< BitOr<N>, T, ReduceEnd>( value ); }
182 
183 template<unsigned N, typename T>
184 inline
185 Reduce< BitAnd<N>, T, ReduceEnd> ReduceBitAnd( T * value )
186 { return Reduce< BitAnd<N>, T, ReduceEnd>( value ); }
187 
188 //----------------------------------------------------------------------
189 // all_reduce( comm , ReduceSum<5>( A ) & ReduceMax<3>( B ) );
190 
191 extern "C" {
192 typedef void (*ParallelReduceOp)
193  ( void * inv , void * outv , int * , ParallelDatatype * );
194 }
195 
196 void all_reduce( ParallelMachine arg_comm ,
197  ParallelReduceOp arg_op ,
198  void * arg_in ,
199  void * arg_out ,
200  unsigned arg_len );
201 
202 namespace {
203 
204 template < class ReduceOp >
205 void all_reduce_driver( ParallelMachine comm , const ReduceOp & op )
206 {
207  typedef typename ReduceOp::WorkType WorkType ;
208 
209  WorkType inbuf , outbuf ;
210 
211  ParallelReduceOp f =
212  reinterpret_cast<ParallelReduceOp>( & ReduceOp::void_op );
213  op.copyin( inbuf );
214  all_reduce( comm , f , & inbuf, & outbuf, sizeof(WorkType) );
215  op.copyout( outbuf );
216 }
217 
218 }
219 
220 template < class ReduceOp >
221 inline
222 void all_reduce( ParallelMachine comm , const ReduceOp & op )
223 { all_reduce_driver<ReduceOp>( comm , op ); }
224 
225 }
226 
227 #endif /* DOXYGEN_COMPILE */
228 
229 //----------------------------------------------------------------------
230 
231 #endif
232 
void all_reduce_bor(ParallelMachine comm, const unsigned *local, unsigned *global, unsigned count)
Parallel bitwise-or to all processors.
void all_reduce_sum(ParallelMachine comm, const double *local, double *global, unsigned count)
Parallel summation to all processors.
std::ostream & out()
Normal output stream.
Definition: OutputLog.cpp:658
Reduce< Sum, T * > * ReduceSum(T *t, T *u, size_t length)
Member function ReduceSum ...
Definition: MPI.hpp:789
Reduce< Prod, T * > * ReduceProd(T *t, T *u, size_t length)
Member function ReduceProd ...
Definition: MPI.hpp:805
Sierra Toolkit.
void all_write_string(ParallelMachine arg_comm, std::ostream &arg_root_os, const std::string &arg_msg)
Write string from any or all processors to the ostream on the root processor.
MPI_Comm ParallelMachine
Definition: Parallel.hpp:32
Reduce< Max, T * > * ReduceMax(T *t, T *u, size_t length)
Member function ReduceMax ...
Definition: MPI.hpp:821
MPI_Datatype ParallelDatatype
Definition: Parallel.hpp:36
Reduce< Min, T * > * ReduceMin(T *t, T *u, size_t length)
Member function ReduceMin ...
Definition: MPI.hpp:837