MueLu  Version of the Day
MueLu_AggregationPhase1Algorithm_kokkos_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // MueLu: A package for multigrid based preconditioning
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact
39 // Jonathan Hu (jhu@sandia.gov)
40 // Andrey Prokopenko (aprokop@sandia.gov)
41 // Ray Tuminaro (rstumin@sandia.gov)
42 //
43 // ***********************************************************************
44 //
45 // @HEADER
46 #ifndef MUELU_AGGREGATIONPHASE1ALGORITHM_KOKKOS_DEF_HPP
47 #define MUELU_AGGREGATIONPHASE1ALGORITHM_KOKKOS_DEF_HPP
48 
49 #ifdef HAVE_MUELU_KOKKOS_REFACTOR
50 
51 #include <queue>
52 #include <vector>
53 
54 #include <Teuchos_Comm.hpp>
55 #include <Teuchos_CommHelpers.hpp>
56 
57 #include <Xpetra_Vector.hpp>
58 
60 
61 #include "MueLu_Aggregates_kokkos.hpp"
62 #include "MueLu_Exceptions.hpp"
63 #include "MueLu_LWGraph_kokkos.hpp"
64 #include "MueLu_Monitor.hpp"
65 
66 #include "KokkosGraph_graph_color.hpp"
67 
68 namespace MueLu {
69 
70  template <class LocalOrdinal, class GlobalOrdinal, class Node>
71  void AggregationPhase1Algorithm_kokkos<LocalOrdinal, GlobalOrdinal, Node>::
72  BuildAggregates(const ParameterList& params, const LWGraph_kokkos& graph, Aggregates_kokkos& aggregates, std::vector<unsigned>& aggStat,
73  LO& numNonAggregatedNodes) const {
74  Monitor m(*this, "BuildAggregates");
75 
76  std::string orderingStr = params.get<std::string>("aggregation: ordering");
77  int maxNeighAlreadySelected = params.get<int> ("aggregation: max selected neighbors");
78  int minNodesPerAggregate = params.get<int> ("aggregation: min agg size");
79  int maxNodesPerAggregate = params.get<int> ("aggregation: max agg size");
80 
81  Algorithm algorithm = Algorithm::Serial;
82  std::string algoParamName = "aggregation: phase 1 algorithm";
83  if(params.isParameter(algoParamName))
84  {
85  algorithm = algorithmFromName(params.get<std::string>("aggregation: phase 1 algorithm"));
86  }
87 
88  TEUCHOS_TEST_FOR_EXCEPTION(maxNodesPerAggregate < minNodesPerAggregate, Exceptions::RuntimeError,
89  "MueLu::UncoupledAggregationAlgorithm::BuildAggregates: minNodesPerAggregate must be smaller or equal to MaxNodePerAggregate!");
90 
91  //Distance-2 gives less control than serial uncoupled phase 1
92  //no custom row reordering because would require making deep copy of local matrix entries and permuting it
93  //can only enforce max aggregate size
94  if(algorithm == Algorithm::Distance2)
95  {
96  BuildAggregatesDistance2(graph, aggregates, aggStat, numNonAggregatedNodes, maxNodesPerAggregate);
97  }
98  else
99  {
100  BuildAggregatesSerial(graph, aggregates, aggStat, numNonAggregatedNodes,
101  minNodesPerAggregate, maxNodesPerAggregate, maxNeighAlreadySelected, orderingStr);
102  }
103  }
104 
105 
106  template <class LocalOrdinal, class GlobalOrdinal, class Node>
107  void AggregationPhase1Algorithm_kokkos<LocalOrdinal, GlobalOrdinal, Node>::
108  BuildAggregatesSerial(const LWGraph_kokkos& graph, Aggregates_kokkos& aggregates,
109  std::vector<unsigned>& aggStat, LO& numNonAggregatedNodes,
110  LO minNodesPerAggregate, LO maxNodesPerAggregate,
111  LO maxNeighAlreadySelected, std::string& orderingStr) const
112  {
113  enum {
114  O_NATURAL,
115  O_RANDOM,
116  O_GRAPH
117  } ordering;
118 
119  ordering = O_NATURAL; // initialize variable (fix CID 143665)
120  if (orderingStr == "natural") ordering = O_NATURAL;
121  if (orderingStr == "random" ) ordering = O_RANDOM;
122  if (orderingStr == "graph" ) ordering = O_GRAPH;
123 
124  const LO numRows = graph.GetNodeNumVertices();
125  const int myRank = graph.GetComm()->getRank();
126 
127  ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
128  ArrayRCP<LO> procWinner = aggregates.GetProcWinner() ->getDataNonConst(0);
129 
130  LO numLocalAggregates = aggregates.GetNumAggregates();
131 
132  ArrayRCP<LO> randomVector;
133  if (ordering == O_RANDOM) {
134  randomVector = arcp<LO>(numRows);
135  for (LO i = 0; i < numRows; i++)
136  randomVector[i] = i;
137  RandomReorder(randomVector);
138  }
139 
140  int aggIndex = -1;
141  size_t aggSize = 0;
142  std::vector<int> aggList(graph.getNodeMaxNumRowEntries());
143 
144  std::queue<LO> graphOrderQueue;
145 
146  // Main loop over all local rows of graph(A)
147  for (LO i = 0; i < numRows; i++) {
148  // Step 1: pick the next node to aggregate
149  LO rootCandidate = 0;
150  if (ordering == O_NATURAL) rootCandidate = i;
151  else if (ordering == O_RANDOM) rootCandidate = randomVector[i];
152  else if (ordering == O_GRAPH) {
153 
154  if (graphOrderQueue.size() == 0) {
155  // Current queue is empty for "graph" ordering, populate with one READY node
156  for (LO jnode = 0; jnode < numRows; jnode++)
157  if (aggStat[jnode] == READY) {
158  graphOrderQueue.push(jnode);
159  break;
160  }
161  }
162  if (graphOrderQueue.size() == 0) {
163  // There are no more ready nodes, end the phase
164  break;
165  }
166  rootCandidate = graphOrderQueue.front(); // take next node from graph ordering queue
167  graphOrderQueue.pop(); // delete this node in list
168  }
169 
170  if (aggStat[rootCandidate] != READY)
171  continue;
172 
173  // Step 2: build tentative aggregate
174  aggSize = 0;
175  aggList[aggSize++] = rootCandidate;
176 
177  auto neighOfINode = graph.getNeighborVertices(rootCandidate);
178 
179  // If the number of neighbors is less than the minimum number of nodes
180  // per aggregate, we know this is not going to be a valid root, and we
181  // may skip it, but only for "natural" and "random" (for "graph" we still
182  // need to fetch the list of local neighbors to continue)
183  if ((ordering == O_NATURAL || ordering == O_RANDOM) &&
184  as<int>(neighOfINode.length) < minNodesPerAggregate) {
185  continue;
186  }
187 
188  LO numAggregatedNeighbours = 0;
189 
190  for (int j = 0; j < as<int>(neighOfINode.length); j++) {
191  LO neigh = neighOfINode(j);
192 
193  if (neigh != rootCandidate && graph.isLocalNeighborVertex(neigh)) {
194 
195  if (aggStat[neigh] == READY || aggStat[neigh] == NOTSEL) {
196  // If aggregate size does not exceed max size, add node to the
197  // tentative aggregate
198  // NOTE: We do not exit the loop over all neighbours since we have
199  // still to count all aggregated neighbour nodes for the
200  // aggregation criteria
201  // NOTE: We check here for the maximum aggregation size. If we
202  // would do it below with all the other check too big aggregates
203  // would not be accepted at all.
204  if (aggSize < as<size_t>(maxNodesPerAggregate))
205  aggList[aggSize++] = neigh;
206 
207  } else {
208  numAggregatedNeighbours++;
209  }
210  }
211  }
212 
213  // Step 3: check if tentative aggregate is acceptable
214  if ((numAggregatedNeighbours <= maxNeighAlreadySelected) && // too many connections to other aggregates
215  (aggSize >= as<size_t>(minNodesPerAggregate))) { // too few nodes in the tentative aggregate
216  // Accept new aggregate
217  // rootCandidate becomes the root of the newly formed aggregate
218  aggregates.SetIsRoot(rootCandidate);
219  aggIndex = numLocalAggregates++;
220 
221  for (size_t k = 0; k < aggSize; k++) {
222  aggStat [aggList[k]] = AGGREGATED;
223  vertex2AggId[aggList[k]] = aggIndex;
224  procWinner [aggList[k]] = myRank;
225  }
226 
227  numNonAggregatedNodes -= aggSize;
228 
229  } else {
230  // Aggregate is not accepted
231  aggStat[rootCandidate] = NOTSEL;
232 
233  // Need this for the "graph" ordering below
234  // The original candidate is always aggList[0]
235  aggSize = 1;
236  }
237 
238  if (ordering == O_GRAPH) {
239  // Add candidates to the list of nodes
240  // NOTE: the code have slightly different meanings depending on context:
241  // - if aggregate was accepted, we add neighbors of neighbors of the original candidate
242  // - if aggregate was not accepted, we add neighbors of the original candidate
243  for (size_t k = 0; k < aggSize; k++) {
244  auto neighOfJNode = graph.getNeighborVertices(aggList[k]);
245 
246  for (int j = 0; j < as<int>(neighOfJNode.length); j++) {
247  LO neigh = neighOfJNode(j);
248 
249  if (graph.isLocalNeighborVertex(neigh) && aggStat[neigh] == READY)
250  graphOrderQueue.push(neigh);
251  }
252  }
253  }
254  }
255 
256  // Reset all NOTSEL vertices to READY
257  // This simplifies other algorithms
258  for (LO i = 0; i < numRows; i++)
259  if (aggStat[i] == NOTSEL)
260  aggStat[i] = READY;
261 
262  // update aggregate object
263  aggregates.SetNumAggregates(numLocalAggregates);
264  }
265 
266  template <class LocalOrdinal, class GlobalOrdinal, class Node>
267  void AggregationPhase1Algorithm_kokkos<LocalOrdinal, GlobalOrdinal, Node>::
268  BuildAggregatesDistance2(const LWGraph_kokkos& graph, Aggregates_kokkos& aggregates,
269  std::vector<unsigned>& aggStat, LO& numNonAggregatedNodes, LO maxAggSize) const
270  {
271  const LO numRows = graph.GetNodeNumVertices();
272  const int myRank = graph.GetComm()->getRank();
273 
274  ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
275  ArrayRCP<LO> procWinner = aggregates.GetProcWinner() ->getDataNonConst(0);
276 
277  LO numLocalAggregates = aggregates.GetNumAggregates();
278 
279  //get the sparse local graph in CRS
280  std::vector<LocalOrdinal> rowptrs;
281  rowptrs.reserve(numRows + 1);
282  std::vector<LocalOrdinal> colinds;
283  colinds.reserve(graph.GetNodeNumEdges());
284 
285  rowptrs.push_back(0);
286  for(LocalOrdinal row = 0; row < numRows; row++)
287  {
288  auto entries = graph.getNeighborVertices(row);
289  for(LocalOrdinal i = 0; i < entries.length; i++)
290  {
291  colinds.push_back(entries.colidx(i));
292  }
293  rowptrs.push_back(colinds.size());
294  }
295 
296  //the local CRS graph to Kokkos device views, then compute graph squared
297  typedef typename Tpetra::CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::local_graph_type graph_t;
298  typedef typename graph_t::device_type device_t;
299  typedef typename device_t::memory_space memory_space;
300  typedef typename device_t::execution_space execution_space;
301  typedef typename graph_t::row_map_type::non_const_type rowptrs_view;
302  typedef typename rowptrs_view::HostMirror host_rowptrs_view;
303  typedef typename graph_t::entries_type::non_const_type colinds_view;
304  typedef typename colinds_view::HostMirror host_colinds_view;
305  //note: just using colinds_view in place of scalar_view_t type (it won't be used at all by symbolic SPGEMM)
306  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
307  typename rowptrs_view::const_value_type, typename colinds_view::const_value_type, typename colinds_view::const_value_type,
308  execution_space, memory_space, memory_space> KernelHandle;
309 
310  KernelHandle kh;
311  //leave gc algorithm choice as the default
312  kh.create_graph_coloring_handle();
313 
314  //Create device views for graph rowptrs/colinds
315  rowptrs_view aRowptrs("A device rowptrs", rowptrs.size());
316  colinds_view aColinds("A device colinds", colinds.size());
317  // Populate A in temporary host views, then copy to device
318  {
319  host_rowptrs_view aHostRowptrs = Kokkos::create_mirror_view(aRowptrs);
320  for(size_t i = 0; i < rowptrs.size(); i++)
321  {
322  aHostRowptrs(i) = rowptrs[i];
323  }
324  Kokkos::deep_copy(aRowptrs, aHostRowptrs);
325  host_colinds_view aHostColinds = Kokkos::create_mirror_view(aColinds);
326  for(size_t i = 0; i < colinds.size(); i++)
327  {
328  aHostColinds(i) = colinds[i];
329  }
330  Kokkos::deep_copy(aColinds, aHostColinds);
331  }
332  //run d2 graph coloring
333  //graph is symmetric so row map/entries and col map/entries are the same
334  KokkosGraph::Experimental::d2_graph_color(&kh, numRows, numRows, aRowptrs, aColinds, aRowptrs, aColinds);
335 
336  // extract the colors
337  auto coloringHandle = kh.get_graph_coloring_handle();
338  auto colorsDevice = coloringHandle->get_vertex_colors();
339 
340  auto colors = Kokkos::create_mirror_view(colorsDevice);
341  Kokkos::deep_copy(colors, colorsDevice);
342 
343  //clean up coloring handle
344  kh.destroy_graph_coloring_handle();
345 
346  //have color 1 (first color) be the aggregate roots (add those to mapping first)
347  LocalOrdinal aggCount = 0;
348  for(LocalOrdinal i = 0; i < numRows; i++)
349  {
350  if(colors(i) == 1 && aggStat[i] == READY)
351  {
352  vertex2AggId[i] = aggCount++;
353  aggStat[i] = AGGREGATED;
354  numLocalAggregates++;
355  procWinner[i] = myRank;
356  }
357  }
358  numNonAggregatedNodes = 0;
359  std::vector<LocalOrdinal> aggSizes(numLocalAggregates, 0);
360  for(int i = 0; i < numRows; i++)
361  {
362  if(vertex2AggId[i] >= 0)
363  aggSizes[vertex2AggId[i]]++;
364  }
365  //now assign every READY vertex to a directly connected root
366  for(LocalOrdinal i = 0; i < numRows; i++)
367  {
368  if(colors(i) != 1 && (aggStat[i] == READY || aggStat[i] == NOTSEL))
369  {
370  //get neighbors of vertex i and
371  //look for local, aggregated, color 1 neighbor (valid root)
372  auto neighbors = graph.getNeighborVertices(i);
373  for(LocalOrdinal j = 0; j < neighbors.length; j++)
374  {
375  auto nei = neighbors.colidx(j);
376  LocalOrdinal agg = vertex2AggId[nei];
377  if(graph.isLocalNeighborVertex(nei) && colors(nei) == 1 && aggStat[nei] == AGGREGATED && aggSizes[agg] < maxAggSize)
378  {
379  //assign vertex i to aggregate with root j
380  vertex2AggId[i] = agg;
381  aggSizes[agg]++;
382  aggStat[i] = AGGREGATED;
383  procWinner[i] = myRank;
384  break;
385  }
386  }
387  }
388  if(aggStat[i] != AGGREGATED)
389  {
390  numNonAggregatedNodes++;
391  if(aggStat[i] == NOTSEL)
392  aggStat[i] = READY;
393  }
394  }
395  // update aggregate object
396  aggregates.SetNumAggregates(numLocalAggregates);
397  }
398 
399  template <class LocalOrdinal, class GlobalOrdinal, class Node>
400  void AggregationPhase1Algorithm_kokkos<LocalOrdinal, GlobalOrdinal, Node>::RandomReorder(ArrayRCP<LO> list) const {
401  //TODO: replace int
402  int n = list.size();
403  for(int i = 0; i < n-1; i++)
404  std::swap(list[i], list[RandomOrdinal(i,n-1)]);
405  }
406 
407  template <class LocalOrdinal, class GlobalOrdinal, class Node>
408  int AggregationPhase1Algorithm_kokkos<LocalOrdinal, GlobalOrdinal, Node>::RandomOrdinal(int min, int max) const {
409  return min + as<int>((max-min+1) * (static_cast<double>(std::rand()) / (RAND_MAX + 1.0)));
410  }
411 
412 } // end namespace
413 
414 #endif // HAVE_MUELU_KOKKOS_REFACTOR
415 #endif // MUELU_AGGREGATIONPHASE1ALGORITHM_KOKKOS_DEF_HPP
416 
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
LocalOrdinal LO
Namespace for MueLu classes and methods.
void deep_copy(const View< DT, DP... > &dst, typename ViewTraits< DT, DP... >::const_value_type &value, typename std::enable_if< std::is_same< typename ViewTraits< DT, DP... >::specialize, void >::value >::type *=0)