46 #ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP 47 #define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP 49 #ifdef HAVE_MUELU_KOKKOS_REFACTOR 51 #include "Kokkos_UnorderedMap.hpp" 55 #include "MueLu_Aggregates_kokkos.hpp" 56 #include "MueLu_AmalgamationFactory.hpp" 57 #include "MueLu_AmalgamationInfo.hpp" 58 #include "MueLu_CoarseMapFactory_kokkos.hpp" 60 #include "MueLu_NullspaceFactory_kokkos.hpp" 61 #include "MueLu_PerfUtils.hpp" 63 #include "MueLu_Utilities_kokkos.hpp" 69 template<
class LocalOrdinal,
class View>
70 class ReduceMaxFunctor{
72 ReduceMaxFunctor(View view) : view_(view) { }
74 KOKKOS_INLINE_FUNCTION
75 void operator()(
const LocalOrdinal &i, LocalOrdinal& vmax)
const {
80 KOKKOS_INLINE_FUNCTION
81 void join (
volatile LocalOrdinal& dst,
const volatile LocalOrdinal& src)
const {
87 KOKKOS_INLINE_FUNCTION
88 void init (LocalOrdinal& dst)
const {
96 template<
class LOType,
class GOType,
class SCType,
class DeviceType,
class NspType,
class aggRowsType,
class maxAggDofSizeType,
class agg2RowMapLOType,
class statusType,
class rowsType,
class rowsAuxType,
class colsAuxType,
class valsAuxType>
97 class LocalQRDecompFunctor {
103 typedef typename DeviceType::execution_space execution_space;
104 typedef Kokkos::ArithTraits<SC> ATS;
105 typedef typename ATS::magnitudeType Magnitude;
107 typedef Kokkos::View<SC**,typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_matrix;
108 typedef Kokkos::View<SC* ,typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_vector;
115 maxAggDofSizeType maxAggDofSize;
116 agg2RowMapLOType agg2RowMapLO;
117 statusType statusAtomic;
124 LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_,
bool doQRStep_) :
128 maxAggDofSize(maxAggDofSize_),
129 agg2RowMapLO(agg2RowMapLO_),
130 statusAtomic(statusAtomic_),
138 KOKKOS_INLINE_FUNCTION
139 void operator() (
const typename Kokkos::TeamPolicy<execution_space>::member_type & thread,
size_t& nnz)
const {
140 auto agg = thread.league_rank();
143 auto aggSize = aggRows(agg+1) - aggRows(agg);
145 const SC one = ATS::one();
146 const SC two = one + one;
147 const SC zero = ATS::zero();
148 const auto zeroM = ATS::magnitude(zero);
151 int n = fineNS.extent(1);
154 Xpetra::global_size_t offset = agg * n;
159 shared_matrix r(thread.team_shmem(), m, n);
160 for (
int j = 0; j < n; j++)
161 for (
int k = 0; k < m; k++)
162 r(k,j) = fineNS(agg2RowMapLO(aggRows(agg)+k),j);
165 for (
int i = 0; i < m; i++) {
166 for (
int j = 0; j < n; j++)
167 printf(
" %5.3lf ", r(i,j));
173 shared_matrix q(thread.team_shmem(), m, m);
175 bool isSingular =
false;
179 for (
int i = 0; i < m; i++) {
180 for (
int j = 0; j < m; j++)
185 for (
int k = 0; k < n; k++) {
187 Magnitude s = zeroM, norm, norm_x;
188 for (
int i = k+1; i < m; i++)
189 s += pow(ATS::magnitude(r(i,k)), 2);
190 norm = sqrt(pow(ATS::magnitude(r(k,k)), 2) + s);
199 norm_x = sqrt(pow(ATS::magnitude(r(k,k)), 2) + s);
200 if (norm_x == zeroM) {
208 for (
int i = k; i < m; i++)
212 for (
int j = k+1; j < n; j++) {
215 for (
int i = k; i < m; i++)
216 si += r(i,k) * r(i,j);
217 for (
int i = k; i < m; i++)
218 r(i,j) -= two*si * r(i,k);
222 for (
int j = k; j < m; j++) {
225 for (
int i = k; i < m; i++)
226 si += r(i,k) * qt(i,j);
227 for (
int i = k; i < m; i++)
228 qt(i,j) -= two*si * r(i,k);
233 for (
int i = k+1; i < m; i++)
239 for (
int i = 0; i < m; i++)
240 for (
int j = 0; j < i; j++) {
248 for (
int j = 0; j < n; j++)
249 for (
int k = 0; k <= j; k++)
250 coarseNS(offset+k,j) = r(k,j);
253 statusAtomic(1) =
true;
288 for (
int j = 0; j < n; j++)
289 for (
int k = 0; k < n; k++)
291 coarseNS(offset+k,j) = r(k,j);
293 coarseNS(offset+k,j) = (k == j ? one : zero);
296 for (
int i = 0; i < m; i++)
297 for (
int j = 0; j < n; j++)
298 q(i,j) = (j == i ? one : zero);
302 for (
int j = 0; j < m; j++) {
303 LO localRow = agg2RowMapLO(aggRows(agg)+j);
304 size_t rowStart = rowsAux(localRow);
306 for (
int k = 0; k < n; k++) {
308 if (q(j,k) != zero) {
309 colsAux(rowStart+lnnz) = offset + k;
310 valsAux(rowStart+lnnz) = q(j,k);
314 rows(localRow+1) = lnnz;
320 for (
int i = 0; i < m; i++) {
321 for (
int j = 0; j < n; j++)
322 printf(
" %5.3lf ", coarseNS(i,j));
327 for (
int i = 0; i < aggSize; i++) {
328 for (
int j = 0; j < aggSize; j++)
329 printf(
" %5.3lf ", q(i,j));
343 for (
int j = 0; j < m; j++) {
344 LO localRow = agg2RowMapLO(aggRows(agg)+j);
345 size_t rowStart = rowsAux(localRow);
347 for (
int k = 0; k < n; k++) {
348 const SC qr_jk = fineNS(localRow,k);
351 colsAux(rowStart+lnnz) = offset + k;
352 valsAux(rowStart+lnnz) = qr_jk;
356 rows(localRow+1) = lnnz;
360 for (
int j = 0; j < n; j++)
361 coarseNS(offset+j,j) = one;
368 size_t team_shmem_size(
int team_size )
const {
370 int m = maxAggDofSize;
371 int n = fineNS.extent(1);
372 return shared_matrix::shmem_size(m, n) +
373 shared_matrix::shmem_size(m, m);
381 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
382 RCP<const ParameterList> TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::GetValidParameterList()
const {
383 RCP<ParameterList> validParamList = rcp(
new ParameterList());
385 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name)) 388 #undef SET_VALID_ENTRY 390 validParamList->set< RCP<const FactoryBase> >(
"A", Teuchos::null,
"Generating factory of the matrix A");
391 validParamList->set< RCP<const FactoryBase> >(
"Aggregates", Teuchos::null,
"Generating factory of the aggregates");
392 validParamList->set< RCP<const FactoryBase> >(
"Nullspace", Teuchos::null,
"Generating factory of the nullspace");
393 validParamList->set< RCP<const FactoryBase> >(
"UnAmalgamationInfo", Teuchos::null,
"Generating factory of UnAmalgamationInfo");
394 validParamList->set< RCP<const FactoryBase> >(
"CoarseMap", Teuchos::null,
"Generating factory of the coarse map");
395 validParamList->set< RCP<const FactoryBase> >(
"Coordinates", Teuchos::null,
"Generating factory of the coordinates");
398 ParameterList norecurse;
399 norecurse.disableRecursiveValidation();
400 validParamList->set<ParameterList> (
"matrixmatrix: kernel params", norecurse,
"MatrixMatrix kernel parameters");
402 return validParamList;
405 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
406 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::DeclareInput(Level& fineLevel, Level& coarseLevel)
const {
408 const ParameterList& pL = GetParameterList();
410 Input(fineLevel,
"A");
411 Input(fineLevel,
"Aggregates");
412 Input(fineLevel,
"Nullspace");
413 Input(fineLevel,
"UnAmalgamationInfo");
414 Input(fineLevel,
"CoarseMap");
415 if( fineLevel.GetLevelID() == 0 &&
417 pL.get<
bool>(
"tentative: build coarse coordinates") ) {
418 bTransferCoordinates_ =
true;
419 Input(fineLevel,
"Coordinates");
420 }
else if (bTransferCoordinates_) {
421 Input(fineLevel,
"Coordinates");
425 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
426 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::Build(Level& fineLevel, Level& coarseLevel)
const {
427 return BuildP(fineLevel, coarseLevel);
430 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
431 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::BuildP(Level& fineLevel, Level& coarseLevel)
const {
432 FactoryMonitor m(*
this,
"Build", coarseLevel);
434 auto A = Get< RCP<Matrix> > (fineLevel,
"A");
435 auto aggregates = Get< RCP<Aggregates_kokkos> >(fineLevel,
"Aggregates");
436 auto amalgInfo = Get< RCP<AmalgamationInfo> > (fineLevel,
"UnAmalgamationInfo");
437 auto fineNullspace = Get< RCP<MultiVector> > (fineLevel,
"Nullspace");
438 auto coarseMap = Get< RCP<const Map> > (fineLevel,
"CoarseMap");
439 RCP<RealValuedMultiVector> fineCoords;
440 if(bTransferCoordinates_) {
441 fineCoords = Get< RCP<RealValuedMultiVector> >(fineLevel,
"Coordinates");
444 RCP<Matrix> Ptentative;
445 RCP<MultiVector> coarseNullspace;
446 RCP<RealValuedMultiVector> coarseCoords;
448 if(bTransferCoordinates_) {
449 ArrayView<const GO> elementAList = coarseMap->getNodeElementList();
450 GO indexBase = coarseMap->getIndexBase();
453 if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
454 blkSize = rcp_dynamic_cast<
const StridedMap>(coarseMap)->getFixedBlockSize();
456 Array<GO> elementList;
457 ArrayView<const GO> elementListView;
461 elementListView = elementAList;
464 auto numElements = elementAList.size() / blkSize;
466 elementList.resize(numElements);
469 for (LO i = 0; i < Teuchos::as<LO>(numElements); i++)
470 elementList[i] = (elementAList[i*blkSize]-indexBase)/blkSize + indexBase;
472 elementListView = elementList;
475 auto uniqueMap = fineCoords->getMap();
476 auto coarseCoordMap = MapFactory::Build(coarseMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
477 elementListView, indexBase, coarseMap->getComm());
478 coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());
481 RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
482 if (aggregates->AggregatesCrossProcessors()) {
483 auto nonUniqueMap = aggregates->GetMap();
484 auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
486 ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
487 ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
492 auto aggGraph = aggregates->GetGraph();
493 auto numAggs = aggGraph.numRows();
495 auto fineCoordsView = fineCoords ->template getLocalView<DeviceType>();
496 auto coarseCoordsView = coarseCoords->template getLocalView<DeviceType>();
500 SubFactoryMonitor m2(*
this,
"AverageCoords", coarseLevel);
502 const auto dim = fineCoords->getNumVectors();
504 typename AppendTrait<decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
505 for (
size_t j = 0; j < dim; j++) {
506 Kokkos::parallel_for(
"MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
507 KOKKOS_LAMBDA(
const LO i) {
511 auto aggregate = aggGraph.rowConst(i);
514 for (
size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
515 sum += fineCoordsRandomView(aggregate(colID),j);
517 coarseCoordsView(i,j) = sum / aggregate.length;
523 if (!aggregates->AggregatesCrossProcessors())
524 BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
526 BuildPcoupled (A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
536 if (A->IsView(
"stridedMaps") ==
true)
537 Ptentative->CreateView(
"stridedMaps", A->getRowMap(
"stridedMaps"), coarseMap);
539 Ptentative->CreateView(
"stridedMaps", Ptentative->getRangeMap(), coarseMap);
541 if(bTransferCoordinates_) {
542 Set(coarseLevel,
"Coordinates", coarseCoords);
544 Set(coarseLevel,
"Nullspace", coarseNullspace);
545 Set(coarseLevel,
"P", Ptentative);
548 RCP<ParameterList> params = rcp(
new ParameterList());
549 params->set(
"printLoadBalancingInfo",
true);
554 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
555 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
556 BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates, RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
557 RCP<const Map> coarseMap, RCP<Matrix>& Ptentative, RCP<MultiVector>& coarseNullspace,
const int levelID)
const {
558 auto rowMap = A->getRowMap();
559 auto colMap = A->getColMap();
561 const size_t numRows = rowMap->getNodeNumElements();
562 const size_t NSDim = fineNullspace->getNumVectors();
564 typedef Kokkos::ArithTraits<SC> ATS;
565 typedef typename ATS::magnitudeType Magnitude;
566 const SC zero = ATS::zero(), one = ATS::one();
568 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
570 typename Aggregates_kokkos::local_graph_type aggGraph;
572 SubFactoryMonitor m2(*
this,
"Get Aggregates graph", coarseLevel);
573 aggGraph = aggregates->GetGraph();
575 auto aggRows = aggGraph.row_map;
576 auto aggCols = aggGraph.entries;
583 SubFactoryMonitor m2(*
this,
"Check good map", coarseLevel);
584 goodMap = isGoodMap(*rowMap, *colMap);
587 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
588 "MueLu: TentativePFactory_kokkos: for now works only with good maps " 589 "(i.e. \"matching\" row and column maps)");
598 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
600 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
601 GO globalOffset = amalgInfo->GlobalOffset();
604 auto procWinner = aggregates->GetProcWinner() ->template getLocalView<DeviceType>();
605 auto vertex2AggId = aggregates->GetVertex2AggId()->template getLocalView<DeviceType>();
606 const size_t numAggregates = aggregates->GetNumAggregates();
608 int myPID = aggregates->GetMap()->getComm()->getRank();
613 typedef typename Aggregates_kokkos::aggregates_sizes_type::non_const_type AggSizeType;
614 AggSizeType aggDofSizes;
616 if (stridedBlockSize == 1) {
617 SubFactoryMonitor m2(*
this,
"Calc AggSizes", coarseLevel);
620 aggDofSizes = AggSizeType(
"agg_dof_sizes", numAggregates+1);
622 auto sizesConst = aggregates->ComputeAggregateSizes();
623 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates+1)), sizesConst);
626 SubFactoryMonitor m2(*
this,
"Calc AggSizes", coarseLevel);
629 aggDofSizes = AggSizeType(
"agg_dof_sizes", numAggregates + 1);
631 auto nodeMap = aggregates->GetMap()->getLocalMap();
632 auto dofMap = colMap->getLocalMap();
634 Kokkos::parallel_for(
"MueLu:TentativePF:Build:compute_agg_sizes", range_type(0,numAggregates),
635 KOKKOS_LAMBDA(
const LO agg) {
636 auto aggRowView = aggGraph.rowConst(agg);
639 for (LO colID = 0; colID < aggRowView.length; colID++) {
640 GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
642 for (LO k = 0; k < stridedBlockSize; k++) {
643 GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
645 if (dofMap.getLocalElement(dofGID) != INVALID)
649 aggDofSizes(agg+1) = size;
656 ReduceMaxFunctor<LO,decltype(aggDofSizes)> reduceMax(aggDofSizes);
657 Kokkos::parallel_reduce(
"MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
661 Kokkos::parallel_scan(
"MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0,numAggregates+1),
662 KOKKOS_LAMBDA(
const LO i, LO& update,
const bool& final_pass) {
663 update += aggDofSizes(i);
665 aggDofSizes(i) = update;
670 Kokkos::View<LO*, DeviceType> agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing(
"agg2row_map_LO"), numRows);
672 SubFactoryMonitor m2(*
this,
"Create Agg2RowMap", coarseLevel);
674 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing(
"aggOffsets"), numAggregates);
675 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
677 Kokkos::parallel_for(
"MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
678 KOKKOS_LAMBDA(
const LO lnode) {
679 if (procWinner(lnode, 0) == myPID) {
681 auto aggID = vertex2AggId(lnode,0);
683 auto offset = Kokkos::atomic_fetch_add( &aggOffsets(aggID), stridedBlockSize );
687 for (LO k = 0; k < stridedBlockSize; k++)
688 agg2RowMapLO(offset + k) = lnode*stridedBlockSize + k;
695 coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
698 auto fineNS = fineNullspace ->template getLocalView<DeviceType>();
699 auto coarseNS = coarseNullspace->template getLocalView<DeviceType>();
703 typedef typename Xpetra::Matrix<SC,LO,GO,NO>::local_matrix_type local_matrix_type;
704 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
705 typedef typename local_matrix_type::index_type::non_const_type cols_type;
706 typedef typename local_matrix_type::values_type::non_const_type vals_type;
710 typedef Kokkos::View<int[10], DeviceType> status_type;
711 status_type status(
"status");
713 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
714 typename AppendTrait<status_type, Kokkos::Atomic> ::type statusAtomic = status;
720 const ParameterList& pL = GetParameterList();
721 const bool& doQRStep = pL.get<
bool>(
"tentative: calculate qr");
723 GetOStream(
Runtime1) <<
"TentativePFactory : bypassing local QR phase" << std::endl;
725 GetOStream(
Warnings0) <<
"TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
734 SubFactoryMonitor m2(*
this,
"Stage 1 (LocalQR)", coarseLevel);
739 rows = rows_type(
"Ptent_rows", numRows+1);
740 cols = cols_type(Kokkos::ViewAllocateWithoutInitializing(
"Ptent_cols"), numRows);
741 vals = vals_type(Kokkos::ViewAllocateWithoutInitializing(
"Ptent_vals"), numRows);
747 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
750 Kokkos::parallel_for(
"MueLu:TentativePF:BuildUncoupled:main_loop", policy,
751 KOKKOS_LAMBDA(
const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
752 auto agg = thread.league_rank();
755 LO aggSize = aggRows(agg+1) - aggRows(agg);
760 auto norm = ATS::magnitude(zero);
765 for (decltype(aggSize) k = 0; k < aggSize; k++) {
766 auto dnorm = ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0));
773 statusAtomic(1) =
true;
778 coarseNS(agg, 0) = norm;
781 for (decltype(aggSize) k = 0; k < aggSize; k++) {
782 LO localRow = agg2RowMapLO(aggRows(agg)+k);
783 SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0) / norm;
785 rows(localRow+1) = localRow+1;
786 cols(localRow) = agg;
787 vals(localRow) = localVal;
792 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
793 Kokkos::deep_copy(statusHost, status);
794 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
796 std::ostringstream oss;
797 oss <<
"MueLu::TentativePFactory::MakeTentative: ";
799 case 0: oss <<
"!goodMap is not implemented";
break;
800 case 1: oss <<
"fine level NS part has a zero column";
break;
802 throw Exceptions::RuntimeError(oss.str());
806 Kokkos::parallel_for(
"MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
807 KOKKOS_LAMBDA(
const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
808 auto agg = thread.league_rank();
811 LO aggSize = aggRows(agg+1) - aggRows(agg);
814 coarseNS(agg, 0) = one;
817 for (decltype(aggSize) k = 0; k < aggSize; k++) {
818 LO localRow = agg2RowMapLO(aggRows(agg)+k);
819 SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0);
821 rows(localRow+1) = localRow+1;
822 cols(localRow) = agg;
823 vals(localRow) = localVal;
839 size_t nnzEstimate = numRows * NSDim;
840 rows_type rowsAux(
"Ptent_aux_rows", numRows+1);
841 cols_type colsAux(
"Ptent_aux_cols", nnzEstimate);
842 vals_type valsAux(
"Ptent_aux_vals", nnzEstimate);
843 rows = rows_type(
"Ptent_rows", numRows+1);
846 SubFactoryMonitor m2(*
this,
"Stage 0 (InitViews)", coarseLevel);
850 Kokkos::parallel_for(
"MueLu:TentativePF:BuildPuncoupled:for1", range_type(0, numRows+1),
851 KOKKOS_LAMBDA(
const LO row) {
852 rowsAux(row) = row*NSDim;
854 Kokkos::parallel_for(
"MueLu:TentativePF:BuildUncoupled:for2", range_type(0, nnzEstimate),
855 KOKKOS_LAMBDA(
const LO j) {
856 colsAux(j) = INVALID;
862 SubFactoryMonitor m2 = SubFactoryMonitor(*
this, doQRStep ?
"Stage 1 (LocalQR)" :
"Stage 1 (Fill coarse nullspace and tentative P)", coarseLevel);
866 const Kokkos::TeamPolicy<execution_space> policy(numAggregates,1);
867 LocalQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom),
868 decltype(aggDofSizes ), decltype(maxAggSize), decltype(agg2RowMapLO),
869 decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux),
871 localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
872 rows, rowsAux, colsAux, valsAux, doQRStep);
873 Kokkos::parallel_reduce(
"MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
876 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
877 Kokkos::deep_copy(statusHost, status);
878 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
880 std::ostringstream oss;
881 oss <<
"MueLu::TentativePFactory::MakeTentative: ";
883 case 0: oss <<
"!goodMap is not implemented";
break;
884 case 1: oss <<
"fine level NS part has a zero column";
break;
886 throw Exceptions::RuntimeError(oss.str());
893 cols = decltype(cols)(
"Ptent_cols", nnz);
894 vals = decltype(vals)(
"Ptent_vals", nnz);
897 SubFactoryMonitor m2(*
this,
"Stage 2 (CompressRows)", coarseLevel);
899 Kokkos::parallel_scan(
"MueLu:TentativePF:Build:compress_rows", range_type(0,numRows+1),
900 KOKKOS_LAMBDA(
const LO i, LO& upd,
const bool&
final) {
908 SubFactoryMonitor m2(*
this,
"Stage 2 (CompressCols)", coarseLevel);
913 Kokkos::parallel_for(
"MueLu:TentativePF:Build:compress_cols_vals", range_type(0,numRows),
914 KOKKOS_LAMBDA(
const LO i) {
915 LO rowStart = rows(i);
918 for (
auto j = rowsAux(i); j < rowsAux(i+1); j++)
919 if (colsAux(j) != INVALID) {
920 cols(rowStart+lnnz) = colsAux(j);
921 vals(rowStart+lnnz) = valsAux(j);
928 GetOStream(
Runtime1) <<
"TentativePFactory : aggregates do not cross process boundaries" << std::endl;
932 SubFactoryMonitor m2(*
this,
"Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
934 local_matrix_type lclMatrix = local_matrix_type(
"A", numRows, coarseMap->getNodeNumElements(), nnz, vals, rows, cols);
937 RCP<ParameterList> FCparams;
938 if (pL.isSublist(
"matrixmatrix: kernel params"))
939 FCparams = rcp(
new ParameterList(pL.sublist(
"matrixmatrix: kernel params")));
941 FCparams = rcp(
new ParameterList);
944 FCparams->set(
"compute global constants", FCparams->get(
"compute global constants",
false));
945 FCparams->set(
"Timer Label", std::string(
"MueLu::TentativeP-") +
toString(levelID));
947 auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
948 Ptentative = rcp(
new CrsMatrixWrap(PtentCrs));
952 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
953 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
954 BuildPcoupled(RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates, RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
955 RCP<const Map> coarseMap, RCP<Matrix>& Ptentative, RCP<MultiVector>& coarseNullspace)
const {
956 throw Exceptions::RuntimeError(
"MueLu: Construction of coupled tentative P is not implemented");
959 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
960 bool TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
961 isGoodMap(
const Map& rowMap,
const Map& colMap)
const {
962 auto rowLocalMap = rowMap.getLocalMap();
963 auto colLocalMap = colMap.getLocalMap();
965 const size_t numRows = rowLocalMap.getNodeNumElements();
966 const size_t numCols = colLocalMap.getNodeNumElements();
968 if (numCols < numRows)
972 Kokkos::parallel_reduce(
"MueLu:TentativePF:isGoodMap", range_type(0, numRows),
973 KOKKOS_LAMBDA(
const LO i,
size_t &diff) {
974 diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
977 return (numDiff == 0);
982 #define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT 983 #endif // HAVE_MUELU_KOKKOS_REFACTOR 984 #endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP Important warning messages (one line)
std::string toString(const T &what)
Little helper function to convert non-string types to strings.
Namespace for MueLu classes and methods.
static const NoFactory * get()
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
Description of what is happening (more verbose)
#define SET_VALID_ENTRY(name)