// -*- C++ -*-
// ACL:license
// ----------------------------------------------------------------------
// This software and ancillary information (herein called "SOFTWARE")
// called POOMA (Parallel Object-Oriented Methods and Applications) is
// made available under the terms described here.  The SOFTWARE has been
// approved for release with associated LA-CC Number LA-CC-98-65.
// 
// Unless otherwise indicated, this SOFTWARE has been authored by an
// employee or employees of the University of California, operator of the
// Los Alamos National Laboratory under Contract No. W-7405-ENG-36 with
// the U.S. Department of Energy.  The U.S. Government has rights to use,
// reproduce, and distribute this SOFTWARE. The public may copy, distribute,
// prepare derivative works and publicly display this SOFTWARE without 
// charge, provided that this Notice and any statement of authorship are 
// reproduced on all copies.  Neither the Government nor the University 
// makes any warranty, express or implied, or assumes any liability or 
// responsibility for the use of this SOFTWARE.
// 
// If SOFTWARE is modified to produce derivative works, such modified
// SOFTWARE should be clearly marked, so as not to confuse it with the
// version available from LANL.
// 
// For more information about POOMA, send e-mail to pooma@acl.lanl.gov,
// or visit the POOMA web page at http://www.acl.lanl.gov/pooma/.
// ----------------------------------------------------------------------
// ACL:license

//-----------------------------------------------------------------------------
// Classes: 
//   Reduction base template
//   Reduction<SinglePatchEvaluatorTag>
//   Reduction<MuliPatchEvaluatorTag>
//----------------------------------------------------------------------------

#ifndef POOMA_EVALUATOR_REDUCTION_H
#define POOMA_EVALUATOR_REDUCTION_H

//-----------------------------------------------------------------------------
// Overview:
//
// Reduction performs global reductions on expressions by examining the 
// engines that are participating in the expression and dispatching to custom 
// code.
//-----------------------------------------------------------------------------

//-----------------------------------------------------------------------------
// Includes:
//-----------------------------------------------------------------------------

#include "Domain/NullDomain.h"
#include "Engine/Intersector.h"
#include "Engine/IntersectEngine.h"
#include "Evaluator/ReductionKernel.h"
#include "Evaluator/EvaluatorTags.h"
#include "Threads/PoomaCSem.h"

#include <vector>
#include <iterator>

//-----------------------------------------------------------------------------
//
// Full Description:
//
// The point of this class is to input an expression with the
// 'evaluate' member function and reduce it by breaking it up into
// appropriate sub-blocks, looping over the whole domain, and
// evaluating the expression at each point.
//
//-----------------------------------------------------------------------------

template <class EvalTag>
struct Reduction
{ };


//-----------------------------------------------------------------------------
// Main Reduction:
//
// This reduction is the one that gets called for a data-parallel expression.
// It just determines the appropriate reduction from the types of the LHS and
// RHS.  We don't need to do a blockAndEvaluate() because  all reductions
// naturally involve some sort of blocking using counting semaphores. 
// This approach is superior to blockAndEvaluate() because iterates not 
// related to the reduction can continue to execute out-of-order.
//-----------------------------------------------------------------------------

template <>
struct Reduction<MainEvaluatorTag>
{
  //---------------------------------------------------------------------------
  // Default ctor.

  Reduction() { }

  //---------------------------------------------------------------------------
  // Destructor

  ~Reduction() { }

  //---------------------------------------------------------------------------
  // Input an expression and cause it to be reduced.
  // We just pass the buck to a special reduction.

  template<class T, class Op, class Expr>
  void evaluate(T &ret, const Op &op, const Expr &e) const
  {
    typedef typename EvaluatorTag1<Expr>::Evaluator_t Evaluator_t;
    Reduction<Evaluator_t>().evaluate(ret, op, e());
    
    POOMA_INCREMENT_STATISTIC(NumReductions)
  }
};


//-----------------------------------------------------------------------------
// Single-patch Reduction:
//
// The single patch version just passes the tag on to generate
// a reduction kernel.
//-----------------------------------------------------------------------------

template <>
struct Reduction<SinglePatchEvaluatorTag>
{
  //---------------------------------------------------------------------------
  // Default ctor.

  Reduction() { }

  //---------------------------------------------------------------------------
  // Destructor

  ~Reduction() { }

  //---------------------------------------------------------------------------
  // Input an expression and cause it to be reduced.
  // We just pass the buck to a special reduction.

  // Include versions expecting and not expecting counting semaphores.
  
  template<class T, class Op, class Expr>
  void evaluate(T &ret, const Op &op, const Expr &e,
		Pooma::CountingSemaphore &csem) const
  {
    typedef typename KernelTag1<Expr>::Kernel_t Kernel_t;

#if POOMA_REORDER_ITERATES
    Pooma::Iterate_t *iterate = 
      new ReductionKernel<T, Op, Expr, Kernel_t>(ret, op, e, csem);
    Pooma::scheduler().handOff(iterate);
#else
    ReductionEvaluator<Kernel_t>::evaluate(ret, op, e);
    csem.incr();
#endif
  }
  
  template<class T, class Op, class Expr>
  void evaluate(T &ret, const Op &op, const Expr &e) const
  {
    Pooma::CountingSemaphore csem;
    csem.height(1);

    Pooma::scheduler().beginGeneration();

    evaluate(ret, op, e, csem);

    Pooma::scheduler().endGeneration();

    csem.wait();
  }
};


//-----------------------------------------------------------------------------
// Multiple-patch Reduction:
//
// The multiple patch version makes patches and sends them out to
// the single patch reduction.
//-----------------------------------------------------------------------------

template <>
struct Reduction<MultiPatchEvaluatorTag>
{
  //---------------------------------------------------------------------------
  // Default ctor.

  Reduction() { }

  //---------------------------------------------------------------------------
  // Destructor

  ~Reduction() { }

  //---------------------------------------------------------------------------
  // Input an expression and cause it to be reduced according to the 
  // computational scheme:
  //   1. Perform the intersection calculation to deduce the patches that 
  //      computation will proceed on.
  //   2. Construct a counting sempahore with a height equal to the number 
  //      of patches.
  //   3. Construct a vector vals to hold the results from each patch 
  //      reduction.
  //   4. For each patch, take a view over the patch and do a reduction of 
  //      the resulting array. Increment the semaphore and store the result in 
  //      the appropriate slot of the vals vector.
  //   5. Wait for all reductions to finish.
  //   6. Finish by doing an immediate reduction of the vals array.

  template<class T, class Op, class Expr>
  void evaluate(T &ret, const Op &op, const Expr &e) const
  {
    typedef Intersector<Expr::dimensions> Inter_t;
    Inter_t inter;
    EngineApply<IntersectorTag<Inter_t> > tag(inter);  
    forEach(e, tag, NullDomain());
  
    const int n = std::distance(inter.begin(), inter.end());
    Pooma::CountingSemaphore csem;
    csem.height(n);
    T *vals = new T[n];

    Pooma::scheduler().beginGeneration();
    
    Inter_t::const_iterator i = inter.begin();
    int j = 0;
    while (j < n)
      {
        Reduction<SinglePatchEvaluatorTag>().
          evaluate(vals[j], op, e(*i), csem);
        ++i; ++j;
      }

    Pooma::scheduler().endGeneration();

    csem.wait();

    ret = vals[0];
    for (j = 1; j < n; j++)
      op(ret, vals[j]);
    delete [] vals;
  }
};


#endif // POOMA_EVALUATOR_REDUCTION_H

// ACL:rcsinfo
// ----------------------------------------------------------------------
// $RCSfile: Reduction.h,v $   $Author: mitchell $
// $Revision: 1.8 $   $Date: 2000/07/18 05:32:35 $
// ----------------------------------------------------------------------
// ACL:rcsinfo
