Two loops eating nearly all elapsed time #9

Laurae2 · 2018-11-07T13:07:56Z

Two loops are currently eating most of the CPU time. For an approximately 2.2 sec run time, on a single thread, this shows us the loops we should focus.

https://github.com/hcho3/xgboost-fast-hist-perf-lab/blob/master/src/build_hist.cc#L37-L42

https://github.com/hcho3/xgboost-fast-hist-perf-lab/blob/master/src/build_hist.cc#L57-L62

Note in the last screenshot, I intentionally inverted the nesting.

First loop details:

    for (int k = 0; k < kUnroll; ++k) {
      for (size_t j = ibegin[k]; j < iend[k]; ++j) {
        const uint32_t bin = gmat.index[j];
        data_[off + bin].Add(stat[k]);
      }
    }

Assembly:

Instruction details:

Second loop details:

  #pragma omp parallel for num_threads(nthread) schedule(static)
  for (dmlc::omp_uint bin_id = 0; bin_id < dmlc::omp_uint(nbin); ++bin_id) {
    for (dmlc::omp_uint tid = 0; tid < nthread; ++tid) {
      (*hist)[bin_id].Add(data_[tid * nbin_ + bin_id]);
    }
  }

Assembly:

Instruction details:

Have to check with Intel compilers to get more details.

Laurae2 · 2018-11-07T22:29:40Z

I managed to cut down the negative efficiency by about 40% (gcc) or 50% (icc), tested with 36 threads (with NUMA nodes) on one run:

Untuned version, gcc or icc: 41 seconds
Tuned version, gcc: 24 seconds
Tuned version, icc: 21 seconds

It's even faster for a small number of threads by about 10% to 20%. I'll run tonight on all threads to see the evolution with the number of threads.

Laurae2 · 2018-11-08T10:04:21Z

Results of yesterday with tuning.

Average speed:

Smaller lookup:

Inverted efficiency:

Efficiency:

Laurae2 · 2018-11-08T10:09:41Z

Code just with pragmas added to mitigate the performance of using many threads:

#include <dmlc/omp.h>
#include <perflab/build_hist.h>
#include <omp.h>
#include <vector>

namespace perflab {

void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
                             const std::vector<size_t>& instance_set,
                             const GHistIndexMatrix& gmat,
                             std::vector<GHistEntry>* hist) {
  std::fill(data_.begin(), data_.end(), GHistEntry());

  constexpr int kUnroll = 8;  // loop unrolling factor
  const auto nthread = static_cast<dmlc::omp_uint>(this->nthread_);
  const size_t nrows = instance_set.size();
  const size_t rest = nrows % kUnroll;

  #pragma omp parallel for num_threads(nthread) schedule(guided)
  for (dmlc::omp_uint i = 0; i < nrows - rest; i += kUnroll) {
    const dmlc::omp_uint tid = omp_get_thread_num();
    const size_t off = tid * nbin_;
    size_t rid[kUnroll];
    size_t ibegin[kUnroll];
    size_t iend[kUnroll];
    GradientPair stat[kUnroll];
    
    // Force ignore outer loop carried dependency as proven by Intel Advisor
    #if defined(__INTEL_COMPILER)
    #  pragma ivdep
    #elif defined(__GNUG__)
    #  pragma GCC ivdep
    #endif
    for (int k = 0; k < kUnroll; ++k) {
      rid[k] = instance_set[i + k];
    }
    
    // Force ignore outer loop carried dependency as proven by Intel Advisor
    #if defined(__INTEL_COMPILER)
    #  pragma ivdep
    #elif defined(__GNUG__)
    #  pragma GCC ivdep
    #endif
    for (int k = 0; k < kUnroll; ++k) {
      ibegin[k] = gmat.row_ptr[rid[k]];
      iend[k] = gmat.row_ptr[rid[k] + 1];
    }
    
    // Force ignore outer loop carried dependency as proven by Intel Advisor
    #if defined(__INTEL_COMPILER)
    #  pragma ivdep
    #elif defined(__GNUG__)
    #  pragma GCC ivdep
    #endif
    for (int k = 0; k < kUnroll; ++k) {
      stat[k] = gpair[rid[k]];
    }
    
    // Force ignore outer loop carried dependency as proven by Intel Advisor
    #if defined(__INTEL_COMPILER)
    #  pragma ivdep
    #  pragma unroll
    #elif defined(__GNUG__)
    #  pragma GCC ivdep
    #  pragma unroll
    #endif
    for (int k = 0; k < kUnroll; ++k) {
      // Very bad inner loop carried dependency causing inter-thread mass locks, should rewrite .Add(stat[k]) from scratch
      for (size_t j = ibegin[k]; j < iend[k]; ++j) {
        const uint32_t bin = gmat.index[j];
        data_[off + bin].Add(stat[k]);
      }
    }
  }

  for (size_t i = nrows - rest; i < nrows; ++i) {
    const size_t rid = instance_set[i];
    const size_t ibegin = gmat.row_ptr[rid];
    const size_t iend = gmat.row_ptr[rid + 1];
    const GradientPair stat = gpair[rid];
    for (size_t j = ibegin; j < iend; ++j) {
      const uint32_t bin = gmat.index[j];
      data_[bin].Add(stat);
    }
  }

  /* reduction */
  const uint32_t nbin = nbin_;
  // Loop carried dependency as proven by Intel Advisor
  // In its current state, there's no way to vectorize this =(
  #pragma omp parallel for num_threads(nthread) schedule(static)
  for (dmlc::omp_uint bin_id = 0; bin_id < dmlc::omp_uint(nbin); ++bin_id) {
    (*hist)[bin_id].Add(data_[omp_get_thread_num() * nbin_ + bin_id]);
  }
}

}  // namespace perflab

Laurae2 · 2018-11-10T10:14:47Z

To optimize later. I got to 36 threads = 0.8s recently.

Laurae2 mentioned this issue Nov 10, 2018

Negative scalability nearly removed #14

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Two loops eating nearly all elapsed time #9

Two loops eating nearly all elapsed time #9

Laurae2 commented Nov 7, 2018

Laurae2 commented Nov 7, 2018

Laurae2 commented Nov 8, 2018

Laurae2 commented Nov 8, 2018

Laurae2 commented Nov 10, 2018

Two loops eating nearly all elapsed time #9

Two loops eating nearly all elapsed time #9

Comments

Laurae2 commented Nov 7, 2018

Laurae2 commented Nov 7, 2018

Laurae2 commented Nov 8, 2018

Laurae2 commented Nov 8, 2018

Laurae2 commented Nov 10, 2018