This repository has been archived by the owner on Jun 24, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
Two loops eating nearly all elapsed time #9
Comments
I managed to cut down the negative efficiency by about 40% (gcc) or 50% (icc), tested with 36 threads (with NUMA nodes) on one run:
It's even faster for a small number of threads by about 10% to 20%. I'll run tonight on all threads to see the evolution with the number of threads. |
Code just with pragmas added to mitigate the performance of using many threads: #include <dmlc/omp.h>
#include <perflab/build_hist.h>
#include <omp.h>
#include <vector>
namespace perflab {
void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
const std::vector<size_t>& instance_set,
const GHistIndexMatrix& gmat,
std::vector<GHistEntry>* hist) {
std::fill(data_.begin(), data_.end(), GHistEntry());
constexpr int kUnroll = 8; // loop unrolling factor
const auto nthread = static_cast<dmlc::omp_uint>(this->nthread_);
const size_t nrows = instance_set.size();
const size_t rest = nrows % kUnroll;
#pragma omp parallel for num_threads(nthread) schedule(guided)
for (dmlc::omp_uint i = 0; i < nrows - rest; i += kUnroll) {
const dmlc::omp_uint tid = omp_get_thread_num();
const size_t off = tid * nbin_;
size_t rid[kUnroll];
size_t ibegin[kUnroll];
size_t iend[kUnroll];
GradientPair stat[kUnroll];
// Force ignore outer loop carried dependency as proven by Intel Advisor
#if defined(__INTEL_COMPILER)
# pragma ivdep
#elif defined(__GNUG__)
# pragma GCC ivdep
#endif
for (int k = 0; k < kUnroll; ++k) {
rid[k] = instance_set[i + k];
}
// Force ignore outer loop carried dependency as proven by Intel Advisor
#if defined(__INTEL_COMPILER)
# pragma ivdep
#elif defined(__GNUG__)
# pragma GCC ivdep
#endif
for (int k = 0; k < kUnroll; ++k) {
ibegin[k] = gmat.row_ptr[rid[k]];
iend[k] = gmat.row_ptr[rid[k] + 1];
}
// Force ignore outer loop carried dependency as proven by Intel Advisor
#if defined(__INTEL_COMPILER)
# pragma ivdep
#elif defined(__GNUG__)
# pragma GCC ivdep
#endif
for (int k = 0; k < kUnroll; ++k) {
stat[k] = gpair[rid[k]];
}
// Force ignore outer loop carried dependency as proven by Intel Advisor
#if defined(__INTEL_COMPILER)
# pragma ivdep
# pragma unroll
#elif defined(__GNUG__)
# pragma GCC ivdep
# pragma unroll
#endif
for (int k = 0; k < kUnroll; ++k) {
// Very bad inner loop carried dependency causing inter-thread mass locks, should rewrite .Add(stat[k]) from scratch
for (size_t j = ibegin[k]; j < iend[k]; ++j) {
const uint32_t bin = gmat.index[j];
data_[off + bin].Add(stat[k]);
}
}
}
for (size_t i = nrows - rest; i < nrows; ++i) {
const size_t rid = instance_set[i];
const size_t ibegin = gmat.row_ptr[rid];
const size_t iend = gmat.row_ptr[rid + 1];
const GradientPair stat = gpair[rid];
for (size_t j = ibegin; j < iend; ++j) {
const uint32_t bin = gmat.index[j];
data_[bin].Add(stat);
}
}
/* reduction */
const uint32_t nbin = nbin_;
// Loop carried dependency as proven by Intel Advisor
// In its current state, there's no way to vectorize this =(
#pragma omp parallel for num_threads(nthread) schedule(static)
for (dmlc::omp_uint bin_id = 0; bin_id < dmlc::omp_uint(nbin); ++bin_id) {
(*hist)[bin_id].Add(data_[omp_get_thread_num() * nbin_ + bin_id]);
}
}
} // namespace perflab |
To optimize later. I got to 36 threads = 0.8s recently. |
Sign up for free
to subscribe to this conversation on GitHub.
Already have an account?
Sign in.
Two loops are currently eating most of the CPU time. For an approximately 2.2 sec run time, on a single thread, this shows us the loops we should focus.
https://github.com/hcho3/xgboost-fast-hist-perf-lab/blob/master/src/build_hist.cc#L37-L42
https://github.com/hcho3/xgboost-fast-hist-perf-lab/blob/master/src/build_hist.cc#L57-L62
Note in the last screenshot, I intentionally inverted the nesting.
First loop details:
Assembly:
Instruction details:
Second loop details:
Assembly:
Instruction details:
Have to check with Intel compilers to get more details.
The text was updated successfully, but these errors were encountered: