Negative scalability nearly removed #14

Laurae2 · 2018-11-10T16:22:47Z

Using the following tricks:

Allocate instead of fill: build_hist is (heavily) latency bounded #12
Outer loop parallelization: build_hist deals with too few data #11
Removal of false positive dependencies found by compilers: Two loops eating nearly all elapsed time #9

The performance becomes the following.

Speed table, from left to right accumulation:

Threads	0. Raw	Trick 1. 1.1. Depend Check 1.2. Bad Loop Removal	Trick 2. Outer Loop Parallel +1	Trick 3. Pre-store Thread nb. +1+2	Trick 4. Loop Fill +1+2+3	Trick 5. Private data_ Pre-alloc +1+2+3	Imp.
1	02.482	02.441	02.484	02.436	02.455	02.229	1.1x
2	01.927	01.850	01.645	01.598	01.599	01.200	1.6x
4	02.237	02.059	01.195	01.117	01.143	00.748	3.0x
6	02.838	02.515	01.035	01.000	01.004	00.624	4.5x
9	04.182	03.656	01.172	01.218	01.107	00.616	6.8x
17	09.712	09.126	01.826	01.923	02.021	00.843	11.5x
18	09.321	09.900	02.009	02.008	02.161	00.834	11.2x
19	11.102	10.824	02.412	02.476	02.512	00.929	11.9x
27	24.526	17.281	03.259	03.458	03.555	01.099	22.3x
35	38.559	23.264	03.433	03.803	03.803	01.146	33.6x
36	40.351	23.972	03.597	03.820	03.857	01.155	34.9x
37	39.076	23.305	03.570	03.958	04.076	01.172	33.4x
54	52.389	40.782	09.248	09.476	09.520	01.236	42.4x
71	79.262	58.390	17.217	11.197	14.736	01.389	57.0x
72	81.466	62.740	20.849	14.417	15.121	01.446	56.4x

Average speed:

Average speed, low thread count:

Parallel efficiency:

Inverted parallel efficiency:

build_hist.cc, lot of changes:

#include <dmlc/omp.h>
#include <perflab/build_hist.h>
#include <omp.h>
#include <vector>

namespace perflab {

void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
                             const std::vector<size_t>& instance_set,
                             const GHistIndexMatrix& gmat,
                             std::vector<GHistEntry>* hist) {
  
  std::vector<GHistEntry> data__; // private alloc
  data__.reserve(data_.size());
  #pragma omp simd // makes code slower and should be removed, but kept here for comparison
  for (size_t i = 0; i < data__.size(); i++) {
    data__[i] = GHistEntry();
  }

  constexpr int kUnroll = 8;  // loop unrolling factor
  const auto nthread = static_cast<dmlc::omp_uint>(this->nthread_);
  const size_t nrows = instance_set.size();
  const size_t rest = nrows % kUnroll;
  const size_t unrolled = nrows - rest;
  const dmlc::omp_uint tid = omp_get_thread_num();
  const size_t off = tid * nbin_;
  size_t rid[kUnroll];
  size_t ibegin[kUnroll];
  size_t iend[kUnroll];
  GradientPair stat[kUnroll];
  uint32_t bin;

  for (dmlc::omp_uint i = 0; i < unrolled; i += kUnroll) {
    
    // Force ignore outer loop carried dependency as proven by Intel Advisor
    #if defined(__INTEL_COMPILER)
    #  pragma ivdep
    #elif defined(__GNUG__)
    #  pragma GCC ivdep
    #endif
    for (int k = 0; k < kUnroll; ++k) {
      rid[k] = instance_set[i + k];
    }
    
    // Force ignore outer loop carried dependency as proven by Intel Advisor
    #if defined(__INTEL_COMPILER)
    #  pragma ivdep
    #elif defined(__GNUG__)
    #  pragma GCC ivdep
    #endif
    for (int k = 0; k < kUnroll; ++k) {
      ibegin[k] = gmat.row_ptr[rid[k]];
      iend[k] = gmat.row_ptr[rid[k] + 1];
    }
    
    // Force ignore outer loop carried dependency as proven by Intel Advisor
    #if defined(__INTEL_COMPILER)
    #  pragma ivdep
    #elif defined(__GNUG__)
    #  pragma GCC ivdep
    #endif
    for (int k = 0; k < kUnroll; ++k) {
      stat[k] = gpair[rid[k]];
    }
    
    // Force ignore outer loop carried dependency as proven by Intel Advisor
    #if defined(__INTEL_COMPILER)
    #  pragma ivdep
    #  pragma unroll
    #elif defined(__GNUG__)
    #  pragma GCC ivdep
    #  pragma unroll
    #endif
    for (int k = 0; k < kUnroll; ++k) {
      // Very bad inner loop carried dependency causing inter-thread mass locks, should rewrite .Add(stat[k]) from scratch
      for (size_t j = ibegin[k]; j < iend[k]; ++j) {
        bin = gmat.index[j];
        data__[off + bin].Add(stat[k]);
      }
    }
  }

  for (size_t i = nrows - rest; i < nrows; ++i) {
    const size_t rid = instance_set[i];
    const size_t ibegin = gmat.row_ptr[rid];
    const size_t iend = gmat.row_ptr[rid + 1];
    const GradientPair stat = gpair[rid];
    for (size_t j = ibegin; j < iend; ++j) {
      const uint32_t bin = gmat.index[j];
      data__[bin].Add(stat);
    }
  }

  /* reduction */
  const uint32_t nbin = nbin_;
  #pragma omp simd
  for (dmlc::omp_uint bin_id = 0; bin_id < dmlc::omp_uint(nbin); ++bin_id) {
    (*hist)[bin_id].Add(data__[tid * nbin_ + bin_id]);
  }
}

}  // namespace perflab

main.cc, only 2 changes (2 pragmas added):

#include <dmlc/timer.h>
#include <dmlc/logging.h>
#include <dmlc/omp.h>
#include <perflab/data_structure.h>
#include <perflab/build_hist.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <cstdint>

template <typename T>
std::vector<T> ParseNewlineSeparatedText(const std::string& path) {
  std::ifstream fin(path);
  std::vector<T> vec;
  T val;
  CHECK(!fin.fail()) << "File " << path << " not found!";
  while ( (fin >> val) ) {
    vec.push_back(val);
  }
  return vec;
}

int main(int argc, char** argv) {
  if (argc != 3) {
    std::cerr << "Usage: " << argv[0] << " [location of extracted record.tar.bz2] [number of threads]" << std::endl;
    return 1;
  }
  const std::string record_path(argv[1]);
  const int nthread = std::stoi(argv[2]); 
  CHECK_GT(nthread, 0) << "There should be positive number of threads";
  CHECK_LE(nthread, omp_get_max_threads()) << "Too many threads";

  LOG(INFO) << "Record location = " << record_path << ", Using " << nthread << " threads";

  double tstart = dmlc::GetTime();

  perflab::GHistIndexMatrix gmat;
  gmat.row_ptr = ParseNewlineSeparatedText<size_t>(record_path + "/gmat_row_ptr.txt");
  gmat.index = ParseNewlineSeparatedText<uint32_t>(record_path + "/gmat_index.txt");
  gmat.cut.row_ptr = ParseNewlineSeparatedText<uint32_t>(record_path + "/gmat_cut_row_ptr.txt");
  gmat.cut.min_val = ParseNewlineSeparatedText<float>(record_path + "/gmat_cut_min_val.txt");
  gmat.cut.cut = ParseNewlineSeparatedText<float>(record_path + "/gmat_cut_cut.txt");

  // size of each gradient histogram
  const uint32_t nbin = gmat.cut.row_ptr.back();

  std::vector<perflab::GradientPair> gpair
    = ParseNewlineSeparatedText<perflab::GradientPair>(record_path + "/gpair-198-0.txt");

  // One histogram is created for each instance set
  const int num_instance_set = 2920;
  std::vector<std::vector<size_t>> instance_set;
  std::vector<std::vector<perflab::GHistEntry>> histogram(
    num_instance_set, std::vector<perflab::GHistEntry>(nbin));

  for (int i = 0; i < num_instance_set; ++i) {
    const std::string path = std::string(record_path + "/rowind-198-") + std::to_string(i) + ".txt";
    instance_set.push_back(ParseNewlineSeparatedText<size_t>(path));
  }

  LOG(INFO) << "Data loaded in " << (dmlc::GetTime() - tstart) << " seconds";

  // Initialize histogram builder
  perflab::GHistBuilder hist_builder(nthread, nbin);

  // Compute histograms
  tstart = dmlc::GetTime();
#if defined(__INTEL_COMPILER)
#  pragma ivdep
#elif defined(__GNUG__)
#  pragma GCC ivdep
#endif
#pragma omp parallel for num_threads(nthread) schedule(dynamic)
  for (int i = 0; i < num_instance_set; ++i) {
    hist_builder.BuildHist(gpair, instance_set[i], gmat, &histogram[i]);
  }
  LOG(INFO) << "Gradient histograms computed in " << (dmlc::GetTime() - tstart) << " seconds";

  return 0;
}

Full table of performance:

Threads	0. Raw	Trick 1. 1.1. Depend Check 1.2. Bad Loop Removal	Trick 2. Outer Loop Parallel +1	Trick 3. Pre-store Thread nb. +1+2	Trick 4. Loop Fill +1+2+3	Trick 5. Private data_ Pre-alloc +1+2+3	Imp.
1	02.482	02.441	02.484	02.436	02.455	02.229	1.1x
2	01.927	01.850	01.645	01.598	01.599	01.200	1.6x
3	01.977	01.893	01.325	01.266	01.295	00.890	2.2x
4	02.237	02.059	01.195	01.117	01.143	00.748	3.0x
5	02.542	02.263	01.097	01.049	01.043	00.667	3.8x
6	02.838	02.515	01.035	01.000	01.004	00.624	4.5x
7	03.168	02.854	01.063	00.980	01.027	00.593	5.3x
8	03.841	03.306	00.981	01.066	01.017	00.583	6.6x
9	04.182	03.656	01.172	01.218	01.107	00.616	6.8x
10	04.533	04.164	01.286	01.266	01.188	00.633	7.2x
11	05.010	04.802	01.397	01.433	01.295	00.635	7.9x
12	06.067	05.532	01.464	01.525	01.424	00.662	9.2x
13	06.388	06.362	01.536	01.600	01.467	00.688	9.3x
14	06.483	07.018	01.551	01.751	01.554	00.689	9.4x
15	07.394	07.557	01.605	01.695	01.682	00.713	10.4x
16	08.531	08.253	01.754	01.795	01.894	00.856	10.0x
17	09.712	09.126	01.826	01.923	02.021	00.843	11.5x
18	09.321	09.900	02.009	02.008	02.161	00.834	11.2x
19	11.102	10.824	02.412	02.476	02.512	00.929	11.9x
20	13.069	11.733	02.831	02.820	02.889	00.976	13.4x
21	14.429	12.524	02.881	03.036	03.054	01.013	14.2x
22	16.070	13.357	02.979	03.111	03.154	01.030	15.6x
23	18.375	14.179	03.029	03.133	03.208	01.058	17.4x
24	19.613	14.851	03.128	03.288	03.327	01.071	18.3x
25	21.289	15.756	03.116	03.392	03.405	01.087	19.6x
26	22.978	16.538	03.217	03.476	03.519	01.097	21.0x
27	24.526	17.281	03.259	03.458	03.555	01.099	22.3x
28	25.963	18.012	03.326	03.554	03.636	01.103	23.5x
29	28.605	18.821	03.259	03.553	03.621	01.118	25.6x
30	31.097	19.497	03.240	03.530	03.618	01.124	27.7x
31	32.949	20.344	03.304	03.614	03.638	01.139	28.9x
32	34.603	20.981	03.349	03.572	03.649	01.138	30.4x
33	35.858	21.816	03.284	03.757	03.725	01.139	31.5x
34	37.469	22.591	03.385	03.712	03.768	01.139	32.9x
35	38.559	23.264	03.433	03.803	03.803	01.146	33.6x
36	40.351	23.972	03.597	03.820	03.857	01.155	34.9x
37	39.076	23.305	03.570	03.958	04.076	01.172	33.4x
38	39.487	24.014	03.696	04.225	04.089	01.183	33.4x
39	40.607	24.657	03.883	04.150	04.398	01.176	34.5x
40	41.691	25.261	04.060	04.639	04.474	01.185	35.2x
41	42.101	25.895	04.460	05.000	04.658	01.182	35.6x
42	42.200	26.691	05.462	05.826	05.306	01.200	35.2x
43	42.957	29.532	06.047	06.193	06.382	01.207	35.6x
44	43.713	31.401	06.882	06.914	07.299	01.205	36.3x
45	45.977	33.044	07.097	07.436	07.379	01.207	38.1x
46	45.250	34.337	07.384	07.731	07.760	01.210	37.4x
47	46.079	35.279	07.733	08.058	07.953	01.216	37.9x
48	47.958	36.092	07.801	08.158	08.094	01.217	39.4x
49	47.339	36.882	08.090	08.483	08.398	01.216	38.9x
50	50.446	37.709	08.315	08.653	08.662	01.213	41.6x
51	50.691	38.589	08.544	08.881	08.886	01.216	41.7x
52	51.344	39.377	08.810	09.029	09.051	01.226	41.9x
53	51.757	39.966	08.998	09.198	09.280	01.221	42.4x
54	52.389	40.782	09.248	09.476	09.520	01.236	42.4x
55	55.402	41.526	09.282	09.521	09.494	01.234	44.9x
56	58.415	42.294	09.339	09.637	09.594	01.237	47.2x
57	60.048	43.469	09.346	09.634	09.747	01.256	47.8x
58	61.238	45.269	09.352	09.706	09.886	01.259	48.6x
59	62.692	46.186	09.312	09.669	09.859	01.258	49.8x
60	65.080	47.080	09.399	09.935	09.796	01.268	51.3x
61	64.910	47.394	09.579	09.672	09.869	01.270	51.1x
62	65.799	48.129	09.628	09.946	09.915	01.278	51.5x
63	67.791	48.953	09.582	09.842	09.919	01.281	52.9x
64	69.838	52.467	09.762	09.935	10.025	01.281	54.5x
65	71.856	54.472	09.636	10.018	10.149	01.277	56.3x
66	71.945	55.588	09.840	10.110	10.284	01.294	55.6x
67	74.701	56.694	09.818	10.012	10.133	01.290	57.9x
68	76.361	57.011	10.036	10.389	11.636	01.321	57.8x
69	75.769	58.643	11.228	10.500	10.811	01.308	57.9x
70	76.516	59.118	12.152	12.463	13.002	01.317	58.1x
71	79.262	58.390	17.217	11.197	14.736	01.389	57.0x
72	81.466	62.740	20.849	14.417	15.121	01.446	56.4x

Laurae2 · 2018-11-10T16:23:59Z

ping @hcho3 for code checking, there is no test at the end therefore I cannot really check whether it is computing correctly or not.

Laurae2 · 2018-11-10T21:17:24Z

Mostly L1 and L3 bound data now with 18 threads.

thvasilo · 2018-11-12T10:41:31Z

This is great work @Laurae2! Do you have the individual changes in a repo so we can check the diff? It will be easier, at least for me, to figure out how your changes work.

Laurae2 · 2018-11-17T09:53:28Z

@thvasilo Check this: https://github.com/Laurae2/xgboost-fast-hist-perf-lab

With the float transformation of @SmirnovEgorRu in #15, I am getting 0.75s for 2 thread, 0.3s for 9 threads, and 0.8s for 36 threads.

Laurae2 · 2018-11-17T12:55:37Z

With double to float trick for gradient / hessian. Now up to 100x to original speed, negative scaling is still here past 10 threads but way lower than before.

The charts depicts a strange behavior past 30 threads.

Truncated table:

Threads	0. Raw gcc	Trick 5. icc	Trick 5. gcc	Trick 6. Double to Float +5 icc	Trick 6. Double to Float +5 gcc	Imp.
1	02.482	02.086	02.229	01.214	01.454	1.7x
2	01.927	01.144	01.200	00.665	00.771	2.5x
4	02.237	00.732	00.748	00.408	00.449	5.0x
6	02.838	00.610	00.624	00.336	00.359	7.9x
9	04.182	00.622	00.616	00.299	00.310	13.5x
17	09.712	00.949	00.843	00.334	00.341	28.5x
18	09.321	00.939	00.834	00.339	00.347	26.8x
19	11.102	01.039	00.929	00.358	00.370	30.0x
27	24.526	01.207	01.099	00.434	00.433	56.6x
35	38.559	01.245	01.146	00.729	00.643	60.0x
36	40.351	01.257	01.155	00.731	00.640	63.1x
37	39.076	01.273	01.172	00.735	00.644	60.6x
54	52.389	01.309	01.236	00.788	00.682	76.8x
71	79.262	01.486	01.389	00.922	00.799	99.1x
72	81.466	01.531	01.446	00.967	00.832	97.9x

Full table:

Threads	0. Raw gcc	Trick 5. icc	Trick 5. gcc	Trick 6. Double to Float +5 icc	Trick 6. Double to Float +5 gcc	Imp.
1	02.482	02.086	02.229	01.214	01.454	1.7x
2	01.927	01.144	01.200	00.665	00.771	2.5x
3	01.977	00.855	00.890	00.488	00.549	3.6x
4	02.237	00.732	00.748	00.408	00.449	5.0x
5	02.542	00.655	00.667	00.363	00.394	6.5x
6	02.838	00.610	00.624	00.336	00.359	7.9x
7	03.168	00.590	00.593	00.319	00.336	9.4x
8	03.841	00.594	00.583	00.309	00.321	12.0x
9	04.182	00.622	00.616	00.299	00.310	13.5x
10	04.533	00.641	00.633	00.294	00.302	15.0x
11	05.010	00.648	00.635	00.299	00.303	16.5x
12	06.067	00.668	00.662	00.306	00.310	19.6x
13	06.388	00.708	00.688	00.312	00.317	20.2x
14	06.483	00.713	00.689	00.317	00.323	20.1x
15	07.394	00.726	00.713	00.322	00.328	22.5x
16	08.531	00.961	00.856	00.329	00.335	25.5x
17	09.712	00.949	00.843	00.334	00.341	28.5x
18	09.321	00.939	00.834	00.339	00.347	26.8x
19	11.102	01.039	00.929	00.358	00.370	30.0x
20	13.069	01.084	00.976	00.378	00.382	34.2x
21	14.429	01.125	01.013	00.392	00.394	36.6x
22	16.070	01.138	01.030	00.405	00.406	39.5x
23	18.375	01.156	01.058	00.413	00.418	44.0x
24	19.613	01.157	01.071	00.422	00.427	45.9x
25	21.289	01.183	01.087	00.432	00.432	49.3x
26	22.978	01.192	01.097	00.434	00.439	52.4x
27	24.526	01.207	01.099	00.434	00.433	56.6x
28	25.963	01.198	01.103	00.425	00.442	58.7x
29	28.605	01.209	01.118	00.428	00.446	64.1x
30	31.097	01.217	01.124	00.431	00.449	69.3x
31	32.949	01.237	01.139	00.735	00.636	51.8x
32	34.603	01.219	01.138	00.733	00.633	54.7x
33	35.858	01.246	01.139	00.729	00.637	56.3x
34	37.469	01.244	01.139	00.733	00.641	58.5x
35	38.559	01.245	01.146	00.729	00.643	60.0x
36	40.351	01.257	01.155	00.731	00.640	63.1x
37	39.076	01.273	01.172	00.735	00.644	60.6x
38	39.487	01.266	01.183	00.737	00.649	60.9x
39	40.607	01.285	01.176	00.742	00.650	62.5x
40	41.691	01.288	01.185	00.742	00.653	63.9x
41	42.101	01.275	01.182	00.743	00.651	64.6x
42	42.200	01.295	01.200	00.749	00.653	64.6x
43	42.957	01.287	01.207	00.750	00.655	65.5x
44	43.713	01.290	01.205	00.751	00.659	66.4x
45	45.977	01.301	01.207	00.753	00.661	69.5x
46	45.250	01.306	01.210	00.757	00.667	67.9x
47	46.079	01.297	01.216	00.759	00.663	69.5x
48	47.958	01.308	01.217	00.764	00.667	71.9x
49	47.339	01.299	01.216	00.765	00.671	70.5x
50	50.446	01.306	01.213	00.765	00.672	75.1x
51	50.691	01.306	01.216	00.768	00.676	75.0x
52	51.344	01.317	01.226	00.770	00.683	75.1x
53	51.757	01.308	01.221	00.780	00.683	75.8x
54	52.389	01.309	01.236	00.788	00.682	76.8x
55	55.402	01.324	01.234	00.794	00.689	80.4x
56	58.415	01.347	01.237	00.797	00.693	84.3x
57	60.048	01.340	01.256	00.803	00.697	86.2x
58	61.238	01.344	01.259	00.807	00.700	87.5x
59	62.692	01.354	01.258	00.812	00.705	88.9x
60	65.080	01.374	01.268	00.813	00.709	91.8x
61	64.910	01.353	01.270	00.823	00.717	90.5x
62	65.799	01.363	01.278	00.827	00.717	91.7x
63	67.791	01.393	01.281	00.832	00.722	93.9x
64	69.838	01.393	01.281	00.845	00.726	96.1x
65	71.856	01.373	01.277	00.852	00.734	97.9x
66	71.945	01.384	01.294	00.854	00.742	97.0x
67	74.701	01.395	01.290	00.859	00.744	100.4x
68	76.361	01.407	01.321	00.864	00.754	101.2x
69	75.769	01.427	01.308	00.874	00.761	99.6x
70	76.516	01.448	01.317	00.887	00.766	99.9x
71	79.262	01.486	01.389	00.922	00.799	99.1x
72	81.466	01.531	01.446	00.967	00.832	97.9x

SmirnovEgorRu · 2018-11-19T09:40:57Z

@Laurae2, As I understand, this code can't be executed in parallel for, because each iteration here - building of one note, and it has dependencies. For example, you can't start building of 2-level nodes, when the root is not built.

#pragma omp parallel for num_threads(nthread) schedule(dynamic)
  for (int i = 0; i < num_instance_set; ++i) {
    hist_builder.BuildHist(gpair, instance_set[i], gmat, &histogram[i]);
  }

Building of tree can be parallelized by nodes (and should be). However, efficient implementation of it in real code will be much harder.

hcho3 · 2018-11-19T09:49:43Z

@Laurae2 @SmirnovEgorRu I really appreciate the work you have done so far. I've been lately extremely busy with work at my organization. Will get back to it in a week. We should discuss how the improvement can be merged into main XGBoost codebase.

Laurae2 · 2018-11-19T20:10:00Z

@SmirnovEgorRu If I understand correctly what you said:

The outer loop (in main.cc) is building all nodes?
The inner loop (in build_hist.cc) is building one specific node?
Each node is dependent to the previous nodes?

I was suspecting this but Intel Advisor contradicted myself by saying there were no dependency in the loop..

thvasilo · 2018-11-20T10:09:55Z

@Laurae2 perhaps it would help to look at the original codebase:

This update is done from updater_quantile_hist.cc::Update which proceeds to call BuildHist which in turn calls BuildBlockHist or BuildHist which is what we are focusing on.

As you will see (as far as I can tell) the only parallel code is in BuildHist, where the histogram building is parallel over the number of data points, and a parallel reduction after that.

I'm more familiar with other builders (updater_histmaker in particular) but this one seems similar. At the beginning of the iteration we assign each data point to a single tree leaf. When we create the histograms we aggregate them per leaf, i.e. each leaf will have one collection of histograms, one per feature.

When we parallelize over data points, we first retrieve the leaf id of the data point, I think that's what row_indices does in, then use that index to update the corresponding histogram set.

Not sure how to interpret the num_instance_set in this repo, maybe it's a way of calculating the histograms in a blocked way? Perhaps @hcho3 can help when he has more time.

RAMitchell · 2018-11-26T05:08:32Z

Really interesting work @Laurae2

As per my comment in #15 I think the summation must be in doubles otherwise the accuracy significantly falls off on larger data sets.

Laurae2 added a commit to Laurae2/xgboost-fast-hist-perf-lab that referenced this issue Nov 17, 2018

to trick 5 hcho3#14

6af4d6a

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Negative scalability nearly removed #14

Negative scalability nearly removed #14

Laurae2 commented Nov 10, 2018

Laurae2 commented Nov 10, 2018

Laurae2 commented Nov 10, 2018 •

edited

Loading

thvasilo commented Nov 12, 2018

Laurae2 commented Nov 17, 2018

Laurae2 commented Nov 17, 2018

SmirnovEgorRu commented Nov 19, 2018

hcho3 commented Nov 19, 2018

Laurae2 commented Nov 19, 2018

thvasilo commented Nov 20, 2018

RAMitchell commented Nov 26, 2018

Negative scalability nearly removed #14

Negative scalability nearly removed #14

Comments

Laurae2 commented Nov 10, 2018

Laurae2 commented Nov 10, 2018

Laurae2 commented Nov 10, 2018 • edited Loading

thvasilo commented Nov 12, 2018

Laurae2 commented Nov 17, 2018

Laurae2 commented Nov 17, 2018

SmirnovEgorRu commented Nov 19, 2018

hcho3 commented Nov 19, 2018

Laurae2 commented Nov 19, 2018

thvasilo commented Nov 20, 2018

RAMitchell commented Nov 26, 2018

Laurae2 commented Nov 10, 2018 •

edited

Loading