Merge "Add multithreads throughput benchmark" into main am: 9ac6169db4 am: 2e6a1923f3 am: 33bc3b7809 am: fd7118ed66 am: 86c29d11cf am: 59de084b0b

Original change: https://android-review.googlesource.com/c/platform/bionic/+/2645428 Change-Id: Iff90cb7c04393787444f0301f0868d771b96c7bf Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
2023-07-11 10:57:02 +00:00 · 2023-07-11 10:57:02 +00:00 · 6e1d673765
commit 6e1d673765
parent 7581ecdbd6 59de084b0b
1 changed files with 106 additions and 0 deletions
--- a/benchmarks/malloc_benchmark.cpp
+++ b/benchmarks/malloc_benchmark.cpp
@ -29,6 +29,10 @@
 #include <malloc.h>
 #include <unistd.h>

+#include <condition_variable>
+#include <mutex>
+#include <random>
+#include <thread>
 #include <vector>

 #include <benchmark/benchmark.h>
@ -68,6 +72,89 @@ static void RunMalloptPurge(benchmark::State& state, int purge_value) {
  mallopt(M_DECAY_TIME, 0);
 }

+static void RunThreadsThroughput(benchmark::State& state, size_t size, size_t num_threads) {
+  constexpr size_t kMaxBytes = 1 << 24;
+  constexpr size_t kMaxThreads = 8;
+  constexpr size_t kMinRounds = 4;
+  const size_t MaxAllocCounts = kMaxBytes / size;
+  std::mutex m;
+  bool ready = false;
+  std::condition_variable cv;
+  std::thread* threads[kMaxThreads];
+
+  // The goal is to create malloc/free interleaving patterns across threads.
+  // The bytes processed by each thread will be the same. The difference is the
+  // patterns. Here's an example:
+  //
+  // A: Allocation
+  // D: Deallocation
+  //
+  //   T1    T2    T3
+  //   A     A     A
+  //   A     A     D
+  //   A     D     A
+  //   A     D     D
+  //   D     A     A
+  //   D     A     D
+  //   D     D     A
+  //   D     D     D
+  //
+  // To do this, `AllocCounts` and `AllocRounds` will be adjusted according to the
+  // thread id.
+  auto thread_task = [&](size_t id) {
+    {
+      std::unique_lock lock(m);
+      // Wait until all threads are created.
+      cv.wait(lock, [&] { return ready; });
+    }
+
+    void** MemPool;
+    const size_t AllocCounts = (MaxAllocCounts >> id);
+    const size_t AllocRounds = (kMinRounds << id);
+    MemPool = new void*[AllocCounts];
+
+    for (size_t i = 0; i < AllocRounds; ++i) {
+      for (size_t j = 0; j < AllocCounts; ++j) {
+        void* ptr = malloc(size);
+        MemPool[j] = ptr;
+      }
+
+      // Use a fix seed to reduce the noise of different round of benchmark.
+      const unsigned seed = 33529;
+      std::shuffle(MemPool, &MemPool[AllocCounts], std::default_random_engine(seed));
+
+      for (size_t j = 0; j < AllocCounts; ++j) free(MemPool[j]);
+    }
+
+    delete[] MemPool;
+  };
+
+  for (auto _ : state) {
+    state.PauseTiming();
+    // Don't need to acquire the lock because no thread is created.
+    ready = false;
+
+    for (size_t i = 0; i < num_threads; ++i) threads[i] = new std::thread(thread_task, i);
+
+    state.ResumeTiming();
+
+    {
+      std::unique_lock lock(m);
+      ready = true;
+    }
+
+    cv.notify_all();
+
+    for (size_t i = 0; i < num_threads; ++i) {
+      threads[i]->join();
+      delete threads[i];
+    }
+  }
+
+  const size_t ThreadsBytesProcessed = kMaxBytes * kMinRounds * num_threads;
+  state.SetBytesProcessed(ThreadsBytesProcessed * static_cast<size_t>(state.iterations()));
+}
+
 static void BM_mallopt_purge(benchmark::State& state) {
  RunMalloptPurge(state, M_PURGE);
 }
@ -78,4 +165,23 @@ static void BM_mallopt_purge_all(benchmark::State& state) {
 }
 BIONIC_BENCHMARK(BM_mallopt_purge_all);

+// Note that this will only test a single size class at a time so that we can
+// observe the impact of contention more often.
+#define BM_MALLOC_THREADS_THROUGHPUT(SIZE, NUM_THREADS)                                      \
+  static void BM_malloc_threads_throughput_##SIZE##_##NUM_THREADS(benchmark::State& state) { \
+    RunThreadsThroughput(state, SIZE, NUM_THREADS);                                          \
+  }                                                                                          \
+  BIONIC_BENCHMARK(BM_malloc_threads_throughput_##SIZE##_##NUM_THREADS);
+
+// There are three block categories in Scudo, we choose 1 from each category.
+BM_MALLOC_THREADS_THROUGHPUT(64, 2);
+BM_MALLOC_THREADS_THROUGHPUT(64, 4);
+BM_MALLOC_THREADS_THROUGHPUT(64, 8);
+BM_MALLOC_THREADS_THROUGHPUT(512, 2);
+BM_MALLOC_THREADS_THROUGHPUT(512, 4);
+BM_MALLOC_THREADS_THROUGHPUT(512, 8);
+BM_MALLOC_THREADS_THROUGHPUT(8192, 2);
+BM_MALLOC_THREADS_THROUGHPUT(8192, 4);
+BM_MALLOC_THREADS_THROUGHPUT(8192, 8);
+
 #endif