Merge "Add multithreads throughput benchmark" into main am: 9ac6169db4 am: 2e6a1923f3 am: 33bc3b7809 am: fd7118ed66

Original change: https://android-review.googlesource.com/c/platform/bionic/+/2645428

Change-Id: If1ab9bff8305dace68bf6bd62bd338e3df5a439c
Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
This commit is contained in:
Treehugger Robot 2023-07-11 09:35:27 +00:00 committed by Automerger Merge Worker
commit 86c29d11cf

View file

@ -29,6 +29,10 @@
#include <malloc.h>
#include <unistd.h>
#include <condition_variable>
#include <mutex>
#include <random>
#include <thread>
#include <vector>
#include <benchmark/benchmark.h>
@ -68,6 +72,89 @@ static void RunMalloptPurge(benchmark::State& state, int purge_value) {
mallopt(M_DECAY_TIME, 0);
}
static void RunThreadsThroughput(benchmark::State& state, size_t size, size_t num_threads) {
constexpr size_t kMaxBytes = 1 << 24;
constexpr size_t kMaxThreads = 8;
constexpr size_t kMinRounds = 4;
const size_t MaxAllocCounts = kMaxBytes / size;
std::mutex m;
bool ready = false;
std::condition_variable cv;
std::thread* threads[kMaxThreads];
// The goal is to create malloc/free interleaving patterns across threads.
// The bytes processed by each thread will be the same. The difference is the
// patterns. Here's an example:
//
// A: Allocation
// D: Deallocation
//
// T1 T2 T3
// A A A
// A A D
// A D A
// A D D
// D A A
// D A D
// D D A
// D D D
//
// To do this, `AllocCounts` and `AllocRounds` will be adjusted according to the
// thread id.
auto thread_task = [&](size_t id) {
{
std::unique_lock lock(m);
// Wait until all threads are created.
cv.wait(lock, [&] { return ready; });
}
void** MemPool;
const size_t AllocCounts = (MaxAllocCounts >> id);
const size_t AllocRounds = (kMinRounds << id);
MemPool = new void*[AllocCounts];
for (size_t i = 0; i < AllocRounds; ++i) {
for (size_t j = 0; j < AllocCounts; ++j) {
void* ptr = malloc(size);
MemPool[j] = ptr;
}
// Use a fix seed to reduce the noise of different round of benchmark.
const unsigned seed = 33529;
std::shuffle(MemPool, &MemPool[AllocCounts], std::default_random_engine(seed));
for (size_t j = 0; j < AllocCounts; ++j) free(MemPool[j]);
}
delete[] MemPool;
};
for (auto _ : state) {
state.PauseTiming();
// Don't need to acquire the lock because no thread is created.
ready = false;
for (size_t i = 0; i < num_threads; ++i) threads[i] = new std::thread(thread_task, i);
state.ResumeTiming();
{
std::unique_lock lock(m);
ready = true;
}
cv.notify_all();
for (size_t i = 0; i < num_threads; ++i) {
threads[i]->join();
delete threads[i];
}
}
const size_t ThreadsBytesProcessed = kMaxBytes * kMinRounds * num_threads;
state.SetBytesProcessed(ThreadsBytesProcessed * static_cast<size_t>(state.iterations()));
}
static void BM_mallopt_purge(benchmark::State& state) {
RunMalloptPurge(state, M_PURGE);
}
@ -78,4 +165,23 @@ static void BM_mallopt_purge_all(benchmark::State& state) {
}
BIONIC_BENCHMARK(BM_mallopt_purge_all);
// Note that this will only test a single size class at a time so that we can
// observe the impact of contention more often.
#define BM_MALLOC_THREADS_THROUGHPUT(SIZE, NUM_THREADS) \
static void BM_malloc_threads_throughput_##SIZE##_##NUM_THREADS(benchmark::State& state) { \
RunThreadsThroughput(state, SIZE, NUM_THREADS); \
} \
BIONIC_BENCHMARK(BM_malloc_threads_throughput_##SIZE##_##NUM_THREADS);
// There are three block categories in Scudo, we choose 1 from each category.
BM_MALLOC_THREADS_THROUGHPUT(64, 2);
BM_MALLOC_THREADS_THROUGHPUT(64, 4);
BM_MALLOC_THREADS_THROUGHPUT(64, 8);
BM_MALLOC_THREADS_THROUGHPUT(512, 2);
BM_MALLOC_THREADS_THROUGHPUT(512, 4);
BM_MALLOC_THREADS_THROUGHPUT(512, 8);
BM_MALLOC_THREADS_THROUGHPUT(8192, 2);
BM_MALLOC_THREADS_THROUGHPUT(8192, 4);
BM_MALLOC_THREADS_THROUGHPUT(8192, 8);
#endif