platform_bionic/benchmarks/malloc_benchmark.cpp

190 lines
6.1 KiB
C++
Raw Normal View History

/*
* Copyright (C) 2019 The Android Open Source Project
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <malloc.h>
#include <unistd.h>
#include <condition_variable>
#include <mutex>
#include <random>
#include <thread>
#include <vector>
#include <benchmark/benchmark.h>
#include "ScopedDecayTimeRestorer.h"
#include "util.h"
#if defined(__BIONIC__)
static void RunMalloptPurge(benchmark::State& state, int purge_value) {
ScopedDecayTimeRestorer restorer;
static size_t sizes[] = {8, 16, 32, 64, 128, 1024, 4096, 16384, 65536, 131072, 1048576};
static int pagesize = getpagesize();
mallopt(M_DECAY_TIME, 1);
mallopt(M_PURGE_ALL, 0);
for (auto _ : state) {
state.PauseTiming();
std::vector<void*> ptrs;
for (auto size : sizes) {
// Allocate at least two pages worth of the allocations.
for (size_t allocated = 0; allocated < 2 * static_cast<size_t>(pagesize); allocated += size) {
void* ptr = malloc(size);
if (ptr == nullptr) {
state.SkipWithError("Failed to allocate memory");
}
MakeAllocationResident(ptr, size, pagesize);
ptrs.push_back(ptr);
}
}
// Free the memory, which should leave many of the pages resident until
// the purge call.
for (auto ptr : ptrs) {
free(ptr);
}
ptrs.clear();
state.ResumeTiming();
mallopt(purge_value, 0);
}
}
static void RunThreadsThroughput(benchmark::State& state, size_t size, size_t num_threads) {
constexpr size_t kMaxBytes = 1 << 24;
constexpr size_t kMaxThreads = 8;
constexpr size_t kMinRounds = 4;
const size_t MaxAllocCounts = kMaxBytes / size;
std::mutex m;
bool ready = false;
std::condition_variable cv;
std::thread* threads[kMaxThreads];
// The goal is to create malloc/free interleaving patterns across threads.
// The bytes processed by each thread will be the same. The difference is the
// patterns. Here's an example:
//
// A: Allocation
// D: Deallocation
//
// T1 T2 T3
// A A A
// A A D
// A D A
// A D D
// D A A
// D A D
// D D A
// D D D
//
// To do this, `AllocCounts` and `AllocRounds` will be adjusted according to the
// thread id.
auto thread_task = [&](size_t id) {
{
std::unique_lock lock(m);
// Wait until all threads are created.
cv.wait(lock, [&] { return ready; });
}
void** MemPool;
const size_t AllocCounts = (MaxAllocCounts >> id);
const size_t AllocRounds = (kMinRounds << id);
MemPool = new void*[AllocCounts];
for (size_t i = 0; i < AllocRounds; ++i) {
for (size_t j = 0; j < AllocCounts; ++j) {
void* ptr = malloc(size);
MemPool[j] = ptr;
}
// Use a fix seed to reduce the noise of different round of benchmark.
const unsigned seed = 33529;
std::shuffle(MemPool, &MemPool[AllocCounts], std::default_random_engine(seed));
for (size_t j = 0; j < AllocCounts; ++j) free(MemPool[j]);
}
delete[] MemPool;
};
for (auto _ : state) {
state.PauseTiming();
// Don't need to acquire the lock because no thread is created.
ready = false;
for (size_t i = 0; i < num_threads; ++i) threads[i] = new std::thread(thread_task, i);
state.ResumeTiming();
{
std::unique_lock lock(m);
ready = true;
}
cv.notify_all();
for (size_t i = 0; i < num_threads; ++i) {
threads[i]->join();
delete threads[i];
}
}
const size_t ThreadsBytesProcessed = kMaxBytes * kMinRounds * num_threads;
state.SetBytesProcessed(ThreadsBytesProcessed * static_cast<size_t>(state.iterations()));
}
static void BM_mallopt_purge(benchmark::State& state) {
RunMalloptPurge(state, M_PURGE);
}
BIONIC_BENCHMARK(BM_mallopt_purge);
static void BM_mallopt_purge_all(benchmark::State& state) {
RunMalloptPurge(state, M_PURGE_ALL);
}
BIONIC_BENCHMARK(BM_mallopt_purge_all);
// Note that this will only test a single size class at a time so that we can
// observe the impact of contention more often.
#define BM_MALLOC_THREADS_THROUGHPUT(SIZE, NUM_THREADS) \
static void BM_malloc_threads_throughput_##SIZE##_##NUM_THREADS(benchmark::State& state) { \
RunThreadsThroughput(state, SIZE, NUM_THREADS); \
} \
BIONIC_BENCHMARK(BM_malloc_threads_throughput_##SIZE##_##NUM_THREADS);
// There are three block categories in Scudo, we choose 1 from each category.
BM_MALLOC_THREADS_THROUGHPUT(64, 2);
BM_MALLOC_THREADS_THROUGHPUT(64, 4);
BM_MALLOC_THREADS_THROUGHPUT(64, 8);
BM_MALLOC_THREADS_THROUGHPUT(512, 2);
BM_MALLOC_THREADS_THROUGHPUT(512, 4);
BM_MALLOC_THREADS_THROUGHPUT(512, 8);
BM_MALLOC_THREADS_THROUGHPUT(8192, 2);
BM_MALLOC_THREADS_THROUGHPUT(8192, 4);
BM_MALLOC_THREADS_THROUGHPUT(8192, 8);
#endif