platform_system_core/metrics/metrics_daemon.cc
Luigi Semenzato 9636019892 metrics_daemon: add zram stats collection
Memory compression stats are being collected by Chrome, but it
is more natural to do it here since they are system-wide rather than
Chrome-specific.

In addition, this provides better granularity for the compression ratio
(percents, from 100% to 600%) since we're especially interested in the
distribution of values between 1 and 2, and currently these all fall
in the same bucket.

Finally, we collect more interesting stats on zero pages.

BUG=chromium:315113
TEST=unit testing, checked about:histograms

Change-Id: I09c974989661d42f45d44afd428e8114e4ee1dbd
Reviewed-on: https://chromium-review.googlesource.com/202587
Reviewed-by: Luigi Semenzato <semenzato@chromium.org>
Commit-Queue: Luigi Semenzato <semenzato@chromium.org>
Tested-by: Luigi Semenzato <semenzato@chromium.org>
2014-06-07 01:33:36 +00:00

1116 lines
39 KiB
C++

// Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "metrics_daemon.h"
#include <fcntl.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include <base/at_exit.h>
#include <base/file_util.h>
#include <base/files/file_path.h>
#include <base/hash.h>
#include <base/logging.h>
#include <base/strings/string_number_conversions.h>
#include <base/strings/string_split.h>
#include <base/strings/string_util.h>
#include <base/strings/stringprintf.h>
#include <base/sys_info.h>
#include <chromeos/dbus/service_constants.h>
#include <dbus/dbus-glib-lowlevel.h>
using base::FilePath;
using base::StringPrintf;
using base::Time;
using base::TimeDelta;
using base::TimeTicks;
using chromeos_metrics::PersistentInteger;
using std::map;
using std::string;
using std::vector;
namespace {
#define SAFE_MESSAGE(e) (e.message ? e.message : "unknown error")
const char kCrashReporterInterface[] = "org.chromium.CrashReporter";
const char kCrashReporterUserCrashSignal[] = "UserCrash";
const int kSecondsPerMinute = 60;
const int kMinutesPerHour = 60;
const int kHoursPerDay = 24;
const int kMinutesPerDay = kHoursPerDay * kMinutesPerHour;
const int kSecondsPerDay = kSecondsPerMinute * kMinutesPerDay;
const int kDaysPerWeek = 7;
const int kSecondsPerWeek = kSecondsPerDay * kDaysPerWeek;
// Interval between calls to UpdateStats().
const guint kUpdateStatsIntervalMs = 300000;
const char kKernelCrashDetectedFile[] = "/var/run/kernel-crash-detected";
const char kUncleanShutdownDetectedFile[] =
"/var/run/unclean-shutdown-detected";
} // namespace
// disk stats metrics
// The {Read,Write}Sectors numbers are in sectors/second.
// A sector is usually 512 bytes.
const char MetricsDaemon::kMetricReadSectorsLongName[] =
"Platform.ReadSectorsLong";
const char MetricsDaemon::kMetricWriteSectorsLongName[] =
"Platform.WriteSectorsLong";
const char MetricsDaemon::kMetricReadSectorsShortName[] =
"Platform.ReadSectorsShort";
const char MetricsDaemon::kMetricWriteSectorsShortName[] =
"Platform.WriteSectorsShort";
const int MetricsDaemon::kMetricStatsShortInterval = 1; // seconds
const int MetricsDaemon::kMetricStatsLongInterval = 30; // seconds
const int MetricsDaemon::kMetricMeminfoInterval = 30; // seconds
// Assume a max rate of 250Mb/s for reads (worse for writes) and 512 byte
// sectors.
const int MetricsDaemon::kMetricSectorsIOMax = 500000; // sectors/second
const int MetricsDaemon::kMetricSectorsBuckets = 50; // buckets
// Page size is 4k, sector size is 0.5k. We're not interested in page fault
// rates that the disk cannot sustain.
const int MetricsDaemon::kMetricPageFaultsMax = kMetricSectorsIOMax / 8;
const int MetricsDaemon::kMetricPageFaultsBuckets = 50;
// Major page faults, i.e. the ones that require data to be read from disk.
const char MetricsDaemon::kMetricPageFaultsLongName[] =
"Platform.PageFaultsLong";
const char MetricsDaemon::kMetricPageFaultsShortName[] =
"Platform.PageFaultsShort";
// Swap in and Swap out
const char MetricsDaemon::kMetricSwapInLongName[] =
"Platform.SwapInLong";
const char MetricsDaemon::kMetricSwapInShortName[] =
"Platform.SwapInShort";
const char MetricsDaemon::kMetricSwapOutLongName[] =
"Platform.SwapOutLong";
const char MetricsDaemon::kMetricSwapOutShortName[] =
"Platform.SwapOutShort";
const char MetricsDaemon::kMetricsProcStatFileName[] = "/proc/stat";
const int MetricsDaemon::kMetricsProcStatFirstLineItemsCount = 11;
// Thermal CPU throttling.
const char MetricsDaemon::kMetricScaledCpuFrequencyName[] =
"Platform.CpuFrequencyThermalScaling";
// Zram sysfs entries.
const char MetricsDaemon::kComprDataSizeName[] = "compr_data_size";
const char MetricsDaemon::kOrigDataSizeName[] = "orig_data_size";
const char MetricsDaemon::kZeroPagesName[] = "zero_pages";
// Memory use stats collection intervals. We collect some memory use interval
// at these intervals after boot, and we stop collecting after the last one,
// with the assumption that in most cases the memory use won't change much
// after that.
static const int kMemuseIntervals[] = {
1 * kSecondsPerMinute, // 1 minute mark
4 * kSecondsPerMinute, // 5 minute mark
25 * kSecondsPerMinute, // 0.5 hour mark
120 * kSecondsPerMinute, // 2.5 hour mark
600 * kSecondsPerMinute, // 12.5 hour mark
};
MetricsDaemon::MetricsDaemon()
: update_stats_timeout_id_(-1),
memuse_final_time_(0),
memuse_interval_index_(0),
read_sectors_(0),
write_sectors_(0),
vmstats_(),
stats_state_(kStatsShort),
stats_initial_time_(0),
ticks_per_second_(0),
latest_cpu_use_ticks_(0) {}
MetricsDaemon::~MetricsDaemon() {
if (update_stats_timeout_id_ > -1)
g_source_remove(update_stats_timeout_id_);
}
double MetricsDaemon::GetActiveTime() {
struct timespec ts;
int r = clock_gettime(CLOCK_MONOTONIC, &ts);
if (r < 0) {
PLOG(WARNING) << "clock_gettime(CLOCK_MONOTONIC) failed";
return 0;
} else {
return ts.tv_sec + ((double) ts.tv_nsec) / (1000 * 1000 * 1000);
}
}
void MetricsDaemon::Run(bool run_as_daemon) {
base::AtExitManager at_exit_manager;
if (run_as_daemon && daemon(0, 0) != 0)
return;
if (CheckSystemCrash(kKernelCrashDetectedFile)) {
ProcessKernelCrash();
}
if (CheckSystemCrash(kUncleanShutdownDetectedFile)) {
ProcessUncleanShutdown();
}
// On OS version change, clear version stats (which are reported daily).
int32 version = GetOsVersionHash();
if (version_cycle_->Get() != version) {
version_cycle_->Set(version);
kernel_crashes_version_count_->Set(0);
version_cumulative_active_use_->Set(0);
version_cumulative_cpu_use_->Set(0);
}
Loop();
}
uint32 MetricsDaemon::GetOsVersionHash() {
static uint32 cached_version_hash = 0;
static bool version_hash_is_cached = false;
if (version_hash_is_cached)
return cached_version_hash;
version_hash_is_cached = true;
std::string version;
if (base::SysInfo::GetLsbReleaseValue("CHROMEOS_RELEASE_VERSION", &version)) {
cached_version_hash = base::Hash(version);
} else if (testing_) {
cached_version_hash = 42; // return any plausible value for the hash
} else {
LOG(FATAL) << "could not find CHROMEOS_RELEASE_VERSION";
}
return cached_version_hash;
}
void MetricsDaemon::Init(bool testing, MetricsLibraryInterface* metrics_lib,
const string& diskstats_path,
const string& vmstats_path,
const string& scaling_max_freq_path,
const string& cpuinfo_max_freq_path) {
testing_ = testing;
DCHECK(metrics_lib != NULL);
metrics_lib_ = metrics_lib;
// Get ticks per second (HZ) on this system.
// Sysconf cannot fail, so no sanity checks are needed.
ticks_per_second_ = sysconf(_SC_CLK_TCK);
daily_active_use_.reset(
new PersistentInteger("Logging.DailyUseTime"));
version_cumulative_active_use_.reset(
new PersistentInteger("Logging.CumulativeDailyUseTime"));
version_cumulative_cpu_use_.reset(
new PersistentInteger("Logging.CumulativeCpuTime"));
kernel_crash_interval_.reset(
new PersistentInteger("Logging.KernelCrashInterval"));
unclean_shutdown_interval_.reset(
new PersistentInteger("Logging.UncleanShutdownInterval"));
user_crash_interval_.reset(
new PersistentInteger("Logging.UserCrashInterval"));
any_crashes_daily_count_.reset(
new PersistentInteger("Logging.AnyCrashesDaily"));
any_crashes_weekly_count_.reset(
new PersistentInteger("Logging.AnyCrashesWeekly"));
user_crashes_daily_count_.reset(
new PersistentInteger("Logging.UserCrashesDaily"));
user_crashes_weekly_count_.reset(
new PersistentInteger("Logging.UserCrashesWeekly"));
kernel_crashes_daily_count_.reset(
new PersistentInteger("Logging.KernelCrashesDaily"));
kernel_crashes_weekly_count_.reset(
new PersistentInteger("Logging.KernelCrashesWeekly"));
kernel_crashes_version_count_.reset(
new PersistentInteger("Logging.KernelCrashesSinceUpdate"));
unclean_shutdowns_daily_count_.reset(
new PersistentInteger("Logging.UncleanShutdownsDaily"));
unclean_shutdowns_weekly_count_.reset(
new PersistentInteger("Logging.UncleanShutdownsWeekly"));
daily_cycle_.reset(new PersistentInteger("daily.cycle"));
weekly_cycle_.reset(new PersistentInteger("weekly.cycle"));
version_cycle_.reset(new PersistentInteger("version.cycle"));
diskstats_path_ = diskstats_path;
vmstats_path_ = vmstats_path;
scaling_max_freq_path_ = scaling_max_freq_path;
cpuinfo_max_freq_path_ = cpuinfo_max_freq_path;
StatsReporterInit();
// Start collecting meminfo stats.
ScheduleMeminfoCallback(kMetricMeminfoInterval);
memuse_final_time_ = GetActiveTime() + kMemuseIntervals[0];
ScheduleMemuseCallback(kMemuseIntervals[0]);
// Don't setup D-Bus and GLib in test mode.
if (testing)
return;
g_type_init();
dbus_threads_init_default();
DBusError error;
dbus_error_init(&error);
DBusConnection* connection = dbus_bus_get(DBUS_BUS_SYSTEM, &error);
LOG_IF(FATAL, dbus_error_is_set(&error)) <<
"No D-Bus connection: " << SAFE_MESSAGE(error);
dbus_connection_setup_with_g_main(connection, NULL);
vector<string> matches;
matches.push_back(
base::StringPrintf("type='signal',interface='%s',path='/',member='%s'",
kCrashReporterInterface,
kCrashReporterUserCrashSignal));
// Registers D-Bus matches for the signals we would like to catch.
for (vector<string>::const_iterator it = matches.begin();
it != matches.end(); ++it) {
const char* match = it->c_str();
DLOG(INFO) << "adding dbus match: " << match;
dbus_bus_add_match(connection, match, &error);
LOG_IF(FATAL, dbus_error_is_set(&error)) <<
"unable to add a match: " << SAFE_MESSAGE(error);
}
// Adds the D-Bus filter routine to be called back whenever one of
// the registered D-Bus matches is successful. The daemon is not
// activated for D-Bus messages that don't match.
CHECK(dbus_connection_add_filter(connection, MessageFilter, this, NULL));
update_stats_timeout_id_ =
g_timeout_add(kUpdateStatsIntervalMs, &HandleUpdateStatsTimeout, this);
}
void MetricsDaemon::Loop() {
GMainLoop* loop = g_main_loop_new(NULL, false);
g_main_loop_run(loop);
}
// static
DBusHandlerResult MetricsDaemon::MessageFilter(DBusConnection* connection,
DBusMessage* message,
void* user_data) {
int message_type = dbus_message_get_type(message);
if (message_type != DBUS_MESSAGE_TYPE_SIGNAL) {
DLOG(WARNING) << "unexpected message type " << message_type;
return DBUS_HANDLER_RESULT_NOT_YET_HANDLED;
}
// Signal messages always have interfaces.
const std::string interface(dbus_message_get_interface(message));
const std::string member(dbus_message_get_member(message));
DLOG(INFO) << "Got " << interface << "." << member << " D-Bus signal";
MetricsDaemon* daemon = static_cast<MetricsDaemon*>(user_data);
DBusMessageIter iter;
dbus_message_iter_init(message, &iter);
if (interface == kCrashReporterInterface) {
CHECK_EQ(member, kCrashReporterUserCrashSignal);
daemon->ProcessUserCrash();
} else {
// Ignore messages from the bus itself.
return DBUS_HANDLER_RESULT_NOT_YET_HANDLED;
}
return DBUS_HANDLER_RESULT_HANDLED;
}
// One might argue that parts of this should go into
// chromium/src/base/sys_info_chromeos.c instead, but put it here for now.
TimeDelta MetricsDaemon::GetIncrementalCpuUse() {
FilePath proc_stat_path = FilePath(kMetricsProcStatFileName);
std::string proc_stat_string;
if (!base::ReadFileToString(proc_stat_path, &proc_stat_string)) {
LOG(WARNING) << "cannot open " << kMetricsProcStatFileName;
return TimeDelta();
}
std::vector<std::string> proc_stat_lines;
base::SplitString(proc_stat_string, '\n', &proc_stat_lines);
if (proc_stat_lines.empty()) {
LOG(WARNING) << "cannot parse " << kMetricsProcStatFileName
<< ": " << proc_stat_string;
return TimeDelta();
}
std::vector<std::string> proc_stat_totals;
base::SplitStringAlongWhitespace(proc_stat_lines[0], &proc_stat_totals);
uint64 user_ticks, user_nice_ticks, system_ticks;
if (proc_stat_totals.size() != kMetricsProcStatFirstLineItemsCount ||
proc_stat_totals[0] != "cpu" ||
!base::StringToUint64(proc_stat_totals[1], &user_ticks) ||
!base::StringToUint64(proc_stat_totals[2], &user_nice_ticks) ||
!base::StringToUint64(proc_stat_totals[3], &system_ticks)) {
LOG(WARNING) << "cannot parse first line: " << proc_stat_lines[0];
return TimeDelta(base::TimeDelta::FromSeconds(0));
}
uint64 total_cpu_use_ticks = user_ticks + user_nice_ticks + system_ticks;
// Sanity check.
if (total_cpu_use_ticks < latest_cpu_use_ticks_) {
LOG(WARNING) << "CPU time decreasing from " << latest_cpu_use_ticks_
<< " to " << total_cpu_use_ticks;
return TimeDelta();
}
uint64 diff = total_cpu_use_ticks - latest_cpu_use_ticks_;
latest_cpu_use_ticks_ = total_cpu_use_ticks;
// Use microseconds to avoid significant truncations.
return base::TimeDelta::FromMicroseconds(
diff * 1000 * 1000 / ticks_per_second_);
}
void MetricsDaemon::ProcessUserCrash() {
// Counts the active time up to now.
UpdateStats(TimeTicks::Now(), Time::Now());
// Reports the active use time since the last crash and resets it.
SendCrashIntervalSample(user_crash_interval_);
any_crashes_daily_count_->Add(1);
any_crashes_weekly_count_->Add(1);
user_crashes_daily_count_->Add(1);
user_crashes_weekly_count_->Add(1);
}
void MetricsDaemon::ProcessKernelCrash() {
// Counts the active time up to now.
UpdateStats(TimeTicks::Now(), Time::Now());
// Reports the active use time since the last crash and resets it.
SendCrashIntervalSample(kernel_crash_interval_);
any_crashes_daily_count_->Add(1);
any_crashes_weekly_count_->Add(1);
kernel_crashes_daily_count_->Add(1);
kernel_crashes_weekly_count_->Add(1);
kernel_crashes_version_count_->Add(1);
}
void MetricsDaemon::ProcessUncleanShutdown() {
// Counts the active time up to now.
UpdateStats(TimeTicks::Now(), Time::Now());
// Reports the active use time since the last crash and resets it.
SendCrashIntervalSample(unclean_shutdown_interval_);
unclean_shutdowns_daily_count_->Add(1);
unclean_shutdowns_weekly_count_->Add(1);
any_crashes_daily_count_->Add(1);
any_crashes_weekly_count_->Add(1);
}
bool MetricsDaemon::CheckSystemCrash(const string& crash_file) {
FilePath crash_detected(crash_file);
if (!base::PathExists(crash_detected))
return false;
// Deletes the crash-detected file so that the daemon doesn't report
// another kernel crash in case it's restarted.
base::DeleteFile(crash_detected, false); // not recursive
return true;
}
void MetricsDaemon::StatsReporterInit() {
DiskStatsReadStats(&read_sectors_, &write_sectors_);
VmStatsReadStats(&vmstats_);
// The first time around just run the long stat, so we don't delay boot.
stats_state_ = kStatsLong;
stats_initial_time_ = GetActiveTime();
if (stats_initial_time_ < 0) {
LOG(WARNING) << "not collecting disk stats";
} else {
ScheduleStatsCallback(kMetricStatsLongInterval);
}
}
void MetricsDaemon::ScheduleStatsCallback(int wait) {
if (testing_) {
return;
}
g_timeout_add_seconds(wait, StatsCallbackStatic, this);
}
bool MetricsDaemon::DiskStatsReadStats(long int* read_sectors,
long int* write_sectors) {
int nchars;
int nitems;
bool success = false;
char line[200];
if (diskstats_path_.empty()) {
return false;
}
int file = HANDLE_EINTR(open(diskstats_path_.c_str(), O_RDONLY));
if (file < 0) {
PLOG(WARNING) << "cannot open " << diskstats_path_;
return false;
}
nchars = HANDLE_EINTR(read(file, line, sizeof(line)));
if (nchars < 0) {
PLOG(WARNING) << "cannot read from " << diskstats_path_;
return false;
} else {
LOG_IF(WARNING, nchars == sizeof(line))
<< "line too long in " << diskstats_path_;
line[nchars] = '\0';
nitems = sscanf(line, "%*d %*d %ld %*d %*d %*d %ld",
read_sectors, write_sectors);
if (nitems == 2) {
success = true;
} else {
LOG(WARNING) << "found " << nitems << " items in "
<< diskstats_path_ << ", expected 2";
}
}
IGNORE_EINTR(close(file));
return success;
}
bool MetricsDaemon::VmStatsParseStats(const char* stats,
struct VmstatRecord* record) {
// a mapping of string name to field in VmstatRecord and whether we found it
struct mapping {
const string name;
uint64_t* value_p;
bool found;
} map[] =
{ { .name = "pgmajfault",
.value_p = &record->page_faults_,
.found = false },
{ .name = "pswpin",
.value_p = &record->swap_in_,
.found = false },
{ .name = "pswpout",
.value_p = &record->swap_out_,
.found = false }, };
// Each line in the file has the form
// <ID> <VALUE>
// for instance:
// nr_free_pages 213427
vector<string> lines;
Tokenize(stats, "\n", &lines);
for (vector<string>::iterator it = lines.begin();
it != lines.end(); ++it) {
vector<string> tokens;
base::SplitString(*it, ' ', &tokens);
if (tokens.size() == 2) {
for (unsigned int i = 0; i < sizeof(map)/sizeof(struct mapping); i++) {
if (!tokens[0].compare(map[i].name)) {
if (!base::StringToUint64(tokens[1], map[i].value_p))
return false;
map[i].found = true;
}
}
} else {
LOG(WARNING) << "unexpected vmstat format";
}
}
// make sure we got all the stats
for (unsigned i = 0; i < sizeof(map)/sizeof(struct mapping); i++) {
if (map[i].found == false) {
LOG(WARNING) << "vmstat missing " << map[i].name;
return false;
}
}
return true;
}
bool MetricsDaemon::VmStatsReadStats(struct VmstatRecord* stats) {
string value_string;
FilePath* path = new FilePath(vmstats_path_);
if (!base::ReadFileToString(*path, &value_string)) {
delete path;
LOG(WARNING) << "cannot read " << vmstats_path_;
return false;
}
delete path;
return VmStatsParseStats(value_string.c_str(), stats);
}
bool MetricsDaemon::ReadFreqToInt(const string& sysfs_file_name, int* value) {
const FilePath sysfs_path(sysfs_file_name);
string value_string;
if (!base::ReadFileToString(sysfs_path, &value_string)) {
LOG(WARNING) << "cannot read " << sysfs_path.value().c_str();
return false;
}
if (!base::RemoveChars(value_string, "\n", &value_string)) {
LOG(WARNING) << "no newline in " << value_string;
// Continue even though the lack of newline is suspicious.
}
if (!base::StringToInt(value_string, value)) {
LOG(WARNING) << "cannot convert " << value_string << " to int";
return false;
}
return true;
}
void MetricsDaemon::SendCpuThrottleMetrics() {
// |max_freq| is 0 only the first time through.
static int max_freq = 0;
if (max_freq == -1)
// Give up, as sysfs did not report max_freq correctly.
return;
if (max_freq == 0 || testing_) {
// One-time initialization of max_freq. (Every time when testing.)
if (!ReadFreqToInt(cpuinfo_max_freq_path_, &max_freq)) {
max_freq = -1;
return;
}
if (max_freq == 0) {
LOG(WARNING) << "sysfs reports 0 max CPU frequency\n";
max_freq = -1;
return;
}
if (max_freq % 10000 == 1000) {
// Special case: system has turbo mode, and max non-turbo frequency is
// max_freq - 1000. This relies on "normal" (non-turbo) frequencies
// being multiples of (at least) 10 MHz. Although there is no guarantee
// of this, it seems a fairly reasonable assumption. Otherwise we should
// read scaling_available_frequencies, sort the frequencies, compare the
// two highest ones, and check if they differ by 1000 (kHz) (and that's a
// hack too, no telling when it will change).
max_freq -= 1000;
}
}
int scaled_freq = 0;
if (!ReadFreqToInt(scaling_max_freq_path_, &scaled_freq))
return;
// Frequencies are in kHz. If scaled_freq > max_freq, turbo is on, but
// scaled_freq is not the actual turbo frequency. We indicate this situation
// with a 101% value.
int percent = scaled_freq > max_freq ? 101 : scaled_freq / (max_freq / 100);
SendLinearSample(kMetricScaledCpuFrequencyName, percent, 101, 102);
}
// static
gboolean MetricsDaemon::StatsCallbackStatic(void* handle) {
(static_cast<MetricsDaemon*>(handle))->StatsCallback();
return false; // one-time callback
}
// Collects disk and vm stats alternating over a short and a long interval.
void MetricsDaemon::StatsCallback() {
long int read_sectors_now, write_sectors_now;
struct VmstatRecord vmstats_now;
double time_now = GetActiveTime();
double delta_time = time_now - stats_initial_time_;
if (testing_) {
// Fake the time when testing.
delta_time = stats_state_ == kStatsShort ?
kMetricStatsShortInterval : kMetricStatsLongInterval;
}
bool diskstats_success = DiskStatsReadStats(&read_sectors_now,
&write_sectors_now);
int delta_read = read_sectors_now - read_sectors_;
int delta_write = write_sectors_now - write_sectors_;
int read_sectors_per_second = delta_read / delta_time;
int write_sectors_per_second = delta_write / delta_time;
bool vmstats_success = VmStatsReadStats(&vmstats_now);
uint64_t delta_faults = vmstats_now.page_faults_ - vmstats_.page_faults_;
uint64_t delta_swap_in = vmstats_now.swap_in_ - vmstats_.swap_in_;
uint64_t delta_swap_out = vmstats_now.swap_out_ - vmstats_.swap_out_;
uint64_t page_faults_per_second = delta_faults / delta_time;
uint64_t swap_in_per_second = delta_swap_in / delta_time;
uint64_t swap_out_per_second = delta_swap_out / delta_time;
switch (stats_state_) {
case kStatsShort:
if (diskstats_success) {
SendSample(kMetricReadSectorsShortName,
read_sectors_per_second,
1,
kMetricSectorsIOMax,
kMetricSectorsBuckets);
SendSample(kMetricWriteSectorsShortName,
write_sectors_per_second,
1,
kMetricSectorsIOMax,
kMetricSectorsBuckets);
}
if (vmstats_success) {
SendSample(kMetricPageFaultsShortName,
page_faults_per_second,
1,
kMetricPageFaultsMax,
kMetricPageFaultsBuckets);
SendSample(kMetricSwapInShortName,
swap_in_per_second,
1,
kMetricPageFaultsMax,
kMetricPageFaultsBuckets);
SendSample(kMetricSwapOutShortName,
swap_out_per_second,
1,
kMetricPageFaultsMax,
kMetricPageFaultsBuckets);
}
// Schedule long callback.
stats_state_ = kStatsLong;
ScheduleStatsCallback(kMetricStatsLongInterval -
kMetricStatsShortInterval);
break;
case kStatsLong:
if (diskstats_success) {
SendSample(kMetricReadSectorsLongName,
read_sectors_per_second,
1,
kMetricSectorsIOMax,
kMetricSectorsBuckets);
SendSample(kMetricWriteSectorsLongName,
write_sectors_per_second,
1,
kMetricSectorsIOMax,
kMetricSectorsBuckets);
// Reset sector counters.
read_sectors_ = read_sectors_now;
write_sectors_ = write_sectors_now;
}
if (vmstats_success) {
SendSample(kMetricPageFaultsLongName,
page_faults_per_second,
1,
kMetricPageFaultsMax,
kMetricPageFaultsBuckets);
SendSample(kMetricSwapInLongName,
swap_in_per_second,
1,
kMetricPageFaultsMax,
kMetricPageFaultsBuckets);
SendSample(kMetricSwapOutLongName,
swap_out_per_second,
1,
kMetricPageFaultsMax,
kMetricPageFaultsBuckets);
vmstats_ = vmstats_now;
}
SendCpuThrottleMetrics();
// Set start time for new cycle.
stats_initial_time_ = time_now;
// Schedule short callback.
stats_state_ = kStatsShort;
ScheduleStatsCallback(kMetricStatsShortInterval);
break;
default:
LOG(FATAL) << "Invalid stats state";
}
}
void MetricsDaemon::ScheduleMeminfoCallback(int wait) {
if (testing_) {
return;
}
g_timeout_add_seconds(wait, MeminfoCallbackStatic, this);
}
// static
gboolean MetricsDaemon::MeminfoCallbackStatic(void* handle) {
return (static_cast<MetricsDaemon*>(handle))->MeminfoCallback();
}
bool MetricsDaemon::MeminfoCallback() {
string meminfo_raw;
const FilePath meminfo_path("/proc/meminfo");
if (!base::ReadFileToString(meminfo_path, &meminfo_raw)) {
LOG(WARNING) << "cannot read " << meminfo_path.value().c_str();
return false;
}
// Make both calls even if the first one fails.
bool success = ProcessMeminfo(meminfo_raw);
return ReportZram(base::FilePath(FILE_PATH_LITERAL("/sys/block/zram0"))) &&
success;
}
// static
bool MetricsDaemon::ReadFileToUint64(const base::FilePath& path,
uint64* value) {
std::string content;
if (!base::ReadFileToString(path, &content)) {
PLOG(WARNING) << "cannot read " << path.MaybeAsASCII();
return false;
}
if (!base::StringToUint64(content, value)) {
LOG(WARNING) << "invalid integer: " << content;
return false;
}
return true;
}
bool MetricsDaemon::ReportZram(const base::FilePath& zram_dir) {
// Data sizes are in bytes. |zero_pages| is in number of pages.
uint64 compr_data_size, orig_data_size, zero_pages;
const size_t page_size = 4096;
if (!ReadFileToUint64(zram_dir.Append(kComprDataSizeName),
&compr_data_size) ||
!ReadFileToUint64(zram_dir.Append(kOrigDataSizeName), &orig_data_size) ||
!ReadFileToUint64(zram_dir.Append(kZeroPagesName), &zero_pages)) {
return false;
}
// |orig_data_size| does not include zero-filled pages.
orig_data_size += zero_pages * page_size;
const int compr_data_size_mb = compr_data_size >> 20;
const int savings_mb = (orig_data_size - compr_data_size) >> 20;
const int zero_ratio_percent = zero_pages * page_size * 100 / orig_data_size;
// Report compressed size in megabytes. 100 MB or less has little impact.
SendSample("Platform.ZramCompressedSize", compr_data_size_mb, 100, 4000, 50);
SendSample("Platform.ZramSavings", savings_mb, 100, 4000, 50);
// The compression ratio is multiplied by 100 for better resolution. The
// ratios of interest are between 1 and 6 (100% and 600% as reported). We
// don't want samples when very little memory is being compressed.
if (compr_data_size_mb >= 1) {
SendSample("Platform.ZramCompressionRatioPercent",
orig_data_size * 100 / compr_data_size, 100, 600, 50);
}
// The values of interest for zero_pages are between 1MB and 1GB. The units
// are number of pages.
SendSample("Platform.ZramZeroPages", zero_pages, 256, 256 * 1024, 50);
SendSample("Platform.ZramZeroRatioPercent", zero_ratio_percent, 1, 50, 50);
return true;
}
bool MetricsDaemon::ProcessMeminfo(const string& meminfo_raw) {
static const MeminfoRecord fields_array[] = {
{ "MemTotal", "MemTotal" }, // SPECIAL CASE: total system memory
{ "MemFree", "MemFree" },
{ "Buffers", "Buffers" },
{ "Cached", "Cached" },
// { "SwapCached", "SwapCached" },
{ "Active", "Active" },
{ "Inactive", "Inactive" },
{ "ActiveAnon", "Active(anon)" },
{ "InactiveAnon", "Inactive(anon)" },
{ "ActiveFile" , "Active(file)" },
{ "InactiveFile", "Inactive(file)" },
{ "Unevictable", "Unevictable", kMeminfoOp_HistLog },
// { "Mlocked", "Mlocked" },
{ "SwapTotal", "SwapTotal", kMeminfoOp_SwapTotal },
{ "SwapFree", "SwapFree", kMeminfoOp_SwapFree },
// { "Dirty", "Dirty" },
// { "Writeback", "Writeback" },
{ "AnonPages", "AnonPages" },
{ "Mapped", "Mapped" },
{ "Shmem", "Shmem", kMeminfoOp_HistLog },
{ "Slab", "Slab", kMeminfoOp_HistLog },
// { "SReclaimable", "SReclaimable" },
// { "SUnreclaim", "SUnreclaim" },
};
vector<MeminfoRecord> fields(fields_array,
fields_array + arraysize(fields_array));
if (!FillMeminfo(meminfo_raw, &fields)) {
return false;
}
int total_memory = fields[0].value;
if (total_memory == 0) {
// this "cannot happen"
LOG(WARNING) << "borked meminfo parser";
return false;
}
int swap_total = 0;
int swap_free = 0;
// Send all fields retrieved, except total memory.
for (unsigned int i = 1; i < fields.size(); i++) {
string metrics_name = base::StringPrintf("Platform.Meminfo%s",
fields[i].name);
int percent;
switch (fields[i].op) {
case kMeminfoOp_HistPercent:
// report value as percent of total memory
percent = fields[i].value * 100 / total_memory;
SendLinearSample(metrics_name, percent, 100, 101);
break;
case kMeminfoOp_HistLog:
// report value in kbytes, log scale, 4Gb max
SendSample(metrics_name, fields[i].value, 1, 4 * 1000 * 1000, 100);
break;
case kMeminfoOp_SwapTotal:
swap_total = fields[i].value;
case kMeminfoOp_SwapFree:
swap_free = fields[i].value;
break;
}
}
if (swap_total > 0) {
int swap_used = swap_total - swap_free;
int swap_used_percent = swap_used * 100 / swap_total;
SendSample("Platform.MeminfoSwapUsed", swap_used, 1, 8 * 1000 * 1000, 100);
SendLinearSample("Platform.MeminfoSwapUsedPercent", swap_used_percent,
100, 101);
}
return true;
}
bool MetricsDaemon::FillMeminfo(const string& meminfo_raw,
vector<MeminfoRecord>* fields) {
vector<string> lines;
unsigned int nlines = Tokenize(meminfo_raw, "\n", &lines);
// Scan meminfo output and collect field values. Each field name has to
// match a meminfo entry (case insensitive) after removing non-alpha
// characters from the entry.
unsigned int ifield = 0;
for (unsigned int iline = 0;
iline < nlines && ifield < fields->size();
iline++) {
vector<string> tokens;
Tokenize(lines[iline], ": ", &tokens);
if (strcmp((*fields)[ifield].match, tokens[0].c_str()) == 0) {
// Name matches. Parse value and save.
char* rest;
(*fields)[ifield].value =
static_cast<int>(strtol(tokens[1].c_str(), &rest, 10));
if (*rest != '\0') {
LOG(WARNING) << "missing meminfo value";
return false;
}
ifield++;
}
}
if (ifield < fields->size()) {
// End of input reached while scanning.
LOG(WARNING) << "cannot find field " << (*fields)[ifield].match
<< " and following";
return false;
}
return true;
}
void MetricsDaemon::ScheduleMemuseCallback(double interval) {
if (testing_) {
return;
}
g_timeout_add_seconds(interval, MemuseCallbackStatic, this);
}
// static
gboolean MetricsDaemon::MemuseCallbackStatic(void* handle) {
MetricsDaemon* daemon = static_cast<MetricsDaemon*>(handle);
daemon->MemuseCallback();
return false;
}
void MetricsDaemon::MemuseCallback() {
// Since we only care about active time (i.e. uptime minus sleep time) but
// the callbacks are driven by real time (uptime), we check if we should
// reschedule this callback due to intervening sleep periods.
double now = GetActiveTime();
// Avoid intervals of less than one second.
double remaining_time = ceil(memuse_final_time_ - now);
if (remaining_time > 0) {
ScheduleMemuseCallback(remaining_time);
} else {
// Report stats and advance the measurement interval unless there are
// errors or we've completed the last interval.
if (MemuseCallbackWork() &&
memuse_interval_index_ < arraysize(kMemuseIntervals)) {
double interval = kMemuseIntervals[memuse_interval_index_++];
memuse_final_time_ = now + interval;
ScheduleMemuseCallback(interval);
}
}
}
bool MetricsDaemon::MemuseCallbackWork() {
string meminfo_raw;
const FilePath meminfo_path("/proc/meminfo");
if (!base::ReadFileToString(meminfo_path, &meminfo_raw)) {
LOG(WARNING) << "cannot read " << meminfo_path.value().c_str();
return false;
}
return ProcessMemuse(meminfo_raw);
}
bool MetricsDaemon::ProcessMemuse(const string& meminfo_raw) {
static const MeminfoRecord fields_array[] = {
{ "MemTotal", "MemTotal" }, // SPECIAL CASE: total system memory
{ "ActiveAnon", "Active(anon)" },
{ "InactiveAnon", "Inactive(anon)" },
};
vector<MeminfoRecord> fields(fields_array,
fields_array + arraysize(fields_array));
if (!FillMeminfo(meminfo_raw, &fields)) {
return false;
}
int total = fields[0].value;
int active_anon = fields[1].value;
int inactive_anon = fields[2].value;
if (total == 0) {
// this "cannot happen"
LOG(WARNING) << "borked meminfo parser";
return false;
}
string metrics_name = base::StringPrintf("Platform.MemuseAnon%d",
memuse_interval_index_);
SendLinearSample(metrics_name, (active_anon + inactive_anon) * 100 / total,
100, 101);
return true;
}
void MetricsDaemon::ReportDailyUse(int use_seconds) {
if (use_seconds <= 0)
return;
int minutes = (use_seconds + kSecondsPerMinute / 2) / kSecondsPerMinute;
SendSample("Logging.DailyUseTime",
minutes,
1,
kMinutesPerDay * 30 * 2, // cumulative---two months worth
50);
}
void MetricsDaemon::SendSample(const string& name, int sample,
int min, int max, int nbuckets) {
metrics_lib_->SendToUMA(name, sample, min, max, nbuckets);
}
void MetricsDaemon::SendKernelCrashesCumulativeCountStats() {
// Report the number of crashes for this OS version, but don't clear the
// counter. It is cleared elsewhere on version change.
int64 crashes_count = kernel_crashes_version_count_->Get();
SendSample(kernel_crashes_version_count_->Name(),
crashes_count,
1, // value of first bucket
500, // value of last bucket
100); // number of buckets
int64 cpu_use_ms = version_cumulative_cpu_use_->Get();
SendSample(version_cumulative_cpu_use_->Name(),
cpu_use_ms / 1000, // stat is in seconds
1, // device may be used very little...
8 * 1000 * 1000, // ... or a lot (a little over 90 days)
100);
// On the first run after an autoupdate, cpu_use_ms and active_use_seconds
// can be zero. Avoid division by zero.
if (cpu_use_ms > 0) {
// Send the crash frequency since update in number of crashes per CPU year.
SendSample("Logging.KernelCrashesPerCpuYear",
crashes_count * kSecondsPerDay * 365 * 1000 / cpu_use_ms,
1,
1000 * 1000, // about one crash every 30s of CPU time
100);
}
int64 active_use_seconds = version_cumulative_active_use_->Get();
if (active_use_seconds > 0) {
SendSample(version_cumulative_active_use_->Name(),
active_use_seconds / 1000, // stat is in seconds
1, // device may be used very little...
8 * 1000 * 1000, // ... or a lot (about 90 days)
100);
// Same as above, but per year of active time.
SendSample("Logging.KernelCrashesPerActiveYear",
crashes_count * kSecondsPerDay * 365 / active_use_seconds,
1,
1000 * 1000, // about one crash every 30s of active time
100);
}
}
void MetricsDaemon::SendDailyUseSample(
const scoped_ptr<PersistentInteger>& use) {
SendSample(use->Name(),
use->GetAndClear(),
1, // value of first bucket
kSecondsPerDay, // value of last bucket
50); // number of buckets
}
void MetricsDaemon::SendCrashIntervalSample(
const scoped_ptr<PersistentInteger>& interval) {
SendSample(interval->Name(),
interval->GetAndClear(),
1, // value of first bucket
4 * kSecondsPerWeek, // value of last bucket
50); // number of buckets
}
void MetricsDaemon::SendCrashFrequencySample(
const scoped_ptr<PersistentInteger>& frequency) {
SendSample(frequency->Name(),
frequency->GetAndClear(),
1, // value of first bucket
100, // value of last bucket
50); // number of buckets
}
void MetricsDaemon::SendLinearSample(const string& name, int sample,
int max, int nbuckets) {
// TODO(semenzato): add a proper linear histogram to the Chrome external
// metrics API.
LOG_IF(FATAL, nbuckets != max + 1) << "unsupported histogram scale";
metrics_lib_->SendEnumToUMA(name, sample, max);
}
void MetricsDaemon::UpdateStats(TimeTicks now_ticks,
Time now_wall_time) {
const int elapsed_seconds = (now_ticks - last_update_stats_time_).InSeconds();
daily_active_use_->Add(elapsed_seconds);
version_cumulative_active_use_->Add(elapsed_seconds);
user_crash_interval_->Add(elapsed_seconds);
kernel_crash_interval_->Add(elapsed_seconds);
version_cumulative_cpu_use_->Add(GetIncrementalCpuUse().InMilliseconds());
last_update_stats_time_ = now_ticks;
const TimeDelta since_epoch = now_wall_time - Time::UnixEpoch();
const int day = since_epoch.InDays();
const int week = day / 7;
if (daily_cycle_->Get() != day) {
daily_cycle_->Set(day);
SendDailyUseSample(daily_active_use_);
SendDailyUseSample(version_cumulative_active_use_);
SendCrashFrequencySample(any_crashes_daily_count_);
SendCrashFrequencySample(user_crashes_daily_count_);
SendCrashFrequencySample(kernel_crashes_daily_count_);
SendCrashFrequencySample(unclean_shutdowns_daily_count_);
SendKernelCrashesCumulativeCountStats();
}
if (weekly_cycle_->Get() != week) {
weekly_cycle_->Set(week);
SendCrashFrequencySample(any_crashes_weekly_count_);
SendCrashFrequencySample(user_crashes_weekly_count_);
SendCrashFrequencySample(kernel_crashes_weekly_count_);
SendCrashFrequencySample(unclean_shutdowns_weekly_count_);
}
}
// static
gboolean MetricsDaemon::HandleUpdateStatsTimeout(gpointer data) {
static_cast<MetricsDaemon*>(data)->UpdateStats(TimeTicks::Now(), Time::Now());
return TRUE;
}