Merge "libsnapshot:snapuserd: Batch merge copy operation" am: 7cd1c03178

Original change: https://android-review.googlesource.com/c/platform/system/core/+/1580192

MUST ONLY BE SUBMITTED BY AUTOMERGER

Change-Id: I5c63b126e96a3ff941b88cea88a93a7800312235
This commit is contained in:
Akilesh Kailash 2021-02-10 05:31:00 +00:00 committed by Automerger Merge Worker
commit f0f96830a8
9 changed files with 567 additions and 93 deletions

View file

@ -549,6 +549,7 @@ cc_test {
],
srcs: [
"cow_snapuserd_test.cpp",
"snapuserd.cpp",
],
cflags: [
"-Wall",

View file

@ -181,6 +181,7 @@ bool CowReader::ParseOps(std::optional<uint64_t> label) {
ops_buffer->resize(current_op_num);
}
LOG(DEBUG) << "COW file read complete. Total ops: " << ops_buffer->size();
// To successfully parse a COW file, we need either:
// (1) a label to read up to, and for that label to be found, or
// (2) a valid footer.
@ -298,10 +299,9 @@ void CowReader::InitializeMerge() {
// are contiguous. These are monotonically increasing numbers.
//
// When both (1) and (2) are true, kernel will batch merge the operations.
// However, we do not want copy operations to be batch merged as
// a crash or system reboot during an overlapping copy can drive the device
// to a corrupted state. Hence, merging of copy operations should always be
// done as a individual 4k block. In the above case, since the
// In the above case, we have to ensure that the copy operations
// are merged first before replace operations are done. Hence,
// we will not change the order of copy operations. Since,
// cow_op->new_block numbers are contiguous, we will ensure that the
// cow block numbers assigned in ReadMetadata() for these respective copy
// operations are not contiguous forcing kernel to issue merge for each
@ -328,10 +328,8 @@ void CowReader::InitializeMerge() {
//
// Merge sequence will look like:
//
// Merge-1 - Copy-op-1
// Merge-2 - Copy-op-2
// Merge-3 - Copy-op-3
// Merge-4 - Batch-merge {Replace-op-7, Replace-op-6, Zero-op-8,
// Merge-1 - Batch-merge { Copy-op-1, Copy-op-2, Copy-op-3 }
// Merge-2 - Batch-merge {Replace-op-7, Replace-op-6, Zero-op-8,
// Replace-op-4, Zero-op-9, Replace-op-5 }
//==============================================================

View file

@ -36,6 +36,8 @@
#include <libsnapshot/snapuserd_client.h>
#include <storage_literals/storage_literals.h>
#include "snapuserd.h"
namespace android {
namespace snapshot {
@ -119,7 +121,6 @@ class CowSnapuserdTest final {
void CreateDmUserDevice();
void StartSnapuserdDaemon();
void CreateSnapshotDevice();
unique_fd CreateTempFile(const std::string& name, size_t size);
unique_ptr<LoopDevice> base_loop_;
unique_ptr<TempDevice> dmuser_dev_;
@ -140,7 +141,24 @@ class CowSnapuserdTest final {
int total_base_size_;
};
unique_fd CowSnapuserdTest::CreateTempFile(const std::string& name, size_t size) {
class CowSnapuserdMetadataTest final {
public:
void Setup();
void SetupPartialArea();
void ValidateMetadata();
void ValidatePartialFilledArea();
private:
void InitMetadata();
void CreateCowDevice();
void CreateCowPartialFilledArea();
std::unique_ptr<Snapuserd> snapuserd_;
std::unique_ptr<TemporaryFile> cow_system_;
size_t size_ = 1_MiB;
};
static unique_fd CreateTempFile(const std::string& name, size_t size) {
unique_fd fd(syscall(__NR_memfd_create, name.c_str(), MFD_ALLOW_SEALING));
if (fd < 0) {
return {};
@ -430,25 +448,299 @@ void CowSnapuserdTest::SimulateDaemonRestart() {
}
void CowSnapuserdTest::MergeInterrupt() {
// Interrupt merge at various intervals
StartMerge();
std::this_thread::sleep_for(4s);
std::this_thread::sleep_for(250ms);
SimulateDaemonRestart();
StartMerge();
std::this_thread::sleep_for(3s);
std::this_thread::sleep_for(250ms);
SimulateDaemonRestart();
StartMerge();
std::this_thread::sleep_for(3s);
std::this_thread::sleep_for(150ms);
SimulateDaemonRestart();
StartMerge();
std::this_thread::sleep_for(1s);
std::this_thread::sleep_for(100ms);
SimulateDaemonRestart();
StartMerge();
std::this_thread::sleep_for(800ms);
SimulateDaemonRestart();
StartMerge();
std::this_thread::sleep_for(600ms);
SimulateDaemonRestart();
ASSERT_TRUE(Merge());
}
void CowSnapuserdMetadataTest::CreateCowPartialFilledArea() {
std::string path = android::base::GetExecutableDirectory();
cow_system_ = std::make_unique<TemporaryFile>(path);
CowOptions options;
options.compression = "gz";
CowWriter writer(options);
ASSERT_TRUE(writer.Initialize(cow_system_->fd));
// Area 0 is completely filled with 256 exceptions
for (int i = 0; i < 256; i++) {
ASSERT_TRUE(writer.AddCopy(i, 256 + i));
}
// Area 1 is partially filled with 2 copy ops and 10 zero ops
ASSERT_TRUE(writer.AddCopy(500, 1000));
ASSERT_TRUE(writer.AddCopy(501, 1001));
ASSERT_TRUE(writer.AddZeroBlocks(300, 10));
// Flush operations
ASSERT_TRUE(writer.Finalize());
}
void CowSnapuserdMetadataTest::ValidatePartialFilledArea() {
int area_sz = snapuserd_->GetMetadataAreaSize();
ASSERT_EQ(area_sz, 2);
size_t new_chunk = 263;
// Verify the partially filled area
void* buffer = snapuserd_->GetExceptionBuffer(1);
loff_t offset = 0;
struct disk_exception* de;
for (int i = 0; i < 12; i++) {
de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
ASSERT_EQ(de->old_chunk, i);
ASSERT_EQ(de->new_chunk, new_chunk);
offset += sizeof(struct disk_exception);
new_chunk += 1;
}
de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
ASSERT_EQ(de->old_chunk, 0);
ASSERT_EQ(de->new_chunk, 0);
}
void CowSnapuserdMetadataTest::SetupPartialArea() {
CreateCowPartialFilledArea();
InitMetadata();
}
void CowSnapuserdMetadataTest::CreateCowDevice() {
unique_fd rnd_fd;
loff_t offset = 0;
std::string path = android::base::GetExecutableDirectory();
cow_system_ = std::make_unique<TemporaryFile>(path);
rnd_fd.reset(open("/dev/random", O_RDONLY));
ASSERT_TRUE(rnd_fd > 0);
std::unique_ptr<uint8_t[]> random_buffer_1_ = std::make_unique<uint8_t[]>(size_);
// Fill random data
for (size_t j = 0; j < (size_ / 1_MiB); j++) {
ASSERT_EQ(ReadFullyAtOffset(rnd_fd, (char*)random_buffer_1_.get() + offset, 1_MiB, 0),
true);
offset += 1_MiB;
}
CowOptions options;
options.compression = "gz";
CowWriter writer(options);
ASSERT_TRUE(writer.Initialize(cow_system_->fd));
size_t num_blocks = size_ / options.block_size;
// Overlapping region. This has to be split
// into two batch operations
ASSERT_TRUE(writer.AddCopy(23, 20));
ASSERT_TRUE(writer.AddCopy(22, 19));
ASSERT_TRUE(writer.AddCopy(21, 18));
ASSERT_TRUE(writer.AddCopy(20, 17));
ASSERT_TRUE(writer.AddCopy(19, 16));
ASSERT_TRUE(writer.AddCopy(18, 15));
// Contiguous region but blocks in ascending order
// Daemon has to ensure that these blocks are merged
// in a batch
ASSERT_TRUE(writer.AddCopy(50, 75));
ASSERT_TRUE(writer.AddCopy(51, 76));
ASSERT_TRUE(writer.AddCopy(52, 77));
ASSERT_TRUE(writer.AddCopy(53, 78));
// Dis-contiguous region
ASSERT_TRUE(writer.AddCopy(110, 130));
ASSERT_TRUE(writer.AddCopy(105, 125));
ASSERT_TRUE(writer.AddCopy(100, 120));
// Overlap
ASSERT_TRUE(writer.AddCopy(25, 30));
ASSERT_TRUE(writer.AddCopy(30, 31));
size_t source_blk = num_blocks;
ASSERT_TRUE(writer.AddRawBlocks(source_blk, random_buffer_1_.get(), size_));
size_t blk_zero_copy_start = source_blk + num_blocks;
ASSERT_TRUE(writer.AddZeroBlocks(blk_zero_copy_start, num_blocks));
// Flush operations
ASSERT_TRUE(writer.Finalize());
}
void CowSnapuserdMetadataTest::InitMetadata() {
snapuserd_ = std::make_unique<Snapuserd>("", cow_system_->path, "");
ASSERT_TRUE(snapuserd_->InitCowDevice());
}
void CowSnapuserdMetadataTest::Setup() {
CreateCowDevice();
InitMetadata();
}
void CowSnapuserdMetadataTest::ValidateMetadata() {
int area_sz = snapuserd_->GetMetadataAreaSize();
ASSERT_EQ(area_sz, 3);
size_t old_chunk;
size_t new_chunk;
for (int i = 0; i < area_sz; i++) {
void* buffer = snapuserd_->GetExceptionBuffer(i);
loff_t offset = 0;
if (i == 0) {
old_chunk = 256;
new_chunk = 2;
} else if (i == 1) {
old_chunk = 512;
new_chunk = 259;
}
for (int j = 0; j < 256; j++) {
struct disk_exception* de =
reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
if (i != 2) {
ASSERT_EQ(de->old_chunk, old_chunk);
ASSERT_EQ(de->new_chunk, new_chunk);
old_chunk += 1;
new_chunk += 1;
} else {
break;
}
offset += sizeof(struct disk_exception);
}
if (i == 2) {
// The first 5 copy operation is not batch merged
// as the sequence is discontiguous
struct disk_exception* de =
reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
ASSERT_EQ(de->old_chunk, 30);
ASSERT_EQ(de->new_chunk, 518);
offset += sizeof(struct disk_exception);
de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
ASSERT_EQ(de->old_chunk, 25);
ASSERT_EQ(de->new_chunk, 520);
offset += sizeof(struct disk_exception);
de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
ASSERT_EQ(de->old_chunk, 100);
ASSERT_EQ(de->new_chunk, 522);
offset += sizeof(struct disk_exception);
de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
ASSERT_EQ(de->old_chunk, 105);
ASSERT_EQ(de->new_chunk, 524);
offset += sizeof(struct disk_exception);
de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
ASSERT_EQ(de->old_chunk, 110);
ASSERT_EQ(de->new_chunk, 526);
offset += sizeof(struct disk_exception);
// The next 4 operations are batch merged as
// both old and new chunk are contiguous
de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
ASSERT_EQ(de->old_chunk, 50);
ASSERT_EQ(de->new_chunk, 528);
offset += sizeof(struct disk_exception);
de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
ASSERT_EQ(de->old_chunk, 51);
ASSERT_EQ(de->new_chunk, 529);
offset += sizeof(struct disk_exception);
de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
ASSERT_EQ(de->old_chunk, 52);
ASSERT_EQ(de->new_chunk, 530);
offset += sizeof(struct disk_exception);
de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
ASSERT_EQ(de->old_chunk, 53);
ASSERT_EQ(de->new_chunk, 531);
offset += sizeof(struct disk_exception);
// This is handling overlap operation with
// two batch merge operations.
de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
ASSERT_EQ(de->old_chunk, 18);
ASSERT_EQ(de->new_chunk, 533);
offset += sizeof(struct disk_exception);
de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
ASSERT_EQ(de->old_chunk, 19);
ASSERT_EQ(de->new_chunk, 534);
offset += sizeof(struct disk_exception);
de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
ASSERT_EQ(de->old_chunk, 20);
ASSERT_EQ(de->new_chunk, 535);
offset += sizeof(struct disk_exception);
de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
ASSERT_EQ(de->old_chunk, 21);
ASSERT_EQ(de->new_chunk, 537);
offset += sizeof(struct disk_exception);
de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
ASSERT_EQ(de->old_chunk, 22);
ASSERT_EQ(de->new_chunk, 538);
offset += sizeof(struct disk_exception);
de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
ASSERT_EQ(de->old_chunk, 23);
ASSERT_EQ(de->new_chunk, 539);
offset += sizeof(struct disk_exception);
// End of metadata
de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
ASSERT_EQ(de->old_chunk, 0);
ASSERT_EQ(de->new_chunk, 0);
offset += sizeof(struct disk_exception);
}
}
}
TEST(Snapuserd_Test, Snapshot_Metadata) {
CowSnapuserdMetadataTest harness;
harness.Setup();
harness.ValidateMetadata();
}
TEST(Snapuserd_Test, Snapshot_Metadata_Overlap) {
CowSnapuserdMetadataTest harness;
harness.SetupPartialArea();
harness.ValidatePartialFilledArea();
}
TEST(Snapuserd_Test, Snapshot_Merge_Resume) {
CowSnapuserdTest harness;
ASSERT_TRUE(harness.Setup());
@ -457,7 +749,7 @@ TEST(Snapuserd_Test, Snapshot_Merge_Resume) {
harness.Shutdown();
}
TEST(Snapuserd_Test, Snapshot) {
TEST(Snapuserd_Test, Snapshot_IO_TEST) {
CowSnapuserdTest harness;
ASSERT_TRUE(harness.Setup());
harness.ReadSnapshotDeviceAndValidate();
@ -465,7 +757,6 @@ TEST(Snapuserd_Test, Snapshot) {
harness.ValidateMerge();
harness.Shutdown();
}
} // namespace snapshot
} // namespace android

View file

@ -491,7 +491,7 @@ bool CowWriter::Sync() {
return true;
}
bool CowWriter::CommitMerge(int merged_ops, bool sync) {
bool CowWriter::CommitMerge(int merged_ops) {
CHECK(merge_in_progress_);
header_.num_merge_ops += merged_ops;
@ -506,11 +506,7 @@ bool CowWriter::CommitMerge(int merged_ops, bool sync) {
return false;
}
// Sync only for merging of copy operations.
if (sync) {
return Sync();
}
return true;
return Sync();
}
bool CowWriter::Truncate(off_t length) {

View file

@ -101,7 +101,7 @@ class CowWriter : public ICowWriter {
bool InitializeAppend(android::base::borrowed_fd fd, uint64_t label);
void InitializeMerge(android::base::borrowed_fd fd, CowHeader* header);
bool CommitMerge(int merged_ops, bool sync);
bool CommitMerge(int merged_ops);
bool Finalize() override;

View file

@ -47,8 +47,8 @@ typedef sector_t chunk_t;
static constexpr uint32_t CHUNK_SIZE = 8;
static constexpr uint32_t CHUNK_SHIFT = (__builtin_ffs(CHUNK_SIZE) - 1);
static constexpr uint32_t BLOCK_SIZE = 4096;
static constexpr uint32_t BLOCK_SHIFT = (__builtin_ffs(BLOCK_SIZE) - 1);
static constexpr uint32_t BLOCK_SZ = 4096;
static constexpr uint32_t BLOCK_SHIFT = (__builtin_ffs(BLOCK_SZ) - 1);
#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))

View file

@ -1439,6 +1439,7 @@ bool SnapshotManager::PerformInitTransition(InitTransition transition,
// Wait for ueventd to acknowledge and create the control device node.
std::string control_device = "/dev/dm-user/" + misc_name;
if (!WaitForDevice(control_device, 10s)) {
LOG(ERROR) << "dm-user control device no found: " << misc_name;
continue;
}

View file

@ -17,6 +17,8 @@
#include "snapuserd.h"
#include <csignal>
#include <optional>
#include <set>
#include <libsnapshot/snapuserd_client.h>
@ -32,7 +34,7 @@ using android::base::unique_fd;
static constexpr size_t PAYLOAD_SIZE = (1UL << 20);
static_assert(PAYLOAD_SIZE >= BLOCK_SIZE);
static_assert(PAYLOAD_SIZE >= BLOCK_SZ);
void BufferSink::Initialize(size_t size) {
buffer_size_ = size;
@ -78,10 +80,10 @@ Snapuserd::Snapuserd(const std::string& misc_name, const std::string& cow_device
// request will always be 4k. After constructing
// the header, zero out the remaining block.
void Snapuserd::ConstructKernelCowHeader() {
void* buffer = bufsink_.GetPayloadBuffer(BLOCK_SIZE);
void* buffer = bufsink_.GetPayloadBuffer(BLOCK_SZ);
CHECK(buffer != nullptr);
memset(buffer, 0, BLOCK_SIZE);
memset(buffer, 0, BLOCK_SZ);
struct disk_header* dh = reinterpret_cast<struct disk_header*>(buffer);
@ -106,13 +108,13 @@ bool Snapuserd::ProcessReplaceOp(const CowOperation* cow_op) {
// Start the copy operation. This will read the backing
// block device which is represented by cow_op->source.
bool Snapuserd::ProcessCopyOp(const CowOperation* cow_op) {
void* buffer = bufsink_.GetPayloadBuffer(BLOCK_SIZE);
void* buffer = bufsink_.GetPayloadBuffer(BLOCK_SZ);
CHECK(buffer != nullptr);
// Issue a single 4K IO. However, this can be optimized
// if the successive blocks are contiguous.
if (!android::base::ReadFullyAtOffset(backing_store_fd_, buffer, BLOCK_SIZE,
cow_op->source * BLOCK_SIZE)) {
if (!android::base::ReadFullyAtOffset(backing_store_fd_, buffer, BLOCK_SZ,
cow_op->source * BLOCK_SZ)) {
SNAP_PLOG(ERROR) << "Copy-op failed. Read from backing store: " << backing_store_device_
<< "at block :" << cow_op->source;
return false;
@ -123,10 +125,10 @@ bool Snapuserd::ProcessCopyOp(const CowOperation* cow_op) {
bool Snapuserd::ProcessZeroOp() {
// Zero out the entire block
void* buffer = bufsink_.GetPayloadBuffer(BLOCK_SIZE);
void* buffer = bufsink_.GetPayloadBuffer(BLOCK_SZ);
CHECK(buffer != nullptr);
memset(buffer, 0, BLOCK_SIZE);
memset(buffer, 0, BLOCK_SZ);
return true;
}
@ -173,11 +175,11 @@ int Snapuserd::ReadUnalignedSector(sector_t sector, size_t size,
struct dm_user_message* msg = (struct dm_user_message*)(&(buffer[0]));
memmove(msg->payload.buf, (char*)msg->payload.buf + skip_sector_size,
(BLOCK_SIZE - skip_sector_size));
(BLOCK_SZ - skip_sector_size));
}
bufsink_.ResetBufferOffset();
return std::min(size, (BLOCK_SIZE - skip_sector_size));
return std::min(size, (BLOCK_SZ - skip_sector_size));
}
/*
@ -234,7 +236,7 @@ int Snapuserd::ReadData(sector_t sector, size_t size) {
return ReadUnalignedSector(sector, size, it);
}
int num_ops = DIV_ROUND_UP(size, BLOCK_SIZE);
int num_ops = DIV_ROUND_UP(size, BLOCK_SZ);
while (num_ops) {
if (!ProcessCowOp(it->second)) {
return -1;
@ -242,7 +244,7 @@ int Snapuserd::ReadData(sector_t sector, size_t size) {
num_ops -= 1;
it++;
// Update the buffer offset
bufsink_.UpdateBufferOffset(BLOCK_SIZE);
bufsink_.UpdateBufferOffset(BLOCK_SZ);
SNAP_LOG(DEBUG) << "ReadData at sector: " << sector << " size: " << size;
}
@ -344,7 +346,7 @@ loff_t Snapuserd::GetMergeStartOffset(void* merged_buffer, void* unmerged_buffer
}
int Snapuserd::GetNumberOfMergedOps(void* merged_buffer, void* unmerged_buffer, loff_t offset,
int unmerged_exceptions, bool* copy_op) {
int unmerged_exceptions) {
int merged_ops_cur_iter = 0;
// Find the operations which are merged in this cycle.
@ -362,10 +364,8 @@ int Snapuserd::GetNumberOfMergedOps(void* merged_buffer, void* unmerged_buffer,
offset += sizeof(struct disk_exception);
const CowOperation* cow_op = chunk_map_[ChunkToSector(cow_de->new_chunk)];
CHECK(cow_op != nullptr);
CHECK(cow_op->new_block == cow_de->old_chunk);
if (cow_op->type == kCowCopyOp) {
*copy_op = true;
}
// zero out to indicate that operation is merged.
cow_de->old_chunk = 0;
cow_de->new_chunk = 0;
@ -389,10 +389,6 @@ int Snapuserd::GetNumberOfMergedOps(void* merged_buffer, void* unmerged_buffer,
return -1;
}
}
if (*copy_op) {
CHECK(merged_ops_cur_iter == 1);
}
return merged_ops_cur_iter;
}
@ -414,42 +410,15 @@ bool Snapuserd::ProcessMergeComplete(chunk_t chunk, void* buffer) {
int unmerged_exceptions = 0;
loff_t offset = GetMergeStartOffset(buffer, vec_[divresult.quot].get(), &unmerged_exceptions);
bool copy_op = false;
// Check if the merged operation is a copy operation. If so, then we need
// to explicitly sync the metadata before initiating the next merge.
// For ex: Consider a following sequence of copy operations in the COW file:
//
// Op-1: Copy 2 -> 3
// Op-2: Copy 1 -> 2
// Op-3: Copy 5 -> 10
//
// Op-1 and Op-2 are overlapping copy operations. The merge sequence will
// look like:
//
// Merge op-1: Copy 2 -> 3
// Merge op-2: Copy 1 -> 2
// Merge op-3: Copy 5 -> 10
//
// Now, let's say we have a crash _after_ Merge op-2; Block 2 contents would
// have been over-written by Block-1 after merge op-2. During next reboot,
// kernel will request the metadata for all the un-merged blocks. If we had
// not sync the metadata after Merge-op 1 and Merge op-2, snapuser daemon
// will think that these merge operations are still pending and hence will
// inform the kernel that Op-1 and Op-2 are un-merged blocks. When kernel
// resumes back the merging process, it will attempt to redo the Merge op-1
// once again. However, block 2 contents are wrong as it has the contents
// of block 1 from previous merge cycle. Although, merge will silently succeed,
// this will lead to silent data corruption.
//
int merged_ops_cur_iter = GetNumberOfMergedOps(buffer, vec_[divresult.quot].get(), offset,
unmerged_exceptions, &copy_op);
int merged_ops_cur_iter =
GetNumberOfMergedOps(buffer, vec_[divresult.quot].get(), offset, unmerged_exceptions);
// There should be at least one operation merged in this cycle
CHECK(merged_ops_cur_iter > 0);
header.num_merge_ops += merged_ops_cur_iter;
reader_->UpdateMergeProgress(merged_ops_cur_iter);
if (!writer_->CommitMerge(merged_ops_cur_iter, copy_op)) {
if (!writer_->CommitMerge(merged_ops_cur_iter)) {
SNAP_LOG(ERROR) << "CommitMerge failed... merged_ops_cur_iter: " << merged_ops_cur_iter;
return false;
}
@ -506,12 +475,13 @@ chunk_t Snapuserd::GetNextAllocatableChunkId(chunk_t chunk) {
* during merge; specifically when the merge operation has dependency.
* These dependencies can only happen during copy operations.
*
* To avoid this problem, we make sure that no two copy-operations
* do not have contiguous chunk IDs. Additionally, we make sure
* that each copy operation is merged individually.
* To avoid this problem, we make sure overlap copy operations
* are not batch merged.
* 6: Use a monotonically increasing chunk number to assign the
* new_chunk
* 7: Each chunk-id represents either a: Metadata page or b: Data page
* 7: Each chunk-id represents either
* a: Metadata page or
* b: Data page
* 8: Chunk-id representing a data page is stored in a map.
* 9: Chunk-id representing a metadata page is converted into a vector
* index. We store this in vector as kernel requests metadata during
@ -531,8 +501,8 @@ bool Snapuserd::ReadMetadata() {
reader_ = std::make_unique<CowReader>();
CowHeader header;
CowOptions options;
bool prev_copy_op = false;
bool metadata_found = false;
int replace_ops = 0, zero_ops = 0, copy_ops = 0;
SNAP_LOG(DEBUG) << "ReadMetadata: Parsing cow file";
@ -546,10 +516,10 @@ bool Snapuserd::ReadMetadata() {
return false;
}
CHECK(header.block_size == BLOCK_SIZE);
CHECK(header.block_size == BLOCK_SZ);
SNAP_LOG(DEBUG) << "Merge-ops: " << header.num_merge_ops;
reader_->InitializeMerge();
SNAP_LOG(DEBUG) << "Merge-ops: " << header.num_merge_ops;
writer_ = std::make_unique<CowWriter>(options);
writer_->InitializeMerge(cow_fd_.get(), &header);
@ -584,17 +554,26 @@ bool Snapuserd::ReadMetadata() {
}
metadata_found = true;
if ((cow_op->type == kCowCopyOp || prev_copy_op)) {
// This loop will handle all the replace and zero ops.
// We will handle the copy ops later as it requires special
// handling of assigning chunk-id's. Furthermore, we make
// sure that replace/zero and copy ops are not batch merged; hence,
// the bump in the chunk_id before break of this loop
if (cow_op->type == kCowCopyOp) {
data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
break;
}
prev_copy_op = (cow_op->type == kCowCopyOp);
if (cow_op->type == kCowReplaceOp) {
replace_ops++;
} else if (cow_op->type == kCowZeroOp) {
zero_ops++;
}
// Construct the disk-exception
de->old_chunk = cow_op->new_block;
de->new_chunk = data_chunk_id;
SNAP_LOG(DEBUG) << "Old-chunk: " << de->old_chunk << "New-chunk: " << de->new_chunk;
// Store operation pointer.
chunk_map_[ChunkToSector(data_chunk_id)] = cow_op;
@ -602,6 +581,9 @@ bool Snapuserd::ReadMetadata() {
offset += sizeof(struct disk_exception);
cowop_riter_->Next();
SNAP_LOG(DEBUG) << num_ops << ":"
<< " Old-chunk: " << de->old_chunk << " New-chunk: " << de->new_chunk;
if (num_ops == exceptions_per_area_) {
// Store it in vector at the right index. This maps the chunk-id to
// vector index.
@ -616,13 +598,213 @@ bool Snapuserd::ReadMetadata() {
if (cowop_riter_->Done()) {
vec_.push_back(std::move(de_ptr));
SNAP_LOG(DEBUG) << "ReadMetadata() completed; Number of Areas: " << vec_.size();
}
}
data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
}
std::optional<chunk_t> prev_id = {};
std::map<uint64_t, const CowOperation*> map;
std::set<uint64_t> dest_blocks;
size_t pending_copy_ops = exceptions_per_area_ - num_ops;
SNAP_LOG(INFO) << " Processing copy-ops at Area: " << vec_.size()
<< " Number of replace/zero ops completed in this area: " << num_ops
<< " Pending copy ops for this area: " << pending_copy_ops;
while (!cowop_riter_->Done()) {
do {
const CowOperation* cow_op = &cowop_riter_->Get();
if (IsMetadataOp(*cow_op)) {
cowop_riter_->Next();
continue;
}
// We have two cases specific cases:
//
// =====================================================
// Case 1: Overlapping copy regions
//
// Ex:
//
// Source -> Destination
//
// 1: 15 -> 18
// 2: 16 -> 19
// 3: 17 -> 20
// 4: 18 -> 21
// 5: 19 -> 22
// 6: 20 -> 23
//
// We have 6 copy operations to be executed in OTA and there is a overlap. Update-engine
// will write to COW file as follows:
//
// Op-1: 20 -> 23
// Op-2: 19 -> 22
// Op-3: 18 -> 21
// Op-4: 17 -> 20
// Op-5: 16 -> 19
// Op-6: 15 -> 18
//
// Note that the blocks numbers are contiguous. Hence, all 6 copy
// operations can potentially be batch merged. However, that will be
// problematic if we have a crash as block 20, 19, 18 would have
// been overwritten and hence subsequent recovery may end up with
// a silent data corruption when op-1, op-2 and op-3 are
// re-executed.
//
// We will split these 6 operations into two batches viz:
//
// Batch-1:
// ===================
// Op-1: 20 -> 23
// Op-2: 19 -> 22
// Op-3: 18 -> 21
// ===================
//
// Batch-2:
// ==================
// Op-4: 17 -> 20
// Op-5: 16 -> 19
// Op-6: 15 -> 18
// ==================
//
// Now, merge sequence will look like:
//
// 1: Merge Batch-1 { op-1, op-2, op-3 }
// 2: Update Metadata in COW File that op-1, op-2, op-3 merge is
// done.
// 3: Merge Batch-2
// 4: Update Metadata in COW File that op-4, op-5, op-6 merge is
// done.
//
// Note, that the order of block operations are still the same.
// However, we have two batch merge operations. Any crash between
// either of this sequence should be safe as each of these
// batches are self-contained.
//
//===========================================================
//
// Case 2:
//
// Let's say we have three copy operations written to COW file
// in the following order:
//
// op-1: 15 -> 18
// op-2: 16 -> 19
// op-3: 17 -> 20
//
// As aforementioned, kernel will initiate merge in reverse order.
// Hence, we will read these ops in reverse order so that all these
// ops are exectued in the same order as requested. Thus, we will
// read the metadata in reverse order and for the kernel it will
// look like:
//
// op-3: 17 -> 20
// op-2: 16 -> 19
// op-1: 15 -> 18 <-- Merge starts here in the kernel
//
// Now, this is problematic as kernel cannot batch merge them.
//
// Merge sequence will look like:
//
// Merge-1: op-1: 15 -> 18
// Merge-2: op-2: 16 -> 19
// Merge-3: op-3: 17 -> 20
//
// We have three merge operations.
//
// Even though the blocks are contiguous, kernel can batch merge
// them if the blocks are in descending order. Update engine
// addresses this issue partially for overlapping operations as
// we see that op-1 to op-3 and op-4 to op-6 operatiosn are in
// descending order. However, if the copy operations are not
// overlapping, update engine cannot write these blocks
// in descending order. Hence, we will try to address it.
// Thus, we will send these blocks to the kernel and it will
// look like:
//
// op-3: 15 -> 18
// op-2: 16 -> 19
// op-1: 17 -> 20 <-- Merge starts here in the kernel
//
// Now with this change, we can batch merge all these three
// operations. Merge sequence will look like:
//
// Merge-1: {op-1: 17 -> 20, op-2: 16 -> 19, op-3: 15 -> 18}
//
// Note that we have changed the ordering of merge; However, this
// is ok as each of these copy operations are independent and there
// is no overlap.
//
//===================================================================
if (prev_id.has_value()) {
chunk_t diff = (cow_op->new_block > prev_id.value())
? (cow_op->new_block - prev_id.value())
: (prev_id.value() - cow_op->new_block);
if (diff != 1) {
break;
}
if (dest_blocks.count(cow_op->new_block) || map.count(cow_op->source) > 0) {
break;
}
}
metadata_found = true;
pending_copy_ops -= 1;
map[cow_op->new_block] = cow_op;
dest_blocks.insert(cow_op->source);
prev_id = cow_op->new_block;
cowop_riter_->Next();
} while (!cowop_riter_->Done() && pending_copy_ops);
data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
SNAP_LOG(DEBUG) << "Batch Merge copy-ops of size: " << map.size()
<< " Area: " << vec_.size() << " Area offset: " << offset
<< " Pending-copy-ops in this area: " << pending_copy_ops;
for (auto it = map.begin(); it != map.end(); it++) {
struct disk_exception* de =
reinterpret_cast<struct disk_exception*>((char*)de_ptr.get() + offset);
de->old_chunk = it->first;
de->new_chunk = data_chunk_id;
// Store operation pointer.
chunk_map_[ChunkToSector(data_chunk_id)] = it->second;
offset += sizeof(struct disk_exception);
num_ops += 1;
copy_ops++;
SNAP_LOG(DEBUG) << num_ops << ":"
<< " Copy-op: "
<< " Old-chunk: " << de->old_chunk << " New-chunk: " << de->new_chunk;
if (num_ops == exceptions_per_area_) {
// Store it in vector at the right index. This maps the chunk-id to
// vector index.
vec_.push_back(std::move(de_ptr));
num_ops = 0;
offset = 0;
// Create buffer for next area
de_ptr = std::make_unique<uint8_t[]>(exceptions_per_area_ *
sizeof(struct disk_exception));
memset(de_ptr.get(), 0, (exceptions_per_area_ * sizeof(struct disk_exception)));
if (cowop_riter_->Done()) {
vec_.push_back(std::move(de_ptr));
SNAP_LOG(DEBUG) << "ReadMetadata() completed; Number of Areas: " << vec_.size();
}
CHECK(pending_copy_ops == 0);
pending_copy_ops = exceptions_per_area_;
}
data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
}
map.clear();
dest_blocks.clear();
prev_id.reset();
}
// Partially filled area or there is no metadata
// If there is no metadata, fill with zero so that kernel
// is aware that merge is completed.
@ -632,8 +814,11 @@ bool Snapuserd::ReadMetadata() {
<< "Areas : " << vec_.size();
}
SNAP_LOG(DEBUG) << "ReadMetadata() completed. Final_chunk_id: " << data_chunk_id
<< "Num Sector: " << ChunkToSector(data_chunk_id);
SNAP_LOG(INFO) << "ReadMetadata completed. Final-chunk-id: " << data_chunk_id
<< " Num Sector: " << ChunkToSector(data_chunk_id)
<< " Replace-ops: " << replace_ops << " Zero-ops: " << zero_ops
<< " Copy-ops: " << copy_ops << " Areas: " << vec_.size()
<< " Num-ops-merged: " << header.num_merge_ops;
// Total number of sectors required for creating dm-user device
num_sectors_ = ChunkToSector(data_chunk_id);
@ -742,7 +927,7 @@ bool Snapuserd::DmuserWriteRequest() {
size_t remaining_size = header->len;
size_t read_size = std::min(PAYLOAD_SIZE, remaining_size);
CHECK(read_size == BLOCK_SIZE);
CHECK(read_size == BLOCK_SZ);
CHECK(header->sector > 0);
chunk_t chunk = SectorToChunk(header->sector);
@ -793,11 +978,11 @@ bool Snapuserd::DmuserReadRequest() {
// will always be a single 4k.
if (header->sector == 0) {
CHECK(metadata_read_done_ == true);
CHECK(read_size == BLOCK_SIZE);
CHECK(read_size == BLOCK_SZ);
ConstructKernelCowHeader();
SNAP_LOG(DEBUG) << "Kernel header constructed";
} else {
if (!offset && (read_size == BLOCK_SIZE) &&
if (!offset && (read_size == BLOCK_SZ) &&
chunk_map_.find(header->sector) == chunk_map_.end()) {
if (!ReadDiskExceptions(chunk, read_size)) {
SNAP_LOG(ERROR) << "ReadDiskExceptions failed for chunk id: " << chunk

View file

@ -75,6 +75,8 @@ class Snapuserd final {
cow_fd_ = {};
backing_store_fd_ = {};
}
size_t GetMetadataAreaSize() { return vec_.size(); }
void* GetExceptionBuffer(size_t i) { return vec_[i].get(); }
private:
bool DmuserReadRequest();
@ -101,11 +103,11 @@ class Snapuserd final {
loff_t GetMergeStartOffset(void* merged_buffer, void* unmerged_buffer,
int* unmerged_exceptions);
int GetNumberOfMergedOps(void* merged_buffer, void* unmerged_buffer, loff_t offset,
int unmerged_exceptions, bool* copy_op);
int unmerged_exceptions);
bool ProcessMergeComplete(chunk_t chunk, void* buffer);
sector_t ChunkToSector(chunk_t chunk) { return chunk << CHUNK_SHIFT; }
chunk_t SectorToChunk(sector_t sector) { return sector >> CHUNK_SHIFT; }
bool IsBlockAligned(int read_size) { return ((read_size & (BLOCK_SIZE - 1)) == 0); }
bool IsBlockAligned(int read_size) { return ((read_size & (BLOCK_SZ - 1)) == 0); }
std::string cow_device_;
std::string backing_store_device_;