Merge "libsnapshot:snapuserd: Handle un-aligned IO request"

This commit is contained in:
Akilesh Kailash 2021-01-12 06:51:47 +00:00 committed by Gerrit Code Review
commit 06ee6f1753
4 changed files with 251 additions and 190 deletions

View file

@ -239,16 +239,12 @@ void CowSnapuserdTest::CreateCowDevice() {
ASSERT_TRUE(rnd_fd > 0);
std::unique_ptr<uint8_t[]> random_buffer_1_ = std::make_unique<uint8_t[]>(size_);
std::unique_ptr<uint8_t[]> random_buffer_2_ = std::make_unique<uint8_t[]>(size_);
// Fill random data
for (size_t j = 0; j < (size_ / 1_MiB); j++) {
ASSERT_EQ(ReadFullyAtOffset(rnd_fd, (char*)random_buffer_1_.get() + offset, 1_MiB, 0),
true);
ASSERT_EQ(ReadFullyAtOffset(rnd_fd, (char*)random_buffer_2_.get() + offset, 1_MiB, 0),
true);
offset += 1_MiB;
}
@ -280,7 +276,7 @@ void CowSnapuserdTest::CreateCowDevice() {
size_t blk_random2_replace_start = blk_zero_copy_end;
ASSERT_TRUE(writer.AddRawBlocks(blk_random2_replace_start, random_buffer_2_.get(), size_));
ASSERT_TRUE(writer.AddRawBlocks(blk_random2_replace_start, random_buffer_1_.get(), size_));
// Flush operations
ASSERT_TRUE(writer.Finalize());
@ -292,7 +288,7 @@ void CowSnapuserdTest::CreateCowDevice() {
ASSERT_EQ(android::base::ReadFullyAtOffset(base_fd_, orig_buffer_.get(), size_, size_), true);
memcpy((char*)orig_buffer_.get() + size_, random_buffer_1_.get(), size_);
memcpy((char*)orig_buffer_.get() + (size_ * 2), (void*)zero_buffer.c_str(), size_);
memcpy((char*)orig_buffer_.get() + (size_ * 3), random_buffer_2_.get(), size_);
memcpy((char*)orig_buffer_.get() + (size_ * 3), random_buffer_1_.get(), size_);
}
void CowSnapuserdTest::InitCowDevice() {

View file

@ -50,6 +50,8 @@ static constexpr uint32_t CHUNK_SHIFT = (__builtin_ffs(CHUNK_SIZE) - 1);
static constexpr uint32_t BLOCK_SIZE = 4096;
static constexpr uint32_t BLOCK_SHIFT = (__builtin_ffs(BLOCK_SIZE) - 1);
#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
// This structure represents the kernel COW header.
// All the below fields should be in Little Endian format.
struct disk_header {

View file

@ -129,88 +129,126 @@ bool Snapuserd::ProcessZeroOp() {
return true;
}
/*
* Read the data of size bytes from a given chunk.
*
* Kernel can potentially merge the blocks if the
* successive chunks are contiguous. For chunk size of 8,
* there can be 256 disk exceptions; and if
* all 256 disk exceptions are contiguous, kernel can merge
* them into a single IO.
*
* Since each chunk in the disk exception
* mapping represents a 4k block, kernel can potentially
* issue 256*4k = 1M IO in one shot.
*
* Even though kernel assumes that the blocks are
* contiguous, we need to split the 1M IO into 4k chunks
* as each operation represents 4k and it can either be:
*
* 1: Replace operation
* 2: Copy operation
* 3: Zero operation
*
*/
bool Snapuserd::ReadData(chunk_t chunk, size_t size) {
size_t read_size = size;
bool ret = true;
chunk_t chunk_key = chunk;
bool Snapuserd::ProcessCowOp(const CowOperation* cow_op) {
CHECK(cow_op != nullptr);
if (!((read_size & (BLOCK_SIZE - 1)) == 0)) {
SNAP_LOG(ERROR) << "ReadData - unaligned read_size: " << read_size;
return false;
switch (cow_op->type) {
case kCowReplaceOp: {
return ProcessReplaceOp(cow_op);
}
case kCowZeroOp: {
return ProcessZeroOp();
}
case kCowCopyOp: {
return ProcessCopyOp(cow_op);
}
default: {
SNAP_LOG(ERROR) << "Unknown operation-type found: " << cow_op->type;
}
}
return false;
}
int Snapuserd::ReadUnalignedSector(sector_t sector, size_t size,
std::map<sector_t, const CowOperation*>::iterator& it) {
size_t skip_sector_size = 0;
SNAP_LOG(DEBUG) << "ReadUnalignedSector: sector " << sector << " size: " << size
<< " Aligned sector: " << it->second;
if (!ProcessCowOp(it->second)) {
SNAP_LOG(ERROR) << "ReadUnalignedSector: " << sector << " failed";
return -1;
}
while (read_size > 0) {
const CowOperation* cow_op = chunk_map_[chunk_key];
CHECK(cow_op != nullptr);
int num_sectors_skip = sector - it->first;
switch (cow_op->type) {
case kCowReplaceOp: {
ret = ProcessReplaceOp(cow_op);
break;
}
if (num_sectors_skip > 0) {
skip_sector_size = num_sectors_skip << SECTOR_SHIFT;
char* buffer = reinterpret_cast<char*>(bufsink_.GetBufPtr());
struct dm_user_message* msg = (struct dm_user_message*)(&(buffer[0]));
case kCowZeroOp: {
ret = ProcessZeroOp();
break;
}
memmove(msg->payload.buf, (char*)msg->payload.buf + skip_sector_size,
(BLOCK_SIZE - skip_sector_size));
}
case kCowCopyOp: {
ret = ProcessCopyOp(cow_op);
break;
}
bufsink_.ResetBufferOffset();
return std::min(size, (BLOCK_SIZE - skip_sector_size));
}
default: {
SNAP_LOG(ERROR) << "Unknown operation-type found: " << cow_op->type;
ret = false;
break;
}
/*
* Read the data for a given COW Operation.
*
* Kernel can issue IO at a sector granularity.
* Hence, an IO may end up with reading partial
* data from a COW operation or we may also
* end up with interspersed request between
* two COW operations.
*
*/
int Snapuserd::ReadData(sector_t sector, size_t size) {
/*
* chunk_map stores COW operation at 4k granularity.
* If the requested IO with the sector falls on the 4k
* boundary, then we can read the COW op directly without
* any issue.
*
* However, if the requested sector is not 4K aligned,
* then we will have the find the nearest COW operation
* and chop the 4K block to fetch the requested sector.
*/
std::map<sector_t, const CowOperation*>::iterator it = chunk_map_.find(sector);
if (it == chunk_map_.end()) {
it = chunk_map_.lower_bound(sector);
if (it != chunk_map_.begin()) {
--it;
}
if (!ret) {
SNAP_LOG(ERROR) << "ReadData failed for operation: " << cow_op->type;
return false;
}
/*
* If the IO is spanned between two COW operations,
* split the IO into two parts:
*
* 1: Read the first part from the single COW op
* 2: Read the second part from the next COW op.
*
* Ex: Let's say we have a 1024 Bytes IO request.
*
* 0 COW OP-1 4096 COW OP-2 8192
* |******************|*******************|
* |*****|*****|
* 3584 4608
* <- 1024B - >
*
* We have two COW operations which are 4k blocks.
* The IO is requested for 1024 Bytes which are spanned
* between two COW operations. We will split this IO
* into two parts:
*
* 1: IO of size 512B from offset 3584 bytes (COW OP-1)
* 2: IO of size 512B from offset 4096 bytes (COW OP-2)
*/
return ReadUnalignedSector(sector, size, it);
}
int num_ops = DIV_ROUND_UP(size, BLOCK_SIZE);
while (num_ops) {
if (!ProcessCowOp(it->second)) {
return -1;
}
num_ops -= 1;
it++;
// Update the buffer offset
bufsink_.UpdateBufferOffset(BLOCK_SIZE);
read_size -= BLOCK_SIZE;
// Start iterating the chunk incrementally; Since while
// constructing the metadata, we know that the chunk IDs
// are contiguous
chunk_key += 1;
if (cow_op->type == kCowCopyOp) {
CHECK(read_size == 0);
}
SNAP_LOG(DEBUG) << "ReadData at sector: " << sector << " size: " << size;
}
// Reset the buffer offset
bufsink_.ResetBufferOffset();
return ret;
return size;
}
/*
@ -261,9 +299,7 @@ bool Snapuserd::ReadDiskExceptions(chunk_t chunk, size_t read_size) {
if (divresult.quot < vec_.size()) {
size = exceptions_per_area_ * sizeof(struct disk_exception);
if (read_size > size) {
return false;
}
CHECK(read_size == size);
void* buffer = bufsink_.GetPayloadBuffer(size);
CHECK(buffer != nullptr);
@ -329,7 +365,7 @@ int Snapuserd::GetNumberOfMergedOps(void* merged_buffer, void* unmerged_buffer,
if (cow_de->new_chunk != 0) {
merged_ops_cur_iter += 1;
offset += sizeof(struct disk_exception);
const CowOperation* cow_op = chunk_map_[cow_de->new_chunk];
const CowOperation* cow_op = chunk_map_[ChunkToSector(cow_de->new_chunk)];
CHECK(cow_op != nullptr);
CHECK(cow_op->new_block == cow_de->old_chunk);
if (cow_op->type == kCowCopyOp) {
@ -563,7 +599,7 @@ bool Snapuserd::ReadMetadata() {
SNAP_LOG(DEBUG) << "Old-chunk: " << de->old_chunk << "New-chunk: " << de->new_chunk;
// Store operation pointer.
chunk_map_[data_chunk_id] = cow_op;
chunk_map_[ChunkToSector(data_chunk_id)] = cow_op;
num_ops += 1;
offset += sizeof(struct disk_exception);
cowop_riter_->Next();
@ -680,6 +716,126 @@ bool Snapuserd::InitBackingAndControlDevice() {
return true;
}
bool Snapuserd::DmuserWriteRequest() {
struct dm_user_header* header = bufsink_.GetHeaderPtr();
// device mapper has the capability to allow
// targets to flush the cache when writes are completed. This
// is controlled by each target by a flag "flush_supported".
// This flag is set by dm-user. When flush is supported,
// a number of zero-length bio's will be submitted to
// the target for the purpose of flushing cache. It is the
// responsibility of the target driver - which is dm-user in this
// case, to remap these bio's to the underlying device. Since,
// there is no underlying device for dm-user, this zero length
// bio's gets routed to daemon.
//
// Flush operations are generated post merge by dm-snap by having
// REQ_PREFLUSH flag set. Snapuser daemon doesn't have anything
// to flush per se; hence, just respond back with a success message.
if (header->sector == 0) {
CHECK(header->len == 0);
header->type = DM_USER_RESP_SUCCESS;
if (!WriteDmUserPayload(0)) {
return false;
}
return true;
}
size_t remaining_size = header->len;
size_t read_size = std::min(PAYLOAD_SIZE, remaining_size);
CHECK(read_size == BLOCK_SIZE);
CHECK(header->sector > 0);
chunk_t chunk = SectorToChunk(header->sector);
CHECK(chunk_map_.find(header->sector) == chunk_map_.end());
void* buffer = bufsink_.GetPayloadBuffer(read_size);
CHECK(buffer != nullptr);
header->type = DM_USER_RESP_SUCCESS;
if (!ReadDmUserPayload(buffer, read_size)) {
SNAP_LOG(ERROR) << "ReadDmUserPayload failed for chunk id: " << chunk
<< "Sector: " << header->sector;
header->type = DM_USER_RESP_ERROR;
}
if (header->type == DM_USER_RESP_SUCCESS && !ProcessMergeComplete(chunk, buffer)) {
SNAP_LOG(ERROR) << "ProcessMergeComplete failed for chunk id: " << chunk
<< "Sector: " << header->sector;
header->type = DM_USER_RESP_ERROR;
} else {
SNAP_LOG(DEBUG) << "ProcessMergeComplete success for chunk id: " << chunk
<< "Sector: " << header->sector;
}
if (!WriteDmUserPayload(0)) {
return false;
}
return true;
}
bool Snapuserd::DmuserReadRequest() {
struct dm_user_header* header = bufsink_.GetHeaderPtr();
size_t remaining_size = header->len;
loff_t offset = 0;
sector_t sector = header->sector;
do {
size_t read_size = std::min(PAYLOAD_SIZE, remaining_size);
int ret = read_size;
header->type = DM_USER_RESP_SUCCESS;
chunk_t chunk = SectorToChunk(header->sector);
// Request to sector 0 is always for kernel
// representation of COW header. This IO should be only
// once during dm-snapshot device creation. We should
// never see multiple IO requests. Additionally this IO
// will always be a single 4k.
if (header->sector == 0) {
CHECK(metadata_read_done_ == true);
CHECK(read_size == BLOCK_SIZE);
ConstructKernelCowHeader();
SNAP_LOG(DEBUG) << "Kernel header constructed";
} else {
if (!offset && (read_size == BLOCK_SIZE) &&
chunk_map_.find(header->sector) == chunk_map_.end()) {
if (!ReadDiskExceptions(chunk, read_size)) {
SNAP_LOG(ERROR) << "ReadDiskExceptions failed for chunk id: " << chunk
<< "Sector: " << header->sector;
header->type = DM_USER_RESP_ERROR;
} else {
SNAP_LOG(DEBUG) << "ReadDiskExceptions success for chunk id: " << chunk
<< "Sector: " << header->sector;
}
} else {
chunk_t num_sectors_read = (offset >> SECTOR_SHIFT);
ret = ReadData(sector + num_sectors_read, read_size);
if (ret < 0) {
SNAP_LOG(ERROR) << "ReadData failed for chunk id: " << chunk
<< "Sector: " << header->sector;
header->type = DM_USER_RESP_ERROR;
} else {
SNAP_LOG(DEBUG) << "ReadData success for chunk id: " << chunk
<< "Sector: " << header->sector;
}
}
}
// Daemon will not be terminated if there is any error. We will
// just send the error back to dm-user.
if (!WriteDmUserPayload(ret)) {
return false;
}
remaining_size -= ret;
offset += ret;
} while (remaining_size > 0);
return true;
}
bool Snapuserd::Run() {
struct dm_user_header* header = bufsink_.GetHeaderPtr();
@ -698,122 +854,16 @@ bool Snapuserd::Run() {
switch (header->type) {
case DM_USER_REQ_MAP_READ: {
size_t remaining_size = header->len;
loff_t offset = 0;
do {
size_t read_size = std::min(PAYLOAD_SIZE, remaining_size);
header->type = DM_USER_RESP_SUCCESS;
// Request to sector 0 is always for kernel
// representation of COW header. This IO should be only
// once during dm-snapshot device creation. We should
// never see multiple IO requests. Additionally this IO
// will always be a single 4k.
if (header->sector == 0) {
CHECK(metadata_read_done_ == true);
CHECK(read_size == BLOCK_SIZE);
ConstructKernelCowHeader();
SNAP_LOG(DEBUG) << "Kernel header constructed";
} else {
// Convert the sector number to a chunk ID.
//
// Check if the chunk ID represents a metadata
// page. If the chunk ID is not found in the
// vector, then it points to a metadata page.
chunk_t chunk = SectorToChunk(header->sector);
if (chunk_map_.find(chunk) == chunk_map_.end()) {
if (!ReadDiskExceptions(chunk, read_size)) {
SNAP_LOG(ERROR) << "ReadDiskExceptions failed for chunk id: " << chunk
<< "Sector: " << header->sector;
header->type = DM_USER_RESP_ERROR;
} else {
SNAP_LOG(DEBUG) << "ReadDiskExceptions success for chunk id: " << chunk
<< "Sector: " << header->sector;
}
} else {
SNAP_LOG(DEBUG) << "ReadData: chunk: " << chunk << " len: " << header->len
<< " read_size: " << read_size << " offset: " << offset;
chunk_t num_chunks_read = (offset >> BLOCK_SHIFT);
if (!ReadData(chunk + num_chunks_read, read_size)) {
SNAP_LOG(ERROR) << "ReadData failed for chunk id: " << chunk
<< "Sector: " << header->sector;
header->type = DM_USER_RESP_ERROR;
} else {
SNAP_LOG(DEBUG) << "ReadData success for chunk id: " << chunk
<< "Sector: " << header->sector;
}
}
}
// Daemon will not be terminated if there is any error. We will
// just send the error back to dm-user.
if (!WriteDmUserPayload(read_size)) {
return false;
}
remaining_size -= read_size;
offset += read_size;
} while (remaining_size);
if (!DmuserReadRequest()) {
return false;
}
break;
}
case DM_USER_REQ_MAP_WRITE: {
// device mapper has the capability to allow
// targets to flush the cache when writes are completed. This
// is controlled by each target by a flag "flush_supported".
// This flag is set by dm-user. When flush is supported,
// a number of zero-length bio's will be submitted to
// the target for the purpose of flushing cache. It is the
// responsibility of the target driver - which is dm-user in this
// case, to remap these bio's to the underlying device. Since,
// there is no underlying device for dm-user, this zero length
// bio's gets routed to daemon.
//
// Flush operations are generated post merge by dm-snap by having
// REQ_PREFLUSH flag set. Snapuser daemon doesn't have anything
// to flush per se; hence, just respond back with a success message.
if (header->sector == 0) {
CHECK(header->len == 0);
header->type = DM_USER_RESP_SUCCESS;
if (!WriteDmUserPayload(0)) {
return false;
}
break;
}
size_t remaining_size = header->len;
size_t read_size = std::min(PAYLOAD_SIZE, remaining_size);
CHECK(read_size == BLOCK_SIZE);
CHECK(header->sector > 0);
chunk_t chunk = SectorToChunk(header->sector);
CHECK(chunk_map_.find(chunk) == chunk_map_.end());
void* buffer = bufsink_.GetPayloadBuffer(read_size);
CHECK(buffer != nullptr);
header->type = DM_USER_RESP_SUCCESS;
if (!ReadDmUserPayload(buffer, read_size)) {
SNAP_LOG(ERROR) << "ReadDmUserPayload failed for chunk id: " << chunk
<< "Sector: " << header->sector;
header->type = DM_USER_RESP_ERROR;
}
if (header->type == DM_USER_RESP_SUCCESS && !ProcessMergeComplete(chunk, buffer)) {
SNAP_LOG(ERROR) << "ProcessMergeComplete failed for chunk id: " << chunk
<< "Sector: " << header->sector;
header->type = DM_USER_RESP_ERROR;
} else {
SNAP_LOG(DEBUG) << "ProcessMergeComplete success for chunk id: " << chunk
<< "Sector: " << header->sector;
}
if (!WriteDmUserPayload(0)) {
if (!DmuserWriteRequest()) {
return false;
}
break;
}
}

View file

@ -22,9 +22,9 @@
#include <cstring>
#include <iostream>
#include <limits>
#include <map>
#include <string>
#include <thread>
#include <unordered_map>
#include <vector>
#include <android-base/file.h>
@ -72,6 +72,9 @@ class Snapuserd final {
bool IsAttached() const { return ctrl_fd_ >= 0; }
private:
bool DmuserReadRequest();
bool DmuserWriteRequest();
bool ReadDmUserHeader();
bool ReadDmUserPayload(void* buffer, size_t size);
bool WriteDmUserPayload(size_t size);
@ -79,10 +82,13 @@ class Snapuserd final {
bool ReadMetadata();
bool ZerofillDiskExceptions(size_t read_size);
bool ReadDiskExceptions(chunk_t chunk, size_t size);
bool ReadData(chunk_t chunk, size_t size);
int ReadUnalignedSector(sector_t sector, size_t size,
std::map<sector_t, const CowOperation*>::iterator& it);
int ReadData(sector_t sector, size_t size);
bool IsChunkIdMetadata(chunk_t chunk);
chunk_t GetNextAllocatableChunkId(chunk_t chunk_id);
bool ProcessCowOp(const CowOperation* cow_op);
bool ProcessReplaceOp(const CowOperation* cow_op);
bool ProcessCopyOp(const CowOperation* cow_op);
bool ProcessZeroOp();
@ -94,6 +100,7 @@ class Snapuserd final {
bool ProcessMergeComplete(chunk_t chunk, void* buffer);
sector_t ChunkToSector(chunk_t chunk) { return chunk << CHUNK_SHIFT; }
chunk_t SectorToChunk(sector_t sector) { return sector >> CHUNK_SHIFT; }
bool IsBlockAligned(int read_size) { return ((read_size & (BLOCK_SIZE - 1)) == 0); }
std::string cow_device_;
std::string backing_store_device_;
@ -116,9 +123,15 @@ class Snapuserd final {
// mapping of old-chunk to new-chunk
std::vector<std::unique_ptr<uint8_t[]>> vec_;
// Key - Chunk ID
// Key - Sector
// Value - cow operation
std::unordered_map<chunk_t, const CowOperation*> chunk_map_;
//
// chunk_map stores the pseudo mapping of sector
// to COW operations. Each COW op is 4k; however,
// we can get a read request which are as small
// as 512 bytes. Hence, we need to binary search
// in the chunk_map to find the nearest COW op.
std::map<sector_t, const CowOperation*> chunk_map_;
bool metadata_read_done_ = false;
BufferSink bufsink_;