freebsd-skq/sbin/hastd/activemap.c
pjd 1c1e2e8b71 Please welcome HAST - Highly Avalable Storage.
HAST allows to transparently store data on two physically separated machines
connected over the TCP/IP network. HAST works in Primary-Secondary
(Master-Backup, Master-Slave) configuration, which means that only one of the
cluster nodes can be active at any given time. Only Primary node is able to
handle I/O requests to HAST-managed devices. Currently HAST is limited to two
cluster nodes in total.

HAST operates on block level - it provides disk-like devices in /dev/hast/
directory for use by file systems and/or applications. Working on block level
makes it transparent for file systems and applications. There in no difference
between using HAST-provided device and raw disk, partition, etc. All of them
are just regular GEOM providers in FreeBSD.

For more information please consult hastd(8), hastctl(8) and hast.conf(5)
manual pages, as well as http://wiki.FreeBSD.org/HAST.

Sponsored by:	FreeBSD Foundation
Sponsored by:	OMCnet Internet Service GmbH
Sponsored by:	TransIP BV
2010-02-18 23:16:19 +00:00

692 lines
17 KiB
C

/*-
* Copyright (c) 2009-2010 The FreeBSD Foundation
* All rights reserved.
*
* This software was developed by Pawel Jakub Dawidek under sponsorship from
* the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h> /* powerof2() */
#include <sys/queue.h>
#include <assert.h>
#include <bitstring.h>
#include <errno.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <activemap.h>
#define ACTIVEMAP_MAGIC 0xac71e4
struct activemap {
int am_magic; /* Magic value. */
off_t am_mediasize; /* Media size in bytes. */
uint32_t am_extentsize; /* Extent size in bytes,
must be power of 2. */
uint8_t am_extentshift;/* 2 ^ extentbits == extentsize */
int am_nextents; /* Number of extents. */
size_t am_mapsize; /* Bitmap size in bytes. */
uint16_t *am_memtab; /* An array that holds number of pending
writes per extent. */
bitstr_t *am_diskmap; /* On-disk bitmap of dirty extents. */
bitstr_t *am_memmap; /* In-memory bitmap of dirty extents. */
size_t am_diskmapsize; /* Map size rounded up to sector size. */
uint64_t am_ndirty; /* Number of dirty regions. */
bitstr_t *am_syncmap; /* Bitmap of extents to sync. */
off_t am_syncoff; /* Next synchronization offset. */
TAILQ_HEAD(skeepdirty, keepdirty) am_keepdirty; /* List of extents that
we keep dirty to reduce bitmap
updates. */
int am_nkeepdirty; /* Number of am_keepdirty elements. */
int am_nkeepdirty_limit; /* Maximum number of am_keepdirty
elements. */
};
struct keepdirty {
int kd_extent;
TAILQ_ENTRY(keepdirty) kd_next;
};
/*
* Helper function taken from sys/systm.h to calculate extentshift.
*/
static uint32_t
bitcount32(uint32_t x)
{
x = (x & 0x55555555) + ((x & 0xaaaaaaaa) >> 1);
x = (x & 0x33333333) + ((x & 0xcccccccc) >> 2);
x = (x + (x >> 4)) & 0x0f0f0f0f;
x = (x + (x >> 8));
x = (x + (x >> 16)) & 0x000000ff;
return (x);
}
static __inline int
off2ext(const struct activemap *amp, off_t offset)
{
int extent;
assert(offset >= 0 && offset < amp->am_mediasize);
extent = (offset >> amp->am_extentshift);
assert(extent >= 0 && extent < amp->am_nextents);
return (extent);
}
static __inline off_t
ext2off(const struct activemap *amp, int extent)
{
off_t offset;
assert(extent >= 0 && extent < amp->am_nextents);
offset = ((off_t)extent << amp->am_extentshift);
assert(offset >= 0 && offset < amp->am_mediasize);
return (offset);
}
/*
* Function calculates number of requests needed to synchronize the given
* extent.
*/
static __inline int
ext2reqs(const struct activemap *amp, int ext)
{
off_t left;
if (ext < amp->am_nextents - 1)
return (((amp->am_extentsize - 1) / MAXPHYS) + 1);
assert(ext == amp->am_nextents - 1);
left = amp->am_mediasize % amp->am_extentsize;
if (left == 0)
left = amp->am_extentsize;
return (((left - 1) / MAXPHYS) + 1);
}
/*
* Initialize activemap structure and allocate memory for internal needs.
* Function returns 0 on success and -1 if any of the allocations failed.
*/
int
activemap_init(struct activemap **ampp, uint64_t mediasize, uint32_t extentsize,
uint32_t sectorsize, uint32_t keepdirty)
{
struct activemap *amp;
assert(ampp != NULL);
assert(mediasize > 0);
assert(extentsize > 0);
assert(powerof2(extentsize));
assert(sectorsize > 0);
assert(powerof2(sectorsize));
assert(keepdirty > 0);
amp = malloc(sizeof(*amp));
if (amp == NULL)
return (-1);
amp->am_mediasize = mediasize;
amp->am_nkeepdirty_limit = keepdirty;
amp->am_extentsize = extentsize;
amp->am_extentshift = bitcount32(extentsize - 1);
amp->am_nextents = ((mediasize - 1) / extentsize) + 1;
amp->am_mapsize = sizeof(bitstr_t) * bitstr_size(amp->am_nextents);
amp->am_diskmapsize = roundup2(amp->am_mapsize, sectorsize);
amp->am_ndirty = 0;
amp->am_syncoff = -2;
TAILQ_INIT(&amp->am_keepdirty);
amp->am_nkeepdirty = 0;
amp->am_memtab = calloc(amp->am_nextents, sizeof(amp->am_memtab[0]));
amp->am_diskmap = calloc(1, amp->am_diskmapsize);
amp->am_memmap = bit_alloc(amp->am_nextents);
amp->am_syncmap = bit_alloc(amp->am_nextents);
/*
* Check to see if any of the allocations above failed.
*/
if (amp->am_memtab == NULL || amp->am_diskmap == NULL ||
amp->am_memmap == NULL || amp->am_syncmap == NULL) {
if (amp->am_memtab != NULL)
free(amp->am_memtab);
if (amp->am_diskmap != NULL)
free(amp->am_diskmap);
if (amp->am_memmap != NULL)
free(amp->am_memmap);
if (amp->am_syncmap != NULL)
free(amp->am_syncmap);
amp->am_magic = 0;
free(amp);
errno = ENOMEM;
return (-1);
}
amp->am_magic = ACTIVEMAP_MAGIC;
*ampp = amp;
return (0);
}
static struct keepdirty *
keepdirty_find(struct activemap *amp, int extent)
{
struct keepdirty *kd;
TAILQ_FOREACH(kd, &amp->am_keepdirty, kd_next) {
if (kd->kd_extent == extent)
break;
}
return (kd);
}
static void
keepdirty_add(struct activemap *amp, int extent)
{
struct keepdirty *kd;
kd = keepdirty_find(amp, extent);
if (kd != NULL) {
/*
* Only move element at the begining.
*/
TAILQ_REMOVE(&amp->am_keepdirty, kd, kd_next);
TAILQ_INSERT_HEAD(&amp->am_keepdirty, kd, kd_next);
return;
}
/*
* Add new element, but first remove the most unused one if
* we have too many.
*/
if (amp->am_nkeepdirty >= amp->am_nkeepdirty_limit) {
kd = TAILQ_LAST(&amp->am_keepdirty, skeepdirty);
assert(kd != NULL);
TAILQ_REMOVE(&amp->am_keepdirty, kd, kd_next);
amp->am_nkeepdirty--;
assert(amp->am_nkeepdirty > 0);
}
if (kd == NULL)
kd = malloc(sizeof(*kd));
/* We can ignore allocation failure. */
if (kd != NULL) {
kd->kd_extent = extent;
amp->am_nkeepdirty++;
TAILQ_INSERT_HEAD(&amp->am_keepdirty, kd, kd_next);
}
}
static void
keepdirty_fill(struct activemap *amp)
{
struct keepdirty *kd;
TAILQ_FOREACH(kd, &amp->am_keepdirty, kd_next)
bit_set(amp->am_diskmap, kd->kd_extent);
}
static void
keepdirty_free(struct activemap *amp)
{
struct keepdirty *kd;
while ((kd = TAILQ_FIRST(&amp->am_keepdirty)) != NULL) {
TAILQ_REMOVE(&amp->am_keepdirty, kd, kd_next);
amp->am_nkeepdirty--;
free(kd);
}
assert(amp->am_nkeepdirty == 0);
}
/*
* Function frees resources allocated by activemap_init() function.
*/
void
activemap_free(struct activemap *amp)
{
assert(amp->am_magic == ACTIVEMAP_MAGIC);
amp->am_magic = 0;
keepdirty_free(amp);
free(amp->am_memtab);
free(amp->am_diskmap);
free(amp->am_memmap);
free(amp->am_syncmap);
}
/*
* Function should be called before we handle write requests. It updates
* internal structures and returns true if on-disk metadata should be updated.
*/
bool
activemap_write_start(struct activemap *amp, off_t offset, off_t length)
{
bool modified;
off_t end;
int ext;
assert(amp->am_magic == ACTIVEMAP_MAGIC);
assert(length > 0);
modified = false;
end = offset + length - 1;
for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
/*
* If the number of pending writes is increased from 0,
* we have to mark the extent as dirty also in on-disk bitmap.
* By returning true we inform the caller that on-disk bitmap
* was modified and has to be flushed to disk.
*/
if (amp->am_memtab[ext]++ == 0) {
assert(!bit_test(amp->am_memmap, ext));
bit_set(amp->am_memmap, ext);
amp->am_ndirty++;
modified = true;
}
keepdirty_add(amp, ext);
}
return (modified);
}
/*
* Function should be called after receiving write confirmation. It updates
* internal structures and returns true if on-disk metadata should be updated.
*/
bool
activemap_write_complete(struct activemap *amp, off_t offset, off_t length)
{
bool modified;
off_t end;
int ext;
assert(amp->am_magic == ACTIVEMAP_MAGIC);
assert(length > 0);
modified = false;
end = offset + length - 1;
for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
/*
* If the number of pending writes goes down to 0, we have to
* mark the extent as clean also in on-disk bitmap.
* By returning true we inform the caller that on-disk bitmap
* was modified and has to be flushed to disk.
*/
assert(amp->am_memtab[ext] > 0);
assert(bit_test(amp->am_memmap, ext));
if (--amp->am_memtab[ext] == 0) {
bit_clear(amp->am_memmap, ext);
amp->am_ndirty--;
modified = true;
}
}
return (modified);
}
/*
* Function should be called after finishing synchronization of one extent.
* It returns true if on-disk metadata should be updated.
*/
bool
activemap_extent_complete(struct activemap *amp, int extent)
{
bool modified;
int reqs;
assert(amp->am_magic == ACTIVEMAP_MAGIC);
assert(extent >= 0 && extent < amp->am_nextents);
modified = false;
reqs = ext2reqs(amp, extent);
assert(amp->am_memtab[extent] >= reqs);
amp->am_memtab[extent] -= reqs;
assert(bit_test(amp->am_memmap, extent));
if (amp->am_memtab[extent] == 0) {
bit_clear(amp->am_memmap, extent);
amp->am_ndirty--;
modified = true;
}
return (modified);
}
/*
* Function returns number of dirty regions.
*/
uint64_t
activemap_ndirty(const struct activemap *amp)
{
assert(amp->am_magic == ACTIVEMAP_MAGIC);
return (amp->am_ndirty);
}
/*
* Function compare on-disk bitmap and in-memory bitmap and returns true if
* they differ and should be flushed to the disk.
*/
bool
activemap_differ(const struct activemap *amp)
{
assert(amp->am_magic == ACTIVEMAP_MAGIC);
return (memcmp(amp->am_diskmap, amp->am_memmap,
amp->am_mapsize) != 0);
}
/*
* Function returns number of bytes used by bitmap.
*/
size_t
activemap_size(const struct activemap *amp)
{
assert(amp->am_magic == ACTIVEMAP_MAGIC);
return (amp->am_mapsize);
}
/*
* Function returns number of bytes needed for storing on-disk bitmap.
* This is the same as activemap_size(), but rounded up to sector size.
*/
size_t
activemap_ondisk_size(const struct activemap *amp)
{
assert(amp->am_magic == ACTIVEMAP_MAGIC);
return (amp->am_diskmapsize);
}
/*
* Function copies the given buffer read from disk to the internal bitmap.
*/
void
activemap_copyin(struct activemap *amp, const unsigned char *buf, size_t size)
{
int ext;
assert(amp->am_magic == ACTIVEMAP_MAGIC);
assert(size >= amp->am_mapsize);
memcpy(amp->am_diskmap, buf, amp->am_mapsize);
memcpy(amp->am_memmap, buf, amp->am_mapsize);
memcpy(amp->am_syncmap, buf, amp->am_mapsize);
bit_ffs(amp->am_memmap, amp->am_nextents, &ext);
if (ext == -1) {
/* There are no dirty extents, so we can leave now. */
return;
}
/*
* Set synchronization offset to the first dirty extent.
*/
activemap_sync_rewind(amp);
/*
* We have dirty extents and we want them to stay that way until
* we synchronize, so we set number of pending writes to number
* of requests needed to synchronize one extent.
*/
amp->am_ndirty = 0;
for (; ext < amp->am_nextents; ext++) {
if (bit_test(amp->am_memmap, ext)) {
amp->am_memtab[ext] = ext2reqs(amp, ext);
amp->am_ndirty++;
}
}
}
/*
* Function merges the given bitmap with existng one.
*/
void
activemap_merge(struct activemap *amp, const unsigned char *buf, size_t size)
{
bitstr_t *remmap = __DECONST(bitstr_t *, buf);
int ext;
assert(amp->am_magic == ACTIVEMAP_MAGIC);
assert(size >= amp->am_mapsize);
bit_ffs(remmap, amp->am_nextents, &ext);
if (ext == -1) {
/* There are no dirty extents, so we can leave now. */
return;
}
/*
* We have dirty extents and we want them to stay that way until
* we synchronize, so we set number of pending writes to number
* of requests needed to synchronize one extent.
*/
for (; ext < amp->am_nextents; ext++) {
/* Local extent already dirty. */
if (bit_test(amp->am_syncmap, ext))
continue;
/* Remote extent isn't dirty. */
if (!bit_test(remmap, ext))
continue;
bit_set(amp->am_syncmap, ext);
bit_set(amp->am_memmap, ext);
bit_set(amp->am_diskmap, ext);
if (amp->am_memtab[ext] == 0)
amp->am_ndirty++;
amp->am_memtab[ext] = ext2reqs(amp, ext);
}
/*
* Set synchronization offset to the first dirty extent.
*/
activemap_sync_rewind(amp);
}
/*
* Function returns pointer to internal bitmap that should be written to disk.
*/
const unsigned char *
activemap_bitmap(struct activemap *amp, size_t *sizep)
{
assert(amp->am_magic == ACTIVEMAP_MAGIC);
if (sizep != NULL)
*sizep = amp->am_diskmapsize;
memcpy(amp->am_diskmap, amp->am_memmap, amp->am_mapsize);
keepdirty_fill(amp);
return ((const unsigned char *)amp->am_diskmap);
}
/*
* Function calculates size needed to store bitmap on disk.
*/
size_t
activemap_calc_ondisk_size(uint64_t mediasize, uint32_t extentsize,
uint32_t sectorsize)
{
uint64_t nextents, mapsize;
assert(mediasize > 0);
assert(extentsize > 0);
assert(powerof2(extentsize));
assert(sectorsize > 0);
assert(powerof2(sectorsize));
nextents = ((mediasize - 1) / extentsize) + 1;
mapsize = sizeof(bitstr_t) * bitstr_size(nextents);
return (roundup2(mapsize, sectorsize));
}
/*
* Set synchronization offset to the first dirty extent.
*/
void
activemap_sync_rewind(struct activemap *amp)
{
int ext;
assert(amp->am_magic == ACTIVEMAP_MAGIC);
bit_ffs(amp->am_syncmap, amp->am_nextents, &ext);
if (ext == -1) {
/* There are no extents to synchronize. */
amp->am_syncoff = -2;
return;
}
/*
* Mark that we want to start synchronization from the begining.
*/
amp->am_syncoff = -1;
}
/*
* Return next offset of where we should synchronize.
*/
off_t
activemap_sync_offset(struct activemap *amp, off_t *lengthp, int *syncextp)
{
off_t syncoff, left;
int ext;
assert(amp->am_magic == ACTIVEMAP_MAGIC);
assert(lengthp != NULL);
assert(syncextp != NULL);
*syncextp = -1;
if (amp->am_syncoff == -2)
return (-1);
if (amp->am_syncoff >= 0 &&
(amp->am_syncoff + MAXPHYS >= amp->am_mediasize ||
off2ext(amp, amp->am_syncoff) !=
off2ext(amp, amp->am_syncoff + MAXPHYS))) {
/*
* We are about to change extent, so mark previous one as clean.
*/
ext = off2ext(amp, amp->am_syncoff);
bit_clear(amp->am_syncmap, ext);
*syncextp = ext;
amp->am_syncoff = -1;
}
if (amp->am_syncoff == -1) {
/*
* Let's find first extent to synchronize.
*/
bit_ffs(amp->am_syncmap, amp->am_nextents, &ext);
if (ext == -1) {
amp->am_syncoff = -2;
return (-1);
}
amp->am_syncoff = ext2off(amp, ext);
} else {
/*
* We don't change extent, so just increase offset.
*/
amp->am_syncoff += MAXPHYS;
if (amp->am_syncoff >= amp->am_mediasize) {
amp->am_syncoff = -2;
return (-1);
}
}
syncoff = amp->am_syncoff;
left = ext2off(amp, off2ext(amp, syncoff)) +
amp->am_extentsize - syncoff;
if (syncoff + left > amp->am_mediasize)
left = amp->am_mediasize - syncoff;
if (left > MAXPHYS)
left = MAXPHYS;
assert(left >= 0 && left <= MAXPHYS);
assert(syncoff >= 0 && syncoff < amp->am_mediasize);
assert(syncoff + left >= 0 && syncoff + left <= amp->am_mediasize);
*lengthp = left;
return (syncoff);
}
/*
* Mark extent(s) containing the given region for synchronization.
* Most likely one of the components is unavailable.
*/
bool
activemap_need_sync(struct activemap *amp, off_t offset, off_t length)
{
bool modified;
off_t end;
int ext;
assert(amp->am_magic == ACTIVEMAP_MAGIC);
modified = false;
end = offset + length - 1;
for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
if (bit_test(amp->am_syncmap, ext)) {
/* Already marked for synchronization. */
assert(bit_test(amp->am_memmap, ext));
continue;
}
bit_set(amp->am_syncmap, ext);
if (!bit_test(amp->am_memmap, ext)) {
bit_set(amp->am_memmap, ext);
amp->am_ndirty++;
}
amp->am_memtab[ext] += ext2reqs(amp, ext);
modified = true;
}
return (modified);
}
void
activemap_dump(const struct activemap *amp)
{
int bit;
printf("M: ");
for (bit = 0; bit < amp->am_nextents; bit++)
printf("%d", bit_test(amp->am_memmap, bit) ? 1 : 0);
printf("\n");
printf("D: ");
for (bit = 0; bit < amp->am_nextents; bit++)
printf("%d", bit_test(amp->am_diskmap, bit) ? 1 : 0);
printf("\n");
printf("S: ");
for (bit = 0; bit < amp->am_nextents; bit++)
printf("%d", bit_test(amp->am_syncmap, bit) ? 1 : 0);
printf("\n");
}