Improve support for blacklisting bad memory locations. The user can supply

a text file with a list of physical memory addresses to exclude, and have it
loaded at boot time via the provided example in loader.conf.  The tunable
'vm.blacklist' remains, but using an external file means that there's no
practical limit to the size of the list.  This change also improves the
scanning algorithm for processing the list, scanning the list only once
instead of scanning it for every page in the system.  Both the sysctl and
the file can be unsorted and contain duplicates so long as each entry is
numeric (decimal or hex) and is separated by a space, comma, or newline
character.  The sysctl 'vm.page_blacklist' is now provided to report what
memory locations were successfully excluded.

Reviewed by:	imp, emax
Obtained from:	Netflix, Inc.
MFC after:	3 days
This commit is contained in:
Scott Long 2015-04-29 15:57:14 +00:00
parent bfce3bb269
commit affc4a4bff
2 changed files with 165 additions and 27 deletions

View File

@ -48,6 +48,16 @@ entropy_cache_type="/boot/entropy"
#kern.random.sys.seeded="0" # Set this to 1 to start /dev/random
# without waiting for a (re)seed.
##############################################################
### RAM Blacklist configuration #############################
##############################################################
ram_blacklist_load="NO" # Set this to YES to load a file
# containing a list of addresses to
# exclude from the running system.
ram_blacklist_name="/boot/blacklist.txt" # Set this to the name of the file
ram_blacklist_type="ram_blacklist" # Required for the kernel to find
# the blacklist module
##############################################################
### Loader settings ########################################

View File

@ -91,12 +91,14 @@ __FBSDID("$FreeBSD$");
#include <sys/lock.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/linker.h>
#include <sys/malloc.h>
#include <sys/mman.h>
#include <sys/msgbuf.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/sbuf.h>
#include <sys/sysctl.h>
#include <sys/vmmeter.h>
#include <sys/vnode.h>
@ -142,6 +144,12 @@ static int pa_tryrelock_restart;
SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD,
&pa_tryrelock_restart, 0, "Number of tryrelock restarts");
static TAILQ_HEAD(, vm_page) blacklist_head;
static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS);
SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD |
CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages");
static uma_zone_t fakepg_zone;
static struct vnode *vm_page_alloc_init(vm_page_t m);
@ -216,35 +224,153 @@ vm_set_page_size(void)
}
/*
* vm_page_blacklist_lookup:
* vm_page_blacklist_next:
*
* See if a physical address in this page has been listed
* in the blacklist tunable. Entries in the tunable are
* separated by spaces or commas. If an invalid integer is
* encountered then the rest of the string is skipped.
* Find the next entry in the provided string of blacklist
* addresses. Entries are separated by space, comma, or newline.
* If an invalid integer is encountered then the rest of the
* string is skipped. Updates the list pointer to the next
* character, or NULL if the string is exhausted or invalid.
*/
static int
vm_page_blacklist_lookup(char *list, vm_paddr_t pa)
static vm_paddr_t
vm_page_blacklist_next(char **list, char *end)
{
vm_paddr_t bad;
char *cp, *pos;
for (pos = list; *pos != '\0'; pos = cp) {
bad = strtoq(pos, &cp, 0);
if (*cp != '\0') {
if (*cp == ' ' || *cp == ',') {
cp++;
if (cp == pos)
continue;
} else
break;
}
if (pa == trunc_page(bad))
return (1);
if (list == NULL || *list == NULL)
return (0);
if (**list =='\0') {
*list = NULL;
return (0);
}
/*
* If there's no end pointer then the buffer is coming from
* the kenv and we know it's null-terminated.
*/
if (end == NULL)
end = *list + strlen(*list);
/* Ensure that strtoq() won't walk off the end */
if (*end != '\0') {
if (*end == '\n' || *end == ' ' || *end == ',')
*end = '\0';
else {
printf("Blacklist not terminated, skipping\n");
*list = NULL;
return (0);
}
}
for (pos = *list; *pos != '\0'; pos = cp) {
bad = strtoq(pos, &cp, 0);
if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') {
if (bad == 0) {
if (++cp < end)
continue;
else
break;
}
} else
break;
if (*cp == '\0' || ++cp >= end)
*list = NULL;
else
*list = cp;
return (trunc_page(bad));
}
printf("Garbage in RAM blacklist, skipping\n");
*list = NULL;
return (0);
}
/*
* vm_page_blacklist_check:
*
* Iterate through the provided string of blacklist addresses, pulling
* each entry out of the physical allocator free list and putting it
* onto a list for reporting via the vm.page_blacklist sysctl.
*/
static void
vm_page_blacklist_check(char *list, char *end)
{
vm_paddr_t pa;
vm_page_t m;
char *next;
int ret;
next = list;
while (next != NULL) {
if ((pa = vm_page_blacklist_next(&next, end)) == 0)
continue;
m = vm_phys_paddr_to_vm_page(pa);
if (m == NULL)
continue;
mtx_lock(&vm_page_queue_free_mtx);
ret = vm_phys_unfree_page(m);
mtx_unlock(&vm_page_queue_free_mtx);
if (ret == TRUE) {
TAILQ_INSERT_TAIL(&blacklist_head, m, listq);
if (bootverbose)
printf("Skipping page with pa 0x%jx\n",
(uintmax_t)pa);
}
}
}
/*
* vm_page_blacklist_load:
*
* Search for a special module named "ram_blacklist". It'll be a
* plain text file provided by the user via the loader directive
* of the same name.
*/
static void
vm_page_blacklist_load(char **list, char **end)
{
void *mod;
u_char *ptr;
u_int len;
mod = NULL;
ptr = NULL;
mod = preload_search_by_type("ram_blacklist");
if (mod != NULL) {
ptr = preload_fetch_addr(mod);
len = preload_fetch_size(mod);
}
*list = ptr;
if (ptr != NULL)
*end = ptr + len;
else
*end = NULL;
return;
}
static int
sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS)
{
vm_page_t m;
struct sbuf sbuf;
int error, first;
first = 1;
error = sysctl_wire_old_buffer(req, 0);
if (error != 0)
return (error);
sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
TAILQ_FOREACH(m, &blacklist_head, listq) {
sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",",
(uintmax_t)m->phys_addr);
first = 0;
}
error = sbuf_finish(&sbuf);
sbuf_delete(&sbuf);
return (error);
}
static void
vm_page_domain_init(struct vm_domain *vmd)
{
@ -290,7 +416,7 @@ vm_page_startup(vm_offset_t vaddr)
int i;
vm_paddr_t pa;
vm_paddr_t last_pa;
char *list;
char *list, *listend;
vm_paddr_t end;
vm_paddr_t biggestsize;
vm_paddr_t low_water, high_water;
@ -477,20 +603,22 @@ vm_page_startup(vm_offset_t vaddr)
*/
vm_cnt.v_page_count = 0;
vm_cnt.v_free_count = 0;
list = kern_getenv("vm.blacklist");
for (i = 0; phys_avail[i + 1] != 0; i += 2) {
pa = phys_avail[i];
last_pa = phys_avail[i + 1];
while (pa < last_pa) {
if (list != NULL &&
vm_page_blacklist_lookup(list, pa))
printf("Skipping page with pa 0x%jx\n",
(uintmax_t)pa);
else
vm_phys_add_page(pa);
vm_phys_add_page(pa);
pa += PAGE_SIZE;
}
}
TAILQ_INIT(&blacklist_head);
vm_page_blacklist_load(&list, &listend);
vm_page_blacklist_check(list, listend);
list = kern_getenv("vm.blacklist");
vm_page_blacklist_check(list, NULL);
freeenv(list);
#if VM_NRESERVLEVEL > 0
/*