freebsd-nq/lib/libpmcstat/libpmcstat_image.c
Andrew Gallatin 517a7adb11 Make hwpmc work for userspace binaries again
hwpmc has been utterly broken for userspace binaries, and has been
labeling all samples from userspace binaries as dubious frames. The
issues are that:

-The check for ph.p_offset & (-ph.p_align) == 0 was mostly bogus. The
 intent was to ignore all executable segments other than the first,
 which when using BFD appeared in the first page, but with current LLD
 a read-only data segment appears before the executable segment,
 pushing the latter into the second page or later. This meant no
 executable segment was ever found, and thus pi_vaddr remained
 0. Instead of relying on BFD's layout, track whether we've seen an
 executable segment explicitly with a local bool.

-Shared libraries were not parsing the segments to calculate pi_vaddr,
 resulting in it always being 0. Again, when using BFD, the executable
 segment started at the first page, and so pi_vaddr was genuinely
 meant to be 0, but not with LLD's current layout. This meant that
 pmcstat_image_link's offset calculation gave the base address of the
 segment in memory, rather than the base address of the whole library
 in memory, and so when adding that to pi_start/pi_end to get the
 range of the executable sections in memory it double-counted the
 offset of the first executable segment within the library. Thus we
 need to do the exact same parsing for ET_DYN as we do for ET_EXEC,
 which is simpler to write as special-casing ET_REL to not look for
 segments. Note that, whilst PT_INTERP isn't needed for shared
 libraries, it will be for PIEs, which pmcstat still fails to handle
 due to not knowing the base address of the PIE; we get the base
 address for libraries by MAP_IN events, and for rtld by virtue of the
 process's entry address being rtld's, but have no equivalent for the
 executable.

Fixes courtesy of jrtc27@.

Reviewed by: jrtc27, jhb (earlier version)
Differential Revision: https://reviews.freebsd.org/D33055
Sponsored by: Netflix
2021-12-15 08:38:36 -05:00

558 lines
14 KiB
C

/*-
* Copyright (c) 2003-2008 Joseph Koshy
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/cpuset.h>
#include <sys/param.h>
#include <sys/endian.h>
#include <sys/pmc.h>
#include <sys/sysctl.h>
#include <sys/imgact_aout.h>
#include <sys/imgact_elf.h>
#include <netinet/in.h>
#include <assert.h>
#include <err.h>
#include <fcntl.h>
#include <pmc.h>
#include <pmclog.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sysexits.h>
#include <unistd.h>
#include "libpmcstat.h"
#define min(A,B) ((A) < (B) ? (A) : (B))
#define max(A,B) ((A) > (B) ? (A) : (B))
/*
* Add the list of symbols in the given section to the list associated
* with the object.
*/
void
pmcstat_image_add_symbols(struct pmcstat_image *image, Elf *e,
Elf_Scn *scn, GElf_Shdr *sh)
{
int firsttime;
size_t n, newsyms, nshsyms, nfuncsyms;
struct pmcstat_symbol *symptr;
char *fnname;
GElf_Sym sym;
Elf_Data *data;
if ((data = elf_getdata(scn, NULL)) == NULL)
return;
/*
* Determine the number of functions named in this
* section.
*/
nshsyms = sh->sh_size / sh->sh_entsize;
for (n = nfuncsyms = 0; n < nshsyms; n++) {
if (gelf_getsym(data, (int) n, &sym) != &sym)
return;
if (GELF_ST_TYPE(sym.st_info) == STT_FUNC)
nfuncsyms++;
}
if (nfuncsyms == 0)
return;
/*
* Allocate space for the new entries.
*/
firsttime = image->pi_symbols == NULL;
symptr = reallocarray(image->pi_symbols,
image->pi_symcount + nfuncsyms, sizeof(*symptr));
if (symptr == image->pi_symbols) /* realloc() failed. */
return;
image->pi_symbols = symptr;
/*
* Append new symbols to the end of the current table.
*/
symptr += image->pi_symcount;
for (n = newsyms = 0; n < nshsyms; n++) {
if (gelf_getsym(data, (int) n, &sym) != &sym)
return;
if (GELF_ST_TYPE(sym.st_info) != STT_FUNC)
continue;
if (sym.st_shndx == STN_UNDEF)
continue;
if (!firsttime && pmcstat_symbol_search(image, sym.st_value))
continue; /* We've seen this symbol already. */
if ((fnname = elf_strptr(e, sh->sh_link, sym.st_name))
== NULL)
continue;
#ifdef __arm__
/* Remove spurious ARM function name. */
if (fnname[0] == '$' &&
(fnname[1] == 'a' || fnname[1] == 't' ||
fnname[1] == 'd') &&
fnname[2] == '\0')
continue;
#endif
symptr->ps_name = pmcstat_string_intern(fnname);
symptr->ps_start = sym.st_value - image->pi_vaddr;
symptr->ps_end = symptr->ps_start + sym.st_size;
symptr++;
newsyms++;
}
image->pi_symcount += newsyms;
if (image->pi_symcount == 0)
return;
assert(newsyms <= nfuncsyms);
/*
* Return space to the system if there were duplicates.
*/
if (newsyms < nfuncsyms)
image->pi_symbols = reallocarray(image->pi_symbols,
image->pi_symcount, sizeof(*symptr));
/*
* Keep the list of symbols sorted.
*/
qsort(image->pi_symbols, image->pi_symcount, sizeof(*symptr),
pmcstat_symbol_compare);
/*
* Deal with function symbols that have a size of 'zero' by
* making them extend to the next higher address. These
* symbols are usually defined in assembly code.
*/
for (symptr = image->pi_symbols;
symptr < image->pi_symbols + (image->pi_symcount - 1);
symptr++)
if (symptr->ps_start == symptr->ps_end)
symptr->ps_end = (symptr+1)->ps_start;
}
/*
* Record the fact that PC values from 'start' to 'end' come from
* image 'image'.
*/
void
pmcstat_image_link(struct pmcstat_process *pp, struct pmcstat_image *image,
uintfptr_t start)
{
struct pmcstat_pcmap *pcm, *pcmnew;
uintfptr_t offset;
#ifdef __powerpc__
unsigned long kernbase;
size_t kernbase_len;
#endif
assert(image->pi_type != PMCSTAT_IMAGE_UNKNOWN &&
image->pi_type != PMCSTAT_IMAGE_INDETERMINABLE);
if ((pcmnew = malloc(sizeof(*pcmnew))) == NULL)
err(EX_OSERR, "ERROR: Cannot create a map entry");
/*
* PowerPC kernel is of DYN type and it has a base address
* where it is initially loaded, before being relocated.
* As the address in 'start' is where the kernel was relocated to,
* but the symbols always use the original base address, we need to
* subtract it to get the correct offset.
*/
#ifdef __powerpc__
if (pp->pp_pid == -1) {
kernbase = 0;
kernbase_len = sizeof(kernbase);
if (sysctlbyname("kern.base_address", &kernbase, &kernbase_len,
NULL, 0) == -1)
warnx(
"WARNING: Could not retrieve kernel base address");
else
start -= kernbase;
}
#endif
/*
* Adjust the map entry to only cover the text portion
* of the object.
*/
offset = start - image->pi_vaddr;
pcmnew->ppm_lowpc = image->pi_start + offset;
pcmnew->ppm_highpc = image->pi_end + offset;
pcmnew->ppm_image = image;
assert(pcmnew->ppm_lowpc < pcmnew->ppm_highpc);
/* Overlapped mmap()'s are assumed to never occur. */
TAILQ_FOREACH(pcm, &pp->pp_map, ppm_next)
if (pcm->ppm_lowpc >= pcmnew->ppm_highpc)
break;
if (pcm == NULL)
TAILQ_INSERT_TAIL(&pp->pp_map, pcmnew, ppm_next);
else
TAILQ_INSERT_BEFORE(pcm, pcmnew, ppm_next);
}
/*
* Determine whether a given executable image is an A.OUT object, and
* if so, fill in its parameters from the text file.
* Sets image->pi_type.
*/
void
pmcstat_image_get_aout_params(struct pmcstat_image *image,
struct pmcstat_args *args)
{
int fd;
ssize_t nbytes;
struct exec ex;
const char *path;
char buffer[PATH_MAX];
path = pmcstat_string_unintern(image->pi_execpath);
assert(path != NULL);
if (image->pi_iskernelmodule)
errx(EX_SOFTWARE,
"ERROR: a.out kernel modules are unsupported \"%s\"", path);
(void) snprintf(buffer, sizeof(buffer), "%s%s",
args->pa_fsroot, path);
if ((fd = open(buffer, O_RDONLY, 0)) < 0 ||
(nbytes = read(fd, &ex, sizeof(ex))) < 0) {
if (args->pa_verbosity >= 2)
warn("WARNING: Cannot determine type of \"%s\"",
path);
image->pi_type = PMCSTAT_IMAGE_INDETERMINABLE;
if (fd != -1)
(void) close(fd);
return;
}
(void) close(fd);
if ((unsigned) nbytes != sizeof(ex) ||
N_BADMAG(ex))
return;
image->pi_type = PMCSTAT_IMAGE_AOUT;
/* TODO: the rest of a.out processing */
return;
}
/*
* Examine an ELF file to determine the size of its text segment.
* Sets image->pi_type if anything conclusive can be determined about
* this image.
*/
void
pmcstat_image_get_elf_params(struct pmcstat_image *image,
struct pmcstat_args *args)
{
int fd;
size_t i, nph, nsh;
const char *path, *elfbase;
char *p, *endp;
bool first_exec_segment;
uintfptr_t minva, maxva;
Elf *e;
Elf_Scn *scn;
GElf_Ehdr eh;
GElf_Phdr ph;
GElf_Shdr sh;
enum pmcstat_image_type image_type;
char buffer[PATH_MAX];
char buffer_modules[PATH_MAX];
assert(image->pi_type == PMCSTAT_IMAGE_UNKNOWN);
image->pi_start = minva = ~(uintfptr_t) 0;
image->pi_end = maxva = (uintfptr_t) 0;
image->pi_type = image_type = PMCSTAT_IMAGE_INDETERMINABLE;
image->pi_isdynamic = 0;
image->pi_dynlinkerpath = NULL;
image->pi_vaddr = 0;
path = pmcstat_string_unintern(image->pi_execpath);
assert(path != NULL);
/*
* Look for kernel modules under FSROOT/KERNELPATH/NAME and
* FSROOT/boot/modules/NAME, and user mode executable objects
* under FSROOT/PATHNAME.
*/
if (image->pi_iskernelmodule) {
(void) snprintf(buffer, sizeof(buffer), "%s%s/%s",
args->pa_fsroot, args->pa_kernel, path);
(void) snprintf(buffer_modules, sizeof(buffer_modules),
"%s/boot/modules/%s", args->pa_fsroot, path);
} else {
(void) snprintf(buffer, sizeof(buffer), "%s%s",
args->pa_fsroot, path);
}
e = NULL;
fd = open(buffer, O_RDONLY, 0);
if (fd < 0 && !image->pi_iskernelmodule) {
warnx("WARNING: Cannot open \"%s\".",
buffer);
goto done;
}
if (fd < 0 && (fd = open(buffer_modules, O_RDONLY, 0)) < 0) {
warnx("WARNING: Cannot open \"%s\" or \"%s\".",
buffer, buffer_modules);
goto done;
}
if (elf_version(EV_CURRENT) == EV_NONE) {
warnx("WARNING: failed to init elf\n");
goto done;
}
if ((e = elf_begin(fd, ELF_C_READ, NULL)) == NULL) {
warnx("WARNING: Cannot read \"%s\".",
buffer);
goto done;
}
if (elf_kind(e) != ELF_K_ELF) {
if (args->pa_verbosity >= 2)
warnx("WARNING: Cannot determine the type of \"%s\".",
buffer);
goto done;
}
if (gelf_getehdr(e, &eh) != &eh) {
warnx(
"WARNING: Cannot retrieve the ELF Header for \"%s\": %s.",
buffer, elf_errmsg(-1));
goto done;
}
if (eh.e_type != ET_EXEC && eh.e_type != ET_DYN &&
!(image->pi_iskernelmodule && eh.e_type == ET_REL)) {
warnx("WARNING: \"%s\" is of an unsupported ELF type.",
buffer);
goto done;
}
image_type = eh.e_ident[EI_CLASS] == ELFCLASS32 ?
PMCSTAT_IMAGE_ELF32 : PMCSTAT_IMAGE_ELF64;
/*
* Determine the virtual address where an executable would be
* loaded. Additionally, for dynamically linked executables,
* save the pathname to the runtime linker.
*/
if (eh.e_type != ET_REL) {
if (elf_getphnum(e, &nph) == 0) {
warnx(
"WARNING: Could not determine the number of program headers in \"%s\": %s.",
buffer,
elf_errmsg(-1));
goto done;
}
first_exec_segment = true;
for (i = 0; i < eh.e_phnum; i++) {
if (gelf_getphdr(e, i, &ph) != &ph) {
warnx(
"WARNING: Retrieval of PHDR entry #%ju in \"%s\" failed: %s.",
(uintmax_t) i, buffer, elf_errmsg(-1));
goto done;
}
switch (ph.p_type) {
case PT_DYNAMIC:
image->pi_isdynamic = 1;
break;
case PT_INTERP:
if ((elfbase = elf_rawfile(e, NULL)) == NULL) {
warnx(
"WARNING: Cannot retrieve the interpreter for \"%s\": %s.",
buffer, elf_errmsg(-1));
goto done;
}
image->pi_dynlinkerpath =
pmcstat_string_intern(elfbase +
ph.p_offset);
break;
case PT_LOAD:
if ((ph.p_flags & PF_X) != 0 &&
first_exec_segment) {
image->pi_vaddr = ph.p_vaddr & (-ph.p_align);
first_exec_segment = false;
}
break;
}
}
}
/*
* Get the min and max VA associated with this ELF object.
*/
if (elf_getshnum(e, &nsh) == 0) {
warnx(
"WARNING: Could not determine the number of sections for \"%s\": %s.",
buffer, elf_errmsg(-1));
goto done;
}
for (i = 0; i < nsh; i++) {
if ((scn = elf_getscn(e, i)) == NULL ||
gelf_getshdr(scn, &sh) != &sh) {
warnx(
"WARNING: Could not retrieve section header #%ju in \"%s\": %s.",
(uintmax_t) i, buffer, elf_errmsg(-1));
goto done;
}
if (sh.sh_flags & SHF_EXECINSTR) {
minva = min(minva, sh.sh_addr);
maxva = max(maxva, sh.sh_addr + sh.sh_size);
}
if (sh.sh_type == SHT_SYMTAB || sh.sh_type == SHT_DYNSYM)
pmcstat_image_add_symbols(image, e, scn, &sh);
}
image->pi_start = minva;
image->pi_end = maxva;
image->pi_type = image_type;
image->pi_fullpath = pmcstat_string_intern(buffer);
/* Build display name
*/
endp = buffer;
for (p = buffer; *p; p++)
if (*p == '/')
endp = p+1;
image->pi_name = pmcstat_string_intern(endp);
done:
(void) elf_end(e);
if (fd >= 0)
(void) close(fd);
return;
}
/*
* Given an image descriptor, determine whether it is an ELF, or AOUT.
* If no handler claims the image, set its type to 'INDETERMINABLE'.
*/
void
pmcstat_image_determine_type(struct pmcstat_image *image,
struct pmcstat_args *args)
{
assert(image->pi_type == PMCSTAT_IMAGE_UNKNOWN);
/* Try each kind of handler in turn */
if (image->pi_type == PMCSTAT_IMAGE_UNKNOWN)
pmcstat_image_get_elf_params(image, args);
if (image->pi_type == PMCSTAT_IMAGE_UNKNOWN)
pmcstat_image_get_aout_params(image, args);
/*
* Otherwise, remember that we tried to determine
* the object's type and had failed.
*/
if (image->pi_type == PMCSTAT_IMAGE_UNKNOWN)
image->pi_type = PMCSTAT_IMAGE_INDETERMINABLE;
}
/*
* Locate an image descriptor given an interned path, adding a fresh
* descriptor to the cache if necessary. This function also finds a
* suitable name for this image's sample file.
*
* We defer filling in the file format specific parts of the image
* structure till the time we actually see a sample that would fall
* into this image.
*/
struct pmcstat_image *
pmcstat_image_from_path(pmcstat_interned_string internedpath,
int iskernelmodule, struct pmcstat_args *args,
struct pmc_plugins *plugins)
{
int hash;
struct pmcstat_image *pi;
hash = pmcstat_string_lookup_hash(internedpath);
/* First, look for an existing entry. */
LIST_FOREACH(pi, &pmcstat_image_hash[hash], pi_next)
if (pi->pi_execpath == internedpath &&
pi->pi_iskernelmodule == iskernelmodule)
return (pi);
/*
* Allocate a new entry and place it at the head of the hash
* and LRU lists.
*/
pi = malloc(sizeof(*pi));
if (pi == NULL)
return (NULL);
pi->pi_type = PMCSTAT_IMAGE_UNKNOWN;
pi->pi_execpath = internedpath;
pi->pi_start = ~0;
pi->pi_end = 0;
pi->pi_entry = 0;
pi->pi_vaddr = 0;
pi->pi_isdynamic = 0;
pi->pi_iskernelmodule = iskernelmodule;
pi->pi_dynlinkerpath = NULL;
pi->pi_symbols = NULL;
pi->pi_symcount = 0;
pi->pi_addr2line = NULL;
if (plugins[args->pa_pplugin].pl_initimage != NULL)
plugins[args->pa_pplugin].pl_initimage(pi);
if (plugins[args->pa_plugin].pl_initimage != NULL)
plugins[args->pa_plugin].pl_initimage(pi);
LIST_INSERT_HEAD(&pmcstat_image_hash[hash], pi, pi_next);
return (pi);
}