fabient 5edfb77dd3 Add software PMC support.
New kernel events can be added at various location for sampling or counting.
This will for example allow easy system profiling whatever the processor is
with known tools like pmcstat(8).

Simultaneous usage of software PMC and hardware PMC is possible, for example
looking at the lock acquire failure, page fault while sampling on
instructions.

Sponsored by: NETASQ
MFC after:	1 month
2012-03-28 20:58:30 +00:00

584 lines
15 KiB
C

/*-
* Copyright (c) 2005-2007 Joseph Koshy
* Copyright (c) 2007 The FreeBSD Foundation
* All rights reserved.
*
* Portions of this software were developed by A. Joseph Koshy under
* sponsorship from the FreeBSD Foundation and Google, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/pmc.h>
#include <sys/pmclog.h>
#include <assert.h>
#include <errno.h>
#include <pmc.h>
#include <pmclog.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <unistd.h>
#include <machine/pmc_mdep.h>
#include "libpmcinternal.h"
#define PMCLOG_BUFFER_SIZE 4096
/*
* API NOTES
*
* The pmclog(3) API is oriented towards parsing an event stream in
* "realtime", i.e., from an data source that may or may not preserve
* record boundaries -- for example when the data source is elsewhere
* on a network. The API allows data to be fed into the parser zero
* or more bytes at a time.
*
* The state for a log file parser is maintained in a 'struct
* pmclog_parse_state'. Parser invocations are done by calling
* 'pmclog_read()'; this function will inform the caller when a
* complete event is parsed.
*
* The parser first assembles a complete log file event in an internal
* work area (see "ps_saved" below). Once a complete log file event
* is read, the parser then parses it and converts it to an event
* descriptor usable by the client. We could possibly avoid this two
* step process by directly parsing the input log to set fields in the
* event record. However the parser's state machine would get
* insanely complicated, and this code is unlikely to be used in
* performance critical paths.
*/
enum pmclog_parser_state {
PL_STATE_NEW_RECORD, /* in-between records */
PL_STATE_EXPECTING_HEADER, /* header being read */
PL_STATE_PARTIAL_RECORD, /* header present but not the record */
PL_STATE_ERROR /* parsing error encountered */
};
struct pmclog_parse_state {
enum pmclog_parser_state ps_state;
enum pmc_cputype ps_arch; /* log file architecture */
uint32_t ps_version; /* hwpmc version */
int ps_initialized; /* whether initialized */
int ps_count; /* count of records processed */
off_t ps_offset; /* stream byte offset */
union pmclog_entry ps_saved; /* saved partial log entry */
int ps_svcount; /* #bytes saved */
int ps_fd; /* active fd or -1 */
char *ps_buffer; /* scratch buffer if fd != -1 */
char *ps_data; /* current parse pointer */
size_t ps_len; /* length of buffered data */
};
#define PMCLOG_HEADER_FROM_SAVED_STATE(PS) \
(* ((uint32_t *) &(PS)->ps_saved))
#define PMCLOG_INITIALIZE_READER(LE,A) LE = (uint32_t *) &(A)
#define PMCLOG_READ32(LE,V) do { \
(V) = *(LE)++; \
} while (0)
#define PMCLOG_READ64(LE,V) do { \
uint64_t _v; \
_v = (uint64_t) *(LE)++; \
_v |= ((uint64_t) *(LE)++) << 32; \
(V) = _v; \
} while (0)
#define PMCLOG_READSTRING(LE,DST,LEN) strlcpy((DST), (char *) (LE), (LEN))
/*
* Assemble a log record from '*len' octets starting from address '*data'.
* Update 'data' and 'len' to reflect the number of bytes consumed.
*
* '*data' is potentially an unaligned address and '*len' octets may
* not be enough to complete a event record.
*/
static enum pmclog_parser_state
pmclog_get_record(struct pmclog_parse_state *ps, char **data, ssize_t *len)
{
int avail, copylen, recordsize, used;
uint32_t h;
const int HEADERSIZE = sizeof(uint32_t);
char *src, *dst;
if ((avail = *len) <= 0)
return (ps->ps_state = PL_STATE_ERROR);
src = *data;
h = used = 0;
if (ps->ps_state == PL_STATE_NEW_RECORD)
ps->ps_svcount = 0;
dst = (char *) &ps->ps_saved + ps->ps_svcount;
switch (ps->ps_state) {
case PL_STATE_NEW_RECORD:
/*
* Transitions:
*
* Case A: avail < headersize
* -> 'expecting header'
*
* Case B: avail >= headersize
* B.1: avail < recordsize
* -> 'partial record'
* B.2: avail >= recordsize
* -> 'new record'
*/
copylen = avail < HEADERSIZE ? avail : HEADERSIZE;
bcopy(src, dst, copylen);
ps->ps_svcount = used = copylen;
if (copylen < HEADERSIZE) {
ps->ps_state = PL_STATE_EXPECTING_HEADER;
goto done;
}
src += copylen;
dst += copylen;
h = PMCLOG_HEADER_FROM_SAVED_STATE(ps);
recordsize = PMCLOG_HEADER_TO_LENGTH(h);
if (recordsize <= 0)
goto error;
if (recordsize <= avail) { /* full record available */
bcopy(src, dst, recordsize - copylen);
ps->ps_svcount = used = recordsize;
goto done;
}
/* header + a partial record is available */
bcopy(src, dst, avail - copylen);
ps->ps_svcount = used = avail;
ps->ps_state = PL_STATE_PARTIAL_RECORD;
break;
case PL_STATE_EXPECTING_HEADER:
/*
* Transitions:
*
* Case C: avail+saved < headersize
* -> 'expecting header'
*
* Case D: avail+saved >= headersize
* D.1: avail+saved < recordsize
* -> 'partial record'
* D.2: avail+saved >= recordsize
* -> 'new record'
* (see PARTIAL_RECORD handling below)
*/
if (avail + ps->ps_svcount < HEADERSIZE) {
bcopy(src, dst, avail);
ps->ps_svcount += avail;
used = avail;
break;
}
used = copylen = HEADERSIZE - ps->ps_svcount;
bcopy(src, dst, copylen);
src += copylen;
dst += copylen;
avail -= copylen;
ps->ps_svcount += copylen;
/*FALLTHROUGH*/
case PL_STATE_PARTIAL_RECORD:
/*
* Transitions:
*
* Case E: avail+saved < recordsize
* -> 'partial record'
*
* Case F: avail+saved >= recordsize
* -> 'new record'
*/
h = PMCLOG_HEADER_FROM_SAVED_STATE(ps);
recordsize = PMCLOG_HEADER_TO_LENGTH(h);
if (recordsize <= 0)
goto error;
if (avail + ps->ps_svcount < recordsize) {
copylen = avail;
ps->ps_state = PL_STATE_PARTIAL_RECORD;
} else {
copylen = recordsize - ps->ps_svcount;
ps->ps_state = PL_STATE_NEW_RECORD;
}
bcopy(src, dst, copylen);
ps->ps_svcount += copylen;
used += copylen;
break;
default:
goto error;
}
done:
*data += used;
*len -= used;
return ps->ps_state;
error:
ps->ps_state = PL_STATE_ERROR;
return ps->ps_state;
}
/*
* Get an event from the stream pointed to by '*data'. '*len'
* indicates the number of bytes available to parse. Arguments
* '*data' and '*len' are updated to indicate the number of bytes
* consumed.
*/
static int
pmclog_get_event(void *cookie, char **data, ssize_t *len,
struct pmclog_ev *ev)
{
int evlen, pathlen;
uint32_t h, *le, npc;
enum pmclog_parser_state e;
struct pmclog_parse_state *ps;
ps = (struct pmclog_parse_state *) cookie;
assert(ps->ps_state != PL_STATE_ERROR);
if ((e = pmclog_get_record(ps,data,len)) == PL_STATE_ERROR) {
ev->pl_state = PMCLOG_ERROR;
return -1;
}
if (e != PL_STATE_NEW_RECORD) {
ev->pl_state = PMCLOG_REQUIRE_DATA;
return -1;
}
PMCLOG_INITIALIZE_READER(le, ps->ps_saved);
PMCLOG_READ32(le,h);
if (!PMCLOG_HEADER_CHECK_MAGIC(h)) {
ps->ps_state = PL_STATE_ERROR;
ev->pl_state = PMCLOG_ERROR;
return -1;
}
/* copy out the time stamp */
PMCLOG_READ32(le,ev->pl_ts.tv_sec);
PMCLOG_READ32(le,ev->pl_ts.tv_nsec);
evlen = PMCLOG_HEADER_TO_LENGTH(h);
#define PMCLOG_GET_PATHLEN(P,E,TYPE) do { \
(P) = (E) - offsetof(struct TYPE, pl_pathname); \
if ((P) > PATH_MAX || (P) < 0) \
goto error; \
} while (0)
#define PMCLOG_GET_CALLCHAIN_SIZE(SZ,E) do { \
(SZ) = ((E) - offsetof(struct pmclog_callchain, pl_pc)) \
/ sizeof(uintfptr_t); \
} while (0);
switch (ev->pl_type = PMCLOG_HEADER_TO_TYPE(h)) {
case PMCLOG_TYPE_CALLCHAIN:
PMCLOG_READ32(le,ev->pl_u.pl_cc.pl_pid);
PMCLOG_READ32(le,ev->pl_u.pl_cc.pl_pmcid);
PMCLOG_READ32(le,ev->pl_u.pl_cc.pl_cpuflags);
PMCLOG_GET_CALLCHAIN_SIZE(ev->pl_u.pl_cc.pl_npc,evlen);
for (npc = 0; npc < ev->pl_u.pl_cc.pl_npc; npc++)
PMCLOG_READADDR(le,ev->pl_u.pl_cc.pl_pc[npc]);
for (;npc < PMC_CALLCHAIN_DEPTH_MAX; npc++)
ev->pl_u.pl_cc.pl_pc[npc] = (uintfptr_t) 0;
break;
case PMCLOG_TYPE_CLOSELOG:
case PMCLOG_TYPE_DROPNOTIFY:
/* nothing to do */
break;
case PMCLOG_TYPE_INITIALIZE:
PMCLOG_READ32(le,ev->pl_u.pl_i.pl_version);
PMCLOG_READ32(le,ev->pl_u.pl_i.pl_arch);
ps->ps_version = ev->pl_u.pl_i.pl_version;
ps->ps_arch = ev->pl_u.pl_i.pl_arch;
ps->ps_initialized = 1;
break;
case PMCLOG_TYPE_MAP_IN:
PMCLOG_GET_PATHLEN(pathlen,evlen,pmclog_map_in);
PMCLOG_READ32(le,ev->pl_u.pl_mi.pl_pid);
PMCLOG_READADDR(le,ev->pl_u.pl_mi.pl_start);
PMCLOG_READSTRING(le, ev->pl_u.pl_mi.pl_pathname, pathlen);
break;
case PMCLOG_TYPE_MAP_OUT:
PMCLOG_READ32(le,ev->pl_u.pl_mo.pl_pid);
PMCLOG_READADDR(le,ev->pl_u.pl_mo.pl_start);
PMCLOG_READADDR(le,ev->pl_u.pl_mo.pl_end);
break;
case PMCLOG_TYPE_PCSAMPLE:
PMCLOG_READ32(le,ev->pl_u.pl_s.pl_pid);
PMCLOG_READADDR(le,ev->pl_u.pl_s.pl_pc);
PMCLOG_READ32(le,ev->pl_u.pl_s.pl_pmcid);
PMCLOG_READ32(le,ev->pl_u.pl_s.pl_usermode);
break;
case PMCLOG_TYPE_PMCALLOCATE:
PMCLOG_READ32(le,ev->pl_u.pl_a.pl_pmcid);
PMCLOG_READ32(le,ev->pl_u.pl_a.pl_event);
PMCLOG_READ32(le,ev->pl_u.pl_a.pl_flags);
if ((ev->pl_u.pl_a.pl_evname =
_pmc_name_of_event(ev->pl_u.pl_a.pl_event, ps->ps_arch))
== NULL)
goto error;
break;
case PMCLOG_TYPE_PMCALLOCATEDYN:
PMCLOG_READ32(le,ev->pl_u.pl_ad.pl_pmcid);
PMCLOG_READ32(le,ev->pl_u.pl_ad.pl_event);
PMCLOG_READ32(le,ev->pl_u.pl_ad.pl_flags);
PMCLOG_READSTRING(le,ev->pl_u.pl_ad.pl_evname,PMC_NAME_MAX);
break;
case PMCLOG_TYPE_PMCATTACH:
PMCLOG_GET_PATHLEN(pathlen,evlen,pmclog_pmcattach);
PMCLOG_READ32(le,ev->pl_u.pl_t.pl_pmcid);
PMCLOG_READ32(le,ev->pl_u.pl_t.pl_pid);
PMCLOG_READSTRING(le,ev->pl_u.pl_t.pl_pathname,pathlen);
break;
case PMCLOG_TYPE_PMCDETACH:
PMCLOG_READ32(le,ev->pl_u.pl_d.pl_pmcid);
PMCLOG_READ32(le,ev->pl_u.pl_d.pl_pid);
break;
case PMCLOG_TYPE_PROCCSW:
PMCLOG_READ32(le,ev->pl_u.pl_c.pl_pmcid);
PMCLOG_READ64(le,ev->pl_u.pl_c.pl_value);
PMCLOG_READ32(le,ev->pl_u.pl_c.pl_pid);
break;
case PMCLOG_TYPE_PROCEXEC:
PMCLOG_GET_PATHLEN(pathlen,evlen,pmclog_procexec);
PMCLOG_READ32(le,ev->pl_u.pl_x.pl_pid);
PMCLOG_READADDR(le,ev->pl_u.pl_x.pl_entryaddr);
PMCLOG_READ32(le,ev->pl_u.pl_x.pl_pmcid);
PMCLOG_READSTRING(le,ev->pl_u.pl_x.pl_pathname,pathlen);
break;
case PMCLOG_TYPE_PROCEXIT:
PMCLOG_READ32(le,ev->pl_u.pl_e.pl_pmcid);
PMCLOG_READ64(le,ev->pl_u.pl_e.pl_value);
PMCLOG_READ32(le,ev->pl_u.pl_e.pl_pid);
break;
case PMCLOG_TYPE_PROCFORK:
PMCLOG_READ32(le,ev->pl_u.pl_f.pl_oldpid);
PMCLOG_READ32(le,ev->pl_u.pl_f.pl_newpid);
break;
case PMCLOG_TYPE_SYSEXIT:
PMCLOG_READ32(le,ev->pl_u.pl_se.pl_pid);
break;
case PMCLOG_TYPE_USERDATA:
PMCLOG_READ32(le,ev->pl_u.pl_u.pl_userdata);
break;
default: /* unknown record type */
ps->ps_state = PL_STATE_ERROR;
ev->pl_state = PMCLOG_ERROR;
return (-1);
}
ev->pl_offset = (ps->ps_offset += evlen);
ev->pl_count = (ps->ps_count += 1);
ev->pl_state = PMCLOG_OK;
return 0;
error:
ev->pl_state = PMCLOG_ERROR;
ps->ps_state = PL_STATE_ERROR;
return -1;
}
/*
* Extract and return the next event from the byte stream.
*
* Returns 0 and sets the event's state to PMCLOG_OK in case an event
* was successfully parsed. Otherwise this function returns -1 and
* sets the event's state to one of PMCLOG_REQUIRE_DATA (if more data
* is needed) or PMCLOG_EOF (if an EOF was seen) or PMCLOG_ERROR if
* a parse error was encountered.
*/
int
pmclog_read(void *cookie, struct pmclog_ev *ev)
{
int retval;
ssize_t nread;
struct pmclog_parse_state *ps;
ps = (struct pmclog_parse_state *) cookie;
if (ps->ps_state == PL_STATE_ERROR) {
ev->pl_state = PMCLOG_ERROR;
return -1;
}
/*
* If there isn't enough data left for a new event try and get
* more data.
*/
if (ps->ps_len == 0) {
ev->pl_state = PMCLOG_REQUIRE_DATA;
/*
* If we have a valid file descriptor to read from, attempt
* to read from that. This read may return with an error,
* (which may be EAGAIN or other recoverable error), or
* can return EOF.
*/
if (ps->ps_fd != PMCLOG_FD_NONE) {
refill:
nread = read(ps->ps_fd, ps->ps_buffer,
PMCLOG_BUFFER_SIZE);
if (nread <= 0) {
if (nread == 0)
ev->pl_state = PMCLOG_EOF;
else if (errno != EAGAIN) /* not restartable */
ev->pl_state = PMCLOG_ERROR;
return -1;
}
ps->ps_len = nread;
ps->ps_data = ps->ps_buffer;
} else
return -1;
}
assert(ps->ps_len > 0);
/* Retrieve one event from the byte stream. */
retval = pmclog_get_event(ps, &ps->ps_data, &ps->ps_len, ev);
/*
* If we need more data and we have a configured fd, try read
* from it.
*/
if (retval < 0 && ev->pl_state == PMCLOG_REQUIRE_DATA &&
ps->ps_fd != -1) {
assert(ps->ps_len == 0);
goto refill;
}
return retval;
}
/*
* Feed data to a memory based parser.
*
* The memory area pointed to by 'data' needs to be valid till the
* next error return from pmclog_next_event().
*/
int
pmclog_feed(void *cookie, char *data, int len)
{
struct pmclog_parse_state *ps;
ps = (struct pmclog_parse_state *) cookie;
if (len < 0 || /* invalid length */
ps->ps_buffer || /* called for a file parser */
ps->ps_len != 0) /* unnecessary call */
return -1;
ps->ps_data = data;
ps->ps_len = len;
return 0;
}
/*
* Allocate and initialize parser state.
*/
void *
pmclog_open(int fd)
{
struct pmclog_parse_state *ps;
if ((ps = (struct pmclog_parse_state *) malloc(sizeof(*ps))) == NULL)
return NULL;
ps->ps_state = PL_STATE_NEW_RECORD;
ps->ps_arch = -1;
ps->ps_initialized = 0;
ps->ps_count = 0;
ps->ps_offset = (off_t) 0;
bzero(&ps->ps_saved, sizeof(ps->ps_saved));
ps->ps_svcount = 0;
ps->ps_fd = fd;
ps->ps_data = NULL;
ps->ps_buffer = NULL;
ps->ps_len = 0;
/* allocate space for a work area */
if (ps->ps_fd != PMCLOG_FD_NONE) {
if ((ps->ps_buffer = malloc(PMCLOG_BUFFER_SIZE)) == NULL) {
free(ps);
return NULL;
}
}
return ps;
}
/*
* Free up parser state.
*/
void
pmclog_close(void *cookie)
{
struct pmclog_parse_state *ps;
ps = (struct pmclog_parse_state *) cookie;
if (ps->ps_buffer)
free(ps->ps_buffer);
free(ps);
}