Update vendor-sys/illumos to illumos-gate 13758:23432da34147 (dtrace)

References:
  https://www.illumos.org/issues/3021
  https://www.illumos.org/issues/3022
  https://www.illumos.org/issues/3023
  https://www.illumos.org/issues/3024
  https://www.illumos.org/issues/3025
  https://www.illumos.org/issues/3026

Obtained from:	ssh://anonhg@hg.illumos.org/illumos-gate
This commit is contained in:
Martin Matuska 2012-08-19 09:21:20 +00:00
parent 41adcc32e9
commit 9de3bb3344
3 changed files with 189 additions and 130 deletions

View File

@ -22,6 +22,7 @@
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, Joyent, Inc. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
*/
/*
@ -2401,9 +2402,10 @@ dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
{
dtrace_speculation_t *spec;
dtrace_buffer_t *src, *dest;
uintptr_t daddr, saddr, dlimit;
uintptr_t daddr, saddr, dlimit, slimit;
dtrace_speculation_state_t current, new;
intptr_t offs;
uint64_t timestamp;
if (which == 0)
return;
@ -2479,7 +2481,37 @@ dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
}
/*
* We have the space; copy the buffer across. (Note that this is a
* We have sufficient space to copy the speculative buffer into the
* primary buffer. First, modify the speculative buffer, filling
* in the timestamp of all entries with the current time. The data
* must have the commit() time rather than the time it was traced,
* so that all entries in the primary buffer are in timestamp order.
*/
timestamp = dtrace_gethrtime();
saddr = (uintptr_t)src->dtb_tomax;
slimit = saddr + src->dtb_offset;
while (saddr < slimit) {
size_t size;
dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
saddr += sizeof (dtrace_epid_t);
continue;
}
ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs);
size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
ASSERT3U(saddr + size, <=, slimit);
ASSERT3U(size, >=, sizeof (dtrace_rechdr_t));
ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX);
DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
saddr += size;
}
/*
* Copy the buffer across. (Note that this is a
* highly subobtimal bcopy(); in the unlikely event that this becomes
* a serious performance issue, a high-performance DTrace-specific
* bcopy() should obviously be invented.)
@ -5951,7 +5983,7 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
if (now - state->dts_alive > dtrace_deadman_timeout) {
/*
* We seem to be dead. Unless we (a) have kernel
* destructive permissions (b) have expicitly enabled
* destructive permissions (b) have explicitly enabled
* destructive actions and (c) destructive actions have
* not been disabled, we're going to transition into
* the KILLED state, from which no further processing
@ -5979,8 +6011,18 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
tomax = buf->dtb_tomax;
ASSERT(tomax != NULL);
if (ecb->dte_size != 0)
DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid);
if (ecb->dte_size != 0) {
dtrace_rechdr_t dtrh;
if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
mstate.dtms_timestamp = dtrace_gethrtime();
mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
}
ASSERT3U(ecb->dte_size, >=, sizeof (dtrace_rechdr_t));
dtrh.dtrh_epid = ecb->dte_epid;
DTRACE_RECORD_STORE_TIMESTAMP(&dtrh,
mstate.dtms_timestamp);
*((dtrace_rechdr_t *)(tomax + offs)) = dtrh;
}
mstate.dtms_epid = ecb->dte_epid;
mstate.dtms_present |= DTRACE_MSTATE_EPID;
@ -6144,7 +6186,9 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
continue;
switch (act->dta_kind) {
case DTRACEACT_SPECULATE:
case DTRACEACT_SPECULATE: {
dtrace_rechdr_t *dtrh;
ASSERT(buf == &state->dts_buffer[cpuid]);
buf = dtrace_speculation_buffer(state,
cpuid, val);
@ -6166,10 +6210,23 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
tomax = buf->dtb_tomax;
ASSERT(tomax != NULL);
if (ecb->dte_size != 0)
DTRACE_STORE(uint32_t, tomax, offs,
ecb->dte_epid);
if (ecb->dte_size == 0)
continue;
ASSERT3U(ecb->dte_size, >=,
sizeof (dtrace_rechdr_t));
dtrh = ((void *)(tomax + offs));
dtrh->dtrh_epid = ecb->dte_epid;
/*
* When the speculation is committed, all of
* the records in the speculative buffer will
* have their timestamps set to the commit
* time. Until then, it is set to a sentinel
* value, for debugability.
*/
DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
continue;
}
case DTRACEACT_CHILL:
if (dtrace_priv_kernel_destructive(state))
@ -9369,9 +9426,9 @@ dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
/*
* The default size is the size of the default action: recording
* the epid.
* the header.
*/
ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
ecb->dte_alignment = sizeof (dtrace_epid_t);
epid = state->dts_epid++;
@ -9470,122 +9527,89 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb)
static void
dtrace_ecb_resize(dtrace_ecb_t *ecb)
{
uint32_t maxalign = sizeof (dtrace_epid_t);
uint32_t align = sizeof (uint8_t), offs, diff;
dtrace_action_t *act;
int wastuple = 0;
uint32_t curneeded = UINT32_MAX;
uint32_t aggbase = UINT32_MAX;
dtrace_state_t *state = ecb->dte_state;
/*
* If we record anything, we always record the epid. (And we always
* record it first.)
* If we record anything, we always record the dtrace_rechdr_t. (And
* we always record it first.)
*/
offs = sizeof (dtrace_epid_t);
ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
ecb->dte_size = sizeof (dtrace_rechdr_t);
ecb->dte_alignment = sizeof (dtrace_epid_t);
for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
dtrace_recdesc_t *rec = &act->dta_rec;
ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
if ((align = rec->dtrd_alignment) > maxalign)
maxalign = align;
if (!wastuple && act->dta_intuple) {
/*
* This is the first record in a tuple. Align the
* offset to be at offset 4 in an 8-byte aligned
* block.
*/
diff = offs + sizeof (dtrace_aggid_t);
if (diff = (diff & (sizeof (uint64_t) - 1)))
offs += sizeof (uint64_t) - diff;
aggbase = offs - sizeof (dtrace_aggid_t);
ASSERT(!(aggbase & (sizeof (uint64_t) - 1)));
}
/*LINTED*/
if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) {
/*
* The current offset is not properly aligned; align it.
*/
offs += align - diff;
}
rec->dtrd_offset = offs;
if (offs + rec->dtrd_size > ecb->dte_needed) {
ecb->dte_needed = offs + rec->dtrd_size;
if (ecb->dte_needed > state->dts_needed)
state->dts_needed = ecb->dte_needed;
}
ecb->dte_alignment = MAX(ecb->dte_alignment,
rec->dtrd_alignment);
if (DTRACEACT_ISAGG(act->dta_kind)) {
dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
dtrace_action_t *first = agg->dtag_first, *prev;
ASSERT(rec->dtrd_size != 0 && first != NULL);
ASSERT(wastuple);
ASSERT(rec->dtrd_size != 0);
ASSERT(agg->dtag_first != NULL);
ASSERT(act->dta_prev->dta_intuple);
ASSERT(aggbase != UINT32_MAX);
ASSERT(curneeded != UINT32_MAX);
agg->dtag_base = aggbase;
while ((prev = first->dta_prev) != NULL &&
DTRACEACT_ISAGG(prev->dta_kind)) {
agg = (dtrace_aggregation_t *)prev;
first = agg->dtag_first;
}
curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
rec->dtrd_offset = curneeded;
curneeded += rec->dtrd_size;
ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
if (prev != NULL) {
offs = prev->dta_rec.dtrd_offset +
prev->dta_rec.dtrd_size;
} else {
offs = sizeof (dtrace_epid_t);
aggbase = UINT32_MAX;
curneeded = UINT32_MAX;
} else if (act->dta_intuple) {
if (curneeded == UINT32_MAX) {
/*
* This is the first record in a tuple. Align
* curneeded to be at offset 4 in an 8-byte
* aligned block.
*/
ASSERT(act->dta_prev == NULL ||
!act->dta_prev->dta_intuple);
ASSERT3U(aggbase, ==, UINT32_MAX);
curneeded = P2PHASEUP(ecb->dte_size,
sizeof (uint64_t), sizeof (dtrace_aggid_t));
aggbase = curneeded - sizeof (dtrace_aggid_t);
ASSERT(IS_P2ALIGNED(aggbase,
sizeof (uint64_t)));
}
wastuple = 0;
curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
rec->dtrd_offset = curneeded;
curneeded += rec->dtrd_size;
} else {
if (!act->dta_intuple)
ecb->dte_size = offs + rec->dtrd_size;
/* tuples must be followed by an aggregation */
ASSERT(act->dta_prev == NULL ||
!act->dta_prev->dta_intuple);
offs += rec->dtrd_size;
ecb->dte_size = P2ROUNDUP(ecb->dte_size,
rec->dtrd_alignment);
rec->dtrd_offset = ecb->dte_size;
ecb->dte_size += rec->dtrd_size;
ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
}
wastuple = act->dta_intuple;
}
if ((act = ecb->dte_action) != NULL &&
!(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
ecb->dte_size == sizeof (dtrace_epid_t)) {
ecb->dte_size == sizeof (dtrace_rechdr_t)) {
/*
* If the size is still sizeof (dtrace_epid_t), then all
* If the size is still sizeof (dtrace_rechdr_t), then all
* actions store no data; set the size to 0.
*/
ecb->dte_alignment = maxalign;
ecb->dte_size = 0;
/*
* If the needed space is still sizeof (dtrace_epid_t), then
* all actions need no additional space; set the needed
* size to 0.
*/
if (ecb->dte_needed == sizeof (dtrace_epid_t))
ecb->dte_needed = 0;
return;
}
/*
* Set our alignment, and make sure that the dte_size and dte_needed
* are aligned to the size of an EPID.
*/
ecb->dte_alignment = maxalign;
ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) &
~(sizeof (dtrace_epid_t) - 1);
ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) &
~(sizeof (dtrace_epid_t) - 1);
ASSERT(ecb->dte_size <= ecb->dte_needed);
ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed,
ecb->dte_needed);
}
static dtrace_action_t *
@ -9955,7 +9979,7 @@ dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
break;
case DTRACEACT_SPECULATE:
if (ecb->dte_size > sizeof (dtrace_epid_t))
if (ecb->dte_size > sizeof (dtrace_rechdr_t))
return (EINVAL);
if (dp == NULL)
@ -10068,7 +10092,7 @@ dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
ecb->dte_action = NULL;
ecb->dte_action_last = NULL;
ecb->dte_size = sizeof (dtrace_epid_t);
ecb->dte_size = 0;
}
static void
@ -10339,12 +10363,13 @@ dtrace_buffer_switch(dtrace_buffer_t *buf)
caddr_t tomax = buf->dtb_tomax;
caddr_t xamot = buf->dtb_xamot;
dtrace_icookie_t cookie;
hrtime_t now = dtrace_gethrtime();
hrtime_t now;
ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
cookie = dtrace_interrupt_disable();
now = dtrace_gethrtime();
buf->dtb_tomax = xamot;
buf->dtb_xamot = tomax;
buf->dtb_xamot_drops = buf->dtb_drops;
@ -10639,7 +10664,7 @@ dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
if (epid == DTRACE_EPIDNONE) {
size = sizeof (uint32_t);
} else {
ASSERT(epid <= state->dts_necbs);
ASSERT3U(epid, <=, state->dts_necbs);
ASSERT(state->dts_ecbs[epid - 1] != NULL);
size = state->dts_ecbs[epid - 1]->dte_size;
@ -15623,6 +15648,7 @@ dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
desc.dtbd_drops = buf->dtb_drops;
desc.dtbd_errors = buf->dtb_errors;
desc.dtbd_oldest = buf->dtb_xamot_offset;
desc.dtbd_timestamp = dtrace_gethrtime();
mutex_exit(&dtrace_lock);
@ -15675,6 +15701,7 @@ dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
desc.dtbd_drops = buf->dtb_xamot_drops;
desc.dtbd_errors = buf->dtb_xamot_errors;
desc.dtbd_oldest = 0;
desc.dtbd_timestamp = buf->dtb_switched;
mutex_exit(&dtrace_lock);

View File

@ -26,6 +26,7 @@
/*
* Copyright (c) 2011, Joyent, Inc. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
*/
#ifndef _SYS_DTRACE_H
@ -919,10 +920,10 @@ typedef struct dtrace_ecbdesc {
* DTrace Metadata Description Structures
*
* DTrace separates the trace data stream from the metadata stream. The only
* metadata tokens placed in the data stream are enabled probe identifiers
* (EPIDs) or (in the case of aggregations) aggregation identifiers. In order
* to determine the structure of the data, DTrace consumers pass the token to
* the kernel, and receive in return a corresponding description of the enabled
* metadata tokens placed in the data stream are the dtrace_rechdr_t (EPID +
* timestamp) or (in the case of aggregations) aggregation identifiers. To
* determine the structure of the data, DTrace consumers pass the token to the
* kernel, and receive in return a corresponding description of the enabled
* probe (via the dtrace_eprobedesc structure) or the aggregation (via the
* dtrace_aggdesc structure). Both of these structures are expressed in terms
* of record descriptions (via the dtrace_recdesc structure) that describe the
@ -1017,7 +1018,8 @@ typedef struct dtrace_fmtdesc {
#define DTRACEOPT_AGGSORTREV 24 /* reverse-sort aggregations */
#define DTRACEOPT_AGGSORTPOS 25 /* agg. position to sort on */
#define DTRACEOPT_AGGSORTKEYPOS 26 /* agg. key position to sort on */
#define DTRACEOPT_MAX 27 /* number of options */
#define DTRACEOPT_TEMPORAL 27 /* temporally ordered output */
#define DTRACEOPT_MAX 28 /* number of options */
#define DTRACEOPT_UNSET (dtrace_optval_t)-2 /* unset option */
@ -1037,7 +1039,9 @@ typedef struct dtrace_fmtdesc {
* where user-level wishes the kernel to snapshot the buffer to (the
* dtbd_data field). The kernel uses the same structure to pass back some
* information regarding the buffer: the size of data actually copied out, the
* number of drops, the number of errors, and the offset of the oldest record.
* number of drops, the number of errors, the offset of the oldest record,
* and the time of the snapshot.
*
* If the buffer policy is a "switch" policy, taking a snapshot of the
* principal buffer has the additional effect of switching the active and
* inactive buffers. Taking a snapshot of the aggregation buffer _always_ has
@ -1050,8 +1054,29 @@ typedef struct dtrace_bufdesc {
uint64_t dtbd_drops; /* number of drops */
DTRACE_PTR(char, dtbd_data); /* data */
uint64_t dtbd_oldest; /* offset of oldest record */
uint64_t dtbd_timestamp; /* hrtime of snapshot */
} dtrace_bufdesc_t;
/*
* Each record in the buffer (dtbd_data) begins with a header that includes
* the epid and a timestamp. The timestamp is split into two 4-byte parts
* so that we do not require 8-byte alignment.
*/
typedef struct dtrace_rechdr {
dtrace_epid_t dtrh_epid; /* enabled probe id */
uint32_t dtrh_timestamp_hi; /* high bits of hrtime_t */
uint32_t dtrh_timestamp_lo; /* low bits of hrtime_t */
} dtrace_rechdr_t;
#define DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) \
((dtrh)->dtrh_timestamp_lo + \
((uint64_t)(dtrh)->dtrh_timestamp_hi << 32))
#define DTRACE_RECORD_STORE_TIMESTAMP(dtrh, hrtime) { \
(dtrh)->dtrh_timestamp_lo = (uint32_t)hrtime; \
(dtrh)->dtrh_timestamp_hi = hrtime >> 32; \
}
/*
* DTrace Status
*

View File

@ -26,6 +26,7 @@
/*
* Copyright (c) 2011, Joyent, Inc. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
*/
#ifndef _SYS_DTRACE_IMPL_H
@ -199,15 +200,18 @@ typedef struct dtrace_hash {
* predicate is non-NULL, the DIF object is executed. If the result is
* non-zero, the action list is processed, with each action being executed
* accordingly. When the action list has been completely executed, processing
* advances to the next ECB. processing advances to the next ECB. If the
* result is non-zero; For each ECB, it first determines the The ECB
* abstraction allows disjoint consumers to multiplex on single probes.
* advances to the next ECB. The ECB abstraction allows disjoint consumers
* to multiplex on single probes.
*
* Execution of the ECB results in consuming dte_size bytes in the buffer
* to record data. During execution, dte_needed bytes must be available in
* the buffer. This space is used for both recorded data and tuple data.
*/
struct dtrace_ecb {
dtrace_epid_t dte_epid; /* enabled probe ID */
uint32_t dte_alignment; /* required alignment */
size_t dte_needed; /* bytes needed */
size_t dte_size; /* total size of payload */
size_t dte_needed; /* space needed for execution */
size_t dte_size; /* size of recorded payload */
dtrace_predicate_t *dte_predicate; /* predicate, if any */
dtrace_action_t *dte_action; /* actions, if any */
dtrace_ecb_t *dte_next; /* next ECB on probe */
@ -265,27 +269,30 @@ typedef struct dtrace_aggregation {
* the EPID, the consumer can determine the data layout. (The data buffer
* layout is shown schematically below.) By assuring that one can determine
* data layout from the EPID, the metadata stream can be separated from the
* data stream -- simplifying the data stream enormously.
* data stream -- simplifying the data stream enormously. The ECB always
* proceeds the recorded data as part of the dtrace_rechdr_t structure that
* includes the EPID and a high-resolution timestamp used for output ordering
* consistency.
*
* base of data buffer ---> +------+--------------------+------+
* | EPID | data | EPID |
* +------+--------+------+----+------+
* | data | EPID | data |
* +---------------+------+-----------+
* | data, cont. |
* +------+--------------------+------+
* | EPID | data | |
* +------+--------------------+ |
* | || |
* | || |
* | \/ |
* : :
* . .
* . .
* . .
* : :
* | |
* limit of data buffer ---> +----------------------------------+
* base of data buffer ---> +--------+--------------------+--------+
* | rechdr | data | rechdr |
* +--------+------+--------+----+--------+
* | data | rechdr | data |
* +---------------+--------+-------------+
* | data, cont. |
* +--------+--------------------+--------+
* | rechdr | data | |
* +--------+--------------------+ |
* | || |
* | || |
* | \/ |
* : :
* . .
* . .
* . .
* : :
* | |
* limit of data buffer ---> +--------------------------------------+
*
* When evaluating an ECB, dtrace_probe() determines if the ECB's needs of the
* principal buffer (both scratch and payload) exceed the available space. If