From 728dc71ed5c827388e77176ea035202e9ecfee5f Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Thu, 9 Oct 2014 23:11:36 +0000 Subject: [PATCH 1/2] 5202 want ctf(4) Reviewed by: Keith M Wesolowski Reviewed by: Jerry Jelinek Reviewed by: Garrett D'Amore Approved by: Dan McDonald Author: Robert Mustacchi illumos/illumos-gate@fe2e029eea29fd49d0d9058dbd5b79a252667e6b --- man/man4/ctf.4 | 1140 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1140 insertions(+) create mode 100644 man/man4/ctf.4 diff --git a/man/man4/ctf.4 b/man/man4/ctf.4 new file mode 100644 index 000000000000..9e807abc4c09 --- /dev/null +++ b/man/man4/ctf.4 @@ -0,0 +1,1140 @@ +.\" +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.\" +.\" +.\" Copyright (c) 2014 Joyent, Inc. +.\" +.Dd Sep 26, 2014 +.Dt CTF 4 +.Os +.Sh NAME +.Nm ctf +.Nd Compact C Type Format +.Sh SYNOPSIS +.In sys/ctf.h +.Sh DESCRIPTION +.Nm +is designed to be a compact representation of the C programming +language's type information focused on serving the needs of dynamic +tracing, debuggers, and other in-situ and post-mortem introspection +tools. +.Nm +data is generally included in +.Sy ELF +objects and is tagged as +.Sy SHT_PROGBITS +to ensure that the data is accessible in a running process and in subsequent +core dumps, if generated. +.Lp +The +.Nm +data contained in each file has information about the layout and +sizes of C types, including intrinsic types, enumerations, structures, +typedefs, and unions, that are used by the corresponding +.Sy ELF +object. The +.Nm +data may also include information about the types of global objects and +the return type and arguments of functions in the symbol table. +.Lp +Because a +.Nm +file is often embedded inside a file, rather than being a standalone +file itself, it may also be referred to as a +.Nm +.Sy container . +.Lp +On illumos systems, +.Nm +data is consumed by multiple programs. It can be used by the modular +debugger, +.Xr mdb 1 , +as well as by +.Xr dtrace 1M . +Programmatic access to +.Nm +data can be obtained through +.Xr libctf 3LIB . +.Lp +The +.Nm +file format is broken down into seven different sections. The first +section is the +.Sy preamble +and +.Sy header , +which describes the version of the +.Nm +file, links it has to other +.Nm +files, and the sizes of the other sections. The next section is the +.Sy label +section, +which provides a way of identifying similar groups of +.Nm +data across multiple files. This is followed by the +.Sy object +information section, which describes the type of global +symbols. The subsequent section is the +.Sy function +information section, which describes the return +types and arguments of functions. The next section is the +.Sy type +information section, which describes +the format and layout of the C types themselves, and finally the last +section is the +.Sy string +section, which contains the names of types, enumerations, members, and +labels. +.Lp +While strictly speaking, only the +.Sy preamble +and +.Sy header +are required, to be actually useful, both the type and string +sections are necessary. +.Lp +A +.Nm +file may contain all of the type information that it requires, or it +may optionally refer to another +.Nm +file which holds the remaining types. When a +.Nm +file refers to another file, it is called the +.Sy child +and the file it refers to is called the +.Sy parent . +A given file may only refer to one parent. This process is called +.Em uniquification +because it ensures each child only has type information that is +unique to it. A common example of this is that most kernel modules in +illumos are uniquified against the kernel module +.Sy genunix +and the type information that comes from the +.Sy IP +module. This means that a module only has types that are unique to +itself and the most common types in the kernel are not duplicated. +.Sh FILE FORMAT +This documents version +.Em two +of the +.Nm +file format. All applications and tools currently produce and operate on +this version. +.Lp +The file format can be summarized with the following image, the +following sections will cover this in more detail. +.Bd -literal + + +-------------+ 0t0 ++--------| Preamble | +| +-------------+ 0t4 +|+-------| Header | +|| +-------------+ 0t36 + cth_lbloff +||+------| Labels | +||| +-------------+ 0t36 + cth_objtoff +|||+-----| Objects | +|||| +-------------+ 0t36 + cth_funcoff +||||+----| Functions | +||||| +-------------+ 0t36 + cth_typeoff +|||||+---| Types | +|||||| +-------------+ 0t36 + cth_stroff +||||||+--| Strings | +||||||| +-------------+ 0t36 + cth_stroff + cth_strlen +||||||| +||||||| +||||||| +||||||| +-- magic - vers flags +||||||| | | | | +||||||| +------+------+------+------+ ++---------| 0xcf | 0xf1 | 0x02 | 0x00 | + |||||| +------+------+------+------+ + |||||| 0 1 2 3 4 + |||||| + |||||| + parent label + objects + |||||| | + parent name | + functions + strings + |||||| | | + label | | + types | + strlen + |||||| | | | | | | | | + |||||| +------+------+------+------+------+-------+-------+-------+ + +--------| 0x00 | 0x00 | 0x00 | 0x08 | 0x36 | 0x110 | 0x5f4 | 0x611 | + ||||| +------+------+------+------+------+-------+-------+-------+ + ||||| 0x04 0x08 0x0c 0x10 0x14 0x18 0x1c 0x20 0x24 + ||||| + ||||| + Label name + ||||| | + Label type + ||||| | | + Next label + ||||| | | | + ||||| +-------+------+-----+ + +-----------| 0x01 | 0x42 | ... | + |||| +-------+------+-----+ + |||| cth_lbloff +0x4 +0x8 cth_objtoff + |||| + |||| + |||| Symidx 0t15 0t43 0t44 + |||| +------+------+------+-----+ + +----------| 0x00 | 0x42 | 0x36 | ... | + ||| +------+------+------+-----+ + ||| cth_objtoff +0x2 +0x4 +0x6 cth_funcoff + ||| + ||| + CTF_TYPE_INFO + CTF_TYPE_INFO + ||| | + Return type | + ||| | | + arg0 | + ||| +--------+------+------+-----+ + +---------| 0x2c10 | 0x08 | 0x0c | ... | + || +--------+------+------+-----+ + || cth_funcff +0x2 +0x4 +0x6 cth_typeoff + || + || + ctf_stype_t for type 1 + || | integer + integer encoding + || | | + ctf_stype_t for type 2 + || | | | + || +--------------------+-----------+-----+ + +--------| 0x19 * 0xc01 * 0x0 | 0x1000000 | ... | + | +--------------------+-----------+-----+ + | cth_typeoff +0x08 +0x0c cth_stroff + | + | +--- str 0 + | | +--- str 1 + str 2 + | | | | + | v v v + | +----+---+---+---+----+---+---+---+---+---+----+ + +---| \\0 | i | n | t | \\0 | f | o | o | _ | t | \\0 | + +----+---+---+---+----+---+---+---+---+---+----+ + 0 1 2 3 4 5 6 7 8 9 10 11 +.Ed +.Lp +Every +.Nm +file begins with a +.Sy preamble , +followed by a +.Sy header . +The +.Sy preamble +is defined as follows: +.Bd -literal +typedef struct ctf_preamble { + ushort_t ctp_magic; /* magic number (CTF_MAGIC) */ + uchar_t ctp_version; /* data format version number (CTF_VERSION) */ + uchar_t ctp_flags; /* flags (see below) */ +} ctf_preamble_t; +.Ed +.Pp +The +.Sy preamble +is four bytes long and must be four byte aligned. +This +.Sy preamble +defines the version of the +.Nm +file which defines the format of the rest of the header. While the +header may change in subsequent versions, the preamble will not change +across versions, though the interpretation of its flags may change from +version to version. The +.Em ctp_magic +member defines the magic number for the +.Nm +file format. This must always be +.Li 0xcff1 . +If another value is encountered, then the file should not be treated as +a +.Nm +file. The +.Em ctp_version +member defines the version of the +.Nm +file. The current version is +.Li 2 . +It is possible to encounter an unsupported version. In that case, +software should not try to parse the format, as it may have changed. +Finally, the +.Em ctp_flags +member describes aspects of the file which modify its interpretation. +The following flags are currently defined: +.Bd -literal +#define CTF_F_COMPRESS 0x01 +.Ed +.Pp +The flag +.Sy CTF_F_COMPRESS +indicates that the body of the +.Nm +file, all the data following the +.Sy header , +has been compressed through the +.Sy zlib +library and its +.Sy deflate +algorithm. If this flag is not present, then the body has not been +compressed and no special action is needed to interpret it. All offsets +into the data as described by +.Sy header , +always refer to the +.Sy uncompressed +data. +.Lp +In version two of the +.Nm +file format, the +.Sy header +denotes whether whether or not this +.Nm +file is the child of another +.Nm +file and also indicates the size of the remaining sections. The +structure for the +.Sy header , +logically contains a copy of the +.Sy preamble +and the two have a combined size of 36 bytes. +.Bd -literal +typedef struct ctf_header { + ctf_preamble_t cth_preamble; + uint_t cth_parlabel; /* ref to name of parent lbl uniq'd against */ + uint_t cth_parname; /* ref to basename of parent */ + uint_t cth_lbloff; /* offset of label section */ + uint_t cth_objtoff; /* offset of object section */ + uint_t cth_funcoff; /* offset of function section */ + uint_t cth_typeoff; /* offset of type section */ + uint_t cth_stroff; /* offset of string section */ + uint_t cth_strlen; /* length of string section in bytes */ +} ctf_header_t; +.Ed +.Pp +After the +.Sy preamble , +the next two members +.Em cth_parlablel +and +.Em cth_parname , +are used to identify the parent. The value of both members are offsets +into the +.Sy string +section which point to the start of a null-terminated string. For more +information on the encoding of strings, see the subsection on +.Sx String Identifiers . +If the value of either is zero, then there is no entry for that +member. If the member +.Em cth_parlabel +is set, then the +.Em ctf_parname +member must be set, otherwise it will not be possible to find the +parent. If +.Em ctf_parname +is set, it is not necessary to define +.Em cth_parlabel , +as the parent may not have a label. For more information on labels +and their interpretation, see +.Sx The Label Section . +.Lp +The remaining members (excepting +.Em cth_strlen ) +describe the beginning of the corresponding sections. These offsets are +relative to the end of the +.Sy header . +Therefore, something with an offset of 0 is at an offset of thirty-six +bytes relative to the start of the +.Nm +file. The difference between members +indicates the size of the section itself. Different offsets have +different alignment requirements. The start of the +.Em cth_objotoff +and +.Em cth_funcoff +must be two byte aligned, while the sections +.Em cth_lbloff +and +.Em cth_typeoff +must be four-byte aligned. The section +.Em cth_stroff +has no alignment requirements. To calculate the size of a given section, +excepting the +.Sy string +section, one should subtract the offset of the section from the following one. For +example, the size of the +.Sy types +section can be calculated by subtracting +.Em cth_stroff +from +.Em cth_typeoff . +.Lp +Finally, the member +.Em cth_strlen +describes the length of the string section itself. From it, you can also +calculate the size of the entire +.Nm +file by adding together the size of the +.Sy ctf_header_t , +the offset of the string section in +.Em cth_stroff , +and the size of the string section in +.Em cth_srlen . +.Ss Type Identifiers +Through the +.Nm ctf +data, types are referred to by identifiers. A given +.Nm +file supports up to 32767 (0x7fff) types. The first valid type identifier is 0x1. +When a given +.Nm +file is a child, indicated by a non-zero entry for the +.Sy header Ns 's +.Em cth_parname , +then the first valid type identifier is 0x8000 and the last is 0xffff. +In this case, type identifiers 0x1 through 0x7fff are references to the +parent. +.Lp +The type identifier zero is a sentinel value used to indicate that there +is no type information available or it is an unknown type. +.Lp +Throughout the file format, the identifier is stored in different sized +values; however, the minimum size to represent a given identifier is a +.Sy uint16_t . +Other consumers of +.Nm +information may use larger or opaque identifiers. +.Ss String Identifiers +String identifiers are always encoded as four byte unsigned integers +which are an offset into a string table. The +.Nm +format supports two different string tables which have an identifier of +zero or one. This identifier is stored in the high-order bit of the +unsigned four byte offset. Therefore, the maximum supported offset into +one of these tables is 0x7ffffffff. +.Lp +Table identifier zero, always refers to the +.Sy string +section in the CTF file itself. String table identifier one refers to an +external string table which is the ELF string table for the ELF symbol +table associated with the +.Nm +container. +.Ss Type Encoding +Every +.Nm +type begins with metadata encoded into a +.Sy uint16_t . +This encoded information tells us three different pieces of information: +.Bl -bullet -offset indent -compact +.It +The kind of the type +.It +Whether this type is a root type or not +.It +The length of the variable data +.El +.Lp +The 16 bits that make up the encoding are broken down such that you have +five bits for the kind, one bit for indicating whether or not it is a +root type, and 10 bits for the variable length. This is laid out as +follows: +.Bd -literal -offset indent ++--------------------+ +| kind | root | vlen | ++--------------------+ +15 11 10 9 0 +.Ed +.Lp +The current version of the file format defines 14 different kinds. The +interpretation of these different kinds will be discussed in the section +.Sx The Type Section . +If a kind is encountered that is not listed below, then it is not a valid +.Nm +file. The kinds are defined as follows: +.Bd -literal -offset indent +#define CTF_K_UNKNOWN 0 +#define CTF_K_INTEGER 1 +#define CTF_K_FLOAT 2 +#define CTF_K_POINTER 3 +#define CTF_K_ARRAY 4 +#define CTF_K_FUNCTION 5 +#define CTF_K_STRUCT 6 +#define CTF_K_UNION 7 +#define CTF_K_ENUM 8 +#define CTF_K_FORWARD 9 +#define CTF_K_TYPEDEF 10 +#define CTF_K_VOLATILE 11 +#define CTF_K_CONST 12 +#define CTF_K_RESTRICT 13 +.Ed +.Lp +Programs directly reference many types; however, other types are referenced +indirectly because they are part of some other structure. These types that are +referenced directly and used are called +.Sy root +types. Other types may be used indirectly, for example, a program may reference +a structure directly, but not one of its members which has a type. That type is +not considered a +.Sy root +type. If a type is a +.Sy root +type, then it will have bit 10 set. +.Lp +The variable length section is specific to each kind and is discussed in the +section +.Sx The Type Section . +.Lp +The following macros are useful for constructing and deconstructing the encoded +type information: +.Bd -literal -offset indent + +#define CTF_MAX_VLEN 0x3ff +#define CTF_INFO_KIND(info) (((info) & 0xf800) >> 11) +#define CTF_INFO_ISROOT(info) (((info) & 0x0400) >> 10) +#define CTF_INFO_VLEN(info) (((info) & CTF_MAX_VLEN)) + +#define CTF_TYPE_INFO(kind, isroot, vlen) \\ + (((kind) << 11) | (((isroot) ? 1 : 0) << 10) | ((vlen) & CTF_MAX_VLEN)) +.Ed +.Ss The Label Section +When consuming +.Nm +data, it is often useful to know whether two different +.Nm +containers come from the same source base and version. For example, when +building illumos, there are many kernel modules that are built against a +single collection of source code. A label is encoded into the +.Nm +files that corresponds with the particular build. This ensures that if +files on the system were to become mixed up from multiple releases, that +they are not used together by tools, particularly when a child needs to +refer to a type in the parent. Because they are linked used the type +identifiers, if the wrong parent is used then the wrong type will be +encountered. +.Lp +Each label is encoded in the file format using the following eight byte +structure: +.Bd -literal +typedef struct ctf_lblent { + uint_t ctl_label; /* ref to name of label */ + uint_t ctl_typeidx; /* last type associated with this label */ +} ctf_lblent_t; +.Ed +.Lp +Each label has two different components, a name and a type identifier. +The name is encoded in the +.Em ctl_label +member which is in the format defined in the section +.Sx String Identifiers . +Generally, the names of all labels are found in the internal string +section. +.Lp +The type identifier encoded in the member +.Em ctl_typeidx +refers to the last type identifier that a label refers to in the current +file. Labels only refer to types in the current file, if the +.Nm +file is a child, then it will have the same label as its parent; +however, its label will only refer to its types, not its parents. +.Lp +It is also possible, though rather uncommon, for a +.Nm +file to have multiple labels. Labels are placed one after another, every +eight bytes. When multiple labels are present, types may only belong to +a single label. +.Ss The Object Section +The object section provides a mapping from ELF symbols of type +.Sy STT_OBJECT +in the symbol table to a type identifier. Every entry in this section is +a +.Sy uint16_t +which contains a type identifier as described in the section +.Sx Type Identifiers . +If there is no information for an object, then the type identifier 0x0 +is stored for that entry. +.Lp +To walk the object section, you need to have a corresponding +.Sy symbol table +in the ELF object that contains the +.Nm +data. Not every object is included in this section. Specifically, when +walking the symbol table. An entry is skipped if it matches any of the +following conditions: +.Lp +.Bl -bullet -offset indent -compact +.It +The symbol type is not +.Sy STT_OBJECT +.It +The symbol's section index is +.Sy SHN_UNDEF +.It +The symbol's name offset is zero +.It +The symbol's section index is +.Sy SHN_ABS +and the value of the symbol is zero. +.It +The symbol's name is +.Li _START_ +or +.Li _END_ . +These are skipped because they are used for scoping local symbols in +ELF. +.El +.Lp +The following sample code shows an example of iterating the object +section and skipping the correct symbols: +.Bd -literal +#include +#include + +/* + * Given the start of the object section in the CTF file, the number of symbols, + * and the ELF Data sections for the symbol table and the string table, this + * prints the type identifiers that correspond to objects. Note, a more robust + * implementation should ensure that they don't walk beyond the end of the CTF + * object section. + */ +static int +walk_symbols(uint16_t *objtoff, Elf_Data *symdata, Elf_Data *strdata, + long nsyms) +{ + long i; + uintptr_t strbase = strdata->d_buf; + + for (i = 1; i < nsyms; i++, objftoff++) { + const char *name; + GElf_Sym sym; + + if (gelf_getsym(symdata, i, &sym) == NULL) + return (1); + + if (GELF_ST_TYPE(sym.st_info) != STT_OBJECT) + continue; + if (sym.st_shndx == SHN_UNDEF || sym.st_name == 0) + continue; + if (sym.st_shndx == SHN_ABS && sym.st_value == 0) + continue; + name = (const char *)(strbase + sym.st_name); + if (strcmp(name, "_START_") == 0 || strcmp(name, "_END_") == 0) + continue; + + (void) printf("Symbol %d has type %d\n", i, *objtoff); + } + + return (0); +} +.Ed +.Ss The Function Section +The function section of the +.Nm +file encodes the types of both the function's arguments and the function's +return type. Similar to +.Sx The Object Section , +the function section encodes information for all symbols of type +.Sy STT_FUNCTION , +excepting those that fit specific criteria. Unlike with objects, because +functions have a variable number of arguments, they start with a type encoding +as defined in +.Sx Type Encoding , +which is the size of a +.Sy uint16_t . +For functions which have no type information available, they are encoded as +.Li CTF_TYPE_INFO(CTF_K_UNKNOWN, 0, 0) . +Functions with arguments are encoded differently. Here, the variable length is +turned into the number of arguments in the function. If a function is a +.Sy varargs +type function, then the number of arguments is increased by one. Functions with +type information are encoded as: +.Li CTF_TYPE_INFO(CTF_K_FUNCTION, 0, nargs) . +.Lp +For functions that have no type information, nothing else is encoded, and the +next function is encoded. For functions with type information, the next +.Sy uint16_t +is encoded with the type identifier of the return type of the function. It is +followed by each of the type identifiers of the arguments, if any exist, in the +order that they appear in the function. Therefore, argument 0 is the first type +identifier and so on. When a function has a final varargs argument, that is +encoded with the type identifier of zero. +.Lp +Like +.Sx The Object Section , +the function section is encoded in the order of the symbol table. It has +similar, but slightly different considerations from objects. While iterating the +symbol table, if any of the following conditions are true, then the entry is +skipped and no corresponding entry is written: +.Lp +.Bl -bullet -offset indent -compact +.It +The symbol type is not +.Sy STT_FUNCTION +.It +The symbol's section index is +.Sy SHN_UNDEF +.It +The symbol's name offset is zero +.It +The symbol's name is +.Li _START_ +or +.Li _END_ . +These are skipped because they are used for scoping local symbols in +ELF. +.El +.Ss The Type Section +The type section is the heart of the +.Nm +data. It encodes all of the information about the types themselves. The base of +the type information comes in two forms, a short form and a long form, each of +which may be followed by a variable number of arguments. The following +definitions describe the short and long forms: +.Bd -literal +#define CTF_MAX_SIZE 0xfffe /* max size of a type in bytes */ +#define CTF_LSIZE_SENT 0xffff /* sentinel for ctt_size */ +#define CTF_MAX_LSIZE UINT64_MAX + +typedef struct ctf_stype { + uint_t ctt_name; /* reference to name in string table */ + ushort_t ctt_info; /* encoded kind, variant length */ + union { + ushort_t _size; /* size of entire type in bytes */ + ushort_t _type; /* reference to another type */ + } _u; +} ctf_stype_t; + +typedef struct ctf_type { + uint_t ctt_name; /* reference to name in string table */ + ushort_t ctt_info; /* encoded kind, variant length */ + union { + ushort_t _size; /* always CTF_LSIZE_SENT */ + ushort_t _type; /* do not use */ + } _u; + uint_t ctt_lsizehi; /* high 32 bits of type size in bytes */ + uint_t ctt_lsizelo; /* low 32 bits of type size in bytes */ +} ctf_type_t; + +#define ctt_size _u._size /* for fundamental types that have a size */ +#define ctt_type _u._type /* for types that reference another type */ +.Ed +.Pp +Type sizes are stored in +.Sy bytes . +The basic small form uses a +.Sy ushort_t +to store the number of bytes. If the number of bytes in a structure would exceed +0xfffe, then the alternate form, the +.Sy ctf_type_t , +is used instead. To indicate that the larger form is being used, the member +.Em ctt_size +is set to value of +.Sy CTF_LSIZE_SENT +(0xffff). In general, when going through the type section, consumers use the +.Sy ctf_type_t +structure, but pay attention to the value of the member +.Em ctt_size +to determine whether they should increment their scan by the size of the +.Sy ctf_stype_t +or +.Sy ctf_type_t . +Not all kinds of types use +.Sy ctt_size . +Those which do not, will always use the +.Sy ctf_stype_t +structure. The individual sections for each kind have more information. +.Lp +Types are written out in order. Therefore the first entry encountered has a type +id of 0x1, or 0x8000 if a child. The member +.Em ctt_name +is encoded as described in the section +.Sx String Identifiers . +The string that it points to is the name of the type. If the identifier points +to an empty string (one that consists solely of a null terminator) then the type +does not have a name, this is common with anonymous structures and unions that +only have a typedef to name them, as well as, pointers and qualifiers. +.Lp +The next member, the +.Em ctt_info , +is encoded as described in the section +.Sx Type Encoding . +The types kind tells us how to interpret the remaining data in the +.Sy ctf_type_t +and any variable length data that may exist. The rest of this section will be +broken down into the interpretation of the various kinds. +.Ss Encoding of Integers +Integers, which are of type +.Sy CTF_K_INTEGER , +have no variable length arguments. Instead, they are followed by a four byte +.Sy uint_t +which describes their encoding. All integers must be encoded with a variable +length of zero. The +.Em ctt_size +member describes the length of the integer in bytes. In general, integer sizes +will be rounded up to the closest power of two. +.Lp +The integer encoding contains three different pieces of information: +.Bl -bullet -offset indent -compact +.It +The encoding of the integer +.It +The offset in +.Sy bits +of the type +.It +The size in +.Sy bits +of the type +.El +.Pp +This encoding can be expressed through the following macros: +.Bd -literal -offset indent +#define CTF_INT_ENCODING(data) (((data) & 0xff000000) >> 24) +#define CTF_INT_OFFSET(data) (((data) & 0x00ff0000) >> 16) +#define CTF_INT_BITS(data) (((data) & 0x0000ffff)) + +#define CTF_INT_DATA(encoding, offset, bits) \\ + (((encoding) << 24) | ((offset) << 16) | (bits)) +.Ed +.Pp +The following flags are defined for the encoding at this time: +.Bd -literal -offset indent +#define CTF_INT_SIGNED 0x01 +#define CTF_INT_CHAR 0x02 +#define CTF_INT_BOOL 0x04 +#define CTF_INT_VARARGS 0x08 +.Ed +.Lp +By default, an integer is considered to be unsigned, unless it has the +.Sy CTF_INT_SIGNED +flag set. If the flag +.Sy CTF_INT_CHAR +is set, that indicates that the integer is of a type that stores character +data, for example the intrinsic C type +.Sy char +would have the +.Sy CTF_INT_CHAR +flag set. If the flag +.Sy CTF_INT_BOOL +is set, that indicates that the integer represents a boolean type. For example, +the intrinsic C type +.Sy _Bool +would have the +.Sy CTF_INT_BOOL +flag set. Finally, the flag +.Sy CTF_INT_VARARGS +indicates that the integer is used as part of a variable number of arguments. +This encoding is rather uncommon. +.Ss Encoding of Floats +Floats, which are of type +.Sy CTF_K_FLOAT , +are similar to their integer counterparts. They have no variable length +arguments and are followed by a four byte encoding which describes the kind of +float that exists. The +.Em ctt_size +member is the size, in bytes, of the float. The float encoding has three +different pieces of information inside of it: +.Lp +.Bl -bullet -offset indent -compact +.It +The specific kind of float that exists +.It +The offset in +.Sy bits +of the float +.It +The size in +.Sy bits +of the float +.El +.Lp +This encoding can be expressed through the following macros: +.Bd -literal -offset indent +#define CTF_FP_ENCODING(data) (((data) & 0xff000000) >> 24) +#define CTF_FP_OFFSET(data) (((data) & 0x00ff0000) >> 16) +#define CTF_FP_BITS(data) (((data) & 0x0000ffff)) + +#define CTF_FP_DATA(encoding, offset, bits) \\ + (((encoding) << 24) | ((offset) << 16) | (bits)) +.Ed +.Lp +Where as the encoding for integers was a series of flags, the encoding for +floats maps to a specific kind of float. It is not a flag-based value. The kinds of floats +correspond to both their size, and the encoding. This covers all of the basic C +intrinsic floating point types. The following are the different kinds of floats +represented in the encoding: +.Bd -literal -offset indent +#define CTF_FP_SINGLE 1 /* IEEE 32-bit float encoding */ +#define CTF_FP_DOUBLE 2 /* IEEE 64-bit float encoding */ +#define CTF_FP_CPLX 3 /* Complex encoding */ +#define CTF_FP_DCPLX 4 /* Double complex encoding */ +#define CTF_FP_LDCPLX 5 /* Long double complex encoding */ +#define CTF_FP_LDOUBLE 6 /* Long double encoding */ +#define CTF_FP_INTRVL 7 /* Interval (2x32-bit) encoding */ +#define CTF_FP_DINTRVL 8 /* Double interval (2x64-bit) encoding */ +#define CTF_FP_LDINTRVL 9 /* Long double interval (2x128-bit) encoding */ +#define CTF_FP_IMAGRY 10 /* Imaginary (32-bit) encoding */ +#define CTF_FP_DIMAGRY 11 /* Long imaginary (64-bit) encoding */ +#define CTF_FP_LDIMAGRY 12 /* Long double imaginary (128-bit) encoding */ +.Ed +.Ss Encoding of Arrays +Arrays, which are of type +.Sy CTF_K_ARRAY , +have no variable length arguments. They are followed by a structure which +describes the number of elements in the array, the type identifier of the +elements in the array, and the type identifier of the index of the array. With +arrays, the +.Em ctt_size +member is set to zero. The structure that follows an array is defined as: +.Bd -literal +typedef struct ctf_array { + ushort_t cta_contents; /* reference to type of array contents */ + ushort_t cta_index; /* reference to type of array index */ + uint_t cta_nelems; /* number of elements */ +} ctf_array_t; +.Ed +.Lp +The +.Em cta_contents +and +.Em cta_index +members of the +.Sy ctf_array_t +are type identifiers which are encoded as per the section +.Sx Type Identifiers . +The member +.Em cta_nelems +is a simple four byte unsigned count of the number of elements. This count may +be zero when encountering C99's flexible array members. +.Ss Encoding of Functions +Function types, which are of type +.Sy CTF_K_FUNCTION , +use the variable length list to be the number of arguments in the function. When +the function has a final member which is a varargs, then the argument count is +incremented by one to account for the variable argument. Here, the +.Em ctt_type +member is encoded with the type identifier of the return type of the function. +Note that the +.Em ctt_size +member is not used here. +.Lp +The variable argument list contains the type identifiers for the arguments of +the function, if any. Each one is represented by a +.Sy uint16_t +and encoded according to the +.Sx Type Identifiers +section. If the function's last argument is of type varargs, then it is also +written out, but the type identifier is zero. This is included in the count of +the function's arguments. +.Ss Encoding of Structures and Unions +Structures and Unions, which are encoded with +.Sy CTF_K_STRUCT +and +.Sy CTF_K_UNION +respectively, are very similar constructs in C. The main difference +between them is the fact that every member of a structure follows one another, +where as in a union, all members share the same memory. They are also very +similar in terms of their encoding in +.Nm . +The variable length argument for structures and unions represents the number of +members that they have. The value of the member +.Em ctt_size +is the size of the structure and union. There are two different structures which +are used to encode members in the variable list. When the size of a structure or +union is greater than or equal to the large member threshold, 8192, then a +different structure is used to encode the member, all members are encoded using +the same structure. The structure for members is as follows: +.Bd -literal +typedef struct ctf_member { + uint_t ctm_name; /* reference to name in string table */ + ushort_t ctm_type; /* reference to type of member */ + ushort_t ctm_offset; /* offset of this member in bits */ +} ctf_member_t; + +typedef struct ctf_lmember { + uint_t ctlm_name; /* reference to name in string table */ + ushort_t ctlm_type; /* reference to type of member */ + ushort_t ctlm_pad; /* padding */ + uint_t ctlm_offsethi; /* high 32 bits of member offset in bits */ + uint_t ctlm_offsetlo; /* low 32 bits of member offset in bits */ +} ctf_lmember_t; +.Ed +.Lp +Both the +.Em ctm_name +and +.Em ctlm_name +refer to the name of the member. The name is encoded as an offset into the +string table as described by the section +.Sx String Identifiers . +The members +.Sy ctm_type +and +.Sy ctlm_type +both refer to the type of the member. They are encoded as per the section +.Sx Type Identifiers . +.Lp +The last piece of information that is present is the offset which describes the +offset in memory that the member begins at. For unions, this value will always +be zero because the start of unions in memory is always zero. For structures, +this is the offset in +.Sy bits +that the member begins at. Note that a compiler may lay out a type with padding. +This means that the difference in offset between two consecutive members may be +larger than the size of the member. When the size of the overall structure is +strictly less than 8192 bytes, the normal structure, +.Sy ctf_member_t , +is used and the offset in bits is stored in the member +.Em ctm_offset . +However, when the size of the structure is greater than or equal to 8192 bytes, +then the number of bits is split into two 32-bit quantities. One member, +.Em ctlm_offsethi , +represents the upper 32 bits of the offset, while the other member, +.Em ctlm_offsetlo , +represents the lower 32 bits of the offset. These can be joined together to get +a 64-bit sized offset in bits by shifting the member +.Em ctlm_offsethi +to the left by thirty two and then doing a binary or of +.Em ctlm_offsetlo . +.Ss Encoding of Enumerations +Enumerations, noted by the type +.Sy CTF_K_ENUM , +are similar to structures. Enumerations use the variable list to note the number +of values that the enumeration contains, which we'll term enumerators. In C, an +enumeration is always equivalent to the intrinsic type +.Sy int , +thus the value of the member +.Em ctt_size +is always the size of an integer which is determined based on the current model. +For illumos systems, this will always be 4, as an integer is always defined to +be 4 bytes large in both +.Sy ILP32 +and +.Sy LP64 , +regardless of the architecture. +.Lp +The enumerators encoded in an enumeration have the following structure in the +variable list: +.Bd -literal +typedef struct ctf_enum { + uint_t cte_name; /* reference to name in string table */ + int cte_value; /* value associated with this name */ +} ctf_enum_t; +.Ed +.Pp +The member +.Em cte_name +refers to the name of the enumerator's value, it is encoded according to the +rules in the section +.Sx String Identifiers . +The member +.Em cte_value +contains the integer value of this enumerator. +.Ss Encoding of Forward References +Forward references, types of kind +.Sy CTF_K_FORWARD , +in a +.Nm +file refer to types which may not have a definition at all, only a name. If +the +.Nm +file is a child, then it may be that the forward is resolved to an +actual type in the parent, otherwise the definition may be in another +.Nm +container or may not be known at all. The only member of the +.Sy ctf_type_t +that matters for a forward declaration is the +.Em ctt_name +which points to the name of the forward reference in the string table as +described earlier. There is no other information recorded for forward +references. +.Ss Encoding of Pointers, Typedefs, Volatile, Const, and Restrict +Pointers, typedefs, volatile, const, and restrict are all similar in +.Nm . +They all refer to another type. In the case of typedefs, they provide an +alternate name, while volatile, const, and restrict change how the type is +interpreted in the C programming language. This covers the +.Nm +kinds +.Sy CTF_K_POINTER , +.Sy CTF_K_TYPEDEF , +.Sy CTF_K_VOLATILE , +.Sy CTF_K_RESTRICT , +and +.Sy CTF_K_CONST . +.Lp +These types have no variable list entries and use the member +.Em ctt_type +to refer to the base type that they modify. +.Ss Encoding of Unknown Types +Types with the kind +.Sy CTF_K_UNKNOWN +are used to indicate gaps in the type identifier space. These entries consume an +identifier, but do not define anything. Nothing should refer to these gap +identifiers. +.Ss Dependencies Between Types +C types can be imagined as a directed, cyclic, graph. Structures and unions may +refer to each other in a way that creates a cyclic dependency. In cases such as +these, the entire type section must be read in and processed. Consumers must +not assume that every type can be laid out in dependency order; they +cannot. +.Ss The String Section +The last section of the +.Nm +file is the +.Sy string +section. This section encodes all of the strings that appear throughout +the other sections. It is laid out as a series of characters followed by +a null terminator. Generally, all names are written out in ASCII, as +most C compilers do not allow and characters to appear in identifiers +outside of a subset of ASCII. However, any extended characters sets +should be written out as a series of UTF-8 bytes. +.Lp +The first entry in the section, at offset zero, is a single null +terminator to reference the empty string. Following that, each C string +should be written out, including the null terminator. Offsets that refer +to something in this section should refer to the first byte which begins +a string. Beyond the first byte in the section being the null +terminator, the order of strings is unimportant. +.Sh Data Encoding and ELF Considerations +.Nm +data is generally included in ELF objects which specify information to +identify the architecture and endianness of the file. A +.Nm +container inside such an object must match the endianness of the ELF +object. Aside from the question of the endian encoding of data, there +should be no other differences between architectures. While many of the +types in this document refer to non-fixed size C integral types, they +are equivalent in the models +.Sy ILP32 +and +.Sy LP64 . +If any other model is being used with +.Nm +data that has different sizes, then it must not use the model's sizes for +those integral types and instead use the fixed size equivalents based on an +.Sy ILP32 +environment. +.Lp +When placing a +.Nm +container inside of an ELF object, there are certain conventions that are +expected for the purposes of tooling being able to find the +.Nm +data. In particular, a given ELF object should only contain a single +.Nm +section. Multiple containers should be merged together into a single +one. +.Lp +The +.Nm +file should be included in its own ELF section. The section's name +must be +.Ql .SUNW_ctf . +The type of the section must be +.Sy SHT_PROGBITS . +The section should have a link set to the symbol table and its address +alignment must be 4. +.Sh SEE ALSO +.Xr mdb 1 , +.Xr dtrace 1M , +.Xr libelf 3LIB , +.Xr gelf 3ELF , +.Xr a.out 4 From f41c6565a56cc169c6a869bdd05690a3afddd7da Mon Sep 17 00:00:00 2001 From: Xin LI Date: Sat, 8 Nov 2014 06:31:51 +0000 Subject: [PATCH 2/2] 5243 zdb -b could be much faster Reviewed by: Christopher Siden Reviewed by: George Wilson Reviewed by: Richard Elling Approved by: Dan McDonald Author: Matthew Ahrens illumos/illumos-gate@f7950bf1145637c6dc57742a8bb95631fd5c846f --- cmd/zdb/zdb.c | 27 +++++++++++++++++++-- uts/common/fs/zfs/dmu_traverse.c | 41 +++++++++++++++++--------------- 2 files changed, 47 insertions(+), 21 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 63752a51ded0..3747e6106798 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -77,9 +77,11 @@ #ifndef lint extern boolean_t zfs_recover; extern uint64_t zfs_arc_max, zfs_arc_meta_limit; +extern int zfs_vdev_async_read_max_active; #else boolean_t zfs_recover; uint64_t zfs_arc_max, zfs_arc_meta_limit; +int zfs_vdev_async_read_max_active; #endif const char cmdname[] = "zdb"; @@ -2355,8 +2357,14 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, zcb->zcb_readfails = 0; - if (dump_opt['b'] < 5 && - gethrtime() > zcb->zcb_lastprint + NANOSEC) { + /* only call gethrtime() every 100 blocks */ + static int iters; + if (++iters > 100) + iters = 0; + else + return (0); + + if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) { uint64_t now = gethrtime(); char buf[10]; uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize; @@ -2465,6 +2473,14 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) (longlong_t)vd->vdev_ms_count); msp->ms_ops = &zdb_metaslab_ops; + + /* + * We don't want to spend the CPU + * manipulating the size-ordered + * tree, so clear the range_tree + * ops. + */ + msp->ms_tree->rt_ops = NULL; VERIFY0(space_map_load(msp->ms_sm, msp->ms_tree, SM_ALLOC)); msp->ms_loaded = B_TRUE; @@ -3478,6 +3494,13 @@ main(int argc, char **argv) */ zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024; + /* + * "zdb -c" uses checksum-verifying scrub i/os which are async reads. + * "zdb -b" uses traversal prefetch which uses async reads. + * For good performance, let several of them be active at once. + */ + zfs_vdev_async_read_max_active = 10; + kernel_init(FREAD); g_zfs = libzfs_init(); ASSERT(g_zfs != NULL); diff --git a/uts/common/fs/zfs/dmu_traverse.c b/uts/common/fs/zfs/dmu_traverse.c index fb1ec0755cf1..5836549d200a 100644 --- a/uts/common/fs/zfs/dmu_traverse.c +++ b/uts/common/fs/zfs/dmu_traverse.c @@ -59,6 +59,7 @@ typedef struct traverse_data { int td_flags; prefetch_data_t *td_pfd; boolean_t td_paused; + uint64_t td_hole_birth_enabled_txg; blkptr_cb_t *td_func; void *td_arg; } traverse_data_t; @@ -229,25 +230,20 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, } if (bp->blk_birth == 0) { - if (spa_feature_is_active(td->td_spa, SPA_FEATURE_HOLE_BIRTH)) { - /* - * Since this block has a birth time of 0 it must be a - * hole created before the SPA_FEATURE_HOLE_BIRTH - * feature was enabled. If SPA_FEATURE_HOLE_BIRTH - * was enabled before the min_txg for this traveral we - * know the hole must have been created before the - * min_txg for this traveral, so we can skip it. If - * SPA_FEATURE_HOLE_BIRTH was enabled after the min_txg - * for this traveral we cannot tell if the hole was - * created before or after the min_txg for this - * traversal, so we cannot skip it. - */ - uint64_t hole_birth_enabled_txg; - VERIFY(spa_feature_enabled_txg(td->td_spa, - SPA_FEATURE_HOLE_BIRTH, &hole_birth_enabled_txg)); - if (hole_birth_enabled_txg < td->td_min_txg) - return (0); - } + /* + * Since this block has a birth time of 0 it must be a + * hole created before the SPA_FEATURE_HOLE_BIRTH + * feature was enabled. If SPA_FEATURE_HOLE_BIRTH + * was enabled before the min_txg for this traveral we + * know the hole must have been created before the + * min_txg for this traveral, so we can skip it. If + * SPA_FEATURE_HOLE_BIRTH was enabled after the min_txg + * for this traveral we cannot tell if the hole was + * created before or after the min_txg for this + * traversal, so we cannot skip it. + */ + if (td->td_hole_birth_enabled_txg < td->td_min_txg) + return (0); } else if (bp->blk_birth <= td->td_min_txg) { return (0); } @@ -523,6 +519,13 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, td.td_flags = flags; td.td_paused = B_FALSE; + if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { + VERIFY(spa_feature_enabled_txg(spa, + SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg)); + } else { + td.td_hole_birth_enabled_txg = 0; + } + pd.pd_blks_max = zfs_pd_blks_max; pd.pd_flags = flags; mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);