2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
/*
|
2010-05-28 13:45:14 -07:00
|
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
2014-11-03 11:12:40 -08:00
|
|
|
* Copyright (c) 2013 by Delphix. All rights reserved.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _SYS_ZAP_H
|
|
|
|
#define _SYS_ZAP_H
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ZAP - ZFS Attribute Processor
|
|
|
|
*
|
|
|
|
* The ZAP is a module which sits on top of the DMU (Data Management
|
|
|
|
* Unit) and implements a higher-level storage primitive using DMU
|
|
|
|
* objects. Its primary consumer is the ZPL (ZFS Posix Layer).
|
|
|
|
*
|
|
|
|
* A "zapobj" is a DMU object which the ZAP uses to stores attributes.
|
|
|
|
* Users should use only zap routines to access a zapobj - they should
|
|
|
|
* not access the DMU object directly using DMU routines.
|
|
|
|
*
|
|
|
|
* The attributes stored in a zapobj are name-value pairs. The name is
|
|
|
|
* a zero-terminated string of up to ZAP_MAXNAMELEN bytes (including
|
|
|
|
* terminating NULL). The value is an array of integers, which may be
|
|
|
|
* 1, 2, 4, or 8 bytes long. The total space used by the array (number
|
|
|
|
* of integers * integer length) can be up to ZAP_MAXVALUELEN bytes.
|
|
|
|
* Note that an 8-byte integer value can be used to store the location
|
|
|
|
* (object number) of another dmu object (which may be itself a zapobj).
|
|
|
|
* Note that you can use a zero-length attribute to store a single bit
|
|
|
|
* of information - the attribute is present or not.
|
|
|
|
*
|
|
|
|
* The ZAP routines are thread-safe. However, you must observe the
|
|
|
|
* DMU's restriction that a transaction may not be operated on
|
|
|
|
* concurrently.
|
|
|
|
*
|
|
|
|
* Any of the routines that return an int may return an I/O error (EIO
|
|
|
|
* or ECHECKSUM).
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* Implementation / Performance Notes:
|
|
|
|
*
|
|
|
|
* The ZAP is intended to operate most efficiently on attributes with
|
|
|
|
* short (49 bytes or less) names and single 8-byte values, for which
|
|
|
|
* the microzap will be used. The ZAP should be efficient enough so
|
|
|
|
* that the user does not need to cache these attributes.
|
|
|
|
*
|
|
|
|
* The ZAP's locking scheme makes its routines thread-safe. Operations
|
|
|
|
* on different zapobjs will be processed concurrently. Operations on
|
|
|
|
* the same zapobj which only read data will be processed concurrently.
|
|
|
|
* Operations on the same zapobj which modify data will be processed
|
|
|
|
* concurrently when there are many attributes in the zapobj (because
|
|
|
|
* the ZAP uses per-block locking - more than 128 * (number of cpus)
|
|
|
|
* small attributes will suffice).
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We're using zero-terminated byte strings (ie. ASCII or UTF-8 C
|
|
|
|
* strings) for the names of attributes, rather than a byte string
|
|
|
|
* bounded by an explicit length. If some day we want to support names
|
|
|
|
* in character sets which have embedded zeros (eg. UTF-16, UTF-32),
|
|
|
|
* we'll have to add routines for using length-bounded strings.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <sys/dmu.h>
|
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
2013-06-11 09:12:34 -08:00
|
|
|
* Specifies matching criteria for ZAP lookups.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
|
|
|
typedef enum matchtype
|
|
|
|
{
|
2013-06-11 09:12:34 -08:00
|
|
|
/* Only find an exact match (non-normalized) */
|
2008-11-20 12:01:55 -08:00
|
|
|
MT_EXACT,
|
2013-06-11 09:12:34 -08:00
|
|
|
/*
|
|
|
|
* If there is an exact match, find that, otherwise find the
|
|
|
|
* first normalized match.
|
|
|
|
*/
|
2008-11-20 12:01:55 -08:00
|
|
|
MT_BEST,
|
2013-06-11 09:12:34 -08:00
|
|
|
/*
|
|
|
|
* Find the "first" normalized (case and Unicode form) match;
|
|
|
|
* the designated "first" match will not change as long as the
|
|
|
|
* set of entries with this normalization doesn't change.
|
|
|
|
*/
|
2008-11-20 12:01:55 -08:00
|
|
|
MT_FIRST
|
|
|
|
} matchtype_t;
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
typedef enum zap_flags {
|
|
|
|
/* Use 64-bit hash value (serialized cursors will always use 64-bits) */
|
|
|
|
ZAP_FLAG_HASH64 = 1 << 0,
|
|
|
|
/* Key is binary, not string (zap_add_uint64() can be used) */
|
|
|
|
ZAP_FLAG_UINT64_KEY = 1 << 1,
|
|
|
|
/*
|
|
|
|
* First word of key (which must be an array of uint64) is
|
|
|
|
* already randomly distributed.
|
|
|
|
*/
|
|
|
|
ZAP_FLAG_PRE_HASHED_KEY = 1 << 2,
|
|
|
|
} zap_flags_t;
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Create a new zapobj with no attributes and return its object number.
|
|
|
|
* MT_EXACT will cause the zap object to only support MT_EXACT lookups,
|
|
|
|
* otherwise any matchtype can be used for lookups.
|
|
|
|
*
|
|
|
|
* normflags specifies what normalization will be done. values are:
|
|
|
|
* 0: no normalization (legacy on-disk format, supports MT_EXACT matching
|
|
|
|
* only)
|
|
|
|
* U8_TEXTPREP_TOLOWER: case normalization will be performed.
|
|
|
|
* MT_FIRST/MT_BEST matching will find entries that match without
|
|
|
|
* regard to case (eg. looking for "foo" can find an entry "Foo").
|
|
|
|
* Eventually, other flags will permit unicode normalization as well.
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
*
|
|
|
|
* dnodesize specifies the on-disk size of the dnode for the new zapobj.
|
|
|
|
* Valid values are multiples of 512 up to DNODE_MAX_SIZE.
|
2008-11-20 12:01:55 -08:00
|
|
|
*/
|
|
|
|
uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
|
|
|
|
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
uint64_t zap_create_dnsize(objset_t *ds, dmu_object_type_t ot,
|
|
|
|
dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
|
2008-11-20 12:01:55 -08:00
|
|
|
uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot,
|
|
|
|
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
uint64_t zap_create_norm_dnsize(objset_t *ds, int normflags,
|
|
|
|
dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
|
|
|
|
int dnodesize, dmu_tx_t *tx);
|
2010-05-28 13:45:14 -07:00
|
|
|
uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
|
|
|
|
dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
|
|
|
|
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
uint64_t zap_create_flags_dnsize(objset_t *os, int normflags,
|
|
|
|
zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift,
|
|
|
|
int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
|
|
|
|
int dnodesize, dmu_tx_t *tx);
|
2012-12-13 15:24:15 -08:00
|
|
|
uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot,
|
|
|
|
uint64_t parent_obj, const char *name, dmu_tx_t *tx);
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot,
|
|
|
|
uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
2013-10-08 09:13:05 -08:00
|
|
|
/*
|
|
|
|
* Initialize an already-allocated object.
|
|
|
|
*/
|
|
|
|
void mzap_create_impl(objset_t *os, uint64_t obj, int normflags,
|
|
|
|
zap_flags_t flags, dmu_tx_t *tx);
|
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Create a new zapobj with no attributes from the given (unallocated)
|
|
|
|
* object number.
|
|
|
|
*/
|
|
|
|
int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
|
|
|
|
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
int zap_create_claim_dnsize(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
|
|
|
|
dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
|
2008-11-20 12:01:55 -08:00
|
|
|
int zap_create_claim_norm(objset_t *ds, uint64_t obj,
|
|
|
|
int normflags, dmu_object_type_t ot,
|
|
|
|
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
|
Implement large_dnode pool feature
Justification
-------------
This feature adds support for variable length dnodes. Our motivation is
to eliminate the overhead associated with using spill blocks. Spill
blocks are used to store system attribute data (i.e. file metadata) that
does not fit in the dnode's bonus buffer. By allowing a larger bonus
buffer area the use of a spill block can be avoided. Spill blocks
potentially incur an additional read I/O for every dnode in a dnode
block. As a worst case example, reading 32 dnodes from a 16k dnode block
and all of the spill blocks could issue 33 separate reads. Now suppose
those dnodes have size 1024 and therefore don't need spill blocks. Then
the worst case number of blocks read is reduced to from 33 to two--one
per dnode block. In practice spill blocks may tend to be co-located on
disk with the dnode blocks so the reduction in I/O would not be this
drastic. In a badly fragmented pool, however, the improvement could be
significant.
ZFS-on-Linux systems that make heavy use of extended attributes would
benefit from this feature. In particular, ZFS-on-Linux supports the
xattr=sa dataset property which allows file extended attribute data
to be stored in the dnode bonus buffer as an alternative to the
traditional directory-based format. Workloads such as SELinux and the
Lustre distributed filesystem often store enough xattr data to force
spill bocks when xattr=sa is in effect. Large dnodes may therefore
provide a performance benefit to such systems.
Other use cases that may benefit from this feature include files with
large ACLs and symbolic links with long target names. Furthermore,
this feature may be desirable on other platforms in case future
applications or features are developed that could make use of a
larger bonus buffer area.
Implementation
--------------
The size of a dnode may be a multiple of 512 bytes up to the size of
a dnode block (currently 16384 bytes). A dn_extra_slots field was
added to the current on-disk dnode_phys_t structure to describe the
size of the physical dnode on disk. The 8 bits for this field were
taken from the zero filled dn_pad2 field. The field represents how
many "extra" dnode_phys_t slots a dnode consumes in its dnode block.
This convention results in a value of 0 for 512 byte dnodes which
preserves on-disk format compatibility with older software.
Similarly, the in-memory dnode_t structure has a new dn_num_slots field
to represent the total number of dnode_phys_t slots consumed on disk.
Thus dn->dn_num_slots is 1 greater than the corresponding
dnp->dn_extra_slots. This difference in convention was adopted
because, unlike on-disk structures, backward compatibility is not a
concern for in-memory objects, so we used a more natural way to
represent size for a dnode_t.
The default size for newly created dnodes is determined by the value of
a new "dnodesize" dataset property. By default the property is set to
"legacy" which is compatible with older software. Setting the property
to "auto" will allow the filesystem to choose the most suitable dnode
size. Currently this just sets the default dnode size to 1k, but future
code improvements could dynamically choose a size based on observed
workload patterns. Dnodes of varying sizes can coexist within the same
dataset and even within the same dnode block. For example, to enable
automatically-sized dnodes, run
# zfs set dnodesize=auto tank/fish
The user can also specify literal values for the dnodesize property.
These are currently limited to powers of two from 1k to 16k. The
power-of-2 limitation is only for simplicity of the user interface.
Internally the implementation can handle any multiple of 512 up to 16k,
and consumers of the DMU API can specify any legal dnode value.
The size of a new dnode is determined at object allocation time and
stored as a new field in the znode in-memory structure. New DMU
interfaces are added to allow the consumer to specify the dnode size
that a newly allocated object should use. Existing interfaces are
unchanged to avoid having to update every call site and to preserve
compatibility with external consumers such as Lustre. The new
interfaces names are given below. The versions of these functions that
don't take a dnodesize parameter now just call the _dnsize() versions
with a dnodesize of 0, which means use the legacy dnode size.
New DMU interfaces:
dmu_object_alloc_dnsize()
dmu_object_claim_dnsize()
dmu_object_reclaim_dnsize()
New ZAP interfaces:
zap_create_dnsize()
zap_create_norm_dnsize()
zap_create_flags_dnsize()
zap_create_claim_norm_dnsize()
zap_create_link_dnsize()
The constant DN_MAX_BONUSLEN is renamed to DN_OLD_MAX_BONUSLEN. The
spa_maxdnodesize() function should be used to determine the maximum
bonus length for a pool.
These are a few noteworthy changes to key functions:
* The prototype for dnode_hold_impl() now takes a "slots" parameter.
When the DNODE_MUST_BE_FREE flag is set, this parameter is used to
ensure the hole at the specified object offset is large enough to
hold the dnode being created. The slots parameter is also used
to ensure a dnode does not span multiple dnode blocks. In both of
these cases, if a failure occurs, ENOSPC is returned. Keep in mind,
these failure cases are only possible when using DNODE_MUST_BE_FREE.
If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
dnode_hold_impl() will check if the requested dnode is already
consumed as an extra dnode slot by an large dnode, in which case
it returns ENOENT.
* The function dmu_object_alloc() advances to the next dnode block
if dnode_hold_impl() returns an error for a requested object.
This is because the beginning of the next dnode block is the only
location it can safely assume to either be a hole or a valid
starting point for a dnode.
* dnode_next_offset_level() and other functions that iterate
through dnode blocks may no longer use a simple array indexing
scheme. These now use the current dnode's dn_num_slots field to
advance to the next dnode in the block. This is to ensure we
properly skip the current dnode's bonus area and don't interpret it
as a valid dnode.
zdb
---
The zdb command was updated to display a dnode's size under the
"dnsize" column when the object is dumped.
For ZIL create log records, zdb will now display the slot count for
the object.
ztest
-----
Ztest chooses a random dnodesize for every newly created object. The
random distribution is more heavily weighted toward small dnodes to
better simulate real-world datasets.
Unused bonus buffer space is filled with non-zero values computed from
the object number, dataset id, offset, and generation number. This
helps ensure that the dnode traversal code properly skips the interior
regions of large dnodes, and that these interior regions are not
overwritten by data belonging to other dnodes. A new test visits each
object in a dataset. It verifies that the actual dnode size matches what
was stored in the ztest block tag when it was created. It also verifies
that the unused bonus buffer space is filled with the expected data
patterns.
ZFS Test Suite
--------------
Added six new large dnode-specific tests, and integrated the dnodesize
property into existing tests for zfs allow and send/recv.
Send/Receive
------------
ZFS send streams for datasets containing large dnodes cannot be received
on pools that don't support the large_dnode feature. A send stream with
large dnodes sets a DMU_BACKUP_FEATURE_LARGE_DNODE flag which will be
unrecognized by an incompatible receiving pool so that the zfs receive
will fail gracefully.
While not implemented here, it may be possible to generate a
backward-compatible send stream from a dataset containing large
dnodes. The implementation may be tricky, however, because the send
object record for a large dnode would need to be resized to a 512
byte dnode, possibly kicking in a spill block in the process. This
means we would need to construct a new SA layout and possibly
register it in the SA layout object. The SA layout is normally just
sent as an ordinary object record. But if we are constructing new
layouts while generating the send stream we'd have to build the SA
layout object dynamically and send it at the end of the stream.
For sending and receiving between pools that do support large dnodes,
the drr_object send record type is extended with a new field to store
the dnode slot count. This field was repurposed from unused padding
in the structure.
ZIL Replay
----------
The dnode slot count is stored in the uppermost 8 bits of the lr_foid
field. The bits were unused as the object id is currently capped at
48 bits.
Resizing Dnodes
---------------
It should be possible to resize a dnode when it is dirtied if the
current dnodesize dataset property differs from the dnode's size, but
this functionality is not currently implemented. Clearly a dnode can
only grow if there are sufficient contiguous unused slots in the
dnode block, but it should always be possible to shrink a dnode.
Growing dnodes may be useful to reduce fragmentation in a pool with
many spill blocks in use. Shrinking dnodes may be useful to allow
sending a dataset to a pool that doesn't support the large_dnode
feature.
Feature Reference Counting
--------------------------
The reference count for the large_dnode pool feature tracks the
number of datasets that have ever contained a dnode of size larger
than 512 bytes. The first time a large dnode is created in a dataset
the dataset is converted to an extensible dataset. This is a one-way
operation and the only way to decrement the feature count is to
destroy the dataset, even if the dataset no longer contains any large
dnodes. The complexity of reference counting on a per-dnode basis was
too high, so we chose to track it on a per-dataset basis similarly to
the large_block feature.
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3542
2016-03-16 18:25:34 -07:00
|
|
|
int zap_create_claim_norm_dnsize(objset_t *ds, uint64_t obj,
|
|
|
|
int normflags, dmu_object_type_t ot,
|
|
|
|
dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The zapobj passed in must be a valid ZAP object for all of the
|
|
|
|
* following routines.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Destroy this zapobj and all its attributes.
|
|
|
|
*
|
|
|
|
* Frees the object number using dmu_object_free.
|
|
|
|
*/
|
|
|
|
int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Manipulate attributes.
|
|
|
|
*
|
|
|
|
* 'integer_size' is in bytes, and must be 1, 2, 4, or 8.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Retrieve the contents of the attribute with the given name.
|
|
|
|
*
|
|
|
|
* If the requested attribute does not exist, the call will fail and
|
|
|
|
* return ENOENT.
|
|
|
|
*
|
|
|
|
* If 'integer_size' is smaller than the attribute's integer size, the
|
|
|
|
* call will fail and return EINVAL.
|
|
|
|
*
|
|
|
|
* If 'integer_size' is equal to or larger than the attribute's integer
|
2013-06-11 09:12:34 -08:00
|
|
|
* size, the call will succeed and return 0.
|
|
|
|
*
|
|
|
|
* When converting to a larger integer size, the integers will be treated as
|
|
|
|
* unsigned (ie. no sign-extension will be performed).
|
2008-11-20 12:01:55 -08:00
|
|
|
*
|
|
|
|
* 'num_integers' is the length (in integers) of 'buf'.
|
|
|
|
*
|
|
|
|
* If the attribute is longer than the buffer, as many integers as will
|
|
|
|
* fit will be transferred to 'buf'. If the entire attribute was not
|
|
|
|
* transferred, the call will return EOVERFLOW.
|
2013-06-11 09:12:34 -08:00
|
|
|
*/
|
|
|
|
int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name,
|
|
|
|
uint64_t integer_size, uint64_t num_integers, void *buf);
|
|
|
|
|
|
|
|
/*
|
2008-11-20 12:01:55 -08:00
|
|
|
* If rn_len is nonzero, realname will be set to the name of the found
|
|
|
|
* entry (which may be different from the requested name if matchtype is
|
|
|
|
* not MT_EXACT).
|
|
|
|
*
|
|
|
|
* If normalization_conflictp is not NULL, it will be set if there is
|
|
|
|
* another name with the same case/unicode normalized form.
|
|
|
|
*/
|
|
|
|
int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name,
|
|
|
|
uint64_t integer_size, uint64_t num_integers, void *buf,
|
|
|
|
matchtype_t mt, char *realname, int rn_len,
|
|
|
|
boolean_t *normalization_conflictp);
|
2010-05-28 13:45:14 -07:00
|
|
|
int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
|
|
|
|
int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf);
|
|
|
|
int zap_contains(objset_t *ds, uint64_t zapobj, const char *name);
|
2014-05-09 14:51:20 -07:00
|
|
|
int zap_prefetch(objset_t *os, uint64_t zapobj, const char *name);
|
2010-05-28 13:45:14 -07:00
|
|
|
int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
|
|
|
|
int key_numints);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
OpenZFS 7004 - dmu_tx_hold_zap() does dnode_hold() 7x on same object
Using a benchmark which has 32 threads creating 2 million files in the
same directory, on a machine with 16 CPU cores, I observed poor
performance. I noticed that dmu_tx_hold_zap() was using about 30% of
all CPU, and doing dnode_hold() 7 times on the same object (the ZAP
object that is being held).
dmu_tx_hold_zap() keeps a hold on the dnode_t the entire time it is
running, in dmu_tx_hold_t:txh_dnode, so it would be nice to use the
dnode_t that we already have in hand, rather than repeatedly calling
dnode_hold(). To do this, we need to pass the dnode_t down through
all the intermediate calls that dmu_tx_hold_zap() makes, making these
routines take the dnode_t* rather than an objset_t* and a uint64_t
object number. In particular, the following routines will need to have
analogous *_by_dnode() variants created:
dmu_buf_hold_noread()
dmu_buf_hold()
zap_lookup()
zap_lookup_norm()
zap_count_write()
zap_lockdir()
zap_count_write()
This can improve performance on the benchmark described above by 100%,
from 30,000 file creations per second to 60,000. (This improvement is on
top of that provided by working around the object allocation issue. Peak
performance of ~90,000 creations per second was observed with 8 CPUs;
adding CPUs past that decreased performance due to lock contention.) The
CPU used by dmu_tx_hold_zap() was reduced by 88%, from 340 CPU-seconds
to 40 CPU-seconds.
Sponsored by: Intel Corp.
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/7004
OpenZFS-commit: https://github.com/openzfs/openzfs/pull/109
Closes #4641
Closes #4972
2016-07-20 15:42:13 -07:00
|
|
|
int zap_lookup_by_dnode(dnode_t *dn, const char *name,
|
|
|
|
uint64_t integer_size, uint64_t num_integers, void *buf);
|
|
|
|
int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
|
|
|
|
uint64_t integer_size, uint64_t num_integers, void *buf,
|
|
|
|
matchtype_t mt, char *realname, int rn_len,
|
|
|
|
boolean_t *ncp);
|
|
|
|
|
|
|
|
int zap_count_write_by_dnode(dnode_t *dn, const char *name,
|
2017-01-23 20:36:24 +03:00
|
|
|
int add, refcount_t *towrite, refcount_t *tooverwrite);
|
2009-07-02 15:44:48 -07:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
/*
|
|
|
|
* Create an attribute with the given name and value.
|
|
|
|
*
|
|
|
|
* If an attribute with the given name already exists, the call will
|
|
|
|
* fail and return EEXIST.
|
|
|
|
*/
|
2010-05-28 13:45:14 -07:00
|
|
|
int zap_add(objset_t *ds, uint64_t zapobj, const char *key,
|
2008-11-20 12:01:55 -08:00
|
|
|
int integer_size, uint64_t num_integers,
|
|
|
|
const void *val, dmu_tx_t *tx);
|
2017-01-14 01:58:41 +03:00
|
|
|
int zap_add_by_dnode(dnode_t *dn, const char *key,
|
|
|
|
int integer_size, uint64_t num_integers,
|
|
|
|
const void *val, dmu_tx_t *tx);
|
2010-05-28 13:45:14 -07:00
|
|
|
int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key,
|
|
|
|
int key_numints, int integer_size, uint64_t num_integers,
|
|
|
|
const void *val, dmu_tx_t *tx);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Set the attribute with the given name to the given value. If an
|
|
|
|
* attribute with the given name does not exist, it will be created. If
|
|
|
|
* an attribute with the given name already exists, the previous value
|
|
|
|
* will be overwritten. The integer_size may be different from the
|
|
|
|
* existing attribute's integer size, in which case the attribute's
|
|
|
|
* integer size will be updated to the new value.
|
|
|
|
*/
|
|
|
|
int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
|
|
|
|
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
|
2010-05-28 13:45:14 -07:00
|
|
|
int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
|
|
|
|
int key_numints,
|
|
|
|
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Get the length (in integers) and the integer size of the specified
|
|
|
|
* attribute.
|
|
|
|
*
|
|
|
|
* If the requested attribute does not exist, the call will fail and
|
|
|
|
* return ENOENT.
|
|
|
|
*/
|
|
|
|
int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
|
|
|
|
uint64_t *integer_size, uint64_t *num_integers);
|
2010-05-28 13:45:14 -07:00
|
|
|
int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
|
|
|
|
int key_numints, uint64_t *integer_size, uint64_t *num_integers);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Remove the specified attribute.
|
|
|
|
*
|
|
|
|
* If the specified attribute does not exist, the call will fail and
|
|
|
|
* return ENOENT.
|
|
|
|
*/
|
|
|
|
int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx);
|
|
|
|
int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
|
|
|
|
matchtype_t mt, dmu_tx_t *tx);
|
2017-01-14 01:58:41 +03:00
|
|
|
int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx);
|
2010-05-28 13:45:14 -07:00
|
|
|
int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
|
|
|
|
int key_numints, dmu_tx_t *tx);
|
2008-11-20 12:01:55 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Returns (in *count) the number of attributes in the specified zap
|
|
|
|
* object.
|
|
|
|
*/
|
|
|
|
int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Returns (in name) the name of the entry whose (value & mask)
|
|
|
|
* (za_first_integer) is value, or ENOENT if not found. The string
|
|
|
|
* pointed to by name must be at least 256 bytes long. If mask==0, the
|
|
|
|
* match must be exact (ie, same as mask=-1ULL).
|
|
|
|
*/
|
|
|
|
int zap_value_search(objset_t *os, uint64_t zapobj,
|
|
|
|
uint64_t value, uint64_t mask, char *name);
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
/*
|
|
|
|
* Transfer all the entries from fromobj into intoobj. Only works on
|
|
|
|
* int_size=8 num_integers=1 values. Fails if there are any duplicated
|
|
|
|
* entries.
|
|
|
|
*/
|
|
|
|
int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx);
|
|
|
|
|
2010-05-28 13:45:14 -07:00
|
|
|
/* Same as zap_join, but set the values to 'value'. */
|
|
|
|
int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
|
|
|
|
uint64_t value, dmu_tx_t *tx);
|
|
|
|
|
|
|
|
/* Same as zap_join, but add together any duplicated entries. */
|
|
|
|
int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
|
|
|
|
dmu_tx_t *tx);
|
|
|
|
|
2008-12-03 12:09:06 -08:00
|
|
|
/*
|
|
|
|
* Manipulate entries where the name + value are the "same" (the name is
|
|
|
|
* a stringified version of the value).
|
|
|
|
*/
|
|
|
|
int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
|
|
|
|
int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
|
|
|
|
int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value);
|
2010-05-28 13:45:14 -07:00
|
|
|
int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
|
|
|
|
dmu_tx_t *tx);
|
|
|
|
|
|
|
|
/* Here the key is an int and the value is a different int. */
|
|
|
|
int zap_add_int_key(objset_t *os, uint64_t obj,
|
|
|
|
uint64_t key, uint64_t value, dmu_tx_t *tx);
|
2012-12-23 15:57:14 -08:00
|
|
|
int zap_update_int_key(objset_t *os, uint64_t obj,
|
|
|
|
uint64_t key, uint64_t value, dmu_tx_t *tx);
|
2010-05-28 13:45:14 -07:00
|
|
|
int zap_lookup_int_key(objset_t *os, uint64_t obj,
|
|
|
|
uint64_t key, uint64_t *valuep);
|
|
|
|
|
|
|
|
int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
|
|
|
|
dmu_tx_t *tx);
|
2008-12-03 12:09:06 -08:00
|
|
|
|
2008-11-20 12:01:55 -08:00
|
|
|
struct zap;
|
|
|
|
struct zap_leaf;
|
|
|
|
typedef struct zap_cursor {
|
|
|
|
/* This structure is opaque! */
|
|
|
|
objset_t *zc_objset;
|
|
|
|
struct zap *zc_zap;
|
|
|
|
struct zap_leaf *zc_leaf;
|
|
|
|
uint64_t zc_zapobj;
|
2010-05-28 13:45:14 -07:00
|
|
|
uint64_t zc_serialized;
|
2008-11-20 12:01:55 -08:00
|
|
|
uint64_t zc_hash;
|
|
|
|
uint32_t zc_cd;
|
|
|
|
} zap_cursor_t;
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
int za_integer_length;
|
|
|
|
/*
|
|
|
|
* za_normalization_conflict will be set if there are additional
|
|
|
|
* entries with this normalized form (eg, "foo" and "Foo").
|
|
|
|
*/
|
|
|
|
boolean_t za_normalization_conflict;
|
|
|
|
uint64_t za_num_integers;
|
|
|
|
uint64_t za_first_integer; /* no sign extension for <8byte ints */
|
2016-06-15 14:28:36 -07:00
|
|
|
char za_name[ZAP_MAXNAMELEN];
|
2008-11-20 12:01:55 -08:00
|
|
|
} zap_attribute_t;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The interface for listing all the attributes of a zapobj can be
|
|
|
|
* thought of as cursor moving down a list of the attributes one by
|
|
|
|
* one. The cookie returned by the zap_cursor_serialize routine is
|
|
|
|
* persistent across system calls (and across reboot, even).
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize a zap cursor, pointing to the "first" attribute of the
|
|
|
|
* zapobj. You must _fini the cursor when you are done with it.
|
|
|
|
*/
|
|
|
|
void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj);
|
|
|
|
void zap_cursor_fini(zap_cursor_t *zc);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get the attribute currently pointed to by the cursor. Returns
|
|
|
|
* ENOENT if at the end of the attributes.
|
|
|
|
*/
|
|
|
|
int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Advance the cursor to the next attribute.
|
|
|
|
*/
|
|
|
|
void zap_cursor_advance(zap_cursor_t *zc);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get a persistent cookie pointing to the current position of the zap
|
|
|
|
* cursor. The low 4 bits in the cookie are always zero, and thus can
|
|
|
|
* be used as to differentiate a serialized cookie from a different type
|
|
|
|
* of value. The cookie will be less than 2^32 as long as there are
|
|
|
|
* fewer than 2^22 (4.2 million) entries in the zap object.
|
|
|
|
*/
|
|
|
|
uint64_t zap_cursor_serialize(zap_cursor_t *zc);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize a zap cursor pointing to the position recorded by
|
|
|
|
* zap_cursor_serialize (in the "serialized" argument). You can also
|
|
|
|
* use a "serialized" argument of 0 to start at the beginning of the
|
|
|
|
* zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to
|
|
|
|
* zap_cursor_init(...).)
|
|
|
|
*/
|
|
|
|
void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds,
|
|
|
|
uint64_t zapobj, uint64_t serialized);
|
|
|
|
|
|
|
|
|
|
|
|
#define ZAP_HISTOGRAM_SIZE 10
|
|
|
|
|
|
|
|
typedef struct zap_stats {
|
|
|
|
/*
|
|
|
|
* Size of the pointer table (in number of entries).
|
|
|
|
* This is always a power of 2, or zero if it's a microzap.
|
|
|
|
* In general, it should be considerably greater than zs_num_leafs.
|
|
|
|
*/
|
|
|
|
uint64_t zs_ptrtbl_len;
|
|
|
|
|
|
|
|
uint64_t zs_blocksize; /* size of zap blocks */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The number of blocks used. Note that some blocks may be
|
|
|
|
* wasted because old ptrtbl's and large name/value blocks are
|
|
|
|
* not reused. (Although their space is reclaimed, we don't
|
|
|
|
* reuse those offsets in the object.)
|
|
|
|
*/
|
|
|
|
uint64_t zs_num_blocks;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Pointer table values from zap_ptrtbl in the zap_phys_t
|
|
|
|
*/
|
|
|
|
uint64_t zs_ptrtbl_nextblk; /* next (larger) copy start block */
|
|
|
|
uint64_t zs_ptrtbl_blks_copied; /* number source blocks copied */
|
|
|
|
uint64_t zs_ptrtbl_zt_blk; /* starting block number */
|
|
|
|
uint64_t zs_ptrtbl_zt_numblks; /* number of blocks */
|
|
|
|
uint64_t zs_ptrtbl_zt_shift; /* bits to index it */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Values of the other members of the zap_phys_t
|
|
|
|
*/
|
|
|
|
uint64_t zs_block_type; /* ZBT_HEADER */
|
|
|
|
uint64_t zs_magic; /* ZAP_MAGIC */
|
|
|
|
uint64_t zs_num_leafs; /* The number of leaf blocks */
|
|
|
|
uint64_t zs_num_entries; /* The number of zap entries */
|
|
|
|
uint64_t zs_salt; /* salt to stir into hash function */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Histograms. For all histograms, the last index
|
|
|
|
* (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater
|
|
|
|
* than what can be represented. For example
|
|
|
|
* zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number
|
|
|
|
* of leafs with more than 45 entries.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* zs_leafs_with_n_pointers[n] is the number of leafs with
|
|
|
|
* 2^n pointers to it.
|
|
|
|
*/
|
|
|
|
uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* zs_leafs_with_n_entries[n] is the number of leafs with
|
|
|
|
* [n*5, (n+1)*5) entries. In the current implementation, there
|
|
|
|
* can be at most 55 entries in any block, but there may be
|
|
|
|
* fewer if the name or value is large, or the block is not
|
|
|
|
* completely full.
|
|
|
|
*/
|
|
|
|
uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* zs_leafs_n_tenths_full[n] is the number of leafs whose
|
|
|
|
* fullness is in the range [n/10, (n+1)/10).
|
|
|
|
*/
|
|
|
|
uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* zs_entries_using_n_chunks[n] is the number of entries which
|
|
|
|
* consume n 24-byte chunks. (Note, large names/values only use
|
|
|
|
* one chunk, but contribute to zs_num_blocks_large.)
|
|
|
|
*/
|
|
|
|
uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* zs_buckets_with_n_entries[n] is the number of buckets (each
|
|
|
|
* leaf has 64 buckets) with n entries.
|
|
|
|
* zs_buckets_with_n_entries[1] should be very close to
|
|
|
|
* zs_num_entries.
|
|
|
|
*/
|
|
|
|
uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE];
|
|
|
|
} zap_stats_t;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get statistics about a ZAP object. Note: you need to be aware of the
|
|
|
|
* internal implementation of the ZAP to correctly interpret some of the
|
|
|
|
* statistics. This interface shouldn't be relied on unless you really
|
|
|
|
* know what you're doing.
|
|
|
|
*/
|
|
|
|
int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs);
|
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* _SYS_ZAP_H */
|