Add d_off support for multiple filesystems.

The d_off field has been added to the dirent structure recently.
Currently filesystems don't support this feature.  Support has been
added and tested for zfs, ufs, ext2fs, fdescfs, msdosfs and unionfs.
A stub implementation is available for cd9660, nandfs, udf and
pseudofs but hasn't been tested.

Motivation for this feature: our usecase is for a userspace nfs server
(nfs-ganesha) with zfs.  At the moment we cache direntry offsets by
calling lseek once per entry, with this patch we can get the offset
directly from getdirentries(2) calls which provides a significant
speedup.

Submitted by:	Jack Halford <jack@gandi.net>
Reviewed by:	mckusick, pfg, rmacklem (previous versions)
Sponsored by:	Gandi.net
MFC after:	1 week
Differential revision:	https://reviews.freebsd.org/D17917
This commit is contained in:
Konstantin Belousov 2018-11-14 14:18:35 +00:00
parent d5aef6d6ca
commit 1c4ca77890
13 changed files with 48 additions and 6 deletions

View File

@ -28,7 +28,7 @@
.\" @(#)getdirentries.2 8.2 (Berkeley) 5/3/95
.\" $FreeBSD$
.\"
.Dd May 28, 2017
.Dd Nov 14, 2018
.Dt GETDIRENTRIES 2
.Os
.Sh NAME
@ -88,6 +88,11 @@ Files that are linked by hard links (see
have the same
.Fa d_fileno .
The
.Fa d_off
field returns a cookie which can be used with
.Xr lseek 2
to position the directory descriptor to the next entry.
The
.Fa d_reclen
entry is the length, in bytes, of the directory record.
The
@ -140,8 +145,17 @@ a value returned in the location pointed to by
.Fa basep
.Po Fn getdirentries
only
.Pc
.Pc ,
a value returned in the
.Fa d_off
field,
or zero.
.Sh IMPLEMENTATION NOTES
The
.Fa d_off
field is being used as a cookie to readdir for nfs servers.
These cookies can be cached and allow to read directory entries at a specific
offset on demand.
.Sh RETURN VALUES
If successful, the number of bytes actually transferred is returned.
Otherwise, -1 is returned and the global variable

View File

@ -28,7 +28,7 @@
.\" @(#)dir.5 8.3 (Berkeley) 4/19/94
.\" $FreeBSD$
.\"
.Dd June 20, 2018
.Dd November 14, 2018
.Dt DIR 5
.Os
.Sh NAME
@ -101,7 +101,7 @@ The directory entry format is defined in the file
struct dirent {
ino_t d_fileno; /* file number of entry */
off_t d_off; /* directory offset of entry */
off_t d_off; /* directory offset of the next entry */
__uint16_t d_reclen; /* length of this record */
__uint8_t d_type; /* file type, see below */
__uint8_t d_namlen; /* length of string in d_name */

View File

@ -1097,6 +1097,8 @@ zfsctl_snapdir_readdir(ap)
strcpy(entry.d_name, snapname);
entry.d_namlen = strlen(entry.d_name);
entry.d_reclen = sizeof(entry);
/* NOTE: d_off is the offset for the *next* entry. */
entry.d_off = cookie + dots_offset;
error = vfs_read_dirent(ap, &entry, uio->uio_offset);
if (error != 0) {
if (error == ENAMETOOLONG)

View File

@ -2529,8 +2529,8 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
*/
eodp->ed_ino = objnum;
eodp->ed_reclen = reclen;
/* NOTE: ed_off is the offset for the *next* entry */
next = &(eodp->ed_off);
/* NOTE: ed_off is the offset for the *next* entry. */
next = &eodp->ed_off;
eodp->ed_eflags = zap.za_normalization_conflict ?
ED_CASE_CONFLICT : 0;
(void) strncpy(eodp->ed_name, zap.za_name,
@ -2543,6 +2543,8 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
odp->d_ino = objnum;
odp->d_reclen = reclen;
odp->d_namlen = strlen(zap.za_name);
/* NOTE: d_off is the offset for the *next* entry. */
next = &odp->d_off;
(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
odp->d_type = type;
odp = (dirent64_t *)((intptr_t)odp + reclen);
@ -2567,6 +2569,9 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_lon
offset += 1;
}
/* Fill the offset right after advancing the cursor. */
if (next != NULL)
*next = offset;
if (cooks != NULL) {
*cooks++ = offset;
ncooks--;

View File

@ -576,6 +576,8 @@ cd9660_readdir(ap)
entryoffsetinblock;
idp->curroff += reclen;
/* NOTE: d_off is the offset of *next* entry. */
idp->current.d_off = idp->curroff;
switch (imp->iso_ftype) {
case ISO_FTYPE_RRIP:

View File

@ -1381,6 +1381,8 @@ devfs_readdir(struct vop_readdir_args *ap)
if (dp->d_reclen > uio->uio_resid)
break;
dp->d_fileno = de->de_inode;
/* NOTE: d_off is the offset for the *next* entry. */
dp->d_off = off + dp->d_reclen;
if (off >= uio->uio_offset) {
error = vfs_read_dirent(ap, dp, off);
if (error)

View File

@ -224,6 +224,8 @@ ext2_readdir(struct vop_readdir_args *ap)
dstdp.d_reclen = GENERIC_DIRSIZ(&dstdp);
bcopy(dp->e2d_name, dstdp.d_name, dstdp.d_namlen);
dstdp.d_name[dstdp.d_namlen] = '\0';
/* NOTE: d_off is the offset of the *next* entry. */
dstdp.d_off = offset + dp->e2d_reclen;
if (dstdp.d_reclen > uio->uio_resid) {
if (uio->uio_resid == startresid)
error = EINVAL;

View File

@ -574,6 +574,8 @@ fdesc_readdir(struct vop_readdir_args *ap)
dp->d_fileno = i + FD_DESC;
break;
}
/* NOTE: d_off is the offset of the *next* entry. */
dp->d_off = UIO_MX * (i + 1);
if (dp->d_namlen != 0) {
/*
* And ship to userland

View File

@ -1558,6 +1558,8 @@ msdosfs_readdir(struct vop_readdir_args *ap)
break;
}
dirbuf.d_reclen = GENERIC_DIRSIZ(&dirbuf);
/* NOTE: d_off is the offset of the *next* entry. */
dirbuf.d_off = offset + sizeof(struct direntry);
if (uio->uio_resid < dirbuf.d_reclen)
goto out;
error = uiomove(&dirbuf, dirbuf.d_reclen, uio);
@ -1681,6 +1683,8 @@ msdosfs_readdir(struct vop_readdir_args *ap)
mbnambuf_flush(&nb, &dirbuf);
chksum = -1;
dirbuf.d_reclen = GENERIC_DIRSIZ(&dirbuf);
/* NOTE: d_off is the offset of the *next* entry. */
dirbuf.d_off = offset + sizeof(struct direntry);
if (uio->uio_resid < dirbuf.d_reclen) {
brelse(bp);
goto out;

View File

@ -1233,6 +1233,8 @@ nandfs_readdir(struct vop_readdir_args *ap)
dirent.d_namlen = name_len;
strncpy(dirent.d_name, ndirent->name, name_len);
dirent.d_reclen = GENERIC_DIRSIZ(&dirent);
/* NOTE: d_off is the offset of the *next* entry. */
dirent.d_off = diroffset + ndirent->rec_len;
DPRINTF(READDIR, ("copying `%*.*s`\n", name_len,
name_len, dirent.d_name));
}

View File

@ -830,6 +830,8 @@ pfs_readdir(struct vop_readdir_args *va)
pfsent->entry.d_name[i] = pn->pn_name[i];
pfsent->entry.d_name[i] = 0;
pfsent->entry.d_namlen = i;
/* NOTE: d_off is the offset of the *next* entry. */
pfsent->entry.d_off = offset + PFS_DELEN;
switch (pn->pn_type) {
case pfstype_procdir:
KASSERT(p != NULL,

View File

@ -846,6 +846,7 @@ udf_readdir(struct vop_readdir_args *a)
dir.d_name[1] = '\0';
dir.d_namlen = 1;
dir.d_reclen = GENERIC_DIRSIZ(&dir);
dir.d_off = 1;
uiodir.dirent = &dir;
error = udf_uiodir(&uiodir, dir.d_reclen, uio, 1);
if (error)
@ -858,6 +859,7 @@ udf_readdir(struct vop_readdir_args *a)
dir.d_name[2] = '\0';
dir.d_namlen = 2;
dir.d_reclen = GENERIC_DIRSIZ(&dir);
dir.d_off = 2;
uiodir.dirent = &dir;
error = udf_uiodir(&uiodir, dir.d_reclen, uio, 2);
} else {
@ -867,6 +869,7 @@ udf_readdir(struct vop_readdir_args *a)
dir.d_type = (fid->file_char & UDF_FILE_CHAR_DIR) ?
DT_DIR : DT_UNKNOWN;
dir.d_reclen = GENERIC_DIRSIZ(&dir);
dir.d_off = ds->this_off;
uiodir.dirent = &dir;
error = udf_uiodir(&uiodir, dir.d_reclen, uio,
ds->this_off);

View File

@ -2218,6 +2218,8 @@ ufs_readdir(ap)
dstdp.d_reclen = GENERIC_DIRSIZ(&dstdp);
bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen);
dstdp.d_name[dstdp.d_namlen] = '\0';
/* NOTE: d_off is the offset of the *next* entry. */
dstdp.d_off = offset + dp->d_reclen;
if (dstdp.d_reclen > uio->uio_resid) {
if (uio->uio_resid == startresid)
error = EINVAL;