vfs: add VOP_STAT

The current scheme of calling VOP_GETATTR adds avoidable overhead.

An example with tmpfs doing fstat (ops/s):
before: 7488958
after:  7913833

Reviewed by:	kib (previous version)
Differential Revision:	https://reviews.freebsd.org/D25910
This commit is contained in:
Mateusz Guzik 2020-08-07 23:06:40 +00:00
parent 1e5d733503
commit 51ea7bea91
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=364044
9 changed files with 189 additions and 124 deletions

View File

@ -2308,7 +2308,8 @@ MLINKS+=vm_page_insert.9 vm_page_remove.9
MLINKS+=vm_page_wire.9 vm_page_unwire.9
MLINKS+=VOP_ACCESS.9 VOP_ACCESSX.9
MLINKS+=VOP_ATTRIB.9 VOP_GETATTR.9 \
VOP_ATTRIB.9 VOP_SETATTR.9
VOP_ATTRIB.9 VOP_SETATTR.9 \
VOP_ATTRIB.9 VOP_STAT.9
MLINKS+=VOP_CREATE.9 VOP_MKDIR.9 \
VOP_CREATE.9 VOP_MKNOD.9 \
VOP_CREATE.9 VOP_SYMLINK.9

View File

@ -28,7 +28,7 @@
.\"
.\" $FreeBSD$
.\"
.Dd August 29, 2008
.Dd August 8, 2020
.Dt VOP_ATTRIB 9
.Os
.Sh NAME
@ -42,19 +42,49 @@
.Fn VOP_GETATTR "struct vnode *vp" "struct vattr *vap" "struct ucred *cred"
.Ft int
.Fn VOP_SETATTR "struct vnode *vp" "struct vattr *vap" "struct ucred *cred"
.Ft int
.Fn VOP_STAT "struct vnode *vp" "struct stat *sb" "struct ucred *active_cred" \
"struct ucred *file_cred" "struct thread *td"
.Sh DESCRIPTION
These entry points manipulate various attributes of a file or directory,
including file permissions, owner, group, size,
access time and modification time.
.Pp
The arguments are:
.Fn VOP_STAT
returns data in a format suitable for the
.Xr stat 2
system call and by default is implemented as a wrapper around
.Fn VOP_GETATTR .
Filesystems may want to implement their own variant for performance reasons.
.Pp
For
.Fn VOP_GETATTR
and
.Fn VOP_SETATTR
the arguments are:
.Bl -tag -width cred
.It Fa vp
The vnode of the file.
.It Fa vap
The attributes of the file.
.It Fa cred
The user credentials of the calling process.
The user credentials of the calling thread.
.El
.Pp
For
.Fn VOP_STAT
the arguments are:
.Bl -tag -width active_cred
.It Fa vp
The vnode of the file.
.It Fa sb
The attributes of the file.
.It Fa active_cred
The user credentials of the calling thread.
.It Fa file_cred
The credentials installed on the file description pointing to the vnode or NOCRED.
.It Fa td
The calling thread.
.El
.Pp
Attributes which are not being modified by
@ -67,8 +97,11 @@ the contents of
.Fa *vap
prior to setting specific values.
.Sh LOCKS
Both
.Fn VOP_GETATTR
expects the vnode to be locked on entry and will leave the vnode locked on
and
.Fn VOP_STAT
expect the vnode to be locked on entry and will leave the vnode locked on
return.
The lock type can be either shared or exclusive.
.Pp
@ -84,6 +117,10 @@ otherwise an appropriate error is returned.
.Fn VOP_SETATTR
returns zero if the attributes were changed successfully, otherwise an
appropriate error is returned.
.Fn VOP_STAT
returns 0 if it was able to retrieve the attribute data
.Fa *sb ,
otherwise an appropriate error is returned.
.Sh ERRORS
.Bl -tag -width Er
.It Bq Er EPERM

View File

@ -1691,7 +1691,7 @@ linux_file_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
vp = filp->f_vnode;
vn_lock(vp, LK_SHARED | LK_RETRY);
error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
error = VOP_STAT(vp, sb, td->td_ucred, NOCRED, td);
VOP_UNLOCK(vp);
return (error);

View File

@ -57,6 +57,9 @@ __FBSDID("$FreeBSD$");
#include <sys/vnode.h>
#include <sys/dirent.h>
#include <sys/poll.h>
#include <sys/stat.h>
#include <security/audit/audit.h>
#include <sys/priv.h>
#include <security/mac/mac_framework.h>
@ -87,6 +90,7 @@ static int vop_stdadd_writecount(struct vop_add_writecount_args *ap);
static int vop_stdcopy_file_range(struct vop_copy_file_range_args *ap);
static int vop_stdfdatasync(struct vop_fdatasync_args *ap);
static int vop_stdgetpages_async(struct vop_getpages_async_args *ap);
static int vop_stdstat(struct vop_stat_args *ap);
/*
* This vnode table stores what we want to do if the filesystem doesn't
@ -114,6 +118,7 @@ struct vop_vector default_vnodeops = {
.vop_bmap = vop_stdbmap,
.vop_close = VOP_NULL,
.vop_fsync = VOP_NULL,
.vop_stat = vop_stdstat,
.vop_fdatasync = vop_stdfdatasync,
.vop_getpages = vop_stdgetpages,
.vop_getpages_async = vop_stdgetpages_async,
@ -1461,3 +1466,111 @@ vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a)
sigallowstop(prev_stops);
return (rc);
}
static int
vop_stdstat(struct vop_stat_args *a)
{
struct vattr vattr;
struct vattr *vap;
struct vnode *vp;
struct stat *sb;
int error;
u_short mode;
vp = a->a_vp;
sb = a->a_sb;
error = vop_stat_helper_pre(a);
if (error != 0)
return (error);
vap = &vattr;
/*
* Initialize defaults for new and unusual fields, so that file
* systems which don't support these fields don't need to know
* about them.
*/
vap->va_birthtime.tv_sec = -1;
vap->va_birthtime.tv_nsec = 0;
vap->va_fsid = VNOVAL;
vap->va_rdev = NODEV;
error = VOP_GETATTR(vp, vap, a->a_active_cred);
if (error)
goto out;
/*
* Zero the spare stat fields
*/
bzero(sb, sizeof *sb);
/*
* Copy from vattr table
*/
if (vap->va_fsid != VNOVAL)
sb->st_dev = vap->va_fsid;
else
sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
sb->st_ino = vap->va_fileid;
mode = vap->va_mode;
switch (vap->va_type) {
case VREG:
mode |= S_IFREG;
break;
case VDIR:
mode |= S_IFDIR;
break;
case VBLK:
mode |= S_IFBLK;
break;
case VCHR:
mode |= S_IFCHR;
break;
case VLNK:
mode |= S_IFLNK;
break;
case VSOCK:
mode |= S_IFSOCK;
break;
case VFIFO:
mode |= S_IFIFO;
break;
default:
error = EBADF;
goto out;
}
sb->st_mode = mode;
sb->st_nlink = vap->va_nlink;
sb->st_uid = vap->va_uid;
sb->st_gid = vap->va_gid;
sb->st_rdev = vap->va_rdev;
if (vap->va_size > OFF_MAX) {
error = EOVERFLOW;
goto out;
}
sb->st_size = vap->va_size;
sb->st_atim.tv_sec = vap->va_atime.tv_sec;
sb->st_atim.tv_nsec = vap->va_atime.tv_nsec;
sb->st_mtim.tv_sec = vap->va_mtime.tv_sec;
sb->st_mtim.tv_nsec = vap->va_mtime.tv_nsec;
sb->st_ctim.tv_sec = vap->va_ctime.tv_sec;
sb->st_ctim.tv_nsec = vap->va_ctime.tv_nsec;
sb->st_birthtim.tv_sec = vap->va_birthtime.tv_sec;
sb->st_birthtim.tv_nsec = vap->va_birthtime.tv_nsec;
/*
* According to www.opengroup.org, the meaning of st_blksize is
* "a filesystem-specific preferred I/O block size for this
* object. In some filesystem types, this may vary from file
* to file"
* Use minimum/default of PAGE_SIZE (e.g. for VCHR).
*/
sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
sb->st_flags = vap->va_flags;
sb->st_blocks = vap->va_bytes / S_BLKSIZE;
sb->st_gen = vap->va_gen;
out:
return (vop_stat_helper_post(a, error));
}

View File

@ -1867,7 +1867,7 @@ kern_funlinkat(struct thread *td, int dfd, const char *path, int fd,
if (vp->v_type == VDIR && oldinum == 0) {
error = EPERM; /* POSIX */
} else if (oldinum != 0 &&
((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
((error = VOP_STAT(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
sb.st_ino != oldinum) {
error = EIDRM; /* Identifier removed */
} else if (fp != NULL && fp->f_vnode != vp) {
@ -2381,7 +2381,7 @@ kern_statat(struct thread *td, int flag, int fd, const char *path,
if ((error = namei(&nd)) != 0)
return (error);
error = vn_stat(nd.ni_vp, sbp, td->td_ucred, NOCRED, td);
error = VOP_STAT(nd.ni_vp, sbp, td->td_ucred, NOCRED, td);
if (error == 0) {
SDT_PROBE2(vfs, , stat, mode, path, sbp->st_mode);
if (S_ISREG(sbp->st_mode))
@ -4566,7 +4566,7 @@ kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
vfs_unbusy(mp);
if (error != 0)
return (error);
error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
error = VOP_STAT(vp, sb, td->td_ucred, NOCRED, td);
vput(vp);
return (error);
}

View File

@ -1455,123 +1455,12 @@ vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred,
int error;
vn_lock(vp, LK_SHARED | LK_RETRY);
error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
error = VOP_STAT(vp, sb, active_cred, fp->f_cred, td);
VOP_UNLOCK(vp);
return (error);
}
/*
* Stat a vnode; implementation for the stat syscall
*/
int
vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred,
struct ucred *file_cred, struct thread *td)
{
struct vattr vattr;
struct vattr *vap;
int error;
u_short mode;
AUDIT_ARG_VNODE1(vp);
#ifdef MAC
error = mac_vnode_check_stat(active_cred, file_cred, vp);
if (error)
return (error);
#endif
vap = &vattr;
/*
* Initialize defaults for new and unusual fields, so that file
* systems which don't support these fields don't need to know
* about them.
*/
vap->va_birthtime.tv_sec = -1;
vap->va_birthtime.tv_nsec = 0;
vap->va_fsid = VNOVAL;
vap->va_rdev = NODEV;
error = VOP_GETATTR(vp, vap, active_cred);
if (error)
return (error);
/*
* Zero the spare stat fields
*/
bzero(sb, sizeof *sb);
/*
* Copy from vattr table
*/
if (vap->va_fsid != VNOVAL)
sb->st_dev = vap->va_fsid;
else
sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
sb->st_ino = vap->va_fileid;
mode = vap->va_mode;
switch (vap->va_type) {
case VREG:
mode |= S_IFREG;
break;
case VDIR:
mode |= S_IFDIR;
break;
case VBLK:
mode |= S_IFBLK;
break;
case VCHR:
mode |= S_IFCHR;
break;
case VLNK:
mode |= S_IFLNK;
break;
case VSOCK:
mode |= S_IFSOCK;
break;
case VFIFO:
mode |= S_IFIFO;
break;
default:
return (EBADF);
}
sb->st_mode = mode;
sb->st_nlink = vap->va_nlink;
sb->st_uid = vap->va_uid;
sb->st_gid = vap->va_gid;
sb->st_rdev = vap->va_rdev;
if (vap->va_size > OFF_MAX)
return (EOVERFLOW);
sb->st_size = vap->va_size;
sb->st_atim.tv_sec = vap->va_atime.tv_sec;
sb->st_atim.tv_nsec = vap->va_atime.tv_nsec;
sb->st_mtim.tv_sec = vap->va_mtime.tv_sec;
sb->st_mtim.tv_nsec = vap->va_mtime.tv_nsec;
sb->st_ctim.tv_sec = vap->va_ctime.tv_sec;
sb->st_ctim.tv_nsec = vap->va_ctime.tv_nsec;
sb->st_birthtim.tv_sec = vap->va_birthtime.tv_sec;
sb->st_birthtim.tv_nsec = vap->va_birthtime.tv_nsec;
/*
* According to www.opengroup.org, the meaning of st_blksize is
* "a filesystem-specific preferred I/O block size for this
* object. In some filesystem types, this may vary from file
* to file"
* Use minimum/default of PAGE_SIZE (e.g. for VCHR).
*/
sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
sb->st_flags = vap->va_flags;
if (priv_check_cred_vfs_generation(td->td_ucred))
sb->st_gen = 0;
else
sb->st_gen = vap->va_gen;
sb->st_blocks = vap->va_bytes / S_BLKSIZE;
return (0);
}
/*
* File table vnode ioctl routine.
*/

View File

@ -177,6 +177,17 @@ vop_accessx {
};
%% stat vp L L L
vop_stat {
IN struct vnode *vp;
OUT struct stat *sb;
IN struct ucred *active_cred;
IN struct ucred *file_cred;
IN struct thread *td;
};
%% getattr vp L L L
vop_getattr {

View File

@ -854,7 +854,7 @@ audit_arg_upath2_canon(char *upath)
* It is assumed that the caller will hold any vnode locks necessary to
* perform a VOP_GETATTR() on the passed vnode.
*
* XXX: The attr code is very similar to vfs_vnops.c:vn_stat(), but always
* XXX: The attr code is very similar to vfs_default.c:vop_stdstat(), but always
* provides access to the generation number as we need that to construct the
* BSM file ID.
*

View File

@ -737,8 +737,6 @@ int vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base,
struct thread *td);
int vn_rlimit_fsize(const struct vnode *vn, const struct uio *uio,
struct thread *td);
int vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred,
struct ucred *file_cred, struct thread *td);
int vn_start_write(struct vnode *vp, struct mount **mpp, int flags);
int vn_start_secondary_write(struct vnode *vp, struct mount **mpp,
int flags);
@ -893,6 +891,22 @@ void vop_need_inactive_debugpost(void *a, int rc);
void vop_rename_fail(struct vop_rename_args *ap);
#define vop_stat_helper_pre(ap) ({ \
int _error; \
AUDIT_ARG_VNODE1(ap->a_vp); \
_error = mac_vnode_check_stat(ap->a_active_cred, ap->a_file_cred, ap->a_vp);\
if (__predict_true(_error == 0)) \
bzero(ap->a_sb, sizeof(*ap->a_sb)); \
_error; \
})
#define vop_stat_helper_post(ap, error) ({ \
int _error = (error); \
if (priv_check_cred_vfs_generation(ap->a_td->td_ucred)) \
ap->a_sb->st_gen = 0; \
_error; \
})
#define VOP_WRITE_PRE(ap) \
struct vattr va; \
int error; \