vfs: lockless lookup
Provides full scalability as long as all visited filesystems support the lookup and terminal vnodes are different. Inner workings are explained in the comment above cache_fplookup. Capabilities and fd-relative lookups are not supported and will result in immediate fallback to regular code. Symlinks, ".." in the path, mount points without support for lockless lookup and mismatched counters will result in an attempt to get a reference to the directory vnode and continue in regular lookup. If this fails, the entire operation is aborted and regular lookup starts from scratch. However, care is taken that data is not copied again from userspace. Sample benchmark: incremental -j 104 bzImage on tmpfs: before: 142.96s user 1025.63s system 4924% cpu 23.731 total after: 147.36s user 313.40s system 3216% cpu 14.326 total Sample microbenchmark: access calls to separate files in /tmpfs, 104 workers, ops/s: before: 2165816 after: 151216530 Reviewed by: kib Tested by: pho (in a patchset) Differential Revision: https://reviews.freebsd.org/D25578
This commit is contained in:
parent
07d2145a17
commit
c42b77e694
@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$");
|
||||
#include <sys/namei.h>
|
||||
#include <sys/proc.h>
|
||||
#include <sys/rwlock.h>
|
||||
#include <sys/seqc.h>
|
||||
#include <sys/sdt.h>
|
||||
#include <sys/smr.h>
|
||||
#include <sys/smp.h>
|
||||
@ -67,6 +68,11 @@ __FBSDID("$FreeBSD$");
|
||||
#include <sys/ktrace.h>
|
||||
#endif
|
||||
|
||||
#include <sys/capsicum.h>
|
||||
|
||||
#include <security/audit/audit.h>
|
||||
#include <security/mac/mac_framework.h>
|
||||
|
||||
#ifdef DDB
|
||||
#include <ddb/ddb.h>
|
||||
#endif
|
||||
@ -100,6 +106,10 @@ SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
|
||||
SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
|
||||
"char *");
|
||||
|
||||
SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
|
||||
SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
|
||||
SDT_PROBE_DECLARE(vfs, namei, lookup, return);
|
||||
|
||||
/*
|
||||
* This structure describes the elements in the cache of recent
|
||||
* names looked up by namei.
|
||||
@ -2835,3 +2845,859 @@ DB_SHOW_COMMAND(vpath, db_show_vpath)
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
extern uma_zone_t namei_zone;
|
||||
|
||||
static bool __read_frequently cache_fast_lookup = true;
|
||||
SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
|
||||
&cache_fast_lookup, 0, "");
|
||||
|
||||
#define CACHE_FPL_FAILED -2020
|
||||
|
||||
static void
|
||||
cache_fpl_cleanup_cnp(struct componentname *cnp)
|
||||
{
|
||||
|
||||
uma_zfree(namei_zone, cnp->cn_pnbuf);
|
||||
#ifdef DIAGNOSTIC
|
||||
cnp->cn_pnbuf = NULL;
|
||||
cnp->cn_nameptr = NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
|
||||
{
|
||||
struct componentname *cnp;
|
||||
|
||||
cnp = &ndp->ni_cnd;
|
||||
while (*(cnp->cn_nameptr) == '/') {
|
||||
cnp->cn_nameptr++;
|
||||
ndp->ni_pathlen--;
|
||||
}
|
||||
|
||||
*dpp = ndp->ni_rootdir;
|
||||
}
|
||||
|
||||
/*
|
||||
* Components of nameidata (or objects it can point to) which may
|
||||
* need restoring in case fast path lookup fails.
|
||||
*/
|
||||
struct nameidata_saved {
|
||||
int cn_flags;
|
||||
long cn_namelen;
|
||||
char *cn_nameptr;
|
||||
size_t ni_pathlen;
|
||||
};
|
||||
|
||||
struct cache_fpl {
|
||||
int line;
|
||||
enum cache_fpl_status status;
|
||||
bool in_smr;
|
||||
struct nameidata *ndp;
|
||||
struct nameidata_saved snd;
|
||||
struct componentname *cnp;
|
||||
struct vnode *dvp;
|
||||
seqc_t dvp_seqc;
|
||||
struct vnode *tvp;
|
||||
seqc_t tvp_seqc;
|
||||
struct pwd *pwd;
|
||||
};
|
||||
|
||||
static void
|
||||
cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd)
|
||||
{
|
||||
|
||||
snd->cn_flags = fpl->ndp->ni_cnd.cn_flags;
|
||||
snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen;
|
||||
snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
|
||||
snd->ni_pathlen = fpl->ndp->ni_pathlen;
|
||||
}
|
||||
|
||||
static void
|
||||
cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd)
|
||||
{
|
||||
|
||||
fpl->ndp->ni_cnd.cn_flags = snd->cn_flags;
|
||||
fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen;
|
||||
fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr;
|
||||
fpl->ndp->ni_pathlen = snd->ni_pathlen;
|
||||
}
|
||||
|
||||
#ifdef INVARIANTS
|
||||
#define cache_fpl_smr_assert_entered(fpl) ({ \
|
||||
struct cache_fpl *_fpl = (fpl); \
|
||||
MPASS(_fpl->in_smr == true); \
|
||||
VFS_SMR_ASSERT_ENTERED(); \
|
||||
})
|
||||
#define cache_fpl_smr_assert_not_entered(fpl) ({ \
|
||||
struct cache_fpl *_fpl = (fpl); \
|
||||
MPASS(_fpl->in_smr == false); \
|
||||
VFS_SMR_ASSERT_NOT_ENTERED(); \
|
||||
})
|
||||
#else
|
||||
#define cache_fpl_smr_assert_entered(fpl) do { } while (0)
|
||||
#define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
|
||||
#endif
|
||||
|
||||
#define cache_fpl_smr_enter(fpl) ({ \
|
||||
struct cache_fpl *_fpl = (fpl); \
|
||||
MPASS(_fpl->in_smr == false); \
|
||||
vfs_smr_enter(); \
|
||||
_fpl->in_smr = true; \
|
||||
})
|
||||
|
||||
#define cache_fpl_smr_exit(fpl) ({ \
|
||||
struct cache_fpl *_fpl = (fpl); \
|
||||
MPASS(_fpl->in_smr == true); \
|
||||
vfs_smr_exit(); \
|
||||
_fpl->in_smr = false; \
|
||||
})
|
||||
|
||||
static int
|
||||
cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
|
||||
{
|
||||
|
||||
if (fpl->status != CACHE_FPL_STATUS_UNSET) {
|
||||
KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
|
||||
("%s: converting to abort from %d at %d, set at %d\n",
|
||||
__func__, fpl->status, line, fpl->line));
|
||||
}
|
||||
fpl->status = CACHE_FPL_STATUS_ABORTED;
|
||||
fpl->line = line;
|
||||
return (CACHE_FPL_FAILED);
|
||||
}
|
||||
|
||||
#define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__)
|
||||
|
||||
static int
|
||||
cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
|
||||
{
|
||||
|
||||
KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
|
||||
("%s: setting to partial at %d, but already set to %d at %d\n",
|
||||
__func__, line, fpl->status, fpl->line));
|
||||
cache_fpl_smr_assert_entered(fpl);
|
||||
fpl->status = CACHE_FPL_STATUS_PARTIAL;
|
||||
fpl->line = line;
|
||||
return (CACHE_FPL_FAILED);
|
||||
}
|
||||
|
||||
#define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__)
|
||||
|
||||
static int
|
||||
cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
|
||||
{
|
||||
|
||||
KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
|
||||
("%s: setting to handled at %d, but already set to %d at %d\n",
|
||||
__func__, line, fpl->status, fpl->line));
|
||||
cache_fpl_smr_assert_not_entered(fpl);
|
||||
MPASS(error != CACHE_FPL_FAILED);
|
||||
fpl->status = CACHE_FPL_STATUS_HANDLED;
|
||||
fpl->line = line;
|
||||
return (error);
|
||||
}
|
||||
|
||||
#define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
|
||||
|
||||
#define CACHE_FPL_SUPPORTED_CN_FLAGS \
|
||||
(LOCKLEAF | FOLLOW | LOCKSHARED | SAVENAME | ISOPEN | AUDITVNODE1)
|
||||
|
||||
static bool
|
||||
cache_can_fplookup(struct cache_fpl *fpl)
|
||||
{
|
||||
struct nameidata *ndp;
|
||||
struct componentname *cnp;
|
||||
struct thread *td;
|
||||
|
||||
ndp = fpl->ndp;
|
||||
cnp = fpl->cnp;
|
||||
td = cnp->cn_thread;
|
||||
|
||||
if (!cache_fast_lookup) {
|
||||
cache_fpl_aborted(fpl);
|
||||
return (false);
|
||||
}
|
||||
#ifdef MAC
|
||||
if (mac_vnode_check_lookup_enabled()) {
|
||||
cache_fpl_aborted(fpl);
|
||||
return (false);
|
||||
}
|
||||
#endif
|
||||
if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
|
||||
cache_fpl_aborted(fpl);
|
||||
return (false);
|
||||
}
|
||||
if ((cnp->cn_flags & LOCKLEAF) == 0) {
|
||||
cache_fpl_aborted(fpl);
|
||||
return (false);
|
||||
}
|
||||
if (cnp->cn_nameiop != LOOKUP) {
|
||||
cache_fpl_aborted(fpl);
|
||||
return (false);
|
||||
}
|
||||
if (ndp->ni_dirfd != AT_FDCWD) {
|
||||
cache_fpl_aborted(fpl);
|
||||
return (false);
|
||||
}
|
||||
if (IN_CAPABILITY_MODE(td)) {
|
||||
cache_fpl_aborted(fpl);
|
||||
return (false);
|
||||
}
|
||||
if (AUDITING_TD(td)) {
|
||||
cache_fpl_aborted(fpl);
|
||||
return (false);
|
||||
}
|
||||
if (ndp->ni_startdir != NULL) {
|
||||
cache_fpl_aborted(fpl);
|
||||
return (false);
|
||||
}
|
||||
return (true);
|
||||
}
|
||||
|
||||
static bool
|
||||
cache_fplookup_vnode_supported(struct vnode *vp)
|
||||
{
|
||||
|
||||
return (vp->v_type != VLNK);
|
||||
}
|
||||
|
||||
/*
|
||||
* The target vnode is not supported, prepare for the slow path to take over.
|
||||
*/
|
||||
static int
|
||||
cache_fplookup_partial_setup(struct cache_fpl *fpl)
|
||||
{
|
||||
struct componentname *cnp;
|
||||
enum vgetstate dvs;
|
||||
struct vnode *dvp;
|
||||
struct pwd *pwd;
|
||||
seqc_t dvp_seqc;
|
||||
|
||||
cnp = fpl->cnp;
|
||||
dvp = fpl->dvp;
|
||||
dvp_seqc = fpl->dvp_seqc;
|
||||
|
||||
dvs = vget_prep_smr(dvp);
|
||||
if (dvs == VGET_NONE) {
|
||||
cache_fpl_smr_exit(fpl);
|
||||
return (cache_fpl_aborted(fpl));
|
||||
}
|
||||
|
||||
cache_fpl_smr_exit(fpl);
|
||||
|
||||
vget_finish_ref(dvp, dvs);
|
||||
if (!vn_seqc_consistent(dvp, dvp_seqc)) {
|
||||
vrele(dvp);
|
||||
return (cache_fpl_aborted(fpl));
|
||||
}
|
||||
|
||||
pwd = pwd_hold(curthread);
|
||||
if (fpl->pwd != pwd) {
|
||||
vrele(dvp);
|
||||
pwd_drop(pwd);
|
||||
return (cache_fpl_aborted(fpl));
|
||||
}
|
||||
|
||||
fpl->ndp->ni_startdir = dvp;
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
cache_fplookup_final(struct cache_fpl *fpl)
|
||||
{
|
||||
struct componentname *cnp;
|
||||
enum vgetstate tvs;
|
||||
struct vnode *dvp, *tvp;
|
||||
seqc_t dvp_seqc, tvp_seqc;
|
||||
int error;
|
||||
|
||||
cnp = fpl->cnp;
|
||||
dvp = fpl->dvp;
|
||||
dvp_seqc = fpl->dvp_seqc;
|
||||
tvp = fpl->tvp;
|
||||
tvp_seqc = fpl->tvp_seqc;
|
||||
|
||||
VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
|
||||
MPASS((cnp->cn_flags & LOCKLEAF) != 0);
|
||||
|
||||
tvs = vget_prep_smr(tvp);
|
||||
if (tvs == VGET_NONE) {
|
||||
return (cache_fpl_partial(fpl));
|
||||
}
|
||||
|
||||
if (!vn_seqc_consistent(dvp, dvp_seqc)) {
|
||||
cache_fpl_smr_exit(fpl);
|
||||
vget_abort(tvp, tvs);
|
||||
return (cache_fpl_aborted(fpl));
|
||||
}
|
||||
|
||||
cache_fpl_smr_exit(fpl);
|
||||
|
||||
error = vget_finish(tvp, cnp->cn_lkflags, tvs);
|
||||
if (error != 0) {
|
||||
return (cache_fpl_aborted(fpl));
|
||||
}
|
||||
|
||||
if (!vn_seqc_consistent(tvp, tvp_seqc)) {
|
||||
vput(tvp);
|
||||
return (cache_fpl_aborted(fpl));
|
||||
}
|
||||
|
||||
return (cache_fpl_handled(fpl, 0));
|
||||
}
|
||||
|
||||
static int
|
||||
cache_fplookup_next(struct cache_fpl *fpl)
|
||||
{
|
||||
struct componentname *cnp;
|
||||
struct namecache *ncp;
|
||||
struct negstate *negstate;
|
||||
struct vnode *dvp, *tvp;
|
||||
u_char nc_flag;
|
||||
uint32_t hash;
|
||||
bool neg_hot;
|
||||
|
||||
cnp = fpl->cnp;
|
||||
dvp = fpl->dvp;
|
||||
|
||||
if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
|
||||
fpl->tvp = dvp;
|
||||
fpl->tvp_seqc = vn_seqc_read_any(dvp);
|
||||
if (seqc_in_modify(fpl->tvp_seqc)) {
|
||||
return (cache_fpl_aborted(fpl));
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
|
||||
|
||||
CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
|
||||
counter_u64_add(numchecks, 1);
|
||||
if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
|
||||
!bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* If there is no entry we have to punt to the slow path to perform
|
||||
* actual lookup. Should there be nothing with this name a negative
|
||||
* entry will be created.
|
||||
*/
|
||||
if (__predict_false(ncp == NULL)) {
|
||||
return (cache_fpl_partial(fpl));
|
||||
}
|
||||
|
||||
tvp = atomic_load_ptr(&ncp->nc_vp);
|
||||
nc_flag = atomic_load_char(&ncp->nc_flag);
|
||||
if ((nc_flag & NCF_NEGATIVE) != 0) {
|
||||
negstate = NCP2NEGSTATE(ncp);
|
||||
neg_hot = ((negstate->neg_flag & NEG_HOT) != 0);
|
||||
if (__predict_false(cache_ncp_invalid(ncp))) {
|
||||
return (cache_fpl_partial(fpl));
|
||||
}
|
||||
if (__predict_false((nc_flag & NCF_WHITE) != 0)) {
|
||||
return (cache_fpl_partial(fpl));
|
||||
}
|
||||
if (!neg_hot) {
|
||||
/*
|
||||
* TODO
|
||||
* Promoting to hot negative requires locks, thus is
|
||||
* left not yet supported for simplicity.
|
||||
*/
|
||||
return (cache_fpl_partial(fpl));
|
||||
}
|
||||
SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
|
||||
ncp->nc_name);
|
||||
counter_u64_add(numneghits, 1);
|
||||
cache_fpl_smr_exit(fpl);
|
||||
return (cache_fpl_handled(fpl, ENOENT));
|
||||
}
|
||||
|
||||
if (__predict_false(cache_ncp_invalid(ncp))) {
|
||||
return (cache_fpl_partial(fpl));
|
||||
}
|
||||
|
||||
fpl->tvp = tvp;
|
||||
fpl->tvp_seqc = vn_seqc_read_any(tvp);
|
||||
if (seqc_in_modify(fpl->tvp_seqc)) {
|
||||
return (cache_fpl_partial(fpl));
|
||||
}
|
||||
|
||||
if (!cache_fplookup_vnode_supported(tvp)) {
|
||||
return (cache_fpl_partial(fpl));
|
||||
}
|
||||
|
||||
counter_u64_add(numposhits, 1);
|
||||
SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
|
||||
return (0);
|
||||
}
|
||||
|
||||
static bool
|
||||
cache_fplookup_mp_supported(struct mount *mp)
|
||||
{
|
||||
|
||||
if (mp == NULL)
|
||||
return (false);
|
||||
if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
|
||||
return (false);
|
||||
if ((mp->mnt_flag & MNT_UNION) != 0)
|
||||
return (false);
|
||||
return (true);
|
||||
}
|
||||
|
||||
/*
|
||||
* Walk up the mount stack (if any).
|
||||
*
|
||||
* Correctness is provided in the following ways:
|
||||
* - all vnodes are protected from freeing with SMR
|
||||
* - struct mount objects are type stable making them always safe to access
|
||||
* - stability of the particular mount is provided by busying it
|
||||
* - relationship between the vnode which is mounted on and the mount is
|
||||
* verified with the vnode sequence counter after busying
|
||||
* - association between root vnode of the mount and the mount is protected
|
||||
* by busy
|
||||
*
|
||||
* From that point on we can read the sequence counter of the root vnode
|
||||
* and get the next mount on the stack (if any) using the same protection.
|
||||
*
|
||||
* By the end of successful walk we are guaranteed the reached state was
|
||||
* indeed present at least at some point which matches the regular lookup.
|
||||
*/
|
||||
static int
|
||||
cache_fplookup_climb_mount(struct cache_fpl *fpl)
|
||||
{
|
||||
struct mount *mp, *prev_mp;
|
||||
struct vnode *vp;
|
||||
seqc_t vp_seqc;
|
||||
|
||||
vp = fpl->tvp;
|
||||
vp_seqc = fpl->tvp_seqc;
|
||||
if (vp->v_type != VDIR)
|
||||
return (0);
|
||||
|
||||
mp = atomic_load_ptr(&vp->v_mountedhere);
|
||||
if (mp == NULL)
|
||||
return (0);
|
||||
|
||||
prev_mp = NULL;
|
||||
for (;;) {
|
||||
if (!vfs_op_thread_enter(mp)) {
|
||||
if (prev_mp != NULL)
|
||||
vfs_op_thread_exit(prev_mp);
|
||||
return (cache_fpl_partial(fpl));
|
||||
}
|
||||
if (prev_mp != NULL)
|
||||
vfs_op_thread_exit(prev_mp);
|
||||
if (!vn_seqc_consistent(vp, vp_seqc)) {
|
||||
vfs_op_thread_exit(mp);
|
||||
return (cache_fpl_partial(fpl));
|
||||
}
|
||||
if (!cache_fplookup_mp_supported(mp)) {
|
||||
vfs_op_thread_exit(mp);
|
||||
return (cache_fpl_partial(fpl));
|
||||
}
|
||||
vp = atomic_load_ptr(&mp->mnt_rootvnode);
|
||||
if (vp == NULL || VN_IS_DOOMED(vp)) {
|
||||
vfs_op_thread_exit(mp);
|
||||
return (cache_fpl_partial(fpl));
|
||||
}
|
||||
vp_seqc = vn_seqc_read_any(vp);
|
||||
if (seqc_in_modify(vp_seqc)) {
|
||||
vfs_op_thread_exit(mp);
|
||||
return (cache_fpl_partial(fpl));
|
||||
}
|
||||
prev_mp = mp;
|
||||
mp = atomic_load_ptr(&vp->v_mountedhere);
|
||||
if (mp == NULL)
|
||||
break;
|
||||
}
|
||||
|
||||
vfs_op_thread_exit(prev_mp);
|
||||
fpl->tvp = vp;
|
||||
fpl->tvp_seqc = vp_seqc;
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Parse the path.
|
||||
*
|
||||
* The code is mostly copy-pasted from regular lookup, see lookup().
|
||||
* The structure is maintained along with comments for easier maintenance.
|
||||
* Deduplicating the code will become feasible after fast path lookup
|
||||
* becomes more feature-complete.
|
||||
*/
|
||||
static int
|
||||
cache_fplookup_parse(struct cache_fpl *fpl)
|
||||
{
|
||||
struct nameidata *ndp;
|
||||
struct componentname *cnp;
|
||||
char *cp;
|
||||
char *prev_ni_next; /* saved ndp->ni_next */
|
||||
size_t prev_ni_pathlen; /* saved ndp->ni_pathlen */
|
||||
|
||||
ndp = fpl->ndp;
|
||||
cnp = fpl->cnp;
|
||||
|
||||
/*
|
||||
* Search a new directory.
|
||||
*
|
||||
* The last component of the filename is left accessible via
|
||||
* cnp->cn_nameptr for callers that need the name. Callers needing
|
||||
* the name set the SAVENAME flag. When done, they assume
|
||||
* responsibility for freeing the pathname buffer.
|
||||
*/
|
||||
for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
|
||||
continue;
|
||||
cnp->cn_namelen = cp - cnp->cn_nameptr;
|
||||
if (cnp->cn_namelen > NAME_MAX) {
|
||||
cache_fpl_smr_exit(fpl);
|
||||
return (cache_fpl_handled(fpl, ENAMETOOLONG));
|
||||
}
|
||||
prev_ni_pathlen = ndp->ni_pathlen;
|
||||
ndp->ni_pathlen -= cnp->cn_namelen;
|
||||
KASSERT(ndp->ni_pathlen <= PATH_MAX,
|
||||
("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
|
||||
prev_ni_next = ndp->ni_next;
|
||||
ndp->ni_next = cp;
|
||||
|
||||
/*
|
||||
* Replace multiple slashes by a single slash and trailing slashes
|
||||
* by a null. This must be done before VOP_LOOKUP() because some
|
||||
* fs's don't know about trailing slashes. Remember if there were
|
||||
* trailing slashes to handle symlinks, existing non-directories
|
||||
* and non-existing files that won't be directories specially later.
|
||||
*/
|
||||
while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
|
||||
cp++;
|
||||
ndp->ni_pathlen--;
|
||||
if (*cp == '\0') {
|
||||
/*
|
||||
* TODO
|
||||
* Regular lookup performs the following:
|
||||
* *ndp->ni_next = '\0';
|
||||
* cnp->cn_flags |= TRAILINGSLASH;
|
||||
*
|
||||
* Which is problematic since it modifies data read
|
||||
* from userspace. Then if fast path lookup was to
|
||||
* abort we would have to either restore it or convey
|
||||
* the flag. Since this is a corner case just ignore
|
||||
* it for simplicity.
|
||||
*/
|
||||
return (cache_fpl_partial(fpl));
|
||||
}
|
||||
}
|
||||
ndp->ni_next = cp;
|
||||
|
||||
cnp->cn_flags |= MAKEENTRY;
|
||||
|
||||
if (cnp->cn_namelen == 2 &&
|
||||
cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
|
||||
cnp->cn_flags |= ISDOTDOT;
|
||||
else
|
||||
cnp->cn_flags &= ~ISDOTDOT;
|
||||
if (*ndp->ni_next == 0)
|
||||
cnp->cn_flags |= ISLASTCN;
|
||||
else
|
||||
cnp->cn_flags &= ~ISLASTCN;
|
||||
|
||||
/*
|
||||
* Check for degenerate name (e.g. / or "")
|
||||
* which is a way of talking about a directory,
|
||||
* e.g. like "/." or ".".
|
||||
*
|
||||
* TODO
|
||||
* Another corner case handled by the regular lookup
|
||||
*/
|
||||
if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
|
||||
return (cache_fpl_partial(fpl));
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
static void
|
||||
cache_fplookup_parse_advance(struct cache_fpl *fpl)
|
||||
{
|
||||
struct nameidata *ndp;
|
||||
struct componentname *cnp;
|
||||
|
||||
ndp = fpl->ndp;
|
||||
cnp = fpl->cnp;
|
||||
|
||||
cnp->cn_nameptr = ndp->ni_next;
|
||||
while (*cnp->cn_nameptr == '/') {
|
||||
cnp->cn_nameptr++;
|
||||
ndp->ni_pathlen--;
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
|
||||
{
|
||||
struct nameidata *ndp;
|
||||
struct componentname *cnp;
|
||||
struct mount *mp;
|
||||
int error;
|
||||
|
||||
error = CACHE_FPL_FAILED;
|
||||
ndp = fpl->ndp;
|
||||
ndp->ni_lcf = 0;
|
||||
cnp = fpl->cnp;
|
||||
cnp->cn_lkflags = LK_SHARED;
|
||||
if ((cnp->cn_flags & LOCKSHARED) == 0)
|
||||
cnp->cn_lkflags = LK_EXCLUSIVE;
|
||||
|
||||
cache_fpl_checkpoint(fpl, &fpl->snd);
|
||||
|
||||
fpl->dvp = dvp;
|
||||
fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
|
||||
if (seqc_in_modify(fpl->dvp_seqc)) {
|
||||
cache_fpl_aborted(fpl);
|
||||
goto out;
|
||||
}
|
||||
mp = atomic_load_ptr(&fpl->dvp->v_mount);
|
||||
if (!cache_fplookup_mp_supported(mp)) {
|
||||
cache_fpl_aborted(fpl);
|
||||
goto out;
|
||||
}
|
||||
|
||||
VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
|
||||
|
||||
for (;;) {
|
||||
error = cache_fplookup_parse(fpl);
|
||||
if (__predict_false(error != 0)) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (cnp->cn_flags & ISDOTDOT) {
|
||||
error = cache_fpl_partial(fpl);
|
||||
break;
|
||||
}
|
||||
|
||||
VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
|
||||
|
||||
error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred, cnp->cn_thread);
|
||||
if (__predict_false(error != 0)) {
|
||||
switch (error) {
|
||||
case EAGAIN:
|
||||
case EOPNOTSUPP: /* can happen when racing against vgone */
|
||||
cache_fpl_partial(fpl);
|
||||
break;
|
||||
default:
|
||||
/*
|
||||
* See the API contract for VOP_FPLOOKUP_VEXEC.
|
||||
*/
|
||||
if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
|
||||
error = cache_fpl_aborted(fpl);
|
||||
} else {
|
||||
cache_fpl_smr_exit(fpl);
|
||||
cache_fpl_handled(fpl, error);
|
||||
}
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
error = cache_fplookup_next(fpl);
|
||||
if (__predict_false(error != 0)) {
|
||||
break;
|
||||
}
|
||||
|
||||
VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
|
||||
|
||||
error = cache_fplookup_climb_mount(fpl);
|
||||
if (__predict_false(error != 0)) {
|
||||
break;
|
||||
}
|
||||
|
||||
VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
|
||||
|
||||
if (cnp->cn_flags & ISLASTCN) {
|
||||
error = cache_fplookup_final(fpl);
|
||||
break;
|
||||
}
|
||||
|
||||
if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
|
||||
error = cache_fpl_aborted(fpl);
|
||||
break;
|
||||
}
|
||||
|
||||
fpl->dvp = fpl->tvp;
|
||||
fpl->dvp_seqc = fpl->tvp_seqc;
|
||||
|
||||
cache_fplookup_parse_advance(fpl);
|
||||
cache_fpl_checkpoint(fpl, &fpl->snd);
|
||||
}
|
||||
out:
|
||||
switch (fpl->status) {
|
||||
case CACHE_FPL_STATUS_UNSET:
|
||||
__assert_unreachable();
|
||||
break;
|
||||
case CACHE_FPL_STATUS_PARTIAL:
|
||||
cache_fpl_smr_assert_entered(fpl);
|
||||
return (cache_fplookup_partial_setup(fpl));
|
||||
case CACHE_FPL_STATUS_ABORTED:
|
||||
if (fpl->in_smr)
|
||||
cache_fpl_smr_exit(fpl);
|
||||
return (CACHE_FPL_FAILED);
|
||||
case CACHE_FPL_STATUS_HANDLED:
|
||||
cache_fpl_smr_assert_not_entered(fpl);
|
||||
if (__predict_false(error != 0)) {
|
||||
ndp->ni_dvp = NULL;
|
||||
ndp->ni_vp = NULL;
|
||||
cache_fpl_cleanup_cnp(cnp);
|
||||
return (error);
|
||||
}
|
||||
ndp->ni_dvp = fpl->dvp;
|
||||
ndp->ni_vp = fpl->tvp;
|
||||
if (cnp->cn_flags & SAVENAME)
|
||||
cnp->cn_flags |= HASBUF;
|
||||
else
|
||||
cache_fpl_cleanup_cnp(cnp);
|
||||
return (error);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Fast path lookup protected with SMR and sequence counters.
|
||||
*
|
||||
* Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
|
||||
*
|
||||
* Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
|
||||
* outlined below.
|
||||
*
|
||||
* Traditional vnode lookup conceptually looks like this:
|
||||
*
|
||||
* vn_lock(current);
|
||||
* for (;;) {
|
||||
* next = find();
|
||||
* vn_lock(next);
|
||||
* vn_unlock(current);
|
||||
* current = next;
|
||||
* if (last)
|
||||
* break;
|
||||
* }
|
||||
* return (current);
|
||||
*
|
||||
* Each jump to the next vnode is safe memory-wise and atomic with respect to
|
||||
* any modifications thanks to holding respective locks.
|
||||
*
|
||||
* The same guarantee can be provided with a combination of safe memory
|
||||
* reclamation and sequence counters instead. If all operations which affect
|
||||
* the relationship between the current vnode and the one we are looking for
|
||||
* also modify the counter, we can verify whether all the conditions held as
|
||||
* we made the jump. This includes things like permissions, mount points etc.
|
||||
* Counter modification is provided by enclosing relevant places in
|
||||
* vn_seqc_write_begin()/end() calls.
|
||||
*
|
||||
* Thus this translates to:
|
||||
*
|
||||
* vfs_smr_enter();
|
||||
* dvp_seqc = seqc_read_any(dvp);
|
||||
* if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
|
||||
* abort();
|
||||
* for (;;) {
|
||||
* tvp = find();
|
||||
* tvp_seqc = seqc_read_any(tvp);
|
||||
* if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
|
||||
* abort();
|
||||
* if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
|
||||
* abort();
|
||||
* dvp = tvp; // we know nothing of importance has changed
|
||||
* dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
|
||||
* if (last)
|
||||
* break;
|
||||
* }
|
||||
* vget(); // secure the vnode
|
||||
* if (!seqc_consistent(tvp, tvp_seqc) // final check
|
||||
* abort();
|
||||
* // at this point we know nothing has changed for any parent<->child pair
|
||||
* // as they were crossed during the lookup, meaning we matched the guarantee
|
||||
* // of the locked variant
|
||||
* return (tvp);
|
||||
*
|
||||
* The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
|
||||
* - they are called while within vfs_smr protection which they must never exit
|
||||
* - EAGAIN can be returned to denote checking could not be performed, it is
|
||||
* always valid to return it
|
||||
* - if the sequence counter has not changed the result must be valid
|
||||
* - if the sequence counter has changed both false positives and false negatives
|
||||
* are permitted (since the result will be rejected later)
|
||||
* - for simple cases of unix permission checks vaccess_vexec_smr can be used
|
||||
*
|
||||
* Caveats to watch out for:
|
||||
* - vnodes are passed unlocked and unreferenced with nothing stopping
|
||||
* VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
|
||||
* to use atomic_load_ptr to fetch it.
|
||||
* - the aforementioned object can also get freed, meaning absent other means it
|
||||
* should be protected with vfs_smr
|
||||
* - either safely checking permissions as they are modified or guaranteeing
|
||||
* their stability is left to the routine
|
||||
*/
|
||||
int
|
||||
cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
|
||||
struct pwd **pwdp)
|
||||
{
|
||||
struct cache_fpl fpl;
|
||||
struct pwd *pwd;
|
||||
struct vnode *dvp;
|
||||
struct componentname *cnp;
|
||||
struct nameidata_saved orig;
|
||||
int error;
|
||||
|
||||
*status = CACHE_FPL_STATUS_UNSET;
|
||||
bzero(&fpl, sizeof(fpl));
|
||||
fpl.status = CACHE_FPL_STATUS_UNSET;
|
||||
fpl.ndp = ndp;
|
||||
fpl.cnp = &ndp->ni_cnd;
|
||||
MPASS(curthread == fpl.cnp->cn_thread);
|
||||
|
||||
if (!cache_can_fplookup(&fpl)) {
|
||||
SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
|
||||
*status = fpl.status;
|
||||
return (EOPNOTSUPP);
|
||||
}
|
||||
|
||||
cache_fpl_checkpoint(&fpl, &orig);
|
||||
|
||||
cache_fpl_smr_enter(&fpl);
|
||||
pwd = pwd_get_smr();
|
||||
fpl.pwd = pwd;
|
||||
ndp->ni_rootdir = pwd->pwd_rdir;
|
||||
ndp->ni_topdir = pwd->pwd_jdir;
|
||||
|
||||
cnp = fpl.cnp;
|
||||
cnp->cn_nameptr = cnp->cn_pnbuf;
|
||||
if (cnp->cn_pnbuf[0] == '/') {
|
||||
cache_fpl_handle_root(ndp, &dvp);
|
||||
} else {
|
||||
MPASS(ndp->ni_dirfd == AT_FDCWD);
|
||||
dvp = pwd->pwd_cdir;
|
||||
}
|
||||
|
||||
SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
|
||||
|
||||
error = cache_fplookup_impl(dvp, &fpl);
|
||||
cache_fpl_smr_assert_not_entered(&fpl);
|
||||
SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
|
||||
|
||||
*status = fpl.status;
|
||||
switch (fpl.status) {
|
||||
case CACHE_FPL_STATUS_UNSET:
|
||||
__assert_unreachable();
|
||||
break;
|
||||
case CACHE_FPL_STATUS_HANDLED:
|
||||
SDT_PROBE3(vfs, namei, lookup, return, error,
|
||||
(error == 0 ? ndp->ni_vp : NULL), true);
|
||||
break;
|
||||
case CACHE_FPL_STATUS_PARTIAL:
|
||||
*pwdp = fpl.pwd;
|
||||
cache_fpl_restore(&fpl, &fpl.snd);
|
||||
break;
|
||||
case CACHE_FPL_STATUS_ABORTED:
|
||||
cache_fpl_restore(&fpl, &orig);
|
||||
break;
|
||||
}
|
||||
return (error);
|
||||
}
|
||||
|
@ -71,9 +71,9 @@ __FBSDID("$FreeBSD$");
|
||||
#undef NAMEI_DIAGNOSTIC
|
||||
|
||||
SDT_PROVIDER_DECLARE(vfs);
|
||||
SDT_PROBE_DEFINE3(vfs, namei, lookup, entry, "struct vnode *", "char *",
|
||||
"unsigned long");
|
||||
SDT_PROBE_DEFINE2(vfs, namei, lookup, return, "int", "struct vnode *");
|
||||
SDT_PROBE_DEFINE4(vfs, namei, lookup, entry, "struct vnode *", "char *",
|
||||
"unsigned long", "bool");
|
||||
SDT_PROBE_DEFINE3(vfs, namei, lookup, return, "int", "struct vnode *", "bool");
|
||||
|
||||
/* Allocation zone for namei. */
|
||||
uma_zone_t namei_zone;
|
||||
@ -280,6 +280,166 @@ namei_handle_root(struct nameidata *ndp, struct vnode **dpp)
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
namei_setup(struct nameidata *ndp, struct vnode **dpp, struct pwd **pwdp)
|
||||
{
|
||||
struct componentname *cnp;
|
||||
struct file *dfp;
|
||||
struct thread *td;
|
||||
struct pwd *pwd;
|
||||
cap_rights_t rights;
|
||||
struct filecaps dirfd_caps;
|
||||
int error, startdir_used;
|
||||
|
||||
cnp = &ndp->ni_cnd;
|
||||
td = cnp->cn_thread;
|
||||
|
||||
*pwdp = NULL;
|
||||
|
||||
#ifdef CAPABILITY_MODE
|
||||
/*
|
||||
* In capability mode, lookups must be restricted to happen in
|
||||
* the subtree with the root specified by the file descriptor:
|
||||
* - The root must be real file descriptor, not the pseudo-descriptor
|
||||
* AT_FDCWD.
|
||||
* - The passed path must be relative and not absolute.
|
||||
* - If lookup_cap_dotdot is disabled, path must not contain the
|
||||
* '..' components.
|
||||
* - If lookup_cap_dotdot is enabled, we verify that all '..'
|
||||
* components lookups result in the directories which were
|
||||
* previously walked by us, which prevents an escape from
|
||||
* the relative root.
|
||||
*/
|
||||
if (IN_CAPABILITY_MODE(td) && (cnp->cn_flags & NOCAPCHECK) == 0) {
|
||||
ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
|
||||
if (ndp->ni_dirfd == AT_FDCWD) {
|
||||
#ifdef KTRACE
|
||||
if (KTRPOINT(td, KTR_CAPFAIL))
|
||||
ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
|
||||
#endif
|
||||
return (ECAPMODE);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
error = 0;
|
||||
|
||||
/*
|
||||
* Get starting point for the translation.
|
||||
*/
|
||||
pwd = pwd_hold(td);
|
||||
/*
|
||||
* The reference on ni_rootdir is acquired in the block below to avoid
|
||||
* back-to-back atomics for absolute lookups.
|
||||
*/
|
||||
ndp->ni_rootdir = pwd->pwd_rdir;
|
||||
ndp->ni_topdir = pwd->pwd_jdir;
|
||||
|
||||
if (cnp->cn_pnbuf[0] == '/') {
|
||||
ndp->ni_resflags |= NIRES_ABS;
|
||||
error = namei_handle_root(ndp, dpp);
|
||||
} else {
|
||||
if (ndp->ni_startdir != NULL) {
|
||||
*dpp = ndp->ni_startdir;
|
||||
startdir_used = 1;
|
||||
} else if (ndp->ni_dirfd == AT_FDCWD) {
|
||||
*dpp = pwd->pwd_cdir;
|
||||
vrefact(*dpp);
|
||||
} else {
|
||||
rights = ndp->ni_rightsneeded;
|
||||
cap_rights_set_one(&rights, CAP_LOOKUP);
|
||||
|
||||
if (cnp->cn_flags & AUDITVNODE1)
|
||||
AUDIT_ARG_ATFD1(ndp->ni_dirfd);
|
||||
if (cnp->cn_flags & AUDITVNODE2)
|
||||
AUDIT_ARG_ATFD2(ndp->ni_dirfd);
|
||||
/*
|
||||
* Effectively inlined fgetvp_rights, because we need to
|
||||
* inspect the file as well as grabbing the vnode.
|
||||
*/
|
||||
error = fget_cap(td, ndp->ni_dirfd, &rights,
|
||||
&dfp, &ndp->ni_filecaps);
|
||||
if (error != 0) {
|
||||
/*
|
||||
* Preserve the error; it should either be EBADF
|
||||
* or capability-related, both of which can be
|
||||
* safely returned to the caller.
|
||||
*/
|
||||
} else {
|
||||
if (dfp->f_ops == &badfileops) {
|
||||
error = EBADF;
|
||||
} else if (dfp->f_vnode == NULL) {
|
||||
error = ENOTDIR;
|
||||
} else {
|
||||
*dpp = dfp->f_vnode;
|
||||
vrefact(*dpp);
|
||||
|
||||
if ((dfp->f_flag & FSEARCH) != 0)
|
||||
cnp->cn_flags |= NOEXECCHECK;
|
||||
}
|
||||
fdrop(dfp, td);
|
||||
}
|
||||
#ifdef CAPABILITIES
|
||||
/*
|
||||
* If file descriptor doesn't have all rights,
|
||||
* all lookups relative to it must also be
|
||||
* strictly relative.
|
||||
*/
|
||||
CAP_ALL(&rights);
|
||||
if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights,
|
||||
&rights) ||
|
||||
ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
|
||||
ndp->ni_filecaps.fc_nioctls != -1) {
|
||||
ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
if (error == 0 && (*dpp)->v_type != VDIR)
|
||||
error = ENOTDIR;
|
||||
}
|
||||
if (error == 0 && (cnp->cn_flags & BENEATH) != 0) {
|
||||
if (ndp->ni_dirfd == AT_FDCWD) {
|
||||
ndp->ni_beneath_latch = pwd->pwd_cdir;
|
||||
vrefact(ndp->ni_beneath_latch);
|
||||
} else {
|
||||
rights = ndp->ni_rightsneeded;
|
||||
cap_rights_set_one(&rights, CAP_LOOKUP);
|
||||
error = fgetvp_rights(td, ndp->ni_dirfd, &rights,
|
||||
&dirfd_caps, &ndp->ni_beneath_latch);
|
||||
if (error == 0 && (*dpp)->v_type != VDIR) {
|
||||
vrele(ndp->ni_beneath_latch);
|
||||
error = ENOTDIR;
|
||||
}
|
||||
}
|
||||
if (error == 0)
|
||||
ndp->ni_lcf |= NI_LCF_LATCH;
|
||||
}
|
||||
/*
|
||||
* If we are auditing the kernel pathname, save the user pathname.
|
||||
*/
|
||||
if (cnp->cn_flags & AUDITVNODE1)
|
||||
AUDIT_ARG_UPATH1_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf);
|
||||
if (cnp->cn_flags & AUDITVNODE2)
|
||||
AUDIT_ARG_UPATH2_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf);
|
||||
if (ndp->ni_startdir != NULL && !startdir_used)
|
||||
vrele(ndp->ni_startdir);
|
||||
if (error != 0) {
|
||||
if (*dpp != NULL)
|
||||
vrele(*dpp);
|
||||
return (error);
|
||||
}
|
||||
MPASS((ndp->ni_lcf & (NI_LCF_BENEATH_ABS | NI_LCF_LATCH)) !=
|
||||
NI_LCF_BENEATH_ABS);
|
||||
if (((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0 &&
|
||||
lookup_cap_dotdot != 0) ||
|
||||
((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) == 0 &&
|
||||
(cnp->cn_flags & BENEATH) != 0))
|
||||
ndp->ni_lcf |= NI_LCF_CAP_DOTDOT;
|
||||
SDT_PROBE4(vfs, namei, lookup, entry, *dpp, cnp->cn_pnbuf,
|
||||
cnp->cn_flags, false);
|
||||
*pwdp = pwd;
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert a pathname into a pointer to a locked vnode.
|
||||
*
|
||||
@ -307,14 +467,12 @@ namei(struct nameidata *ndp)
|
||||
struct vnode *dp; /* the directory we are searching */
|
||||
struct iovec aiov; /* uio for reading symbolic links */
|
||||
struct componentname *cnp;
|
||||
struct file *dfp;
|
||||
struct thread *td;
|
||||
struct proc *p;
|
||||
struct pwd *pwd;
|
||||
cap_rights_t rights;
|
||||
struct filecaps dirfd_caps;
|
||||
struct uio auio;
|
||||
int error, linklen, startdir_used;
|
||||
int error, linklen;
|
||||
enum cache_fpl_status status;
|
||||
|
||||
cnp = &ndp->ni_cnd;
|
||||
td = cnp->cn_thread;
|
||||
@ -329,10 +487,14 @@ namei(struct nameidata *ndp)
|
||||
ndp->ni_startdir->v_type == VBAD);
|
||||
TAILQ_INIT(&ndp->ni_cap_tracker);
|
||||
ndp->ni_lcf = 0;
|
||||
ndp->ni_loopcnt = 0;
|
||||
dp = NULL;
|
||||
|
||||
/* We will set this ourselves if we need it. */
|
||||
cnp->cn_flags &= ~TRAILINGSLASH;
|
||||
|
||||
ndp->ni_vp = NULL;
|
||||
|
||||
/*
|
||||
* Get a buffer for the name to be translated, and copy the
|
||||
* name into the buffer.
|
||||
@ -346,44 +508,21 @@ namei(struct nameidata *ndp)
|
||||
error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
|
||||
&ndp->ni_pathlen);
|
||||
|
||||
if (error != 0) {
|
||||
namei_cleanup_cnp(cnp);
|
||||
return (error);
|
||||
}
|
||||
|
||||
cnp->cn_nameptr = cnp->cn_pnbuf;
|
||||
|
||||
/*
|
||||
* Don't allow empty pathnames.
|
||||
*/
|
||||
if (error == 0 && *cnp->cn_pnbuf == '\0')
|
||||
error = ENOENT;
|
||||
|
||||
#ifdef CAPABILITY_MODE
|
||||
/*
|
||||
* In capability mode, lookups must be restricted to happen in
|
||||
* the subtree with the root specified by the file descriptor:
|
||||
* - The root must be real file descriptor, not the pseudo-descriptor
|
||||
* AT_FDCWD.
|
||||
* - The passed path must be relative and not absolute.
|
||||
* - If lookup_cap_dotdot is disabled, path must not contain the
|
||||
* '..' components.
|
||||
* - If lookup_cap_dotdot is enabled, we verify that all '..'
|
||||
* components lookups result in the directories which were
|
||||
* previously walked by us, which prevents an escape from
|
||||
* the relative root.
|
||||
*/
|
||||
if (error == 0 && IN_CAPABILITY_MODE(td) &&
|
||||
(cnp->cn_flags & NOCAPCHECK) == 0) {
|
||||
ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
|
||||
if (ndp->ni_dirfd == AT_FDCWD) {
|
||||
#ifdef KTRACE
|
||||
if (KTRPOINT(td, KTR_CAPFAIL))
|
||||
ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
|
||||
#endif
|
||||
error = ECAPMODE;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (error != 0) {
|
||||
if (*cnp->cn_pnbuf == '\0') {
|
||||
namei_cleanup_cnp(cnp);
|
||||
ndp->ni_vp = NULL;
|
||||
return (error);
|
||||
return (ENOENT);
|
||||
}
|
||||
ndp->ni_loopcnt = 0;
|
||||
|
||||
#ifdef KTRACE
|
||||
if (KTRPOINT(td, KTR_NAMEI)) {
|
||||
KASSERT(cnp->cn_thread == curthread,
|
||||
@ -391,122 +530,34 @@ namei(struct nameidata *ndp)
|
||||
ktrnamei(cnp->cn_pnbuf);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Get starting point for the translation.
|
||||
* First try looking up the target without locking any vnodes.
|
||||
*
|
||||
* We may need to start from scratch or pick up where it left off.
|
||||
*/
|
||||
pwd = pwd_hold(td);
|
||||
/*
|
||||
* The reference on ni_rootdir is acquired in the block below to avoid
|
||||
* back-to-back atomics for absolute lookups.
|
||||
*/
|
||||
ndp->ni_rootdir = pwd->pwd_rdir;
|
||||
ndp->ni_topdir = pwd->pwd_jdir;
|
||||
|
||||
startdir_used = 0;
|
||||
dp = NULL;
|
||||
cnp->cn_nameptr = cnp->cn_pnbuf;
|
||||
if (cnp->cn_pnbuf[0] == '/') {
|
||||
ndp->ni_resflags |= NIRES_ABS;
|
||||
error = namei_handle_root(ndp, &dp);
|
||||
} else {
|
||||
if (ndp->ni_startdir != NULL) {
|
||||
dp = ndp->ni_startdir;
|
||||
startdir_used = 1;
|
||||
} else if (ndp->ni_dirfd == AT_FDCWD) {
|
||||
dp = pwd->pwd_cdir;
|
||||
vrefact(dp);
|
||||
} else {
|
||||
rights = ndp->ni_rightsneeded;
|
||||
cap_rights_set_one(&rights, CAP_LOOKUP);
|
||||
|
||||
if (cnp->cn_flags & AUDITVNODE1)
|
||||
AUDIT_ARG_ATFD1(ndp->ni_dirfd);
|
||||
if (cnp->cn_flags & AUDITVNODE2)
|
||||
AUDIT_ARG_ATFD2(ndp->ni_dirfd);
|
||||
/*
|
||||
* Effectively inlined fgetvp_rights, because we need to
|
||||
* inspect the file as well as grabbing the vnode.
|
||||
*/
|
||||
error = fget_cap(td, ndp->ni_dirfd, &rights,
|
||||
&dfp, &ndp->ni_filecaps);
|
||||
if (error != 0) {
|
||||
/*
|
||||
* Preserve the error; it should either be EBADF
|
||||
* or capability-related, both of which can be
|
||||
* safely returned to the caller.
|
||||
*/
|
||||
} else {
|
||||
if (dfp->f_ops == &badfileops) {
|
||||
error = EBADF;
|
||||
} else if (dfp->f_vnode == NULL) {
|
||||
error = ENOTDIR;
|
||||
} else {
|
||||
dp = dfp->f_vnode;
|
||||
vrefact(dp);
|
||||
|
||||
if ((dfp->f_flag & FSEARCH) != 0)
|
||||
cnp->cn_flags |= NOEXECCHECK;
|
||||
}
|
||||
fdrop(dfp, td);
|
||||
}
|
||||
#ifdef CAPABILITIES
|
||||
/*
|
||||
* If file descriptor doesn't have all rights,
|
||||
* all lookups relative to it must also be
|
||||
* strictly relative.
|
||||
*/
|
||||
CAP_ALL(&rights);
|
||||
if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights,
|
||||
&rights) ||
|
||||
ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
|
||||
ndp->ni_filecaps.fc_nioctls != -1) {
|
||||
ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
|
||||
}
|
||||
#endif
|
||||
error = cache_fplookup(ndp, &status, &pwd);
|
||||
switch (status) {
|
||||
case CACHE_FPL_STATUS_UNSET:
|
||||
__assert_unreachable();
|
||||
break;
|
||||
case CACHE_FPL_STATUS_HANDLED:
|
||||
return (error);
|
||||
case CACHE_FPL_STATUS_PARTIAL:
|
||||
dp = ndp->ni_startdir;
|
||||
break;
|
||||
case CACHE_FPL_STATUS_ABORTED:
|
||||
error = namei_setup(ndp, &dp, &pwd);
|
||||
if (error != 0) {
|
||||
namei_cleanup_cnp(cnp);
|
||||
return (error);
|
||||
}
|
||||
if (error == 0 && dp->v_type != VDIR)
|
||||
error = ENOTDIR;
|
||||
}
|
||||
if (error == 0 && (cnp->cn_flags & BENEATH) != 0) {
|
||||
if (ndp->ni_dirfd == AT_FDCWD) {
|
||||
ndp->ni_beneath_latch = pwd->pwd_cdir;
|
||||
vrefact(ndp->ni_beneath_latch);
|
||||
} else {
|
||||
rights = ndp->ni_rightsneeded;
|
||||
cap_rights_set_one(&rights, CAP_LOOKUP);
|
||||
error = fgetvp_rights(td, ndp->ni_dirfd, &rights,
|
||||
&dirfd_caps, &ndp->ni_beneath_latch);
|
||||
if (error == 0 && dp->v_type != VDIR) {
|
||||
vrele(ndp->ni_beneath_latch);
|
||||
error = ENOTDIR;
|
||||
}
|
||||
}
|
||||
if (error == 0)
|
||||
ndp->ni_lcf |= NI_LCF_LATCH;
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we are auditing the kernel pathname, save the user pathname.
|
||||
* Locked lookup.
|
||||
*/
|
||||
if (cnp->cn_flags & AUDITVNODE1)
|
||||
AUDIT_ARG_UPATH1_VP(td, ndp->ni_rootdir, dp, cnp->cn_pnbuf);
|
||||
if (cnp->cn_flags & AUDITVNODE2)
|
||||
AUDIT_ARG_UPATH2_VP(td, ndp->ni_rootdir, dp, cnp->cn_pnbuf);
|
||||
if (ndp->ni_startdir != NULL && !startdir_used)
|
||||
vrele(ndp->ni_startdir);
|
||||
if (error != 0) {
|
||||
if (dp != NULL)
|
||||
vrele(dp);
|
||||
goto out;
|
||||
}
|
||||
MPASS((ndp->ni_lcf & (NI_LCF_BENEATH_ABS | NI_LCF_LATCH)) !=
|
||||
NI_LCF_BENEATH_ABS);
|
||||
if (((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0 &&
|
||||
lookup_cap_dotdot != 0) ||
|
||||
((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) == 0 &&
|
||||
(cnp->cn_flags & BENEATH) != 0))
|
||||
ndp->ni_lcf |= NI_LCF_CAP_DOTDOT;
|
||||
SDT_PROBE3(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf,
|
||||
cnp->cn_flags);
|
||||
for (;;) {
|
||||
ndp->ni_startdir = dp;
|
||||
error = lookup(ndp);
|
||||
@ -526,8 +577,8 @@ namei(struct nameidata *ndp)
|
||||
error = ENOTCAPABLE;
|
||||
}
|
||||
nameicap_cleanup(ndp, true);
|
||||
SDT_PROBE2(vfs, namei, lookup, return, error,
|
||||
(error == 0 ? ndp->ni_vp : NULL));
|
||||
SDT_PROBE3(vfs, namei, lookup, return, error,
|
||||
(error == 0 ? ndp->ni_vp : NULL), false);
|
||||
pwd_drop(pwd);
|
||||
return (error);
|
||||
}
|
||||
@ -602,7 +653,7 @@ namei(struct nameidata *ndp)
|
||||
MPASS(error != 0);
|
||||
namei_cleanup_cnp(cnp);
|
||||
nameicap_cleanup(ndp, true);
|
||||
SDT_PROBE2(vfs, namei, lookup, return, error, NULL);
|
||||
SDT_PROBE3(vfs, namei, lookup, return, error, NULL, false);
|
||||
pwd_drop(pwd);
|
||||
return (error);
|
||||
}
|
||||
|
@ -108,6 +108,12 @@ struct nameidata {
|
||||
};
|
||||
|
||||
#ifdef _KERNEL
|
||||
|
||||
enum cache_fpl_status { CACHE_FPL_STATUS_ABORTED, CACHE_FPL_STATUS_PARTIAL,
|
||||
CACHE_FPL_STATUS_HANDLED, CACHE_FPL_STATUS_UNSET };
|
||||
int cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
|
||||
struct pwd **pwdp);
|
||||
|
||||
/*
|
||||
* namei operations
|
||||
*/
|
||||
|
Loading…
Reference in New Issue
Block a user