Add an optional feature to the pNFS server.

Without this patch, the pNFS server distributes the data storage files across
all of the specified DSs.
A tester noted that it would be nice if a system administrator could control
which DSs are used to store the file data for a given exported MDS file system.
This patch adds the kernel support to do this. It also makes a slight semantic
change to nfsv4_findmirror(), since some uses of it no longer require that
the DS being searched for have a current mirror.
A patch that will be committed in a few minutes will modify the nfsd daemon
to support this feature.
The patch should only affect sites using the pNFS server (specified via the
"-p" command line option for nfsd.

Suggested by:	james.rose@framestore.com
This commit is contained in:
Rick Macklem 2018-07-02 19:21:33 +00:00
parent f127d86390
commit 2f32675c83
5 changed files with 184 additions and 48 deletions

View File

@ -185,6 +185,8 @@ struct nfsd_nfsd_args {
int dnshostlen; /* Length of DNS names */
char *dspath; /* DS Mount path on MDS */
int dspathlen; /* Length of DS Mount path on MDS */
char *mdspath; /* MDS mount for DS path on MDS */
int mdspathlen; /* Length of MDS mount for DS path on MDS */
int mirrorcnt; /* Number of mirrors to create on DSs */
};

View File

@ -4692,37 +4692,26 @@ nfsv4_freeslot(struct nfsclsession *sep, int slot)
}
/*
* Search for a matching pnfsd mirror device structure, base on the nmp arg.
* Search for a matching pnfsd DS, based on the nmp arg.
* Return one if found, NULL otherwise.
*/
struct nfsdevice *
nfsv4_findmirror(struct nfsmount *nmp)
{
struct nfsdevice *ds, *fndds;
int fndmirror;
struct nfsdevice *ds;
mtx_assert(NFSDDSMUTEXPTR, MA_OWNED);
/*
* Search the DS server list for a match with nmp.
* Remove the DS entry if found and there is a mirror.
*/
fndds = NULL;
fndmirror = 0;
if (nfsrv_devidcnt == 0)
return (fndds);
return (NULL);
TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) {
if (ds->nfsdev_nmp == nmp) {
NFSCL_DEBUG(4, "fnd main ds\n");
fndds = ds;
} else if (ds->nfsdev_nmp != NULL)
fndmirror = 1;
if (fndds != NULL && fndmirror != 0)
NFSCL_DEBUG(4, "nfsv4_findmirror: fnd main ds\n");
break;
}
}
if (fndmirror == 0) {
NFSCL_DEBUG(4, "no mirror for DS\n");
return (NULL);
}
return (fndds);
return (ds);
}

View File

@ -345,9 +345,11 @@ struct nfsdevice {
uint16_t nfsdev_hostnamelen;
uint16_t nfsdev_fileaddrlen;
uint16_t nfsdev_flexaddrlen;
uint16_t nfsdev_mdsisset;
char *nfsdev_fileaddr;
char *nfsdev_flexaddr;
char *nfsdev_host;
fsid_t nfsdev_mdsfsid;
uint32_t nfsdev_nextdir;
vnode_t nfsdev_dsdir[0];
};

View File

@ -3355,6 +3355,10 @@ nfssvc_nfsd(struct thread *td, struct nfssvc_args *uap)
nfsdarg.addrlen = 0;
nfsdarg.dnshost = NULL;
nfsdarg.dnshostlen = 0;
nfsdarg.dspath = NULL;
nfsdarg.dspathlen = 0;
nfsdarg.mdspath = NULL;
nfsdarg.mdspathlen = 0;
nfsdarg.mirrorcnt = 1;
}
} else
@ -3364,14 +3368,15 @@ nfssvc_nfsd(struct thread *td, struct nfssvc_args *uap)
if (nfsdarg.addrlen > 0 && nfsdarg.addrlen < 10000 &&
nfsdarg.dnshostlen > 0 && nfsdarg.dnshostlen < 10000 &&
nfsdarg.dspathlen > 0 && nfsdarg.dspathlen < 10000 &&
nfsdarg.mdspathlen > 0 && nfsdarg.mdspathlen < 10000 &&
nfsdarg.mirrorcnt >= 1 &&
nfsdarg.mirrorcnt <= NFSDEV_MAXMIRRORS &&
nfsdarg.addr != NULL && nfsdarg.dnshost != NULL &&
nfsdarg.dspath != NULL) {
nfsdarg.dspath != NULL && nfsdarg.mdspath != NULL) {
NFSD_DEBUG(1, "addrlen=%d dspathlen=%d dnslen=%d"
" mirrorcnt=%d\n", nfsdarg.addrlen,
" mdspathlen=%d mirrorcnt=%d\n", nfsdarg.addrlen,
nfsdarg.dspathlen, nfsdarg.dnshostlen,
nfsdarg.mirrorcnt);
nfsdarg.mdspathlen, nfsdarg.mirrorcnt);
cp = malloc(nfsdarg.addrlen + 1, M_TEMP, M_WAITOK);
error = copyin(nfsdarg.addr, cp, nfsdarg.addrlen);
if (error != 0) {
@ -3399,6 +3404,17 @@ nfssvc_nfsd(struct thread *td, struct nfssvc_args *uap)
}
cp[nfsdarg.dspathlen] = '\0'; /* Ensure nul term. */
nfsdarg.dspath = cp;
cp = malloc(nfsdarg.mdspathlen + 1, M_TEMP, M_WAITOK);
error = copyin(nfsdarg.mdspath, cp, nfsdarg.mdspathlen);
if (error != 0) {
free(nfsdarg.addr, M_TEMP);
free(nfsdarg.dnshost, M_TEMP);
free(nfsdarg.dspath, M_TEMP);
free(cp, M_TEMP);
goto out;
}
cp[nfsdarg.mdspathlen] = '\0'; /* Ensure nul term. */
nfsdarg.mdspath = cp;
} else {
nfsdarg.addr = NULL;
nfsdarg.addrlen = 0;
@ -3406,12 +3422,15 @@ nfssvc_nfsd(struct thread *td, struct nfssvc_args *uap)
nfsdarg.dnshostlen = 0;
nfsdarg.dspath = NULL;
nfsdarg.dspathlen = 0;
nfsdarg.mdspath = NULL;
nfsdarg.mdspathlen = 0;
nfsdarg.mirrorcnt = 1;
}
error = nfsrvd_nfsd(td, &nfsdarg);
free(nfsdarg.addr, M_TEMP);
free(nfsdarg.dnshost, M_TEMP);
free(nfsdarg.dspath, M_TEMP);
free(nfsdarg.mdspath, M_TEMP);
} else if (uap->flag & NFSSVC_PNFSDS) {
error = copyin(uap->argp, &pnfsdarg, sizeof(pnfsdarg));
if (error == 0 && pnfsdarg.op == PNFSDOP_DELDSSERVER) {
@ -3846,9 +3865,12 @@ nfsrv_pnfscreate(struct vnode *vp, struct vattr *vap, struct ucred *cred,
/* Get a DS server directory in a round-robin order. */
mirrorcnt = 1;
mp = vp->v_mount;
NFSDDSLOCK();
TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) {
if (ds->nfsdev_nmp != NULL)
if (ds->nfsdev_nmp != NULL && (ds->nfsdev_mdsisset == 0 ||
(mp->mnt_stat.f_fsid.val[0] == ds->nfsdev_mdsfsid.val[0] &&
mp->mnt_stat.f_fsid.val[1] == ds->nfsdev_mdsfsid.val[1])))
break;
}
if (ds == NULL) {
@ -3862,7 +3884,12 @@ nfsrv_pnfscreate(struct vnode *vp, struct vattr *vap, struct ucred *cred,
mds = TAILQ_NEXT(ds, nfsdev_list);
if (nfsrv_maxpnfsmirror > 1 && mds != NULL) {
TAILQ_FOREACH_FROM(mds, &nfsrv_devidhead, nfsdev_list) {
if (mds->nfsdev_nmp != NULL) {
if (mds->nfsdev_nmp != NULL &&
(mds->nfsdev_mdsisset == 0 ||
(mp->mnt_stat.f_fsid.val[0] ==
mds->nfsdev_mdsfsid.val[0] &&
mp->mnt_stat.f_fsid.val[1] ==
mds->nfsdev_mdsfsid.val[1]))) {
dsdir[mirrorcnt] = i;
dvp[mirrorcnt] = mds->nfsdev_dsdir[i];
mirrorcnt++;
@ -4464,6 +4491,7 @@ nfsrv_dsgetsockmnt(struct vnode *vp, int lktype, char *buf, int *buflenp,
struct nfsmount *curnmp, int *ippos, int *dsdirp)
{
struct vnode *dvp, *nvp, **tdvpp;
struct mount *mp;
struct nfsmount *nmp, *newnmp;
struct sockaddr *sad;
struct sockaddr_in *sin;
@ -4485,6 +4513,7 @@ nfsrv_dsgetsockmnt(struct vnode *vp, int lktype, char *buf, int *buflenp,
newnmp = *newnmpp;
else
newnmp = NULL;
mp = vp->v_mount;
error = vn_extattr_get(vp, IO_NODELOCKED, EXTATTR_NAMESPACE_SYSTEM,
"pnfsd.dsfile", buflenp, buf, p);
mirrorcnt = *buflenp / sizeof(*pf);
@ -4545,7 +4574,13 @@ nfsrv_dsgetsockmnt(struct vnode *vp, int lktype, char *buf, int *buflenp,
fndds = ds;
else if (newnmpp != NULL &&
newnmp == NULL &&
(*newnmpp == NULL || fndds == NULL))
(*newnmpp == NULL ||
fndds == NULL) &&
(ds->nfsdev_mdsisset == 0 ||
(ds->nfsdev_mdsfsid.val[0] ==
mp->mnt_stat.f_fsid.val[0] &&
ds->nfsdev_mdsfsid.val[1] ==
mp->mnt_stat.f_fsid.val[1])))
/*
* Return a destination for the
* copy in newnmpp. Choose the

View File

@ -210,7 +210,7 @@ static void nfsrv_freelayouts(nfsquad_t *clid, fsid_t *fs, int laytype,
int iomode);
static void nfsrv_freealllayouts(void);
static void nfsrv_freedevid(struct nfsdevice *ds);
static int nfsrv_setdsserver(char *dspathp, NFSPROC_T *p,
static int nfsrv_setdsserver(char *dspathp, char *mdspathp, NFSPROC_T *p,
struct nfsdevice **dsp);
static int nfsrv_delds(char *devid, NFSPROC_T *p);
static void nfsrv_deleteds(struct nfsdevice *fndds);
@ -232,6 +232,7 @@ static int nfsrv_dontlayout(fhandle_t *fhp);
static int nfsrv_createdsfile(vnode_t vp, fhandle_t *fhp, struct pnfsdsfile *pf,
vnode_t dvp, struct nfsdevice *ds, struct ucred *cred, NFSPROC_T *p,
vnode_t *tvpp);
static struct nfsdevice *nfsrv_findmirroredds(struct nfsmount *nmp);
/*
* Scan the client list for a match and either return the current one,
@ -7369,10 +7370,12 @@ nfsrv_freealllayouts(void)
* Look up the mount path for the DS server.
*/
static int
nfsrv_setdsserver(char *dspathp, NFSPROC_T *p, struct nfsdevice **dsp)
nfsrv_setdsserver(char *dspathp, char *mdspathp, NFSPROC_T *p,
struct nfsdevice **dsp)
{
struct nameidata nd;
struct nfsdevice *ds;
struct mount *mp;
int error, i;
char *dsdirpath;
size_t dsdirsize;
@ -7400,6 +7403,9 @@ nfsrv_setdsserver(char *dspathp, NFSPROC_T *p, struct nfsdevice **dsp)
* Allocate a DS server structure with the NFS mounted directory
* vnode reference counted, so that a non-forced dismount will
* fail with EBUSY.
* This structure is always linked into the list, even if an error
* is being returned. The caller will free the entire list upon
* an error return.
*/
*dsp = ds = malloc(sizeof(*ds) + nfsrv_dsdirsize * sizeof(vnode_t),
M_NFSDSTATE, M_WAITOK | M_ZERO);
@ -7435,6 +7441,36 @@ nfsrv_setdsserver(char *dspathp, NFSPROC_T *p, struct nfsdevice **dsp)
}
free(dsdirpath, M_TEMP);
if (strlen(mdspathp) > 0) {
/*
* This DS stores file for a specific MDS exported file
* system.
*/
NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF,
UIO_SYSSPACE, mdspathp, p);
error = namei(&nd);
NFSD_DEBUG(4, "mds lookup=%d\n", error);
if (error != 0)
goto out;
if (nd.ni_vp->v_type != VDIR) {
vput(nd.ni_vp);
error = ENOTDIR;
NFSD_DEBUG(4, "mdspath not dir\n");
goto out;
}
mp = nd.ni_vp->v_mount;
if ((mp->mnt_flag & MNT_EXPORTED) == 0) {
vput(nd.ni_vp);
error = ENXIO;
NFSD_DEBUG(4, "mdspath not an exported fs\n");
goto out;
}
ds->nfsdev_mdsfsid = mp->mnt_stat.f_fsid;
ds->nfsdev_mdsisset = 1;
vput(nd.ni_vp);
}
out:
TAILQ_INSERT_TAIL(&nfsrv_devidhead, ds, nfsdev_list);
atomic_add_int(&nfsrv_devidcnt, 1);
return (error);
@ -7514,11 +7550,7 @@ nfsrv_deldsnmp(struct nfsmount *nmp, NFSPROC_T *p)
NFSD_DEBUG(4, "deldsdvp\n");
NFSDDSLOCK();
if (nfsrv_faildscnt <= 0) {
NFSDDSUNLOCK();
return (NULL);
}
fndds = nfsv4_findmirror(nmp);
fndds = nfsrv_findmirroredds(nmp);
if (fndds != NULL)
nfsrv_deleteds(fndds);
NFSDDSUNLOCK();
@ -7551,21 +7583,35 @@ nfsrv_delds(char *devid, NFSPROC_T *p)
nmp = NULL;
fndmirror = 0;
NFSDDSLOCK();
if (nfsrv_faildscnt <= 0) {
NFSDDSUNLOCK();
return (ENXIO);
}
TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) {
if (NFSBCMP(ds->nfsdev_deviceid, devid, NFSX_V4DEVICEID) == 0 &&
ds->nfsdev_nmp != NULL) {
NFSD_DEBUG(4, "fnd main ds\n");
fndds = ds;
} else if (ds->nfsdev_nmp != NULL)
fndmirror = 1;
if (fndds != NULL && fndmirror != 0)
break;
}
}
if (fndds != NULL && fndmirror != 0) {
if (fndds == NULL) {
NFSDDSUNLOCK();
return (ENXIO);
}
if (fndds->nfsdev_mdsisset == 0 && nfsrv_faildscnt > 0)
fndmirror = 1;
else {
/* For the fsid is set case, search for a mirror. */
TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) {
if (ds != fndds && ds->nfsdev_nmp != NULL &&
ds->nfsdev_mdsisset != 0 &&
ds->nfsdev_mdsfsid.val[0] ==
fndds->nfsdev_mdsfsid.val[0] &&
ds->nfsdev_mdsfsid.val[1] ==
fndds->nfsdev_mdsfsid.val[1]) {
fndmirror = 1;
break;
}
}
}
if (fndmirror != 0) {
nmp = fndds->nfsdev_nmp;
NFSLOCKMNT(nmp);
if ((nmp->nm_privflag & (NFSMNTP_FORCEDISM |
@ -7579,7 +7625,7 @@ nfsrv_delds(char *devid, NFSPROC_T *p)
}
}
NFSDDSUNLOCK();
if (fndds != NULL && nmp != NULL) {
if (nmp != NULL) {
nfsrv_flexmirrordel(fndds->nfsdev_deviceid, p);
printf("pNFS server: mirror %s failed\n", fndds->nfsdev_host);
nfsrv_killrpcs(nmp);
@ -7601,7 +7647,8 @@ nfsrv_deleteds(struct nfsdevice *fndds)
NFSD_DEBUG(4, "deleteds: deleting a mirror\n");
fndds->nfsdev_nmp = NULL;
nfsrv_faildscnt--;
if (fndds->nfsdev_mdsisset == 0)
nfsrv_faildscnt--;
}
/*
@ -7687,24 +7734,27 @@ int
nfsrv_createdevids(struct nfsd_nfsd_args *args, NFSPROC_T *p)
{
struct nfsdevice *ds;
char *addrp, *dnshostp, *dspathp;
char *addrp, *dnshostp, *dspathp, *mdspathp;
int error, i;
addrp = args->addr;
dnshostp = args->dnshost;
dspathp = args->dspath;
mdspathp = args->mdspath;
nfsrv_maxpnfsmirror = args->mirrorcnt;
if (addrp == NULL || dnshostp == NULL || dspathp == NULL)
if (addrp == NULL || dnshostp == NULL || dspathp == NULL ||
mdspathp == NULL)
return (0);
/*
* Loop around for each nul-terminated string in args->addr,
* args->dnshost and args->dnspath.
* args->dnshost, args->dnspath and args->mdspath.
*/
while (addrp < (args->addr + args->addrlen) &&
dnshostp < (args->dnshost + args->dnshostlen) &&
dspathp < (args->dspath + args->dspathlen)) {
error = nfsrv_setdsserver(dspathp, p, &ds);
dspathp < (args->dspath + args->dspathlen) &&
mdspathp < (args->mdspath + args->mdspathlen)) {
error = nfsrv_setdsserver(dspathp, mdspathp, p, &ds);
if (error != 0) {
/* Free all DS servers. */
nfsrv_freealldevids();
@ -7715,6 +7765,7 @@ nfsrv_createdevids(struct nfsd_nfsd_args *args, NFSPROC_T *p)
addrp += (strlen(addrp) + 1);
dnshostp += (strlen(dnshostp) + 1);
dspathp += (strlen(dspathp) + 1);
mdspathp += (strlen(mdspathp) + 1);
}
if (nfsrv_devidcnt < nfsrv_maxpnfsmirror) {
/* Free all DS servers. */
@ -8299,9 +8350,15 @@ nfsrv_mdscopymr(char *mdspathp, char *dspathp, char *curdspathp, char *buf,
}
nmp = VFSTONFS(nd.ni_vp->v_mount);
/* Search the nfsdev list for a match. */
/*
* Search the nfsdevice list for a match. If curnmp == NULL,
* this is a recovery and there must be a mirror.
*/
NFSDDSLOCK();
*dsp = nfsv4_findmirror(nmp);
if (curnmp == NULL)
*dsp = nfsrv_findmirroredds(nmp);
else
*dsp = nfsv4_findmirror(nmp);
NFSDDSUNLOCK();
if (*dsp == NULL) {
vput(nd.ni_vp);
@ -8331,7 +8388,7 @@ nfsrv_mdscopymr(char *mdspathp, char *dspathp, char *curdspathp, char *buf,
if (error == 0 && nmp != NULL) {
/* Search the nfsdev list for a match. */
NFSDDSLOCK();
*dsp = nfsv4_findmirror(nmp);
*dsp = nfsrv_findmirroredds(nmp);
NFSDDSUNLOCK();
}
if (error == 0 && (nmp == NULL || *dsp == NULL)) {
@ -8376,3 +8433,54 @@ nfsrv_mdscopymr(char *mdspathp, char *dspathp, char *curdspathp, char *buf,
return (error);
}
/*
* Search for a matching pnfsd mirror device structure, base on the nmp arg.
* Return one if found, NULL otherwise.
*/
static struct nfsdevice *
nfsrv_findmirroredds(struct nfsmount *nmp)
{
struct nfsdevice *ds, *fndds;
int fndmirror;
mtx_assert(NFSDDSMUTEXPTR, MA_OWNED);
/*
* Search the DS server list for a match with nmp.
* Remove the DS entry if found and there is a mirror.
*/
fndds = NULL;
fndmirror = 0;
if (nfsrv_devidcnt == 0)
return (fndds);
TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) {
if (ds->nfsdev_nmp == nmp) {
NFSD_DEBUG(4, "nfsrv_findmirroredds: fnd main ds\n");
fndds = ds;
break;
}
}
if (fndds == NULL)
return (fndds);
if (fndds->nfsdev_mdsisset == 0 && nfsrv_faildscnt > 0)
fndmirror = 1;
else {
/* For the fsid is set case, search for a mirror. */
TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) {
if (ds != fndds && ds->nfsdev_nmp != NULL &&
ds->nfsdev_mdsisset != 0 &&
ds->nfsdev_mdsfsid.val[0] ==
fndds->nfsdev_mdsfsid.val[0] &&
ds->nfsdev_mdsfsid.val[1] ==
fndds->nfsdev_mdsfsid.val[1]) {
fndmirror = 1;
break;
}
}
}
if (fndmirror == 0) {
NFSD_DEBUG(4, "nfsrv_findmirroredds: no mirror for DS\n");
return (NULL);
}
return (fndds);
}