NFS MP scaling changes.

- Eliminate the hideous nfs_sndlock that serialized NFS/TCP request senders
  thru the sndlock.
- Institute a new nfs_connectlock that serializes NFS/TCP reconnects. Add
  logic to wait for pending request senders to finish sending before
  reconnecting. Dial down the sb_timeo for NFS/TCP sockets to 1 sec.
- Break out the nfs xid manipulation under a new nfs xid lock, rather than
  over loading the nfs request lock for this purpose.
- Fix some of the locking in nfs_request.
Many thanks to Kris Kennaway for his help with this and for initiating the
MP scaling analysis and work. Kris also tested this patch thorougly.
Approved by: re@ (Ken Smith)
This commit is contained in:
Mohan Srinivasan 2007-10-12 19:12:21 +00:00
parent 6a97238d11
commit faf529dce5
4 changed files with 131 additions and 75 deletions

View File

@ -293,8 +293,8 @@ int nfs_mountroot(struct mount *mp, struct thread *td);
#ifndef NFS4_USE_RPCCLNT
int nfs_send(struct socket *, struct sockaddr *, struct mbuf *,
struct nfsreq *);
int nfs_sndlock(struct nfsreq *);
void nfs_sndunlock(struct nfsreq *);
int nfs_connect_lock(struct nfsreq *);
void nfs_connect_unlock(struct nfsreq *);
#endif /* ! NFS4_USE_RPCCLNT */
int nfs_vinvalbuf(struct vnode *, int, struct thread *, int);

View File

@ -78,6 +78,7 @@ __FBSDID("$FreeBSD$");
#define FALSE 0
extern u_int32_t nfs_xid;
extern struct mtx nfs_xid_mtx;
static int nfs_realign_test;
static int nfs_realign_count;
@ -374,7 +375,10 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
SOCK_UNLOCK(so);
}
so->so_rcv.sb_timeo = 12 * hz;
so->so_snd.sb_timeo = 5 * hz;
if (nmp->nm_sotype == SOCK_STREAM)
so->so_snd.sb_timeo = 1 * hz; /* 1s snd timeout for NFS/TCP */
else
so->so_snd.sb_timeo = 5 * hz;
/*
* Get buffer reservation size from sysctl, but impose reasonable
@ -463,6 +467,17 @@ bad:
return (error);
}
static void
nfs_wakup_reconnectors(struct nfsmount *nmp)
{
KASSERT(mtx_owned(&nmp->nm_mtx), ("NFS mnt lock not owned !"));
if (--nmp->nm_nfstcpstate.sock_send_inprog == 0 &&
(nmp->nm_nfstcpstate.flags & NFS_TCP_WAIT_WRITE_DRAIN)) {
nmp->nm_nfstcpstate.flags &= ~NFS_TCP_WAIT_WRITE_DRAIN;
wakeup((caddr_t)&nmp->nm_nfstcpstate.sock_send_inprog);
}
}
/*
* Reconnect routine:
* Called when a connection is broken on a reliable protocol.
@ -478,14 +493,41 @@ nfs_reconnect(struct nfsreq *rep)
struct nfsreq *rp;
struct nfsmount *nmp = rep->r_nmp;
int error;
int slpflag = 0;
KASSERT(mtx_owned(&nmp->nm_mtx), ("NFS mnt lock not owned !"));
if (nmp->nm_flag & NFSMNT_INT)
slpflag = PCATCH;
/*
* Wait for any pending writes to this socket to drain (or timeout).
*/
while (nmp->nm_nfstcpstate.sock_send_inprog > 0) {
nmp->nm_nfstcpstate.flags |= NFS_TCP_WAIT_WRITE_DRAIN;
error = msleep((caddr_t)&nmp->nm_nfstcpstate.sock_send_inprog,
&nmp->nm_mtx, slpflag | (PZERO - 1), "nfscon", 0);
}
/*
* Grab the nfs_connect_lock to serialize connects.
* After grabbing the nfs_connect_lock, check if a reconnect is necessary or
* if someone else beat us to the connect !
*/
error = nfs_connect_lock(rep);
if (error)
goto unlock_exit;
if ((nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) == 0)
goto unlock_exit;
else
mtx_unlock(&nmp->nm_mtx);
nfs_reconnects++;
nfs_disconnect(nmp);
while ((error = nfs_connect(nmp, rep)) != 0) {
if (error == ERESTART)
error = EINTR;
if (error == EIO || error == EINTR)
return (error);
if (error == EIO || error == EINTR) {
mtx_lock(&nmp->nm_mtx);
goto unlock_exit;
}
(void) tsleep(&lbolt, PSOCK, "nfscon", 0);
}
@ -516,7 +558,11 @@ nfs_reconnect(struct nfsreq *rep)
}
}
mtx_unlock(&nfs_reqq_mtx);
return (0);
mtx_lock(&nmp->nm_mtx);
unlock_exit:
nfs_connect_unlock(rep);
mtx_unlock(&nmp->nm_mtx);
return (error);
}
/*
@ -581,7 +627,7 @@ nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top,
mtx_unlock(&rep->r_mtx);
mtx_unlock(&rep->r_nmp->nm_mtx);
m_freem(top);
return (0);
return (EPIPE);
}
rep->r_flags &= ~R_MUSTRESEND;
soflags = rep->r_nmp->nm_soflags;
@ -644,69 +690,69 @@ nfs_reply(struct nfsreq *rep)
register struct socket *so;
register struct mbuf *m;
int error = 0, sotype, slpflag;
sotype = rep->r_nmp->nm_sotype;
struct nfsmount *nmp = rep->r_nmp;
sotype = nmp->nm_sotype;
/*
* For reliable protocols, lock against other senders/receivers
* in case a reconnect is necessary.
*/
if (sotype != SOCK_DGRAM) {
error = nfs_sndlock(rep);
if (error)
return (error);
tryagain:
mtx_lock(&rep->r_nmp->nm_mtx);
mtx_lock(&nmp->nm_mtx);
mtx_lock(&rep->r_mtx);
if (rep->r_mrep) {
mtx_unlock(&rep->r_mtx);
mtx_unlock(&rep->r_nmp->nm_mtx);
nfs_sndunlock(rep);
mtx_unlock(&nmp->nm_mtx);
return (0);
}
if (rep->r_flags & R_SOFTTERM) {
mtx_unlock(&rep->r_mtx);
mtx_unlock(&rep->r_nmp->nm_mtx);
nfs_sndunlock(rep);
mtx_unlock(&nmp->nm_mtx);
return (EINTR);
}
so = rep->r_nmp->nm_so;
so = nmp->nm_so;
if (!so ||
(rep->r_nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT)) {
(nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT)) {
mtx_unlock(&rep->r_mtx);
mtx_unlock(&rep->r_nmp->nm_mtx);
nmp->nm_nfstcpstate.flags |= NFS_TCP_FORCE_RECONNECT;
error = nfs_reconnect(rep);
if (error) {
nfs_sndunlock(rep);
if (error)
return (error);
}
goto tryagain;
}
while (rep->r_flags & R_MUSTRESEND) {
mtx_unlock(&rep->r_mtx);
mtx_unlock(&rep->r_nmp->nm_mtx);
nmp->nm_nfstcpstate.sock_send_inprog++;
mtx_unlock(&nmp->nm_mtx);
m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT);
nfsstats.rpcretries++;
error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
error = nfs_send(so, nmp->nm_nam, m, rep);
if (error) {
if (error == EINTR || error == ERESTART ||
(error = nfs_reconnect(rep)) != 0) {
nfs_sndunlock(rep);
mtx_lock(&nmp->nm_mtx);
nfs_wakup_reconnectors(nmp);
if (!(error == EINTR || error == ERESTART)) {
nmp->nm_nfstcpstate.flags |= NFS_TCP_FORCE_RECONNECT;
error = nfs_reconnect(rep);
} else
mtx_unlock(&nmp->nm_mtx);
if (error)
return (error);
}
goto tryagain;
}
mtx_lock(&rep->r_nmp->nm_mtx);
mtx_lock(&rep->r_mtx);
} else {
mtx_lock(&nmp->nm_mtx);
nfs_wakup_reconnectors(nmp);
mtx_lock(&rep->r_mtx);
}
}
mtx_unlock(&rep->r_nmp->nm_mtx);
mtx_unlock(&rep->r_mtx);
nfs_sndunlock(rep);
mtx_unlock(&nmp->nm_mtx);
}
slpflag = 0;
mtx_lock(&rep->r_nmp->nm_mtx);
if (rep->r_nmp->nm_flag & NFSMNT_INT)
mtx_lock(&nmp->nm_mtx);
if (nmp->nm_flag & NFSMNT_INT)
slpflag = PCATCH;
mtx_unlock(&rep->r_nmp->nm_mtx);
mtx_unlock(&nmp->nm_mtx);
mtx_lock(&rep->r_mtx);
while ((rep->r_mrep == NULL) && (error == 0) &&
((rep->r_flags & R_SOFTTERM) == 0) &&
@ -725,19 +771,16 @@ tryagain:
}
mtx_unlock(&rep->r_mtx);
if (sotype == SOCK_STREAM) {
mtx_lock(&rep->r_nmp->nm_mtx);
mtx_lock(&nmp->nm_mtx);
mtx_lock(&rep->r_mtx);
if (((rep->r_nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) ||
if (((nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT) ||
(rep->r_flags & R_MUSTRESEND))) {
mtx_unlock(&rep->r_mtx);
mtx_unlock(&rep->r_nmp->nm_mtx);
error = nfs_sndlock(rep);
if (error)
return (error);
mtx_unlock(&nmp->nm_mtx);
goto tryagain;
} else {
mtx_unlock(&rep->r_mtx);
mtx_unlock(&rep->r_nmp->nm_mtx);
mtx_unlock(&nmp->nm_mtx);
}
}
return (error);
@ -1146,28 +1189,28 @@ tryagain:
* do it now.
*/
mtx_lock(&nmp->nm_mtx);
if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
(nmp->nm_flag & NFSMNT_DUMBTIMR) ||
nmp->nm_sent < nmp->nm_cwnd)) {
if (nmp->nm_so &&
(((nmp->nm_sotype == SOCK_STREAM) && !(nmp->nm_nfstcpstate.flags & NFS_TCP_FORCE_RECONNECT)) ||
(nmp->nm_flag & NFSMNT_DUMBTIMR) || nmp->nm_sent < nmp->nm_cwnd)) {
if (nmp->nm_sotype == SOCK_STREAM)
nmp->nm_nfstcpstate.sock_send_inprog++;
mtx_unlock(&nmp->nm_mtx);
error = nfs_sndlock(rep);
if (!error) {
m2 = m_copym(m, 0, M_COPYALL, M_TRYWAIT);
error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
nfs_sndunlock(rep);
}
mtx_lock(&nfs_reqq_mtx);
m2 = m_copym(m, 0, M_COPYALL, M_TRYWAIT);
error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
mtx_lock(&nmp->nm_mtx);
mtx_lock(&rep->r_mtx);
/*
* nfs_timer() could've re-transmitted the request if we ended up
* blocking on nfs_send() too long, so check for R_SENT here.
*/
if (!error && (rep->r_flags & (R_SENT | R_MUSTRESEND)) == 0) {
mtx_lock(&nmp->nm_mtx);
nmp->nm_sent += NFS_CWNDSCALE;
mtx_unlock(&nmp->nm_mtx);
rep->r_flags |= R_SENT;
}
mtx_unlock(&nfs_reqq_mtx);
mtx_unlock(&rep->r_mtx);
if (nmp->nm_sotype == SOCK_STREAM)
nfs_wakup_reconnectors(rep->r_nmp);
mtx_unlock(&nmp->nm_mtx);
} else {
mtx_unlock(&nmp->nm_mtx);
rep->r_rtt = -1;
@ -1179,10 +1222,6 @@ tryagain:
if (!error || error == EPIPE)
error = nfs_reply(rep);
/*
* RPC done, unlink the request.
*/
mtx_lock(&nfs_reqq_mtx);
/*
* nfs_timer() may be in the process of re-transmitting this request.
* nfs_timer() drops the nfs_reqq_mtx before the pru_send() (to avoid LORs).
@ -1190,23 +1229,41 @@ tryagain:
* comes back, it will be discarded (since the req struct for it no longer
* exists).
*/
wait_for_pinned_req:
mtx_lock(&rep->r_mtx);
while (rep->r_flags & R_PIN_REQ) {
msleep((caddr_t)&rep->r_flags, &nfs_reqq_mtx,
msleep((caddr_t)&rep->r_flags, &rep->r_mtx,
(PZERO - 1), "nfsrxmt", 0);
}
mtx_unlock(&rep->r_mtx);
mtx_lock(&nfs_reqq_mtx);
/* Have to check for R_PIN_REQ after grabbing wlock again */
mtx_lock(&rep->r_mtx);
if (rep->r_flags & R_PIN_REQ) {
mtx_unlock(&rep->r_mtx);
mtx_unlock(&nfs_reqq_mtx);
goto wait_for_pinned_req;
} else
mtx_unlock(&rep->r_mtx);
/* RPC done (timer not active, request not pinned), unlink the request */
TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
if (TAILQ_EMPTY(&nfs_reqq))
callout_stop(&nfs_callout);
mtx_unlock(&nfs_reqq_mtx);
/*
* Decrement the outstanding request count.
*/
mtx_lock(&rep->r_mtx);
if (rep->r_flags & R_SENT) {
rep->r_flags &= ~R_SENT; /* paranoia */
mtx_unlock(&rep->r_mtx);
mtx_lock(&nmp->nm_mtx);
nmp->nm_sent -= NFS_CWNDSCALE;
mtx_unlock(&nmp->nm_mtx);
}
mtx_unlock(&nfs_reqq_mtx);
} else
mtx_unlock(&rep->r_mtx);
/*
* If there was a successful reply and a tprintf msg.
@ -1273,11 +1330,11 @@ tryagain:
while (time_second < waituntil) {
(void) tsleep(&lbolt, PSOCK, "nqnfstry", 0);
}
mtx_lock(&nfs_reqq_mtx);
mtx_lock(&nfs_xid_mtx);
if (++nfs_xid == 0)
nfs_xid++;
rep->r_xid = *xidp = txdr_unsigned(nfs_xid);
mtx_unlock(&nfs_reqq_mtx);
mtx_unlock(&nfs_xid_mtx);
goto tryagain;
}
@ -1710,20 +1767,18 @@ nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct thread *td)
* in progress when a reconnect is necessary.
*/
int
nfs_sndlock(struct nfsreq *rep)
nfs_connect_lock(struct nfsreq *rep)
{
int *statep = &rep->r_nmp->nm_state;
struct thread *td;
int error, slpflag = 0, slptimeo = 0;
td = rep->r_td;
mtx_lock(&rep->r_nmp->nm_mtx);
if (rep->r_nmp->nm_flag & NFSMNT_INT)
slpflag = PCATCH;
while (*statep & NFSSTA_SNDLOCK) {
error = nfs_sigintr(rep->r_nmp, rep, td);
if (error) {
mtx_unlock(&rep->r_nmp->nm_mtx);
return (error);
}
*statep |= NFSSTA_WANTSND;
@ -1735,7 +1790,6 @@ nfs_sndlock(struct nfsreq *rep)
}
}
*statep |= NFSSTA_SNDLOCK;
mtx_unlock(&rep->r_nmp->nm_mtx);
return (0);
}
@ -1743,11 +1797,10 @@ nfs_sndlock(struct nfsreq *rep)
* Unlock the stream socket for others.
*/
void
nfs_sndunlock(struct nfsreq *rep)
nfs_connect_unlock(struct nfsreq *rep)
{
int *statep = &rep->r_nmp->nm_state;
mtx_lock(&rep->r_nmp->nm_mtx);
if ((*statep & NFSSTA_SNDLOCK) == 0)
panic("nfs sndunlock");
*statep &= ~NFSSTA_SNDLOCK;
@ -1755,7 +1808,6 @@ nfs_sndunlock(struct nfsreq *rep)
*statep &= ~NFSSTA_WANTSND;
wakeup(statep);
}
mtx_unlock(&rep->r_nmp->nm_mtx);
}
/*

View File

@ -102,6 +102,7 @@ int nfs_pbuf_freecnt = -1; /* start out unlimited */
struct nfs_reqq nfs_reqq;
struct mtx nfs_reqq_mtx;
struct nfs_bufq nfs_bufq;
struct mtx nfs_xid_mtx;
/*
* and the reverse mapping from generic to Version 2 procedure numbers
@ -187,7 +188,7 @@ nfsm_rpchead(struct ucred *cr, int nmflag, int procid, int auth_type,
*/
tl = nfsm_build(u_int32_t *, 8 * NFSX_UNSIGNED);
mtx_lock(&nfs_reqq_mtx);
mtx_lock(&nfs_xid_mtx);
/* Get a pretty random xid to start with */
if (!nfs_xid)
nfs_xid = random();
@ -199,7 +200,7 @@ nfsm_rpchead(struct ucred *cr, int nmflag, int procid, int auth_type,
*xidpp = tl;
*tl++ = txdr_unsigned(nfs_xid);
mtx_unlock(&nfs_reqq_mtx);
mtx_unlock(&nfs_xid_mtx);
*tl++ = rpc_call;
*tl++ = rpc_vers;
*tl++ = txdr_unsigned(NFS_PROG);
@ -424,6 +425,7 @@ nfs_init(struct vfsconf *vfsp)
callout_init(&nfs_callout, CALLOUT_MPSAFE);
mtx_init(&nfs_reqq_mtx, "NFS reqq lock", NULL, MTX_DEF);
mtx_init(&nfs_iod_mtx, "NFS iod lock", NULL, MTX_DEF);
mtx_init(&nfs_xid_mtx, "NFS xid lock", NULL, MTX_DEF);
nfs_pbuf_freecnt = nswbuf / 2 + 1;

View File

@ -40,7 +40,9 @@ struct nfs_tcp_mountstate {
int rpcresid;
#define NFS_TCP_EXPECT_RPCMARKER 0x0001 /* Expect to see a RPC/TCP marker next */
#define NFS_TCP_FORCE_RECONNECT 0x0002 /* Force a TCP reconnect */
#define NFS_TCP_WAIT_WRITE_DRAIN 0x0004 /* Waiting for socket writers to finish */
int flags;
int sock_send_inprog;
};
/*