ocfs2: special case recovery lock in dlmlock_remote()
authorKurt Hackel <kurt.hackel@oracle.com>
Mon, 1 May 2006 20:47:50 +0000 (13:47 -0700)
committerMark Fasheh <mark.fasheh@oracle.com>
Mon, 26 Jun 2006 21:43:08 +0000 (14:43 -0700)
If the previous master of the recovery lock dies, let calc_usage take it
down completely and let the caller completely redo the dlmlock() call.
Otherwise, there will never be an opportunity to re-master the lockres and
recovery wont be able to progress.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
fs/ocfs2/dlm/dlmlock.c
fs/ocfs2/dlm/dlmrecovery.c

index 0ff934874942a423e19478331423408017db0dc3..20b38dc18736f49a46cf7e511c5d6e9a9dacd0d0 100644 (file)
@@ -227,7 +227,16 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
        res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
        lock->lock_pending = 0;
        if (status != DLM_NORMAL) {
-               if (status != DLM_NOTQUEUED) {
+               if (status == DLM_RECOVERING &&
+                   dlm_is_recovery_lock(res->lockname.name,
+                                        res->lockname.len)) {
+                       /* recovery lock was mastered by dead node.
+                        * we need to have calc_usage shoot down this
+                        * lockres and completely remaster it. */
+                       mlog(0, "%s: recovery lock was owned by "
+                            "dead node %u, remaster it now.\n",
+                            dlm->name, res->owner);
+               } else if (status != DLM_NOTQUEUED) {
                        /*
                         * DO NOT call calc_usage, as this would unhash
                         * the remote lockres before we ever get to use
@@ -691,18 +700,22 @@ retry_lock:
                        msleep(100);
                        /* no waiting for dlm_reco_thread */
                        if (recovery) {
-                               if (status == DLM_RECOVERING) {
-                                       mlog(0, "%s: got RECOVERING "
-                                            "for $REOCVERY lock, master "
-                                            "was %u\n", dlm->name, 
-                                            res->owner);
-                                       dlm_wait_for_node_death(dlm, res->owner, 
-                                                       DLM_NODE_DEATH_WAIT_MAX);
-                               }
+                               if (status != DLM_RECOVERING)
+                                       goto retry_lock;
+
+                               mlog(0, "%s: got RECOVERING "
+                                    "for $RECOVERY lock, master "
+                                    "was %u\n", dlm->name,
+                                    res->owner);
+                               /* wait to see the node go down, then
+                                * drop down and allow the lockres to
+                                * get cleaned up.  need to remaster. */
+                               dlm_wait_for_node_death(dlm, res->owner,
+                                               DLM_NODE_DEATH_WAIT_MAX);
                        } else {
                                dlm_wait_for_recovery(dlm);
+                               goto retry_lock;
                        }
-                       goto retry_lock;
                }
 
                if (status != DLM_NORMAL) {
index 86199f66eb5651ab3502755493e2b9fa15a89541..00209f4a29169b834a3db1f1a55ead88ad885936 100644 (file)
@@ -2314,6 +2314,10 @@ again:
                mlog(0, "%s: reco master %u is ready to recover %u\n",
                     dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
                status = -EEXIST;
+       } else if (ret == DLM_RECOVERING) {
+               mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n",
+                    dlm->name, dlm->node_num);
+               goto again;
        } else {
                struct dlm_lock_resource *res;