nvme-fc: change controllers first connect to use reconnect path
authorJames Smart <jsmart2021@gmail.com>
Wed, 13 Jun 2018 21:07:37 +0000 (14:07 -0700)
committerChristoph Hellwig <hch@lst.de>
Thu, 14 Jun 2018 12:25:09 +0000 (14:25 +0200)
Current code follows the framework that has been in the transports
from the beginning where initial link-side controller connect occurs
as part of "creating the controller". Thus that first connect fully
talks to the controller and obtains values that can then be used in
for blk-mq setup, etc. It also means that everything about the
controller is fully know before the "create controller" call returns.

This has several weaknesses:
- The initial create_ctrl call made by the cli will block for a long
  time as wire transactions are performed synchronously. This delay
  becomes longer if errors occur or connectivity is lost and retries
  need to be performed.
- Code wise, it means there is a separate connect path for initial
  controller connect vs the (same) steps used in the reconnect path.
- And as there's separate paths, it means there's separate error
  handling and retry logic. It also plays havoc with the NEW state
  (should transition out of it after successful initial connect) vs
  the RESETTING and CONNECTING (reconnect) states that want to be
  transitioned to on error.
- As there's separate paths, to recover from errors and disruptions,
  it requires separate recovery/retry paths as well and can severely
  convolute the controller state.

This patch reworks the fc transport to use the same connect paths
for the initial connection as it uses for reconnect. This makes a
single path for error recovery and handling.

This patch:
- Removes the driving of the initial connect and replaces it with
  a state transition to CONNECTING and initiating the reconnect
  thread. A dummy state transition of RESETTING had to be traversed
  as a direct transtion of NEW->CONNECTING is not allowed. Given
  that the controller is "new", the RESETTING transition is a simple
  no-op. Once in the reconnecting thread, the normal behaviors of
  ctrl_loss_tmo (max_retries * connect_delay) and dev_loss_tmo will
  apply before the controller is torn down.
- Only if the state transitions couldn't be traversed and the
  reconnect thread not scheduled, will the controller be torn down
  while in create_ctrl.
- The prior code used the controller state of NEW to indicate
  whether request queues had been initialized or not. For the admin
  queue, the request queue is always created, so there's no need to
  check a state. For IO queues, change to tracking whether a successful
  io request queue create has occurred (e.g. 1st successful connect).
- The initial controller id is initialized to the dynamic controller
  id used in the initial connect message. It will be overwritten by
  the real controller id once the controller is connected on the wire.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
drivers/nvme/host/fc.c

index 0bad65803271ff68bc883e0dd16c78b8386fabf8..9d826b726425d11231db947bcb2d9387fd07caa5 100644 (file)
@@ -142,6 +142,7 @@ struct nvme_fc_ctrl {
        struct nvme_fc_rport    *rport;
        u32                     cnum;
 
+       bool                    ioq_live;
        bool                    assoc_active;
        u64                     association_id;
 
@@ -2463,6 +2464,8 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
        if (ret)
                goto out_delete_hw_queues;
 
+       ctrl->ioq_live = true;
+
        return 0;
 
 out_delete_hw_queues:
@@ -2615,8 +2618,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
        if (ret)
                goto out_delete_hw_queue;
 
-       if (ctrl->ctrl.state != NVME_CTRL_NEW)
-               blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
 
        ret = nvmf_connect_admin_queue(&ctrl->ctrl);
        if (ret)
@@ -2689,7 +2691,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
         */
 
        if (ctrl->ctrl.queue_count > 1) {
-               if (ctrl->ctrl.state == NVME_CTRL_NEW)
+               if (!ctrl->ioq_live)
                        ret = nvme_fc_create_io_queues(ctrl);
                else
                        ret = nvme_fc_reinit_io_queues(ctrl);
@@ -2776,8 +2778,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
         * use blk_mq_tagset_busy_itr() and the transport routine to
         * terminate the exchanges.
         */
-       if (ctrl->ctrl.state != NVME_CTRL_NEW)
-               blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
        blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
                                nvme_fc_terminate_exchange, &ctrl->ctrl);
 
@@ -2934,7 +2935,7 @@ nvme_fc_connect_ctrl_work(struct work_struct *work)
                nvme_fc_reconnect_or_delete(ctrl, ret);
        else
                dev_info(ctrl->ctrl.device,
-                       "NVME-FC{%d}: controller reconnect complete\n",
+                       "NVME-FC{%d}: controller connect complete\n",
                        ctrl->cnum);
 }
 
@@ -2982,7 +2983,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 {
        struct nvme_fc_ctrl *ctrl;
        unsigned long flags;
-       int ret, idx, retry;
+       int ret, idx;
 
        if (!(rport->remoteport.port_role &
            (FC_PORT_ROLE_NVME_DISCOVERY | FC_PORT_ROLE_NVME_TARGET))) {
@@ -3009,11 +3010,13 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
        }
 
        ctrl->ctrl.opts = opts;
+       ctrl->ctrl.nr_reconnects = 0;
        INIT_LIST_HEAD(&ctrl->ctrl_list);
        ctrl->lport = lport;
        ctrl->rport = rport;
        ctrl->dev = lport->dev;
        ctrl->cnum = idx;
+       ctrl->ioq_live = false;
        ctrl->assoc_active = false;
        init_waitqueue_head(&ctrl->ioabort_wait);
 
@@ -3032,6 +3035,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 
        ctrl->ctrl.sqsize = opts->queue_size - 1;
        ctrl->ctrl.kato = opts->kato;
+       ctrl->ctrl.cntlid = 0xffff;
 
        ret = -ENOMEM;
        ctrl->queues = kcalloc(ctrl->ctrl.queue_count,
@@ -3081,62 +3085,24 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
        list_add_tail(&ctrl->ctrl_list, &rport->ctrl_list);
        spin_unlock_irqrestore(&rport->lock, flags);
 
-       /*
-        * It's possible that transactions used to create the association
-        * may fail. Examples: CreateAssociation LS or CreateIOConnection
-        * LS gets dropped/corrupted/fails; or a frame gets dropped or a
-        * command times out for one of the actions to init the controller
-        * (Connect, Get/Set_Property, Set_Features, etc). Many of these
-        * transport errors (frame drop, LS failure) inherently must kill
-        * the association. The transport is coded so that any command used
-        * to create the association (prior to a LIVE state transition
-        * while NEW or CONNECTING) will fail if it completes in error or
-        * times out.
-        *
-        * As such: as the connect request was mostly likely due to a
-        * udev event that discovered the remote port, meaning there is
-        * not an admin or script there to restart if the connect
-        * request fails, retry the initial connection creation up to
-        * three times before giving up and declaring failure.
-        */
-       for (retry = 0; retry < 3; retry++) {
-               ret = nvme_fc_create_association(ctrl);
-               if (!ret)
-                       break;
-       }
-
-       if (ret) {
-               nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING);
-               cancel_work_sync(&ctrl->ctrl.reset_work);
-               cancel_delayed_work_sync(&ctrl->connect_work);
-
-               /* couldn't schedule retry - fail out */
+       if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING) ||
+           !nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
                dev_err(ctrl->ctrl.device,
-                       "NVME-FC{%d}: Connect retry failed\n", ctrl->cnum);
-
-               ctrl->ctrl.opts = NULL;
+                       "NVME-FC{%d}: failed to init ctrl state\n", ctrl->cnum);
+               goto fail_ctrl;
+       }
 
-               /* initiate nvme ctrl ref counting teardown */
-               nvme_uninit_ctrl(&ctrl->ctrl);
+       nvme_get_ctrl(&ctrl->ctrl);
 
-               /* Remove core ctrl ref. */
+       if (!queue_delayed_work(nvme_wq, &ctrl->connect_work, 0)) {
                nvme_put_ctrl(&ctrl->ctrl);
-
-               /* as we're past the point where we transition to the ref
-                * counting teardown path, if we return a bad pointer here,
-                * the calling routine, thinking it's prior to the
-                * transition, will do an rport put. Since the teardown
-                * path also does a rport put, we do an extra get here to
-                * so proper order/teardown happens.
-                */
-               nvme_fc_rport_get(rport);
-
-               if (ret > 0)
-                       ret = -EIO;
-               return ERR_PTR(ret);
+               dev_err(ctrl->ctrl.device,
+                       "NVME-FC{%d}: failed to schedule initial connect\n",
+                       ctrl->cnum);
+               goto fail_ctrl;
        }
 
-       nvme_get_ctrl(&ctrl->ctrl);
+       flush_delayed_work(&ctrl->connect_work);
 
        dev_info(ctrl->ctrl.device,
                "NVME-FC{%d}: new ctrl: NQN \"%s\"\n",
@@ -3144,6 +3110,30 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 
        return &ctrl->ctrl;
 
+fail_ctrl:
+       nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING);
+       cancel_work_sync(&ctrl->ctrl.reset_work);
+       cancel_delayed_work_sync(&ctrl->connect_work);
+
+       ctrl->ctrl.opts = NULL;
+
+       /* initiate nvme ctrl ref counting teardown */
+       nvme_uninit_ctrl(&ctrl->ctrl);
+
+       /* Remove core ctrl ref. */
+       nvme_put_ctrl(&ctrl->ctrl);
+
+       /* as we're past the point where we transition to the ref
+        * counting teardown path, if we return a bad pointer here,
+        * the calling routine, thinking it's prior to the
+        * transition, will do an rport put. Since the teardown
+        * path also does a rport put, we do an extra get here to
+        * so proper order/teardown happens.
+        */
+       nvme_fc_rport_get(rport);
+
+       return ERR_PTR(-EIO);
+
 out_cleanup_admin_q:
        blk_cleanup_queue(ctrl->ctrl.admin_q);
 out_free_admin_tag_set: