From 493893bc5498f2f69e335af97663c53087b4ed89 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 17 Oct 2019 14:20:48 -0500 Subject: [PATCH] runner: fix state_lock and cfgfs use The kernel can end up taking a configfs lock then call up to userspace, so we must not have a lock that is taken in this upcall and is taken when interacting with configfs. As reported by sherlockxiao: https://github.com/open-iscsi/tcmu-runner/issues/595 this happens with the state_lock where during deletion the kernel will hold the state_lock, but some code paths will hold the state_lock while calling into configfs. This moves our configfs access out of the state_lock. --- target.c | 18 ++++-------------- target.h | 4 +++- tcmur_device.c | 15 +++++++++++++-- tcmur_device.h | 13 ++++++++++++- 4 files changed, 32 insertions(+), 18 deletions(-) diff --git a/target.c b/target.c index ecb93a87..8275338d 100644 --- a/target.c +++ b/target.c @@ -28,7 +28,7 @@ static struct list_head tpg_recovery_list = LIST_HEAD_INIT(tpg_recovery_list); /* * Locking ordering: - * rdev->state_lock + * rdev->state_lock (see tcmur_device.h for more state_lock restrictions) * tpg_recovery_lock */ static pthread_mutex_t tpg_recovery_lock = PTHREAD_MUTEX_INITIALIZER; @@ -276,32 +276,23 @@ static void *tgt_port_grp_recovery_thread_fn(void *arg) return NULL; } -int tcmu_add_dev_to_recovery_list(struct tcmu_device *dev) +int tcmu_add_dev_to_recovery_list(struct tcmu_device *dev, + struct list_head *alua_list) { struct tcmur_device *rdev = tcmu_dev_get_private(dev); - struct list_head alua_list; struct alua_grp *group; struct tgt_port_grp *tpg; struct tgt_port *port, *enabled_port = NULL; int ret; pthread_mutex_lock(&tpg_recovery_lock); - - list_head_init(&alua_list); - ret = tcmu_get_alua_grps(dev, &alua_list); - if (ret) { - /* User is deleting device so fast fail */ - tcmu_dev_warn(dev, "Could not find any tpgs.\n"); - goto done; - } - /* * This assumes a tcmu_dev is only exported though one local * enabled tpg. The kernel members file only returns * the one and runner is not passed info about which * tpg/port IO was received on. */ - list_for_each(&alua_list, group, entry) { + list_for_each(alua_list, group, entry) { list_for_each(&group->tgt_ports, port, entry) { if (port->enabled) enabled_port = port; @@ -340,7 +331,6 @@ int tcmu_add_dev_to_recovery_list(struct tcmu_device *dev) add_to_list: list_add(&tpg->devs, &rdev->recovery_entry); done: - tcmu_release_alua_grps(&alua_list); pthread_mutex_unlock(&tpg_recovery_lock); return ret; } diff --git a/target.h b/target.h index 98580e77..e01d3647 100644 --- a/target.h +++ b/target.h @@ -12,6 +12,7 @@ #include "ccan/list/list.h" struct tgt_port_grp; +struct list_head; struct tgt_port { uint16_t rel_port_id; @@ -31,6 +32,7 @@ struct tgt_port { void tcmu_free_tgt_port(struct tgt_port *port); struct tgt_port *tcmu_get_tgt_port(char *member_str); -int tcmu_add_dev_to_recovery_list(struct tcmu_device *dev); +int tcmu_add_dev_to_recovery_list(struct tcmu_device *dev, + struct list_head *alua_list); #endif diff --git a/tcmur_device.c b/tcmur_device.c index ed72654a..211c3cbb 100644 --- a/tcmur_device.c +++ b/tcmur_device.c @@ -179,6 +179,16 @@ void tcmu_cancel_recovery(struct tcmu_device *dev) void tcmu_notify_conn_lost(struct tcmu_device *dev) { struct tcmur_device *rdev = tcmu_dev_get_private(dev); + struct list_head alua_list; + int ret; + + list_head_init(&alua_list); + ret = tcmu_get_alua_grps(dev, &alua_list); + if (ret) { + /* User is deleting device so fast fail */ + tcmu_dev_warn(dev, "Could not find any tpgs.\n"); + return; + } pthread_mutex_lock(&rdev->state_lock); @@ -200,10 +210,11 @@ void tcmu_notify_conn_lost(struct tcmu_device *dev) tcmu_dev_err(dev, "Handler connection lost (lock state %d)\n", rdev->lock_state); - if (!tcmu_add_dev_to_recovery_list(dev)) + if (!tcmu_add_dev_to_recovery_list(dev, &alua_list)) rdev->flags |= TCMUR_DEV_FLAG_IN_RECOVERY; unlock: pthread_mutex_unlock(&rdev->state_lock); + tcmu_release_alua_grps(&alua_list); } /** @@ -439,11 +450,11 @@ int tcmu_acquire_dev_lock(struct tcmu_device *dev, uint16_t tag) tcmu_dev_info(dev, "Lock acquisition %s\n", rdev->lock == TCMUR_DEV_LOCK_LOCKED ? "successful" : "unsuccessful"); - tcmu_cfgfs_dev_exec_action(dev, "block_dev", 0); pthread_cond_signal(&rdev->lock_cond); pthread_mutex_unlock(&rdev->state_lock); + tcmu_cfgfs_dev_exec_action(dev, "block_dev", 0); return ret; } diff --git a/tcmur_device.h b/tcmur_device.h index 13ccfd3a..b6b039fa 100644 --- a/tcmur_device.h +++ b/tcmur_device.h @@ -56,7 +56,18 @@ struct tcmur_device { pthread_t lock_thread; pthread_cond_t lock_cond; - /* General lock for lock state, thread, dev state, etc */ + /* + * General lock for lock state, thread, dev state, etc. + * + * Locking order: + * 1. Kernel configfs lock + * 2. state_lock + * 3. tpg_recovery_lock + * + * On deletion the kernel will grab the configfs lock then call into + * userspace, so we must not hold the state_lock then perform a configfs + * operation. + * */ pthread_mutex_t state_lock; int pending_uas;