From 384c6c330da9494bc91dd41d56461dd0e7286d92 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Thu, 20 Feb 2025 12:01:59 -0800 Subject: [PATCH 1/2] zpool: Allow lockless zpool status Add the following flags to `zpool status`: --lockless: Try for a short amount of time to get the spa_namespace lock. If that doesn't work, then do the zpool status locklessly. This is dangerous and can crash your system if the pools configs are being modified while zpool status is running. This flag requires the zfs_allow_lockless_zpool_status module param be set to 1 in in order to run. --trylock: Try for a short amount of time to get the spa_namespace lock. If that doesn't work then simply abort 'zpool status'. These commands allow users to view the zpool status when the pool gets stuck while holding the spa_namespace lock. Signed-off-by: Tony Hutter --- cmd/zdb/zdb.c | 3 +- cmd/zpool/zpool_iter.c | 7 +- cmd/zpool/zpool_main.c | 37 ++++- include/libzfs.h | 2 + include/os/freebsd/spl/sys/mutex.h | 15 ++ include/os/linux/spl/sys/mutex.h | 16 ++ include/sys/fs/zfs.h | 27 ++++ include/sys/spa.h | 12 +- include/sys/spa_impl.h | 2 + include/sys/zfs_context.h | 1 + lib/libzfs/libzfs_config.c | 33 +++- lib/libzfs/libzfs_impl.h | 2 + lib/libzfs/libzfs_pool.c | 8 +- lib/libzfs/libzfs_util.c | 12 ++ lib/libzpool/kernel.c | 16 ++ lib/libzpool/util.c | 3 +- man/man4/zfs.4 | 41 ++++- man/man8/zpool-status.8 | 36 ++++- module/zfs/spa.c | 145 +++++++++++++++++- module/zfs/spa_config.c | 105 +++++++++++-- module/zfs/spa_misc.c | 53 ++++--- module/zfs/zfs_ioctl.c | 120 ++++++++++++++- tests/runfiles/common.run | 2 +- tests/zfs-tests/include/libtest.shlib | 10 ++ tests/zfs-tests/include/tunables.cfg | 3 + tests/zfs-tests/tests/Makefile.am | 1 + .../zpool_status/zpool_status_lockless.ksh | 92 +++++++++++ 27 files changed, 736 insertions(+), 68 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_lockless.ksh diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 45eb9c783659..9a3e45d29ac0 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -7723,7 +7723,8 @@ import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) if (cfg == NULL) { zdb_set_skip_mmp(poolname); - error = spa_get_stats(poolname, &cfg, NULL, 0); + error = spa_get_stats(poolname, &cfg, NULL, 0, + ZPOOL_LOCK_BEHAVIOR_DEFAULT); if (error != 0) { fatal("Tried to read config of pool \"%s\" but " "spa_get_stats() failed with error %d\n", diff --git a/cmd/zpool/zpool_iter.c b/cmd/zpool/zpool_iter.c index 2ec189b98653..fa4f5769b730 100644 --- a/cmd/zpool/zpool_iter.c +++ b/cmd/zpool/zpool_iter.c @@ -118,6 +118,7 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, zfs_type_t type, boolean_t literal, int *err) { zpool_list_t *zlp; + int rc; zlp = safe_malloc(sizeof (zpool_list_t)); @@ -137,7 +138,11 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, zfs_type_t type, zlp->zl_literal = literal; if (argc == 0) { - (void) zpool_iter(g_zfs, add_pool, zlp); + rc = zpool_iter(g_zfs, add_pool, zlp); + if (rc != 0) { + free(zlp); + return (NULL); + } zlp->zl_findall = B_TRUE; } else { int i; diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index f2c2c3f41a7c..6d9a52799a3f 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -146,7 +146,9 @@ enum zpool_options { ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH, ZPOOL_OPTION_POOL_KEY_GUID, ZPOOL_OPTION_JSON_NUMS_AS_INT, - ZPOOL_OPTION_JSON_FLAT_VDEVS + ZPOOL_OPTION_JSON_FLAT_VDEVS, + ZPOOL_OPTION_LOCKLESS, + ZPOOL_OPTION_TRYLOCK, }; /* @@ -490,8 +492,8 @@ get_usage(zpool_help_t idx) case HELP_LABELCLEAR: return (gettext("\tlabelclear [-f] \n")); case HELP_LIST: - return (gettext("\tlist [-gHLpPv] [-o property[,...]] [-j " - "[--json-int, --json-pool-key-guid]] ...\n" + return (gettext("\tlist [-gHLpPv] [-o property[,...]] " + "[-j [--json-int, --json-pool-key-guid]] ...\n" "\t [-T d|u] [pool] [interval [count]]\n")); case HELP_PREFETCH: return (gettext("\tprefetch -t [] \n" @@ -521,8 +523,8 @@ get_usage(zpool_help_t idx) return (gettext("\ttrim [-dw] [-r ] [-c | -s] " "[ ...]\n")); case HELP_STATUS: - return (gettext("\tstatus [--power] [-j [--json-int, " - "--json-flat-vdevs, ...\n" + return (gettext("\tstatus [--power] [--lockless|--trylock] " + "[-j [--json-int, --json-flat-vdevs, ...\n" "\t --json-pool-key-guid]] [-c [script1,script2,...]] " "[-dDegiLpPstvx] ...\n" "\t [-T d|u] [pool] [interval [count]]\n")); @@ -2614,6 +2616,9 @@ typedef struct status_cbdata { nvlist_t *cb_jsobj; boolean_t cb_json_as_int; boolean_t cb_json_pool_key_guid; + boolean_t cb_lockless; + boolean_t cb_trylock; + } status_cbdata_t; /* Return 1 if string is NULL, empty, or whitespace; return 0 otherwise. */ @@ -11002,6 +11007,8 @@ status_callback(zpool_handle_t *zhp, void *data) * --json-int Display numbers in inteeger format instead of string * --json-flat-vdevs Display vdevs in flat hierarchy * --json-pool-key-guid Use pool GUID as key for pool objects + * --lockless No locks + * --trylock Try to get namespace lock, but abort if not * * Describes the health status of all pools or some subset. */ @@ -11024,6 +11031,10 @@ zpool_do_status(int argc, char **argv) ZPOOL_OPTION_JSON_FLAT_VDEVS}, {"json-pool-key-guid", no_argument, NULL, ZPOOL_OPTION_POOL_KEY_GUID}, + {"lockless", no_argument, NULL, + ZPOOL_OPTION_LOCKLESS}, + {"trylock", no_argument, NULL, + ZPOOL_OPTION_TRYLOCK}, {0, 0, 0, 0} }; @@ -11111,6 +11122,12 @@ zpool_do_status(int argc, char **argv) case ZPOOL_OPTION_POOL_KEY_GUID: cb.cb_json_pool_key_guid = B_TRUE; break; + case ZPOOL_OPTION_LOCKLESS: + cb.cb_lockless = B_TRUE; + break; + case ZPOOL_OPTION_TRYLOCK: + cb.cb_trylock = B_TRUE; + break; case '?': if (optopt == 'c') { print_zpool_script_list("status"); @@ -11152,6 +11169,16 @@ zpool_do_status(int argc, char **argv) usage(B_FALSE); } + if (cb.cb_lockless && cb.cb_trylock) { + (void) fprintf(stderr, gettext("cannot pass both --lockless and" + " --trylock\n")); + usage(B_FALSE); + } else if (cb.cb_lockless) { + libzfs_set_lock_behavior(g_zfs, ZPOOL_LOCK_BEHAVIOR_LOCKLESS); + } else if (cb.cb_trylock) { + libzfs_set_lock_behavior(g_zfs, ZPOOL_LOCK_BEHAVIOR_TRYLOCK); + } + for (;;) { if (cb.cb_json) { cb.cb_jsobj = zpool_json_schema(0, 1); diff --git a/include/libzfs.h b/include/libzfs.h index 8774d490f74b..6feaef75ad28 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -264,6 +264,8 @@ _LIBZFS_H int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, nvlist_t *, nvlist_t *); _LIBZFS_H int zpool_destroy(zpool_handle_t *, const char *); _LIBZFS_H int zpool_add(zpool_handle_t *, nvlist_t *, boolean_t check_ashift); +_LIBZFS_H void libzfs_set_lock_behavior(libzfs_handle_t *, + zpool_lock_behavior_t); typedef struct splitflags { /* do not split, but return the config that would be split off */ diff --git a/include/os/freebsd/spl/sys/mutex.h b/include/os/freebsd/spl/sys/mutex.h index 546a4681a291..f91831fcefde 100644 --- a/include/os/freebsd/spl/sys/mutex.h +++ b/include/os/freebsd/spl/sys/mutex.h @@ -72,4 +72,19 @@ typedef enum { #define mutex_owned(lock) sx_xlocked(lock) #define mutex_owner(lock) sx_xholder(lock) +/* + * Poor-man's version of Linux kernel's down_timeout(). Try to acquire a mutex + * for 'ns' number of nanoseconds. Returns 0 if mutex was acquired or ETIME + * if timeout occurred. + */ +static inline int mutex_enter_timeout(kmutex_t *mutex, uint64_t ns) +{ + hrtime_t end = gethrtime() + ns; + while (gethrtime() < end) { + if (mutex_tryenter(mutex)) + return (0); /* success */ + } + return (ETIME); +} + #endif /* _OPENSOLARIS_SYS_MUTEX_H_ */ diff --git a/include/os/linux/spl/sys/mutex.h b/include/os/linux/spl/sys/mutex.h index f000f53ab9b6..5f1faa6a95ce 100644 --- a/include/os/linux/spl/sys/mutex.h +++ b/include/os/linux/spl/sys/mutex.h @@ -26,6 +26,7 @@ #define _SPL_MUTEX_H #include +#include #include #include #include @@ -187,4 +188,19 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \ /* NOTE: do not dereference mp after this point */ \ } +/* + * Poor-man's version of Linux kernel's down_timeout(). Try to acquire a mutex + * for 'ns' number of nanoseconds. Returns 0 if mutex was acquired or ETIME + * if timeout occurred. + */ +static inline int mutex_enter_timeout(kmutex_t *mutex, uint64_t ns) +{ + hrtime_t end = gethrtime() + ns; + while (gethrtime() < end) { + if (mutex_tryenter(mutex)) + return (0); /* success */ + } + return (ETIME); +} + #endif /* _SPL_MUTEX_H */ diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 2d27aee217e0..744575b9913b 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -866,6 +866,9 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_REBUILD_STATS "org.openzfs:rebuild_stats" #define ZPOOL_CONFIG_COMPATIBILITY "compatibility" +/* ZFS_IOC_POOL_STATS argument to for spa_namespace locking behavior */ +#define ZPOOL_CONFIG_LOCK_BEHAVIOR "lock_behavior" /* not stored on disk */ + /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such @@ -1978,6 +1981,30 @@ enum zio_encrypt { ZFS_XA_NS_PREFIX_MATCH(LINUX_TRUSTED, name) || \ ZFS_XA_NS_PREFIX_MATCH(LINUX_USER, name)) +/* + * Set locking behavior for zpool commands. + */ +typedef enum { + /* Wait to acquire the lock on the zpool config */ + ZPOOL_LOCK_BEHAVIOR_WAIT = 0, + ZPOOL_LOCK_BEHAVIOR_DEFAULT = ZPOOL_LOCK_BEHAVIOR_WAIT, + /* + * Return an error if it's taking an unnecessarily long time to + * acquire the lock on the pool config (default 100ms) + */ + ZPOOL_LOCK_BEHAVIOR_TRYLOCK = 1, + + /* + * DANGER: THIS CAN CRASH YOUR SYSTEM + * + * If you can't acquire the pool config lock after 100ms then do a + * a lockless lookup. This should only be done in emergencies, as it + * can crash the kernel module! + */ + ZPOOL_LOCK_BEHAVIOR_LOCKLESS = 2, + ZPOOL_LOCK_BEHAVIOR_END = 3 /* last entry marker */ +} zpool_lock_behavior_t; + #ifdef __cplusplus } #endif diff --git a/include/sys/spa.h b/include/sys/spa.h index 439cd461b710..784c90510137 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -742,10 +742,13 @@ typedef enum trim_type { /* state manipulation functions */ extern int spa_open(const char *pool, spa_t **, const void *tag); +extern int spa_open_common_lock_behavior(const char *pool, spa_t **spapp, + const void *tag, nvlist_t *nvpolicy, nvlist_t **config, + zpool_lock_behavior_t zpool_lock_behavior); extern int spa_open_rewind(const char *pool, spa_t **, const void *tag, nvlist_t *policy, nvlist_t **config); extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, - size_t buflen); + size_t buflen, zpool_lock_behavior_t zpool_lock_behavior); extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops, struct dsl_crypto_params *dcp); extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props, @@ -850,10 +853,13 @@ extern kcondvar_t spa_namespace_cv; extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t, boolean_t); extern void spa_config_load(void); -extern int spa_all_configs(uint64_t *generation, nvlist_t **pools); +extern int spa_all_configs(uint64_t *generation, nvlist_t **pools, + zpool_lock_behavior_t zpool_lock_behavior); extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); +extern nvlist_t *spa_config_generate_lock_behavior(spa_t *spa, vdev_t *vd, + uint64_t txg, int getstats, zpool_lock_behavior_t zpool_lock_behavior); extern void spa_config_update(spa_t *spa, int what); extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype); @@ -865,9 +871,11 @@ extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, /* Namespace manipulation */ extern spa_t *spa_lookup(const char *name); +extern spa_t *spa_lookup_lockless(const char *name); extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); extern void spa_remove(spa_t *spa); extern spa_t *spa_next(spa_t *prev); +extern spa_t *spa_next_lockless(spa_t *prev); /* Refcount functions */ extern void spa_open_ref(spa_t *spa, const void *tag); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 8c52f751a819..5152ddc6852c 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -491,6 +491,8 @@ extern void spa_set_deadman_synctime(hrtime_t ns); extern void spa_set_deadman_ziotime(hrtime_t ns); extern const char *spa_history_zone(void); extern const char *zfs_active_allocator; +extern int zfs_allow_lockless_zpool_status; +extern unsigned int spa_namespace_trylock_ms; extern int param_set_active_allocator_common(const char *val); #ifdef __cplusplus diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 549f54c09383..5efe26d94dcf 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -277,6 +277,7 @@ extern void mutex_enter(kmutex_t *mp); extern int mutex_enter_check_return(kmutex_t *mp); extern void mutex_exit(kmutex_t *mp); extern int mutex_tryenter(kmutex_t *mp); +extern int mutex_enter_timeout(kmutex_t *mp, uint64_t ns); #define NESTED_SINGLE 1 #define mutex_enter_nested(mp, class) mutex_enter(mp) diff --git a/lib/libzfs/libzfs_config.c b/lib/libzfs/libzfs_config.c index 0d2102191389..8a927e6e5b05 100644 --- a/lib/libzfs/libzfs_config.c +++ b/lib/libzfs/libzfs_config.c @@ -109,6 +109,7 @@ namespace_reload(libzfs_handle_t *hdl) nvpair_t *elem; zfs_cmd_t zc = {"\0"}; void *cookie; + nvlist_t *nv_args = NULL; if (hdl->libzfs_ns_gen == 0) { /* @@ -129,21 +130,35 @@ namespace_reload(libzfs_handle_t *hdl) zcmd_alloc_dst_nvlist(hdl, &zc, 0); + + VERIFY0(nvlist_alloc(&nv_args, NV_UNIQUE_NAME, 0)); + VERIFY0(nvlist_add_uint64(nv_args, ZPOOL_CONFIG_LOCK_BEHAVIOR, + hdl->zpool_lock_behavior)); + zcmd_write_src_nvlist(hdl, &zc, nv_args); + for (;;) { zc.zc_cookie = hdl->libzfs_ns_gen; if (zfs_ioctl(hdl, ZFS_IOC_POOL_CONFIGS, &zc) != 0) { switch (errno) { + case EAGAIN: + return (zfs_standard_error(hdl, errno, + dgettext(TEXT_DOMAIN, "Unable to get the " + "pool config lock in a reasonable amount of" + " time"))); case EEXIST: /* * The namespace hasn't changed. */ zcmd_free_nvlists(&zc); return (0); - case ENOMEM: zcmd_expand_dst_nvlist(hdl, &zc); break; - + case ENOTSUP: + return (zfs_standard_error(hdl, errno, + dgettext(TEXT_DOMAIN, "--lockless requires" + " module parameter " + "zfs_allow_lockless_zpool_status=1"))); default: zcmd_free_nvlists(&zc); return (zfs_standard_error(hdl, errno, @@ -253,6 +268,7 @@ zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) int error; nvlist_t *config; libzfs_handle_t *hdl = zhp->zpool_hdl; + nvlist_t *nv_args = NULL; *missing = B_FALSE; (void) strcpy(zc.zc_name, zhp->zpool_name); @@ -262,9 +278,15 @@ zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size); + VERIFY0(nvlist_alloc(&nv_args, NV_UNIQUE_NAME, 0)); + VERIFY0(nvlist_add_uint64(nv_args, ZPOOL_CONFIG_LOCK_BEHAVIOR, + hdl->zpool_lock_behavior)); + zcmd_write_src_nvlist(hdl, &zc, nv_args); + for (;;) { - if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_STATS, - &zc) == 0) { + int rc; + rc = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_STATS, &zc); + if (rc == 0) { /* * The real error is returned in the zc_cookie field. */ @@ -276,7 +298,8 @@ zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) zcmd_expand_dst_nvlist(hdl, &zc); else { zcmd_free_nvlists(&zc); - if (errno == ENOENT || errno == EINVAL) + + if (errno == ENOENT || errno == EINVAL || rc) *missing = B_TRUE; zhp->zpool_state = POOL_STATE_UNAVAIL; return (0); diff --git a/lib/libzfs/libzfs_impl.h b/lib/libzfs/libzfs_impl.h index 9053740ec2f0..cbc36eb871ec 100644 --- a/lib/libzfs/libzfs_impl.h +++ b/lib/libzfs/libzfs_impl.h @@ -73,6 +73,8 @@ struct libzfs_handle { uint64_t libzfs_max_nvlist; void *libfetch; char *libfetch_load_error; + + zpool_lock_behavior_t zpool_lock_behavior; }; struct zfs_handle { diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 06d2c3543e3d..4c9bff31095b 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -80,14 +80,14 @@ zpool_get_all_props(zpool_handle_t *zhp) (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + nvlist_t *innvl = fnvlist_alloc(); if (zhp->zpool_n_propnames > 0) { - nvlist_t *innvl = fnvlist_alloc(); fnvlist_add_string_array(innvl, ZPOOL_GET_PROPS_NAMES, zhp->zpool_propnames, zhp->zpool_n_propnames); - zcmd_write_src_nvlist(hdl, &zc, innvl); } - - zcmd_alloc_dst_nvlist(hdl, &zc, 0); + VERIFY0(nvlist_add_uint64(innvl, ZPOOL_CONFIG_LOCK_BEHAVIOR, + hdl->zpool_lock_behavior)); + zcmd_write_src_nvlist(hdl, &zc, innvl); while (zfs_ioctl(hdl, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) { if (errno == ENOMEM) diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index e450d94add87..f9972f6267e1 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -2449,3 +2449,15 @@ zpool_prepare_and_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, rc = zpool_label_disk(hdl, zhp, name); return (rc); } + +/* + * Set spa_namespace locking behavior for zpools. For example you may want to + * error out if you can't acquire the lock, or skip the lock entirely (which + * can be dangerous). + */ +void +libzfs_set_lock_behavior(libzfs_handle_t *hdl, + zpool_lock_behavior_t zpool_lock_behavior) +{ + hdl->zpool_lock_behavior = zpool_lock_behavior; +} diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 653380149a9e..0461bccbf8a7 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -238,6 +238,22 @@ mutex_exit(kmutex_t *mp) VERIFY0(pthread_mutex_unlock(&mp->m_lock)); } +/* + * Poor-man's version of Linux kernel's down_timeout(). Try to acquire a mutex + * for 'ns' number of nanoseconds. Returns 0 if mutex was acquired or ENOLCK + * if timeout occurred. + */ +int +mutex_enter_timeout(kmutex_t *mutex, uint64_t ns) +{ + hrtime_t end = gethrtime() + ns; + while (gethrtime() < end) { + if (mutex_tryenter(mutex)) + return (0); /* success */ + } + return (ENOLCK); +} + /* * ========================================================================= * rwlocks diff --git a/lib/libzpool/util.c b/lib/libzpool/util.c index a297daedbd4d..b2b11f32af67 100644 --- a/lib/libzpool/util.c +++ b/lib/libzpool/util.c @@ -137,7 +137,8 @@ show_pool_stats(spa_t *spa) nvlist_t *config, *nvroot; const char *name; - VERIFY(spa_get_stats(spa_name(spa), &config, NULL, 0) == 0); + VERIFY(spa_get_stats(spa_name(spa), &config, NULL, 0, + ZPOOL_LOCK_BEHAVIOR_DEFAULT) == 0); VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index f5881d7faf59..330d4a9b6532 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2673,8 +2673,47 @@ Defines zvol block devices behavior when .It Sy zvol_enforce_quotas Ns = Ns Sy 0 Ns | Ns 1 Pq uint Enable strict ZVOL quota enforcement. The strict quota enforcement may have a performance impact. +.It Sy zfs_debug_namespace_lock_delay +WARNING, DEBUG USE ONLY! +If set, add an artificial +.Sy zfs_debug_namespace_lock_delay +milliseconds delay after taking the spa_namespace lock. +This is used to simulate a hung pool. +If +.Sy zfs_debug_namespace_lock_delay +is set to -1 then crash the kernel on the next spa_namespace lock hold. +If set to 0 (default) then take no action. +This option is used by test suite. +.It Sy zfs_allow_lockless_zpool_status +WARNING, DEBUG USE ONLY! +Allow the use of +.Sy zpool status --lockless +by the user. +.Sy zpool status --lockless +is useful in emergency situations when you really want to see the status +of a pool that's locked up. +WARNING: This can crash your system if the pool configuration is being updated +while +.Sy zpool status --lockless +is being run. +Set to 1 to enable, or 0 to disable (default). +.It Sy spa_namespace_trylock_ms +If the user does a +.Sy zpool status --trylock|--lockless +then try for this many milliseconds to get the the spa_namespace lock before +giving up. +This is the internal lock that synchronizes changes to the pool configuration. +Note that +.Sy zpool status +will try to get the lock in multiple places (like getting the pool config, +stats, props) before returning. +That means the total wall clock time for +.Sy zpool status --trylock|--lockless +could a multiple of +.Sy spa_namespace_trylock_ms +if the lock is already held. +Default is 100 (milliseconds). .El -. .Sh ZFS I/O SCHEDULER ZFS issues I/O operations to leaf vdevs to satisfy and complete I/O operations. The scheduler determines when and in what order those operations are issued. diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index 10d7ff973788..8ddf5e8074c6 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -38,11 +38,13 @@ .Nm zpool .Cm status .Op Fl dDegiLpPstvx +.Op Fl --lockless|--trylock +.Op Fl --power +.Op Fl j Op Ar --json-int, --json-flat-vdevs, --json-pool-key-guid .Op Fl T Sy u Ns | Ns Sy d .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns … .Oo Ar pool Oc Ns … .Op Ar interval Op Ar count -.Op Fl j Op Ar --json-int, --json-flat-vdevs, --json-pool-key-guid . .Sh DESCRIPTION Displays the detailed health status for the given pools. @@ -61,6 +63,38 @@ the other workloads on the system can change. .Bl -tag -width Ds .It Fl -power Display vdev enclosure slot power status (on or off). +.It Fl -lockless +If +.Sy zpool status +is unable to acquire the pool config lock (a.k.a the +spa_namespace lock) in a reasonable amount of time, then proceed without the +lock. +This is useful in emergency situations when you really want to see the status +of a pool that's locked up. +WARNING: This can crash your system if the pool configuration is being updated +while +.Sy zpool status --lockless +is being run. +As an additional safety measure, +.Sy --lockless +requires the +.Sy zfs_debug_allow_lockless_zpool +module param be set to 1 in order to work. +This prevents unprivileged users who have delegated access to +.Sy zpool status +from potentially crashing the system. +.It Fl -trylock +When +.Sy zpool status +runs, it will try to acquire the pool config lock within the +kernel module (the spa_namespace lock). +However, there can be times when a pool gets locked up while holding this lock, +leading to subsequent zpool status invocations getting hung. +The +.Sy --trylock +option gets around this by attempting to take the pool config lock +for a reasonable amount of time, and then aborting if it's unable to take the +lock. .It Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns … Run a script (or scripts) on each vdev and include the output as a new column in the diff --git a/module/zfs/spa.c b/module/zfs/spa.c index cbe90303072f..8fecb5330054 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -310,6 +310,38 @@ static int zfs_livelist_condense_zthr_cancel = 0; */ static int zfs_livelist_condense_new_alloc = 0; + +/* + * WARNING, FOR TEST USE ONLY! + * + * Wait zfs_debug_namespace_lock_delay milliseconds after taking the + * spa_namespace_lock to simulate lock contention. + * If set to -1, then *panic* to simulate a pool failure. + * If set to 0 then take no action (default) + */ +static int zfs_debug_namespace_lock_delay = 0; + +/* + * Set to 1 to allow 'zpool status --lockless'. This is an extra safety to + * make sure you can only run with --lockless when you know what you're doing. + * Running locklessly while the pool configuration is changing can cause kernel + * panics. + * + * This safety also means that un-privileged users who have been delegated + * 'zpool status' rights, cannot run --lockless without the root user setting + * zfs_allow_lockless_zpool_status=1 first. + */ +int zfs_allow_lockless_zpool_status = 0; + +/* + * When zpool_lock_behavior is ZPOOL_LOCK_BEHAVIOR_TRY, then try to acquire + * the spa_namespace_ lock for this many milliseconds before giving up. + * + * Note that 'zpool status' will try to get the lock in multiple places (like + * getting the pool config, stats, props) before returning. + */ +unsigned int spa_namespace_trylock_ms = 100; + /* * ========================================================================== * SPA properties routines @@ -5794,8 +5826,9 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, * ambiguous state. */ static int -spa_open_common(const char *pool, spa_t **spapp, const void *tag, - nvlist_t *nvpolicy, nvlist_t **config) +spa_open_common_impl(const char *pool, spa_t **spapp, const void *tag, + nvlist_t *nvpolicy, nvlist_t **config, + zpool_lock_behavior_t zpool_lock_behavior) { spa_t *spa; spa_load_state_t state = SPA_LOAD_OPEN; @@ -5805,6 +5838,13 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, *spapp = NULL; + /* + * Sanity check: ZPOOL_LOCK_BEHAVIOR_LOCKLESS requires + * zfs_allow_lockless_zpool_status = 1 + */ + ASSERT(!((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) && + (zfs_allow_lockless_zpool_status != 1))); + /* * As disgusting as this is, we need to support recursive calls to this * function because dsl_dir_open() is called during spa_load(), and ends @@ -5812,8 +5852,45 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, * avoid dsl_dir_open() calling this in the first place. */ if (MUTEX_NOT_HELD(&spa_namespace_lock)) { - mutex_enter(&spa_namespace_lock); - locked = B_TRUE; + + /* + * Special debug case: we've selected a pool for lockless + * operation + */ + if ((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) || + (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_TRYLOCK)) { + /* Make a cursory effort to acquire the lock in 100ms */ + if (mutex_enter_timeout(&spa_namespace_lock, + MSEC2NSEC(spa_namespace_trylock_ms)) != 0) { + if (zpool_lock_behavior == + ZPOOL_LOCK_BEHAVIOR_LOCKLESS) { + + /* + * Danger: we're doing spa_lookup() + * without holding the lock + */ + spa = spa_lookup_lockless(pool); + if (spa == NULL) { + return (SET_ERROR(ENOENT)); + } + + goto skip_the_locks; + } else { + /* + * We had ZPOOL_LOCK_BEHAVIOR_TRYLOCK + * set and cant get the lock in 100ms. + */ + return (SET_ERROR(EAGAIN)); + } + } else { + /* We got the lock from mutex_enter_timeout() */ + locked = B_TRUE; + } + } else { + /* Normal case of getting the lock */ + mutex_enter(&spa_namespace_lock); + locked = B_TRUE; + } } if ((spa = spa_lookup(pool)) == NULL) { @@ -5822,6 +5899,19 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, return (SET_ERROR(ENOENT)); } + /* + * Simulate holding the spa_namespace lock for a long time, or forcing + * a crash with the lock held. This is for testing purposes only. + */ + if (zfs_debug_namespace_lock_delay == -1) { +#if _KERNEL && defined(__linux__) + BUG(); +#endif + } else if (zfs_debug_namespace_lock_delay > 0) { + zfs_sleep_until(gethrtime() + + MSEC2NSEC(zfs_debug_namespace_lock_delay)); + } + if (spa->spa_state == POOL_STATE_UNINITIALIZED) { zpool_load_policy_t policy; @@ -5881,10 +5971,12 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, } } +skip_the_locks: spa_open_ref(spa, tag); if (config != NULL) - *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + *config = spa_config_generate_lock_behavior(spa, NULL, -1ULL, + B_TRUE, zpool_lock_behavior); /* * If we've recovered the pool, pass back any information we @@ -5910,6 +6002,23 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, return (0); } +static int +spa_open_common(const char *pool, spa_t **spapp, const void *tag, + nvlist_t *nvpolicy, nvlist_t **config) +{ + return (spa_open_common_impl(pool, spapp, tag, nvpolicy, config, + ZPOOL_LOCK_BEHAVIOR_DEFAULT)); +} + +int +spa_open_common_lock_behavior(const char *pool, spa_t **spapp, const void *tag, + nvlist_t *nvpolicy, nvlist_t **config, zpool_lock_behavior_t + zpool_lock_behavior) +{ + return (spa_open_common_impl(pool, spapp, tag, nvpolicy, config, + zpool_lock_behavior)); +} + int spa_open_rewind(const char *name, spa_t **spapp, const void *tag, nvlist_t *policy, nvlist_t **config) @@ -6143,13 +6252,25 @@ spa_add_feature_stats(spa_t *spa, nvlist_t *config) int spa_get_stats(const char *name, nvlist_t **config, - char *altroot, size_t buflen) + char *altroot, size_t buflen, zpool_lock_behavior_t zpool_lock_behavior) { int error; spa_t *spa; *config = NULL; - error = spa_open_common(name, &spa, FTAG, NULL, config); + + /* + * Sanity check: ZPOOL_LOCK_BEHAVIOR_LOCKLESS requires + * zfs_allow_lockless_zpool_status = 1 + */ + ASSERT(!((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) && + (zfs_allow_lockless_zpool_status != 1))); + + error = spa_open_common_lock_behavior(name, &spa, FTAG, NULL, config, + zpool_lock_behavior); + if (error != 0) { + return (error); + } if (spa != NULL) { /* @@ -10993,6 +11114,16 @@ ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, "Allow importing pool with up to this number of missing top-level " "vdevs (in read-only mode)"); +ZFS_MODULE_PARAM(zfs, zfs_, debug_namespace_lock_delay, INT, ZMOD_RW, + "Do not use - for ZFS test suite only."); + +ZFS_MODULE_PARAM(zfs, zfs_, allow_lockless_zpool_status, INT, ZMOD_RW, + "Set to 1 to allow 'zpool status --lockless' (dangerous)"); + +ZFS_MODULE_PARAM(zfs_spa, spa_, namespace_trylock_ms, UINT, ZMOD_RW, + "Try for this many milliseconds to get the spa_namespace lock when " + "running 'zpool status --trylock|--lockless'."); + ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW, "Set the livelist condense zthr to pause"); diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index de53de114979..c60221d6f590 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -69,6 +69,16 @@ static uint64_t spa_config_generation = 1; * userland pools when doing testing. */ char *spa_config_path = (char *)ZPOOL_CACHE; + +/* + * WARNING, THIS CAN CRASH THE KERNEL! + * If enabled, allow 'zpool status --lockless'. This is an extra safety + * measure the user must engage, since 'zpool status --lockless' can potentially + * crash the pool if run while the pool configuration is changing. Set to 0 to + * disable, 1 to enable. + */ +int zfs_allow_lockless_zpool = 0; + #ifdef _KERNEL static int zfs_autoimport_disable = B_TRUE; #endif @@ -369,19 +379,49 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent, * information for all pool visible within the zone. */ int -spa_all_configs(uint64_t *generation, nvlist_t **pools) +spa_all_configs(uint64_t *generation, nvlist_t **pools, zpool_lock_behavior_t + zpool_lock_behavior) { + int error = 0; spa_t *spa = NULL; + boolean_t use_lock = B_TRUE; + + /* + * Sanity check: ZPOOL_LOCK_BEHAVIOR_LOCKLESS requires + * zfs_allow_lockless_zpool_status = 1 + */ + ASSERT(!((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) && + (zfs_allow_lockless_zpool_status != 1))); if (*generation == spa_config_generation) return (SET_ERROR(EEXIST)); - int error = mutex_enter_interruptible(&spa_namespace_lock); - if (error) - return (SET_ERROR(EINTR)); + if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_WAIT) { + error = mutex_enter_interruptible(&spa_namespace_lock); + } else { + error = mutex_enter_timeout(&spa_namespace_lock, + MSEC2NSEC(spa_namespace_trylock_ms)); + } + if (error) { + if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_TRYLOCK) + return (SET_ERROR(EAGAIN)); + else if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) + use_lock = B_FALSE; + else + return (SET_ERROR(EINTR)); + } *pools = fnvlist_alloc(); - while ((spa = spa_next(spa)) != NULL) { + + do { + if (use_lock) + spa = spa_next(spa); + else + spa = spa_next_lockless(spa); + + if (spa == NULL) + break; + if (INGLOBALZONE(curproc) || zone_dataset_visible(spa_name(spa), NULL)) { mutex_enter(&spa->spa_props_lock); @@ -389,9 +429,11 @@ spa_all_configs(uint64_t *generation, nvlist_t **pools) spa->spa_config); mutex_exit(&spa->spa_props_lock); } - } + } while (1); *generation = spa_config_generation; - mutex_exit(&spa_namespace_lock); + + if (use_lock) + mutex_exit(&spa_namespace_lock); return (0); } @@ -412,8 +454,9 @@ spa_config_set(spa_t *spa, nvlist_t *config) * We infer whether to generate a complete config or just one top-level config * based on whether vd is the root vdev. */ -nvlist_t * -spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) +static nvlist_t * +spa_config_generate_impl(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats, + zpool_lock_behavior_t zpool_lock_behavior) { nvlist_t *config, *nvroot; vdev_t *rvd = spa->spa_root_vdev; @@ -421,15 +464,33 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) boolean_t locked = B_FALSE; uint64_t split_guid; const char *pool_name; + boolean_t skip_locks = B_FALSE; + + /* + * Sanity check: ZPOOL_LOCK_BEHAVIOR_LOCKLESS requires + * zfs_allow_lockless_zpool_status = 1 + */ + ASSERT(!((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) && + (zfs_allow_lockless_zpool_status != 1))); + + /* Special debug case: we've selected a pool for lockless operation */ + if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) { + skip_locks = B_TRUE; + } if (vd == NULL) { vd = rvd; locked = B_TRUE; - spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + + if (!skip_locks) + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, + RW_READER); } - ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) == - (SCL_CONFIG | SCL_STATE)); + if (!skip_locks) + ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, + RW_READER) == (SCL_CONFIG | SCL_STATE)); + /* * If txg is -1, report the current value of spa->spa_config_txg. @@ -552,12 +613,27 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) kmem_free(dds, sizeof (ddt_stat_t)); } - if (locked) + if (locked && !skip_locks) spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (config); } +nvlist_t * +spa_config_generate_lock_behavior(spa_t *spa, vdev_t *vd, uint64_t txg, + int getstats, zpool_lock_behavior_t zpool_lock_behavior) +{ + return (spa_config_generate_impl(spa, vd, txg, getstats, + zpool_lock_behavior)); +} + +nvlist_t * +spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) +{ + return (spa_config_generate_impl(spa, vd, txg, getstats, + ZPOOL_LOCK_BEHAVIOR_DEFAULT)); +} + /* * Update all disk labels, generate a fresh config based on the current * in-core state, and sync the global config cache (do not sync the config @@ -637,3 +713,6 @@ ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD, ZFS_MODULE_PARAM(zfs, zfs_, autoimport_disable, INT, ZMOD_RW, "Disable pool import at module load"); + +ZFS_MODULE_PARAM(zfs, zfs_, allow_lockless_zpool, INT, ZMOD_RW, + "DEBUG ONLY! Allow 'zpool status --lockless'"); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index f2b0297787f4..9847ce86f67c 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -608,21 +608,14 @@ spa_config_held(spa_t *spa, int locks, krw_t rw) * ========================================================================== */ -/* - * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. - * Returns NULL if no matching spa_t is found. - */ spa_t * -spa_lookup(const char *name) +spa_lookup_lockless(const char *name) { static spa_t search; /* spa_t is large; don't allocate on stack */ - spa_t *spa; avl_index_t where; + spa_t *spa; char *cp; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - -retry: (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); /* @@ -634,7 +627,24 @@ spa_lookup(const char *name) *cp = '\0'; spa = avl_find(&spa_namespace_avl, &search, &where); - if (spa == NULL) + + return (spa); +} + +/* + * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. + * Returns NULL if no matching spa_t is found. + */ +spa_t * +spa_lookup(const char *name) +{ + spa_t *spa; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + +retry: + spa = spa_lookup_lockless(name); + if (!spa) return (NULL); /* @@ -842,7 +852,6 @@ spa_remove(spa_t *spa) ASSERT0(spa->spa_waiters); nvlist_free(spa->spa_config_splitting); - avl_remove(&spa_namespace_avl, spa); if (spa->spa_root) @@ -907,6 +916,15 @@ spa_remove(spa_t *spa) kmem_free(spa, sizeof (spa_t)); } +spa_t * +spa_next_lockless(spa_t *prev) +{ + if (prev) + return (AVL_NEXT(&spa_namespace_avl, prev)); + else + return (avl_first(&spa_namespace_avl)); +} + /* * Given a pool, return the next pool in the namespace, or NULL if there is * none. If 'prev' is NULL, return the first pool. @@ -915,11 +933,7 @@ spa_t * spa_next(spa_t *prev) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - if (prev) - return (AVL_NEXT(&spa_namespace_avl, prev)); - else - return (avl_first(&spa_namespace_avl)); + return (spa_next_lockless(prev)); } /* @@ -1583,8 +1597,9 @@ spa_load_guid_exists(uint64_t guid) ASSERT(MUTEX_HELD(&spa_namespace_lock)); for (spa_t *spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { - if (spa_load_guid(spa) == guid) + if (spa_load_guid(spa) == guid) { return (B_TRUE); + } } return (arc_async_flush_guid_inuse(guid)); @@ -3033,6 +3048,7 @@ EXPORT_SYMBOL(spa_lookup); EXPORT_SYMBOL(spa_add); EXPORT_SYMBOL(spa_remove); EXPORT_SYMBOL(spa_next); +EXPORT_SYMBOL(spa_next_lockless); /* Refcount functions */ EXPORT_SYMBOL(spa_open_ref); @@ -3160,6 +3176,3 @@ ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW, "Number of allocators per spa"); - -ZFS_MODULE_PARAM(zfs, spa_, cpus_per_allocator, INT, ZMOD_RW, - "Minimum number of CPUs per allocators"); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 9266c3c28b50..78ec67c151c2 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1585,13 +1585,63 @@ zfs_ioc_pool_export(zfs_cmd_t *zc) return (error); } +/* + * Given a zc with only ZPOOL_CONFIG_LOCK_BEHAVIOR set in its ioctl + * source nvlist, extract and return the zpool_lock_behavior_t value. + * + * Return zpool_lock_behavior_t on success. On failure, return + * ZPOOL_LOCK_BEHAVIOR_DEFAULT. + */ +static zpool_lock_behavior_t +zfs_get_zpool_lock_behavior_helper(zfs_cmd_t *zc) +{ + int error = 0; + uint64_t zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_DEFAULT; + nvlist_t *nvl = NULL; /* args being passed in */ + + /* + * Our zfs_cmd_t can include an nvlist with optional input arguments. + * Right now the only optional arg is zpool_lock_behavior. Check if + * that exists and if it does, set values accordingly. + */ + error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl); + + if (error == 0) { + if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_LOCK_BEHAVIOR, + &zpool_lock_behavior) == 0) { + if (zpool_lock_behavior >= ZPOOL_LOCK_BEHAVIOR_END) { + nvlist_free(nvl); + /* Weird zpool_lock_behavior value */ + return (ZPOOL_LOCK_BEHAVIOR_DEFAULT); + } + } + nvlist_free(nvl); + } + return ((zpool_lock_behavior_t)zpool_lock_behavior); +} + static int zfs_ioc_pool_configs(zfs_cmd_t *zc) { nvlist_t *configs; int error; + zpool_lock_behavior_t zpool_lock_behavior; + + zpool_lock_behavior = zfs_get_zpool_lock_behavior_helper(zc); + + if ((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) && + (zfs_allow_lockless_zpool_status != 1)) { + /* + * Error - you must have zfs_debug_namespace_lock_delay=1 to + * enable --lockless. + */ + zc->zc_cookie = ENOTSUP; + return (ENOTSUP); + } + + error = spa_all_configs(&zc->zc_cookie, &configs, zpool_lock_behavior); - error = spa_all_configs(&zc->zc_cookie, &configs); if (error) return (error); @@ -1617,9 +1667,21 @@ zfs_ioc_pool_stats(zfs_cmd_t *zc) nvlist_t *config; int error; int ret = 0; + zpool_lock_behavior_t zpool_lock_behavior; + + zpool_lock_behavior = zfs_get_zpool_lock_behavior_helper(zc); + if ((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) && + (zfs_allow_lockless_zpool_status != 1)) { + /* + * Error - you must have zfs_debug_namespace_lock_delay=1 to + * enable --lockless. + */ + zc->zc_cookie = ENOTSUP; + return (ENOTSUP); + } error = spa_get_stats(zc->zc_name, &config, zc->zc_value, - sizeof (zc->zc_value)); + sizeof (zc->zc_value), zpool_lock_behavior); if (config != NULL) { ret = put_nvlist(zc, config); @@ -3092,6 +3154,7 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) static const zfs_ioc_key_t zfs_keys_get_props[] = { { ZPOOL_GET_PROPS_NAMES, DATA_TYPE_STRING_ARRAY, ZK_OPTIONAL }, + { ZPOOL_CONFIG_LOCK_BEHAVIOR, DATA_TYPE_UINT64, ZK_OPTIONAL }, }; static int @@ -3101,26 +3164,71 @@ zfs_ioc_pool_get_props(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) char **props = NULL; unsigned int n_props = 0; int error; + int rc; + uint64_t zpool_lock_behavior; + boolean_t is_locked; if (nvlist_lookup_string_array(innvl, ZPOOL_GET_PROPS_NAMES, &props, &n_props) != 0) { props = NULL; } - if ((error = spa_open(pool, &spa, FTAG)) != 0) { + zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_DEFAULT; + if (nvlist_lookup_uint64(innvl, ZPOOL_CONFIG_LOCK_BEHAVIOR, + &zpool_lock_behavior) == 0) { + if (zpool_lock_behavior >= ZPOOL_LOCK_BEHAVIOR_END) { + zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_DEFAULT; + } + } + + if ((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) && + (zfs_allow_lockless_zpool_status != 1)) { + /* + * Error - you must have zfs_debug_namespace_lock_delay=1 to + * enable --lockless. + */ + return (ENOTSUP); + } + + if ((error = spa_open_common_lock_behavior(pool, &spa, FTAG, NULL, NULL, + zpool_lock_behavior)) != 0) { /* * If the pool is faulted, there may be properties we can still * get (such as altroot and cachefile), so attempt to get them * anyway. */ - mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(pool)) != NULL) { + + is_locked = B_FALSE; + if ((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_TRYLOCK) || + (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS)) { + rc = mutex_enter_timeout(&spa_namespace_lock, + MSEC2NSEC(spa_namespace_trylock_ms)); + if (rc == 0) + is_locked = B_TRUE; + } else { + mutex_enter(&spa_namespace_lock); + is_locked = B_TRUE; + } + + if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_TRYLOCK && + !is_locked) { + return (EAGAIN); + } + + if (is_locked) + spa = spa_lookup(pool); + else + spa = spa_lookup_lockless(pool); + + if (spa != NULL) { error = spa_prop_get(spa, outnvl); if (error == 0 && props != NULL) error = spa_prop_get_nvlist(spa, props, n_props, outnvl); } - mutex_exit(&spa_namespace_lock); + + if (is_locked) + mutex_exit(&spa_namespace_lock); } else { error = spa_prop_get(spa, outnvl); if (error == 0 && props != NULL) diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 1f8aca0d9e1b..a00f776ea91d 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -559,7 +559,7 @@ tests = ['zpool_status_001_pos', 'zpool_status_002_pos', 'zpool_status_003_pos', 'zpool_status_004_pos', 'zpool_status_005_pos', 'zpool_status_006_pos', 'zpool_status_007_pos', 'zpool_status_008_pos', - 'zpool_status_features_001_pos'] + 'zpool_status_features_001_pos', 'zpool_status_lockless'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 8bffe9d8240c..3898ad80494a 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -3885,4 +3885,14 @@ function pop_coredump_pattern esac } +# Get the Unix timestamp as expressed in milliseconds. This can be useful for +# measuring how long something takes. This works on both Linux and +# FreeBSD. +function get_unix_timestamp_in_ms +{ + # We can't just use 'date +%s%N' here since FreeBSD 'date' only supports + # one-second resolution. + python3 -c 'import time; print(round(time.time() * 1000))' +} + . ${STF_SUITE}/include/kstat.shlib diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 79dc64ad9350..f002f43c3bc3 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -17,6 +17,7 @@ UNAME=$(uname) # NAME FreeBSD tunable Linux tunable cat <<%%%% | ADMIN_SNAPSHOT UNSUPPORTED zfs_admin_snapshot +ALLOW_LOCKLESS_ZPOOL_STATUS allow_lockless_zpool_status zfs_allow_lockless_zpool_status ALLOW_REDACTED_DATASET_MOUNT allow_redacted_dataset_mount zfs_allow_redacted_dataset_mount ARC_MAX arc.max zfs_arc_max ARC_MIN arc.min zfs_arc_min @@ -39,6 +40,7 @@ DEADMAN_EVENTS_PER_SECOND deadman_events_per_second zfs_deadman_events_per_secon DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode DEADMAN_SYNCTIME_MS deadman.synctime_ms zfs_deadman_synctime_ms DEADMAN_ZIOTIME_MS deadman.ziotime_ms zfs_deadman_ziotime_ms +DEBUG_NAMESPACE_LOCK_DELAY debug_namespace_lock_delay zfs_debug_namespace_lock_delay DISABLE_IVSET_GUID_CHECK disable_ivset_guid_check zfs_disable_ivset_guid_check DMU_OFFSET_NEXT_SYNC dmu_offset_next_sync zfs_dmu_offset_next_sync EMBEDDED_SLOG_MIN_MS embedded_slog_min_ms zfs_embedded_slog_min_ms @@ -87,6 +89,7 @@ SPA_ASIZE_INFLATION spa.asize_inflation spa_asize_inflation SPA_DISCARD_MEMORY_LIMIT spa.discard_memory_limit zfs_spa_discard_memory_limit SPA_LOAD_VERIFY_DATA spa.load_verify_data spa_load_verify_data SPA_LOAD_VERIFY_METADATA spa.load_verify_metadata spa_load_verify_metadata +SPA_NAMESPACE_TRYLOCK_MS spa.namespace_trylock_ms spa_namespace_trylock_ms TRIM_EXTENT_BYTES_MIN trim.extent_bytes_min zfs_trim_extent_bytes_min TRIM_METASLAB_SKIP trim.metaslab_skip zfs_trim_metaslab_skip TRIM_TXG_BATCH trim.txg_batch zfs_trim_txg_batch diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index bce546d066f6..25d3e11d9c1a 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1272,6 +1272,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_status/zpool_status_007_pos.ksh \ functional/cli_root/zpool_status/zpool_status_008_pos.ksh \ functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \ + functional/cli_root/zpool_status/zpool_status_lockless.ksh \ functional/cli_root/zpool_sync/cleanup.ksh \ functional/cli_root/zpool_sync/setup.ksh \ functional/cli_root/zpool_sync/zpool_sync_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_lockless.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_lockless.ksh new file mode 100755 index 000000000000..5908d547670b --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_lockless.ksh @@ -0,0 +1,92 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify 'zpool status --lockless|--trylock' work correctly + +DISK=$TEST_BASE_DIR/zpool_status_lockless_vdev + +function cleanup +{ + log_must restore_tunable ALLOW_LOCKLESS_ZPOOL_STATUS + log_must restore_tunable DEBUG_NAMESPACE_LOCK_DELAY + log_must restore_tunable SPA_NAMESPACE_TRYLOCK_MS + log_must zpool destroy testpool2 + log_must rm -f "$DISK" +} + +log_assert "Verify 'zpool status ---lockless|--trylock'" + +log_onexit cleanup + +log_must save_tunable ALLOW_LOCKLESS_ZPOOL_STATUS +log_must save_tunable DEBUG_NAMESPACE_LOCK_DELAY +log_must save_tunable SPA_NAMESPACE_TRYLOCK_MS + +log_mustnot test -e $DISK +log_must truncate -s 100M $DISK +log_must zpool create testpool2 $DISK +log_must zpool status +log_must zpool status --trylock + +# This should fail since zfs_allow_lockless_zpool_status!=1 +log_mustnot zpool status --lockless + +# Add an artificial 1s delay after taking the spa_namespace lock +set_tunable32 DEBUG_NAMESPACE_LOCK_DELAY 500 +zpool status & +sleep 0.1 + +# This should fail since the previously spawned off 'zpool status' is holding +# the lock for at least 500ms, and --trylock only tries for 100ms. +set_tunable32 SPA_NAMESPACE_TRYLOCK_MS 100 +log_mustnot zpool status --trylock +wait + +set_tunable32 DEBUG_NAMESPACE_LOCK_DELAY 100 +zpool status & +sleep 0.05 + +# This should succeed since the previously spawned off 'zpool status' is holding +# the lock for only 100ms, and --trylock tries for 300ms. +set_tunable32 SPA_NAMESPACE_TRYLOCK_MS 300 +log_must zpool status --trylock +wait + +# Now we artificially hold the lock for 500ms, and check that --lockless +# returns in a short amount of time, and without error. +set_tunable32 DEBUG_NAMESPACE_LOCK_DELAY 500 +set_tunable32 SPA_NAMESPACE_TRYLOCK_MS 10 +set_tunable32 ALLOW_LOCKLESS_ZPOOL_STATUS 1 +zpool status & +sleep 0.05 +before=$(get_unix_timestamp_in_ms) +log_must zpool status --lockless +after=$(get_unix_timestamp_in_ms) +d=$(($after - $before)) +wait +log_note "'zpool status --lockless' took $d milliseconds" +log_must test $d -le 300 + +log_pass "zpool status --lockless|--trylock work correctly" From f12ea3fb89fcf09929d1898bc8dd840ed941e276 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Wed, 26 Mar 2025 11:36:37 -0700 Subject: [PATCH 2/2] Update ABI files after lockless zpool status change Signed-off-by: Tony Hutter --- lib/libnvpair/libnvpair.abi | 5 + lib/libuutil/libuutil.abi | 45 +------ lib/libzfs/libzfs.abi | 174 ++++++++++++++-------------- lib/libzfs_core/libzfs_core.abi | 77 ++++-------- lib/libzfsbootenv/libzfsbootenv.abi | 2 +- 5 files changed, 123 insertions(+), 180 deletions(-) diff --git a/lib/libnvpair/libnvpair.abi b/lib/libnvpair/libnvpair.abi index e3eacb195463..fa8efbcd9d7e 100644 --- a/lib/libnvpair/libnvpair.abi +++ b/lib/libnvpair/libnvpair.abi @@ -2194,6 +2194,7 @@ + @@ -2306,6 +2307,10 @@ + + + + diff --git a/lib/libuutil/libuutil.abi b/lib/libuutil/libuutil.abi index 7cb92ac9f3f8..0052f0d47a7f 100644 --- a/lib/libuutil/libuutil.abi +++ b/lib/libuutil/libuutil.abi @@ -652,6 +652,7 @@ + @@ -763,6 +764,10 @@ + + + + @@ -1011,16 +1016,9 @@ - - - - - - - @@ -1130,25 +1128,6 @@ - - - - - - - - - - - - - - - - - - - @@ -1157,23 +1136,12 @@ - - - - - - - - - - - @@ -1185,9 +1153,8 @@ - + - diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 1f9fde6677d8..6c16f2f9b10d 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -218,6 +218,7 @@ + @@ -631,7 +632,7 @@ - + @@ -1170,6 +1171,7 @@ + @@ -1275,6 +1277,10 @@ + + + + @@ -2062,6 +2068,15 @@ + + + + + + + + + @@ -2201,7 +2216,7 @@ - + @@ -2259,6 +2274,9 @@ + + + @@ -2476,6 +2494,13 @@ + + + + + + + @@ -2692,7 +2717,7 @@ - + @@ -2744,6 +2769,12 @@ + + + + + + @@ -2761,7 +2792,7 @@ - + @@ -2834,37 +2865,37 @@ - + - + - + - + - + - + - + - + - + - + - + @@ -2925,6 +2956,12 @@ + + + + + + @@ -2935,6 +2972,12 @@ + + + + + + @@ -3030,6 +3073,12 @@ + + + + + + @@ -3063,9 +3112,6 @@ - - - @@ -3078,10 +3124,6 @@ - - - - @@ -3206,24 +3248,6 @@ - - - - - - - - - - - - - - - - - - @@ -3239,7 +3263,6 @@ - @@ -3475,13 +3498,9 @@ - - - - @@ -3495,6 +3514,7 @@ + @@ -3514,7 +3534,6 @@ - @@ -3540,13 +3559,6 @@ - - - - - - - @@ -3576,12 +3588,6 @@ - - - - - - @@ -3619,10 +3625,6 @@ - - - - @@ -3633,6 +3635,14 @@ + + + + + + + + @@ -3706,9 +3716,8 @@ - + - @@ -4078,7 +4087,6 @@ - @@ -4335,12 +4343,6 @@ - - - - - - @@ -4531,14 +4533,6 @@ - - - - - - - - @@ -4967,12 +4961,6 @@ - - - - - - @@ -8368,6 +8356,11 @@ + + + + + @@ -8563,6 +8556,11 @@ + + + + + @@ -9532,8 +9530,8 @@ - - + + diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index 6a9c20a2bb88..f6aa9a742bff 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -651,6 +651,7 @@ + @@ -762,6 +763,10 @@ + + + + @@ -982,13 +987,6 @@ - - - - - - - @@ -1092,42 +1090,12 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -1144,9 +1112,8 @@ - + - @@ -2111,7 +2078,7 @@ - + @@ -2163,6 +2130,12 @@ + + + + + + @@ -2180,7 +2153,7 @@ - + @@ -2253,37 +2226,37 @@ - + - + - + - + - + - + - + - + - + - + - + diff --git a/lib/libzfsbootenv/libzfsbootenv.abi b/lib/libzfsbootenv/libzfsbootenv.abi index 5903d5dcbe21..bf866b0fa61b 100644 --- a/lib/libzfsbootenv/libzfsbootenv.abi +++ b/lib/libzfsbootenv/libzfsbootenv.abi @@ -1,6 +1,6 @@ - +