diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 45eb9c783659..9a3e45d29ac0 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -7723,7 +7723,8 @@ import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) if (cfg == NULL) { zdb_set_skip_mmp(poolname); - error = spa_get_stats(poolname, &cfg, NULL, 0); + error = spa_get_stats(poolname, &cfg, NULL, 0, + ZPOOL_LOCK_BEHAVIOR_DEFAULT); if (error != 0) { fatal("Tried to read config of pool \"%s\" but " "spa_get_stats() failed with error %d\n", diff --git a/cmd/zpool/zpool_iter.c b/cmd/zpool/zpool_iter.c index 2ec189b98653..fa4f5769b730 100644 --- a/cmd/zpool/zpool_iter.c +++ b/cmd/zpool/zpool_iter.c @@ -118,6 +118,7 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, zfs_type_t type, boolean_t literal, int *err) { zpool_list_t *zlp; + int rc; zlp = safe_malloc(sizeof (zpool_list_t)); @@ -137,7 +138,11 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, zfs_type_t type, zlp->zl_literal = literal; if (argc == 0) { - (void) zpool_iter(g_zfs, add_pool, zlp); + rc = zpool_iter(g_zfs, add_pool, zlp); + if (rc != 0) { + free(zlp); + return (NULL); + } zlp->zl_findall = B_TRUE; } else { int i; diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index f2c2c3f41a7c..6d9a52799a3f 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -146,7 +146,9 @@ enum zpool_options { ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH, ZPOOL_OPTION_POOL_KEY_GUID, ZPOOL_OPTION_JSON_NUMS_AS_INT, - ZPOOL_OPTION_JSON_FLAT_VDEVS + ZPOOL_OPTION_JSON_FLAT_VDEVS, + ZPOOL_OPTION_LOCKLESS, + ZPOOL_OPTION_TRYLOCK, }; /* @@ -490,8 +492,8 @@ get_usage(zpool_help_t idx) case HELP_LABELCLEAR: return (gettext("\tlabelclear [-f] \n")); case HELP_LIST: - return (gettext("\tlist [-gHLpPv] [-o property[,...]] [-j " - "[--json-int, --json-pool-key-guid]] ...\n" + return (gettext("\tlist [-gHLpPv] [-o property[,...]] " + "[-j [--json-int, --json-pool-key-guid]] ...\n" "\t [-T d|u] [pool] [interval [count]]\n")); case HELP_PREFETCH: return (gettext("\tprefetch -t [] \n" @@ -521,8 +523,8 @@ get_usage(zpool_help_t idx) return (gettext("\ttrim [-dw] [-r ] [-c | -s] " "[ ...]\n")); case HELP_STATUS: - return (gettext("\tstatus [--power] [-j [--json-int, " - "--json-flat-vdevs, ...\n" + return (gettext("\tstatus [--power] [--lockless|--trylock] " + "[-j [--json-int, --json-flat-vdevs, ...\n" "\t --json-pool-key-guid]] [-c [script1,script2,...]] " "[-dDegiLpPstvx] ...\n" "\t [-T d|u] [pool] [interval [count]]\n")); @@ -2614,6 +2616,9 @@ typedef struct status_cbdata { nvlist_t *cb_jsobj; boolean_t cb_json_as_int; boolean_t cb_json_pool_key_guid; + boolean_t cb_lockless; + boolean_t cb_trylock; + } status_cbdata_t; /* Return 1 if string is NULL, empty, or whitespace; return 0 otherwise. */ @@ -11002,6 +11007,8 @@ status_callback(zpool_handle_t *zhp, void *data) * --json-int Display numbers in inteeger format instead of string * --json-flat-vdevs Display vdevs in flat hierarchy * --json-pool-key-guid Use pool GUID as key for pool objects + * --lockless No locks + * --trylock Try to get namespace lock, but abort if not * * Describes the health status of all pools or some subset. */ @@ -11024,6 +11031,10 @@ zpool_do_status(int argc, char **argv) ZPOOL_OPTION_JSON_FLAT_VDEVS}, {"json-pool-key-guid", no_argument, NULL, ZPOOL_OPTION_POOL_KEY_GUID}, + {"lockless", no_argument, NULL, + ZPOOL_OPTION_LOCKLESS}, + {"trylock", no_argument, NULL, + ZPOOL_OPTION_TRYLOCK}, {0, 0, 0, 0} }; @@ -11111,6 +11122,12 @@ zpool_do_status(int argc, char **argv) case ZPOOL_OPTION_POOL_KEY_GUID: cb.cb_json_pool_key_guid = B_TRUE; break; + case ZPOOL_OPTION_LOCKLESS: + cb.cb_lockless = B_TRUE; + break; + case ZPOOL_OPTION_TRYLOCK: + cb.cb_trylock = B_TRUE; + break; case '?': if (optopt == 'c') { print_zpool_script_list("status"); @@ -11152,6 +11169,16 @@ zpool_do_status(int argc, char **argv) usage(B_FALSE); } + if (cb.cb_lockless && cb.cb_trylock) { + (void) fprintf(stderr, gettext("cannot pass both --lockless and" + " --trylock\n")); + usage(B_FALSE); + } else if (cb.cb_lockless) { + libzfs_set_lock_behavior(g_zfs, ZPOOL_LOCK_BEHAVIOR_LOCKLESS); + } else if (cb.cb_trylock) { + libzfs_set_lock_behavior(g_zfs, ZPOOL_LOCK_BEHAVIOR_TRYLOCK); + } + for (;;) { if (cb.cb_json) { cb.cb_jsobj = zpool_json_schema(0, 1); diff --git a/include/libzfs.h b/include/libzfs.h index 8774d490f74b..6feaef75ad28 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -264,6 +264,8 @@ _LIBZFS_H int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, nvlist_t *, nvlist_t *); _LIBZFS_H int zpool_destroy(zpool_handle_t *, const char *); _LIBZFS_H int zpool_add(zpool_handle_t *, nvlist_t *, boolean_t check_ashift); +_LIBZFS_H void libzfs_set_lock_behavior(libzfs_handle_t *, + zpool_lock_behavior_t); typedef struct splitflags { /* do not split, but return the config that would be split off */ diff --git a/include/os/freebsd/spl/sys/mutex.h b/include/os/freebsd/spl/sys/mutex.h index 546a4681a291..f91831fcefde 100644 --- a/include/os/freebsd/spl/sys/mutex.h +++ b/include/os/freebsd/spl/sys/mutex.h @@ -72,4 +72,19 @@ typedef enum { #define mutex_owned(lock) sx_xlocked(lock) #define mutex_owner(lock) sx_xholder(lock) +/* + * Poor-man's version of Linux kernel's down_timeout(). Try to acquire a mutex + * for 'ns' number of nanoseconds. Returns 0 if mutex was acquired or ETIME + * if timeout occurred. + */ +static inline int mutex_enter_timeout(kmutex_t *mutex, uint64_t ns) +{ + hrtime_t end = gethrtime() + ns; + while (gethrtime() < end) { + if (mutex_tryenter(mutex)) + return (0); /* success */ + } + return (ETIME); +} + #endif /* _OPENSOLARIS_SYS_MUTEX_H_ */ diff --git a/include/os/linux/spl/sys/mutex.h b/include/os/linux/spl/sys/mutex.h index f000f53ab9b6..5f1faa6a95ce 100644 --- a/include/os/linux/spl/sys/mutex.h +++ b/include/os/linux/spl/sys/mutex.h @@ -26,6 +26,7 @@ #define _SPL_MUTEX_H #include +#include #include #include #include @@ -187,4 +188,19 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \ /* NOTE: do not dereference mp after this point */ \ } +/* + * Poor-man's version of Linux kernel's down_timeout(). Try to acquire a mutex + * for 'ns' number of nanoseconds. Returns 0 if mutex was acquired or ETIME + * if timeout occurred. + */ +static inline int mutex_enter_timeout(kmutex_t *mutex, uint64_t ns) +{ + hrtime_t end = gethrtime() + ns; + while (gethrtime() < end) { + if (mutex_tryenter(mutex)) + return (0); /* success */ + } + return (ETIME); +} + #endif /* _SPL_MUTEX_H */ diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 2d27aee217e0..744575b9913b 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -866,6 +866,9 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_REBUILD_STATS "org.openzfs:rebuild_stats" #define ZPOOL_CONFIG_COMPATIBILITY "compatibility" +/* ZFS_IOC_POOL_STATS argument to for spa_namespace locking behavior */ +#define ZPOOL_CONFIG_LOCK_BEHAVIOR "lock_behavior" /* not stored on disk */ + /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such @@ -1978,6 +1981,30 @@ enum zio_encrypt { ZFS_XA_NS_PREFIX_MATCH(LINUX_TRUSTED, name) || \ ZFS_XA_NS_PREFIX_MATCH(LINUX_USER, name)) +/* + * Set locking behavior for zpool commands. + */ +typedef enum { + /* Wait to acquire the lock on the zpool config */ + ZPOOL_LOCK_BEHAVIOR_WAIT = 0, + ZPOOL_LOCK_BEHAVIOR_DEFAULT = ZPOOL_LOCK_BEHAVIOR_WAIT, + /* + * Return an error if it's taking an unnecessarily long time to + * acquire the lock on the pool config (default 100ms) + */ + ZPOOL_LOCK_BEHAVIOR_TRYLOCK = 1, + + /* + * DANGER: THIS CAN CRASH YOUR SYSTEM + * + * If you can't acquire the pool config lock after 100ms then do a + * a lockless lookup. This should only be done in emergencies, as it + * can crash the kernel module! + */ + ZPOOL_LOCK_BEHAVIOR_LOCKLESS = 2, + ZPOOL_LOCK_BEHAVIOR_END = 3 /* last entry marker */ +} zpool_lock_behavior_t; + #ifdef __cplusplus } #endif diff --git a/include/sys/spa.h b/include/sys/spa.h index 439cd461b710..784c90510137 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -742,10 +742,13 @@ typedef enum trim_type { /* state manipulation functions */ extern int spa_open(const char *pool, spa_t **, const void *tag); +extern int spa_open_common_lock_behavior(const char *pool, spa_t **spapp, + const void *tag, nvlist_t *nvpolicy, nvlist_t **config, + zpool_lock_behavior_t zpool_lock_behavior); extern int spa_open_rewind(const char *pool, spa_t **, const void *tag, nvlist_t *policy, nvlist_t **config); extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, - size_t buflen); + size_t buflen, zpool_lock_behavior_t zpool_lock_behavior); extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops, struct dsl_crypto_params *dcp); extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props, @@ -850,10 +853,13 @@ extern kcondvar_t spa_namespace_cv; extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t, boolean_t); extern void spa_config_load(void); -extern int spa_all_configs(uint64_t *generation, nvlist_t **pools); +extern int spa_all_configs(uint64_t *generation, nvlist_t **pools, + zpool_lock_behavior_t zpool_lock_behavior); extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); +extern nvlist_t *spa_config_generate_lock_behavior(spa_t *spa, vdev_t *vd, + uint64_t txg, int getstats, zpool_lock_behavior_t zpool_lock_behavior); extern void spa_config_update(spa_t *spa, int what); extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype); @@ -865,9 +871,11 @@ extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, /* Namespace manipulation */ extern spa_t *spa_lookup(const char *name); +extern spa_t *spa_lookup_lockless(const char *name); extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); extern void spa_remove(spa_t *spa); extern spa_t *spa_next(spa_t *prev); +extern spa_t *spa_next_lockless(spa_t *prev); /* Refcount functions */ extern void spa_open_ref(spa_t *spa, const void *tag); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 8c52f751a819..5152ddc6852c 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -491,6 +491,8 @@ extern void spa_set_deadman_synctime(hrtime_t ns); extern void spa_set_deadman_ziotime(hrtime_t ns); extern const char *spa_history_zone(void); extern const char *zfs_active_allocator; +extern int zfs_allow_lockless_zpool_status; +extern unsigned int spa_namespace_trylock_ms; extern int param_set_active_allocator_common(const char *val); #ifdef __cplusplus diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 549f54c09383..5efe26d94dcf 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -277,6 +277,7 @@ extern void mutex_enter(kmutex_t *mp); extern int mutex_enter_check_return(kmutex_t *mp); extern void mutex_exit(kmutex_t *mp); extern int mutex_tryenter(kmutex_t *mp); +extern int mutex_enter_timeout(kmutex_t *mp, uint64_t ns); #define NESTED_SINGLE 1 #define mutex_enter_nested(mp, class) mutex_enter(mp) diff --git a/lib/libnvpair/libnvpair.abi b/lib/libnvpair/libnvpair.abi index e3eacb195463..fa8efbcd9d7e 100644 --- a/lib/libnvpair/libnvpair.abi +++ b/lib/libnvpair/libnvpair.abi @@ -2194,6 +2194,7 @@ + @@ -2306,6 +2307,10 @@ + + + + diff --git a/lib/libuutil/libuutil.abi b/lib/libuutil/libuutil.abi index 7cb92ac9f3f8..0052f0d47a7f 100644 --- a/lib/libuutil/libuutil.abi +++ b/lib/libuutil/libuutil.abi @@ -652,6 +652,7 @@ + @@ -763,6 +764,10 @@ + + + + @@ -1011,16 +1016,9 @@ - - - - - - - @@ -1130,25 +1128,6 @@ - - - - - - - - - - - - - - - - - - - @@ -1157,23 +1136,12 @@ - - - - - - - - - - - @@ -1185,9 +1153,8 @@ - + - diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 1f9fde6677d8..6c16f2f9b10d 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -218,6 +218,7 @@ + @@ -631,7 +632,7 @@ - + @@ -1170,6 +1171,7 @@ + @@ -1275,6 +1277,10 @@ + + + + @@ -2062,6 +2068,15 @@ + + + + + + + + + @@ -2201,7 +2216,7 @@ - + @@ -2259,6 +2274,9 @@ + + + @@ -2476,6 +2494,13 @@ + + + + + + + @@ -2692,7 +2717,7 @@ - + @@ -2744,6 +2769,12 @@ + + + + + + @@ -2761,7 +2792,7 @@ - + @@ -2834,37 +2865,37 @@ - + - + - + - + - + - + - + - + - + - + - + @@ -2925,6 +2956,12 @@ + + + + + + @@ -2935,6 +2972,12 @@ + + + + + + @@ -3030,6 +3073,12 @@ + + + + + + @@ -3063,9 +3112,6 @@ - - - @@ -3078,10 +3124,6 @@ - - - - @@ -3206,24 +3248,6 @@ - - - - - - - - - - - - - - - - - - @@ -3239,7 +3263,6 @@ - @@ -3475,13 +3498,9 @@ - - - - @@ -3495,6 +3514,7 @@ + @@ -3514,7 +3534,6 @@ - @@ -3540,13 +3559,6 @@ - - - - - - - @@ -3576,12 +3588,6 @@ - - - - - - @@ -3619,10 +3625,6 @@ - - - - @@ -3633,6 +3635,14 @@ + + + + + + + + @@ -3706,9 +3716,8 @@ - + - @@ -4078,7 +4087,6 @@ - @@ -4335,12 +4343,6 @@ - - - - - - @@ -4531,14 +4533,6 @@ - - - - - - - - @@ -4967,12 +4961,6 @@ - - - - - - @@ -8368,6 +8356,11 @@ + + + + + @@ -8563,6 +8556,11 @@ + + + + + @@ -9532,8 +9530,8 @@ - - + + diff --git a/lib/libzfs/libzfs_config.c b/lib/libzfs/libzfs_config.c index 0d2102191389..8a927e6e5b05 100644 --- a/lib/libzfs/libzfs_config.c +++ b/lib/libzfs/libzfs_config.c @@ -109,6 +109,7 @@ namespace_reload(libzfs_handle_t *hdl) nvpair_t *elem; zfs_cmd_t zc = {"\0"}; void *cookie; + nvlist_t *nv_args = NULL; if (hdl->libzfs_ns_gen == 0) { /* @@ -129,21 +130,35 @@ namespace_reload(libzfs_handle_t *hdl) zcmd_alloc_dst_nvlist(hdl, &zc, 0); + + VERIFY0(nvlist_alloc(&nv_args, NV_UNIQUE_NAME, 0)); + VERIFY0(nvlist_add_uint64(nv_args, ZPOOL_CONFIG_LOCK_BEHAVIOR, + hdl->zpool_lock_behavior)); + zcmd_write_src_nvlist(hdl, &zc, nv_args); + for (;;) { zc.zc_cookie = hdl->libzfs_ns_gen; if (zfs_ioctl(hdl, ZFS_IOC_POOL_CONFIGS, &zc) != 0) { switch (errno) { + case EAGAIN: + return (zfs_standard_error(hdl, errno, + dgettext(TEXT_DOMAIN, "Unable to get the " + "pool config lock in a reasonable amount of" + " time"))); case EEXIST: /* * The namespace hasn't changed. */ zcmd_free_nvlists(&zc); return (0); - case ENOMEM: zcmd_expand_dst_nvlist(hdl, &zc); break; - + case ENOTSUP: + return (zfs_standard_error(hdl, errno, + dgettext(TEXT_DOMAIN, "--lockless requires" + " module parameter " + "zfs_allow_lockless_zpool_status=1"))); default: zcmd_free_nvlists(&zc); return (zfs_standard_error(hdl, errno, @@ -253,6 +268,7 @@ zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) int error; nvlist_t *config; libzfs_handle_t *hdl = zhp->zpool_hdl; + nvlist_t *nv_args = NULL; *missing = B_FALSE; (void) strcpy(zc.zc_name, zhp->zpool_name); @@ -262,9 +278,15 @@ zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size); + VERIFY0(nvlist_alloc(&nv_args, NV_UNIQUE_NAME, 0)); + VERIFY0(nvlist_add_uint64(nv_args, ZPOOL_CONFIG_LOCK_BEHAVIOR, + hdl->zpool_lock_behavior)); + zcmd_write_src_nvlist(hdl, &zc, nv_args); + for (;;) { - if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_STATS, - &zc) == 0) { + int rc; + rc = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_STATS, &zc); + if (rc == 0) { /* * The real error is returned in the zc_cookie field. */ @@ -276,7 +298,8 @@ zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) zcmd_expand_dst_nvlist(hdl, &zc); else { zcmd_free_nvlists(&zc); - if (errno == ENOENT || errno == EINVAL) + + if (errno == ENOENT || errno == EINVAL || rc) *missing = B_TRUE; zhp->zpool_state = POOL_STATE_UNAVAIL; return (0); diff --git a/lib/libzfs/libzfs_impl.h b/lib/libzfs/libzfs_impl.h index 9053740ec2f0..cbc36eb871ec 100644 --- a/lib/libzfs/libzfs_impl.h +++ b/lib/libzfs/libzfs_impl.h @@ -73,6 +73,8 @@ struct libzfs_handle { uint64_t libzfs_max_nvlist; void *libfetch; char *libfetch_load_error; + + zpool_lock_behavior_t zpool_lock_behavior; }; struct zfs_handle { diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 06d2c3543e3d..4c9bff31095b 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -80,14 +80,14 @@ zpool_get_all_props(zpool_handle_t *zhp) (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + nvlist_t *innvl = fnvlist_alloc(); if (zhp->zpool_n_propnames > 0) { - nvlist_t *innvl = fnvlist_alloc(); fnvlist_add_string_array(innvl, ZPOOL_GET_PROPS_NAMES, zhp->zpool_propnames, zhp->zpool_n_propnames); - zcmd_write_src_nvlist(hdl, &zc, innvl); } - - zcmd_alloc_dst_nvlist(hdl, &zc, 0); + VERIFY0(nvlist_add_uint64(innvl, ZPOOL_CONFIG_LOCK_BEHAVIOR, + hdl->zpool_lock_behavior)); + zcmd_write_src_nvlist(hdl, &zc, innvl); while (zfs_ioctl(hdl, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) { if (errno == ENOMEM) diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index e450d94add87..f9972f6267e1 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -2449,3 +2449,15 @@ zpool_prepare_and_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, rc = zpool_label_disk(hdl, zhp, name); return (rc); } + +/* + * Set spa_namespace locking behavior for zpools. For example you may want to + * error out if you can't acquire the lock, or skip the lock entirely (which + * can be dangerous). + */ +void +libzfs_set_lock_behavior(libzfs_handle_t *hdl, + zpool_lock_behavior_t zpool_lock_behavior) +{ + hdl->zpool_lock_behavior = zpool_lock_behavior; +} diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index 6a9c20a2bb88..f6aa9a742bff 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -651,6 +651,7 @@ + @@ -762,6 +763,10 @@ + + + + @@ -982,13 +987,6 @@ - - - - - - - @@ -1092,42 +1090,12 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -1144,9 +1112,8 @@ - + - @@ -2111,7 +2078,7 @@ - + @@ -2163,6 +2130,12 @@ + + + + + + @@ -2180,7 +2153,7 @@ - + @@ -2253,37 +2226,37 @@ - + - + - + - + - + - + - + - + - + - + - + diff --git a/lib/libzfsbootenv/libzfsbootenv.abi b/lib/libzfsbootenv/libzfsbootenv.abi index 5903d5dcbe21..bf866b0fa61b 100644 --- a/lib/libzfsbootenv/libzfsbootenv.abi +++ b/lib/libzfsbootenv/libzfsbootenv.abi @@ -1,6 +1,6 @@ - + diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 653380149a9e..0461bccbf8a7 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -238,6 +238,22 @@ mutex_exit(kmutex_t *mp) VERIFY0(pthread_mutex_unlock(&mp->m_lock)); } +/* + * Poor-man's version of Linux kernel's down_timeout(). Try to acquire a mutex + * for 'ns' number of nanoseconds. Returns 0 if mutex was acquired or ENOLCK + * if timeout occurred. + */ +int +mutex_enter_timeout(kmutex_t *mutex, uint64_t ns) +{ + hrtime_t end = gethrtime() + ns; + while (gethrtime() < end) { + if (mutex_tryenter(mutex)) + return (0); /* success */ + } + return (ENOLCK); +} + /* * ========================================================================= * rwlocks diff --git a/lib/libzpool/util.c b/lib/libzpool/util.c index a297daedbd4d..b2b11f32af67 100644 --- a/lib/libzpool/util.c +++ b/lib/libzpool/util.c @@ -137,7 +137,8 @@ show_pool_stats(spa_t *spa) nvlist_t *config, *nvroot; const char *name; - VERIFY(spa_get_stats(spa_name(spa), &config, NULL, 0) == 0); + VERIFY(spa_get_stats(spa_name(spa), &config, NULL, 0, + ZPOOL_LOCK_BEHAVIOR_DEFAULT) == 0); VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index f5881d7faf59..330d4a9b6532 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2673,8 +2673,47 @@ Defines zvol block devices behavior when .It Sy zvol_enforce_quotas Ns = Ns Sy 0 Ns | Ns 1 Pq uint Enable strict ZVOL quota enforcement. The strict quota enforcement may have a performance impact. +.It Sy zfs_debug_namespace_lock_delay +WARNING, DEBUG USE ONLY! +If set, add an artificial +.Sy zfs_debug_namespace_lock_delay +milliseconds delay after taking the spa_namespace lock. +This is used to simulate a hung pool. +If +.Sy zfs_debug_namespace_lock_delay +is set to -1 then crash the kernel on the next spa_namespace lock hold. +If set to 0 (default) then take no action. +This option is used by test suite. +.It Sy zfs_allow_lockless_zpool_status +WARNING, DEBUG USE ONLY! +Allow the use of +.Sy zpool status --lockless +by the user. +.Sy zpool status --lockless +is useful in emergency situations when you really want to see the status +of a pool that's locked up. +WARNING: This can crash your system if the pool configuration is being updated +while +.Sy zpool status --lockless +is being run. +Set to 1 to enable, or 0 to disable (default). +.It Sy spa_namespace_trylock_ms +If the user does a +.Sy zpool status --trylock|--lockless +then try for this many milliseconds to get the the spa_namespace lock before +giving up. +This is the internal lock that synchronizes changes to the pool configuration. +Note that +.Sy zpool status +will try to get the lock in multiple places (like getting the pool config, +stats, props) before returning. +That means the total wall clock time for +.Sy zpool status --trylock|--lockless +could a multiple of +.Sy spa_namespace_trylock_ms +if the lock is already held. +Default is 100 (milliseconds). .El -. .Sh ZFS I/O SCHEDULER ZFS issues I/O operations to leaf vdevs to satisfy and complete I/O operations. The scheduler determines when and in what order those operations are issued. diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index 10d7ff973788..8ddf5e8074c6 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -38,11 +38,13 @@ .Nm zpool .Cm status .Op Fl dDegiLpPstvx +.Op Fl --lockless|--trylock +.Op Fl --power +.Op Fl j Op Ar --json-int, --json-flat-vdevs, --json-pool-key-guid .Op Fl T Sy u Ns | Ns Sy d .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns … .Oo Ar pool Oc Ns … .Op Ar interval Op Ar count -.Op Fl j Op Ar --json-int, --json-flat-vdevs, --json-pool-key-guid . .Sh DESCRIPTION Displays the detailed health status for the given pools. @@ -61,6 +63,38 @@ the other workloads on the system can change. .Bl -tag -width Ds .It Fl -power Display vdev enclosure slot power status (on or off). +.It Fl -lockless +If +.Sy zpool status +is unable to acquire the pool config lock (a.k.a the +spa_namespace lock) in a reasonable amount of time, then proceed without the +lock. +This is useful in emergency situations when you really want to see the status +of a pool that's locked up. +WARNING: This can crash your system if the pool configuration is being updated +while +.Sy zpool status --lockless +is being run. +As an additional safety measure, +.Sy --lockless +requires the +.Sy zfs_debug_allow_lockless_zpool +module param be set to 1 in order to work. +This prevents unprivileged users who have delegated access to +.Sy zpool status +from potentially crashing the system. +.It Fl -trylock +When +.Sy zpool status +runs, it will try to acquire the pool config lock within the +kernel module (the spa_namespace lock). +However, there can be times when a pool gets locked up while holding this lock, +leading to subsequent zpool status invocations getting hung. +The +.Sy --trylock +option gets around this by attempting to take the pool config lock +for a reasonable amount of time, and then aborting if it's unable to take the +lock. .It Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns … Run a script (or scripts) on each vdev and include the output as a new column in the diff --git a/module/zfs/spa.c b/module/zfs/spa.c index cbe90303072f..8fecb5330054 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -310,6 +310,38 @@ static int zfs_livelist_condense_zthr_cancel = 0; */ static int zfs_livelist_condense_new_alloc = 0; + +/* + * WARNING, FOR TEST USE ONLY! + * + * Wait zfs_debug_namespace_lock_delay milliseconds after taking the + * spa_namespace_lock to simulate lock contention. + * If set to -1, then *panic* to simulate a pool failure. + * If set to 0 then take no action (default) + */ +static int zfs_debug_namespace_lock_delay = 0; + +/* + * Set to 1 to allow 'zpool status --lockless'. This is an extra safety to + * make sure you can only run with --lockless when you know what you're doing. + * Running locklessly while the pool configuration is changing can cause kernel + * panics. + * + * This safety also means that un-privileged users who have been delegated + * 'zpool status' rights, cannot run --lockless without the root user setting + * zfs_allow_lockless_zpool_status=1 first. + */ +int zfs_allow_lockless_zpool_status = 0; + +/* + * When zpool_lock_behavior is ZPOOL_LOCK_BEHAVIOR_TRY, then try to acquire + * the spa_namespace_ lock for this many milliseconds before giving up. + * + * Note that 'zpool status' will try to get the lock in multiple places (like + * getting the pool config, stats, props) before returning. + */ +unsigned int spa_namespace_trylock_ms = 100; + /* * ========================================================================== * SPA properties routines @@ -5794,8 +5826,9 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, * ambiguous state. */ static int -spa_open_common(const char *pool, spa_t **spapp, const void *tag, - nvlist_t *nvpolicy, nvlist_t **config) +spa_open_common_impl(const char *pool, spa_t **spapp, const void *tag, + nvlist_t *nvpolicy, nvlist_t **config, + zpool_lock_behavior_t zpool_lock_behavior) { spa_t *spa; spa_load_state_t state = SPA_LOAD_OPEN; @@ -5805,6 +5838,13 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, *spapp = NULL; + /* + * Sanity check: ZPOOL_LOCK_BEHAVIOR_LOCKLESS requires + * zfs_allow_lockless_zpool_status = 1 + */ + ASSERT(!((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) && + (zfs_allow_lockless_zpool_status != 1))); + /* * As disgusting as this is, we need to support recursive calls to this * function because dsl_dir_open() is called during spa_load(), and ends @@ -5812,8 +5852,45 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, * avoid dsl_dir_open() calling this in the first place. */ if (MUTEX_NOT_HELD(&spa_namespace_lock)) { - mutex_enter(&spa_namespace_lock); - locked = B_TRUE; + + /* + * Special debug case: we've selected a pool for lockless + * operation + */ + if ((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) || + (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_TRYLOCK)) { + /* Make a cursory effort to acquire the lock in 100ms */ + if (mutex_enter_timeout(&spa_namespace_lock, + MSEC2NSEC(spa_namespace_trylock_ms)) != 0) { + if (zpool_lock_behavior == + ZPOOL_LOCK_BEHAVIOR_LOCKLESS) { + + /* + * Danger: we're doing spa_lookup() + * without holding the lock + */ + spa = spa_lookup_lockless(pool); + if (spa == NULL) { + return (SET_ERROR(ENOENT)); + } + + goto skip_the_locks; + } else { + /* + * We had ZPOOL_LOCK_BEHAVIOR_TRYLOCK + * set and cant get the lock in 100ms. + */ + return (SET_ERROR(EAGAIN)); + } + } else { + /* We got the lock from mutex_enter_timeout() */ + locked = B_TRUE; + } + } else { + /* Normal case of getting the lock */ + mutex_enter(&spa_namespace_lock); + locked = B_TRUE; + } } if ((spa = spa_lookup(pool)) == NULL) { @@ -5822,6 +5899,19 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, return (SET_ERROR(ENOENT)); } + /* + * Simulate holding the spa_namespace lock for a long time, or forcing + * a crash with the lock held. This is for testing purposes only. + */ + if (zfs_debug_namespace_lock_delay == -1) { +#if _KERNEL && defined(__linux__) + BUG(); +#endif + } else if (zfs_debug_namespace_lock_delay > 0) { + zfs_sleep_until(gethrtime() + + MSEC2NSEC(zfs_debug_namespace_lock_delay)); + } + if (spa->spa_state == POOL_STATE_UNINITIALIZED) { zpool_load_policy_t policy; @@ -5881,10 +5971,12 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, } } +skip_the_locks: spa_open_ref(spa, tag); if (config != NULL) - *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + *config = spa_config_generate_lock_behavior(spa, NULL, -1ULL, + B_TRUE, zpool_lock_behavior); /* * If we've recovered the pool, pass back any information we @@ -5910,6 +6002,23 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, return (0); } +static int +spa_open_common(const char *pool, spa_t **spapp, const void *tag, + nvlist_t *nvpolicy, nvlist_t **config) +{ + return (spa_open_common_impl(pool, spapp, tag, nvpolicy, config, + ZPOOL_LOCK_BEHAVIOR_DEFAULT)); +} + +int +spa_open_common_lock_behavior(const char *pool, spa_t **spapp, const void *tag, + nvlist_t *nvpolicy, nvlist_t **config, zpool_lock_behavior_t + zpool_lock_behavior) +{ + return (spa_open_common_impl(pool, spapp, tag, nvpolicy, config, + zpool_lock_behavior)); +} + int spa_open_rewind(const char *name, spa_t **spapp, const void *tag, nvlist_t *policy, nvlist_t **config) @@ -6143,13 +6252,25 @@ spa_add_feature_stats(spa_t *spa, nvlist_t *config) int spa_get_stats(const char *name, nvlist_t **config, - char *altroot, size_t buflen) + char *altroot, size_t buflen, zpool_lock_behavior_t zpool_lock_behavior) { int error; spa_t *spa; *config = NULL; - error = spa_open_common(name, &spa, FTAG, NULL, config); + + /* + * Sanity check: ZPOOL_LOCK_BEHAVIOR_LOCKLESS requires + * zfs_allow_lockless_zpool_status = 1 + */ + ASSERT(!((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) && + (zfs_allow_lockless_zpool_status != 1))); + + error = spa_open_common_lock_behavior(name, &spa, FTAG, NULL, config, + zpool_lock_behavior); + if (error != 0) { + return (error); + } if (spa != NULL) { /* @@ -10993,6 +11114,16 @@ ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, "Allow importing pool with up to this number of missing top-level " "vdevs (in read-only mode)"); +ZFS_MODULE_PARAM(zfs, zfs_, debug_namespace_lock_delay, INT, ZMOD_RW, + "Do not use - for ZFS test suite only."); + +ZFS_MODULE_PARAM(zfs, zfs_, allow_lockless_zpool_status, INT, ZMOD_RW, + "Set to 1 to allow 'zpool status --lockless' (dangerous)"); + +ZFS_MODULE_PARAM(zfs_spa, spa_, namespace_trylock_ms, UINT, ZMOD_RW, + "Try for this many milliseconds to get the spa_namespace lock when " + "running 'zpool status --trylock|--lockless'."); + ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW, "Set the livelist condense zthr to pause"); diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index de53de114979..c60221d6f590 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -69,6 +69,16 @@ static uint64_t spa_config_generation = 1; * userland pools when doing testing. */ char *spa_config_path = (char *)ZPOOL_CACHE; + +/* + * WARNING, THIS CAN CRASH THE KERNEL! + * If enabled, allow 'zpool status --lockless'. This is an extra safety + * measure the user must engage, since 'zpool status --lockless' can potentially + * crash the pool if run while the pool configuration is changing. Set to 0 to + * disable, 1 to enable. + */ +int zfs_allow_lockless_zpool = 0; + #ifdef _KERNEL static int zfs_autoimport_disable = B_TRUE; #endif @@ -369,19 +379,49 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent, * information for all pool visible within the zone. */ int -spa_all_configs(uint64_t *generation, nvlist_t **pools) +spa_all_configs(uint64_t *generation, nvlist_t **pools, zpool_lock_behavior_t + zpool_lock_behavior) { + int error = 0; spa_t *spa = NULL; + boolean_t use_lock = B_TRUE; + + /* + * Sanity check: ZPOOL_LOCK_BEHAVIOR_LOCKLESS requires + * zfs_allow_lockless_zpool_status = 1 + */ + ASSERT(!((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) && + (zfs_allow_lockless_zpool_status != 1))); if (*generation == spa_config_generation) return (SET_ERROR(EEXIST)); - int error = mutex_enter_interruptible(&spa_namespace_lock); - if (error) - return (SET_ERROR(EINTR)); + if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_WAIT) { + error = mutex_enter_interruptible(&spa_namespace_lock); + } else { + error = mutex_enter_timeout(&spa_namespace_lock, + MSEC2NSEC(spa_namespace_trylock_ms)); + } + if (error) { + if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_TRYLOCK) + return (SET_ERROR(EAGAIN)); + else if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) + use_lock = B_FALSE; + else + return (SET_ERROR(EINTR)); + } *pools = fnvlist_alloc(); - while ((spa = spa_next(spa)) != NULL) { + + do { + if (use_lock) + spa = spa_next(spa); + else + spa = spa_next_lockless(spa); + + if (spa == NULL) + break; + if (INGLOBALZONE(curproc) || zone_dataset_visible(spa_name(spa), NULL)) { mutex_enter(&spa->spa_props_lock); @@ -389,9 +429,11 @@ spa_all_configs(uint64_t *generation, nvlist_t **pools) spa->spa_config); mutex_exit(&spa->spa_props_lock); } - } + } while (1); *generation = spa_config_generation; - mutex_exit(&spa_namespace_lock); + + if (use_lock) + mutex_exit(&spa_namespace_lock); return (0); } @@ -412,8 +454,9 @@ spa_config_set(spa_t *spa, nvlist_t *config) * We infer whether to generate a complete config or just one top-level config * based on whether vd is the root vdev. */ -nvlist_t * -spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) +static nvlist_t * +spa_config_generate_impl(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats, + zpool_lock_behavior_t zpool_lock_behavior) { nvlist_t *config, *nvroot; vdev_t *rvd = spa->spa_root_vdev; @@ -421,15 +464,33 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) boolean_t locked = B_FALSE; uint64_t split_guid; const char *pool_name; + boolean_t skip_locks = B_FALSE; + + /* + * Sanity check: ZPOOL_LOCK_BEHAVIOR_LOCKLESS requires + * zfs_allow_lockless_zpool_status = 1 + */ + ASSERT(!((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) && + (zfs_allow_lockless_zpool_status != 1))); + + /* Special debug case: we've selected a pool for lockless operation */ + if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) { + skip_locks = B_TRUE; + } if (vd == NULL) { vd = rvd; locked = B_TRUE; - spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + + if (!skip_locks) + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, + RW_READER); } - ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) == - (SCL_CONFIG | SCL_STATE)); + if (!skip_locks) + ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, + RW_READER) == (SCL_CONFIG | SCL_STATE)); + /* * If txg is -1, report the current value of spa->spa_config_txg. @@ -552,12 +613,27 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) kmem_free(dds, sizeof (ddt_stat_t)); } - if (locked) + if (locked && !skip_locks) spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (config); } +nvlist_t * +spa_config_generate_lock_behavior(spa_t *spa, vdev_t *vd, uint64_t txg, + int getstats, zpool_lock_behavior_t zpool_lock_behavior) +{ + return (spa_config_generate_impl(spa, vd, txg, getstats, + zpool_lock_behavior)); +} + +nvlist_t * +spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) +{ + return (spa_config_generate_impl(spa, vd, txg, getstats, + ZPOOL_LOCK_BEHAVIOR_DEFAULT)); +} + /* * Update all disk labels, generate a fresh config based on the current * in-core state, and sync the global config cache (do not sync the config @@ -637,3 +713,6 @@ ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD, ZFS_MODULE_PARAM(zfs, zfs_, autoimport_disable, INT, ZMOD_RW, "Disable pool import at module load"); + +ZFS_MODULE_PARAM(zfs, zfs_, allow_lockless_zpool, INT, ZMOD_RW, + "DEBUG ONLY! Allow 'zpool status --lockless'"); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index f2b0297787f4..9847ce86f67c 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -608,21 +608,14 @@ spa_config_held(spa_t *spa, int locks, krw_t rw) * ========================================================================== */ -/* - * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. - * Returns NULL if no matching spa_t is found. - */ spa_t * -spa_lookup(const char *name) +spa_lookup_lockless(const char *name) { static spa_t search; /* spa_t is large; don't allocate on stack */ - spa_t *spa; avl_index_t where; + spa_t *spa; char *cp; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - -retry: (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); /* @@ -634,7 +627,24 @@ spa_lookup(const char *name) *cp = '\0'; spa = avl_find(&spa_namespace_avl, &search, &where); - if (spa == NULL) + + return (spa); +} + +/* + * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. + * Returns NULL if no matching spa_t is found. + */ +spa_t * +spa_lookup(const char *name) +{ + spa_t *spa; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + +retry: + spa = spa_lookup_lockless(name); + if (!spa) return (NULL); /* @@ -842,7 +852,6 @@ spa_remove(spa_t *spa) ASSERT0(spa->spa_waiters); nvlist_free(spa->spa_config_splitting); - avl_remove(&spa_namespace_avl, spa); if (spa->spa_root) @@ -907,6 +916,15 @@ spa_remove(spa_t *spa) kmem_free(spa, sizeof (spa_t)); } +spa_t * +spa_next_lockless(spa_t *prev) +{ + if (prev) + return (AVL_NEXT(&spa_namespace_avl, prev)); + else + return (avl_first(&spa_namespace_avl)); +} + /* * Given a pool, return the next pool in the namespace, or NULL if there is * none. If 'prev' is NULL, return the first pool. @@ -915,11 +933,7 @@ spa_t * spa_next(spa_t *prev) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - if (prev) - return (AVL_NEXT(&spa_namespace_avl, prev)); - else - return (avl_first(&spa_namespace_avl)); + return (spa_next_lockless(prev)); } /* @@ -1583,8 +1597,9 @@ spa_load_guid_exists(uint64_t guid) ASSERT(MUTEX_HELD(&spa_namespace_lock)); for (spa_t *spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { - if (spa_load_guid(spa) == guid) + if (spa_load_guid(spa) == guid) { return (B_TRUE); + } } return (arc_async_flush_guid_inuse(guid)); @@ -3033,6 +3048,7 @@ EXPORT_SYMBOL(spa_lookup); EXPORT_SYMBOL(spa_add); EXPORT_SYMBOL(spa_remove); EXPORT_SYMBOL(spa_next); +EXPORT_SYMBOL(spa_next_lockless); /* Refcount functions */ EXPORT_SYMBOL(spa_open_ref); @@ -3160,6 +3176,3 @@ ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW, "Number of allocators per spa"); - -ZFS_MODULE_PARAM(zfs, spa_, cpus_per_allocator, INT, ZMOD_RW, - "Minimum number of CPUs per allocators"); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 9266c3c28b50..78ec67c151c2 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1585,13 +1585,63 @@ zfs_ioc_pool_export(zfs_cmd_t *zc) return (error); } +/* + * Given a zc with only ZPOOL_CONFIG_LOCK_BEHAVIOR set in its ioctl + * source nvlist, extract and return the zpool_lock_behavior_t value. + * + * Return zpool_lock_behavior_t on success. On failure, return + * ZPOOL_LOCK_BEHAVIOR_DEFAULT. + */ +static zpool_lock_behavior_t +zfs_get_zpool_lock_behavior_helper(zfs_cmd_t *zc) +{ + int error = 0; + uint64_t zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_DEFAULT; + nvlist_t *nvl = NULL; /* args being passed in */ + + /* + * Our zfs_cmd_t can include an nvlist with optional input arguments. + * Right now the only optional arg is zpool_lock_behavior. Check if + * that exists and if it does, set values accordingly. + */ + error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl); + + if (error == 0) { + if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_LOCK_BEHAVIOR, + &zpool_lock_behavior) == 0) { + if (zpool_lock_behavior >= ZPOOL_LOCK_BEHAVIOR_END) { + nvlist_free(nvl); + /* Weird zpool_lock_behavior value */ + return (ZPOOL_LOCK_BEHAVIOR_DEFAULT); + } + } + nvlist_free(nvl); + } + return ((zpool_lock_behavior_t)zpool_lock_behavior); +} + static int zfs_ioc_pool_configs(zfs_cmd_t *zc) { nvlist_t *configs; int error; + zpool_lock_behavior_t zpool_lock_behavior; + + zpool_lock_behavior = zfs_get_zpool_lock_behavior_helper(zc); + + if ((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) && + (zfs_allow_lockless_zpool_status != 1)) { + /* + * Error - you must have zfs_debug_namespace_lock_delay=1 to + * enable --lockless. + */ + zc->zc_cookie = ENOTSUP; + return (ENOTSUP); + } + + error = spa_all_configs(&zc->zc_cookie, &configs, zpool_lock_behavior); - error = spa_all_configs(&zc->zc_cookie, &configs); if (error) return (error); @@ -1617,9 +1667,21 @@ zfs_ioc_pool_stats(zfs_cmd_t *zc) nvlist_t *config; int error; int ret = 0; + zpool_lock_behavior_t zpool_lock_behavior; + + zpool_lock_behavior = zfs_get_zpool_lock_behavior_helper(zc); + if ((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) && + (zfs_allow_lockless_zpool_status != 1)) { + /* + * Error - you must have zfs_debug_namespace_lock_delay=1 to + * enable --lockless. + */ + zc->zc_cookie = ENOTSUP; + return (ENOTSUP); + } error = spa_get_stats(zc->zc_name, &config, zc->zc_value, - sizeof (zc->zc_value)); + sizeof (zc->zc_value), zpool_lock_behavior); if (config != NULL) { ret = put_nvlist(zc, config); @@ -3092,6 +3154,7 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) static const zfs_ioc_key_t zfs_keys_get_props[] = { { ZPOOL_GET_PROPS_NAMES, DATA_TYPE_STRING_ARRAY, ZK_OPTIONAL }, + { ZPOOL_CONFIG_LOCK_BEHAVIOR, DATA_TYPE_UINT64, ZK_OPTIONAL }, }; static int @@ -3101,26 +3164,71 @@ zfs_ioc_pool_get_props(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) char **props = NULL; unsigned int n_props = 0; int error; + int rc; + uint64_t zpool_lock_behavior; + boolean_t is_locked; if (nvlist_lookup_string_array(innvl, ZPOOL_GET_PROPS_NAMES, &props, &n_props) != 0) { props = NULL; } - if ((error = spa_open(pool, &spa, FTAG)) != 0) { + zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_DEFAULT; + if (nvlist_lookup_uint64(innvl, ZPOOL_CONFIG_LOCK_BEHAVIOR, + &zpool_lock_behavior) == 0) { + if (zpool_lock_behavior >= ZPOOL_LOCK_BEHAVIOR_END) { + zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_DEFAULT; + } + } + + if ((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) && + (zfs_allow_lockless_zpool_status != 1)) { + /* + * Error - you must have zfs_debug_namespace_lock_delay=1 to + * enable --lockless. + */ + return (ENOTSUP); + } + + if ((error = spa_open_common_lock_behavior(pool, &spa, FTAG, NULL, NULL, + zpool_lock_behavior)) != 0) { /* * If the pool is faulted, there may be properties we can still * get (such as altroot and cachefile), so attempt to get them * anyway. */ - mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(pool)) != NULL) { + + is_locked = B_FALSE; + if ((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_TRYLOCK) || + (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS)) { + rc = mutex_enter_timeout(&spa_namespace_lock, + MSEC2NSEC(spa_namespace_trylock_ms)); + if (rc == 0) + is_locked = B_TRUE; + } else { + mutex_enter(&spa_namespace_lock); + is_locked = B_TRUE; + } + + if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_TRYLOCK && + !is_locked) { + return (EAGAIN); + } + + if (is_locked) + spa = spa_lookup(pool); + else + spa = spa_lookup_lockless(pool); + + if (spa != NULL) { error = spa_prop_get(spa, outnvl); if (error == 0 && props != NULL) error = spa_prop_get_nvlist(spa, props, n_props, outnvl); } - mutex_exit(&spa_namespace_lock); + + if (is_locked) + mutex_exit(&spa_namespace_lock); } else { error = spa_prop_get(spa, outnvl); if (error == 0 && props != NULL) diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 1f8aca0d9e1b..a00f776ea91d 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -559,7 +559,7 @@ tests = ['zpool_status_001_pos', 'zpool_status_002_pos', 'zpool_status_003_pos', 'zpool_status_004_pos', 'zpool_status_005_pos', 'zpool_status_006_pos', 'zpool_status_007_pos', 'zpool_status_008_pos', - 'zpool_status_features_001_pos'] + 'zpool_status_features_001_pos', 'zpool_status_lockless'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 8bffe9d8240c..3898ad80494a 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -3885,4 +3885,14 @@ function pop_coredump_pattern esac } +# Get the Unix timestamp as expressed in milliseconds. This can be useful for +# measuring how long something takes. This works on both Linux and +# FreeBSD. +function get_unix_timestamp_in_ms +{ + # We can't just use 'date +%s%N' here since FreeBSD 'date' only supports + # one-second resolution. + python3 -c 'import time; print(round(time.time() * 1000))' +} + . ${STF_SUITE}/include/kstat.shlib diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 79dc64ad9350..f002f43c3bc3 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -17,6 +17,7 @@ UNAME=$(uname) # NAME FreeBSD tunable Linux tunable cat <<%%%% | ADMIN_SNAPSHOT UNSUPPORTED zfs_admin_snapshot +ALLOW_LOCKLESS_ZPOOL_STATUS allow_lockless_zpool_status zfs_allow_lockless_zpool_status ALLOW_REDACTED_DATASET_MOUNT allow_redacted_dataset_mount zfs_allow_redacted_dataset_mount ARC_MAX arc.max zfs_arc_max ARC_MIN arc.min zfs_arc_min @@ -39,6 +40,7 @@ DEADMAN_EVENTS_PER_SECOND deadman_events_per_second zfs_deadman_events_per_secon DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode DEADMAN_SYNCTIME_MS deadman.synctime_ms zfs_deadman_synctime_ms DEADMAN_ZIOTIME_MS deadman.ziotime_ms zfs_deadman_ziotime_ms +DEBUG_NAMESPACE_LOCK_DELAY debug_namespace_lock_delay zfs_debug_namespace_lock_delay DISABLE_IVSET_GUID_CHECK disable_ivset_guid_check zfs_disable_ivset_guid_check DMU_OFFSET_NEXT_SYNC dmu_offset_next_sync zfs_dmu_offset_next_sync EMBEDDED_SLOG_MIN_MS embedded_slog_min_ms zfs_embedded_slog_min_ms @@ -87,6 +89,7 @@ SPA_ASIZE_INFLATION spa.asize_inflation spa_asize_inflation SPA_DISCARD_MEMORY_LIMIT spa.discard_memory_limit zfs_spa_discard_memory_limit SPA_LOAD_VERIFY_DATA spa.load_verify_data spa_load_verify_data SPA_LOAD_VERIFY_METADATA spa.load_verify_metadata spa_load_verify_metadata +SPA_NAMESPACE_TRYLOCK_MS spa.namespace_trylock_ms spa_namespace_trylock_ms TRIM_EXTENT_BYTES_MIN trim.extent_bytes_min zfs_trim_extent_bytes_min TRIM_METASLAB_SKIP trim.metaslab_skip zfs_trim_metaslab_skip TRIM_TXG_BATCH trim.txg_batch zfs_trim_txg_batch diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index bce546d066f6..25d3e11d9c1a 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1272,6 +1272,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_status/zpool_status_007_pos.ksh \ functional/cli_root/zpool_status/zpool_status_008_pos.ksh \ functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \ + functional/cli_root/zpool_status/zpool_status_lockless.ksh \ functional/cli_root/zpool_sync/cleanup.ksh \ functional/cli_root/zpool_sync/setup.ksh \ functional/cli_root/zpool_sync/zpool_sync_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_lockless.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_lockless.ksh new file mode 100755 index 000000000000..5908d547670b --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_lockless.ksh @@ -0,0 +1,92 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify 'zpool status --lockless|--trylock' work correctly + +DISK=$TEST_BASE_DIR/zpool_status_lockless_vdev + +function cleanup +{ + log_must restore_tunable ALLOW_LOCKLESS_ZPOOL_STATUS + log_must restore_tunable DEBUG_NAMESPACE_LOCK_DELAY + log_must restore_tunable SPA_NAMESPACE_TRYLOCK_MS + log_must zpool destroy testpool2 + log_must rm -f "$DISK" +} + +log_assert "Verify 'zpool status ---lockless|--trylock'" + +log_onexit cleanup + +log_must save_tunable ALLOW_LOCKLESS_ZPOOL_STATUS +log_must save_tunable DEBUG_NAMESPACE_LOCK_DELAY +log_must save_tunable SPA_NAMESPACE_TRYLOCK_MS + +log_mustnot test -e $DISK +log_must truncate -s 100M $DISK +log_must zpool create testpool2 $DISK +log_must zpool status +log_must zpool status --trylock + +# This should fail since zfs_allow_lockless_zpool_status!=1 +log_mustnot zpool status --lockless + +# Add an artificial 1s delay after taking the spa_namespace lock +set_tunable32 DEBUG_NAMESPACE_LOCK_DELAY 500 +zpool status & +sleep 0.1 + +# This should fail since the previously spawned off 'zpool status' is holding +# the lock for at least 500ms, and --trylock only tries for 100ms. +set_tunable32 SPA_NAMESPACE_TRYLOCK_MS 100 +log_mustnot zpool status --trylock +wait + +set_tunable32 DEBUG_NAMESPACE_LOCK_DELAY 100 +zpool status & +sleep 0.05 + +# This should succeed since the previously spawned off 'zpool status' is holding +# the lock for only 100ms, and --trylock tries for 300ms. +set_tunable32 SPA_NAMESPACE_TRYLOCK_MS 300 +log_must zpool status --trylock +wait + +# Now we artificially hold the lock for 500ms, and check that --lockless +# returns in a short amount of time, and without error. +set_tunable32 DEBUG_NAMESPACE_LOCK_DELAY 500 +set_tunable32 SPA_NAMESPACE_TRYLOCK_MS 10 +set_tunable32 ALLOW_LOCKLESS_ZPOOL_STATUS 1 +zpool status & +sleep 0.05 +before=$(get_unix_timestamp_in_ms) +log_must zpool status --lockless +after=$(get_unix_timestamp_in_ms) +d=$(($after - $before)) +wait +log_note "'zpool status --lockless' took $d milliseconds" +log_must test $d -le 300 + +log_pass "zpool status --lockless|--trylock work correctly"