Skip to content

Commit 1164057

Browse files
committed
Merge branch 'mlx5-misc-fixes'
Tariq Toukan says: ==================== mlx5 misc fixes This patchset provides bug fixes to mlx5 driver. Patch 1 by Shay fixes the error flow in mlx5e_suspend(). Patch 2 by Shay aligns the peer devlink set logic with the register devlink flow. Patch 3 by Maher solves a deadlock in lag enable/disable. Patches 4 and 5 by Akiva address issues in command interface corner cases. Series generated against: commit 393ceeb ("Merge branch 'there-are-some-bugfix-for-the-hns3-ethernet-driver'") ==================== Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
2 parents df7025b + db9b31a commit 1164057

File tree

9 files changed

+79
-51
lines changed

9 files changed

+79
-51
lines changed

drivers/net/ethernet/mellanox/mlx5/core/cmd.c

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -969,19 +969,32 @@ static void cmd_work_handler(struct work_struct *work)
969969
bool poll_cmd = ent->polling;
970970
struct mlx5_cmd_layout *lay;
971971
struct mlx5_core_dev *dev;
972-
unsigned long cb_timeout;
973-
struct semaphore *sem;
972+
unsigned long timeout;
974973
unsigned long flags;
975974
int alloc_ret;
976975
int cmd_mode;
977976

977+
complete(&ent->handling);
978+
978979
dev = container_of(cmd, struct mlx5_core_dev, cmd);
979-
cb_timeout = msecs_to_jiffies(mlx5_tout_ms(dev, CMD));
980+
timeout = msecs_to_jiffies(mlx5_tout_ms(dev, CMD));
980981

981-
complete(&ent->handling);
982-
sem = ent->page_queue ? &cmd->vars.pages_sem : &cmd->vars.sem;
983-
down(sem);
984982
if (!ent->page_queue) {
983+
if (down_timeout(&cmd->vars.sem, timeout)) {
984+
mlx5_core_warn(dev, "%s(0x%x) timed out while waiting for a slot.\n",
985+
mlx5_command_str(ent->op), ent->op);
986+
if (ent->callback) {
987+
ent->callback(-EBUSY, ent->context);
988+
mlx5_free_cmd_msg(dev, ent->out);
989+
free_msg(dev, ent->in);
990+
cmd_ent_put(ent);
991+
} else {
992+
ent->ret = -EBUSY;
993+
complete(&ent->done);
994+
}
995+
complete(&ent->slotted);
996+
return;
997+
}
985998
alloc_ret = cmd_alloc_index(cmd, ent);
986999
if (alloc_ret < 0) {
9871000
mlx5_core_err_rl(dev, "failed to allocate command entry\n");
@@ -994,17 +1007,20 @@ static void cmd_work_handler(struct work_struct *work)
9941007
ent->ret = -EAGAIN;
9951008
complete(&ent->done);
9961009
}
997-
up(sem);
1010+
up(&cmd->vars.sem);
9981011
return;
9991012
}
10001013
} else {
1014+
down(&cmd->vars.pages_sem);
10011015
ent->idx = cmd->vars.max_reg_cmds;
10021016
spin_lock_irqsave(&cmd->alloc_lock, flags);
10031017
clear_bit(ent->idx, &cmd->vars.bitmask);
10041018
cmd->ent_arr[ent->idx] = ent;
10051019
spin_unlock_irqrestore(&cmd->alloc_lock, flags);
10061020
}
10071021

1022+
complete(&ent->slotted);
1023+
10081024
lay = get_inst(cmd, ent->idx);
10091025
ent->lay = lay;
10101026
memset(lay, 0, sizeof(*lay));
@@ -1023,7 +1039,7 @@ static void cmd_work_handler(struct work_struct *work)
10231039
ent->ts1 = ktime_get_ns();
10241040
cmd_mode = cmd->mode;
10251041

1026-
if (ent->callback && schedule_delayed_work(&ent->cb_timeout_work, cb_timeout))
1042+
if (ent->callback && schedule_delayed_work(&ent->cb_timeout_work, timeout))
10271043
cmd_ent_get(ent);
10281044
set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state);
10291045

@@ -1143,6 +1159,9 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent)
11431159
ent->ret = -ECANCELED;
11441160
goto out_err;
11451161
}
1162+
1163+
wait_for_completion(&ent->slotted);
1164+
11461165
if (cmd->mode == CMD_MODE_POLLING || ent->polling)
11471166
wait_for_completion(&ent->done);
11481167
else if (!wait_for_completion_timeout(&ent->done, timeout))
@@ -1157,6 +1176,9 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent)
11571176
} else if (err == -ECANCELED) {
11581177
mlx5_core_warn(dev, "%s(0x%x) canceled on out of queue timeout.\n",
11591178
mlx5_command_str(ent->op), ent->op);
1179+
} else if (err == -EBUSY) {
1180+
mlx5_core_warn(dev, "%s(0x%x) timeout while waiting for command semaphore.\n",
1181+
mlx5_command_str(ent->op), ent->op);
11601182
}
11611183
mlx5_core_dbg(dev, "err %d, delivery status %s(%d)\n",
11621184
err, deliv_status_to_str(ent->status), ent->status);
@@ -1208,6 +1230,7 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
12081230
ent->polling = force_polling;
12091231

12101232
init_completion(&ent->handling);
1233+
init_completion(&ent->slotted);
12111234
if (!callback)
12121235
init_completion(&ent->done);
12131236

@@ -1225,7 +1248,7 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
12251248
return 0; /* mlx5_cmd_comp_handler() will put(ent) */
12261249

12271250
err = wait_func(dev, ent);
1228-
if (err == -ETIMEDOUT || err == -ECANCELED)
1251+
if (err == -ETIMEDOUT || err == -ECANCELED || err == -EBUSY)
12291252
goto out_free;
12301253

12311254
ds = ent->ts2 - ent->ts1;
@@ -1611,6 +1634,9 @@ static int cmd_comp_notifier(struct notifier_block *nb,
16111634
dev = container_of(cmd, struct mlx5_core_dev, cmd);
16121635
eqe = data;
16131636

1637+
if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
1638+
return NOTIFY_DONE;
1639+
16141640
mlx5_cmd_comp_handler(dev, be32_to_cpu(eqe->data.cmd.vector), false);
16151641

16161642
return NOTIFY_OK;

drivers/net/ethernet/mellanox/mlx5/core/en_main.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6058,7 +6058,7 @@ static int mlx5e_resume(struct auxiliary_device *adev)
60586058
return 0;
60596059
}
60606060

6061-
static int _mlx5e_suspend(struct auxiliary_device *adev)
6061+
static int _mlx5e_suspend(struct auxiliary_device *adev, bool pre_netdev_reg)
60626062
{
60636063
struct mlx5e_dev *mlx5e_dev = auxiliary_get_drvdata(adev);
60646064
struct mlx5e_priv *priv = mlx5e_dev->priv;
@@ -6067,7 +6067,7 @@ static int _mlx5e_suspend(struct auxiliary_device *adev)
60676067
struct mlx5_core_dev *pos;
60686068
int i;
60696069

6070-
if (!netif_device_present(netdev)) {
6070+
if (!pre_netdev_reg && !netif_device_present(netdev)) {
60716071
if (test_bit(MLX5E_STATE_DESTROYING, &priv->state))
60726072
mlx5_sd_for_each_dev(i, mdev, pos)
60736073
mlx5e_destroy_mdev_resources(pos);
@@ -6090,7 +6090,7 @@ static int mlx5e_suspend(struct auxiliary_device *adev, pm_message_t state)
60906090

60916091
actual_adev = mlx5_sd_get_adev(mdev, adev, edev->idx);
60926092
if (actual_adev)
6093-
err = _mlx5e_suspend(actual_adev);
6093+
err = _mlx5e_suspend(actual_adev, false);
60946094

60956095
mlx5_sd_cleanup(mdev);
60966096
return err;
@@ -6157,7 +6157,7 @@ static int _mlx5e_probe(struct auxiliary_device *adev)
61576157
return 0;
61586158

61596159
err_resume:
6160-
_mlx5e_suspend(adev);
6160+
_mlx5e_suspend(adev, true);
61616161
err_profile_cleanup:
61626162
profile->cleanup(priv);
61636163
err_destroy_netdev:
@@ -6197,7 +6197,7 @@ static void _mlx5e_remove(struct auxiliary_device *adev)
61976197
mlx5_core_uplink_netdev_set(mdev, NULL);
61986198
mlx5e_dcbnl_delete_app(priv);
61996199
unregister_netdev(priv->netdev);
6200-
_mlx5e_suspend(adev);
6200+
_mlx5e_suspend(adev, false);
62016201
priv->profile->cleanup(priv);
62026202
mlx5e_destroy_netdev(priv);
62036203
mlx5e_devlink_port_unregister(mlx5e_dev);

drivers/net/ethernet/mellanox/mlx5/core/eswitch.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -833,7 +833,7 @@ int mlx5_eswitch_offloads_single_fdb_add_one(struct mlx5_eswitch *master_esw,
833833
struct mlx5_eswitch *slave_esw, int max_slaves);
834834
void mlx5_eswitch_offloads_single_fdb_del_one(struct mlx5_eswitch *master_esw,
835835
struct mlx5_eswitch *slave_esw);
836-
int mlx5_eswitch_reload_reps(struct mlx5_eswitch *esw);
836+
int mlx5_eswitch_reload_ib_reps(struct mlx5_eswitch *esw);
837837

838838
bool mlx5_eswitch_block_encap(struct mlx5_core_dev *dev);
839839
void mlx5_eswitch_unblock_encap(struct mlx5_core_dev *dev);
@@ -925,7 +925,7 @@ mlx5_eswitch_offloads_single_fdb_del_one(struct mlx5_eswitch *master_esw,
925925
static inline int mlx5_eswitch_get_npeers(struct mlx5_eswitch *esw) { return 0; }
926926

927927
static inline int
928-
mlx5_eswitch_reload_reps(struct mlx5_eswitch *esw)
928+
mlx5_eswitch_reload_ib_reps(struct mlx5_eswitch *esw)
929929
{
930930
return 0;
931931
}

drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2502,6 +2502,16 @@ void esw_offloads_cleanup(struct mlx5_eswitch *esw)
25022502
esw_offloads_cleanup_reps(esw);
25032503
}
25042504

2505+
static int __esw_offloads_load_rep(struct mlx5_eswitch *esw,
2506+
struct mlx5_eswitch_rep *rep, u8 rep_type)
2507+
{
2508+
if (atomic_cmpxchg(&rep->rep_data[rep_type].state,
2509+
REP_REGISTERED, REP_LOADED) == REP_REGISTERED)
2510+
return esw->offloads.rep_ops[rep_type]->load(esw->dev, rep);
2511+
2512+
return 0;
2513+
}
2514+
25052515
static void __esw_offloads_unload_rep(struct mlx5_eswitch *esw,
25062516
struct mlx5_eswitch_rep *rep, u8 rep_type)
25072517
{
@@ -2526,13 +2536,11 @@ static int mlx5_esw_offloads_rep_load(struct mlx5_eswitch *esw, u16 vport_num)
25262536
int err;
25272537

25282538
rep = mlx5_eswitch_get_rep(esw, vport_num);
2529-
for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++)
2530-
if (atomic_cmpxchg(&rep->rep_data[rep_type].state,
2531-
REP_REGISTERED, REP_LOADED) == REP_REGISTERED) {
2532-
err = esw->offloads.rep_ops[rep_type]->load(esw->dev, rep);
2533-
if (err)
2534-
goto err_reps;
2535-
}
2539+
for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++) {
2540+
err = __esw_offloads_load_rep(esw, rep, rep_type);
2541+
if (err)
2542+
goto err_reps;
2543+
}
25362544

25372545
return 0;
25382546

@@ -3277,7 +3285,7 @@ static void esw_destroy_offloads_acl_tables(struct mlx5_eswitch *esw)
32773285
esw_vport_destroy_offloads_acl_tables(esw, vport);
32783286
}
32793287

3280-
int mlx5_eswitch_reload_reps(struct mlx5_eswitch *esw)
3288+
int mlx5_eswitch_reload_ib_reps(struct mlx5_eswitch *esw)
32813289
{
32823290
struct mlx5_eswitch_rep *rep;
32833291
unsigned long i;
@@ -3290,13 +3298,13 @@ int mlx5_eswitch_reload_reps(struct mlx5_eswitch *esw)
32903298
if (atomic_read(&rep->rep_data[REP_ETH].state) != REP_LOADED)
32913299
return 0;
32923300

3293-
ret = mlx5_esw_offloads_rep_load(esw, MLX5_VPORT_UPLINK);
3301+
ret = __esw_offloads_load_rep(esw, rep, REP_IB);
32943302
if (ret)
32953303
return ret;
32963304

32973305
mlx5_esw_for_each_rep(esw, i, rep) {
32983306
if (atomic_read(&rep->rep_data[REP_ETH].state) == REP_LOADED)
3299-
mlx5_esw_offloads_rep_load(esw, rep->vport);
3307+
__esw_offloads_load_rep(esw, rep, REP_IB);
33003308
}
33013309

33023310
return 0;

drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -814,7 +814,7 @@ void mlx5_disable_lag(struct mlx5_lag *ldev)
814814
if (shared_fdb)
815815
for (i = 0; i < ldev->ports; i++)
816816
if (!(ldev->pf[i].dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV))
817-
mlx5_eswitch_reload_reps(ldev->pf[i].dev->priv.eswitch);
817+
mlx5_eswitch_reload_ib_reps(ldev->pf[i].dev->priv.eswitch);
818818
}
819819

820820
static bool mlx5_shared_fdb_supported(struct mlx5_lag *ldev)
@@ -922,7 +922,7 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
922922
mlx5_rescan_drivers_locked(dev0);
923923

924924
for (i = 0; i < ldev->ports; i++) {
925-
err = mlx5_eswitch_reload_reps(ldev->pf[i].dev->priv.eswitch);
925+
err = mlx5_eswitch_reload_ib_reps(ldev->pf[i].dev->priv.eswitch);
926926
if (err)
927927
break;
928928
}
@@ -933,7 +933,7 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
933933
mlx5_deactivate_lag(ldev);
934934
mlx5_lag_add_devices(ldev);
935935
for (i = 0; i < ldev->ports; i++)
936-
mlx5_eswitch_reload_reps(ldev->pf[i].dev->priv.eswitch);
936+
mlx5_eswitch_reload_ib_reps(ldev->pf[i].dev->priv.eswitch);
937937
mlx5_core_err(dev0, "Failed to enable lag\n");
938938
return;
939939
}

drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ static int enable_mpesw(struct mlx5_lag *ldev)
9999
dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
100100
mlx5_rescan_drivers_locked(dev0);
101101
for (i = 0; i < ldev->ports; i++) {
102-
err = mlx5_eswitch_reload_reps(ldev->pf[i].dev->priv.eswitch);
102+
err = mlx5_eswitch_reload_ib_reps(ldev->pf[i].dev->priv.eswitch);
103103
if (err)
104104
goto err_rescan_drivers;
105105
}
@@ -113,7 +113,7 @@ static int enable_mpesw(struct mlx5_lag *ldev)
113113
err_add_devices:
114114
mlx5_lag_add_devices(ldev);
115115
for (i = 0; i < ldev->ports; i++)
116-
mlx5_eswitch_reload_reps(ldev->pf[i].dev->priv.eswitch);
116+
mlx5_eswitch_reload_ib_reps(ldev->pf[i].dev->priv.eswitch);
117117
mlx5_mpesw_metadata_cleanup(ldev);
118118
return err;
119119
}

drivers/net/ethernet/mellanox/mlx5/core/main.c

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1680,6 +1680,8 @@ int mlx5_init_one_light(struct mlx5_core_dev *dev)
16801680
struct devlink *devlink = priv_to_devlink(dev);
16811681
int err;
16821682

1683+
devl_lock(devlink);
1684+
devl_register(devlink);
16831685
dev->state = MLX5_DEVICE_STATE_UP;
16841686
err = mlx5_function_enable(dev, true, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
16851687
if (err) {
@@ -1693,27 +1695,21 @@ int mlx5_init_one_light(struct mlx5_core_dev *dev)
16931695
goto query_hca_caps_err;
16941696
}
16951697

1696-
devl_lock(devlink);
1697-
devl_register(devlink);
1698-
16991698
err = mlx5_devlink_params_register(priv_to_devlink(dev));
17001699
if (err) {
17011700
mlx5_core_warn(dev, "mlx5_devlink_param_reg err = %d\n", err);
1702-
goto params_reg_err;
1701+
goto query_hca_caps_err;
17031702
}
17041703

17051704
devl_unlock(devlink);
17061705
return 0;
17071706

1708-
params_reg_err:
1709-
devl_unregister(devlink);
1710-
devl_unlock(devlink);
17111707
query_hca_caps_err:
1712-
devl_unregister(devlink);
1713-
devl_unlock(devlink);
17141708
mlx5_function_disable(dev, true);
17151709
out:
17161710
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
1711+
devl_unregister(devlink);
1712+
devl_unlock(devlink);
17171713
return err;
17181714
}
17191715

drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,13 @@ static int mlx5_sf_dev_probe(struct auxiliary_device *adev, const struct auxilia
6060
goto remap_err;
6161
}
6262

63+
/* Peer devlink logic expects to work on unregistered devlink instance. */
64+
err = mlx5_core_peer_devlink_set(sf_dev, devlink);
65+
if (err) {
66+
mlx5_core_warn(mdev, "mlx5_core_peer_devlink_set err=%d\n", err);
67+
goto peer_devlink_set_err;
68+
}
69+
6370
if (MLX5_ESWITCH_MANAGER(sf_dev->parent_mdev))
6471
err = mlx5_init_one_light(mdev);
6572
else
@@ -69,20 +76,10 @@ static int mlx5_sf_dev_probe(struct auxiliary_device *adev, const struct auxilia
6976
goto init_one_err;
7077
}
7178

72-
err = mlx5_core_peer_devlink_set(sf_dev, devlink);
73-
if (err) {
74-
mlx5_core_warn(mdev, "mlx5_core_peer_devlink_set err=%d\n", err);
75-
goto peer_devlink_set_err;
76-
}
77-
7879
return 0;
7980

80-
peer_devlink_set_err:
81-
if (mlx5_dev_is_lightweight(sf_dev->mdev))
82-
mlx5_uninit_one_light(sf_dev->mdev);
83-
else
84-
mlx5_uninit_one(sf_dev->mdev);
8581
init_one_err:
82+
peer_devlink_set_err:
8683
iounmap(mdev->iseg);
8784
remap_err:
8885
mlx5_mdev_uninit(mdev);

include/linux/mlx5/driver.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -862,6 +862,7 @@ struct mlx5_cmd_work_ent {
862862
void *context;
863863
int idx;
864864
struct completion handling;
865+
struct completion slotted;
865866
struct completion done;
866867
struct mlx5_cmd *cmd;
867868
struct work_struct work;

0 commit comments

Comments
 (0)