Skip to content

Commit e056bbc

Browse files
author
Jaideep Karande
committed
WL#14940: GR: Eject member on resource exhaustion
This worklog enhances the high-availability(HA) Group Replication(GR) system to automatically detect and mitigate issues related to lagging secondary servers and resource exhaustion, thereby ensuring continuous operation and resilience of the group. In GR, when the secondary's applier lags or experiences swapping, it exacerbates high availability issues. Partially responsive or unreliable secondaries can destabilize the entire group. This worklog aims to enhance group resilience by implementing automatic ejection of servers encountering these critical conditions. Safeguarding the group from problematic members involves: Preventing resource depletion within the group by automatically removing members facing resource shortages. Enhancing the management of groups by seamlessly handling transient failures across various instances. Implementing self-ejection followed by automatic rejoin, enabling members to autonomously provision themselves and ensure continuous operation. Change-Id: I88a5d1871682d7b0efdb0d53f8624bc14a10ff2c
1 parent e1e57a1 commit e056bbc

File tree

12 files changed

+317
-0
lines changed

12 files changed

+317
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/* Copyright (c) 2024 Oracle and/or its affiliates.
2+
3+
This program is free software; you can redistribute it and/or modify
4+
it under the terms of the GNU General Public License, version 2.0,
5+
as published by the Free Software Foundation.
6+
7+
This program is designed to work with certain software (including
8+
but not limited to OpenSSL) that is licensed under separate terms,
9+
as designated in a particular file or component or in included license
10+
documentation. The authors of MySQL hereby grant you an additional
11+
permission to link the program and your derivative works with the
12+
separately licensed software that they have either included with
13+
the program or referenced in the documentation.
14+
15+
This program is distributed in the hope that it will be useful,
16+
but WITHOUT ANY WARRANTY; without even the implied warranty of
17+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18+
GNU General Public License, version 2.0, for more details.
19+
20+
You should have received a copy of the GNU General Public License
21+
along with this program; if not, write to the Free Software
22+
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
23+
24+
#ifndef GROUP_REPLICATION_MANAGEMENT_SERVICE_H
25+
#define GROUP_REPLICATION_MANAGEMENT_SERVICE_H
26+
27+
#include <mysql/components/service.h>
28+
#include <stddef.h>
29+
30+
enum eject_status {
31+
GR_RM_SUCCESS_LEFT_GROUP,
32+
GR_RM_NOT_IN_SINGLE_PRIMARY_MODE,
33+
GR_RM_NOT_A_SECONDARY_MEMBER,
34+
GR_RM_NUMBER_OF_MEMBERS_LESS_THAN_THREE,
35+
GR_RM_QUARANTINE_PERIOD_NOT_OVER,
36+
GR_RM_NOT_A_MEMBER
37+
};
38+
39+
BEGIN_SERVICE_DEFINITION(group_replication_management_service_v1)
40+
41+
DECLARE_METHOD(enum eject_status, eject,
42+
(int quarantine_time_in_seconds,
43+
unsigned int *seconds_since_member_join));
44+
45+
/**
46+
Checks if this member is ONLINE or RECOVERING.
47+
48+
@return status
49+
@retval true this member is ONLINE or RECOVERING
50+
@retval false otherwise
51+
*/
52+
DECLARE_BOOL_METHOD(is_member_online_or_recovering, ());
53+
54+
END_SERVICE_DEFINITION(group_replication_management_service_v1)
55+
56+
#endif /* GROUP_REPLICATION_MANAGEMENT_SERVICE_H */

mysql-test/include/excludenoskip.list

+3
Original file line numberDiff line numberDiff line change
@@ -349,3 +349,6 @@ have_component_replication_applier_metrics.inc
349349

350350
# 39. Skip tests that need the diagnostic log.
351351
have_log_diagnostic.inc
352+
353+
# 40. Skip tests which expect the Group Replication resource manager component
354+
have_group_replication_resource_manager.inc

packaging/deb-in/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ usr/lib/mysql/plugin/component_scheduler.so
129129
usr/lib/mysql/plugin/component_telemetry.so
130130
usr/lib/mysql/plugin/component_option_tracker.so
131131
usr/lib/mysql/plugin/component_group_replication_flow_control_stats.so
132+
usr/lib/mysql/plugin/component_group_replication_resource_manager.so
132133
usr/lib/mysql/plugin/component_replication_applier_metrics.so
133134
usr/lib/mysql/private/libpolyglot.so
134135
")

packaging/deb-in/deb_debug.cmake

+1
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@ usr/lib/mysql/plugin/debug/component_scheduler.so
235235
usr/lib/mysql/plugin/debug/component_telemetry.so
236236
usr/lib/mysql/plugin/debug/component_option_tracker.so
237237
usr/lib/mysql/plugin/debug/component_group_replication_flow_control_stats.so
238+
usr/lib/mysql/plugin/debug/component_group_replication_resource_manager.so
238239
usr/lib/mysql/plugin/debug/component_replication_applier_metrics.so
239240
usr/lib/mysql/plugin/debug/authentication_webauthn.so
240241
")

packaging/rpm-docker/mysql.spec.in

+4
Original file line numberDiff line numberDiff line change
@@ -700,6 +700,7 @@ rm -r $(readlink var) var
700700
%attr(755, root, root) %{_libdir}/mysql/plugin/component_telemetry.so
701701
%attr(755, root, root) %{_libdir}/mysql/plugin/component_option_tracker.so
702702
%attr(755, root, root) %{_libdir}/mysql/plugin/component_group_replication_flow_control_stats.so
703+
%attr(755, root, root) %{_libdir}/mysql/plugin/component_group_replication_resource_manager.so
703704
%attr(755, root, root) %{_libdir}/mysql/plugin/component_replication_applier_metrics.so
704705
%if 0%{?aws_sdk}
705706
%attr(755, root, root) %{_libdir}/mysql/plugin/keyring_aws.so
@@ -719,6 +720,9 @@ rm -r $(readlink var) var
719720
%dir %attr(750, mysql, mysql) /var/lib/mysql-keyring
720721

721722
%changelog
723+
* Fri Nov 29 2024 Jaideep Karande <[email protected]> - 9.2.0-1
724+
- Added component_group_replication_resource_manager
725+
722726
* Wed Nov 13 2024 Samar Pratap Singh <[email protected]> - 9.2.0-1
723727
- Added component_connection_control
724728

packaging/rpm-oel/mysql.spec.in

+5
Original file line numberDiff line numberDiff line change
@@ -1311,6 +1311,7 @@ fi
13111311
%attr(755, root, root) %{_libdir}/mysql/plugin/component_scheduler.so
13121312
%attr(755, root, root) %{_libdir}/mysql/plugin/component_option_tracker.so
13131313
%attr(755, root, root) %{_libdir}/mysql/plugin/component_group_replication_flow_control_stats.so
1314+
%attr(755, root, root) %{_libdir}/mysql/plugin/component_group_replication_resource_manager.so
13141315
%attr(755, root, root) %{_libdir}/mysql/plugin/component_replication_applier_metrics.so
13151316
%attr(644, root, root) %{_datadir}/mysql-*/linux_install_firewall.sql
13161317
%attr(644, root, root) %{_datadir}/mysql-*/uninstall_firewall.sql
@@ -1409,6 +1410,7 @@ fi
14091410
%attr(755, root, root) %{_libdir}/mysql/plugin/debug/component_scheduler.so
14101411
%attr(755, root, root) %{_libdir}/mysql/plugin/debug/component_option_tracker.so
14111412
%attr(755, root, root) %{_libdir}/mysql/plugin/debug/component_group_replication_flow_control_stats.so
1413+
%attr(755, root, root) %{_libdir}/mysql/plugin/debug/component_group_replication_resource_manager.so
14121414
%attr(755, root, root) %{_libdir}/mysql/plugin/debug/component_replication_applier_metrics.so
14131415
%if 0%{?aws_sdk}
14141416
%attr(755, root, root) %{_libdir}/mysql/plugin/debug/keyring_aws.so
@@ -1979,6 +1981,9 @@ fi
19791981
%endif # with_router
19801982

19811983
%changelog
1984+
* Fri Nov 29 2024 Jaideep Karande <[email protected]> - 9.2.0-1
1985+
- Added component_group_replication_resource_manager
1986+
19821987
* Wed Nov 13 2024 Samar Pratap Singh <[email protected]> - 9.2.0-1
19831988
- Added component_connection_control
19841989

packaging/rpm-sles/mysql.spec.in

+5
Original file line numberDiff line numberDiff line change
@@ -956,6 +956,7 @@ fi
956956
%attr(755, root, root) %{_libdir}/mysql/plugin/component_telemetry.so
957957
%attr(755, root, root) %{_libdir}/mysql/plugin/component_option_tracker.so
958958
%attr(755, root, root) %{_libdir}/mysql/plugin/component_group_replication_flow_control_stats.so
959+
%attr(755, root, root) %{_libdir}/mysql/plugin/component_group_replication_resource_manager.so
959960
%attr(755, root, root) %{_libdir}/mysql/plugin/component_replication_applier_metrics.so
960961
%if 0%{?aws_sdk}
961962
%attr(755, root, root) %{_libdir}/mysql/plugin/keyring_aws.so
@@ -1042,6 +1043,7 @@ fi
10421043
%attr(755, root, root) %{_libdir}/mysql/plugin/debug/component_telemetry.so
10431044
%attr(755, root, root) %{_libdir}/mysql/plugin/debug/component_option_tracker.so
10441045
%attr(755, root, root) %{_libdir}/mysql/plugin/debug/component_group_replication_flow_control_stats.so
1046+
%attr(755, root, root) %{_libdir}/mysql/plugin/debug/component_group_replication_resource_manager.so
10451047
%attr(755, root, root) %{_libdir}/mysql/plugin/debug/component_replication_applier_metrics.so
10461048
%if 0%{?aws_sdk}
10471049
%attr(755, root, root) %{_libdir}/mysql/plugin/debug/keyring_aws.so
@@ -1550,6 +1552,9 @@ fi
15501552
%endif # with_router
15511553

15521554
%changelog
1555+
* Fri Nov 29 2024 Jaideep Karande <[email protected]> - 9.2.0-1
1556+
- Added component_group_replication_resource_manager
1557+
15531558
* Wed Nov 13 2024 Samar Pratap Singh <[email protected]> - 9.2.0-1
15541559
- Added component_connection_control
15551560

plugin/group_replication/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ SET(GROUP_REPLICATION_SOURCES
111111
src/recovery_channel_state_observer.cc
112112
src/recovery_state_transfer.cc
113113
src/replication_threads_api.cc
114+
src/services/management/management.cc
114115
src/services/message_service/message_service.cc
115116
src/services/notification/notification.cc
116117
src/services/status_service/status_service.cc
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
/* Copyright (c) 2024, Oracle and/or its affiliates.
2+
3+
This program is free software; you can redistribute it and/or modify
4+
it under the terms of the GNU General Public License, version 2.0,
5+
as published by the Free Software Foundation.
6+
7+
This program is designed to work with certain software (including
8+
but not limited to OpenSSL) that is licensed under separate terms,
9+
as designated in a particular file or component or in included license
10+
documentation. The authors of MySQL hereby grant you an additional
11+
permission to link the program and your derivative works with the
12+
separately licensed software that they have either included with
13+
the program or referenced in the documentation.
14+
15+
This program is distributed in the hope that it will be useful,
16+
but WITHOUT ANY WARRANTY; without even the implied warranty of
17+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18+
GNU General Public License, version 2.0, for more details.
19+
20+
You should have received a copy of the GNU General Public License
21+
along with this program; if not, write to the Free Software
22+
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
23+
24+
#ifndef GR_MANAGEMENT_SERVICES_H
25+
#define GR_MANAGEMENT_SERVICES_H
26+
27+
#include <mysql/components/services/registry.h>
28+
#include <mysql/service_plugin_registry.h>
29+
#include <chrono>
30+
#include <ctime>
31+
#include <string>
32+
33+
#define GROUP_REPLICATION_MANAGEMENT_SERVICE_NAME \
34+
"group_replication.group_replication_management"
35+
36+
class GR_start_time_maintain {
37+
private:
38+
static std::chrono::steady_clock::time_point gr_start_time;
39+
40+
public:
41+
static void reset_start_time();
42+
43+
static bool check_if_quarantine_time_passed(
44+
int quarantime_time, unsigned int *seconds_since_member_join);
45+
};
46+
47+
bool register_group_replication_management_services();
48+
49+
bool unregister_group_replication_management_services();
50+
#endif // GR_MANAGEMENT_SERVICES_H

plugin/group_replication/src/plugin.cc

+19
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
#include "plugin/group_replication/include/plugin_variables.h"
5151
#include "plugin/group_replication/include/plugin_variables/recovery_endpoints.h"
5252
#include "plugin/group_replication/include/services/flow_control/get_metrics.h"
53+
#include "plugin/group_replication/include/services/management/management.h"
5354
#include "plugin/group_replication/include/services/message_service/message_service.h"
5455
#include "plugin/group_replication/include/services/status_service/status_service.h"
5556
#include "plugin/group_replication/include/sql_service/sql_service_interface.h"
@@ -683,6 +684,12 @@ int plugin_group_replication_start(char **error_message) {
683684
// Reset the coordinator in case there was a previous stop.
684685
group_action_coordinator->reset_coordinator_process();
685686

687+
/*
688+
Reset start time before join the group to avoid that the
689+
eviction service sees a old start time.
690+
*/
691+
GR_start_time_maintain::reset_start_time();
692+
686693
// GR delayed initialization.
687694
if (!server_engine_initialized()) {
688695
lv.wait_on_engine_initialization = true;
@@ -1865,6 +1872,12 @@ bool attempt_rejoin() {
18651872
*/
18661873
if (initialize_plugin_modules(modules_mask)) goto end;
18671874

1875+
/*
1876+
Reset start time before join the group to avoid that the
1877+
eviction service sees a old start time.
1878+
*/
1879+
GR_start_time_maintain::reset_start_time();
1880+
18681881
/*
18691882
Finally we attempt the join itself.
18701883
*/
@@ -2159,6 +2172,11 @@ int plugin_group_replication_init(MYSQL_PLUGIN plugin_info) {
21592172
"mode) service.");
21602173
return 1;
21612174
}
2175+
if (register_group_replication_management_services()) {
2176+
LogPluginErr(ERROR_LEVEL, ER_GRP_RPL_ERROR_MSG,
2177+
"Failed to initialize Group Replication Management service");
2178+
return 1;
2179+
}
21622180

21632181
if (gr::flow_control_metrics_service::
21642182
register_gr_flow_control_metrics_service()) {
@@ -2228,6 +2246,7 @@ int plugin_group_replication_deinit(void *p) {
22282246
finalize_perfschema_module();
22292247

22302248
gr::status_service::unregister_gr_status_service();
2249+
unregister_group_replication_management_services();
22312250

22322251
gr::flow_control_metrics_service::
22332252
unregister_gr_flow_control_metrics_service();
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
/* Copyright (c) 2024, Oracle and/or its affiliates.
2+
3+
This program is free software; you can redistribute it and/or modify
4+
it under the terms of the GNU General Public License, version 2.0,
5+
as published by the Free Software Foundation.
6+
7+
This program is designed to work with certain software (including
8+
but not limited to OpenSSL) that is licensed under separate terms,
9+
as designated in a particular file or component or in included license
10+
documentation. The authors of MySQL hereby grant you an additional
11+
permission to link the program and your derivative works with the
12+
separately licensed software that they have either included with
13+
the program or referenced in the documentation.
14+
15+
This program is distributed in the hope that it will be useful,
16+
but WITHOUT ANY WARRANTY; without even the implied warranty of
17+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18+
GNU General Public License, version 2.0, for more details.
19+
20+
You should have received a copy of the GNU General Public License
21+
along with this program; if not, write to the Free Software
22+
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
23+
24+
#include <mysql/components/service_implementation.h>
25+
#include <mysql/components/services/group_replication_management_service.h>
26+
27+
#include "plugin/group_replication/include/leave_group_on_failure.h"
28+
#include "plugin/group_replication/include/plugin.h"
29+
#include "plugin/group_replication/include/services/management/management.h"
30+
31+
std::chrono::steady_clock::time_point GR_start_time_maintain::gr_start_time =
32+
std::chrono::steady_clock::time_point::min();
33+
34+
void GR_start_time_maintain::reset_start_time() {
35+
gr_start_time = std::chrono::steady_clock::now();
36+
}
37+
38+
bool GR_start_time_maintain::check_if_quarantine_time_passed(
39+
int quarantime_time, unsigned int *seconds_since_member_join) {
40+
auto time_now = std::chrono::steady_clock::now();
41+
auto time_diff =
42+
std::chrono::duration_cast<std::chrono::seconds>(time_now - gr_start_time)
43+
.count();
44+
*seconds_since_member_join = time_diff;
45+
return gr_start_time != std::chrono::steady_clock::time_point::min() &&
46+
time_diff > quarantime_time;
47+
}
48+
49+
////////////////////////////////////////////////////////////////////////////////
50+
namespace gr {
51+
namespace gr_management {
52+
DEFINE_METHOD(eject_status, eject,
53+
(int quarantine_time_in_seconds,
54+
unsigned int *seconds_since_member_join)) {
55+
DBUG_TRACE;
56+
if (local_member_info == nullptr || group_member_mgr == nullptr) {
57+
return GR_RM_NOT_A_MEMBER;
58+
}
59+
if (!local_member_info->in_primary_mode()) {
60+
return GR_RM_NOT_IN_SINGLE_PRIMARY_MODE;
61+
}
62+
if (local_member_info->get_role() !=
63+
Group_member_info::MEMBER_ROLE_SECONDARY) {
64+
return GR_RM_NOT_A_SECONDARY_MEMBER;
65+
}
66+
if (group_member_mgr->get_number_of_members() < 3) {
67+
return GR_RM_NUMBER_OF_MEMBERS_LESS_THAN_THREE;
68+
}
69+
if (!GR_start_time_maintain::check_if_quarantine_time_passed(
70+
quarantine_time_in_seconds, seconds_since_member_join)) {
71+
return GR_RM_QUARANTINE_PERIOD_NOT_OVER;
72+
}
73+
74+
std::string error_message("Service call to leave the group.");
75+
leave_group_on_failure::mask leave_actions;
76+
leave_actions.set(leave_group_on_failure::STOP_APPLIER, true);
77+
leave_actions.set(leave_group_on_failure::HANDLE_EXIT_STATE_ACTION, true);
78+
leave_actions.set(leave_group_on_failure::HANDLE_AUTO_REJOIN, true);
79+
leave_group_on_failure::leave(leave_actions, 0, nullptr,
80+
error_message.c_str());
81+
return GR_RM_SUCCESS_LEFT_GROUP;
82+
}
83+
84+
DEFINE_BOOL_METHOD(is_member_online_or_recovering, ()) {
85+
DBUG_TRACE;
86+
87+
if (!plugin_is_group_replication_running()) return false;
88+
89+
if (nullptr == local_member_info) return false;
90+
91+
const Group_member_info::Group_member_status member_status =
92+
local_member_info->get_recovery_status();
93+
if (member_status == Group_member_info::MEMBER_ONLINE ||
94+
member_status == Group_member_info::MEMBER_IN_RECOVERY) {
95+
return true;
96+
}
97+
98+
return false;
99+
}
100+
} // namespace gr_management
101+
} // namespace gr
102+
BEGIN_SERVICE_IMPLEMENTATION(group_replication,
103+
group_replication_management_service_v1)
104+
gr::gr_management::eject, gr::gr_management::is_member_online_or_recovering,
105+
END_SERVICE_IMPLEMENTATION();
106+
107+
bool register_group_replication_management_services() {
108+
DBUG_TRACE;
109+
110+
DBUG_EXECUTE_IF("group_replication_management_service", return false;);
111+
112+
my_service<SERVICE_TYPE(registry_registration)> reg("registry_registration",
113+
get_plugin_registry());
114+
using group_replication_management_service_t =
115+
SERVICE_TYPE_NO_CONST(group_replication_management_service_v1);
116+
return reg->register_service(
117+
GROUP_REPLICATION_MANAGEMENT_SERVICE_NAME,
118+
reinterpret_cast<my_h_service>(
119+
const_cast<group_replication_management_service_t *>(
120+
&SERVICE_IMPLEMENTATION(
121+
group_replication,
122+
group_replication_management_service_v1))));
123+
}
124+
125+
bool unregister_group_replication_management_services() {
126+
DBUG_TRACE;
127+
my_service<SERVICE_TYPE(registry_registration)> reg("registry_registration",
128+
get_plugin_registry());
129+
return reg->unregister(GROUP_REPLICATION_MANAGEMENT_SERVICE_NAME);
130+
}

0 commit comments

Comments
 (0)