Skip to content

Commit cf25c68

Browse files
authored
fix restored cluster failed pod (#461)
1 parent d169caa commit cf25c68

File tree

11 files changed

+30
-19
lines changed

11 files changed

+30
-19
lines changed

charts/nebula-operator/crds/nebulaclusters.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,7 @@ spec:
494494
type: object
495495
type: array
496496
heartbeatInterval:
497+
default: 60
497498
format: int32
498499
type: integer
499500
image:

charts/nebula-operator/templates/scheduler-deployment.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ spec:
2929
- --leader-elect
3030
- --leader-elect-resource-name={{ .Values.scheduler.schedulerName }}
3131
- --leader-elect-resource-namespace={{ template "nebula-operator.namespace" . }}
32+
- --pod-max-in-unschedulable-pods-duration={{ .Values.scheduler.podMaxInUnschedulablePodsDuration }}
3233
- --v={{ .Values.scheduler.verbosity }}
3334
{{- if or .Values.kubernetesClusterDomain .Values.scheduler.env }}
3435
env:

charts/nebula-operator/values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ scheduler:
8181
cpu: 100m
8282
memory: 100Mi
8383
verbosity: 0
84+
podMaxInUnschedulablePodsDuration: 10s
8485
plugins:
8586
enabled: ["NodeZone"]
8687
disabled: [] # only in-tree plugins need to be defined here

config/crd/bases/apps.nebula-graph.io_nebulaclusters.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,7 @@ spec:
494494
type: object
495495
type: array
496496
heartbeatInterval:
497+
default: 60
497498
format: int32
498499
type: integer
499500
image:

config/samples/nebularestore-gs.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
apiVersion: v1
22
kind: Secret
33
metadata:
4-
name: gs-secret
4+
name: gcp-secret
55
type: Opaque
66
data:
77
credentials: <GOOGLE_APPLICATION_CREDENTIALS_JSON>

config/samples/nebularestore-s3.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
apiVersion: v1
22
kind: Secret
33
metadata:
4-
name: aws-s3-secret
4+
name: aws-secret
55
type: Opaque
66
data:
77
access_key: <ACCESS_KEY>
@@ -20,4 +20,4 @@ spec:
2020
region: "us-west-2"
2121
bucket: "nebula-br-test"
2222
endpoint: "https://s3.us-west-2.amazonaws.com"
23-
secretName: "aws-s3-secret"
23+
secretName: "aws-secret"

config/samples/restore-pod.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
apiVersion: v1
22
kind: Secret
33
metadata:
4-
name: aws-s3-secret
4+
name: aws-secret
55
type: Opaque
66
data:
77
access_key: <ACCESS_KEY>
@@ -15,7 +15,7 @@ spec:
1515
imagePullSecrets:
1616
- name: nebula-image
1717
containers:
18-
- image: reg.vesoft-inc.com/cloud-dev/br-ent:v3.5.1
18+
- image: reg.vesoft-inc.com/cloud-dev/br-ent:v3.7.0
1919
imagePullPolicy: Always
2020
name: restore
2121
command:
@@ -24,7 +24,7 @@ spec:
2424
- 'exec /usr/local/bin/br-ent restore full
2525
--mode=k8s --cluster=nebula
2626
--namespace default
27-
--secret=aws-s3-secret
27+
--secret=aws-secret
2828
--name BACKUP_2023_02_10_09_57_17
2929
--storage s3://BUCKET
3030
--s3.region=REGION

doc/user/br_guide.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ The fields in the table is optional.
2525
| Parameter | Description | Default |
2626
|:---------------------|:--------------------------------------------------------------------------|:---------|
2727
| `image` | backup container image without tag, and use `version` as tag | `` |
28-
| `nebula.version` | backup image tag | `` |
28+
| `version` | backup image tag | `` |
2929
| `imagePullPolicy` | backup image pull policy | `Always` |
3030
| `imagePullSecrets` | The secret to use for pulling the images | `[]` |
3131
| `env` | backup container environment variables | `[]` |
@@ -76,6 +76,8 @@ spec:
7676
config:
7777
# The name of the backup/restore nebula cluster
7878
clusterName: nebula
79+
# Concurrency is used to control the number of concurrent file uploads during data backup.
80+
concurrency: 15
7981
gs:
8082
# Location in which the gs bucket is located.
8183
location: "us-central1"
@@ -125,6 +127,7 @@ spec:
125127
cleanBackupData: true
126128
config:
127129
clusterName: nebula
130+
concurrency: 15
128131
gs:
129132
location: "us-central1"
130133
bucket: "nebula-test"

pkg/controller/nebulabackup/nebula_backup_control.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,10 @@ func (c *defaultBackupControl) UpdateNebulaBackup(backup *v1alpha1.NebulaBackup)
132132
}
133133

134134
func (c *defaultBackupControl) addFinalizer(backup *v1alpha1.NebulaBackup) error {
135+
if !backup.CleanBackupData() && kube.HasFinalizer(backup, finalizer) {
136+
return kube.UpdateFinalizer(context.TODO(), c.client, backup, kube.RemoveFinalizerOpType, finalizer)
137+
}
138+
135139
if needToAddFinalizer(backup) {
136140
if err := kube.UpdateFinalizer(context.TODO(), c.client, backup, kube.AddFinalizerOpType, finalizer); err != nil {
137141
return fmt.Errorf("add backup [%s/%s] finalizer failed, err: %v", backup.Namespace, backup.Name, err)

pkg/controller/nebularestore/nebula_restore_control.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ limitations under the License.
1717
package nebularestore
1818

1919
import (
20+
"fmt"
21+
2022
corev1 "k8s.io/api/core/v1"
2123
"k8s.io/klog/v2"
2224

@@ -77,23 +79,21 @@ func (c *defaultRestoreControl) UpdateNebulaRestore(nr *v1alpha1.NebulaRestore)
7779
}
7880
for _, pod := range pods {
7981
if pod.Status.Phase == corev1.PodFailed {
80-
klog.Infof("NebulaCluster [%s/%s] has failed pod %s.", ns, name, pod.Name)
82+
terminatedReason := getPodTerminateReason(pod)
8183
if err := c.clientSet.NebulaRestore().UpdateNebulaRestoreStatus(nr, &v1alpha1.RestoreCondition{
8284
Type: v1alpha1.RestoreFailed,
83-
Status: corev1.ConditionTrue,
85+
Status: corev1.ConditionUnknown,
8486
Reason: "PodFailed",
85-
Message: getPodTerminateReason(pod),
87+
Message: terminatedReason,
8688
}, &kube.RestoreUpdateStatus{
8789
ConditionType: v1alpha1.RestoreFailed,
8890
}); err != nil {
8991
klog.Errorf("Fail to update the condition of NebulaRestore [%s/%s], %v", ns, name, err)
9092
}
91-
if nr.Spec.AutoRemoveFailed {
92-
if err := c.deleteRestoredCluster(ns, nr.Status.ClusterName); err != nil {
93-
klog.Errorf("Fail to delete NebulaCluster [%s/%s], %v", ns, nr.Status.ClusterName, err)
94-
}
93+
if terminatedReason != "" {
94+
klog.Errorf("restored cluster [%s/%s] has failed pod %s, terminated reason: %s", ns, name, pod.Name, terminatedReason)
95+
return fmt.Errorf("restored cluster has failed pod: %s", pod.Name)
9596
}
96-
return nil
9797
}
9898
}
9999
}

pkg/controller/nebularestore/nebula_restore_manager.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -812,13 +812,13 @@ func (rm *restoreManager) getRestoredName(nr *v1alpha1.NebulaRestore) (string, e
812812

813813
func getPodTerminateReason(pod corev1.Pod) string {
814814
for _, cs := range pod.Status.InitContainerStatuses {
815-
if cs.State.Terminated != nil {
816-
return cs.State.Terminated.String()
815+
if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 {
816+
return fmt.Sprintf("container %s terminated: %s", cs.Name, cs.State.Terminated.String())
817817
}
818818
}
819819
for _, cs := range pod.Status.ContainerStatuses {
820-
if cs.State.Terminated != nil {
821-
return cs.State.Terminated.String()
820+
if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 {
821+
return fmt.Sprintf("container %s terminated: %s", cs.Name, cs.State.Terminated.String())
822822
}
823823
}
824824
return ""

0 commit comments

Comments
 (0)