Skip to content

Commit 363e06c

Browse files
Lars Maierneunhoef
authored andcommitted
Resign Leadership (#414)
* Added resign leadership job to operator. (Version check missing) * Added version check. * Added missing deep copy of new field. * New go-driver. * Take current version from member.
1 parent ac26eca commit 363e06c

File tree

6 files changed

+89
-14
lines changed

6 files changed

+89
-14
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ require (
2323
github.com/aktau/github-release v0.7.2
2424
github.com/arangodb-helper/go-certificates v0.0.0-20180821055445-9fca24fc2680
2525
github.com/arangodb/arangosync-client v0.6.3
26-
github.com/arangodb/go-driver v0.0.0-20190514134119-4cd2eb482d16
26+
github.com/arangodb/go-driver v0.0.0-20190802095550-7a2c11a3ff12
2727
github.com/arangodb/go-upgrade-rules v0.0.0-20180809110947-031b4774ff21
2828
github.com/arangodb/go-velocypack v0.0.0-20190129082528-7896a965b4ad
2929
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a

go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,10 @@ github.com/arangodb/go-driver v0.0.0-20190430103524-b14f41496c3d h1:9w1hB1yqZ8B1
6363
github.com/arangodb/go-driver v0.0.0-20190430103524-b14f41496c3d/go.mod h1:5nMxlcbN6PHA05uGUw2EH6PIQHvdRVTrg/S/GM8jG4w=
6464
github.com/arangodb/go-driver v0.0.0-20190514134119-4cd2eb482d16 h1:D0+YgwcHnePwiKuV1lOzJLPZb6Tw8hHK2qKSQGqN28s=
6565
github.com/arangodb/go-driver v0.0.0-20190514134119-4cd2eb482d16/go.mod h1:5nMxlcbN6PHA05uGUw2EH6PIQHvdRVTrg/S/GM8jG4w=
66+
github.com/arangodb/go-driver v0.0.0-20190704144722-b504aec1c4d1 h1:tozWjsO7b891qPZJTZoC4HnMmlV3wrBFo/y9DnKMOVA=
67+
github.com/arangodb/go-driver v0.0.0-20190704144722-b504aec1c4d1/go.mod h1:5nMxlcbN6PHA05uGUw2EH6PIQHvdRVTrg/S/GM8jG4w=
68+
github.com/arangodb/go-driver v0.0.0-20190802095550-7a2c11a3ff12 h1:SZZFNsy/6c3CAv9HmhPukMtkbpdt5u1kZ1AaZElDGqE=
69+
github.com/arangodb/go-driver v0.0.0-20190802095550-7a2c11a3ff12/go.mod h1:5nMxlcbN6PHA05uGUw2EH6PIQHvdRVTrg/S/GM8jG4w=
6670
github.com/arangodb/go-upgrade-rules v0.0.0-20180809110947-031b4774ff21 h1:+W7D5ttxi/Ygh/39vialtypE23p9KI7P0J2qtoqUV4w=
6771
github.com/arangodb/go-upgrade-rules v0.0.0-20180809110947-031b4774ff21/go.mod h1:RkPIG6JJ2pcJUoymc18NxAJGraZd+iAEVnOTDjZey/w=
6872
github.com/arangodb/go-velocypack v0.0.0-20190129082528-7896a965b4ad h1:Ah+VRYUWLuqgbfnDyuC8IrIe8XFzpt9tBVVnPGFNTJ8=

pkg/apis/deployment/v1alpha/deployment_spec.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ type DeploymentSpec struct {
5454
ImagePullPolicy *v1.PullPolicy `json:"imagePullPolicy,omitempty"`
5555
DowntimeAllowed *bool `json:"downtimeAllowed,omitempty"`
5656
DisableIPv6 *bool `json:"disableIPv6,omitempty"`
57+
LocallyAttachedVolumes *bool `json:"locallyAttachedVolumes,omitempty"`
5758

5859
ExternalAccess ExternalAccessSpec `json:"externalAccess"`
5960
RocksDB RocksDBSpec `json:"rocksdb"`
@@ -123,6 +124,11 @@ func (s DeploymentSpec) IsDisableIPv6() bool {
123124
return util.BoolOrDefault(s.DisableIPv6)
124125
}
125126

127+
// IsLocallyAttachedVolumes returns the value of locallyAttachedVolumes, default true
128+
func (s DeploymentSpec) IsLocallyAttachedVolumes() bool {
129+
return util.BoolOrDefault(s.LocallyAttachedVolumes, true)
130+
}
131+
126132
// GetListenAddr returns "[::]" or "0.0.0.0" depending on IsDisableIPv6
127133
func (s DeploymentSpec) GetListenAddr() string {
128134
if s.IsDisableIPv6() {

pkg/apis/deployment/v1alpha/member_phase.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ const (
3636
MemberPhaseCleanOut MemberPhase = "CleanOut"
3737
// MemberPhaseDrain indicates that a dbserver is in the process of being cleaned out as result of draining a node
3838
MemberPhaseDrain MemberPhase = "Drain"
39+
// MemberPhaseResign indicates that a dbserver is in the process of resigning for a shutdown
40+
MemberPhaseResign MemberPhase = "Resign"
3941
// MemberPhaseShuttingDown indicates that a member is shutting down
4042
MemberPhaseShuttingDown MemberPhase = "ShuttingDown"
4143
// MemberPhaseRotating indicates that a member is being rotated

pkg/apis/deployment/v1alpha/zz_generated.deepcopy.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/deployment/resources/pod_termination.go

Lines changed: 71 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,14 @@ import (
2828
"time"
2929

3030
"github.com/rs/zerolog"
31-
"k8s.io/api/core/v1"
3231
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3332

3433
driver "github.com/arangodb/go-driver"
3534
"github.com/arangodb/go-driver/agency"
3635
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
3736
"github.com/arangodb/kube-arangodb/pkg/util/arangod"
3837
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
38+
v1 "k8s.io/api/core/v1"
3939
)
4040

4141
// prepareAgencyPodTermination checks if the given agency pod is allowed to terminate
@@ -137,6 +137,16 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol
137137
return nil
138138
}
139139

140+
resignJobAvailable := false
141+
currentVersion := memberStatus.ArangoVersion
142+
if currentVersion != "" {
143+
if currentVersion.CompareTo("3.4.7") > 0 && currentVersion.CompareTo("3.5") < 0 {
144+
resignJobAvailable = true
145+
} else if currentVersion.CompareTo("3.5.0") > 0 {
146+
resignJobAvailable = true
147+
}
148+
}
149+
140150
// Check node the pod is scheduled on
141151
dbserverDataWillBeGone := false
142152
if p.Spec.NodeName != "" {
@@ -147,7 +157,9 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol
147157
log.Warn().Err(err).Msg("Failed to get node for member")
148158
return maskAny(err)
149159
} else if node.Spec.Unschedulable {
150-
dbserverDataWillBeGone = true
160+
if r.context.GetSpec().IsLocallyAttachedVolumes() || !resignJobAvailable {
161+
dbserverDataWillBeGone = true
162+
}
151163
}
152164
}
153165

@@ -168,13 +180,12 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol
168180
}
169181

170182
// Is this a simple pod restart?
171-
if !dbserverDataWillBeGone {
183+
if !dbserverDataWillBeGone && !resignJobAvailable {
172184
log.Debug().Msg("Pod is just being restarted, safe to remove dbserver pod")
173185
return nil
174186
}
175187

176188
// Inspect cleaned out state
177-
log.Debug().Msg("DBServer data is being deleted, so we will cleanout the dbserver first")
178189
c, err := r.context.GetDatabaseClient(ctx)
179190
if err != nil {
180191
log.Debug().Err(err).Msg("Failed to create member client")
@@ -201,7 +212,7 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol
201212
}
202213
// Not cleaned out yet, check member status
203214
if memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) {
204-
log.Warn().Msg("Member is already terminated before it could be cleaned out. Not good, but removing dbserver pod because we cannot do anything further")
215+
log.Warn().Msg("Member is already terminated before it could resign or be cleaned out. Not good, but removing dbserver pod because we cannot do anything further")
205216
// At this point we have to set CleanedOut to true,
206217
// because we can no longer reason about the state in the agency and
207218
// bringing back the dbserver again may result in an cleaned out server without us knowing
@@ -220,13 +231,24 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol
220231
var jobID string
221232
ctx = driver.WithJobIDResponse(ctx, &jobID)
222233
// Ensure the cleanout is triggered
223-
log.Debug().Msg("Server is not yet clean out. Triggering a clean out now")
224-
if err := cluster.CleanOutServer(ctx, memberStatus.ID); err != nil {
225-
log.Debug().Err(err).Msg("Failed to clean out server")
226-
return maskAny(err)
234+
if dbserverDataWillBeGone {
235+
log.Debug().Msg("Server is not yet cleaned out. Triggering a clean out now")
236+
if err := cluster.CleanOutServer(ctx, memberStatus.ID); err != nil {
237+
log.Debug().Err(err).Msg("Failed to clean out server")
238+
return maskAny(err)
239+
}
240+
memberStatus.Phase = api.MemberPhaseDrain
241+
} else {
242+
log.Debug().Msg("Temporary shutdown, resign leadership")
243+
if err := cluster.ResignServer(ctx, memberStatus.ID); err != nil {
244+
log.Debug().Err(err).Msg("Failed to resign server")
245+
return maskAny(err)
246+
}
247+
memberStatus.Phase = api.MemberPhaseResign
227248
}
249+
228250
memberStatus.CleanoutJobID = jobID
229-
memberStatus.Phase = api.MemberPhaseDrain
251+
230252
if err := updateMember(memberStatus); err != nil {
231253
return maskAny(err)
232254
}
@@ -239,18 +261,54 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol
239261
}
240262
jobStatus, err := arangod.CleanoutServerJobStatus(ctx, memberStatus.CleanoutJobID, c, agency)
241263
if err != nil {
242-
log.Debug().Err(err).Msg("Failed to fetch cleanout job status")
264+
log.Debug().Err(err).Msg("Failed to fetch job status")
265+
return maskAny(err)
266+
}
267+
if jobStatus.IsFailed() {
268+
log.Warn().Str("reason", jobStatus.Reason()).Msg("Job failed")
269+
// Revert cleanout state
270+
memberStatus.Phase = api.MemberPhaseCreated
271+
memberStatus.CleanoutJobID = ""
272+
if err := updateMember(memberStatus); err != nil {
273+
return maskAny(err)
274+
}
275+
log.Error().Msg("Cleanout/Resign server job failed, continue anyway")
276+
return nil
277+
}
278+
if jobStatus.IsFinished() {
279+
memberStatus.CleanoutJobID = ""
280+
memberStatus.Phase = api.MemberPhaseCreated
281+
}
282+
} else if memberStatus.Phase == api.MemberPhaseResign {
283+
// Check the job progress
284+
agency, err := r.context.GetAgency(ctx)
285+
if err != nil {
286+
log.Debug().Err(err).Msg("Failed to create agency client")
287+
return maskAny(err)
288+
}
289+
jobStatus, err := arangod.CleanoutServerJobStatus(ctx, memberStatus.CleanoutJobID, c, agency)
290+
if err != nil {
291+
log.Debug().Err(err).Msg("Failed to fetch job status")
243292
return maskAny(err)
244293
}
245294
if jobStatus.IsFailed() {
246-
log.Warn().Str("reason", jobStatus.Reason()).Msg("Cleanout Job failed")
295+
log.Warn().Str("reason", jobStatus.Reason()).Msg("Resign Job failed")
247296
// Revert cleanout state
248297
memberStatus.Phase = api.MemberPhaseCreated
249298
memberStatus.CleanoutJobID = ""
250299
if err := updateMember(memberStatus); err != nil {
251300
return maskAny(err)
252301
}
253-
log.Error().Msg("Cleanout server job failed, continue anyway")
302+
log.Error().Msg("Cleanout/Resign server job failed, continue anyway")
303+
return nil
304+
}
305+
if jobStatus.IsFinished() {
306+
log.Debug().Str("reason", jobStatus.Reason()).Msg("Resign Job finished")
307+
memberStatus.CleanoutJobID = ""
308+
memberStatus.Phase = api.MemberPhaseCreated
309+
if err := updateMember(memberStatus); err != nil {
310+
return maskAny(err)
311+
}
254312
return nil
255313
}
256314
}

0 commit comments

Comments
 (0)