diff --git a/Makefile b/Makefile index e2ea94dfc..30174a452 100644 --- a/Makefile +++ b/Makefile @@ -63,3 +63,7 @@ visualize-fsm: cd cmd/metal-api/internal/tools/visualize_fsm go run main.go dot -Tsvg fsm.dot > fsm.svg + +.PHONY: mocks +mocks: + docker run --user $$(id -u):$$(id -g) --rm -w /work -v ${PWD}:/work vektra/mockery:v2.21.1 --name MachineManager --dir /work/cmd/metal-api/internal/scaler --output /work/cmd/metal-api/internal/scaler --filename pool_scaler_mock_test.go --testonly --inpackage diff --git a/cmd/metal-api/internal/datastore/event.go b/cmd/metal-api/internal/datastore/event.go index 88b45d619..392cd538d 100644 --- a/cmd/metal-api/internal/datastore/event.go +++ b/cmd/metal-api/internal/datastore/event.go @@ -5,7 +5,10 @@ import ( "log/slog" "github.com/metal-stack/metal-api/cmd/metal-api/internal/fsm" + "github.com/metal-stack/metal-api/cmd/metal-api/internal/fsm/states" "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" + s "github.com/metal-stack/metal-api/cmd/metal-api/internal/scaler" + "github.com/metal-stack/metal-lib/bus" ) // ListProvisioningEventContainers returns all machine provisioning event containers. @@ -40,8 +43,8 @@ func (rs *RethinkStore) UpsertProvisioningEventContainer(ec *metal.ProvisioningE return rs.upsertEntity(rs.eventTable(), ec) } -func (rs *RethinkStore) ProvisioningEventForMachine(ctx context.Context, log *slog.Logger, event *metal.ProvisioningEvent, machineID string) (*metal.ProvisioningEventContainer, error) { - ec, err := rs.FindProvisioningEventContainer(machineID) +func (rs *RethinkStore) ProvisioningEventForMachine(ctx context.Context, log *slog.Logger, publisher bus.Publisher, event *metal.ProvisioningEvent, machine *metal.Machine) (*metal.ProvisioningEventContainer, error) { + ec, err := rs.FindProvisioningEventContainer(machine.ID) if err != nil && !metal.IsNotFound(err) { return nil, err } @@ -49,13 +52,45 @@ func (rs *RethinkStore) ProvisioningEventForMachine(ctx context.Context, log *sl if ec == nil { ec = &metal.ProvisioningEventContainer{ Base: metal.Base{ - ID: machineID, + ID: machine.ID, }, Liveliness: metal.MachineLivelinessAlive, } } - newEC, err := fsm.HandleProvisioningEvent(ctx, log, ec, event) + var scaler *s.PoolScaler + if machine.PartitionID != "" && machine.SizeID != "" { + // in the early lifecycle, when the pxe booting event is submitted + // a machine does not have a partition or size , so the pool scaler + // cannot work at this stage + + p, err := rs.FindPartition(machine.PartitionID) + if err != nil { + return nil, err + } + + scaler = s.NewPoolScaler(&s.PoolScalerConfig{ + Log: log.With("pool-scaler", machine.PartitionID), + Manager: &manager{ + rs: rs, + publisher: publisher, + partitionid: p.ID, + sizeid: machine.SizeID, + }, + Partition: *p, + }) + } + + config := states.StateConfig{ + Log: log.With("fsm", machine.ID), + Context: ctx, + Container: ec, + Event: event, + Scaler: scaler, + Machine: machine, + } + + newEC, err := fsm.HandleProvisioningEvent(&config) if err != nil { return nil, err } @@ -65,7 +100,10 @@ func (rs *RethinkStore) ProvisioningEventForMachine(ctx context.Context, log *sl } newEC.TrimEvents(100) - err = rs.UpsertProvisioningEventContainer(newEC) + if err != nil { + return nil, err + } + return newEC, err } diff --git a/cmd/metal-api/internal/datastore/machine.go b/cmd/metal-api/internal/datastore/machine.go index eb208e942..a58e5e030 100644 --- a/cmd/metal-api/internal/datastore/machine.go +++ b/cmd/metal-api/internal/datastore/machine.go @@ -23,6 +23,7 @@ type MachineSearchQuery struct { Tags []string `json:"tags" optional:"true"` // allocation + NotAllocated *bool `json:"not_allocated" optional:"true"` AllocationName *string `json:"allocation_name" optional:"true"` AllocationProject *string `json:"allocation_project" optional:"true"` AllocationImageID *string `json:"allocation_image_id" optional:"true"` @@ -54,7 +55,10 @@ type MachineSearchQuery struct { DiskSizes []int64 `json:"disk_sizes" optional:"true"` // state - StateValue *string `json:"state_value" optional:"true" enum:"|RESERVED|LOCKED"` + HibernationEnabled *bool `json:"hibernation_enabled" optional:"true"` + StateValue *string `json:"state_value" optional:"true" enum:"|RESERVED|LOCKED"` + PreAllocated *bool `json:"preallocated" optional:"true"` + Waiting *bool `json:"waiting" optional:"true"` // ipmi IpmiAddress *string `json:"ipmi_address" optional:"true"` @@ -114,6 +118,16 @@ func (p *MachineSearchQuery) generateTerm(rs *RethinkStore) *r.Term { }) } + if p.NotAllocated != nil { + q = q.Filter(func(row r.Term) r.Term { + if *p.NotAllocated { + return row.Field("allocation").Eq(nil) + } else { + return row.Field("allocation").Ne(nil) + } + }) + } + if p.AllocationName != nil { q = q.Filter(func(row r.Term) r.Term { return row.Field("allocation").Field("name").Eq(*p.AllocationName) @@ -288,12 +302,30 @@ func (p *MachineSearchQuery) generateTerm(rs *RethinkStore) *r.Term { }) } + if p.HibernationEnabled != nil { + q = q.Filter(func(row r.Term) r.Term { + return row.Field("state").Field("hibernation").Field("enabled").Eq(*p.HibernationEnabled) + }) + } + if p.StateValue != nil { q = q.Filter(func(row r.Term) r.Term { return row.Field("state").Field("value").Eq(*p.StateValue) }) } + if p.PreAllocated != nil { + q = q.Filter(func(row r.Term) r.Term { + return row.Field("preallocated").Eq(*p.PreAllocated) + }) + } + + if p.Waiting != nil { + q = q.Filter(func(row r.Term) r.Term { + return row.Field("waiting").Eq(*p.Waiting) + }) + } + if p.IpmiAddress != nil { q = q.Filter(func(row r.Term) r.Term { return row.Field("ipmi").Field("address").Eq(*p.IpmiAddress) diff --git a/cmd/metal-api/internal/datastore/machine_integration_test.go b/cmd/metal-api/internal/datastore/machine_integration_test.go index e45a76810..5d6e92059 100644 --- a/cmd/metal-api/internal/datastore/machine_integration_test.go +++ b/cmd/metal-api/internal/datastore/machine_integration_test.go @@ -367,6 +367,54 @@ func TestRethinkStore_SearchMachines(t *testing.T) { }, wantErr: nil, }, + { + name: "search by not_allocated true", + q: &MachineSearchQuery{ + NotAllocated: pointer.Pointer(true), + }, + mock: []*metal.Machine{ + {Base: metal.Base{ID: "1"}, Allocation: &metal.MachineAllocation{}}, + {Base: metal.Base{ID: "2"}, Allocation: &metal.MachineAllocation{}}, + {Base: metal.Base{ID: "3"}, Allocation: nil}, + }, + want: []*metal.Machine{ + tt.defaultBody(&metal.Machine{Base: metal.Base{ID: "3"}, Allocation: nil}), + }, + wantErr: nil, + }, + { + name: "search by not_allocated false", + q: &MachineSearchQuery{ + NotAllocated: pointer.Pointer(false), + }, + mock: []*metal.Machine{ + {Base: metal.Base{ID: "1"}, Allocation: &metal.MachineAllocation{}}, + {Base: metal.Base{ID: "2"}, Allocation: &metal.MachineAllocation{}}, + {Base: metal.Base{ID: "3"}, Allocation: nil}, + }, + want: []*metal.Machine{ + tt.defaultBody(&metal.Machine{Base: metal.Base{ID: "1"}, Allocation: &metal.MachineAllocation{}}), + tt.defaultBody(&metal.Machine{Base: metal.Base{ID: "2"}, Allocation: &metal.MachineAllocation{}}), + }, + wantErr: nil, + }, + { + name: "search by not_allocated nil", + q: &MachineSearchQuery{ + NotAllocated: nil, + }, + mock: []*metal.Machine{ + {Base: metal.Base{ID: "1"}, Allocation: &metal.MachineAllocation{}}, + {Base: metal.Base{ID: "2"}, Allocation: &metal.MachineAllocation{}}, + {Base: metal.Base{ID: "3"}, Allocation: nil}, + }, + want: []*metal.Machine{ + tt.defaultBody(&metal.Machine{Base: metal.Base{ID: "1"}, Allocation: &metal.MachineAllocation{}}), + tt.defaultBody(&metal.Machine{Base: metal.Base{ID: "2"}, Allocation: &metal.MachineAllocation{}}), + tt.defaultBody(&metal.Machine{Base: metal.Base{ID: "3"}, Allocation: nil}), + }, + wantErr: nil, + }, { name: "search by network ids", q: &MachineSearchQuery{ @@ -644,6 +692,93 @@ func TestRethinkStore_SearchMachines(t *testing.T) { }, wantErr: nil, }, + { + name: "search by waiting true", + q: &MachineSearchQuery{ + Waiting: pointer.Pointer(true), + }, + mock: []*metal.Machine{ + {Base: metal.Base{ID: "1"}, Waiting: true}, + {Base: metal.Base{ID: "2"}, Waiting: true}, + {Base: metal.Base{ID: "3"}, Waiting: false}, + {Base: metal.Base{ID: "4"}, Waiting: false}, + }, + want: []*metal.Machine{ + tt.defaultBody(&metal.Machine{Base: metal.Base{ID: "1"}, Waiting: true}), + tt.defaultBody(&metal.Machine{Base: metal.Base{ID: "2"}, Waiting: true}), + }, + wantErr: nil, + }, + { + name: "search by preallocated true", + q: &MachineSearchQuery{ + PreAllocated: pointer.Pointer(true), + }, + mock: []*metal.Machine{ + {Base: metal.Base{ID: "1"}, PreAllocated: true}, + {Base: metal.Base{ID: "2"}, PreAllocated: true}, + {Base: metal.Base{ID: "3"}, PreAllocated: false}, + {Base: metal.Base{ID: "4"}, PreAllocated: false}, + }, + want: []*metal.Machine{ + tt.defaultBody(&metal.Machine{Base: metal.Base{ID: "1"}, PreAllocated: true}), + tt.defaultBody(&metal.Machine{Base: metal.Base{ID: "2"}, PreAllocated: true}), + }, + wantErr: nil, + }, + { + name: "search by hibernation_enabled true", + q: &MachineSearchQuery{ + HibernationEnabled: pointer.Pointer(true), + }, + mock: []*metal.Machine{ + {Base: metal.Base{ID: "1"}, State: metal.MachineState{Hibernation: metal.MachineHibernation{Enabled: true}}}, + {Base: metal.Base{ID: "2"}, State: metal.MachineState{Hibernation: metal.MachineHibernation{Enabled: true}}}, + {Base: metal.Base{ID: "3"}, State: metal.MachineState{Hibernation: metal.MachineHibernation{Enabled: false}}}, + {Base: metal.Base{ID: "4"}, State: metal.MachineState{Hibernation: metal.MachineHibernation{Enabled: false}}}, + }, + want: []*metal.Machine{ + tt.defaultBody(&metal.Machine{Base: metal.Base{ID: "1"}, State: metal.MachineState{Hibernation: metal.MachineHibernation{Enabled: true}}}), + tt.defaultBody(&metal.Machine{Base: metal.Base{ID: "2"}, State: metal.MachineState{Hibernation: metal.MachineHibernation{Enabled: true}}}), + }, + wantErr: nil, + }, + { + name: "search by hibernation_enabled false", + q: &MachineSearchQuery{ + HibernationEnabled: pointer.Pointer(false), + }, + mock: []*metal.Machine{ + {Base: metal.Base{ID: "1"}, State: metal.MachineState{Hibernation: metal.MachineHibernation{Enabled: true}}}, + {Base: metal.Base{ID: "2"}, State: metal.MachineState{Hibernation: metal.MachineHibernation{Enabled: true}}}, + {Base: metal.Base{ID: "3"}, State: metal.MachineState{Hibernation: metal.MachineHibernation{Enabled: false}}}, + {Base: metal.Base{ID: "4"}, State: metal.MachineState{Hibernation: metal.MachineHibernation{Enabled: false}}}, + }, + want: []*metal.Machine{ + tt.defaultBody(&metal.Machine{Base: metal.Base{ID: "3"}, State: metal.MachineState{Hibernation: metal.MachineHibernation{Enabled: false}}}), + tt.defaultBody(&metal.Machine{Base: metal.Base{ID: "4"}, State: metal.MachineState{Hibernation: metal.MachineHibernation{Enabled: false}}}), + }, + wantErr: nil, + }, + { + name: "search by hibernation_enabled nil", + q: &MachineSearchQuery{ + HibernationEnabled: nil, + }, + mock: []*metal.Machine{ + {Base: metal.Base{ID: "1"}, State: metal.MachineState{Hibernation: metal.MachineHibernation{Enabled: true}}}, + {Base: metal.Base{ID: "2"}, State: metal.MachineState{Hibernation: metal.MachineHibernation{Enabled: true}}}, + {Base: metal.Base{ID: "3"}, State: metal.MachineState{Hibernation: metal.MachineHibernation{Enabled: false}}}, + {Base: metal.Base{ID: "4"}, State: metal.MachineState{Hibernation: metal.MachineHibernation{Enabled: false}}}, + }, + want: []*metal.Machine{ + tt.defaultBody(&metal.Machine{Base: metal.Base{ID: "1"}, State: metal.MachineState{Hibernation: metal.MachineHibernation{Enabled: true}}}), + tt.defaultBody(&metal.Machine{Base: metal.Base{ID: "2"}, State: metal.MachineState{Hibernation: metal.MachineHibernation{Enabled: true}}}), + tt.defaultBody(&metal.Machine{Base: metal.Base{ID: "3"}, State: metal.MachineState{Hibernation: metal.MachineHibernation{Enabled: false}}}), + tt.defaultBody(&metal.Machine{Base: metal.Base{ID: "4"}, State: metal.MachineState{Hibernation: metal.MachineHibernation{Enabled: false}}}), + }, + wantErr: nil, + }, { name: "search by ipmi address", q: &MachineSearchQuery{ diff --git a/cmd/metal-api/internal/datastore/poolsize.go b/cmd/metal-api/internal/datastore/poolsize.go new file mode 100644 index 000000000..5fd0f69a2 --- /dev/null +++ b/cmd/metal-api/internal/datastore/poolsize.go @@ -0,0 +1,117 @@ +package datastore + +import ( + "log/slog" + "time" + + e "github.com/metal-stack/metal-api/cmd/metal-api/internal/eventbus" + "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" + "github.com/metal-stack/metal-lib/bus" + "github.com/metal-stack/metal-lib/pkg/pointer" +) + +type manager struct { + rs *RethinkStore + publisher bus.Publisher + partitionid string + sizeid string +} + +func (m *manager) AllMachines() (metal.Machines, error) { + q := MachineSearchQuery{ + PartitionID: &m.partitionid, + SizeID: &m.sizeid, + } + + allMachines := metal.Machines{} + err := m.rs.SearchMachines(&q, &allMachines) + if err != nil { + return nil, err + } + + return allMachines, nil +} + +func (m *manager) WaitingMachines() (metal.Machines, error) { + q := MachineSearchQuery{ + PartitionID: &m.partitionid, + SizeID: &m.sizeid, + StateValue: pointer.Pointer(string(metal.AvailableState)), + NotAllocated: pointer.Pointer(true), + Waiting: pointer.Pointer(true), + PreAllocated: pointer.Pointer(false), + HibernationEnabled: pointer.Pointer(false), + } + + waitingMachines := metal.Machines{} + err := m.rs.SearchMachines(&q, &waitingMachines) + if err != nil { + return nil, err + } + + return waitingMachines, nil +} + +func (m *manager) ShutdownMachines() (metal.Machines, error) { + q := MachineSearchQuery{ + PartitionID: &m.partitionid, + SizeID: &m.sizeid, + HibernationEnabled: pointer.Pointer(true), + } + + shutdownMachines := metal.Machines{} + err := m.rs.SearchMachines(&q, &shutdownMachines) + if err != nil { + return nil, err + } + + return shutdownMachines, nil +} + +func (m *manager) Shutdown(machine *metal.Machine) error { + state := metal.MachineState{ + Hibernation: metal.MachineHibernation{ + Enabled: true, + Description: "shutdown by pool scaler due to exceeding pool size", + Changed: pointer.Pointer(time.Now()), + }, + } + + err := m.rs.publishCommandAndUpdate(m.rs.log, machine, m.publisher, metal.MachineOffCmd, state) + if err != nil { + return err + } + return nil +} + +func (m *manager) PowerOn(machine *metal.Machine) error { + state := metal.MachineState{ + Hibernation: metal.MachineHibernation{ + Enabled: false, + Description: "powered on by pool scaler to increase pool size", + Changed: pointer.Pointer(time.Now()), + }, + } + + err := m.rs.publishCommandAndUpdate(m.rs.log, machine, m.publisher, metal.MachineOnCmd, state) + if err != nil { + return err + } + return nil +} + +func (rs *RethinkStore) publishCommandAndUpdate(logger *slog.Logger, m *metal.Machine, publisher bus.Publisher, cmd metal.MachineCommand, state metal.MachineState) error { + newMachine := *m + newMachine.State = state + err := rs.UpdateMachine(m, &newMachine) + if err != nil { + return err + } + + err = e.PublishMachineCmd(logger, m, publisher, cmd) + if err != nil { + return err + } + + return nil +} diff --git a/cmd/metal-api/internal/datastore/rethinkdb.go b/cmd/metal-api/internal/datastore/rethinkdb.go index 06238a84e..51ba8c042 100644 --- a/cmd/metal-api/internal/datastore/rethinkdb.go +++ b/cmd/metal-api/internal/datastore/rethinkdb.go @@ -135,18 +135,18 @@ func (rs *RethinkStore) initializeTables(opts r.TableCreateOpts) error { // demoted runtime user creation / update rs.log.Info("creating / updating demoted runtime user") - _, err = rs.userTable().Insert(map[string]interface{}{"id": DemotedUser, "password": rs.dbpass}, r.InsertOpts{ + _, err = rs.userTable().Insert(map[string]any{"id": DemotedUser, "password": rs.dbpass}, r.InsertOpts{ Conflict: "update", }).RunWrite(rs.session) if err != nil { return err } rs.log.Info("ensuring demoted user can read and write") - _, err = rs.db().Grant(DemotedUser, map[string]interface{}{"read": true, "write": true}).RunWrite(rs.session) + _, err = rs.db().Grant(DemotedUser, map[string]any{"read": true, "write": true}).RunWrite(rs.session) if err != nil { return err } - _, err = r.DB("rethinkdb").Grant(DemotedUser, map[string]interface{}{"read": true}).RunWrite(rs.session) + _, err = r.DB("rethinkdb").Grant(DemotedUser, map[string]any{"read": true}).RunWrite(rs.session) if err != nil { return err } @@ -361,7 +361,7 @@ tryAgain: return s } -func (rs *RethinkStore) findEntityByID(table *r.Term, entity interface{}, id string) error { +func (rs *RethinkStore) findEntityByID(table *r.Term, entity any, id string) error { res, err := table.Get(id).Run(rs.session) if err != nil { return fmt.Errorf("cannot find %v with id %q in database: %w", getEntityName(entity), id, err) @@ -377,7 +377,7 @@ func (rs *RethinkStore) findEntityByID(table *r.Term, entity interface{}, id str return nil } -func (rs *RethinkStore) findEntity(query *r.Term, entity interface{}) error { +func (rs *RethinkStore) findEntity(query *r.Term, entity any) error { res, err := query.Run(rs.session) if err != nil { return fmt.Errorf("cannot find %v in database: %w", getEntityName(entity), err) @@ -392,7 +392,7 @@ func (rs *RethinkStore) findEntity(query *r.Term, entity interface{}) error { return fmt.Errorf("cannot find %v in database: %w", getEntityName(entity), err) } - next := map[string]interface{}{} + next := map[string]any{} hasResult = res.Next(&next) if hasResult { return fmt.Errorf("more than one %v exists", getEntityName(entity)) @@ -401,7 +401,7 @@ func (rs *RethinkStore) findEntity(query *r.Term, entity interface{}) error { return nil } -func (rs *RethinkStore) searchEntities(query *r.Term, entity interface{}) error { +func (rs *RethinkStore) searchEntities(query *r.Term, entity any) error { res, err := query.Run(rs.session) if err != nil { return fmt.Errorf("cannot search %v in database: %w", getEntityName(entity), err) @@ -415,7 +415,7 @@ func (rs *RethinkStore) searchEntities(query *r.Term, entity interface{}) error return nil } -func (rs *RethinkStore) listEntities(table *r.Term, entity interface{}) error { +func (rs *RethinkStore) listEntities(table *r.Term, entity any) error { res, err := table.Run(rs.session) if err != nil { return fmt.Errorf("cannot list %v from database: %w", getEntityName(entity), err) @@ -493,7 +493,7 @@ func (rs *RethinkStore) updateEntity(table *r.Term, newEntity metal.Entity, oldE return nil } -func getEntityName(entity interface{}) string { +func getEntityName(entity any) string { t := reflect.TypeOf(entity) for t.Kind() == reflect.Ptr { t = t.Elem() diff --git a/cmd/metal-api/internal/eventbus/nsq.go b/cmd/metal-api/internal/eventbus/nsq.go index 5eb3502c9..1ef90ae7e 100644 --- a/cmd/metal-api/internal/eventbus/nsq.go +++ b/cmd/metal-api/internal/eventbus/nsq.go @@ -109,3 +109,22 @@ func (n *NSQClient) createTopics(partitions metal.Partitions, topics []metal.NSQ func (n *NSQClient) delay() { time.Sleep(nsqdRetryDelay) } + +func PublishMachineCmd(logger *slog.Logger, m *metal.Machine, publisher bus.Publisher, cmd metal.MachineCommand) error { + evt := metal.MachineEvent{ + Type: metal.COMMAND, + Cmd: &metal.MachineExecCommand{ + Command: cmd, + TargetMachineID: m.ID, + IPMI: &m.IPMI, + }, + } + + logger.Info("publish event", "event", evt, "command", *evt.Cmd) + err := publisher.Publish(metal.TopicMachine.GetFQN(m.PartitionID), evt) + if err != nil { + return err + } + + return nil +} diff --git a/cmd/metal-api/internal/fsm/events.go b/cmd/metal-api/internal/fsm/events.go index 4466ce24d..58929a7ca 100644 --- a/cmd/metal-api/internal/fsm/events.go +++ b/cmd/metal-api/internal/fsm/events.go @@ -152,13 +152,14 @@ func eventCallbacks(config *states.StateConfig) fsm.Callbacks { // therefore we have an artificial state self_transition from which we can trigger the state-specific callback "enter_" + SelfTransitionState: func(ctx context.Context, e *fsm.Event) { if state, ok := allStates[e.Src]; ok { - state.OnTransition(ctx, e) + state.OnEnter(ctx, e) } }, } for name, state := range allStates { - callbacks["enter_"+name] = state.OnTransition + callbacks["enter_"+name] = state.OnEnter + callbacks["leave_"+name] = state.OnLeave } return callbacks diff --git a/cmd/metal-api/internal/fsm/fsm.go b/cmd/metal-api/internal/fsm/fsm.go index 20b4d8b70..0b36fdbdc 100644 --- a/cmd/metal-api/internal/fsm/fsm.go +++ b/cmd/metal-api/internal/fsm/fsm.go @@ -1,10 +1,8 @@ package fsm import ( - "context" "errors" "fmt" - "log/slog" "strings" "github.com/looplab/fsm" @@ -17,57 +15,51 @@ import ( // The function returns a new provisioning event container that can then be safely persisted in the database. If an error is returned, the incoming event is not supposed to be persisted in the database. // // Among other things, this function can detect crash loops or other irregularities within a machine lifecycle and enriches the returned provisioning event container with this information. -func HandleProvisioningEvent(ctx context.Context, log *slog.Logger, ec *metal.ProvisioningEventContainer, event *metal.ProvisioningEvent) (*metal.ProvisioningEventContainer, error) { - if ec == nil { - return nil, fmt.Errorf("provisioning event container must not be nil") - } - - if event == nil { - return nil, fmt.Errorf("provisioning event must not be nil") +func HandleProvisioningEvent(c *states.StateConfig) (*metal.ProvisioningEventContainer, error) { + if err := c.Validate(); err != nil { + return nil, err } var ( - clone = *ec - container = &clone - f = fsm.NewFSM( - initialStateFromEventContainer(container), + f = fsm.NewFSM( + initialStateFromEventContainer(c.Container), Events(), - eventCallbacks(&states.StateConfig{Log: log, Event: event, Container: container}), + eventCallbacks(c), ) ) - err := f.Event(ctx, event.Event.String()) + err := f.Event(c.Context, c.Event.Event.String()) if err == nil { - return container, nil + return c.Container, nil } if errors.As(err, &fsm.InvalidEventError{}) { - if event.Message == "" { - event.Message = fmt.Sprintf("[unexpectedly received in %s]", strings.ToLower(f.Current())) + if c.Event.Message == "" { + c.Event.Message = fmt.Sprintf("[unexpectedly received in %s]", strings.ToLower(f.Current())) } else { - event.Message = fmt.Sprintf("[unexpectedly received in %s]: %s", strings.ToLower(f.Current()), event.Message) + c.Event.Message = fmt.Sprintf("[unexpectedly received in %s]: %s", strings.ToLower(f.Current()), c.Event.Message) } - container.LastEventTime = &event.Time - container.Liveliness = metal.MachineLivelinessAlive - container.LastErrorEvent = event + c.Container.LastEventTime = &c.Event.Time + c.Container.Liveliness = metal.MachineLivelinessAlive + c.Container.LastErrorEvent = c.Event - switch e := event.Event; e { //nolint:exhaustive + switch e := c.Event.Event; e { //nolint:exhaustive case metal.ProvisioningEventPXEBooting, metal.ProvisioningEventPreparing: - container.CrashLoop = true - container.Events = append([]metal.ProvisioningEvent{*event}, container.Events...) + c.Container.CrashLoop = true + c.Container.Events = append([]metal.ProvisioningEvent{*c.Event}, c.Container.Events...) case metal.ProvisioningEventAlive: // under no circumstances we want to persists alive in the events container. // when this happens the FSM gets stuck in invalid transitions // (e.g. all following transitions are invalid and all subsequent alive events will be stored, cramping history). default: - container.Events = append([]metal.ProvisioningEvent{*event}, container.Events...) + c.Container.Events = append([]metal.ProvisioningEvent{*c.Event}, c.Container.Events...) } - return container, nil + return c.Container, nil } - return nil, fmt.Errorf("internal error while calculating provisioning event container for machine %s: %w", container.ID, err) + return nil, fmt.Errorf("internal error while calculating provisioning event container for machine %s: %w", c.Container.ID, err) } func initialStateFromEventContainer(container *metal.ProvisioningEventContainer) string { diff --git a/cmd/metal-api/internal/fsm/fsm_test.go b/cmd/metal-api/internal/fsm/fsm_test.go index b6605c048..fb85c4c69 100644 --- a/cmd/metal-api/internal/fsm/fsm_test.go +++ b/cmd/metal-api/internal/fsm/fsm_test.go @@ -3,10 +3,12 @@ package fsm import ( "context" "log/slog" + "os" "testing" "time" "github.com/google/go-cmp/cmp" + "github.com/metal-stack/metal-api/cmd/metal-api/internal/fsm/states" "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" ) @@ -127,7 +129,7 @@ func TestHandleProvisioningEvent(t *testing.T) { }, }, { - name: "valid transition from registering to preparing (metal-hammer wait skip)", + name: "valid transition from registering to installing (metal-hammer wait skip)", container: &metal.ProvisioningEventContainer{ Events: metal.ProvisioningEvents{ { @@ -631,7 +633,14 @@ func TestHandleProvisioningEvent(t *testing.T) { ctx := context.Background() tt := tests[i] t.Run(tt.name, func(t *testing.T) { - got, err := HandleProvisioningEvent(ctx, slog.Default(), tt.container, tt.event) + params := states.StateConfig{ + Log: slog.New(slog.NewJSONHandler(os.Stdout, nil)), + Context: ctx, + Container: tt.container, + Event: tt.event, + } + + got, err := HandleProvisioningEvent(¶ms) if diff := cmp.Diff(tt.wantErr, err); diff != "" { t.Errorf("HandleProvisioningEvent() diff = %s", diff) } @@ -652,15 +661,22 @@ func TestReactionToAllIncomingEvents(t *testing.T) { // this test ensures that for every incoming event we have a proper transition for e1 := range metal.AllProvisioningEventTypes { for e2 := range metal.AllProvisioningEventTypes { - _, err := HandleProvisioningEvent(ctx, slog.Default(), &metal.ProvisioningEventContainer{ - Events: metal.ProvisioningEvents{ - { - Event: e2, + params := states.StateConfig{ + Log: slog.New(slog.NewJSONHandler(os.Stdout, nil)), + Context: ctx, + Container: &metal.ProvisioningEventContainer{ + Events: metal.ProvisioningEvents{ + { + Event: e2, + }, }, }, - }, &metal.ProvisioningEvent{ - Event: e1, - }) + Event: &metal.ProvisioningEvent{ + Event: e1, + }, + } + + _, err := HandleProvisioningEvent(¶ms) if err != nil { t.Errorf("transitioning from state %s with event %s: %s", e2, e1, err) } diff --git a/cmd/metal-api/internal/fsm/states/alive.go b/cmd/metal-api/internal/fsm/states/alive.go index d7d78ee4c..baf595559 100644 --- a/cmd/metal-api/internal/fsm/states/alive.go +++ b/cmd/metal-api/internal/fsm/states/alive.go @@ -9,9 +9,11 @@ import ( ) type AliveState struct { + noopState log *slog.Logger container *metal.ProvisioningEventContainer event *metal.ProvisioningEvent + machine *metal.Machine } func newAlive(c *StateConfig) *AliveState { @@ -19,10 +21,17 @@ func newAlive(c *StateConfig) *AliveState { log: c.Log, container: c.Container, event: c.Event, + machine: c.Machine, } } -func (p *AliveState) OnTransition(ctx context.Context, e *fsm.Event) { - updateTimeAndLiveliness(p.event, p.container) +func (p *AliveState) OnEnter(ctx context.Context, e *fsm.Event) { p.log.Debug("received provisioning alive event", "id", p.container.ID) + + if p.machine != nil && p.machine.State.Hibernation.Enabled { + p.container.LastEventTime = &p.event.Time // machine is about to shutdown and is still sending alive events + return + } + + updateTimeAndLiveliness(p.event, p.container) } diff --git a/cmd/metal-api/internal/fsm/states/booting-new-kernel.go b/cmd/metal-api/internal/fsm/states/booting-new-kernel.go index ed61faa98..4b601f3c4 100644 --- a/cmd/metal-api/internal/fsm/states/booting-new-kernel.go +++ b/cmd/metal-api/internal/fsm/states/booting-new-kernel.go @@ -8,6 +8,7 @@ import ( ) type BootingNewKernelState struct { + noopState container *metal.ProvisioningEventContainer event *metal.ProvisioningEvent } @@ -19,6 +20,6 @@ func newBootingNewKernel(c *StateConfig) *BootingNewKernelState { } } -func (p *BootingNewKernelState) OnTransition(ctx context.Context, e *fsm.Event) { +func (p *BootingNewKernelState) OnEnter(ctx context.Context, e *fsm.Event) { appendEventToContainer(p.event, p.container) } diff --git a/cmd/metal-api/internal/fsm/states/crashed.go b/cmd/metal-api/internal/fsm/states/crashed.go index fa4856239..849fb041d 100644 --- a/cmd/metal-api/internal/fsm/states/crashed.go +++ b/cmd/metal-api/internal/fsm/states/crashed.go @@ -8,6 +8,7 @@ import ( ) type CrashState struct { + noopState container *metal.ProvisioningEventContainer event *metal.ProvisioningEvent } @@ -19,7 +20,7 @@ func newCrash(c *StateConfig) *CrashState { } } -func (p *CrashState) OnTransition(ctx context.Context, e *fsm.Event) { +func (p *CrashState) OnEnter(ctx context.Context, e *fsm.Event) { p.container.CrashLoop = true p.container.LastErrorEvent = p.event appendEventToContainer(p.event, p.container) diff --git a/cmd/metal-api/internal/fsm/states/initial.go b/cmd/metal-api/internal/fsm/states/initial.go index 724b347c2..9e19c997e 100644 --- a/cmd/metal-api/internal/fsm/states/initial.go +++ b/cmd/metal-api/internal/fsm/states/initial.go @@ -9,6 +9,7 @@ import ( ) type InitialState struct { + noopState container *metal.ProvisioningEventContainer event *metal.ProvisioningEvent } @@ -20,6 +21,6 @@ func newInitial(c *StateConfig) *InitialState { } } -func (p *InitialState) OnTransition(ctx context.Context, e *fsm.Event) { +func (p *InitialState) OnEnter(ctx context.Context, e *fsm.Event) { e.Err = fmt.Errorf("unexpected transition back to initial state") } diff --git a/cmd/metal-api/internal/fsm/states/installing.go b/cmd/metal-api/internal/fsm/states/installing.go index 84912626d..6a05ac252 100644 --- a/cmd/metal-api/internal/fsm/states/installing.go +++ b/cmd/metal-api/internal/fsm/states/installing.go @@ -8,6 +8,7 @@ import ( ) type InstallingState struct { + noopState container *metal.ProvisioningEventContainer event *metal.ProvisioningEvent } @@ -19,6 +20,6 @@ func newInstalling(c *StateConfig) *InstallingState { } } -func (p *InstallingState) OnTransition(ctx context.Context, e *fsm.Event) { +func (p *InstallingState) OnEnter(ctx context.Context, e *fsm.Event) { appendEventToContainer(p.event, p.container) } diff --git a/cmd/metal-api/internal/fsm/states/machine-reclaim.go b/cmd/metal-api/internal/fsm/states/machine-reclaim.go index 3981b723b..1711e3f57 100644 --- a/cmd/metal-api/internal/fsm/states/machine-reclaim.go +++ b/cmd/metal-api/internal/fsm/states/machine-reclaim.go @@ -8,6 +8,7 @@ import ( ) type MachineReclaimState struct { + noopState container *metal.ProvisioningEventContainer event *metal.ProvisioningEvent } @@ -19,7 +20,7 @@ func newMachineReclaim(c *StateConfig) *MachineReclaimState { } } -func (p *MachineReclaimState) OnTransition(ctx context.Context, e *fsm.Event) { +func (p *MachineReclaimState) OnEnter(ctx context.Context, e *fsm.Event) { p.container.CrashLoop = false appendEventToContainer(p.event, p.container) } diff --git a/cmd/metal-api/internal/fsm/states/noop-state.go b/cmd/metal-api/internal/fsm/states/noop-state.go new file mode 100644 index 000000000..127ced397 --- /dev/null +++ b/cmd/metal-api/internal/fsm/states/noop-state.go @@ -0,0 +1,12 @@ +package states + +import ( + "context" + + "github.com/looplab/fsm" +) + +type noopState struct{} + +func (_ noopState) OnEnter(ctx context.Context, e *fsm.Event) {} +func (_ noopState) OnLeave(ctx context.Context, e *fsm.Event) {} diff --git a/cmd/metal-api/internal/fsm/states/phoned-home.go b/cmd/metal-api/internal/fsm/states/phoned-home.go index 113f0a76d..de633b245 100644 --- a/cmd/metal-api/internal/fsm/states/phoned-home.go +++ b/cmd/metal-api/internal/fsm/states/phoned-home.go @@ -13,6 +13,7 @@ import ( const failedMachineReclaimThreshold = 5 * time.Minute type PhonedHomeState struct { + noopState log *slog.Logger container *metal.ProvisioningEventContainer event *metal.ProvisioningEvent @@ -26,7 +27,7 @@ func newPhonedHome(c *StateConfig) *PhonedHomeState { } } -func (p *PhonedHomeState) OnTransition(ctx context.Context, e *fsm.Event) { +func (p *PhonedHomeState) OnEnter(ctx context.Context, e *fsm.Event) { switch e.Src { case PhonedHome.String(): updateTimeAndLiveliness(p.event, p.container) diff --git a/cmd/metal-api/internal/fsm/states/planned-reboot.go b/cmd/metal-api/internal/fsm/states/planned-reboot.go index a81525a93..cdf624547 100644 --- a/cmd/metal-api/internal/fsm/states/planned-reboot.go +++ b/cmd/metal-api/internal/fsm/states/planned-reboot.go @@ -8,6 +8,7 @@ import ( ) type PlannedRebootState struct { + noopState container *metal.ProvisioningEventContainer event *metal.ProvisioningEvent } @@ -19,7 +20,7 @@ func newPlannedReboot(c *StateConfig) *PlannedRebootState { } } -func (p *PlannedRebootState) OnTransition(ctx context.Context, e *fsm.Event) { +func (p *PlannedRebootState) OnEnter(ctx context.Context, e *fsm.Event) { p.container.CrashLoop = false appendEventToContainer(p.event, p.container) } diff --git a/cmd/metal-api/internal/fsm/states/preparing.go b/cmd/metal-api/internal/fsm/states/preparing.go index 637b921d2..cbc4943da 100644 --- a/cmd/metal-api/internal/fsm/states/preparing.go +++ b/cmd/metal-api/internal/fsm/states/preparing.go @@ -8,6 +8,7 @@ import ( ) type PreparingState struct { + noopState container *metal.ProvisioningEventContainer event *metal.ProvisioningEvent } @@ -19,7 +20,7 @@ func newPreparing(c *StateConfig) *PreparingState { } } -func (p *PreparingState) OnTransition(ctx context.Context, e *fsm.Event) { +func (p *PreparingState) OnEnter(ctx context.Context, e *fsm.Event) { p.container.FailedMachineReclaim = false appendEventToContainer(p.event, p.container) diff --git a/cmd/metal-api/internal/fsm/states/pxe-booting.go b/cmd/metal-api/internal/fsm/states/pxe-booting.go index 25b8133d8..1d02596f0 100644 --- a/cmd/metal-api/internal/fsm/states/pxe-booting.go +++ b/cmd/metal-api/internal/fsm/states/pxe-booting.go @@ -8,6 +8,7 @@ import ( ) type PXEBootingState struct { + noopState container *metal.ProvisioningEventContainer event *metal.ProvisioningEvent } @@ -19,7 +20,7 @@ func newPXEBooting(c *StateConfig) *PXEBootingState { } } -func (p *PXEBootingState) OnTransition(ctx context.Context, e *fsm.Event) { +func (p *PXEBootingState) OnEnter(ctx context.Context, e *fsm.Event) { p.container.FailedMachineReclaim = false if e.Src == PXEBooting.String() { diff --git a/cmd/metal-api/internal/fsm/states/registering.go b/cmd/metal-api/internal/fsm/states/registering.go index 2dea3656b..8162edb71 100644 --- a/cmd/metal-api/internal/fsm/states/registering.go +++ b/cmd/metal-api/internal/fsm/states/registering.go @@ -8,6 +8,7 @@ import ( ) type RegisteringState struct { + noopState container *metal.ProvisioningEventContainer event *metal.ProvisioningEvent } @@ -19,6 +20,6 @@ func newRegistering(c *StateConfig) *RegisteringState { } } -func (p *RegisteringState) OnTransition(ctx context.Context, e *fsm.Event) { +func (p *RegisteringState) OnEnter(ctx context.Context, e *fsm.Event) { appendEventToContainer(p.event, p.container) } diff --git a/cmd/metal-api/internal/fsm/states/states.go b/cmd/metal-api/internal/fsm/states/states.go index 4ed7804e9..e649f4d19 100644 --- a/cmd/metal-api/internal/fsm/states/states.go +++ b/cmd/metal-api/internal/fsm/states/states.go @@ -1,11 +1,14 @@ package states import ( + "fmt" + "context" "log/slog" "github.com/looplab/fsm" "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" + "github.com/metal-stack/metal-api/cmd/metal-api/internal/scaler" ) const ( @@ -24,7 +27,8 @@ const ( ) type FSMState interface { - OnTransition(ctx context.Context, e *fsm.Event) + OnEnter(ctx context.Context, e *fsm.Event) + OnLeave(ctx context.Context, e *fsm.Event) } type stateType string @@ -35,8 +39,23 @@ func (t stateType) String() string { type StateConfig struct { Log *slog.Logger + Context context.Context Container *metal.ProvisioningEventContainer Event *metal.ProvisioningEvent + Scaler *scaler.PoolScaler + Machine *metal.Machine +} + +func (c *StateConfig) Validate() error { + if c.Container == nil { + return fmt.Errorf("provisioning event container must not be nil") + } + + if c.Event == nil { + return fmt.Errorf("provisioning event must not be nil") + } + + return nil } func AllStates(c *StateConfig) map[string]FSMState { diff --git a/cmd/metal-api/internal/fsm/states/waiting.go b/cmd/metal-api/internal/fsm/states/waiting.go index 9d4c1ee72..6d920e0f5 100644 --- a/cmd/metal-api/internal/fsm/states/waiting.go +++ b/cmd/metal-api/internal/fsm/states/waiting.go @@ -4,21 +4,34 @@ import ( "context" "github.com/looplab/fsm" - "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" ) type WaitingState struct { - container *metal.ProvisioningEventContainer - event *metal.ProvisioningEvent + config *StateConfig } func newWaiting(c *StateConfig) *WaitingState { return &WaitingState{ - container: c.Container, - event: c.Event, + config: c, } } -func (p *WaitingState) OnTransition(ctx context.Context, e *fsm.Event) { - appendEventToContainer(p.event, p.container) +func (p *WaitingState) OnEnter(ctx context.Context, e *fsm.Event) { + appendEventToContainer(p.config.Event, p.config.Container) + + if p.config.Scaler != nil { + err := p.config.Scaler.AdjustNumberOfWaitingMachines() + if err != nil { + p.config.Log.Error("received error from pool scaler", "error", err) + } + } +} + +func (p *WaitingState) OnLeave(ctx context.Context, e *fsm.Event) { + if p.config.Scaler != nil && e.Dst == Installing.String() { + err := p.config.Scaler.AdjustNumberOfWaitingMachines() + if err != nil { + p.config.Log.Error("received error from pool scaler", "error", err) + } + } } diff --git a/cmd/metal-api/internal/grpc/event-service.go b/cmd/metal-api/internal/grpc/event-service.go index fdd77b395..67d6df3d5 100644 --- a/cmd/metal-api/internal/grpc/event-service.go +++ b/cmd/metal-api/internal/grpc/event-service.go @@ -10,17 +10,20 @@ import ( "github.com/metal-stack/metal-api/cmd/metal-api/internal/datastore" "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" v1 "github.com/metal-stack/metal-api/pkg/api/v1" + "github.com/metal-stack/metal-lib/bus" ) type EventService struct { - log *slog.Logger - ds *datastore.RethinkStore + log *slog.Logger + ds *datastore.RethinkStore + publisher bus.Publisher } func NewEventService(cfg *ServerConfig) *EventService { return &EventService{ - ds: cfg.Store, - log: cfg.Logger.WithGroup("event-service"), + ds: cfg.Store, + log: cfg.Logger.WithGroup("event-service"), + publisher: cfg.Publisher, } } func (e *EventService) Send(ctx context.Context, req *v1.EventServiceSendRequest) (*v1.EventServiceSendResponse, error) { @@ -70,7 +73,7 @@ func (e *EventService) Send(ctx context.Context, req *v1.EventServiceSendRequest Message: event.Message, } - _, err = e.ds.ProvisioningEventForMachine(ctx, e.log, &ev, machineID) + _, err = e.ds.ProvisioningEventForMachine(ctx, e.log, e.publisher, &ev, m) if err != nil { processErrs = append(processErrs, err) failed = append(failed, machineID) diff --git a/cmd/metal-api/internal/metal/machine.go b/cmd/metal-api/internal/metal/machine.go index 8fb0b5ae3..beb637987 100644 --- a/cmd/metal-api/internal/metal/machine.go +++ b/cmd/metal-api/internal/metal/machine.go @@ -55,10 +55,18 @@ var ( // the machine will be available for allocation. In all other cases the allocation // must explicitly point to this machine. type MachineState struct { - Value MState `rethinkdb:"value" json:"value"` - Description string `rethinkdb:"description" json:"description"` - Issuer string `rethinkdb:"issuer" json:"issuer,omitempty"` - MetalHammerVersion string `rethinkdb:"metal_hammer_version" json:"metal_hammer_version"` + Value MState `rethinkdb:"value" json:"value"` + Description string `rethinkdb:"description" json:"description"` + Issuer string `rethinkdb:"issuer" json:"issuer,omitempty"` + MetalHammerVersion string `rethinkdb:"metal_hammer_version" json:"metal_hammer_version"` + Hibernation MachineHibernation `rethinkdb:"hibernation" json:"hibernation"` +} + +// A MachineHibernation indicates that a machine was sent to sleep or woken up by the pool scaler +type MachineHibernation struct { + Enabled bool `rethinkdb:"enabled" json:"enabled"` + Description string `rethinkdb:"description" json:"description"` + Changed *time.Time `rethinkdb:"changed" json:"changed"` } // MachineStateFrom converts a machineState string to the type @@ -501,11 +509,12 @@ type MachineLiveliness string // The enums for the machine liveliness states. const ( - MachineLivelinessAlive MachineLiveliness = "Alive" - MachineLivelinessDead MachineLiveliness = "Dead" - MachineLivelinessUnknown MachineLiveliness = "Unknown" - MachineDeadAfter time.Duration = 5 * time.Minute - MachineResurrectAfter time.Duration = time.Hour + MachineLivelinessAlive MachineLiveliness = "Alive" + MachineLivelinessDead MachineLiveliness = "Dead" + MachineLivelinessUnknown MachineLiveliness = "Unknown" + MachineLivelinessHibernated MachineLiveliness = "Hibernated" + MachineDeadAfter time.Duration = 5 * time.Minute + MachineResurrectAfter time.Duration = time.Hour ) func capacityOf[V any](identifier string, vs []V, countFn func(v V) (model string, count uint64)) (uint64, []V) { diff --git a/cmd/metal-api/internal/metal/partition.go b/cmd/metal-api/internal/metal/partition.go index 6621057e2..e4542b4ba 100644 --- a/cmd/metal-api/internal/metal/partition.go +++ b/cmd/metal-api/internal/metal/partition.go @@ -1,13 +1,23 @@ package metal +import ( + "errors" + "math" + "strconv" + "strings" +) + // A Partition represents a location. type Partition struct { Base - BootConfiguration BootConfiguration `rethinkdb:"bootconfig" json:"bootconfig"` - MgmtServiceAddress string `rethinkdb:"mgmtserviceaddr" json:"mgmtserviceaddr"` - Labels map[string]string `rethinkdb:"labels" json:"labels"` - DNSServers DNSServers `rethinkdb:"dns_servers" json:"dns_servers"` - NTPServers NTPServers `rethinkdb:"ntp_servers" json:"ntp_servers"` + BootConfiguration BootConfiguration `rethinkdb:"bootconfig" json:"bootconfig"` + MgmtServiceAddress string `rethinkdb:"mgmtserviceaddr" json:"mgmtserviceaddr"` + PrivateNetworkPrefixLength uint8 `rethinkdb:"privatenetworkprefixlength" json:"privatenetworkprefixlength"` + WaitingPoolMinSize string `rethinkdb:"waitingpoolminsize" json:"waitingpoolminsize"` + WaitingPoolMaxSize string `rethinkdb:"waitingpoolmaxsize" json:"waitingpoolmaxsize"` + Labels map[string]string `rethinkdb:"labels" json:"labels"` + DNSServers DNSServers `rethinkdb:"dns_servers" json:"dns_servers"` + NTPServers NTPServers `rethinkdb:"ntp_servers" json:"ntp_servers"` } // BootConfiguration defines the metal-hammer initrd, kernel and commandline @@ -31,3 +41,112 @@ func (sz Partitions) ByID() PartitionMap { } return res } + +type ScalerRange struct { + isDisabled bool + inPercent bool + minSize int + maxSize int +} + +func NewScalerRange(min, max string) (*ScalerRange, error) { + r := &ScalerRange{} + if r.setDisabled(min, max); r.IsDisabled() { + return r, nil + } + + if err := r.setRange(min, max); err != nil { + return nil, err + } + + if err := r.Validate(); err != nil { + return nil, err + } + + return r, nil +} + +func (r *ScalerRange) setDisabled(min, max string) { + if min == "" && max == "" { + r.isDisabled = true + return + } + r.isDisabled = false +} + +func (r *ScalerRange) IsDisabled() bool { + return r.isDisabled +} + +func (r *ScalerRange) setRange(min, max string) error { + minSize, err, minInPercent := stringToSize(min) + if err != nil { + return errors.New("could not parse minimum waiting pool size") + } + + maxSize, err, maxInPercent := stringToSize(max) + if err != nil { + return errors.New("could not parse maximum waiting pool size") + } + + if minInPercent != maxInPercent { + return errors.New("minimum and maximum pool sizes must either be both in percent or both an absolute value") + } + + if minInPercent { + r.inPercent = true + } else { + r.inPercent = false + } + + r.minSize = int(minSize) + r.maxSize = int(maxSize) + return nil +} + +func stringToSize(input string) (size int64, err error, inPercent bool) { + s := input + inPercent = false + + if strings.HasSuffix(input, "%") { + s = strings.Split(input, "%")[0] + inPercent = true + } + + size, err = strconv.ParseInt(s, 10, 64) + return size, err, inPercent +} + +func (r *ScalerRange) Min(of int) int { + if r.inPercent { + return percentOf(r.minSize, of) + } + return r.minSize +} + +func (r *ScalerRange) Max(of int) int { + if r.inPercent { + return percentOf(r.maxSize, of) + } + return r.maxSize +} + +func percentOf(percent, of int) int { + return int(math.Round(float64(of) * float64(percent) / 100)) +} + +func (r *ScalerRange) Validate() error { + if r.IsDisabled() { + return nil + } + + if r.minSize <= 0 || r.maxSize <= 0 { + return errors.New("minimum and maximum waiting pool sizes must be greater than 0") + } + + if r.maxSize < r.minSize { + return errors.New("minimum waiting pool size must be less or equal to maximum pool size") + } + + return nil +} diff --git a/cmd/metal-api/internal/metal/partition_test.go b/cmd/metal-api/internal/metal/partition_test.go index e0a6f8b64..d239f6f65 100644 --- a/cmd/metal-api/internal/metal/partition_test.go +++ b/cmd/metal-api/internal/metal/partition_test.go @@ -1,6 +1,7 @@ package metal import ( + "errors" "reflect" "testing" ) @@ -50,3 +51,81 @@ func TestPartitions_ByID(t *testing.T) { }) } } + +func TestNewScalerRange(t *testing.T) { + tests := []struct { + name string + min string + max string + want *ScalerRange + wantErr error + }{ + { + name: "min max format mismatch", + min: "15%", + max: "30", + want: nil, + wantErr: errors.New("minimum and maximum pool sizes must either be both in percent or both an absolute value"), + }, + { + name: "parse error for min", + min: "15#%", + max: "30", + want: nil, + wantErr: errors.New("could not parse minimum waiting pool size"), + }, + { + name: "parse error for max", + min: "15", + max: "#30", + want: nil, + wantErr: errors.New("could not parse maximum waiting pool size"), + }, + { + name: "max less than min", + min: "15", + max: "1", + want: nil, + wantErr: errors.New("minimum waiting pool size must be less or equal to maximum pool size"), + }, + { + name: "0 is not allowed", + min: "0", + max: "0", + want: nil, + wantErr: errors.New("minimum and maximum waiting pool sizes must be greater than 0"), + }, + { + name: "everything okay", + min: "15", + max: "30", + want: &ScalerRange{minSize: 15, maxSize: 30}, + wantErr: nil, + }, + { + name: "everything okay in percent", + min: "15%", + max: "30%", + want: &ScalerRange{minSize: 15, maxSize: 30, inPercent: true}, + wantErr: nil, + }, + { + name: "pool scaling disabled", + want: &ScalerRange{isDisabled: true}, + wantErr: nil, + }, + } + for i := range tests { + tt := tests[i] + t.Run(tt.name, func(t *testing.T) { + got, err := NewScalerRange(tt.min, tt.max) + if (err != nil || tt.wantErr != nil) && (err == nil || tt.wantErr == nil || err.Error() != tt.wantErr.Error()) { + t.Errorf("NewScalerRange() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("NewScalerRange() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/cmd/metal-api/internal/scaler/pool_scaler_mock_test.go b/cmd/metal-api/internal/scaler/pool_scaler_mock_test.go new file mode 100644 index 000000000..b95d6e899 --- /dev/null +++ b/cmd/metal-api/internal/scaler/pool_scaler_mock_test.go @@ -0,0 +1,134 @@ +// Code generated by mockery v2.21.1. DO NOT EDIT. + +package scaler + +import ( + metal "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" + mock "github.com/stretchr/testify/mock" +) + +// MockMachineManager is an autogenerated mock type for the MachineManager type +type MockMachineManager struct { + mock.Mock +} + +// AllMachines provides a mock function with given fields: +func (_m *MockMachineManager) AllMachines() (metal.Machines, error) { + ret := _m.Called() + + var r0 metal.Machines + var r1 error + if rf, ok := ret.Get(0).(func() (metal.Machines, error)); ok { + return rf() + } + if rf, ok := ret.Get(0).(func() metal.Machines); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(metal.Machines) + } + } + + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// PowerOn provides a mock function with given fields: m +func (_m *MockMachineManager) PowerOn(m *metal.Machine) error { + ret := _m.Called(m) + + var r0 error + if rf, ok := ret.Get(0).(func(*metal.Machine) error); ok { + r0 = rf(m) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// Shutdown provides a mock function with given fields: m +func (_m *MockMachineManager) Shutdown(m *metal.Machine) error { + ret := _m.Called(m) + + var r0 error + if rf, ok := ret.Get(0).(func(*metal.Machine) error); ok { + r0 = rf(m) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// ShutdownMachines provides a mock function with given fields: +func (_m *MockMachineManager) ShutdownMachines() (metal.Machines, error) { + ret := _m.Called() + + var r0 metal.Machines + var r1 error + if rf, ok := ret.Get(0).(func() (metal.Machines, error)); ok { + return rf() + } + if rf, ok := ret.Get(0).(func() metal.Machines); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(metal.Machines) + } + } + + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// WaitingMachines provides a mock function with given fields: +func (_m *MockMachineManager) WaitingMachines() (metal.Machines, error) { + ret := _m.Called() + + var r0 metal.Machines + var r1 error + if rf, ok := ret.Get(0).(func() (metal.Machines, error)); ok { + return rf() + } + if rf, ok := ret.Get(0).(func() metal.Machines); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(metal.Machines) + } + } + + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +type mockConstructorTestingTNewMockMachineManager interface { + mock.TestingT + Cleanup(func()) +} + +// NewMockMachineManager creates a new instance of MockMachineManager. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +func NewMockMachineManager(t mockConstructorTestingTNewMockMachineManager) *MockMachineManager { + mock := &MockMachineManager{} + mock.Mock.Test(t) + + t.Cleanup(func() { mock.AssertExpectations(t) }) + + return mock +} diff --git a/cmd/metal-api/internal/scaler/poolscaler.go b/cmd/metal-api/internal/scaler/poolscaler.go new file mode 100644 index 000000000..7da1d0ff0 --- /dev/null +++ b/cmd/metal-api/internal/scaler/poolscaler.go @@ -0,0 +1,166 @@ +package scaler + +import ( + "log/slog" + "math" + "math/rand" + + "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" +) + +type ( + PoolScaler struct { + log *slog.Logger + manager MachineManager + partition metal.Partition + } + PoolScalerConfig struct { + Log *slog.Logger + Manager MachineManager + Partition metal.Partition + } + MachineManager interface { + AllMachines() (metal.Machines, error) + WaitingMachines() (metal.Machines, error) + ShutdownMachines() (metal.Machines, error) + + Shutdown(m *metal.Machine) error + PowerOn(m *metal.Machine) error + } +) + +func NewPoolScaler(c *PoolScalerConfig) *PoolScaler { + return &PoolScaler{ + log: c.Log.With("partition", c.Partition.ID), + manager: c.Manager, + partition: c.Partition, + } +} + +// AdjustNumberOfWaitingMachines compares the number of waiting machines to the required pool size of the partition and shuts down or powers on machines accordingly +func (p *PoolScaler) AdjustNumberOfWaitingMachines() error { + waitingPoolRange, err := metal.NewScalerRange(p.partition.WaitingPoolMinSize, p.partition.WaitingPoolMaxSize) + if err != nil { + return err + } + + if waitingPoolRange.IsDisabled() { + shutdownMachines, err := p.manager.ShutdownMachines() + if err != nil { + return err + } + + if len(shutdownMachines) < 1 { + return nil + } + + p.log.Info("power on all shutdown machines as automatic pool size scaling is disabled", "number of machines to power on", len(shutdownMachines)) + err = p.powerOnMachines(shutdownMachines, len(shutdownMachines)) + if err != nil { + return err + } + + return nil + } + + waitingMachines, err := p.manager.WaitingMachines() + if err != nil { + return err + } + + var poolSizeExcess int + poolSizeExcess = p.calculatePoolSizeExcess(len(waitingMachines), *waitingPoolRange) + + if poolSizeExcess == 0 { + p.log.Info("pool size condition met; doing nothing") + return nil + } + + if poolSizeExcess > 0 { + p.log.Info("shut down spare machines", "number", poolSizeExcess) + err := p.shutdownMachines(waitingMachines, poolSizeExcess) + if err != nil { + return err + } + } else { + shutdownMachines, err := p.manager.ShutdownMachines() + if err != nil { + return err + } + + poolSizeExcess = int(math.Abs(float64(poolSizeExcess))) + if len(shutdownMachines) < poolSizeExcess { + p.log.Info("not enough machines to meet waiting pool size; power on all remaining", "number", len(shutdownMachines)) + err = p.powerOnMachines(shutdownMachines, len(shutdownMachines)) + if err != nil { + return err + } + return nil + } + + p.log.Info("power on missing machines", "number", poolSizeExcess) + err = p.powerOnMachines(shutdownMachines, poolSizeExcess) + if err != nil { + return err + } + } + + return nil +} + +// calculatePoolSizeExcess checks if there are less waiting machines than minRequired or more than maxRequired +// if yes, it returns the difference between the actual amount of waiting machines and the average of minRequired and maxRequired +// if no, it returns 0 +func (p *PoolScaler) calculatePoolSizeExcess(current int, scalerRange metal.ScalerRange) int { + allMachines, err := p.manager.AllMachines() + if err != nil { + return 0 + } + + min := scalerRange.Min(len(allMachines)) + max := scalerRange.Max(len(allMachines)) + average := (float64(max) + float64(min)) / 2 + + p.log.Info("checking pool size condition", "minimum pool size", min, "maximum pool size", max, "current pool size", current) + + if current >= min && current <= max { + return 0 + } + + return current - int(math.Round(average)) +} + +func randomIndices(n, k int) []int { + indices := make([]int, n) + for i := range indices { + indices[i] = i + } + + rand.Shuffle(len(indices), func(i, j int) { indices[i], indices[j] = indices[j], indices[i] }) + + return indices[:k] +} + +func (p *PoolScaler) shutdownMachines(machines metal.Machines, number int) error { + indices := randomIndices(len(machines), number) + for i := range indices { + idx := indices[i] + err := p.manager.Shutdown(&machines[idx]) + if err != nil { + return err + } + } + return nil +} + +func (p *PoolScaler) powerOnMachines(machines metal.Machines, number int) error { + indices := randomIndices(len(machines), number) + for i := range indices { + idx := indices[i] + err := p.manager.PowerOn(&machines[idx]) + if err != nil { + return err + } + } + return nil +} diff --git a/cmd/metal-api/internal/scaler/poolscaler_test.go b/cmd/metal-api/internal/scaler/poolscaler_test.go new file mode 100644 index 000000000..2ca272b1d --- /dev/null +++ b/cmd/metal-api/internal/scaler/poolscaler_test.go @@ -0,0 +1,175 @@ +package scaler + +import ( + "log/slog" + "os" + "testing" + + "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" +) + +func TestPoolScaler_AdjustNumberOfWaitingMachines(t *testing.T) { + tests := []struct { + name string + partition *metal.Partition + mockFn func(mock *MockMachineManager) + wantErr bool + }{ + { + name: "waiting machines match pool size; do nothing", + partition: &metal.Partition{ + WaitingPoolMinSize: "10", + WaitingPoolMaxSize: "20", + }, + mockFn: func(mock *MockMachineManager) { + mock.On("WaitingMachines").Once().Return(metal.Machines(make([]metal.Machine, 10)), nil) + mock.On("AllMachines").Once().Return(metal.Machines(make([]metal.Machine, 150)), nil) + }, + wantErr: false, + }, + { + name: "waiting machines match pool size in percent; do nothing", + partition: &metal.Partition{ + WaitingPoolMinSize: "25%", + WaitingPoolMaxSize: "30%", + }, + mockFn: func(mock *MockMachineManager) { + mock.On("AllMachines").Once().Return(metal.Machines(make([]metal.Machine, 150)), nil) + mock.On("WaitingMachines").Once().Return(metal.Machines(make([]metal.Machine, 45)), nil) + }, + wantErr: false, + }, + { + name: "more waiting machines needed; power on 8 machines", + partition: &metal.Partition{ + WaitingPoolMinSize: "15", + WaitingPoolMaxSize: "20", + }, + mockFn: func(mock *MockMachineManager) { + mock.On("AllMachines").Once().Return(metal.Machines(make([]metal.Machine, 150)), nil) + mock.On("WaitingMachines").Once().Return(metal.Machines(make([]metal.Machine, 10)), nil) + mock.On("ShutdownMachines").Once().Return(metal.Machines(make([]metal.Machine, 10)), nil) + mock.On("PowerOn", &metal.Machine{}).Return(nil).Times(8) + }, + wantErr: false, + }, + { + name: "more waiting machines needed in percent; power on 18 machines", + partition: &metal.Partition{ + WaitingPoolMinSize: "30%", + WaitingPoolMaxSize: "40%", + }, + mockFn: func(mock *MockMachineManager) { + mock.On("AllMachines").Once().Return(metal.Machines(make([]metal.Machine, 150)), nil) + mock.On("WaitingMachines").Once().Return(metal.Machines(make([]metal.Machine, 35)), nil) + mock.On("ShutdownMachines").Once().Return(metal.Machines(make([]metal.Machine, 20)), nil) + mock.On("PowerOn", &metal.Machine{}).Return(nil).Times(18) + }, + wantErr: false, + }, + { + name: "pool size exceeded; power off 7 machines", + partition: &metal.Partition{ + WaitingPoolMinSize: "10", + WaitingPoolMaxSize: "15", + }, + mockFn: func(mock *MockMachineManager) { + mock.On("WaitingMachines").Once().Return(metal.Machines(make([]metal.Machine, 20)), nil) + mock.On("AllMachines").Once().Return(metal.Machines(make([]metal.Machine, 150)), nil) + mock.On("Shutdown", &metal.Machine{}).Return(nil).Times(7) + }, + wantErr: false, + }, + { + name: "pool size exceeded in percent; power off 4 machines", + partition: &metal.Partition{ + WaitingPoolMinSize: "15%", + WaitingPoolMaxSize: "20%", + }, + mockFn: func(mock *MockMachineManager) { + mock.On("AllMachines").Once().Return(metal.Machines(make([]metal.Machine, 150)), nil) + mock.On("WaitingMachines").Once().Return(metal.Machines(make([]metal.Machine, 31)), nil) + mock.On("Shutdown", &metal.Machine{}).Return(nil).Times(4) + }, + wantErr: false, + }, + { + name: "more machines needed than available; power on all remaining", + partition: &metal.Partition{ + WaitingPoolMinSize: "30", + WaitingPoolMaxSize: "40", + }, + mockFn: func(mock *MockMachineManager) { + mock.On("ShutdownMachines").Once().Return(metal.Machines(make([]metal.Machine, 10)), nil) + mock.On("WaitingMachines").Once().Return(metal.Machines(make([]metal.Machine, 15)), nil) + mock.On("AllMachines").Once().Return(metal.Machines(make([]metal.Machine, 150)), nil) + mock.On("PowerOn", &metal.Machine{}).Return(nil).Times(10) + }, + wantErr: false, + }, + { + name: "no more machines left; do nothing", + partition: &metal.Partition{ + WaitingPoolMinSize: "15%", + WaitingPoolMaxSize: "20%", + }, + mockFn: func(mock *MockMachineManager) { + mock.On("AllMachines").Once().Return(metal.Machines(make([]metal.Machine, 150)), nil) + mock.On("ShutdownMachines").Once().Return(metal.Machines(make([]metal.Machine, 0)), nil) + mock.On("WaitingMachines").Once().Return(metal.Machines(make([]metal.Machine, 22)), nil) + }, + wantErr: false, + }, + { + name: "edge case 2 waiting machines; min = max = 1", + partition: &metal.Partition{ + WaitingPoolMinSize: "1", + WaitingPoolMaxSize: "1", + }, + mockFn: func(mock *MockMachineManager) { + mock.On("AllMachines").Once().Return(metal.Machines(make([]metal.Machine, 2)), nil) + mock.On("WaitingMachines").Once().Return(metal.Machines(make([]metal.Machine, 2)), nil) + mock.On("Shutdown", &metal.Machine{}).Return(nil).Times(1) + }, + wantErr: false, + }, + { + name: "pool scaling disabled and no shutdown machines; do nothing", + partition: &metal.Partition{}, + mockFn: func(mock *MockMachineManager) { + mock.On("ShutdownMachines").Once().Return(nil, nil) + }, + wantErr: false, + }, + { + name: "pool scaling disabled and some machines are shutdown; power on all", + partition: &metal.Partition{}, + mockFn: func(mock *MockMachineManager) { + mock.On("ShutdownMachines").Once().Return(metal.Machines(make([]metal.Machine, 10)), nil) + mock.On("PowerOn", &metal.Machine{}).Return(nil).Times(10) + }, + wantErr: false, + }, + } + for i := range tests { + tt := tests[i] + t.Run(tt.name, func(t *testing.T) { + manager := NewMockMachineManager(t) + + if tt.mockFn != nil { + tt.mockFn(manager) + } + + p := &PoolScaler{ + log: slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelError})), + manager: manager, + partition: *tt.partition, + } + if err := p.AdjustNumberOfWaitingMachines(); (err != nil) != tt.wantErr { + t.Errorf("PoolScaler.AdjustNumberOfWaitingMachines() error = %v, wantErr %v", err, tt.wantErr) + } + + manager.AssertExpectations(t) + }) + } +} diff --git a/cmd/metal-api/internal/service/integration_test.go b/cmd/metal-api/internal/service/integration_test.go index b79af30ee..33e69747a 100644 --- a/cmd/metal-api/internal/service/integration_test.go +++ b/cmd/metal-api/internal/service/integration_test.go @@ -72,12 +72,13 @@ func createTestEnvironment(t *testing.T, log *slog.Logger, ds *datastore.Rethink }}, nil) mdc := mdm.NewMock(psc, nil, nil, nil, nil) + ctx := context.Background() listener, err := net.Listen("tcp", "127.0.0.1:0") require.NoError(t, err) go func() { - err := metalgrpc.Run(&metalgrpc.ServerConfig{ - Context: context.Background(), + err := metalgrpc.Run(&metalgrpc.ServerConfig{ //nolint + Context: ctx, Store: ds, Publisher: publisher, Consumer: consumer, @@ -115,7 +116,7 @@ func createTestEnvironment(t *testing.T, log *slog.Logger, ds *datastore.Rethink machineService: machineService, ipService: ipService, ds: ds, - ctx: context.TODO(), + ctx: ctx, listener: listener, } diff --git a/cmd/metal-api/internal/service/machine-service.go b/cmd/metal-api/internal/service/machine-service.go index a49f0a4a7..b09c32302 100644 --- a/cmd/metal-api/internal/service/machine-service.go +++ b/cmd/metal-api/internal/service/machine-service.go @@ -32,6 +32,7 @@ import ( mdm "github.com/metal-stack/masterdata-api/pkg/client" "github.com/metal-stack/metal-api/cmd/metal-api/internal/datastore" + e "github.com/metal-stack/metal-api/cmd/metal-api/internal/eventbus" "github.com/metal-stack/metal-api/cmd/metal-api/internal/ipam" "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" v1 "github.com/metal-stack/metal-api/cmd/metal-api/internal/service/v1" @@ -920,6 +921,17 @@ func (r *machineResource) ipmiReport(request *restful.Request, response *restful if report.PowerState != "" { newMachine.IPMI.PowerState = report.PowerState } + + h := newMachine.State.Hibernation + // TODO: an enum for PowerState would be nice + if newMachine.IPMI.PowerState == "ON" && h.Enabled && h.Changed != nil && time.Since(*h.Changed) > 10*time.Minute { + newMachine.State.Hibernation = metal.MachineHibernation{ + Enabled: false, + Description: "machine was hibernated by poolscaler but is still powered on or was powered on by user", + Changed: pointer.Pointer(time.Now()), + } + } + if report.PowerMetric != nil { newMachine.IPMI.PowerMetric = &metal.PowerMetric{ AverageConsumedWatts: report.PowerMetric.AverageConsumedWatts, @@ -1802,7 +1814,7 @@ func (r *machineResource) freeMachine(request *restful.Request, response *restfu logger := r.logger(request) - err = publishMachineCmd(logger, m, r.Publisher, metal.ChassisIdentifyLEDOffCmd) + err = e.PublishMachineCmd(logger, m, r.Publisher, metal.ChassisIdentifyLEDOffCmd) if err != nil { logger.Error("unable to publish machine command", "command", string(metal.ChassisIdentifyLEDOffCmd), "machineID", m.ID, "error", err) } @@ -1826,9 +1838,9 @@ func (r *machineResource) freeMachine(request *restful.Request, response *restfu Event: metal.ProvisioningEventMachineReclaim, Message: "free machine called", } - ctx := request.Request.Context() - _, err = r.ds.ProvisioningEventForMachine(ctx, logger, &ev, id) + _, err = r.ds.ProvisioningEventForMachine(ctx, logger, r, &ev, m) + if err != nil { r.log.Error("error sending provisioning event after machine free", "error", err) } @@ -1977,7 +1989,7 @@ func (r *machineResource) reinstallMachine(request *restful.Request, response *r return } - err = publishMachineCmd(logger, m, r.Publisher, metal.MachineReinstallCmd) + err = e.PublishMachineCmd(logger, m, r.Publisher, metal.MachineReinstallCmd) if err != nil { logger.Error("unable to publish machine command", "command", string(metal.MachineReinstallCmd), "machineID", m.ID, "error", err) } @@ -2032,61 +2044,65 @@ func MachineLiveliness(ds *datastore.RethinkStore, logger *slog.Logger) error { return err } - unknown := 0 - alive := 0 - dead := 0 - errs := 0 + var ( + unknown = 0 + alive = 0 + dead = 0 + hibernated = 0 + errs = 0 + ) for _, m := range machines { lvlness, err := evaluateMachineLiveliness(ds, m) if err != nil { logger.Error("cannot update liveliness", "error", err, "machine", m) errs++ - // fall through, so the rest of the machines is getting evaluated + // fall through, so the machine counted anyway and rest of the machines is getting evaluated } switch lvlness { case metal.MachineLivelinessAlive: alive++ case metal.MachineLivelinessDead: dead++ + case metal.MachineLivelinessHibernated: + hibernated++ case metal.MachineLivelinessUnknown: unknown++ } } - logger.Info("machine liveliness evaluated", "alive", alive, "dead", dead, "unknown", unknown, "errors", errs) + logger.Info("machine liveliness evaluated", "alive", alive, "dead", dead, "hibernated", hibernated, "unknown", unknown, "errors", errs) return nil } func evaluateMachineLiveliness(ds *datastore.RethinkStore, m metal.Machine) (metal.MachineLiveliness, error) { - provisioningEvents, err := ds.FindProvisioningEventContainer(m.ID) + ec, err := ds.FindProvisioningEventContainer(m.ID) if err != nil { // we have no provisioning events... we cannot tell return metal.MachineLivelinessUnknown, fmt.Errorf("no provisioning event container found for machine: %s", m.ID) } - old := *provisioningEvents + updateLiveliness := func(liveliness metal.MachineLiveliness) (metal.MachineLiveliness, error) { + old := *ec + ec.Liveliness = liveliness - if provisioningEvents.LastEventTime != nil { - if time.Since(*provisioningEvents.LastEventTime) > metal.MachineDeadAfter { - if m.Allocation != nil { - // the machine is either dead or the customer did turn off the phone home service - provisioningEvents.Liveliness = metal.MachineLivelinessUnknown - } else { - // the machine is just dead - provisioningEvents.Liveliness = metal.MachineLivelinessDead - } - } else { - provisioningEvents.Liveliness = metal.MachineLivelinessAlive - } + return ec.Liveliness, ds.UpdateProvisioningEventContainer(&old, ec) + } - err = ds.UpdateProvisioningEventContainer(&old, provisioningEvents) - if err != nil { - return provisioningEvents.Liveliness, err - } + if m.State.Hibernation.Enabled { + return updateLiveliness(metal.MachineLivelinessHibernated) + } + + if time.Since(pointer.SafeDeref(ec.LastEventTime)) < metal.MachineDeadAfter { + return updateLiveliness(metal.MachineLivelinessAlive) + } + + if m.Allocation != nil { + // the machine is either dead or the customer did turn off the phone home service + return updateLiveliness(metal.MachineLivelinessUnknown) } - return provisioningEvents.Liveliness, nil + return updateLiveliness(metal.MachineLivelinessDead) } // ResurrectMachines attempts to resurrect machines that are obviously dead @@ -2120,6 +2136,10 @@ func ResurrectMachines(ctx context.Context, ds *datastore.RethinkStore, publishe continue } + if provisioningEvents.Liveliness == metal.MachineLivelinessHibernated { + continue + } + if provisioningEvents.Liveliness == metal.MachineLivelinessDead && time.Since(*provisioningEvents.LastEventTime) > metal.MachineResurrectAfter { logger.Info("resurrecting dead machine", "machineID", m.ID, "liveliness", provisioningEvents.Liveliness, "since", time.Since(*provisioningEvents.LastEventTime).String()) err = act.freeMachine(ctx, publisher, &m, headscaleClient, logger) @@ -2291,7 +2311,7 @@ func (r *machineResource) machineCmd(ctx context.Context, cmd metal.MachineComma Event: metal.ProvisioningEventPlannedReboot, Message: string(cmd), } - _, err = r.ds.ProvisioningEventForMachine(ctx, logger, &ev, id) + _, err = r.ds.ProvisioningEventForMachine(ctx, logger, r, &ev, newMachine) if err != nil { r.sendError(request, response, defaultError(err)) return @@ -2327,7 +2347,7 @@ func (r *machineResource) machineCmd(ctx context.Context, cmd metal.MachineComma newMachine.IPMI.Password = r.ipmiSuperUser.Password() } - err = publishMachineCmd(logger, newMachine, r.Publisher, cmd) + err = e.PublishMachineCmd(logger, newMachine, r.Publisher, cmd) if err != nil { r.sendError(request, response, defaultError(err)) return @@ -2342,25 +2362,6 @@ func (r *machineResource) machineCmd(ctx context.Context, cmd metal.MachineComma r.send(request, response, http.StatusOK, resp) } -func publishMachineCmd(logger *slog.Logger, m *metal.Machine, publisher bus.Publisher, cmd metal.MachineCommand) error { - evt := metal.MachineEvent{ - Type: metal.COMMAND, - Cmd: &metal.MachineExecCommand{ - Command: cmd, - TargetMachineID: m.ID, - IPMI: &m.IPMI, - }, - } - - logger.Info("publish event", "event", evt, "command", *evt.Cmd) - err := publisher.Publish(metal.TopicMachine.GetFQN(m.PartitionID), evt) - if err != nil { - return err - } - - return nil -} - func makeMachineResponse(m *metal.Machine, ds *datastore.RethinkStore) (*v1.MachineResponse, error) { s, p, i, ec, err := findMachineReferencedEntities(m, ds) if err != nil { diff --git a/cmd/metal-api/internal/service/machine-service_integration_test.go b/cmd/metal-api/internal/service/machine-service_integration_test.go index 2babe4e5e..032bb02aa 100644 --- a/cmd/metal-api/internal/service/machine-service_integration_test.go +++ b/cmd/metal-api/internal/service/machine-service_integration_test.go @@ -20,6 +20,7 @@ import ( grpcv1 "github.com/metal-stack/metal-api/pkg/api/v1" "github.com/metal-stack/metal-api/test" "github.com/metal-stack/metal-lib/bus" + "github.com/metal-stack/metal-lib/pkg/pointer" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" @@ -319,3 +320,134 @@ func BenchmarkMachineList(b *testing.B) { b.StopTimer() } + +func TestMachineLivelinessEvaluation(t *testing.T) { + log := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelError})) + + rethinkContainer, cd, err := test.StartRethink(t) + require.NoError(t, err) + nsqContainer, publisher, consumer := test.StartNsqd(t, log) + + defer func() { + _ = rethinkContainer.Terminate(context.Background()) + _ = nsqContainer.Terminate(context.Background()) + }() + + ds := datastore.New(log, cd.IP+":"+cd.Port, cd.DB, cd.User, cd.Password) + ds.VRFPoolRangeMax = 1000 + ds.ASNPoolRangeMax = 1000 + + te := createTestEnvironment(t, log, ds, publisher, consumer) + now := time.Now() + + machinesWithECs := []struct { + m metal.Machine + ec *metal.ProvisioningEventContainer + want metal.MachineLiveliness + }{ + { + m: metal.Machine{ + Base: metal.Base{ + ID: "1", + Description: "waiting machine", + }, + Waiting: true, + }, + ec: &metal.ProvisioningEventContainer{ + Base: metal.Base{ID: "1"}, + LastEventTime: pointer.Pointer(now.Add(-1 * time.Minute)), + }, + want: metal.MachineLivelinessAlive, + }, + { + m: metal.Machine{ + Base: metal.Base{ + ID: "2", + Description: "dead machine", + }, + }, + ec: &metal.ProvisioningEventContainer{ + Base: metal.Base{ID: "2"}, + LastEventTime: pointer.Pointer(now.Add(-1*time.Minute - metal.MachineDeadAfter)), + }, + want: metal.MachineLivelinessDead, + }, + { + m: metal.Machine{ + Base: metal.Base{ + ID: "3", + Description: "machine where user disabled lldp daemon", + }, + Allocation: &metal.MachineAllocation{ + Creator: "someone", + }, + }, + ec: &metal.ProvisioningEventContainer{ + Base: metal.Base{ID: "3"}, + LastEventTime: pointer.Pointer(now.Add(-1*time.Minute - metal.MachineDeadAfter)), + }, + want: metal.MachineLivelinessUnknown, + }, + { + m: metal.Machine{ + Base: metal.Base{ + ID: "4", + Description: "hibernated machine", + }, + State: metal.MachineState{ + Hibernation: metal.MachineHibernation{ + Enabled: true, + }, + }, + }, + ec: &metal.ProvisioningEventContainer{ + Base: metal.Base{ID: "4"}, + LastEventTime: pointer.Pointer(now.Add(-1*time.Minute - metal.MachineDeadAfter)), + }, + want: metal.MachineLivelinessHibernated, + }, + { + m: metal.Machine{ + Base: metal.Base{ + ID: "5", + Description: "machine without event container", + }, + }, + ec: nil, + want: metal.MachineLivelinessUnknown, + }, + } + + for i := range machinesWithECs { + skeleton := &metal.Machine{Base: machinesWithECs[i].m.Base} + err := te.ds.CreateMachine(skeleton) + require.NoError(t, err) + + updatedMachine := *skeleton + updatedMachine.Waiting = machinesWithECs[i].m.Waiting + updatedMachine.Allocation = machinesWithECs[i].m.Allocation + updatedMachine.State = machinesWithECs[i].m.State + + err = te.ds.UpdateMachine(skeleton, &updatedMachine) + require.NoError(t, err) + + if machinesWithECs[i].ec != nil { + err = te.ds.CreateProvisioningEventContainer(machinesWithECs[i].ec) + require.NoError(t, err) + } + } + + err = MachineLiveliness(te.ds, slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelError}))) + require.NoError(t, err) + + for i := range machinesWithECs { + if machinesWithECs[i].ec == nil { + continue + } + + ec, err := te.ds.FindProvisioningEventContainer(machinesWithECs[i].m.ID) + require.NoError(t, err) + + assert.Equal(t, machinesWithECs[i].want, ec.Liveliness, "machine %q", machinesWithECs[i].m.ID) + } +} diff --git a/cmd/metal-api/internal/service/partition-service.go b/cmd/metal-api/internal/service/partition-service.go index 61f774863..537af7cb8 100644 --- a/cmd/metal-api/internal/service/partition-service.go +++ b/cmd/metal-api/internal/service/partition-service.go @@ -197,6 +197,12 @@ func (r *partitionResource) createPartition(request *restful.Request, response * commandLine = *requestPayload.PartitionBootConfiguration.CommandLine } + minSize, maxSize, err := getPoolsizeRange("", "", requestPayload.PartitionWaitingPoolMinSize, requestPayload.PartitionWaitingPoolMaxSize) + if err != nil { + r.sendError(request, response, httperrors.BadRequest(err)) + return + } + var dnsServers metal.DNSServers if len(requestPayload.DNSServers) != 0 { for _, s := range requestPayload.DNSServers { @@ -237,8 +243,10 @@ func (r *partitionResource) createPartition(request *restful.Request, response * KernelURL: kernelURL, CommandLine: commandLine, }, - DNSServers: dnsServers, - NTPServers: ntpServers, + WaitingPoolMinSize: minSize, + WaitingPoolMaxSize: maxSize, + DNSServers: dnsServers, + NTPServers: ntpServers, } fqn := metal.TopicMachine.GetFQN(p.GetID()) @@ -324,6 +332,14 @@ func (r *partitionResource) updatePartition(request *restful.Request, response * newPartition.BootConfiguration.CommandLine = *requestPayload.PartitionBootConfiguration.CommandLine } + minSize, maxSize, err := getPoolsizeRange(oldPartition.WaitingPoolMinSize, oldPartition.WaitingPoolMaxSize, requestPayload.PartitionWaitingPoolMinSize, requestPayload.PartitionWaitingPoolMaxSize) + if err != nil { + r.sendError(request, response, httperrors.BadRequest(err)) + return + } + newPartition.WaitingPoolMinSize = minSize + newPartition.WaitingPoolMaxSize = maxSize + if requestPayload.DNSServers != nil { newPartition.DNSServers = metal.DNSServers{} for _, s := range requestPayload.DNSServers { @@ -555,3 +571,27 @@ func (r *partitionResource) calcPartitionCapacity(pcr *v1.PartitionCapacityReque return res, nil } + +func getPoolsizeRange(currentMinSize, currentMaxSize string, requestedMinSize, requestedMaxSize *string) (min, max string, err error) { + if requestedMinSize != nil { + min = *requestedMinSize + } else { + min = currentMinSize + } + + if requestedMaxSize != nil { + max = *requestedMaxSize + } else { + max = currentMaxSize + } + + if (min == "") != (max == "") { + return "", "", fmt.Errorf("to activate waiting machine pool scaling both minimum and maximum pool sizes must be given") + } + + if _, err = metal.NewScalerRange(min, max); err != nil { + return "", "", err + } + + return min, max, nil +} diff --git a/cmd/metal-api/internal/service/partition-service_test.go b/cmd/metal-api/internal/service/partition-service_test.go index bac50f5b2..292e32932 100644 --- a/cmd/metal-api/internal/service/partition-service_test.go +++ b/cmd/metal-api/internal/service/partition-service_test.go @@ -11,18 +11,17 @@ import ( "testing" "time" - "github.com/google/go-cmp/cmp" - "github.com/stretchr/testify/assert" - r "gopkg.in/rethinkdb/rethinkdb-go.v6" - restful "github.com/emicklei/go-restful/v3" + "github.com/google/go-cmp/cmp" "github.com/metal-stack/metal-api/cmd/metal-api/internal/datastore" "github.com/metal-stack/metal-api/cmd/metal-api/internal/metal" v1 "github.com/metal-stack/metal-api/cmd/metal-api/internal/service/v1" "github.com/metal-stack/metal-api/cmd/metal-api/internal/testdata" "github.com/metal-stack/metal-lib/httperrors" "github.com/metal-stack/metal-lib/pkg/pointer" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + r "gopkg.in/rethinkdb/rethinkdb-go.v6" ) type nopTopicCreator struct { @@ -814,3 +813,101 @@ func TestPartitionCapacity(t *testing.T) { }) } } + +func Test_partitionResource_getPoolsizeRange(t *testing.T) { + tests := []struct { + name string + currentMin, currentMax string + newMin, newMax *string + wantMin, wantMax string + wantErr bool + }{ + { + name: "both args empty leaves partition unchanged", + wantErr: false, + }, + { + name: "if one arg is given the other must be given, too", + newMin: pointer.Pointer("10"), + wantErr: true, + }, + { + name: "leaving max size empty is okay if the field is already set in the partition", + currentMin: "10", + currentMax: "20", + newMin: pointer.Pointer("5"), + wantMin: "5", + wantMax: "20", + wantErr: false, + }, + { + name: "leaving min size empty is okay if the field is already set in the partition", + currentMin: "10", + currentMax: "20", + newMax: pointer.Pointer("50"), + wantMin: "10", + wantMax: "50", + wantErr: false, + }, + { + name: "setting both fields overrides existing entries", + currentMin: "10", + currentMax: "20", + newMin: pointer.Pointer("20"), + newMax: pointer.Pointer("50"), + wantMin: "20", + wantMax: "50", + wantErr: false, + }, + { + name: "setting both fields overrides non-exsiting entries", + newMin: pointer.Pointer("20"), + newMax: pointer.Pointer("50"), + wantMin: "20", + wantMax: "50", + wantErr: false, + }, + { + name: "min must not be greater than max", + newMin: pointer.Pointer("60"), + newMax: pointer.Pointer("50"), + wantErr: true, + }, + { + name: "min must not be greater than existing max", + currentMin: "10", + currentMax: "50", + newMin: pointer.Pointer("60"), + wantErr: true, + }, + { + name: "existing min must not be greater than new max", + currentMin: "10", + currentMax: "50", + newMax: pointer.Pointer("5"), + wantErr: true, + }, + { + name: "passing empty strings is allowed", + currentMin: "10", + currentMax: "50", + newMin: pointer.Pointer(""), + newMax: pointer.Pointer(""), + wantMin: "", + wantMax: "", + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + minSize, maxSize, err := getPoolsizeRange(tt.currentMin, tt.currentMax, tt.newMin, tt.newMax) + if (err != nil) != tt.wantErr { + t.Errorf("partitionResource.setPoolsizeRange() error = %v, wantErr %v", err, tt.wantErr) + } + + if minSize != tt.wantMin || maxSize != tt.wantMax { + t.Errorf("partitionResource.setPoolsizeRange() = %s, %s, wantMin %s, wantMax %s", minSize, maxSize, tt.wantMin, tt.wantMax) + } + }) + } +} diff --git a/cmd/metal-api/internal/service/v1/machine.go b/cmd/metal-api/internal/service/v1/machine.go index ee137fd58..76b946a63 100644 --- a/cmd/metal-api/internal/service/v1/machine.go +++ b/cmd/metal-api/internal/service/v1/machine.go @@ -21,7 +21,7 @@ type MachineBase struct { Allocation *MachineAllocation `json:"allocation" description:"the allocation data of an allocated machine" optional:"true"` State MachineState `json:"state" rethinkdb:"state" description:"the state of this machine"` LEDState ChassisIdentifyLEDState `json:"ledstate" rethinkdb:"ledstate" description:"the state of this chassis identify LED"` - Liveliness string `json:"liveliness" description:"the liveliness of this machine"` + Liveliness string `json:"liveliness" description:"the liveliness of this machine" enum:"Alive|Dead|Unknown|Hibernated"` RecentProvisioningEvents MachineRecentProvisioningEvents `json:"events" description:"recent events of this machine during provisioning"` Tags []string `json:"tags" description:"tags for this machine"` } @@ -109,10 +109,17 @@ type MetalGPU struct { } type MachineState struct { - Value string `json:"value" enum:"RESERVED|LOCKED|" description:"the state of this machine. empty means available for all"` - Description string `json:"description" description:"a description why this machine is in the given state"` - Issuer string `json:"issuer,omitempty" optional:"true" description:"the user that changed the state"` - MetalHammerVersion string `json:"metal_hammer_version" description:"the version of metal hammer which put the machine in waiting state"` + Value string `json:"value" enum:"RESERVED|LOCKED|" description:"the state of this machine. empty means available for all"` + Description string `json:"description" description:"a description why this machine is in the given state"` + Issuer string `json:"issuer,omitempty" optional:"true" description:"the user that changed the state"` + MetalHammerVersion string `json:"metal_hammer_version" description:"the version of metal hammer which put the machine in waiting state"` + Hibernation MachineHibernation `json:"hibernation" description:"indicates that a machine was sent to sleep or woken up by the pool scaler"` +} + +type MachineHibernation struct { + Enabled bool `json:"enabled" description:"true if hibernation is enabled"` + Description string `json:"description" description:"describes last state change of hibernation"` + Changed *time.Time `json:"changed" optional:"true" description:"last changed timestamp"` } type ChassisIdentifyLEDState struct { @@ -678,6 +685,11 @@ func NewMachineResponse(m *metal.Machine, s *metal.Size, p *metal.Partition, i * Description: m.State.Description, Issuer: m.State.Issuer, MetalHammerVersion: m.State.MetalHammerVersion, + Hibernation: MachineHibernation{ + Enabled: m.State.Hibernation.Enabled, + Description: m.State.Hibernation.Description, + Changed: m.State.Hibernation.Changed, + }, }, LEDState: ChassisIdentifyLEDState{ Value: string(m.LEDState.Value), diff --git a/cmd/metal-api/internal/service/v1/partition.go b/cmd/metal-api/internal/service/v1/partition.go index 1561d768b..af4cb6ac0 100644 --- a/cmd/metal-api/internal/service/v1/partition.go +++ b/cmd/metal-api/internal/service/v1/partition.go @@ -5,10 +5,13 @@ import ( ) type PartitionBase struct { - MgmtServiceAddress *string `json:"mgmtserviceaddress" description:"the address to the management service of this partition" optional:"true"` - Labels map[string]string `json:"labels" description:"free labels that you associate with this partition" optional:"true"` - DNSServers []DNSServer `json:"dns_servers,omitempty" description:"the dns servers for this partition" optional:"true"` - NTPServers []NTPServer `json:"ntp_servers,omitempty" description:"the ntp servers for this partition" optional:"true"` + MgmtServiceAddress *string `json:"mgmtserviceaddress" description:"the address to the management service of this partition" optional:"true"` + PrivateNetworkPrefixLength *int `json:"privatenetworkprefixlength" description:"the length of private networks for the machine's child networks in this partition, default 22" optional:"true" minimum:"16" maximum:"30"` + PartitionWaitingPoolMinSize *string `json:"waitingpoolminsize" description:"the minimum waiting pool size of this partition"` + PartitionWaitingPoolMaxSize *string `json:"waitingpoolmaxsize" description:"the maximum waiting pool size of this partition"` + Labels map[string]string `json:"labels" description:"free labels that you associate with this partition" optional:"true"` + DNSServers []DNSServer `json:"dns_servers,omitempty" description:"the dns servers for this partition" optional:"true"` + NTPServers []NTPServer `json:"ntp_servers,omitempty" description:"the ntp servers for this partition" optional:"true"` } type PartitionBootConfiguration struct { @@ -25,11 +28,13 @@ type PartitionCreateRequest struct { type PartitionUpdateRequest struct { Common - MgmtServiceAddress *string `json:"mgmtserviceaddress" description:"the address to the management service of this partition" optional:"true"` - PartitionBootConfiguration *PartitionBootConfiguration `json:"bootconfig" description:"the boot configuration of this partition" optional:"true"` - Labels map[string]string `json:"labels" description:"free labels that you associate with this partition" optional:"true"` - DNSServers []DNSServer `json:"dns_servers" description:"the dns servers for this partition"` - NTPServers []NTPServer `json:"ntp_servers" description:"the ntp servers for this partition"` + MgmtServiceAddress *string `json:"mgmtserviceaddress" description:"the address to the management service of this partition" optional:"true"` + PartitionWaitingPoolMinSize *string `json:"waitingpoolminsize" description:"the minimum waiting pool size of this partition"` + PartitionWaitingPoolMaxSize *string `json:"waitingpoolmaxsize" description:"the maximum waiting pool size of this partition"` + PartitionBootConfiguration *PartitionBootConfiguration `json:"bootconfig" description:"the boot configuration of this partition" optional:"true"` + Labels map[string]string `json:"labels" description:"free labels that you associate with this partition" optional:"true"` + DNSServers []DNSServer `json:"dns_servers" description:"the dns servers for this partition"` + NTPServers []NTPServer `json:"ntp_servers" description:"the ntp servers for this partition"` } type PartitionResponse struct { @@ -133,9 +138,11 @@ func NewPartitionResponse(p *metal.Partition) *PartitionResponse { }, }, PartitionBase: PartitionBase{ - MgmtServiceAddress: &p.MgmtServiceAddress, - DNSServers: dnsServers, - NTPServers: ntpServers, + MgmtServiceAddress: &p.MgmtServiceAddress, + PartitionWaitingPoolMinSize: &p.WaitingPoolMinSize, + PartitionWaitingPoolMaxSize: &p.WaitingPoolMaxSize, + DNSServers: dnsServers, + NTPServers: ntpServers, }, PartitionBootConfiguration: PartitionBootConfiguration{ ImageURL: &p.BootConfiguration.ImageURL, diff --git a/spec/metal-api.json b/spec/metal-api.json index 480ac1baa..330a39552 100644 --- a/spec/metal-api.json +++ b/spec/metal-api.json @@ -143,6 +143,9 @@ "format": "int64", "type": "integer" }, + "hibernation_enabled": { + "type": "boolean" + }, "id": { "type": "string" }, @@ -235,9 +238,15 @@ }, "type": "array" }, + "not_allocated": { + "type": "boolean" + }, "partition_id": { "type": "string" }, + "preallocated": { + "type": "boolean" + }, "rackid": { "type": "string" }, @@ -257,6 +266,9 @@ "type": "string" }, "type": "array" + }, + "waiting": { + "type": "boolean" } } }, @@ -1268,6 +1280,9 @@ "format": "int64", "type": "integer" }, + "hibernation_enabled": { + "type": "boolean" + }, "id": { "type": "string" }, @@ -1360,9 +1375,15 @@ }, "type": "array" }, + "not_allocated": { + "type": "boolean" + }, "partition_id": { "type": "string" }, + "preallocated": { + "type": "boolean" + }, "rackid": { "type": "string" }, @@ -1382,6 +1403,9 @@ "type": "string" }, "type": "array" + }, + "waiting": { + "type": "boolean" } } }, @@ -1471,6 +1495,12 @@ }, "liveliness": { "description": "the liveliness of this machine", + "enum": [ + "Alive", + "Dead", + "Hibernated", + "Unknown" + ], "type": "string" }, "name": { @@ -2350,6 +2380,12 @@ }, "liveliness": { "description": "the liveliness of this machine", + "enum": [ + "Alive", + "Dead", + "Hibernated", + "Unknown" + ], "type": "string" }, "partition": { @@ -2507,6 +2543,9 @@ "format": "int64", "type": "integer" }, + "hibernation_enabled": { + "type": "boolean" + }, "id": { "type": "string" }, @@ -2599,9 +2638,15 @@ }, "type": "array" }, + "not_allocated": { + "type": "boolean" + }, "partition_id": { "type": "string" }, + "preallocated": { + "type": "boolean" + }, "rackid": { "type": "string" }, @@ -2621,6 +2666,9 @@ "type": "string" }, "type": "array" + }, + "waiting": { + "type": "boolean" } } }, @@ -2749,6 +2797,27 @@ "memory" ] }, + "v1.MachineHibernation": { + "properties": { + "changed": { + "description": "last changed timestamp", + "format": "date-time", + "type": "string" + }, + "description": { + "description": "describes last state change of hibernation", + "type": "string" + }, + "enabled": { + "description": "true if hibernation is enabled", + "type": "boolean" + } + }, + "required": [ + "description", + "enabled" + ] + }, "v1.MachineIPMI": { "description": "The IPMI connection data", "properties": { @@ -2852,6 +2921,12 @@ }, "liveliness": { "description": "the liveliness of this machine", + "enum": [ + "Alive", + "Dead", + "Hibernated", + "Unknown" + ], "type": "string" }, "name": { @@ -3086,6 +3161,9 @@ "format": "int64", "type": "integer" }, + "hibernation_enabled": { + "type": "boolean" + }, "id": { "type": "string" }, @@ -3183,6 +3261,9 @@ }, "type": "array" }, + "not_allocated": { + "type": "boolean" + }, "omit": { "description": "a list of machine issues to omit", "items": { @@ -3200,6 +3281,9 @@ "partition_id": { "type": "string" }, + "preallocated": { + "type": "boolean" + }, "rackid": { "type": "string" }, @@ -3223,6 +3307,9 @@ "type": "string" }, "type": "array" + }, + "waiting": { + "type": "boolean" } }, "required": [ @@ -3451,6 +3538,12 @@ }, "liveliness": { "description": "the liveliness of this machine", + "enum": [ + "Alive", + "Dead", + "Hibernated", + "Unknown" + ], "type": "string" }, "name": { @@ -3501,6 +3594,10 @@ "description": "a description why this machine is in the given state", "type": "string" }, + "hibernation": { + "$ref": "#/definitions/v1.MachineHibernation", + "description": "indicates that a machine was sent to sleep or woken up by the pool scaler" + }, "issuer": { "description": "the user that changed the state", "type": "string" @@ -3521,6 +3618,7 @@ }, "required": [ "description", + "hibernation", "metal_hammer_version", "value" ] @@ -4247,8 +4345,27 @@ "$ref": "#/definitions/v1.NTPServer" }, "type": "array" + }, + "privatenetworkprefixlength": { + "description": "the length of private networks for the machine's child networks in this partition, default 22", + "format": "int32", + "maximum": 30, + "minimum": 16, + "type": "integer" + }, + "waitingpoolmaxsize": { + "description": "the maximum waiting pool size of this partition", + "type": "string" + }, + "waitingpoolminsize": { + "description": "the minimum waiting pool size of this partition", + "type": "string" } - } + }, + "required": [ + "waitingpoolmaxsize", + "waitingpoolminsize" + ] }, "v1.PartitionBootConfiguration": { "description": "a partition has a distinct location in a data center, individual entities belong to a partition", @@ -4355,11 +4472,28 @@ "$ref": "#/definitions/v1.NTPServer" }, "type": "array" + }, + "privatenetworkprefixlength": { + "description": "the length of private networks for the machine's child networks in this partition, default 22", + "format": "int32", + "maximum": 30, + "minimum": 16, + "type": "integer" + }, + "waitingpoolmaxsize": { + "description": "the maximum waiting pool size of this partition", + "type": "string" + }, + "waitingpoolminsize": { + "description": "the minimum waiting pool size of this partition", + "type": "string" } }, "required": [ "bootconfig", - "id" + "id", + "waitingpoolmaxsize", + "waitingpoolminsize" ] }, "v1.PartitionResponse": { @@ -4416,11 +4550,28 @@ "$ref": "#/definitions/v1.NTPServer" }, "type": "array" + }, + "privatenetworkprefixlength": { + "description": "the length of private networks for the machine's child networks in this partition, default 22", + "format": "int32", + "maximum": 30, + "minimum": 16, + "type": "integer" + }, + "waitingpoolmaxsize": { + "description": "the maximum waiting pool size of this partition", + "type": "string" + }, + "waitingpoolminsize": { + "description": "the minimum waiting pool size of this partition", + "type": "string" } }, "required": [ "bootconfig", - "id" + "id", + "waitingpoolmaxsize", + "waitingpoolminsize" ] }, "v1.PartitionUpdateRequest": { @@ -4465,12 +4616,22 @@ "$ref": "#/definitions/v1.NTPServer" }, "type": "array" + }, + "waitingpoolmaxsize": { + "description": "the maximum waiting pool size of this partition", + "type": "string" + }, + "waitingpoolminsize": { + "description": "the minimum waiting pool size of this partition", + "type": "string" } }, "required": [ "dns_servers", "id", - "ntp_servers" + "ntp_servers", + "waitingpoolmaxsize", + "waitingpoolminsize" ] }, "v1.PowerMetric": {