Datacenter no more handle booking but is fully charged with Kube & minio allocate per NATS
This commit is contained in:
244
infrastructure/booking_watchdog.go
Normal file
244
infrastructure/booking_watchdog.go
Normal file
@@ -0,0 +1,244 @@
|
||||
package infrastructure
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"oc-datacenter/infrastructure/minio"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/dbs"
|
||||
bookingmodel "cloud.o-forge.io/core/oc-lib/models/booking"
|
||||
"cloud.o-forge.io/core/oc-lib/models/common/enum"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow_execution"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
"go.mongodb.org/mongo-driver/bson/primitive"
|
||||
)
|
||||
|
||||
// processedBookings tracks booking IDs whose start-expiry has already been handled.
|
||||
// Resets on restart; teardown methods are idempotent so duplicate runs are safe.
|
||||
var processedBookings sync.Map
|
||||
|
||||
// processedEndBookings tracks booking IDs whose end-expiry (Admiralty source cleanup)
|
||||
// has already been triggered in this process lifetime.
|
||||
var processedEndBookings sync.Map
|
||||
|
||||
// closingStates is the set of terminal booking states after which infra must be torn down.
|
||||
var closingStates = map[enum.BookingStatus]bool{
|
||||
enum.FAILURE: true,
|
||||
enum.SUCCESS: true,
|
||||
enum.FORGOTTEN: true,
|
||||
enum.CANCELLED: true,
|
||||
}
|
||||
|
||||
// WatchBookings starts a passive loop that ticks every minute, scans bookings whose
|
||||
// ExpectedStartDate + 1 min has passed, transitions them to terminal states when needed,
|
||||
// and tears down the associated Kubernetes / Minio infrastructure.
|
||||
// Must be launched in a goroutine from main.
|
||||
func WatchBookings() {
|
||||
logger := oclib.GetLogger()
|
||||
logger.Info().Msg("BookingWatchdog: started")
|
||||
ticker := time.NewTicker(time.Minute)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
if err := scanExpiredBookings(); err != nil {
|
||||
logger.Error().Msg("BookingWatchdog: " + err.Error())
|
||||
}
|
||||
if err := scanEndedExec(); err != nil {
|
||||
logger.Error().Msg("BookingWatchdog: " + err.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// scanExpiredBookings queries all bookings whose start deadline has passed and
|
||||
// dispatches each one to processExpiredBooking.
|
||||
func scanExpiredBookings() error {
|
||||
myself, err := oclib.GetMySelf()
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not resolve local peer: %w", err)
|
||||
}
|
||||
peerID := myself.GetID()
|
||||
|
||||
deadline := time.Now().Add(-time.Minute)
|
||||
res := oclib.NewRequest(oclib.LibDataEnum(oclib.BOOKING), "", peerID, []string{}, nil).
|
||||
Search(&dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
"expected_start_date": {{
|
||||
Operator: dbs.LTE.String(),
|
||||
Value: primitive.NewDateTimeFromTime(deadline),
|
||||
}},
|
||||
},
|
||||
}, "", false)
|
||||
|
||||
if res.Err != "" {
|
||||
return fmt.Errorf("booking search failed: %s", res.Err)
|
||||
}
|
||||
|
||||
for _, dbo := range res.Data {
|
||||
b, ok := dbo.(*bookingmodel.Booking)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
go processExpiredBooking(b, peerID)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// processExpiredBooking transitions the booking to a terminal state when applicable,
|
||||
// then tears down infrastructure based on the resource type:
|
||||
// - LIVE_DATACENTER / COMPUTE_RESOURCE → Admiralty (as target) + Minio (as target)
|
||||
// - LIVE_STORAGE / STORAGE_RESOURCE → Minio (as source)
|
||||
func processExpiredBooking(b *bookingmodel.Booking, peerID string) {
|
||||
logger := oclib.GetLogger()
|
||||
ctx := context.Background()
|
||||
|
||||
// Skip bookings already handled during this process lifetime.
|
||||
if _, done := processedBookings.Load(b.GetID()); done {
|
||||
return
|
||||
}
|
||||
|
||||
// Transition non-terminal bookings.
|
||||
if !closingStates[b.State] {
|
||||
var newState enum.BookingStatus
|
||||
switch b.State {
|
||||
case enum.DRAFT, enum.DELAYED:
|
||||
// DRAFT: never launched; DELAYED: was SCHEDULED but start never arrived.
|
||||
newState = enum.FORGOTTEN
|
||||
case enum.SCHEDULED:
|
||||
// Passed its start date without ever being launched.
|
||||
newState = enum.FAILURE
|
||||
case enum.STARTED:
|
||||
// A running booking is never auto-closed by the watchdog.
|
||||
return
|
||||
default:
|
||||
return
|
||||
}
|
||||
|
||||
upd := oclib.NewRequest(oclib.LibDataEnum(oclib.BOOKING), "", peerID, []string{}, nil).
|
||||
UpdateOne(map[string]any{"state": newState.EnumIndex()}, b.GetID())
|
||||
if upd.Err != "" {
|
||||
logger.Error().Msgf("BookingWatchdog: failed to update booking %s: %s", b.GetID(), upd.Err)
|
||||
return
|
||||
}
|
||||
b.State = newState
|
||||
logger.Info().Msgf("BookingWatchdog: booking %s (exec=%s, type=%s) → %s",
|
||||
b.GetID(), b.ExecutionsID, b.ResourceType, b.State)
|
||||
}
|
||||
|
||||
// Mark as handled before triggering async teardown (avoids double-trigger on next tick).
|
||||
processedBookings.Store(b.GetID(), struct{}{})
|
||||
|
||||
// Tear down infrastructure according to resource type.
|
||||
switch b.ResourceType {
|
||||
case tools.LIVE_DATACENTER, tools.COMPUTE_RESOURCE:
|
||||
logger.Info().Msgf("BookingWatchdog: tearing down compute infra exec=%s", b.ExecutionsID)
|
||||
go NewAdmiraltySetter(b.ExecutionsID).TeardownAsSource(ctx) // i'm the compute units.
|
||||
go teardownMinioForComputeBooking(ctx, b, peerID)
|
||||
|
||||
case tools.LIVE_STORAGE, tools.STORAGE_RESOURCE:
|
||||
logger.Info().Msgf("BookingWatchdog: tearing down storage infra exec=%s", b.ExecutionsID)
|
||||
go teardownMinioSourceBooking(ctx, b, peerID)
|
||||
}
|
||||
}
|
||||
|
||||
// scanEndedBookings queries LIVE_DATACENTER / COMPUTE_RESOURCE bookings whose
|
||||
// ExpectedEndDate + 1 min has passed and triggers TeardownAsSource for Admiralty,
|
||||
// cleaning up the compute-side namespace once the execution window is over.
|
||||
func scanEndedExec() error {
|
||||
myself, err := oclib.GetMySelf()
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not resolve local peer: %w", err)
|
||||
}
|
||||
peerID := myself.GetID()
|
||||
res := oclib.NewRequest(oclib.LibDataEnum(oclib.WORKFLOW_EXECUTION), "", peerID, []string{}, nil).
|
||||
Search(&dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
// Only compute bookings require Admiralty source cleanup.
|
||||
"state": {{
|
||||
Operator: dbs.GT.String(),
|
||||
Value: 2,
|
||||
}},
|
||||
},
|
||||
}, "", false)
|
||||
|
||||
if res.Err != "" {
|
||||
return fmt.Errorf("ended-booking search failed: %s", res.Err)
|
||||
}
|
||||
|
||||
for _, dbo := range res.Data {
|
||||
b, ok := dbo.(*workflow_execution.WorkflowExecution)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
go teardownAdmiraltyTarget(b)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// teardownAdmiraltySource triggers TeardownAsSource for the compute-side namespace
|
||||
// of an execution whose expected end date has passed.
|
||||
func teardownAdmiraltyTarget(b *workflow_execution.WorkflowExecution) {
|
||||
logger := oclib.GetLogger()
|
||||
|
||||
// Each executionsID is processed at most once per process lifetime.
|
||||
if _, done := processedEndBookings.Load(b.ExecutionsID); done {
|
||||
return
|
||||
}
|
||||
processedEndBookings.Store(b.ExecutionsID, struct{}{})
|
||||
|
||||
logger.Info().Msgf("BookingWatchdog: tearing down Admiralty source exec=%s (booking=%s)",
|
||||
b.ExecutionsID, b.GetID())
|
||||
if p, err := oclib.GetMySelf(); err == nil {
|
||||
NewAdmiraltySetter(b.ExecutionsID).TeardownAsTarget(context.Background(), p.GetID())
|
||||
}
|
||||
}
|
||||
|
||||
// teardownMinioForComputeBooking finds the LIVE_STORAGE bookings belonging to the same
|
||||
// execution and triggers Minio-as-target teardown for each (K8s secret + configmap).
|
||||
// The Minio-as-source side is handled separately by the storage booking's own watchdog pass.
|
||||
func teardownMinioForComputeBooking(ctx context.Context, computeBooking *bookingmodel.Booking, localPeerID string) {
|
||||
logger := oclib.GetLogger()
|
||||
|
||||
res := oclib.NewRequest(oclib.LibDataEnum(oclib.BOOKING), "", localPeerID, []string{}, nil).
|
||||
Search(&dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
"executions_id": {{Operator: dbs.EQUAL.String(), Value: computeBooking.ExecutionsID}},
|
||||
"resource_type": {{Operator: dbs.EQUAL.String(), Value: tools.LIVE_STORAGE.EnumIndex()}},
|
||||
},
|
||||
}, "", false)
|
||||
|
||||
if res.Err != "" || len(res.Data) == 0 {
|
||||
logger.Warn().Msgf("BookingWatchdog: no storage booking found for exec=%s", computeBooking.ExecutionsID)
|
||||
return
|
||||
}
|
||||
|
||||
for _, dbo := range res.Data {
|
||||
sb, ok := dbo.(*bookingmodel.Booking)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
event := minio.MinioDeleteEvent{
|
||||
ExecutionsID: computeBooking.ExecutionsID,
|
||||
MinioID: sb.ResourceID,
|
||||
SourcePeerID: sb.DestPeerID, // peer hosting Minio
|
||||
DestPeerID: localPeerID, // this peer (compute/target)
|
||||
OriginID: "",
|
||||
}
|
||||
minio.NewMinioSetter(computeBooking.ExecutionsID, sb.ResourceID).TeardownAsTarget(ctx, event)
|
||||
}
|
||||
}
|
||||
|
||||
// teardownMinioSourceBooking triggers Minio-as-source teardown for a storage booking:
|
||||
// revokes the scoped service account and removes the execution bucket on this Minio host.
|
||||
func teardownMinioSourceBooking(ctx context.Context, b *bookingmodel.Booking, localPeerID string) {
|
||||
event := minio.MinioDeleteEvent{
|
||||
ExecutionsID: b.ExecutionsID,
|
||||
MinioID: b.ResourceID,
|
||||
SourcePeerID: localPeerID, // this peer IS the Minio host
|
||||
DestPeerID: b.DestPeerID,
|
||||
OriginID: "",
|
||||
}
|
||||
minio.NewMinioSetter(b.ExecutionsID, b.ResourceID).TeardownAsSource(ctx, event)
|
||||
}
|
||||
Reference in New Issue
Block a user