WatchDog Kube
This commit is contained in:
331
infrastructure/infra_watchdog.go
Normal file
331
infrastructure/infra_watchdog.go
Normal file
@@ -0,0 +1,331 @@
|
||||
package infrastructure
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"oc-datacenter/conf"
|
||||
"oc-datacenter/infrastructure/minio"
|
||||
"oc-datacenter/infrastructure/storage"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/dbs"
|
||||
bookingmodel "cloud.o-forge.io/core/oc-lib/models/booking"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow_execution"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
)
|
||||
|
||||
// uuidNsPattern matches Kubernetes namespace names that are execution UUIDs.
|
||||
var uuidNsPattern = regexp.MustCompile(`^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$`)
|
||||
|
||||
// WatchInfra is a safety-net watchdog that periodically scans Kubernetes for
|
||||
// execution namespaces whose WorkflowExecution has reached a terminal state
|
||||
// but whose infra was never torn down (e.g. because WORKFLOW_DONE_EVENT was
|
||||
// missed due to oc-monitord or oc-datacenter crash/restart).
|
||||
//
|
||||
// Must be launched in a goroutine from main.
|
||||
func WatchInfra() {
|
||||
logger := oclib.GetLogger()
|
||||
logger.Info().Msg("InfraWatchdog: started")
|
||||
ticker := time.NewTicker(5 * time.Minute)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
if err := scanOrphanedInfra(); err != nil {
|
||||
logger.Error().Msg("InfraWatchdog: " + err.Error())
|
||||
}
|
||||
if err := scanOrphanedMinio(); err != nil {
|
||||
logger.Error().Msg("InfraWatchdog(minio): " + err.Error())
|
||||
}
|
||||
if err := scanOrphanedAdmiraltyNodes(); err != nil {
|
||||
logger.Error().Msg("InfraWatchdog(admiralty-nodes): " + err.Error())
|
||||
}
|
||||
if err := scanOrphanedPVC(); err != nil {
|
||||
logger.Error().Msg("InfraWatchdog(pvc): " + err.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// scanOrphanedInfra lists all UUID-named Kubernetes namespaces, looks up their
|
||||
// WorkflowExecution in the DB, and triggers teardown for any that are in a
|
||||
// terminal state. Namespaces already in Terminating phase are skipped.
|
||||
func scanOrphanedInfra() error {
|
||||
logger := oclib.GetLogger()
|
||||
|
||||
serv, err := tools.NewKubernetesService(
|
||||
conf.GetConfig().KubeHost+":"+conf.GetConfig().KubePort,
|
||||
conf.GetConfig().KubeCA,
|
||||
conf.GetConfig().KubeCert,
|
||||
conf.GetConfig().KubeData,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to init k8s service: %w", err)
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
nsList, err := serv.Set.CoreV1().Namespaces().List(ctx, metav1.ListOptions{})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to list namespaces: %w", err)
|
||||
}
|
||||
|
||||
myself, err := oclib.GetMySelf()
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not resolve local peer: %w", err)
|
||||
}
|
||||
peerID := myself.GetID()
|
||||
|
||||
for _, ns := range nsList.Items {
|
||||
executionsID := ns.Name
|
||||
if !uuidNsPattern.MatchString(executionsID) {
|
||||
continue
|
||||
}
|
||||
// Skip namespaces already being deleted by a previous teardown.
|
||||
if ns.Status.Phase == v1.NamespaceTerminating {
|
||||
continue
|
||||
}
|
||||
|
||||
exec := findTerminalExecution(executionsID, peerID)
|
||||
if exec == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
logger.Info().Msgf("InfraWatchdog: orphaned infra detected for execution %s (state=%v) → teardown",
|
||||
executionsID, exec.State)
|
||||
go teardownInfraForExecution(exec.GetID(), executionsID)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// scanOrphanedMinio scans LIVE_STORAGE bookings for executions that are in a
|
||||
// terminal state and triggers Minio teardown for each unique executionsID found.
|
||||
// This covers the case where the Kubernetes namespace is already gone (manual
|
||||
// deletion, prior partial teardown) but Minio SA and bucket were never revoked.
|
||||
func scanOrphanedMinio() error {
|
||||
logger := oclib.GetLogger()
|
||||
|
||||
myself, err := oclib.GetMySelf()
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not resolve local peer: %w", err)
|
||||
}
|
||||
peerID := myself.GetID()
|
||||
|
||||
res := oclib.NewRequest(oclib.LibDataEnum(oclib.BOOKING), "", peerID, []string{}, nil).
|
||||
Search(&dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
"resource_type": {{Operator: dbs.EQUAL.String(), Value: tools.LIVE_STORAGE.EnumIndex()}},
|
||||
},
|
||||
}, "", false)
|
||||
|
||||
if res.Err != "" {
|
||||
return fmt.Errorf("failed to search LIVE_STORAGE bookings: %s", res.Err)
|
||||
}
|
||||
|
||||
// Collect unique executionsIDs to avoid redundant teardowns.
|
||||
seen := map[string]bool{}
|
||||
ctx := context.Background()
|
||||
|
||||
for _, dbo := range res.Data {
|
||||
b, ok := dbo.(*bookingmodel.Booking)
|
||||
if !ok || seen[b.ExecutionsID] {
|
||||
continue
|
||||
}
|
||||
|
||||
exec := findTerminalExecution(b.ExecutionsID, peerID)
|
||||
if exec == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
seen[b.ExecutionsID] = true
|
||||
|
||||
// Determine this peer's role and call the appropriate teardown.
|
||||
if b.DestPeerID == peerID {
|
||||
logger.Info().Msgf("InfraWatchdog(minio): orphaned target resources for exec %s → TeardownAsTarget", b.ExecutionsID)
|
||||
event := minio.MinioDeleteEvent{
|
||||
ExecutionsID: b.ExecutionsID,
|
||||
MinioID: b.ResourceID,
|
||||
SourcePeerID: b.DestPeerID,
|
||||
DestPeerID: peerID,
|
||||
}
|
||||
go minio.NewMinioSetter(b.ExecutionsID, b.ResourceID).TeardownAsTarget(ctx, event)
|
||||
} else {
|
||||
logger.Info().Msgf("InfraWatchdog(minio): orphaned source resources for exec %s → TeardownAsSource", b.ExecutionsID)
|
||||
event := minio.MinioDeleteEvent{
|
||||
ExecutionsID: b.ExecutionsID,
|
||||
MinioID: b.ResourceID,
|
||||
SourcePeerID: peerID,
|
||||
DestPeerID: b.DestPeerID,
|
||||
}
|
||||
go minio.NewMinioSetter(b.ExecutionsID, b.ResourceID).TeardownAsSource(ctx, event)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// scanOrphanedAdmiraltyNodes lists all Kubernetes nodes, identifies Admiralty
|
||||
// virtual nodes (name prefix "admiralty-{UUID}-") that are NotReady, and
|
||||
// explicitly deletes them when their WorkflowExecution is in a terminal state.
|
||||
//
|
||||
// This covers the gap where the namespace is already gone (or Terminating) but
|
||||
// the virtual node was never cleaned up by the Admiralty controller — which can
|
||||
// happen when the node goes NotReady before the AdmiraltyTarget CRD is deleted.
|
||||
func scanOrphanedAdmiraltyNodes() error {
|
||||
logger := oclib.GetLogger()
|
||||
|
||||
serv, err := tools.NewKubernetesService(
|
||||
conf.GetConfig().KubeHost+":"+conf.GetConfig().KubePort,
|
||||
conf.GetConfig().KubeCA,
|
||||
conf.GetConfig().KubeCert,
|
||||
conf.GetConfig().KubeData,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to init k8s service: %w", err)
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
nodeList, err := serv.Set.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to list nodes: %w", err)
|
||||
}
|
||||
|
||||
myself, err := oclib.GetMySelf()
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not resolve local peer: %w", err)
|
||||
}
|
||||
peerID := myself.GetID()
|
||||
|
||||
for _, node := range nodeList.Items {
|
||||
// Admiralty virtual nodes are named: admiralty-{executionID}-target-{...}
|
||||
rest := strings.TrimPrefix(node.Name, "admiralty-")
|
||||
if rest == node.Name {
|
||||
continue // not an admiralty node
|
||||
}
|
||||
// UUID is exactly 36 chars: 8-4-4-4-12
|
||||
if len(rest) < 36 {
|
||||
continue
|
||||
}
|
||||
executionsID := rest[:36]
|
||||
if !uuidNsPattern.MatchString(executionsID) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Only act on NotReady nodes.
|
||||
ready := false
|
||||
for _, cond := range node.Status.Conditions {
|
||||
if cond.Type == v1.NodeReady {
|
||||
ready = cond.Status == v1.ConditionTrue
|
||||
break
|
||||
}
|
||||
}
|
||||
if ready {
|
||||
continue
|
||||
}
|
||||
|
||||
exec := findTerminalExecution(executionsID, peerID)
|
||||
if exec == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
logger.Info().Msgf("InfraWatchdog(admiralty-nodes): NotReady orphaned node %s for terminal execution %s → deleting",
|
||||
node.Name, executionsID)
|
||||
if delErr := serv.Set.CoreV1().Nodes().Delete(ctx, node.Name, metav1.DeleteOptions{}); delErr != nil {
|
||||
logger.Error().Msgf("InfraWatchdog(admiralty-nodes): failed to delete node %s: %v", node.Name, delErr)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// scanOrphanedPVC scans LIVE_STORAGE bookings for executions that are in a
|
||||
// terminal state and triggers PVC teardown for each one where this peer holds
|
||||
// the local storage. This covers the case where the Kubernetes namespace was
|
||||
// already deleted (or its teardown was partial) but the PersistentVolume
|
||||
// (cluster-scoped) was never reclaimed.
|
||||
//
|
||||
// A LIVE_STORAGE booking is treated as a local PVC only when ResolveStorageName
|
||||
// returns a non-empty name — the same guard used by teardownPVCForExecution.
|
||||
func scanOrphanedPVC() error {
|
||||
logger := oclib.GetLogger()
|
||||
|
||||
myself, err := oclib.GetMySelf()
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not resolve local peer: %w", err)
|
||||
}
|
||||
peerID := myself.GetID()
|
||||
|
||||
res := oclib.NewRequest(oclib.LibDataEnum(oclib.BOOKING), "", peerID, []string{}, nil).
|
||||
Search(&dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
"resource_type": {{Operator: dbs.EQUAL.String(), Value: tools.LIVE_STORAGE.EnumIndex()}},
|
||||
},
|
||||
}, "", false)
|
||||
|
||||
if res.Err != "" {
|
||||
return fmt.Errorf("failed to search LIVE_STORAGE bookings: %s", res.Err)
|
||||
}
|
||||
|
||||
seen := map[string]bool{}
|
||||
ctx := context.Background()
|
||||
|
||||
for _, dbo := range res.Data {
|
||||
b, ok := dbo.(*bookingmodel.Booking)
|
||||
if !ok || seen[b.ExecutionsID+b.ResourceID] {
|
||||
continue
|
||||
}
|
||||
|
||||
storageName := storage.ResolveStorageName(b.ResourceID, peerID)
|
||||
if storageName == "" {
|
||||
continue // not a local PVC booking
|
||||
}
|
||||
|
||||
exec := findTerminalExecution(b.ExecutionsID, peerID)
|
||||
if exec == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
seen[b.ExecutionsID+b.ResourceID] = true
|
||||
|
||||
logger.Info().Msgf("InfraWatchdog(pvc): orphaned PVC for exec %s storage %s → TeardownAsSource",
|
||||
b.ExecutionsID, b.ResourceID)
|
||||
event := storage.PVCDeleteEvent{
|
||||
ExecutionsID: b.ExecutionsID,
|
||||
StorageID: b.ResourceID,
|
||||
StorageName: storageName,
|
||||
SourcePeerID: peerID,
|
||||
DestPeerID: b.DestPeerID,
|
||||
}
|
||||
go storage.NewPVCSetter(b.ExecutionsID, b.ResourceID).TeardownAsSource(ctx, event)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// findTerminalExecution returns the WorkflowExecution for the given executionsID
|
||||
// if it exists in the DB and is in a terminal state, otherwise nil.
|
||||
func findTerminalExecution(executionsID string, peerID string) *workflow_execution.WorkflowExecution {
|
||||
res := oclib.NewRequest(oclib.LibDataEnum(oclib.WORKFLOW_EXECUTION), "", peerID, []string{}, nil).
|
||||
Search(&dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
"executions_id": {{Operator: dbs.EQUAL.String(), Value: executionsID}},
|
||||
},
|
||||
}, "", false)
|
||||
|
||||
if res.Err != "" || len(res.Data) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
exec, ok := res.Data[0].(*workflow_execution.WorkflowExecution)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
|
||||
if !closingStates[exec.State] {
|
||||
return nil
|
||||
}
|
||||
return exec
|
||||
}
|
||||
Reference in New Issue
Block a user