Prepull for pod & Asym Jobs

2026-03-25 11:13:12 +01:00
parent 56bc342d24
commit a9284314ef
17 changed files with 754 additions and 512 deletions
--- a/logger/argo_logs.go
+++ b/logger/argo_logs.go
@@ -11,8 +11,8 @@ import (
 	"sync"
 	"time"

-	oclib "cloud.o-forge.io/core/oc-lib"
 	"cloud.o-forge.io/core/oc-lib/models/common/enum"
+	octools "cloud.o-forge.io/core/oc-lib/tools"
 	"github.com/rs/zerolog"
 	"k8s.io/apimachinery/pkg/watch"

@@ -48,7 +48,7 @@ func NewArgoLogs(name string, namespace string, stepMax int) *ArgoLogs {
 	return &ArgoLogs{
 		Name:        "oc-monitor-" + name,
 		Namespace:   namespace,
-		CreatedDate: time.Now().Format("2006-01-02 15:04:05"),
+		CreatedDate: time.Now().UTC().Format("2006-01-02 15:04:05"),
 		StepCount:   0,
 		StepMax:     stepMax,
 		stop:        false,
@@ -109,7 +109,17 @@ func NewArgoPodLog(name string, step string, msg string) ArgoPodLog {
 	}
 }

-func LogKubernetesArgo(wfName string, execID string, namespace string, watcher watch.Interface) {
+// LogKubernetesArgo watches an Argo workflow and emits NATS lifecycle events.
+// It no longer writes directly to the database — all state transitions are
+// delegated to oc-scheduler (WorkflowExecution) and oc-datacenter (Bookings)
+// via the dedicated NATS channels.
+//
+//   - wfName      : Argo workflow name (also the name of the root DAG node)
+//   - execID      : WorkflowExecution UUID  (for oc-scheduler to update state)
+//   - executionsID: run-group ID shared by all bookings of this run
+//   - namespace   : Kubernetes namespace
+//   - watcher     : Argo watch stream
+func LogKubernetesArgo(wfName string, execID string, executionsID string, namespace string, watcher watch.Interface) {
 	var argoWatcher *ArgoWatch
 	var pods []string
 	var node wfv1.NodeStatus
@@ -119,6 +129,17 @@ func LogKubernetesArgo(wfName string, execID string, namespace string, watcher w

 	var wg sync.WaitGroup

+	// nodePhases tracks the last known phase of each step node so we can detect
+	// phase transitions and emit WORKFLOW_STEP_DONE_EVENT exactly once per step.
+	nodePhases := map[string]wfv1.NodePhase{}
+
+	// stepResults captures the final NodeStatus of every completed step so the
+	// WORKFLOW_DONE_EVENT can include a full recap (Steps slice) for oc-scheduler
+	// and oc-catalog to catch up if they missed individual STEP_DONE events.
+	stepResults := map[string]wfv1.NodeStatus{}
+
+	workflowStartedEmitted := false
+
 	for event := range watcher.ResultChan() {
 		wf, ok := event.Object.(*wfv1.Workflow)
 		if !ok {
@@ -126,10 +147,22 @@ func LogKubernetesArgo(wfName string, execID string, namespace string, watcher w
 			continue
 		}
 		if len(wf.Status.Nodes) == 0 {
-			wfl.Info().Msg("No node status yet") // The first output of the channel doesn't contain Nodes so we skip it
+			wfl.Info().Msg("No node status yet")
 			continue
 		}

+		// ── Emit WORKFLOW_STARTED_EVENT once ────────────────────────────────
+		if !workflowStartedEmitted {
+			realStart := wf.Status.StartedAt.Time
+			emitLifecycleEvent(octools.WORKFLOW_STARTED_EVENT, octools.WorkflowLifecycleEvent{
+				ExecutionID:  execID,
+				ExecutionsID: executionsID,
+				State:        enum.STARTED.EnumIndex(),
+				RealStart:    &realStart,
+			})
+			workflowStartedEmitted = true
+		}
+
 		conditions := retrieveCondition(wf)

 		// Retrieving the Status for the main node, which is named after the workflow
@@ -138,9 +171,9 @@ func LogKubernetesArgo(wfName string, execID string, namespace string, watcher w
 			wfl.Fatal().Msg("Could not find the " + wfName + " node in \n" + string(bytified))
 		}

-		now := time.Now()
+		now := time.Now().UTC()
 		start, _ := time.Parse(time.RFC3339, node.StartedAt.String())
-		duration := now.Sub(start)
+		duration := now.Sub(start.UTC())

 		newWatcher := ArgoWatch{
 			Name:       node.Name,
@@ -163,15 +196,57 @@ func LogKubernetesArgo(wfName string, execID string, namespace string, watcher w
 			argoWatcher = &newWatcher
 		}

-		// I don't think we need to use WaitGroup here, because the loop itself
-		// acts as blocking process for the main thread, because Argo watch never closes the channel
+		// ── Per-step completion detection ────────────────────────────────────
+		for _, stepNode := range wf.Status.Nodes {
+			if stepNode.Name == wfName {
+				continue // skip the main DAG node
+			}
+			prev := nodePhases[stepNode.Name]
+			nodePhases[stepNode.Name] = stepNode.Phase
+
+			if prev == stepNode.Phase {
+				continue // no change
+			}
+			if !stepNode.Phase.Completed() && !stepNode.Phase.FailedOrError() {
+				continue // not terminal yet
+			}
+			if prev.Completed() || prev.FailedOrError() {
+				continue // already processed
+			}
+
+			bookingID := extractBookingID(stepNode.Name)
+			if bookingID == "" {
+				continue
+			}
+
+			stepState := enum.SUCCESS
+			if stepNode.Phase.FailedOrError() {
+				stepState = enum.FAILURE
+			}
+			realStart := stepNode.StartedAt.Time
+			realEnd := stepNode.FinishedAt.Time
+			if realEnd.IsZero() {
+				realEnd = time.Now().UTC()
+			}
+			emitLifecycleEvent(octools.WORKFLOW_STEP_DONE_EVENT, octools.WorkflowLifecycleEvent{
+				ExecutionID:  execID,
+				ExecutionsID: executionsID,
+				BookingID:    bookingID,
+				State:        stepState.EnumIndex(),
+				RealStart:    &realStart,
+				RealEnd:      &realEnd,
+			})
+			// Store for the final recap emitted with WORKFLOW_DONE_EVENT.
+			stepResults[bookingID] = stepNode
+		}
+
+		// ── Pod log streaming ────────────────────────────────────────────────
 		for _, pod := range wf.Status.Nodes {
+			if pod.Type != wfv1.NodeTypePod {
+				continue
+			}
 			if !slices.Contains(pods, pod.Name) {
 				pl := wfl.With().Str("pod", pod.Name).Logger()
-				if wfName == pod.Name {
-					pods = append(pods, pod.Name)
-					continue
-				} // One of the node is the Workflow, the others are the pods so don't try to log on the wf name
 				pl.Info().Msg("Found a new pod to log : " + pod.Name)
 				wg.Add(1)
 				go logKubernetesPods(namespace, wfName, pod.Name, pl, &wg)
@@ -179,27 +254,93 @@ func LogKubernetesArgo(wfName string, execID string, namespace string, watcher w
 			}
 		}

-		// Stop listening to the chan when the Workflow is completed or something bad happened
-		if node.Phase.Completed() {
-			wfl.Info().Msg(wfName + " worflow completed")
+		// ── Workflow terminal phase ──────────────────────────────────────────
+		if node.Phase.Completed() || node.Phase.FailedOrError() {
+			if node.Phase.Completed() {
+				wfl.Info().Msg(wfName + " workflow completed")
+			} else {
+				wfl.Error().Msg(wfName + " has failed, please refer to the logs")
+				wfl.Error().Msg(node.Message)
+			}
 			wg.Wait()
 			wfl.Info().Msg(wfName + " exiting")
-			oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.WORKFLOW_EXECUTION), nil).UpdateOne(map[string]interface{}{
-				"state": enum.SUCCESS.EnumIndex(),
-			}, execID)
-			break
-		}
-		if node.Phase.FailedOrError() {
-			wfl.Error().Msg(wfName + "has failed, please refer to the logs")
-			oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.WORKFLOW_EXECUTION), nil).UpdateOne(map[string]interface{}{
-				"state": enum.FAILURE.EnumIndex(),
-			}, execID)
-			wfl.Error().Msg(node.Message)
+
+			finalState := enum.SUCCESS
+			if node.Phase.FailedOrError() {
+				finalState = enum.FAILURE
+			}
+			realStart := node.StartedAt.Time
+			realEnd := node.FinishedAt.Time
+			if realEnd.IsZero() {
+				realEnd = time.Now().UTC()
+			}
+
+			// Build recap from all observed step results.
+			steps := make([]octools.StepMetric, 0, len(stepResults))
+			for bookingID, s := range stepResults {
+				stepState := enum.SUCCESS
+				if s.Phase.FailedOrError() {
+					stepState = enum.FAILURE
+				}
+				start := s.StartedAt.Time
+				end := s.FinishedAt.Time
+				if end.IsZero() {
+					end = realEnd
+				}
+				steps = append(steps, octools.StepMetric{
+					BookingID: bookingID,
+					State:     stepState.EnumIndex(),
+					RealStart: &start,
+					RealEnd:   &end,
+				})
+			}
+
+			emitLifecycleEvent(octools.WORKFLOW_DONE_EVENT, octools.WorkflowLifecycleEvent{
+				ExecutionID:  execID,
+				ExecutionsID: executionsID,
+				State:        finalState.EnumIndex(),
+				RealStart:    &realStart,
+				RealEnd:      &realEnd,
+				Steps:        steps,
+			})
 			break
 		}
 	}
 }

+// emitLifecycleEvent publishes a WorkflowLifecycleEvent on the given NATS channel.
+func emitLifecycleEvent(method octools.NATSMethod, evt octools.WorkflowLifecycleEvent) {
+	payload, err := json.Marshal(evt)
+	if err != nil {
+		return
+	}
+	octools.NewNATSCaller().SetNATSPub(method, octools.NATSResponse{
+		FromApp: "oc-monitord",
+		Method:  int(method),
+		Payload: payload,
+	})
+}
+
+// extractBookingID extracts the bookingID (UUID, 36 chars) from an Argo node
+// display name. Argo step nodes are named "{wfName}.{taskName}" where taskName
+// is "{resource-name}-{bookingID}" as generated by getArgoName in argo_builder.
+func extractBookingID(nodeName string) string {
+	parts := strings.SplitN(nodeName, ".", 2)
+	if len(parts) < 2 {
+		return ""
+	}
+	taskName := parts[1]
+	if len(taskName) < 36 {
+		return ""
+	}
+	candidate := taskName[len(taskName)-36:]
+	// Validate UUID shape: 8-4-4-4-12 with dashes at positions 8,13,18,23.
+	if candidate[8] == '-' && candidate[13] == '-' && candidate[18] == '-' && candidate[23] == '-' {
+		return candidate
+	}
+	return ""
+}
+
 func retrieveCondition(wf *wfv1.Workflow) (c Conditions) {
 	for _, cond := range wf.Status.Conditions {
 		if cond.Type == "PodRunning" {
@@ -209,12 +350,10 @@ func retrieveCondition(wf *wfv1.Workflow) (c Conditions) {
 			c.Completed = cond.Status == "True"
 		}
 	}
-
 	return
-
 }

-// Function needed to be executed as a go thread
+// logKubernetesPods streams pod logs to the structured logger.
 func logKubernetesPods(executionId string, wfName string, podName string, logger zerolog.Logger, wg *sync.WaitGroup) {
 	defer wg.Done()

@@ -241,5 +380,4 @@ func logKubernetesPods(executionId string, wfName string, podName string, logger
 		jsonified, _ := json.Marshal(podLog)
 		logger.Info().Msg(string(jsonified))
 	}
-
 }