Prepull for pod & Asym Jobs
This commit is contained in:
@@ -11,8 +11,8 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/models/common/enum"
|
||||
octools "cloud.o-forge.io/core/oc-lib/tools"
|
||||
"github.com/rs/zerolog"
|
||||
"k8s.io/apimachinery/pkg/watch"
|
||||
|
||||
@@ -48,7 +48,7 @@ func NewArgoLogs(name string, namespace string, stepMax int) *ArgoLogs {
|
||||
return &ArgoLogs{
|
||||
Name: "oc-monitor-" + name,
|
||||
Namespace: namespace,
|
||||
CreatedDate: time.Now().Format("2006-01-02 15:04:05"),
|
||||
CreatedDate: time.Now().UTC().Format("2006-01-02 15:04:05"),
|
||||
StepCount: 0,
|
||||
StepMax: stepMax,
|
||||
stop: false,
|
||||
@@ -109,7 +109,17 @@ func NewArgoPodLog(name string, step string, msg string) ArgoPodLog {
|
||||
}
|
||||
}
|
||||
|
||||
func LogKubernetesArgo(wfName string, execID string, namespace string, watcher watch.Interface) {
|
||||
// LogKubernetesArgo watches an Argo workflow and emits NATS lifecycle events.
|
||||
// It no longer writes directly to the database — all state transitions are
|
||||
// delegated to oc-scheduler (WorkflowExecution) and oc-datacenter (Bookings)
|
||||
// via the dedicated NATS channels.
|
||||
//
|
||||
// - wfName : Argo workflow name (also the name of the root DAG node)
|
||||
// - execID : WorkflowExecution UUID (for oc-scheduler to update state)
|
||||
// - executionsID: run-group ID shared by all bookings of this run
|
||||
// - namespace : Kubernetes namespace
|
||||
// - watcher : Argo watch stream
|
||||
func LogKubernetesArgo(wfName string, execID string, executionsID string, namespace string, watcher watch.Interface) {
|
||||
var argoWatcher *ArgoWatch
|
||||
var pods []string
|
||||
var node wfv1.NodeStatus
|
||||
@@ -119,6 +129,17 @@ func LogKubernetesArgo(wfName string, execID string, namespace string, watcher w
|
||||
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// nodePhases tracks the last known phase of each step node so we can detect
|
||||
// phase transitions and emit WORKFLOW_STEP_DONE_EVENT exactly once per step.
|
||||
nodePhases := map[string]wfv1.NodePhase{}
|
||||
|
||||
// stepResults captures the final NodeStatus of every completed step so the
|
||||
// WORKFLOW_DONE_EVENT can include a full recap (Steps slice) for oc-scheduler
|
||||
// and oc-catalog to catch up if they missed individual STEP_DONE events.
|
||||
stepResults := map[string]wfv1.NodeStatus{}
|
||||
|
||||
workflowStartedEmitted := false
|
||||
|
||||
for event := range watcher.ResultChan() {
|
||||
wf, ok := event.Object.(*wfv1.Workflow)
|
||||
if !ok {
|
||||
@@ -126,10 +147,22 @@ func LogKubernetesArgo(wfName string, execID string, namespace string, watcher w
|
||||
continue
|
||||
}
|
||||
if len(wf.Status.Nodes) == 0 {
|
||||
wfl.Info().Msg("No node status yet") // The first output of the channel doesn't contain Nodes so we skip it
|
||||
wfl.Info().Msg("No node status yet")
|
||||
continue
|
||||
}
|
||||
|
||||
// ── Emit WORKFLOW_STARTED_EVENT once ────────────────────────────────
|
||||
if !workflowStartedEmitted {
|
||||
realStart := wf.Status.StartedAt.Time
|
||||
emitLifecycleEvent(octools.WORKFLOW_STARTED_EVENT, octools.WorkflowLifecycleEvent{
|
||||
ExecutionID: execID,
|
||||
ExecutionsID: executionsID,
|
||||
State: enum.STARTED.EnumIndex(),
|
||||
RealStart: &realStart,
|
||||
})
|
||||
workflowStartedEmitted = true
|
||||
}
|
||||
|
||||
conditions := retrieveCondition(wf)
|
||||
|
||||
// Retrieving the Status for the main node, which is named after the workflow
|
||||
@@ -138,9 +171,9 @@ func LogKubernetesArgo(wfName string, execID string, namespace string, watcher w
|
||||
wfl.Fatal().Msg("Could not find the " + wfName + " node in \n" + string(bytified))
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
now := time.Now().UTC()
|
||||
start, _ := time.Parse(time.RFC3339, node.StartedAt.String())
|
||||
duration := now.Sub(start)
|
||||
duration := now.Sub(start.UTC())
|
||||
|
||||
newWatcher := ArgoWatch{
|
||||
Name: node.Name,
|
||||
@@ -163,15 +196,57 @@ func LogKubernetesArgo(wfName string, execID string, namespace string, watcher w
|
||||
argoWatcher = &newWatcher
|
||||
}
|
||||
|
||||
// I don't think we need to use WaitGroup here, because the loop itself
|
||||
// acts as blocking process for the main thread, because Argo watch never closes the channel
|
||||
// ── Per-step completion detection ────────────────────────────────────
|
||||
for _, stepNode := range wf.Status.Nodes {
|
||||
if stepNode.Name == wfName {
|
||||
continue // skip the main DAG node
|
||||
}
|
||||
prev := nodePhases[stepNode.Name]
|
||||
nodePhases[stepNode.Name] = stepNode.Phase
|
||||
|
||||
if prev == stepNode.Phase {
|
||||
continue // no change
|
||||
}
|
||||
if !stepNode.Phase.Completed() && !stepNode.Phase.FailedOrError() {
|
||||
continue // not terminal yet
|
||||
}
|
||||
if prev.Completed() || prev.FailedOrError() {
|
||||
continue // already processed
|
||||
}
|
||||
|
||||
bookingID := extractBookingID(stepNode.Name)
|
||||
if bookingID == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
stepState := enum.SUCCESS
|
||||
if stepNode.Phase.FailedOrError() {
|
||||
stepState = enum.FAILURE
|
||||
}
|
||||
realStart := stepNode.StartedAt.Time
|
||||
realEnd := stepNode.FinishedAt.Time
|
||||
if realEnd.IsZero() {
|
||||
realEnd = time.Now().UTC()
|
||||
}
|
||||
emitLifecycleEvent(octools.WORKFLOW_STEP_DONE_EVENT, octools.WorkflowLifecycleEvent{
|
||||
ExecutionID: execID,
|
||||
ExecutionsID: executionsID,
|
||||
BookingID: bookingID,
|
||||
State: stepState.EnumIndex(),
|
||||
RealStart: &realStart,
|
||||
RealEnd: &realEnd,
|
||||
})
|
||||
// Store for the final recap emitted with WORKFLOW_DONE_EVENT.
|
||||
stepResults[bookingID] = stepNode
|
||||
}
|
||||
|
||||
// ── Pod log streaming ────────────────────────────────────────────────
|
||||
for _, pod := range wf.Status.Nodes {
|
||||
if pod.Type != wfv1.NodeTypePod {
|
||||
continue
|
||||
}
|
||||
if !slices.Contains(pods, pod.Name) {
|
||||
pl := wfl.With().Str("pod", pod.Name).Logger()
|
||||
if wfName == pod.Name {
|
||||
pods = append(pods, pod.Name)
|
||||
continue
|
||||
} // One of the node is the Workflow, the others are the pods so don't try to log on the wf name
|
||||
pl.Info().Msg("Found a new pod to log : " + pod.Name)
|
||||
wg.Add(1)
|
||||
go logKubernetesPods(namespace, wfName, pod.Name, pl, &wg)
|
||||
@@ -179,27 +254,93 @@ func LogKubernetesArgo(wfName string, execID string, namespace string, watcher w
|
||||
}
|
||||
}
|
||||
|
||||
// Stop listening to the chan when the Workflow is completed or something bad happened
|
||||
if node.Phase.Completed() {
|
||||
wfl.Info().Msg(wfName + " worflow completed")
|
||||
// ── Workflow terminal phase ──────────────────────────────────────────
|
||||
if node.Phase.Completed() || node.Phase.FailedOrError() {
|
||||
if node.Phase.Completed() {
|
||||
wfl.Info().Msg(wfName + " workflow completed")
|
||||
} else {
|
||||
wfl.Error().Msg(wfName + " has failed, please refer to the logs")
|
||||
wfl.Error().Msg(node.Message)
|
||||
}
|
||||
wg.Wait()
|
||||
wfl.Info().Msg(wfName + " exiting")
|
||||
oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.WORKFLOW_EXECUTION), nil).UpdateOne(map[string]interface{}{
|
||||
"state": enum.SUCCESS.EnumIndex(),
|
||||
}, execID)
|
||||
break
|
||||
}
|
||||
if node.Phase.FailedOrError() {
|
||||
wfl.Error().Msg(wfName + "has failed, please refer to the logs")
|
||||
oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.WORKFLOW_EXECUTION), nil).UpdateOne(map[string]interface{}{
|
||||
"state": enum.FAILURE.EnumIndex(),
|
||||
}, execID)
|
||||
wfl.Error().Msg(node.Message)
|
||||
|
||||
finalState := enum.SUCCESS
|
||||
if node.Phase.FailedOrError() {
|
||||
finalState = enum.FAILURE
|
||||
}
|
||||
realStart := node.StartedAt.Time
|
||||
realEnd := node.FinishedAt.Time
|
||||
if realEnd.IsZero() {
|
||||
realEnd = time.Now().UTC()
|
||||
}
|
||||
|
||||
// Build recap from all observed step results.
|
||||
steps := make([]octools.StepMetric, 0, len(stepResults))
|
||||
for bookingID, s := range stepResults {
|
||||
stepState := enum.SUCCESS
|
||||
if s.Phase.FailedOrError() {
|
||||
stepState = enum.FAILURE
|
||||
}
|
||||
start := s.StartedAt.Time
|
||||
end := s.FinishedAt.Time
|
||||
if end.IsZero() {
|
||||
end = realEnd
|
||||
}
|
||||
steps = append(steps, octools.StepMetric{
|
||||
BookingID: bookingID,
|
||||
State: stepState.EnumIndex(),
|
||||
RealStart: &start,
|
||||
RealEnd: &end,
|
||||
})
|
||||
}
|
||||
|
||||
emitLifecycleEvent(octools.WORKFLOW_DONE_EVENT, octools.WorkflowLifecycleEvent{
|
||||
ExecutionID: execID,
|
||||
ExecutionsID: executionsID,
|
||||
State: finalState.EnumIndex(),
|
||||
RealStart: &realStart,
|
||||
RealEnd: &realEnd,
|
||||
Steps: steps,
|
||||
})
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// emitLifecycleEvent publishes a WorkflowLifecycleEvent on the given NATS channel.
|
||||
func emitLifecycleEvent(method octools.NATSMethod, evt octools.WorkflowLifecycleEvent) {
|
||||
payload, err := json.Marshal(evt)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
octools.NewNATSCaller().SetNATSPub(method, octools.NATSResponse{
|
||||
FromApp: "oc-monitord",
|
||||
Method: int(method),
|
||||
Payload: payload,
|
||||
})
|
||||
}
|
||||
|
||||
// extractBookingID extracts the bookingID (UUID, 36 chars) from an Argo node
|
||||
// display name. Argo step nodes are named "{wfName}.{taskName}" where taskName
|
||||
// is "{resource-name}-{bookingID}" as generated by getArgoName in argo_builder.
|
||||
func extractBookingID(nodeName string) string {
|
||||
parts := strings.SplitN(nodeName, ".", 2)
|
||||
if len(parts) < 2 {
|
||||
return ""
|
||||
}
|
||||
taskName := parts[1]
|
||||
if len(taskName) < 36 {
|
||||
return ""
|
||||
}
|
||||
candidate := taskName[len(taskName)-36:]
|
||||
// Validate UUID shape: 8-4-4-4-12 with dashes at positions 8,13,18,23.
|
||||
if candidate[8] == '-' && candidate[13] == '-' && candidate[18] == '-' && candidate[23] == '-' {
|
||||
return candidate
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func retrieveCondition(wf *wfv1.Workflow) (c Conditions) {
|
||||
for _, cond := range wf.Status.Conditions {
|
||||
if cond.Type == "PodRunning" {
|
||||
@@ -209,12 +350,10 @@ func retrieveCondition(wf *wfv1.Workflow) (c Conditions) {
|
||||
c.Completed = cond.Status == "True"
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
|
||||
}
|
||||
|
||||
// Function needed to be executed as a go thread
|
||||
// logKubernetesPods streams pod logs to the structured logger.
|
||||
func logKubernetesPods(executionId string, wfName string, podName string, logger zerolog.Logger, wg *sync.WaitGroup) {
|
||||
defer wg.Done()
|
||||
|
||||
@@ -241,5 +380,4 @@ func logKubernetesPods(executionId string, wfName string, podName string, logger
|
||||
jsonified, _ := json.Marshal(podLog)
|
||||
logger.Info().Msg(string(jsonified))
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user