oc-schedulerd/daemons/execution_manager.go

package daemons

import (
	"fmt"
	"strings"
	"time"

	"oc-schedulerd/conf"

	oclib "cloud.o-forge.io/core/oc-lib"
	"cloud.o-forge.io/core/oc-lib/models/common/enum"
	"cloud.o-forge.io/core/oc-lib/models/resources"
	wf "cloud.o-forge.io/core/oc-lib/models/workflow"
	workflow_execution "cloud.o-forge.io/core/oc-lib/models/workflow_execution"
	"cloud.o-forge.io/core/oc-lib/tools"
	"github.com/rs/zerolog"
)

var Executions = ScheduledExecution{Execs: map[string]workflow_execution.WorkflowExecution{}}

type ExecutionManager struct{}

// Loop every second on the Execution's list and move the Execution that must start to a new list
// that will be looped over to start them
func (em *ExecutionManager) RetrieveNextExecutions() {
	logger := oclib.GetLogger()
	for {
		Executions.Mu.Lock()
		if len(Executions.Execs) > 0 {
			executions := Executions.Execs
			orderedExec := map[int]map[string]workflow_execution.WorkflowExecution{}
			for execId, exec := range executions {
				if orderedExec[exec.Priority] == nil {
					orderedExec[exec.Priority] = map[string]workflow_execution.WorkflowExecution{}
				}
				orderedExec[exec.Priority][execId] = exec
			}
			for i := range []int{7, 6, 5, 4, 3, 2, 1, 0} { // priority in reversed
				if orderedExec[i] == nil {
					continue
				}
				fmt.Println("Next exec", i)
				lead := time.Duration(conf.GetConfig().PrepLeadSeconds) * time.Second
				for execId, exec := range orderedExec[i] {
					fmt.Println("ExecDate Before", exec.ExecDate.Before(time.Now().UTC().Add(lead)))
					// Fire PrepLeadSeconds before the scheduled start so oc-monitord
					// has time to pre-pull images and set up infra before ExecDate.
					if exec.ExecDate.Before(time.Now().UTC().Add(lead)) {
						logger.Info().Msg(fmt.Sprintf("Launching prep for %s (scheduled %s, lead %s)",
							execId, exec.ExecDate.Format(time.RFC3339), lead))
						// Mark as STARTED immediately (before goroutine) so the next
						// SchedulePolling cycle doesn't re-pick this execution from DB.
						emitExecStateUpdate(exec.GetID(), enum.STARTED)
						go em.executeExecution(&exec)
						delete(executions, execId)
					}
				}
			}

		}
		Executions.Mu.Unlock()
		time.Sleep(time.Second)
	}
}

// validateWorkflowIntegrity loads the workflow referenced by the execution and
// runs structural integrity checks before any resource is booked or any pod is
// started. This is the sovereign enforcement layer — oc-front may be bypassed
// via direct API calls, so oc-schedulerd re-validates independently.
//
// Two layers of validation are applied in order:
//  1. Structural integrity (cycles, missing compute links, variable refs, …).
//  2. Autorisation d'Exploitation (AE) — coupling and peer-usage constraints
//     published by resource owners in oc-catalog. Violations are fraudulent and
//     trigger a PEER_BEHAVIOR_EVENT(BehaviorFraud) against the consumer peer.
//
// Returns true when the execution is safe to proceed.
// On failure: emits FAILURE state, logs each violation, and returns false.
func (em *ExecutionManager) validateWorkflowIntegrity(execution *workflow_execution.WorkflowExecution, logger zerolog.Logger) bool {
	res := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.WORKFLOW), nil).LoadOne(execution.WorkflowID)
	if res.Err != "" || res.Data == nil {
		return true // can't load workflow — let the existing error path handle it downstream
	}
	workflow, ok := res.Data.(*wf.Workflow)
	if !ok {
		return true
	}

	// ── 1. Structural integrity ───────────────────────────────────────────────
	violations := workflow.ValidateIntegrity()
	var structErrors []wf.IntegrityViolation
	for _, v := range violations {
		if v.IsError() {
			structErrors = append(structErrors, v)
		}
	}
	if len(structErrors) > 0 {
		msgs := make([]string, 0, len(structErrors))
		for _, v := range structErrors {
			msgs = append(msgs, fmt.Sprintf("[%s] %s", v.Type, v.Message))
		}
		logger.Error().Msg(fmt.Sprintf(
			"workflow '%s' (exec %s) rejected — %d integrity violation(s):\n  %s",
			execution.WorkflowID, execution.GetID(), len(structErrors), strings.Join(msgs, "\n  "),
		))
		emitExecStateUpdate(execution.GetID(), enum.FAILURE)
		return false
	}

	// ── 2. Autorisation d'Exploitation (AE) ──────────────────────────────────
	// Build a per-type map of resource IDs referenced in the workflow.
	// The workflow's ResourceSet stores raw IDs in Datas/Processings/etc.
	resourcesByType := map[tools.DataType][]string{
		tools.DATA_RESOURCE:       workflow.Datas,
		tools.PROCESSING_RESOURCE: workflow.Processings,
		tools.STORAGE_RESOURCE:    workflow.Storages,
		tools.COMPUTE_RESOURCE:    workflow.Computes,
		tools.WORKFLOW_RESOURCE:   workflow.Workflows,
		tools.SERVICE_RESOURCE:    workflow.Services,
	}

	// Build a flat ID set for coupling membership checks.
	idSet := map[string]struct{}{}
	for _, ids := range resourcesByType {
		for _, id := range ids {
			idSet[id] = struct{}{}
		}
	}

	// Determine the consumer peer (this peer is executing the workflow).
	consumerPeerID := ""
	if self, err := oclib.GetMySelf(); err == nil && self != nil {
		consumerPeerID = self.GetID()
	}

	aeViolations := checkWorkflowAE(execution.WorkflowID, consumerPeerID, resourcesByType, idSet)
	if len(aeViolations) > 0 {
		msgs := make([]string, 0, len(aeViolations))
		for _, v := range aeViolations {
			msgs = append(msgs, fmt.Sprintf("[%s] %s", v.Type, v.Message))
		}
		logger.Error().Msg(fmt.Sprintf(
			"workflow '%s' (exec %s) rejected — %d AE violation(s):\n  %s",
			execution.WorkflowID, execution.GetID(), len(aeViolations), strings.Join(msgs, "\n  "),
		))
		resources.EmitAEBehaviorReport(consumerPeerID, aeViolations)
		emitExecStateUpdate(execution.GetID(), enum.FAILURE)
		return false
	}

	return true
}

// checkWorkflowAE loads each workflow resource from the DB and checks its
// embedded ExploitationAuthorizations against the execution context.
// Kept in oc-schedulerd (not oc-lib/models/resources) to avoid a circular
// import: resources → oclib → models → resources.
func checkWorkflowAE(
	workflowID string,
	consumerPeerID string,
	resourcesByType map[tools.DataType][]string,
	idSet map[string]struct{},
) []resources.AEViolation {
	now := time.Now().UTC()
	var violations []resources.AEViolation

	type hasAE interface {
		GetExploitationAuthorizations() []resources.ExploitationAuthorization
	}

	for dt, ids := range resourcesByType {
		for _, id := range ids {
			res := oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil).LoadOne(id)
			if res.Err != "" || res.Data == nil {
				continue
			}
			ra, ok := res.Data.(hasAE)
			if !ok {
				continue
			}
			for _, ae := range ra.GetExploitationAuthorizations() {
				vs := ae.CheckAE(id, workflowID, consumerPeerID, idSet, now)
				violations = append(violations, vs...)
			}
		}
	}
	return violations
}

func (em *ExecutionManager) executeExecution(execution *workflow_execution.WorkflowExecution) {
	// start execution
	// create the yaml that describes the pod : filename, path/url to Loki
	var executor Executor
	// exec_method := os.Getenv("MONITOR_METHOD")
	logger := oclib.GetLogger()

	// Sovereign integrity check — reject before touching any resource.
	if !em.validateWorkflowIntegrity(execution, logger) {
		return
	}

	duration := 0
	if execution.EndDate != nil {
		duration = int(execution.EndDate.Sub(execution.ExecDate).Seconds())
	}

	if conf.GetConfig().Mode == "kubernetes" {
		executor = NewContainerMonitor(execution.UUID, execution.CreatorID, duration, execution.ExecDate)
	} else {
		executor = NewLocalMonitor(execution.UUID, execution.CreatorID, duration, execution.ExecDate)
	}

	if executor == nil {
		logger.Fatal().Msg("Could not create executor")
		emitExecStateUpdate(execution.GetID(), enum.FAILURE)
		return
	}

	args := executor.PrepareMonitorExec()
	executor.LaunchMonitor(args, execution.GetID(), conf.GetConfig().KubeNamespace, logger)
}