This commit is contained in:
mr
2026-03-26 11:14:29 +01:00
parent c1609ea9d9
commit a8fa18520c
16 changed files with 730 additions and 261 deletions

View File

@@ -5,6 +5,7 @@ import (
"encoding/base64"
"fmt"
"oc-schedulerd/conf"
"time" // already used for ContainerMonitor.watchJob
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/models/common/enum"
@@ -27,15 +28,16 @@ type ContainerMonitor struct {
KubeImage string
}
func NewContainerMonitor(UUID string, peerId string, duration int) Executor {
func NewContainerMonitor(UUID string, peerId string, duration int, scheduledTime time.Time) Executor {
return &ContainerMonitor{
Monitor: LocalMonitor{
ExecutionID: UUID,
PeerID: peerId,
Duration: duration,
LokiUrl: oclib.GetConfig().LokiUrl,
MongoUrl: oclib.GetConfig().MongoUrl,
DBName: oclib.GetConfig().MongoDatabase,
ExecutionID: UUID,
PeerID: peerId,
Duration: duration,
LokiUrl: oclib.GetConfig().LokiUrl,
MongoUrl: oclib.GetConfig().MongoUrl,
DBName: oclib.GetConfig().MongoDatabase,
ScheduledTime: scheduledTime,
},
KubeCA: conf.GetConfig().KubeCA,
KubeCert: conf.GetConfig().KubeCert,
@@ -48,13 +50,9 @@ func NewContainerMonitor(UUID string, peerId string, duration int) Executor {
}
func (cm *ContainerMonitor) PrepareMonitorExec() []string {
args := []string{
"-e", cm.Monitor.ExecutionID,
"-p", cm.Monitor.PeerID,
"-u", cm.Monitor.LokiUrl,
"-m", cm.Monitor.MongoUrl,
"-d", cm.Monitor.DBName,
"-M", "kubernetes",
"-H", cm.KubeHost,
"-P", cm.KubePort,
@@ -77,7 +75,7 @@ func (cm *ContainerMonitor) failExec(execID string, l zerolog.Logger, msg string
}, execID)
}
func (cm *ContainerMonitor) LaunchMonitor(args []string, execID string, l zerolog.Logger) {
func (cm *ContainerMonitor) LaunchMonitor(args []string, execID string, ns string, l zerolog.Logger) {
ca, err := base64.StdEncoding.DecodeString(cm.KubeCA)
if err != nil {
@@ -104,6 +102,8 @@ func (cm *ContainerMonitor) LaunchMonitor(args []string, execID string, l zerolo
},
}
fmt.Println(ca, cert, key)
clientset, err := kubernetes.NewForConfig(cfg)
if err != nil {
cm.failExec(execID, l, "Failed to build Kubernetes client: "+err.Error())
@@ -111,21 +111,33 @@ func (cm *ContainerMonitor) LaunchMonitor(args []string, execID string, l zerolo
}
backoffLimit := int32(0)
l.Info().Str("mongo_url", oclib.GetConfig().MongoUrl).Msg("Env vars for job")
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: "oc-monitord-" + execID,
Namespace: cm.KubeNamespace,
Namespace: ns,
},
Spec: batchv1.JobSpec{
BackoffLimit: &backoffLimit,
Template: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
"k8s.v1.cni.cncf.io/networks": "docker-oc-network",
},
},
Spec: corev1.PodSpec{
RestartPolicy: corev1.RestartPolicyNever,
Containers: []corev1.Container{
{
Name: "oc-monitord",
Image: cm.KubeImage,
Args: args,
Name: "oc-monitord",
Image: cm.KubeImage,
Args: args,
Env: []corev1.EnvVar{
{Name: "OC_MONGO_URL", Value: oclib.GetConfig().MongoUrl},
{Name: "OC_MONGO_DATABASE", Value: oclib.GetConfig().MongoDatabase},
{Name: "OC_LOKI_URL", Value: oclib.GetConfig().LokiUrl},
{Name: "OC_NATS_URL", Value: oclib.GetConfig().NATSUrl},
},
},
},
},
@@ -133,11 +145,81 @@ func (cm *ContainerMonitor) LaunchMonitor(args []string, execID string, l zerolo
},
}
_, err = clientset.BatchV1().Jobs(cm.KubeNamespace).Create(context.Background(), job, metav1.CreateOptions{})
_, err = clientset.BatchV1().Jobs(ns).Create(context.Background(), job, metav1.CreateOptions{})
if err != nil {
fmt.Println("Failed to create Kubernetes Job: ", err)
cm.failExec(execID, l, "Failed to create Kubernetes Job: "+err.Error())
return
}
l.Info().Msg("Started Kubernetes Job oc-monitord-" + execID)
l.Info().Str("job", "oc-monitord-"+execID).Msg("Kubernetes Job created")
go cm.watchJob(clientset, execID, ns, l)
}
func (cm *ContainerMonitor) watchJob(clientset *kubernetes.Clientset, execID string, ns string, l zerolog.Logger) {
jobName := "oc-monitord-" + execID
l = l.With().Str("job", jobName).Logger()
// Poll until the pod spawned by the job appears (up to 60s)
podName := ""
for i := 0; i < 60; i++ {
pods, err := clientset.CoreV1().Pods(ns).List(context.Background(), metav1.ListOptions{
LabelSelector: "job-name=" + jobName,
})
if err != nil {
l.Error().Err(err).Msg("Failed to list pods for job")
return
}
if len(pods.Items) > 0 {
podName = pods.Items[0].Name
break
}
time.Sleep(time.Second)
}
if podName == "" {
l.Error().Msg("No pod found for job after 60s")
return
}
l.Info().Str("pod", podName).Msg("Pod found for job")
// Wait for the pod to be Running or terminal (up to 120s)
for i := 0; i < 120; i++ {
pod, err := clientset.CoreV1().Pods(ns).Get(context.Background(), podName, metav1.GetOptions{})
if err != nil {
l.Error().Err(err).Str("pod", podName).Msg("Failed to get pod status")
return
}
phase := pod.Status.Phase
if phase == corev1.PodRunning || phase == corev1.PodSucceeded || phase == corev1.PodFailed {
l.Info().Str("pod", podName).Str("phase", string(phase)).Msg("Pod phase")
break
}
time.Sleep(time.Second)
}
// Stream pod logs
req := clientset.CoreV1().Pods(ns).GetLogs(podName, &corev1.PodLogOptions{Follow: true})
stream, err := req.Stream(context.Background())
if err != nil {
l.Error().Err(err).Str("pod", podName).Msg("Failed to stream pod logs")
} else {
defer stream.Close()
l.Info().Str("pod", podName).Msg("Streaming pod logs")
logExecution(stream, l)
}
// Log final job status
job, err := clientset.BatchV1().Jobs(ns).Get(context.Background(), jobName, metav1.GetOptions{})
if err != nil {
l.Error().Err(err).Msg("Failed to get final job status")
return
}
if job.Status.Succeeded > 0 {
l.Info().Msg("Job succeeded")
} else {
msg := fmt.Sprintf("Job failed with %d failed pod(s)", job.Status.Failed)
cm.failExec(execID, l, msg)
}
}

View File

@@ -3,7 +3,9 @@ package daemons
import (
"fmt"
"oc-schedulerd/conf"
"os"
"os/exec"
"time"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/models/common/enum"
@@ -11,22 +13,21 @@ import (
)
type LocalMonitor struct {
ExecutionID string
PeerID string
Duration int
LokiUrl string
MongoUrl string
DBName string
ExecutionID string
PeerID string
Duration int
LokiUrl string
MongoUrl string
DBName string
ScheduledTime time.Time
}
func NewLocalMonitor(UUID string, peerId string, duration int) Executor {
func NewLocalMonitor(UUID string, peerId string, duration int, scheduledTime time.Time) Executor {
return &LocalMonitor{
ExecutionID: UUID,
PeerID: peerId,
Duration: duration,
LokiUrl: oclib.GetConfig().LokiUrl,
MongoUrl: oclib.GetConfig().MongoUrl,
DBName: oclib.GetConfig().MongoDatabase,
ExecutionID: UUID,
PeerID: peerId,
Duration: duration,
ScheduledTime: scheduledTime,
}
}
@@ -42,11 +43,12 @@ func (lm *LocalMonitor) PrepareMonitorExec() []string {
args := []string{
"-e", lm.ExecutionID,
"-p", lm.PeerID,
"-u", lm.LokiUrl,
"-m", lm.MongoUrl,
"-d", lm.DBName,
"-H", conf.GetConfig().KubeHost,
"-c", conf.GetConfig().KubeCA,
"-C", conf.GetConfig().KubeCert,
"-D", conf.GetConfig().KubeData,
"-s", fmt.Sprintf("%d", lm.ScheduledTime.Unix()),
}
if lm.Duration > 0 {
args = append(args, "-t", fmt.Sprintf("%d", lm.Duration))
}
@@ -54,13 +56,29 @@ func (lm *LocalMonitor) PrepareMonitorExec() []string {
return args
}
func (lm *LocalMonitor) LaunchMonitor(args []string, execID string, l zerolog.Logger) {
func (lm *LocalMonitor) LaunchMonitor(args []string, execID string, ns string, l zerolog.Logger) {
cmd := exec.Command(conf.GetConfig().MonitorPath, args...)
fmt.Printf("Command : %v\n", cmd)
cmd.Env = append(os.Environ(),
"OC_MONGO_URL="+oclib.GetConfig().MongoUrl,
"OC_MONGO_DATABASE="+oclib.GetConfig().MongoDatabase,
"OC_LOKI_URL="+oclib.GetConfig().LokiUrl,
"OC_NATS_URL="+oclib.GetConfig().NATSUrl,
)
fmt.Println("LaunchMonitor LOCAL")
l.Info().Str("binary", conf.GetConfig().MonitorPath).Strs("args", args).Msg("Starting oc-monitord")
stdoutMonitord, err := cmd.StdoutPipe()
if err != nil {
l.Error().Msg("Could not retrieve stdoutpipe for execution of oc-monitord" + err.Error())
l.Error().Err(err).Msg("Could not retrieve stdout pipe for oc-monitord")
oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.WORKFLOW_EXECUTION), nil).UpdateOne(map[string]interface{}{
"state": enum.FAILURE.EnumIndex(),
}, execID)
return
}
stderrMonitord, err := cmd.StderrPipe()
if err != nil {
l.Error().Err(err).Msg("Could not retrieve stderr pipe for oc-monitord")
oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.WORKFLOW_EXECUTION), nil).UpdateOne(map[string]interface{}{
"state": enum.FAILURE.EnumIndex(),
}, execID)
@@ -69,11 +87,14 @@ func (lm *LocalMonitor) LaunchMonitor(args []string, execID string, l zerolog.Lo
err = cmd.Start()
if err != nil {
l.Error().Msg("Could not start oc-monitor for " + lm.ExecutionID + " : " + err.Error())
l.Error().Err(err).Str("execution", lm.ExecutionID).Msg("Could not start oc-monitord")
oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.WORKFLOW_EXECUTION), nil).UpdateOne(map[string]interface{}{
"state": enum.FAILURE.EnumIndex(),
}, execID)
return
}
l.Info().Int("pid", cmd.Process.Pid).Msg("oc-monitord started")
go logExecution(stderrMonitord, l)
logExecution(stdoutMonitord, l)
}

View File

@@ -1,8 +1,11 @@
package daemons
import (
"fmt"
"time"
"oc-schedulerd/conf"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/dbs"
"cloud.o-forge.io/core/oc-lib/models/common/enum"
@@ -34,12 +37,21 @@ func (em *ExecutionManager) RetrieveNextExecutions() {
continue
}
lead := time.Duration(conf.GetConfig().PrepLeadSeconds) * time.Second
for execId, exec := range orderedExec[i] {
if i == 0 && em.isAStartingExecutionBeforeEnd(&exec) { // BEST EFFORT exception
continue
}
if exec.ExecDate.Before(time.Now().UTC()) {
logger.Info().Msg("Will execute " + execId + " soon")
// Fire PrepLeadSeconds before the scheduled start so oc-monitord
// has time to pre-pull images and set up infra before ExecDate.
if exec.ExecDate.Before(time.Now().UTC().Add(lead)) {
logger.Info().Msg(fmt.Sprintf("Launching prep for %s (scheduled %s, lead %s)",
execId, exec.ExecDate.Format(time.RFC3339), lead))
// Mark as STARTED immediately (before goroutine) so the next
// SchedulePolling cycle doesn't re-pick this execution from DB.
oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.WORKFLOW_EXECUTION), nil).UpdateOne(map[string]interface{}{
"state": enum.STARTED.EnumIndex(),
}, exec.GetID())
go em.executeExecution(&exec)
delete(executions, execId)
}
@@ -78,7 +90,11 @@ func (em *ExecutionManager) executeExecution(execution *workflow_execution.Workf
duration = int(execution.EndDate.Sub(execution.ExecDate).Seconds())
}
executor = NewContainerMonitor(execution.UUID, execution.CreatorID, duration)
if conf.GetConfig().Mode == "kubernetes" {
executor = NewContainerMonitor(execution.UUID, execution.CreatorID, duration, execution.ExecDate)
} else {
executor = NewLocalMonitor(execution.UUID, execution.CreatorID, duration, execution.ExecDate)
}
if executor == nil {
logger.Fatal().Msg("Could not create executor")
@@ -89,5 +105,5 @@ func (em *ExecutionManager) executeExecution(execution *workflow_execution.Workf
}
args := executor.PrepareMonitorExec()
executor.LaunchMonitor(args, execution.GetID(), logger)
executor.LaunchMonitor(args, execution.GetID(), conf.GetConfig().KubeNamespace, logger)
}

View File

@@ -2,6 +2,7 @@ package daemons
import (
"bufio"
"fmt"
"io"
"github.com/rs/zerolog"
@@ -9,13 +10,14 @@ import (
type Executor interface {
PrepareMonitorExec() []string
LaunchMonitor(args []string, execID string, l zerolog.Logger)
LaunchMonitor(args []string, execID string, ns string, l zerolog.Logger)
}
func logExecution(reader io.ReadCloser, l zerolog.Logger) {
scanner := bufio.NewScanner(reader)
for scanner.Scan() {
output := scanner.Text()
fmt.Println(output)
l.Debug().Msg(output)
}
}

View File

@@ -6,6 +6,8 @@ import (
"sync"
"time"
"oc-schedulerd/conf"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/dbs"
"cloud.o-forge.io/core/oc-lib/models/common/enum"
@@ -122,14 +124,14 @@ func (s *ScheduleManager) ExecuteWorkflow(resp tools.NATSResponse) {
func (s *ScheduleManager) GetNextScheduledWorkflows(_ tools.NATSResponse) {
start := time.Now().UTC()
fmt.Println(s.getExecution(
start.Add(time.Second*time.Duration(-1)).UTC(),
start.Add(time.Minute*time.Duration(1)).UTC(),
))
if next_wf_exec, err := s.getExecution(
start.Add(time.Second*time.Duration(-1)).UTC(),
start.Add(time.Minute*time.Duration(1)).UTC(),
); err != nil {
// Fetch executions whose scheduled start falls within the next
// (PrepLeadSeconds + 60s) window, so they are loaded in time to
// trigger oc-monitord PrepLeadSeconds before the actual start.
horizon := time.Duration(conf.GetConfig().PrepLeadSeconds+60) * time.Second
from := start.Add(-time.Second)
to := start.Add(horizon)
fmt.Println(s.getExecution(from, to))
if next_wf_exec, err := s.getExecution(from, to); err != nil {
s.Logger.Error().Msg("Could not retrieve next schedules")
} else {
Executions.AddSchedules(next_wf_exec, s.Logger)