This commit is contained in:
mr
2026-03-26 11:14:29 +01:00
parent c1609ea9d9
commit a8fa18520c
16 changed files with 730 additions and 261 deletions

View File

@@ -5,6 +5,7 @@ import (
"encoding/base64"
"fmt"
"oc-schedulerd/conf"
"time" // already used for ContainerMonitor.watchJob
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/models/common/enum"
@@ -27,15 +28,16 @@ type ContainerMonitor struct {
KubeImage string
}
func NewContainerMonitor(UUID string, peerId string, duration int) Executor {
func NewContainerMonitor(UUID string, peerId string, duration int, scheduledTime time.Time) Executor {
return &ContainerMonitor{
Monitor: LocalMonitor{
ExecutionID: UUID,
PeerID: peerId,
Duration: duration,
LokiUrl: oclib.GetConfig().LokiUrl,
MongoUrl: oclib.GetConfig().MongoUrl,
DBName: oclib.GetConfig().MongoDatabase,
ExecutionID: UUID,
PeerID: peerId,
Duration: duration,
LokiUrl: oclib.GetConfig().LokiUrl,
MongoUrl: oclib.GetConfig().MongoUrl,
DBName: oclib.GetConfig().MongoDatabase,
ScheduledTime: scheduledTime,
},
KubeCA: conf.GetConfig().KubeCA,
KubeCert: conf.GetConfig().KubeCert,
@@ -48,13 +50,9 @@ func NewContainerMonitor(UUID string, peerId string, duration int) Executor {
}
func (cm *ContainerMonitor) PrepareMonitorExec() []string {
args := []string{
"-e", cm.Monitor.ExecutionID,
"-p", cm.Monitor.PeerID,
"-u", cm.Monitor.LokiUrl,
"-m", cm.Monitor.MongoUrl,
"-d", cm.Monitor.DBName,
"-M", "kubernetes",
"-H", cm.KubeHost,
"-P", cm.KubePort,
@@ -77,7 +75,7 @@ func (cm *ContainerMonitor) failExec(execID string, l zerolog.Logger, msg string
}, execID)
}
func (cm *ContainerMonitor) LaunchMonitor(args []string, execID string, l zerolog.Logger) {
func (cm *ContainerMonitor) LaunchMonitor(args []string, execID string, ns string, l zerolog.Logger) {
ca, err := base64.StdEncoding.DecodeString(cm.KubeCA)
if err != nil {
@@ -104,6 +102,8 @@ func (cm *ContainerMonitor) LaunchMonitor(args []string, execID string, l zerolo
},
}
fmt.Println(ca, cert, key)
clientset, err := kubernetes.NewForConfig(cfg)
if err != nil {
cm.failExec(execID, l, "Failed to build Kubernetes client: "+err.Error())
@@ -111,21 +111,33 @@ func (cm *ContainerMonitor) LaunchMonitor(args []string, execID string, l zerolo
}
backoffLimit := int32(0)
l.Info().Str("mongo_url", oclib.GetConfig().MongoUrl).Msg("Env vars for job")
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: "oc-monitord-" + execID,
Namespace: cm.KubeNamespace,
Namespace: ns,
},
Spec: batchv1.JobSpec{
BackoffLimit: &backoffLimit,
Template: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
"k8s.v1.cni.cncf.io/networks": "docker-oc-network",
},
},
Spec: corev1.PodSpec{
RestartPolicy: corev1.RestartPolicyNever,
Containers: []corev1.Container{
{
Name: "oc-monitord",
Image: cm.KubeImage,
Args: args,
Name: "oc-monitord",
Image: cm.KubeImage,
Args: args,
Env: []corev1.EnvVar{
{Name: "OC_MONGO_URL", Value: oclib.GetConfig().MongoUrl},
{Name: "OC_MONGO_DATABASE", Value: oclib.GetConfig().MongoDatabase},
{Name: "OC_LOKI_URL", Value: oclib.GetConfig().LokiUrl},
{Name: "OC_NATS_URL", Value: oclib.GetConfig().NATSUrl},
},
},
},
},
@@ -133,11 +145,81 @@ func (cm *ContainerMonitor) LaunchMonitor(args []string, execID string, l zerolo
},
}
_, err = clientset.BatchV1().Jobs(cm.KubeNamespace).Create(context.Background(), job, metav1.CreateOptions{})
_, err = clientset.BatchV1().Jobs(ns).Create(context.Background(), job, metav1.CreateOptions{})
if err != nil {
fmt.Println("Failed to create Kubernetes Job: ", err)
cm.failExec(execID, l, "Failed to create Kubernetes Job: "+err.Error())
return
}
l.Info().Msg("Started Kubernetes Job oc-monitord-" + execID)
l.Info().Str("job", "oc-monitord-"+execID).Msg("Kubernetes Job created")
go cm.watchJob(clientset, execID, ns, l)
}
func (cm *ContainerMonitor) watchJob(clientset *kubernetes.Clientset, execID string, ns string, l zerolog.Logger) {
jobName := "oc-monitord-" + execID
l = l.With().Str("job", jobName).Logger()
// Poll until the pod spawned by the job appears (up to 60s)
podName := ""
for i := 0; i < 60; i++ {
pods, err := clientset.CoreV1().Pods(ns).List(context.Background(), metav1.ListOptions{
LabelSelector: "job-name=" + jobName,
})
if err != nil {
l.Error().Err(err).Msg("Failed to list pods for job")
return
}
if len(pods.Items) > 0 {
podName = pods.Items[0].Name
break
}
time.Sleep(time.Second)
}
if podName == "" {
l.Error().Msg("No pod found for job after 60s")
return
}
l.Info().Str("pod", podName).Msg("Pod found for job")
// Wait for the pod to be Running or terminal (up to 120s)
for i := 0; i < 120; i++ {
pod, err := clientset.CoreV1().Pods(ns).Get(context.Background(), podName, metav1.GetOptions{})
if err != nil {
l.Error().Err(err).Str("pod", podName).Msg("Failed to get pod status")
return
}
phase := pod.Status.Phase
if phase == corev1.PodRunning || phase == corev1.PodSucceeded || phase == corev1.PodFailed {
l.Info().Str("pod", podName).Str("phase", string(phase)).Msg("Pod phase")
break
}
time.Sleep(time.Second)
}
// Stream pod logs
req := clientset.CoreV1().Pods(ns).GetLogs(podName, &corev1.PodLogOptions{Follow: true})
stream, err := req.Stream(context.Background())
if err != nil {
l.Error().Err(err).Str("pod", podName).Msg("Failed to stream pod logs")
} else {
defer stream.Close()
l.Info().Str("pod", podName).Msg("Streaming pod logs")
logExecution(stream, l)
}
// Log final job status
job, err := clientset.BatchV1().Jobs(ns).Get(context.Background(), jobName, metav1.GetOptions{})
if err != nil {
l.Error().Err(err).Msg("Failed to get final job status")
return
}
if job.Status.Succeeded > 0 {
l.Info().Msg("Job succeeded")
} else {
msg := fmt.Sprintf("Job failed with %d failed pod(s)", job.Status.Failed)
cm.failExec(execID, l, msg)
}
}