prep
This commit is contained in:
@@ -5,6 +5,7 @@ import (
|
||||
"encoding/base64"
|
||||
"fmt"
|
||||
"oc-schedulerd/conf"
|
||||
"time" // already used for ContainerMonitor.watchJob
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/models/common/enum"
|
||||
@@ -27,15 +28,16 @@ type ContainerMonitor struct {
|
||||
KubeImage string
|
||||
}
|
||||
|
||||
func NewContainerMonitor(UUID string, peerId string, duration int) Executor {
|
||||
func NewContainerMonitor(UUID string, peerId string, duration int, scheduledTime time.Time) Executor {
|
||||
return &ContainerMonitor{
|
||||
Monitor: LocalMonitor{
|
||||
ExecutionID: UUID,
|
||||
PeerID: peerId,
|
||||
Duration: duration,
|
||||
LokiUrl: oclib.GetConfig().LokiUrl,
|
||||
MongoUrl: oclib.GetConfig().MongoUrl,
|
||||
DBName: oclib.GetConfig().MongoDatabase,
|
||||
ExecutionID: UUID,
|
||||
PeerID: peerId,
|
||||
Duration: duration,
|
||||
LokiUrl: oclib.GetConfig().LokiUrl,
|
||||
MongoUrl: oclib.GetConfig().MongoUrl,
|
||||
DBName: oclib.GetConfig().MongoDatabase,
|
||||
ScheduledTime: scheduledTime,
|
||||
},
|
||||
KubeCA: conf.GetConfig().KubeCA,
|
||||
KubeCert: conf.GetConfig().KubeCert,
|
||||
@@ -48,13 +50,9 @@ func NewContainerMonitor(UUID string, peerId string, duration int) Executor {
|
||||
}
|
||||
|
||||
func (cm *ContainerMonitor) PrepareMonitorExec() []string {
|
||||
|
||||
args := []string{
|
||||
"-e", cm.Monitor.ExecutionID,
|
||||
"-p", cm.Monitor.PeerID,
|
||||
"-u", cm.Monitor.LokiUrl,
|
||||
"-m", cm.Monitor.MongoUrl,
|
||||
"-d", cm.Monitor.DBName,
|
||||
"-M", "kubernetes",
|
||||
"-H", cm.KubeHost,
|
||||
"-P", cm.KubePort,
|
||||
@@ -77,7 +75,7 @@ func (cm *ContainerMonitor) failExec(execID string, l zerolog.Logger, msg string
|
||||
}, execID)
|
||||
}
|
||||
|
||||
func (cm *ContainerMonitor) LaunchMonitor(args []string, execID string, l zerolog.Logger) {
|
||||
func (cm *ContainerMonitor) LaunchMonitor(args []string, execID string, ns string, l zerolog.Logger) {
|
||||
|
||||
ca, err := base64.StdEncoding.DecodeString(cm.KubeCA)
|
||||
if err != nil {
|
||||
@@ -104,6 +102,8 @@ func (cm *ContainerMonitor) LaunchMonitor(args []string, execID string, l zerolo
|
||||
},
|
||||
}
|
||||
|
||||
fmt.Println(ca, cert, key)
|
||||
|
||||
clientset, err := kubernetes.NewForConfig(cfg)
|
||||
if err != nil {
|
||||
cm.failExec(execID, l, "Failed to build Kubernetes client: "+err.Error())
|
||||
@@ -111,21 +111,33 @@ func (cm *ContainerMonitor) LaunchMonitor(args []string, execID string, l zerolo
|
||||
}
|
||||
|
||||
backoffLimit := int32(0)
|
||||
l.Info().Str("mongo_url", oclib.GetConfig().MongoUrl).Msg("Env vars for job")
|
||||
job := &batchv1.Job{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "oc-monitord-" + execID,
|
||||
Namespace: cm.KubeNamespace,
|
||||
Namespace: ns,
|
||||
},
|
||||
Spec: batchv1.JobSpec{
|
||||
BackoffLimit: &backoffLimit,
|
||||
Template: corev1.PodTemplateSpec{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Annotations: map[string]string{
|
||||
"k8s.v1.cni.cncf.io/networks": "docker-oc-network",
|
||||
},
|
||||
},
|
||||
Spec: corev1.PodSpec{
|
||||
RestartPolicy: corev1.RestartPolicyNever,
|
||||
Containers: []corev1.Container{
|
||||
{
|
||||
Name: "oc-monitord",
|
||||
Image: cm.KubeImage,
|
||||
Args: args,
|
||||
Name: "oc-monitord",
|
||||
Image: cm.KubeImage,
|
||||
Args: args,
|
||||
Env: []corev1.EnvVar{
|
||||
{Name: "OC_MONGO_URL", Value: oclib.GetConfig().MongoUrl},
|
||||
{Name: "OC_MONGO_DATABASE", Value: oclib.GetConfig().MongoDatabase},
|
||||
{Name: "OC_LOKI_URL", Value: oclib.GetConfig().LokiUrl},
|
||||
{Name: "OC_NATS_URL", Value: oclib.GetConfig().NATSUrl},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -133,11 +145,81 @@ func (cm *ContainerMonitor) LaunchMonitor(args []string, execID string, l zerolo
|
||||
},
|
||||
}
|
||||
|
||||
_, err = clientset.BatchV1().Jobs(cm.KubeNamespace).Create(context.Background(), job, metav1.CreateOptions{})
|
||||
_, err = clientset.BatchV1().Jobs(ns).Create(context.Background(), job, metav1.CreateOptions{})
|
||||
if err != nil {
|
||||
fmt.Println("Failed to create Kubernetes Job: ", err)
|
||||
cm.failExec(execID, l, "Failed to create Kubernetes Job: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
l.Info().Msg("Started Kubernetes Job oc-monitord-" + execID)
|
||||
l.Info().Str("job", "oc-monitord-"+execID).Msg("Kubernetes Job created")
|
||||
go cm.watchJob(clientset, execID, ns, l)
|
||||
}
|
||||
|
||||
func (cm *ContainerMonitor) watchJob(clientset *kubernetes.Clientset, execID string, ns string, l zerolog.Logger) {
|
||||
jobName := "oc-monitord-" + execID
|
||||
l = l.With().Str("job", jobName).Logger()
|
||||
|
||||
// Poll until the pod spawned by the job appears (up to 60s)
|
||||
podName := ""
|
||||
for i := 0; i < 60; i++ {
|
||||
pods, err := clientset.CoreV1().Pods(ns).List(context.Background(), metav1.ListOptions{
|
||||
LabelSelector: "job-name=" + jobName,
|
||||
})
|
||||
if err != nil {
|
||||
l.Error().Err(err).Msg("Failed to list pods for job")
|
||||
return
|
||||
}
|
||||
if len(pods.Items) > 0 {
|
||||
podName = pods.Items[0].Name
|
||||
break
|
||||
}
|
||||
time.Sleep(time.Second)
|
||||
}
|
||||
|
||||
if podName == "" {
|
||||
l.Error().Msg("No pod found for job after 60s")
|
||||
return
|
||||
}
|
||||
|
||||
l.Info().Str("pod", podName).Msg("Pod found for job")
|
||||
|
||||
// Wait for the pod to be Running or terminal (up to 120s)
|
||||
for i := 0; i < 120; i++ {
|
||||
pod, err := clientset.CoreV1().Pods(ns).Get(context.Background(), podName, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
l.Error().Err(err).Str("pod", podName).Msg("Failed to get pod status")
|
||||
return
|
||||
}
|
||||
phase := pod.Status.Phase
|
||||
if phase == corev1.PodRunning || phase == corev1.PodSucceeded || phase == corev1.PodFailed {
|
||||
l.Info().Str("pod", podName).Str("phase", string(phase)).Msg("Pod phase")
|
||||
break
|
||||
}
|
||||
time.Sleep(time.Second)
|
||||
}
|
||||
|
||||
// Stream pod logs
|
||||
req := clientset.CoreV1().Pods(ns).GetLogs(podName, &corev1.PodLogOptions{Follow: true})
|
||||
stream, err := req.Stream(context.Background())
|
||||
if err != nil {
|
||||
l.Error().Err(err).Str("pod", podName).Msg("Failed to stream pod logs")
|
||||
} else {
|
||||
defer stream.Close()
|
||||
l.Info().Str("pod", podName).Msg("Streaming pod logs")
|
||||
logExecution(stream, l)
|
||||
}
|
||||
|
||||
// Log final job status
|
||||
job, err := clientset.BatchV1().Jobs(ns).Get(context.Background(), jobName, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
l.Error().Err(err).Msg("Failed to get final job status")
|
||||
return
|
||||
}
|
||||
if job.Status.Succeeded > 0 {
|
||||
l.Info().Msg("Job succeeded")
|
||||
} else {
|
||||
msg := fmt.Sprintf("Job failed with %d failed pod(s)", job.Status.Failed)
|
||||
cm.failExec(execID, l, msg)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,9 @@ package daemons
|
||||
import (
|
||||
"fmt"
|
||||
"oc-schedulerd/conf"
|
||||
"os"
|
||||
"os/exec"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/models/common/enum"
|
||||
@@ -11,22 +13,21 @@ import (
|
||||
)
|
||||
|
||||
type LocalMonitor struct {
|
||||
ExecutionID string
|
||||
PeerID string
|
||||
Duration int
|
||||
LokiUrl string
|
||||
MongoUrl string
|
||||
DBName string
|
||||
ExecutionID string
|
||||
PeerID string
|
||||
Duration int
|
||||
LokiUrl string
|
||||
MongoUrl string
|
||||
DBName string
|
||||
ScheduledTime time.Time
|
||||
}
|
||||
|
||||
func NewLocalMonitor(UUID string, peerId string, duration int) Executor {
|
||||
func NewLocalMonitor(UUID string, peerId string, duration int, scheduledTime time.Time) Executor {
|
||||
return &LocalMonitor{
|
||||
ExecutionID: UUID,
|
||||
PeerID: peerId,
|
||||
Duration: duration,
|
||||
LokiUrl: oclib.GetConfig().LokiUrl,
|
||||
MongoUrl: oclib.GetConfig().MongoUrl,
|
||||
DBName: oclib.GetConfig().MongoDatabase,
|
||||
ExecutionID: UUID,
|
||||
PeerID: peerId,
|
||||
Duration: duration,
|
||||
ScheduledTime: scheduledTime,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -42,11 +43,12 @@ func (lm *LocalMonitor) PrepareMonitorExec() []string {
|
||||
args := []string{
|
||||
"-e", lm.ExecutionID,
|
||||
"-p", lm.PeerID,
|
||||
"-u", lm.LokiUrl,
|
||||
"-m", lm.MongoUrl,
|
||||
"-d", lm.DBName,
|
||||
"-H", conf.GetConfig().KubeHost,
|
||||
"-c", conf.GetConfig().KubeCA,
|
||||
"-C", conf.GetConfig().KubeCert,
|
||||
"-D", conf.GetConfig().KubeData,
|
||||
"-s", fmt.Sprintf("%d", lm.ScheduledTime.Unix()),
|
||||
}
|
||||
|
||||
if lm.Duration > 0 {
|
||||
args = append(args, "-t", fmt.Sprintf("%d", lm.Duration))
|
||||
}
|
||||
@@ -54,13 +56,29 @@ func (lm *LocalMonitor) PrepareMonitorExec() []string {
|
||||
return args
|
||||
}
|
||||
|
||||
func (lm *LocalMonitor) LaunchMonitor(args []string, execID string, l zerolog.Logger) {
|
||||
func (lm *LocalMonitor) LaunchMonitor(args []string, execID string, ns string, l zerolog.Logger) {
|
||||
cmd := exec.Command(conf.GetConfig().MonitorPath, args...)
|
||||
fmt.Printf("Command : %v\n", cmd)
|
||||
cmd.Env = append(os.Environ(),
|
||||
"OC_MONGO_URL="+oclib.GetConfig().MongoUrl,
|
||||
"OC_MONGO_DATABASE="+oclib.GetConfig().MongoDatabase,
|
||||
"OC_LOKI_URL="+oclib.GetConfig().LokiUrl,
|
||||
"OC_NATS_URL="+oclib.GetConfig().NATSUrl,
|
||||
)
|
||||
fmt.Println("LaunchMonitor LOCAL")
|
||||
l.Info().Str("binary", conf.GetConfig().MonitorPath).Strs("args", args).Msg("Starting oc-monitord")
|
||||
|
||||
stdoutMonitord, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
l.Error().Msg("Could not retrieve stdoutpipe for execution of oc-monitord" + err.Error())
|
||||
l.Error().Err(err).Msg("Could not retrieve stdout pipe for oc-monitord")
|
||||
oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.WORKFLOW_EXECUTION), nil).UpdateOne(map[string]interface{}{
|
||||
"state": enum.FAILURE.EnumIndex(),
|
||||
}, execID)
|
||||
return
|
||||
}
|
||||
|
||||
stderrMonitord, err := cmd.StderrPipe()
|
||||
if err != nil {
|
||||
l.Error().Err(err).Msg("Could not retrieve stderr pipe for oc-monitord")
|
||||
oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.WORKFLOW_EXECUTION), nil).UpdateOne(map[string]interface{}{
|
||||
"state": enum.FAILURE.EnumIndex(),
|
||||
}, execID)
|
||||
@@ -69,11 +87,14 @@ func (lm *LocalMonitor) LaunchMonitor(args []string, execID string, l zerolog.Lo
|
||||
|
||||
err = cmd.Start()
|
||||
if err != nil {
|
||||
l.Error().Msg("Could not start oc-monitor for " + lm.ExecutionID + " : " + err.Error())
|
||||
l.Error().Err(err).Str("execution", lm.ExecutionID).Msg("Could not start oc-monitord")
|
||||
oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.WORKFLOW_EXECUTION), nil).UpdateOne(map[string]interface{}{
|
||||
"state": enum.FAILURE.EnumIndex(),
|
||||
}, execID)
|
||||
return
|
||||
}
|
||||
|
||||
l.Info().Int("pid", cmd.Process.Pid).Msg("oc-monitord started")
|
||||
go logExecution(stderrMonitord, l)
|
||||
logExecution(stdoutMonitord, l)
|
||||
}
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
package daemons
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"oc-schedulerd/conf"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/dbs"
|
||||
"cloud.o-forge.io/core/oc-lib/models/common/enum"
|
||||
@@ -34,12 +37,21 @@ func (em *ExecutionManager) RetrieveNextExecutions() {
|
||||
continue
|
||||
}
|
||||
|
||||
lead := time.Duration(conf.GetConfig().PrepLeadSeconds) * time.Second
|
||||
for execId, exec := range orderedExec[i] {
|
||||
if i == 0 && em.isAStartingExecutionBeforeEnd(&exec) { // BEST EFFORT exception
|
||||
continue
|
||||
}
|
||||
if exec.ExecDate.Before(time.Now().UTC()) {
|
||||
logger.Info().Msg("Will execute " + execId + " soon")
|
||||
// Fire PrepLeadSeconds before the scheduled start so oc-monitord
|
||||
// has time to pre-pull images and set up infra before ExecDate.
|
||||
if exec.ExecDate.Before(time.Now().UTC().Add(lead)) {
|
||||
logger.Info().Msg(fmt.Sprintf("Launching prep for %s (scheduled %s, lead %s)",
|
||||
execId, exec.ExecDate.Format(time.RFC3339), lead))
|
||||
// Mark as STARTED immediately (before goroutine) so the next
|
||||
// SchedulePolling cycle doesn't re-pick this execution from DB.
|
||||
oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.WORKFLOW_EXECUTION), nil).UpdateOne(map[string]interface{}{
|
||||
"state": enum.STARTED.EnumIndex(),
|
||||
}, exec.GetID())
|
||||
go em.executeExecution(&exec)
|
||||
delete(executions, execId)
|
||||
}
|
||||
@@ -78,7 +90,11 @@ func (em *ExecutionManager) executeExecution(execution *workflow_execution.Workf
|
||||
duration = int(execution.EndDate.Sub(execution.ExecDate).Seconds())
|
||||
}
|
||||
|
||||
executor = NewContainerMonitor(execution.UUID, execution.CreatorID, duration)
|
||||
if conf.GetConfig().Mode == "kubernetes" {
|
||||
executor = NewContainerMonitor(execution.UUID, execution.CreatorID, duration, execution.ExecDate)
|
||||
} else {
|
||||
executor = NewLocalMonitor(execution.UUID, execution.CreatorID, duration, execution.ExecDate)
|
||||
}
|
||||
|
||||
if executor == nil {
|
||||
logger.Fatal().Msg("Could not create executor")
|
||||
@@ -89,5 +105,5 @@ func (em *ExecutionManager) executeExecution(execution *workflow_execution.Workf
|
||||
}
|
||||
|
||||
args := executor.PrepareMonitorExec()
|
||||
executor.LaunchMonitor(args, execution.GetID(), logger)
|
||||
executor.LaunchMonitor(args, execution.GetID(), conf.GetConfig().KubeNamespace, logger)
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ package daemons
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"io"
|
||||
|
||||
"github.com/rs/zerolog"
|
||||
@@ -9,13 +10,14 @@ import (
|
||||
|
||||
type Executor interface {
|
||||
PrepareMonitorExec() []string
|
||||
LaunchMonitor(args []string, execID string, l zerolog.Logger)
|
||||
LaunchMonitor(args []string, execID string, ns string, l zerolog.Logger)
|
||||
}
|
||||
|
||||
func logExecution(reader io.ReadCloser, l zerolog.Logger) {
|
||||
scanner := bufio.NewScanner(reader)
|
||||
for scanner.Scan() {
|
||||
output := scanner.Text()
|
||||
fmt.Println(output)
|
||||
l.Debug().Msg(output)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,6 +6,8 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"oc-schedulerd/conf"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/dbs"
|
||||
"cloud.o-forge.io/core/oc-lib/models/common/enum"
|
||||
@@ -122,14 +124,14 @@ func (s *ScheduleManager) ExecuteWorkflow(resp tools.NATSResponse) {
|
||||
|
||||
func (s *ScheduleManager) GetNextScheduledWorkflows(_ tools.NATSResponse) {
|
||||
start := time.Now().UTC()
|
||||
fmt.Println(s.getExecution(
|
||||
start.Add(time.Second*time.Duration(-1)).UTC(),
|
||||
start.Add(time.Minute*time.Duration(1)).UTC(),
|
||||
))
|
||||
if next_wf_exec, err := s.getExecution(
|
||||
start.Add(time.Second*time.Duration(-1)).UTC(),
|
||||
start.Add(time.Minute*time.Duration(1)).UTC(),
|
||||
); err != nil {
|
||||
// Fetch executions whose scheduled start falls within the next
|
||||
// (PrepLeadSeconds + 60s) window, so they are loaded in time to
|
||||
// trigger oc-monitord PrepLeadSeconds before the actual start.
|
||||
horizon := time.Duration(conf.GetConfig().PrepLeadSeconds+60) * time.Second
|
||||
from := start.Add(-time.Second)
|
||||
to := start.Add(horizon)
|
||||
fmt.Println(s.getExecution(from, to))
|
||||
if next_wf_exec, err := s.getExecution(from, to); err != nil {
|
||||
s.Logger.Error().Msg("Could not retrieve next schedules")
|
||||
} else {
|
||||
Executions.AddSchedules(next_wf_exec, s.Logger)
|
||||
|
||||
Reference in New Issue
Block a user