Prepull for pod & Asym Jobs

This commit is contained in:
mr
2026-03-25 11:13:12 +01:00
parent 56bc342d24
commit a9284314ef
17 changed files with 754 additions and 512 deletions

View File

@@ -21,6 +21,7 @@ var _service = map[string]func() (Tool, error){
}
func NewService(name string) (Tool, error) {
return NewKubernetesTool()
service, ok := _service[name]
if !ok {
return nil, errors.New("service not found")

View File

@@ -8,7 +8,6 @@ import (
"oc-monitord/conf"
"oc-monitord/utils"
"os"
"time"
wfv1 "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1"
"github.com/argoproj/argo-workflows/v3/pkg/client/clientset/versioned"
@@ -76,7 +75,7 @@ func (k *KubernetesTools) CreateArgoWorkflow(path string, ns string) (string, er
if !ok {
return "", errors.New("decoded object is not a Workflow")
}
fmt.Println("NAMESPACE", ns)
// Create the workflow in the "argo" namespace
createdWf, err := k.VersionedSet.ArgoprojV1alpha1().Workflows(ns).Create(context.TODO(), workflow, metav1.CreateOptions{})
if err != nil {
@@ -96,7 +95,7 @@ func (k *KubernetesTools) CreateAccessSecret(access string, password string, sto
}
// Define the Secret object
name := storageId+"-secret-s3"
name := storageId + "-secret-s3"
secret := &v1.Secret{
ObjectMeta: metav1.ObjectMeta{
Name: name,
@@ -116,9 +115,9 @@ func (k *KubernetesTools) CreateAccessSecret(access string, password string, sto
func (k *KubernetesTools) GetS3Secret(storageId string, namespace string) *v1.Secret {
secret, err := k.Set.CoreV1().Secrets(namespace).Get(context.TODO(), storageId + "-secret-s3", metav1.GetOptions{})
secret, err := k.Set.CoreV1().Secrets(namespace).Get(context.TODO(), storageId+"-secret-s3", metav1.GetOptions{})
// Get(context.TODO(),storageId + "-artifact-server", metav1.GetOptions{})
if err != nil && !k8serrors.IsNotFound(err) {
l := utils.GetLogger()
l.Fatal().Msg("An error happened when retrieving secret in " + namespace + " : " + err.Error())
@@ -128,77 +127,83 @@ func (k *KubernetesTools) GetS3Secret(storageId string, namespace string) *v1.Se
}
return secret
// return secret
// return secret
}
func (k *KubernetesTools) GetArgoWatch(executionId string, wfName string) (watch.Interface, error){
options := metav1.ListOptions{FieldSelector: "metadata.name=oc-monitor-"+wfName}
func (k *KubernetesTools) GetArgoWatch(executionId string, wfName string) (watch.Interface, error) {
options := metav1.ListOptions{FieldSelector: "metadata.name=oc-monitor-" + wfName}
watcher, err := k.VersionedSet.ArgoprojV1alpha1().Workflows(executionId).Watch(context.Background(), options)
if err != nil {
return nil, errors.New("Error executing 'argo watch " + wfName + " -n " + executionId + " with ArgoprojV1alpha1 client")
}
return watcher, nil
return watcher, nil
}
func (k *KubernetesTools) GetPodLogger(ns string, wfName string, nodeName string) (io.ReadCloser, error) {
var targetPod v1.Pod
pods, err := k.Set.CoreV1().Pods(ns).List(context.Background(), metav1.ListOptions{
LabelSelector: "workflows.argoproj.io/workflow="+wfName,
})
if err != nil {
return nil, fmt.Errorf("failed to list pods: " + err.Error())
}
if len(pods.Items) == 0 {
return nil, fmt.Errorf("no pods found with label workflows.argoproj.io/workflow="+ wfName + " no pods found with label workflows.argoproj.io/node-name=" + nodeName + " in namespace " + ns)
}
for _, pod := range pods.Items {
LabelSelector: "workflows.argoproj.io/workflow=" + wfName,
})
if err != nil {
return nil, fmt.Errorf("%s", "failed to list pods: "+err.Error())
}
if len(pods.Items) == 0 {
return nil, fmt.Errorf("%s", "no pods found with label workflows.argoproj.io/workflow="+wfName+" no pods found with label workflows.argoproj.io/node-name="+nodeName+" in namespace "+ns)
}
for _, pod := range pods.Items {
if pod.Annotations["workflows.argoproj.io/node-name"] == nodeName {
targetPod = pod
}
}
if targetPod.Name == "" {
return nil, fmt.Errorf("no pod found matching node-name %s in namespace %s", nodeName, ns)
}
// k8s API throws an error if we try getting logs while the container are not initialized, so we repeat status check there
k.testPodReady(targetPod, ns)
// When using kubec logs for a pod we see it contacts /api/v1/namespaces/NAMESPACE/pods/oc-monitor-PODNAME/log?container=main so we add this container: main to the call
req, err := k.Set.CoreV1().Pods(ns).GetLogs(targetPod.Name, &v1.PodLogOptions{Follow: true, Container: "main"}). Stream(context.Background())
req, err := k.Set.CoreV1().Pods(ns).GetLogs(targetPod.Name, &v1.PodLogOptions{Follow: true, Container: "main"}).Stream(context.Background())
if err != nil {
return nil, fmt.Errorf(" Error when trying to get logs for " + targetPod.Name + " : " + err.Error())
return nil, fmt.Errorf("%s", " Error when trying to get logs for "+targetPod.Name+" : "+err.Error())
}
return req, nil
}
func (k *KubernetesTools) testPodReady(pod v1.Pod, ns string) {
for {
pod, err := k.Set.CoreV1().Pods(ns).Get(context.Background(), pod.Name, metav1.GetOptions{})
if err != nil {
wfl := utils.GetWFLogger("")
wfl.Error().Msg("Error fetching pod: " + err.Error() + "\n")
break
wfl := utils.GetWFLogger("")
watcher, err := k.Set.CoreV1().Pods(ns).Watch(context.Background(), metav1.ListOptions{
FieldSelector: "metadata.name=" + pod.Name,
ResourceVersion: pod.ResourceVersion,
})
if err != nil {
wfl.Error().Msg("Error watching pod: " + err.Error() + "\n")
return
}
defer watcher.Stop()
for event := range watcher.ResultChan() {
p, ok := event.Object.(*v1.Pod)
if !ok {
continue
}
var initialized bool
for _, cond := range pod.Status.Conditions {
// It seems that for remote pods the pod gets the Succeeded status before it has time to display the it is ready to run in .status.conditions,so we added the OR condition
if (cond.Type == v1.PodReady && cond.Status == v1.ConditionTrue) || pod.Status.Phase == v1.PodSucceeded {
initialized = true
// It seems that for remote pods the pod gets the Succeeded status before it has time to display the it is ready to run in .status.conditions,so we added the OR condition
if p.Status.Phase == v1.PodSucceeded {
return
}
for _, cond := range p.Status.Conditions {
if cond.Type == v1.PodReady && cond.Status == v1.ConditionTrue {
return
}
}
if initialized {
return
}
time.Sleep(2 * time.Second) // avoid hammering the API
}
}
}