draft... metrics by booking...

This commit is contained in:
mr
2025-06-18 08:26:10 +02:00
parent 001539fb36
commit c7290b0ead
8 changed files with 489 additions and 233 deletions

View File

@@ -15,18 +15,27 @@ type Infrastructure interface {
CreateServiceAccount(ctx context.Context, ns string) error
CreateRoleBinding(ctx context.Context, ns string, roleBinding string, role string) error
CreateRole(ctx context.Context, ns string, role string, groups [][]string, resources [][]string, verbs [][]string) error
GetTargets(ctx context.Context) ([]string,error)
GetTargets(ctx context.Context) ([]string, error)
CreateAdmiraltySource(context context.Context, executionId string) ([]byte, error)
CreateKubeconfigSecret(context context.Context, kubeconfig string, executionId string, peerId string) ([]byte, error)
CreateKubeconfigSecret(context context.Context, kubeconfig string, executionId string, peerId string) ([]byte, error)
GetKubeconfigSecret(context context.Context, executionId string, peerId string) ([]byte, error)
CreateAdmiraltyTarget(context context.Context, executionId string, peerId string)([]byte,error)
CreateAdmiraltyTarget(context context.Context, executionId string, peerId string) ([]byte, error)
GetOneNode(context context.Context, executionID string, peerId string) (*v1.Node, error)
CheckHealth() error
}
var _service = map[string]func() (Infrastructure, error){
"kubernetes": NewKubernetesService,
}
func NewServiceByType(t string) (Infrastructure, error) {
service, ok := _service[t]
if !ok {
return nil, errors.New("service not found")
}
return service()
}
func NewService() (Infrastructure, error) {
service, ok := _service[conf.GetConfig().Mode]
if !ok {
@@ -34,4 +43,3 @@ func NewService() (Infrastructure, error) {
}
return service()
}

View File

@@ -8,6 +8,7 @@ import (
"fmt"
"oc-datacenter/conf"
"strings"
"time"
authv1 "k8s.io/api/authentication/v1"
v1 "k8s.io/api/core/v1"
@@ -26,7 +27,7 @@ var gvrSources = schema.GroupVersionResource{Group: "multicluster.admiralty.io",
var gvrTargets = schema.GroupVersionResource{Group: "multicluster.admiralty.io", Version: "v1alpha1", Resource: "targets"}
type KubernetesService struct {
Set *kubernetes.Clientset
Set *kubernetes.Clientset
}
func NewDynamicClient() (*dynamic.DynamicClient, error) {
@@ -59,7 +60,7 @@ func NewKubernetesService() (Infrastructure, error) {
KeyData: []byte(conf.GetConfig().KubeData),
},
}
// Create clientset
clientset, err := kubernetes.NewForConfig(config)
fmt.Println("NewForConfig", clientset, err)
@@ -70,7 +71,6 @@ func NewKubernetesService() (Infrastructure, error) {
return nil, errors.New("Error creating Kubernetes client: clientset is nil")
}
return &KubernetesService{
Set: clientset,
}, nil
@@ -111,7 +111,7 @@ func (k *KubernetesService) CreateNamespace(ctx context.Context, ns string) erro
ObjectMeta: metav1.ObjectMeta{
Name: ns,
Labels: map[string]string{
"multicluster-scheduler":"enabled",
"multicluster-scheduler": "enabled",
},
},
}
@@ -291,24 +291,24 @@ func (k *KubernetesService) CreateAdmiraltyTarget(context context.Context, execu
fmt.Println("Target needs to be binded to a secret in namespace ", executionId)
return nil, nil // Maybe we could create a wrapper for errors and add more info to have
}
targetName := "target-" + getConcatenatedName(peerId,executionId)
targetName := "target-" + getConcatenatedName(peerId, executionId)
target := map[string]interface{}{
"apiVersion": "multicluster.admiralty.io/v1alpha1",
"kind": "Target",
"metadata": map[string]interface{}{
"name": targetName,
"namespace": executionId,
"apiVersion": "multicluster.admiralty.io/v1alpha1",
"kind": "Target",
"metadata": map[string]interface{}{
"name": targetName,
"namespace": executionId,
"labels": map[string]interface{}{
"peer": peerId,
},
},
"spec": map[string]interface{}{
"kubeconfigSecret": map[string]string{
"name" : "kube-secret-"+ getConcatenatedName(peerId, executionId),
},
"spec": map[string]interface{}{
"kubeconfigSecret": map[string]string{
"name": "kube-secret-" + getConcatenatedName(peerId, executionId),
},
},
}
},
}
res, err := dynamicClientApply(executionId, targetName, gvrTargets, context, target)
if err != nil {
@@ -327,22 +327,21 @@ func (k *KubernetesService) CreateAdmiraltyTarget(context context.Context, execu
// This method is temporary to implement the use of Admiralty, but must be edited
// to rather contact the oc-datacenter from the remote cluster to create the source
// locally and retrieve the token for the serviceAccount
func (k *KubernetesService) CreateAdmiraltySource(context context.Context,executionId string) ([]byte, error) {
func (k *KubernetesService) CreateAdmiraltySource(context context.Context, executionId string) ([]byte, error) {
source := map[string]interface{}{
"apiVersion": "multicluster.admiralty.io/v1alpha1",
"kind": "Source",
"metadata": map[string]interface{}{
"name": "source-"+executionId,
"namespace": executionId,
},
"spec": map[string]interface{}{
"serviceAccountName": "sa-"+executionId,
},
}
"apiVersion": "multicluster.admiralty.io/v1alpha1",
"kind": "Source",
"metadata": map[string]interface{}{
"name": "source-" + executionId,
"namespace": executionId,
},
"spec": map[string]interface{}{
"serviceAccountName": "sa-" + executionId,
},
}
res, err := dynamicClientApply(executionId, "source-" + executionId,gvrSources, context, source)
res, err := dynamicClientApply(executionId, "source-"+executionId, gvrSources, context, source)
if err != nil {
return nil, errors.New("Error when trying to apply Source definition :" + err.Error())
}
@@ -361,14 +360,12 @@ func (k *KubernetesService) CreateKubeconfigSecret(context context.Context, kube
return nil, err
}
secretApplyConfig := apply.Secret("kube-secret-" + getConcatenatedName(peerId, executionId),
executionId).
WithData(map[string][]byte{
"config": config,
},
)
secretApplyConfig := apply.Secret("kube-secret-"+getConcatenatedName(peerId, executionId),
executionId).
WithData(map[string][]byte{
"config": config,
},
)
// exists, err := k.GetKubeconfigSecret(context,executionId)
// if err != nil {
@@ -384,15 +381,14 @@ func (k *KubernetesService) CreateKubeconfigSecret(context context.Context, kube
// _ = err
// }
resp, err := k.Set.CoreV1().
Secrets(executionId).
Apply(context,
secretApplyConfig,
metav1.ApplyOptions{
FieldManager: "admiralty-manager",
})
Secrets(executionId).
Apply(context,
secretApplyConfig,
metav1.ApplyOptions{
FieldManager: "admiralty-manager",
})
if err != nil {
fmt.Println("Error while trying to contact API to get secret kube-secret-" + executionId)
fmt.Println(err)
@@ -411,7 +407,7 @@ func (k *KubernetesService) CreateKubeconfigSecret(context context.Context, kube
func (k *KubernetesService) GetKubeconfigSecret(context context.Context, executionId string, peerId string) ([]byte, error) {
resp, err := k.Set.CoreV1().
Secrets(executionId).
Get(context, "kube-secret-"+ getConcatenatedName(peerId, executionId), metav1.GetOptions{})
Get(context, "kube-secret-"+getConcatenatedName(peerId, executionId), metav1.GetOptions{})
if err != nil {
if apierrors.IsNotFound(err) {
@@ -459,23 +455,22 @@ func dynamicClientApply(executionId string, resourceName string, resourceDefinit
}
res, err := cli.Resource(resourceDefinition).
Namespace(executionId).
Apply(ctx,
resourceName,
&unstructured.Unstructured{Object: object},
metav1.ApplyOptions{
FieldManager: "kubectl-client-side-apply",
},
)
Namespace(executionId).
Apply(ctx,
resourceName,
&unstructured.Unstructured{Object: object},
metav1.ApplyOptions{
FieldManager: "kubectl-client-side-apply",
},
)
if err != nil {
o, err := json.Marshal(object)
fmt.Println("Error from k8s API when applying " + fmt.Sprint(string(o)) + " to " + gvrSources.String() + " : " , err)
return nil,err
fmt.Println("Error from k8s API when applying "+fmt.Sprint(string(o))+" to "+gvrSources.String()+" : ", err)
return nil, err
}
// We can add more info to the log with the content of resp if not nil
resByte, err := json.Marshal(res)
resByte, err := json.Marshal(res)
if err != nil {
// fmt.Println("Error trying to create a Source on remote cluster : ", err , " : ", res)
return nil, err
@@ -485,26 +480,40 @@ func dynamicClientApply(executionId string, resourceName string, resourceDefinit
}
func putCDRapiKube(client kubernetes.Clientset, ctx context.Context, path string, body []byte, params ...map[string]string) ([]byte, error){
req := client.RESTClient().
Post().
AbsPath(path).
Body(body)
func (k *KubernetesService) CheckHealth() error {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
for _, param := range params {
for k, v := range param {
req = req.Param(k, v)
// Check API server connectivity
_, err := k.Set.ServerVersion()
if err != nil {
return fmt.Errorf("API server unreachable: %v", err)
}
// Check nodes status
nodes, err := k.Set.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return fmt.Errorf("failed to list nodes: %v", err)
}
for _, node := range nodes.Items {
for _, condition := range node.Status.Conditions {
if condition.Type == "Ready" && condition.Status != "True" {
return fmt.Errorf("node %s not ready", node.Name)
}
}
}
resp, err := req.DoRaw(ctx)
// Optional: Check if all pods in kube-system are running
pods, err := k.Set.CoreV1().Pods("kube-system").List(ctx, metav1.ListOptions{})
if err != nil {
fmt.Println("Error from k8s API when posting "+string(body)+" to "+path+" : ", err)
return nil, err
return fmt.Errorf("failed to list pods: %v", err)
}
return resp, nil
for _, pod := range pods.Items {
if pod.Status.Phase != "Running" && pod.Status.Phase != "Succeeded" {
return fmt.Errorf("pod %s in namespace kube-system is %s", pod.Name, pod.Status.Phase)
}
}
return nil
}
// Returns the Kubernetes' Node object corresponding to the executionID if it exists on this host
@@ -526,7 +535,7 @@ func (k *KubernetesService) GetOneNode(context context.Context, executionID stri
}
for _, node := range res.Items {
if isNode := strings.Contains(node.Name, "admiralty-"+ executionID +"-target-"+ concatenatedName + "-"); isNode {
if isNode := strings.Contains(node.Name, "admiralty-"+executionID+"-target-"+concatenatedName+"-"); isNode {
return &node, nil
}
}
@@ -534,7 +543,6 @@ func (k *KubernetesService) GetOneNode(context context.Context, executionID stri
return nil, nil
}
// Returns a concatenation of the peerId and namespace in order for
// kubernetes ressources to have a unique name, under 63 characters
// and yet identify which peer they are created for

View File

@@ -0,0 +1,176 @@
package infrastructure
import (
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strconv"
"sync"
"time"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/dbs"
"cloud.o-forge.io/core/oc-lib/models/booking"
"cloud.o-forge.io/core/oc-lib/models/common/models"
"cloud.o-forge.io/core/oc-lib/models/compute_units"
)
type MetricsSnapshot struct {
From string `json:"origin"`
Metrics []Metric `json:"metrics"`
}
type Metric struct {
Name string `json:"name"`
Value float64 `json:"value"`
Error error `json:"error"`
}
type PrometheusResponse struct {
Status string `json:"status"`
Data struct {
ResultType string `json:"resultType"`
Result []struct {
Metric map[string]string `json:"metric"`
Value []interface{} `json:"value"` // [timestamp, value]
} `json:"result"`
} `json:"data"`
}
var queriesMetrics = []string{
"rate(container_cpu_usage_seconds_total{namespace=\"%s\"}[1m]) * 100",
"container_memory_usage_bytes{namespace=\"%s\"}",
"(container_fs_usage_bytes{namespace=\"%s\"}) / (container_fs_limit_bytes{namespace=\"%s\"}) * 100",
"DCGM_FI_DEV_GPU_UTIL{namespace=\"%s\"}",
// "system_load_average",
"rate(container_fs_reads_bytes_total{namespace=\"%s\"}[1m])",
"rate(container_fs_writes_bytes_total{namespace=\"%s\"}[1m])",
"rate(container_network_receive_bytes_total{namespace=\"%s\"}[1m])",
"rate(container_network_transmit_bytes_total{namespace=\"%s\"}[1m])",
// "system_network_latency_ms",
"rate(http_requests_total{namespace=\"%s\"}[1m])",
"(rate(http_requests_total{status=~\"5..\", namespace=\"%s\"}[1m]) / rate(http_requests_total{namespace=\"%s\"}[1m])) * 100",
// "app_mean_time_to_repair_seconds",
// "app_mean_time_between_failure_seconds",
}
type PrometheusService struct {
}
func NewPrometheusService() *PrometheusService {
return &PrometheusService{}
}
func (p *PrometheusService) queryPrometheus(promURL string, expr string, namespace string) models.Metric {
metric := models.Metric{Name: expr, Value: -1}
resp, err := http.Get(promURL + "/api/v1/query?query=" + url.QueryEscape(fmt.Sprintf(expr, namespace)))
if err != nil {
metric.Error = err
} else {
defer resp.Body.Close()
if body, err := io.ReadAll(resp.Body); err == nil {
var result PrometheusResponse
if err = json.Unmarshal(body, &result); err == nil && len(result.Data.Result) > 0 && len(result.Data.Result[0].Value) == 2 {
metric.Value, metric.Error = strconv.ParseFloat(fmt.Sprintf("%s", result.Data.Result[0].Value[1]), 64)
}
}
}
return metric
}
func (p *PrometheusService) Call(bookingID string) (*booking.Booking, map[string]models.MetricsSnapshot) {
var wg sync.WaitGroup
metrics := map[string]models.MetricsSnapshot{}
// get all booking... from executions_id == namespace typed datacenter.
bAccess := oclib.NewRequest(oclib.LibDataEnum(oclib.BOOKING), "", "", []string{}, nil)
book := bAccess.LoadOne(bookingID)
if book.Err != "" {
fmt.Errorf("stop because of empty : %s", book.Err)
return nil, metrics
}
cUAccess := oclib.NewRequest(oclib.LibDataEnum(oclib.COMPUTE_UNITS), "", "", []string{}, nil)
cRAccess := oclib.NewRequest(oclib.LibDataEnum(oclib.COMPUTE_RESOURCE), "", "", []string{}, nil)
rr := cRAccess.LoadOne(book.Data.(*booking.Booking).ResourceID)
if rr.Err != "" {
fmt.Errorf("can't proceed because of unfound resource %s : %s", book.Data.(*booking.Booking).ResourceID, rr.Err)
return book.Data.(*booking.Booking), metrics
}
computeRes := rr.ToComputeResource()
for _, instance := range computeRes.Instances {
res := cUAccess.Search(&dbs.Filters{
And: map[string][]dbs.Filter{
"source": {{Operator: dbs.EQUAL.String(), Value: instance.Source}},
},
}, "", false)
if res.Err != "" {
continue
}
for _, r := range res.Data {
// TODO watch out ... to not exec on an absent datacenter...
if r.(*compute_units.ComputeUnits).MonitorPath == "" {
continue
}
wg.Add(1)
snapshot := models.MetricsSnapshot{From: instance.Source, Metrics: []models.Metric{}}
go func() {
defer wg.Done()
for _, expr := range queriesMetrics {
snapshot.Metrics = append(snapshot.Metrics,
p.queryPrometheus(r.(*compute_units.ComputeUnits).MonitorPath, expr, book.Data.(*booking.Booking).ExecutionsID))
}
metrics[instance.Name] = snapshot
}()
}
}
wg.Wait()
return book.Data.(*booking.Booking), metrics
}
func (p *PrometheusService) Stream(bookingID string, end *time.Time, interval time.Duration, flusher http.Flusher, encoder *json.Encoder) {
if end != nil {
max := 100
count := 0
mets := map[string][]models.MetricsSnapshot{}
for time.Now().Before(*end) {
go func() {
count++
book, metrics := p.Call(bookingID)
for k, v := range metrics {
if me, ok := mets[k]; !ok {
mets[k] = []models.MetricsSnapshot{v}
} else {
me = append(me, v)
mets[k] = me
}
}
encoder.Encode(metrics)
flusher.Flush()
if count == max {
if book.ExecutionMetrics == nil {
book.ExecutionMetrics = mets
} else {
for kk, vv := range mets {
if em, ok := book.ExecutionMetrics[kk]; !ok {
book.ExecutionMetrics[kk] = vv
} else {
em = append(em, vv...)
book.ExecutionMetrics[kk] = em
}
}
}
book.GetAccessor(nil).UpdateOne(book, bookingID)
}
}()
time.Sleep(interval)
}
} else {
// todo an anchor... detecting the end of a task.
}
}
// should add a datacenter... under juridiction... of opencloud...