draft... metrics by booking...
This commit is contained in:
@@ -15,18 +15,27 @@ type Infrastructure interface {
|
||||
CreateServiceAccount(ctx context.Context, ns string) error
|
||||
CreateRoleBinding(ctx context.Context, ns string, roleBinding string, role string) error
|
||||
CreateRole(ctx context.Context, ns string, role string, groups [][]string, resources [][]string, verbs [][]string) error
|
||||
GetTargets(ctx context.Context) ([]string,error)
|
||||
GetTargets(ctx context.Context) ([]string, error)
|
||||
CreateAdmiraltySource(context context.Context, executionId string) ([]byte, error)
|
||||
CreateKubeconfigSecret(context context.Context, kubeconfig string, executionId string, peerId string) ([]byte, error)
|
||||
CreateKubeconfigSecret(context context.Context, kubeconfig string, executionId string, peerId string) ([]byte, error)
|
||||
GetKubeconfigSecret(context context.Context, executionId string, peerId string) ([]byte, error)
|
||||
CreateAdmiraltyTarget(context context.Context, executionId string, peerId string)([]byte,error)
|
||||
CreateAdmiraltyTarget(context context.Context, executionId string, peerId string) ([]byte, error)
|
||||
GetOneNode(context context.Context, executionID string, peerId string) (*v1.Node, error)
|
||||
CheckHealth() error
|
||||
}
|
||||
|
||||
var _service = map[string]func() (Infrastructure, error){
|
||||
"kubernetes": NewKubernetesService,
|
||||
}
|
||||
|
||||
func NewServiceByType(t string) (Infrastructure, error) {
|
||||
service, ok := _service[t]
|
||||
if !ok {
|
||||
return nil, errors.New("service not found")
|
||||
}
|
||||
return service()
|
||||
}
|
||||
|
||||
func NewService() (Infrastructure, error) {
|
||||
service, ok := _service[conf.GetConfig().Mode]
|
||||
if !ok {
|
||||
@@ -34,4 +43,3 @@ func NewService() (Infrastructure, error) {
|
||||
}
|
||||
return service()
|
||||
}
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
"fmt"
|
||||
"oc-datacenter/conf"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
authv1 "k8s.io/api/authentication/v1"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
@@ -26,7 +27,7 @@ var gvrSources = schema.GroupVersionResource{Group: "multicluster.admiralty.io",
|
||||
var gvrTargets = schema.GroupVersionResource{Group: "multicluster.admiralty.io", Version: "v1alpha1", Resource: "targets"}
|
||||
|
||||
type KubernetesService struct {
|
||||
Set *kubernetes.Clientset
|
||||
Set *kubernetes.Clientset
|
||||
}
|
||||
|
||||
func NewDynamicClient() (*dynamic.DynamicClient, error) {
|
||||
@@ -59,7 +60,7 @@ func NewKubernetesService() (Infrastructure, error) {
|
||||
KeyData: []byte(conf.GetConfig().KubeData),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
// Create clientset
|
||||
clientset, err := kubernetes.NewForConfig(config)
|
||||
fmt.Println("NewForConfig", clientset, err)
|
||||
@@ -70,7 +71,6 @@ func NewKubernetesService() (Infrastructure, error) {
|
||||
return nil, errors.New("Error creating Kubernetes client: clientset is nil")
|
||||
}
|
||||
|
||||
|
||||
return &KubernetesService{
|
||||
Set: clientset,
|
||||
}, nil
|
||||
@@ -111,7 +111,7 @@ func (k *KubernetesService) CreateNamespace(ctx context.Context, ns string) erro
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: ns,
|
||||
Labels: map[string]string{
|
||||
"multicluster-scheduler":"enabled",
|
||||
"multicluster-scheduler": "enabled",
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -291,24 +291,24 @@ func (k *KubernetesService) CreateAdmiraltyTarget(context context.Context, execu
|
||||
fmt.Println("Target needs to be binded to a secret in namespace ", executionId)
|
||||
return nil, nil // Maybe we could create a wrapper for errors and add more info to have
|
||||
}
|
||||
|
||||
targetName := "target-" + getConcatenatedName(peerId,executionId)
|
||||
|
||||
targetName := "target-" + getConcatenatedName(peerId, executionId)
|
||||
target := map[string]interface{}{
|
||||
"apiVersion": "multicluster.admiralty.io/v1alpha1",
|
||||
"kind": "Target",
|
||||
"metadata": map[string]interface{}{
|
||||
"name": targetName,
|
||||
"namespace": executionId,
|
||||
"apiVersion": "multicluster.admiralty.io/v1alpha1",
|
||||
"kind": "Target",
|
||||
"metadata": map[string]interface{}{
|
||||
"name": targetName,
|
||||
"namespace": executionId,
|
||||
"labels": map[string]interface{}{
|
||||
"peer": peerId,
|
||||
},
|
||||
},
|
||||
"spec": map[string]interface{}{
|
||||
"kubeconfigSecret": map[string]string{
|
||||
"name" : "kube-secret-"+ getConcatenatedName(peerId, executionId),
|
||||
},
|
||||
"spec": map[string]interface{}{
|
||||
"kubeconfigSecret": map[string]string{
|
||||
"name": "kube-secret-" + getConcatenatedName(peerId, executionId),
|
||||
},
|
||||
},
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
res, err := dynamicClientApply(executionId, targetName, gvrTargets, context, target)
|
||||
if err != nil {
|
||||
@@ -327,22 +327,21 @@ func (k *KubernetesService) CreateAdmiraltyTarget(context context.Context, execu
|
||||
// This method is temporary to implement the use of Admiralty, but must be edited
|
||||
// to rather contact the oc-datacenter from the remote cluster to create the source
|
||||
// locally and retrieve the token for the serviceAccount
|
||||
func (k *KubernetesService) CreateAdmiraltySource(context context.Context,executionId string) ([]byte, error) {
|
||||
func (k *KubernetesService) CreateAdmiraltySource(context context.Context, executionId string) ([]byte, error) {
|
||||
|
||||
source := map[string]interface{}{
|
||||
"apiVersion": "multicluster.admiralty.io/v1alpha1",
|
||||
"kind": "Source",
|
||||
"metadata": map[string]interface{}{
|
||||
"name": "source-"+executionId,
|
||||
"namespace": executionId,
|
||||
},
|
||||
"spec": map[string]interface{}{
|
||||
"serviceAccountName": "sa-"+executionId,
|
||||
},
|
||||
}
|
||||
"apiVersion": "multicluster.admiralty.io/v1alpha1",
|
||||
"kind": "Source",
|
||||
"metadata": map[string]interface{}{
|
||||
"name": "source-" + executionId,
|
||||
"namespace": executionId,
|
||||
},
|
||||
"spec": map[string]interface{}{
|
||||
"serviceAccountName": "sa-" + executionId,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
res, err := dynamicClientApply(executionId, "source-" + executionId,gvrSources, context, source)
|
||||
res, err := dynamicClientApply(executionId, "source-"+executionId, gvrSources, context, source)
|
||||
if err != nil {
|
||||
return nil, errors.New("Error when trying to apply Source definition :" + err.Error())
|
||||
}
|
||||
@@ -361,14 +360,12 @@ func (k *KubernetesService) CreateKubeconfigSecret(context context.Context, kube
|
||||
return nil, err
|
||||
}
|
||||
|
||||
secretApplyConfig := apply.Secret("kube-secret-" + getConcatenatedName(peerId, executionId),
|
||||
executionId).
|
||||
WithData(map[string][]byte{
|
||||
"config": config,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
secretApplyConfig := apply.Secret("kube-secret-"+getConcatenatedName(peerId, executionId),
|
||||
executionId).
|
||||
WithData(map[string][]byte{
|
||||
"config": config,
|
||||
},
|
||||
)
|
||||
|
||||
// exists, err := k.GetKubeconfigSecret(context,executionId)
|
||||
// if err != nil {
|
||||
@@ -384,15 +381,14 @@ func (k *KubernetesService) CreateKubeconfigSecret(context context.Context, kube
|
||||
// _ = err
|
||||
// }
|
||||
|
||||
|
||||
resp, err := k.Set.CoreV1().
|
||||
Secrets(executionId).
|
||||
Apply(context,
|
||||
secretApplyConfig,
|
||||
metav1.ApplyOptions{
|
||||
FieldManager: "admiralty-manager",
|
||||
})
|
||||
|
||||
Secrets(executionId).
|
||||
Apply(context,
|
||||
secretApplyConfig,
|
||||
metav1.ApplyOptions{
|
||||
FieldManager: "admiralty-manager",
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
fmt.Println("Error while trying to contact API to get secret kube-secret-" + executionId)
|
||||
fmt.Println(err)
|
||||
@@ -411,7 +407,7 @@ func (k *KubernetesService) CreateKubeconfigSecret(context context.Context, kube
|
||||
func (k *KubernetesService) GetKubeconfigSecret(context context.Context, executionId string, peerId string) ([]byte, error) {
|
||||
resp, err := k.Set.CoreV1().
|
||||
Secrets(executionId).
|
||||
Get(context, "kube-secret-"+ getConcatenatedName(peerId, executionId), metav1.GetOptions{})
|
||||
Get(context, "kube-secret-"+getConcatenatedName(peerId, executionId), metav1.GetOptions{})
|
||||
|
||||
if err != nil {
|
||||
if apierrors.IsNotFound(err) {
|
||||
@@ -459,23 +455,22 @@ func dynamicClientApply(executionId string, resourceName string, resourceDefinit
|
||||
}
|
||||
|
||||
res, err := cli.Resource(resourceDefinition).
|
||||
Namespace(executionId).
|
||||
Apply(ctx,
|
||||
resourceName,
|
||||
&unstructured.Unstructured{Object: object},
|
||||
metav1.ApplyOptions{
|
||||
FieldManager: "kubectl-client-side-apply",
|
||||
},
|
||||
)
|
||||
Namespace(executionId).
|
||||
Apply(ctx,
|
||||
resourceName,
|
||||
&unstructured.Unstructured{Object: object},
|
||||
metav1.ApplyOptions{
|
||||
FieldManager: "kubectl-client-side-apply",
|
||||
},
|
||||
)
|
||||
if err != nil {
|
||||
o, err := json.Marshal(object)
|
||||
fmt.Println("Error from k8s API when applying " + fmt.Sprint(string(o)) + " to " + gvrSources.String() + " : " , err)
|
||||
return nil,err
|
||||
fmt.Println("Error from k8s API when applying "+fmt.Sprint(string(o))+" to "+gvrSources.String()+" : ", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
|
||||
// We can add more info to the log with the content of resp if not nil
|
||||
resByte, err := json.Marshal(res)
|
||||
resByte, err := json.Marshal(res)
|
||||
if err != nil {
|
||||
// fmt.Println("Error trying to create a Source on remote cluster : ", err , " : ", res)
|
||||
return nil, err
|
||||
@@ -485,26 +480,40 @@ func dynamicClientApply(executionId string, resourceName string, resourceDefinit
|
||||
|
||||
}
|
||||
|
||||
func putCDRapiKube(client kubernetes.Clientset, ctx context.Context, path string, body []byte, params ...map[string]string) ([]byte, error){
|
||||
req := client.RESTClient().
|
||||
Post().
|
||||
AbsPath(path).
|
||||
Body(body)
|
||||
func (k *KubernetesService) CheckHealth() error {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
for _, param := range params {
|
||||
for k, v := range param {
|
||||
req = req.Param(k, v)
|
||||
// Check API server connectivity
|
||||
_, err := k.Set.ServerVersion()
|
||||
if err != nil {
|
||||
return fmt.Errorf("API server unreachable: %v", err)
|
||||
}
|
||||
|
||||
// Check nodes status
|
||||
nodes, err := k.Set.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to list nodes: %v", err)
|
||||
}
|
||||
for _, node := range nodes.Items {
|
||||
for _, condition := range node.Status.Conditions {
|
||||
if condition.Type == "Ready" && condition.Status != "True" {
|
||||
return fmt.Errorf("node %s not ready", node.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resp, err := req.DoRaw(ctx)
|
||||
|
||||
// Optional: Check if all pods in kube-system are running
|
||||
pods, err := k.Set.CoreV1().Pods("kube-system").List(ctx, metav1.ListOptions{})
|
||||
if err != nil {
|
||||
fmt.Println("Error from k8s API when posting "+string(body)+" to "+path+" : ", err)
|
||||
return nil, err
|
||||
return fmt.Errorf("failed to list pods: %v", err)
|
||||
}
|
||||
|
||||
return resp, nil
|
||||
for _, pod := range pods.Items {
|
||||
if pod.Status.Phase != "Running" && pod.Status.Phase != "Succeeded" {
|
||||
return fmt.Errorf("pod %s in namespace kube-system is %s", pod.Name, pod.Status.Phase)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Returns the Kubernetes' Node object corresponding to the executionID if it exists on this host
|
||||
@@ -526,7 +535,7 @@ func (k *KubernetesService) GetOneNode(context context.Context, executionID stri
|
||||
}
|
||||
|
||||
for _, node := range res.Items {
|
||||
if isNode := strings.Contains(node.Name, "admiralty-"+ executionID +"-target-"+ concatenatedName + "-"); isNode {
|
||||
if isNode := strings.Contains(node.Name, "admiralty-"+executionID+"-target-"+concatenatedName+"-"); isNode {
|
||||
return &node, nil
|
||||
}
|
||||
}
|
||||
@@ -534,7 +543,6 @@ func (k *KubernetesService) GetOneNode(context context.Context, executionID stri
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
|
||||
// Returns a concatenation of the peerId and namespace in order for
|
||||
// kubernetes ressources to have a unique name, under 63 characters
|
||||
// and yet identify which peer they are created for
|
||||
|
||||
176
infrastructure/prometheus.go
Normal file
176
infrastructure/prometheus.go
Normal file
@@ -0,0 +1,176 @@
|
||||
package infrastructure
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/dbs"
|
||||
"cloud.o-forge.io/core/oc-lib/models/booking"
|
||||
"cloud.o-forge.io/core/oc-lib/models/common/models"
|
||||
"cloud.o-forge.io/core/oc-lib/models/compute_units"
|
||||
)
|
||||
|
||||
type MetricsSnapshot struct {
|
||||
From string `json:"origin"`
|
||||
Metrics []Metric `json:"metrics"`
|
||||
}
|
||||
|
||||
type Metric struct {
|
||||
Name string `json:"name"`
|
||||
Value float64 `json:"value"`
|
||||
Error error `json:"error"`
|
||||
}
|
||||
|
||||
type PrometheusResponse struct {
|
||||
Status string `json:"status"`
|
||||
Data struct {
|
||||
ResultType string `json:"resultType"`
|
||||
Result []struct {
|
||||
Metric map[string]string `json:"metric"`
|
||||
Value []interface{} `json:"value"` // [timestamp, value]
|
||||
} `json:"result"`
|
||||
} `json:"data"`
|
||||
}
|
||||
|
||||
var queriesMetrics = []string{
|
||||
"rate(container_cpu_usage_seconds_total{namespace=\"%s\"}[1m]) * 100",
|
||||
"container_memory_usage_bytes{namespace=\"%s\"}",
|
||||
"(container_fs_usage_bytes{namespace=\"%s\"}) / (container_fs_limit_bytes{namespace=\"%s\"}) * 100",
|
||||
"DCGM_FI_DEV_GPU_UTIL{namespace=\"%s\"}",
|
||||
// "system_load_average",
|
||||
"rate(container_fs_reads_bytes_total{namespace=\"%s\"}[1m])",
|
||||
"rate(container_fs_writes_bytes_total{namespace=\"%s\"}[1m])",
|
||||
"rate(container_network_receive_bytes_total{namespace=\"%s\"}[1m])",
|
||||
"rate(container_network_transmit_bytes_total{namespace=\"%s\"}[1m])",
|
||||
// "system_network_latency_ms",
|
||||
"rate(http_requests_total{namespace=\"%s\"}[1m])",
|
||||
"(rate(http_requests_total{status=~\"5..\", namespace=\"%s\"}[1m]) / rate(http_requests_total{namespace=\"%s\"}[1m])) * 100",
|
||||
// "app_mean_time_to_repair_seconds",
|
||||
// "app_mean_time_between_failure_seconds",
|
||||
}
|
||||
|
||||
type PrometheusService struct {
|
||||
}
|
||||
|
||||
func NewPrometheusService() *PrometheusService {
|
||||
return &PrometheusService{}
|
||||
}
|
||||
|
||||
func (p *PrometheusService) queryPrometheus(promURL string, expr string, namespace string) models.Metric {
|
||||
metric := models.Metric{Name: expr, Value: -1}
|
||||
resp, err := http.Get(promURL + "/api/v1/query?query=" + url.QueryEscape(fmt.Sprintf(expr, namespace)))
|
||||
if err != nil {
|
||||
metric.Error = err
|
||||
} else {
|
||||
defer resp.Body.Close()
|
||||
if body, err := io.ReadAll(resp.Body); err == nil {
|
||||
var result PrometheusResponse
|
||||
if err = json.Unmarshal(body, &result); err == nil && len(result.Data.Result) > 0 && len(result.Data.Result[0].Value) == 2 {
|
||||
metric.Value, metric.Error = strconv.ParseFloat(fmt.Sprintf("%s", result.Data.Result[0].Value[1]), 64)
|
||||
}
|
||||
}
|
||||
}
|
||||
return metric
|
||||
}
|
||||
|
||||
func (p *PrometheusService) Call(bookingID string) (*booking.Booking, map[string]models.MetricsSnapshot) {
|
||||
var wg sync.WaitGroup
|
||||
|
||||
metrics := map[string]models.MetricsSnapshot{}
|
||||
// get all booking... from executions_id == namespace typed datacenter.
|
||||
bAccess := oclib.NewRequest(oclib.LibDataEnum(oclib.BOOKING), "", "", []string{}, nil)
|
||||
book := bAccess.LoadOne(bookingID)
|
||||
if book.Err != "" {
|
||||
fmt.Errorf("stop because of empty : %s", book.Err)
|
||||
return nil, metrics
|
||||
}
|
||||
cUAccess := oclib.NewRequest(oclib.LibDataEnum(oclib.COMPUTE_UNITS), "", "", []string{}, nil)
|
||||
cRAccess := oclib.NewRequest(oclib.LibDataEnum(oclib.COMPUTE_RESOURCE), "", "", []string{}, nil)
|
||||
|
||||
rr := cRAccess.LoadOne(book.Data.(*booking.Booking).ResourceID)
|
||||
if rr.Err != "" {
|
||||
fmt.Errorf("can't proceed because of unfound resource %s : %s", book.Data.(*booking.Booking).ResourceID, rr.Err)
|
||||
return book.Data.(*booking.Booking), metrics
|
||||
}
|
||||
computeRes := rr.ToComputeResource()
|
||||
for _, instance := range computeRes.Instances {
|
||||
res := cUAccess.Search(&dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
"source": {{Operator: dbs.EQUAL.String(), Value: instance.Source}},
|
||||
},
|
||||
}, "", false)
|
||||
if res.Err != "" {
|
||||
continue
|
||||
}
|
||||
for _, r := range res.Data {
|
||||
// TODO watch out ... to not exec on an absent datacenter...
|
||||
if r.(*compute_units.ComputeUnits).MonitorPath == "" {
|
||||
continue
|
||||
}
|
||||
wg.Add(1)
|
||||
snapshot := models.MetricsSnapshot{From: instance.Source, Metrics: []models.Metric{}}
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for _, expr := range queriesMetrics {
|
||||
snapshot.Metrics = append(snapshot.Metrics,
|
||||
p.queryPrometheus(r.(*compute_units.ComputeUnits).MonitorPath, expr, book.Data.(*booking.Booking).ExecutionsID))
|
||||
}
|
||||
metrics[instance.Name] = snapshot
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
return book.Data.(*booking.Booking), metrics
|
||||
}
|
||||
|
||||
func (p *PrometheusService) Stream(bookingID string, end *time.Time, interval time.Duration, flusher http.Flusher, encoder *json.Encoder) {
|
||||
if end != nil {
|
||||
max := 100
|
||||
count := 0
|
||||
mets := map[string][]models.MetricsSnapshot{}
|
||||
for time.Now().Before(*end) {
|
||||
go func() {
|
||||
count++
|
||||
book, metrics := p.Call(bookingID)
|
||||
for k, v := range metrics {
|
||||
if me, ok := mets[k]; !ok {
|
||||
mets[k] = []models.MetricsSnapshot{v}
|
||||
} else {
|
||||
me = append(me, v)
|
||||
mets[k] = me
|
||||
}
|
||||
}
|
||||
encoder.Encode(metrics)
|
||||
flusher.Flush()
|
||||
if count == max {
|
||||
if book.ExecutionMetrics == nil {
|
||||
book.ExecutionMetrics = mets
|
||||
} else {
|
||||
for kk, vv := range mets {
|
||||
if em, ok := book.ExecutionMetrics[kk]; !ok {
|
||||
book.ExecutionMetrics[kk] = vv
|
||||
} else {
|
||||
em = append(em, vv...)
|
||||
book.ExecutionMetrics[kk] = em
|
||||
}
|
||||
}
|
||||
}
|
||||
book.GetAccessor(nil).UpdateOne(book, bookingID)
|
||||
}
|
||||
}()
|
||||
time.Sleep(interval)
|
||||
}
|
||||
} else {
|
||||
// todo an anchor... detecting the end of a task.
|
||||
}
|
||||
}
|
||||
|
||||
// should add a datacenter... under juridiction... of opencloud...
|
||||
Reference in New Issue
Block a user