Prometheus X Vector

2026-02-17 17:23:54 +01:00
parent c99f161a51
commit 3d27da3e7c
11 changed files with 480 additions and 234 deletions
@@ -0,0 +1,75 @@
+package monitor
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"oc-datacenter/conf"
+	"sync"
+	"time"
+
+	oclib "cloud.o-forge.io/core/oc-lib"
+	"cloud.o-forge.io/core/oc-lib/dbs"
+	"cloud.o-forge.io/core/oc-lib/models/booking"
+	"cloud.o-forge.io/core/oc-lib/models/common/models"
+	"cloud.o-forge.io/core/oc-lib/models/live"
+	"cloud.o-forge.io/core/oc-lib/models/resources"
+	"github.com/gorilla/websocket"
+)
+
+type MonitorInterface interface {
+	Stream(ctx context.Context, bookingID string, interval time.Duration, ws *websocket.Conn)
+}
+
+var _monitorService = map[string]func() MonitorInterface{
+	"prometheus": func() MonitorInterface { return NewPrometheusService() },
+	"vector":     func() MonitorInterface { return NewVectorService() },
+}
+
+func NewMonitorService() (MonitorInterface, error) {
+	service, ok := _monitorService[conf.GetConfig().MonitorMode]
+	if !ok {
+		return nil, errors.New("monitor service not found")
+	}
+	return service(), nil
+}
+
+func Call(book *booking.Booking,
+	f func(*live.LiveDatacenter, *resources.ComputeResourceInstance,
+		map[string]models.MetricsSnapshot, *sync.WaitGroup, *sync.Mutex)) (*booking.Booking, map[string]models.MetricsSnapshot) {
+	logger := oclib.GetLogger()
+	metrics := map[string]models.MetricsSnapshot{}
+	var wg sync.WaitGroup
+	var mu sync.Mutex
+
+	cUAccess := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.LIVE_DATACENTER), nil)
+	cRAccess := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.COMPUTE_RESOURCE), nil)
+
+	rr := cRAccess.LoadOne(book.ResourceID)
+	if rr.Err != "" {
+		logger.Err(fmt.Errorf("can't proceed because of unfound resource %s : %s", book.ResourceID, rr.Err))
+		return book, metrics
+	}
+	computeRes := rr.ToComputeResource()
+	for _, instance := range computeRes.Instances {
+		res := cUAccess.Search(&dbs.Filters{
+			And: map[string][]dbs.Filter{
+				"source":                    {{Operator: dbs.EQUAL.String(), Value: instance.Source}},
+				"abstractlive.resources_id": {{Operator: dbs.EQUAL.String(), Value: computeRes.GetID()}},
+			},
+		}, "", false)
+		if res.Err != "" {
+			continue
+		}
+		for _, r := range res.Data {
+			dc := r.(*live.LiveDatacenter)
+			if dc.MonitorPath == "" {
+				continue
+			}
+			wg.Add(1)
+			go f(dc, instance, metrics, &wg, &mu)
+		}
+	}
+	wg.Wait()
+	return book, metrics
+}
@@ -0,0 +1,194 @@
+package monitor
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	oclib "cloud.o-forge.io/core/oc-lib"
+	"cloud.o-forge.io/core/oc-lib/models/booking"
+	"cloud.o-forge.io/core/oc-lib/models/common/models"
+	"cloud.o-forge.io/core/oc-lib/models/live"
+	"cloud.o-forge.io/core/oc-lib/models/resources"
+	"github.com/gorilla/websocket"
+)
+
+type PrometheusResponse struct {
+	Status string `json:"status"`
+	Data   struct {
+		ResultType string `json:"resultType"`
+		Result     []struct {
+			Metric map[string]string `json:"metric"`
+			Value  []interface{}     `json:"value"` // [timestamp, value]
+		} `json:"result"`
+	} `json:"data"`
+}
+
+var queriesMetrics = []string{
+	"rate(container_cpu_usage_seconds_total{namespace=\"%s\"}[1m]) * 100",
+	"container_memory_usage_bytes{namespace=\"%s\"}",
+	"(container_fs_usage_bytes{namespace=\"%s\"}) / (container_fs_limit_bytes{namespace=\"%s\"}) * 100",
+	"DCGM_FI_DEV_GPU_UTIL{namespace=\"%s\"}",
+	"rate(container_fs_reads_bytes_total{namespace=\"%s\"}[1m])",
+	"rate(container_fs_writes_bytes_total{namespace=\"%s\"}[1m])",
+	"rate(container_network_receive_bytes_total{namespace=\"%s\"}[1m])",
+	"rate(container_network_transmit_bytes_total{namespace=\"%s\"}[1m])",
+	"rate(http_requests_total{namespace=\"%s\"}[1m])",
+	"(rate(http_requests_total{status=~\"5..\", namespace=\"%s\"}[1m]) / rate(http_requests_total{namespace=\"%s\"}[1m])) * 100",
+}
+
+var httpClient = &http.Client{
+	Timeout: 10 * time.Second,
+}
+
+// StreamRegistry manages cancellation of active monitoring streams by namespace.
+var StreamRegistry = &streamRegistry{
+	streams: map[string]context.CancelFunc{},
+}
+
+type streamRegistry struct {
+	mu      sync.Mutex
+	streams map[string]context.CancelFunc
+}
+
+func (r *streamRegistry) Register(namespace string) context.Context {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	if cancel, ok := r.streams[namespace]; ok {
+		cancel()
+	}
+	ctx, cancel := context.WithCancel(context.Background())
+	r.streams[namespace] = cancel
+	return ctx
+}
+
+func (r *streamRegistry) Cancel(namespace string) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	if cancel, ok := r.streams[namespace]; ok {
+		cancel()
+		delete(r.streams, namespace)
+	}
+}
+
+type PrometheusService struct {
+}
+
+func NewPrometheusService() *PrometheusService {
+	return &PrometheusService{}
+}
+
+func (p *PrometheusService) queryPrometheus(ctx context.Context, promURL string, expr string, namespace string) models.Metric {
+	metric := models.Metric{Name: expr, Value: -1}
+	query := strings.ReplaceAll(expr, "%s", namespace)
+	reqURL := promURL + "/api/v1/query?query=" + url.QueryEscape(query)
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, reqURL, nil)
+	if err != nil {
+		metric.Error = err
+		return metric
+	}
+	resp, err := httpClient.Do(req)
+	if err != nil {
+		metric.Error = err
+		return metric
+	}
+	defer resp.Body.Close()
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		metric.Error = err
+		return metric
+	}
+	var result PrometheusResponse
+	if err = json.Unmarshal(body, &result); err != nil {
+		metric.Error = err
+		return metric
+	}
+	if len(result.Data.Result) > 0 && len(result.Data.Result[0].Value) == 2 {
+		metric.Value, metric.Error = strconv.ParseFloat(fmt.Sprintf("%s", result.Data.Result[0].Value[1]), 64)
+	}
+	return metric
+}
+
+func (p *PrometheusService) Stream(ctx context.Context, bookingID string, interval time.Duration, ws *websocket.Conn) {
+	logger := oclib.GetLogger()
+	max := 100
+	count := 0
+	mets := map[string][]models.MetricsSnapshot{}
+	bAccess := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.BOOKING), nil)
+	book := bAccess.LoadOne(bookingID)
+	if book.Err != "" {
+		logger.Err(fmt.Errorf("stop because of empty : %s", book.Err))
+		return
+	}
+
+	isActive := func(e *booking.Booking) bool {
+		if e.ExpectedEndDate == nil {
+			return true
+		}
+		return time.Now().Before(*e.ExpectedEndDate)
+	}
+
+	ticker := time.NewTicker(interval)
+	defer ticker.Stop()
+
+	for isActive(book.Data.(*booking.Booking)) {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+		}
+
+		b, metrics := Call(book.Data.(*booking.Booking),
+			func(dc *live.LiveDatacenter, instance *resources.ComputeResourceInstance,
+				metrics map[string]models.MetricsSnapshot,
+				wg *sync.WaitGroup, mu *sync.Mutex) {
+				defer wg.Done()
+				for _, expr := range queriesMetrics {
+					if mm, ok := metrics[instance.Name]; !ok {
+						mu.Lock()
+						metrics[instance.Name] = models.MetricsSnapshot{
+							From:    instance.Source,
+							Metrics: []models.Metric{p.queryPrometheus(ctx, dc.MonitorPath, expr, book.Data.(*booking.Booking).ExecutionsID)},
+						}
+						mu.Unlock()
+					} else {
+						mu.Lock()
+						mm.Metrics = append(mm.Metrics, p.queryPrometheus(ctx, dc.MonitorPath, expr, book.Data.(*booking.Booking).ExecutionsID))
+						mu.Unlock()
+					}
+				}
+			})
+		_ = b
+		count++
+
+		if ws != nil {
+			if err := ws.WriteJSON(metrics); err != nil {
+				logger.Err(fmt.Errorf("websocket write error: %w", err))
+				return
+			}
+		}
+
+		if count < max {
+			continue
+		}
+
+		bk := book.Data.(*booking.Booking)
+		if bk.ExecutionMetrics == nil {
+			bk.ExecutionMetrics = mets
+		} else {
+			for kk, vv := range mets {
+				bk.ExecutionMetrics[kk] = append(bk.ExecutionMetrics[kk], vv...)
+			}
+		}
+		bk.GetAccessor(nil).UpdateOne(bk, bookingID)
+		mets = map[string][]models.MetricsSnapshot{}
+		count = 0
+	}
+}
@@ -0,0 +1,153 @@
+package monitor
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"log"
+	"sync"
+	"time"
+
+	oclib "cloud.o-forge.io/core/oc-lib"
+	"cloud.o-forge.io/core/oc-lib/models/booking"
+	"cloud.o-forge.io/core/oc-lib/models/common/models"
+	"cloud.o-forge.io/core/oc-lib/models/live"
+	"cloud.o-forge.io/core/oc-lib/models/resources"
+	"github.com/gorilla/websocket"
+)
+
+// --- Structure métrique Vector ---
+type VectorMetric struct {
+	Name      string            `json:"name"`
+	Value     float64           `json:"value"`
+	Labels    map[string]string `json:"labels"`
+	Timestamp int64             `json:"timestamp"`
+}
+
+// --- Service Vector ---
+type VectorService struct {
+	mu               sync.Mutex
+	ExecutionMetrics map[string][]models.MetricsSnapshot // bookingID -> snapshots
+	sessions         map[string]context.CancelFunc       // optional: WS clients
+}
+
+func NewVectorService() *VectorService {
+	return &VectorService{
+		ExecutionMetrics: make(map[string][]models.MetricsSnapshot),
+		sessions:         make(map[string]context.CancelFunc),
+	}
+}
+
+// --- Connexion à un flux Vector en WS ---
+func (v *VectorService) ListenVector(ctx context.Context, b *booking.Booking, interval time.Duration, ws *websocket.Conn) error {
+	max := 100
+	count := 0
+	mets := map[string][]models.MetricsSnapshot{}
+	isActive := func() bool {
+		if b.ExpectedEndDate == nil {
+			return true
+		}
+		return time.Now().Before(*b.ExpectedEndDate)
+	}
+
+	ticker := time.NewTicker(interval)
+	defer ticker.Stop()
+
+	for isActive() {
+		select {
+		case <-ctx.Done():
+			return nil
+		case <-ticker.C:
+		}
+
+		bb, metrics := Call(b,
+			func(dc *live.LiveDatacenter, instance *resources.ComputeResourceInstance, metrics map[string]models.MetricsSnapshot,
+				wg *sync.WaitGroup, mu *sync.Mutex) {
+				c, _, err := websocket.DefaultDialer.Dial(dc.MonitorPath, nil)
+				if err != nil {
+					return
+				}
+				defer c.Close()
+				_, msg, err := c.ReadMessage()
+				if err != nil {
+					log.Println("vector ws read error:", err)
+					return
+				}
+
+				var m models.Metric
+				if err := json.Unmarshal(msg, &m); err != nil {
+					log.Println("json unmarshal error:", err)
+					return
+				}
+				mu.Lock()
+				if mm, ok := metrics[instance.Name]; !ok {
+					metrics[instance.Name] = models.MetricsSnapshot{
+						From:    instance.Source,
+						Metrics: []models.Metric{m},
+					}
+				} else {
+					mm.Metrics = append(mm.Metrics, m)
+				}
+
+				mu.Unlock()
+			})
+		_ = bb
+		for k, v := range metrics {
+			mets[k] = append(mets[k], v)
+		}
+		count++
+
+		if ws != nil {
+			if err := ws.WriteJSON(metrics); err != nil {
+				return fmt.Errorf("websocket write error: %w", err)
+			}
+		}
+
+		if count < max {
+			continue
+		}
+
+		if b.ExecutionMetrics == nil {
+			b.ExecutionMetrics = mets
+		} else {
+			for kk, vv := range mets {
+				b.ExecutionMetrics[kk] = append(b.ExecutionMetrics[kk], vv...)
+			}
+		}
+		b.GetAccessor(nil).UpdateOne(b, b.GetID())
+		mets = map[string][]models.MetricsSnapshot{}
+		count = 0
+	}
+
+	return nil
+}
+
+// --- Permet d'ajouter un front WebSocket pour recevoir les metrics live ---
+// --- Permet de récupérer le cache historique pour un booking ---
+func (v *VectorService) GetCache(bookingID string) []models.MetricsSnapshot {
+	v.mu.Lock()
+	defer v.mu.Unlock()
+	snapshots := v.ExecutionMetrics[bookingID]
+	return snapshots
+}
+
+// --- Exemple d'intégration avec un booking ---
+func (v *VectorService) Stream(ctx context.Context, bookingID string, interval time.Duration, ws *websocket.Conn) {
+	logger := oclib.GetLogger()
+	go func() {
+		bAccess := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.BOOKING), nil)
+		book := bAccess.LoadOne(bookingID)
+		if book.Err != "" {
+			logger.Err(fmt.Errorf("stop because of empty : %s", book.Err))
+			return
+		}
+		b := book.ToBookings()
+		if b == nil {
+			logger.Err(fmt.Errorf("stop because of empty is not a booking"))
+			return
+		}
+		if err := v.ListenVector(ctx, b, interval, ws); err != nil {
+			log.Printf("Vector listen error for booking %s: %v\n", b.GetID(), err)
+		}
+	}()
+}