Prometheus X Vector
This commit is contained in:
75
infrastructure/monitor/monitor.go
Normal file
75
infrastructure/monitor/monitor.go
Normal file
@@ -0,0 +1,75 @@
|
||||
package monitor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"oc-datacenter/conf"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/dbs"
|
||||
"cloud.o-forge.io/core/oc-lib/models/booking"
|
||||
"cloud.o-forge.io/core/oc-lib/models/common/models"
|
||||
"cloud.o-forge.io/core/oc-lib/models/live"
|
||||
"cloud.o-forge.io/core/oc-lib/models/resources"
|
||||
"github.com/gorilla/websocket"
|
||||
)
|
||||
|
||||
type MonitorInterface interface {
|
||||
Stream(ctx context.Context, bookingID string, interval time.Duration, ws *websocket.Conn)
|
||||
}
|
||||
|
||||
var _monitorService = map[string]func() MonitorInterface{
|
||||
"prometheus": func() MonitorInterface { return NewPrometheusService() },
|
||||
"vector": func() MonitorInterface { return NewVectorService() },
|
||||
}
|
||||
|
||||
func NewMonitorService() (MonitorInterface, error) {
|
||||
service, ok := _monitorService[conf.GetConfig().MonitorMode]
|
||||
if !ok {
|
||||
return nil, errors.New("monitor service not found")
|
||||
}
|
||||
return service(), nil
|
||||
}
|
||||
|
||||
func Call(book *booking.Booking,
|
||||
f func(*live.LiveDatacenter, *resources.ComputeResourceInstance,
|
||||
map[string]models.MetricsSnapshot, *sync.WaitGroup, *sync.Mutex)) (*booking.Booking, map[string]models.MetricsSnapshot) {
|
||||
logger := oclib.GetLogger()
|
||||
metrics := map[string]models.MetricsSnapshot{}
|
||||
var wg sync.WaitGroup
|
||||
var mu sync.Mutex
|
||||
|
||||
cUAccess := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.LIVE_DATACENTER), nil)
|
||||
cRAccess := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.COMPUTE_RESOURCE), nil)
|
||||
|
||||
rr := cRAccess.LoadOne(book.ResourceID)
|
||||
if rr.Err != "" {
|
||||
logger.Err(fmt.Errorf("can't proceed because of unfound resource %s : %s", book.ResourceID, rr.Err))
|
||||
return book, metrics
|
||||
}
|
||||
computeRes := rr.ToComputeResource()
|
||||
for _, instance := range computeRes.Instances {
|
||||
res := cUAccess.Search(&dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
"source": {{Operator: dbs.EQUAL.String(), Value: instance.Source}},
|
||||
"abstractlive.resources_id": {{Operator: dbs.EQUAL.String(), Value: computeRes.GetID()}},
|
||||
},
|
||||
}, "", false)
|
||||
if res.Err != "" {
|
||||
continue
|
||||
}
|
||||
for _, r := range res.Data {
|
||||
dc := r.(*live.LiveDatacenter)
|
||||
if dc.MonitorPath == "" {
|
||||
continue
|
||||
}
|
||||
wg.Add(1)
|
||||
go f(dc, instance, metrics, &wg, &mu)
|
||||
}
|
||||
}
|
||||
wg.Wait()
|
||||
return book, metrics
|
||||
}
|
||||
194
infrastructure/monitor/prometheus.go
Normal file
194
infrastructure/monitor/prometheus.go
Normal file
@@ -0,0 +1,194 @@
|
||||
package monitor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/models/booking"
|
||||
"cloud.o-forge.io/core/oc-lib/models/common/models"
|
||||
"cloud.o-forge.io/core/oc-lib/models/live"
|
||||
"cloud.o-forge.io/core/oc-lib/models/resources"
|
||||
"github.com/gorilla/websocket"
|
||||
)
|
||||
|
||||
type PrometheusResponse struct {
|
||||
Status string `json:"status"`
|
||||
Data struct {
|
||||
ResultType string `json:"resultType"`
|
||||
Result []struct {
|
||||
Metric map[string]string `json:"metric"`
|
||||
Value []interface{} `json:"value"` // [timestamp, value]
|
||||
} `json:"result"`
|
||||
} `json:"data"`
|
||||
}
|
||||
|
||||
var queriesMetrics = []string{
|
||||
"rate(container_cpu_usage_seconds_total{namespace=\"%s\"}[1m]) * 100",
|
||||
"container_memory_usage_bytes{namespace=\"%s\"}",
|
||||
"(container_fs_usage_bytes{namespace=\"%s\"}) / (container_fs_limit_bytes{namespace=\"%s\"}) * 100",
|
||||
"DCGM_FI_DEV_GPU_UTIL{namespace=\"%s\"}",
|
||||
"rate(container_fs_reads_bytes_total{namespace=\"%s\"}[1m])",
|
||||
"rate(container_fs_writes_bytes_total{namespace=\"%s\"}[1m])",
|
||||
"rate(container_network_receive_bytes_total{namespace=\"%s\"}[1m])",
|
||||
"rate(container_network_transmit_bytes_total{namespace=\"%s\"}[1m])",
|
||||
"rate(http_requests_total{namespace=\"%s\"}[1m])",
|
||||
"(rate(http_requests_total{status=~\"5..\", namespace=\"%s\"}[1m]) / rate(http_requests_total{namespace=\"%s\"}[1m])) * 100",
|
||||
}
|
||||
|
||||
var httpClient = &http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
}
|
||||
|
||||
// StreamRegistry manages cancellation of active monitoring streams by namespace.
|
||||
var StreamRegistry = &streamRegistry{
|
||||
streams: map[string]context.CancelFunc{},
|
||||
}
|
||||
|
||||
type streamRegistry struct {
|
||||
mu sync.Mutex
|
||||
streams map[string]context.CancelFunc
|
||||
}
|
||||
|
||||
func (r *streamRegistry) Register(namespace string) context.Context {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
if cancel, ok := r.streams[namespace]; ok {
|
||||
cancel()
|
||||
}
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
r.streams[namespace] = cancel
|
||||
return ctx
|
||||
}
|
||||
|
||||
func (r *streamRegistry) Cancel(namespace string) {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
if cancel, ok := r.streams[namespace]; ok {
|
||||
cancel()
|
||||
delete(r.streams, namespace)
|
||||
}
|
||||
}
|
||||
|
||||
type PrometheusService struct {
|
||||
}
|
||||
|
||||
func NewPrometheusService() *PrometheusService {
|
||||
return &PrometheusService{}
|
||||
}
|
||||
|
||||
func (p *PrometheusService) queryPrometheus(ctx context.Context, promURL string, expr string, namespace string) models.Metric {
|
||||
metric := models.Metric{Name: expr, Value: -1}
|
||||
query := strings.ReplaceAll(expr, "%s", namespace)
|
||||
reqURL := promURL + "/api/v1/query?query=" + url.QueryEscape(query)
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, reqURL, nil)
|
||||
if err != nil {
|
||||
metric.Error = err
|
||||
return metric
|
||||
}
|
||||
resp, err := httpClient.Do(req)
|
||||
if err != nil {
|
||||
metric.Error = err
|
||||
return metric
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
metric.Error = err
|
||||
return metric
|
||||
}
|
||||
var result PrometheusResponse
|
||||
if err = json.Unmarshal(body, &result); err != nil {
|
||||
metric.Error = err
|
||||
return metric
|
||||
}
|
||||
if len(result.Data.Result) > 0 && len(result.Data.Result[0].Value) == 2 {
|
||||
metric.Value, metric.Error = strconv.ParseFloat(fmt.Sprintf("%s", result.Data.Result[0].Value[1]), 64)
|
||||
}
|
||||
return metric
|
||||
}
|
||||
|
||||
func (p *PrometheusService) Stream(ctx context.Context, bookingID string, interval time.Duration, ws *websocket.Conn) {
|
||||
logger := oclib.GetLogger()
|
||||
max := 100
|
||||
count := 0
|
||||
mets := map[string][]models.MetricsSnapshot{}
|
||||
bAccess := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.BOOKING), nil)
|
||||
book := bAccess.LoadOne(bookingID)
|
||||
if book.Err != "" {
|
||||
logger.Err(fmt.Errorf("stop because of empty : %s", book.Err))
|
||||
return
|
||||
}
|
||||
|
||||
isActive := func(e *booking.Booking) bool {
|
||||
if e.ExpectedEndDate == nil {
|
||||
return true
|
||||
}
|
||||
return time.Now().Before(*e.ExpectedEndDate)
|
||||
}
|
||||
|
||||
ticker := time.NewTicker(interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for isActive(book.Data.(*booking.Booking)) {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
}
|
||||
|
||||
b, metrics := Call(book.Data.(*booking.Booking),
|
||||
func(dc *live.LiveDatacenter, instance *resources.ComputeResourceInstance,
|
||||
metrics map[string]models.MetricsSnapshot,
|
||||
wg *sync.WaitGroup, mu *sync.Mutex) {
|
||||
defer wg.Done()
|
||||
for _, expr := range queriesMetrics {
|
||||
if mm, ok := metrics[instance.Name]; !ok {
|
||||
mu.Lock()
|
||||
metrics[instance.Name] = models.MetricsSnapshot{
|
||||
From: instance.Source,
|
||||
Metrics: []models.Metric{p.queryPrometheus(ctx, dc.MonitorPath, expr, book.Data.(*booking.Booking).ExecutionsID)},
|
||||
}
|
||||
mu.Unlock()
|
||||
} else {
|
||||
mu.Lock()
|
||||
mm.Metrics = append(mm.Metrics, p.queryPrometheus(ctx, dc.MonitorPath, expr, book.Data.(*booking.Booking).ExecutionsID))
|
||||
mu.Unlock()
|
||||
}
|
||||
}
|
||||
})
|
||||
_ = b
|
||||
count++
|
||||
|
||||
if ws != nil {
|
||||
if err := ws.WriteJSON(metrics); err != nil {
|
||||
logger.Err(fmt.Errorf("websocket write error: %w", err))
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if count < max {
|
||||
continue
|
||||
}
|
||||
|
||||
bk := book.Data.(*booking.Booking)
|
||||
if bk.ExecutionMetrics == nil {
|
||||
bk.ExecutionMetrics = mets
|
||||
} else {
|
||||
for kk, vv := range mets {
|
||||
bk.ExecutionMetrics[kk] = append(bk.ExecutionMetrics[kk], vv...)
|
||||
}
|
||||
}
|
||||
bk.GetAccessor(nil).UpdateOne(bk, bookingID)
|
||||
mets = map[string][]models.MetricsSnapshot{}
|
||||
count = 0
|
||||
}
|
||||
}
|
||||
153
infrastructure/monitor/vector.go
Normal file
153
infrastructure/monitor/vector.go
Normal file
@@ -0,0 +1,153 @@
|
||||
package monitor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/models/booking"
|
||||
"cloud.o-forge.io/core/oc-lib/models/common/models"
|
||||
"cloud.o-forge.io/core/oc-lib/models/live"
|
||||
"cloud.o-forge.io/core/oc-lib/models/resources"
|
||||
"github.com/gorilla/websocket"
|
||||
)
|
||||
|
||||
// --- Structure métrique Vector ---
|
||||
type VectorMetric struct {
|
||||
Name string `json:"name"`
|
||||
Value float64 `json:"value"`
|
||||
Labels map[string]string `json:"labels"`
|
||||
Timestamp int64 `json:"timestamp"`
|
||||
}
|
||||
|
||||
// --- Service Vector ---
|
||||
type VectorService struct {
|
||||
mu sync.Mutex
|
||||
ExecutionMetrics map[string][]models.MetricsSnapshot // bookingID -> snapshots
|
||||
sessions map[string]context.CancelFunc // optional: WS clients
|
||||
}
|
||||
|
||||
func NewVectorService() *VectorService {
|
||||
return &VectorService{
|
||||
ExecutionMetrics: make(map[string][]models.MetricsSnapshot),
|
||||
sessions: make(map[string]context.CancelFunc),
|
||||
}
|
||||
}
|
||||
|
||||
// --- Connexion à un flux Vector en WS ---
|
||||
func (v *VectorService) ListenVector(ctx context.Context, b *booking.Booking, interval time.Duration, ws *websocket.Conn) error {
|
||||
max := 100
|
||||
count := 0
|
||||
mets := map[string][]models.MetricsSnapshot{}
|
||||
isActive := func() bool {
|
||||
if b.ExpectedEndDate == nil {
|
||||
return true
|
||||
}
|
||||
return time.Now().Before(*b.ExpectedEndDate)
|
||||
}
|
||||
|
||||
ticker := time.NewTicker(interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for isActive() {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil
|
||||
case <-ticker.C:
|
||||
}
|
||||
|
||||
bb, metrics := Call(b,
|
||||
func(dc *live.LiveDatacenter, instance *resources.ComputeResourceInstance, metrics map[string]models.MetricsSnapshot,
|
||||
wg *sync.WaitGroup, mu *sync.Mutex) {
|
||||
c, _, err := websocket.DefaultDialer.Dial(dc.MonitorPath, nil)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
defer c.Close()
|
||||
_, msg, err := c.ReadMessage()
|
||||
if err != nil {
|
||||
log.Println("vector ws read error:", err)
|
||||
return
|
||||
}
|
||||
|
||||
var m models.Metric
|
||||
if err := json.Unmarshal(msg, &m); err != nil {
|
||||
log.Println("json unmarshal error:", err)
|
||||
return
|
||||
}
|
||||
mu.Lock()
|
||||
if mm, ok := metrics[instance.Name]; !ok {
|
||||
metrics[instance.Name] = models.MetricsSnapshot{
|
||||
From: instance.Source,
|
||||
Metrics: []models.Metric{m},
|
||||
}
|
||||
} else {
|
||||
mm.Metrics = append(mm.Metrics, m)
|
||||
}
|
||||
|
||||
mu.Unlock()
|
||||
})
|
||||
_ = bb
|
||||
for k, v := range metrics {
|
||||
mets[k] = append(mets[k], v)
|
||||
}
|
||||
count++
|
||||
|
||||
if ws != nil {
|
||||
if err := ws.WriteJSON(metrics); err != nil {
|
||||
return fmt.Errorf("websocket write error: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
if count < max {
|
||||
continue
|
||||
}
|
||||
|
||||
if b.ExecutionMetrics == nil {
|
||||
b.ExecutionMetrics = mets
|
||||
} else {
|
||||
for kk, vv := range mets {
|
||||
b.ExecutionMetrics[kk] = append(b.ExecutionMetrics[kk], vv...)
|
||||
}
|
||||
}
|
||||
b.GetAccessor(nil).UpdateOne(b, b.GetID())
|
||||
mets = map[string][]models.MetricsSnapshot{}
|
||||
count = 0
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// --- Permet d'ajouter un front WebSocket pour recevoir les metrics live ---
|
||||
// --- Permet de récupérer le cache historique pour un booking ---
|
||||
func (v *VectorService) GetCache(bookingID string) []models.MetricsSnapshot {
|
||||
v.mu.Lock()
|
||||
defer v.mu.Unlock()
|
||||
snapshots := v.ExecutionMetrics[bookingID]
|
||||
return snapshots
|
||||
}
|
||||
|
||||
// --- Exemple d'intégration avec un booking ---
|
||||
func (v *VectorService) Stream(ctx context.Context, bookingID string, interval time.Duration, ws *websocket.Conn) {
|
||||
logger := oclib.GetLogger()
|
||||
go func() {
|
||||
bAccess := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.BOOKING), nil)
|
||||
book := bAccess.LoadOne(bookingID)
|
||||
if book.Err != "" {
|
||||
logger.Err(fmt.Errorf("stop because of empty : %s", book.Err))
|
||||
return
|
||||
}
|
||||
b := book.ToBookings()
|
||||
if b == nil {
|
||||
logger.Err(fmt.Errorf("stop because of empty is not a booking"))
|
||||
return
|
||||
}
|
||||
if err := v.ListenVector(ctx, b, interval, ws); err != nil {
|
||||
log.Printf("Vector listen error for booking %s: %v\n", b.GetID(), err)
|
||||
}
|
||||
}()
|
||||
}
|
||||
Reference in New Issue
Block a user