Refactor Oc-Sheduler
This commit is contained in:
137
infrastructure/api.go
Normal file
137
infrastructure/api.go
Normal file
@@ -0,0 +1,137 @@
|
||||
// Package infrastructure is the public façade for all scheduling sub-services.
|
||||
// Controllers and main.go import only this package; the sub-packages are
|
||||
// internal implementation details.
|
||||
package infrastructure
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"oc-scheduler/infrastructure/execution"
|
||||
"oc-scheduler/infrastructure/nats"
|
||||
"oc-scheduler/infrastructure/planner"
|
||||
"oc-scheduler/infrastructure/scheduler"
|
||||
"oc-scheduler/infrastructure/scheduling_resources"
|
||||
"oc-scheduler/infrastructure/session"
|
||||
"oc-scheduler/infrastructure/utils"
|
||||
"time"
|
||||
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow_execution"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
)
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Type re-exports
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
type WorkflowSchedule = scheduler.WorkflowSchedule
|
||||
type CheckResult = scheduler.CheckResult
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Bootstrap — called from main.go
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func ListenNATS() { nats.ListenNATS() }
|
||||
func InitSelfPlanner() { planner.InitPlanner() }
|
||||
func RecoverDraftExecutions() { execution.RecoverDraft() }
|
||||
func WatchExecutions() { execution.WatchExecutions() }
|
||||
|
||||
// EmitNATS broadcasts a propagation message via NATS.
|
||||
func EmitNATS(peerID string, message tools.PropalgationMessage) {
|
||||
utils.Propalgate(peerID, message)
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Utilities
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func GetWorkflowPeerIDs(wfID string, req *tools.APIRequest) ([]string, error) {
|
||||
return utils.GetWorkflowPeerIDs(wfID, req)
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Planner subscriptions
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func SubscribePlannerUpdates(peerIDs []string) (<-chan string, func()) {
|
||||
return planner.GetPlannerService().SubscribePlannerUpdates(peerIDs...)
|
||||
}
|
||||
|
||||
func SubscribeWorkflowUpdates(wfID string) (<-chan struct{}, func()) {
|
||||
return planner.GetPlannerService().SubscribeWorkflowUpdates(wfID)
|
||||
}
|
||||
|
||||
func RequestPlannerRefresh(peerIDs []string, executionsID string) []string {
|
||||
return planner.GetPlannerService().Refresh(peerIDs, executionsID)
|
||||
}
|
||||
|
||||
func ReleaseRefreshOwnership(peerIDs []string, executionsID string) {
|
||||
planner.GetPlannerService().ReleaseRefreshOwnership(peerIDs, executionsID)
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Session management
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func UpsertSessionDrafts(
|
||||
executionsID string,
|
||||
execs []*workflow_execution.WorkflowExecution,
|
||||
purchases, bookings []scheduling_resources.SchedulerObject,
|
||||
req *tools.APIRequest,
|
||||
) {
|
||||
svc := session.NewSessionExecutionsService(executionsID)
|
||||
svc.UpsertSessionDrafts(purchases, bookings, execs, req)
|
||||
}
|
||||
|
||||
func CleanupSession(executionsID string, req *tools.APIRequest) {
|
||||
svc := session.NewSessionExecutionsService(executionsID)
|
||||
svc.CleanupSession(req)
|
||||
}
|
||||
|
||||
func UnscheduleExecution(executionID string, req *tools.APIRequest) error {
|
||||
return execution.Unschedule(executionID, req)
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Schedule confirmation
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func Schedule(
|
||||
ws *WorkflowSchedule,
|
||||
wfID string,
|
||||
req *tools.APIRequest,
|
||||
) (*WorkflowSchedule, *workflow.Workflow, []*workflow_execution.WorkflowExecution, error) {
|
||||
if req == nil {
|
||||
return ws, nil, nil, fmt.Errorf("no request provided")
|
||||
}
|
||||
if ws.UUID == "" {
|
||||
return ws, nil, nil, fmt.Errorf("no scheduling session: use the Check stream first")
|
||||
}
|
||||
|
||||
svc := session.NewSessionExecutionsService(ws.UUID)
|
||||
|
||||
executions := svc.LoadSessionExecs()
|
||||
for _, exec := range executions {
|
||||
if !exec.ExecDate.IsZero() && exec.ExecDate.Before(time.Now().UTC()) {
|
||||
return ws, nil, nil, fmt.Errorf("execution %s is obsolete (start date in the past)", exec.GetID())
|
||||
}
|
||||
}
|
||||
|
||||
if err := svc.ConfirmSession(req); err != nil {
|
||||
return ws, nil, nil, fmt.Errorf("confirm session failed: %w", err)
|
||||
}
|
||||
|
||||
for _, exec := range executions {
|
||||
go execution.WatchDeadline(exec.GetID(), exec.ExecutionsID, exec.ExecDate, req)
|
||||
}
|
||||
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
obj, _, _ := workflow.NewAccessor(req).LoadOne(wfID)
|
||||
if obj == nil {
|
||||
return ws, nil, executions, nil
|
||||
}
|
||||
wf := obj.(*workflow.Workflow)
|
||||
ws.Workflow = wf
|
||||
ws.WorkflowExecution = executions
|
||||
wf.GetAccessor(adminReq).UpdateOne(wf.Serialize(wf), wf.GetID())
|
||||
return ws, wf, executions, nil
|
||||
}
|
||||
@@ -1,343 +0,0 @@
|
||||
package infrastructure
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/models/booking/planner"
|
||||
"cloud.o-forge.io/core/oc-lib/models/resources"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
)
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Slot availability check
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const (
|
||||
checkWindowHours = 5 // how far ahead to scan for a free slot (hours)
|
||||
checkStepMin = 15 // time increment per scan step (minutes)
|
||||
// asapBuffer is the minimum lead time added to time.Now() for as_possible
|
||||
// and WHEN_POSSIBLE bookings. It absorbs NATS propagation + p2p stream
|
||||
// latency so the ExpectedStartDate never arrives already in the past at
|
||||
// the destination peer.
|
||||
asapBuffer = 2 * time.Minute
|
||||
)
|
||||
|
||||
// CheckResult holds the outcome of a slot availability check.
|
||||
type CheckResult struct {
|
||||
Available bool `json:"available"`
|
||||
Start time.Time `json:"start"`
|
||||
End *time.Time `json:"end,omitempty"`
|
||||
// NextSlot is the nearest free slot found within checkWindowHours when
|
||||
// the requested slot is unavailable, or the preferred (conflict-free) slot
|
||||
// when running in preemption mode.
|
||||
NextSlot *time.Time `json:"next_slot,omitempty"`
|
||||
Warnings []string `json:"warnings,omitempty"`
|
||||
// Preemptible is true when the check was run in preemption mode.
|
||||
Preemptible bool `json:"preemptible,omitempty"`
|
||||
// SchedulingID is the session identifier the client must supply to Schedule
|
||||
// in order to confirm the draft bookings created during this Check session.
|
||||
SchedulingID string `json:"scheduling_id,omitempty"`
|
||||
}
|
||||
|
||||
// bookingResource is the minimum info needed to verify a resource against the
|
||||
// planner cache.
|
||||
type bookingResource struct {
|
||||
id string // resource MongoDB _id
|
||||
peerPID string // peer public PeerID (PID) — PlannerCache key
|
||||
instanceID string // resolved from WorkflowSchedule.SelectedInstances
|
||||
}
|
||||
|
||||
// Check verifies that all booking-relevant resources (storage and compute) of
|
||||
// the given workflow have capacity for the requested time slot.
|
||||
//
|
||||
// - asap=true → ignore ws.Start, begin searching from time.Now()
|
||||
// - preemption → always return Available=true but populate Warnings with
|
||||
// conflicts and NextSlot with the nearest conflict-free alternative
|
||||
func (ws *WorkflowSchedule) Check(wfID string, asap bool, preemption bool, request *tools.APIRequest) (*CheckResult, error) {
|
||||
// 1. Load workflow
|
||||
obj, code, err := workflow.NewAccessor(request).LoadOne(wfID)
|
||||
if code != 200 || err != nil {
|
||||
msg := "could not load workflow " + wfID
|
||||
if err != nil {
|
||||
msg += ": " + err.Error()
|
||||
}
|
||||
return nil, errors.New(msg)
|
||||
}
|
||||
wf := obj.(*workflow.Workflow)
|
||||
|
||||
// 2. Resolve start
|
||||
start := ws.Start
|
||||
if asap || start.IsZero() {
|
||||
start = time.Now().UTC().Add(asapBuffer)
|
||||
}
|
||||
|
||||
// 3. Resolve end – use explicit end/duration or estimate via Planify
|
||||
end := ws.End
|
||||
if end == nil {
|
||||
if ws.DurationS > 0 {
|
||||
e := start.Add(time.Duration(ws.DurationS * float64(time.Second)))
|
||||
end = &e
|
||||
} else {
|
||||
_, longest, _, _, planErr := wf.Planify(
|
||||
start, nil,
|
||||
ws.SelectedInstances, ws.SelectedPartnerships,
|
||||
ws.SelectedBuyings, ws.SelectedStrategies,
|
||||
int(ws.BookingMode), request,
|
||||
)
|
||||
if planErr == nil && longest > 0 {
|
||||
e := start.Add(time.Duration(longest) * time.Second)
|
||||
end = &e
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Extract booking-relevant (storage + compute) resources from the graph,
|
||||
// resolving the selected instance for each resource.
|
||||
checkables := collectBookingResources(wf, ws.SelectedInstances)
|
||||
// 5. Check every resource against its peer's planner
|
||||
unavailable, warnings := checkResourceAvailability(checkables, start, end)
|
||||
result := &CheckResult{
|
||||
Start: start,
|
||||
End: end,
|
||||
Warnings: warnings,
|
||||
}
|
||||
|
||||
// 6. Preemption mode: mark as schedulable regardless of conflicts, but
|
||||
// surface warnings and the nearest conflict-free alternative.
|
||||
if preemption {
|
||||
result.Available = true
|
||||
result.Preemptible = true
|
||||
if len(unavailable) > 0 {
|
||||
result.NextSlot = findNextSlot(checkables, start, end, checkWindowHours)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// 7. All resources are free
|
||||
if len(unavailable) == 0 {
|
||||
result.Available = true
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// 8. Slot unavailable – locate the nearest free slot within the window
|
||||
result.Available = false
|
||||
result.NextSlot = findNextSlot(checkables, start, end, checkWindowHours)
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// collectBookingResources returns unique storage and compute resources from the
|
||||
// workflow graph. For each resource the selected instance ID is resolved from
|
||||
// selectedInstances (the scheduler's SelectedInstances ConfigItem) so the planner
|
||||
// check targets the exact instance chosen by the user.
|
||||
func collectBookingResources(wf *workflow.Workflow, selectedInstances workflow.ConfigItem) map[string]bookingResource {
|
||||
if wf.Graph == nil {
|
||||
return nil
|
||||
}
|
||||
seen := map[string]bool{}
|
||||
result := map[string]bookingResource{}
|
||||
|
||||
// Resolve MongoDB peer _id (DID) → public PeerID (PID) used as PlannerCache key.
|
||||
peerAccess := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
|
||||
didToPID := map[string]string{}
|
||||
resolvePID := func(did string) string {
|
||||
if pid, ok := didToPID[did]; ok {
|
||||
return pid
|
||||
}
|
||||
if data := peerAccess.LoadOne(did); data.Data != nil {
|
||||
if p := data.ToPeer(); p != nil {
|
||||
didToPID[did] = p.PeerID
|
||||
return p.PeerID
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
resolveInstanceID := func(res interface {
|
||||
GetID() string
|
||||
GetCreatorID() string
|
||||
}) string {
|
||||
idx := selectedInstances.Get(res.GetID())
|
||||
switch r := res.(type) {
|
||||
case *resources.StorageResource:
|
||||
if inst := r.GetSelectedInstance(idx); inst != nil {
|
||||
return inst.GetID()
|
||||
}
|
||||
case *resources.ComputeResource:
|
||||
if inst := r.GetSelectedInstance(idx); inst != nil {
|
||||
return inst.GetID()
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
for _, item := range wf.GetGraphItems(wf.Graph.IsStorage) {
|
||||
i := item
|
||||
_, res := i.GetResource()
|
||||
if res == nil {
|
||||
continue
|
||||
}
|
||||
id := res.GetID()
|
||||
if seen[id] {
|
||||
continue
|
||||
}
|
||||
pid := resolvePID(res.GetCreatorID())
|
||||
if pid == "" {
|
||||
continue
|
||||
}
|
||||
seen[id] = true
|
||||
result[pid] = bookingResource{
|
||||
id: id,
|
||||
peerPID: pid,
|
||||
instanceID: resolveInstanceID(res),
|
||||
}
|
||||
}
|
||||
|
||||
for _, item := range wf.GetGraphItems(wf.Graph.IsCompute) {
|
||||
i := item
|
||||
_, res := i.GetResource()
|
||||
if res == nil {
|
||||
continue
|
||||
}
|
||||
id := res.GetID()
|
||||
if seen[id] {
|
||||
continue
|
||||
}
|
||||
pid := resolvePID(res.GetCreatorID())
|
||||
if pid == "" {
|
||||
continue
|
||||
}
|
||||
seen[id] = true
|
||||
result[pid] = bookingResource{
|
||||
id: id,
|
||||
peerPID: pid,
|
||||
instanceID: resolveInstanceID(res),
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// checkResourceAvailability returns the IDs of unavailable resources and
|
||||
// human-readable warning messages.
|
||||
func checkResourceAvailability(res map[string]bookingResource, start time.Time, end *time.Time) (unavailable []string, warnings []string) {
|
||||
for _, r := range res {
|
||||
plannerMu.RLock()
|
||||
entry := PlannerCache[r.peerPID]
|
||||
plannerMu.RUnlock()
|
||||
if entry == nil || entry.Planner == nil {
|
||||
warnings = append(warnings, fmt.Sprintf(
|
||||
"peer %s planner not in cache for resource %s – assuming available", r.peerPID, r.id))
|
||||
continue
|
||||
}
|
||||
if !checkInstance(entry.Planner, r.id, r.instanceID, start, end) {
|
||||
unavailable = append(unavailable, r.id)
|
||||
warnings = append(warnings, fmt.Sprintf(
|
||||
"resource %s is not available in [%s – %s]",
|
||||
r.id, start.Format(time.RFC3339), formatOptTime(end)))
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// checkInstance checks availability for the specific instance resolved by the
|
||||
// scheduler. When instanceID is empty (no instance selected / none resolvable),
|
||||
// it falls back to checking all instances known in the planner and returns true
|
||||
// if any one has remaining capacity. Returns true when no capacity is recorded.
|
||||
func checkInstance(p *planner.Planner, resourceID string, instanceID string, start time.Time, end *time.Time) bool {
|
||||
if instanceID != "" {
|
||||
return p.Check(resourceID, instanceID, nil, start, end)
|
||||
}
|
||||
// Fallback: accept if any known instance has free capacity
|
||||
caps, ok := p.Capacities[resourceID]
|
||||
if !ok || len(caps) == 0 {
|
||||
return true // no recorded usage → assume free
|
||||
}
|
||||
for id := range caps {
|
||||
if p.Check(resourceID, id, nil, start, end) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// findNextSlot scans forward from 'from' in checkStepMin increments for up to
|
||||
// windowH hours and returns the first candidate start time at which all
|
||||
// resources are simultaneously free.
|
||||
func findNextSlot(resources map[string]bookingResource, from time.Time, originalEnd *time.Time, windowH int) *time.Time {
|
||||
duration := time.Hour
|
||||
if originalEnd != nil {
|
||||
if d := originalEnd.Sub(from); d > 0 {
|
||||
duration = d
|
||||
}
|
||||
}
|
||||
step := time.Duration(checkStepMin) * time.Minute
|
||||
limit := from.Add(time.Duration(windowH) * time.Hour)
|
||||
for t := from.Add(step); t.Before(limit); t = t.Add(step) {
|
||||
e := t.Add(duration)
|
||||
if unavail, _ := checkResourceAvailability(resources, t, &e); len(unavail) == 0 {
|
||||
return &t
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func formatOptTime(t *time.Time) string {
|
||||
if t == nil {
|
||||
return "open"
|
||||
}
|
||||
return t.Format(time.RFC3339)
|
||||
}
|
||||
|
||||
// GetWorkflowPeerIDs loads the workflow and returns the deduplicated list of
|
||||
// creator peer IDs for all its storage and compute resources.
|
||||
// These are the peers whose planners must be watched by a check stream.
|
||||
func GetWorkflowPeerIDs(wfID string, request *tools.APIRequest) ([]string, error) {
|
||||
obj, code, err := workflow.NewAccessor(request).LoadOne(wfID)
|
||||
if code != 200 || err != nil {
|
||||
msg := "could not load workflow " + wfID
|
||||
if err != nil {
|
||||
msg += ": " + err.Error()
|
||||
}
|
||||
return nil, errors.New(msg)
|
||||
}
|
||||
wf := obj.(*workflow.Workflow)
|
||||
if wf.Graph == nil {
|
||||
return nil, nil
|
||||
}
|
||||
seen := map[string]bool{}
|
||||
var peerIDs []string
|
||||
for _, item := range wf.GetGraphItems(wf.Graph.IsStorage) {
|
||||
i := item
|
||||
_, res := i.GetResource()
|
||||
if res == nil {
|
||||
continue
|
||||
}
|
||||
if id := res.GetCreatorID(); id != "" && !seen[id] {
|
||||
seen[id] = true
|
||||
peerIDs = append(peerIDs, id)
|
||||
}
|
||||
}
|
||||
for _, item := range wf.GetGraphItems(wf.Graph.IsCompute) {
|
||||
i := item
|
||||
_, res := i.GetResource()
|
||||
if res == nil {
|
||||
continue
|
||||
}
|
||||
if id := res.GetCreatorID(); id != "" && !seen[id] {
|
||||
seen[id] = true
|
||||
peerIDs = append(peerIDs, id)
|
||||
}
|
||||
}
|
||||
realPeersID := []string{}
|
||||
access := oclib.NewRequestAdmin(oclib.LibDataEnum(tools.PEER), nil)
|
||||
for _, id := range peerIDs {
|
||||
if data := access.LoadOne(id); data.Data != nil {
|
||||
realPeersID = append(realPeersID, data.ToPeer().PeerID)
|
||||
}
|
||||
}
|
||||
return realPeersID, nil
|
||||
}
|
||||
@@ -1,197 +0,0 @@
|
||||
package infrastructure
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"sync"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/models/common/enum"
|
||||
"cloud.o-forge.io/core/oc-lib/models/utils"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow_execution"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
"oc-scheduler/infrastructure/scheduling"
|
||||
)
|
||||
|
||||
type executionConsidersPayload struct {
|
||||
ID string `json:"id"`
|
||||
ExecutionsID string `json:"executions_id"`
|
||||
ExecutionID string `json:"execution_id"`
|
||||
PeerIDs []string `json:"peer_ids"`
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Per-execution mutex map (replaces the global stateMu)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
var execLocksMu sync.RWMutex
|
||||
var execLocks = map[string]*sync.Mutex{} // executionID → per-execution mutex
|
||||
|
||||
// RegisterExecLock creates a mutex entry for the execution. Called when a new execution draft is persisted.
|
||||
func RegisterExecLock(executionID string) {
|
||||
execLocksMu.Lock()
|
||||
execLocks[executionID] = &sync.Mutex{}
|
||||
execLocksMu.Unlock()
|
||||
}
|
||||
|
||||
// UnregisterExecLock removes the mutex entry. Called on unschedule and execution deletion.
|
||||
func UnregisterExecLock(executionID string) {
|
||||
execLocksMu.Lock()
|
||||
delete(execLocks, executionID)
|
||||
execLocksMu.Unlock()
|
||||
}
|
||||
|
||||
// applyConsidersLocal applies the considers update directly for a confirmed
|
||||
// booking or purchase (bypasses NATS since updateExecutionState resolves the
|
||||
// execution from the resource itself).
|
||||
func applyConsidersLocal(id string, dt tools.DataType) {
|
||||
payload, err := json.Marshal(&executionConsidersPayload{ID: id})
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
updateExecutionState(payload, dt)
|
||||
}
|
||||
|
||||
// EmitConsidersExecution broadcasts a Considers / WORKFLOW_EXECUTION message to all
|
||||
// storage and compute peers of wf once the execution has transitioned to SCHEDULED.
|
||||
// Each receiving peer will use it to confirm (IsDraft=false) their local drafts.
|
||||
func EmitConsidersExecution(exec *workflow_execution.WorkflowExecution, wf *workflow.Workflow) {
|
||||
if wf == nil || wf.Graph == nil {
|
||||
return
|
||||
}
|
||||
peerIDs, err := GetWorkflowPeerIDs(wf.GetID(), &tools.APIRequest{Admin: true})
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
if len(peerIDs) == 0 {
|
||||
return
|
||||
}
|
||||
payload, err := json.Marshal(executionConsidersPayload{
|
||||
ID: exec.GetID(),
|
||||
ExecutionID: exec.GetID(),
|
||||
ExecutionsID: exec.ExecutionsID,
|
||||
PeerIDs: peerIDs})
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
b, err := json.Marshal(tools.PropalgationMessage{
|
||||
DataType: int(tools.WORKFLOW_EXECUTION),
|
||||
Action: tools.PB_CONSIDERS,
|
||||
Payload: payload,
|
||||
})
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
|
||||
FromApp: "oc-scheduler",
|
||||
Datatype: tools.WORKFLOW_EXECUTION,
|
||||
Method: int(tools.PROPALGATION_EVENT),
|
||||
Payload: b,
|
||||
})
|
||||
}
|
||||
|
||||
// updateExecutionState sets BookingsState[id]=true (dt==BOOKING) or
|
||||
// PurchasesState[id]=true (dt==PURCHASE_RESOURCE) on the target execution.
|
||||
// payload must be JSON-encoded {"id":"...", "execution_id":"..."}.
|
||||
func updateExecutionState(payload []byte, dt tools.DataType) {
|
||||
var data executionConsidersPayload
|
||||
if err := json.Unmarshal(payload, &data); err != nil || data.ID == "" {
|
||||
return
|
||||
}
|
||||
schdata := oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil).LoadOne(data.ID)
|
||||
if schdata.Data == nil {
|
||||
return
|
||||
}
|
||||
sch := scheduling.ToSchedulerObject(dt, schdata.Data)
|
||||
if sch == nil {
|
||||
return
|
||||
}
|
||||
execID := sch.GetExecutionId()
|
||||
|
||||
execLocksMu.RLock()
|
||||
mu := execLocks[execID]
|
||||
execLocksMu.RUnlock()
|
||||
if mu == nil {
|
||||
fmt.Printf("updateExecutionState: no lock for execution %s, skipping\n", execID)
|
||||
return
|
||||
}
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
res, _, err := workflow_execution.NewAccessor(adminReq).LoadOne(execID)
|
||||
if err != nil || res == nil {
|
||||
fmt.Printf("updateExecutionState: could not load execution %s: %v\n", data.ExecutionID, err)
|
||||
return
|
||||
}
|
||||
|
||||
exec := res.(*workflow_execution.WorkflowExecution)
|
||||
fmt.Println("sch.GetExecutionId()", data.ID, exec.BookingsState)
|
||||
|
||||
switch dt {
|
||||
case tools.BOOKING:
|
||||
if exec.BookingsState == nil {
|
||||
exec.BookingsState = map[string]bool{}
|
||||
}
|
||||
exec.BookingsState[data.ID] = true
|
||||
fmt.Println("sch.GetExecutionId()", data.ID)
|
||||
|
||||
case tools.PURCHASE_RESOURCE:
|
||||
if exec.PurchasesState == nil {
|
||||
exec.PurchasesState = map[string]bool{}
|
||||
}
|
||||
exec.PurchasesState[data.ID] = true
|
||||
}
|
||||
allConfirmed := true
|
||||
for _, st := range exec.BookingsState {
|
||||
if !st {
|
||||
allConfirmed = false
|
||||
break
|
||||
}
|
||||
}
|
||||
for _, st := range exec.PurchasesState {
|
||||
if !st {
|
||||
allConfirmed = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if allConfirmed {
|
||||
exec.State = enum.SCHEDULED
|
||||
exec.IsDraft = false
|
||||
}
|
||||
if _, _, err := utils.GenericRawUpdateOne(exec, exec.GetID(), workflow_execution.NewAccessor(adminReq)); err != nil {
|
||||
fmt.Printf("updateExecutionState: could not update execution %s: %v\n", sch.GetExecutionId(), err)
|
||||
return
|
||||
}
|
||||
if allConfirmed {
|
||||
// Confirm the order and notify all peers that execution is scheduled.
|
||||
go confirmSessionOrder(exec.ExecutionsID, adminReq)
|
||||
obj, _, err := workflow.NewAccessor(adminReq).LoadOne(exec.WorkflowID)
|
||||
if err == nil && obj != nil {
|
||||
go EmitConsidersExecution(exec, obj.(*workflow.Workflow))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// confirmExecutionDrafts is called when a Considers/WORKFLOW_EXECUTION message
|
||||
// is received from oc-discovery, meaning the originating peer has confirmed the
|
||||
// execution as SCHEDULED. For every booking and purchase ID listed in the
|
||||
// execution's states, we confirm the local draft (IsDraft=false).
|
||||
func confirmExecutionDrafts(payload []byte) {
|
||||
var data executionConsidersPayload
|
||||
if err := json.Unmarshal(payload, &data); err != nil {
|
||||
fmt.Printf("confirmExecutionDrafts: could not parse payload: %v\n", err)
|
||||
return
|
||||
}
|
||||
access := oclib.NewRequestAdmin(oclib.LibDataEnum(tools.WORKFLOW_EXECUTION), nil)
|
||||
d := access.LoadOne(data.ExecutionID)
|
||||
if exec := d.ToWorkflowExecution(); exec != nil {
|
||||
for id := range exec.BookingsState {
|
||||
go confirmResource(id, tools.BOOKING)
|
||||
}
|
||||
for id := range exec.PurchasesState {
|
||||
go confirmResource(id, tools.PURCHASE_RESOURCE)
|
||||
}
|
||||
}
|
||||
}
|
||||
508
infrastructure/execution/execution.go
Normal file
508
infrastructure/execution/execution.go
Normal file
@@ -0,0 +1,508 @@
|
||||
package execution
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"oc-scheduler/conf"
|
||||
"oc-scheduler/infrastructure/planner"
|
||||
"oc-scheduler/infrastructure/scheduling_resources"
|
||||
infUtils "oc-scheduler/infrastructure/utils"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/dbs"
|
||||
"cloud.o-forge.io/core/oc-lib/models/booking"
|
||||
"cloud.o-forge.io/core/oc-lib/models/common/enum"
|
||||
"cloud.o-forge.io/core/oc-lib/models/order"
|
||||
"cloud.o-forge.io/core/oc-lib/models/utils"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow_execution"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
"go.mongodb.org/mongo-driver/bson/primitive"
|
||||
)
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Global execution lock registry
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
var execLocksMu sync.RWMutex
|
||||
var execLocks = map[string]*sync.Mutex{}
|
||||
|
||||
func RegisterExecLock(executionID string) {
|
||||
execLocksMu.Lock()
|
||||
execLocks[executionID] = &sync.Mutex{}
|
||||
execLocksMu.Unlock()
|
||||
}
|
||||
|
||||
func UnregisterExecLock(executionID string) {
|
||||
execLocksMu.Lock()
|
||||
delete(execLocks, executionID)
|
||||
execLocksMu.Unlock()
|
||||
}
|
||||
|
||||
func GetExecLock(executionID string) *sync.Mutex {
|
||||
execLocksMu.RLock()
|
||||
mu := execLocks[executionID]
|
||||
execLocksMu.RUnlock()
|
||||
return mu
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Considers payload
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
type ConsidersPayload struct {
|
||||
ID string `json:"id"`
|
||||
ExecutionsID string `json:"executions_id"`
|
||||
ExecutionID string `json:"execution_id"`
|
||||
PeerIDs []string `json:"peer_ids"`
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Execution state machine — considers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func UpdateExecutionState(payload []byte, dt tools.DataType) {
|
||||
var data ConsidersPayload
|
||||
if err := json.Unmarshal(payload, &data); err != nil || data.ID == "" {
|
||||
return
|
||||
}
|
||||
schdata := oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil).LoadOne(data.ID)
|
||||
if schdata.Data == nil {
|
||||
return
|
||||
}
|
||||
sch := scheduling_resources.ToSchedulerObject(dt, schdata.Data)
|
||||
if sch == nil {
|
||||
return
|
||||
}
|
||||
execID := sch.GetExecutionId()
|
||||
|
||||
mu := GetExecLock(execID)
|
||||
if mu == nil {
|
||||
fmt.Printf("UpdateExecutionState: no lock for execution %s, skipping\n", execID)
|
||||
return
|
||||
}
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
res, _, err := workflow_execution.NewAccessor(adminReq).LoadOne(execID)
|
||||
if err != nil || res == nil {
|
||||
fmt.Printf("UpdateExecutionState: could not load execution %s: %v\n", execID, err)
|
||||
return
|
||||
}
|
||||
exec := res.(*workflow_execution.WorkflowExecution)
|
||||
|
||||
switch dt {
|
||||
case tools.BOOKING:
|
||||
if exec.BookingsState == nil {
|
||||
exec.BookingsState = map[string]bool{}
|
||||
}
|
||||
exec.BookingsState[data.ID] = true
|
||||
case tools.PURCHASE_RESOURCE:
|
||||
if exec.PurchasesState == nil {
|
||||
exec.PurchasesState = map[string]bool{}
|
||||
}
|
||||
exec.PurchasesState[data.ID] = true
|
||||
}
|
||||
|
||||
allConfirmed := true
|
||||
for _, st := range exec.BookingsState {
|
||||
if !st {
|
||||
allConfirmed = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if allConfirmed {
|
||||
for _, st := range exec.PurchasesState {
|
||||
if !st {
|
||||
allConfirmed = false
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if allConfirmed {
|
||||
exec.State = enum.SCHEDULED
|
||||
exec.IsDraft = false
|
||||
}
|
||||
if _, _, err := utils.GenericRawUpdateOne(exec, exec.GetID(), workflow_execution.NewAccessor(adminReq)); err != nil {
|
||||
fmt.Printf("UpdateExecutionState: could not update execution %s: %v\n", execID, err)
|
||||
return
|
||||
}
|
||||
if allConfirmed {
|
||||
go confirmSessionOrder(exec.ExecutionsID, adminReq)
|
||||
obj, _, err := workflow.NewAccessor(adminReq).LoadOne(exec.WorkflowID)
|
||||
if err == nil && obj != nil {
|
||||
go EmitConsidersExecution(exec, obj.(*workflow.Workflow))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func confirmSessionOrder(executionsID string, adminReq *tools.APIRequest) {
|
||||
results, _, _ := order.NewAccessor(adminReq).Search(
|
||||
&dbs.Filters{And: map[string][]dbs.Filter{
|
||||
"executions_id": {{Operator: dbs.EQUAL.String(), Value: executionsID}},
|
||||
}}, "", true)
|
||||
for _, obj := range results {
|
||||
if o, ok := obj.(*order.Order); ok {
|
||||
o.IsDraft = false
|
||||
utils.GenericRawUpdateOne(o, o.GetID(), order.NewAccessor(adminReq))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func ConfirmExecutionDrafts(payload []byte) {
|
||||
var data ConsidersPayload
|
||||
if err := json.Unmarshal(payload, &data); err != nil {
|
||||
fmt.Printf("ConfirmExecutionDrafts: could not parse payload: %v\n", err)
|
||||
return
|
||||
}
|
||||
d := oclib.NewRequestAdmin(oclib.LibDataEnum(tools.WORKFLOW_EXECUTION), nil).LoadOne(data.ExecutionID)
|
||||
if exec := d.ToWorkflowExecution(); exec != nil {
|
||||
for id := range exec.BookingsState {
|
||||
go scheduling_resources.Confirm(id, tools.BOOKING)
|
||||
}
|
||||
for id := range exec.PurchasesState {
|
||||
go scheduling_resources.Confirm(id, tools.PURCHASE_RESOURCE)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func EmitConsidersExecution(exec *workflow_execution.WorkflowExecution, wf *workflow.Workflow) {
|
||||
if wf == nil || wf.Graph == nil {
|
||||
return
|
||||
}
|
||||
peerIDs, err := infUtils.GetWorkflowPeerIDs(wf.GetID(), &tools.APIRequest{Admin: true})
|
||||
if err != nil || len(peerIDs) == 0 {
|
||||
return
|
||||
}
|
||||
payload, err := json.Marshal(ConsidersPayload{
|
||||
ID: exec.GetID(),
|
||||
ExecutionID: exec.GetID(),
|
||||
ExecutionsID: exec.ExecutionsID,
|
||||
PeerIDs: peerIDs,
|
||||
})
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
b, err := json.Marshal(tools.PropalgationMessage{
|
||||
DataType: int(tools.WORKFLOW_EXECUTION),
|
||||
Action: tools.PB_CONSIDERS,
|
||||
Payload: payload,
|
||||
})
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
|
||||
FromApp: "oc-scheduler",
|
||||
Datatype: tools.WORKFLOW_EXECUTION,
|
||||
Method: int(tools.PROPALGATION_EVENT),
|
||||
Payload: b,
|
||||
})
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Deadline watchers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func WatchDeadline(executionID string, ns string, execDate time.Time, request *tools.APIRequest) {
|
||||
delay := time.Until(execDate.UTC().Add(-1 * time.Minute))
|
||||
if delay <= 0 {
|
||||
go handleDeadline(executionID, ns, request)
|
||||
return
|
||||
}
|
||||
time.AfterFunc(delay, func() { handleDeadline(executionID, ns, request) })
|
||||
}
|
||||
|
||||
func handleDeadline(executionID string, ns string, request *tools.APIRequest) {
|
||||
res, _, err := workflow_execution.NewAccessor(&tools.APIRequest{Admin: true}).LoadOne(executionID)
|
||||
if err != nil || res == nil {
|
||||
fmt.Printf("handleDeadline: execution %s not found\n", executionID)
|
||||
return
|
||||
}
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
exec := res.(*workflow_execution.WorkflowExecution)
|
||||
if exec.IsDraft {
|
||||
Unschedule(executionID, request)
|
||||
workflow_execution.NewAccessor(adminReq).DeleteOne(executionID)
|
||||
fmt.Printf("handleDeadline: purged draft execution %s\n", executionID)
|
||||
return
|
||||
}
|
||||
if serv, err := tools.NewKubernetesService(
|
||||
conf.GetConfig().KubeHost+":"+conf.GetConfig().KubePort,
|
||||
conf.GetConfig().KubeCA, conf.GetConfig().KubeCert, conf.GetConfig().KubeData); err != nil {
|
||||
fmt.Printf("handleDeadline: k8s init failed for %s: %v\n", executionID, err)
|
||||
} else if err := serv.ProvisionExecutionNamespace(context.Background(), ns); err != nil &&
|
||||
!strings.Contains(err.Error(), "already exists") {
|
||||
fmt.Printf("handleDeadline: failed to provision namespace %s: %v\n", ns, err)
|
||||
}
|
||||
go watchEnd(executionID, ns, exec.EndDate, exec.ExecDate)
|
||||
}
|
||||
|
||||
func watchEnd(executionID string, ns string, endDate *time.Time, execDate time.Time) {
|
||||
var end time.Time
|
||||
if endDate != nil {
|
||||
end = *endDate
|
||||
} else {
|
||||
end = execDate.UTC().Add(5 * time.Minute)
|
||||
}
|
||||
fire := func() {
|
||||
serv, err := tools.NewKubernetesService(
|
||||
conf.GetConfig().KubeHost+":"+conf.GetConfig().KubePort,
|
||||
conf.GetConfig().KubeCA, conf.GetConfig().KubeCert, conf.GetConfig().KubeData)
|
||||
if err != nil {
|
||||
fmt.Printf("watchEnd: k8s init failed for %s: %v\n", executionID, err)
|
||||
return
|
||||
}
|
||||
if err := serv.TeardownExecutionNamespace(context.Background(), ns); err != nil {
|
||||
fmt.Printf("watchEnd: failed to teardown namespace %s: %v\n", ns, err)
|
||||
}
|
||||
}
|
||||
if delay := time.Until(end.UTC()); delay <= 0 {
|
||||
go fire()
|
||||
} else {
|
||||
time.AfterFunc(delay, fire)
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Unschedule / Recovery
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func Unschedule(executionID string, request *tools.APIRequest) error {
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
res, _, err := workflow_execution.NewAccessor(adminReq).LoadOne(executionID)
|
||||
if err != nil || res == nil {
|
||||
return fmt.Errorf("execution %s not found: %w", executionID, err)
|
||||
}
|
||||
exec := res.(*workflow_execution.WorkflowExecution)
|
||||
for _, byResource := range exec.PeerBookByGraph {
|
||||
for _, bookingIDs := range byResource {
|
||||
for _, bkID := range bookingIDs {
|
||||
bkRes, _, loadErr := booking.NewAccessor(adminReq).LoadOne(bkID)
|
||||
if loadErr != nil || bkRes == nil {
|
||||
continue
|
||||
}
|
||||
scheduling_resources.GetService().Delete(
|
||||
tools.BOOKING,
|
||||
scheduling_resources.ToSchedulerObject(tools.BOOKING, bkRes),
|
||||
request,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
workflow_execution.NewAccessor(adminReq).DeleteOne(executionID)
|
||||
UnregisterExecLock(executionID)
|
||||
return nil
|
||||
}
|
||||
|
||||
func RecoverDraft() {
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
results, _, _ := workflow_execution.NewAccessor(adminReq).Search(nil, "*", true)
|
||||
for _, obj := range results {
|
||||
exec, ok := obj.(*workflow_execution.WorkflowExecution)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
RegisterExecLock(exec.GetID())
|
||||
go WatchDeadline(exec.GetID(), exec.ExecutionsID, exec.ExecDate, adminReq)
|
||||
}
|
||||
fmt.Printf("RecoverDraft: recovered %d executions\n", len(results))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// NATS workflow lifecycle handlers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func HandleWorkflowStarted(resp tools.NATSResponse) {
|
||||
var evt tools.WorkflowLifecycleEvent
|
||||
if err := json.Unmarshal(resp.Payload, &evt); err != nil {
|
||||
return
|
||||
}
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
res, _, err := workflow_execution.NewAccessor(adminReq).LoadOne(evt.ExecutionID)
|
||||
if err != nil || res == nil {
|
||||
return
|
||||
}
|
||||
exec := res.(*workflow_execution.WorkflowExecution)
|
||||
exec.State = enum.STARTED
|
||||
if evt.RealStart != nil {
|
||||
exec.ExecDate = *evt.RealStart
|
||||
}
|
||||
utils.GenericRawUpdateOne(exec, exec.GetID(), workflow_execution.NewAccessor(adminReq))
|
||||
}
|
||||
|
||||
func HandleWorkflowDone(resp tools.NATSResponse) {
|
||||
var evt tools.WorkflowLifecycleEvent
|
||||
if err := json.Unmarshal(resp.Payload, &evt); err != nil {
|
||||
return
|
||||
}
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
res, _, err := workflow_execution.NewAccessor(adminReq).LoadOne(evt.ExecutionID)
|
||||
if err != nil || res == nil {
|
||||
return
|
||||
}
|
||||
exec := res.(*workflow_execution.WorkflowExecution)
|
||||
exec.State = enum.BookingStatus(evt.State)
|
||||
if evt.RealEnd != nil {
|
||||
exec.EndDate = evt.RealEnd
|
||||
}
|
||||
utils.GenericRawUpdateOne(exec, exec.GetID(), workflow_execution.NewAccessor(adminReq))
|
||||
for _, step := range evt.Steps {
|
||||
applyStepToBooking(step, adminReq)
|
||||
}
|
||||
self, err := oclib.GetMySelf()
|
||||
if err == nil && self != nil {
|
||||
go planner.GetPlannerService().RefreshSelf(self.PeerID, adminReq)
|
||||
}
|
||||
}
|
||||
|
||||
func HandleWorkflowStepDone(resp tools.NATSResponse) {
|
||||
var evt tools.WorkflowLifecycleEvent
|
||||
if err := json.Unmarshal(resp.Payload, &evt); err != nil || evt.BookingID == "" {
|
||||
return
|
||||
}
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
res, _, err := booking.NewAccessor(adminReq).LoadOne(evt.BookingID)
|
||||
if err != nil || res == nil {
|
||||
return
|
||||
}
|
||||
bk := res.(*booking.Booking)
|
||||
bk.State = enum.BookingStatus(evt.State)
|
||||
if evt.RealStart != nil {
|
||||
bk.RealStartDate = evt.RealStart
|
||||
}
|
||||
if evt.RealEnd != nil {
|
||||
bk.RealEndDate = evt.RealEnd
|
||||
}
|
||||
utils.GenericRawUpdateOne(bk, bk.GetID(), booking.NewAccessor(adminReq))
|
||||
switch bk.State {
|
||||
case enum.SUCCESS, enum.FAILURE, enum.FORGOTTEN, enum.CANCELLED:
|
||||
self, err := oclib.GetMySelf()
|
||||
if err == nil && self != nil {
|
||||
go planner.GetPlannerService().RefreshSelf(self.PeerID, adminReq)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func applyStepToBooking(step tools.StepMetric, adminReq *tools.APIRequest) {
|
||||
res, _, err := booking.NewAccessor(adminReq).LoadOne(step.BookingID)
|
||||
if err != nil || res == nil {
|
||||
return
|
||||
}
|
||||
bk := res.(*booking.Booking)
|
||||
switch bk.State {
|
||||
case enum.SUCCESS, enum.FAILURE, enum.FORGOTTEN, enum.CANCELLED:
|
||||
return
|
||||
}
|
||||
bk.State = enum.BookingStatus(step.State)
|
||||
if step.RealStart != nil {
|
||||
bk.RealStartDate = step.RealStart
|
||||
}
|
||||
if step.RealEnd != nil {
|
||||
bk.RealEndDate = step.RealEnd
|
||||
}
|
||||
utils.GenericRawUpdateOne(bk, bk.GetID(), booking.NewAccessor(adminReq))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Watchdog — stale execution safety net
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
var processedExecutions sync.Map
|
||||
|
||||
var terminalExecStates = map[enum.BookingStatus]bool{
|
||||
enum.SUCCESS: true, enum.FAILURE: true, enum.FORGOTTEN: true, enum.CANCELLED: true,
|
||||
}
|
||||
|
||||
func WatchExecutions() {
|
||||
logger := oclib.GetLogger()
|
||||
logger.Info().Msg("ExecutionWatchdog: started")
|
||||
ticker := time.NewTicker(time.Minute)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
if err := scanStaleExecutions(); err != nil {
|
||||
logger.Error().Msg("ExecutionWatchdog: " + err.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func scanStaleExecutions() error {
|
||||
myself, err := oclib.GetMySelf()
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not resolve local peer: %w", err)
|
||||
}
|
||||
deadline := time.Now().UTC().Add(-time.Minute)
|
||||
res := oclib.NewRequest(oclib.LibDataEnum(oclib.WORKFLOW_EXECUTION), "", myself.GetID(), []string{}, nil).
|
||||
Search(&dbs.Filters{And: map[string][]dbs.Filter{
|
||||
"execution_date": {{Operator: dbs.LTE.String(), Value: primitive.NewDateTimeFromTime(deadline)}},
|
||||
}}, "", false)
|
||||
if res.Err != "" {
|
||||
return fmt.Errorf("stale execution search failed: %s", res.Err)
|
||||
}
|
||||
for _, dbo := range res.Data {
|
||||
if exec, ok := dbo.(*workflow_execution.WorkflowExecution); ok {
|
||||
go emitExecutionFailure(exec)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func emitExecutionFailure(exec *workflow_execution.WorkflowExecution) {
|
||||
logger := oclib.GetLogger()
|
||||
if _, done := processedExecutions.Load(exec.GetID()); done {
|
||||
return
|
||||
}
|
||||
if terminalExecStates[exec.State] {
|
||||
processedExecutions.Store(exec.GetID(), struct{}{})
|
||||
return
|
||||
}
|
||||
now := time.Now().UTC()
|
||||
steps := make([]tools.StepMetric, 0)
|
||||
for _, byGraph := range exec.PeerBookByGraph {
|
||||
for _, bookingIDs := range byGraph {
|
||||
for _, bookingID := range bookingIDs {
|
||||
payload, err := json.Marshal(tools.WorkflowLifecycleEvent{
|
||||
ExecutionID: exec.GetID(),
|
||||
ExecutionsID: exec.ExecutionsID,
|
||||
BookingID: bookingID,
|
||||
State: enum.FAILURE.EnumIndex(),
|
||||
RealEnd: &now,
|
||||
})
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
tools.NewNATSCaller().SetNATSPub(tools.WORKFLOW_STEP_DONE_EVENT, tools.NATSResponse{
|
||||
FromApp: "oc-scheduler-watchdog",
|
||||
Method: int(tools.WORKFLOW_STEP_DONE_EVENT),
|
||||
Payload: payload,
|
||||
})
|
||||
steps = append(steps, tools.StepMetric{
|
||||
BookingID: bookingID,
|
||||
State: enum.FAILURE.EnumIndex(),
|
||||
RealEnd: &now,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
donePayload, err := json.Marshal(tools.WorkflowLifecycleEvent{
|
||||
ExecutionID: exec.GetID(),
|
||||
ExecutionsID: exec.ExecutionsID,
|
||||
State: enum.FAILURE.EnumIndex(),
|
||||
RealEnd: &now,
|
||||
Steps: steps,
|
||||
})
|
||||
if err == nil {
|
||||
tools.NewNATSCaller().SetNATSPub(tools.WORKFLOW_DONE_EVENT, tools.NATSResponse{
|
||||
FromApp: "oc-scheduler-watchdog",
|
||||
Method: int(tools.WORKFLOW_DONE_EVENT),
|
||||
Payload: donePayload,
|
||||
})
|
||||
}
|
||||
logger.Info().Msgf("ExecutionWatchdog: execution %s stale → emitting FAILURE (%d bookings)",
|
||||
exec.GetID(), len(steps))
|
||||
processedExecutions.Store(exec.GetID(), struct{}{})
|
||||
}
|
||||
@@ -1,75 +0,0 @@
|
||||
package infrastructure
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
|
||||
"cloud.o-forge.io/core/oc-lib/models/booking"
|
||||
"cloud.o-forge.io/core/oc-lib/models/resources/purchase_resource"
|
||||
"cloud.o-forge.io/core/oc-lib/models/utils"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
)
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// NATS emission
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func EmitNATS(peerID string, message tools.PropalgationMessage) {
|
||||
// PB_CLOSE_PLANNER: notify local watchers so streams re-evaluate.
|
||||
// Cache mutations (eviction or ownership reset) are the caller's
|
||||
// responsibility — see evictAfter and ReleaseRefreshOwnership.
|
||||
if message.Action == tools.PB_CLOSE_PLANNER {
|
||||
notifyPlannerWatchers(peerID)
|
||||
}
|
||||
b, _ := json.Marshal(message)
|
||||
tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
|
||||
FromApp: "oc-scheduler",
|
||||
Datatype: -1,
|
||||
Method: int(tools.PROPALGATION_EVENT),
|
||||
Payload: b,
|
||||
})
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// NATS listeners
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func ListenNATS() {
|
||||
tools.NewNATSCaller().ListenNats(map[tools.NATSMethod]func(tools.NATSResponse){
|
||||
tools.PLANNER_EXECUTION: handlePlannerExecution,
|
||||
tools.CONSIDERS_EVENT: handleConsidersEvent,
|
||||
tools.REMOVE_RESOURCE: handleRemoveResource,
|
||||
tools.CREATE_RESOURCE: handleCreateResource,
|
||||
tools.CONFIRM_EVENT: handleConfirm,
|
||||
})
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Draft timeout
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// draftTimeout deletes a booking or purchase resource if it is still a draft
|
||||
// after the 10-minute confirmation window has elapsed.
|
||||
func draftTimeout(id string, dt tools.DataType) {
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
var res utils.DBObject
|
||||
var loadErr error
|
||||
switch dt {
|
||||
case tools.BOOKING:
|
||||
res, _, loadErr = booking.NewAccessor(adminReq).LoadOne(id)
|
||||
case tools.PURCHASE_RESOURCE:
|
||||
res, _, loadErr = purchase_resource.NewAccessor(adminReq).LoadOne(id)
|
||||
default:
|
||||
return
|
||||
}
|
||||
if loadErr != nil || res == nil || !res.IsDrafted() {
|
||||
return
|
||||
}
|
||||
switch dt {
|
||||
case tools.BOOKING:
|
||||
booking.NewAccessor(adminReq).DeleteOne(id)
|
||||
case tools.PURCHASE_RESOURCE:
|
||||
purchase_resource.NewAccessor(adminReq).DeleteOne(id)
|
||||
}
|
||||
fmt.Printf("draftTimeout: %s %s deleted (still draft after 10 min)\n", dt.String(), id)
|
||||
}
|
||||
23
infrastructure/nats/nats.go
Normal file
23
infrastructure/nats/nats.go
Normal file
@@ -0,0 +1,23 @@
|
||||
package nats
|
||||
|
||||
import (
|
||||
"oc-scheduler/infrastructure/execution"
|
||||
"oc-scheduler/infrastructure/planner"
|
||||
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
)
|
||||
|
||||
// ListenNATS registers all NATS event handlers and starts listening.
|
||||
// Each handler is a thin router that delegates to the appropriate service.
|
||||
func ListenNATS() {
|
||||
tools.NewNATSCaller().ListenNats(map[tools.NATSMethod]func(tools.NATSResponse){
|
||||
tools.PLANNER_EXECUTION: planner.GetPlannerService().HandleStore,
|
||||
tools.CONSIDERS_EVENT: handleConsidersEvent,
|
||||
tools.REMOVE_RESOURCE: handleRemoveResource,
|
||||
tools.CREATE_RESOURCE: handleCreateResource,
|
||||
tools.CONFIRM_EVENT: handleConfirm,
|
||||
tools.WORKFLOW_STARTED_EVENT: execution.HandleWorkflowStarted,
|
||||
tools.WORKFLOW_STEP_DONE_EVENT: execution.HandleWorkflowStepDone,
|
||||
tools.WORKFLOW_DONE_EVENT: execution.HandleWorkflowDone,
|
||||
})
|
||||
}
|
||||
87
infrastructure/nats/nats_handlers.go
Normal file
87
infrastructure/nats/nats_handlers.go
Normal file
@@ -0,0 +1,87 @@
|
||||
package nats
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"oc-scheduler/infrastructure/execution"
|
||||
"oc-scheduler/infrastructure/planner"
|
||||
"oc-scheduler/infrastructure/scheduling_resources"
|
||||
|
||||
"cloud.o-forge.io/core/oc-lib/models/booking"
|
||||
"cloud.o-forge.io/core/oc-lib/models/resources/purchase_resource"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
)
|
||||
|
||||
// handleConfirm processes a CONFIRM_EVENT: sets IsDraft=false on the resource.
|
||||
func handleConfirm(resp tools.NATSResponse) {
|
||||
scheduling_resources.Confirm(string(resp.Payload), resp.Datatype)
|
||||
}
|
||||
|
||||
// handleConsidersEvent routes CONSIDERS_EVENT to the execution service.
|
||||
func handleConsidersEvent(resp tools.NATSResponse) {
|
||||
switch resp.Datatype {
|
||||
case tools.BOOKING, tools.PURCHASE_RESOURCE:
|
||||
execution.UpdateExecutionState(resp.Payload, resp.Datatype)
|
||||
case tools.WORKFLOW_EXECUTION:
|
||||
execution.ConfirmExecutionDrafts(resp.Payload)
|
||||
}
|
||||
}
|
||||
|
||||
// handleRemoveResource routes REMOVE_RESOURCE to the appropriate service.
|
||||
func handleRemoveResource(resp tools.NATSResponse) {
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
switch resp.Datatype {
|
||||
case tools.WORKFLOW:
|
||||
var wf workflow.Workflow
|
||||
if err := json.Unmarshal(resp.Payload, &wf); err != nil {
|
||||
return
|
||||
}
|
||||
planner.GetPlannerService().NotifyWorkflow(wf.GetID())
|
||||
case tools.BOOKING:
|
||||
var p scheduling_resources.RemoveResourcePayload
|
||||
if err := json.Unmarshal(resp.Payload, &p); err != nil {
|
||||
return
|
||||
}
|
||||
scheduling_resources.GetService().HandleRemoveBooking(p, adminReq)
|
||||
case tools.PURCHASE_RESOURCE:
|
||||
var p scheduling_resources.RemoveResourcePayload
|
||||
if err := json.Unmarshal(resp.Payload, &p); err != nil {
|
||||
return
|
||||
}
|
||||
scheduling_resources.GetService().HandleRemovePurchase(p, adminReq)
|
||||
}
|
||||
}
|
||||
|
||||
// handleCreateResource routes CREATE_RESOURCE to the appropriate service.
|
||||
func handleCreateResource(resp tools.NATSResponse) {
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
switch resp.Datatype {
|
||||
case tools.WORKFLOW:
|
||||
var wf workflow.Workflow
|
||||
if err := json.Unmarshal(resp.Payload, &wf); err != nil {
|
||||
return
|
||||
}
|
||||
planner.GetPlannerService().Broadcast(&wf)
|
||||
planner.GetPlannerService().NotifyWorkflow(wf.GetID())
|
||||
case tools.BOOKING:
|
||||
var bk booking.Booking
|
||||
if err := json.Unmarshal(resp.Payload, &bk); err != nil {
|
||||
return
|
||||
}
|
||||
needsConsiders := scheduling_resources.GetService().HandleCreateBooking(&bk, adminReq)
|
||||
if needsConsiders {
|
||||
payload, _ := json.Marshal(execution.ConsidersPayload{ID: bk.GetID()})
|
||||
execution.UpdateExecutionState(payload, tools.BOOKING)
|
||||
}
|
||||
case tools.PURCHASE_RESOURCE:
|
||||
var pr purchase_resource.PurchaseResource
|
||||
if err := json.Unmarshal(resp.Payload, &pr); err != nil {
|
||||
return
|
||||
}
|
||||
needsConsiders := scheduling_resources.GetService().HandleCreatePurchase(&pr, adminReq)
|
||||
if needsConsiders {
|
||||
payload, _ := json.Marshal(execution.ConsidersPayload{ID: pr.GetID()})
|
||||
execution.UpdateExecutionState(payload, tools.PURCHASE_RESOURCE)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,248 +0,0 @@
|
||||
package infrastructure
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/models/booking"
|
||||
"cloud.o-forge.io/core/oc-lib/models/booking/planner"
|
||||
"cloud.o-forge.io/core/oc-lib/models/common/enum"
|
||||
"cloud.o-forge.io/core/oc-lib/models/peer"
|
||||
"cloud.o-forge.io/core/oc-lib/models/resources/purchase_resource"
|
||||
"cloud.o-forge.io/core/oc-lib/models/utils"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
)
|
||||
|
||||
func handleConfirm(resp tools.NATSResponse) {
|
||||
confirmResource(string(resp.Payload), resp.Datatype)
|
||||
}
|
||||
|
||||
func handlePlannerExecution(resp tools.NATSResponse) {
|
||||
m := map[string]interface{}{}
|
||||
p := planner.Planner{}
|
||||
if err := json.Unmarshal(resp.Payload, &m); err != nil {
|
||||
return
|
||||
}
|
||||
if err := json.Unmarshal(resp.Payload, &p); err != nil {
|
||||
return
|
||||
}
|
||||
storePlanner(fmt.Sprintf("%v", m["peer_id"]), &p)
|
||||
}
|
||||
|
||||
func handleConsidersEvent(resp tools.NATSResponse) {
|
||||
fmt.Println("CONSIDERS_EVENT", resp.Datatype)
|
||||
switch resp.Datatype {
|
||||
case tools.BOOKING, tools.PURCHASE_RESOURCE:
|
||||
fmt.Println("updateExecutionState", resp.Datatype)
|
||||
updateExecutionState(resp.Payload, resp.Datatype)
|
||||
case tools.WORKFLOW_EXECUTION:
|
||||
confirmExecutionDrafts(resp.Payload)
|
||||
}
|
||||
}
|
||||
|
||||
func handleRemoveResource(resp tools.NATSResponse) {
|
||||
switch resp.Datatype {
|
||||
case tools.WORKFLOW:
|
||||
wf := workflow.Workflow{}
|
||||
if err := json.Unmarshal(resp.Payload, &wf); err != nil {
|
||||
return
|
||||
}
|
||||
notifyWorkflowWatchers(wf.GetID())
|
||||
case tools.BOOKING:
|
||||
var p removeResourcePayload
|
||||
if err := json.Unmarshal(resp.Payload, &p); err != nil {
|
||||
return
|
||||
}
|
||||
self, err := oclib.GetMySelf()
|
||||
if err != nil || self == nil {
|
||||
return
|
||||
}
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
res, _, loadErr := booking.NewAccessor(adminReq).LoadOne(p.ID)
|
||||
if loadErr != nil || res == nil {
|
||||
return
|
||||
}
|
||||
existing := res.(*booking.Booking)
|
||||
if existing.SchedulerPeerID != p.SchedulerPeerID || existing.ExecutionsID != p.ExecutionsID {
|
||||
fmt.Println("ListenNATS REMOVE_RESOURCE booking: auth mismatch, ignoring", p.ID)
|
||||
return
|
||||
}
|
||||
booking.NewAccessor(adminReq).DeleteOne(p.ID)
|
||||
go refreshSelfPlanner(self.PeerID, adminReq)
|
||||
case tools.PURCHASE_RESOURCE:
|
||||
var p removeResourcePayload
|
||||
if err := json.Unmarshal(resp.Payload, &p); err != nil {
|
||||
return
|
||||
}
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
res, _, loadErr := purchase_resource.NewAccessor(adminReq).LoadOne(p.ID)
|
||||
if loadErr != nil || res == nil {
|
||||
return
|
||||
}
|
||||
existing := res.(*purchase_resource.PurchaseResource)
|
||||
if existing.SchedulerPeerID != p.SchedulerPeerID || existing.ExecutionsID != p.ExecutionsID {
|
||||
fmt.Println("ListenNATS REMOVE_RESOURCE purchase: auth mismatch, ignoring", p.ID)
|
||||
return
|
||||
}
|
||||
purchase_resource.NewAccessor(adminReq).DeleteOne(p.ID)
|
||||
}
|
||||
}
|
||||
|
||||
func handleCreateBooking(bk *booking.Booking, self *peer.Peer, adminReq *tools.APIRequest) {
|
||||
// Upsert: if a booking with this ID already exists, verify auth and update.
|
||||
if existing, _, loadErr := booking.NewAccessor(adminReq).LoadOne(bk.GetID()); loadErr == nil && existing != nil {
|
||||
prev := existing.(*booking.Booking)
|
||||
if prev.SchedulerPeerID != bk.SchedulerPeerID || prev.ExecutionsID != bk.ExecutionsID {
|
||||
fmt.Println("ListenNATS CREATE_RESOURCE booking upsert: auth mismatch, ignoring", bk.GetID())
|
||||
return
|
||||
}
|
||||
if !prev.IsDrafted() && bk.IsDraft {
|
||||
// Already confirmed, refuse downgrade.
|
||||
return
|
||||
}
|
||||
// Expired check only on confirmation (IsDraft→false).
|
||||
if !bk.IsDraft && !prev.ExpectedStartDate.IsZero() && prev.ExpectedStartDate.Before(time.Now().UTC()) {
|
||||
fmt.Println("ListenNATS CREATE_RESOURCE booking: expired, deleting", bk.GetID())
|
||||
booking.NewAccessor(adminReq).DeleteOne(bk.GetID())
|
||||
return
|
||||
}
|
||||
if _, _, err := utils.GenericRawUpdateOne(bk, bk.GetID(), booking.NewAccessor(adminReq)); err != nil {
|
||||
fmt.Println("ListenNATS CREATE_RESOURCE booking update failed:", err)
|
||||
return
|
||||
}
|
||||
go refreshSelfPlanner(self.PeerID, adminReq)
|
||||
if !bk.IsDraft {
|
||||
go applyConsidersLocal(bk.GetID(), tools.BOOKING)
|
||||
}
|
||||
return
|
||||
}
|
||||
// New booking: standard create flow.
|
||||
if !bk.ExpectedStartDate.IsZero() && bk.ExpectedStartDate.Before(time.Now().UTC()) {
|
||||
fmt.Println("ListenNATS: booking start date is in the past, discarding")
|
||||
return
|
||||
}
|
||||
plannerMu.RLock()
|
||||
selfEntry := PlannerCache[self.PeerID]
|
||||
plannerMu.RUnlock()
|
||||
if selfEntry != nil && selfEntry.Planner != nil && !checkInstance(selfEntry.Planner, bk.ResourceID, bk.InstanceID, bk.ExpectedStartDate, bk.ExpectedEndDate) {
|
||||
fmt.Println("ListenNATS: booking conflicts with local planner, discarding")
|
||||
return
|
||||
}
|
||||
bk.IsDraft = true
|
||||
stored, _, err := booking.NewAccessor(adminReq).StoreOne(bk)
|
||||
if err != nil {
|
||||
fmt.Println("ListenNATS: could not store booking:", err)
|
||||
return
|
||||
}
|
||||
storedID := stored.GetID()
|
||||
go refreshSelfPlanner(self.PeerID, adminReq)
|
||||
time.AfterFunc(10*time.Minute, func() { draftTimeout(storedID, tools.BOOKING) })
|
||||
}
|
||||
|
||||
func handleCreatePurchase(pr *purchase_resource.PurchaseResource, self *peer.Peer, adminReq *tools.APIRequest) {
|
||||
if pr.DestPeerID != self.GetID() {
|
||||
return
|
||||
}
|
||||
// Upsert: if a purchase with this ID already exists, verify auth and update.
|
||||
if existing, _, loadErr := purchase_resource.NewAccessor(adminReq).LoadOne(pr.GetID()); loadErr == nil && existing != nil {
|
||||
prev := existing.(*purchase_resource.PurchaseResource)
|
||||
if prev.SchedulerPeerID != pr.SchedulerPeerID || prev.ExecutionsID != pr.ExecutionsID {
|
||||
fmt.Println("ListenNATS CREATE_RESOURCE purchase upsert: auth mismatch, ignoring", pr.GetID())
|
||||
return
|
||||
}
|
||||
if !prev.IsDrafted() && pr.IsDraft {
|
||||
return
|
||||
}
|
||||
if _, _, err := utils.GenericRawUpdateOne(pr, pr.GetID(), purchase_resource.NewAccessor(adminReq)); err != nil {
|
||||
fmt.Println("ListenNATS CREATE_RESOURCE purchase update failed:", err)
|
||||
return
|
||||
}
|
||||
if !pr.IsDraft {
|
||||
go applyConsidersLocal(pr.GetID(), tools.PURCHASE_RESOURCE)
|
||||
}
|
||||
return
|
||||
}
|
||||
// New purchase: standard create flow.
|
||||
pr.IsDraft = true
|
||||
stored, _, err := purchase_resource.NewAccessor(adminReq).StoreOne(pr)
|
||||
if err != nil {
|
||||
fmt.Println("ListenNATS: could not store purchase:", err)
|
||||
return
|
||||
}
|
||||
storedID := stored.GetID()
|
||||
time.AfterFunc(10*time.Minute, func() { draftTimeout(storedID, tools.PURCHASE_RESOURCE) })
|
||||
}
|
||||
|
||||
func handleCreateResource(resp tools.NATSResponse) {
|
||||
switch resp.Datatype {
|
||||
case tools.WORKFLOW:
|
||||
wf := workflow.Workflow{}
|
||||
if err := json.Unmarshal(resp.Payload, &wf); err != nil {
|
||||
return
|
||||
}
|
||||
broadcastPlanner(&wf)
|
||||
notifyWorkflowWatchers(wf.GetID())
|
||||
case tools.BOOKING:
|
||||
var bk booking.Booking
|
||||
if err := json.Unmarshal(resp.Payload, &bk); err != nil {
|
||||
return
|
||||
}
|
||||
self, err := oclib.GetMySelf()
|
||||
/*if err != nil || self == nil || bk.DestPeerID != self.GetID() {
|
||||
return
|
||||
}*/
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
_ = err
|
||||
handleCreateBooking(&bk, self, adminReq)
|
||||
case tools.PURCHASE_RESOURCE:
|
||||
var pr purchase_resource.PurchaseResource
|
||||
if err := json.Unmarshal(resp.Payload, &pr); err != nil {
|
||||
return
|
||||
}
|
||||
self, err := oclib.GetMySelf()
|
||||
if err != nil || self == nil {
|
||||
return
|
||||
}
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
handleCreatePurchase(&pr, self, adminReq)
|
||||
}
|
||||
}
|
||||
|
||||
// confirmResource sets IsDraft=false for a booking or purchase resource.
|
||||
// For bookings it also advances State to SCHEDULED and refreshes the local planner.
|
||||
func confirmResource(id string, dt tools.DataType) {
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
switch dt {
|
||||
case tools.BOOKING:
|
||||
res, _, err := booking.NewAccessor(adminReq).LoadOne(id)
|
||||
if err != nil || res == nil {
|
||||
fmt.Printf("confirmResource: could not load booking %s: %v\n", id, err)
|
||||
return
|
||||
}
|
||||
bk := res.(*booking.Booking)
|
||||
bk.IsDraft = false
|
||||
bk.State = enum.SCHEDULED
|
||||
if _, _, err := utils.GenericRawUpdateOne(bk, id, booking.NewAccessor(adminReq)); err != nil {
|
||||
fmt.Printf("confirmResource: could not confirm booking %s: %v\n", id, err)
|
||||
return
|
||||
}
|
||||
self, err := oclib.GetMySelf()
|
||||
if err == nil && self != nil {
|
||||
go refreshSelfPlanner(self.PeerID, adminReq)
|
||||
}
|
||||
case tools.PURCHASE_RESOURCE:
|
||||
res, _, err := purchase_resource.NewAccessor(adminReq).LoadOne(id)
|
||||
if err != nil || res == nil {
|
||||
fmt.Printf("confirmResource: could not load purchase %s: %v\n", id, err)
|
||||
return
|
||||
}
|
||||
pr := res.(*purchase_resource.PurchaseResource)
|
||||
pr.IsDraft = false
|
||||
if _, _, err := utils.GenericRawUpdateOne(pr, id, purchase_resource.NewAccessor(adminReq)); err != nil {
|
||||
fmt.Printf("confirmResource: could not confirm purchase %s: %v\n", id, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,353 +0,0 @@
|
||||
package infrastructure
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"slices"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/models/booking/planner"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow/graph"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
)
|
||||
|
||||
const plannerTTL = 24 * time.Hour
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Planner cache — protected by plannerMu
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// plannerEntry wraps a planner snapshot with refresh-ownership tracking.
|
||||
// At most one check session may be the "refresh owner" of a given peer's
|
||||
// planner at a time: it emits PB_PLANNER to request a fresh snapshot from
|
||||
// oc-discovery and, on close (clean or forced), emits PB_CLOSE_PLANNER to
|
||||
// release the stream. Any subsequent session that needs the same peer's
|
||||
// planner will see Refreshing=true and skip the duplicate request.
|
||||
type plannerEntry struct {
|
||||
Planner *planner.Planner
|
||||
Refreshing bool // true while a PB_PLANNER request is in flight
|
||||
RefreshOwner string // session UUID that initiated the current refresh
|
||||
}
|
||||
|
||||
var plannerMu sync.RWMutex
|
||||
var PlannerCache = map[string]*plannerEntry{}
|
||||
var plannerAddedAt = map[string]time.Time{} // peerID → first-seen timestamp
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Subscriber registries — one keyed by peerID, one by workflowID
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
var subsMu sync.RWMutex
|
||||
var plannerSubs = map[string][]chan string{} // peerID → channels (deliver peerID)
|
||||
var workflowSubs = map[string][]chan struct{}{} // workflowID → notification channels
|
||||
|
||||
// subscribePlanners registers interest in planner changes for the given peer IDs.
|
||||
// The returned channel receives the peerID string (non-blocking) each time any
|
||||
// of those planners is updated. Call cancel to unregister.
|
||||
func subscribePlanners(peerIDs []string) (<-chan string, func()) {
|
||||
ch := make(chan string, 1)
|
||||
subsMu.Lock()
|
||||
for _, k := range peerIDs {
|
||||
plannerSubs[k] = append(plannerSubs[k], ch)
|
||||
}
|
||||
subsMu.Unlock()
|
||||
cancel := func() {
|
||||
subsMu.Lock()
|
||||
for _, k := range peerIDs {
|
||||
subs := plannerSubs[k]
|
||||
for i, s := range subs {
|
||||
if s == ch {
|
||||
plannerSubs[k] = append(subs[:i], subs[i+1:]...)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
subsMu.Unlock()
|
||||
}
|
||||
return ch, cancel
|
||||
}
|
||||
|
||||
// SubscribePlannerUpdates registers interest in planner changes for the given
|
||||
// peer IDs. The returned channel receives the peerID string (non-blocking) each
|
||||
// time any of those planners is updated. Call cancel to unregister.
|
||||
func SubscribePlannerUpdates(peerIDs []string) (<-chan string, func()) {
|
||||
return subscribePlanners(peerIDs)
|
||||
}
|
||||
|
||||
// SubscribeWorkflowUpdates registers interest in workflow modifications for the
|
||||
// given workflow ID. The returned channel is signalled when the workflow changes
|
||||
// (peer list may have grown or shrunk). Call cancel to unregister.
|
||||
func SubscribeWorkflowUpdates(wfID string) (<-chan struct{}, func()) {
|
||||
ch, cancel := subscribe(&subsMu, workflowSubs, []string{wfID})
|
||||
return ch, cancel
|
||||
}
|
||||
|
||||
// subscribe is the generic helper used by the workflow registry.
|
||||
func subscribe(mu *sync.RWMutex, registry map[string][]chan struct{}, keys []string) (<-chan struct{}, func()) {
|
||||
ch := make(chan struct{}, 1)
|
||||
mu.Lock()
|
||||
for _, k := range keys {
|
||||
registry[k] = append(registry[k], ch)
|
||||
}
|
||||
mu.Unlock()
|
||||
cancel := func() {
|
||||
mu.Lock()
|
||||
for _, k := range keys {
|
||||
subs := registry[k]
|
||||
for i, s := range subs {
|
||||
if s == ch {
|
||||
registry[k] = append(subs[:i], subs[i+1:]...)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
mu.Unlock()
|
||||
}
|
||||
return ch, cancel
|
||||
}
|
||||
|
||||
func notifyPlannerWatchers(peerID string) {
|
||||
subsMu.RLock()
|
||||
subs := plannerSubs[peerID]
|
||||
subsMu.RUnlock()
|
||||
for _, ch := range subs {
|
||||
select {
|
||||
case ch <- peerID:
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func notifyWorkflowWatchers(wfID string) {
|
||||
notify(&subsMu, workflowSubs, wfID)
|
||||
}
|
||||
|
||||
func notify(mu *sync.RWMutex, registry map[string][]chan struct{}, key string) {
|
||||
mu.RLock()
|
||||
subs := registry[key]
|
||||
mu.RUnlock()
|
||||
for _, ch := range subs {
|
||||
select {
|
||||
case ch <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Cache helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// storePlanner inserts or updates the planner snapshot for peerID.
|
||||
// On first insertion it schedules an automatic eviction after plannerTTL.
|
||||
// Existing refresh-ownership state (Refreshing / RefreshOwner) is preserved
|
||||
// so that an in-flight request is not inadvertently reset.
|
||||
// All subscribers interested in this peer are notified.
|
||||
func storePlanner(peerID string, p *planner.Planner) {
|
||||
plannerMu.Lock()
|
||||
entry := PlannerCache[peerID]
|
||||
isNew := entry == nil
|
||||
if isNew {
|
||||
entry = &plannerEntry{}
|
||||
PlannerCache[peerID] = entry
|
||||
plannerAddedAt[peerID] = time.Now().UTC()
|
||||
go evictAfter(peerID, plannerTTL)
|
||||
}
|
||||
entry.Planner = p
|
||||
plannerMu.Unlock()
|
||||
notifyPlannerWatchers(peerID)
|
||||
}
|
||||
|
||||
// evictAfter waits ttl from first insertion then deletes the cache entry and
|
||||
// emits PB_CLOSE_PLANNER so oc-discovery stops streaming for this peer.
|
||||
// This is the only path that actually removes an entry from PlannerCache;
|
||||
// session close (ReleaseRefreshOwnership) only resets ownership state.
|
||||
func evictAfter(peerID string, ttl time.Duration) {
|
||||
time.Sleep(ttl)
|
||||
plannerMu.Lock()
|
||||
_, exists := PlannerCache[peerID]
|
||||
if exists {
|
||||
delete(PlannerCache, peerID)
|
||||
delete(plannerAddedAt, peerID)
|
||||
}
|
||||
plannerMu.Unlock()
|
||||
if exists {
|
||||
EmitNATS(peerID, tools.PropalgationMessage{Action: tools.PB_CLOSE_PLANNER})
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Planner refresh / broadcast
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// RequestPlannerRefresh asks oc-discovery for a fresh planner snapshot for
|
||||
// each peer in peerIDs. Only the first session to request a given peer becomes
|
||||
// its "refresh owner": subsequent sessions see Refreshing=true and skip the
|
||||
// duplicate PB_PLANNER emission. Returns the subset of peerIDs for which this
|
||||
// session claimed ownership (needed to release on close).
|
||||
func RequestPlannerRefresh(peerIDs []string, executionsID string) []string {
|
||||
var owned []string
|
||||
for _, peerID := range peerIDs {
|
||||
plannerMu.Lock()
|
||||
entry := PlannerCache[peerID]
|
||||
if entry == nil {
|
||||
entry = &plannerEntry{}
|
||||
PlannerCache[peerID] = entry
|
||||
plannerAddedAt[peerID] = time.Now().UTC()
|
||||
go evictAfter(peerID, plannerTTL)
|
||||
}
|
||||
shouldRequest := !entry.Refreshing
|
||||
if shouldRequest {
|
||||
entry.Refreshing = true
|
||||
entry.RefreshOwner = executionsID
|
||||
}
|
||||
plannerMu.Unlock()
|
||||
if shouldRequest {
|
||||
owned = append(owned, peerID)
|
||||
if p, err := oclib.GetMySelf(); err == nil && p != nil && p.PeerID == peerID {
|
||||
// Self peer: generate and cache the planner directly without
|
||||
// going through NATS / oc-discovery.
|
||||
go refreshSelfPlanner(peerID, &tools.APIRequest{Admin: true})
|
||||
} else {
|
||||
payload, _ := json.Marshal(map[string]any{"peer_id": peerID})
|
||||
fmt.Println("PB_PLANNER", peerID)
|
||||
EmitNATS(peerID, tools.PropalgationMessage{
|
||||
Action: tools.PB_PLANNER,
|
||||
Payload: payload,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
return owned
|
||||
}
|
||||
|
||||
// ReleaseRefreshOwnership is called when a check session closes (clean or
|
||||
// forced). For each peer this session owns, it resets the refresh state and
|
||||
// emits PB_CLOSE_PLANNER so oc-discovery stops the planner stream.
|
||||
// The planner data itself stays in the cache until TTL eviction.
|
||||
func ReleaseRefreshOwnership(peerIDs []string, executionsID string) {
|
||||
for _, peerID := range peerIDs {
|
||||
plannerMu.Lock()
|
||||
if entry := PlannerCache[peerID]; entry != nil && entry.RefreshOwner == executionsID {
|
||||
entry.Refreshing = false
|
||||
entry.RefreshOwner = ""
|
||||
}
|
||||
plannerMu.Unlock()
|
||||
payload, _ := json.Marshal(map[string]any{"peer_id": peerID})
|
||||
EmitNATS(peerID, tools.PropalgationMessage{
|
||||
Action: tools.PB_CLOSE_PLANNER,
|
||||
Payload: payload,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// broadcastPlanner iterates the storage and compute peers of the given workflow
|
||||
// and, for each peer not yet in the cache, emits a PB_PLANNER propagation so
|
||||
// downstream consumers (oc-discovery, other schedulers) refresh their state.
|
||||
func broadcastPlanner(wf *workflow.Workflow) {
|
||||
if wf.Graph == nil {
|
||||
return
|
||||
}
|
||||
items := []graph.GraphItem{}
|
||||
items = append(items, wf.GetGraphItems(wf.Graph.IsStorage)...)
|
||||
items = append(items, wf.GetGraphItems(wf.Graph.IsCompute)...)
|
||||
|
||||
seen := []string{}
|
||||
for _, item := range items {
|
||||
i := item
|
||||
_, res := i.GetResource()
|
||||
if res == nil {
|
||||
continue
|
||||
}
|
||||
creatorID := res.GetCreatorID()
|
||||
if slices.Contains(seen, creatorID) {
|
||||
continue
|
||||
}
|
||||
|
||||
data := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil).LoadOne(creatorID)
|
||||
p := data.ToPeer()
|
||||
if p == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
plannerMu.RLock()
|
||||
cached := PlannerCache[p.PeerID]
|
||||
plannerMu.RUnlock()
|
||||
|
||||
// Only request if no snapshot and no refresh already in flight.
|
||||
if cached == nil || (cached.Planner == nil && !cached.Refreshing) {
|
||||
payload, err := json.Marshal(map[string]interface{}{"peer_id": p.PeerID})
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
seen = append(seen, creatorID)
|
||||
EmitNATS(p.PeerID, tools.PropalgationMessage{
|
||||
Action: tools.PB_PLANNER,
|
||||
Payload: payload,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Self-planner initialisation
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// InitSelfPlanner bootstraps our own planner entry at startup.
|
||||
// It waits (with 15-second retries) for our peer record to be present in the
|
||||
// database before generating the first planner snapshot and broadcasting it
|
||||
// on PB_PLANNER. This handles the race between oc-scheduler starting before
|
||||
// oc-peer has fully registered our node.
|
||||
func InitSelfPlanner() {
|
||||
for {
|
||||
self, err := oclib.GetMySelf()
|
||||
if err != nil || self == nil {
|
||||
fmt.Println("InitSelfPlanner: self peer not found yet, retrying in 15s...")
|
||||
time.Sleep(15 * time.Second)
|
||||
continue
|
||||
}
|
||||
refreshSelfPlanner(self.PeerID, &tools.APIRequest{Admin: true})
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Self-planner refresh
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// refreshSelfPlanner regenerates the local planner from the current state of
|
||||
// the booking DB, stores it in PlannerCache under our own node UUID, and
|
||||
// broadcasts it on PROPALGATION_EVENT / PB_PLANNER so all listeners (including
|
||||
// oc-discovery) are kept in sync.
|
||||
//
|
||||
// It should be called whenever a booking for our own peer is created, whether
|
||||
// by direct DB insertion (self-peer routing) or upon receiving a CREATE_RESOURCE
|
||||
// BOOKING message from oc-discovery.
|
||||
func refreshSelfPlanner(peerID string, request *tools.APIRequest) {
|
||||
p, err := planner.GenerateShallow(request)
|
||||
if err != nil {
|
||||
fmt.Println("refreshSelfPlanner: could not generate planner:", err)
|
||||
return
|
||||
}
|
||||
|
||||
// Update the local cache and notify any waiting CheckStream goroutines.
|
||||
storePlanner(peerID, p)
|
||||
|
||||
// Broadcast the updated planner so remote peers (and oc-discovery) can
|
||||
// refresh their view of our availability.
|
||||
type plannerWithPeer struct {
|
||||
PeerID string `json:"peer_id"`
|
||||
*planner.Planner
|
||||
}
|
||||
plannerPayload, err := json.Marshal(plannerWithPeer{PeerID: peerID, Planner: p})
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
EmitNATS(peerID, tools.PropalgationMessage{
|
||||
Action: tools.PB_PLANNER,
|
||||
Payload: plannerPayload,
|
||||
})
|
||||
}
|
||||
453
infrastructure/planner/planner.go
Normal file
453
infrastructure/planner/planner.go
Normal file
@@ -0,0 +1,453 @@
|
||||
package planner
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"oc-scheduler/infrastructure/utils"
|
||||
"slices"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/models/booking/planner"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow/graph"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
)
|
||||
|
||||
const (
|
||||
checkWindowHours = 5
|
||||
checkStepMin = 15 // time increment per scan step (minutes)
|
||||
plannerTTL = 24 * time.Hour
|
||||
)
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Planner cache — protected by plannerMu
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// plannerEntry wraps a planner snapshot with refresh-ownership tracking.
|
||||
// At most one check session may be the "refresh owner" of a given peer's
|
||||
// planner at a time: it emits PB_PLANNER to request a fresh snapshot from
|
||||
// oc-discovery and, on close (clean or forced), emits PB_CLOSE_PLANNER to
|
||||
// release the stream. Any subsequent session that needs the same peer's
|
||||
// planner will see Refreshing=true and skip the duplicate request.
|
||||
type plannerEntry struct {
|
||||
Planner *planner.Planner
|
||||
Refreshing bool // true while a PB_PLANNER request is in flight
|
||||
RefreshOwner string // session UUID that initiated the current refresh
|
||||
}
|
||||
|
||||
type PlannerService struct {
|
||||
Mu sync.RWMutex
|
||||
Cache map[string]*plannerEntry
|
||||
SubMu sync.RWMutex
|
||||
Subs map[string][]chan string
|
||||
AddedAt map[string]time.Time
|
||||
WorkflowSubMu sync.RWMutex
|
||||
WorkflowSubs map[string][]chan struct{}
|
||||
}
|
||||
|
||||
var singleton *PlannerService
|
||||
|
||||
// InitSelfPlanner bootstraps our own planner entry at startup.
|
||||
// It waits (with 15-second retries) for our peer record to be present in the
|
||||
// database before generating the first planner snapshot and broadcasting it
|
||||
// on PB_PLANNER. This handles the race between oc-scheduler starting before
|
||||
// oc-peer has fully registered our node.
|
||||
func InitPlanner() {
|
||||
singleton = &PlannerService{
|
||||
AddedAt: map[string]time.Time{},
|
||||
Subs: map[string][]chan string{},
|
||||
Cache: map[string]*plannerEntry{},
|
||||
WorkflowSubs: map[string][]chan struct{}{},
|
||||
}
|
||||
for {
|
||||
self, err := oclib.GetMySelf()
|
||||
if err != nil || self == nil {
|
||||
fmt.Println("InitPlanner: self peer not found yet, retrying in 15s...")
|
||||
time.Sleep(15 * time.Second)
|
||||
continue
|
||||
}
|
||||
singleton.RefreshSelf(self.PeerID, &tools.APIRequest{Admin: true})
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func GetPlannerService() *PlannerService {
|
||||
return singleton
|
||||
}
|
||||
|
||||
func (s *PlannerService) HandleStore(resp tools.NATSResponse) {
|
||||
m := map[string]interface{}{}
|
||||
p := planner.Planner{}
|
||||
if err := json.Unmarshal(resp.Payload, &m); err != nil {
|
||||
return
|
||||
}
|
||||
if err := json.Unmarshal(resp.Payload, &p); err != nil {
|
||||
return
|
||||
}
|
||||
s.Store(fmt.Sprintf("%v", m["peer_id"]), &p)
|
||||
}
|
||||
|
||||
// missingPlannerPeers returns the peer IDs from res whose planner is absent
|
||||
// or not yet populated in PlannerCache.
|
||||
// func missingPlannerPeers(res map[string]bookingResource) []string {
|
||||
func (s *PlannerService) MissingPeers(res map[string]utils.BookingResource) []string {
|
||||
var out []string
|
||||
for _, r := range res {
|
||||
s.Mu.RLock()
|
||||
entry := s.Cache[r.PeerPID]
|
||||
s.Mu.RUnlock()
|
||||
if entry == nil || entry.Planner == nil {
|
||||
out = append(out, r.PeerPID)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (s *PlannerService) FindDate(wfID string, checkables map[string]utils.BookingResource, start time.Time, end *time.Time, preemption bool, asap bool) (time.Time, *time.Time, bool, bool, []string) {
|
||||
var unavailable, warnings []string
|
||||
// 4. Preemption: Planify ran (end is resolved), skip availability check.
|
||||
if preemption {
|
||||
return start, end, true, true, warnings
|
||||
}
|
||||
// 5b. For any peer whose planner is not yet cached, request it and wait
|
||||
// briefly so the decision is based on real data rather than a blind
|
||||
// "assume available". The wait is capped to avoid blocking the caller
|
||||
// when oc-discovery is unreachable.
|
||||
s.Fill(checkables, wfID)
|
||||
|
||||
unavailable, warnings = s.checkResourceAvailability(checkables, start, end)
|
||||
|
||||
if len(unavailable) == 0 {
|
||||
//result.Available = true
|
||||
return start, end, true, false, warnings
|
||||
}
|
||||
|
||||
// 6. as_possible: find and commit to the next free slot.
|
||||
if asap {
|
||||
next := s.findNextSlot(checkables, start, end, checkWindowHours)
|
||||
if next != nil {
|
||||
start = *next
|
||||
if end != nil {
|
||||
shifted := next.Add(end.Sub(start))
|
||||
end = &shifted
|
||||
}
|
||||
return start, end, true, false, warnings
|
||||
} else {
|
||||
return start, end, false, false, warnings
|
||||
}
|
||||
}
|
||||
return start, end, false, false, warnings
|
||||
}
|
||||
|
||||
func (s *PlannerService) Fill(checkables map[string]utils.BookingResource, wfID string) {
|
||||
if missing := s.MissingPeers(checkables); len(missing) > 0 {
|
||||
const plannerFetchTimeout = 2 * time.Second
|
||||
tmpSession := "check-oneshot-" + wfID
|
||||
ch, cancelSub := SubscribeUpdates(s.Subs, &s.SubMu, missing...)
|
||||
owned := s.Refresh(missing, tmpSession)
|
||||
select {
|
||||
case <-ch:
|
||||
case <-time.After(plannerFetchTimeout):
|
||||
}
|
||||
cancelSub()
|
||||
s.ReleaseRefreshOwnership(owned, tmpSession)
|
||||
}
|
||||
}
|
||||
|
||||
// evictAfter waits ttl from first insertion then deletes the cache entry and
|
||||
// emits PB_CLOSE_PLANNER so oc-discovery stops streaming for this peer.
|
||||
// This is the only path that actually removes an entry from PlannerCache;
|
||||
// session close (ReleaseRefreshOwnership) only resets ownership state.
|
||||
func (s *PlannerService) EvictAfter(peerID string, ttl time.Duration) {
|
||||
time.Sleep(ttl)
|
||||
s.Mu.Lock()
|
||||
_, exists := s.Cache[peerID]
|
||||
if exists {
|
||||
delete(s.Cache, peerID)
|
||||
delete(s.AddedAt, peerID)
|
||||
}
|
||||
s.Mu.Unlock()
|
||||
if exists {
|
||||
utils.Notify(&s.SubMu, s.Subs, peerID, peerID)
|
||||
utils.Propalgate(peerID, tools.PropalgationMessage{Action: tools.PB_CLOSE_PLANNER})
|
||||
}
|
||||
}
|
||||
|
||||
// SubscribePlannerUpdates registers interest in planner changes for the given
|
||||
// peer IDs. The returned channel receives the peerID string (non-blocking) each
|
||||
// time any of those planners is updated. Call cancel to unregister.
|
||||
func SubscribeUpdates[T interface{}](subs map[string][]chan T, mu *sync.RWMutex, updates ...string) (<-chan T, func()) {
|
||||
ch := make(chan T, 1)
|
||||
mu.Lock()
|
||||
for _, k := range updates {
|
||||
subs[k] = append(subs[k], ch)
|
||||
}
|
||||
mu.Unlock()
|
||||
cancel := func() {
|
||||
mu.Lock()
|
||||
for _, k := range updates {
|
||||
subsk := subs[k]
|
||||
for i, s := range subsk {
|
||||
if s == ch {
|
||||
subs[k] = append(subsk[:i], subsk[i+1:]...)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
mu.Unlock()
|
||||
}
|
||||
return ch, cancel
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Cache helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func (s *PlannerService) Store(peerID string, p *planner.Planner) {
|
||||
s.Mu.Lock()
|
||||
entry := s.Cache[peerID]
|
||||
isNew := entry == nil
|
||||
if isNew {
|
||||
entry = &plannerEntry{}
|
||||
s.Cache[peerID] = entry
|
||||
s.AddedAt[peerID] = time.Now().UTC()
|
||||
go s.EvictAfter(peerID, plannerTTL)
|
||||
}
|
||||
entry.Planner = p
|
||||
s.Mu.Unlock()
|
||||
utils.Notify[string](&s.SubMu, s.Subs, peerID, peerID)
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Planner refresh / broadcast
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// RequestPlannerRefresh asks oc-discovery for a fresh planner snapshot for
|
||||
// each peer in peerIDs. Only the first session to request a given peer becomes
|
||||
// its "refresh owner": subsequent sessions see Refreshing=true and skip the
|
||||
// duplicate PB_PLANNER emission. Returns the subset of peerIDs for which this
|
||||
// session claimed ownership (needed to release on close).
|
||||
|
||||
// RequestPlannerRefresh
|
||||
func (s *PlannerService) Refresh(peerIDs []string, executionsID string) []string {
|
||||
var owned []string
|
||||
for _, peerID := range peerIDs {
|
||||
s.Mu.Lock()
|
||||
entry := s.Cache[peerID]
|
||||
if entry == nil {
|
||||
entry = &plannerEntry{}
|
||||
s.Cache[peerID] = entry
|
||||
s.AddedAt[peerID] = time.Now().UTC()
|
||||
go s.EvictAfter(peerID, plannerTTL)
|
||||
}
|
||||
shouldRequest := !entry.Refreshing
|
||||
if shouldRequest {
|
||||
entry.Refreshing = true
|
||||
entry.RefreshOwner = executionsID
|
||||
}
|
||||
s.Mu.Unlock()
|
||||
if shouldRequest {
|
||||
owned = append(owned, peerID)
|
||||
if p, err := oclib.GetMySelf(); err == nil && p != nil && p.PeerID == peerID {
|
||||
go s.RefreshSelf(peerID, &tools.APIRequest{Admin: true})
|
||||
} else {
|
||||
payload, _ := json.Marshal(map[string]any{"peer_id": peerID})
|
||||
utils.Propalgate(peerID, tools.PropalgationMessage{
|
||||
Action: tools.PB_PLANNER,
|
||||
Payload: payload,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
return owned
|
||||
}
|
||||
|
||||
// ReleaseRefreshOwnership is called when a check session closes (clean or
|
||||
// forced). For each peer this session owns, it resets the refresh state and
|
||||
// emits PB_CLOSE_PLANNER so oc-discovery stops the planner stream.
|
||||
// The planner data itself stays in the cache until TTL eviction.
|
||||
func (s *PlannerService) ReleaseRefreshOwnership(peerIDs []string, executionsID string) {
|
||||
for _, peerID := range peerIDs {
|
||||
s.Mu.Lock()
|
||||
if entry := s.Cache[peerID]; entry != nil && entry.RefreshOwner == executionsID {
|
||||
entry.Refreshing = false
|
||||
entry.RefreshOwner = ""
|
||||
}
|
||||
s.Mu.Unlock()
|
||||
utils.Notify(&s.SubMu, s.Subs, peerID, peerID)
|
||||
payload, _ := json.Marshal(map[string]any{"peer_id": peerID})
|
||||
utils.Propalgate(peerID, tools.PropalgationMessage{
|
||||
Action: tools.PB_CLOSE_PLANNER,
|
||||
Payload: payload,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// broadcastPlanner iterates the storage and compute peers of the given workflow
|
||||
// and, for each peer not yet in the cache, emits a PB_PLANNER propagation so
|
||||
// downstream consumers (oc-discovery, other schedulers) refresh their state.
|
||||
func (s *PlannerService) Broadcast(wf *workflow.Workflow) {
|
||||
if wf.Graph == nil {
|
||||
return
|
||||
}
|
||||
items := []graph.GraphItem{}
|
||||
items = append(items, wf.GetGraphItems(wf.Graph.IsStorage)...)
|
||||
items = append(items, wf.GetGraphItems(wf.Graph.IsCompute)...)
|
||||
|
||||
seen := []string{}
|
||||
for _, item := range items {
|
||||
_, res := item.GetResource()
|
||||
if res == nil {
|
||||
continue
|
||||
}
|
||||
creatorID := res.GetCreatorID()
|
||||
if slices.Contains(seen, creatorID) {
|
||||
continue
|
||||
}
|
||||
|
||||
data := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil).LoadOne(creatorID)
|
||||
p := data.ToPeer()
|
||||
if p == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
s.Mu.RLock()
|
||||
cached := s.Cache[p.PeerID]
|
||||
s.Mu.RUnlock()
|
||||
|
||||
// Only request if no snapshot and no refresh already in flight.
|
||||
if cached == nil || (cached.Planner == nil && !cached.Refreshing) {
|
||||
payload, err := json.Marshal(map[string]interface{}{"peer_id": p.PeerID})
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
seen = append(seen, creatorID)
|
||||
utils.Propalgate(p.PeerID, tools.PropalgationMessage{
|
||||
Action: tools.PB_PLANNER,
|
||||
Payload: payload,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Self-planner refresh
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func (s *PlannerService) RefreshSelf(peerID string, request *tools.APIRequest) {
|
||||
p, err := planner.GenerateShallow(request)
|
||||
if err != nil {
|
||||
fmt.Println("refreshSelfPlanner: could not generate planner:", err)
|
||||
return
|
||||
}
|
||||
// Update the local cache and notify any waiting CheckStream goroutines.
|
||||
s.Store(peerID, p)
|
||||
// Broadcast the updated planner so remote peers (and oc-discovery) can
|
||||
// refresh their view of our availability.
|
||||
type plannerWithPeer struct {
|
||||
PeerID string `json:"peer_id"`
|
||||
*planner.Planner
|
||||
}
|
||||
plannerPayload, err := json.Marshal(plannerWithPeer{PeerID: peerID, Planner: p})
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
utils.Propalgate(peerID, tools.PropalgationMessage{
|
||||
Action: tools.PB_PLANNER,
|
||||
Payload: plannerPayload,
|
||||
})
|
||||
}
|
||||
|
||||
// findNextSlot scans forward from 'from' in checkStepMin increments for up to
|
||||
// windowH hours and returns the first candidate start time at which all
|
||||
// resources are simultaneously free.
|
||||
func (s *PlannerService) findNextSlot(resources map[string]utils.BookingResource, from time.Time, originalEnd *time.Time, windowH int) *time.Time {
|
||||
duration := 5 * time.Minute
|
||||
if originalEnd != nil {
|
||||
if d := originalEnd.Sub(from); d > 0 {
|
||||
duration = d
|
||||
}
|
||||
}
|
||||
step := time.Duration(checkStepMin) * time.Minute
|
||||
limit := from.Add(time.Duration(windowH) * time.Hour)
|
||||
for t := from.Add(step); t.Before(limit); t = t.Add(step) {
|
||||
e := t.Add(duration)
|
||||
if unavail, _ := s.checkResourceAvailability(resources, t, &e); len(unavail) == 0 {
|
||||
return &t
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// checkResourceAvailability returns the IDs of unavailable resources and
|
||||
// human-readable warning messages.
|
||||
func (s *PlannerService) checkResourceAvailability(res map[string]utils.BookingResource, start time.Time, end *time.Time) (unavailable []string, warnings []string) {
|
||||
for _, r := range res {
|
||||
s.Mu.RLock()
|
||||
entry := s.Cache[r.PeerPID]
|
||||
s.Mu.RUnlock()
|
||||
if entry == nil || entry.Planner == nil {
|
||||
warnings = append(warnings, fmt.Sprintf(
|
||||
"peer %s planner not in cache for resource %s – assuming available", r.PeerPID, r.ID))
|
||||
continue
|
||||
}
|
||||
if !s.checkInstance(entry.Planner, r.ID, r.InstanceID, start, end) {
|
||||
unavailable = append(unavailable, r.ID)
|
||||
warnings = append(warnings, fmt.Sprintf(
|
||||
"resource %s is not available in [%s – %s]",
|
||||
r.ID, start.Format(time.RFC3339), utils.FormatOptTime(end)))
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// CheckResourceInstance checks whether a resource/instance is available on the
|
||||
// local planner cache for the given peer. Called by scheduling_resources when
|
||||
// validating an incoming booking creation.
|
||||
func (s *PlannerService) CheckResourceInstance(peerID, resourceID, instanceID string, start time.Time, end *time.Time) bool {
|
||||
s.Mu.RLock()
|
||||
entry := s.Cache[peerID]
|
||||
s.Mu.RUnlock()
|
||||
if entry == nil || entry.Planner == nil {
|
||||
return true // no planner cached → assume available
|
||||
}
|
||||
return s.checkInstance(entry.Planner, resourceID, instanceID, start, end)
|
||||
}
|
||||
|
||||
// SubscribePlannerUpdates returns a channel that receives a peerID each time
|
||||
// one of the given peers' planners is updated.
|
||||
func (s *PlannerService) SubscribePlannerUpdates(peerIDs ...string) (<-chan string, func()) {
|
||||
return SubscribeUpdates[string](s.Subs, &s.SubMu, peerIDs...)
|
||||
}
|
||||
|
||||
// SubscribeWorkflowUpdates returns a channel signalled when the workflow changes.
|
||||
func (s *PlannerService) SubscribeWorkflowUpdates(wfID string) (<-chan struct{}, func()) {
|
||||
return SubscribeUpdates[struct{}](s.WorkflowSubs, &s.WorkflowSubMu, wfID)
|
||||
}
|
||||
|
||||
// NotifyWorkflow signals all subscribers watching wfID.
|
||||
func (s *PlannerService) NotifyWorkflow(wfID string) {
|
||||
utils.Notify[struct{}](&s.WorkflowSubMu, s.WorkflowSubs, wfID, struct{}{})
|
||||
}
|
||||
|
||||
// checkInstance checks availability for the specific instance resolved by the
|
||||
// scheduler. When instanceID is empty (no instance selected / none resolvable),
|
||||
// it falls back to checking all instances known in the planner and returns true
|
||||
// if any one has remaining capacity. Returns true when no capacity is recorded.
|
||||
func (s *PlannerService) checkInstance(p *planner.Planner, resourceID string, instanceID string, start time.Time, end *time.Time) bool {
|
||||
if instanceID != "" {
|
||||
return p.Check(resourceID, instanceID, nil, start, end)
|
||||
}
|
||||
caps, ok := p.Capacities[resourceID]
|
||||
if !ok || len(caps) == 0 {
|
||||
return true
|
||||
}
|
||||
for id := range caps {
|
||||
if p.Check(resourceID, id, nil, start, end) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -1,320 +0,0 @@
|
||||
package infrastructure
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"oc-scheduler/infrastructure/scheduling"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/models/bill"
|
||||
"cloud.o-forge.io/core/oc-lib/models/booking"
|
||||
"cloud.o-forge.io/core/oc-lib/models/common/enum"
|
||||
"cloud.o-forge.io/core/oc-lib/models/common/pricing"
|
||||
"cloud.o-forge.io/core/oc-lib/models/order"
|
||||
"cloud.o-forge.io/core/oc-lib/models/peer"
|
||||
"cloud.o-forge.io/core/oc-lib/models/resources/purchase_resource"
|
||||
"cloud.o-forge.io/core/oc-lib/models/utils"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow_execution"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
"github.com/google/uuid"
|
||||
"github.com/robfig/cron"
|
||||
)
|
||||
|
||||
/*
|
||||
* WorkflowSchedule is a struct that contains the scheduling information of a workflow
|
||||
* It contains the mode of the schedule (Task or Service), the name of the schedule, the start and end time of the schedule and the cron expression
|
||||
*/
|
||||
// it's a flying object only use in a session time. It's not stored in the database
|
||||
type WorkflowSchedule struct {
|
||||
UUID string `json:"id" validate:"required"` // ExecutionsID is the list of the executions id of the workflow
|
||||
Workflow *workflow.Workflow `json:"workflow,omitempty"` // Workflow is the workflow dependancy of the schedule
|
||||
WorkflowExecution []*workflow_execution.WorkflowExecution `json:"workflow_executions,omitempty"` // WorkflowExecution is the list of executions of the workflow
|
||||
Message string `json:"message,omitempty"` // Message is the message of the schedule
|
||||
Warning string `json:"warning,omitempty"` // Warning is the warning message of the schedule
|
||||
Start time.Time `json:"start" validate:"required,ltfield=End"` // Start is the start time of the schedule, is required and must be less than the End time
|
||||
End *time.Time `json:"end,omitempty"` // End is the end time of the schedule, is required and must be greater than the Start time
|
||||
DurationS float64 `json:"duration_s" default:"-1"` // End is the end time of the schedule
|
||||
Cron string `json:"cron,omitempty"` // here the cron format : ss mm hh dd MM dw task
|
||||
|
||||
BookingMode booking.BookingMode `json:"booking_mode,omitempty"` // BookingMode qualify the preemption order of the scheduling. if no payment allowed with preemption set up When_Possible
|
||||
SelectedInstances workflow.ConfigItem `json:"selected_instances"`
|
||||
SelectedPartnerships workflow.ConfigItem `json:"selected_partnerships"`
|
||||
SelectedBuyings workflow.ConfigItem `json:"selected_buyings"`
|
||||
SelectedStrategies workflow.ConfigItem `json:"selected_strategies"`
|
||||
|
||||
SelectedBillingStrategy pricing.BillingStrategy `json:"selected_billing_strategy"`
|
||||
|
||||
// Confirm, when true, triggers Schedule() to confirm the drafts held by this session.
|
||||
Confirm bool `json:"confirm,omitempty"`
|
||||
}
|
||||
|
||||
func NewScheduler(mode int, start string, end string, durationInS float64, cron string) *WorkflowSchedule {
|
||||
ws := &WorkflowSchedule{
|
||||
UUID: uuid.New().String(),
|
||||
Start: time.Now().UTC().Add(asapBuffer),
|
||||
BookingMode: booking.BookingMode(mode),
|
||||
DurationS: durationInS,
|
||||
Cron: cron,
|
||||
}
|
||||
s, err := time.ParseInLocation("2006-01-02T15:04:05", start, time.UTC)
|
||||
if err == nil && ws.BookingMode == booking.PLANNED {
|
||||
ws.Start = s // can apply a defined start other than now, if planned
|
||||
}
|
||||
|
||||
e, err := time.ParseInLocation("2006-01-02T15:04:05", end, time.UTC)
|
||||
if err == nil {
|
||||
ws.End = &e
|
||||
}
|
||||
return ws
|
||||
}
|
||||
|
||||
func (ws *WorkflowSchedule) GetBuyAndBook(wfID string, request *tools.APIRequest) (bool, *workflow.Workflow, []*workflow_execution.WorkflowExecution, []scheduling.SchedulerObject, []scheduling.SchedulerObject, error) {
|
||||
access := workflow.NewAccessor(request)
|
||||
res, code, err := access.LoadOne(wfID)
|
||||
if code != 200 {
|
||||
return false, nil, []*workflow_execution.WorkflowExecution{}, []scheduling.SchedulerObject{}, []scheduling.SchedulerObject{}, errors.New("could not load the workflow with id: " + err.Error())
|
||||
}
|
||||
wf := res.(*workflow.Workflow)
|
||||
isPreemptible, longest, priceds, wf, err := wf.Planify(ws.Start, ws.End,
|
||||
ws.SelectedInstances, ws.SelectedPartnerships, ws.SelectedBuyings, ws.SelectedStrategies,
|
||||
int(ws.BookingMode), request)
|
||||
if err != nil {
|
||||
return false, wf, []*workflow_execution.WorkflowExecution{}, []scheduling.SchedulerObject{}, []scheduling.SchedulerObject{}, err
|
||||
}
|
||||
ws.DurationS = longest
|
||||
ws.Message = "We estimate that the workflow will start at " + ws.Start.String() + " and last " + fmt.Sprintf("%v", ws.DurationS) + " seconds."
|
||||
if ws.End != nil && ws.Start.Add(time.Duration(longest)*time.Second).After(*ws.End) {
|
||||
ws.Warning = "The workflow may be too long to be executed in the given time frame, we will try to book it anyway\n"
|
||||
}
|
||||
execs, err := ws.GetExecutions(wf, isPreemptible)
|
||||
if err != nil {
|
||||
return false, wf, []*workflow_execution.WorkflowExecution{}, []scheduling.SchedulerObject{}, []scheduling.SchedulerObject{}, err
|
||||
}
|
||||
purchased := []scheduling.SchedulerObject{}
|
||||
bookings := []scheduling.SchedulerObject{}
|
||||
for _, exec := range execs {
|
||||
for _, obj := range exec.Buy(ws.SelectedBillingStrategy, ws.UUID, wfID, priceds) {
|
||||
purchased = append(purchased, scheduling.ToSchedulerObject(tools.PURCHASE_RESOURCE, obj))
|
||||
}
|
||||
for _, obj := range exec.Book(ws.UUID, wfID, priceds) {
|
||||
bookings = append(bookings, scheduling.ToSchedulerObject(tools.BOOKING, obj))
|
||||
}
|
||||
}
|
||||
return true, wf, execs, purchased, bookings, nil
|
||||
}
|
||||
|
||||
// GenerateOrder creates a draft order (+ draft bill) for the given purchases and bookings.
|
||||
// Returns the created order ID and any error.
|
||||
func (ws *WorkflowSchedule) GenerateOrder(purchases []scheduling.SchedulerObject, bookings []scheduling.SchedulerObject, executionsID string, request *tools.APIRequest) (string, error) {
|
||||
newOrder := &order.Order{
|
||||
AbstractObject: utils.AbstractObject{
|
||||
Name: "order_" + request.PeerID + "_" + time.Now().UTC().Format("2006-01-02T15:04:05"),
|
||||
IsDraft: true,
|
||||
},
|
||||
ExecutionsID: executionsID,
|
||||
Purchases: []*purchase_resource.PurchaseResource{},
|
||||
Bookings: []*booking.Booking{},
|
||||
Status: enum.PENDING,
|
||||
}
|
||||
for _, purch := range purchases {
|
||||
newOrder.Purchases = append(
|
||||
newOrder.Purchases, scheduling.FromSchedulerObject(tools.PURCHASE_RESOURCE, purch).(*purchase_resource.PurchaseResource))
|
||||
}
|
||||
for _, b := range bookings {
|
||||
newOrder.Bookings = append(
|
||||
newOrder.Bookings, scheduling.FromSchedulerObject(tools.BOOKING, b).(*booking.Booking))
|
||||
}
|
||||
res, _, err := order.NewAccessor(request).StoreOne(newOrder)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if _, err := bill.DraftFirstBill(res.(*order.Order), request); err != nil {
|
||||
return res.GetID(), err
|
||||
}
|
||||
return res.GetID(), nil
|
||||
}
|
||||
|
||||
func (ws *WorkflowSchedule) Schedules(wfID string, request *tools.APIRequest) (*WorkflowSchedule, *workflow.Workflow, []*workflow_execution.WorkflowExecution, error) {
|
||||
if request == nil {
|
||||
return ws, nil, []*workflow_execution.WorkflowExecution{}, errors.New("no request found")
|
||||
}
|
||||
selfID, _ := oclib.GetMySelf()
|
||||
|
||||
// If the client provides a scheduling_id from a Check session, confirm the
|
||||
// pre-created drafts (bookings/purchases). Executions already exist as drafts
|
||||
// and will be confirmed later by the considers mechanism.
|
||||
if ws.UUID != "" {
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
|
||||
// Obsolescence check: abort if any session execution's start date has passed.
|
||||
executions := loadSessionExecs(ws.UUID)
|
||||
for _, exec := range executions {
|
||||
if !exec.ExecDate.IsZero() && exec.ExecDate.Before(time.Now().UTC()) {
|
||||
return ws, nil, nil, fmt.Errorf("execution %s is obsolete (start date in the past)", exec.GetID())
|
||||
}
|
||||
}
|
||||
|
||||
if err := ConfirmSession(ws.UUID, selfID, request); err != nil {
|
||||
return ws, nil, []*workflow_execution.WorkflowExecution{}, fmt.Errorf("confirm session failed: %w", err)
|
||||
}
|
||||
|
||||
for _, exec := range executions {
|
||||
go WatchExecDeadline(exec.GetID(), exec.ExecutionsID, exec.ExecDate, selfID, request)
|
||||
}
|
||||
|
||||
obj, _, _ := workflow.NewAccessor(request).LoadOne(wfID)
|
||||
if obj == nil {
|
||||
return ws, nil, executions, nil
|
||||
}
|
||||
wf := obj.(*workflow.Workflow)
|
||||
ws.Workflow = wf
|
||||
ws.WorkflowExecution = executions
|
||||
wf.GetAccessor(adminReq).UpdateOne(wf.Serialize(wf), wf.GetID())
|
||||
return ws, wf, executions, nil
|
||||
}
|
||||
|
||||
// Schedule must be called from a Check session (ws.UUID set above).
|
||||
// Direct scheduling without a prior Check session is not supported.
|
||||
return ws, nil, []*workflow_execution.WorkflowExecution{}, errors.New("no scheduling session: use the Check stream first")
|
||||
}
|
||||
|
||||
// propagateResource routes a purchase or booking to its destination:
|
||||
// - If destPeerID matches our own peer (selfMongoID), the object is stored
|
||||
// directly in the local DB as draft and the local planner is refreshed.
|
||||
// - Otherwise a NATS CREATE_RESOURCE message is emitted so the destination
|
||||
// peer can process it asynchronously.
|
||||
//
|
||||
// The caller is responsible for setting obj.IsDraft before calling.
|
||||
func propagateResource(obj utils.DBObject, destPeerID string, dt tools.DataType, selfMongoID *peer.Peer, request *tools.APIRequest, errCh chan error) {
|
||||
if destPeerID == selfMongoID.GetID() {
|
||||
stored := oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil).StoreOne(obj.Serialize(obj))
|
||||
if stored.Err != "" || stored.Data == nil {
|
||||
errCh <- fmt.Errorf("could not store %s locally: %s", dt.String(), stored.Err)
|
||||
return
|
||||
}
|
||||
// The planner tracks booking time-slots only; purchases do not affect it.
|
||||
if dt == tools.BOOKING {
|
||||
go refreshSelfPlanner(selfMongoID.PeerID, request)
|
||||
}
|
||||
errCh <- nil
|
||||
return
|
||||
}
|
||||
m := obj.Serialize(obj)
|
||||
if m["dest_peer_id"] != nil {
|
||||
if data := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil).LoadOne(fmt.Sprintf("%v", m["dest_peer_id"])); data.Data != nil {
|
||||
m["peer_id"] = data.Data.(*peer.Peer).PeerID
|
||||
}
|
||||
} else {
|
||||
fmt.Println("NO DEST ID")
|
||||
return
|
||||
}
|
||||
payload, err := json.Marshal(m)
|
||||
if err != nil {
|
||||
errCh <- fmt.Errorf("could not serialize %s: %w", dt.String(), err)
|
||||
return
|
||||
}
|
||||
if b, err := json.Marshal(&tools.PropalgationMessage{
|
||||
DataType: dt.EnumIndex(),
|
||||
Action: tools.PB_CREATE,
|
||||
Payload: payload,
|
||||
}); err == nil {
|
||||
tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
|
||||
FromApp: "oc-scheduler",
|
||||
Datatype: dt,
|
||||
Method: int(tools.PROPALGATION_EVENT),
|
||||
Payload: b,
|
||||
})
|
||||
}
|
||||
errCh <- nil
|
||||
}
|
||||
|
||||
/*
|
||||
* getExecutions is a function that returns the executions of a workflow
|
||||
* it returns an array of workflow_execution.WorkflowExecution
|
||||
*/
|
||||
func (ws *WorkflowSchedule) GetExecutions(workflow *workflow.Workflow, isPreemptible bool) ([]*workflow_execution.WorkflowExecution, error) {
|
||||
workflows_executions := []*workflow_execution.WorkflowExecution{}
|
||||
dates, err := ws.GetDates()
|
||||
if err != nil {
|
||||
return workflows_executions, err
|
||||
}
|
||||
for _, date := range dates {
|
||||
obj := &workflow_execution.WorkflowExecution{
|
||||
AbstractObject: utils.AbstractObject{
|
||||
UUID: uuid.New().String(), // set the uuid of the execution
|
||||
Name: workflow.Name + "_execution_" + date.Start.String(), // set the name of the execution
|
||||
},
|
||||
Priority: 1,
|
||||
ExecutionsID: ws.UUID,
|
||||
ExecDate: date.Start, // set the execution date
|
||||
EndDate: date.End, // set the end date
|
||||
State: enum.DRAFT, // set the state to 1 (scheduled)
|
||||
WorkflowID: workflow.GetID(), // set the workflow id dependancy of the execution
|
||||
}
|
||||
if ws.BookingMode != booking.PLANNED {
|
||||
obj.Priority = 0
|
||||
}
|
||||
if ws.BookingMode == booking.PREEMPTED && isPreemptible {
|
||||
obj.Priority = 7
|
||||
}
|
||||
|
||||
ws.SelectedStrategies = obj.SelectedStrategies
|
||||
ws.SelectedPartnerships = obj.SelectedPartnerships
|
||||
ws.SelectedBuyings = obj.SelectedBuyings
|
||||
ws.SelectedInstances = obj.SelectedInstances
|
||||
|
||||
workflows_executions = append(workflows_executions, obj)
|
||||
}
|
||||
return workflows_executions, nil
|
||||
}
|
||||
|
||||
func (ws *WorkflowSchedule) GetDates() ([]Schedule, error) {
|
||||
schedule := []Schedule{}
|
||||
if len(ws.Cron) > 0 { // if cron is set then end date should be set
|
||||
if ws.End == nil {
|
||||
return schedule, errors.New("a cron task should have an end date")
|
||||
}
|
||||
if ws.DurationS <= 0 {
|
||||
ws.DurationS = ws.End.Sub(ws.Start).Seconds()
|
||||
}
|
||||
cronStr := strings.Split(ws.Cron, " ") // split the cron string to treat it
|
||||
if len(cronStr) < 6 { // if the cron string is less than 6 fields, return an error because format is : ss mm hh dd MM dw (6 fields)
|
||||
return schedule, errors.New("Bad cron message: (" + ws.Cron + "). Should be at least ss mm hh dd MM dw")
|
||||
}
|
||||
subCron := strings.Join(cronStr[:6], " ")
|
||||
// cron should be parsed as ss mm hh dd MM dw t (min 6 fields)
|
||||
specParser := cron.NewParser(cron.Second | cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow) // create a new cron parser
|
||||
sched, err := specParser.Parse(subCron) // parse the cron string
|
||||
if err != nil {
|
||||
return schedule, errors.New("Bad cron message: " + err.Error())
|
||||
}
|
||||
// loop through the cron schedule to set the executions
|
||||
for s := sched.Next(ws.Start); !s.IsZero() && s.Before(*ws.End); s = sched.Next(s) {
|
||||
e := s.Add(time.Duration(ws.DurationS) * time.Second)
|
||||
schedule = append(schedule, Schedule{
|
||||
Start: s,
|
||||
End: &e,
|
||||
})
|
||||
}
|
||||
} else { // if no cron, set the execution to the start date
|
||||
schedule = append(schedule, Schedule{
|
||||
Start: ws.Start,
|
||||
End: ws.End,
|
||||
})
|
||||
}
|
||||
return schedule, nil
|
||||
}
|
||||
|
||||
type Schedule struct {
|
||||
Start time.Time
|
||||
End *time.Time
|
||||
}
|
||||
|
||||
/*
|
||||
* TODO : LARGEST GRAIN PLANIFYING THE WORKFLOW WHEN OPTION IS SET
|
||||
* SET PROTECTION BORDER TIME
|
||||
*/
|
||||
235
infrastructure/scheduler/scheduler.go
Normal file
235
infrastructure/scheduler/scheduler.go
Normal file
@@ -0,0 +1,235 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"oc-scheduler/infrastructure/planner"
|
||||
"oc-scheduler/infrastructure/scheduling_resources"
|
||||
infUtils "oc-scheduler/infrastructure/utils"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"cloud.o-forge.io/core/oc-lib/models/booking"
|
||||
"cloud.o-forge.io/core/oc-lib/models/common/enum"
|
||||
"cloud.o-forge.io/core/oc-lib/models/common/pricing"
|
||||
"cloud.o-forge.io/core/oc-lib/models/utils"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow_execution"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
"github.com/google/uuid"
|
||||
"github.com/robfig/cron"
|
||||
)
|
||||
|
||||
const asapBuffer = 2 * time.Minute
|
||||
|
||||
// Schedule holds a resolved start/end pair for a single execution slot.
|
||||
type Schedule struct {
|
||||
Start time.Time
|
||||
End *time.Time
|
||||
}
|
||||
|
||||
// WorkflowSchedule is the flying session object for a scheduling interaction.
|
||||
// It is never persisted; it lives only for the duration of a WebSocket check session.
|
||||
type WorkflowSchedule struct {
|
||||
UUID string `json:"id" validate:"required"`
|
||||
Workflow *workflow.Workflow `json:"workflow,omitempty"`
|
||||
WorkflowExecution []*workflow_execution.WorkflowExecution `json:"workflow_executions,omitempty"`
|
||||
Message string `json:"message,omitempty"`
|
||||
Warning string `json:"warning,omitempty"`
|
||||
Start time.Time `json:"start" validate:"required,ltfield=End"`
|
||||
End *time.Time `json:"end,omitempty"`
|
||||
DurationS float64 `json:"duration_s" default:"-1"`
|
||||
Cron string `json:"cron,omitempty"`
|
||||
|
||||
BookingMode booking.BookingMode `json:"booking_mode,omitempty"`
|
||||
SelectedInstances workflow.ConfigItem `json:"selected_instances"`
|
||||
SelectedPartnerships workflow.ConfigItem `json:"selected_partnerships"`
|
||||
SelectedBuyings workflow.ConfigItem `json:"selected_buyings"`
|
||||
SelectedStrategies workflow.ConfigItem `json:"selected_strategies"`
|
||||
SelectedBillingStrategy pricing.BillingStrategy `json:"selected_billing_strategy"`
|
||||
|
||||
// Confirm, when true, triggers Schedule() to confirm the drafts held by this session.
|
||||
Confirm bool `json:"confirm,omitempty"`
|
||||
}
|
||||
|
||||
// CheckResult is the response payload for an availability check.
|
||||
type CheckResult struct {
|
||||
Available bool `json:"available"`
|
||||
Start time.Time `json:"start"`
|
||||
End *time.Time `json:"end,omitempty"`
|
||||
Warnings []string `json:"warnings,omitempty"`
|
||||
Preemptible bool `json:"preemptible,omitempty"`
|
||||
// SchedulingID is the session UUID the client must supply when confirming.
|
||||
SchedulingID string `json:"scheduling_id,omitempty"`
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Check — availability
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// Check verifies whether the requested slot is available across all resource peers.
|
||||
func (ws *WorkflowSchedule) Check(wfID string, asap bool, preemption bool, request *tools.APIRequest) (*CheckResult, error) {
|
||||
fmt.Println("CHECK", asap, "/", preemption)
|
||||
obj, code, err := workflow.NewAccessor(request).LoadOne(wfID)
|
||||
if code != 200 || err != nil {
|
||||
msg := "could not load workflow " + wfID
|
||||
if err != nil {
|
||||
msg += ": " + err.Error()
|
||||
}
|
||||
return nil, errors.New(msg)
|
||||
}
|
||||
wf := obj.(*workflow.Workflow)
|
||||
|
||||
start := ws.Start
|
||||
if asap || start.IsZero() {
|
||||
start = time.Now().UTC().Add(asapBuffer)
|
||||
}
|
||||
|
||||
end := ws.End
|
||||
if end == nil {
|
||||
if ws.DurationS > 0 {
|
||||
e := start.Add(time.Duration(ws.DurationS * float64(time.Second)))
|
||||
end = &e
|
||||
} else {
|
||||
_, longest, _, _, planErr := wf.Planify(
|
||||
start, nil,
|
||||
ws.SelectedInstances, ws.SelectedPartnerships,
|
||||
ws.SelectedBuyings, ws.SelectedStrategies,
|
||||
int(ws.BookingMode), nil, request,
|
||||
)
|
||||
if planErr == nil && longest > 0 {
|
||||
e := start.Add(time.Duration(longest) * time.Second)
|
||||
end = &e
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
checkables := infUtils.CollectBookingResources(wf, ws.SelectedInstances)
|
||||
start, end, available, preemptible, warnings := planner.GetPlannerService().FindDate(wfID, checkables, start, end, preemption, asap)
|
||||
|
||||
return &CheckResult{
|
||||
Start: start,
|
||||
End: end,
|
||||
Available: available,
|
||||
Preemptible: preemptible,
|
||||
Warnings: warnings,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// GetBuyAndBook — generate scheduling resources
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// GetBuyAndBook runs Planify to generate the purchases and bookings for this session.
|
||||
func (ws *WorkflowSchedule) GetBuyAndBook(wfID string, request *tools.APIRequest) (
|
||||
bool,
|
||||
*workflow.Workflow,
|
||||
[]*workflow_execution.WorkflowExecution,
|
||||
[]scheduling_resources.SchedulerObject,
|
||||
[]scheduling_resources.SchedulerObject,
|
||||
error,
|
||||
) {
|
||||
res, code, err := workflow.NewAccessor(request).LoadOne(wfID)
|
||||
if code != 200 {
|
||||
return false, nil, nil, nil, nil,
|
||||
errors.New("could not load the workflow: " + err.Error())
|
||||
}
|
||||
wf := res.(*workflow.Workflow)
|
||||
isPreemptible, longest, priceds, wf, err := wf.Planify(
|
||||
ws.Start, ws.End,
|
||||
ws.SelectedInstances, ws.SelectedPartnerships,
|
||||
ws.SelectedBuyings, ws.SelectedStrategies,
|
||||
int(ws.BookingMode), nil, request,
|
||||
)
|
||||
if err != nil {
|
||||
return false, wf, nil, nil, nil, err
|
||||
}
|
||||
ws.DurationS = longest
|
||||
ws.Message = "We estimate that the workflow will start at " + ws.Start.String() +
|
||||
" and last " + fmt.Sprintf("%v", ws.DurationS) + " seconds."
|
||||
if ws.End != nil && ws.Start.Add(time.Duration(longest)*time.Second).After(*ws.End) {
|
||||
ws.Warning = "The workflow may be too long to be executed in the given time frame, we will try to book it anyway\n"
|
||||
}
|
||||
|
||||
execs, err := ws.GenerateExecutions(wf, isPreemptible)
|
||||
if err != nil {
|
||||
return false, wf, nil, nil, nil, err
|
||||
}
|
||||
|
||||
var purchased, bookings []scheduling_resources.SchedulerObject
|
||||
for _, exec := range execs {
|
||||
for _, obj := range exec.Buy(ws.SelectedBillingStrategy, ws.UUID, wfID, priceds) {
|
||||
purchased = append(purchased, scheduling_resources.ToSchedulerObject(tools.PURCHASE_RESOURCE, obj))
|
||||
}
|
||||
for _, obj := range exec.Book(ws.UUID, wfID, priceds) {
|
||||
bookings = append(bookings, scheduling_resources.ToSchedulerObject(tools.BOOKING, obj))
|
||||
}
|
||||
}
|
||||
return true, wf, execs, purchased, bookings, nil
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// GenerateExecutions / GetDates
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// GenerateExecutions expands the cron schedule into WorkflowExecution instances.
|
||||
func (ws *WorkflowSchedule) GenerateExecutions(wf *workflow.Workflow, isPreemptible bool) ([]*workflow_execution.WorkflowExecution, error) {
|
||||
dates, err := ws.GetDates()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var executions []*workflow_execution.WorkflowExecution
|
||||
for _, date := range dates {
|
||||
obj := &workflow_execution.WorkflowExecution{
|
||||
AbstractObject: utils.AbstractObject{
|
||||
UUID: uuid.New().String(),
|
||||
Name: wf.Name + "_execution_" + date.Start.String(),
|
||||
},
|
||||
Priority: 1,
|
||||
ExecutionsID: ws.UUID,
|
||||
ExecDate: date.Start,
|
||||
EndDate: date.End,
|
||||
State: enum.DRAFT,
|
||||
WorkflowID: wf.GetID(),
|
||||
}
|
||||
if ws.BookingMode != booking.PLANNED {
|
||||
obj.Priority = 0
|
||||
}
|
||||
if ws.BookingMode == booking.PREEMPTED && isPreemptible {
|
||||
obj.Priority = 7
|
||||
}
|
||||
executions = append(executions, obj)
|
||||
}
|
||||
return executions, nil
|
||||
}
|
||||
|
||||
// GetDates parses the cron expression and returns execution date slots.
|
||||
func (ws *WorkflowSchedule) GetDates() ([]Schedule, error) {
|
||||
var schedule []Schedule
|
||||
if len(ws.Cron) > 0 {
|
||||
if ws.End == nil {
|
||||
return schedule, errors.New("a cron task should have an end date")
|
||||
}
|
||||
if ws.DurationS <= 0 {
|
||||
ws.DurationS = ws.End.Sub(ws.Start).Seconds()
|
||||
}
|
||||
cronStr := strings.Split(ws.Cron, " ")
|
||||
if len(cronStr) < 6 {
|
||||
return schedule, errors.New("Bad cron message: (" + ws.Cron + "). Should be at least ss mm hh dd MM dw")
|
||||
}
|
||||
subCron := strings.Join(cronStr[:6], " ")
|
||||
specParser := cron.NewParser(cron.Second | cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow)
|
||||
sched, err := specParser.Parse(subCron)
|
||||
if err != nil {
|
||||
return schedule, errors.New("Bad cron message: " + err.Error())
|
||||
}
|
||||
for s := sched.Next(ws.Start); !s.IsZero() && s.Before(*ws.End); s = sched.Next(s) {
|
||||
e := s.Add(time.Duration(ws.DurationS) * time.Second)
|
||||
schedule = append(schedule, Schedule{Start: s, End: &e})
|
||||
}
|
||||
} else {
|
||||
schedule = append(schedule, Schedule{Start: ws.Start, End: ws.End})
|
||||
}
|
||||
return schedule, nil
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package scheduling
|
||||
package scheduling_resources
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
@@ -77,66 +77,25 @@ func ToSchedulerObject(dt tools.DataType, obj utils.ShallowDBObject) SchedulerOb
|
||||
return nil
|
||||
}
|
||||
|
||||
func (b *ScheduledBooking) GetExecutionId() string {
|
||||
return b.ExecutionID
|
||||
}
|
||||
|
||||
func (b *ScheduledPurchase) GetExecutionId() string {
|
||||
return b.ExecutionID
|
||||
}
|
||||
|
||||
func (b *ScheduledBooking) GetExecutionsId() string {
|
||||
return b.ExecutionsID
|
||||
}
|
||||
|
||||
func (b *ScheduledPurchase) GetExecutionsId() string {
|
||||
return b.ExecutionsID
|
||||
}
|
||||
|
||||
func (b *ScheduledBooking) GetPeerSession() string {
|
||||
return b.SchedulerPeerID
|
||||
}
|
||||
|
||||
func (b *ScheduledPurchase) GetPeerSession() string {
|
||||
return b.SchedulerPeerID
|
||||
}
|
||||
|
||||
func (b *ScheduledBooking) GetDestPeer() string {
|
||||
return b.DestPeerID
|
||||
}
|
||||
|
||||
func (b *ScheduledPurchase) GetDestPeer() string {
|
||||
return b.DestPeerID
|
||||
}
|
||||
func (b *ScheduledBooking) GetExecutionId() string { return b.ExecutionID }
|
||||
func (b *ScheduledPurchase) GetExecutionId() string { return b.ExecutionID }
|
||||
func (b *ScheduledBooking) GetExecutionsId() string { return b.ExecutionsID }
|
||||
func (b *ScheduledPurchase) GetExecutionsId() string { return b.ExecutionsID }
|
||||
func (b *ScheduledBooking) GetPeerSession() string { return b.SchedulerPeerID }
|
||||
func (b *ScheduledPurchase) GetPeerSession() string { return b.SchedulerPeerID }
|
||||
func (b *ScheduledBooking) GetDestPeer() string { return b.DestPeerID }
|
||||
func (b *ScheduledPurchase) GetDestPeer() string { return b.DestPeerID }
|
||||
|
||||
func (b *ScheduledBooking) GetKey() string {
|
||||
return b.ResourceID + "/" + b.InstanceID + "/" + tools.BOOKING.String()
|
||||
}
|
||||
|
||||
func (b *ScheduledPurchase) GetKey() string {
|
||||
return b.ResourceID + "/" + b.InstanceID + "/" + tools.PURCHASE_RESOURCE.String()
|
||||
}
|
||||
|
||||
func (b *ScheduledBooking) SetIsDraft(ok bool) {
|
||||
b.IsDraft = ok
|
||||
}
|
||||
|
||||
func (b *ScheduledPurchase) SetIsDraft(ok bool) {
|
||||
b.IsDraft = ok
|
||||
}
|
||||
|
||||
func (b *ScheduledBooking) SetSchedulerPeerID(peerID string) {
|
||||
b.SchedulerPeerID = peerID
|
||||
}
|
||||
|
||||
func (b *ScheduledPurchase) SetSchedulerPeerID(peerID string) {
|
||||
b.SchedulerPeerID = peerID
|
||||
}
|
||||
|
||||
func (b *ScheduledBooking) SetExecutionsID(ei string) {
|
||||
b.ExecutionsID = ei
|
||||
}
|
||||
|
||||
func (b *ScheduledPurchase) SetExecutionsID(ei string) {
|
||||
b.ExecutionsID = ei
|
||||
}
|
||||
func (b *ScheduledBooking) SetIsDraft(ok bool) { b.IsDraft = ok }
|
||||
func (b *ScheduledPurchase) SetIsDraft(ok bool) { b.IsDraft = ok }
|
||||
func (b *ScheduledBooking) SetSchedulerPeerID(p string) { b.SchedulerPeerID = p }
|
||||
func (b *ScheduledPurchase) SetSchedulerPeerID(p string) { b.SchedulerPeerID = p }
|
||||
func (b *ScheduledBooking) SetExecutionsID(ei string) { b.ExecutionsID = ei }
|
||||
func (b *ScheduledPurchase) SetExecutionsID(ei string) { b.ExecutionsID = ei }
|
||||
474
infrastructure/scheduling_resources/service.go
Normal file
474
infrastructure/scheduling_resources/service.go
Normal file
@@ -0,0 +1,474 @@
|
||||
package scheduling_resources
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/models/booking"
|
||||
"cloud.o-forge.io/core/oc-lib/models/common/enum"
|
||||
"cloud.o-forge.io/core/oc-lib/models/peer"
|
||||
"cloud.o-forge.io/core/oc-lib/models/resources/purchase_resource"
|
||||
"cloud.o-forge.io/core/oc-lib/models/utils"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
"oc-scheduler/infrastructure/planner"
|
||||
)
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Service
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// SchedulingResourcesService manages the lifecycle of Booking and PurchaseResource
|
||||
// as SchedulerObjects. It caches the local peer identity so every operation can
|
||||
// route correctly without calling oclib.GetMySelf() on each request.
|
||||
type SchedulingResourcesService struct {
|
||||
mu sync.RWMutex
|
||||
selfPeer *peer.Peer
|
||||
}
|
||||
|
||||
var singleton *SchedulingResourcesService
|
||||
|
||||
func init() {
|
||||
singleton = &SchedulingResourcesService{}
|
||||
}
|
||||
|
||||
// GetService returns the singleton SchedulingResourcesService.
|
||||
func GetService() *SchedulingResourcesService {
|
||||
return singleton
|
||||
}
|
||||
|
||||
// Self returns the cached local peer, lazily resolving it on first call.
|
||||
func (s *SchedulingResourcesService) Self() *peer.Peer {
|
||||
s.mu.RLock()
|
||||
p := s.selfPeer
|
||||
s.mu.RUnlock()
|
||||
if p != nil {
|
||||
return p
|
||||
}
|
||||
p, _ = oclib.GetMySelf()
|
||||
if p != nil {
|
||||
s.mu.Lock()
|
||||
s.selfPeer = p
|
||||
s.mu.Unlock()
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
// InvalidateSelf clears the cached self peer (e.g. after a peer re-registration).
|
||||
func (s *SchedulingResourcesService) InvalidateSelf() {
|
||||
s.mu.Lock()
|
||||
s.selfPeer = nil
|
||||
s.mu.Unlock()
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// RemoveResourcePayload
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// RemoveResourcePayload is sent via NATS REMOVE_RESOURCE so the receiver can
|
||||
// verify the delete order comes from the original scheduler session.
|
||||
type RemoveResourcePayload struct {
|
||||
ID string `json:"id"`
|
||||
SchedulerPeerID string `json:"scheduler_peer_id"`
|
||||
ExecutionsID string `json:"executions_id"`
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Propagation — creation
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// PropagateCreate routes a new booking/purchase draft to its destination:
|
||||
// - local peer → store in DB + refresh planner
|
||||
// - remote peer → emit NATS PROPALGATION_EVENT/PB_CREATE
|
||||
func (s *SchedulingResourcesService) PropagateCreate(
|
||||
obj utils.DBObject,
|
||||
destPeerID string,
|
||||
dt tools.DataType,
|
||||
request *tools.APIRequest,
|
||||
errCh chan error,
|
||||
) {
|
||||
selfID := s.Self()
|
||||
if selfID == nil {
|
||||
errCh <- fmt.Errorf("PropagateCreate: local peer not available")
|
||||
return
|
||||
}
|
||||
|
||||
if destPeerID == selfID.GetID() {
|
||||
stored := oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil).StoreOne(obj.Serialize(obj))
|
||||
if stored.Err != "" || stored.Data == nil {
|
||||
errCh <- fmt.Errorf("could not store %s locally: %s", dt.String(), stored.Err)
|
||||
return
|
||||
}
|
||||
if dt == tools.BOOKING {
|
||||
planner.GetPlannerService().RefreshSelf(selfID.PeerID, request)
|
||||
}
|
||||
errCh <- nil
|
||||
return
|
||||
}
|
||||
|
||||
m := obj.Serialize(obj)
|
||||
if m["dest_peer_id"] != nil {
|
||||
if data := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil).LoadOne(fmt.Sprintf("%v", m["dest_peer_id"])); data.Data != nil {
|
||||
m["peer_id"] = data.Data.(*peer.Peer).PeerID
|
||||
}
|
||||
} else if m["peerless"] == true {
|
||||
originRef := fmt.Sprintf("%v", m["origin_ref"])
|
||||
if !isValidPeerlessRef(originRef) {
|
||||
emitPeerBehaviorReport(request.PeerID, tools.BehaviorFraud,
|
||||
"peerless booking with invalid or unrecognised Origin.Ref", originRef)
|
||||
errCh <- fmt.Errorf("peerless booking rejected: invalid Origin.Ref %q", originRef)
|
||||
return
|
||||
}
|
||||
stored := oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil).StoreOne(m)
|
||||
if stored.Err != "" || stored.Data == nil {
|
||||
errCh <- fmt.Errorf("could not store peerless %s locally: %s", dt.String(), stored.Err)
|
||||
return
|
||||
}
|
||||
if dt == tools.BOOKING {
|
||||
planner.GetPlannerService().RefreshSelf(selfID.PeerID, request)
|
||||
}
|
||||
errCh <- nil
|
||||
return
|
||||
} else {
|
||||
fmt.Println("PropagateCreate: no dest_peer_id and not peerless, skipping")
|
||||
errCh <- nil
|
||||
return
|
||||
}
|
||||
|
||||
payload, err := json.Marshal(m)
|
||||
if err != nil {
|
||||
errCh <- fmt.Errorf("could not serialize %s: %w", dt.String(), err)
|
||||
return
|
||||
}
|
||||
b, err := json.Marshal(&tools.PropalgationMessage{
|
||||
DataType: dt.EnumIndex(),
|
||||
Action: tools.PB_CREATE,
|
||||
Payload: payload,
|
||||
})
|
||||
if err == nil {
|
||||
tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
|
||||
FromApp: "oc-scheduler",
|
||||
Datatype: dt,
|
||||
Method: int(tools.PROPALGATION_EVENT),
|
||||
Payload: b,
|
||||
})
|
||||
}
|
||||
errCh <- nil
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Propagation — update / confirmation
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// PropagateWrite routes a booking/purchase update to its destination.
|
||||
// Returns true when the resource was confirmed locally (IsDraft=false on self peer)
|
||||
// and the caller must trigger considers via execution.UpdateExecutionState.
|
||||
func (s *SchedulingResourcesService) PropagateWrite(
|
||||
obj utils.DBObject,
|
||||
destPeerID string,
|
||||
dt tools.DataType,
|
||||
request *tools.APIRequest,
|
||||
) bool {
|
||||
selfID := s.Self()
|
||||
if selfID == nil {
|
||||
fmt.Println("PropagateWrite: local peer not available")
|
||||
return false
|
||||
}
|
||||
|
||||
if destPeerID == selfID.GetID() {
|
||||
if _, _, err := utils.GenericRawUpdateOne(obj, obj.GetID(), obj.GetAccessor(request)); err != nil {
|
||||
fmt.Printf("PropagateWrite: local update failed for %s %s: %v\n", dt, obj.GetID(), err)
|
||||
return false
|
||||
}
|
||||
if dt == tools.BOOKING {
|
||||
planner.GetPlannerService().RefreshSelf(selfID.PeerID, request)
|
||||
}
|
||||
return !obj.IsDrafted()
|
||||
}
|
||||
|
||||
payload, err := json.Marshal(obj)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
tools.NewNATSCaller().SetNATSPub(tools.CREATE_RESOURCE, tools.NATSResponse{
|
||||
FromApp: "oc-scheduler",
|
||||
Datatype: dt,
|
||||
Method: int(tools.CREATE_RESOURCE),
|
||||
Payload: payload,
|
||||
})
|
||||
return false
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Deletion
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// Delete removes a booking/purchase from its destination peer (local or NATS).
|
||||
func (s *SchedulingResourcesService) Delete(dt tools.DataType, bk SchedulerObject, request *tools.APIRequest) {
|
||||
selfID := s.Self()
|
||||
if selfID == nil {
|
||||
fmt.Println("Delete: local peer not available")
|
||||
return
|
||||
}
|
||||
|
||||
if bk.GetDestPeer() == selfID.GetID() {
|
||||
data := oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil).DeleteOne(bk.GetID())
|
||||
fmt.Println("Delete scheduling resource", bk.GetID(), data.Err)
|
||||
if dt == tools.BOOKING {
|
||||
planner.GetPlannerService().RefreshSelf(selfID.PeerID, request)
|
||||
}
|
||||
return
|
||||
}
|
||||
EmitNATSRemove(bk.GetID(), bk.GetPeerSession(), bk.GetExecutionsId(), dt)
|
||||
}
|
||||
|
||||
// EmitNATSRemove sends a REMOVE_RESOURCE NATS event with auth fields.
|
||||
func EmitNATSRemove(id, schedulerPeerID, executionsID string, dt tools.DataType) {
|
||||
payload, _ := json.Marshal(RemoveResourcePayload{
|
||||
ID: id,
|
||||
SchedulerPeerID: schedulerPeerID,
|
||||
ExecutionsID: executionsID,
|
||||
})
|
||||
tools.NewNATSCaller().SetNATSPub(tools.REMOVE_RESOURCE, tools.NATSResponse{
|
||||
FromApp: "oc-scheduler",
|
||||
Datatype: dt,
|
||||
Method: int(tools.REMOVE_RESOURCE),
|
||||
Payload: payload,
|
||||
})
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Confirmation
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// Confirm sets IsDraft=false on a booking or purchase.
|
||||
// For bookings, also advances State to SCHEDULED and refreshes the self planner.
|
||||
func Confirm(id string, dt tools.DataType) {
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
switch dt {
|
||||
case tools.BOOKING:
|
||||
res, _, err := booking.NewAccessor(adminReq).LoadOne(id)
|
||||
if err != nil || res == nil {
|
||||
fmt.Printf("Confirm: could not load booking %s: %v\n", id, err)
|
||||
return
|
||||
}
|
||||
bk := res.(*booking.Booking)
|
||||
bk.IsDraft = false
|
||||
bk.State = enum.SCHEDULED
|
||||
if _, _, err := utils.GenericRawUpdateOne(bk, id, booking.NewAccessor(adminReq)); err != nil {
|
||||
fmt.Printf("Confirm: could not confirm booking %s: %v\n", id, err)
|
||||
return
|
||||
}
|
||||
if self := GetService().Self(); self != nil {
|
||||
planner.GetPlannerService().RefreshSelf(self.PeerID, adminReq)
|
||||
}
|
||||
case tools.PURCHASE_RESOURCE:
|
||||
res, _, err := purchase_resource.NewAccessor(adminReq).LoadOne(id)
|
||||
if err != nil || res == nil {
|
||||
fmt.Printf("Confirm: could not load purchase %s: %v\n", id, err)
|
||||
return
|
||||
}
|
||||
pr := res.(*purchase_resource.PurchaseResource)
|
||||
pr.IsDraft = false
|
||||
if _, _, err := utils.GenericRawUpdateOne(pr, id, purchase_resource.NewAccessor(adminReq)); err != nil {
|
||||
fmt.Printf("Confirm: could not confirm purchase %s: %v\n", id, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// DraftTimeout deletes a booking/purchase if it is still a draft after 10 minutes.
|
||||
func DraftTimeout(id string, dt tools.DataType) {
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
var res utils.DBObject
|
||||
var loadErr error
|
||||
switch dt {
|
||||
case tools.BOOKING:
|
||||
res, _, loadErr = booking.NewAccessor(adminReq).LoadOne(id)
|
||||
case tools.PURCHASE_RESOURCE:
|
||||
res, _, loadErr = purchase_resource.NewAccessor(adminReq).LoadOne(id)
|
||||
default:
|
||||
return
|
||||
}
|
||||
if loadErr != nil || res == nil || !res.IsDrafted() {
|
||||
return
|
||||
}
|
||||
switch dt {
|
||||
case tools.BOOKING:
|
||||
booking.NewAccessor(adminReq).DeleteOne(id)
|
||||
case tools.PURCHASE_RESOURCE:
|
||||
purchase_resource.NewAccessor(adminReq).DeleteOne(id)
|
||||
}
|
||||
fmt.Printf("DraftTimeout: %s %s deleted (still draft after 10 min)\n", dt.String(), id)
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// NATS handlers — incoming booking/purchase
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// HandleCreateBooking processes an incoming booking from NATS.
|
||||
// Returns true if the booking was confirmed (IsDraft→false) and considers must be triggered.
|
||||
func (s *SchedulingResourcesService) HandleCreateBooking(bk *booking.Booking, adminReq *tools.APIRequest) bool {
|
||||
self := s.Self()
|
||||
if self == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
if existing, _, loadErr := booking.NewAccessor(adminReq).LoadOne(bk.GetID()); loadErr == nil && existing != nil {
|
||||
prev := existing.(*booking.Booking)
|
||||
if prev.SchedulerPeerID != bk.SchedulerPeerID || prev.ExecutionsID != bk.ExecutionsID {
|
||||
fmt.Println("HandleCreateBooking: auth mismatch, ignoring", bk.GetID())
|
||||
return false
|
||||
}
|
||||
if !prev.IsDrafted() && bk.IsDraft {
|
||||
return false
|
||||
}
|
||||
if !bk.IsDraft && !prev.ExpectedStartDate.IsZero() && prev.ExpectedStartDate.Before(time.Now().UTC()) {
|
||||
fmt.Println("HandleCreateBooking: expired, deleting", bk.GetID())
|
||||
booking.NewAccessor(adminReq).DeleteOne(bk.GetID())
|
||||
return false
|
||||
}
|
||||
if _, _, err := utils.GenericRawUpdateOne(bk, bk.GetID(), booking.NewAccessor(adminReq)); err != nil {
|
||||
fmt.Println("HandleCreateBooking: update failed:", err)
|
||||
return false
|
||||
}
|
||||
planner.GetPlannerService().RefreshSelf(self.PeerID, adminReq)
|
||||
return !bk.IsDraft
|
||||
}
|
||||
|
||||
// New booking
|
||||
if !bk.ExpectedStartDate.IsZero() && bk.ExpectedStartDate.Before(time.Now().UTC()) {
|
||||
fmt.Println("HandleCreateBooking: start date in the past, discarding")
|
||||
return false
|
||||
}
|
||||
if !planner.GetPlannerService().CheckResourceInstance(self.PeerID, bk.ResourceID, bk.InstanceID, bk.ExpectedStartDate, bk.ExpectedEndDate) {
|
||||
fmt.Println("HandleCreateBooking: conflicts with local planner, discarding")
|
||||
return false
|
||||
}
|
||||
bk.IsDraft = true
|
||||
stored, _, err := booking.NewAccessor(adminReq).StoreOne(bk)
|
||||
if err != nil {
|
||||
fmt.Println("HandleCreateBooking: could not store:", err)
|
||||
return false
|
||||
}
|
||||
storedID := stored.GetID()
|
||||
planner.GetPlannerService().RefreshSelf(self.PeerID, adminReq)
|
||||
time.AfterFunc(10*time.Minute, func() { DraftTimeout(storedID, tools.BOOKING) })
|
||||
return false
|
||||
}
|
||||
|
||||
// HandleCreatePurchase processes an incoming purchase from NATS.
|
||||
// Returns true if considers must be triggered.
|
||||
func (s *SchedulingResourcesService) HandleCreatePurchase(pr *purchase_resource.PurchaseResource, adminReq *tools.APIRequest) bool {
|
||||
self := s.Self()
|
||||
if self == nil {
|
||||
return false
|
||||
}
|
||||
if pr.DestPeerID != self.GetID() {
|
||||
return false
|
||||
}
|
||||
|
||||
if existing, _, loadErr := purchase_resource.NewAccessor(adminReq).LoadOne(pr.GetID()); loadErr == nil && existing != nil {
|
||||
prev := existing.(*purchase_resource.PurchaseResource)
|
||||
if prev.SchedulerPeerID != pr.SchedulerPeerID || prev.ExecutionsID != pr.ExecutionsID {
|
||||
fmt.Println("HandleCreatePurchase: auth mismatch, ignoring", pr.GetID())
|
||||
return false
|
||||
}
|
||||
if !prev.IsDrafted() && pr.IsDraft {
|
||||
return false
|
||||
}
|
||||
if _, _, err := utils.GenericRawUpdateOne(pr, pr.GetID(), purchase_resource.NewAccessor(adminReq)); err != nil {
|
||||
fmt.Println("HandleCreatePurchase: update failed:", err)
|
||||
return false
|
||||
}
|
||||
return !pr.IsDraft
|
||||
}
|
||||
|
||||
pr.IsDraft = true
|
||||
stored, _, err := purchase_resource.NewAccessor(adminReq).StoreOne(pr)
|
||||
if err != nil {
|
||||
fmt.Println("HandleCreatePurchase: could not store:", err)
|
||||
return false
|
||||
}
|
||||
storedID := stored.GetID()
|
||||
time.AfterFunc(10*time.Minute, func() { DraftTimeout(storedID, tools.PURCHASE_RESOURCE) })
|
||||
return false
|
||||
}
|
||||
|
||||
// HandleRemoveBooking verifies auth and deletes the booking.
|
||||
func (s *SchedulingResourcesService) HandleRemoveBooking(p RemoveResourcePayload, adminReq *tools.APIRequest) {
|
||||
res, _, loadErr := booking.NewAccessor(adminReq).LoadOne(p.ID)
|
||||
if loadErr != nil || res == nil {
|
||||
return
|
||||
}
|
||||
existing := res.(*booking.Booking)
|
||||
if existing.SchedulerPeerID != p.SchedulerPeerID || existing.ExecutionsID != p.ExecutionsID {
|
||||
fmt.Println("HandleRemoveBooking: auth mismatch, ignoring", p.ID)
|
||||
return
|
||||
}
|
||||
booking.NewAccessor(adminReq).DeleteOne(p.ID)
|
||||
if self := s.Self(); self != nil {
|
||||
planner.GetPlannerService().RefreshSelf(self.PeerID, adminReq)
|
||||
}
|
||||
}
|
||||
|
||||
// HandleRemovePurchase verifies auth and deletes the purchase.
|
||||
func (s *SchedulingResourcesService) HandleRemovePurchase(p RemoveResourcePayload, adminReq *tools.APIRequest) {
|
||||
res, _, loadErr := purchase_resource.NewAccessor(adminReq).LoadOne(p.ID)
|
||||
if loadErr != nil || res == nil {
|
||||
return
|
||||
}
|
||||
existing := res.(*purchase_resource.PurchaseResource)
|
||||
if existing.SchedulerPeerID != p.SchedulerPeerID || existing.ExecutionsID != p.ExecutionsID {
|
||||
fmt.Println("HandleRemovePurchase: auth mismatch, ignoring", p.ID)
|
||||
return
|
||||
}
|
||||
purchase_resource.NewAccessor(adminReq).DeleteOne(p.ID)
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Internal helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
var knownRegistryPrefixes = []string{
|
||||
"docker.io/", "index.docker.io/", "ghcr.io/", "quay.io/",
|
||||
"registry.hub.docker.com/", "gcr.io/", "public.ecr.aws/",
|
||||
}
|
||||
|
||||
func isValidPeerlessRef(ref string) bool {
|
||||
if ref == "" || ref == "<nil>" {
|
||||
return false
|
||||
}
|
||||
for _, prefix := range knownRegistryPrefixes {
|
||||
if strings.HasPrefix(ref, prefix) && len(ref) > len(prefix) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func emitPeerBehaviorReport(targetPeerDID string, severity tools.BehaviorSeverity, reason, evidence string) {
|
||||
if targetPeerDID == "" {
|
||||
return
|
||||
}
|
||||
report := tools.PeerBehaviorReport{
|
||||
ReporterApp: "oc-scheduler",
|
||||
TargetPeerID: targetPeerDID,
|
||||
Severity: severity,
|
||||
Reason: reason,
|
||||
Evidence: evidence,
|
||||
At: time.Now().UTC(),
|
||||
}
|
||||
payload, err := json.Marshal(report)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
tools.NewNATSCaller().SetNATSPub(tools.PEER_BEHAVIOR_EVENT, tools.NATSResponse{
|
||||
FromApp: "oc-scheduler",
|
||||
Datatype: tools.PEER,
|
||||
Method: int(tools.PEER_BEHAVIOR_EVENT),
|
||||
Payload: payload,
|
||||
})
|
||||
}
|
||||
@@ -1,395 +0,0 @@
|
||||
package infrastructure
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"oc-scheduler/conf"
|
||||
"oc-scheduler/infrastructure/scheduling"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/dbs"
|
||||
"cloud.o-forge.io/core/oc-lib/models/booking"
|
||||
"cloud.o-forge.io/core/oc-lib/models/order"
|
||||
"cloud.o-forge.io/core/oc-lib/models/peer"
|
||||
"cloud.o-forge.io/core/oc-lib/models/resources/purchase_resource"
|
||||
"cloud.o-forge.io/core/oc-lib/models/utils"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow_execution"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
)
|
||||
|
||||
// removeResourcePayload is sent via NATS REMOVE_RESOURCE so the receiver can
|
||||
// verify the delete order comes from the original scheduler session.
|
||||
type removeResourcePayload struct {
|
||||
ID string `json:"id"`
|
||||
SchedulerPeerID string `json:"scheduler_peer_id"`
|
||||
ExecutionsID string `json:"executions_id"`
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// DB helpers — objects are found via executions_id
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func sessionIDFilter(field, id string) *dbs.Filters {
|
||||
return &dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
field: {{Operator: dbs.EQUAL.String(), Value: id}},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func loadSession(executionsID string, dt tools.DataType) []scheduling.SchedulerObject {
|
||||
results := oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil).Search(
|
||||
sessionIDFilter("executions_id", executionsID), "", true)
|
||||
out := make([]scheduling.SchedulerObject, 0, len(results.Data))
|
||||
for _, obj := range results.Data {
|
||||
out = append(out, scheduling.ToSchedulerObject(dt, obj))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func loadSessionExecs(executionsID string) []*workflow_execution.WorkflowExecution {
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
results, _, _ := workflow_execution.NewAccessor(adminReq).Search(
|
||||
sessionIDFilter("executions_id", executionsID), "", true)
|
||||
out := make([]*workflow_execution.WorkflowExecution, 0, len(results))
|
||||
for _, obj := range results {
|
||||
if exec, ok := obj.(*workflow_execution.WorkflowExecution); ok {
|
||||
out = append(out, exec)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func loadSessionOrder(executionsID string) *order.Order {
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
results, _, _ := order.NewAccessor(adminReq).Search(
|
||||
sessionIDFilter("executions_id", executionsID), "", true)
|
||||
for _, obj := range results {
|
||||
if o, ok := obj.(*order.Order); ok {
|
||||
return o
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Session upsert
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// UpsertSessionDrafts creates or updates draft bookings/purchases/executions for a
|
||||
// Check session. Existing objects are found via the DB (executions_id).
|
||||
// Called on first successful check and on user date changes.
|
||||
//
|
||||
// - bookings/purchases: upserted by (resourceID, instanceID); stale ones deleted
|
||||
// - executions: replaced on every call (dates may have changed)
|
||||
// - order: created once, updated on subsequent calls
|
||||
func (ws *WorkflowSchedule) UpsertSessionDrafts(wfID, executionsID string, selfID *peer.Peer, request *tools.APIRequest) {
|
||||
_, _, execs, purchases, bookings, err := ws.GetBuyAndBook(wfID, request)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
|
||||
// --- bookings ---
|
||||
existing := map[string]scheduling.SchedulerObject{}
|
||||
seen := map[string]bool{}
|
||||
for dt, datas := range map[tools.DataType][]scheduling.SchedulerObject{
|
||||
tools.BOOKING: bookings, tools.PURCHASE_RESOURCE: purchases,
|
||||
} {
|
||||
for _, bk := range loadSession(executionsID, dt) {
|
||||
existing[bk.GetKey()] = bk
|
||||
}
|
||||
upsertSessionDrafts(dt, datas, existing, seen, selfID, executionsID, request)
|
||||
for key, prev := range existing {
|
||||
if !seen[key] {
|
||||
deleteScheduling(dt, prev, selfID, request)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
// --- executions: replace on every call (dates may have changed) ---
|
||||
for _, old := range loadSessionExecs(executionsID) {
|
||||
UnregisterExecLock(old.GetID())
|
||||
workflow_execution.NewAccessor(adminReq).DeleteOne(old.GetID())
|
||||
}
|
||||
for _, exec := range execs {
|
||||
exec.ExecutionsID = executionsID
|
||||
exec.IsDraft = true
|
||||
ex, _, err := utils.GenericStoreOne(exec, workflow_execution.NewAccessor(adminReq))
|
||||
if err == nil {
|
||||
RegisterExecLock(ex.GetID())
|
||||
go WatchExecDeadline(
|
||||
ex.GetID(), executionsID, exec.ExecDate, selfID, request)
|
||||
}
|
||||
}
|
||||
|
||||
// --- order: create once, update on subsequent calls ---
|
||||
if existing := loadSessionOrder(executionsID); existing == nil {
|
||||
ws.GenerateOrder(purchases, bookings, executionsID, request)
|
||||
} else {
|
||||
for _, purch := range purchases {
|
||||
existing.Purchases = append(
|
||||
existing.Purchases, scheduling.FromSchedulerObject(tools.PURCHASE_RESOURCE, purch).(*purchase_resource.PurchaseResource))
|
||||
}
|
||||
for _, b := range bookings {
|
||||
existing.Bookings = append(
|
||||
existing.Bookings, scheduling.FromSchedulerObject(tools.BOOKING, b).(*booking.Booking))
|
||||
}
|
||||
utils.GenericRawUpdateOne(existing, existing.GetID(), order.NewAccessor(adminReq))
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Session lifecycle
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func upsertSessionDrafts(dt tools.DataType, datas []scheduling.SchedulerObject, existing map[string]scheduling.SchedulerObject,
|
||||
seen map[string]bool, selfID *peer.Peer,
|
||||
executionsID string, request *tools.APIRequest) {
|
||||
fmt.Println("UpsertSessionDrafts", len(datas), len(existing))
|
||||
for _, bk := range datas {
|
||||
bk.SetSchedulerPeerID(selfID.PeerID)
|
||||
bk.SetExecutionsID(executionsID)
|
||||
seen[bk.GetKey()] = true
|
||||
if prev, ok := existing[bk.GetKey()]; ok {
|
||||
bk.SetID(prev.GetID())
|
||||
bk.SetIsDraft(false)
|
||||
// Convert to concrete type (Booking/PurchaseResource) so that
|
||||
// GenericRawUpdateOne serializes the real struct, not the wrapper.
|
||||
propagateWriteResource(
|
||||
scheduling.FromSchedulerDBObject(dt, bk), bk.GetDestPeer(), dt, selfID, request)
|
||||
} else {
|
||||
errCh := make(chan error, 1)
|
||||
propagateResource(scheduling.FromSchedulerDBObject(dt, bk), bk.GetDestPeer(), dt, selfID, request, errCh)
|
||||
<-errCh
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// CleanupSession deletes all draft bookings/purchases/executions/order for a
|
||||
// session (called when the WebSocket closes without a confirm).
|
||||
func CleanupSession(self *peer.Peer, executionsID string, selfID *peer.Peer, request *tools.APIRequest) {
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
for _, exec := range loadSessionExecs(executionsID) {
|
||||
UnscheduleExecution(exec.GetID(), selfID, request)
|
||||
workflow_execution.NewAccessor(adminReq).DeleteOne(exec.GetID())
|
||||
}
|
||||
if o := loadSessionOrder(executionsID); o != nil {
|
||||
order.NewAccessor(adminReq).DeleteOne(o.GetID())
|
||||
}
|
||||
}
|
||||
|
||||
// ConfirmSession flips all session drafts to IsDraft=false and propagates them.
|
||||
// The considers mechanism then transitions executions to IsDraft=false once
|
||||
// all remote peers acknowledge.
|
||||
func ConfirmSession(executionsID string, selfID *peer.Peer, request *tools.APIRequest) error {
|
||||
for _, dt := range []tools.DataType{tools.BOOKING, tools.PURCHASE_RESOURCE} {
|
||||
for _, bk := range loadSession(executionsID, dt) {
|
||||
bk.SetIsDraft(false)
|
||||
propagateWriteResource(
|
||||
scheduling.FromSchedulerDBObject(dt, bk), bk.GetDestPeer(), dt, selfID, request)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// confirmSessionOrder sets the order IsDraft=false once all considers are received.
|
||||
func confirmSessionOrder(executionsID string, adminReq *tools.APIRequest) {
|
||||
if o := loadSessionOrder(executionsID); o != nil {
|
||||
o.IsDraft = false
|
||||
utils.GenericRawUpdateOne(o, o.GetID(), order.NewAccessor(adminReq))
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Propagation
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// propagateWriteResource routes a booking/purchase write to its destination:
|
||||
// - local peer → DB upsert; emits considers on confirm (IsDraft=false)
|
||||
// - remote peer → NATS CREATE_RESOURCE (receiver upserts)
|
||||
func propagateWriteResource(obj utils.DBObject, destPeerID string, dt tools.DataType, selfID *peer.Peer, request *tools.APIRequest) {
|
||||
if destPeerID == selfID.GetID() {
|
||||
if _, _, err := utils.GenericRawUpdateOne(obj, obj.GetID(), obj.GetAccessor(request)); err != nil {
|
||||
fmt.Printf("propagateWriteResource: local update failed for %s %s: %v\n", dt, obj.GetID(), err)
|
||||
return
|
||||
}
|
||||
if dt == tools.BOOKING {
|
||||
go refreshSelfPlanner(selfID.PeerID, request)
|
||||
}
|
||||
fmt.Println("IS DRAFTED", obj.IsDrafted())
|
||||
if !obj.IsDrafted() {
|
||||
if payload, err := json.Marshal(&executionConsidersPayload{
|
||||
ID: obj.GetID(),
|
||||
}); err == nil {
|
||||
go updateExecutionState(payload, dt)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
payload, err := json.Marshal(obj)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
tools.NewNATSCaller().SetNATSPub(tools.CREATE_RESOURCE, tools.NATSResponse{
|
||||
FromApp: "oc-scheduler",
|
||||
Datatype: dt,
|
||||
Method: int(tools.CREATE_RESOURCE),
|
||||
Payload: payload,
|
||||
})
|
||||
}
|
||||
|
||||
// deleteBooking deletes a booking from its destination peer (local DB or NATS).
|
||||
func deleteScheduling(dt tools.DataType, bk scheduling.SchedulerObject, selfID *peer.Peer, request *tools.APIRequest) {
|
||||
if bk.GetDestPeer() == selfID.GetID() {
|
||||
oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil).DeleteOne(bk.GetID())
|
||||
go refreshSelfPlanner(selfID.PeerID, request)
|
||||
return
|
||||
}
|
||||
emitNATSRemove(bk.GetID(), bk.GetPeerSession(), bk.GetExecutionsId(), dt)
|
||||
}
|
||||
|
||||
// emitNATSRemove sends a REMOVE_RESOURCE event to the remote peer carrying
|
||||
// auth fields so the receiver can verify the delete is legitimate.
|
||||
func emitNATSRemove(id, schedulerPeerID, executionsID string, dt tools.DataType) {
|
||||
payload, _ := json.Marshal(removeResourcePayload{
|
||||
ID: id,
|
||||
SchedulerPeerID: schedulerPeerID,
|
||||
ExecutionsID: executionsID,
|
||||
})
|
||||
tools.NewNATSCaller().SetNATSPub(tools.REMOVE_RESOURCE, tools.NATSResponse{
|
||||
FromApp: "oc-scheduler",
|
||||
Datatype: dt,
|
||||
Method: int(tools.REMOVE_RESOURCE),
|
||||
Payload: payload,
|
||||
})
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Deadline watchers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// WatchExecDeadline fires one minute before the execution start date.
|
||||
// If the execution is still a draft it is purged; otherwise the namespace
|
||||
// is created and a WatchExecEnd watcher is armed.
|
||||
// If the deadline has already passed (e.g. after a process restart), it fires immediately.
|
||||
func WatchExecDeadline(executionID string, ns string, execDate time.Time, selfID *peer.Peer, request *tools.APIRequest) {
|
||||
fmt.Println("WatchExecDeadline")
|
||||
delay := time.Until(execDate.UTC().Add(-1 * time.Minute))
|
||||
if delay <= 0 {
|
||||
go handleExecDeadline(executionID, ns, selfID, request)
|
||||
return
|
||||
}
|
||||
time.AfterFunc(delay, func() { handleExecDeadline(executionID, ns, selfID, request) })
|
||||
}
|
||||
|
||||
func handleExecDeadline(executionID string, ns string, selfID *peer.Peer, request *tools.APIRequest) {
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
res, _, err := workflow_execution.NewAccessor(adminReq).LoadOne(executionID)
|
||||
if err != nil || res == nil {
|
||||
fmt.Printf("handleExecDeadline: execution %s not found\n", executionID)
|
||||
return
|
||||
}
|
||||
exec := res.(*workflow_execution.WorkflowExecution)
|
||||
if exec.IsDraft {
|
||||
UnscheduleExecution(executionID, selfID, request)
|
||||
workflow_execution.NewAccessor(adminReq).DeleteOne(executionID)
|
||||
fmt.Printf("handleExecDeadline: purged draft execution %s\n", executionID)
|
||||
return
|
||||
}
|
||||
if serv, err := tools.NewKubernetesService(
|
||||
conf.GetConfig().KubeHost+":"+conf.GetConfig().KubePort,
|
||||
conf.GetConfig().KubeCA, conf.GetConfig().KubeCert, conf.GetConfig().KubeData); err != nil {
|
||||
fmt.Printf("handleExecDeadline: k8s init failed for %s: %v\n", executionID, err)
|
||||
} else if err := serv.ProvisionExecutionNamespace(context.Background(), ns); err != nil {
|
||||
fmt.Printf("handleExecDeadline: failed to provision namespace for %s: %v\n", ns, err)
|
||||
}
|
||||
go WatchExecEnd(executionID, ns, exec.EndDate, exec.ExecDate)
|
||||
}
|
||||
|
||||
// WatchExecEnd fires at the execution end date (ExecDate+1h when EndDate is nil)
|
||||
// and deletes the Kubernetes namespace associated with the execution.
|
||||
func WatchExecEnd(executionID string, ns string, endDate *time.Time, execDate time.Time) {
|
||||
var end time.Time
|
||||
if endDate != nil {
|
||||
end = *endDate
|
||||
} else {
|
||||
end = execDate.UTC().Add(time.Hour)
|
||||
}
|
||||
delay := time.Until(end.UTC())
|
||||
fire := func() {
|
||||
serv, err := tools.NewKubernetesService(
|
||||
conf.GetConfig().KubeHost+":"+conf.GetConfig().KubePort,
|
||||
conf.GetConfig().KubeCA, conf.GetConfig().KubeCert, conf.GetConfig().KubeData)
|
||||
if err != nil {
|
||||
fmt.Printf("WatchExecEnd: k8s init failed for %s: %v\n", executionID, err)
|
||||
return
|
||||
}
|
||||
if err := serv.TeardownExecutionNamespace(context.Background(), ns); err != nil {
|
||||
fmt.Printf("WatchExecEnd: failed to teardown namespace %s: %v\n", ns, err)
|
||||
}
|
||||
}
|
||||
if delay <= 0 {
|
||||
go fire()
|
||||
return
|
||||
}
|
||||
time.AfterFunc(delay, fire)
|
||||
}
|
||||
|
||||
// RecoverDraftExecutions is called at startup to restore deadline watchers for
|
||||
// draft executions that survived a process restart. Executions already past
|
||||
// their deadline are purged immediately.
|
||||
func RecoverDraftExecutions() {
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
var selfID *peer.Peer
|
||||
for selfID == nil {
|
||||
selfID, _ = oclib.GetMySelf()
|
||||
if selfID == nil {
|
||||
time.Sleep(5 * time.Second)
|
||||
}
|
||||
}
|
||||
results, _, _ := workflow_execution.NewAccessor(adminReq).Search(nil, "*", true)
|
||||
for _, obj := range results {
|
||||
exec, ok := obj.(*workflow_execution.WorkflowExecution)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
RegisterExecLock(exec.GetID())
|
||||
go WatchExecDeadline(exec.GetID(), exec.ExecutionsID, exec.ExecDate, selfID, adminReq)
|
||||
}
|
||||
fmt.Printf("RecoverDraftExecutions: recovered %d draft executions\n", len(results))
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Unschedule
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// UnscheduleExecution deletes all bookings for an execution (via PeerBookByGraph)
|
||||
// then deletes the execution itself.
|
||||
func UnscheduleExecution(executionID string, selfID *peer.Peer, request *tools.APIRequest) error {
|
||||
fmt.Println("UnscheduleExecution")
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
res, _, err := workflow_execution.NewAccessor(adminReq).LoadOne(executionID)
|
||||
if err != nil || res == nil {
|
||||
return fmt.Errorf("execution %s not found: %w", executionID, err)
|
||||
}
|
||||
exec := res.(*workflow_execution.WorkflowExecution)
|
||||
for _, byResource := range exec.PeerBookByGraph {
|
||||
for _, bookingIDs := range byResource {
|
||||
for _, bkID := range bookingIDs {
|
||||
bkRes, _, loadErr := booking.NewAccessor(adminReq).LoadOne(bkID)
|
||||
fmt.Println("UnscheduleExecution", bkID, loadErr)
|
||||
if loadErr != nil || bkRes == nil {
|
||||
continue
|
||||
}
|
||||
deleteScheduling(tools.BOOKING, scheduling.ToSchedulerObject(tools.BOOKING, bkRes), selfID, request)
|
||||
}
|
||||
}
|
||||
}
|
||||
workflow_execution.NewAccessor(adminReq).DeleteOne(executionID)
|
||||
UnregisterExecLock(executionID)
|
||||
return nil
|
||||
}
|
||||
233
infrastructure/session/session.go
Normal file
233
infrastructure/session/session.go
Normal file
@@ -0,0 +1,233 @@
|
||||
package session
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"oc-scheduler/infrastructure/execution"
|
||||
"oc-scheduler/infrastructure/scheduling_resources"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"cloud.o-forge.io/core/oc-lib/dbs"
|
||||
"cloud.o-forge.io/core/oc-lib/models/bill"
|
||||
"cloud.o-forge.io/core/oc-lib/models/booking"
|
||||
"cloud.o-forge.io/core/oc-lib/models/common/enum"
|
||||
"cloud.o-forge.io/core/oc-lib/models/order"
|
||||
"cloud.o-forge.io/core/oc-lib/models/resources/purchase_resource"
|
||||
"cloud.o-forge.io/core/oc-lib/models/utils"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow_execution"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
)
|
||||
|
||||
type SessionExecutionsService struct {
|
||||
Mu sync.RWMutex
|
||||
ExecutionsSessionID string
|
||||
}
|
||||
|
||||
func NewSessionExecutionsService(sessionID string) *SessionExecutionsService {
|
||||
return &SessionExecutionsService{ExecutionsSessionID: sessionID}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// DB helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func (s *SessionExecutionsService) sessionIDFilter(field, id string) *dbs.Filters {
|
||||
return &dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
field: {{Operator: dbs.EQUAL.String(), Value: id}},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SessionExecutionsService) loadSession(dt tools.DataType) []scheduling_resources.SchedulerObject {
|
||||
results := oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil).Search(
|
||||
s.sessionIDFilter("executions_id", s.ExecutionsSessionID), "", true)
|
||||
out := make([]scheduling_resources.SchedulerObject, 0, len(results.Data))
|
||||
for _, obj := range results.Data {
|
||||
out = append(out, scheduling_resources.ToSchedulerObject(dt, obj))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (s *SessionExecutionsService) LoadSessionExecs() []*workflow_execution.WorkflowExecution {
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
results, _, _ := workflow_execution.NewAccessor(adminReq).Search(
|
||||
s.sessionIDFilter("executions_id", s.ExecutionsSessionID), "", true)
|
||||
out := make([]*workflow_execution.WorkflowExecution, 0)
|
||||
for _, obj := range results {
|
||||
if exec, ok := obj.(*workflow_execution.WorkflowExecution); ok {
|
||||
out = append(out, exec)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (s *SessionExecutionsService) loadSessionOrder() *order.Order {
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
results, _, _ := order.NewAccessor(adminReq).Search(
|
||||
s.sessionIDFilter("executions_id", s.ExecutionsSessionID), "", true)
|
||||
for _, obj := range results {
|
||||
if o, ok := obj.(*order.Order); ok {
|
||||
return o
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Session upsert
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func (s *SessionExecutionsService) UpsertSessionDrafts(
|
||||
purchases, bookings []scheduling_resources.SchedulerObject,
|
||||
execs []*workflow_execution.WorkflowExecution,
|
||||
request *tools.APIRequest,
|
||||
) {
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
|
||||
for dt, datas := range map[tools.DataType][]scheduling_resources.SchedulerObject{
|
||||
tools.BOOKING: bookings,
|
||||
tools.PURCHASE_RESOURCE: purchases,
|
||||
} {
|
||||
existing := map[string]scheduling_resources.SchedulerObject{}
|
||||
seen := map[string]bool{}
|
||||
for _, bk := range s.loadSession(dt) {
|
||||
existing[bk.GetKey()] = bk
|
||||
}
|
||||
s.upsertDrafts(dt, datas, existing, seen, request)
|
||||
for key, prev := range existing {
|
||||
if !seen[key] {
|
||||
scheduling_resources.GetService().Delete(dt, prev, request)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, old := range s.LoadSessionExecs() {
|
||||
execution.UnregisterExecLock(old.GetID())
|
||||
workflow_execution.NewAccessor(adminReq).DeleteOne(old.GetID())
|
||||
}
|
||||
for _, exec := range execs {
|
||||
exec.ExecutionsID = s.ExecutionsSessionID
|
||||
exec.IsDraft = true
|
||||
ex, _, err := utils.GenericStoreOne(exec, workflow_execution.NewAccessor(adminReq))
|
||||
if err == nil {
|
||||
execution.RegisterExecLock(ex.GetID())
|
||||
go execution.WatchDeadline(ex.GetID(), s.ExecutionsSessionID, exec.ExecDate, request)
|
||||
}
|
||||
}
|
||||
|
||||
if existing := s.loadSessionOrder(); existing == nil {
|
||||
GenerateOrder(purchases, bookings, s.ExecutionsSessionID, request)
|
||||
} else {
|
||||
for _, purch := range purchases {
|
||||
existing.Purchases = append(existing.Purchases,
|
||||
scheduling_resources.FromSchedulerObject(tools.PURCHASE_RESOURCE, purch).(*purchase_resource.PurchaseResource))
|
||||
}
|
||||
for _, b := range bookings {
|
||||
existing.Bookings = append(existing.Bookings,
|
||||
scheduling_resources.FromSchedulerObject(tools.BOOKING, b).(*booking.Booking))
|
||||
}
|
||||
utils.GenericRawUpdateOne(existing, existing.GetID(), order.NewAccessor(adminReq))
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SessionExecutionsService) upsertDrafts(
|
||||
dt tools.DataType,
|
||||
datas []scheduling_resources.SchedulerObject,
|
||||
existing map[string]scheduling_resources.SchedulerObject,
|
||||
seen map[string]bool,
|
||||
request *tools.APIRequest,
|
||||
) {
|
||||
self := scheduling_resources.GetService().Self()
|
||||
fmt.Println("upsertDrafts", len(datas), len(existing))
|
||||
for _, bk := range datas {
|
||||
if self != nil {
|
||||
bk.SetSchedulerPeerID(self.PeerID)
|
||||
}
|
||||
bk.SetExecutionsID(s.ExecutionsSessionID)
|
||||
seen[bk.GetKey()] = true
|
||||
if prev, ok := existing[bk.GetKey()]; ok {
|
||||
bk.SetID(prev.GetID())
|
||||
bk.SetIsDraft(false)
|
||||
needsConsiders := scheduling_resources.GetService().PropagateWrite(
|
||||
scheduling_resources.FromSchedulerDBObject(dt, bk), bk.GetDestPeer(), dt, request)
|
||||
if needsConsiders {
|
||||
if payload, err := json.Marshal(execution.ConsidersPayload{ID: bk.GetID()}); err == nil {
|
||||
go execution.UpdateExecutionState(payload, dt)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
errCh := make(chan error, 1)
|
||||
scheduling_resources.GetService().PropagateCreate(
|
||||
scheduling_resources.FromSchedulerDBObject(dt, bk), bk.GetDestPeer(), dt, request, errCh)
|
||||
<-errCh
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Session lifecycle
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func (s *SessionExecutionsService) CleanupSession(request *tools.APIRequest) {
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
for _, exec := range s.LoadSessionExecs() {
|
||||
execution.Unschedule(exec.GetID(), request)
|
||||
workflow_execution.NewAccessor(adminReq).DeleteOne(exec.GetID())
|
||||
}
|
||||
if o := s.loadSessionOrder(); o != nil {
|
||||
order.NewAccessor(adminReq).DeleteOne(o.GetID())
|
||||
}
|
||||
}
|
||||
|
||||
func GenerateOrder(
|
||||
purchases, bookings []scheduling_resources.SchedulerObject,
|
||||
executionsID string,
|
||||
request *tools.APIRequest,
|
||||
) (string, error) {
|
||||
newOrder := &order.Order{
|
||||
AbstractObject: utils.AbstractObject{
|
||||
Name: "order_" + request.PeerID + "_" + time.Now().UTC().Format("2006-01-02T15:04:05"),
|
||||
IsDraft: true,
|
||||
},
|
||||
ExecutionsID: executionsID,
|
||||
Purchases: []*purchase_resource.PurchaseResource{},
|
||||
Bookings: []*booking.Booking{},
|
||||
Status: enum.PENDING,
|
||||
}
|
||||
for _, purch := range purchases {
|
||||
newOrder.Purchases = append(newOrder.Purchases,
|
||||
scheduling_resources.FromSchedulerObject(tools.PURCHASE_RESOURCE, purch).(*purchase_resource.PurchaseResource))
|
||||
}
|
||||
for _, b := range bookings {
|
||||
newOrder.Bookings = append(newOrder.Bookings,
|
||||
scheduling_resources.FromSchedulerObject(tools.BOOKING, b).(*booking.Booking))
|
||||
}
|
||||
res, _, err := order.NewAccessor(request).StoreOne(newOrder)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if _, err := bill.DraftFirstBill(res.(*order.Order), request); err != nil {
|
||||
return res.GetID(), err
|
||||
}
|
||||
return res.GetID(), nil
|
||||
}
|
||||
|
||||
func (s *SessionExecutionsService) ConfirmSession(request *tools.APIRequest) error {
|
||||
for _, dt := range []tools.DataType{tools.BOOKING, tools.PURCHASE_RESOURCE} {
|
||||
for _, bk := range s.loadSession(dt) {
|
||||
bk.SetIsDraft(false)
|
||||
needsConsiders := scheduling_resources.GetService().PropagateWrite(
|
||||
scheduling_resources.FromSchedulerDBObject(dt, bk), bk.GetDestPeer(), dt, request)
|
||||
if needsConsiders {
|
||||
if payload, err := json.Marshal(execution.ConsidersPayload{ID: bk.GetID()}); err == nil {
|
||||
go execution.UpdateExecutionState(payload, dt)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
186
infrastructure/utils/utils.go
Normal file
186
infrastructure/utils/utils.go
Normal file
@@ -0,0 +1,186 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/models/resources"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
)
|
||||
|
||||
type BookingResource struct {
|
||||
ID string // resource MongoDB _id
|
||||
PeerPID string // peer public PeerID (PID) — PlannerCache key
|
||||
InstanceID string // resolved from WorkflowSchedule.SelectedInstances
|
||||
}
|
||||
|
||||
// collectBookingResources returns unique storage and compute resources from the
|
||||
// workflow graph. For each resource the selected instance ID is resolved from
|
||||
// selectedInstances (the scheduler's SelectedInstances ConfigItem) so the planner
|
||||
// check targets the exact instance chosen by the user.
|
||||
func CollectBookingResources(wf *workflow.Workflow, selectedInstances workflow.ConfigItem) map[string]BookingResource {
|
||||
if wf.Graph == nil {
|
||||
return nil
|
||||
}
|
||||
seen := map[string]bool{}
|
||||
result := map[string]BookingResource{}
|
||||
|
||||
// Resolve MongoDB peer _id (DID) → public PeerID (PID) used as PlannerCache key.
|
||||
peerAccess := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
|
||||
didToPID := map[string]string{}
|
||||
resolvePID := func(did string) string {
|
||||
if pid, ok := didToPID[did]; ok {
|
||||
return pid
|
||||
}
|
||||
if data := peerAccess.LoadOne(did); data.Data != nil {
|
||||
if p := data.ToPeer(); p != nil {
|
||||
didToPID[did] = p.PeerID
|
||||
return p.PeerID
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
resolveInstanceID := func(res interface {
|
||||
GetID() string
|
||||
GetCreatorID() string
|
||||
}) string {
|
||||
idx := selectedInstances.Get(res.GetID())
|
||||
switch r := res.(type) {
|
||||
case *resources.StorageResource:
|
||||
if inst := r.GetSelectedInstance(idx); inst != nil {
|
||||
return inst.GetID()
|
||||
}
|
||||
case *resources.ComputeResource:
|
||||
if inst := r.GetSelectedInstance(idx); inst != nil {
|
||||
return inst.GetID()
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
for _, item := range wf.GetGraphItems(wf.Graph.IsStorage) {
|
||||
_, res := item.GetResource()
|
||||
if res == nil {
|
||||
continue
|
||||
}
|
||||
id := res.GetID()
|
||||
if seen[id] {
|
||||
continue
|
||||
}
|
||||
pid := resolvePID(res.GetCreatorID())
|
||||
if pid == "" {
|
||||
continue
|
||||
}
|
||||
seen[id] = true
|
||||
result[pid] = BookingResource{
|
||||
ID: id,
|
||||
PeerPID: pid,
|
||||
InstanceID: resolveInstanceID(res),
|
||||
}
|
||||
}
|
||||
|
||||
for _, item := range wf.GetGraphItems(wf.Graph.IsCompute) {
|
||||
_, res := item.GetResource()
|
||||
if res == nil {
|
||||
continue
|
||||
}
|
||||
id := res.GetID()
|
||||
if seen[id] {
|
||||
continue
|
||||
}
|
||||
pid := resolvePID(res.GetCreatorID())
|
||||
if pid == "" {
|
||||
continue
|
||||
}
|
||||
seen[id] = true
|
||||
result[pid] = BookingResource{
|
||||
ID: id,
|
||||
PeerPID: pid,
|
||||
InstanceID: resolveInstanceID(res),
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// GetWorkflowPeerIDs loads the workflow and returns the deduplicated list of
|
||||
// creator peer IDs for all its storage and compute resources.
|
||||
// These are the peers whose planners must be watched by a check stream.
|
||||
func GetWorkflowPeerIDs(wfID string, request *tools.APIRequest) ([]string, error) {
|
||||
obj, code, err := workflow.NewAccessor(request).LoadOne(wfID)
|
||||
if code != 200 || err != nil {
|
||||
msg := "could not load workflow " + wfID
|
||||
if err != nil {
|
||||
msg += ": " + err.Error()
|
||||
}
|
||||
return nil, errors.New(msg)
|
||||
}
|
||||
wf := obj.(*workflow.Workflow)
|
||||
if wf.Graph == nil {
|
||||
return nil, nil
|
||||
}
|
||||
seen := map[string]bool{}
|
||||
var peerIDs []string
|
||||
for _, item := range wf.GetGraphItems(wf.Graph.IsStorage) {
|
||||
_, res := item.GetResource()
|
||||
if res == nil {
|
||||
continue
|
||||
}
|
||||
if id := res.GetCreatorID(); id != "" && !seen[id] {
|
||||
seen[id] = true
|
||||
peerIDs = append(peerIDs, id)
|
||||
}
|
||||
}
|
||||
for _, item := range wf.GetGraphItems(wf.Graph.IsCompute) {
|
||||
_, res := item.GetResource()
|
||||
if res == nil {
|
||||
continue
|
||||
}
|
||||
if id := res.GetCreatorID(); id != "" && !seen[id] {
|
||||
seen[id] = true
|
||||
peerIDs = append(peerIDs, id)
|
||||
}
|
||||
}
|
||||
realPeersID := []string{}
|
||||
access := oclib.NewRequestAdmin(oclib.LibDataEnum(tools.PEER), nil)
|
||||
for _, id := range peerIDs {
|
||||
if data := access.LoadOne(id); data.Data != nil {
|
||||
realPeersID = append(realPeersID, data.ToPeer().PeerID)
|
||||
}
|
||||
}
|
||||
return realPeersID, nil
|
||||
}
|
||||
|
||||
func FormatOptTime(t *time.Time) string {
|
||||
if t == nil {
|
||||
return "open"
|
||||
}
|
||||
return t.Format(time.RFC3339)
|
||||
}
|
||||
|
||||
func Notify[T interface{}](mu *sync.RWMutex, registry map[string][]chan T, key string, toAdd T) {
|
||||
mu.RLock()
|
||||
subs := registry[key]
|
||||
mu.RUnlock()
|
||||
for _, ch := range subs {
|
||||
select {
|
||||
case ch <- toAdd:
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func Propalgate(peerID string, message tools.PropalgationMessage) {
|
||||
b, _ := json.Marshal(message)
|
||||
tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
|
||||
FromApp: "oc-scheduler",
|
||||
Datatype: -1,
|
||||
Method: int(tools.PROPALGATION_EVENT),
|
||||
Payload: b,
|
||||
})
|
||||
}
|
||||
Reference in New Issue
Block a user