Scheduler + Observe

This commit is contained in:
mr
2026-04-29 07:45:41 +02:00
parent 4b9b1b8b91
commit 3be023b9af
20 changed files with 1006 additions and 87 deletions

View File

@@ -13,10 +13,12 @@ import (
"time"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/config"
"cloud.o-forge.io/core/oc-lib/dbs"
"cloud.o-forge.io/core/oc-lib/models/booking"
"cloud.o-forge.io/core/oc-lib/models/common/enum"
"cloud.o-forge.io/core/oc-lib/models/order"
"cloud.o-forge.io/core/oc-lib/models/resources/purchase_resource"
"cloud.o-forge.io/core/oc-lib/models/utils"
"cloud.o-forge.io/core/oc-lib/models/workflow"
"cloud.o-forge.io/core/oc-lib/models/workflow_execution"
@@ -144,13 +146,19 @@ func UpdateExecutionState(payload []byte, dt tools.DataType) {
st := exec.BookingsState[data.ID]
st.IsBooked = true
exec.BookingsState[data.ID] = st
if config.GetConfig().IsNano {
scheduling_resources.SendBookingToMaster(schdata.Data.(*booking.Booking)) // TODO : ASK FOR RESPONSE...
}
case tools.PURCHASE_RESOURCE:
if exec.PurchasesState == nil {
exec.PurchasesState = map[string]bool{}
}
exec.PurchasesState[data.ID] = true
if config.GetConfig().IsNano {
scheduling_resources.SendPurchaseToMaster(schdata.Data.(*purchase_resource.PurchaseResource)) // TODO : ASK FOR RESPONSE...
}
}
// TODO REMOVE
allConfirmed := true
for _, st := range exec.BookingsState {
if !st.IsBooked {
@@ -366,6 +374,13 @@ func HandleWorkflowStarted(resp tools.NATSResponse) {
return
}
adminReq := &tools.APIRequest{Admin: true}
mu := GetExecLock(evt.ExecutionID)
if mu != nil {
mu.Lock()
defer mu.Unlock()
}
res, _, err := workflow_execution.NewAccessor(adminReq).LoadOne(evt.ExecutionID)
if err != nil || res == nil {
return
@@ -375,6 +390,22 @@ func HandleWorkflowStarted(resp tools.NATSResponse) {
if evt.RealStart != nil {
exec.ExecDate = *evt.RealStart
}
// Build the execution graph summary from the workflow graph on first start.
if len(exec.Graph) == 0 {
wfRes, _, wfErr := workflow.NewAccessor(adminReq).LoadOne(exec.WorkflowID)
if wfErr == nil && wfRes != nil {
exec.Graph = workflow_execution.BuildExecutionGraph(wfRes.(*workflow.Workflow).Graph)
}
}
// Advance steps whose deps are already satisfied (typically the entry nodes).
if len(exec.Graph) > 0 {
now := time.Now().UTC()
for _, id := range exec.Graph.ReadyToRun() {
exec.Graph.MarkRunning(id, now)
}
}
utils.GenericRawUpdateOne(exec, exec.GetID(), workflow_execution.NewAccessor(adminReq))
}
@@ -384,6 +415,13 @@ func HandleWorkflowDone(resp tools.NATSResponse) {
return
}
adminReq := &tools.APIRequest{Admin: true}
mu := GetExecLock(evt.ExecutionID)
if mu != nil {
mu.Lock()
defer mu.Unlock()
}
res, _, err := workflow_execution.NewAccessor(adminReq).LoadOne(evt.ExecutionID)
if err != nil || res == nil {
return
@@ -393,17 +431,58 @@ func HandleWorkflowDone(resp tools.NATSResponse) {
if evt.RealEnd != nil {
exec.EndDate = evt.RealEnd
}
// All bookings are no longer reserved and are done
// Release all booking reservations (workflow is over) without overwriting
// IsDone: individual step events already set the authoritative done state
// for each booking. Resetting everything here would lose that granularity.
if exec.BookingsState == nil {
exec.BookingsState = map[string]workflow_execution.BookingState{}
}
for id := range exec.BookingsState {
exec.BookingsState[id] = workflow_execution.BookingState{IsBooked: false, IsDone: true}
for id, st := range exec.BookingsState {
st.IsBooked = false
exec.BookingsState[id] = st
}
// Graph items that already reached success/failure keep their state.
// Items still in running when the execution terminates receive the terminal
// state (the step was active but no step_done event arrived before the
// workflow finished — treat it as the execution outcome).
terminalSuccess := enum.BookingStatus(evt.State) == enum.SUCCESS
nowGraph := time.Now().UTC()
for itemID, item := range exec.Graph {
if item.State == workflow_execution.StepRunning {
exec.Graph.MarkDone(itemID, terminalSuccess, nowGraph)
}
}
utils.GenericRawUpdateOne(exec, exec.GetID(), workflow_execution.NewAccessor(adminReq))
// Build a set of booking IDs already covered by per-step events so we only
// fall back for bookings the orchestrator never emitted a step for (e.g. storage).
coveredByStep := map[string]bool{}
for _, step := range evt.Steps {
applyStepToBooking(step, adminReq)
coveredByStep[step.BookingID] = true
}
// Propagate the execution's terminal state to any booking that was not
// updated by a step event and is not already in a terminal state.
terminalState := enum.BookingStatus(evt.State)
now := time.Now().UTC()
for id := range exec.BookingsState {
if coveredByStep[id] {
continue
}
res, _, err := booking.NewAccessor(adminReq).LoadOne(id)
if err != nil || res == nil {
continue
}
bk := res.(*booking.Booking)
if terminalExecStates[bk.State] {
continue
}
bk.State = terminalState
bk.RealEndDate = &now
utils.GenericRawUpdateOne(bk, bk.GetID(), booking.NewAccessor(adminReq))
}
self, err := oclib.GetMySelf()
if err == nil && self != nil {
go planner.GetPlannerService().RefreshSelf(self.PeerID, adminReq)
@@ -416,6 +495,8 @@ func HandleWorkflowStepDone(resp tools.NATSResponse) {
return
}
adminReq := &tools.APIRequest{Admin: true}
// Update the booking itself first (no exec lock needed for the booking doc).
res, _, err := booking.NewAccessor(adminReq).LoadOne(evt.BookingID)
if err != nil || res == nil {
return
@@ -430,20 +511,56 @@ func HandleWorkflowStepDone(resp tools.NATSResponse) {
}
utils.GenericRawUpdateOne(bk, bk.GetID(), booking.NewAccessor(adminReq))
// Update BookingsState in the parent WorkflowExecution: resource released, step done
execRes, _, execErr := workflow_execution.NewAccessor(adminReq).LoadOne(bk.ExecutionID)
if execErr == nil && execRes != nil {
exec := execRes.(*workflow_execution.WorkflowExecution)
if exec.BookingsState == nil {
exec.BookingsState = map[string]workflow_execution.BookingState{}
}
st := exec.BookingsState[evt.BookingID]
st.IsBooked = false
st.IsDone = true
exec.BookingsState[evt.BookingID] = st
utils.GenericRawUpdateOne(exec, exec.GetID(), workflow_execution.NewAccessor(adminReq))
// Update the parent WorkflowExecution under its exec lock to avoid races
// between concurrent WORKFLOW_STEP_DONE_EVENT deliveries.
execID := bk.ExecutionID
mu := GetExecLock(execID)
if mu != nil {
mu.Lock()
defer mu.Unlock()
}
execRes, _, execErr := workflow_execution.NewAccessor(adminReq).LoadOne(execID)
if execErr != nil || execRes == nil {
return
}
exec := execRes.(*workflow_execution.WorkflowExecution)
// BookingsState: resource released, step done.
if exec.BookingsState == nil {
exec.BookingsState = map[string]workflow_execution.BookingState{}
}
st := exec.BookingsState[evt.BookingID]
st.IsBooked = false
st.IsDone = true
exec.BookingsState[evt.BookingID] = st
// Advance the execution graph.
if len(exec.Graph) > 0 {
itemID := findItemIDByBookingID(exec, evt.BookingID)
if itemID != "" {
success := enum.BookingStatus(evt.State) == enum.SUCCESS
end := time.Now().UTC()
if evt.RealEnd != nil {
end = *evt.RealEnd
}
exec.Graph.MarkDone(itemID, success, end)
// Only advance when the step succeeded; a failure leaves dependents waiting.
if success {
start := end
if evt.RealStart != nil {
start = *evt.RealStart
}
for _, nextID := range exec.Graph.ReadyToRun() {
exec.Graph.MarkRunning(nextID, start)
}
}
}
}
utils.GenericRawUpdateOne(exec, exec.GetID(), workflow_execution.NewAccessor(adminReq))
switch bk.State {
case enum.SUCCESS, enum.FAILURE, enum.FORGOTTEN, enum.CANCELLED:
self, err := oclib.GetMySelf()
@@ -453,6 +570,21 @@ func HandleWorkflowStepDone(resp tools.NATSResponse) {
}
}
// findItemIDByBookingID reverse-looks up a booking ID in PeerBookByGraph.
// PeerBookByGraph layout: map[peerID]map[itemID][]bookingID
func findItemIDByBookingID(exec *workflow_execution.WorkflowExecution, bookingID string) string {
for _, byItem := range exec.PeerBookByGraph {
for itemID, bookingIDs := range byItem {
for _, bkID := range bookingIDs {
if bkID == bookingID {
return itemID
}
}
}
}
return ""
}
func applyStepToBooking(step tools.StepMetric, adminReq *tools.APIRequest) {
res, _, err := booking.NewAccessor(adminReq).LoadOne(step.BookingID)
if err != nil || res == nil {

View File

@@ -6,9 +6,13 @@ import (
"oc-scheduler/infrastructure/planner"
"oc-scheduler/infrastructure/scheduling_resources"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/models/booking"
"cloud.o-forge.io/core/oc-lib/models/peer"
"cloud.o-forge.io/core/oc-lib/models/resources/purchase_resource"
libutils "cloud.o-forge.io/core/oc-lib/models/utils"
"cloud.o-forge.io/core/oc-lib/models/workflow"
"cloud.o-forge.io/core/oc-lib/models/workflow_execution"
"cloud.o-forge.io/core/oc-lib/tools"
)
@@ -49,6 +53,13 @@ func handleRemoveResource(resp tools.NATSResponse) {
return
}
scheduling_resources.GetService().HandleRemovePurchase(p, adminReq)
case tools.WORKFLOW_EXECUTION:
var p scheduling_resources.RemoveResourcePayload
if err := json.Unmarshal(resp.Payload, &p); err != nil || p.ID == "" {
return
}
// DeleteOne calls GenericDeleteOne internally which fires NotifyChange.
workflow_execution.NewAccessor(adminReq).DeleteOne(p.ID)
}
}
@@ -68,6 +79,21 @@ func handleCreateResource(resp tools.NATSResponse) {
if err := json.Unmarshal(resp.Payload, &bk); err != nil {
return
}
if bk.FromNano != "" {
access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
pp := access.LoadOne(bk.FromNano)
if p := pp.ToPeer(); p == nil || p.Relation == peer.NANO {
return
}
access = oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.BOOKING), nil)
d := access.LoadOne(bk.GetID())
if d.Data == nil {
access.StoreOne(bk.Serialize(&bk))
} else {
access.UpdateOne(bk.Serialize(&bk), bk.GetID())
}
return
}
needsConsiders := scheduling_resources.GetService().HandleCreateBooking(&bk, adminReq)
if needsConsiders {
payload, _ := json.Marshal(execution.ConsidersPayload{ID: bk.GetID()})
@@ -78,10 +104,43 @@ func handleCreateResource(resp tools.NATSResponse) {
if err := json.Unmarshal(resp.Payload, &pr); err != nil {
return
}
if pr.FromNano != "" {
access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
pp := access.LoadOne(pr.FromNano)
if p := pp.ToPeer(); p == nil || p.Relation == peer.NANO {
return
}
access = oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PURCHASE_RESOURCE), nil)
d := access.LoadOne(pr.GetID())
if d.Data == nil {
access.StoreOne(pr.Serialize(&pr))
} else {
access.UpdateOne(pr.Serialize(&pr), pr.GetID())
}
return
}
needsConsiders := scheduling_resources.GetService().HandleCreatePurchase(&pr, adminReq)
if needsConsiders {
payload, _ := json.Marshal(execution.ConsidersPayload{ID: pr.GetID()})
execution.UpdateExecutionState(payload, tools.PURCHASE_RESOURCE)
}
case tools.WORKFLOW_EXECUTION:
// Only propagate the state change onto an execution that oc-scheduler
// already owns. Never create executions from an external NATS event:
// creation is strictly oc-scheduler's responsibility (via the session
// flow), and blindly calling StoreOne here would trigger
// StoreDraftDefault (IsDraft=true, State=DRAFT), polluting the name-
// uniqueness index and breaking the check stream's first draft creation.
var update workflow_execution.WorkflowExecution
if err := json.Unmarshal(resp.Payload, &update); err != nil || update.GetID() == "" {
return
}
res, _, loadErr := workflow_execution.NewAccessor(adminReq).LoadOne(update.GetID())
if loadErr != nil || res == nil {
return
}
exec := res.(*workflow_execution.WorkflowExecution)
exec.State = update.State
libutils.GenericRawUpdateOne(exec, exec.GetID(), workflow_execution.NewAccessor(adminReq))
}
}

View File

@@ -10,6 +10,7 @@ import (
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/models/booking/planner"
"cloud.o-forge.io/core/oc-lib/models/resources"
"cloud.o-forge.io/core/oc-lib/models/workflow"
"cloud.o-forge.io/core/oc-lib/models/workflow/graph"
"cloud.o-forge.io/core/oc-lib/tools"
@@ -509,6 +510,126 @@ func (s *PlannerService) NotifyWorkflow(wfID string) {
utils.Notify(&s.WorkflowSubMu, s.WorkflowSubs, wfID, struct{}{})
}
// FillForPeers fetches and waits for planners for an explicit list of peer PIDs.
// Same mechanic as Fill but decoupled from the BookingResource map — used for
// dynamic resource resolution where the peer set is not part of checkables.
func (s *PlannerService) FillForPeers(peerPIDs []string, wfID string) {
if len(peerPIDs) == 0 {
return
}
const plannerFetchTimeout = 5 * time.Second
tmpSession := "check-dynamic-" + wfID
s.Mu.Lock()
myself, _ := oclib.GetMySelf()
for _, peerID := range peerPIDs {
entry := s.Cache[peerID]
if entry == nil {
entry = &plannerEntry{}
s.Cache[peerID] = entry
s.AddedAt[peerID] = time.Now().UTC()
go s.EvictAfter(peerID, plannerTTL)
}
entry.Planner = nil
if !entry.Refreshing {
entry.Refreshing = true
entry.RefreshOwner = tmpSession
}
}
s.Mu.Unlock()
defer s.ReleaseRefreshOwnership(peerPIDs, tmpSession)
for _, peerID := range peerPIDs {
if myself != nil && myself.PeerID == peerID {
go s.RefreshSelf(peerID, &tools.APIRequest{Admin: true})
} else {
payload, _ := json.Marshal(map[string]any{"peer_id": peerID})
utils.Propalgate(peerID, tools.PropalgationMessage{
Action: tools.PB_PLANNER,
Payload: payload,
})
}
}
deadline := time.Now().Add(plannerFetchTimeout)
remaining := slices.Clone(peerPIDs)
for len(remaining) > 0 {
wait := time.Until(deadline)
if wait <= 0 {
return
}
ch, cancelSub := SubscribeUpdates(s.Subs, &s.SubMu, remaining...)
select {
case <-ch:
case <-time.After(wait):
cancelSub()
return
}
cancelSub()
remaining = remaining[:0]
s.Mu.RLock()
for _, pid := range peerPIDs {
if entry := s.Cache[pid]; entry == nil || entry.Planner == nil {
remaining = append(remaining, pid)
}
}
s.Mu.RUnlock()
}
}
// FillDynamic resolves all peer DIDs across the given dynamic resources to PIDs,
// fetches their planners via FillForPeers, and returns the DID→PID mapping for use
// in ResolveDynamic. All dynamics are batched into a single planner fetch round.
func (s *PlannerService) FillDynamic(dynamics []*resources.DynamicResource, wfID string) map[string]string {
didToPID := map[string]string{}
peerPIDs := []string{}
access := oclib.NewRequestAdmin(oclib.LibDataEnum(tools.PEER), nil)
for _, d := range dynamics {
for _, did := range d.PeerIds {
if did == "" || didToPID[did] != "" {
continue
}
if data := access.LoadOne(did); data.Data != nil {
if p := data.ToPeer(); p != nil {
didToPID[did] = p.PeerID
peerPIDs = append(peerPIDs, p.PeerID)
}
}
}
}
s.FillForPeers(peerPIDs, wfID)
return didToPID
}
// ResolveDynamic walks the sorted instance list of a DynamicResource via
// GetSelectedInstance and returns true as soon as it finds an instance whose
// peer's planner confirms availability for [start, end].
// d.SelectedIndex is updated to the elected instance on success.
// Peers that did not respond (no planner in cache) are skipped.
func (s *PlannerService) ResolveDynamic(d *resources.DynamicResource, didToPID map[string]string, start time.Time, end *time.Time) bool {
for {
inst := d.GetSelectedInstance(nil)
if inst == nil {
return false // exhausted all candidates
}
did := d.PeerIds[d.SelectedIndex]
resourceID := d.ResourceIds[d.SelectedIndex]
pid, ok := didToPID[did]
if !ok {
continue // peer DID could not be resolved
}
s.Mu.RLock()
entry := s.Cache[pid]
s.Mu.RUnlock()
if entry == nil || entry.Planner == nil {
continue // peer did not respond in time
}
if s.checkInstance(entry.Planner, resourceID, inst.GetID(), start, end) {
return true // d.SelectedIndex points to the elected instance
}
}
}
// checkInstance checks availability for the specific instance resolved by the
// scheduler. When instanceID is empty (no instance selected / none resolvable),
// it falls back to checking all instances known in the planner and returns true

View File

@@ -13,6 +13,7 @@ import (
"cloud.o-forge.io/core/oc-lib/models/booking"
"cloud.o-forge.io/core/oc-lib/models/common/enum"
"cloud.o-forge.io/core/oc-lib/models/common/pricing"
"cloud.o-forge.io/core/oc-lib/models/resources"
"cloud.o-forge.io/core/oc-lib/models/utils"
"cloud.o-forge.io/core/oc-lib/models/workflow"
"cloud.o-forge.io/core/oc-lib/models/workflow_execution"
@@ -40,12 +41,14 @@ type WorkflowSchedule struct {
DurationS float64 `json:"duration_s" default:"-1"`
Cron string `json:"cron,omitempty"`
BookingMode booking.BookingMode `json:"booking_mode,omitempty"`
SelectedInstances workflow.ConfigItem `json:"selected_instances"`
SelectedPartnerships workflow.ConfigItem `json:"selected_partnerships"`
SelectedBuyings workflow.ConfigItem `json:"selected_buyings"`
SelectedStrategies workflow.ConfigItem `json:"selected_strategies"`
SelectedBillingStrategy pricing.BillingStrategy `json:"selected_billing_strategy"`
BookingMode booking.BookingMode `json:"booking_mode,omitempty"`
SelectedInstances workflow.ConfigItem `json:"selected_instances"`
SelectedPartnerships workflow.ConfigItem `json:"selected_partnerships"`
SelectedBuyings workflow.ConfigItem `json:"selected_buyings"`
SelectedStrategies workflow.ConfigItem `json:"selected_strategies"`
SelectedPaymentType workflow.ConfigItem `json:"selected_payment_type"`
SelectedBillingStrategy pricing.BillingStrategy `json:"selected_billing_strategy"`
SelectedEmbeddedStorages map[string]*resources.EmbeddedStorageSelection `json:"selected_embedded_storages,omitempty"`
// Confirm, when true, triggers Schedule() to confirm the drafts held by this session.
Confirm bool `json:"confirm,omitempty"`
@@ -119,6 +122,28 @@ func (ws *WorkflowSchedule) Check(wfID string, asap bool, preemption bool, reque
checkables := infUtils.CollectBookingResources(wf, ws.SelectedInstances)
start, end, available, preemptible, warnings := planner.GetPlannerService().FindDate(wfID, checkables, start, end, preemption, asap)
// Dynamic resources are resolved separately: their peer planners are fetched
// and the sorted instance list is walked until an available one is found.
var dynamics []*resources.DynamicResource
for _, item := range wf.GetGraphItems(wf.Graph.IsDynamic) {
_, res := item.GetResource()
if res == nil {
continue
}
d := res.(*resources.DynamicResource)
d.SetAllowedInstances(request)
dynamics = append(dynamics, d)
}
if len(dynamics) > 0 {
didToPID := planner.GetPlannerService().FillDynamic(dynamics, wfID)
for _, d := range dynamics {
if !planner.GetPlannerService().ResolveDynamic(d, didToPID, start, end) {
available = false
warnings = append(warnings, "no available instance for dynamic resource "+d.GetName())
}
}
}
return &CheckResult{
Start: start,
End: end,
@@ -197,12 +222,17 @@ func (ws *WorkflowSchedule) GenerateExecutions(wf *workflow.Workflow, isPreempti
UUID: uuid.New().String(),
Name: wf.Name + " execution " + date.Start.Format("2006-01-02 15:04"),
},
Priority: 1,
ExecutionsID: ws.UUID,
ExecDate: date.Start,
EndDate: date.End,
State: enum.DRAFT,
WorkflowID: wf.GetID(),
SelectedInstances: ws.SelectedInstances,
SelectedPartnerships: ws.SelectedPartnerships,
SelectedBuyings: ws.SelectedBuyings,
SelectedStrategies: ws.SelectedStrategies,
SelectedEmbeddedStorages: ws.SelectedEmbeddedStorages,
Priority: 1,
ExecutionsID: ws.UUID,
ExecDate: date.Start,
EndDate: date.End,
State: enum.DRAFT,
WorkflowID: wf.GetID(),
}
if ws.BookingMode != booking.PLANNED {
obj.Priority = 0

View File

@@ -10,6 +10,8 @@ import (
"oc-scheduler/infrastructure/planner"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/config"
"cloud.o-forge.io/core/oc-lib/dbs"
"cloud.o-forge.io/core/oc-lib/models/booking"
"cloud.o-forge.io/core/oc-lib/models/common/enum"
"cloud.o-forge.io/core/oc-lib/models/peer"
@@ -221,6 +223,9 @@ func (s *SchedulingResourcesService) Delete(dt tools.DataType, bk SchedulerObjec
if dt == tools.BOOKING {
planner.GetPlannerService().RefreshSelf(selfID.PeerID, request)
}
if (dt == tools.BOOKING || dt == tools.PURCHASE_RESOURCE) && config.GetConfig().IsNano {
SendRemoveToMaster(bk, dt)
}
return
}
EmitNATSRemove(bk.GetID(), bk.GetPeerSession(), bk.GetExecutionsId(), dt)
@@ -299,8 +304,14 @@ func DraftTimeout(id string, dt tools.DataType) {
switch dt {
case tools.BOOKING:
booking.NewAccessor(adminReq).DeleteOne(id)
if config.GetConfig().IsNano {
SendRemoveToMaster(res, dt)
}
case tools.PURCHASE_RESOURCE:
purchase_resource.NewAccessor(adminReq).DeleteOne(id)
if config.GetConfig().IsNano {
SendRemoveToMaster(res, dt)
}
}
fmt.Printf("DraftTimeout: %s %s deleted (still draft after 10 min)\n", dt.String(), id)
}
@@ -316,7 +327,6 @@ func (s *SchedulingResourcesService) HandleCreateBooking(bk *booking.Booking, ad
if self == nil {
return false
}
if existing, _, loadErr := booking.NewAccessor(adminReq).LoadOne(bk.GetID()); loadErr == nil && existing != nil {
prev := existing.(*booking.Booking)
if prev.SchedulerPeerID != bk.SchedulerPeerID || prev.ExecutionsID != bk.ExecutionsID {
@@ -329,13 +339,20 @@ func (s *SchedulingResourcesService) HandleCreateBooking(bk *booking.Booking, ad
if !bk.IsDraft && !prev.ExpectedStartDate.IsZero() && prev.ExpectedStartDate.Before(time.Now().UTC()) {
fmt.Println("HandleCreateBooking: expired, deleting", bk.GetID())
booking.NewAccessor(adminReq).DeleteOne(bk.GetID())
if config.GetConfig().IsNano {
SendRemoveToMaster(bk, tools.BOOKING)
}
return false
}
if _, _, err := utils.GenericRawUpdateOne(bk, bk.GetID(), booking.NewAccessor(adminReq)); err != nil {
fmt.Println("HandleCreateBooking: update failed:", err)
return false
}
if config.GetConfig().IsNano {
SendBookingToMaster(bk)
}
planner.GetPlannerService().RefreshSelf(self.PeerID, adminReq)
return !bk.IsDraft
}
@@ -348,6 +365,7 @@ func (s *SchedulingResourcesService) HandleCreateBooking(bk *booking.Booking, ad
fmt.Println("HandleCreateBooking: conflicts with local planner, discarding")
return false
}
bk.IsDraft = true
stored, _, err := booking.NewAccessor(adminReq).StoreOne(bk)
if err != nil {
@@ -355,11 +373,126 @@ func (s *SchedulingResourcesService) HandleCreateBooking(bk *booking.Booking, ad
return false
}
storedID := stored.GetID()
planner.GetPlannerService().RefreshSelf(self.PeerID, adminReq)
time.AfterFunc(10*time.Minute, func() { DraftTimeout(storedID, tools.BOOKING) })
if config.GetConfig().IsNano {
SendBookingToMaster(bk) // TODO : ASK FOR RESPONSE...
}
return false
}
func SendBookingToMaster(booking *booking.Booking) {
self, _ := oclib.GetMySelf()
if booking.GetCreatorID() != self.GetID() {
return
}
d := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil).Search(&dbs.Filters{
And: map[string][]dbs.Filter{
"relation": {{Operator: dbs.EQUAL.String(), Value: peer.MASTER}},
},
}, "", false, 0, 1)
for _, dd := range d.Data {
booking.IsDraft = false
booking.FromNano = self.GetID()
m := map[string]interface{}{}
i, err := json.Marshal(m)
if err == nil {
json.Unmarshal(i, &m)
m["peer_id"] = dd.(*peer.Peer).PeerID
if payloadd, err := json.Marshal(m); err == nil {
b, err := json.Marshal(&tools.PropalgationMessage{
DataType: tools.BOOKING.EnumIndex(),
Action: tools.PB_CREATE,
Payload: payloadd,
})
if err == nil {
tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
FromApp: "oc-scheduler",
Datatype: tools.BOOKING,
Method: int(tools.PROPALGATION_EVENT),
Payload: b,
})
}
}
}
}
}
func SendRemoveToMaster(obj utils.DBObject, dt tools.DataType) {
self, _ := oclib.GetMySelf()
if obj.GetCreatorID() != self.GetID() {
return
}
d := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil).Search(&dbs.Filters{
And: map[string][]dbs.Filter{
"relation": {{Operator: dbs.EQUAL.String(), Value: peer.MASTER}},
},
}, "", false, 0, 1)
for _, dd := range d.Data {
m := map[string]interface{}{}
i, err := json.Marshal(m)
if err == nil {
json.Unmarshal(i, &m)
m["peer_id"] = dd.(*peer.Peer).PeerID
if payloadd, err := json.Marshal(m); err == nil {
b, err := json.Marshal(&tools.PropalgationMessage{
DataType: dt.EnumIndex(),
Action: tools.PB_DELETE,
Payload: payloadd,
})
if err == nil {
tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
FromApp: "oc-scheduler",
Datatype: dt,
Method: int(tools.PROPALGATION_EVENT),
Payload: b,
})
}
}
}
}
}
func SendPurchaseToMaster(purchase *purchase_resource.PurchaseResource) {
self, _ := oclib.GetMySelf()
if purchase.GetCreatorID() != self.GetID() {
return
}
d := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil).Search(&dbs.Filters{
And: map[string][]dbs.Filter{
"relation": {{Operator: dbs.EQUAL.String(), Value: peer.MASTER}},
},
}, "", false, 0, 1)
for _, dd := range d.Data {
purchase.IsDraft = false
purchase.FromNano = self.GetID()
m := map[string]interface{}{}
i, err := json.Marshal(m)
if err == nil {
json.Unmarshal(i, &m)
m["peer_id"] = dd.(*peer.Peer).PeerID
if payloadd, err := json.Marshal(m); err == nil {
b, err := json.Marshal(&tools.PropalgationMessage{
DataType: tools.PURCHASE_RESOURCE.EnumIndex(),
Action: tools.PB_CREATE,
Payload: payloadd,
})
if err == nil {
tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
FromApp: "oc-scheduler",
Datatype: tools.PURCHASE_RESOURCE,
Method: int(tools.PROPALGATION_EVENT),
Payload: b,
})
}
}
}
}
}
// HandleCreatePurchase processes an incoming purchase from NATS.
// Returns true if considers must be triggered.
func (s *SchedulingResourcesService) HandleCreatePurchase(pr *purchase_resource.PurchaseResource, adminReq *tools.APIRequest) bool {
@@ -393,6 +526,9 @@ func (s *SchedulingResourcesService) HandleCreatePurchase(pr *purchase_resource.
fmt.Println("HandleCreatePurchase: could not store:", err)
return false
}
if config.GetConfig().IsNano {
SendPurchaseToMaster(pr) // TODO : ASK FOR RESPONSE...
}
storedID := stored.GetID()
time.AfterFunc(10*time.Minute, func() { DraftTimeout(storedID, tools.PURCHASE_RESOURCE) })
return false
@@ -405,14 +541,17 @@ func (s *SchedulingResourcesService) HandleRemoveBooking(p RemoveResourcePayload
return
}
existing := res.(*booking.Booking)
if existing.SchedulerPeerID != p.SchedulerPeerID || existing.ExecutionsID != p.ExecutionsID {
if existing.SchedulerPeerID != p.SchedulerPeerID || existing.ExecutionsID != p.ExecutionsID || existing.IsDraft {
fmt.Println("HandleRemoveBooking: auth mismatch, ignoring", p.ID)
return
}
booking.NewAccessor(adminReq).DeleteOne(p.ID)
d, _, _ := booking.NewAccessor(adminReq).DeleteOne(p.ID)
if self := s.Self(); self != nil {
planner.GetPlannerService().RefreshSelf(self.PeerID, adminReq)
}
if config.GetConfig().IsNano && d != nil {
SendRemoveToMaster(d, tools.BOOKING) // TODO : ASK FOR RESPONSE...
}
}
// HandleRemovePurchase verifies auth and deletes the purchase.
@@ -422,11 +561,14 @@ func (s *SchedulingResourcesService) HandleRemovePurchase(p RemoveResourcePayloa
return
}
existing := res.(*purchase_resource.PurchaseResource)
if existing.SchedulerPeerID != p.SchedulerPeerID || existing.ExecutionsID != p.ExecutionsID {
if existing.SchedulerPeerID != p.SchedulerPeerID || existing.ExecutionsID != p.ExecutionsID || existing.IsDraft {
fmt.Println("HandleRemovePurchase: auth mismatch, ignoring", p.ID)
return
}
purchase_resource.NewAccessor(adminReq).DeleteOne(p.ID)
d, _, _ := purchase_resource.NewAccessor(adminReq).DeleteOne(p.ID)
if config.GetConfig().IsNano && d != nil {
SendRemoveToMaster(d, tools.PURCHASE_RESOURCE) // TODO : ASK FOR RESPONSE...
}
}
// ---------------------------------------------------------------------------

View File

@@ -30,6 +30,48 @@ func NewSessionExecutionsService(sessionID string) *SessionExecutionsService {
return &SessionExecutionsService{ExecutionsSessionID: sessionID}
}
// ---------------------------------------------------------------------------
// Remote resource registry
//
// Bookings and purchases for remote peers are sent via NATS and stored only on
// the remote peer — they never appear in local MongoDB. CleanupSession would
// therefore miss them entirely. We keep a package-level in-memory registry
// (executionsID → list) that is populated when PropagateCreate routes to a
// remote peer, and consumed (cleared) by CleanupSession so it can emit the
// corresponding REMOVE_RESOURCE NATS messages.
// ---------------------------------------------------------------------------
type remoteResourceEntry struct {
ID string
SchedulerPeerID string
ExecutionsID string
DT tools.DataType
}
var remoteRegistryMu sync.Mutex
var remoteRegistry = map[string][]remoteResourceEntry{}
func trackRemoteResource(executionsID, id, schedulerPeerID string, dt tools.DataType) {
if id == "" {
return
}
remoteRegistryMu.Lock()
remoteRegistry[executionsID] = append(remoteRegistry[executionsID], remoteResourceEntry{
ID: id, SchedulerPeerID: schedulerPeerID, ExecutionsID: executionsID, DT: dt,
})
remoteRegistryMu.Unlock()
}
// consumeTrackedRemotes atomically returns and removes all tracked remote
// resources for the given session.
func consumeTrackedRemotes(executionsID string) []remoteResourceEntry {
remoteRegistryMu.Lock()
defer remoteRegistryMu.Unlock()
entries := remoteRegistry[executionsID]
delete(remoteRegistry, executionsID)
return entries
}
// ---------------------------------------------------------------------------
// DB helpers
// ---------------------------------------------------------------------------
@@ -164,6 +206,11 @@ func (s *SessionExecutionsService) upsertDrafts(
scheduling_resources.GetService().PropagateCreate(
scheduling_resources.FromSchedulerDBObject(dt, bk), bk.GetDestPeer(), dt, request, errCh)
<-errCh
// If this booking/purchase was routed to a remote peer (not stored in
// local DB), register it so CleanupSession can emit REMOVE_RESOURCE later.
if self != nil && bk.GetDestPeer() != self.GetID() {
trackRemoteResource(s.ExecutionsSessionID, bk.GetID(), bk.GetPeerSession(), dt)
}
}
}
}
@@ -184,6 +231,14 @@ func (s *SessionExecutionsService) CleanupSession(request *tools.APIRequest) {
}
}
// Emit NATS REMOVE_RESOURCE for bookings/purchases that were routed to
// remote peers and therefore never stored in local DB. loadSession above
// cannot find them, so we rely on the in-memory registry populated by
// upsertDrafts when PropagateCreate routes to a non-self peer.
for _, entry := range consumeTrackedRemotes(s.ExecutionsSessionID) {
scheduling_resources.EmitNATSRemove(entry.ID, entry.SchedulerPeerID, entry.ExecutionsID, entry.DT)
}
for _, exec := range s.LoadSessionExecs() {
execution.UnregisterExecLock(exec.GetID())
workflow_execution.NewAccessor(adminReq).DeleteOne(exec.GetID())
@@ -227,6 +282,7 @@ func GenerateOrder(
}
func (s *SessionExecutionsService) ConfirmSession(request *tools.APIRequest) error {
adminReq := &tools.APIRequest{Admin: true}
for _, dt := range []tools.DataType{tools.BOOKING, tools.PURCHASE_RESOURCE} {
for _, bk := range s.loadSession(dt) {
bk.SetIsDraft(false)
@@ -239,5 +295,9 @@ func (s *SessionExecutionsService) ConfirmSession(request *tools.APIRequest) err
}
}
}
for _, exec := range s.LoadSessionExecs() {
exec.State = enum.SCHEDULED
utils.GenericRawUpdateOne(exec, exec.GetID(), workflow_execution.NewAccessor(adminReq))
}
return nil
}

View File

@@ -60,6 +60,10 @@ func CollectBookingResources(wf *workflow.Workflow, selectedInstances workflow.C
if inst := r.GetSelectedInstance(idx); inst != nil {
return inst.GetID()
}
case *resources.ServiceResource:
if inst := r.GetSelectedInstance(idx); inst != nil {
return inst.GetID()
}
}
return ""
}
@@ -106,6 +110,39 @@ func CollectBookingResources(wf *workflow.Workflow, selectedInstances workflow.C
}
}
// HOSTED services: capacity is capped by MaxConcurrent on the LiveService.
// The peer to watch is the creator (who operates the service).
// DEPLOYMENT services are covered through their linked compute unit.
for _, item := range wf.GetGraphItems(wf.Graph.IsService) {
_, res := item.GetResource()
if res == nil {
continue
}
svc := res.(*resources.ServiceResource)
idx := selectedInstances.Get(svc.GetID())
inst := svc.GetSelectedInstance(idx)
if inst == nil {
continue
}
if inst.(*resources.ServiceInstance).Mode != resources.HOSTED {
continue
}
id := svc.GetID()
if seen[id] {
continue
}
pid := resolvePID(svc.GetCreatorID())
if pid == "" {
continue
}
seen[id] = true
result[pid] = BookingResource{
ID: id,
PeerPID: pid,
InstanceID: resolveInstanceID(res),
}
}
return result
}
@@ -147,6 +184,35 @@ func GetWorkflowPeerIDs(wfID string, request *tools.APIRequest) ([]string, error
peerIDs = append(peerIDs, id)
}
}
for _, item := range wf.GetGraphItems(wf.Graph.IsService) {
_, res := item.GetResource()
if res == nil {
continue
}
svc := res.(*resources.ServiceResource)
if len(svc.Instances) == 0 || svc.Instances[0].Mode != resources.HOSTED {
continue
}
if id := svc.GetCreatorID(); id != "" && !seen[id] {
seen[id] = true
peerIDs = append(peerIDs, id)
}
}
for _, item := range wf.GetGraphItems(wf.Graph.IsDynamic) {
_, res := item.GetResource()
if res == nil {
continue
}
d := res.(*resources.DynamicResource)
d.SetAllowedInstances(request)
for _, creatorID := range d.PeerIds {
if creatorID != "" && !seen[creatorID] {
seen[creatorID] = true
peerIDs = append(peerIDs, creatorID)
}
}
}
realPeersID := []string{}
access := oclib.NewRequestAdmin(oclib.LibDataEnum(tools.PEER), nil)
for _, id := range peerIDs {