Scheduler + Observe

This commit is contained in:
mr
2026-04-29 07:45:41 +02:00
parent 4b9b1b8b91
commit 3be023b9af
20 changed files with 1006 additions and 87 deletions

View File

@@ -13,10 +13,12 @@ import (
"time"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/config"
"cloud.o-forge.io/core/oc-lib/dbs"
"cloud.o-forge.io/core/oc-lib/models/booking"
"cloud.o-forge.io/core/oc-lib/models/common/enum"
"cloud.o-forge.io/core/oc-lib/models/order"
"cloud.o-forge.io/core/oc-lib/models/resources/purchase_resource"
"cloud.o-forge.io/core/oc-lib/models/utils"
"cloud.o-forge.io/core/oc-lib/models/workflow"
"cloud.o-forge.io/core/oc-lib/models/workflow_execution"
@@ -144,13 +146,19 @@ func UpdateExecutionState(payload []byte, dt tools.DataType) {
st := exec.BookingsState[data.ID]
st.IsBooked = true
exec.BookingsState[data.ID] = st
if config.GetConfig().IsNano {
scheduling_resources.SendBookingToMaster(schdata.Data.(*booking.Booking)) // TODO : ASK FOR RESPONSE...
}
case tools.PURCHASE_RESOURCE:
if exec.PurchasesState == nil {
exec.PurchasesState = map[string]bool{}
}
exec.PurchasesState[data.ID] = true
if config.GetConfig().IsNano {
scheduling_resources.SendPurchaseToMaster(schdata.Data.(*purchase_resource.PurchaseResource)) // TODO : ASK FOR RESPONSE...
}
}
// TODO REMOVE
allConfirmed := true
for _, st := range exec.BookingsState {
if !st.IsBooked {
@@ -366,6 +374,13 @@ func HandleWorkflowStarted(resp tools.NATSResponse) {
return
}
adminReq := &tools.APIRequest{Admin: true}
mu := GetExecLock(evt.ExecutionID)
if mu != nil {
mu.Lock()
defer mu.Unlock()
}
res, _, err := workflow_execution.NewAccessor(adminReq).LoadOne(evt.ExecutionID)
if err != nil || res == nil {
return
@@ -375,6 +390,22 @@ func HandleWorkflowStarted(resp tools.NATSResponse) {
if evt.RealStart != nil {
exec.ExecDate = *evt.RealStart
}
// Build the execution graph summary from the workflow graph on first start.
if len(exec.Graph) == 0 {
wfRes, _, wfErr := workflow.NewAccessor(adminReq).LoadOne(exec.WorkflowID)
if wfErr == nil && wfRes != nil {
exec.Graph = workflow_execution.BuildExecutionGraph(wfRes.(*workflow.Workflow).Graph)
}
}
// Advance steps whose deps are already satisfied (typically the entry nodes).
if len(exec.Graph) > 0 {
now := time.Now().UTC()
for _, id := range exec.Graph.ReadyToRun() {
exec.Graph.MarkRunning(id, now)
}
}
utils.GenericRawUpdateOne(exec, exec.GetID(), workflow_execution.NewAccessor(adminReq))
}
@@ -384,6 +415,13 @@ func HandleWorkflowDone(resp tools.NATSResponse) {
return
}
adminReq := &tools.APIRequest{Admin: true}
mu := GetExecLock(evt.ExecutionID)
if mu != nil {
mu.Lock()
defer mu.Unlock()
}
res, _, err := workflow_execution.NewAccessor(adminReq).LoadOne(evt.ExecutionID)
if err != nil || res == nil {
return
@@ -393,17 +431,58 @@ func HandleWorkflowDone(resp tools.NATSResponse) {
if evt.RealEnd != nil {
exec.EndDate = evt.RealEnd
}
// All bookings are no longer reserved and are done
// Release all booking reservations (workflow is over) without overwriting
// IsDone: individual step events already set the authoritative done state
// for each booking. Resetting everything here would lose that granularity.
if exec.BookingsState == nil {
exec.BookingsState = map[string]workflow_execution.BookingState{}
}
for id := range exec.BookingsState {
exec.BookingsState[id] = workflow_execution.BookingState{IsBooked: false, IsDone: true}
for id, st := range exec.BookingsState {
st.IsBooked = false
exec.BookingsState[id] = st
}
// Graph items that already reached success/failure keep their state.
// Items still in running when the execution terminates receive the terminal
// state (the step was active but no step_done event arrived before the
// workflow finished — treat it as the execution outcome).
terminalSuccess := enum.BookingStatus(evt.State) == enum.SUCCESS
nowGraph := time.Now().UTC()
for itemID, item := range exec.Graph {
if item.State == workflow_execution.StepRunning {
exec.Graph.MarkDone(itemID, terminalSuccess, nowGraph)
}
}
utils.GenericRawUpdateOne(exec, exec.GetID(), workflow_execution.NewAccessor(adminReq))
// Build a set of booking IDs already covered by per-step events so we only
// fall back for bookings the orchestrator never emitted a step for (e.g. storage).
coveredByStep := map[string]bool{}
for _, step := range evt.Steps {
applyStepToBooking(step, adminReq)
coveredByStep[step.BookingID] = true
}
// Propagate the execution's terminal state to any booking that was not
// updated by a step event and is not already in a terminal state.
terminalState := enum.BookingStatus(evt.State)
now := time.Now().UTC()
for id := range exec.BookingsState {
if coveredByStep[id] {
continue
}
res, _, err := booking.NewAccessor(adminReq).LoadOne(id)
if err != nil || res == nil {
continue
}
bk := res.(*booking.Booking)
if terminalExecStates[bk.State] {
continue
}
bk.State = terminalState
bk.RealEndDate = &now
utils.GenericRawUpdateOne(bk, bk.GetID(), booking.NewAccessor(adminReq))
}
self, err := oclib.GetMySelf()
if err == nil && self != nil {
go planner.GetPlannerService().RefreshSelf(self.PeerID, adminReq)
@@ -416,6 +495,8 @@ func HandleWorkflowStepDone(resp tools.NATSResponse) {
return
}
adminReq := &tools.APIRequest{Admin: true}
// Update the booking itself first (no exec lock needed for the booking doc).
res, _, err := booking.NewAccessor(adminReq).LoadOne(evt.BookingID)
if err != nil || res == nil {
return
@@ -430,20 +511,56 @@ func HandleWorkflowStepDone(resp tools.NATSResponse) {
}
utils.GenericRawUpdateOne(bk, bk.GetID(), booking.NewAccessor(adminReq))
// Update BookingsState in the parent WorkflowExecution: resource released, step done
execRes, _, execErr := workflow_execution.NewAccessor(adminReq).LoadOne(bk.ExecutionID)
if execErr == nil && execRes != nil {
exec := execRes.(*workflow_execution.WorkflowExecution)
if exec.BookingsState == nil {
exec.BookingsState = map[string]workflow_execution.BookingState{}
}
st := exec.BookingsState[evt.BookingID]
st.IsBooked = false
st.IsDone = true
exec.BookingsState[evt.BookingID] = st
utils.GenericRawUpdateOne(exec, exec.GetID(), workflow_execution.NewAccessor(adminReq))
// Update the parent WorkflowExecution under its exec lock to avoid races
// between concurrent WORKFLOW_STEP_DONE_EVENT deliveries.
execID := bk.ExecutionID
mu := GetExecLock(execID)
if mu != nil {
mu.Lock()
defer mu.Unlock()
}
execRes, _, execErr := workflow_execution.NewAccessor(adminReq).LoadOne(execID)
if execErr != nil || execRes == nil {
return
}
exec := execRes.(*workflow_execution.WorkflowExecution)
// BookingsState: resource released, step done.
if exec.BookingsState == nil {
exec.BookingsState = map[string]workflow_execution.BookingState{}
}
st := exec.BookingsState[evt.BookingID]
st.IsBooked = false
st.IsDone = true
exec.BookingsState[evt.BookingID] = st
// Advance the execution graph.
if len(exec.Graph) > 0 {
itemID := findItemIDByBookingID(exec, evt.BookingID)
if itemID != "" {
success := enum.BookingStatus(evt.State) == enum.SUCCESS
end := time.Now().UTC()
if evt.RealEnd != nil {
end = *evt.RealEnd
}
exec.Graph.MarkDone(itemID, success, end)
// Only advance when the step succeeded; a failure leaves dependents waiting.
if success {
start := end
if evt.RealStart != nil {
start = *evt.RealStart
}
for _, nextID := range exec.Graph.ReadyToRun() {
exec.Graph.MarkRunning(nextID, start)
}
}
}
}
utils.GenericRawUpdateOne(exec, exec.GetID(), workflow_execution.NewAccessor(adminReq))
switch bk.State {
case enum.SUCCESS, enum.FAILURE, enum.FORGOTTEN, enum.CANCELLED:
self, err := oclib.GetMySelf()
@@ -453,6 +570,21 @@ func HandleWorkflowStepDone(resp tools.NATSResponse) {
}
}
// findItemIDByBookingID reverse-looks up a booking ID in PeerBookByGraph.
// PeerBookByGraph layout: map[peerID]map[itemID][]bookingID
func findItemIDByBookingID(exec *workflow_execution.WorkflowExecution, bookingID string) string {
for _, byItem := range exec.PeerBookByGraph {
for itemID, bookingIDs := range byItem {
for _, bkID := range bookingIDs {
if bkID == bookingID {
return itemID
}
}
}
}
return ""
}
func applyStepToBooking(step tools.StepMetric, adminReq *tools.APIRequest) {
res, _, err := booking.NewAccessor(adminReq).LoadOne(step.BookingID)
if err != nil || res == nil {