Scheduler + Observe
This commit is contained in:
@@ -13,10 +13,12 @@ import (
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/config"
|
||||
"cloud.o-forge.io/core/oc-lib/dbs"
|
||||
"cloud.o-forge.io/core/oc-lib/models/booking"
|
||||
"cloud.o-forge.io/core/oc-lib/models/common/enum"
|
||||
"cloud.o-forge.io/core/oc-lib/models/order"
|
||||
"cloud.o-forge.io/core/oc-lib/models/resources/purchase_resource"
|
||||
"cloud.o-forge.io/core/oc-lib/models/utils"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow"
|
||||
"cloud.o-forge.io/core/oc-lib/models/workflow_execution"
|
||||
@@ -144,13 +146,19 @@ func UpdateExecutionState(payload []byte, dt tools.DataType) {
|
||||
st := exec.BookingsState[data.ID]
|
||||
st.IsBooked = true
|
||||
exec.BookingsState[data.ID] = st
|
||||
if config.GetConfig().IsNano {
|
||||
scheduling_resources.SendBookingToMaster(schdata.Data.(*booking.Booking)) // TODO : ASK FOR RESPONSE...
|
||||
}
|
||||
case tools.PURCHASE_RESOURCE:
|
||||
if exec.PurchasesState == nil {
|
||||
exec.PurchasesState = map[string]bool{}
|
||||
}
|
||||
exec.PurchasesState[data.ID] = true
|
||||
if config.GetConfig().IsNano {
|
||||
scheduling_resources.SendPurchaseToMaster(schdata.Data.(*purchase_resource.PurchaseResource)) // TODO : ASK FOR RESPONSE...
|
||||
}
|
||||
}
|
||||
|
||||
// TODO REMOVE
|
||||
allConfirmed := true
|
||||
for _, st := range exec.BookingsState {
|
||||
if !st.IsBooked {
|
||||
@@ -366,6 +374,13 @@ func HandleWorkflowStarted(resp tools.NATSResponse) {
|
||||
return
|
||||
}
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
|
||||
mu := GetExecLock(evt.ExecutionID)
|
||||
if mu != nil {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
}
|
||||
|
||||
res, _, err := workflow_execution.NewAccessor(adminReq).LoadOne(evt.ExecutionID)
|
||||
if err != nil || res == nil {
|
||||
return
|
||||
@@ -375,6 +390,22 @@ func HandleWorkflowStarted(resp tools.NATSResponse) {
|
||||
if evt.RealStart != nil {
|
||||
exec.ExecDate = *evt.RealStart
|
||||
}
|
||||
|
||||
// Build the execution graph summary from the workflow graph on first start.
|
||||
if len(exec.Graph) == 0 {
|
||||
wfRes, _, wfErr := workflow.NewAccessor(adminReq).LoadOne(exec.WorkflowID)
|
||||
if wfErr == nil && wfRes != nil {
|
||||
exec.Graph = workflow_execution.BuildExecutionGraph(wfRes.(*workflow.Workflow).Graph)
|
||||
}
|
||||
}
|
||||
// Advance steps whose deps are already satisfied (typically the entry nodes).
|
||||
if len(exec.Graph) > 0 {
|
||||
now := time.Now().UTC()
|
||||
for _, id := range exec.Graph.ReadyToRun() {
|
||||
exec.Graph.MarkRunning(id, now)
|
||||
}
|
||||
}
|
||||
|
||||
utils.GenericRawUpdateOne(exec, exec.GetID(), workflow_execution.NewAccessor(adminReq))
|
||||
}
|
||||
|
||||
@@ -384,6 +415,13 @@ func HandleWorkflowDone(resp tools.NATSResponse) {
|
||||
return
|
||||
}
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
|
||||
mu := GetExecLock(evt.ExecutionID)
|
||||
if mu != nil {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
}
|
||||
|
||||
res, _, err := workflow_execution.NewAccessor(adminReq).LoadOne(evt.ExecutionID)
|
||||
if err != nil || res == nil {
|
||||
return
|
||||
@@ -393,17 +431,58 @@ func HandleWorkflowDone(resp tools.NATSResponse) {
|
||||
if evt.RealEnd != nil {
|
||||
exec.EndDate = evt.RealEnd
|
||||
}
|
||||
// All bookings are no longer reserved and are done
|
||||
// Release all booking reservations (workflow is over) without overwriting
|
||||
// IsDone: individual step events already set the authoritative done state
|
||||
// for each booking. Resetting everything here would lose that granularity.
|
||||
if exec.BookingsState == nil {
|
||||
exec.BookingsState = map[string]workflow_execution.BookingState{}
|
||||
}
|
||||
for id := range exec.BookingsState {
|
||||
exec.BookingsState[id] = workflow_execution.BookingState{IsBooked: false, IsDone: true}
|
||||
for id, st := range exec.BookingsState {
|
||||
st.IsBooked = false
|
||||
exec.BookingsState[id] = st
|
||||
}
|
||||
// Graph items that already reached success/failure keep their state.
|
||||
// Items still in running when the execution terminates receive the terminal
|
||||
// state (the step was active but no step_done event arrived before the
|
||||
// workflow finished — treat it as the execution outcome).
|
||||
terminalSuccess := enum.BookingStatus(evt.State) == enum.SUCCESS
|
||||
nowGraph := time.Now().UTC()
|
||||
for itemID, item := range exec.Graph {
|
||||
if item.State == workflow_execution.StepRunning {
|
||||
exec.Graph.MarkDone(itemID, terminalSuccess, nowGraph)
|
||||
}
|
||||
}
|
||||
utils.GenericRawUpdateOne(exec, exec.GetID(), workflow_execution.NewAccessor(adminReq))
|
||||
|
||||
// Build a set of booking IDs already covered by per-step events so we only
|
||||
// fall back for bookings the orchestrator never emitted a step for (e.g. storage).
|
||||
coveredByStep := map[string]bool{}
|
||||
for _, step := range evt.Steps {
|
||||
applyStepToBooking(step, adminReq)
|
||||
coveredByStep[step.BookingID] = true
|
||||
}
|
||||
|
||||
// Propagate the execution's terminal state to any booking that was not
|
||||
// updated by a step event and is not already in a terminal state.
|
||||
terminalState := enum.BookingStatus(evt.State)
|
||||
now := time.Now().UTC()
|
||||
for id := range exec.BookingsState {
|
||||
if coveredByStep[id] {
|
||||
continue
|
||||
}
|
||||
res, _, err := booking.NewAccessor(adminReq).LoadOne(id)
|
||||
if err != nil || res == nil {
|
||||
continue
|
||||
}
|
||||
bk := res.(*booking.Booking)
|
||||
if terminalExecStates[bk.State] {
|
||||
continue
|
||||
}
|
||||
bk.State = terminalState
|
||||
bk.RealEndDate = &now
|
||||
utils.GenericRawUpdateOne(bk, bk.GetID(), booking.NewAccessor(adminReq))
|
||||
}
|
||||
|
||||
self, err := oclib.GetMySelf()
|
||||
if err == nil && self != nil {
|
||||
go planner.GetPlannerService().RefreshSelf(self.PeerID, adminReq)
|
||||
@@ -416,6 +495,8 @@ func HandleWorkflowStepDone(resp tools.NATSResponse) {
|
||||
return
|
||||
}
|
||||
adminReq := &tools.APIRequest{Admin: true}
|
||||
|
||||
// Update the booking itself first (no exec lock needed for the booking doc).
|
||||
res, _, err := booking.NewAccessor(adminReq).LoadOne(evt.BookingID)
|
||||
if err != nil || res == nil {
|
||||
return
|
||||
@@ -430,20 +511,56 @@ func HandleWorkflowStepDone(resp tools.NATSResponse) {
|
||||
}
|
||||
utils.GenericRawUpdateOne(bk, bk.GetID(), booking.NewAccessor(adminReq))
|
||||
|
||||
// Update BookingsState in the parent WorkflowExecution: resource released, step done
|
||||
execRes, _, execErr := workflow_execution.NewAccessor(adminReq).LoadOne(bk.ExecutionID)
|
||||
if execErr == nil && execRes != nil {
|
||||
exec := execRes.(*workflow_execution.WorkflowExecution)
|
||||
if exec.BookingsState == nil {
|
||||
exec.BookingsState = map[string]workflow_execution.BookingState{}
|
||||
}
|
||||
st := exec.BookingsState[evt.BookingID]
|
||||
st.IsBooked = false
|
||||
st.IsDone = true
|
||||
exec.BookingsState[evt.BookingID] = st
|
||||
utils.GenericRawUpdateOne(exec, exec.GetID(), workflow_execution.NewAccessor(adminReq))
|
||||
// Update the parent WorkflowExecution under its exec lock to avoid races
|
||||
// between concurrent WORKFLOW_STEP_DONE_EVENT deliveries.
|
||||
execID := bk.ExecutionID
|
||||
mu := GetExecLock(execID)
|
||||
if mu != nil {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
}
|
||||
|
||||
execRes, _, execErr := workflow_execution.NewAccessor(adminReq).LoadOne(execID)
|
||||
if execErr != nil || execRes == nil {
|
||||
return
|
||||
}
|
||||
exec := execRes.(*workflow_execution.WorkflowExecution)
|
||||
|
||||
// BookingsState: resource released, step done.
|
||||
if exec.BookingsState == nil {
|
||||
exec.BookingsState = map[string]workflow_execution.BookingState{}
|
||||
}
|
||||
st := exec.BookingsState[evt.BookingID]
|
||||
st.IsBooked = false
|
||||
st.IsDone = true
|
||||
exec.BookingsState[evt.BookingID] = st
|
||||
|
||||
// Advance the execution graph.
|
||||
if len(exec.Graph) > 0 {
|
||||
itemID := findItemIDByBookingID(exec, evt.BookingID)
|
||||
if itemID != "" {
|
||||
success := enum.BookingStatus(evt.State) == enum.SUCCESS
|
||||
end := time.Now().UTC()
|
||||
if evt.RealEnd != nil {
|
||||
end = *evt.RealEnd
|
||||
}
|
||||
exec.Graph.MarkDone(itemID, success, end)
|
||||
|
||||
// Only advance when the step succeeded; a failure leaves dependents waiting.
|
||||
if success {
|
||||
start := end
|
||||
if evt.RealStart != nil {
|
||||
start = *evt.RealStart
|
||||
}
|
||||
for _, nextID := range exec.Graph.ReadyToRun() {
|
||||
exec.Graph.MarkRunning(nextID, start)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
utils.GenericRawUpdateOne(exec, exec.GetID(), workflow_execution.NewAccessor(adminReq))
|
||||
|
||||
switch bk.State {
|
||||
case enum.SUCCESS, enum.FAILURE, enum.FORGOTTEN, enum.CANCELLED:
|
||||
self, err := oclib.GetMySelf()
|
||||
@@ -453,6 +570,21 @@ func HandleWorkflowStepDone(resp tools.NATSResponse) {
|
||||
}
|
||||
}
|
||||
|
||||
// findItemIDByBookingID reverse-looks up a booking ID in PeerBookByGraph.
|
||||
// PeerBookByGraph layout: map[peerID]map[itemID][]bookingID
|
||||
func findItemIDByBookingID(exec *workflow_execution.WorkflowExecution, bookingID string) string {
|
||||
for _, byItem := range exec.PeerBookByGraph {
|
||||
for itemID, bookingIDs := range byItem {
|
||||
for _, bkID := range bookingIDs {
|
||||
if bkID == bookingID {
|
||||
return itemID
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func applyStepToBooking(step tools.StepMetric, adminReq *tools.APIRequest) {
|
||||
res, _, err := booking.NewAccessor(adminReq).LoadOne(step.BookingID)
|
||||
if err != nil || res == nil {
|
||||
|
||||
Reference in New Issue
Block a user