oc-scheduler -> scheduling + logs

This commit is contained in:
mr
2026-04-08 10:05:27 +02:00
parent f8a6e69ef3
commit 1d63d31442
21 changed files with 4605 additions and 139 deletions

View File

@@ -99,9 +99,11 @@ func UpdateExecutionState(payload []byte, dt tools.DataType) {
switch dt {
case tools.BOOKING:
if exec.BookingsState == nil {
exec.BookingsState = map[string]bool{}
exec.BookingsState = map[string]workflow_execution.BookingState{}
}
exec.BookingsState[data.ID] = true
st := exec.BookingsState[data.ID]
st.IsBooked = true
exec.BookingsState[data.ID] = st
case tools.PURCHASE_RESOURCE:
if exec.PurchasesState == nil {
exec.PurchasesState = map[string]bool{}
@@ -111,7 +113,7 @@ func UpdateExecutionState(payload []byte, dt tools.DataType) {
allConfirmed := true
for _, st := range exec.BookingsState {
if !st {
if !st.IsBooked {
allConfirmed = false
break
}
@@ -145,7 +147,7 @@ func confirmSessionOrder(executionsID string, adminReq *tools.APIRequest) {
results, _, _ := order.NewAccessor(adminReq).Search(
&dbs.Filters{And: map[string][]dbs.Filter{
"executions_id": {{Operator: dbs.EQUAL.String(), Value: executionsID}},
}}, "", true)
}}, "", true, 0, 10000)
for _, obj := range results {
if o, ok := obj.(*order.Order); ok {
o.IsDraft = false
@@ -301,7 +303,7 @@ func Unschedule(executionID string, request *tools.APIRequest) error {
func RecoverDraft() {
adminReq := &tools.APIRequest{Admin: true}
results, _, _ := workflow_execution.NewAccessor(adminReq).Search(nil, "*", true)
results, _, _ := workflow_execution.NewAccessor(adminReq).Search(nil, "*", true, 0, 10000)
for _, obj := range results {
exec, ok := obj.(*workflow_execution.WorkflowExecution)
if !ok {
@@ -350,6 +352,13 @@ func HandleWorkflowDone(resp tools.NATSResponse) {
if evt.RealEnd != nil {
exec.EndDate = evt.RealEnd
}
// All bookings are no longer reserved and are done
if exec.BookingsState == nil {
exec.BookingsState = map[string]workflow_execution.BookingState{}
}
for id := range exec.BookingsState {
exec.BookingsState[id] = workflow_execution.BookingState{IsBooked: false, IsDone: true}
}
utils.GenericRawUpdateOne(exec, exec.GetID(), workflow_execution.NewAccessor(adminReq))
for _, step := range evt.Steps {
applyStepToBooking(step, adminReq)
@@ -379,6 +388,21 @@ func HandleWorkflowStepDone(resp tools.NATSResponse) {
bk.RealEndDate = evt.RealEnd
}
utils.GenericRawUpdateOne(bk, bk.GetID(), booking.NewAccessor(adminReq))
// Update BookingsState in the parent WorkflowExecution: resource released, step done
execRes, _, execErr := workflow_execution.NewAccessor(adminReq).LoadOne(bk.ExecutionID)
if execErr == nil && execRes != nil {
exec := execRes.(*workflow_execution.WorkflowExecution)
if exec.BookingsState == nil {
exec.BookingsState = map[string]workflow_execution.BookingState{}
}
st := exec.BookingsState[evt.BookingID]
st.IsBooked = false
st.IsDone = true
exec.BookingsState[evt.BookingID] = st
utils.GenericRawUpdateOne(exec, exec.GetID(), workflow_execution.NewAccessor(adminReq))
}
switch bk.State {
case enum.SUCCESS, enum.FAILURE, enum.FORGOTTEN, enum.CANCELLED:
self, err := oclib.GetMySelf()
@@ -439,7 +463,7 @@ func scanStaleExecutions() error {
res := oclib.NewRequest(oclib.LibDataEnum(oclib.WORKFLOW_EXECUTION), "", myself.GetID(), []string{}, nil).
Search(&dbs.Filters{And: map[string][]dbs.Filter{
"execution_date": {{Operator: dbs.LTE.String(), Value: primitive.NewDateTimeFromTime(deadline)}},
}}, "", false)
}}, "", false, 0, 10000)
if res.Err != "" {
return fmt.Errorf("stale execution search failed: %s", res.Err)
}

View File

@@ -84,6 +84,7 @@ func (s *PlannerService) HandleStore(resp tools.NATSResponse) {
return
}
if err := json.Unmarshal(resp.Payload, &p); err != nil {
fmt.Println("RETRIEVE PLANNER ERR", err)
return
}
s.Store(fmt.Sprintf("%v", m["peer_id"]), &p)
@@ -128,11 +129,12 @@ func (s *PlannerService) FindDate(wfID string, checkables map[string]utils.Booki
if asap {
next := s.findNextSlot(checkables, start, end, checkWindowHours)
if next != nil {
start = *next
if end != nil {
shifted := next.Add(end.Sub(start))
end = &shifted
duration := end.Sub(start) // capture before overwriting start
e := next.Add(duration)
end = &e
}
start = *next
return start, end, true, false, warnings
} else {
return start, end, false, false, warnings
@@ -142,20 +144,84 @@ func (s *PlannerService) FindDate(wfID string, checkables map[string]utils.Booki
}
func (s *PlannerService) Fill(checkables map[string]utils.BookingResource, wfID string) {
if missing := s.MissingPeers(checkables); len(missing) > 0 {
const plannerFetchTimeout = 2 * time.Second
tmpSession := "check-oneshot-" + wfID
ch, cancelSub := SubscribeUpdates(s.Subs, &s.SubMu, missing...)
owned := s.Refresh(missing, tmpSession)
// Collect all peers involved in this check (not just missing ones).
// We always re-request every peer because PB_CLOSE_PLANNER is emitted
// after each check session, which stops the remote stream. The cached
// snapshot may therefore be stale: re-fetching ensures the check is made
// against up-to-date availability data.
all := s.allPeers(checkables)
if len(all) == 0 {
return
}
const plannerFetchTimeout = 5 * time.Second
tmpSession := "check-oneshot-" + wfID
// Mark pending entries and clear any stale planner so the wait loop below
// will not return early with an old snapshot.
s.Mu.Lock()
myself, _ := oclib.GetMySelf()
for _, peerID := range all {
entry := s.Cache[peerID]
if entry == nil {
entry = &plannerEntry{}
s.Cache[peerID] = entry
s.AddedAt[peerID] = time.Now().UTC()
go s.EvictAfter(peerID, plannerTTL)
}
// Reset so MissingPeers sees it as absent until the fresh snapshot arrives.
entry.Planner = nil
if !entry.Refreshing {
entry.Refreshing = true
entry.RefreshOwner = tmpSession
}
}
s.Mu.Unlock()
defer s.ReleaseRefreshOwnership(all, tmpSession)
for _, peerID := range all {
if myself != nil && myself.PeerID == peerID {
go s.RefreshSelf(peerID, &tools.APIRequest{Admin: true})
} else {
payload, _ := json.Marshal(map[string]any{"peer_id": peerID})
utils.Propalgate(peerID, tools.PropalgationMessage{
Action: tools.PB_PLANNER,
Payload: payload,
})
}
}
deadline := time.Now().Add(plannerFetchTimeout)
for {
remaining := s.MissingPeers(checkables)
if len(remaining) == 0 {
return
}
wait := time.Until(deadline)
if wait <= 0 {
return
}
ch, cancelSub := SubscribeUpdates(s.Subs, &s.SubMu, remaining...)
select {
case <-ch:
case <-time.After(plannerFetchTimeout):
case <-time.After(wait):
}
cancelSub()
s.ReleaseRefreshOwnership(owned, tmpSession)
}
}
// allPeers returns the deduplicated list of peer IDs for all checkable resources.
func (s *PlannerService) allPeers(res map[string]utils.BookingResource) []string {
seen := map[string]struct{}{}
var out []string
for _, r := range res {
if _, ok := seen[r.PeerPID]; !ok {
seen[r.PeerPID] = struct{}{}
out = append(out, r.PeerPID)
}
}
return out
}
// evictAfter waits ttl from first insertion then deletes the cache entry and
// emits PB_CLOSE_PLANNER so oc-discovery stops streaming for this peer.
// This is the only path that actually removes an entry from PlannerCache;
@@ -206,6 +272,10 @@ func SubscribeUpdates[T interface{}](subs map[string][]chan T, mu *sync.RWMutex,
// ---------------------------------------------------------------------------
func (s *PlannerService) Store(peerID string, p *planner.Planner) {
if s == nil {
fmt.Println("PLANNER IS NULL")
return
}
s.Mu.Lock()
entry := s.Cache[peerID]
isNew := entry == nil
@@ -216,8 +286,9 @@ func (s *PlannerService) Store(peerID string, p *planner.Planner) {
go s.EvictAfter(peerID, plannerTTL)
}
entry.Planner = p
s.Cache[peerID] = entry
s.Mu.Unlock()
utils.Notify[string](&s.SubMu, s.Subs, peerID, peerID)
utils.Notify(&s.SubMu, s.Subs, peerID, peerID)
}
// ---------------------------------------------------------------------------
@@ -388,9 +459,15 @@ func (s *PlannerService) checkResourceAvailability(res map[string]utils.BookingR
s.Mu.RLock()
entry := s.Cache[r.PeerPID]
s.Mu.RUnlock()
if entry == nil || entry.Planner == nil {
fmt.Println("Retrieve", r.PeerPID, s.Cache, entry.Planner)
if entry == nil {
unavailable = append(unavailable, r.ID)
warnings = append(warnings, fmt.Sprintf(
"peer %s planner not in cache for resource %s assuming available", r.PeerPID, r.ID))
"resource %s is not available in [%s %s] : Missing Planner",
r.ID, start.Format(time.RFC3339), utils.FormatOptTime(end)))
continue
}
if entry.Planner == nil {
continue
}
if !s.checkInstance(entry.Planner, r.ID, r.InstanceID, start, end) {
@@ -419,17 +496,17 @@ func (s *PlannerService) CheckResourceInstance(peerID, resourceID, instanceID st
// SubscribePlannerUpdates returns a channel that receives a peerID each time
// one of the given peers' planners is updated.
func (s *PlannerService) SubscribePlannerUpdates(peerIDs ...string) (<-chan string, func()) {
return SubscribeUpdates[string](s.Subs, &s.SubMu, peerIDs...)
return SubscribeUpdates(s.Subs, &s.SubMu, peerIDs...)
}
// SubscribeWorkflowUpdates returns a channel signalled when the workflow changes.
func (s *PlannerService) SubscribeWorkflowUpdates(wfID string) (<-chan struct{}, func()) {
return SubscribeUpdates[struct{}](s.WorkflowSubs, &s.WorkflowSubMu, wfID)
return SubscribeUpdates(s.WorkflowSubs, &s.WorkflowSubMu, wfID)
}
// NotifyWorkflow signals all subscribers watching wfID.
func (s *PlannerService) NotifyWorkflow(wfID string) {
utils.Notify[struct{}](&s.WorkflowSubMu, s.WorkflowSubs, wfID, struct{}{})
utils.Notify(&s.WorkflowSubMu, s.WorkflowSubs, wfID, struct{}{})
}
// checkInstance checks availability for the specific instance resolved by the

View File

@@ -40,15 +40,20 @@ type WorkflowSchedule struct {
DurationS float64 `json:"duration_s" default:"-1"`
Cron string `json:"cron,omitempty"`
BookingMode booking.BookingMode `json:"booking_mode,omitempty"`
SelectedInstances workflow.ConfigItem `json:"selected_instances"`
SelectedPartnerships workflow.ConfigItem `json:"selected_partnerships"`
SelectedBuyings workflow.ConfigItem `json:"selected_buyings"`
SelectedStrategies workflow.ConfigItem `json:"selected_strategies"`
BookingMode booking.BookingMode `json:"booking_mode,omitempty"`
SelectedInstances workflow.ConfigItem `json:"selected_instances"`
SelectedPartnerships workflow.ConfigItem `json:"selected_partnerships"`
SelectedBuyings workflow.ConfigItem `json:"selected_buyings"`
SelectedStrategies workflow.ConfigItem `json:"selected_strategies"`
SelectedBillingStrategy pricing.BillingStrategy `json:"selected_billing_strategy"`
// Confirm, when true, triggers Schedule() to confirm the drafts held by this session.
Confirm bool `json:"confirm,omitempty"`
// Asap and Preemption override the query-param mode on a per-message basis.
// nil means "not set" (keep previous value).
Asap *bool `json:"asap,omitempty"`
Preemption *bool `json:"preemption,omitempty"`
}
// CheckResult is the response payload for an availability check.
@@ -68,7 +73,6 @@ type CheckResult struct {
// Check verifies whether the requested slot is available across all resource peers.
func (ws *WorkflowSchedule) Check(wfID string, asap bool, preemption bool, request *tools.APIRequest) (*CheckResult, error) {
fmt.Println("CHECK", asap, "/", preemption)
obj, code, err := workflow.NewAccessor(request).LoadOne(wfID)
if code != 200 || err != nil {
msg := "could not load workflow " + wfID
@@ -81,7 +85,7 @@ func (ws *WorkflowSchedule) Check(wfID string, asap bool, preemption bool, reque
prepLead := conf.GetConfig().PrepLead()
start := ws.Start
if asap || start.IsZero() {
if asap || preemption || start.IsZero() {
start = time.Now().UTC().Add(prepLead)
} else if start.Before(time.Now().UTC().Add(prepLead)) {
// Explicit date is within the prep window — impossible to guarantee on time.
@@ -240,4 +244,3 @@ func (ws *WorkflowSchedule) GetDates() ([]Schedule, error) {
}
return schedule, nil
}

View File

@@ -7,6 +7,8 @@ import (
"sync"
"time"
"oc-scheduler/infrastructure/planner"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/models/booking"
"cloud.o-forge.io/core/oc-lib/models/common/enum"
@@ -14,7 +16,6 @@ import (
"cloud.o-forge.io/core/oc-lib/models/resources/purchase_resource"
"cloud.o-forge.io/core/oc-lib/models/utils"
"cloud.o-forge.io/core/oc-lib/tools"
"oc-scheduler/infrastructure/planner"
)
// ---------------------------------------------------------------------------
@@ -31,7 +32,7 @@ type SchedulingResourcesService struct {
var singleton *SchedulingResourcesService
func init() {
func InitSchedulingResource() {
singleton = &SchedulingResourcesService{}
}

View File

@@ -44,7 +44,7 @@ func (s *SessionExecutionsService) sessionIDFilter(field, id string) *dbs.Filter
func (s *SessionExecutionsService) loadSession(dt tools.DataType) []scheduling_resources.SchedulerObject {
results := oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil).Search(
s.sessionIDFilter("executions_id", s.ExecutionsSessionID), "", true)
s.sessionIDFilter("executions_id", s.ExecutionsSessionID), "", true, 0, 10000)
out := make([]scheduling_resources.SchedulerObject, 0, len(results.Data))
for _, obj := range results.Data {
out = append(out, scheduling_resources.ToSchedulerObject(dt, obj))
@@ -55,7 +55,7 @@ func (s *SessionExecutionsService) loadSession(dt tools.DataType) []scheduling_r
func (s *SessionExecutionsService) LoadSessionExecs() []*workflow_execution.WorkflowExecution {
adminReq := &tools.APIRequest{Admin: true}
results, _, _ := workflow_execution.NewAccessor(adminReq).Search(
s.sessionIDFilter("executions_id", s.ExecutionsSessionID), "", true)
s.sessionIDFilter("executions_id", s.ExecutionsSessionID), "", true, 0, 10000)
out := make([]*workflow_execution.WorkflowExecution, 0)
for _, obj := range results {
if exec, ok := obj.(*workflow_execution.WorkflowExecution); ok {
@@ -68,7 +68,7 @@ func (s *SessionExecutionsService) LoadSessionExecs() []*workflow_execution.Work
func (s *SessionExecutionsService) loadSessionOrder() *order.Order {
adminReq := &tools.APIRequest{Admin: true}
results, _, _ := order.NewAccessor(adminReq).Search(
s.sessionIDFilter("executions_id", s.ExecutionsSessionID), "", true)
s.sessionIDFilter("executions_id", s.ExecutionsSessionID), "", true, 0, 10000)
for _, obj := range results {
if o, ok := obj.(*order.Order); ok {
return o
@@ -174,8 +174,18 @@ func (s *SessionExecutionsService) upsertDrafts(
func (s *SessionExecutionsService) CleanupSession(request *tools.APIRequest) {
adminReq := &tools.APIRequest{Admin: true}
// Delete bookings and purchases directly by executions_id.
// We cannot rely on execution.Unschedule here because it uses
// exec.PeerBookByGraph which is empty during the draft/check phase.
for _, dt := range []tools.DataType{tools.BOOKING, tools.PURCHASE_RESOURCE} {
for _, obj := range s.loadSession(dt) {
scheduling_resources.GetService().Delete(dt, obj, request)
}
}
for _, exec := range s.LoadSessionExecs() {
execution.Unschedule(exec.GetID(), request)
execution.UnregisterExecLock(exec.GetID())
workflow_execution.NewAccessor(adminReq).DeleteOne(exec.GetID())
}
if o := s.loadSessionOrder(); o != nil {

View File

@@ -3,6 +3,7 @@ package utils
import (
"encoding/json"
"errors"
"fmt"
"sync"
"time"
@@ -177,6 +178,7 @@ func Notify[T interface{}](mu *sync.RWMutex, registry map[string][]chan T, key s
func Propalgate(peerID string, message tools.PropalgationMessage) {
b, _ := json.Marshal(message)
fmt.Println("Propalgate")
tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
FromApp: "oc-scheduler",
Datatype: -1,