package planner import ( "encoding/json" "fmt" "oc-scheduler/infrastructure/utils" "slices" "sync" "time" oclib "cloud.o-forge.io/core/oc-lib" "cloud.o-forge.io/core/oc-lib/models/booking/planner" "cloud.o-forge.io/core/oc-lib/models/resources" "cloud.o-forge.io/core/oc-lib/models/workflow" "cloud.o-forge.io/core/oc-lib/models/workflow/graph" "cloud.o-forge.io/core/oc-lib/tools" ) const ( checkWindowHours = 5 checkStepMin = 15 // time increment per scan step (minutes) plannerTTL = 24 * time.Hour ) // --------------------------------------------------------------------------- // Planner cache — protected by plannerMu // --------------------------------------------------------------------------- // plannerEntry wraps a planner snapshot with refresh-ownership tracking. // At most one check session may be the "refresh owner" of a given peer's // planner at a time: it emits PB_PLANNER to request a fresh snapshot from // oc-discovery and, on close (clean or forced), emits PB_CLOSE_PLANNER to // release the stream. Any subsequent session that needs the same peer's // planner will see Refreshing=true and skip the duplicate request. type plannerEntry struct { Planner *planner.Planner Refreshing bool // true while a PB_PLANNER request is in flight RefreshOwner string // session UUID that initiated the current refresh } type PlannerService struct { Mu sync.RWMutex Cache map[string]*plannerEntry SubMu sync.RWMutex Subs map[string][]chan string AddedAt map[string]time.Time WorkflowSubMu sync.RWMutex WorkflowSubs map[string][]chan struct{} } var singleton *PlannerService // InitSelfPlanner bootstraps our own planner entry at startup. // It waits (with 15-second retries) for our peer record to be present in the // database before generating the first planner snapshot and broadcasting it // on PB_PLANNER. This handles the race between oc-scheduler starting before // oc-peer has fully registered our node. func InitPlanner() { singleton = &PlannerService{ AddedAt: map[string]time.Time{}, Subs: map[string][]chan string{}, Cache: map[string]*plannerEntry{}, WorkflowSubs: map[string][]chan struct{}{}, } for { self, err := oclib.GetMySelf() if err != nil || self == nil { fmt.Println("InitPlanner: self peer not found yet, retrying in 15s...") time.Sleep(15 * time.Second) continue } singleton.RefreshSelf(self.PeerID, &tools.APIRequest{Admin: true}) return } } func GetPlannerService() *PlannerService { return singleton } func (s *PlannerService) HandleStore(resp tools.NATSResponse) { m := map[string]interface{}{} p := planner.Planner{} if err := json.Unmarshal(resp.Payload, &m); err != nil { return } if err := json.Unmarshal(resp.Payload, &p); err != nil { fmt.Println("RETRIEVE PLANNER ERR", err) return } s.Store(fmt.Sprintf("%v", m["peer_id"]), &p) } // missingPlannerPeers returns the peer IDs from res whose planner is absent // or not yet populated in PlannerCache. // func missingPlannerPeers(res map[string]bookingResource) []string { func (s *PlannerService) MissingPeers(res map[string]utils.BookingResource) []string { var out []string for _, r := range res { s.Mu.RLock() entry := s.Cache[r.PeerPID] s.Mu.RUnlock() if entry == nil || entry.Planner == nil { out = append(out, r.PeerPID) } } return out } func (s *PlannerService) FindDate(wfID string, checkables map[string]utils.BookingResource, start time.Time, end *time.Time, preemption bool, asap bool) (time.Time, *time.Time, bool, bool, []string) { var unavailable, warnings []string // 4. Preemption: Planify ran (end is resolved), skip availability check. if preemption { return start, end, true, true, warnings } // 5b. For any peer whose planner is not yet cached, request it and wait // briefly so the decision is based on real data rather than a blind // "assume available". The wait is capped to avoid blocking the caller // when oc-discovery is unreachable. s.Fill(checkables, wfID) unavailable, warnings = s.checkResourceAvailability(checkables, start, end) if len(unavailable) == 0 { //result.Available = true return start, end, true, false, warnings } // 6. as_possible: find and commit to the next free slot. if asap { next := s.findNextSlot(checkables, start, end, checkWindowHours) if next != nil { if end != nil { duration := end.Sub(start) // capture before overwriting start e := next.Add(duration) end = &e } start = *next return start, end, true, false, warnings } else { return start, end, false, false, warnings } } return start, end, false, false, warnings } func (s *PlannerService) Fill(checkables map[string]utils.BookingResource, wfID string) { // Collect all peers involved in this check (not just missing ones). // We always re-request every peer because PB_CLOSE_PLANNER is emitted // after each check session, which stops the remote stream. The cached // snapshot may therefore be stale: re-fetching ensures the check is made // against up-to-date availability data. all := s.allPeers(checkables) if len(all) == 0 { return } const plannerFetchTimeout = 5 * time.Second tmpSession := "check-oneshot-" + wfID // Mark pending entries and clear any stale planner so the wait loop below // will not return early with an old snapshot. s.Mu.Lock() myself, _ := oclib.GetMySelf() for _, peerID := range all { entry := s.Cache[peerID] if entry == nil { entry = &plannerEntry{} s.Cache[peerID] = entry s.AddedAt[peerID] = time.Now().UTC() go s.EvictAfter(peerID, plannerTTL) } // Reset so MissingPeers sees it as absent until the fresh snapshot arrives. entry.Planner = nil if !entry.Refreshing { entry.Refreshing = true entry.RefreshOwner = tmpSession } } s.Mu.Unlock() defer s.ReleaseRefreshOwnership(all, tmpSession) for _, peerID := range all { if myself != nil && myself.PeerID == peerID { go s.RefreshSelf(peerID, &tools.APIRequest{Admin: true}) } else { payload, _ := json.Marshal(map[string]any{"peer_id": peerID}) utils.Propalgate(peerID, tools.PropalgationMessage{ Action: tools.PB_PLANNER, Payload: payload, }) } } deadline := time.Now().Add(plannerFetchTimeout) for { remaining := s.MissingPeers(checkables) if len(remaining) == 0 { return } wait := time.Until(deadline) if wait <= 0 { return } ch, cancelSub := SubscribeUpdates(s.Subs, &s.SubMu, remaining...) select { case <-ch: case <-time.After(wait): } cancelSub() } } // allPeers returns the deduplicated list of peer IDs for all checkable resources. func (s *PlannerService) allPeers(res map[string]utils.BookingResource) []string { seen := map[string]struct{}{} var out []string for _, r := range res { if _, ok := seen[r.PeerPID]; !ok { seen[r.PeerPID] = struct{}{} out = append(out, r.PeerPID) } } return out } // evictAfter waits ttl from first insertion then deletes the cache entry and // emits PB_CLOSE_PLANNER so oc-discovery stops streaming for this peer. // This is the only path that actually removes an entry from PlannerCache; // session close (ReleaseRefreshOwnership) only resets ownership state. func (s *PlannerService) EvictAfter(peerID string, ttl time.Duration) { time.Sleep(ttl) s.Mu.Lock() _, exists := s.Cache[peerID] if exists { delete(s.Cache, peerID) delete(s.AddedAt, peerID) } s.Mu.Unlock() if exists { utils.Notify(&s.SubMu, s.Subs, peerID, peerID) utils.Propalgate(peerID, tools.PropalgationMessage{Action: tools.PB_CLOSE_PLANNER}) } } // SubscribePlannerUpdates registers interest in planner changes for the given // peer IDs. The returned channel receives the peerID string (non-blocking) each // time any of those planners is updated. Call cancel to unregister. func SubscribeUpdates[T interface{}](subs map[string][]chan T, mu *sync.RWMutex, updates ...string) (<-chan T, func()) { ch := make(chan T, 1) mu.Lock() for _, k := range updates { subs[k] = append(subs[k], ch) } mu.Unlock() cancel := func() { mu.Lock() for _, k := range updates { subsk := subs[k] for i, s := range subsk { if s == ch { subs[k] = append(subsk[:i], subsk[i+1:]...) break } } } mu.Unlock() } return ch, cancel } // --------------------------------------------------------------------------- // Cache helpers // --------------------------------------------------------------------------- func (s *PlannerService) Store(peerID string, p *planner.Planner) { if s == nil { fmt.Println("PLANNER IS NULL") return } s.Mu.Lock() entry := s.Cache[peerID] isNew := entry == nil if isNew { entry = &plannerEntry{} s.Cache[peerID] = entry s.AddedAt[peerID] = time.Now().UTC() go s.EvictAfter(peerID, plannerTTL) } entry.Planner = p s.Cache[peerID] = entry s.Mu.Unlock() utils.Notify(&s.SubMu, s.Subs, peerID, peerID) } // --------------------------------------------------------------------------- // Planner refresh / broadcast // --------------------------------------------------------------------------- // RequestPlannerRefresh asks oc-discovery for a fresh planner snapshot for // each peer in peerIDs. Only the first session to request a given peer becomes // its "refresh owner": subsequent sessions see Refreshing=true and skip the // duplicate PB_PLANNER emission. Returns the subset of peerIDs for which this // session claimed ownership (needed to release on close). // RequestPlannerRefresh func (s *PlannerService) Refresh(peerIDs []string, executionsID string) []string { var owned []string for _, peerID := range peerIDs { s.Mu.Lock() entry := s.Cache[peerID] if entry == nil { entry = &plannerEntry{} s.Cache[peerID] = entry s.AddedAt[peerID] = time.Now().UTC() go s.EvictAfter(peerID, plannerTTL) } shouldRequest := !entry.Refreshing if shouldRequest { entry.Refreshing = true entry.RefreshOwner = executionsID } s.Mu.Unlock() if shouldRequest { owned = append(owned, peerID) if p, err := oclib.GetMySelf(); err == nil && p != nil && p.PeerID == peerID { go s.RefreshSelf(peerID, &tools.APIRequest{Admin: true}) } else { payload, _ := json.Marshal(map[string]any{"peer_id": peerID}) utils.Propalgate(peerID, tools.PropalgationMessage{ Action: tools.PB_PLANNER, Payload: payload, }) } } } return owned } // ReleaseRefreshOwnership is called when a check session closes (clean or // forced). For each peer this session owns, it resets the refresh state and // emits PB_CLOSE_PLANNER so oc-discovery stops the planner stream. // The planner data itself stays in the cache until TTL eviction. func (s *PlannerService) ReleaseRefreshOwnership(peerIDs []string, executionsID string) { for _, peerID := range peerIDs { s.Mu.Lock() if entry := s.Cache[peerID]; entry != nil && entry.RefreshOwner == executionsID { entry.Refreshing = false entry.RefreshOwner = "" } s.Mu.Unlock() utils.Notify(&s.SubMu, s.Subs, peerID, peerID) payload, _ := json.Marshal(map[string]any{"peer_id": peerID}) utils.Propalgate(peerID, tools.PropalgationMessage{ Action: tools.PB_CLOSE_PLANNER, Payload: payload, }) } } // broadcastPlanner iterates the storage and compute peers of the given workflow // and, for each peer not yet in the cache, emits a PB_PLANNER propagation so // downstream consumers (oc-discovery, other schedulers) refresh their state. func (s *PlannerService) Broadcast(wf *workflow.Workflow) { if wf.Graph == nil { return } items := []graph.GraphItem{} items = append(items, wf.GetGraphItems(wf.Graph.IsStorage)...) items = append(items, wf.GetGraphItems(wf.Graph.IsCompute)...) seen := []string{} for _, item := range items { _, res := item.GetResource() if res == nil { continue } creatorID := res.GetCreatorID() if slices.Contains(seen, creatorID) { continue } data := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil).LoadOne(creatorID) p := data.ToPeer() if p == nil { continue } s.Mu.RLock() cached := s.Cache[p.PeerID] s.Mu.RUnlock() // Only request if no snapshot and no refresh already in flight. if cached == nil || (cached.Planner == nil && !cached.Refreshing) { payload, err := json.Marshal(map[string]interface{}{"peer_id": p.PeerID}) if err != nil { continue } seen = append(seen, creatorID) utils.Propalgate(p.PeerID, tools.PropalgationMessage{ Action: tools.PB_PLANNER, Payload: payload, }) } } } // --------------------------------------------------------------------------- // Self-planner refresh // --------------------------------------------------------------------------- func (s *PlannerService) RefreshSelf(peerID string, request *tools.APIRequest) { p, err := planner.GenerateShallow(request) if err != nil { fmt.Println("refreshSelfPlanner: could not generate planner:", err) return } // Update the local cache and notify any waiting CheckStream goroutines. s.Store(peerID, p) // Broadcast the updated planner so remote peers (and oc-discovery) can // refresh their view of our availability. type plannerWithPeer struct { PeerID string `json:"peer_id"` *planner.Planner } plannerPayload, err := json.Marshal(plannerWithPeer{PeerID: peerID, Planner: p}) if err != nil { return } utils.Propalgate(peerID, tools.PropalgationMessage{ Action: tools.PB_PLANNER, Payload: plannerPayload, }) } // findNextSlot scans forward from 'from' in checkStepMin increments for up to // windowH hours and returns the first candidate start time at which all // resources are simultaneously free. func (s *PlannerService) findNextSlot(resources map[string]utils.BookingResource, from time.Time, originalEnd *time.Time, windowH int) *time.Time { duration := 5 * time.Minute if originalEnd != nil { if d := originalEnd.Sub(from); d > 0 { duration = d } } step := time.Duration(checkStepMin) * time.Minute limit := from.Add(time.Duration(windowH) * time.Hour) for t := from.Add(step); t.Before(limit); t = t.Add(step) { e := t.Add(duration) if unavail, _ := s.checkResourceAvailability(resources, t, &e); len(unavail) == 0 { return &t } } return nil } // checkResourceAvailability returns the IDs of unavailable resources and // human-readable warning messages. func (s *PlannerService) checkResourceAvailability(res map[string]utils.BookingResource, start time.Time, end *time.Time) (unavailable []string, warnings []string) { for _, r := range res { s.Mu.RLock() entry := s.Cache[r.PeerPID] s.Mu.RUnlock() fmt.Println("Retrieve", r.PeerPID, s.Cache, entry.Planner) if entry == nil { unavailable = append(unavailable, r.ID) warnings = append(warnings, fmt.Sprintf( "resource %s is not available in [%s – %s] : Missing Planner", r.ID, start.Format(time.RFC3339), utils.FormatOptTime(end))) continue } if entry.Planner == nil { continue } if !s.checkInstance(entry.Planner, r.ID, r.InstanceID, start, end) { unavailable = append(unavailable, r.ID) warnings = append(warnings, fmt.Sprintf( "resource %s is not available in [%s – %s]", r.ID, start.Format(time.RFC3339), utils.FormatOptTime(end))) } } return } // CheckResourceInstance checks whether a resource/instance is available on the // local planner cache for the given peer. Called by scheduling_resources when // validating an incoming booking creation. func (s *PlannerService) CheckResourceInstance(peerID, resourceID, instanceID string, start time.Time, end *time.Time) bool { s.Mu.RLock() entry := s.Cache[peerID] s.Mu.RUnlock() if entry == nil || entry.Planner == nil { return true // no planner cached → assume available } return s.checkInstance(entry.Planner, resourceID, instanceID, start, end) } // SubscribePlannerUpdates returns a channel that receives a peerID each time // one of the given peers' planners is updated. func (s *PlannerService) SubscribePlannerUpdates(peerIDs ...string) (<-chan string, func()) { return SubscribeUpdates(s.Subs, &s.SubMu, peerIDs...) } // SubscribeWorkflowUpdates returns a channel signalled when the workflow changes. func (s *PlannerService) SubscribeWorkflowUpdates(wfID string) (<-chan struct{}, func()) { return SubscribeUpdates(s.WorkflowSubs, &s.WorkflowSubMu, wfID) } // NotifyWorkflow signals all subscribers watching wfID. func (s *PlannerService) NotifyWorkflow(wfID string) { utils.Notify(&s.WorkflowSubMu, s.WorkflowSubs, wfID, struct{}{}) } // FillForPeers fetches and waits for planners for an explicit list of peer PIDs. // Same mechanic as Fill but decoupled from the BookingResource map — used for // dynamic resource resolution where the peer set is not part of checkables. func (s *PlannerService) FillForPeers(peerPIDs []string, wfID string) { if len(peerPIDs) == 0 { return } const plannerFetchTimeout = 5 * time.Second tmpSession := "check-dynamic-" + wfID s.Mu.Lock() myself, _ := oclib.GetMySelf() for _, peerID := range peerPIDs { entry := s.Cache[peerID] if entry == nil { entry = &plannerEntry{} s.Cache[peerID] = entry s.AddedAt[peerID] = time.Now().UTC() go s.EvictAfter(peerID, plannerTTL) } entry.Planner = nil if !entry.Refreshing { entry.Refreshing = true entry.RefreshOwner = tmpSession } } s.Mu.Unlock() defer s.ReleaseRefreshOwnership(peerPIDs, tmpSession) for _, peerID := range peerPIDs { if myself != nil && myself.PeerID == peerID { go s.RefreshSelf(peerID, &tools.APIRequest{Admin: true}) } else { payload, _ := json.Marshal(map[string]any{"peer_id": peerID}) utils.Propalgate(peerID, tools.PropalgationMessage{ Action: tools.PB_PLANNER, Payload: payload, }) } } deadline := time.Now().Add(plannerFetchTimeout) remaining := slices.Clone(peerPIDs) for len(remaining) > 0 { wait := time.Until(deadline) if wait <= 0 { return } ch, cancelSub := SubscribeUpdates(s.Subs, &s.SubMu, remaining...) select { case <-ch: case <-time.After(wait): cancelSub() return } cancelSub() remaining = remaining[:0] s.Mu.RLock() for _, pid := range peerPIDs { if entry := s.Cache[pid]; entry == nil || entry.Planner == nil { remaining = append(remaining, pid) } } s.Mu.RUnlock() } } // FillDynamic resolves all peer DIDs across the given dynamic resources to PIDs, // fetches their planners via FillForPeers, and returns the DID→PID mapping for use // in ResolveDynamic. All dynamics are batched into a single planner fetch round. func (s *PlannerService) FillDynamic(dynamics []*resources.DynamicResource, wfID string) map[string]string { didToPID := map[string]string{} peerPIDs := []string{} access := oclib.NewRequestAdmin(oclib.LibDataEnum(tools.PEER), nil) for _, d := range dynamics { for _, did := range d.PeerIds { if did == "" || didToPID[did] != "" { continue } if data := access.LoadOne(did); data.Data != nil { if p := data.ToPeer(); p != nil { didToPID[did] = p.PeerID peerPIDs = append(peerPIDs, p.PeerID) } } } } s.FillForPeers(peerPIDs, wfID) return didToPID } // ResolveDynamic walks the sorted instance list of a DynamicResource via // GetSelectedInstance and returns true as soon as it finds an instance whose // peer's planner confirms availability for [start, end]. // d.SelectedIndex is updated to the elected instance on success. // Peers that did not respond (no planner in cache) are skipped. func (s *PlannerService) ResolveDynamic(d *resources.DynamicResource, didToPID map[string]string, start time.Time, end *time.Time) bool { for { inst := d.GetSelectedInstance(nil) if inst == nil { return false // exhausted all candidates } did := d.PeerIds[d.SelectedIndex] resourceID := d.ResourceIds[d.SelectedIndex] pid, ok := didToPID[did] if !ok { continue // peer DID could not be resolved } s.Mu.RLock() entry := s.Cache[pid] s.Mu.RUnlock() if entry == nil || entry.Planner == nil { continue // peer did not respond in time } if s.checkInstance(entry.Planner, resourceID, inst.GetID(), start, end) { return true // d.SelectedIndex points to the elected instance } } } // checkInstance checks availability for the specific instance resolved by the // scheduler. When instanceID is empty (no instance selected / none resolvable), // it falls back to checking all instances known in the planner and returns true // if any one has remaining capacity. Returns true when no capacity is recorded. func (s *PlannerService) checkInstance(p *planner.Planner, resourceID string, instanceID string, start time.Time, end *time.Time) bool { if instanceID != "" { return p.Check(resourceID, instanceID, nil, start, end) } caps, ok := p.Capacities[resourceID] if !ok || len(caps) == 0 { return true } for id := range caps { if p.Check(resourceID, id, nil, start, end) { return true } } return false }