oc-scheduler -> scheduling + logs

This commit is contained in:
mr
2026-04-08 10:05:27 +02:00
parent f8a6e69ef3
commit 1d63d31442
21 changed files with 4605 additions and 139 deletions

View File

@@ -84,6 +84,7 @@ func (s *PlannerService) HandleStore(resp tools.NATSResponse) {
return
}
if err := json.Unmarshal(resp.Payload, &p); err != nil {
fmt.Println("RETRIEVE PLANNER ERR", err)
return
}
s.Store(fmt.Sprintf("%v", m["peer_id"]), &p)
@@ -128,11 +129,12 @@ func (s *PlannerService) FindDate(wfID string, checkables map[string]utils.Booki
if asap {
next := s.findNextSlot(checkables, start, end, checkWindowHours)
if next != nil {
start = *next
if end != nil {
shifted := next.Add(end.Sub(start))
end = &shifted
duration := end.Sub(start) // capture before overwriting start
e := next.Add(duration)
end = &e
}
start = *next
return start, end, true, false, warnings
} else {
return start, end, false, false, warnings
@@ -142,20 +144,84 @@ func (s *PlannerService) FindDate(wfID string, checkables map[string]utils.Booki
}
func (s *PlannerService) Fill(checkables map[string]utils.BookingResource, wfID string) {
if missing := s.MissingPeers(checkables); len(missing) > 0 {
const plannerFetchTimeout = 2 * time.Second
tmpSession := "check-oneshot-" + wfID
ch, cancelSub := SubscribeUpdates(s.Subs, &s.SubMu, missing...)
owned := s.Refresh(missing, tmpSession)
// Collect all peers involved in this check (not just missing ones).
// We always re-request every peer because PB_CLOSE_PLANNER is emitted
// after each check session, which stops the remote stream. The cached
// snapshot may therefore be stale: re-fetching ensures the check is made
// against up-to-date availability data.
all := s.allPeers(checkables)
if len(all) == 0 {
return
}
const plannerFetchTimeout = 5 * time.Second
tmpSession := "check-oneshot-" + wfID
// Mark pending entries and clear any stale planner so the wait loop below
// will not return early with an old snapshot.
s.Mu.Lock()
myself, _ := oclib.GetMySelf()
for _, peerID := range all {
entry := s.Cache[peerID]
if entry == nil {
entry = &plannerEntry{}
s.Cache[peerID] = entry
s.AddedAt[peerID] = time.Now().UTC()
go s.EvictAfter(peerID, plannerTTL)
}
// Reset so MissingPeers sees it as absent until the fresh snapshot arrives.
entry.Planner = nil
if !entry.Refreshing {
entry.Refreshing = true
entry.RefreshOwner = tmpSession
}
}
s.Mu.Unlock()
defer s.ReleaseRefreshOwnership(all, tmpSession)
for _, peerID := range all {
if myself != nil && myself.PeerID == peerID {
go s.RefreshSelf(peerID, &tools.APIRequest{Admin: true})
} else {
payload, _ := json.Marshal(map[string]any{"peer_id": peerID})
utils.Propalgate(peerID, tools.PropalgationMessage{
Action: tools.PB_PLANNER,
Payload: payload,
})
}
}
deadline := time.Now().Add(plannerFetchTimeout)
for {
remaining := s.MissingPeers(checkables)
if len(remaining) == 0 {
return
}
wait := time.Until(deadline)
if wait <= 0 {
return
}
ch, cancelSub := SubscribeUpdates(s.Subs, &s.SubMu, remaining...)
select {
case <-ch:
case <-time.After(plannerFetchTimeout):
case <-time.After(wait):
}
cancelSub()
s.ReleaseRefreshOwnership(owned, tmpSession)
}
}
// allPeers returns the deduplicated list of peer IDs for all checkable resources.
func (s *PlannerService) allPeers(res map[string]utils.BookingResource) []string {
seen := map[string]struct{}{}
var out []string
for _, r := range res {
if _, ok := seen[r.PeerPID]; !ok {
seen[r.PeerPID] = struct{}{}
out = append(out, r.PeerPID)
}
}
return out
}
// evictAfter waits ttl from first insertion then deletes the cache entry and
// emits PB_CLOSE_PLANNER so oc-discovery stops streaming for this peer.
// This is the only path that actually removes an entry from PlannerCache;
@@ -206,6 +272,10 @@ func SubscribeUpdates[T interface{}](subs map[string][]chan T, mu *sync.RWMutex,
// ---------------------------------------------------------------------------
func (s *PlannerService) Store(peerID string, p *planner.Planner) {
if s == nil {
fmt.Println("PLANNER IS NULL")
return
}
s.Mu.Lock()
entry := s.Cache[peerID]
isNew := entry == nil
@@ -216,8 +286,9 @@ func (s *PlannerService) Store(peerID string, p *planner.Planner) {
go s.EvictAfter(peerID, plannerTTL)
}
entry.Planner = p
s.Cache[peerID] = entry
s.Mu.Unlock()
utils.Notify[string](&s.SubMu, s.Subs, peerID, peerID)
utils.Notify(&s.SubMu, s.Subs, peerID, peerID)
}
// ---------------------------------------------------------------------------
@@ -388,9 +459,15 @@ func (s *PlannerService) checkResourceAvailability(res map[string]utils.BookingR
s.Mu.RLock()
entry := s.Cache[r.PeerPID]
s.Mu.RUnlock()
if entry == nil || entry.Planner == nil {
fmt.Println("Retrieve", r.PeerPID, s.Cache, entry.Planner)
if entry == nil {
unavailable = append(unavailable, r.ID)
warnings = append(warnings, fmt.Sprintf(
"peer %s planner not in cache for resource %s assuming available", r.PeerPID, r.ID))
"resource %s is not available in [%s %s] : Missing Planner",
r.ID, start.Format(time.RFC3339), utils.FormatOptTime(end)))
continue
}
if entry.Planner == nil {
continue
}
if !s.checkInstance(entry.Planner, r.ID, r.InstanceID, start, end) {
@@ -419,17 +496,17 @@ func (s *PlannerService) CheckResourceInstance(peerID, resourceID, instanceID st
// SubscribePlannerUpdates returns a channel that receives a peerID each time
// one of the given peers' planners is updated.
func (s *PlannerService) SubscribePlannerUpdates(peerIDs ...string) (<-chan string, func()) {
return SubscribeUpdates[string](s.Subs, &s.SubMu, peerIDs...)
return SubscribeUpdates(s.Subs, &s.SubMu, peerIDs...)
}
// SubscribeWorkflowUpdates returns a channel signalled when the workflow changes.
func (s *PlannerService) SubscribeWorkflowUpdates(wfID string) (<-chan struct{}, func()) {
return SubscribeUpdates[struct{}](s.WorkflowSubs, &s.WorkflowSubMu, wfID)
return SubscribeUpdates(s.WorkflowSubs, &s.WorkflowSubMu, wfID)
}
// NotifyWorkflow signals all subscribers watching wfID.
func (s *PlannerService) NotifyWorkflow(wfID string) {
utils.Notify[struct{}](&s.WorkflowSubMu, s.WorkflowSubs, wfID, struct{}{})
utils.Notify(&s.WorkflowSubMu, s.WorkflowSubs, wfID, struct{}{})
}
// checkInstance checks availability for the specific instance resolved by the