Refactor Oc-Sheduler

This commit is contained in:
mr
2026-03-25 11:11:37 +01:00
parent 7cbe08f4ea
commit 12eba65a01
24 changed files with 3498 additions and 2047 deletions

View File

@@ -0,0 +1,474 @@
package scheduling_resources
import (
"encoding/json"
"fmt"
"strings"
"sync"
"time"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/models/booking"
"cloud.o-forge.io/core/oc-lib/models/common/enum"
"cloud.o-forge.io/core/oc-lib/models/peer"
"cloud.o-forge.io/core/oc-lib/models/resources/purchase_resource"
"cloud.o-forge.io/core/oc-lib/models/utils"
"cloud.o-forge.io/core/oc-lib/tools"
"oc-scheduler/infrastructure/planner"
)
// ---------------------------------------------------------------------------
// Service
// ---------------------------------------------------------------------------
// SchedulingResourcesService manages the lifecycle of Booking and PurchaseResource
// as SchedulerObjects. It caches the local peer identity so every operation can
// route correctly without calling oclib.GetMySelf() on each request.
type SchedulingResourcesService struct {
mu sync.RWMutex
selfPeer *peer.Peer
}
var singleton *SchedulingResourcesService
func init() {
singleton = &SchedulingResourcesService{}
}
// GetService returns the singleton SchedulingResourcesService.
func GetService() *SchedulingResourcesService {
return singleton
}
// Self returns the cached local peer, lazily resolving it on first call.
func (s *SchedulingResourcesService) Self() *peer.Peer {
s.mu.RLock()
p := s.selfPeer
s.mu.RUnlock()
if p != nil {
return p
}
p, _ = oclib.GetMySelf()
if p != nil {
s.mu.Lock()
s.selfPeer = p
s.mu.Unlock()
}
return p
}
// InvalidateSelf clears the cached self peer (e.g. after a peer re-registration).
func (s *SchedulingResourcesService) InvalidateSelf() {
s.mu.Lock()
s.selfPeer = nil
s.mu.Unlock()
}
// ---------------------------------------------------------------------------
// RemoveResourcePayload
// ---------------------------------------------------------------------------
// RemoveResourcePayload is sent via NATS REMOVE_RESOURCE so the receiver can
// verify the delete order comes from the original scheduler session.
type RemoveResourcePayload struct {
ID string `json:"id"`
SchedulerPeerID string `json:"scheduler_peer_id"`
ExecutionsID string `json:"executions_id"`
}
// ---------------------------------------------------------------------------
// Propagation — creation
// ---------------------------------------------------------------------------
// PropagateCreate routes a new booking/purchase draft to its destination:
// - local peer → store in DB + refresh planner
// - remote peer → emit NATS PROPALGATION_EVENT/PB_CREATE
func (s *SchedulingResourcesService) PropagateCreate(
obj utils.DBObject,
destPeerID string,
dt tools.DataType,
request *tools.APIRequest,
errCh chan error,
) {
selfID := s.Self()
if selfID == nil {
errCh <- fmt.Errorf("PropagateCreate: local peer not available")
return
}
if destPeerID == selfID.GetID() {
stored := oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil).StoreOne(obj.Serialize(obj))
if stored.Err != "" || stored.Data == nil {
errCh <- fmt.Errorf("could not store %s locally: %s", dt.String(), stored.Err)
return
}
if dt == tools.BOOKING {
planner.GetPlannerService().RefreshSelf(selfID.PeerID, request)
}
errCh <- nil
return
}
m := obj.Serialize(obj)
if m["dest_peer_id"] != nil {
if data := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil).LoadOne(fmt.Sprintf("%v", m["dest_peer_id"])); data.Data != nil {
m["peer_id"] = data.Data.(*peer.Peer).PeerID
}
} else if m["peerless"] == true {
originRef := fmt.Sprintf("%v", m["origin_ref"])
if !isValidPeerlessRef(originRef) {
emitPeerBehaviorReport(request.PeerID, tools.BehaviorFraud,
"peerless booking with invalid or unrecognised Origin.Ref", originRef)
errCh <- fmt.Errorf("peerless booking rejected: invalid Origin.Ref %q", originRef)
return
}
stored := oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil).StoreOne(m)
if stored.Err != "" || stored.Data == nil {
errCh <- fmt.Errorf("could not store peerless %s locally: %s", dt.String(), stored.Err)
return
}
if dt == tools.BOOKING {
planner.GetPlannerService().RefreshSelf(selfID.PeerID, request)
}
errCh <- nil
return
} else {
fmt.Println("PropagateCreate: no dest_peer_id and not peerless, skipping")
errCh <- nil
return
}
payload, err := json.Marshal(m)
if err != nil {
errCh <- fmt.Errorf("could not serialize %s: %w", dt.String(), err)
return
}
b, err := json.Marshal(&tools.PropalgationMessage{
DataType: dt.EnumIndex(),
Action: tools.PB_CREATE,
Payload: payload,
})
if err == nil {
tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
FromApp: "oc-scheduler",
Datatype: dt,
Method: int(tools.PROPALGATION_EVENT),
Payload: b,
})
}
errCh <- nil
}
// ---------------------------------------------------------------------------
// Propagation — update / confirmation
// ---------------------------------------------------------------------------
// PropagateWrite routes a booking/purchase update to its destination.
// Returns true when the resource was confirmed locally (IsDraft=false on self peer)
// and the caller must trigger considers via execution.UpdateExecutionState.
func (s *SchedulingResourcesService) PropagateWrite(
obj utils.DBObject,
destPeerID string,
dt tools.DataType,
request *tools.APIRequest,
) bool {
selfID := s.Self()
if selfID == nil {
fmt.Println("PropagateWrite: local peer not available")
return false
}
if destPeerID == selfID.GetID() {
if _, _, err := utils.GenericRawUpdateOne(obj, obj.GetID(), obj.GetAccessor(request)); err != nil {
fmt.Printf("PropagateWrite: local update failed for %s %s: %v\n", dt, obj.GetID(), err)
return false
}
if dt == tools.BOOKING {
planner.GetPlannerService().RefreshSelf(selfID.PeerID, request)
}
return !obj.IsDrafted()
}
payload, err := json.Marshal(obj)
if err != nil {
return false
}
tools.NewNATSCaller().SetNATSPub(tools.CREATE_RESOURCE, tools.NATSResponse{
FromApp: "oc-scheduler",
Datatype: dt,
Method: int(tools.CREATE_RESOURCE),
Payload: payload,
})
return false
}
// ---------------------------------------------------------------------------
// Deletion
// ---------------------------------------------------------------------------
// Delete removes a booking/purchase from its destination peer (local or NATS).
func (s *SchedulingResourcesService) Delete(dt tools.DataType, bk SchedulerObject, request *tools.APIRequest) {
selfID := s.Self()
if selfID == nil {
fmt.Println("Delete: local peer not available")
return
}
if bk.GetDestPeer() == selfID.GetID() {
data := oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil).DeleteOne(bk.GetID())
fmt.Println("Delete scheduling resource", bk.GetID(), data.Err)
if dt == tools.BOOKING {
planner.GetPlannerService().RefreshSelf(selfID.PeerID, request)
}
return
}
EmitNATSRemove(bk.GetID(), bk.GetPeerSession(), bk.GetExecutionsId(), dt)
}
// EmitNATSRemove sends a REMOVE_RESOURCE NATS event with auth fields.
func EmitNATSRemove(id, schedulerPeerID, executionsID string, dt tools.DataType) {
payload, _ := json.Marshal(RemoveResourcePayload{
ID: id,
SchedulerPeerID: schedulerPeerID,
ExecutionsID: executionsID,
})
tools.NewNATSCaller().SetNATSPub(tools.REMOVE_RESOURCE, tools.NATSResponse{
FromApp: "oc-scheduler",
Datatype: dt,
Method: int(tools.REMOVE_RESOURCE),
Payload: payload,
})
}
// ---------------------------------------------------------------------------
// Confirmation
// ---------------------------------------------------------------------------
// Confirm sets IsDraft=false on a booking or purchase.
// For bookings, also advances State to SCHEDULED and refreshes the self planner.
func Confirm(id string, dt tools.DataType) {
adminReq := &tools.APIRequest{Admin: true}
switch dt {
case tools.BOOKING:
res, _, err := booking.NewAccessor(adminReq).LoadOne(id)
if err != nil || res == nil {
fmt.Printf("Confirm: could not load booking %s: %v\n", id, err)
return
}
bk := res.(*booking.Booking)
bk.IsDraft = false
bk.State = enum.SCHEDULED
if _, _, err := utils.GenericRawUpdateOne(bk, id, booking.NewAccessor(adminReq)); err != nil {
fmt.Printf("Confirm: could not confirm booking %s: %v\n", id, err)
return
}
if self := GetService().Self(); self != nil {
planner.GetPlannerService().RefreshSelf(self.PeerID, adminReq)
}
case tools.PURCHASE_RESOURCE:
res, _, err := purchase_resource.NewAccessor(adminReq).LoadOne(id)
if err != nil || res == nil {
fmt.Printf("Confirm: could not load purchase %s: %v\n", id, err)
return
}
pr := res.(*purchase_resource.PurchaseResource)
pr.IsDraft = false
if _, _, err := utils.GenericRawUpdateOne(pr, id, purchase_resource.NewAccessor(adminReq)); err != nil {
fmt.Printf("Confirm: could not confirm purchase %s: %v\n", id, err)
}
}
}
// DraftTimeout deletes a booking/purchase if it is still a draft after 10 minutes.
func DraftTimeout(id string, dt tools.DataType) {
adminReq := &tools.APIRequest{Admin: true}
var res utils.DBObject
var loadErr error
switch dt {
case tools.BOOKING:
res, _, loadErr = booking.NewAccessor(adminReq).LoadOne(id)
case tools.PURCHASE_RESOURCE:
res, _, loadErr = purchase_resource.NewAccessor(adminReq).LoadOne(id)
default:
return
}
if loadErr != nil || res == nil || !res.IsDrafted() {
return
}
switch dt {
case tools.BOOKING:
booking.NewAccessor(adminReq).DeleteOne(id)
case tools.PURCHASE_RESOURCE:
purchase_resource.NewAccessor(adminReq).DeleteOne(id)
}
fmt.Printf("DraftTimeout: %s %s deleted (still draft after 10 min)\n", dt.String(), id)
}
// ---------------------------------------------------------------------------
// NATS handlers — incoming booking/purchase
// ---------------------------------------------------------------------------
// HandleCreateBooking processes an incoming booking from NATS.
// Returns true if the booking was confirmed (IsDraft→false) and considers must be triggered.
func (s *SchedulingResourcesService) HandleCreateBooking(bk *booking.Booking, adminReq *tools.APIRequest) bool {
self := s.Self()
if self == nil {
return false
}
if existing, _, loadErr := booking.NewAccessor(adminReq).LoadOne(bk.GetID()); loadErr == nil && existing != nil {
prev := existing.(*booking.Booking)
if prev.SchedulerPeerID != bk.SchedulerPeerID || prev.ExecutionsID != bk.ExecutionsID {
fmt.Println("HandleCreateBooking: auth mismatch, ignoring", bk.GetID())
return false
}
if !prev.IsDrafted() && bk.IsDraft {
return false
}
if !bk.IsDraft && !prev.ExpectedStartDate.IsZero() && prev.ExpectedStartDate.Before(time.Now().UTC()) {
fmt.Println("HandleCreateBooking: expired, deleting", bk.GetID())
booking.NewAccessor(adminReq).DeleteOne(bk.GetID())
return false
}
if _, _, err := utils.GenericRawUpdateOne(bk, bk.GetID(), booking.NewAccessor(adminReq)); err != nil {
fmt.Println("HandleCreateBooking: update failed:", err)
return false
}
planner.GetPlannerService().RefreshSelf(self.PeerID, adminReq)
return !bk.IsDraft
}
// New booking
if !bk.ExpectedStartDate.IsZero() && bk.ExpectedStartDate.Before(time.Now().UTC()) {
fmt.Println("HandleCreateBooking: start date in the past, discarding")
return false
}
if !planner.GetPlannerService().CheckResourceInstance(self.PeerID, bk.ResourceID, bk.InstanceID, bk.ExpectedStartDate, bk.ExpectedEndDate) {
fmt.Println("HandleCreateBooking: conflicts with local planner, discarding")
return false
}
bk.IsDraft = true
stored, _, err := booking.NewAccessor(adminReq).StoreOne(bk)
if err != nil {
fmt.Println("HandleCreateBooking: could not store:", err)
return false
}
storedID := stored.GetID()
planner.GetPlannerService().RefreshSelf(self.PeerID, adminReq)
time.AfterFunc(10*time.Minute, func() { DraftTimeout(storedID, tools.BOOKING) })
return false
}
// HandleCreatePurchase processes an incoming purchase from NATS.
// Returns true if considers must be triggered.
func (s *SchedulingResourcesService) HandleCreatePurchase(pr *purchase_resource.PurchaseResource, adminReq *tools.APIRequest) bool {
self := s.Self()
if self == nil {
return false
}
if pr.DestPeerID != self.GetID() {
return false
}
if existing, _, loadErr := purchase_resource.NewAccessor(adminReq).LoadOne(pr.GetID()); loadErr == nil && existing != nil {
prev := existing.(*purchase_resource.PurchaseResource)
if prev.SchedulerPeerID != pr.SchedulerPeerID || prev.ExecutionsID != pr.ExecutionsID {
fmt.Println("HandleCreatePurchase: auth mismatch, ignoring", pr.GetID())
return false
}
if !prev.IsDrafted() && pr.IsDraft {
return false
}
if _, _, err := utils.GenericRawUpdateOne(pr, pr.GetID(), purchase_resource.NewAccessor(adminReq)); err != nil {
fmt.Println("HandleCreatePurchase: update failed:", err)
return false
}
return !pr.IsDraft
}
pr.IsDraft = true
stored, _, err := purchase_resource.NewAccessor(adminReq).StoreOne(pr)
if err != nil {
fmt.Println("HandleCreatePurchase: could not store:", err)
return false
}
storedID := stored.GetID()
time.AfterFunc(10*time.Minute, func() { DraftTimeout(storedID, tools.PURCHASE_RESOURCE) })
return false
}
// HandleRemoveBooking verifies auth and deletes the booking.
func (s *SchedulingResourcesService) HandleRemoveBooking(p RemoveResourcePayload, adminReq *tools.APIRequest) {
res, _, loadErr := booking.NewAccessor(adminReq).LoadOne(p.ID)
if loadErr != nil || res == nil {
return
}
existing := res.(*booking.Booking)
if existing.SchedulerPeerID != p.SchedulerPeerID || existing.ExecutionsID != p.ExecutionsID {
fmt.Println("HandleRemoveBooking: auth mismatch, ignoring", p.ID)
return
}
booking.NewAccessor(adminReq).DeleteOne(p.ID)
if self := s.Self(); self != nil {
planner.GetPlannerService().RefreshSelf(self.PeerID, adminReq)
}
}
// HandleRemovePurchase verifies auth and deletes the purchase.
func (s *SchedulingResourcesService) HandleRemovePurchase(p RemoveResourcePayload, adminReq *tools.APIRequest) {
res, _, loadErr := purchase_resource.NewAccessor(adminReq).LoadOne(p.ID)
if loadErr != nil || res == nil {
return
}
existing := res.(*purchase_resource.PurchaseResource)
if existing.SchedulerPeerID != p.SchedulerPeerID || existing.ExecutionsID != p.ExecutionsID {
fmt.Println("HandleRemovePurchase: auth mismatch, ignoring", p.ID)
return
}
purchase_resource.NewAccessor(adminReq).DeleteOne(p.ID)
}
// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------
var knownRegistryPrefixes = []string{
"docker.io/", "index.docker.io/", "ghcr.io/", "quay.io/",
"registry.hub.docker.com/", "gcr.io/", "public.ecr.aws/",
}
func isValidPeerlessRef(ref string) bool {
if ref == "" || ref == "<nil>" {
return false
}
for _, prefix := range knownRegistryPrefixes {
if strings.HasPrefix(ref, prefix) && len(ref) > len(prefix) {
return true
}
}
return false
}
func emitPeerBehaviorReport(targetPeerDID string, severity tools.BehaviorSeverity, reason, evidence string) {
if targetPeerDID == "" {
return
}
report := tools.PeerBehaviorReport{
ReporterApp: "oc-scheduler",
TargetPeerID: targetPeerDID,
Severity: severity,
Reason: reason,
Evidence: evidence,
At: time.Now().UTC(),
}
payload, err := json.Marshal(report)
if err != nil {
return
}
tools.NewNATSCaller().SetNATSPub(tools.PEER_BEHAVIOR_EVENT, tools.NATSResponse{
FromApp: "oc-scheduler",
Datatype: tools.PEER,
Method: int(tools.PEER_BEHAVIOR_EVENT),
Payload: payload,
})
}