oc-discovery -> conf

2026-04-08 10:04:41 +02:00
parent 46dee0a6cb
commit 29b26d366e
21 changed files with 1934 additions and 119 deletions
@@ -13,6 +13,26 @@ import (
 	oclib "cloud.o-forge.io/core/oc-lib"
 )

+// MemberEventType is the SWIM membership event classification.
+type MemberEventType string
+
+const (
+	MemberAlive   MemberEventType = "alive"
+	MemberSuspect MemberEventType = "suspect"
+	MemberDead    MemberEventType = "dead"
+)
+
+// MemberEvent is a SWIM membership event piggybacked on heartbeats (infection-style).
+// HopsLeft starts at InitialEventHops and is decremented on each retransmission.
+// Receivers discard events whose HopsLeft reaches 0 instead of forwarding them further.
+// Deduplication by (PeerID, Incarnation): higher incarnation or higher-priority type wins.
+type MemberEvent struct {
+	Type        MemberEventType `json:"type"`
+	PeerID      string          `json:"peer_id"`
+	Incarnation uint64          `json:"incarnation"`
+	HopsLeft    int             `json:"hops_left"`
+}
+
 type Heartbeat struct {
 	Name           string   `json:"name"`
 	Stream         *Stream  `json:"stream"`
@@ -39,6 +59,13 @@ type Heartbeat struct {
 	// Only one indexer per node receives Referent=true at a time (the best-scored one).
 	// The indexer stores the node in its referencedNodes for distributed search.
 	Referent bool `json:"referent,omitempty"`
+	// SuspectedIncarnation is set when this node currently suspects the target indexer.
+	// If the value matches the indexer's own incarnation, the indexer increments its
+	// incarnation and replies with the new value — this is the SWIM refutation signal.
+	SuspectedIncarnation *uint64 `json:"suspected_incarnation,omitempty"`
+	// MembershipEvents carries SWIM events piggybacked on this heartbeat.
+	// Events are forwarded infection-style until HopsLeft reaches 0.
+	MembershipEvents []MemberEvent `json:"membership_events,omitempty"`
 }

 // SearchPeerRequest is sent by a node to an indexer via ProtocolSearchPeer.
@@ -104,6 +131,13 @@ type HeartbeatResponse struct {
 	// Seeds: node de-stickies this indexer once it has MinIndexer non-seed alternatives.
 	// Non-seeds: node removes this indexer immediately if it has enough alternatives.
 	SuggestMigrate bool `json:"suggest_migrate,omitempty"`
+	// Incarnation is this indexer's current SWIM incarnation number.
+	// It is incremented whenever the indexer refutes a suspicion signal.
+	// The node tracks this to detect explicit refutations and to clear suspect state.
+	Incarnation uint64 `json:"incarnation,omitempty"`
+	// MembershipEvents carries SWIM events piggybacked on this response.
+	// The node should forward them to its other indexers (infection-style).
+	MembershipEvents []MemberEvent `json:"membership_events,omitempty"`
 }

 // ComputeIndexerScore computes a composite quality score [0, 100] for the connecting peer.
@@ -24,6 +24,11 @@ var TimeWatcher time.Time
 // retryRunning guards against launching multiple retryUntilSeedResponds goroutines.
 var retryRunning atomic.Bool

+// suspectTimeout is the maximum time a peer can stay in suspect state before
+// being declared dead and evicted. Aligned with 3 heartbeat intervals so the
+// peer has at least 3 chances to respond or refute the suspicion signal.
+const suspectTimeout = 3 * RecommendedHeartbeatInterval
+
 func ConnectToIndexers(h host.Host, minIndexer int, maxIndexer int, recordFn ...func() json.RawMessage) error {
 	TimeWatcher = time.Now().UTC()
 	logger := oclib.GetLogger()
@@ -304,6 +309,11 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
 			if recFn != nil {
 				baseHB.Record = recFn()
 			}
+			// Piggyback SWIM membership events on every outgoing heartbeat batch.
+			// All peers in the pool receive the same events this tick.
+			if isIndexerHB {
+				baseHB.MembershipEvents = NodeEventQueue.Drain(5)
+			}
 			// Determine the referent indexer: highest-scored one receives Referent=true
 			// so it stores us in its referencedNodes for distributed search.
 			var referentAddr string
@@ -323,6 +333,13 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
 				if isIndexerHB && referentAddr != "" && ai.Addr == referentAddr {
 					hb.Referent = true
 				}
+				// SWIM: signal suspicion so the peer can refute by incrementing incarnation.
+				if isIndexerHB {
+					if score := directory.GetScore(ai.Addr); score != nil && !score.UptimeTracker.SuspectedAt.IsZero() {
+						inc := score.UptimeTracker.LastKnownIncarnation
+						hb.SuspectedIncarnation = &inc
+					}
+				}
 				// Ensure an IndexerScore entry exists for this peer.
 				var score *Score
 				if isIndexerHB {
@@ -378,6 +395,40 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
 					score.UptimeTracker.RecordHeartbeat()
 					score.UptimeTracker.ConsecutiveFails = 0 // reset on success

+					// SWIM: clear suspect state on any successful direct heartbeat.
+					// The peer proved it is reachable; if it also incremented its incarnation
+					// that is an explicit refutation — log it distinctly.
+					if !score.UptimeTracker.SuspectedAt.IsZero() {
+						wasExplicitRefutation := resp != nil &&
+							resp.Incarnation > 0 &&
+							resp.Incarnation > score.UptimeTracker.LastKnownIncarnation
+						if wasExplicitRefutation {
+							logger.Info().Str("peer", ai.Info.ID.String()).
+								Uint64("old_incarnation", score.UptimeTracker.LastKnownIncarnation).
+								Uint64("new_incarnation", resp.Incarnation).
+								Msg("[swim] explicit refutation: incarnation incremented, suspicion cleared")
+						} else {
+							logger.Info().Str("peer", ai.Info.ID.String()).
+								Msg("[swim] suspect cleared — peer responded to direct probe")
+						}
+						score.UptimeTracker.SuspectedAt = time.Time{}
+						// Propagate alive event so other nodes can clear their own suspect state.
+						inc := score.UptimeTracker.LastKnownIncarnation
+						if resp != nil && resp.Incarnation > 0 {
+							inc = resp.Incarnation
+						}
+						NodeEventQueue.Add(MemberEvent{
+							Type:        MemberAlive,
+							PeerID:      ai.Info.ID.String(),
+							Incarnation: inc,
+							HopsLeft:    InitialEventHops,
+						})
+					}
+					// Always update last known incarnation.
+					if resp != nil && resp.Incarnation > score.UptimeTracker.LastKnownIncarnation {
+						score.UptimeTracker.LastKnownIncarnation = resp.Incarnation
+					}
+
 					maxRTT := BaseRoundTrip * 10
 					latencyScore := 1.0 - float64(rtt)/float64(maxRTT)
 					if latencyScore < 0 {
@@ -458,6 +509,15 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
 								score.witnessConsistent++
 							}
 						}
+
+						// SWIM infection: process membership events piggybacked on this response.
+						// Events with HopsLeft > 0 are re-queued for forwarding to other indexers.
+						for _, ev := range resp.MembershipEvents {
+							if ev.HopsLeft > 0 {
+								NodeEventQueue.Add(ev)
+							}
+							applyMemberEvent(ev, directory)
+						}
 					}

 					score.Score = score.ComputeNodeSideScore(latencyScore)
@@ -530,6 +590,59 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
 	}()
 }

+// runIndirectProbe asks up to k live indexers (voters) to probe target via
+// ProtocolBandwidthProbe and returns true if the majority report reachable.
+// This is the SWIM explicit indirect ping — called only on heartbeat failure.
+func runIndirectProbe(h host.Host, target pp.AddrInfo, voters []Entry, k int) bool {
+	if k > len(voters) {
+		k = len(voters)
+	}
+	if k == 0 {
+		return false
+	}
+	shuffled := make([]Entry, len(voters))
+	copy(shuffled, voters)
+	rand.Shuffle(len(shuffled), func(i, j int) { shuffled[i], shuffled[j] = shuffled[j], shuffled[i] })
+	shuffled = shuffled[:k]
+
+	type result struct{ reachable bool }
+	ch := make(chan result, k)
+	for _, voter := range shuffled {
+		if voter.Info == nil {
+			ch <- result{false}
+			continue
+		}
+		go func(v pp.AddrInfo) {
+			ctx, cancel := context.WithTimeout(context.Background(), 8*time.Second)
+			defer cancel()
+			s, err := h.NewStream(ctx, v.ID, ProtocolIndirectProbe)
+			if err != nil {
+				ch <- result{false}
+				return
+			}
+			s.SetDeadline(time.Now().Add(8 * time.Second))
+			defer s.Close()
+			if err := json.NewEncoder(s).Encode(IndirectProbeRequest{Target: target}); err != nil {
+				ch <- result{false}
+				return
+			}
+			var resp IndirectProbeResponse
+			if err := json.NewDecoder(s).Decode(&resp); err != nil {
+				ch <- result{false}
+				return
+			}
+			ch <- result{resp.Reachable}
+		}(*voter.Info)
+	}
+	reachable := 0
+	for range k {
+		if (<-ch).reachable {
+			reachable++
+		}
+	}
+	return reachable > k/2
+}
+
 func HeartbeatFailure(h host.Host, proto protocol.ID, directory *Directory,
 	addr string, info *pp.AddrInfo, isIndexerHB bool, maxPool int, err error) {
 	logger := oclib.GetLogger()
@@ -545,22 +658,96 @@ func HeartbeatFailure(h host.Host, proto protocol.ID, directory *Directory,
 					Msg("[pool] seed heartbeat failed — keeping in pool, ticker will retry " + err.Error())
 				return
 			}
-			// Indirect probing via other alive indexers:
-			// If other indexers in the pool are still responding, they act as implicit
-			// third-party witnesses confirming our connectivity is fine — the failed
-			// indexer is genuinely dead, evict immediately.
-			// If this is the last indexer, there is no third party. Retry up to 3 times
-			// (consecutive failures tracked in UptimeTracker) before declaring it dead.
-			if len(directory.GetAddrs()) <= 1 {
-				score.UptimeTracker.ConsecutiveFails++
-				if score.UptimeTracker.ConsecutiveFails < 3 {
+
+			voters := directory.GetAddrs()
+			if len(voters) <= 1 {
+				// Last indexer: no peer available to proxy a probe.
+				// Enter suspect state on first failure; evict only after suspectTimeout.
+				if score.UptimeTracker.SuspectedAt.IsZero() {
+					score.UptimeTracker.SuspectedAt = time.Now().UTC()
+					score.UptimeTracker.ConsecutiveFails++
+					NodeEventQueue.Add(MemberEvent{
+						Type:        MemberSuspect,
+						PeerID:      info.ID.String(),
+						Incarnation: score.UptimeTracker.LastKnownIncarnation,
+						HopsLeft:    InitialEventHops,
+					})
 					logger.Warn().Str("peer", info.ID.String()).
-						Int("attempt", score.UptimeTracker.ConsecutiveFails).
-						Msg("[indirect] last indexer failed, retrying before eviction")
+						Msg("[swim] last indexer suspect — waiting for refutation or timeout")
 					return
 				}
+				if time.Since(score.UptimeTracker.SuspectedAt) < suspectTimeout {
+					logger.Warn().Str("peer", info.ID.String()).
+						Dur("suspected_for", time.Since(score.UptimeTracker.SuspectedAt)).
+						Msg("[swim] last indexer still failing, holding in suspect state")
+					return
+				}
+				// suspectTimeout exceeded with no refutation — declare dead.
 				logger.Warn().Str("peer", info.ID.String()).
-					Msg("[indirect] last indexer failed 3 times consecutively, evicting")
+					Msg("[swim] last indexer suspect timeout exceeded, evicting")
+				NodeEventQueue.Add(MemberEvent{
+					Type:        MemberDead,
+					PeerID:      info.ID.String(),
+					Incarnation: score.UptimeTracker.LastKnownIncarnation,
+					HopsLeft:    InitialEventHops,
+				})
+			} else if score.UptimeTracker.SuspectedAt.IsZero() {
+				// First miss with other live indexers available:
+				// enter suspect state and run an indirect probe asynchronously.
+				score.UptimeTracker.SuspectedAt = time.Now().UTC()
+				score.UptimeTracker.ConsecutiveFails++
+				NodeEventQueue.Add(MemberEvent{
+					Type:        MemberSuspect,
+					PeerID:      info.ID.String(),
+					Incarnation: score.UptimeTracker.LastKnownIncarnation,
+					HopsLeft:    InitialEventHops,
+				})
+				probeTarget := *info
+				go func() {
+					alive := runIndirectProbe(h, probeTarget, voters, 2)
+					if alive {
+						// Other indexers confirm the target is reachable → our direct
+						// link may be temporarily broken. Keep suspected; the next
+						// heartbeat tick will retry the direct probe.
+						logger.Warn().Str("peer", probeTarget.ID.String()).
+							Msg("[swim] indirect probe: target reachable by peers, keeping (suspected)")
+					} else {
+						// Majority of probes also failed → the indexer is genuinely dead.
+						logger.Warn().Str("peer", probeTarget.ID.String()).
+							Msg("[swim] indirect probe: target unreachable, evicting")
+						NodeEventQueue.Add(MemberEvent{
+							Type:        MemberDead,
+							PeerID:      probeTarget.ID.String(),
+							Incarnation: score.UptimeTracker.LastKnownIncarnation,
+							HopsLeft:    InitialEventHops,
+						})
+						consensusVoters := evictPeer(directory, addr, probeTarget.ID, proto)
+						need := max(maxPool-len(consensusVoters), 1)
+						if len(consensusVoters) > 0 {
+							TriggerConsensus(h, consensusVoters, need)
+						} else {
+							replenishIndexersFromDHT(h, need)
+						}
+					}
+				}()
+				return // decision deferred to probe goroutine
+			} else if time.Since(score.UptimeTracker.SuspectedAt) < suspectTimeout {
+				// Still within suspect window — the next tick's SuspectedIncarnation
+				// in the heartbeat may trigger a refutation. Keep retrying.
+				logger.Warn().Str("peer", info.ID.String()).
+					Dur("suspected_for", time.Since(score.UptimeTracker.SuspectedAt)).
+					Msg("[swim] suspected peer still failing, waiting for refutation or timeout")
+				return
+			} else {
+				// suspectTimeout exceeded — declare dead and fall through to eviction.
+				logger.Warn().Str("peer", info.ID.String()).
+					Msg("[swim] suspect timeout exceeded, evicting")
+				NodeEventQueue.Add(MemberEvent{
+					Type:        MemberDead,
+					PeerID:      info.ID.String(),
+					Incarnation: score.UptimeTracker.LastKnownIncarnation,
+					HopsLeft:    InitialEventHops,
+				})
 			}
 		}
 	}
@@ -587,3 +774,34 @@ func HeartbeatFailure(h host.Host, proto protocol.ID, directory *Directory,
 		}
 	}
 }
+
+// applyMemberEvent applies an incoming SWIM membership event to the local directory.
+// Only MemberAlive events with a higher incarnation can clear an existing suspect state;
+// MemberSuspect / MemberDead from gossip are informational — we do not act on them
+// unilaterally since the node has its own direct-probe evidence.
+func applyMemberEvent(ev MemberEvent, directory *Directory) {
+	if ev.Type != MemberAlive {
+		return
+	}
+	logger := oclib.GetLogger()
+	for _, ai := range directory.GetAddrs() {
+		if ai.Info == nil || ai.Info.ID.String() != ev.PeerID {
+			continue
+		}
+		score := directory.GetScore(ai.Addr)
+		if score == nil || score.UptimeTracker == nil {
+			return
+		}
+		if ev.Incarnation > score.UptimeTracker.LastKnownIncarnation {
+			score.UptimeTracker.LastKnownIncarnation = ev.Incarnation
+			if !score.UptimeTracker.SuspectedAt.IsZero() {
+				score.UptimeTracker.SuspectedAt = time.Time{}
+				score.UptimeTracker.ConsecutiveFails = 0
+				logger.Info().Str("peer", ev.PeerID).
+					Uint64("incarnation", ev.Incarnation).
+					Msg("[swim] alive event via gossip cleared suspicion")
+			}
+		}
+		return
+	}
+}
@@ -146,6 +146,22 @@ func (s *LongLivedPubSubService) SubscribeToSearch(ps *pubsub.PubSub, f *func(co
 	if f != nil {
 		return SubscribeEvents(s, context.Background(), TopicPubSubSearch, -1, *f)
 	}
+	// Even when no handler is needed (e.g. strict indexers), we must call
+	// topic.Subscribe() so that this peer sends a SUBSCRIBE control message
+	// to connected peers and joins the GossipSub mesh as a forwarder.
+	// Without this, messages cannot be relayed through indexers between nodes.
+	topic := s.LongLivedPubSubs[TopicPubSubSearch]
+	sub, err := topic.Subscribe()
+	if err != nil {
+		return err
+	}
+	go func() {
+		for {
+			if _, err := sub.Next(context.Background()); err != nil {
+				return
+			}
+		}
+	}()
 	return nil
 }

@@ -161,26 +177,27 @@ func SubscribeEvents[T interface{}](s *LongLivedPubSubService,
 		return err
 	}
 	// launch loop waiting for results.
-	go waitResults(s, ctx, sub, proto, timeout, f)
+	go waitResults(topic, s, ctx, sub, proto, timeout, f)

 	return nil
 }

-func waitResults[T interface{}](s *LongLivedPubSubService, ctx context.Context, sub *pubsub.Subscription, proto string, timeout int, f func(context.Context, T, string)) {
+func waitResults[T interface{}](topic *pubsub.Topic, s *LongLivedPubSubService, ctx context.Context, sub *pubsub.Subscription, proto string, timeout int, f func(context.Context, T, string)) {
 	defer ctx.Done()
 	for {
 		s.PubsubMu.Lock()                     // check safely if cache is actually notified subscribed to topic
 		if s.LongLivedPubSubs[proto] == nil { // if not kill the loop.
-			s.PubsubMu.Unlock()
-			break
+			s.LongLivedPubSubs[proto] = topic
 		}
 		s.PubsubMu.Unlock()
+
 		// if still subscribed -> wait for new message
 		var cancel context.CancelFunc
 		if timeout != -1 {
 			ctx, cancel = context.WithTimeout(ctx, time.Duration(timeout)*time.Second)
 			defer cancel()
 		}
+
 		msg, err := sub.Next(ctx)
 		if err != nil {
 			if errors.Is(err, context.DeadlineExceeded) {
@@ -197,5 +214,6 @@ func waitResults[T interface{}](s *LongLivedPubSubService, ctx context.Context,
 			continue
 		}
 		f(ctx, evt, fmt.Sprintf("%v", proto))
+		fmt.Println("DEADLOCK ?")
 	}
 }
@@ -21,7 +21,12 @@ type UptimeTracker struct {
 	FirstSeen        time.Time
 	LastSeen         time.Time
 	TotalOnline      time.Duration
-	ConsecutiveFails int // incremented on each heartbeat failure; reset to 0 on success
+	ConsecutiveFails int       // kept for compatibility / logging; primary eviction uses SuspectedAt
+	SuspectedAt      time.Time // SWIM: non-zero when this peer is in suspect state
+	// LastKnownIncarnation is the last incarnation number received from this peer.
+	// When a peer sees itself suspected (SuspectedIncarnation in heartbeat) it
+	// increments its incarnation and the node clears the suspect state on receipt.
+	LastKnownIncarnation uint64
 }

 // RecordHeartbeat accumulates online time gap-aware: only counts the interval if
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"io"
 	"math/rand"
+	"oc-discovery/conf"
 	"strings"
 	"sync"
 	"time"
@@ -22,6 +23,8 @@ type LongLivedStreamRecordedService[T interface{}] struct {
 	StreamRecords map[protocol.ID]map[pp.ID]*StreamRecord[T]
 	StreamMU      sync.RWMutex
 	maxNodesConn  int
+	ConnGuard     *ConnectionRateGuard
+
 	// AllowInbound, when set, is called once at stream open before any heartbeat
 	// is decoded.  remotePeer is the connecting peer; isNew is true when no
 	// StreamRecord exists yet (first-ever connection).  Return a non-nil error
@@ -39,13 +42,9 @@ type LongLivedStreamRecordedService[T interface{}] struct {
 	AfterDelete func(pid pp.ID, name string, did string)
 	// BuildHeartbeatResponse, when set, is called after each successfully decoded
 	// heartbeat to build the response sent back to the node.
-	// remotePeer is the peer that sent the heartbeat (used for offload routing).
-	// need is how many more indexers the node wants (from hb.Need).
-	// referent is true when the node designated this indexer as its search referent.
-	// rawRecord is the fresh signed PeerRecord embedded in the heartbeat (hb.Record),
-	// passed directly so the handler does not race with AfterHeartbeat goroutine
-	// updating StreamRecord.Record.
-	BuildHeartbeatResponse func(remotePeer pp.ID, need int, challenges []string, challengeDID string, referent bool, rawRecord json.RawMessage) *HeartbeatResponse
+	// remotePeer is the connecting peer. hb is the full decoded heartbeat, including
+	// SWIM fields (SuspectedIncarnation, MembershipEvents) and record/challenge data.
+	BuildHeartbeatResponse func(remotePeer pp.ID, hb *Heartbeat) *HeartbeatResponse
 }

 func (ix *LongLivedStreamRecordedService[T]) MaxNodesConn() int {
@@ -57,6 +56,7 @@ func NewStreamRecordedService[T interface{}](h host.Host, maxNodesConn int) *Lon
 		LongLivedPubSubService: NewLongLivedPubSubService(h),
 		StreamRecords:          map[protocol.ID]map[pp.ID]*StreamRecord[T]{},
 		maxNodesConn:           maxNodesConn,
+		ConnGuard:              newConnectionRateGuard(),
 	}
 	go service.StartGC(30 * time.Second)
 	// Garbage collection is needed on every Map of Long-Lived Stream... it may be a top level redesigned
@@ -247,7 +247,7 @@ func (ix *LongLivedStreamRecordedService[T]) HandleHeartbeat(s network.Stream) {
 		}
 		// Send response back to the node (bidirectional heartbeat).
 		if ix.BuildHeartbeatResponse != nil {
-			if resp := ix.BuildHeartbeatResponse(s.Conn().RemotePeer(), hb.Need, hb.Challenges, hb.ChallengeDID, hb.Referent, hb.Record); resp != nil {
+			if resp := ix.BuildHeartbeatResponse(s.Conn().RemotePeer(), hb); resp != nil {
 				s.SetWriteDeadline(time.Now().Add(3 * time.Second))
 				json.NewEncoder(s).Encode(resp)
 				s.SetWriteDeadline(time.Time{})
@@ -303,3 +303,52 @@ func CheckHeartbeat(h host.Host, s network.Stream, dec *json.Decoder, streams ma
 		return &pid, &hb, err
 	}
 }
+
+// ── ConnectionRateGuard ───────────────────────────────────────────────────────
+
+// ConnectionRateGuard limits the number of NEW incoming connections accepted
+// within a sliding time window.  It protects public indexers against coordinated
+// registration floods (Sybil bursts).
+
+const defaultMaxConnPerWindow = 20
+const defaultConnWindowSecs = 30
+
+type ConnectionRateGuard struct {
+	mu          sync.Mutex
+	window      []time.Time
+	maxInWindow int
+	windowDur   time.Duration
+}
+
+func newConnectionRateGuard() *ConnectionRateGuard {
+	cfg := conf.GetConfig()
+	return &ConnectionRateGuard{
+		maxInWindow: CfgOr(cfg.MaxConnPerWindow, defaultMaxConnPerWindow),
+		windowDur:   time.Duration(CfgOr(cfg.ConnWindowSecs, defaultConnWindowSecs)) * time.Second,
+	}
+}
+
+// Allow returns true if a new connection may be accepted.
+// The internal window is pruned on each call so memory stays bounded.
+func (g *ConnectionRateGuard) Allow() bool {
+	g.mu.Lock()
+	defer g.mu.Unlock()
+	now := time.Now()
+	cutoff := now.Add(-g.windowDur)
+	i := 0
+	for i < len(g.window) && g.window[i].Before(cutoff) {
+		i++
+	}
+	g.window = g.window[i:]
+	if len(g.window) >= g.maxInWindow {
+		return false
+	}
+	g.window = append(g.window, now)
+	return true
+}
+func CfgOr(v, def int) int {
+	if v > 0 {
+		return v
+	}
+	return def
+}
@@ -14,11 +14,110 @@ import (
 	"github.com/libp2p/go-libp2p/core/protocol"
 )

+// InitialEventHops is the starting hop count for SWIM membership events.
+// floor(log2(typical max-pool)) + 1 gives O(log n) propagation rounds.
+const InitialEventHops = 4
+
+const maxMemberEventQueue = 50
+
+// MembershipEventQueue holds SWIM membership events to be piggybacked on
+// outgoing heartbeats (infection-style dissemination). Bounded at
+// maxMemberEventQueue entries; events are deduplicated by PeerID.
+type MembershipEventQueue struct {
+	mu     sync.Mutex
+	events []MemberEvent
+}
+
+// memberEventPriority maps event types to an integer so higher-severity
+// events override lower-severity ones for the same PeerID.
+func memberEventPriority(t MemberEventType) int {
+	switch t {
+	case MemberDead:
+		return 3
+	case MemberSuspect:
+		return 2
+	case MemberAlive:
+		return 1
+	}
+	return 0
+}
+
+// Add inserts or updates a membership event.
+// An incoming event replaces the existing entry for the same PeerID when:
+//   - its Incarnation is higher, OR
+//   - the Incarnation is equal but the event type is higher-severity.
+func (q *MembershipEventQueue) Add(e MemberEvent) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	for i, ex := range q.events {
+		if ex.PeerID == e.PeerID {
+			if e.Incarnation > ex.Incarnation ||
+				(e.Incarnation == ex.Incarnation && memberEventPriority(e.Type) > memberEventPriority(ex.Type)) {
+				q.events[i] = e
+			}
+			return
+		}
+	}
+	if len(q.events) >= maxMemberEventQueue {
+		q.events = q.events[1:] // drop oldest
+	}
+	q.events = append(q.events, e)
+}
+
+// Drain returns up to max events ready for transmission.
+// HopsLeft is decremented on each call; events that reach 0 are removed from
+// the queue (they have already propagated enough rounds).
+func (q *MembershipEventQueue) Drain(max int) []MemberEvent {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	if len(q.events) == 0 {
+		return nil
+	}
+	out := make([]MemberEvent, 0, max)
+	kept := q.events[:0]
+	for _, e := range q.events {
+		if len(out) < max {
+			e.HopsLeft--
+			out = append(out, e)
+			if e.HopsLeft > 0 {
+				kept = append(kept, e)
+			}
+			// HopsLeft reached 0: event has propagated enough, drop from queue.
+		} else {
+			kept = append(kept, e)
+		}
+	}
+	q.events = kept
+	return out
+}
+
+// NodeEventQueue is the global SWIM event queue for the node side.
+// Events are added on suspect/dead detection and drained into outgoing heartbeats.
+var NodeEventQueue = &MembershipEventQueue{}
+
 const (
 	ProtocolPublish = "/opencloud/record/publish/1.0"
 	ProtocolGet     = "/opencloud/record/get/1.0"
+	ProtocolDelete  = "/opencloud/record/delete/1.0"
+	// ProtocolIndirectProbe is opened by a node toward a live indexer to ask it
+	// to actively probe a suspected indexer on the node's behalf (SWIM indirect ping).
+	// It is the only inter-indexer protocol — indexers do not maintain persistent
+	// connections to each other; this stream is one-shot and short-lived.
+	ProtocolIndirectProbe = "/opencloud/indexer/probe/1.0"
 )

+// IndirectProbeRequest is sent by a node over ProtocolIndirectProbe.
+// The receiving indexer must attempt to reach Target and report back.
+type IndirectProbeRequest struct {
+	Target pp.AddrInfo `json:"target"`
+}
+
+// IndirectProbeResponse is the reply from the probing indexer.
+type IndirectProbeResponse struct {
+	Reachable bool  `json:"reachable"`
+	LatencyMs int64 `json:"latency_ms,omitempty"`
+}
+
 const ProtocolHeartbeat = "/opencloud/heartbeat/1.0"

 // ProtocolWitnessQuery is opened by a node to ask a peer what it thinks of a given indexer.