saved
This commit is contained in:
@@ -166,8 +166,6 @@ func SubscribeEvents[T interface{}](s *LongLivedPubSubService,
|
||||
}
|
||||
|
||||
func waitResults[T interface{}](s *LongLivedPubSubService, ctx context.Context, sub *pubsub.Subscription, proto string, timeout int, f func(context.Context, T, string)) {
|
||||
fmt.Println("waitResults", proto)
|
||||
|
||||
defer ctx.Done()
|
||||
for {
|
||||
s.PubsubMu.Lock() // check safely if cache is actually notified subscribed to topic
|
||||
|
||||
@@ -35,6 +35,10 @@ type LongLivedStreamRecordedService[T interface{}] struct {
|
||||
AfterDelete func(pid pp.ID, name string, did string)
|
||||
}
|
||||
|
||||
func (ix *LongLivedStreamRecordedService[T]) MaxNodesConn() int {
|
||||
return ix.maxNodesConn
|
||||
}
|
||||
|
||||
func NewStreamRecordedService[T interface{}](h host.Host, maxNodesConn int) *LongLivedStreamRecordedService[T] {
|
||||
service := &LongLivedStreamRecordedService[T]{
|
||||
LongLivedPubSubService: NewLongLivedPubSubService(h),
|
||||
@@ -160,25 +164,26 @@ func (ix *LongLivedStreamRecordedService[T]) HandleHeartbeat(s network.Stream) {
|
||||
// if record already seen update last seen
|
||||
if rec, ok := streams[*pid]; ok {
|
||||
rec.DID = hb.DID
|
||||
if rec.HeartbeatStream == nil {
|
||||
rec.HeartbeatStream = hb.Stream
|
||||
}
|
||||
// Preserve the existing UptimeTracker so TotalOnline accumulates correctly.
|
||||
// hb.Stream is a fresh Stream with no UptimeTracker; carry the old one over.
|
||||
oldTracker := rec.GetUptimeTracker()
|
||||
rec.HeartbeatStream = hb.Stream
|
||||
if rec.HeartbeatStream.UptimeTracker == nil {
|
||||
rec.HeartbeatStream.UptimeTracker = &UptimeTracker{
|
||||
FirstSeen: time.Now().UTC(),
|
||||
LastSeen: time.Now().UTC(),
|
||||
}
|
||||
if oldTracker != nil {
|
||||
rec.HeartbeatStream.UptimeTracker = oldTracker
|
||||
} else {
|
||||
rec.HeartbeatStream.UptimeTracker = &UptimeTracker{FirstSeen: time.Now().UTC()}
|
||||
}
|
||||
rec.HeartbeatStream.UptimeTracker.RecordHeartbeat()
|
||||
rec.LastScore = hb.Score
|
||||
logger.Info().Msg("A new node is updated : " + pid.String())
|
||||
} else {
|
||||
hb.Stream.UptimeTracker = &UptimeTracker{
|
||||
FirstSeen: time.Now().UTC(),
|
||||
LastSeen: time.Now().UTC(),
|
||||
}
|
||||
tracker := &UptimeTracker{FirstSeen: time.Now().UTC()}
|
||||
tracker.RecordHeartbeat()
|
||||
hb.Stream.UptimeTracker = tracker
|
||||
streams[*pid] = &StreamRecord[T]{
|
||||
DID: hb.DID,
|
||||
HeartbeatStream: hb.Stream,
|
||||
LastScore: hb.Score,
|
||||
}
|
||||
logger.Info().Msg("A new node is subscribed : " + pid.String())
|
||||
}
|
||||
@@ -215,30 +220,33 @@ func CheckHeartbeat(h host.Host, s network.Stream, dec *json.Decoder, streams ma
|
||||
if err := dec.Decode(&hb); err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
_, bpms, _ := getBandwidthChallengeRate(h, s.Conn().RemotePeer(), MinPayloadChallenge+int(rand.Float64()*(MaxPayloadChallenge-MinPayloadChallenge)))
|
||||
_, bpms, latencyScore, _ := getBandwidthChallengeRate(h, s.Conn().RemotePeer(), MinPayloadChallenge+int(rand.Float64()*(MaxPayloadChallenge-MinPayloadChallenge)))
|
||||
{
|
||||
pid, err := pp.Decode(hb.PeerID)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
upTime := float64(0)
|
||||
isFirstHeartbeat := true
|
||||
uptimeRatio := float64(0)
|
||||
age := time.Duration(0)
|
||||
lock.Lock()
|
||||
if rec, ok := streams[pid]; ok && rec.GetUptimeTracker() != nil {
|
||||
upTime = rec.GetUptimeTracker().Uptime().Hours() / float64(time.Since(TimeWatcher).Hours())
|
||||
isFirstHeartbeat = false
|
||||
uptimeRatio = rec.GetUptimeTracker().UptimeRatio()
|
||||
age = rec.GetUptimeTracker().Uptime()
|
||||
}
|
||||
lock.Unlock()
|
||||
diversity := getDiversityRate(h, hb.IndexersBinded)
|
||||
hb.ComputeIndexerScore(upTime, bpms, diversity)
|
||||
// First heartbeat: uptime is always 0 so the score ceiling is 60, below the
|
||||
// steady-state threshold of 75. Use a lower admission threshold so new peers
|
||||
// can enter and start accumulating uptime. Subsequent heartbeats must meet
|
||||
// the full threshold once uptime is tracked.
|
||||
minScore := float64(40)
|
||||
if isFirstHeartbeat {
|
||||
minScore = 40
|
||||
// E: measure the indexer's own subnet diversity, not the node's view.
|
||||
diversity := getOwnDiversityRate(h)
|
||||
// fillRate: fraction of indexer capacity used — higher = more peers trust this indexer.
|
||||
fillRate := 0.0
|
||||
if maxNodes > 0 {
|
||||
fillRate = float64(len(h.Network().Peers())) / float64(maxNodes)
|
||||
if fillRate > 1 {
|
||||
fillRate = 1
|
||||
}
|
||||
}
|
||||
hb.ComputeIndexerScore(uptimeRatio, bpms, diversity, latencyScore, fillRate)
|
||||
// B: dynamic minScore — starts at 20% for brand-new peers, ramps to 80% at 24h.
|
||||
minScore := dynamicMinScore(age)
|
||||
if hb.Score < minScore {
|
||||
return nil, nil, errors.New("not enough trusting value")
|
||||
}
|
||||
@@ -247,7 +255,7 @@ func CheckHeartbeat(h host.Host, s network.Stream, dec *json.Decoder, streams ma
|
||||
DID: hb.DID,
|
||||
Stream: s,
|
||||
Expiry: time.Now().UTC().Add(2 * time.Minute),
|
||||
} // here is the long-lived bidirectionnal heart bit.
|
||||
} // here is the long-lived bidirectional heartbeat.
|
||||
return &pid, &hb, err
|
||||
}
|
||||
}
|
||||
@@ -268,7 +276,40 @@ func getDiversityRate(h host.Host, peers []string) float64 {
|
||||
if len(diverse) == 0 || len(peers) == 0 {
|
||||
return 1
|
||||
}
|
||||
return float64(len(diverse) / len(peers))
|
||||
return float64(len(diverse)) / float64(len(peers))
|
||||
}
|
||||
|
||||
// getOwnDiversityRate measures subnet /24 diversity of the indexer's own connected peers.
|
||||
// This evaluates the indexer's network position rather than the connecting node's topology.
|
||||
func getOwnDiversityRate(h host.Host) float64 {
|
||||
diverse := map[string]struct{}{}
|
||||
total := 0
|
||||
for _, pid := range h.Network().Peers() {
|
||||
for _, maddr := range h.Peerstore().Addrs(pid) {
|
||||
total++
|
||||
ip, err := ExtractIP(maddr.String())
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
diverse[ip.Mask(net.CIDRMask(24, 32)).String()] = struct{}{}
|
||||
}
|
||||
}
|
||||
if total == 0 {
|
||||
return 1
|
||||
}
|
||||
return float64(len(diverse)) / float64(total)
|
||||
}
|
||||
|
||||
// dynamicMinScore returns the minimum acceptable score for a peer, starting
|
||||
// permissive (20%) for brand-new peers and hardening linearly to 80% over 24h.
|
||||
// This prevents ejecting newcomers in fresh networks while filtering parasites.
|
||||
func dynamicMinScore(age time.Duration) float64 {
|
||||
hours := age.Hours()
|
||||
score := 20.0 + 60.0*(hours/24.0)
|
||||
if score > 80.0 {
|
||||
score = 80.0
|
||||
}
|
||||
return score
|
||||
}
|
||||
|
||||
func checkPeers(h host.Host, peers []string) ([]string, []string) {
|
||||
@@ -295,53 +336,95 @@ const MaxPayloadChallenge = 2048
|
||||
const BaseRoundTrip = 400 * time.Millisecond
|
||||
|
||||
// getBandwidthChallengeRate opens a dedicated ProtocolBandwidthProbe stream to
|
||||
// remotePeer, sends a random payload, reads the echo, and computes throughput.
|
||||
// remotePeer, sends a random payload, reads the echo, and computes throughput
|
||||
// and a latency score. Returns (ok, bpms, latencyScore, error).
|
||||
// latencyScore is 1.0 when RTT is very fast and 0.0 when at or beyond maxRoundTrip.
|
||||
// Using a separate stream avoids mixing binary data on the JSON heartbeat stream
|
||||
// and ensures the echo handler is actually running on the remote side.
|
||||
func getBandwidthChallengeRate(h host.Host, remotePeer pp.ID, payloadSize int) (bool, float64, error) {
|
||||
func getBandwidthChallengeRate(h host.Host, remotePeer pp.ID, payloadSize int) (bool, float64, float64, error) {
|
||||
payload := make([]byte, payloadSize)
|
||||
if _, err := cr.Read(payload); err != nil {
|
||||
return false, 0, err
|
||||
return false, 0, 0, err
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
s, err := h.NewStream(ctx, remotePeer, ProtocolBandwidthProbe)
|
||||
if err != nil {
|
||||
return false, 0, err
|
||||
return false, 0, 0, err
|
||||
}
|
||||
defer s.Reset()
|
||||
s.SetDeadline(time.Now().Add(10 * time.Second))
|
||||
start := time.Now()
|
||||
if _, err = s.Write(payload); err != nil {
|
||||
return false, 0, err
|
||||
return false, 0, 0, err
|
||||
}
|
||||
s.CloseWrite()
|
||||
// Half-close the write side so the handler's io.Copy sees EOF and stops.
|
||||
// Read the echo.
|
||||
response := make([]byte, payloadSize)
|
||||
if _, err = io.ReadFull(s, response); err != nil {
|
||||
return false, 0, err
|
||||
return false, 0, 0, err
|
||||
}
|
||||
|
||||
duration := time.Since(start)
|
||||
maxRoundTrip := BaseRoundTrip + (time.Duration(payloadSize) * (100 * time.Millisecond))
|
||||
mbps := float64(payloadSize*8) / duration.Seconds() / 1e6
|
||||
if duration > maxRoundTrip || mbps < 5.0 {
|
||||
return false, float64(mbps / MaxExpectedMbps), nil
|
||||
|
||||
// latencyScore: 1.0 = instant, 0.0 = at maxRoundTrip or beyond.
|
||||
latencyScore := 1.0 - float64(duration)/float64(maxRoundTrip)
|
||||
if latencyScore < 0 {
|
||||
latencyScore = 0
|
||||
}
|
||||
return true, float64(mbps / MaxExpectedMbps), nil
|
||||
if latencyScore > 1 {
|
||||
latencyScore = 1
|
||||
}
|
||||
|
||||
if duration > maxRoundTrip || mbps < 5.0 {
|
||||
return false, float64(mbps / MaxExpectedMbps), latencyScore, nil
|
||||
}
|
||||
return true, float64(mbps / MaxExpectedMbps), latencyScore, nil
|
||||
}
|
||||
|
||||
type UptimeTracker struct {
|
||||
FirstSeen time.Time
|
||||
LastSeen time.Time
|
||||
FirstSeen time.Time
|
||||
LastSeen time.Time
|
||||
TotalOnline time.Duration
|
||||
}
|
||||
|
||||
// RecordHeartbeat accumulates online time gap-aware: only counts the interval if
|
||||
// the gap since the last heartbeat is within 2× the recommended interval (i.e. no
|
||||
// extended outage). Call this each time a heartbeat is successfully processed.
|
||||
func (u *UptimeTracker) RecordHeartbeat() {
|
||||
now := time.Now().UTC()
|
||||
if !u.LastSeen.IsZero() {
|
||||
gap := now.Sub(u.LastSeen)
|
||||
if gap <= 2*RecommendedHeartbeatInterval {
|
||||
u.TotalOnline += gap
|
||||
}
|
||||
}
|
||||
u.LastSeen = now
|
||||
}
|
||||
|
||||
func (u *UptimeTracker) Uptime() time.Duration {
|
||||
return time.Since(u.FirstSeen)
|
||||
}
|
||||
|
||||
// UptimeRatio returns the fraction of tracked lifetime during which the peer was
|
||||
// continuously online (gap ≤ 2×RecommendedHeartbeatInterval). Returns 0 before
|
||||
// the first heartbeat interval has elapsed.
|
||||
func (u *UptimeTracker) UptimeRatio() float64 {
|
||||
total := time.Since(u.FirstSeen)
|
||||
if total <= 0 {
|
||||
return 0
|
||||
}
|
||||
ratio := float64(u.TotalOnline) / float64(total)
|
||||
if ratio > 1 {
|
||||
ratio = 1
|
||||
}
|
||||
return ratio
|
||||
}
|
||||
|
||||
func (u *UptimeTracker) IsEligible(min time.Duration) bool {
|
||||
return u.Uptime() >= min
|
||||
}
|
||||
@@ -350,6 +433,7 @@ type StreamRecord[T interface{}] struct {
|
||||
DID string
|
||||
HeartbeatStream *Stream
|
||||
Record T
|
||||
LastScore float64
|
||||
}
|
||||
|
||||
func (s *StreamRecord[T]) GetUptimeTracker() *UptimeTracker {
|
||||
@@ -426,7 +510,24 @@ const (
|
||||
|
||||
var TimeWatcher time.Time
|
||||
|
||||
// IndexerRecord holds admission metadata for an indexer in the pool.
|
||||
// AdmittedAt is zero for seed entries (IndexerAddresses) never validated by a native.
|
||||
// It is set to the admission time when a native confirms the indexer via consensus.
|
||||
type IndexerRecord struct {
|
||||
AdmittedAt time.Time
|
||||
}
|
||||
|
||||
// IsStableVoter returns true when this indexer has been admitted by a native
|
||||
// long enough ago to participate as a voter in Phase 2 liveness voting.
|
||||
func (r *IndexerRecord) IsStableVoter() bool {
|
||||
return !r.AdmittedAt.IsZero() && time.Since(r.AdmittedAt) >= MinStableAge
|
||||
}
|
||||
|
||||
var StaticIndexers map[string]*pp.AddrInfo = map[string]*pp.AddrInfo{}
|
||||
|
||||
// StaticIndexerMeta mirrors StaticIndexers with admission metadata.
|
||||
// Both maps are always updated together under StreamMuIndexes.
|
||||
var StaticIndexerMeta map[string]*IndexerRecord = map[string]*IndexerRecord{}
|
||||
var StreamMuIndexes sync.RWMutex
|
||||
var StreamIndexers ProtocolStream = ProtocolStream{}
|
||||
|
||||
@@ -462,28 +563,64 @@ func ConnectToIndexers(h host.Host, minIndexer int, maxIndexer int, myPID pp.ID,
|
||||
return nil
|
||||
}
|
||||
|
||||
// No native configured: bootstrap from IndexerAddresses seed set.
|
||||
addresses := strings.Split(conf.GetConfig().IndexerAddresses, ",")
|
||||
|
||||
if len(addresses) > maxIndexer {
|
||||
addresses = addresses[0:maxIndexer]
|
||||
}
|
||||
|
||||
StreamMuIndexes.Lock()
|
||||
for _, indexerAddr := range addresses {
|
||||
indexerAddr = strings.TrimSpace(indexerAddr)
|
||||
if indexerAddr == "" {
|
||||
continue
|
||||
}
|
||||
ad, err := pp.AddrInfoFromString(indexerAddr)
|
||||
if err != nil {
|
||||
logger.Err(err)
|
||||
continue
|
||||
}
|
||||
// AdmittedAt zero = seed, not yet validated by a native.
|
||||
StaticIndexers[indexerAddr] = ad
|
||||
StaticIndexerMeta[indexerAddr] = &IndexerRecord{}
|
||||
}
|
||||
indexerCount := len(StaticIndexers)
|
||||
StreamMuIndexes.Unlock()
|
||||
|
||||
SendHeartbeat(context.Background(), ProtocolHeartbeat, conf.GetConfig().Name, h, StreamIndexers, StaticIndexers, &StreamMuIndexes, 20*time.Second, recordFn...) // your indexer is just like a node for the next indexer.
|
||||
if indexerCount < minIndexer {
|
||||
return errors.New("you run a node without indexers... your gonna be isolated.")
|
||||
}
|
||||
|
||||
// Start long-lived heartbeat to seed indexers. The single goroutine follows
|
||||
// all subsequent StaticIndexers changes (including after native discovery).
|
||||
SendHeartbeat(context.Background(), ProtocolHeartbeat, conf.GetConfig().Name,
|
||||
h, StreamIndexers, StaticIndexers, &StreamMuIndexes, 20*time.Second, recordFn...)
|
||||
|
||||
// Async: ask seed indexers whether they know a native — same logic as
|
||||
// replenishNativesFromPeers. Runs after a short delay to let h.Connect warm up.
|
||||
go func() {
|
||||
time.Sleep(2 * time.Second)
|
||||
logger.Info().Msg("[startup] no native configured — asking seed indexers for native addresses")
|
||||
newAddr := fetchNativeFromIndexers(h, nil)
|
||||
if newAddr == "" {
|
||||
logger.Info().Msg("[startup] no native found from seed indexers — pure indexer mode")
|
||||
return
|
||||
}
|
||||
ad, err := pp.AddrInfoFromString(newAddr)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
logger.Info().Str("addr", newAddr).Msg("[startup] native discovered via seed indexers — bootstrapping")
|
||||
StreamNativeMu.Lock()
|
||||
StaticNatives[newAddr] = ad
|
||||
StreamNativeMu.Unlock()
|
||||
// Full native bootstrap: fetch pool, run consensus, replace StaticIndexers
|
||||
// with properly admitted records (AdmittedAt set).
|
||||
if err := ConnectToNatives(h, minIndexer, maxIndexer, myPID); err != nil {
|
||||
logger.Warn().Err(err).Msg("[startup] native bootstrap failed after discovery")
|
||||
}
|
||||
}()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -536,10 +673,19 @@ type Heartbeat struct {
|
||||
Record json.RawMessage `json:"record,omitempty"`
|
||||
}
|
||||
|
||||
func (hb *Heartbeat) ComputeIndexerScore(uptimeHours float64, bpms float64, diversity float64) {
|
||||
hb.Score = ((0.3 * uptimeHours) +
|
||||
(0.3 * bpms) +
|
||||
(0.4 * diversity)) * 100
|
||||
// ComputeIndexerScore computes a composite quality score [0, 100] for the connecting peer.
|
||||
// - uptimeRatio: fraction of tracked lifetime online (gap-aware) — peer reliability
|
||||
// - bpms: bandwidth normalized to MaxExpectedMbps — link capacity
|
||||
// - diversity: indexer's own /24 subnet diversity — network topology quality
|
||||
// - latencyScore: 1 - RTT/maxRoundTrip — link responsiveness
|
||||
// - fillRate: fraction of indexer slots used (0=empty, 1=full) — collective trust signal:
|
||||
// a fuller indexer has been chosen and retained by many peers, which is evidence of quality.
|
||||
func (hb *Heartbeat) ComputeIndexerScore(uptimeRatio float64, bpms float64, diversity float64, latencyScore float64, fillRate float64) {
|
||||
hb.Score = ((0.20 * uptimeRatio) +
|
||||
(0.20 * bpms) +
|
||||
(0.20 * diversity) +
|
||||
(0.15 * latencyScore) +
|
||||
(0.25 * fillRate)) * 100
|
||||
}
|
||||
|
||||
type HeartbeatInfo []struct {
|
||||
@@ -616,6 +762,9 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
|
||||
|
||||
for _, ix := range snapshot {
|
||||
wasConnected := h.Network().Connectedness(ix.ID) == network.Connected
|
||||
StreamNativeMu.RLock()
|
||||
hasNative := len(StaticNatives) > 0
|
||||
StreamNativeMu.RUnlock()
|
||||
if err := sendHeartbeat(ctx, h, proto, ix, hb, ps, interval*time.Second); err != nil {
|
||||
// Step 3: heartbeat failed — remove from pool and trigger replenish.
|
||||
logger.Info().Str("peer", ix.ID.String()).Str("proto", string(proto)).Msg("[native] step 3 — heartbeat failed, removing peer from pool")
|
||||
@@ -639,6 +788,9 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
|
||||
if ad.ID == ix.ID {
|
||||
lostAddr = addr
|
||||
delete(peers, addr)
|
||||
if isIndexerHB {
|
||||
delete(StaticIndexerMeta, addr)
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
@@ -650,7 +802,8 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
|
||||
logger.Info().Int("remaining", remaining).Int("min", conf.GetConfig().MinIndexer).Int("need", need).Msg("[native] step 3 — pool state after removal")
|
||||
|
||||
// Step 4: ask the native for the missing indexer count.
|
||||
if isIndexerHB && conf.GetConfig().NativeIndexerAddresses != "" {
|
||||
// hasNative computed above (used in both err and success branches).
|
||||
if isIndexerHB && hasNative {
|
||||
if need < 1 {
|
||||
need = 1
|
||||
}
|
||||
@@ -663,7 +816,7 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
|
||||
// from StaticIndexers immediately without waiting for the indexer HB tick.
|
||||
if isNativeHB {
|
||||
logger.Info().Str("addr", lostAddr).Msg("[native] step 3 — native heartbeat failed, triggering native replenish")
|
||||
if lostAddr != "" && conf.GetConfig().NativeIndexerAddresses != "" {
|
||||
if lostAddr != "" && hasNative {
|
||||
StreamMuIndexes.Lock()
|
||||
if _, wasIndexer := StaticIndexers[lostAddr]; wasIndexer {
|
||||
delete(StaticIndexers, lostAddr)
|
||||
@@ -695,7 +848,7 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
|
||||
// blank state (responsiblePeers empty). Evict it from StaticIndexers and
|
||||
// re-request an assignment so the native re-tracks us properly and
|
||||
// runOffloadLoop can eventually migrate us to real indexers.
|
||||
if !wasConnected && isIndexerHB && conf.GetConfig().NativeIndexerAddresses != "" {
|
||||
if !wasConnected && isIndexerHB && hasNative {
|
||||
StreamNativeMu.RLock()
|
||||
isNativeIndexer := false
|
||||
for _, ad := range StaticNatives {
|
||||
|
||||
@@ -35,8 +35,27 @@ const (
|
||||
consensusQueryTimeout = 3 * time.Second
|
||||
// consensusCollectTimeout is the total wait for all native responses.
|
||||
consensusCollectTimeout = 4 * time.Second
|
||||
|
||||
// ProtocolIndexerConsensus is the Phase 2 liveness-voting protocol.
|
||||
// Each stable indexer is asked which candidates it considers reachable.
|
||||
ProtocolIndexerConsensus = "/opencloud/indexer/consensus/1.0"
|
||||
|
||||
// MinStableAge is the minimum time since native admission before an indexer
|
||||
// may participate as a voter in Phase 2 liveness voting.
|
||||
MinStableAge = 2 * time.Minute
|
||||
)
|
||||
|
||||
// IndexerConsensusRequest is sent to stable indexers during Phase 2 liveness voting.
|
||||
// Each voter replies with which candidates from the list it can currently reach.
|
||||
type IndexerConsensusRequest struct {
|
||||
Candidates []string `json:"candidates"`
|
||||
}
|
||||
|
||||
// IndexerConsensusResponse is the reply from a Phase 2 voter.
|
||||
type IndexerConsensusResponse struct {
|
||||
Alive []string `json:"alive"`
|
||||
}
|
||||
|
||||
// ConsensusRequest is sent by a node/indexer to a native to validate a candidate
|
||||
// indexer list. The native replies with what it trusts and what it suggests instead.
|
||||
type ConsensusRequest struct {
|
||||
@@ -56,11 +75,12 @@ type ConsensusResponse struct {
|
||||
// Timestamp + PubKey + Signature allow the native and DHT to verify that the
|
||||
// registration was produced by the peer that owns the declared PeerID.
|
||||
type IndexerRegistration struct {
|
||||
PeerID string `json:"peer_id,omitempty"`
|
||||
Addr string `json:"addr"`
|
||||
Timestamp int64 `json:"ts,omitempty"` // Unix nanoseconds (anti-replay)
|
||||
PubKey []byte `json:"pub_key,omitempty"` // marshaled libp2p public key
|
||||
Signature []byte `json:"sig,omitempty"` // Sign(signaturePayload())
|
||||
PeerID string `json:"peer_id,omitempty"`
|
||||
Addr string `json:"addr"`
|
||||
Timestamp int64 `json:"ts,omitempty"` // Unix nanoseconds (anti-replay)
|
||||
PubKey []byte `json:"pub_key,omitempty"` // marshaled libp2p public key
|
||||
Signature []byte `json:"sig,omitempty"` // Sign(signaturePayload())
|
||||
FillRate float64 `json:"fill_rate,omitempty"` // connected_nodes / max_nodes (0=empty, 1=full)
|
||||
}
|
||||
|
||||
// SignaturePayload returns the canonical byte slice that is signed/verified.
|
||||
@@ -106,9 +126,12 @@ type GetIndexersRequest struct {
|
||||
}
|
||||
|
||||
// GetIndexersResponse is returned by the native with live indexer multiaddrs.
|
||||
// FillRates maps each indexer address to its last reported fill rate (0=empty, 1=full).
|
||||
// Nodes use fill rates to prefer indexers with available capacity.
|
||||
type GetIndexersResponse struct {
|
||||
Indexers []string `json:"indexers"`
|
||||
IsSelfFallback bool `json:"is_self_fallback,omitempty"`
|
||||
Indexers []string `json:"indexers"`
|
||||
IsSelfFallback bool `json:"is_self_fallback,omitempty"`
|
||||
FillRates map[string]float64 `json:"fill_rates,omitempty"`
|
||||
}
|
||||
|
||||
var StaticNatives = map[string]*pp.AddrInfo{}
|
||||
@@ -177,8 +200,8 @@ func ConnectToNatives(h host.Host, minIndexer int, maxIndexer int, myPID pp.ID)
|
||||
logger.Info().Int("candidates", len(candidates)).Bool("fallback", isFallback).Msg("[native] step 1 — pool received")
|
||||
|
||||
// Step 2: populate StaticIndexers — consensus for real indexers, direct for fallback.
|
||||
pool := resolvePool(h, candidates, isFallback, maxIndexer)
|
||||
replaceStaticIndexers(pool)
|
||||
pool, admittedAt := resolvePool(h, candidates, isFallback, maxIndexer)
|
||||
replaceStaticIndexers(pool, admittedAt)
|
||||
|
||||
StreamMuIndexes.RLock()
|
||||
indexerCount := len(StaticIndexers)
|
||||
@@ -216,7 +239,7 @@ func replenishIndexersFromNative(h host.Host, need int) {
|
||||
}
|
||||
logger.Info().Int("candidates", len(candidates)).Bool("fallback", isFallback).Msg("[native] step 4 — candidates received")
|
||||
|
||||
pool := resolvePool(h, candidates, isFallback, need)
|
||||
pool, admittedAt := resolvePool(h, candidates, isFallback, need)
|
||||
if len(pool) == 0 {
|
||||
logger.Warn().Msg("[native] step 4 — consensus yielded no confirmed indexers")
|
||||
return
|
||||
@@ -226,9 +249,11 @@ func replenishIndexersFromNative(h host.Host, need int) {
|
||||
StreamMuIndexes.Lock()
|
||||
for addr, ad := range pool {
|
||||
StaticIndexers[addr] = ad
|
||||
if StaticIndexerMeta[addr] == nil {
|
||||
StaticIndexerMeta[addr] = &IndexerRecord{AdmittedAt: admittedAt}
|
||||
}
|
||||
}
|
||||
total := len(StaticIndexers)
|
||||
|
||||
StreamMuIndexes.Unlock()
|
||||
logger.Info().Int("added", len(pool)).Int("total", total).Msg("[native] step 4 — pool replenished")
|
||||
|
||||
@@ -335,9 +360,9 @@ collect:
|
||||
}
|
||||
|
||||
// resolvePool converts a candidate list to a validated addr→AddrInfo map.
|
||||
// When isFallback is true the native itself is the indexer — no consensus needed.
|
||||
// When isFallback is false, consensus is run before accepting the candidates.
|
||||
func resolvePool(h host.Host, candidates []string, isFallback bool, maxIndexer int) map[string]*pp.AddrInfo {
|
||||
// When isFallback is true the native itself is the indexer — no Phase 1 consensus needed.
|
||||
// Returns the pool and the admission timestamp (zero for fallback/seed entries).
|
||||
func resolvePool(h host.Host, candidates []string, isFallback bool, maxIndexer int) (map[string]*pp.AddrInfo, time.Time) {
|
||||
logger := oclib.GetLogger()
|
||||
if isFallback {
|
||||
logger.Info().Strs("addrs", candidates).Msg("[native] resolve — fallback mode, skipping consensus")
|
||||
@@ -349,9 +374,10 @@ func resolvePool(h host.Host, candidates []string, isFallback bool, maxIndexer i
|
||||
}
|
||||
pool[addr] = ad
|
||||
}
|
||||
return pool
|
||||
return pool, time.Time{}
|
||||
}
|
||||
|
||||
// Phase 1 — native admission.
|
||||
// Round 1.
|
||||
logger.Info().Int("candidates", len(candidates)).Msg("[native] resolve — consensus round 1")
|
||||
confirmed, suggestions := clientSideConsensus(h, candidates)
|
||||
@@ -372,6 +398,7 @@ func resolvePool(h host.Host, candidates []string, isFallback bool, maxIndexer i
|
||||
logger.Info().Int("confirmed", len(confirmed)).Msg("[native] resolve — consensus round 2 done")
|
||||
}
|
||||
|
||||
admittedAt := time.Now().UTC()
|
||||
pool := make(map[string]*pp.AddrInfo, len(confirmed))
|
||||
for _, addr := range confirmed {
|
||||
ad, err := pp.AddrInfoFromString(addr)
|
||||
@@ -380,18 +407,130 @@ func resolvePool(h host.Host, candidates []string, isFallback bool, maxIndexer i
|
||||
}
|
||||
pool[addr] = ad
|
||||
}
|
||||
logger.Info().Int("pool_size", len(pool)).Msg("[native] resolve — pool ready")
|
||||
return pool
|
||||
|
||||
// Phase 2 — indexer liveness vote.
|
||||
logger.Info().Int("pool_size", len(pool)).Msg("[native] resolve — Phase 1 done, running Phase 2 liveness vote")
|
||||
pool = indexerLivenessVote(h, pool)
|
||||
logger.Info().Int("pool_size", len(pool)).Msg("[native] resolve — Phase 2 done, pool ready")
|
||||
return pool, admittedAt
|
||||
}
|
||||
|
||||
// indexerLivenessVote runs Phase 2 of the hybrid consensus: it queries every
|
||||
// stable indexer in StaticIndexers (AdmittedAt non-zero, age >= MinStableAge)
|
||||
// for their view of the candidate list and returns only the candidates confirmed
|
||||
// by quorum. When no stable voter exists the full admitted set is returned
|
||||
// unchanged — this is correct on first boot before any indexer is old enough.
|
||||
func indexerLivenessVote(h host.Host, admitted map[string]*pp.AddrInfo) map[string]*pp.AddrInfo {
|
||||
logger := oclib.GetLogger()
|
||||
|
||||
StreamMuIndexes.RLock()
|
||||
voters := make([]*pp.AddrInfo, 0, len(StaticIndexers))
|
||||
for addr, ad := range StaticIndexers {
|
||||
if meta, ok := StaticIndexerMeta[addr]; ok && meta.IsStableVoter() {
|
||||
voters = append(voters, ad)
|
||||
}
|
||||
}
|
||||
StreamMuIndexes.RUnlock()
|
||||
|
||||
if len(voters) == 0 {
|
||||
logger.Info().Msg("[phase2] no stable voters yet — trusting Phase 1 result")
|
||||
return admitted
|
||||
}
|
||||
|
||||
candidates := make([]string, 0, len(admitted))
|
||||
for addr := range admitted {
|
||||
candidates = append(candidates, addr)
|
||||
}
|
||||
|
||||
type result struct {
|
||||
alive map[string]struct{}
|
||||
ok bool
|
||||
}
|
||||
ch := make(chan result, len(voters))
|
||||
|
||||
for _, voter := range voters {
|
||||
go func(v *pp.AddrInfo) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), consensusQueryTimeout)
|
||||
defer cancel()
|
||||
if err := h.Connect(ctx, *v); err != nil {
|
||||
ch <- result{}
|
||||
return
|
||||
}
|
||||
s, err := h.NewStream(ctx, v.ID, ProtocolIndexerConsensus)
|
||||
if err != nil {
|
||||
ch <- result{}
|
||||
return
|
||||
}
|
||||
s.SetDeadline(time.Now().Add(consensusQueryTimeout))
|
||||
defer s.Close()
|
||||
if err := json.NewEncoder(s).Encode(IndexerConsensusRequest{Candidates: candidates}); err != nil {
|
||||
ch <- result{}
|
||||
return
|
||||
}
|
||||
var resp IndexerConsensusResponse
|
||||
if err := json.NewDecoder(s).Decode(&resp); err != nil {
|
||||
ch <- result{}
|
||||
return
|
||||
}
|
||||
alive := make(map[string]struct{}, len(resp.Alive))
|
||||
for _, a := range resp.Alive {
|
||||
alive[a] = struct{}{}
|
||||
}
|
||||
ch <- result{alive: alive, ok: true}
|
||||
}(voter)
|
||||
}
|
||||
|
||||
timer := time.NewTimer(consensusCollectTimeout)
|
||||
defer timer.Stop()
|
||||
|
||||
aliveCounts := map[string]int{}
|
||||
total, collected := 0, 0
|
||||
collect:
|
||||
for collected < len(voters) {
|
||||
select {
|
||||
case r := <-ch:
|
||||
collected++
|
||||
if !r.ok {
|
||||
continue
|
||||
}
|
||||
total++
|
||||
for addr := range r.alive {
|
||||
aliveCounts[addr]++
|
||||
}
|
||||
case <-timer.C:
|
||||
break collect
|
||||
}
|
||||
}
|
||||
|
||||
if total == 0 {
|
||||
logger.Info().Msg("[phase2] no voter responded — trusting Phase 1 result")
|
||||
return admitted
|
||||
}
|
||||
|
||||
quorum := conf.GetConfig().ConsensusQuorum
|
||||
if quorum <= 0 {
|
||||
quorum = 0.5
|
||||
}
|
||||
confirmed := make(map[string]*pp.AddrInfo, len(admitted))
|
||||
for addr, ad := range admitted {
|
||||
if float64(aliveCounts[addr]) > float64(total)*quorum {
|
||||
confirmed[addr] = ad
|
||||
}
|
||||
}
|
||||
logger.Info().Int("admitted", len(admitted)).Int("confirmed", len(confirmed)).Int("voters", total).Msg("[phase2] liveness vote complete")
|
||||
return confirmed
|
||||
}
|
||||
|
||||
// replaceStaticIndexers atomically replaces the active indexer pool.
|
||||
// Peers no longer in next have their heartbeat streams closed so the SendHeartbeat
|
||||
// goroutine stops sending to them on the next tick.
|
||||
func replaceStaticIndexers(next map[string]*pp.AddrInfo) {
|
||||
// admittedAt is the time of native admission (zero for fallback/seed entries).
|
||||
func replaceStaticIndexers(next map[string]*pp.AddrInfo, admittedAt time.Time) {
|
||||
StreamMuIndexes.Lock()
|
||||
defer StreamMuIndexes.Unlock()
|
||||
for addr, ad := range next {
|
||||
StaticIndexers[addr] = ad
|
||||
if StaticIndexerMeta[addr] == nil {
|
||||
StaticIndexerMeta[addr] = &IndexerRecord{AdmittedAt: admittedAt}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -508,8 +647,10 @@ collect:
|
||||
}
|
||||
|
||||
// RegisterWithNative sends a one-shot registration to each configured native indexer.
|
||||
// fillRateFn, when non-nil, is called to obtain the current fill rate (0=empty, 1=full)
|
||||
// which the native uses to route new nodes toward less-loaded indexers.
|
||||
// Should be called periodically every RecommendedHeartbeatInterval.
|
||||
func RegisterWithNative(h host.Host, nativeAddressesStr string) {
|
||||
func RegisterWithNative(h host.Host, nativeAddressesStr string, fillRateFn func() float64) {
|
||||
logger := oclib.GetLogger()
|
||||
myAddr := ""
|
||||
if !strings.Contains(h.Addrs()[len(h.Addrs())-1].String(), "127.0.0.1") {
|
||||
@@ -524,6 +665,9 @@ func RegisterWithNative(h host.Host, nativeAddressesStr string) {
|
||||
Addr: myAddr,
|
||||
Timestamp: time.Now().UnixNano(),
|
||||
}
|
||||
if fillRateFn != nil {
|
||||
reg.FillRate = fillRateFn()
|
||||
}
|
||||
reg.Sign(h)
|
||||
for _, addr := range strings.Split(nativeAddressesStr, ",") {
|
||||
addr = strings.TrimSpace(addr)
|
||||
@@ -619,7 +763,10 @@ func EnsureNativePeers(h host.Host) {
|
||||
})
|
||||
}
|
||||
|
||||
func StartNativeRegistration(h host.Host, nativeAddressesStr string) {
|
||||
// StartNativeRegistration starts a goroutine that periodically registers this
|
||||
// indexer with all configured native indexers (every RecommendedHeartbeatInterval).
|
||||
// fillRateFn is called on each registration tick to report current capacity usage.
|
||||
func StartNativeRegistration(h host.Host, nativeAddressesStr string, fillRateFn func() float64) {
|
||||
go func() {
|
||||
// Poll until a routable (non-loopback) address is available before the first
|
||||
// registration attempt. libp2p may not have discovered external addresses yet
|
||||
@@ -636,11 +783,11 @@ func StartNativeRegistration(h host.Host, nativeAddressesStr string) {
|
||||
}
|
||||
time.Sleep(5 * time.Second)
|
||||
}
|
||||
RegisterWithNative(h, nativeAddressesStr)
|
||||
RegisterWithNative(h, nativeAddressesStr, fillRateFn)
|
||||
t := time.NewTicker(RecommendedHeartbeatInterval)
|
||||
defer t.Stop()
|
||||
for range t.C {
|
||||
RegisterWithNative(h, nativeAddressesStr)
|
||||
RegisterWithNative(h, nativeAddressesStr, fillRateFn)
|
||||
}
|
||||
}()
|
||||
}
|
||||
@@ -917,7 +1064,7 @@ func retryLostNative(ctx context.Context, h host.Host, addr string, nativeProto
|
||||
NudgeNativeHeartbeat()
|
||||
replenishIndexersIfNeeded(h)
|
||||
if nativeProto == ProtocolNativeGetIndexers {
|
||||
StartNativeRegistration(h, addr) // register back
|
||||
StartNativeRegistration(h, addr, nil) // register back (fill rate unknown in this context)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
@@ -178,6 +178,43 @@ func (ix *IndexerService) initNodeHandler() {
|
||||
ix.Host.SetStreamHandler(common.ProtocolPublish, ix.handleNodePublish)
|
||||
ix.Host.SetStreamHandler(common.ProtocolGet, ix.handleNodeGet)
|
||||
ix.Host.SetStreamHandler(common.ProtocolIndexerGetNatives, ix.handleGetNatives)
|
||||
ix.Host.SetStreamHandler(common.ProtocolIndexerConsensus, ix.handleIndexerConsensus)
|
||||
}
|
||||
|
||||
// handleIndexerConsensus implements Phase 2 liveness voting (ProtocolIndexerConsensus).
|
||||
// The caller sends a list of candidate multiaddrs; this indexer replies with the
|
||||
// subset it considers currently alive (recent heartbeat in StreamRecords).
|
||||
func (ix *IndexerService) handleIndexerConsensus(stream network.Stream) {
|
||||
defer stream.Reset()
|
||||
|
||||
var req common.IndexerConsensusRequest
|
||||
if err := json.NewDecoder(stream).Decode(&req); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
ix.StreamMU.RLock()
|
||||
streams := ix.StreamRecords[common.ProtocolHeartbeat]
|
||||
ix.StreamMU.RUnlock()
|
||||
|
||||
alive := make([]string, 0, len(req.Candidates))
|
||||
for _, addr := range req.Candidates {
|
||||
ad, err := peer.AddrInfoFromString(addr)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
ix.StreamMU.RLock()
|
||||
rec, ok := streams[ad.ID]
|
||||
ix.StreamMU.RUnlock()
|
||||
if !ok || rec.HeartbeatStream == nil || rec.HeartbeatStream.UptimeTracker == nil {
|
||||
continue
|
||||
}
|
||||
// D: consider alive only if recent heartbeat AND score above minimum quality bar.
|
||||
if time.Since(rec.HeartbeatStream.UptimeTracker.LastSeen) <= 2*common.RecommendedHeartbeatInterval &&
|
||||
rec.LastScore >= 30.0 {
|
||||
alive = append(alive, addr)
|
||||
}
|
||||
}
|
||||
json.NewEncoder(stream).Encode(common.IndexerConsensusResponse{Alive: alive})
|
||||
}
|
||||
|
||||
func (ix *IndexerService) handleNodePublish(s network.Stream) {
|
||||
|
||||
@@ -40,6 +40,7 @@ const (
|
||||
// liveIndexerEntry tracks a registered indexer in the native's in-memory cache and DHT.
|
||||
// PubKey and Signature are forwarded from the IndexerRegistration so the DHT validator
|
||||
// can verify that the entry was produced by the peer owning the declared PeerID.
|
||||
// FillRate is the fraction of capacity used (0=empty, 1=full) at last registration.
|
||||
type liveIndexerEntry struct {
|
||||
PeerID string `json:"peer_id"`
|
||||
Addr string `json:"addr"`
|
||||
@@ -47,6 +48,7 @@ type liveIndexerEntry struct {
|
||||
RegTimestamp int64 `json:"reg_ts,omitempty"` // Timestamp from the original IndexerRegistration
|
||||
PubKey []byte `json:"pub_key,omitempty"`
|
||||
Signature []byte `json:"sig,omitempty"`
|
||||
FillRate float64 `json:"fill_rate,omitempty"`
|
||||
}
|
||||
|
||||
// NativeState holds runtime state specific to native indexer operation.
|
||||
@@ -265,6 +267,7 @@ func (ix *IndexerService) handleNativeSubscription(s network.Stream) {
|
||||
RegTimestamp: reg.Timestamp,
|
||||
PubKey: reg.PubKey,
|
||||
Signature: reg.Signature,
|
||||
FillRate: reg.FillRate,
|
||||
}
|
||||
|
||||
// Verify that the declared address is actually reachable before admitting
|
||||
@@ -428,11 +431,40 @@ func (ix *IndexerService) handleNativeGetIndexers(s network.Stream) {
|
||||
"native: fallback pool saturated, refusing self-delegation")
|
||||
}
|
||||
} else {
|
||||
rand.Shuffle(len(reachable), func(i, j int) { reachable[i], reachable[j] = reachable[j], reachable[i] })
|
||||
// Sort by fill rate ascending so less-full indexers are preferred for routing.
|
||||
ix.Native.liveIndexersMu.RLock()
|
||||
fillRates := make(map[string]float64, len(reachable))
|
||||
for _, addr := range reachable {
|
||||
ad, err := pp.AddrInfoFromString(addr)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
for _, e := range ix.Native.liveIndexers {
|
||||
if e.PeerID == ad.ID.String() {
|
||||
fillRates[addr] = e.FillRate
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
ix.Native.liveIndexersMu.RUnlock()
|
||||
|
||||
// Sort by routing weight descending: weight = fillRate × (1 − fillRate).
|
||||
// This prefers indexers in the "trust sweet spot" — proven popular (fillRate > 0)
|
||||
// but not saturated (fillRate < 1). Peak at fillRate ≈ 0.5.
|
||||
routingWeight := func(addr string) float64 {
|
||||
f := fillRates[addr]
|
||||
return f * (1 - f)
|
||||
}
|
||||
for i := 1; i < len(reachable); i++ {
|
||||
for j := i; j > 0 && routingWeight(reachable[j]) > routingWeight(reachable[j-1]); j-- {
|
||||
reachable[j], reachable[j-1] = reachable[j-1], reachable[j]
|
||||
}
|
||||
}
|
||||
if req.Count > len(reachable) {
|
||||
req.Count = len(reachable)
|
||||
}
|
||||
resp.Indexers = reachable[:req.Count]
|
||||
resp.FillRates = fillRates
|
||||
}
|
||||
|
||||
if err := json.NewEncoder(s).Encode(resp); err != nil {
|
||||
|
||||
@@ -96,9 +96,24 @@ func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int, isNative boo
|
||||
ix.InitNative()
|
||||
} else {
|
||||
ix.initNodeHandler()
|
||||
// Register with configured natives so this indexer appears in their cache
|
||||
// Register with configured natives so this indexer appears in their cache.
|
||||
// Pass a fill rate provider so the native can route new nodes to less-loaded indexers.
|
||||
if nativeAddrs := conf.GetConfig().NativeIndexerAddresses; nativeAddrs != "" {
|
||||
common.StartNativeRegistration(ix.Host, nativeAddrs)
|
||||
fillRateFn := func() float64 {
|
||||
ix.StreamMU.RLock()
|
||||
n := len(ix.StreamRecords[common.ProtocolHeartbeat])
|
||||
ix.StreamMU.RUnlock()
|
||||
maxN := ix.MaxNodesConn()
|
||||
if maxN <= 0 {
|
||||
return 0
|
||||
}
|
||||
rate := float64(n) / float64(maxN)
|
||||
if rate > 1 {
|
||||
rate = 1
|
||||
}
|
||||
return rate
|
||||
}
|
||||
common.StartNativeRegistration(ix.Host, nativeAddrs, fillRateFn)
|
||||
}
|
||||
}
|
||||
return ix
|
||||
|
||||
@@ -123,7 +123,6 @@ func InitNode(isNode bool, isIndexer bool, isNativeIndexer bool) (*Node, error)
|
||||
m := map[string]interface{}{}
|
||||
err := json.Unmarshal(evt.Payload, &m)
|
||||
if err != nil || evt.From == node.PeerID.String() {
|
||||
fmt.Println(evt.From, node.PeerID.String(), err)
|
||||
return
|
||||
}
|
||||
if p, err := node.GetPeerRecord(ctx, evt.From, false); err == nil && len(p) > 0 && m["search"] != nil {
|
||||
|
||||
Reference in New Issue
Block a user