This commit is contained in:
mr
2026-03-09 14:57:41 +01:00
parent 3751ec554d
commit 83cef6e6f6
47 changed files with 2704 additions and 1034 deletions

View File

@@ -35,6 +35,10 @@ type LongLivedStreamRecordedService[T interface{}] struct {
AfterDelete func(pid pp.ID, name string, did string)
}
func (ix *LongLivedStreamRecordedService[T]) MaxNodesConn() int {
return ix.maxNodesConn
}
func NewStreamRecordedService[T interface{}](h host.Host, maxNodesConn int) *LongLivedStreamRecordedService[T] {
service := &LongLivedStreamRecordedService[T]{
LongLivedPubSubService: NewLongLivedPubSubService(h),
@@ -160,25 +164,26 @@ func (ix *LongLivedStreamRecordedService[T]) HandleHeartbeat(s network.Stream) {
// if record already seen update last seen
if rec, ok := streams[*pid]; ok {
rec.DID = hb.DID
if rec.HeartbeatStream == nil {
rec.HeartbeatStream = hb.Stream
}
// Preserve the existing UptimeTracker so TotalOnline accumulates correctly.
// hb.Stream is a fresh Stream with no UptimeTracker; carry the old one over.
oldTracker := rec.GetUptimeTracker()
rec.HeartbeatStream = hb.Stream
if rec.HeartbeatStream.UptimeTracker == nil {
rec.HeartbeatStream.UptimeTracker = &UptimeTracker{
FirstSeen: time.Now().UTC(),
LastSeen: time.Now().UTC(),
}
if oldTracker != nil {
rec.HeartbeatStream.UptimeTracker = oldTracker
} else {
rec.HeartbeatStream.UptimeTracker = &UptimeTracker{FirstSeen: time.Now().UTC()}
}
rec.HeartbeatStream.UptimeTracker.RecordHeartbeat()
rec.LastScore = hb.Score
logger.Info().Msg("A new node is updated : " + pid.String())
} else {
hb.Stream.UptimeTracker = &UptimeTracker{
FirstSeen: time.Now().UTC(),
LastSeen: time.Now().UTC(),
}
tracker := &UptimeTracker{FirstSeen: time.Now().UTC()}
tracker.RecordHeartbeat()
hb.Stream.UptimeTracker = tracker
streams[*pid] = &StreamRecord[T]{
DID: hb.DID,
HeartbeatStream: hb.Stream,
LastScore: hb.Score,
}
logger.Info().Msg("A new node is subscribed : " + pid.String())
}
@@ -215,30 +220,33 @@ func CheckHeartbeat(h host.Host, s network.Stream, dec *json.Decoder, streams ma
if err := dec.Decode(&hb); err != nil {
return nil, nil, err
}
_, bpms, _ := getBandwidthChallengeRate(h, s.Conn().RemotePeer(), MinPayloadChallenge+int(rand.Float64()*(MaxPayloadChallenge-MinPayloadChallenge)))
_, bpms, latencyScore, _ := getBandwidthChallengeRate(h, s.Conn().RemotePeer(), MinPayloadChallenge+int(rand.Float64()*(MaxPayloadChallenge-MinPayloadChallenge)))
{
pid, err := pp.Decode(hb.PeerID)
if err != nil {
return nil, nil, err
}
upTime := float64(0)
isFirstHeartbeat := true
uptimeRatio := float64(0)
age := time.Duration(0)
lock.Lock()
if rec, ok := streams[pid]; ok && rec.GetUptimeTracker() != nil {
upTime = rec.GetUptimeTracker().Uptime().Hours() / float64(time.Since(TimeWatcher).Hours())
isFirstHeartbeat = false
uptimeRatio = rec.GetUptimeTracker().UptimeRatio()
age = rec.GetUptimeTracker().Uptime()
}
lock.Unlock()
diversity := getDiversityRate(h, hb.IndexersBinded)
hb.ComputeIndexerScore(upTime, bpms, diversity)
// First heartbeat: uptime is always 0 so the score ceiling is 60, below the
// steady-state threshold of 75. Use a lower admission threshold so new peers
// can enter and start accumulating uptime. Subsequent heartbeats must meet
// the full threshold once uptime is tracked.
minScore := float64(40)
if isFirstHeartbeat {
minScore = 40
// E: measure the indexer's own subnet diversity, not the node's view.
diversity := getOwnDiversityRate(h)
// fillRate: fraction of indexer capacity used — higher = more peers trust this indexer.
fillRate := 0.0
if maxNodes > 0 {
fillRate = float64(len(h.Network().Peers())) / float64(maxNodes)
if fillRate > 1 {
fillRate = 1
}
}
hb.ComputeIndexerScore(uptimeRatio, bpms, diversity, latencyScore, fillRate)
// B: dynamic minScore — starts at 20% for brand-new peers, ramps to 80% at 24h.
minScore := dynamicMinScore(age)
if hb.Score < minScore {
return nil, nil, errors.New("not enough trusting value")
}
@@ -247,7 +255,7 @@ func CheckHeartbeat(h host.Host, s network.Stream, dec *json.Decoder, streams ma
DID: hb.DID,
Stream: s,
Expiry: time.Now().UTC().Add(2 * time.Minute),
} // here is the long-lived bidirectionnal heart bit.
} // here is the long-lived bidirectional heartbeat.
return &pid, &hb, err
}
}
@@ -268,7 +276,40 @@ func getDiversityRate(h host.Host, peers []string) float64 {
if len(diverse) == 0 || len(peers) == 0 {
return 1
}
return float64(len(diverse) / len(peers))
return float64(len(diverse)) / float64(len(peers))
}
// getOwnDiversityRate measures subnet /24 diversity of the indexer's own connected peers.
// This evaluates the indexer's network position rather than the connecting node's topology.
func getOwnDiversityRate(h host.Host) float64 {
diverse := map[string]struct{}{}
total := 0
for _, pid := range h.Network().Peers() {
for _, maddr := range h.Peerstore().Addrs(pid) {
total++
ip, err := ExtractIP(maddr.String())
if err != nil {
continue
}
diverse[ip.Mask(net.CIDRMask(24, 32)).String()] = struct{}{}
}
}
if total == 0 {
return 1
}
return float64(len(diverse)) / float64(total)
}
// dynamicMinScore returns the minimum acceptable score for a peer, starting
// permissive (20%) for brand-new peers and hardening linearly to 80% over 24h.
// This prevents ejecting newcomers in fresh networks while filtering parasites.
func dynamicMinScore(age time.Duration) float64 {
hours := age.Hours()
score := 20.0 + 60.0*(hours/24.0)
if score > 80.0 {
score = 80.0
}
return score
}
func checkPeers(h host.Host, peers []string) ([]string, []string) {
@@ -295,53 +336,95 @@ const MaxPayloadChallenge = 2048
const BaseRoundTrip = 400 * time.Millisecond
// getBandwidthChallengeRate opens a dedicated ProtocolBandwidthProbe stream to
// remotePeer, sends a random payload, reads the echo, and computes throughput.
// remotePeer, sends a random payload, reads the echo, and computes throughput
// and a latency score. Returns (ok, bpms, latencyScore, error).
// latencyScore is 1.0 when RTT is very fast and 0.0 when at or beyond maxRoundTrip.
// Using a separate stream avoids mixing binary data on the JSON heartbeat stream
// and ensures the echo handler is actually running on the remote side.
func getBandwidthChallengeRate(h host.Host, remotePeer pp.ID, payloadSize int) (bool, float64, error) {
func getBandwidthChallengeRate(h host.Host, remotePeer pp.ID, payloadSize int) (bool, float64, float64, error) {
payload := make([]byte, payloadSize)
if _, err := cr.Read(payload); err != nil {
return false, 0, err
return false, 0, 0, err
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
s, err := h.NewStream(ctx, remotePeer, ProtocolBandwidthProbe)
if err != nil {
return false, 0, err
return false, 0, 0, err
}
defer s.Reset()
s.SetDeadline(time.Now().Add(10 * time.Second))
start := time.Now()
if _, err = s.Write(payload); err != nil {
return false, 0, err
return false, 0, 0, err
}
s.CloseWrite()
// Half-close the write side so the handler's io.Copy sees EOF and stops.
// Read the echo.
response := make([]byte, payloadSize)
if _, err = io.ReadFull(s, response); err != nil {
return false, 0, err
return false, 0, 0, err
}
duration := time.Since(start)
maxRoundTrip := BaseRoundTrip + (time.Duration(payloadSize) * (100 * time.Millisecond))
mbps := float64(payloadSize*8) / duration.Seconds() / 1e6
if duration > maxRoundTrip || mbps < 5.0 {
return false, float64(mbps / MaxExpectedMbps), nil
// latencyScore: 1.0 = instant, 0.0 = at maxRoundTrip or beyond.
latencyScore := 1.0 - float64(duration)/float64(maxRoundTrip)
if latencyScore < 0 {
latencyScore = 0
}
return true, float64(mbps / MaxExpectedMbps), nil
if latencyScore > 1 {
latencyScore = 1
}
if duration > maxRoundTrip || mbps < 5.0 {
return false, float64(mbps / MaxExpectedMbps), latencyScore, nil
}
return true, float64(mbps / MaxExpectedMbps), latencyScore, nil
}
type UptimeTracker struct {
FirstSeen time.Time
LastSeen time.Time
FirstSeen time.Time
LastSeen time.Time
TotalOnline time.Duration
}
// RecordHeartbeat accumulates online time gap-aware: only counts the interval if
// the gap since the last heartbeat is within 2× the recommended interval (i.e. no
// extended outage). Call this each time a heartbeat is successfully processed.
func (u *UptimeTracker) RecordHeartbeat() {
now := time.Now().UTC()
if !u.LastSeen.IsZero() {
gap := now.Sub(u.LastSeen)
if gap <= 2*RecommendedHeartbeatInterval {
u.TotalOnline += gap
}
}
u.LastSeen = now
}
func (u *UptimeTracker) Uptime() time.Duration {
return time.Since(u.FirstSeen)
}
// UptimeRatio returns the fraction of tracked lifetime during which the peer was
// continuously online (gap ≤ 2×RecommendedHeartbeatInterval). Returns 0 before
// the first heartbeat interval has elapsed.
func (u *UptimeTracker) UptimeRatio() float64 {
total := time.Since(u.FirstSeen)
if total <= 0 {
return 0
}
ratio := float64(u.TotalOnline) / float64(total)
if ratio > 1 {
ratio = 1
}
return ratio
}
func (u *UptimeTracker) IsEligible(min time.Duration) bool {
return u.Uptime() >= min
}
@@ -350,6 +433,7 @@ type StreamRecord[T interface{}] struct {
DID string
HeartbeatStream *Stream
Record T
LastScore float64
}
func (s *StreamRecord[T]) GetUptimeTracker() *UptimeTracker {
@@ -426,7 +510,24 @@ const (
var TimeWatcher time.Time
// IndexerRecord holds admission metadata for an indexer in the pool.
// AdmittedAt is zero for seed entries (IndexerAddresses) never validated by a native.
// It is set to the admission time when a native confirms the indexer via consensus.
type IndexerRecord struct {
AdmittedAt time.Time
}
// IsStableVoter returns true when this indexer has been admitted by a native
// long enough ago to participate as a voter in Phase 2 liveness voting.
func (r *IndexerRecord) IsStableVoter() bool {
return !r.AdmittedAt.IsZero() && time.Since(r.AdmittedAt) >= MinStableAge
}
var StaticIndexers map[string]*pp.AddrInfo = map[string]*pp.AddrInfo{}
// StaticIndexerMeta mirrors StaticIndexers with admission metadata.
// Both maps are always updated together under StreamMuIndexes.
var StaticIndexerMeta map[string]*IndexerRecord = map[string]*IndexerRecord{}
var StreamMuIndexes sync.RWMutex
var StreamIndexers ProtocolStream = ProtocolStream{}
@@ -462,28 +563,64 @@ func ConnectToIndexers(h host.Host, minIndexer int, maxIndexer int, myPID pp.ID,
return nil
}
// No native configured: bootstrap from IndexerAddresses seed set.
addresses := strings.Split(conf.GetConfig().IndexerAddresses, ",")
if len(addresses) > maxIndexer {
addresses = addresses[0:maxIndexer]
}
StreamMuIndexes.Lock()
for _, indexerAddr := range addresses {
indexerAddr = strings.TrimSpace(indexerAddr)
if indexerAddr == "" {
continue
}
ad, err := pp.AddrInfoFromString(indexerAddr)
if err != nil {
logger.Err(err)
continue
}
// AdmittedAt zero = seed, not yet validated by a native.
StaticIndexers[indexerAddr] = ad
StaticIndexerMeta[indexerAddr] = &IndexerRecord{}
}
indexerCount := len(StaticIndexers)
StreamMuIndexes.Unlock()
SendHeartbeat(context.Background(), ProtocolHeartbeat, conf.GetConfig().Name, h, StreamIndexers, StaticIndexers, &StreamMuIndexes, 20*time.Second, recordFn...) // your indexer is just like a node for the next indexer.
if indexerCount < minIndexer {
return errors.New("you run a node without indexers... your gonna be isolated.")
}
// Start long-lived heartbeat to seed indexers. The single goroutine follows
// all subsequent StaticIndexers changes (including after native discovery).
SendHeartbeat(context.Background(), ProtocolHeartbeat, conf.GetConfig().Name,
h, StreamIndexers, StaticIndexers, &StreamMuIndexes, 20*time.Second, recordFn...)
// Async: ask seed indexers whether they know a native — same logic as
// replenishNativesFromPeers. Runs after a short delay to let h.Connect warm up.
go func() {
time.Sleep(2 * time.Second)
logger.Info().Msg("[startup] no native configured — asking seed indexers for native addresses")
newAddr := fetchNativeFromIndexers(h, nil)
if newAddr == "" {
logger.Info().Msg("[startup] no native found from seed indexers — pure indexer mode")
return
}
ad, err := pp.AddrInfoFromString(newAddr)
if err != nil {
return
}
logger.Info().Str("addr", newAddr).Msg("[startup] native discovered via seed indexers — bootstrapping")
StreamNativeMu.Lock()
StaticNatives[newAddr] = ad
StreamNativeMu.Unlock()
// Full native bootstrap: fetch pool, run consensus, replace StaticIndexers
// with properly admitted records (AdmittedAt set).
if err := ConnectToNatives(h, minIndexer, maxIndexer, myPID); err != nil {
logger.Warn().Err(err).Msg("[startup] native bootstrap failed after discovery")
}
}()
return nil
}
@@ -536,10 +673,19 @@ type Heartbeat struct {
Record json.RawMessage `json:"record,omitempty"`
}
func (hb *Heartbeat) ComputeIndexerScore(uptimeHours float64, bpms float64, diversity float64) {
hb.Score = ((0.3 * uptimeHours) +
(0.3 * bpms) +
(0.4 * diversity)) * 100
// ComputeIndexerScore computes a composite quality score [0, 100] for the connecting peer.
// - uptimeRatio: fraction of tracked lifetime online (gap-aware) — peer reliability
// - bpms: bandwidth normalized to MaxExpectedMbps — link capacity
// - diversity: indexer's own /24 subnet diversity — network topology quality
// - latencyScore: 1 - RTT/maxRoundTrip — link responsiveness
// - fillRate: fraction of indexer slots used (0=empty, 1=full) — collective trust signal:
// a fuller indexer has been chosen and retained by many peers, which is evidence of quality.
func (hb *Heartbeat) ComputeIndexerScore(uptimeRatio float64, bpms float64, diversity float64, latencyScore float64, fillRate float64) {
hb.Score = ((0.20 * uptimeRatio) +
(0.20 * bpms) +
(0.20 * diversity) +
(0.15 * latencyScore) +
(0.25 * fillRate)) * 100
}
type HeartbeatInfo []struct {
@@ -616,6 +762,9 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
for _, ix := range snapshot {
wasConnected := h.Network().Connectedness(ix.ID) == network.Connected
StreamNativeMu.RLock()
hasNative := len(StaticNatives) > 0
StreamNativeMu.RUnlock()
if err := sendHeartbeat(ctx, h, proto, ix, hb, ps, interval*time.Second); err != nil {
// Step 3: heartbeat failed — remove from pool and trigger replenish.
logger.Info().Str("peer", ix.ID.String()).Str("proto", string(proto)).Msg("[native] step 3 — heartbeat failed, removing peer from pool")
@@ -639,6 +788,9 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
if ad.ID == ix.ID {
lostAddr = addr
delete(peers, addr)
if isIndexerHB {
delete(StaticIndexerMeta, addr)
}
break
}
}
@@ -650,7 +802,8 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
logger.Info().Int("remaining", remaining).Int("min", conf.GetConfig().MinIndexer).Int("need", need).Msg("[native] step 3 — pool state after removal")
// Step 4: ask the native for the missing indexer count.
if isIndexerHB && conf.GetConfig().NativeIndexerAddresses != "" {
// hasNative computed above (used in both err and success branches).
if isIndexerHB && hasNative {
if need < 1 {
need = 1
}
@@ -663,7 +816,7 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
// from StaticIndexers immediately without waiting for the indexer HB tick.
if isNativeHB {
logger.Info().Str("addr", lostAddr).Msg("[native] step 3 — native heartbeat failed, triggering native replenish")
if lostAddr != "" && conf.GetConfig().NativeIndexerAddresses != "" {
if lostAddr != "" && hasNative {
StreamMuIndexes.Lock()
if _, wasIndexer := StaticIndexers[lostAddr]; wasIndexer {
delete(StaticIndexers, lostAddr)
@@ -695,7 +848,7 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
// blank state (responsiblePeers empty). Evict it from StaticIndexers and
// re-request an assignment so the native re-tracks us properly and
// runOffloadLoop can eventually migrate us to real indexers.
if !wasConnected && isIndexerHB && conf.GetConfig().NativeIndexerAddresses != "" {
if !wasConnected && isIndexerHB && hasNative {
StreamNativeMu.RLock()
isNativeIndexer := false
for _, ad := range StaticNatives {