Files
oc-discovery/daemons/node/common/common_stream.go

841 lines
26 KiB
Go
Raw Normal View History

2026-01-30 16:57:36 +01:00
package common
import (
"context"
2026-02-17 13:11:22 +01:00
cr "crypto/rand"
2026-01-30 16:57:36 +01:00
"encoding/json"
"errors"
"fmt"
2026-02-17 13:11:22 +01:00
"io"
"math/rand"
"net"
2026-01-30 16:57:36 +01:00
"oc-discovery/conf"
2026-02-17 13:11:22 +01:00
"slices"
2026-01-30 16:57:36 +01:00
"strings"
"sync"
"time"
oclib "cloud.o-forge.io/core/oc-lib"
"github.com/libp2p/go-libp2p/core/host"
"github.com/libp2p/go-libp2p/core/network"
pp "github.com/libp2p/go-libp2p/core/peer"
"github.com/libp2p/go-libp2p/core/protocol"
)
type LongLivedStreamRecordedService[T interface{}] struct {
*LongLivedPubSubService
2026-02-20 12:42:18 +01:00
StreamRecords map[protocol.ID]map[pp.ID]*StreamRecord[T]
StreamMU sync.RWMutex
maxNodesConn int
2026-03-03 16:38:24 +01:00
// AfterHeartbeat is an optional hook called after each successful heartbeat update.
// The indexer sets it to republish the embedded signed record to the DHT.
AfterHeartbeat func(pid pp.ID)
// AfterDelete is called after gc() evicts an expired peer, outside the lock.
// name and did may be empty if the HeartbeatStream had no metadata.
AfterDelete func(pid pp.ID, name string, did string)
2026-01-30 16:57:36 +01:00
}
2026-02-20 12:42:18 +01:00
func NewStreamRecordedService[T interface{}](h host.Host, maxNodesConn int) *LongLivedStreamRecordedService[T] {
2026-01-30 16:57:36 +01:00
service := &LongLivedStreamRecordedService[T]{
LongLivedPubSubService: NewLongLivedPubSubService(h),
StreamRecords: map[protocol.ID]map[pp.ID]*StreamRecord[T]{},
maxNodesConn: maxNodesConn,
}
go service.StartGC(30 * time.Second)
// Garbage collection is needed on every Map of Long-Lived Stream... it may be a top level redesigned
go service.Snapshot(1 * time.Hour)
return service
}
func (ix *LongLivedStreamRecordedService[T]) StartGC(interval time.Duration) {
go func() {
t := time.NewTicker(interval)
defer t.Stop()
for range t.C {
ix.gc()
}
}()
}
func (ix *LongLivedStreamRecordedService[T]) gc() {
ix.StreamMU.Lock()
2026-02-03 15:25:15 +01:00
now := time.Now().UTC()
if ix.StreamRecords[ProtocolHeartbeat] == nil {
2026-01-30 16:57:36 +01:00
ix.StreamRecords[ProtocolHeartbeat] = map[pp.ID]*StreamRecord[T]{}
2026-03-03 16:38:24 +01:00
ix.StreamMU.Unlock()
2026-01-30 16:57:36 +01:00
return
}
2026-02-03 15:25:15 +01:00
streams := ix.StreamRecords[ProtocolHeartbeat]
2026-03-03 16:38:24 +01:00
fmt.Println(StaticNatives, StaticIndexers, streams)
2026-02-03 15:25:15 +01:00
2026-03-03 16:38:24 +01:00
type gcEntry struct {
pid pp.ID
name string
did string
}
var evicted []gcEntry
2026-01-30 16:57:36 +01:00
for pid, rec := range streams {
2026-02-17 13:11:22 +01:00
if now.After(rec.HeartbeatStream.Expiry) || now.Sub(rec.HeartbeatStream.UptimeTracker.LastSeen) > 2*rec.HeartbeatStream.Expiry.Sub(now) {
2026-03-03 16:38:24 +01:00
name, did := "", ""
if rec.HeartbeatStream != nil {
name = rec.HeartbeatStream.Name
did = rec.HeartbeatStream.DID
}
evicted = append(evicted, gcEntry{pid, name, did})
2026-01-30 16:57:36 +01:00
for _, sstreams := range ix.StreamRecords {
if sstreams[pid] != nil {
delete(sstreams, pid)
}
}
}
}
2026-03-03 16:38:24 +01:00
ix.StreamMU.Unlock()
if ix.AfterDelete != nil {
for _, e := range evicted {
ix.AfterDelete(e.pid, e.name, e.did)
}
}
2026-01-30 16:57:36 +01:00
}
func (ix *LongLivedStreamRecordedService[T]) Snapshot(interval time.Duration) {
go func() {
logger := oclib.GetLogger()
t := time.NewTicker(interval)
defer t.Stop()
for range t.C {
infos := ix.snapshot()
for _, inf := range infos {
logger.Info().Msg(" -> " + inf.DID)
}
}
}()
}
// -------- Snapshot / Query --------
func (ix *LongLivedStreamRecordedService[T]) snapshot() []*StreamRecord[T] {
ix.StreamMU.Lock()
defer ix.StreamMU.Unlock()
out := make([]*StreamRecord[T], 0, len(ix.StreamRecords))
for _, streams := range ix.StreamRecords {
for _, stream := range streams {
out = append(out, stream)
}
}
return out
}
2026-03-03 16:38:24 +01:00
func (ix *LongLivedStreamRecordedService[T]) HandleHeartbeat(s network.Stream) {
logger := oclib.GetLogger()
2026-02-03 15:25:15 +01:00
defer s.Close()
2026-03-03 16:38:24 +01:00
dec := json.NewDecoder(s)
2026-02-03 15:25:15 +01:00
for {
ix.StreamMU.Lock()
if ix.StreamRecords[ProtocolHeartbeat] == nil {
ix.StreamRecords[ProtocolHeartbeat] = map[pp.ID]*StreamRecord[T]{}
}
streams := ix.StreamRecords[ProtocolHeartbeat]
2026-02-17 13:11:22 +01:00
streamsAnonym := map[pp.ID]HeartBeatStreamed{}
for k, v := range streams {
streamsAnonym[k] = v
}
ix.StreamMU.Unlock()
2026-03-03 16:38:24 +01:00
pid, hb, err := CheckHeartbeat(ix.Host, s, dec, streamsAnonym, &ix.StreamMU, ix.maxNodesConn)
2026-02-17 13:11:22 +01:00
if err != nil {
2026-03-03 16:38:24 +01:00
// Stream-level errors (EOF, reset, closed) mean the connection is gone
// — exit so the goroutine doesn't spin forever on a dead stream.
// Metric/policy errors (score too low, too many connections) are transient
// — those are also stream-terminal since the stream carries one session.
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) ||
strings.Contains(err.Error(), "reset") ||
strings.Contains(err.Error(), "closed") ||
strings.Contains(err.Error(), "too many connections") {
logger.Info().Err(err).Msg("heartbeat stream terminated, closing handler")
return
}
logger.Warn().Err(err).Msg("heartbeat check failed, retrying on same stream")
2026-02-17 13:11:22 +01:00
continue
}
ix.StreamMU.Lock()
2026-02-03 15:25:15 +01:00
// if record already seen update last seen
if rec, ok := streams[*pid]; ok {
rec.DID = hb.DID
2026-03-03 16:38:24 +01:00
if rec.HeartbeatStream == nil {
rec.HeartbeatStream = hb.Stream
}
2026-02-03 15:25:15 +01:00
rec.HeartbeatStream = hb.Stream
2026-03-03 16:38:24 +01:00
if rec.HeartbeatStream.UptimeTracker == nil {
rec.HeartbeatStream.UptimeTracker = &UptimeTracker{
FirstSeen: time.Now().UTC(),
LastSeen: time.Now().UTC(),
}
}
logger.Info().Msg("A new node is updated : " + pid.String())
2026-02-03 15:25:15 +01:00
} else {
2026-02-17 13:11:22 +01:00
hb.Stream.UptimeTracker = &UptimeTracker{
FirstSeen: time.Now().UTC(),
LastSeen: time.Now().UTC(),
}
2026-02-03 15:25:15 +01:00
streams[*pid] = &StreamRecord[T]{
DID: hb.DID,
HeartbeatStream: hb.Stream,
}
2026-03-03 16:38:24 +01:00
logger.Info().Msg("A new node is subscribed : " + pid.String())
2026-02-03 15:25:15 +01:00
}
ix.StreamMU.Unlock()
2026-03-03 16:38:24 +01:00
// Let the indexer republish the embedded signed record to the DHT.
if ix.AfterHeartbeat != nil {
ix.AfterHeartbeat(*pid)
}
2026-01-30 16:57:36 +01:00
}
}
2026-03-03 16:38:24 +01:00
func CheckHeartbeat(h host.Host, s network.Stream, dec *json.Decoder, streams map[pp.ID]HeartBeatStreamed, lock *sync.RWMutex, maxNodes int) (*pp.ID, *Heartbeat, error) {
2026-01-30 16:57:36 +01:00
if len(h.Network().Peers()) >= maxNodes {
return nil, nil, fmt.Errorf("too many connections, try another indexer")
}
var hb Heartbeat
2026-03-03 16:38:24 +01:00
if err := dec.Decode(&hb); err != nil {
2026-01-30 16:57:36 +01:00
return nil, nil, err
}
2026-03-03 16:38:24 +01:00
_, bpms, _ := getBandwidthChallengeRate(h, s.Conn().RemotePeer(), MinPayloadChallenge+int(rand.Float64()*(MaxPayloadChallenge-MinPayloadChallenge)))
{
2026-02-17 13:11:22 +01:00
pid, err := pp.Decode(hb.PeerID)
if err != nil {
return nil, nil, err
}
upTime := float64(0)
2026-03-03 16:38:24 +01:00
isFirstHeartbeat := true
2026-02-17 13:11:22 +01:00
lock.Lock()
if rec, ok := streams[pid]; ok && rec.GetUptimeTracker() != nil {
upTime = rec.GetUptimeTracker().Uptime().Hours() / float64(time.Since(TimeWatcher).Hours())
2026-03-03 16:38:24 +01:00
isFirstHeartbeat = false
2026-02-17 13:11:22 +01:00
}
lock.Unlock()
diversity := getDiversityRate(h, hb.IndexersBinded)
2026-03-03 16:38:24 +01:00
fmt.Println(upTime, bpms, diversity)
2026-02-17 13:11:22 +01:00
hb.ComputeIndexerScore(upTime, bpms, diversity)
2026-03-03 16:38:24 +01:00
// First heartbeat: uptime is always 0 so the score ceiling is 60, below the
// steady-state threshold of 75. Use a lower admission threshold so new peers
// can enter and start accumulating uptime. Subsequent heartbeats must meet
// the full threshold once uptime is tracked.
minScore := float64(50)
if isFirstHeartbeat {
minScore = 40
}
fmt.Println(hb.Score, minScore)
if hb.Score < minScore {
2026-02-17 13:11:22 +01:00
return nil, nil, errors.New("not enough trusting value")
}
hb.Stream = &Stream{
Name: hb.Name,
DID: hb.DID,
Stream: s,
Expiry: time.Now().UTC().Add(2 * time.Minute),
} // here is the long-lived bidirectionnal heart bit.
return &pid, &hb, err
}
}
func getDiversityRate(h host.Host, peers []string) float64 {
2026-03-03 16:38:24 +01:00
2026-02-17 13:11:22 +01:00
peers, _ = checkPeers(h, peers)
diverse := []string{}
for _, p := range peers {
ip, err := ExtractIP(p)
if err != nil {
2026-03-03 16:38:24 +01:00
fmt.Println("NO IP", p, err)
2026-02-17 13:11:22 +01:00
continue
}
div := ip.Mask(net.CIDRMask(24, 32)).String()
if !slices.Contains(diverse, div) {
diverse = append(diverse, div)
}
}
2026-03-03 16:38:24 +01:00
if len(diverse) == 0 || len(peers) == 0 {
return 1
}
2026-02-17 13:11:22 +01:00
return float64(len(diverse) / len(peers))
}
func checkPeers(h host.Host, peers []string) ([]string, []string) {
concretePeer := []string{}
ips := []string{}
for _, p := range peers {
ad, err := pp.AddrInfoFromString(p)
if err != nil {
continue
}
if PeerIsAlive(h, *ad) {
concretePeer = append(concretePeer, p)
if ip, err := ExtractIP(p); err == nil {
ips = append(ips, ip.Mask(net.CIDRMask(24, 32)).String())
}
}
}
return concretePeer, ips
}
2026-03-03 16:38:24 +01:00
const MaxExpectedMbps = 100.0
2026-02-17 13:11:22 +01:00
const MinPayloadChallenge = 512
const MaxPayloadChallenge = 2048
const BaseRoundTrip = 400 * time.Millisecond
2026-03-03 16:38:24 +01:00
// getBandwidthChallengeRate opens a dedicated ProtocolBandwidthProbe stream to
// remotePeer, sends a random payload, reads the echo, and computes throughput.
// Using a separate stream avoids mixing binary data on the JSON heartbeat stream
// and ensures the echo handler is actually running on the remote side.
func getBandwidthChallengeRate(h host.Host, remotePeer pp.ID, payloadSize int) (bool, float64, error) {
2026-02-17 13:11:22 +01:00
payload := make([]byte, payloadSize)
2026-03-03 16:38:24 +01:00
if _, err := cr.Read(payload); err != nil {
return false, 0, err
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
s, err := h.NewStream(ctx, remotePeer, ProtocolBandwidthProbe)
2026-02-17 13:11:22 +01:00
if err != nil {
return false, 0, err
}
2026-03-03 16:38:24 +01:00
defer s.Reset()
s.SetDeadline(time.Now().Add(10 * time.Second))
2026-02-17 13:11:22 +01:00
start := time.Now()
if _, err = s.Write(payload); err != nil {
return false, 0, err
}
2026-03-03 16:38:24 +01:00
s.CloseWrite()
// Half-close the write side so the handler's io.Copy sees EOF and stops.
// Read the echo.
2026-02-17 13:11:22 +01:00
response := make([]byte, payloadSize)
2026-03-03 16:38:24 +01:00
if _, err = io.ReadFull(s, response); err != nil {
2026-02-17 13:11:22 +01:00
return false, 0, err
}
duration := time.Since(start)
maxRoundTrip := BaseRoundTrip + (time.Duration(payloadSize) * (100 * time.Millisecond))
mbps := float64(payloadSize*8) / duration.Seconds() / 1e6
if duration > maxRoundTrip || mbps < 5.0 {
return false, float64(mbps / MaxExpectedMbps), nil
}
return true, float64(mbps / MaxExpectedMbps), nil
}
type UptimeTracker struct {
FirstSeen time.Time
LastSeen time.Time
}
func (u *UptimeTracker) Uptime() time.Duration {
return time.Since(u.FirstSeen)
}
func (u *UptimeTracker) IsEligible(min time.Duration) bool {
return u.Uptime() >= min
2026-01-30 16:57:36 +01:00
}
type StreamRecord[T interface{}] struct {
DID string
HeartbeatStream *Stream
Record T
2026-02-17 13:11:22 +01:00
}
func (s *StreamRecord[T]) GetUptimeTracker() *UptimeTracker {
if s.HeartbeatStream == nil {
return nil
}
return s.HeartbeatStream.UptimeTracker
2026-01-30 16:57:36 +01:00
}
type Stream struct {
2026-02-17 13:11:22 +01:00
Name string `json:"name"`
DID string `json:"did"`
Stream network.Stream
Expiry time.Time `json:"expiry"`
UptimeTracker *UptimeTracker
}
func (s *Stream) GetUptimeTracker() *UptimeTracker {
return s.UptimeTracker
2026-01-30 16:57:36 +01:00
}
func NewStream[T interface{}](s network.Stream, did string, record T) *Stream {
return &Stream{
DID: did,
Stream: s,
Expiry: time.Now().UTC().Add(2 * time.Minute),
}
}
type ProtocolStream map[protocol.ID]map[pp.ID]*Stream
func (ps ProtocolStream) Get(protocol protocol.ID) map[pp.ID]*Stream {
if ps[protocol] == nil {
ps[protocol] = map[pp.ID]*Stream{}
}
return ps[protocol]
}
func (ps ProtocolStream) Add(protocol protocol.ID, peerID *pp.ID, s *Stream) error {
if ps[protocol] == nil {
ps[protocol] = map[pp.ID]*Stream{}
}
if peerID != nil {
if s != nil {
ps[protocol][*peerID] = s
} else {
return errors.New("unable to add stream : stream missing")
}
}
return nil
}
func (ps ProtocolStream) Delete(protocol protocol.ID, peerID *pp.ID) {
if streams, ok := ps[protocol]; ok {
if peerID != nil && streams[*peerID] != nil {
streams[*peerID].Stream.Close()
delete(streams, *peerID)
} else {
for _, s := range ps {
for _, v := range s {
v.Stream.Close()
}
}
delete(ps, protocol)
}
}
}
const (
ProtocolPublish = "/opencloud/record/publish/1.0"
ProtocolGet = "/opencloud/record/get/1.0"
)
2026-02-17 13:11:22 +01:00
var TimeWatcher time.Time
var StaticIndexers map[string]*pp.AddrInfo = map[string]*pp.AddrInfo{}
var StreamMuIndexes sync.RWMutex
2026-01-30 16:57:36 +01:00
var StreamIndexers ProtocolStream = ProtocolStream{}
2026-03-03 16:38:24 +01:00
// indexerHeartbeatNudge allows replenishIndexersFromNative to trigger an immediate
// heartbeat tick after adding new entries to StaticIndexers, without waiting up
// to 20s for the regular ticker. Buffered(1) so the sender never blocks.
var indexerHeartbeatNudge = make(chan struct{}, 1)
// NudgeIndexerHeartbeat signals the indexer heartbeat goroutine to fire immediately.
func NudgeIndexerHeartbeat() {
select {
case indexerHeartbeatNudge <- struct{}{}:
default: // nudge already pending, skip
}
}
func ConnectToIndexers(h host.Host, minIndexer int, maxIndexer int, myPID pp.ID, recordFn ...func() json.RawMessage) error {
2026-02-17 13:11:22 +01:00
TimeWatcher = time.Now().UTC()
2026-01-30 16:57:36 +01:00
logger := oclib.GetLogger()
2026-02-20 12:42:18 +01:00
2026-03-03 16:38:24 +01:00
// If native addresses are configured, get the indexer pool from the native mesh,
// then start the long-lived heartbeat goroutine toward those indexers.
2026-02-20 12:42:18 +01:00
if conf.GetConfig().NativeIndexerAddresses != "" {
2026-03-03 16:38:24 +01:00
if err := ConnectToNatives(h, minIndexer, maxIndexer, myPID); err != nil {
return err
}
// Step 2: start the long-lived heartbeat goroutine toward the indexer pool.
// replaceStaticIndexers/replenishIndexersFromNative update the map in-place
// so this single goroutine follows all pool changes automatically.
logger.Info().Msg("[native] step 2 — starting long-lived heartbeat to indexer pool")
SendHeartbeat(context.Background(), ProtocolHeartbeat, conf.GetConfig().Name,
h, StreamIndexers, StaticIndexers, &StreamMuIndexes, 20*time.Second, recordFn...)
return nil
2026-02-20 12:42:18 +01:00
}
2026-01-30 16:57:36 +01:00
addresses := strings.Split(conf.GetConfig().IndexerAddresses, ",")
if len(addresses) > maxIndexer {
addresses = addresses[0:maxIndexer]
}
2026-03-03 16:38:24 +01:00
StreamMuIndexes.Lock()
2026-01-30 16:57:36 +01:00
for _, indexerAddr := range addresses {
ad, err := pp.AddrInfoFromString(indexerAddr)
if err != nil {
logger.Err(err)
continue
}
2026-02-17 13:11:22 +01:00
StaticIndexers[indexerAddr] = ad
2026-01-30 16:57:36 +01:00
}
2026-03-03 16:38:24 +01:00
indexerCount := len(StaticIndexers)
StreamMuIndexes.Unlock()
2026-01-30 16:57:36 +01:00
2026-03-03 16:38:24 +01:00
SendHeartbeat(context.Background(), ProtocolHeartbeat, conf.GetConfig().Name, h, StreamIndexers, StaticIndexers, &StreamMuIndexes, 20*time.Second, recordFn...) // your indexer is just like a node for the next indexer.
if indexerCount < minIndexer {
return errors.New("you run a node without indexers... your gonna be isolated.")
2026-01-30 16:57:36 +01:00
}
return nil
2026-01-30 16:57:36 +01:00
}
2026-02-03 15:25:15 +01:00
func AddStreamProtocol(ctx *context.Context, protoS ProtocolStream, h host.Host, proto protocol.ID, id pp.ID, mypid pp.ID, force bool, onStreamCreated *func(network.Stream)) ProtocolStream {
2026-03-03 16:38:24 +01:00
logger := oclib.GetLogger()
2026-01-30 16:57:36 +01:00
if onStreamCreated == nil {
f := func(s network.Stream) {
protoS[proto][id] = &Stream{
Stream: s,
2026-02-03 15:25:15 +01:00
Expiry: time.Now().UTC().Add(2 * time.Minute),
2026-01-30 16:57:36 +01:00
}
}
onStreamCreated = &f
}
f := *onStreamCreated
2026-02-03 15:25:15 +01:00
if mypid > id || force {
2026-01-30 16:57:36 +01:00
if ctx == nil {
c := context.Background()
ctx = &c
}
if protoS[proto] == nil {
protoS[proto] = map[pp.ID]*Stream{}
}
if protoS[proto][id] != nil {
protoS[proto][id].Expiry = time.Now().Add(2 * time.Minute)
} else {
2026-03-03 16:38:24 +01:00
logger.Info().Msg("NEW STREAM Generated" + fmt.Sprintf("%v", proto) + " " + id.String())
2026-01-30 16:57:36 +01:00
s, err := h.NewStream(*ctx, id, proto)
if err != nil {
panic(err.Error())
}
f(s)
}
}
return protoS
}
type Heartbeat struct {
2026-02-17 13:11:22 +01:00
Name string `json:"name"`
Stream *Stream `json:"stream"`
DID string `json:"did"`
PeerID string `json:"peer_id"`
Timestamp int64 `json:"timestamp"`
IndexersBinded []string `json:"indexers_binded"`
Score float64
2026-03-03 16:38:24 +01:00
// Record carries a fresh signed PeerRecord (JSON) so the receiving indexer
// can republish it to the DHT without an extra round-trip.
// Only set by nodes (not indexers heartbeating other indexers).
Record json.RawMessage `json:"record,omitempty"`
2026-02-17 13:11:22 +01:00
}
func (hb *Heartbeat) ComputeIndexerScore(uptimeHours float64, bpms float64, diversity float64) {
2026-03-03 16:38:24 +01:00
hb.Score = ((0.3 * uptimeHours) +
(0.3 * bpms) +
(0.4 * diversity)) * 100
2026-01-30 16:57:36 +01:00
}
type HeartbeatInfo []struct {
Info []byte `json:"info"`
}
const ProtocolHeartbeat = "/opencloud/heartbeat/1.0"
2026-03-03 16:38:24 +01:00
// ProtocolBandwidthProbe is a dedicated short-lived stream used exclusively
// for bandwidth/latency measurement. The handler echoes any bytes it receives.
// All nodes and indexers register this handler so peers can measure them.
const ProtocolBandwidthProbe = "/opencloud/probe/1.0"
// HandleBandwidthProbe echoes back everything written on the stream, then closes.
// It is registered by all participants so the measuring side (the heartbeat receiver)
// can open a dedicated probe stream and read the round-trip latency + throughput.
func HandleBandwidthProbe(s network.Stream) {
defer s.Close()
s.SetDeadline(time.Now().Add(10 * time.Second))
io.Copy(s, s) // echo every byte back to the sender
}
// SendHeartbeat starts a goroutine that sends periodic heartbeats to peers.
// recordFn, when provided, is called on each tick and its output is embedded in
// the heartbeat as a fresh signed PeerRecord so the receiving indexer can
// republish it to the DHT without an extra round-trip.
// Pass no recordFn (or nil) for indexer→indexer / native heartbeats.
func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.Host, ps ProtocolStream, peers map[string]*pp.AddrInfo, mu *sync.RWMutex, interval time.Duration, recordFn ...func() json.RawMessage) {
logger := oclib.GetLogger()
// isIndexerHB is true when this goroutine drives the indexer heartbeat.
// isNativeHB is true when it drives the native heartbeat.
isIndexerHB := mu == &StreamMuIndexes
isNativeHB := mu == &StreamNativeMu
var recFn func() json.RawMessage
if len(recordFn) > 0 {
recFn = recordFn[0]
2026-01-30 16:57:36 +01:00
}
go func() {
2026-03-03 16:38:24 +01:00
logger.Info().Str("proto", string(proto)).Int("peers", len(peers)).Msg("heartbeat started")
2026-01-30 16:57:36 +01:00
t := time.NewTicker(interval)
defer t.Stop()
2026-03-03 16:38:24 +01:00
// doTick sends one round of heartbeats to the current peer snapshot.
doTick := func() {
// Build the heartbeat payload — snapshot current indexer addresses.
StreamMuIndexes.RLock()
addrs := make([]string, 0, len(StaticIndexers))
for addr := range StaticIndexers {
addrs = append(addrs, addr)
}
StreamMuIndexes.RUnlock()
hb := Heartbeat{
Name: name,
PeerID: h.ID().String(),
Timestamp: time.Now().UTC().Unix(),
IndexersBinded: addrs,
}
if recFn != nil {
hb.Record = recFn()
}
// Snapshot the peer list under a read lock so we don't hold the
// write lock during network I/O.
if mu != nil {
mu.RLock()
}
snapshot := make([]*pp.AddrInfo, 0, len(peers))
for _, ix := range peers {
snapshot = append(snapshot, ix)
}
if mu != nil {
mu.RUnlock()
}
for _, ix := range snapshot {
wasConnected := h.Network().Connectedness(ix.ID) == network.Connected
if err := sendHeartbeat(ctx, h, proto, ix, hb, ps, interval*time.Second); err != nil {
// Step 3: heartbeat failed — remove from pool and trigger replenish.
logger.Info().Str("peer", ix.ID.String()).Str("proto", string(proto)).Msg("[native] step 3 — heartbeat failed, removing peer from pool")
// Remove the dead peer and clean up its stream.
// mu already covers ps when isIndexerHB (same mutex), so one
// lock acquisition is sufficient — no re-entrant double-lock.
if mu != nil {
mu.Lock()
}
if ps[proto] != nil {
if s, ok := ps[proto][ix.ID]; ok {
if s.Stream != nil {
s.Stream.Close()
}
delete(ps[proto], ix.ID)
}
}
lostAddr := ""
for addr, ad := range peers {
if ad.ID == ix.ID {
lostAddr = addr
delete(peers, addr)
break
}
}
need := conf.GetConfig().MinIndexer - len(peers)
remaining := len(peers)
if mu != nil {
mu.Unlock()
}
logger.Info().Int("remaining", remaining).Int("min", conf.GetConfig().MinIndexer).Int("need", need).Msg("[native] step 3 — pool state after removal")
// Step 4: ask the native for the missing indexer count.
if isIndexerHB && conf.GetConfig().NativeIndexerAddresses != "" {
if need < 1 {
need = 1
}
logger.Info().Int("need", need).Msg("[native] step 3→4 — triggering replenish")
go replenishIndexersFromNative(h, need)
}
// Native heartbeat failed — find a replacement native.
// Case 1: if the dead native was also serving as an indexer, evict it
// from StaticIndexers immediately without waiting for the indexer HB tick.
if isNativeHB {
logger.Info().Str("addr", lostAddr).Msg("[native] step 3 — native heartbeat failed, triggering native replenish")
if lostAddr != "" && conf.GetConfig().NativeIndexerAddresses != "" {
StreamMuIndexes.Lock()
if _, wasIndexer := StaticIndexers[lostAddr]; wasIndexer {
delete(StaticIndexers, lostAddr)
if s := StreamIndexers[ProtocolHeartbeat]; s != nil {
if stream, ok := s[ix.ID]; ok {
if stream.Stream != nil {
stream.Stream.Close()
}
delete(s, ix.ID)
}
}
idxNeed := conf.GetConfig().MinIndexer - len(StaticIndexers)
StreamMuIndexes.Unlock()
if idxNeed < 1 {
idxNeed = 1
}
logger.Info().Str("addr", lostAddr).Msg("[native] dead native evicted from indexer pool, triggering replenish")
go replenishIndexersFromNative(h, idxNeed)
} else {
StreamMuIndexes.Unlock()
}
}
go replenishNativesFromPeers(h, lostAddr, proto)
}
} else {
// Case 2: native-as-indexer reconnected after a restart.
// If the peer was disconnected before this tick and the heartbeat just
// succeeded (transparent reconnect), the native may have restarted with
// blank state (responsiblePeers empty). Evict it from StaticIndexers and
// re-request an assignment so the native re-tracks us properly and
// runOffloadLoop can eventually migrate us to real indexers.
if !wasConnected && isIndexerHB && conf.GetConfig().NativeIndexerAddresses != "" {
StreamNativeMu.RLock()
isNativeIndexer := false
for _, ad := range StaticNatives {
if ad.ID == ix.ID {
isNativeIndexer = true
break
}
}
StreamNativeMu.RUnlock()
if isNativeIndexer {
if mu != nil {
mu.Lock()
}
if ps[proto] != nil {
if s, ok := ps[proto][ix.ID]; ok {
if s.Stream != nil {
s.Stream.Close()
}
delete(ps[proto], ix.ID)
}
}
reconnectedAddr := ""
for addr, ad := range peers {
if ad.ID == ix.ID {
reconnectedAddr = addr
delete(peers, addr)
break
}
}
idxNeed := conf.GetConfig().MinIndexer - len(peers)
if mu != nil {
mu.Unlock()
}
if idxNeed < 1 {
idxNeed = 1
}
logger.Info().Str("addr", reconnectedAddr).Str("peer", ix.ID.String()).Msg(
"[native] native-as-indexer reconnected after restart — evicting and re-requesting assignment")
go replenishIndexersFromNative(h, idxNeed)
}
}
logger.Debug().Str("peer", ix.ID.String()).Str("proto", string(proto)).Msg("[native] step 2 — heartbeat sent ok")
}
}
}
2026-01-30 16:57:36 +01:00
for {
select {
case <-t.C:
2026-03-03 16:38:24 +01:00
doTick()
case <-indexerHeartbeatNudge:
if isIndexerHB {
logger.Info().Msg("[native] step 2 — nudge received, heartbeating new indexers immediately")
doTick()
2026-02-17 13:11:22 +01:00
}
2026-03-03 16:38:24 +01:00
case <-nativeHeartbeatNudge:
if isNativeHB {
logger.Info().Msg("[native] native nudge received, heartbeating replacement native immediately")
doTick()
2026-01-30 16:57:36 +01:00
}
case <-ctx.Done():
return
}
}
}()
}
2026-02-24 14:31:37 +01:00
type ProtocolInfo struct {
PersistantStream bool
WaitResponse bool
TTL time.Duration
}
func TempStream(h host.Host, ad pp.AddrInfo, proto protocol.ID, did string, streams ProtocolStream, pts map[protocol.ID]*ProtocolInfo, mu *sync.RWMutex) (ProtocolStream, error) {
expiry := 2 * time.Second
if pts[proto] != nil {
expiry = pts[proto].TTL
}
2026-03-03 16:38:24 +01:00
ctxTTL, _ := context.WithTimeout(context.Background(), expiry)
if h.Network().Connectedness(ad.ID) != network.Connected {
if err := h.Connect(ctxTTL, ad); err != nil {
return streams, err
}
}
if streams[proto] != nil && streams[proto][ad.ID] != nil {
return streams, nil
} else if s, err := h.NewStream(ctxTTL, ad.ID, proto); err == nil {
mu.Lock()
if streams[proto] == nil {
streams[proto] = map[pp.ID]*Stream{}
}
2026-03-03 16:38:24 +01:00
mu.Unlock()
time.AfterFunc(expiry, func() {
mu.Lock()
2026-03-03 16:38:24 +01:00
delete(streams[proto], ad.ID)
mu.Unlock()
2026-03-03 16:38:24 +01:00
})
mu.Lock()
streams[proto][ad.ID] = &Stream{
DID: did,
Stream: s,
Expiry: time.Now().UTC().Add(expiry),
}
2026-03-03 16:38:24 +01:00
mu.Unlock()
return streams, nil
} else {
return streams, err
}
}
2026-02-03 15:25:15 +01:00
func sendHeartbeat(ctx context.Context, h host.Host, proto protocol.ID, p *pp.AddrInfo,
hb Heartbeat, ps ProtocolStream, interval time.Duration) error {
2026-03-03 16:38:24 +01:00
logger := oclib.GetLogger()
if ps[proto] == nil {
ps[proto] = map[pp.ID]*Stream{}
2026-01-30 16:57:36 +01:00
}
2026-03-03 16:38:24 +01:00
streams := ps[proto]
2026-01-30 16:57:36 +01:00
pss, exists := streams[p.ID]
2026-03-03 16:38:24 +01:00
ctxTTL, cancel := context.WithTimeout(ctx, 3*interval)
defer cancel()
2026-01-30 16:57:36 +01:00
// Connect si nécessaire
if h.Network().Connectedness(p.ID) != network.Connected {
if err := h.Connect(ctxTTL, *p); err != nil {
2026-03-03 16:38:24 +01:00
logger.Err(err)
return err
}
2026-01-30 16:57:36 +01:00
exists = false // on devra recréer le stream
}
// Crée le stream si inexistant ou fermé
if !exists || pss.Stream == nil {
2026-03-03 16:38:24 +01:00
logger.Info().Msg("New Stream engaged as Heartbeat " + fmt.Sprintf("%v", proto) + " " + p.ID.String())
2026-01-30 16:57:36 +01:00
s, err := h.NewStream(ctx, p.ID, proto)
if err != nil {
2026-03-03 16:38:24 +01:00
logger.Err(err)
2026-01-30 16:57:36 +01:00
return err
}
pss = &Stream{
Stream: s,
Expiry: time.Now().UTC().Add(2 * time.Minute),
}
streams[p.ID] = pss
}
// Envoie le heartbeat
ss := json.NewEncoder(pss.Stream)
err := ss.Encode(&hb)
if err != nil {
pss.Stream.Close()
pss.Stream = nil // recréera au prochain tick
return err
}
pss.Expiry = time.Now().UTC().Add(2 * time.Minute)
return nil
}