2026-01-30 16:57:36 +01:00
package common
import (
"context"
2026-02-17 13:11:22 +01:00
cr "crypto/rand"
2026-01-30 16:57:36 +01:00
"encoding/json"
"errors"
"fmt"
2026-02-17 13:11:22 +01:00
"io"
"math/rand"
"net"
2026-01-30 16:57:36 +01:00
"oc-discovery/conf"
2026-02-17 13:11:22 +01:00
"slices"
2026-01-30 16:57:36 +01:00
"strings"
"sync"
"time"
oclib "cloud.o-forge.io/core/oc-lib"
"github.com/libp2p/go-libp2p/core/host"
"github.com/libp2p/go-libp2p/core/network"
pp "github.com/libp2p/go-libp2p/core/peer"
"github.com/libp2p/go-libp2p/core/protocol"
)
type LongLivedStreamRecordedService [ T interface { } ] struct {
* LongLivedPubSubService
2026-02-20 12:42:18 +01:00
StreamRecords map [ protocol . ID ] map [ pp . ID ] * StreamRecord [ T ]
StreamMU sync . RWMutex
maxNodesConn int
2026-03-03 16:38:24 +01:00
// AfterHeartbeat is an optional hook called after each successful heartbeat update.
// The indexer sets it to republish the embedded signed record to the DHT.
AfterHeartbeat func ( pid pp . ID )
// AfterDelete is called after gc() evicts an expired peer, outside the lock.
// name and did may be empty if the HeartbeatStream had no metadata.
AfterDelete func ( pid pp . ID , name string , did string )
2026-01-30 16:57:36 +01:00
}
2026-02-20 12:42:18 +01:00
func NewStreamRecordedService [ T interface { } ] ( h host . Host , maxNodesConn int ) * LongLivedStreamRecordedService [ T ] {
2026-01-30 16:57:36 +01:00
service := & LongLivedStreamRecordedService [ T ] {
LongLivedPubSubService : NewLongLivedPubSubService ( h ) ,
StreamRecords : map [ protocol . ID ] map [ pp . ID ] * StreamRecord [ T ] { } ,
maxNodesConn : maxNodesConn ,
}
go service . StartGC ( 30 * time . Second )
// Garbage collection is needed on every Map of Long-Lived Stream... it may be a top level redesigned
go service . Snapshot ( 1 * time . Hour )
return service
}
func ( ix * LongLivedStreamRecordedService [ T ] ) StartGC ( interval time . Duration ) {
go func ( ) {
t := time . NewTicker ( interval )
defer t . Stop ( )
for range t . C {
ix . gc ( )
}
} ( )
}
func ( ix * LongLivedStreamRecordedService [ T ] ) gc ( ) {
ix . StreamMU . Lock ( )
2026-02-03 15:25:15 +01:00
now := time . Now ( ) . UTC ( )
if ix . StreamRecords [ ProtocolHeartbeat ] == nil {
2026-01-30 16:57:36 +01:00
ix . StreamRecords [ ProtocolHeartbeat ] = map [ pp . ID ] * StreamRecord [ T ] { }
2026-03-03 16:38:24 +01:00
ix . StreamMU . Unlock ( )
2026-01-30 16:57:36 +01:00
return
}
2026-02-03 15:25:15 +01:00
streams := ix . StreamRecords [ ProtocolHeartbeat ]
2026-03-03 16:38:24 +01:00
fmt . Println ( StaticNatives , StaticIndexers , streams )
2026-02-03 15:25:15 +01:00
2026-03-03 16:38:24 +01:00
type gcEntry struct {
pid pp . ID
name string
did string
}
var evicted [ ] gcEntry
2026-01-30 16:57:36 +01:00
for pid , rec := range streams {
2026-02-17 13:11:22 +01:00
if now . After ( rec . HeartbeatStream . Expiry ) || now . Sub ( rec . HeartbeatStream . UptimeTracker . LastSeen ) > 2 * rec . HeartbeatStream . Expiry . Sub ( now ) {
2026-03-03 16:38:24 +01:00
name , did := "" , ""
if rec . HeartbeatStream != nil {
name = rec . HeartbeatStream . Name
did = rec . HeartbeatStream . DID
}
evicted = append ( evicted , gcEntry { pid , name , did } )
2026-01-30 16:57:36 +01:00
for _ , sstreams := range ix . StreamRecords {
if sstreams [ pid ] != nil {
delete ( sstreams , pid )
}
}
}
}
2026-03-03 16:38:24 +01:00
ix . StreamMU . Unlock ( )
if ix . AfterDelete != nil {
for _ , e := range evicted {
ix . AfterDelete ( e . pid , e . name , e . did )
}
}
2026-01-30 16:57:36 +01:00
}
func ( ix * LongLivedStreamRecordedService [ T ] ) Snapshot ( interval time . Duration ) {
go func ( ) {
logger := oclib . GetLogger ( )
t := time . NewTicker ( interval )
defer t . Stop ( )
for range t . C {
infos := ix . snapshot ( )
for _ , inf := range infos {
logger . Info ( ) . Msg ( " -> " + inf . DID )
}
}
} ( )
}
// -------- Snapshot / Query --------
func ( ix * LongLivedStreamRecordedService [ T ] ) snapshot ( ) [ ] * StreamRecord [ T ] {
ix . StreamMU . Lock ( )
defer ix . StreamMU . Unlock ( )
out := make ( [ ] * StreamRecord [ T ] , 0 , len ( ix . StreamRecords ) )
for _ , streams := range ix . StreamRecords {
for _ , stream := range streams {
out = append ( out , stream )
}
}
return out
}
2026-03-03 16:38:24 +01:00
func ( ix * LongLivedStreamRecordedService [ T ] ) HandleHeartbeat ( s network . Stream ) {
logger := oclib . GetLogger ( )
2026-02-03 15:25:15 +01:00
defer s . Close ( )
2026-03-03 16:38:24 +01:00
dec := json . NewDecoder ( s )
2026-02-03 15:25:15 +01:00
for {
ix . StreamMU . Lock ( )
if ix . StreamRecords [ ProtocolHeartbeat ] == nil {
ix . StreamRecords [ ProtocolHeartbeat ] = map [ pp . ID ] * StreamRecord [ T ] { }
}
streams := ix . StreamRecords [ ProtocolHeartbeat ]
2026-02-17 13:11:22 +01:00
streamsAnonym := map [ pp . ID ] HeartBeatStreamed { }
for k , v := range streams {
streamsAnonym [ k ] = v
}
ix . StreamMU . Unlock ( )
2026-03-03 16:38:24 +01:00
pid , hb , err := CheckHeartbeat ( ix . Host , s , dec , streamsAnonym , & ix . StreamMU , ix . maxNodesConn )
2026-02-17 13:11:22 +01:00
if err != nil {
2026-03-03 16:38:24 +01:00
// Stream-level errors (EOF, reset, closed) mean the connection is gone
// — exit so the goroutine doesn't spin forever on a dead stream.
// Metric/policy errors (score too low, too many connections) are transient
// — those are also stream-terminal since the stream carries one session.
if errors . Is ( err , io . EOF ) || errors . Is ( err , io . ErrUnexpectedEOF ) ||
strings . Contains ( err . Error ( ) , "reset" ) ||
strings . Contains ( err . Error ( ) , "closed" ) ||
strings . Contains ( err . Error ( ) , "too many connections" ) {
logger . Info ( ) . Err ( err ) . Msg ( "heartbeat stream terminated, closing handler" )
return
}
logger . Warn ( ) . Err ( err ) . Msg ( "heartbeat check failed, retrying on same stream" )
2026-02-17 13:11:22 +01:00
continue
}
ix . StreamMU . Lock ( )
2026-02-03 15:25:15 +01:00
// if record already seen update last seen
if rec , ok := streams [ * pid ] ; ok {
rec . DID = hb . DID
2026-03-03 16:38:24 +01:00
if rec . HeartbeatStream == nil {
rec . HeartbeatStream = hb . Stream
}
2026-02-03 15:25:15 +01:00
rec . HeartbeatStream = hb . Stream
2026-03-03 16:38:24 +01:00
if rec . HeartbeatStream . UptimeTracker == nil {
rec . HeartbeatStream . UptimeTracker = & UptimeTracker {
FirstSeen : time . Now ( ) . UTC ( ) ,
LastSeen : time . Now ( ) . UTC ( ) ,
}
}
logger . Info ( ) . Msg ( "A new node is updated : " + pid . String ( ) )
2026-02-03 15:25:15 +01:00
} else {
2026-02-17 13:11:22 +01:00
hb . Stream . UptimeTracker = & UptimeTracker {
FirstSeen : time . Now ( ) . UTC ( ) ,
LastSeen : time . Now ( ) . UTC ( ) ,
}
2026-02-03 15:25:15 +01:00
streams [ * pid ] = & StreamRecord [ T ] {
DID : hb . DID ,
HeartbeatStream : hb . Stream ,
}
2026-03-03 16:38:24 +01:00
logger . Info ( ) . Msg ( "A new node is subscribed : " + pid . String ( ) )
2026-02-03 15:25:15 +01:00
}
ix . StreamMU . Unlock ( )
2026-03-03 16:38:24 +01:00
// Let the indexer republish the embedded signed record to the DHT.
if ix . AfterHeartbeat != nil {
ix . AfterHeartbeat ( * pid )
}
2026-01-30 16:57:36 +01:00
}
}
2026-03-03 16:38:24 +01:00
func CheckHeartbeat ( h host . Host , s network . Stream , dec * json . Decoder , streams map [ pp . ID ] HeartBeatStreamed , lock * sync . RWMutex , maxNodes int ) ( * pp . ID , * Heartbeat , error ) {
2026-01-30 16:57:36 +01:00
if len ( h . Network ( ) . Peers ( ) ) >= maxNodes {
return nil , nil , fmt . Errorf ( "too many connections, try another indexer" )
}
var hb Heartbeat
2026-03-03 16:38:24 +01:00
if err := dec . Decode ( & hb ) ; err != nil {
2026-01-30 16:57:36 +01:00
return nil , nil , err
}
2026-03-03 16:38:24 +01:00
_ , bpms , _ := getBandwidthChallengeRate ( h , s . Conn ( ) . RemotePeer ( ) , MinPayloadChallenge + int ( rand . Float64 ( ) * ( MaxPayloadChallenge - MinPayloadChallenge ) ) )
{
2026-02-17 13:11:22 +01:00
pid , err := pp . Decode ( hb . PeerID )
if err != nil {
return nil , nil , err
}
upTime := float64 ( 0 )
2026-03-03 16:38:24 +01:00
isFirstHeartbeat := true
2026-02-17 13:11:22 +01:00
lock . Lock ( )
if rec , ok := streams [ pid ] ; ok && rec . GetUptimeTracker ( ) != nil {
upTime = rec . GetUptimeTracker ( ) . Uptime ( ) . Hours ( ) / float64 ( time . Since ( TimeWatcher ) . Hours ( ) )
2026-03-03 16:38:24 +01:00
isFirstHeartbeat = false
2026-02-17 13:11:22 +01:00
}
lock . Unlock ( )
diversity := getDiversityRate ( h , hb . IndexersBinded )
2026-03-03 16:38:24 +01:00
fmt . Println ( upTime , bpms , diversity )
2026-02-17 13:11:22 +01:00
hb . ComputeIndexerScore ( upTime , bpms , diversity )
2026-03-03 16:38:24 +01:00
// First heartbeat: uptime is always 0 so the score ceiling is 60, below the
// steady-state threshold of 75. Use a lower admission threshold so new peers
// can enter and start accumulating uptime. Subsequent heartbeats must meet
// the full threshold once uptime is tracked.
minScore := float64 ( 50 )
if isFirstHeartbeat {
minScore = 40
}
fmt . Println ( hb . Score , minScore )
if hb . Score < minScore {
2026-02-17 13:11:22 +01:00
return nil , nil , errors . New ( "not enough trusting value" )
}
hb . Stream = & Stream {
Name : hb . Name ,
DID : hb . DID ,
Stream : s ,
Expiry : time . Now ( ) . UTC ( ) . Add ( 2 * time . Minute ) ,
} // here is the long-lived bidirectionnal heart bit.
return & pid , & hb , err
}
}
func getDiversityRate ( h host . Host , peers [ ] string ) float64 {
2026-03-03 16:38:24 +01:00
2026-02-17 13:11:22 +01:00
peers , _ = checkPeers ( h , peers )
diverse := [ ] string { }
for _ , p := range peers {
ip , err := ExtractIP ( p )
if err != nil {
2026-03-03 16:38:24 +01:00
fmt . Println ( "NO IP" , p , err )
2026-02-17 13:11:22 +01:00
continue
}
div := ip . Mask ( net . CIDRMask ( 24 , 32 ) ) . String ( )
if ! slices . Contains ( diverse , div ) {
diverse = append ( diverse , div )
}
}
2026-03-03 16:38:24 +01:00
if len ( diverse ) == 0 || len ( peers ) == 0 {
return 1
}
2026-02-17 13:11:22 +01:00
return float64 ( len ( diverse ) / len ( peers ) )
}
func checkPeers ( h host . Host , peers [ ] string ) ( [ ] string , [ ] string ) {
concretePeer := [ ] string { }
ips := [ ] string { }
for _ , p := range peers {
ad , err := pp . AddrInfoFromString ( p )
if err != nil {
continue
}
if PeerIsAlive ( h , * ad ) {
concretePeer = append ( concretePeer , p )
if ip , err := ExtractIP ( p ) ; err == nil {
ips = append ( ips , ip . Mask ( net . CIDRMask ( 24 , 32 ) ) . String ( ) )
}
}
}
return concretePeer , ips
}
2026-03-03 16:38:24 +01:00
const MaxExpectedMbps = 100.0
2026-02-17 13:11:22 +01:00
const MinPayloadChallenge = 512
const MaxPayloadChallenge = 2048
const BaseRoundTrip = 400 * time . Millisecond
2026-03-03 16:38:24 +01:00
// getBandwidthChallengeRate opens a dedicated ProtocolBandwidthProbe stream to
// remotePeer, sends a random payload, reads the echo, and computes throughput.
// Using a separate stream avoids mixing binary data on the JSON heartbeat stream
// and ensures the echo handler is actually running on the remote side.
func getBandwidthChallengeRate ( h host . Host , remotePeer pp . ID , payloadSize int ) ( bool , float64 , error ) {
2026-02-17 13:11:22 +01:00
payload := make ( [ ] byte , payloadSize )
2026-03-03 16:38:24 +01:00
if _ , err := cr . Read ( payload ) ; err != nil {
return false , 0 , err
}
ctx , cancel := context . WithTimeout ( context . Background ( ) , 10 * time . Second )
defer cancel ( )
s , err := h . NewStream ( ctx , remotePeer , ProtocolBandwidthProbe )
2026-02-17 13:11:22 +01:00
if err != nil {
return false , 0 , err
}
2026-03-03 16:38:24 +01:00
defer s . Reset ( )
s . SetDeadline ( time . Now ( ) . Add ( 10 * time . Second ) )
2026-02-17 13:11:22 +01:00
start := time . Now ( )
if _ , err = s . Write ( payload ) ; err != nil {
return false , 0 , err
}
2026-03-03 16:38:24 +01:00
s . CloseWrite ( )
// Half-close the write side so the handler's io.Copy sees EOF and stops.
// Read the echo.
2026-02-17 13:11:22 +01:00
response := make ( [ ] byte , payloadSize )
2026-03-03 16:38:24 +01:00
if _ , err = io . ReadFull ( s , response ) ; err != nil {
2026-02-17 13:11:22 +01:00
return false , 0 , err
}
duration := time . Since ( start )
maxRoundTrip := BaseRoundTrip + ( time . Duration ( payloadSize ) * ( 100 * time . Millisecond ) )
mbps := float64 ( payloadSize * 8 ) / duration . Seconds ( ) / 1e6
if duration > maxRoundTrip || mbps < 5.0 {
return false , float64 ( mbps / MaxExpectedMbps ) , nil
}
return true , float64 ( mbps / MaxExpectedMbps ) , nil
}
type UptimeTracker struct {
FirstSeen time . Time
LastSeen time . Time
}
func ( u * UptimeTracker ) Uptime ( ) time . Duration {
return time . Since ( u . FirstSeen )
}
func ( u * UptimeTracker ) IsEligible ( min time . Duration ) bool {
return u . Uptime ( ) >= min
2026-01-30 16:57:36 +01:00
}
type StreamRecord [ T interface { } ] struct {
DID string
HeartbeatStream * Stream
Record T
2026-02-17 13:11:22 +01:00
}
func ( s * StreamRecord [ T ] ) GetUptimeTracker ( ) * UptimeTracker {
if s . HeartbeatStream == nil {
return nil
}
return s . HeartbeatStream . UptimeTracker
2026-01-30 16:57:36 +01:00
}
type Stream struct {
2026-02-17 13:11:22 +01:00
Name string ` json:"name" `
DID string ` json:"did" `
Stream network . Stream
Expiry time . Time ` json:"expiry" `
UptimeTracker * UptimeTracker
}
func ( s * Stream ) GetUptimeTracker ( ) * UptimeTracker {
return s . UptimeTracker
2026-01-30 16:57:36 +01:00
}
func NewStream [ T interface { } ] ( s network . Stream , did string , record T ) * Stream {
return & Stream {
DID : did ,
Stream : s ,
Expiry : time . Now ( ) . UTC ( ) . Add ( 2 * time . Minute ) ,
}
}
type ProtocolStream map [ protocol . ID ] map [ pp . ID ] * Stream
func ( ps ProtocolStream ) Get ( protocol protocol . ID ) map [ pp . ID ] * Stream {
if ps [ protocol ] == nil {
ps [ protocol ] = map [ pp . ID ] * Stream { }
}
return ps [ protocol ]
}
func ( ps ProtocolStream ) Add ( protocol protocol . ID , peerID * pp . ID , s * Stream ) error {
if ps [ protocol ] == nil {
ps [ protocol ] = map [ pp . ID ] * Stream { }
}
if peerID != nil {
if s != nil {
ps [ protocol ] [ * peerID ] = s
} else {
return errors . New ( "unable to add stream : stream missing" )
}
}
return nil
}
func ( ps ProtocolStream ) Delete ( protocol protocol . ID , peerID * pp . ID ) {
if streams , ok := ps [ protocol ] ; ok {
if peerID != nil && streams [ * peerID ] != nil {
streams [ * peerID ] . Stream . Close ( )
delete ( streams , * peerID )
} else {
for _ , s := range ps {
for _ , v := range s {
v . Stream . Close ( )
}
}
delete ( ps , protocol )
}
}
}
const (
ProtocolPublish = "/opencloud/record/publish/1.0"
ProtocolGet = "/opencloud/record/get/1.0"
)
2026-02-17 13:11:22 +01:00
var TimeWatcher time . Time
var StaticIndexers map [ string ] * pp . AddrInfo = map [ string ] * pp . AddrInfo { }
2026-02-18 13:29:50 +01:00
var StreamMuIndexes sync . RWMutex
2026-01-30 16:57:36 +01:00
var StreamIndexers ProtocolStream = ProtocolStream { }
2026-03-03 16:38:24 +01:00
// indexerHeartbeatNudge allows replenishIndexersFromNative to trigger an immediate
// heartbeat tick after adding new entries to StaticIndexers, without waiting up
// to 20s for the regular ticker. Buffered(1) so the sender never blocks.
var indexerHeartbeatNudge = make ( chan struct { } , 1 )
// NudgeIndexerHeartbeat signals the indexer heartbeat goroutine to fire immediately.
func NudgeIndexerHeartbeat ( ) {
select {
case indexerHeartbeatNudge <- struct { } { } :
default : // nudge already pending, skip
}
}
func ConnectToIndexers ( h host . Host , minIndexer int , maxIndexer int , myPID pp . ID , recordFn ... func ( ) json . RawMessage ) error {
2026-02-17 13:11:22 +01:00
TimeWatcher = time . Now ( ) . UTC ( )
2026-01-30 16:57:36 +01:00
logger := oclib . GetLogger ( )
2026-02-20 12:42:18 +01:00
2026-03-03 16:38:24 +01:00
// If native addresses are configured, get the indexer pool from the native mesh,
// then start the long-lived heartbeat goroutine toward those indexers.
2026-02-20 12:42:18 +01:00
if conf . GetConfig ( ) . NativeIndexerAddresses != "" {
2026-03-03 16:38:24 +01:00
if err := ConnectToNatives ( h , minIndexer , maxIndexer , myPID ) ; err != nil {
return err
}
// Step 2: start the long-lived heartbeat goroutine toward the indexer pool.
// replaceStaticIndexers/replenishIndexersFromNative update the map in-place
// so this single goroutine follows all pool changes automatically.
logger . Info ( ) . Msg ( "[native] step 2 — starting long-lived heartbeat to indexer pool" )
SendHeartbeat ( context . Background ( ) , ProtocolHeartbeat , conf . GetConfig ( ) . Name ,
h , StreamIndexers , StaticIndexers , & StreamMuIndexes , 20 * time . Second , recordFn ... )
return nil
2026-02-20 12:42:18 +01:00
}
2026-01-30 16:57:36 +01:00
addresses := strings . Split ( conf . GetConfig ( ) . IndexerAddresses , "," )
if len ( addresses ) > maxIndexer {
addresses = addresses [ 0 : maxIndexer ]
}
2026-03-03 16:38:24 +01:00
StreamMuIndexes . Lock ( )
2026-01-30 16:57:36 +01:00
for _ , indexerAddr := range addresses {
ad , err := pp . AddrInfoFromString ( indexerAddr )
if err != nil {
logger . Err ( err )
continue
}
2026-02-17 13:11:22 +01:00
StaticIndexers [ indexerAddr ] = ad
2026-01-30 16:57:36 +01:00
}
2026-03-03 16:38:24 +01:00
indexerCount := len ( StaticIndexers )
StreamMuIndexes . Unlock ( )
2026-01-30 16:57:36 +01:00
2026-03-03 16:38:24 +01:00
SendHeartbeat ( context . Background ( ) , ProtocolHeartbeat , conf . GetConfig ( ) . Name , h , StreamIndexers , StaticIndexers , & StreamMuIndexes , 20 * time . Second , recordFn ... ) // your indexer is just like a node for the next indexer.
if indexerCount < minIndexer {
2026-02-18 13:29:50 +01:00
return errors . New ( "you run a node without indexers... your gonna be isolated." )
2026-01-30 16:57:36 +01:00
}
2026-02-18 13:29:50 +01:00
return nil
2026-01-30 16:57:36 +01:00
}
2026-02-03 15:25:15 +01:00
func AddStreamProtocol ( ctx * context . Context , protoS ProtocolStream , h host . Host , proto protocol . ID , id pp . ID , mypid pp . ID , force bool , onStreamCreated * func ( network . Stream ) ) ProtocolStream {
2026-03-03 16:38:24 +01:00
logger := oclib . GetLogger ( )
2026-01-30 16:57:36 +01:00
if onStreamCreated == nil {
f := func ( s network . Stream ) {
protoS [ proto ] [ id ] = & Stream {
Stream : s ,
2026-02-03 15:25:15 +01:00
Expiry : time . Now ( ) . UTC ( ) . Add ( 2 * time . Minute ) ,
2026-01-30 16:57:36 +01:00
}
}
onStreamCreated = & f
}
f := * onStreamCreated
2026-02-03 15:25:15 +01:00
if mypid > id || force {
2026-01-30 16:57:36 +01:00
if ctx == nil {
c := context . Background ( )
ctx = & c
}
if protoS [ proto ] == nil {
protoS [ proto ] = map [ pp . ID ] * Stream { }
}
if protoS [ proto ] [ id ] != nil {
protoS [ proto ] [ id ] . Expiry = time . Now ( ) . Add ( 2 * time . Minute )
} else {
2026-03-03 16:38:24 +01:00
logger . Info ( ) . Msg ( "NEW STREAM Generated" + fmt . Sprintf ( "%v" , proto ) + " " + id . String ( ) )
2026-01-30 16:57:36 +01:00
s , err := h . NewStream ( * ctx , id , proto )
if err != nil {
panic ( err . Error ( ) )
}
f ( s )
}
}
return protoS
}
type Heartbeat struct {
2026-02-17 13:11:22 +01:00
Name string ` json:"name" `
Stream * Stream ` json:"stream" `
DID string ` json:"did" `
PeerID string ` json:"peer_id" `
Timestamp int64 ` json:"timestamp" `
IndexersBinded [ ] string ` json:"indexers_binded" `
Score float64
2026-03-03 16:38:24 +01:00
// Record carries a fresh signed PeerRecord (JSON) so the receiving indexer
// can republish it to the DHT without an extra round-trip.
// Only set by nodes (not indexers heartbeating other indexers).
Record json . RawMessage ` json:"record,omitempty" `
2026-02-17 13:11:22 +01:00
}
func ( hb * Heartbeat ) ComputeIndexerScore ( uptimeHours float64 , bpms float64 , diversity float64 ) {
2026-03-03 16:38:24 +01:00
hb . Score = ( ( 0.3 * uptimeHours ) +
( 0.3 * bpms ) +
( 0.4 * diversity ) ) * 100
2026-01-30 16:57:36 +01:00
}
type HeartbeatInfo [ ] struct {
Info [ ] byte ` json:"info" `
}
const ProtocolHeartbeat = "/opencloud/heartbeat/1.0"
2026-03-03 16:38:24 +01:00
// ProtocolBandwidthProbe is a dedicated short-lived stream used exclusively
// for bandwidth/latency measurement. The handler echoes any bytes it receives.
// All nodes and indexers register this handler so peers can measure them.
const ProtocolBandwidthProbe = "/opencloud/probe/1.0"
// HandleBandwidthProbe echoes back everything written on the stream, then closes.
// It is registered by all participants so the measuring side (the heartbeat receiver)
// can open a dedicated probe stream and read the round-trip latency + throughput.
func HandleBandwidthProbe ( s network . Stream ) {
defer s . Close ( )
s . SetDeadline ( time . Now ( ) . Add ( 10 * time . Second ) )
io . Copy ( s , s ) // echo every byte back to the sender
}
// SendHeartbeat starts a goroutine that sends periodic heartbeats to peers.
// recordFn, when provided, is called on each tick and its output is embedded in
// the heartbeat as a fresh signed PeerRecord so the receiving indexer can
// republish it to the DHT without an extra round-trip.
// Pass no recordFn (or nil) for indexer→indexer / native heartbeats.
func SendHeartbeat ( ctx context . Context , proto protocol . ID , name string , h host . Host , ps ProtocolStream , peers map [ string ] * pp . AddrInfo , mu * sync . RWMutex , interval time . Duration , recordFn ... func ( ) json . RawMessage ) {
logger := oclib . GetLogger ( )
// isIndexerHB is true when this goroutine drives the indexer heartbeat.
// isNativeHB is true when it drives the native heartbeat.
isIndexerHB := mu == & StreamMuIndexes
isNativeHB := mu == & StreamNativeMu
var recFn func ( ) json . RawMessage
if len ( recordFn ) > 0 {
recFn = recordFn [ 0 ]
2026-01-30 16:57:36 +01:00
}
go func ( ) {
2026-03-03 16:38:24 +01:00
logger . Info ( ) . Str ( "proto" , string ( proto ) ) . Int ( "peers" , len ( peers ) ) . Msg ( "heartbeat started" )
2026-01-30 16:57:36 +01:00
t := time . NewTicker ( interval )
defer t . Stop ( )
2026-03-03 16:38:24 +01:00
// doTick sends one round of heartbeats to the current peer snapshot.
doTick := func ( ) {
// Build the heartbeat payload — snapshot current indexer addresses.
StreamMuIndexes . RLock ( )
addrs := make ( [ ] string , 0 , len ( StaticIndexers ) )
for addr := range StaticIndexers {
addrs = append ( addrs , addr )
}
StreamMuIndexes . RUnlock ( )
hb := Heartbeat {
Name : name ,
PeerID : h . ID ( ) . String ( ) ,
Timestamp : time . Now ( ) . UTC ( ) . Unix ( ) ,
IndexersBinded : addrs ,
}
if recFn != nil {
hb . Record = recFn ( )
}
// Snapshot the peer list under a read lock so we don't hold the
// write lock during network I/O.
if mu != nil {
mu . RLock ( )
}
snapshot := make ( [ ] * pp . AddrInfo , 0 , len ( peers ) )
for _ , ix := range peers {
snapshot = append ( snapshot , ix )
}
if mu != nil {
mu . RUnlock ( )
}
for _ , ix := range snapshot {
wasConnected := h . Network ( ) . Connectedness ( ix . ID ) == network . Connected
if err := sendHeartbeat ( ctx , h , proto , ix , hb , ps , interval * time . Second ) ; err != nil {
// Step 3: heartbeat failed — remove from pool and trigger replenish.
logger . Info ( ) . Str ( "peer" , ix . ID . String ( ) ) . Str ( "proto" , string ( proto ) ) . Msg ( "[native] step 3 — heartbeat failed, removing peer from pool" )
// Remove the dead peer and clean up its stream.
// mu already covers ps when isIndexerHB (same mutex), so one
// lock acquisition is sufficient — no re-entrant double-lock.
if mu != nil {
mu . Lock ( )
}
if ps [ proto ] != nil {
if s , ok := ps [ proto ] [ ix . ID ] ; ok {
if s . Stream != nil {
s . Stream . Close ( )
}
delete ( ps [ proto ] , ix . ID )
}
}
lostAddr := ""
for addr , ad := range peers {
if ad . ID == ix . ID {
lostAddr = addr
delete ( peers , addr )
break
}
}
need := conf . GetConfig ( ) . MinIndexer - len ( peers )
remaining := len ( peers )
if mu != nil {
mu . Unlock ( )
}
logger . Info ( ) . Int ( "remaining" , remaining ) . Int ( "min" , conf . GetConfig ( ) . MinIndexer ) . Int ( "need" , need ) . Msg ( "[native] step 3 — pool state after removal" )
// Step 4: ask the native for the missing indexer count.
if isIndexerHB && conf . GetConfig ( ) . NativeIndexerAddresses != "" {
if need < 1 {
need = 1
}
logger . Info ( ) . Int ( "need" , need ) . Msg ( "[native] step 3→4 — triggering replenish" )
go replenishIndexersFromNative ( h , need )
}
// Native heartbeat failed — find a replacement native.
// Case 1: if the dead native was also serving as an indexer, evict it
// from StaticIndexers immediately without waiting for the indexer HB tick.
if isNativeHB {
logger . Info ( ) . Str ( "addr" , lostAddr ) . Msg ( "[native] step 3 — native heartbeat failed, triggering native replenish" )
if lostAddr != "" && conf . GetConfig ( ) . NativeIndexerAddresses != "" {
StreamMuIndexes . Lock ( )
if _ , wasIndexer := StaticIndexers [ lostAddr ] ; wasIndexer {
delete ( StaticIndexers , lostAddr )
if s := StreamIndexers [ ProtocolHeartbeat ] ; s != nil {
if stream , ok := s [ ix . ID ] ; ok {
if stream . Stream != nil {
stream . Stream . Close ( )
}
delete ( s , ix . ID )
}
}
idxNeed := conf . GetConfig ( ) . MinIndexer - len ( StaticIndexers )
StreamMuIndexes . Unlock ( )
if idxNeed < 1 {
idxNeed = 1
}
logger . Info ( ) . Str ( "addr" , lostAddr ) . Msg ( "[native] dead native evicted from indexer pool, triggering replenish" )
go replenishIndexersFromNative ( h , idxNeed )
} else {
StreamMuIndexes . Unlock ( )
}
}
go replenishNativesFromPeers ( h , lostAddr , proto )
}
} else {
// Case 2: native-as-indexer reconnected after a restart.
// If the peer was disconnected before this tick and the heartbeat just
// succeeded (transparent reconnect), the native may have restarted with
// blank state (responsiblePeers empty). Evict it from StaticIndexers and
// re-request an assignment so the native re-tracks us properly and
// runOffloadLoop can eventually migrate us to real indexers.
if ! wasConnected && isIndexerHB && conf . GetConfig ( ) . NativeIndexerAddresses != "" {
StreamNativeMu . RLock ( )
isNativeIndexer := false
for _ , ad := range StaticNatives {
if ad . ID == ix . ID {
isNativeIndexer = true
break
}
}
StreamNativeMu . RUnlock ( )
if isNativeIndexer {
if mu != nil {
mu . Lock ( )
}
if ps [ proto ] != nil {
if s , ok := ps [ proto ] [ ix . ID ] ; ok {
if s . Stream != nil {
s . Stream . Close ( )
}
delete ( ps [ proto ] , ix . ID )
}
}
reconnectedAddr := ""
for addr , ad := range peers {
if ad . ID == ix . ID {
reconnectedAddr = addr
delete ( peers , addr )
break
}
}
idxNeed := conf . GetConfig ( ) . MinIndexer - len ( peers )
if mu != nil {
mu . Unlock ( )
}
if idxNeed < 1 {
idxNeed = 1
}
logger . Info ( ) . Str ( "addr" , reconnectedAddr ) . Str ( "peer" , ix . ID . String ( ) ) . Msg (
"[native] native-as-indexer reconnected after restart — evicting and re-requesting assignment" )
go replenishIndexersFromNative ( h , idxNeed )
}
}
logger . Debug ( ) . Str ( "peer" , ix . ID . String ( ) ) . Str ( "proto" , string ( proto ) ) . Msg ( "[native] step 2 — heartbeat sent ok" )
}
}
}
2026-01-30 16:57:36 +01:00
for {
select {
case <- t . C :
2026-03-03 16:38:24 +01:00
doTick ( )
case <- indexerHeartbeatNudge :
if isIndexerHB {
logger . Info ( ) . Msg ( "[native] step 2 — nudge received, heartbeating new indexers immediately" )
doTick ( )
2026-02-17 13:11:22 +01:00
}
2026-03-03 16:38:24 +01:00
case <- nativeHeartbeatNudge :
if isNativeHB {
logger . Info ( ) . Msg ( "[native] native nudge received, heartbeating replacement native immediately" )
doTick ( )
2026-01-30 16:57:36 +01:00
}
case <- ctx . Done ( ) :
return
}
}
} ( )
}
2026-02-24 14:31:37 +01:00
type ProtocolInfo struct {
PersistantStream bool
WaitResponse bool
TTL time . Duration
}
func TempStream ( h host . Host , ad pp . AddrInfo , proto protocol . ID , did string , streams ProtocolStream , pts map [ protocol . ID ] * ProtocolInfo , mu * sync . RWMutex ) ( ProtocolStream , error ) {
expiry := 2 * time . Second
if pts [ proto ] != nil {
expiry = pts [ proto ] . TTL
}
2026-03-03 16:38:24 +01:00
ctxTTL , _ := context . WithTimeout ( context . Background ( ) , expiry )
if h . Network ( ) . Connectedness ( ad . ID ) != network . Connected {
if err := h . Connect ( ctxTTL , ad ) ; err != nil {
return streams , err
}
}
if streams [ proto ] != nil && streams [ proto ] [ ad . ID ] != nil {
return streams , nil
} else if s , err := h . NewStream ( ctxTTL , ad . ID , proto ) ; err == nil {
mu . Lock ( )
if streams [ proto ] == nil {
streams [ proto ] = map [ pp . ID ] * Stream { }
2026-02-18 13:29:50 +01:00
}
2026-03-03 16:38:24 +01:00
mu . Unlock ( )
time . AfterFunc ( expiry , func ( ) {
2026-02-18 13:29:50 +01:00
mu . Lock ( )
2026-03-03 16:38:24 +01:00
delete ( streams [ proto ] , ad . ID )
2026-02-18 13:29:50 +01:00
mu . Unlock ( )
2026-03-03 16:38:24 +01:00
} )
mu . Lock ( )
streams [ proto ] [ ad . ID ] = & Stream {
DID : did ,
Stream : s ,
Expiry : time . Now ( ) . UTC ( ) . Add ( expiry ) ,
2026-02-18 13:29:50 +01:00
}
2026-03-03 16:38:24 +01:00
mu . Unlock ( )
return streams , nil
} else {
return streams , err
2026-02-18 13:29:50 +01:00
}
}
2026-02-03 15:25:15 +01:00
func sendHeartbeat ( ctx context . Context , h host . Host , proto protocol . ID , p * pp . AddrInfo ,
hb Heartbeat , ps ProtocolStream , interval time . Duration ) error {
2026-03-03 16:38:24 +01:00
logger := oclib . GetLogger ( )
if ps [ proto ] == nil {
ps [ proto ] = map [ pp . ID ] * Stream { }
2026-01-30 16:57:36 +01:00
}
2026-03-03 16:38:24 +01:00
streams := ps [ proto ]
2026-01-30 16:57:36 +01:00
pss , exists := streams [ p . ID ]
2026-03-03 16:38:24 +01:00
ctxTTL , cancel := context . WithTimeout ( ctx , 3 * interval )
defer cancel ( )
2026-01-30 16:57:36 +01:00
// Connect si nécessaire
if h . Network ( ) . Connectedness ( p . ID ) != network . Connected {
2026-02-18 13:29:50 +01:00
if err := h . Connect ( ctxTTL , * p ) ; err != nil {
2026-03-03 16:38:24 +01:00
logger . Err ( err )
2026-02-18 13:29:50 +01:00
return err
}
2026-01-30 16:57:36 +01:00
exists = false // on devra recréer le stream
}
// Crée le stream si inexistant ou fermé
if ! exists || pss . Stream == nil {
2026-03-03 16:38:24 +01:00
logger . Info ( ) . Msg ( "New Stream engaged as Heartbeat " + fmt . Sprintf ( "%v" , proto ) + " " + p . ID . String ( ) )
2026-01-30 16:57:36 +01:00
s , err := h . NewStream ( ctx , p . ID , proto )
if err != nil {
2026-03-03 16:38:24 +01:00
logger . Err ( err )
2026-01-30 16:57:36 +01:00
return err
}
pss = & Stream {
Stream : s ,
Expiry : time . Now ( ) . UTC ( ) . Add ( 2 * time . Minute ) ,
}
streams [ p . ID ] = pss
}
// Envoie le heartbeat
ss := json . NewEncoder ( pss . Stream )
err := ss . Encode ( & hb )
if err != nil {
pss . Stream . Close ( )
pss . Stream = nil // recréera au prochain tick
return err
}
pss . Expiry = time . Now ( ) . UTC ( ) . Add ( 2 * time . Minute )
return nil
}