demo test + Peer
This commit is contained in:
@@ -1,7 +1,6 @@
|
||||
package common
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
cr "crypto/rand"
|
||||
"encoding/json"
|
||||
@@ -28,6 +27,12 @@ type LongLivedStreamRecordedService[T interface{}] struct {
|
||||
StreamRecords map[protocol.ID]map[pp.ID]*StreamRecord[T]
|
||||
StreamMU sync.RWMutex
|
||||
maxNodesConn int
|
||||
// AfterHeartbeat is an optional hook called after each successful heartbeat update.
|
||||
// The indexer sets it to republish the embedded signed record to the DHT.
|
||||
AfterHeartbeat func(pid pp.ID)
|
||||
// AfterDelete is called after gc() evicts an expired peer, outside the lock.
|
||||
// name and did may be empty if the HeartbeatStream had no metadata.
|
||||
AfterDelete func(pid pp.ID, name string, did string)
|
||||
}
|
||||
|
||||
func NewStreamRecordedService[T interface{}](h host.Host, maxNodesConn int) *LongLivedStreamRecordedService[T] {
|
||||
@@ -54,16 +59,29 @@ func (ix *LongLivedStreamRecordedService[T]) StartGC(interval time.Duration) {
|
||||
|
||||
func (ix *LongLivedStreamRecordedService[T]) gc() {
|
||||
ix.StreamMU.Lock()
|
||||
defer ix.StreamMU.Unlock()
|
||||
now := time.Now().UTC()
|
||||
if ix.StreamRecords[ProtocolHeartbeat] == nil {
|
||||
ix.StreamRecords[ProtocolHeartbeat] = map[pp.ID]*StreamRecord[T]{}
|
||||
ix.StreamMU.Unlock()
|
||||
return
|
||||
}
|
||||
streams := ix.StreamRecords[ProtocolHeartbeat]
|
||||
fmt.Println(StaticNatives, StaticIndexers, streams)
|
||||
|
||||
type gcEntry struct {
|
||||
pid pp.ID
|
||||
name string
|
||||
did string
|
||||
}
|
||||
var evicted []gcEntry
|
||||
for pid, rec := range streams {
|
||||
if now.After(rec.HeartbeatStream.Expiry) || now.Sub(rec.HeartbeatStream.UptimeTracker.LastSeen) > 2*rec.HeartbeatStream.Expiry.Sub(now) {
|
||||
name, did := "", ""
|
||||
if rec.HeartbeatStream != nil {
|
||||
name = rec.HeartbeatStream.Name
|
||||
did = rec.HeartbeatStream.DID
|
||||
}
|
||||
evicted = append(evicted, gcEntry{pid, name, did})
|
||||
for _, sstreams := range ix.StreamRecords {
|
||||
if sstreams[pid] != nil {
|
||||
delete(sstreams, pid)
|
||||
@@ -71,6 +89,13 @@ func (ix *LongLivedStreamRecordedService[T]) gc() {
|
||||
}
|
||||
}
|
||||
}
|
||||
ix.StreamMU.Unlock()
|
||||
|
||||
if ix.AfterDelete != nil {
|
||||
for _, e := range evicted {
|
||||
ix.AfterDelete(e.pid, e.name, e.did)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (ix *LongLivedStreamRecordedService[T]) Snapshot(interval time.Duration) {
|
||||
@@ -101,8 +126,10 @@ func (ix *LongLivedStreamRecordedService[T]) snapshot() []*StreamRecord[T] {
|
||||
return out
|
||||
}
|
||||
|
||||
func (ix *LongLivedStreamRecordedService[T]) HandleNodeHeartbeat(s network.Stream) {
|
||||
func (ix *LongLivedStreamRecordedService[T]) HandleHeartbeat(s network.Stream) {
|
||||
logger := oclib.GetLogger()
|
||||
defer s.Close()
|
||||
dec := json.NewDecoder(s)
|
||||
for {
|
||||
ix.StreamMU.Lock()
|
||||
if ix.StreamRecords[ProtocolHeartbeat] == nil {
|
||||
@@ -114,17 +141,37 @@ func (ix *LongLivedStreamRecordedService[T]) HandleNodeHeartbeat(s network.Strea
|
||||
streamsAnonym[k] = v
|
||||
}
|
||||
ix.StreamMU.Unlock()
|
||||
|
||||
pid, hb, err := CheckHeartbeat(ix.Host, s, streamsAnonym, &ix.StreamMU, ix.maxNodesConn)
|
||||
pid, hb, err := CheckHeartbeat(ix.Host, s, dec, streamsAnonym, &ix.StreamMU, ix.maxNodesConn)
|
||||
if err != nil {
|
||||
// Stream-level errors (EOF, reset, closed) mean the connection is gone
|
||||
// — exit so the goroutine doesn't spin forever on a dead stream.
|
||||
// Metric/policy errors (score too low, too many connections) are transient
|
||||
// — those are also stream-terminal since the stream carries one session.
|
||||
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) ||
|
||||
strings.Contains(err.Error(), "reset") ||
|
||||
strings.Contains(err.Error(), "closed") ||
|
||||
strings.Contains(err.Error(), "too many connections") {
|
||||
logger.Info().Err(err).Msg("heartbeat stream terminated, closing handler")
|
||||
return
|
||||
}
|
||||
logger.Warn().Err(err).Msg("heartbeat check failed, retrying on same stream")
|
||||
continue
|
||||
}
|
||||
ix.StreamMU.Lock()
|
||||
// if record already seen update last seen
|
||||
if rec, ok := streams[*pid]; ok {
|
||||
rec.DID = hb.DID
|
||||
if rec.HeartbeatStream == nil {
|
||||
rec.HeartbeatStream = hb.Stream
|
||||
}
|
||||
rec.HeartbeatStream = hb.Stream
|
||||
rec.HeartbeatStream.UptimeTracker.LastSeen = time.Now().UTC()
|
||||
if rec.HeartbeatStream.UptimeTracker == nil {
|
||||
rec.HeartbeatStream.UptimeTracker = &UptimeTracker{
|
||||
FirstSeen: time.Now().UTC(),
|
||||
LastSeen: time.Now().UTC(),
|
||||
}
|
||||
}
|
||||
logger.Info().Msg("A new node is updated : " + pid.String())
|
||||
} else {
|
||||
hb.Stream.UptimeTracker = &UptimeTracker{
|
||||
FirstSeen: time.Now().UTC(),
|
||||
@@ -134,37 +181,51 @@ func (ix *LongLivedStreamRecordedService[T]) HandleNodeHeartbeat(s network.Strea
|
||||
DID: hb.DID,
|
||||
HeartbeatStream: hb.Stream,
|
||||
}
|
||||
logger.Info().Msg("A new node is subscribed : " + pid.String())
|
||||
}
|
||||
ix.StreamMU.Unlock()
|
||||
// Let the indexer republish the embedded signed record to the DHT.
|
||||
if ix.AfterHeartbeat != nil {
|
||||
ix.AfterHeartbeat(*pid)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func CheckHeartbeat(h host.Host, s network.Stream, streams map[pp.ID]HeartBeatStreamed, lock *sync.RWMutex, maxNodes int) (*pp.ID, *Heartbeat, error) {
|
||||
func CheckHeartbeat(h host.Host, s network.Stream, dec *json.Decoder, streams map[pp.ID]HeartBeatStreamed, lock *sync.RWMutex, maxNodes int) (*pp.ID, *Heartbeat, error) {
|
||||
if len(h.Network().Peers()) >= maxNodes {
|
||||
return nil, nil, fmt.Errorf("too many connections, try another indexer")
|
||||
}
|
||||
var hb Heartbeat
|
||||
if err := json.NewDecoder(s).Decode(&hb); err != nil {
|
||||
if err := dec.Decode(&hb); err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
if ok, bpms, err := getBandwidthChallengeRate(MinPayloadChallenge+int(rand.Float64()*(MaxPayloadChallenge-MinPayloadChallenge)), s); err != nil {
|
||||
return nil, nil, err
|
||||
} else if !ok {
|
||||
return nil, nil, fmt.Errorf("Not a proper peer")
|
||||
} else {
|
||||
_, bpms, _ := getBandwidthChallengeRate(h, s.Conn().RemotePeer(), MinPayloadChallenge+int(rand.Float64()*(MaxPayloadChallenge-MinPayloadChallenge)))
|
||||
{
|
||||
pid, err := pp.Decode(hb.PeerID)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
upTime := float64(0)
|
||||
isFirstHeartbeat := true
|
||||
lock.Lock()
|
||||
if rec, ok := streams[pid]; ok && rec.GetUptimeTracker() != nil {
|
||||
upTime = rec.GetUptimeTracker().Uptime().Hours() / float64(time.Since(TimeWatcher).Hours())
|
||||
isFirstHeartbeat = false
|
||||
}
|
||||
lock.Unlock()
|
||||
diversity := getDiversityRate(h, hb.IndexersBinded)
|
||||
fmt.Println(upTime, bpms, diversity)
|
||||
hb.ComputeIndexerScore(upTime, bpms, diversity)
|
||||
if hb.Score < 75 {
|
||||
// First heartbeat: uptime is always 0 so the score ceiling is 60, below the
|
||||
// steady-state threshold of 75. Use a lower admission threshold so new peers
|
||||
// can enter and start accumulating uptime. Subsequent heartbeats must meet
|
||||
// the full threshold once uptime is tracked.
|
||||
minScore := float64(50)
|
||||
if isFirstHeartbeat {
|
||||
minScore = 40
|
||||
}
|
||||
fmt.Println(hb.Score, minScore)
|
||||
if hb.Score < minScore {
|
||||
return nil, nil, errors.New("not enough trusting value")
|
||||
}
|
||||
hb.Stream = &Stream{
|
||||
@@ -178,11 +239,13 @@ func CheckHeartbeat(h host.Host, s network.Stream, streams map[pp.ID]HeartBeatSt
|
||||
}
|
||||
|
||||
func getDiversityRate(h host.Host, peers []string) float64 {
|
||||
|
||||
peers, _ = checkPeers(h, peers)
|
||||
diverse := []string{}
|
||||
for _, p := range peers {
|
||||
ip, err := ExtractIP(p)
|
||||
if err != nil {
|
||||
fmt.Println("NO IP", p, err)
|
||||
continue
|
||||
}
|
||||
div := ip.Mask(net.CIDRMask(24, 32)).String()
|
||||
@@ -190,6 +253,9 @@ func getDiversityRate(h host.Host, peers []string) float64 {
|
||||
diverse = append(diverse, div)
|
||||
}
|
||||
}
|
||||
if len(diverse) == 0 || len(peers) == 0 {
|
||||
return 1
|
||||
}
|
||||
return float64(len(diverse) / len(peers))
|
||||
}
|
||||
|
||||
@@ -211,35 +277,42 @@ func checkPeers(h host.Host, peers []string) ([]string, []string) {
|
||||
return concretePeer, ips
|
||||
}
|
||||
|
||||
const MaxExpectedMbps = 50.0
|
||||
const MaxExpectedMbps = 100.0
|
||||
const MinPayloadChallenge = 512
|
||||
const MaxPayloadChallenge = 2048
|
||||
const BaseRoundTrip = 400 * time.Millisecond
|
||||
|
||||
func getBandwidthChallengeRate(payloadSize int, s network.Stream) (bool, float64, error) {
|
||||
// Génération payload aléatoire
|
||||
// getBandwidthChallengeRate opens a dedicated ProtocolBandwidthProbe stream to
|
||||
// remotePeer, sends a random payload, reads the echo, and computes throughput.
|
||||
// Using a separate stream avoids mixing binary data on the JSON heartbeat stream
|
||||
// and ensures the echo handler is actually running on the remote side.
|
||||
func getBandwidthChallengeRate(h host.Host, remotePeer pp.ID, payloadSize int) (bool, float64, error) {
|
||||
payload := make([]byte, payloadSize)
|
||||
_, err := cr.Read(payload)
|
||||
if _, err := cr.Read(payload); err != nil {
|
||||
return false, 0, err
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
s, err := h.NewStream(ctx, remotePeer, ProtocolBandwidthProbe)
|
||||
if err != nil {
|
||||
return false, 0, err
|
||||
}
|
||||
defer s.Reset()
|
||||
s.SetDeadline(time.Now().Add(10 * time.Second))
|
||||
start := time.Now()
|
||||
// send on heartbeat stream the challenge
|
||||
if _, err = s.Write(payload); err != nil {
|
||||
return false, 0, err
|
||||
}
|
||||
// read back
|
||||
s.CloseWrite()
|
||||
// Half-close the write side so the handler's io.Copy sees EOF and stops.
|
||||
// Read the echo.
|
||||
response := make([]byte, payloadSize)
|
||||
_, err = io.ReadFull(s, response)
|
||||
if err != nil {
|
||||
if _, err = io.ReadFull(s, response); err != nil {
|
||||
return false, 0, err
|
||||
}
|
||||
|
||||
duration := time.Since(start)
|
||||
// Verify content
|
||||
if !bytes.Equal(payload, response) {
|
||||
return false, 0, nil // pb or a sadge peer.
|
||||
}
|
||||
maxRoundTrip := BaseRoundTrip + (time.Duration(payloadSize) * (100 * time.Millisecond))
|
||||
mbps := float64(payloadSize*8) / duration.Seconds() / 1e6
|
||||
if duration > maxRoundTrip || mbps < 5.0 {
|
||||
@@ -345,13 +418,36 @@ var StaticIndexers map[string]*pp.AddrInfo = map[string]*pp.AddrInfo{}
|
||||
var StreamMuIndexes sync.RWMutex
|
||||
var StreamIndexers ProtocolStream = ProtocolStream{}
|
||||
|
||||
func ConnectToIndexers(h host.Host, minIndexer int, maxIndexer int, myPID pp.ID) error {
|
||||
// indexerHeartbeatNudge allows replenishIndexersFromNative to trigger an immediate
|
||||
// heartbeat tick after adding new entries to StaticIndexers, without waiting up
|
||||
// to 20s for the regular ticker. Buffered(1) so the sender never blocks.
|
||||
var indexerHeartbeatNudge = make(chan struct{}, 1)
|
||||
|
||||
// NudgeIndexerHeartbeat signals the indexer heartbeat goroutine to fire immediately.
|
||||
func NudgeIndexerHeartbeat() {
|
||||
select {
|
||||
case indexerHeartbeatNudge <- struct{}{}:
|
||||
default: // nudge already pending, skip
|
||||
}
|
||||
}
|
||||
|
||||
func ConnectToIndexers(h host.Host, minIndexer int, maxIndexer int, myPID pp.ID, recordFn ...func() json.RawMessage) error {
|
||||
TimeWatcher = time.Now().UTC()
|
||||
logger := oclib.GetLogger()
|
||||
|
||||
// If native addresses are configured, bypass static indexer addresses
|
||||
// If native addresses are configured, get the indexer pool from the native mesh,
|
||||
// then start the long-lived heartbeat goroutine toward those indexers.
|
||||
if conf.GetConfig().NativeIndexerAddresses != "" {
|
||||
return ConnectToNatives(h, minIndexer, maxIndexer, myPID)
|
||||
if err := ConnectToNatives(h, minIndexer, maxIndexer, myPID); err != nil {
|
||||
return err
|
||||
}
|
||||
// Step 2: start the long-lived heartbeat goroutine toward the indexer pool.
|
||||
// replaceStaticIndexers/replenishIndexersFromNative update the map in-place
|
||||
// so this single goroutine follows all pool changes automatically.
|
||||
logger.Info().Msg("[native] step 2 — starting long-lived heartbeat to indexer pool")
|
||||
SendHeartbeat(context.Background(), ProtocolHeartbeat, conf.GetConfig().Name,
|
||||
h, StreamIndexers, StaticIndexers, &StreamMuIndexes, 20*time.Second, recordFn...)
|
||||
return nil
|
||||
}
|
||||
|
||||
addresses := strings.Split(conf.GetConfig().IndexerAddresses, ",")
|
||||
@@ -360,8 +456,8 @@ func ConnectToIndexers(h host.Host, minIndexer int, maxIndexer int, myPID pp.ID)
|
||||
addresses = addresses[0:maxIndexer]
|
||||
}
|
||||
|
||||
StreamMuIndexes.Lock()
|
||||
for _, indexerAddr := range addresses {
|
||||
fmt.Println("GENERATE ADDR", indexerAddr)
|
||||
ad, err := pp.AddrInfoFromString(indexerAddr)
|
||||
if err != nil {
|
||||
logger.Err(err)
|
||||
@@ -369,15 +465,18 @@ func ConnectToIndexers(h host.Host, minIndexer int, maxIndexer int, myPID pp.ID)
|
||||
}
|
||||
StaticIndexers[indexerAddr] = ad
|
||||
}
|
||||
indexerCount := len(StaticIndexers)
|
||||
StreamMuIndexes.Unlock()
|
||||
|
||||
SendHeartbeat(context.Background(), ProtocolHeartbeat, conf.GetConfig().Name, h, StreamIndexers, StaticIndexers, 20*time.Second) // your indexer is just like a node for the next indexer.
|
||||
if len(StaticIndexers) < minIndexer {
|
||||
SendHeartbeat(context.Background(), ProtocolHeartbeat, conf.GetConfig().Name, h, StreamIndexers, StaticIndexers, &StreamMuIndexes, 20*time.Second, recordFn...) // your indexer is just like a node for the next indexer.
|
||||
if indexerCount < minIndexer {
|
||||
return errors.New("you run a node without indexers... your gonna be isolated.")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func AddStreamProtocol(ctx *context.Context, protoS ProtocolStream, h host.Host, proto protocol.ID, id pp.ID, mypid pp.ID, force bool, onStreamCreated *func(network.Stream)) ProtocolStream {
|
||||
logger := oclib.GetLogger()
|
||||
if onStreamCreated == nil {
|
||||
f := func(s network.Stream) {
|
||||
protoS[proto][id] = &Stream{
|
||||
@@ -400,7 +499,7 @@ func AddStreamProtocol(ctx *context.Context, protoS ProtocolStream, h host.Host,
|
||||
if protoS[proto][id] != nil {
|
||||
protoS[proto][id].Expiry = time.Now().Add(2 * time.Minute)
|
||||
} else {
|
||||
fmt.Println("NEW STREAM", proto, id)
|
||||
logger.Info().Msg("NEW STREAM Generated" + fmt.Sprintf("%v", proto) + " " + id.String())
|
||||
s, err := h.NewStream(*ctx, id, proto)
|
||||
if err != nil {
|
||||
panic(err.Error())
|
||||
@@ -419,12 +518,16 @@ type Heartbeat struct {
|
||||
Timestamp int64 `json:"timestamp"`
|
||||
IndexersBinded []string `json:"indexers_binded"`
|
||||
Score float64
|
||||
// Record carries a fresh signed PeerRecord (JSON) so the receiving indexer
|
||||
// can republish it to the DHT without an extra round-trip.
|
||||
// Only set by nodes (not indexers heartbeating other indexers).
|
||||
Record json.RawMessage `json:"record,omitempty"`
|
||||
}
|
||||
|
||||
func (hb *Heartbeat) ComputeIndexerScore(uptimeHours float64, bpms float64, diversity float64) {
|
||||
hb.Score = (0.4 * uptimeHours) +
|
||||
(0.4 * bpms) +
|
||||
(0.2 * diversity)
|
||||
hb.Score = ((0.3 * uptimeHours) +
|
||||
(0.3 * bpms) +
|
||||
(0.4 * diversity)) * 100
|
||||
}
|
||||
|
||||
type HeartbeatInfo []struct {
|
||||
@@ -433,34 +536,213 @@ type HeartbeatInfo []struct {
|
||||
|
||||
const ProtocolHeartbeat = "/opencloud/heartbeat/1.0"
|
||||
|
||||
func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.Host, ps ProtocolStream, peers map[string]*pp.AddrInfo, interval time.Duration) {
|
||||
peerID, err := oclib.GenerateNodeID()
|
||||
if err == nil {
|
||||
panic("can't heartbeat daemon failed to start")
|
||||
// ProtocolBandwidthProbe is a dedicated short-lived stream used exclusively
|
||||
// for bandwidth/latency measurement. The handler echoes any bytes it receives.
|
||||
// All nodes and indexers register this handler so peers can measure them.
|
||||
const ProtocolBandwidthProbe = "/opencloud/probe/1.0"
|
||||
|
||||
// HandleBandwidthProbe echoes back everything written on the stream, then closes.
|
||||
// It is registered by all participants so the measuring side (the heartbeat receiver)
|
||||
// can open a dedicated probe stream and read the round-trip latency + throughput.
|
||||
func HandleBandwidthProbe(s network.Stream) {
|
||||
defer s.Close()
|
||||
s.SetDeadline(time.Now().Add(10 * time.Second))
|
||||
io.Copy(s, s) // echo every byte back to the sender
|
||||
}
|
||||
|
||||
// SendHeartbeat starts a goroutine that sends periodic heartbeats to peers.
|
||||
// recordFn, when provided, is called on each tick and its output is embedded in
|
||||
// the heartbeat as a fresh signed PeerRecord so the receiving indexer can
|
||||
// republish it to the DHT without an extra round-trip.
|
||||
// Pass no recordFn (or nil) for indexer→indexer / native heartbeats.
|
||||
func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.Host, ps ProtocolStream, peers map[string]*pp.AddrInfo, mu *sync.RWMutex, interval time.Duration, recordFn ...func() json.RawMessage) {
|
||||
logger := oclib.GetLogger()
|
||||
// isIndexerHB is true when this goroutine drives the indexer heartbeat.
|
||||
// isNativeHB is true when it drives the native heartbeat.
|
||||
isIndexerHB := mu == &StreamMuIndexes
|
||||
isNativeHB := mu == &StreamNativeMu
|
||||
var recFn func() json.RawMessage
|
||||
if len(recordFn) > 0 {
|
||||
recFn = recordFn[0]
|
||||
}
|
||||
go func() {
|
||||
logger.Info().Str("proto", string(proto)).Int("peers", len(peers)).Msg("heartbeat started")
|
||||
t := time.NewTicker(interval)
|
||||
defer t.Stop()
|
||||
|
||||
// doTick sends one round of heartbeats to the current peer snapshot.
|
||||
doTick := func() {
|
||||
// Build the heartbeat payload — snapshot current indexer addresses.
|
||||
StreamMuIndexes.RLock()
|
||||
addrs := make([]string, 0, len(StaticIndexers))
|
||||
for addr := range StaticIndexers {
|
||||
addrs = append(addrs, addr)
|
||||
}
|
||||
StreamMuIndexes.RUnlock()
|
||||
hb := Heartbeat{
|
||||
Name: name,
|
||||
PeerID: h.ID().String(),
|
||||
Timestamp: time.Now().UTC().Unix(),
|
||||
IndexersBinded: addrs,
|
||||
}
|
||||
if recFn != nil {
|
||||
hb.Record = recFn()
|
||||
}
|
||||
|
||||
// Snapshot the peer list under a read lock so we don't hold the
|
||||
// write lock during network I/O.
|
||||
if mu != nil {
|
||||
mu.RLock()
|
||||
}
|
||||
snapshot := make([]*pp.AddrInfo, 0, len(peers))
|
||||
for _, ix := range peers {
|
||||
snapshot = append(snapshot, ix)
|
||||
}
|
||||
if mu != nil {
|
||||
mu.RUnlock()
|
||||
}
|
||||
|
||||
for _, ix := range snapshot {
|
||||
wasConnected := h.Network().Connectedness(ix.ID) == network.Connected
|
||||
if err := sendHeartbeat(ctx, h, proto, ix, hb, ps, interval*time.Second); err != nil {
|
||||
// Step 3: heartbeat failed — remove from pool and trigger replenish.
|
||||
logger.Info().Str("peer", ix.ID.String()).Str("proto", string(proto)).Msg("[native] step 3 — heartbeat failed, removing peer from pool")
|
||||
|
||||
// Remove the dead peer and clean up its stream.
|
||||
// mu already covers ps when isIndexerHB (same mutex), so one
|
||||
// lock acquisition is sufficient — no re-entrant double-lock.
|
||||
if mu != nil {
|
||||
mu.Lock()
|
||||
}
|
||||
if ps[proto] != nil {
|
||||
if s, ok := ps[proto][ix.ID]; ok {
|
||||
if s.Stream != nil {
|
||||
s.Stream.Close()
|
||||
}
|
||||
delete(ps[proto], ix.ID)
|
||||
}
|
||||
}
|
||||
lostAddr := ""
|
||||
for addr, ad := range peers {
|
||||
if ad.ID == ix.ID {
|
||||
lostAddr = addr
|
||||
delete(peers, addr)
|
||||
break
|
||||
}
|
||||
}
|
||||
need := conf.GetConfig().MinIndexer - len(peers)
|
||||
remaining := len(peers)
|
||||
if mu != nil {
|
||||
mu.Unlock()
|
||||
}
|
||||
logger.Info().Int("remaining", remaining).Int("min", conf.GetConfig().MinIndexer).Int("need", need).Msg("[native] step 3 — pool state after removal")
|
||||
|
||||
// Step 4: ask the native for the missing indexer count.
|
||||
if isIndexerHB && conf.GetConfig().NativeIndexerAddresses != "" {
|
||||
if need < 1 {
|
||||
need = 1
|
||||
}
|
||||
logger.Info().Int("need", need).Msg("[native] step 3→4 — triggering replenish")
|
||||
go replenishIndexersFromNative(h, need)
|
||||
}
|
||||
|
||||
// Native heartbeat failed — find a replacement native.
|
||||
// Case 1: if the dead native was also serving as an indexer, evict it
|
||||
// from StaticIndexers immediately without waiting for the indexer HB tick.
|
||||
if isNativeHB {
|
||||
logger.Info().Str("addr", lostAddr).Msg("[native] step 3 — native heartbeat failed, triggering native replenish")
|
||||
if lostAddr != "" && conf.GetConfig().NativeIndexerAddresses != "" {
|
||||
StreamMuIndexes.Lock()
|
||||
if _, wasIndexer := StaticIndexers[lostAddr]; wasIndexer {
|
||||
delete(StaticIndexers, lostAddr)
|
||||
if s := StreamIndexers[ProtocolHeartbeat]; s != nil {
|
||||
if stream, ok := s[ix.ID]; ok {
|
||||
if stream.Stream != nil {
|
||||
stream.Stream.Close()
|
||||
}
|
||||
delete(s, ix.ID)
|
||||
}
|
||||
}
|
||||
idxNeed := conf.GetConfig().MinIndexer - len(StaticIndexers)
|
||||
StreamMuIndexes.Unlock()
|
||||
if idxNeed < 1 {
|
||||
idxNeed = 1
|
||||
}
|
||||
logger.Info().Str("addr", lostAddr).Msg("[native] dead native evicted from indexer pool, triggering replenish")
|
||||
go replenishIndexersFromNative(h, idxNeed)
|
||||
} else {
|
||||
StreamMuIndexes.Unlock()
|
||||
}
|
||||
}
|
||||
go replenishNativesFromPeers(h, lostAddr, proto)
|
||||
}
|
||||
} else {
|
||||
// Case 2: native-as-indexer reconnected after a restart.
|
||||
// If the peer was disconnected before this tick and the heartbeat just
|
||||
// succeeded (transparent reconnect), the native may have restarted with
|
||||
// blank state (responsiblePeers empty). Evict it from StaticIndexers and
|
||||
// re-request an assignment so the native re-tracks us properly and
|
||||
// runOffloadLoop can eventually migrate us to real indexers.
|
||||
if !wasConnected && isIndexerHB && conf.GetConfig().NativeIndexerAddresses != "" {
|
||||
StreamNativeMu.RLock()
|
||||
isNativeIndexer := false
|
||||
for _, ad := range StaticNatives {
|
||||
if ad.ID == ix.ID {
|
||||
isNativeIndexer = true
|
||||
break
|
||||
}
|
||||
}
|
||||
StreamNativeMu.RUnlock()
|
||||
if isNativeIndexer {
|
||||
if mu != nil {
|
||||
mu.Lock()
|
||||
}
|
||||
if ps[proto] != nil {
|
||||
if s, ok := ps[proto][ix.ID]; ok {
|
||||
if s.Stream != nil {
|
||||
s.Stream.Close()
|
||||
}
|
||||
delete(ps[proto], ix.ID)
|
||||
}
|
||||
}
|
||||
reconnectedAddr := ""
|
||||
for addr, ad := range peers {
|
||||
if ad.ID == ix.ID {
|
||||
reconnectedAddr = addr
|
||||
delete(peers, addr)
|
||||
break
|
||||
}
|
||||
}
|
||||
idxNeed := conf.GetConfig().MinIndexer - len(peers)
|
||||
if mu != nil {
|
||||
mu.Unlock()
|
||||
}
|
||||
if idxNeed < 1 {
|
||||
idxNeed = 1
|
||||
}
|
||||
logger.Info().Str("addr", reconnectedAddr).Str("peer", ix.ID.String()).Msg(
|
||||
"[native] native-as-indexer reconnected after restart — evicting and re-requesting assignment")
|
||||
go replenishIndexersFromNative(h, idxNeed)
|
||||
}
|
||||
}
|
||||
logger.Debug().Str("peer", ix.ID.String()).Str("proto", string(proto)).Msg("[native] step 2 — heartbeat sent ok")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-t.C:
|
||||
addrs := []string{}
|
||||
for addr := range StaticIndexers {
|
||||
addrs = append(addrs, addr)
|
||||
doTick()
|
||||
case <-indexerHeartbeatNudge:
|
||||
if isIndexerHB {
|
||||
logger.Info().Msg("[native] step 2 — nudge received, heartbeating new indexers immediately")
|
||||
doTick()
|
||||
}
|
||||
hb := Heartbeat{
|
||||
Name: name,
|
||||
DID: peerID,
|
||||
PeerID: h.ID().String(),
|
||||
Timestamp: time.Now().UTC().Unix(),
|
||||
IndexersBinded: addrs,
|
||||
}
|
||||
for _, ix := range peers {
|
||||
if err = sendHeartbeat(ctx, h, proto, ix, hb, ps, interval*time.Second); err != nil {
|
||||
StreamMuIndexes.Lock()
|
||||
delete(StreamIndexers[proto], ix.ID)
|
||||
StreamMuIndexes.Unlock()
|
||||
}
|
||||
case <-nativeHeartbeatNudge:
|
||||
if isNativeHB {
|
||||
logger.Info().Msg("[native] native nudge received, heartbeating replacement native immediately")
|
||||
doTick()
|
||||
}
|
||||
case <-ctx.Done():
|
||||
return
|
||||
@@ -480,58 +762,62 @@ func TempStream(h host.Host, ad pp.AddrInfo, proto protocol.ID, did string, stre
|
||||
if pts[proto] != nil {
|
||||
expiry = pts[proto].TTL
|
||||
}
|
||||
if ctxTTL, err := context.WithTimeout(context.Background(), expiry); err == nil {
|
||||
if h.Network().Connectedness(ad.ID) != network.Connected {
|
||||
if err := h.Connect(ctxTTL, ad); err != nil {
|
||||
return streams, err
|
||||
}
|
||||
}
|
||||
if streams[proto] != nil && streams[proto][ad.ID] != nil {
|
||||
return streams, nil
|
||||
} else if s, err := h.NewStream(ctxTTL, ad.ID, proto); err == nil {
|
||||
mu.Lock()
|
||||
if streams[proto] == nil {
|
||||
streams[proto] = map[pp.ID]*Stream{}
|
||||
}
|
||||
mu.Unlock()
|
||||
time.AfterFunc(expiry, func() {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
delete(streams[proto], ad.ID)
|
||||
})
|
||||
streams[ProtocolPublish][ad.ID] = &Stream{
|
||||
DID: did,
|
||||
Stream: s,
|
||||
Expiry: time.Now().UTC().Add(expiry),
|
||||
}
|
||||
mu.Unlock()
|
||||
return streams, nil
|
||||
} else {
|
||||
ctxTTL, _ := context.WithTimeout(context.Background(), expiry)
|
||||
if h.Network().Connectedness(ad.ID) != network.Connected {
|
||||
if err := h.Connect(ctxTTL, ad); err != nil {
|
||||
return streams, err
|
||||
}
|
||||
}
|
||||
return streams, errors.New("can't create a context")
|
||||
if streams[proto] != nil && streams[proto][ad.ID] != nil {
|
||||
return streams, nil
|
||||
} else if s, err := h.NewStream(ctxTTL, ad.ID, proto); err == nil {
|
||||
mu.Lock()
|
||||
if streams[proto] == nil {
|
||||
streams[proto] = map[pp.ID]*Stream{}
|
||||
}
|
||||
mu.Unlock()
|
||||
time.AfterFunc(expiry, func() {
|
||||
mu.Lock()
|
||||
delete(streams[proto], ad.ID)
|
||||
mu.Unlock()
|
||||
})
|
||||
mu.Lock()
|
||||
streams[proto][ad.ID] = &Stream{
|
||||
DID: did,
|
||||
Stream: s,
|
||||
Expiry: time.Now().UTC().Add(expiry),
|
||||
}
|
||||
mu.Unlock()
|
||||
return streams, nil
|
||||
} else {
|
||||
return streams, err
|
||||
}
|
||||
}
|
||||
|
||||
func sendHeartbeat(ctx context.Context, h host.Host, proto protocol.ID, p *pp.AddrInfo,
|
||||
hb Heartbeat, ps ProtocolStream, interval time.Duration) error {
|
||||
streams := ps.Get(proto)
|
||||
if len(streams) == 0 {
|
||||
return errors.New("no stream for protocol heartbeat founded")
|
||||
logger := oclib.GetLogger()
|
||||
if ps[proto] == nil {
|
||||
ps[proto] = map[pp.ID]*Stream{}
|
||||
}
|
||||
streams := ps[proto]
|
||||
pss, exists := streams[p.ID]
|
||||
ctxTTL, _ := context.WithTimeout(ctx, 3*interval)
|
||||
ctxTTL, cancel := context.WithTimeout(ctx, 3*interval)
|
||||
defer cancel()
|
||||
// Connect si nécessaire
|
||||
if h.Network().Connectedness(p.ID) != network.Connected {
|
||||
if err := h.Connect(ctxTTL, *p); err != nil {
|
||||
logger.Err(err)
|
||||
return err
|
||||
}
|
||||
exists = false // on devra recréer le stream
|
||||
}
|
||||
// Crée le stream si inexistant ou fermé
|
||||
if !exists || pss.Stream == nil {
|
||||
logger.Info().Msg("New Stream engaged as Heartbeat " + fmt.Sprintf("%v", proto) + " " + p.ID.String())
|
||||
s, err := h.NewStream(ctx, p.ID, proto)
|
||||
if err != nil {
|
||||
logger.Err(err)
|
||||
return err
|
||||
}
|
||||
pss = &Stream{
|
||||
|
||||
@@ -13,6 +13,7 @@ import (
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"github.com/libp2p/go-libp2p/core/host"
|
||||
pp "github.com/libp2p/go-libp2p/core/peer"
|
||||
"github.com/libp2p/go-libp2p/core/protocol"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -56,7 +57,8 @@ type IndexerRegistration struct {
|
||||
|
||||
// GetIndexersRequest asks a native for a pool of live indexers.
|
||||
type GetIndexersRequest struct {
|
||||
Count int `json:"count"`
|
||||
Count int `json:"count"`
|
||||
From string `json:"from"`
|
||||
}
|
||||
|
||||
// GetIndexersResponse is returned by the native with live indexer multiaddrs.
|
||||
@@ -69,17 +71,26 @@ var StaticNatives = map[string]*pp.AddrInfo{}
|
||||
var StreamNativeMu sync.RWMutex
|
||||
var StreamNatives ProtocolStream = ProtocolStream{}
|
||||
|
||||
// ConnectToNatives is the client-side entry point for nodes/indexers that have
|
||||
// NativeIndexerAddresses configured. It:
|
||||
// 1. Connects (long-lived heartbeat) to all configured natives.
|
||||
// 2. Fetches an initial indexer pool from the FIRST responsive native.
|
||||
// 3. Challenges that pool to ALL natives (consensus round 1).
|
||||
// 4. If the confirmed list is short, samples native suggestions and re-challenges (round 2).
|
||||
// 5. Populates StaticIndexers with majority-confirmed indexers.
|
||||
// nativeHeartbeatOnce ensures we start exactly one long-lived heartbeat goroutine
|
||||
// toward the native mesh, even when ConnectToNatives is called from recovery paths.
|
||||
var nativeHeartbeatOnce sync.Once
|
||||
|
||||
// nativeMeshHeartbeatOnce guards the native-to-native heartbeat goroutine started
|
||||
// by EnsureNativePeers so only one goroutine covers the whole StaticNatives map.
|
||||
var nativeMeshHeartbeatOnce sync.Once
|
||||
|
||||
// ConnectToNatives is the initial setup for nodes/indexers in native mode:
|
||||
// 1. Parses native addresses → StaticNatives.
|
||||
// 2. Starts a single long-lived heartbeat goroutine toward the native mesh.
|
||||
// 3. Fetches an initial indexer pool from the first responsive native.
|
||||
// 4. Runs consensus when real (non-fallback) indexers are returned.
|
||||
// 5. Replaces StaticIndexers with the confirmed pool.
|
||||
func ConnectToNatives(h host.Host, minIndexer int, maxIndexer int, myPID pp.ID) error {
|
||||
logger := oclib.GetLogger()
|
||||
logger.Info().Msg("[native] step 1 — parsing native addresses")
|
||||
|
||||
// Parse in config order: the first entry is the primary pool source.
|
||||
// Parse native addresses — safe to call multiple times.
|
||||
StreamNativeMu.Lock()
|
||||
orderedAddrs := []string{}
|
||||
for _, addr := range strings.Split(conf.GetConfig().NativeIndexerAddresses, ",") {
|
||||
addr = strings.TrimSpace(addr)
|
||||
@@ -88,106 +99,208 @@ func ConnectToNatives(h host.Host, minIndexer int, maxIndexer int, myPID pp.ID)
|
||||
}
|
||||
ad, err := pp.AddrInfoFromString(addr)
|
||||
if err != nil {
|
||||
logger.Err(err).Msg("ConnectToNatives: invalid addr")
|
||||
logger.Err(err).Msg("[native] step 1 — invalid native addr")
|
||||
continue
|
||||
}
|
||||
StaticNatives[addr] = ad
|
||||
orderedAddrs = append(orderedAddrs, addr)
|
||||
logger.Info().Str("addr", addr).Msg("[native] step 1 — native registered")
|
||||
}
|
||||
if len(StaticNatives) == 0 {
|
||||
StreamNativeMu.Unlock()
|
||||
return errors.New("no valid native addresses configured")
|
||||
}
|
||||
StreamNativeMu.Unlock()
|
||||
logger.Info().Int("count", len(orderedAddrs)).Msg("[native] step 1 — natives parsed")
|
||||
|
||||
// Long-lived heartbeat connections to keep the native mesh active.
|
||||
SendHeartbeat(context.Background(), ProtocolHeartbeat,
|
||||
conf.GetConfig().Name, h, StreamNatives, StaticNatives, 20*time.Second)
|
||||
|
||||
// Step 1: get an initial pool from the FIRST responsive native (in config order).
|
||||
var candidates []string
|
||||
var isFallback bool
|
||||
for _, addr := range orderedAddrs {
|
||||
ad := StaticNatives[addr]
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
if err := h.Connect(ctx, *ad); err != nil {
|
||||
cancel()
|
||||
continue
|
||||
}
|
||||
s, err := h.NewStream(ctx, ad.ID, ProtocolNativeGetIndexers)
|
||||
cancel()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
req := GetIndexersRequest{Count: maxIndexer}
|
||||
if encErr := json.NewEncoder(s).Encode(req); encErr != nil {
|
||||
s.Close()
|
||||
continue
|
||||
}
|
||||
var resp GetIndexersResponse
|
||||
if decErr := json.NewDecoder(s).Decode(&resp); decErr != nil {
|
||||
s.Close()
|
||||
continue
|
||||
}
|
||||
s.Close()
|
||||
candidates = resp.Indexers
|
||||
isFallback = resp.IsSelfFallback
|
||||
break // first responsive native only
|
||||
}
|
||||
// Step 1: one long-lived heartbeat to each native.
|
||||
nativeHeartbeatOnce.Do(func() {
|
||||
logger.Info().Msg("[native] step 1 — starting long-lived heartbeat to native mesh")
|
||||
SendHeartbeat(context.Background(), ProtocolHeartbeat,
|
||||
conf.GetConfig().Name, h, StreamNatives, StaticNatives, &StreamNativeMu, 20*time.Second)
|
||||
})
|
||||
|
||||
// Fetch initial pool from the first responsive native.
|
||||
logger.Info().Int("want", maxIndexer).Msg("[native] step 1 — fetching indexer pool from native")
|
||||
candidates, isFallback := fetchIndexersFromNative(h, orderedAddrs, maxIndexer)
|
||||
if len(candidates) == 0 {
|
||||
logger.Warn().Msg("[native] step 1 — no candidates returned by any native")
|
||||
if minIndexer > 0 {
|
||||
return errors.New("ConnectToNatives: no indexers available from any native")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
logger.Info().Int("candidates", len(candidates)).Bool("fallback", isFallback).Msg("[native] step 1 — pool received")
|
||||
|
||||
// If the native is already the fallback indexer, use it directly — no consensus needed.
|
||||
// Step 2: populate StaticIndexers — consensus for real indexers, direct for fallback.
|
||||
pool := resolvePool(h, candidates, isFallback, maxIndexer)
|
||||
replaceStaticIndexers(pool)
|
||||
|
||||
StreamMuIndexes.RLock()
|
||||
indexerCount := len(StaticIndexers)
|
||||
StreamMuIndexes.RUnlock()
|
||||
logger.Info().Int("pool_size", indexerCount).Msg("[native] step 2 — StaticIndexers replaced")
|
||||
|
||||
if minIndexer > 0 && indexerCount < minIndexer {
|
||||
return errors.New("not enough majority-confirmed indexers available")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// replenishIndexersFromNative is called when an indexer heartbeat fails (step 3→4).
|
||||
// It asks the native for exactly `need` replacement indexers, runs consensus when
|
||||
// real indexers are returned, and adds the results to StaticIndexers without
|
||||
// clearing the existing pool.
|
||||
func replenishIndexersFromNative(h host.Host, need int) {
|
||||
if need <= 0 {
|
||||
return
|
||||
}
|
||||
logger := oclib.GetLogger()
|
||||
logger.Info().Int("need", need).Msg("[native] step 4 — replenishing indexer pool from native")
|
||||
|
||||
StreamNativeMu.RLock()
|
||||
addrs := make([]string, 0, len(StaticNatives))
|
||||
for addr := range StaticNatives {
|
||||
addrs = append(addrs, addr)
|
||||
}
|
||||
StreamNativeMu.RUnlock()
|
||||
|
||||
candidates, isFallback := fetchIndexersFromNative(h, addrs, need)
|
||||
if len(candidates) == 0 {
|
||||
logger.Warn().Msg("[native] step 4 — no candidates returned by any native")
|
||||
return
|
||||
}
|
||||
logger.Info().Int("candidates", len(candidates)).Bool("fallback", isFallback).Msg("[native] step 4 — candidates received")
|
||||
|
||||
pool := resolvePool(h, candidates, isFallback, need)
|
||||
if len(pool) == 0 {
|
||||
logger.Warn().Msg("[native] step 4 — consensus yielded no confirmed indexers")
|
||||
return
|
||||
}
|
||||
|
||||
// Add new indexers to the pool — do NOT clear existing ones.
|
||||
StreamMuIndexes.Lock()
|
||||
for addr, ad := range pool {
|
||||
StaticIndexers[addr] = ad
|
||||
}
|
||||
total := len(StaticIndexers)
|
||||
|
||||
StreamMuIndexes.Unlock()
|
||||
logger.Info().Int("added", len(pool)).Int("total", total).Msg("[native] step 4 — pool replenished")
|
||||
|
||||
// Nudge the heartbeat goroutine to connect immediately instead of waiting
|
||||
// for the next 20s tick.
|
||||
NudgeIndexerHeartbeat()
|
||||
logger.Info().Msg("[native] step 4 — heartbeat goroutine nudged")
|
||||
}
|
||||
|
||||
// fetchIndexersFromNative opens a ProtocolNativeGetIndexers stream to the first
|
||||
// responsive native and returns the candidate list and fallback flag.
|
||||
func fetchIndexersFromNative(h host.Host, nativeAddrs []string, count int) (candidates []string, isFallback bool) {
|
||||
logger := oclib.GetLogger()
|
||||
for _, addr := range nativeAddrs {
|
||||
ad, err := pp.AddrInfoFromString(addr)
|
||||
if err != nil {
|
||||
logger.Warn().Str("addr", addr).Msg("[native] fetch — skipping invalid addr")
|
||||
continue
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
if err := h.Connect(ctx, *ad); err != nil {
|
||||
cancel()
|
||||
logger.Warn().Str("addr", addr).Err(err).Msg("[native] fetch — connect failed")
|
||||
continue
|
||||
}
|
||||
s, err := h.NewStream(ctx, ad.ID, ProtocolNativeGetIndexers)
|
||||
cancel()
|
||||
if err != nil {
|
||||
logger.Warn().Str("addr", addr).Err(err).Msg("[native] fetch — stream open failed")
|
||||
continue
|
||||
}
|
||||
req := GetIndexersRequest{Count: count, From: h.ID().String()}
|
||||
if encErr := json.NewEncoder(s).Encode(req); encErr != nil {
|
||||
s.Close()
|
||||
logger.Warn().Str("addr", addr).Err(encErr).Msg("[native] fetch — encode request failed")
|
||||
continue
|
||||
}
|
||||
var resp GetIndexersResponse
|
||||
if decErr := json.NewDecoder(s).Decode(&resp); decErr != nil {
|
||||
s.Close()
|
||||
logger.Warn().Str("addr", addr).Err(decErr).Msg("[native] fetch — decode response failed")
|
||||
continue
|
||||
}
|
||||
s.Close()
|
||||
logger.Info().Str("native", addr).Int("indexers", len(resp.Indexers)).Bool("fallback", resp.IsSelfFallback).Msg("[native] fetch — response received")
|
||||
return resp.Indexers, resp.IsSelfFallback
|
||||
}
|
||||
logger.Warn().Msg("[native] fetch — no native responded")
|
||||
return nil, false
|
||||
}
|
||||
|
||||
// resolvePool converts a candidate list to a validated addr→AddrInfo map.
|
||||
// When isFallback is true the native itself is the indexer — no consensus needed.
|
||||
// When isFallback is false, consensus is run before accepting the candidates.
|
||||
func resolvePool(h host.Host, candidates []string, isFallback bool, maxIndexer int) map[string]*pp.AddrInfo {
|
||||
logger := oclib.GetLogger()
|
||||
if isFallback {
|
||||
logger.Info().Strs("addrs", candidates).Msg("[native] resolve — fallback mode, skipping consensus")
|
||||
pool := make(map[string]*pp.AddrInfo, len(candidates))
|
||||
for _, addr := range candidates {
|
||||
ad, err := pp.AddrInfoFromString(addr)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
StaticIndexers[addr] = ad
|
||||
pool[addr] = ad
|
||||
}
|
||||
return nil
|
||||
return pool
|
||||
}
|
||||
|
||||
// Step 2: challenge the pool to ALL configured natives and score by majority vote.
|
||||
// Round 1.
|
||||
logger.Info().Int("candidates", len(candidates)).Msg("[native] resolve — consensus round 1")
|
||||
confirmed, suggestions := clientSideConsensus(h, candidates)
|
||||
logger.Info().Int("confirmed", len(confirmed)).Int("suggestions", len(suggestions)).Msg("[native] resolve — consensus round 1 done")
|
||||
|
||||
// Step 3: if we still have gaps, sample from suggestions and re-challenge.
|
||||
// Round 2: fill gaps from suggestions if below target.
|
||||
if len(confirmed) < maxIndexer && len(suggestions) > 0 {
|
||||
rand.Shuffle(len(suggestions), func(i, j int) { suggestions[i], suggestions[j] = suggestions[j], suggestions[i] })
|
||||
gap := maxIndexer - len(confirmed)
|
||||
if gap > len(suggestions) {
|
||||
gap = len(suggestions)
|
||||
}
|
||||
logger.Info().Int("gap", gap).Msg("[native] resolve — consensus round 2 (filling gaps)")
|
||||
confirmed2, _ := clientSideConsensus(h, append(confirmed, suggestions[:gap]...))
|
||||
if len(confirmed2) > 0 {
|
||||
confirmed = confirmed2
|
||||
}
|
||||
logger.Info().Int("confirmed", len(confirmed)).Msg("[native] resolve — consensus round 2 done")
|
||||
}
|
||||
|
||||
// Step 4: populate StaticIndexers with confirmed addresses.
|
||||
pool := make(map[string]*pp.AddrInfo, len(confirmed))
|
||||
for _, addr := range confirmed {
|
||||
ad, err := pp.AddrInfoFromString(addr)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
pool[addr] = ad
|
||||
}
|
||||
logger.Info().Int("pool_size", len(pool)).Msg("[native] resolve — pool ready")
|
||||
return pool
|
||||
}
|
||||
|
||||
// replaceStaticIndexers atomically replaces the active indexer pool.
|
||||
// Peers no longer in next have their heartbeat streams closed so the SendHeartbeat
|
||||
// goroutine stops sending to them on the next tick.
|
||||
func replaceStaticIndexers(next map[string]*pp.AddrInfo) {
|
||||
StreamMuIndexes.Lock()
|
||||
defer StreamMuIndexes.Unlock()
|
||||
for addr, ad := range next {
|
||||
StaticIndexers[addr] = ad
|
||||
}
|
||||
|
||||
if minIndexer > 0 && len(StaticIndexers) < minIndexer {
|
||||
return errors.New("not enough majority-confirmed indexers available")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// clientSideConsensus challenges a candidate list to ALL configured native peers
|
||||
// in parallel. Each native replies with the candidates it trusts plus extras it
|
||||
// recommends. An indexer is confirmed when strictly more than 50% of responding
|
||||
// natives trust it. The remaining addresses from native suggestions are returned
|
||||
// as suggestions for a possible second round.
|
||||
// natives trust it.
|
||||
func clientSideConsensus(h host.Host, candidates []string) (confirmed []string, suggestions []string) {
|
||||
if len(candidates) == 0 {
|
||||
return nil, nil
|
||||
@@ -201,7 +314,6 @@ func clientSideConsensus(h host.Host, candidates []string) (confirmed []string,
|
||||
StreamNativeMu.RUnlock()
|
||||
|
||||
if len(peers) == 0 {
|
||||
// No natives to challenge: trust candidates as-is.
|
||||
return candidates, nil
|
||||
}
|
||||
|
||||
@@ -239,13 +351,12 @@ func clientSideConsensus(h host.Host, candidates []string) (confirmed []string,
|
||||
}(ad)
|
||||
}
|
||||
|
||||
// Collect responses up to consensusCollectTimeout.
|
||||
timer := time.NewTimer(consensusCollectTimeout)
|
||||
defer timer.Stop()
|
||||
|
||||
trustedCounts := map[string]int{}
|
||||
suggestionPool := map[string]struct{}{}
|
||||
total := 0 // counts only natives that actually responded
|
||||
total := 0
|
||||
collected := 0
|
||||
|
||||
collect:
|
||||
@@ -254,7 +365,7 @@ collect:
|
||||
case r := <-ch:
|
||||
collected++
|
||||
if !r.responded {
|
||||
continue // timeout / error: skip, do not count as vote
|
||||
continue
|
||||
}
|
||||
total++
|
||||
seen := map[string]struct{}{}
|
||||
@@ -273,13 +384,12 @@ collect:
|
||||
}
|
||||
|
||||
if total == 0 {
|
||||
// No native responded: fall back to trusting the candidates as-is.
|
||||
return candidates, nil
|
||||
}
|
||||
|
||||
confirmedSet := map[string]struct{}{}
|
||||
for addr, count := range trustedCounts {
|
||||
if count*2 > total { // strictly >50%
|
||||
if count*2 > total {
|
||||
confirmed = append(confirmed, addr)
|
||||
confirmedSet[addr] = struct{}{}
|
||||
}
|
||||
@@ -292,15 +402,17 @@ collect:
|
||||
return
|
||||
}
|
||||
|
||||
const ProtocolIndexerHeartbeat = "/opencloud/heartbeat/indexer/1.0"
|
||||
|
||||
// RegisterWithNative sends a one-shot registration to each configured native indexer.
|
||||
// Should be called periodically every RecommendedHeartbeatInterval.
|
||||
func RegisterWithNative(h host.Host, nativeAddressesStr string) {
|
||||
logger := oclib.GetLogger()
|
||||
myAddr := ""
|
||||
if len(h.Addrs()) > 0 {
|
||||
myAddr = h.Addrs()[0].String() + "/p2p/" + h.ID().String()
|
||||
if !strings.Contains(h.Addrs()[len(h.Addrs())-1].String(), "127.0.0.1") {
|
||||
myAddr = h.Addrs()[len(h.Addrs())-1].String() + "/p2p/" + h.ID().String()
|
||||
}
|
||||
if myAddr == "" {
|
||||
logger.Warn().Msg("RegisterWithNative: no routable address yet, skipping")
|
||||
return
|
||||
}
|
||||
reg := IndexerRegistration{
|
||||
PeerID: h.ID().String(),
|
||||
@@ -334,16 +446,16 @@ func RegisterWithNative(h host.Host, nativeAddressesStr string) {
|
||||
}
|
||||
}
|
||||
|
||||
// EnsureNativePeers populates StaticNatives from config and starts heartbeat
|
||||
// connections to other natives. Safe to call multiple times; heartbeat is only
|
||||
// started once (when StaticNatives transitions from empty to non-empty).
|
||||
// EnsureNativePeers populates StaticNatives from config and starts a single
|
||||
// heartbeat goroutine toward the native mesh. Safe to call multiple times;
|
||||
// the heartbeat goroutine is started at most once (nativeMeshHeartbeatOnce).
|
||||
func EnsureNativePeers(h host.Host) {
|
||||
logger := oclib.GetLogger()
|
||||
nativeAddrs := conf.GetConfig().NativeIndexerAddresses
|
||||
if nativeAddrs == "" {
|
||||
return
|
||||
}
|
||||
StreamNativeMu.Lock()
|
||||
wasEmpty := len(StaticNatives) == 0
|
||||
for _, addr := range strings.Split(nativeAddrs, ",") {
|
||||
addr = strings.TrimSpace(addr)
|
||||
if addr == "" {
|
||||
@@ -354,11 +466,312 @@ func EnsureNativePeers(h host.Host) {
|
||||
continue
|
||||
}
|
||||
StaticNatives[addr] = ad
|
||||
logger.Info().Str("addr", addr).Msg("native: registered peer in native mesh")
|
||||
}
|
||||
StreamNativeMu.Unlock()
|
||||
// One heartbeat goroutine iterates over all of StaticNatives on each tick;
|
||||
// starting one per address would multiply heartbeats by the native count.
|
||||
nativeMeshHeartbeatOnce.Do(func() {
|
||||
logger.Info().Msg("native: starting mesh heartbeat goroutine")
|
||||
SendHeartbeat(context.Background(), ProtocolHeartbeat,
|
||||
conf.GetConfig().Name, h, StreamNatives, StaticNatives, &StreamNativeMu, 20*time.Second)
|
||||
})
|
||||
}
|
||||
|
||||
if wasEmpty && len(StaticNatives) > 0 {
|
||||
SendHeartbeat(context.Background(), ProtocolIndexerHeartbeat,
|
||||
conf.GetConfig().Name, h, StreamNatives, StaticNatives, 20*time.Second)
|
||||
func StartNativeRegistration(h host.Host, nativeAddressesStr string) {
|
||||
go func() {
|
||||
// Poll until a routable (non-loopback) address is available before the first
|
||||
// registration attempt. libp2p may not have discovered external addresses yet
|
||||
// at startup. Cap at 12 retries (~1 minute) so we don't spin indefinitely.
|
||||
for i := 0; i < 12; i++ {
|
||||
hasRoutable := false
|
||||
if !strings.Contains(h.Addrs()[len(h.Addrs())-1].String(), "127.0.0.1") {
|
||||
hasRoutable = true
|
||||
break
|
||||
}
|
||||
|
||||
if hasRoutable {
|
||||
break
|
||||
}
|
||||
time.Sleep(5 * time.Second)
|
||||
}
|
||||
RegisterWithNative(h, nativeAddressesStr)
|
||||
t := time.NewTicker(RecommendedHeartbeatInterval)
|
||||
defer t.Stop()
|
||||
for range t.C {
|
||||
RegisterWithNative(h, nativeAddressesStr)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// ── Lost-native replacement ───────────────────────────────────────────────────
|
||||
|
||||
const (
|
||||
// ProtocolNativeGetPeers lets a node/indexer ask a native for a random
|
||||
// selection of that native's own native contacts (to replace a dead native).
|
||||
ProtocolNativeGetPeers = "/opencloud/native/peers/1.0"
|
||||
// ProtocolIndexerGetNatives lets nodes/indexers ask a connected indexer for
|
||||
// its configured native addresses (fallback when no alive native responds).
|
||||
ProtocolIndexerGetNatives = "/opencloud/indexer/natives/1.0"
|
||||
// retryNativeInterval is how often retryLostNative polls a dead native.
|
||||
retryNativeInterval = 30 * time.Second
|
||||
)
|
||||
|
||||
// GetNativePeersRequest is sent to a native to ask for its known native contacts.
|
||||
type GetNativePeersRequest struct {
|
||||
Exclude []string `json:"exclude"`
|
||||
Count int `json:"count"`
|
||||
}
|
||||
|
||||
// GetNativePeersResponse carries native addresses returned by a native's peer list.
|
||||
type GetNativePeersResponse struct {
|
||||
Peers []string `json:"peers"`
|
||||
}
|
||||
|
||||
// GetIndexerNativesRequest is sent to an indexer to ask for its configured native addresses.
|
||||
type GetIndexerNativesRequest struct {
|
||||
Exclude []string `json:"exclude"`
|
||||
}
|
||||
|
||||
// GetIndexerNativesResponse carries native addresses returned by an indexer.
|
||||
type GetIndexerNativesResponse struct {
|
||||
Natives []string `json:"natives"`
|
||||
}
|
||||
|
||||
// nativeHeartbeatNudge allows replenishNativesFromPeers to trigger an immediate
|
||||
// native heartbeat tick after adding a replacement native to the pool.
|
||||
var nativeHeartbeatNudge = make(chan struct{}, 1)
|
||||
|
||||
// NudgeNativeHeartbeat signals the native heartbeat goroutine to fire immediately.
|
||||
func NudgeNativeHeartbeat() {
|
||||
select {
|
||||
case nativeHeartbeatNudge <- struct{}{}:
|
||||
default: // nudge already pending, skip
|
||||
}
|
||||
}
|
||||
|
||||
// replenishIndexersIfNeeded checks if the indexer pool is below the configured
|
||||
// minimum (or empty) and, if so, asks the native mesh for replacements.
|
||||
// Called whenever a native is recovered so the indexer pool is restored.
|
||||
func replenishIndexersIfNeeded(h host.Host) {
|
||||
logger := oclib.GetLogger()
|
||||
minIdx := conf.GetConfig().MinIndexer
|
||||
if minIdx < 1 {
|
||||
minIdx = 1
|
||||
}
|
||||
StreamMuIndexes.RLock()
|
||||
indexerCount := len(StaticIndexers)
|
||||
StreamMuIndexes.RUnlock()
|
||||
if indexerCount < minIdx {
|
||||
need := minIdx - indexerCount
|
||||
logger.Info().Int("need", need).Int("current", indexerCount).Msg("[native] native recovered — replenishing indexer pool")
|
||||
go replenishIndexersFromNative(h, need)
|
||||
}
|
||||
}
|
||||
|
||||
// replenishNativesFromPeers is called when the heartbeat to a native fails.
|
||||
// Flow:
|
||||
// 1. Ask other alive natives for one of their native contacts (ProtocolNativeGetPeers).
|
||||
// 2. If none respond or return a new address, ask connected indexers (ProtocolIndexerGetNatives).
|
||||
// 3. If no replacement found:
|
||||
// - remaining > 1 → ignore (enough natives remain).
|
||||
// - remaining ≤ 1 → start periodic retry (retryLostNative).
|
||||
func replenishNativesFromPeers(h host.Host, lostAddr string, proto protocol.ID) {
|
||||
if lostAddr == "" {
|
||||
return
|
||||
}
|
||||
logger := oclib.GetLogger()
|
||||
logger.Info().Str("lost", lostAddr).Msg("[native] replenish natives — start")
|
||||
|
||||
// Build exclude list: the lost addr + all currently alive natives.
|
||||
// lostAddr has already been removed from StaticNatives by doTick.
|
||||
StreamNativeMu.RLock()
|
||||
remaining := len(StaticNatives)
|
||||
exclude := make([]string, 0, remaining+1)
|
||||
exclude = append(exclude, lostAddr)
|
||||
for addr := range StaticNatives {
|
||||
exclude = append(exclude, addr)
|
||||
}
|
||||
StreamNativeMu.RUnlock()
|
||||
|
||||
logger.Info().Int("remaining", remaining).Msg("[native] replenish natives — step 1: ask alive natives for a peer")
|
||||
|
||||
// Step 1: ask other alive natives for a replacement.
|
||||
newAddr := fetchNativeFromNatives(h, exclude)
|
||||
|
||||
// Step 2: fallback — ask connected indexers for their native addresses.
|
||||
if newAddr == "" {
|
||||
logger.Info().Msg("[native] replenish natives — step 2: ask indexers for their native addresses")
|
||||
newAddr = fetchNativeFromIndexers(h, exclude)
|
||||
}
|
||||
|
||||
if newAddr != "" {
|
||||
ad, err := pp.AddrInfoFromString(newAddr)
|
||||
if err == nil {
|
||||
StreamNativeMu.Lock()
|
||||
StaticNatives[newAddr] = ad
|
||||
StreamNativeMu.Unlock()
|
||||
logger.Info().Str("new", newAddr).Msg("[native] replenish natives — replacement added, nudging heartbeat")
|
||||
NudgeNativeHeartbeat()
|
||||
replenishIndexersIfNeeded(h)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Step 3: no replacement found.
|
||||
logger.Warn().Int("remaining", remaining).Msg("[native] replenish natives — no replacement found")
|
||||
if remaining > 1 {
|
||||
logger.Info().Msg("[native] replenish natives — enough natives remain, ignoring loss")
|
||||
return
|
||||
}
|
||||
// Last (or only) native — retry periodically.
|
||||
logger.Info().Str("addr", lostAddr).Msg("[native] replenish natives — last native lost, starting periodic retry")
|
||||
go retryLostNative(h, lostAddr, proto)
|
||||
}
|
||||
|
||||
// fetchNativeFromNatives asks each alive native for one of its own native contacts
|
||||
// not in exclude. Returns the first new address found or "" if none.
|
||||
func fetchNativeFromNatives(h host.Host, exclude []string) string {
|
||||
logger := oclib.GetLogger()
|
||||
excludeSet := make(map[string]struct{}, len(exclude))
|
||||
for _, e := range exclude {
|
||||
excludeSet[e] = struct{}{}
|
||||
}
|
||||
|
||||
StreamNativeMu.RLock()
|
||||
natives := make([]*pp.AddrInfo, 0, len(StaticNatives))
|
||||
for _, ad := range StaticNatives {
|
||||
natives = append(natives, ad)
|
||||
}
|
||||
StreamNativeMu.RUnlock()
|
||||
|
||||
rand.Shuffle(len(natives), func(i, j int) { natives[i], natives[j] = natives[j], natives[i] })
|
||||
|
||||
for _, ad := range natives {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
if err := h.Connect(ctx, *ad); err != nil {
|
||||
cancel()
|
||||
logger.Warn().Str("native", ad.ID.String()).Err(err).Msg("[native] fetch native peers — connect failed")
|
||||
continue
|
||||
}
|
||||
s, err := h.NewStream(ctx, ad.ID, ProtocolNativeGetPeers)
|
||||
cancel()
|
||||
if err != nil {
|
||||
logger.Warn().Str("native", ad.ID.String()).Err(err).Msg("[native] fetch native peers — stream failed")
|
||||
continue
|
||||
}
|
||||
req := GetNativePeersRequest{Exclude: exclude, Count: 1}
|
||||
if encErr := json.NewEncoder(s).Encode(req); encErr != nil {
|
||||
s.Close()
|
||||
continue
|
||||
}
|
||||
var resp GetNativePeersResponse
|
||||
if decErr := json.NewDecoder(s).Decode(&resp); decErr != nil {
|
||||
s.Close()
|
||||
continue
|
||||
}
|
||||
s.Close()
|
||||
for _, peer := range resp.Peers {
|
||||
if _, excluded := excludeSet[peer]; !excluded && peer != "" {
|
||||
logger.Info().Str("from", ad.ID.String()).Str("new", peer).Msg("[native] fetch native peers — got replacement")
|
||||
return peer
|
||||
}
|
||||
}
|
||||
logger.Debug().Str("native", ad.ID.String()).Msg("[native] fetch native peers — no new native from this peer")
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// fetchNativeFromIndexers asks connected indexers for their configured native addresses,
|
||||
// returning the first one not in exclude.
|
||||
func fetchNativeFromIndexers(h host.Host, exclude []string) string {
|
||||
logger := oclib.GetLogger()
|
||||
excludeSet := make(map[string]struct{}, len(exclude))
|
||||
for _, e := range exclude {
|
||||
excludeSet[e] = struct{}{}
|
||||
}
|
||||
|
||||
StreamMuIndexes.RLock()
|
||||
indexers := make([]*pp.AddrInfo, 0, len(StaticIndexers))
|
||||
for _, ad := range StaticIndexers {
|
||||
indexers = append(indexers, ad)
|
||||
}
|
||||
StreamMuIndexes.RUnlock()
|
||||
|
||||
rand.Shuffle(len(indexers), func(i, j int) { indexers[i], indexers[j] = indexers[j], indexers[i] })
|
||||
|
||||
for _, ad := range indexers {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
if err := h.Connect(ctx, *ad); err != nil {
|
||||
cancel()
|
||||
continue
|
||||
}
|
||||
s, err := h.NewStream(ctx, ad.ID, ProtocolIndexerGetNatives)
|
||||
cancel()
|
||||
if err != nil {
|
||||
logger.Warn().Str("indexer", ad.ID.String()).Err(err).Msg("[native] fetch indexer natives — stream failed")
|
||||
continue
|
||||
}
|
||||
req := GetIndexerNativesRequest{Exclude: exclude}
|
||||
if encErr := json.NewEncoder(s).Encode(req); encErr != nil {
|
||||
s.Close()
|
||||
continue
|
||||
}
|
||||
var resp GetIndexerNativesResponse
|
||||
if decErr := json.NewDecoder(s).Decode(&resp); decErr != nil {
|
||||
s.Close()
|
||||
continue
|
||||
}
|
||||
s.Close()
|
||||
for _, nativeAddr := range resp.Natives {
|
||||
if _, excluded := excludeSet[nativeAddr]; !excluded && nativeAddr != "" {
|
||||
logger.Info().Str("indexer", ad.ID.String()).Str("native", nativeAddr).Msg("[native] fetch indexer natives — got native")
|
||||
return nativeAddr
|
||||
}
|
||||
}
|
||||
}
|
||||
logger.Warn().Msg("[native] fetch indexer natives — no native found from indexers")
|
||||
return ""
|
||||
}
|
||||
|
||||
// retryLostNative periodically retries connecting to a lost native address until
|
||||
// it becomes reachable again or was already restored by another path.
|
||||
func retryLostNative(h host.Host, addr string, nativeProto protocol.ID) {
|
||||
logger := oclib.GetLogger()
|
||||
logger.Info().Str("addr", addr).Msg("[native] retry — periodic retry for lost native started")
|
||||
t := time.NewTicker(retryNativeInterval)
|
||||
defer t.Stop()
|
||||
for range t.C {
|
||||
StreamNativeMu.RLock()
|
||||
_, alreadyRestored := StaticNatives[addr]
|
||||
StreamNativeMu.RUnlock()
|
||||
if alreadyRestored {
|
||||
logger.Info().Str("addr", addr).Msg("[native] retry — native already restored, stopping retry")
|
||||
return
|
||||
}
|
||||
|
||||
ad, err := pp.AddrInfoFromString(addr)
|
||||
if err != nil {
|
||||
logger.Warn().Str("addr", addr).Msg("[native] retry — invalid addr, stopping retry")
|
||||
return
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
err = h.Connect(ctx, *ad)
|
||||
cancel()
|
||||
if err != nil {
|
||||
logger.Warn().Str("addr", addr).Msg("[native] retry — still unreachable")
|
||||
continue
|
||||
}
|
||||
// Reachable again — add back to pool.
|
||||
StreamNativeMu.Lock()
|
||||
StaticNatives[addr] = ad
|
||||
StreamNativeMu.Unlock()
|
||||
logger.Info().Str("addr", addr).Msg("[native] retry — native reconnected and added back to pool")
|
||||
NudgeNativeHeartbeat()
|
||||
replenishIndexersIfNeeded(h)
|
||||
if nativeProto == ProtocolNativeGetIndexers {
|
||||
StartNativeRegistration(h, addr) // register back
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,17 +24,16 @@ func ExtractIP(addr string) (net.IP, error) {
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
ips, err := ma.ValueForProtocol(multiaddr.P_IP4) // or P_IP6
|
||||
ipStr, err := ma.ValueForProtocol(multiaddr.P_IP4)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
ipStr, err = ma.ValueForProtocol(multiaddr.P_IP6)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
host, _, err := net.SplitHostPort(ips)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
ip := net.ParseIP(host)
|
||||
ip := net.ParseIP(ipStr)
|
||||
if ip == nil {
|
||||
return nil, fmt.Errorf("invalid IP: %s", host)
|
||||
return nil, fmt.Errorf("invalid IP: %s", ipStr)
|
||||
}
|
||||
return ip, nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user