Full Flow : Catalog + Peer

This commit is contained in:
mr
2026-03-05 15:22:02 +01:00
parent ef3d998ead
commit 3751ec554d
20 changed files with 970 additions and 641 deletions

View File

@@ -112,6 +112,12 @@ func NewLongLivedPubSubService(h host.Host) *LongLivedPubSubService {
}
}
func (s *LongLivedPubSubService) GetPubSub(topicName string) *pubsub.Topic {
s.PubsubMu.Lock()
defer s.PubsubMu.Unlock()
return s.LongLivedPubSubs[topicName]
}
func (s *LongLivedPubSubService) processEvent(
ctx context.Context,
p *peer.Peer,
@@ -123,26 +129,8 @@ func (s *LongLivedPubSubService) processEvent(
return handler(ctx, topicName, event)
}
const TopicPubSubNodeActivity = "oc-node-activity"
const TopicPubSubSearch = "oc-node-search"
func (s *LongLivedPubSubService) SubscribeToNodeActivity(ps *pubsub.PubSub, f *func(context.Context, TopicNodeActivityPub, string)) error {
ps.RegisterTopicValidator(TopicPubSubNodeActivity, func(ctx context.Context, p pp.ID, m *pubsub.Message) bool {
return true
})
if topic, err := ps.Join(TopicPubSubNodeActivity); err != nil {
return err
} else {
s.PubsubMu.Lock()
defer s.PubsubMu.Unlock()
s.LongLivedPubSubs[TopicPubSubNodeActivity] = topic
}
if f != nil {
return SubscribeEvents(s, context.Background(), TopicPubSubNodeActivity, -1, *f)
}
return nil
}
func (s *LongLivedPubSubService) SubscribeToSearch(ps *pubsub.PubSub, f *func(context.Context, Event, string)) error {
ps.RegisterTopicValidator(TopicPubSubSearch, func(ctx context.Context, p pp.ID, m *pubsub.Message) bool {
return true
@@ -151,8 +139,8 @@ func (s *LongLivedPubSubService) SubscribeToSearch(ps *pubsub.PubSub, f *func(co
return err
} else {
s.PubsubMu.Lock()
defer s.PubsubMu.Unlock()
s.LongLivedPubSubs[TopicPubSubSearch] = topic
s.PubsubMu.Unlock()
}
if f != nil {
return SubscribeEvents(s, context.Background(), TopicPubSubSearch, -1, *f)
@@ -178,10 +166,13 @@ func SubscribeEvents[T interface{}](s *LongLivedPubSubService,
}
func waitResults[T interface{}](s *LongLivedPubSubService, ctx context.Context, sub *pubsub.Subscription, proto string, timeout int, f func(context.Context, T, string)) {
fmt.Println("waitResults", proto)
defer ctx.Done()
for {
s.PubsubMu.Lock() // check safely if cache is actually notified subscribed to topic
if s.LongLivedPubSubs[proto] == nil { // if not kill the loop.
s.PubsubMu.Unlock()
break
}
s.PubsubMu.Unlock()

View File

@@ -27,9 +27,9 @@ type LongLivedStreamRecordedService[T interface{}] struct {
StreamRecords map[protocol.ID]map[pp.ID]*StreamRecord[T]
StreamMU sync.RWMutex
maxNodesConn int
// AfterHeartbeat is an optional hook called after each successful heartbeat update.
// The indexer sets it to republish the embedded signed record to the DHT.
AfterHeartbeat func(pid pp.ID)
// AfterHeartbeat is called after each successful heartbeat with the full
// decoded Heartbeat so the hook can use the fresh embedded PeerRecord.
AfterHeartbeat func(hb *Heartbeat)
// AfterDelete is called after gc() evicts an expired peer, outside the lock.
// name and did may be empty if the HeartbeatStream had no metadata.
AfterDelete func(pid pp.ID, name string, did string)
@@ -66,7 +66,6 @@ func (ix *LongLivedStreamRecordedService[T]) gc() {
return
}
streams := ix.StreamRecords[ProtocolHeartbeat]
fmt.Println(StaticNatives, StaticIndexers, streams)
type gcEntry struct {
pid pp.ID
@@ -184,9 +183,26 @@ func (ix *LongLivedStreamRecordedService[T]) HandleHeartbeat(s network.Stream) {
logger.Info().Msg("A new node is subscribed : " + pid.String())
}
ix.StreamMU.Unlock()
// Let the indexer republish the embedded signed record to the DHT.
if ix.AfterHeartbeat != nil {
ix.AfterHeartbeat(*pid)
// Enrich hb.DID before calling the hook: nodes never set hb.DID directly;
// extract it from the embedded signed PeerRecord if available, then fall
// back to the DID stored by handleNodePublish in the stream record.
if hb.DID == "" && len(hb.Record) > 0 {
var partial struct {
DID string `json:"did"`
}
if json.Unmarshal(hb.Record, &partial) == nil && partial.DID != "" {
hb.DID = partial.DID
}
}
if hb.DID == "" {
ix.StreamMU.RLock()
if rec, ok := streams[*pid]; ok {
hb.DID = rec.DID
}
ix.StreamMU.RUnlock()
}
if ix.AfterHeartbeat != nil && hb.DID != "" {
ix.AfterHeartbeat(hb)
}
}
}
@@ -214,17 +230,15 @@ func CheckHeartbeat(h host.Host, s network.Stream, dec *json.Decoder, streams ma
}
lock.Unlock()
diversity := getDiversityRate(h, hb.IndexersBinded)
fmt.Println(upTime, bpms, diversity)
hb.ComputeIndexerScore(upTime, bpms, diversity)
// First heartbeat: uptime is always 0 so the score ceiling is 60, below the
// steady-state threshold of 75. Use a lower admission threshold so new peers
// can enter and start accumulating uptime. Subsequent heartbeats must meet
// the full threshold once uptime is tracked.
minScore := float64(50)
minScore := float64(40)
if isFirstHeartbeat {
minScore = 40
}
fmt.Println(hb.Score, minScore)
if hb.Score < minScore {
return nil, nil, errors.New("not enough trusting value")
}
@@ -239,13 +253,11 @@ func CheckHeartbeat(h host.Host, s network.Stream, dec *json.Decoder, streams ma
}
func getDiversityRate(h host.Host, peers []string) float64 {
peers, _ = checkPeers(h, peers)
diverse := []string{}
for _, p := range peers {
ip, err := ExtractIP(p)
if err != nil {
fmt.Println("NO IP", p, err)
continue
}
div := ip.Mask(net.CIDRMask(24, 32)).String()
@@ -725,7 +737,7 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
go replenishIndexersFromNative(h, idxNeed)
}
}
logger.Debug().Str("peer", ix.ID.String()).Str("proto", string(proto)).Msg("[native] step 2 — heartbeat sent ok")
// logger.Debug().Str("peer", ix.ID.String()).Str("proto", string(proto)).Msg("[native] step 2 — heartbeat sent ok")
}
}
}
@@ -768,6 +780,7 @@ func TempStream(h host.Host, ad pp.AddrInfo, proto protocol.ID, did string, stre
return streams, err
}
}
if streams[proto] != nil && streams[proto][ad.ID] != nil {
return streams, nil
} else if s, err := h.NewStream(ctxTTL, ad.ID, proto); err == nil {

View File

@@ -4,6 +4,7 @@ import (
"context"
"cloud.o-forge.io/core/oc-lib/models/peer"
pubsub "github.com/libp2p/go-libp2p-pubsub"
)
type HeartBeatStreamed interface {
@@ -11,5 +12,6 @@ type HeartBeatStreamed interface {
}
type DiscoveryPeer interface {
GetPeerRecord(ctx context.Context, key string) ([]*peer.Peer, error)
GetPeerRecord(ctx context.Context, key string, search bool) ([]*peer.Peer, error)
GetPubSub(topicName string) *pubsub.Topic
}

View File

@@ -4,6 +4,7 @@ import (
"context"
"encoding/json"
"errors"
"fmt"
"math/rand"
"oc-discovery/conf"
"strings"
@@ -11,14 +12,16 @@ import (
"time"
oclib "cloud.o-forge.io/core/oc-lib"
"github.com/libp2p/go-libp2p/core/crypto"
"github.com/libp2p/go-libp2p/core/host"
pp "github.com/libp2p/go-libp2p/core/peer"
"github.com/libp2p/go-libp2p/core/protocol"
)
const (
ProtocolNativeSubscription = "/opencloud/native/subscribe/1.0"
ProtocolNativeGetIndexers = "/opencloud/native/indexers/1.0"
ProtocolNativeSubscription = "/opencloud/native/subscribe/1.0"
ProtocolNativeUnsubscribe = "/opencloud/native/unsubscribe/1.0"
ProtocolNativeGetIndexers = "/opencloud/native/indexers/1.0"
// ProtocolNativeConsensus is used by nodes/indexers to cross-validate an indexer
// pool against all configured native peers.
ProtocolNativeConsensus = "/opencloud/native/consensus/1.0"
@@ -50,9 +53,50 @@ type ConsensusResponse struct {
// IndexerRegistration is sent by an indexer to a native to signal its alive state.
// Only Addr is required; PeerID is derived from it if omitted.
// Timestamp + PubKey + Signature allow the native and DHT to verify that the
// registration was produced by the peer that owns the declared PeerID.
type IndexerRegistration struct {
PeerID string `json:"peer_id,omitempty"`
Addr string `json:"addr"`
PeerID string `json:"peer_id,omitempty"`
Addr string `json:"addr"`
Timestamp int64 `json:"ts,omitempty"` // Unix nanoseconds (anti-replay)
PubKey []byte `json:"pub_key,omitempty"` // marshaled libp2p public key
Signature []byte `json:"sig,omitempty"` // Sign(signaturePayload())
}
// SignaturePayload returns the canonical byte slice that is signed/verified.
// Format: "<PeerID>|<Addr>|<Timestamp>"
func (r *IndexerRegistration) SignaturePayload() []byte {
return []byte(fmt.Sprintf("%s|%s|%d", r.PeerID, r.Addr, r.Timestamp))
}
// Sign fills PubKey and Signature using the host's own private key.
func (r *IndexerRegistration) Sign(h host.Host) {
priv := h.Peerstore().PrivKey(h.ID())
if priv == nil {
return
}
if pub, err := crypto.MarshalPublicKey(priv.GetPublic()); err == nil {
r.PubKey = pub
}
r.Signature, _ = priv.Sign(r.SignaturePayload())
}
// Verify returns true when the registration carries a valid self-signature.
// Returns true (not an error) when no signature is present, to remain backward-
// compatible with older nodes that do not sign their registrations.
func (r *IndexerRegistration) Verify() (bool, error) {
if len(r.Signature) == 0 || len(r.PubKey) == 0 {
return true, nil // unsigned — accepted but untrusted
}
pub, err := crypto.UnmarshalPublicKey(r.PubKey)
if err != nil {
return false, fmt.Errorf("unmarshal pub key: %w", err)
}
ok, err := pub.Verify(r.SignaturePayload(), r.Signature)
if err != nil {
return false, err
}
return ok, nil
}
// GetIndexersRequest asks a native for a pool of live indexers.
@@ -194,46 +238,100 @@ func replenishIndexersFromNative(h host.Host, need int) {
logger.Info().Msg("[native] step 4 — heartbeat goroutine nudged")
}
// fetchIndexersFromNative opens a ProtocolNativeGetIndexers stream to the first
// responsive native and returns the candidate list and fallback flag.
// fetchIndexersFromNative queries ALL configured natives in parallel and merges
// their indexer lists. Non-fallback responses are preferred; if only fallbacks
// respond the fallback list is returned. Results are deduplicated and capped at count.
func fetchIndexersFromNative(h host.Host, nativeAddrs []string, count int) (candidates []string, isFallback bool) {
logger := oclib.GetLogger()
for _, addr := range nativeAddrs {
ad, err := pp.AddrInfoFromString(addr)
if err != nil {
logger.Warn().Str("addr", addr).Msg("[native] fetch — skipping invalid addr")
continue
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
if err := h.Connect(ctx, *ad); err != nil {
cancel()
logger.Warn().Str("addr", addr).Err(err).Msg("[native] fetch — connect failed")
continue
}
s, err := h.NewStream(ctx, ad.ID, ProtocolNativeGetIndexers)
cancel()
if err != nil {
logger.Warn().Str("addr", addr).Err(err).Msg("[native] fetch — stream open failed")
continue
}
req := GetIndexersRequest{Count: count, From: h.ID().String()}
if encErr := json.NewEncoder(s).Encode(req); encErr != nil {
s.Close()
logger.Warn().Str("addr", addr).Err(encErr).Msg("[native] fetch — encode request failed")
continue
}
var resp GetIndexersResponse
if decErr := json.NewDecoder(s).Decode(&resp); decErr != nil {
s.Close()
logger.Warn().Str("addr", addr).Err(decErr).Msg("[native] fetch — decode response failed")
continue
}
s.Close()
logger.Info().Str("native", addr).Int("indexers", len(resp.Indexers)).Bool("fallback", resp.IsSelfFallback).Msg("[native] fetch — response received")
return resp.Indexers, resp.IsSelfFallback
if len(nativeAddrs) == 0 {
return nil, false
}
logger.Warn().Msg("[native] fetch — no native responded")
return nil, false
type fetchResult struct {
indexers []string
isFallback bool
}
ch := make(chan fetchResult, len(nativeAddrs))
for _, addr := range nativeAddrs {
go func(addr string) {
ad, err := pp.AddrInfoFromString(addr)
if err != nil {
ch <- fetchResult{}
return
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := h.Connect(ctx, *ad); err != nil {
logger.Warn().Str("addr", addr).Err(err).Msg("[native] fetch — connect failed")
ch <- fetchResult{}
return
}
s, err := h.NewStream(ctx, ad.ID, ProtocolNativeGetIndexers)
if err != nil {
logger.Warn().Str("addr", addr).Err(err).Msg("[native] fetch — stream open failed")
ch <- fetchResult{}
return
}
s.SetDeadline(time.Now().Add(4 * time.Second))
defer s.Close()
req := GetIndexersRequest{Count: count, From: h.ID().String()}
if encErr := json.NewEncoder(s).Encode(req); encErr != nil {
logger.Warn().Str("addr", addr).Err(encErr).Msg("[native] fetch — encode failed")
ch <- fetchResult{}
return
}
var resp GetIndexersResponse
if decErr := json.NewDecoder(s).Decode(&resp); decErr != nil {
logger.Warn().Str("addr", addr).Err(decErr).Msg("[native] fetch — decode failed")
ch <- fetchResult{}
return
}
logger.Info().Str("native", addr).Int("indexers", len(resp.Indexers)).Bool("fallback", resp.IsSelfFallback).Msg("[native] fetch — response received")
ch <- fetchResult{indexers: resp.Indexers, isFallback: resp.IsSelfFallback}
}(addr)
}
timer := time.NewTimer(6 * time.Second)
defer timer.Stop()
seen := map[string]struct{}{}
var realList, fallbackList []string
collected := 0
collect:
for collected < len(nativeAddrs) {
select {
case r := <-ch:
collected++
for _, ix := range r.indexers {
if _, ok := seen[ix]; ok {
continue
}
seen[ix] = struct{}{}
if r.isFallback {
fallbackList = append(fallbackList, ix)
} else {
realList = append(realList, ix)
}
}
case <-timer.C:
break collect
}
}
if len(realList) > 0 {
if len(realList) > count {
realList = realList[:count]
}
logger.Info().Int("count", len(realList)).Msg("[native] fetch — merged real indexers from all natives")
return realList, false
}
if len(fallbackList) > count {
fallbackList = fallbackList[:count]
}
logger.Info().Int("count", len(fallbackList)).Bool("fallback", true).Msg("[native] fetch — using fallback indexers")
return fallbackList, len(fallbackList) > 0
}
// resolvePool converts a candidate list to a validated addr→AddrInfo map.
@@ -337,6 +435,9 @@ func clientSideConsensus(h host.Host, candidates []string) (confirmed []string,
ch <- nativeResult{}
return
}
// Set an absolute deadline on the stream so Encode/Decode cannot
// block past the per-query budget, even if the remote native stalls.
s.SetDeadline(time.Now().Add(consensusQueryTimeout))
defer s.Close()
if err := json.NewEncoder(s).Encode(ConsensusRequest{Candidates: candidates}); err != nil {
ch <- nativeResult{}
@@ -387,9 +488,13 @@ collect:
return candidates, nil
}
quorum := conf.GetConfig().ConsensusQuorum
if quorum <= 0 {
quorum = 0.5
}
confirmedSet := map[string]struct{}{}
for addr, count := range trustedCounts {
if count*2 > total {
if float64(count) > float64(total)*quorum {
confirmed = append(confirmed, addr)
confirmedSet[addr] = struct{}{}
}
@@ -415,9 +520,11 @@ func RegisterWithNative(h host.Host, nativeAddressesStr string) {
return
}
reg := IndexerRegistration{
PeerID: h.ID().String(),
Addr: myAddr,
PeerID: h.ID().String(),
Addr: myAddr,
Timestamp: time.Now().UnixNano(),
}
reg.Sign(h)
for _, addr := range strings.Split(nativeAddressesStr, ",") {
addr = strings.TrimSpace(addr)
if addr == "" {
@@ -446,6 +553,40 @@ func RegisterWithNative(h host.Host, nativeAddressesStr string) {
}
}
// UnregisterFromNative sends an explicit deregistration message to each configured
// native so it can evict this indexer immediately without waiting for TTL expiry.
// Should be called during graceful shutdown.
func UnregisterFromNative(h host.Host, nativeAddressesStr string) {
logger := oclib.GetLogger()
reg := IndexerRegistration{PeerID: h.ID().String()}
for _, addr := range strings.Split(nativeAddressesStr, ",") {
addr = strings.TrimSpace(addr)
if addr == "" {
continue
}
ad, err := pp.AddrInfoFromString(addr)
if err != nil {
continue
}
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
if err := h.Connect(ctx, *ad); err != nil {
cancel()
logger.Warn().Str("addr", addr).Msg("UnregisterFromNative: connect failed")
continue
}
s, err := h.NewStream(ctx, ad.ID, ProtocolNativeUnsubscribe)
cancel()
if err != nil {
logger.Warn().Str("addr", addr).Msg("UnregisterFromNative: stream open failed")
continue
}
if err := json.NewEncoder(s).Encode(reg); err != nil {
logger.Warn().Str("addr", addr).Msg("UnregisterFromNative: encode failed")
}
s.Close()
}
}
// EnsureNativePeers populates StaticNatives from config and starts a single
// heartbeat goroutine toward the native mesh. Safe to call multiple times;
// the heartbeat goroutine is started at most once (nativeMeshHeartbeatOnce).
@@ -626,7 +767,7 @@ func replenishNativesFromPeers(h host.Host, lostAddr string, proto protocol.ID)
}
// Last (or only) native — retry periodically.
logger.Info().Str("addr", lostAddr).Msg("[native] replenish natives — last native lost, starting periodic retry")
go retryLostNative(h, lostAddr, proto)
go retryLostNative(context.Background(), h, lostAddr, proto)
}
// fetchNativeFromNatives asks each alive native for one of its own native contacts
@@ -677,7 +818,7 @@ func fetchNativeFromNatives(h host.Host, exclude []string) string {
return peer
}
}
logger.Debug().Str("native", ad.ID.String()).Msg("[native] fetch native peers — no new native from this peer")
// logger.Debug().Str("native", ad.ID.String()).Msg("[native] fetch native peers — no new native from this peer")
}
return ""
}
@@ -735,13 +876,19 @@ func fetchNativeFromIndexers(h host.Host, exclude []string) string {
}
// retryLostNative periodically retries connecting to a lost native address until
// it becomes reachable again or was already restored by another path.
func retryLostNative(h host.Host, addr string, nativeProto protocol.ID) {
// it becomes reachable again, was already restored by another path, or ctx is cancelled.
func retryLostNative(ctx context.Context, h host.Host, addr string, nativeProto protocol.ID) {
logger := oclib.GetLogger()
logger.Info().Str("addr", addr).Msg("[native] retry — periodic retry for lost native started")
t := time.NewTicker(retryNativeInterval)
defer t.Stop()
for range t.C {
for {
select {
case <-ctx.Done():
logger.Info().Str("addr", addr).Msg("[native] retry — context cancelled, stopping retry")
return
case <-t.C:
}
StreamNativeMu.RLock()
_, alreadyRestored := StaticNatives[addr]
StreamNativeMu.RUnlock()