Full Flow : Catalog + Peer
This commit is contained in:
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"oc-discovery/conf"
|
||||
"strings"
|
||||
@@ -11,14 +12,16 @@ import (
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"github.com/libp2p/go-libp2p/core/crypto"
|
||||
"github.com/libp2p/go-libp2p/core/host"
|
||||
pp "github.com/libp2p/go-libp2p/core/peer"
|
||||
"github.com/libp2p/go-libp2p/core/protocol"
|
||||
)
|
||||
|
||||
const (
|
||||
ProtocolNativeSubscription = "/opencloud/native/subscribe/1.0"
|
||||
ProtocolNativeGetIndexers = "/opencloud/native/indexers/1.0"
|
||||
ProtocolNativeSubscription = "/opencloud/native/subscribe/1.0"
|
||||
ProtocolNativeUnsubscribe = "/opencloud/native/unsubscribe/1.0"
|
||||
ProtocolNativeGetIndexers = "/opencloud/native/indexers/1.0"
|
||||
// ProtocolNativeConsensus is used by nodes/indexers to cross-validate an indexer
|
||||
// pool against all configured native peers.
|
||||
ProtocolNativeConsensus = "/opencloud/native/consensus/1.0"
|
||||
@@ -50,9 +53,50 @@ type ConsensusResponse struct {
|
||||
|
||||
// IndexerRegistration is sent by an indexer to a native to signal its alive state.
|
||||
// Only Addr is required; PeerID is derived from it if omitted.
|
||||
// Timestamp + PubKey + Signature allow the native and DHT to verify that the
|
||||
// registration was produced by the peer that owns the declared PeerID.
|
||||
type IndexerRegistration struct {
|
||||
PeerID string `json:"peer_id,omitempty"`
|
||||
Addr string `json:"addr"`
|
||||
PeerID string `json:"peer_id,omitempty"`
|
||||
Addr string `json:"addr"`
|
||||
Timestamp int64 `json:"ts,omitempty"` // Unix nanoseconds (anti-replay)
|
||||
PubKey []byte `json:"pub_key,omitempty"` // marshaled libp2p public key
|
||||
Signature []byte `json:"sig,omitempty"` // Sign(signaturePayload())
|
||||
}
|
||||
|
||||
// SignaturePayload returns the canonical byte slice that is signed/verified.
|
||||
// Format: "<PeerID>|<Addr>|<Timestamp>"
|
||||
func (r *IndexerRegistration) SignaturePayload() []byte {
|
||||
return []byte(fmt.Sprintf("%s|%s|%d", r.PeerID, r.Addr, r.Timestamp))
|
||||
}
|
||||
|
||||
// Sign fills PubKey and Signature using the host's own private key.
|
||||
func (r *IndexerRegistration) Sign(h host.Host) {
|
||||
priv := h.Peerstore().PrivKey(h.ID())
|
||||
if priv == nil {
|
||||
return
|
||||
}
|
||||
if pub, err := crypto.MarshalPublicKey(priv.GetPublic()); err == nil {
|
||||
r.PubKey = pub
|
||||
}
|
||||
r.Signature, _ = priv.Sign(r.SignaturePayload())
|
||||
}
|
||||
|
||||
// Verify returns true when the registration carries a valid self-signature.
|
||||
// Returns true (not an error) when no signature is present, to remain backward-
|
||||
// compatible with older nodes that do not sign their registrations.
|
||||
func (r *IndexerRegistration) Verify() (bool, error) {
|
||||
if len(r.Signature) == 0 || len(r.PubKey) == 0 {
|
||||
return true, nil // unsigned — accepted but untrusted
|
||||
}
|
||||
pub, err := crypto.UnmarshalPublicKey(r.PubKey)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("unmarshal pub key: %w", err)
|
||||
}
|
||||
ok, err := pub.Verify(r.SignaturePayload(), r.Signature)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return ok, nil
|
||||
}
|
||||
|
||||
// GetIndexersRequest asks a native for a pool of live indexers.
|
||||
@@ -194,46 +238,100 @@ func replenishIndexersFromNative(h host.Host, need int) {
|
||||
logger.Info().Msg("[native] step 4 — heartbeat goroutine nudged")
|
||||
}
|
||||
|
||||
// fetchIndexersFromNative opens a ProtocolNativeGetIndexers stream to the first
|
||||
// responsive native and returns the candidate list and fallback flag.
|
||||
// fetchIndexersFromNative queries ALL configured natives in parallel and merges
|
||||
// their indexer lists. Non-fallback responses are preferred; if only fallbacks
|
||||
// respond the fallback list is returned. Results are deduplicated and capped at count.
|
||||
func fetchIndexersFromNative(h host.Host, nativeAddrs []string, count int) (candidates []string, isFallback bool) {
|
||||
logger := oclib.GetLogger()
|
||||
for _, addr := range nativeAddrs {
|
||||
ad, err := pp.AddrInfoFromString(addr)
|
||||
if err != nil {
|
||||
logger.Warn().Str("addr", addr).Msg("[native] fetch — skipping invalid addr")
|
||||
continue
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
if err := h.Connect(ctx, *ad); err != nil {
|
||||
cancel()
|
||||
logger.Warn().Str("addr", addr).Err(err).Msg("[native] fetch — connect failed")
|
||||
continue
|
||||
}
|
||||
s, err := h.NewStream(ctx, ad.ID, ProtocolNativeGetIndexers)
|
||||
cancel()
|
||||
if err != nil {
|
||||
logger.Warn().Str("addr", addr).Err(err).Msg("[native] fetch — stream open failed")
|
||||
continue
|
||||
}
|
||||
req := GetIndexersRequest{Count: count, From: h.ID().String()}
|
||||
if encErr := json.NewEncoder(s).Encode(req); encErr != nil {
|
||||
s.Close()
|
||||
logger.Warn().Str("addr", addr).Err(encErr).Msg("[native] fetch — encode request failed")
|
||||
continue
|
||||
}
|
||||
var resp GetIndexersResponse
|
||||
if decErr := json.NewDecoder(s).Decode(&resp); decErr != nil {
|
||||
s.Close()
|
||||
logger.Warn().Str("addr", addr).Err(decErr).Msg("[native] fetch — decode response failed")
|
||||
continue
|
||||
}
|
||||
s.Close()
|
||||
logger.Info().Str("native", addr).Int("indexers", len(resp.Indexers)).Bool("fallback", resp.IsSelfFallback).Msg("[native] fetch — response received")
|
||||
return resp.Indexers, resp.IsSelfFallback
|
||||
if len(nativeAddrs) == 0 {
|
||||
return nil, false
|
||||
}
|
||||
logger.Warn().Msg("[native] fetch — no native responded")
|
||||
return nil, false
|
||||
|
||||
type fetchResult struct {
|
||||
indexers []string
|
||||
isFallback bool
|
||||
}
|
||||
ch := make(chan fetchResult, len(nativeAddrs))
|
||||
|
||||
for _, addr := range nativeAddrs {
|
||||
go func(addr string) {
|
||||
ad, err := pp.AddrInfoFromString(addr)
|
||||
if err != nil {
|
||||
ch <- fetchResult{}
|
||||
return
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
if err := h.Connect(ctx, *ad); err != nil {
|
||||
logger.Warn().Str("addr", addr).Err(err).Msg("[native] fetch — connect failed")
|
||||
ch <- fetchResult{}
|
||||
return
|
||||
}
|
||||
s, err := h.NewStream(ctx, ad.ID, ProtocolNativeGetIndexers)
|
||||
if err != nil {
|
||||
logger.Warn().Str("addr", addr).Err(err).Msg("[native] fetch — stream open failed")
|
||||
ch <- fetchResult{}
|
||||
return
|
||||
}
|
||||
s.SetDeadline(time.Now().Add(4 * time.Second))
|
||||
defer s.Close()
|
||||
req := GetIndexersRequest{Count: count, From: h.ID().String()}
|
||||
if encErr := json.NewEncoder(s).Encode(req); encErr != nil {
|
||||
logger.Warn().Str("addr", addr).Err(encErr).Msg("[native] fetch — encode failed")
|
||||
ch <- fetchResult{}
|
||||
return
|
||||
}
|
||||
var resp GetIndexersResponse
|
||||
if decErr := json.NewDecoder(s).Decode(&resp); decErr != nil {
|
||||
logger.Warn().Str("addr", addr).Err(decErr).Msg("[native] fetch — decode failed")
|
||||
ch <- fetchResult{}
|
||||
return
|
||||
}
|
||||
logger.Info().Str("native", addr).Int("indexers", len(resp.Indexers)).Bool("fallback", resp.IsSelfFallback).Msg("[native] fetch — response received")
|
||||
ch <- fetchResult{indexers: resp.Indexers, isFallback: resp.IsSelfFallback}
|
||||
}(addr)
|
||||
}
|
||||
|
||||
timer := time.NewTimer(6 * time.Second)
|
||||
defer timer.Stop()
|
||||
|
||||
seen := map[string]struct{}{}
|
||||
var realList, fallbackList []string
|
||||
collected := 0
|
||||
|
||||
collect:
|
||||
for collected < len(nativeAddrs) {
|
||||
select {
|
||||
case r := <-ch:
|
||||
collected++
|
||||
for _, ix := range r.indexers {
|
||||
if _, ok := seen[ix]; ok {
|
||||
continue
|
||||
}
|
||||
seen[ix] = struct{}{}
|
||||
if r.isFallback {
|
||||
fallbackList = append(fallbackList, ix)
|
||||
} else {
|
||||
realList = append(realList, ix)
|
||||
}
|
||||
}
|
||||
case <-timer.C:
|
||||
break collect
|
||||
}
|
||||
}
|
||||
|
||||
if len(realList) > 0 {
|
||||
if len(realList) > count {
|
||||
realList = realList[:count]
|
||||
}
|
||||
logger.Info().Int("count", len(realList)).Msg("[native] fetch — merged real indexers from all natives")
|
||||
return realList, false
|
||||
}
|
||||
if len(fallbackList) > count {
|
||||
fallbackList = fallbackList[:count]
|
||||
}
|
||||
logger.Info().Int("count", len(fallbackList)).Bool("fallback", true).Msg("[native] fetch — using fallback indexers")
|
||||
return fallbackList, len(fallbackList) > 0
|
||||
}
|
||||
|
||||
// resolvePool converts a candidate list to a validated addr→AddrInfo map.
|
||||
@@ -337,6 +435,9 @@ func clientSideConsensus(h host.Host, candidates []string) (confirmed []string,
|
||||
ch <- nativeResult{}
|
||||
return
|
||||
}
|
||||
// Set an absolute deadline on the stream so Encode/Decode cannot
|
||||
// block past the per-query budget, even if the remote native stalls.
|
||||
s.SetDeadline(time.Now().Add(consensusQueryTimeout))
|
||||
defer s.Close()
|
||||
if err := json.NewEncoder(s).Encode(ConsensusRequest{Candidates: candidates}); err != nil {
|
||||
ch <- nativeResult{}
|
||||
@@ -387,9 +488,13 @@ collect:
|
||||
return candidates, nil
|
||||
}
|
||||
|
||||
quorum := conf.GetConfig().ConsensusQuorum
|
||||
if quorum <= 0 {
|
||||
quorum = 0.5
|
||||
}
|
||||
confirmedSet := map[string]struct{}{}
|
||||
for addr, count := range trustedCounts {
|
||||
if count*2 > total {
|
||||
if float64(count) > float64(total)*quorum {
|
||||
confirmed = append(confirmed, addr)
|
||||
confirmedSet[addr] = struct{}{}
|
||||
}
|
||||
@@ -415,9 +520,11 @@ func RegisterWithNative(h host.Host, nativeAddressesStr string) {
|
||||
return
|
||||
}
|
||||
reg := IndexerRegistration{
|
||||
PeerID: h.ID().String(),
|
||||
Addr: myAddr,
|
||||
PeerID: h.ID().String(),
|
||||
Addr: myAddr,
|
||||
Timestamp: time.Now().UnixNano(),
|
||||
}
|
||||
reg.Sign(h)
|
||||
for _, addr := range strings.Split(nativeAddressesStr, ",") {
|
||||
addr = strings.TrimSpace(addr)
|
||||
if addr == "" {
|
||||
@@ -446,6 +553,40 @@ func RegisterWithNative(h host.Host, nativeAddressesStr string) {
|
||||
}
|
||||
}
|
||||
|
||||
// UnregisterFromNative sends an explicit deregistration message to each configured
|
||||
// native so it can evict this indexer immediately without waiting for TTL expiry.
|
||||
// Should be called during graceful shutdown.
|
||||
func UnregisterFromNative(h host.Host, nativeAddressesStr string) {
|
||||
logger := oclib.GetLogger()
|
||||
reg := IndexerRegistration{PeerID: h.ID().String()}
|
||||
for _, addr := range strings.Split(nativeAddressesStr, ",") {
|
||||
addr = strings.TrimSpace(addr)
|
||||
if addr == "" {
|
||||
continue
|
||||
}
|
||||
ad, err := pp.AddrInfoFromString(addr)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
|
||||
if err := h.Connect(ctx, *ad); err != nil {
|
||||
cancel()
|
||||
logger.Warn().Str("addr", addr).Msg("UnregisterFromNative: connect failed")
|
||||
continue
|
||||
}
|
||||
s, err := h.NewStream(ctx, ad.ID, ProtocolNativeUnsubscribe)
|
||||
cancel()
|
||||
if err != nil {
|
||||
logger.Warn().Str("addr", addr).Msg("UnregisterFromNative: stream open failed")
|
||||
continue
|
||||
}
|
||||
if err := json.NewEncoder(s).Encode(reg); err != nil {
|
||||
logger.Warn().Str("addr", addr).Msg("UnregisterFromNative: encode failed")
|
||||
}
|
||||
s.Close()
|
||||
}
|
||||
}
|
||||
|
||||
// EnsureNativePeers populates StaticNatives from config and starts a single
|
||||
// heartbeat goroutine toward the native mesh. Safe to call multiple times;
|
||||
// the heartbeat goroutine is started at most once (nativeMeshHeartbeatOnce).
|
||||
@@ -626,7 +767,7 @@ func replenishNativesFromPeers(h host.Host, lostAddr string, proto protocol.ID)
|
||||
}
|
||||
// Last (or only) native — retry periodically.
|
||||
logger.Info().Str("addr", lostAddr).Msg("[native] replenish natives — last native lost, starting periodic retry")
|
||||
go retryLostNative(h, lostAddr, proto)
|
||||
go retryLostNative(context.Background(), h, lostAddr, proto)
|
||||
}
|
||||
|
||||
// fetchNativeFromNatives asks each alive native for one of its own native contacts
|
||||
@@ -677,7 +818,7 @@ func fetchNativeFromNatives(h host.Host, exclude []string) string {
|
||||
return peer
|
||||
}
|
||||
}
|
||||
logger.Debug().Str("native", ad.ID.String()).Msg("[native] fetch native peers — no new native from this peer")
|
||||
// logger.Debug().Str("native", ad.ID.String()).Msg("[native] fetch native peers — no new native from this peer")
|
||||
}
|
||||
return ""
|
||||
}
|
||||
@@ -735,13 +876,19 @@ func fetchNativeFromIndexers(h host.Host, exclude []string) string {
|
||||
}
|
||||
|
||||
// retryLostNative periodically retries connecting to a lost native address until
|
||||
// it becomes reachable again or was already restored by another path.
|
||||
func retryLostNative(h host.Host, addr string, nativeProto protocol.ID) {
|
||||
// it becomes reachable again, was already restored by another path, or ctx is cancelled.
|
||||
func retryLostNative(ctx context.Context, h host.Host, addr string, nativeProto protocol.ID) {
|
||||
logger := oclib.GetLogger()
|
||||
logger.Info().Str("addr", addr).Msg("[native] retry — periodic retry for lost native started")
|
||||
t := time.NewTicker(retryNativeInterval)
|
||||
defer t.Stop()
|
||||
for range t.C {
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
logger.Info().Str("addr", addr).Msg("[native] retry — context cancelled, stopping retry")
|
||||
return
|
||||
case <-t.C:
|
||||
}
|
||||
StreamNativeMu.RLock()
|
||||
_, alreadyRestored := StaticNatives[addr]
|
||||
StreamNativeMu.RUnlock()
|
||||
|
||||
Reference in New Issue
Block a user