Full Flow : Catalog + Peer

This commit is contained in:
mr
2026-03-05 15:22:02 +01:00
parent ef3d998ead
commit 3751ec554d
20 changed files with 970 additions and 641 deletions

View File

@@ -5,6 +5,8 @@ import (
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"io"
"oc-discovery/conf"
"oc-discovery/daemons/node/common"
"strings"
@@ -83,30 +85,17 @@ func (pr *PeerRecord) ExtractPeer(ourkey string, key string, pubKey crypto.PubKe
NATSAddress: pr.NATSAddress,
WalletAddress: pr.WalletAddress,
}
b, err := json.Marshal(p)
if err != nil {
return pp.SELF == p.Relation, nil, err
}
if time.Now().UTC().After(pr.ExpiryDate) {
return pp.SELF == p.Relation, nil, errors.New("peer " + key + " is offline")
}
go tools.NewNATSCaller().SetNATSPub(tools.CREATE_RESOURCE, tools.NATSResponse{
FromApp: "oc-discovery",
Datatype: tools.PEER,
Method: int(tools.CREATE_RESOURCE),
SearchAttr: "peer_id",
Payload: b,
})
return pp.SELF == p.Relation, p, nil
}
type GetValue struct {
Key string `json:"key"`
PeerID peer.ID `json:"peer_id"`
Name string `json:"name,omitempty"`
Search bool `json:"search,omitempty"`
Key string `json:"key"`
PeerID string `json:"peer_id,omitempty"`
Name string `json:"name,omitempty"`
Search bool `json:"search,omitempty"`
}
type GetResponse struct {
@@ -132,29 +121,33 @@ func (ix *IndexerService) initNodeHandler() {
// Each heartbeat from a node carries a freshly signed PeerRecord.
// Republish it to the DHT so the record never expires as long as the node
// is alive — no separate publish stream needed from the node side.
ix.AfterHeartbeat = func(pid peer.ID) {
ctx1, cancel1 := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel1()
res, err := ix.DHT.GetValue(ctx1, ix.genPIDKey(pid.String()))
if err != nil {
logger.Warn().Err(err)
return
}
did := string(res)
ctx2, cancel2 := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel2()
res, err = ix.DHT.GetValue(ctx2, ix.genKey(did))
if err != nil {
logger.Warn().Err(err)
return
}
ix.AfterHeartbeat = func(hb *common.Heartbeat) {
// Priority 1: use the fresh signed PeerRecord embedded in the heartbeat.
// Each heartbeat tick, the node re-signs with ExpiryDate = now+2min, so
// this record is always fresh. Fetching from DHT would give a stale expiry.
var rec PeerRecord
if err := json.Unmarshal(res, &rec); err != nil {
logger.Warn().Err(err).Str("peer", pid.String()).Msg("indexer: heartbeat record unmarshal failed")
return
if len(hb.Record) > 0 {
if err := json.Unmarshal(hb.Record, &rec); err != nil {
logger.Warn().Err(err).Msg("indexer: heartbeat embedded record unmarshal failed")
return
}
} else {
// Fallback: node didn't embed a record yet (first heartbeat before claimInfo).
// Fetch from DHT using the DID resolved by HandleHeartbeat.
ctx2, cancel2 := context.WithTimeout(context.Background(), 10*time.Second)
res, err := ix.DHT.GetValue(ctx2, ix.genKey(hb.DID))
cancel2()
if err != nil {
logger.Warn().Err(err).Str("did", hb.DID).Msg("indexer: DHT fetch for refresh failed")
return
}
if err := json.Unmarshal(res, &rec); err != nil {
logger.Warn().Err(err).Str("did", hb.DID).Msg("indexer: heartbeat record unmarshal failed")
return
}
}
if _, err := rec.Verify(); err != nil {
logger.Warn().Err(err).Str("peer", pid.String()).Msg("indexer: heartbeat record signature invalid")
logger.Warn().Err(err).Str("did", rec.DID).Msg("indexer: heartbeat record signature invalid")
return
}
data, err := json.Marshal(rec)
@@ -162,12 +155,14 @@ func (ix *IndexerService) initNodeHandler() {
return
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
logger.Info().Msg("REFRESH PutValue " + ix.genKey(rec.DID))
if err := ix.DHT.PutValue(ctx, ix.genKey(rec.DID), data); err != nil {
logger.Warn().Err(err).Str("did", rec.DID).Msg("indexer: DHT refresh failed")
cancel()
return
}
cancel()
ix.publishNameEvent(NameIndexAdd, rec.Name, rec.PeerID, rec.DID)
if rec.Name != "" {
ctx2, cancel2 := context.WithTimeout(context.Background(), 10*time.Second)
ix.DHT.PutValue(ctx2, ix.genNameKey(rec.Name), []byte(rec.DID))
@@ -188,132 +183,151 @@ func (ix *IndexerService) initNodeHandler() {
func (ix *IndexerService) handleNodePublish(s network.Stream) {
defer s.Close()
logger := oclib.GetLogger()
for {
var rec PeerRecord
if err := json.NewDecoder(s).Decode(&rec); err != nil {
logger.Err(err)
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) ||
strings.Contains(err.Error(), "reset") ||
strings.Contains(err.Error(), "closed") ||
strings.Contains(err.Error(), "too many connections") {
return
}
continue
}
if _, err := rec.Verify(); err != nil {
logger.Err(err)
return
}
if rec.PeerID == "" || rec.ExpiryDate.Before(time.Now().UTC()) {
logger.Err(errors.New(rec.PeerID + " is expired."))
return
}
pid, err := peer.Decode(rec.PeerID)
if err != nil {
return
}
var rec PeerRecord
if err := json.NewDecoder(s).Decode(&rec); err != nil {
logger.Err(err)
return
}
if _, err := rec.Verify(); err != nil {
logger.Err(err)
return
}
if rec.PeerID == "" || rec.ExpiryDate.Before(time.Now().UTC()) {
logger.Err(errors.New(rec.PeerID + " is expired."))
return
}
pid, err := peer.Decode(rec.PeerID)
if err != nil {
return
}
ix.StreamMU.Lock()
defer ix.StreamMU.Unlock()
if ix.StreamRecords[common.ProtocolHeartbeat] == nil {
ix.StreamRecords[common.ProtocolHeartbeat] = map[peer.ID]*common.StreamRecord[PeerRecord]{}
}
streams := ix.StreamRecords[common.ProtocolHeartbeat]
if srec, ok := streams[pid]; ok {
srec.DID = rec.DID
srec.Record = rec
srec.HeartbeatStream.UptimeTracker.LastSeen = time.Now().UTC()
}
ix.StreamMU.Lock()
defer ix.StreamMU.Unlock()
if ix.StreamRecords[common.ProtocolHeartbeat] == nil {
ix.StreamRecords[common.ProtocolHeartbeat] = map[peer.ID]*common.StreamRecord[PeerRecord]{}
}
streams := ix.StreamRecords[common.ProtocolHeartbeat]
if srec, ok := streams[pid]; ok {
srec.DID = rec.DID
srec.Record = rec
srec.HeartbeatStream.UptimeTracker.LastSeen = time.Now().UTC()
}
key := ix.genKey(rec.DID)
data, err := json.Marshal(rec)
if err != nil {
logger.Err(err)
return
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
if err := ix.DHT.PutValue(ctx, key, data); err != nil {
logger.Err(err)
key := ix.genKey(rec.DID)
data, err := json.Marshal(rec)
if err != nil {
logger.Err(err)
return
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
if err := ix.DHT.PutValue(ctx, key, data); err != nil {
logger.Err(err)
cancel()
return
}
cancel()
return
}
cancel()
// Secondary index: /name/<name> → DID, so peers can resolve by human-readable name.
if rec.Name != "" {
ctx2, cancel2 := context.WithTimeout(context.Background(), 10*time.Second)
if err := ix.DHT.PutValue(ctx2, ix.genNameKey(rec.Name), []byte(rec.DID)); err != nil {
logger.Err(err).Str("name", rec.Name).Msg("indexer: failed to write name index")
fmt.Println("publishNameEvent")
ix.publishNameEvent(NameIndexAdd, rec.Name, rec.PeerID, rec.DID)
// Secondary index: /name/<name> → DID, so peers can resolve by human-readable name.
if rec.Name != "" {
ctx2, cancel2 := context.WithTimeout(context.Background(), 10*time.Second)
if err := ix.DHT.PutValue(ctx2, ix.genNameKey(rec.Name), []byte(rec.DID)); err != nil {
logger.Err(err).Str("name", rec.Name).Msg("indexer: failed to write name index")
}
cancel2()
}
cancel2()
}
// Secondary index: /pid/<peerID> → DID, so peers can resolve by libp2p PeerID.
if rec.PeerID != "" {
ctx3, cancel3 := context.WithTimeout(context.Background(), 10*time.Second)
if err := ix.DHT.PutValue(ctx3, ix.genPIDKey(rec.PeerID), []byte(rec.DID)); err != nil {
logger.Err(err).Str("pid", rec.PeerID).Msg("indexer: failed to write pid index")
// Secondary index: /pid/<peerID> → DID, so peers can resolve by libp2p PeerID.
if rec.PeerID != "" {
ctx3, cancel3 := context.WithTimeout(context.Background(), 10*time.Second)
if err := ix.DHT.PutValue(ctx3, ix.genPIDKey(rec.PeerID), []byte(rec.DID)); err != nil {
logger.Err(err).Str("pid", rec.PeerID).Msg("indexer: failed to write pid index")
}
cancel3()
}
cancel3()
return
}
}
func (ix *IndexerService) handleNodeGet(s network.Stream) {
defer s.Close()
logger := oclib.GetLogger()
var req GetValue
if err := json.NewDecoder(s).Decode(&req); err != nil {
logger.Err(err)
return
}
resp := GetResponse{Found: false, Records: map[string]PeerRecord{}}
keys := []string{}
// Name substring search — scan in-memory connected nodes first, then DHT exact match.
if req.Name != "" {
if req.Search {
for _, did := range ix.LookupNameIndex(strings.ToLower(req.Name)) {
keys = append(keys, did)
for {
var req GetValue
if err := json.NewDecoder(s).Decode(&req); err != nil {
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) ||
strings.Contains(err.Error(), "reset") ||
strings.Contains(err.Error(), "closed") ||
strings.Contains(err.Error(), "too many connections") {
return
}
} else {
// 2. DHT exact-name lookup: covers nodes that published but aren't currently connected.
nameCtx, nameCancel := context.WithTimeout(context.Background(), 5*time.Second)
if ch, err := ix.DHT.SearchValue(nameCtx, ix.genNameKey(req.Name)); err == nil {
for did := range ch {
keys = append(keys, string(did))
break
logger.Err(err)
continue
}
resp := GetResponse{Found: false, Records: map[string]PeerRecord{}}
fmt.Println("handleNodeGet", req.Search, req.Name)
keys := []string{}
// Name substring search — scan in-memory connected nodes first, then DHT exact match.
if req.Name != "" {
if req.Search {
for _, did := range ix.LookupNameIndex(strings.ToLower(req.Name)) {
keys = append(keys, did)
}
}
nameCancel()
}
} else if req.PeerID != "" {
pidCtx, pidCancel := context.WithTimeout(context.Background(), 5*time.Second)
if did, err := ix.DHT.GetValue(pidCtx, ix.genPIDKey(req.PeerID.String())); err == nil {
keys = append(keys, string(did))
}
pidCancel()
} else {
keys = append(keys, req.Key)
}
// DHT record fetch by DID key (covers exact-name and PeerID paths).
if len(keys) > 0 {
for _, k := range keys {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
c, err := ix.DHT.GetValue(ctx, ix.genKey(k))
cancel()
if err == nil {
var rec PeerRecord
if json.Unmarshal(c, &rec) == nil {
// Filter by PeerID only when one was explicitly specified.
if req.PeerID == "" || rec.PeerID == req.PeerID.String() {
resp.Records[rec.PeerID] = rec
} else {
// 2. DHT exact-name lookup: covers nodes that published but aren't currently connected.
nameCtx, nameCancel := context.WithTimeout(context.Background(), 5*time.Second)
if ch, err := ix.DHT.SearchValue(nameCtx, ix.genNameKey(req.Name)); err == nil {
for did := range ch {
keys = append(keys, string(did))
break
}
}
} else if req.Name == "" && req.PeerID == "" {
logger.Err(err).Msg("Failed to fetch PeerRecord from DHT " + req.Key)
nameCancel()
}
} else if req.PeerID != "" {
pidCtx, pidCancel := context.WithTimeout(context.Background(), 5*time.Second)
if did, err := ix.DHT.GetValue(pidCtx, ix.genPIDKey(req.PeerID)); err == nil {
keys = append(keys, string(did))
}
pidCancel()
} else {
keys = append(keys, req.Key)
}
// DHT record fetch by DID key (covers exact-name and PeerID paths).
if len(keys) > 0 {
for _, k := range keys {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
fmt.Println("TRY TO CATCH DID", ix.genKey(k))
c, err := ix.DHT.GetValue(ctx, ix.genKey(k))
cancel()
fmt.Println("TRY TO CATCH DID ERR", ix.genKey(k), c, err)
if err == nil {
var rec PeerRecord
if json.Unmarshal(c, &rec) == nil {
fmt.Println("CATCH DID ERR", ix.genKey(k), rec)
resp.Records[rec.PeerID] = rec
}
} else if req.Name == "" && req.PeerID == "" {
logger.Err(err).Msg("Failed to fetch PeerRecord from DHT " + req.Key)
}
}
}
}
resp.Found = len(resp.Records) > 0
_ = json.NewEncoder(s).Encode(resp)
resp.Found = len(resp.Records) > 0
_ = json.NewEncoder(s).Encode(resp)
break
}
}
// handleGetNatives returns this indexer's configured native addresses,
@@ -321,30 +335,38 @@ func (ix *IndexerService) handleNodeGet(s network.Stream) {
func (ix *IndexerService) handleGetNatives(s network.Stream) {
defer s.Close()
logger := oclib.GetLogger()
var req common.GetIndexerNativesRequest
if err := json.NewDecoder(s).Decode(&req); err != nil {
logger.Err(err).Msg("indexer get natives: decode")
return
}
excludeSet := make(map[string]struct{}, len(req.Exclude))
for _, e := range req.Exclude {
excludeSet[e] = struct{}{}
}
resp := common.GetIndexerNativesResponse{}
for _, addr := range strings.Split(conf.GetConfig().NativeIndexerAddresses, ",") {
addr = strings.TrimSpace(addr)
if addr == "" {
for {
var req common.GetIndexerNativesRequest
if err := json.NewDecoder(s).Decode(&req); err != nil {
logger.Err(err).Msg("indexer get natives: decode")
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) ||
strings.Contains(err.Error(), "reset") ||
strings.Contains(err.Error(), "closed") ||
strings.Contains(err.Error(), "too many connections") {
return
}
continue
}
if _, excluded := excludeSet[addr]; !excluded {
resp.Natives = append(resp.Natives, addr)
}
}
if err := json.NewEncoder(s).Encode(resp); err != nil {
logger.Err(err).Msg("indexer get natives: encode response")
excludeSet := make(map[string]struct{}, len(req.Exclude))
for _, e := range req.Exclude {
excludeSet[e] = struct{}{}
}
resp := common.GetIndexerNativesResponse{}
for _, addr := range strings.Split(conf.GetConfig().NativeIndexerAddresses, ",") {
addr = strings.TrimSpace(addr)
if addr == "" {
continue
}
if _, excluded := excludeSet[addr]; !excluded {
resp.Natives = append(resp.Natives, addr)
}
}
if err := json.NewEncoder(s).Encode(resp); err != nil {
logger.Err(err).Msg("indexer get natives: encode response")
}
break
}
}

View File

@@ -3,6 +3,7 @@ package indexer
import (
"context"
"encoding/json"
"fmt"
"strings"
"sync"
"time"
@@ -158,11 +159,13 @@ func (ix *IndexerService) LookupNameIndex(needle string) map[string]string {
ix.nameIndex.indexMu.RLock()
defer ix.nameIndex.indexMu.RUnlock()
for name, peers := range ix.nameIndex.index {
fmt.Println(strings.Contains(strings.ToLower(name), needleLow), needleLow, strings.ToLower(name))
if strings.Contains(strings.ToLower(name), needleLow) {
for peerID, did := range peers {
result[peerID] = did
}
}
}
fmt.Println("RESULT", result)
return result
}

View File

@@ -5,6 +5,7 @@ import (
"encoding/json"
"errors"
"fmt"
"io"
"math/rand"
"slices"
"strings"
@@ -15,6 +16,7 @@ import (
oclib "cloud.o-forge.io/core/oc-lib"
pubsub "github.com/libp2p/go-libp2p-pubsub"
"github.com/libp2p/go-libp2p/core/crypto"
"github.com/libp2p/go-libp2p/core/network"
pp "github.com/libp2p/go-libp2p/core/peer"
)
@@ -36,10 +38,15 @@ const (
)
// liveIndexerEntry tracks a registered indexer in the native's in-memory cache and DHT.
// PubKey and Signature are forwarded from the IndexerRegistration so the DHT validator
// can verify that the entry was produced by the peer owning the declared PeerID.
type liveIndexerEntry struct {
PeerID string `json:"peer_id"`
Addr string `json:"addr"`
ExpiresAt time.Time `json:"expires_at"`
PeerID string `json:"peer_id"`
Addr string `json:"addr"`
ExpiresAt time.Time `json:"expires_at"`
RegTimestamp int64 `json:"reg_ts,omitempty"` // Timestamp from the original IndexerRegistration
PubKey []byte `json:"pub_key,omitempty"`
Signature []byte `json:"sig,omitempty"`
}
// NativeState holds runtime state specific to native indexer operation.
@@ -53,13 +60,18 @@ type NativeState struct {
// including entries written by other natives.
knownPeerIDs map[string]string
knownMu sync.RWMutex
// cancel stops background goroutines (runOffloadLoop, refreshIndexersFromDHT)
// when the native shuts down.
cancel context.CancelFunc
}
func newNativeState() *NativeState {
func newNativeState(cancel context.CancelFunc) *NativeState {
return &NativeState{
liveIndexers: map[string]*liveIndexerEntry{},
responsiblePeers: map[pp.ID]struct{}{},
knownPeerIDs: map[string]string{},
cancel: cancel,
}
}
@@ -77,6 +89,18 @@ func (v IndexerRecordValidator) Validate(_ string, value []byte) error {
if e.ExpiresAt.Before(time.Now().UTC()) {
return errors.New("expired indexer record")
}
// Verify self-signature when present — rejects entries forged by a
// compromised native that does not control the declared PeerID.
if len(e.Signature) > 0 && len(e.PubKey) > 0 {
pub, err := crypto.UnmarshalPublicKey(e.PubKey)
if err != nil {
return fmt.Errorf("indexer entry: invalid public key: %w", err)
}
payload := []byte(fmt.Sprintf("%s|%s|%d", e.PeerID, e.Addr, e.RegTimestamp))
if ok, err := pub.Verify(payload, e.Signature); err != nil || !ok {
return errors.New("indexer entry: invalid signature")
}
}
return nil
}
@@ -99,9 +123,11 @@ func (v IndexerRecordValidator) Select(_ string, values [][]byte) (int, error) {
// InitNative registers native-specific stream handlers and starts background loops.
// Must be called after DHT is initialized.
func (ix *IndexerService) InitNative() {
ix.Native = newNativeState()
ctx, cancel := context.WithCancel(context.Background())
ix.Native = newNativeState(cancel)
ix.Host.SetStreamHandler(common.ProtocolHeartbeat, ix.HandleHeartbeat) // specific heartbeat for Indexer.
ix.Host.SetStreamHandler(common.ProtocolNativeSubscription, ix.handleNativeSubscription)
ix.Host.SetStreamHandler(common.ProtocolNativeUnsubscribe, ix.handleNativeUnsubscribe)
ix.Host.SetStreamHandler(common.ProtocolNativeGetIndexers, ix.handleNativeGetIndexers)
ix.Host.SetStreamHandler(common.ProtocolNativeConsensus, ix.handleNativeConsensus)
ix.Host.SetStreamHandler(common.ProtocolNativeGetPeers, ix.handleNativeGetPeers)
@@ -109,8 +135,8 @@ func (ix *IndexerService) InitNative() {
ix.subscribeIndexerRegistry()
// Ensure long connections to other configured natives (native-to-native mesh).
common.EnsureNativePeers(ix.Host)
go ix.runOffloadLoop()
go ix.refreshIndexersFromDHT()
go ix.runOffloadLoop(ctx)
go ix.refreshIndexersFromDHT(ctx)
}
// subscribeIndexerRegistry joins the PubSub topic used by natives to gossip newly
@@ -118,14 +144,40 @@ func (ix *IndexerService) InitNative() {
func (ix *IndexerService) subscribeIndexerRegistry() {
logger := oclib.GetLogger()
ix.PS.RegisterTopicValidator(common.TopicIndexerRegistry, func(_ context.Context, _ pp.ID, msg *pubsub.Message) bool {
// Reject empty or syntactically invalid multiaddrs before they reach the
// message loop. A compromised native could otherwise gossip arbitrary data.
addr := string(msg.Data)
if addr == "" {
// Parse as a signed IndexerRegistration.
var reg common.IndexerRegistration
if err := json.Unmarshal(msg.Data, &reg); err != nil {
return false
}
_, err := pp.AddrInfoFromString(addr)
return err == nil
if reg.Addr == "" {
return false
}
if _, err := pp.AddrInfoFromString(reg.Addr); err != nil {
return false
}
// Verify the self-signature when present (rejects forged gossip from a
// compromised native that does not control the announced PeerID).
if ok, _ := reg.Verify(); !ok {
return false
}
// Accept only messages from known native peers or from this host itself.
// This prevents external PSK participants from injecting registry entries.
from := msg.GetFrom()
if from == ix.Host.ID() {
return true
}
common.StreamNativeMu.RLock()
_, knownNative := common.StaticNatives[from.String()]
if !knownNative {
for _, ad := range common.StaticNatives {
if ad.ID == from {
knownNative = true
break
}
}
}
common.StreamNativeMu.RUnlock()
return knownNative
})
topic, err := ix.PS.Join(common.TopicIndexerRegistry)
if err != nil {
@@ -147,18 +199,18 @@ func (ix *IndexerService) subscribeIndexerRegistry() {
if err != nil {
return
}
addr := string(msg.Data)
if addr == "" {
// The gossip payload is a JSON-encoded IndexerRegistration (signed).
var gossipReg common.IndexerRegistration
if jsonErr := json.Unmarshal(msg.Data, &gossipReg); jsonErr != nil {
continue
}
if peer, err := pp.AddrInfoFromString(addr); err == nil {
ix.Native.knownMu.Lock()
ix.Native.knownPeerIDs[peer.ID.String()] = addr
ix.Native.knownMu.Unlock()
if gossipReg.Addr == "" || gossipReg.PeerID == "" {
continue
}
// A neighbouring native registered this PeerID; add to known set for DHT refresh.
ix.Native.knownMu.Lock()
ix.Native.knownPeerIDs[gossipReg.PeerID] = gossipReg.Addr
ix.Native.knownMu.Unlock()
}
}()
}
@@ -171,86 +223,172 @@ func (ix *IndexerService) handleNativeSubscription(s network.Stream) {
logger := oclib.GetLogger()
logger.Info().Msg("Subscription")
for {
var reg common.IndexerRegistration
if err := json.NewDecoder(s).Decode(&reg); err != nil {
logger.Err(err).Msg("native subscription: decode")
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) ||
strings.Contains(err.Error(), "reset") ||
strings.Contains(err.Error(), "closed") ||
strings.Contains(err.Error(), "too many connections") {
return
}
continue
}
logger.Info().Msg("Subscription " + reg.Addr)
if reg.Addr == "" {
logger.Error().Msg("native subscription: missing addr")
return
}
if reg.PeerID == "" {
ad, err := pp.AddrInfoFromString(reg.Addr)
if err != nil {
logger.Err(err).Msg("native subscription: invalid addr")
return
}
reg.PeerID = ad.ID.String()
}
// Reject registrations with an invalid self-signature.
if ok, err := reg.Verify(); !ok {
logger.Warn().Str("peer", reg.PeerID).Err(err).Msg("native subscription: invalid signature, rejecting")
return
}
// Build entry with a fresh TTL — must happen before the cache write so the
// TTL window is not consumed by DHT retries.
entry := &liveIndexerEntry{
PeerID: reg.PeerID,
Addr: reg.Addr,
ExpiresAt: time.Now().UTC().Add(IndexerTTL),
RegTimestamp: reg.Timestamp,
PubKey: reg.PubKey,
Signature: reg.Signature,
}
// Verify that the declared address is actually reachable before admitting
// the registration. This async dial runs in the background; the indexer is
// tentatively admitted immediately (so heartbeats don't get stuck) but is
// evicted from the cache if the dial fails within 5 s.
go func(e *liveIndexerEntry) {
ad, err := pp.AddrInfoFromString(e.Addr)
if err != nil {
logger.Warn().Str("addr", e.Addr).Msg("native subscription: invalid addr during validation, rejecting")
ix.Native.liveIndexersMu.Lock()
if cur := ix.Native.liveIndexers[e.PeerID]; cur == e {
delete(ix.Native.liveIndexers, e.PeerID)
}
ix.Native.liveIndexersMu.Unlock()
return
}
dialCtx, dialCancel := context.WithTimeout(context.Background(), 5*time.Second)
defer dialCancel()
if err := ix.Host.Connect(dialCtx, *ad); err != nil {
logger.Warn().Str("addr", e.Addr).Err(err).Msg("native subscription: declared address unreachable, rejecting")
ix.Native.liveIndexersMu.Lock()
if cur := ix.Native.liveIndexers[e.PeerID]; cur == e {
delete(ix.Native.liveIndexers, e.PeerID)
}
ix.Native.liveIndexersMu.Unlock()
}
}(entry)
// Update local cache and known set immediately so concurrent GetIndexers calls
// can already see this indexer without waiting for the DHT write to complete.
ix.Native.liveIndexersMu.Lock()
_, isRenewal := ix.Native.liveIndexers[reg.PeerID]
ix.Native.liveIndexers[reg.PeerID] = entry
ix.Native.liveIndexersMu.Unlock()
ix.Native.knownMu.Lock()
ix.Native.knownPeerIDs[reg.PeerID] = reg.Addr
ix.Native.knownMu.Unlock()
// Gossip the signed registration to neighbouring natives.
// The payload is JSON-encoded so the receiver can verify the self-signature.
ix.PubsubMu.RLock()
topic := ix.LongLivedPubSubs[common.TopicIndexerRegistry]
ix.PubsubMu.RUnlock()
if topic != nil {
if gossipData, marshalErr := json.Marshal(reg); marshalErr == nil {
if err := topic.Publish(context.Background(), gossipData); err != nil {
logger.Err(err).Msg("native subscription: registry gossip publish")
}
}
}
if isRenewal {
// logger.Debug().Str("peer", reg.PeerID).Msg("native: indexer TTL renewed : " + fmt.Sprintf("%v", len(ix.Native.liveIndexers)))
} else {
logger.Info().Str("peer", reg.PeerID).Msg("native: indexer registered : " + fmt.Sprintf("%v", len(ix.Native.liveIndexers)))
}
// Persist in DHT asynchronously with bounded retry.
// Max retry window = IndexerTTL (90 s) — retrying past entry expiry is pointless.
// Backoff: 10 s → 20 s → 40 s, then repeats at 40 s until deadline.
key := ix.genIndexerKey(reg.PeerID)
data, err := json.Marshal(entry)
if err != nil {
logger.Err(err).Msg("native subscription: marshal entry")
return
}
go func() {
deadline := time.Now().Add(IndexerTTL)
backoff := 10 * time.Second
for {
if time.Now().After(deadline) {
logger.Warn().Str("key", key).Msg("native subscription: DHT put abandoned, entry TTL exceeded")
return
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
err := ix.DHT.PutValue(ctx, key, data)
cancel()
if err == nil {
return
}
logger.Err(err).Msg("native subscription: DHT put " + key)
if !strings.Contains(err.Error(), "failed to find any peer in table") {
return // non-retryable error
}
remaining := time.Until(deadline)
if backoff > remaining {
backoff = remaining
}
if backoff <= 0 {
return
}
time.Sleep(backoff)
if backoff < 40*time.Second {
backoff *= 2
}
}
}()
break
}
}
// handleNativeUnsubscribe removes a departing indexer from the local cache and
// known set immediately, without waiting for TTL expiry.
func (ix *IndexerService) handleNativeUnsubscribe(s network.Stream) {
defer s.Close()
logger := oclib.GetLogger()
var reg common.IndexerRegistration
if err := json.NewDecoder(s).Decode(&reg); err != nil {
logger.Err(err).Msg("native subscription: decode")
return
}
logger.Info().Msg("Subscription " + reg.Addr)
if reg.Addr == "" {
logger.Error().Msg("native subscription: missing addr")
logger.Err(err).Msg("native unsubscribe: decode")
return
}
if reg.PeerID == "" {
ad, err := pp.AddrInfoFromString(reg.Addr)
if err != nil {
logger.Err(err).Msg("native subscription: invalid addr")
return
}
reg.PeerID = ad.ID.String()
}
// Build entry with a fresh TTL — must happen before the cache write so the 66s
// window is not consumed by DHT retries.
entry := &liveIndexerEntry{
PeerID: reg.PeerID,
Addr: reg.Addr,
ExpiresAt: time.Now().UTC().Add(IndexerTTL),
}
// Update local cache and known set immediately so concurrent GetIndexers calls
// can already see this indexer without waiting for the DHT write to complete.
ix.Native.liveIndexersMu.Lock()
_, isRenewal := ix.Native.liveIndexers[reg.PeerID]
ix.Native.liveIndexers[reg.PeerID] = entry
ix.Native.liveIndexersMu.Unlock()
ix.Native.knownMu.Lock()
ix.Native.knownPeerIDs[reg.PeerID] = reg.Addr
ix.Native.knownMu.Unlock()
// Gossip PeerID to neighbouring natives so they discover it via DHT.
ix.PubsubMu.RLock()
topic := ix.LongLivedPubSubs[common.TopicIndexerRegistry]
ix.PubsubMu.RUnlock()
if topic != nil {
if err := topic.Publish(context.Background(), []byte(reg.Addr)); err != nil {
logger.Err(err).Msg("native subscription: registry gossip publish")
}
}
if isRenewal {
logger.Debug().Str("peer", reg.PeerID).Msg("native: indexer TTL renewed : " + fmt.Sprintf("%v", len(ix.Native.liveIndexers)))
} else {
logger.Info().Str("peer", reg.PeerID).Msg("native: indexer registered : " + fmt.Sprintf("%v", len(ix.Native.liveIndexers)))
}
// Persist in DHT asynchronously — retries must not block the handler or consume
// the local cache TTL.
key := ix.genIndexerKey(reg.PeerID)
data, err := json.Marshal(entry)
if err != nil {
logger.Err(err).Msg("native subscription: marshal entry")
logger.Warn().Msg("native unsubscribe: missing peer_id")
return
}
go func() {
for {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
if err := ix.DHT.PutValue(ctx, key, data); err != nil {
cancel()
logger.Err(err).Msg("native subscription: DHT put " + key)
if strings.Contains(err.Error(), "failed to find any peer in table") {
time.Sleep(10 * time.Second)
continue
}
return
}
cancel()
return
}
}()
ix.Native.liveIndexersMu.Lock()
delete(ix.Native.liveIndexers, reg.PeerID)
ix.Native.liveIndexersMu.Unlock()
ix.Native.knownMu.Lock()
delete(ix.Native.knownPeerIDs, reg.PeerID)
ix.Native.knownMu.Unlock()
logger.Info().Str("peer", reg.PeerID).Msg("native: indexer explicitly unregistered")
}
// handleNativeGetIndexers returns this native's own list of reachable indexers.
@@ -260,39 +398,47 @@ func (ix *IndexerService) handleNativeSubscription(s network.Stream) {
func (ix *IndexerService) handleNativeGetIndexers(s network.Stream) {
defer s.Close()
logger := oclib.GetLogger()
for {
var req common.GetIndexersRequest
if err := json.NewDecoder(s).Decode(&req); err != nil {
logger.Err(err).Msg("native get indexers: decode")
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) ||
strings.Contains(err.Error(), "reset") ||
strings.Contains(err.Error(), "closed") ||
strings.Contains(err.Error(), "too many connections") {
return
}
continue
}
if req.Count <= 0 {
req.Count = 3
}
callerPeerID := s.Conn().RemotePeer().String()
reachable := ix.reachableLiveIndexers(req.Count, callerPeerID)
var resp common.GetIndexersResponse
var req common.GetIndexersRequest
if err := json.NewDecoder(s).Decode(&req); err != nil {
logger.Err(err).Msg("native get indexers: decode")
return
}
if req.Count <= 0 {
req.Count = 3
}
callerPeerID := s.Conn().RemotePeer().String()
reachable := ix.reachableLiveIndexers(req.Count, callerPeerID)
var resp common.GetIndexersResponse
if len(reachable) == 0 {
// No live indexers reachable — try to self-delegate.
if ix.selfDelegate(s.Conn().RemotePeer(), &resp) {
logger.Info().Str("peer", callerPeerID).Msg("native: no indexers, acting as fallback for node")
if len(reachable) == 0 {
// No live indexers reachable — try to self-delegate.
if ix.selfDelegate(s.Conn().RemotePeer(), &resp) {
logger.Info().Str("peer", callerPeerID).Msg("native: no indexers, acting as fallback for node")
} else {
// Fallback pool saturated: return empty so the caller retries another
// native instead of piling more load onto this one.
logger.Warn().Str("peer", callerPeerID).Int("pool", maxFallbackPeers).Msg(
"native: fallback pool saturated, refusing self-delegation")
}
} else {
// Fallback pool saturated: return empty so the caller retries another
// native instead of piling more load onto this one.
logger.Warn().Str("peer", callerPeerID).Int("pool", maxFallbackPeers).Msg(
"native: fallback pool saturated, refusing self-delegation")
rand.Shuffle(len(reachable), func(i, j int) { reachable[i], reachable[j] = reachable[j], reachable[i] })
if req.Count > len(reachable) {
req.Count = len(reachable)
}
resp.Indexers = reachable[:req.Count]
}
} else {
rand.Shuffle(len(reachable), func(i, j int) { reachable[i], reachable[j] = reachable[j], reachable[i] })
if req.Count > len(reachable) {
req.Count = len(reachable)
}
resp.Indexers = reachable[:req.Count]
}
if err := json.NewEncoder(s).Encode(resp); err != nil {
logger.Err(err).Msg("native get indexers: encode response")
if err := json.NewEncoder(s).Encode(resp); err != nil {
logger.Err(err).Msg("native get indexers: encode response")
}
break
}
}
@@ -303,39 +449,47 @@ func (ix *IndexerService) handleNativeGetIndexers(s network.Stream) {
func (ix *IndexerService) handleNativeConsensus(s network.Stream) {
defer s.Close()
logger := oclib.GetLogger()
var req common.ConsensusRequest
if err := json.NewDecoder(s).Decode(&req); err != nil {
logger.Err(err).Msg("native consensus: decode")
return
}
myList := ix.reachableLiveIndexers(-1, s.Conn().RemotePeer().String())
mySet := make(map[string]struct{}, len(myList))
for _, addr := range myList {
mySet[addr] = struct{}{}
}
trusted := []string{}
candidateSet := make(map[string]struct{}, len(req.Candidates))
for _, addr := range req.Candidates {
candidateSet[addr] = struct{}{}
if _, ok := mySet[addr]; ok {
trusted = append(trusted, addr) // candidate we also confirm as reachable
for {
var req common.ConsensusRequest
if err := json.NewDecoder(s).Decode(&req); err != nil {
logger.Err(err).Msg("native consensus: decode")
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) ||
strings.Contains(err.Error(), "reset") ||
strings.Contains(err.Error(), "closed") ||
strings.Contains(err.Error(), "too many connections") {
return
}
continue
}
}
// Extras we trust but that the requester didn't include → suggestions.
suggestions := []string{}
for _, addr := range myList {
if _, inCandidates := candidateSet[addr]; !inCandidates {
suggestions = append(suggestions, addr)
myList := ix.reachableLiveIndexers(-1, s.Conn().RemotePeer().String())
mySet := make(map[string]struct{}, len(myList))
for _, addr := range myList {
mySet[addr] = struct{}{}
}
}
resp := common.ConsensusResponse{Trusted: trusted, Suggestions: suggestions}
if err := json.NewEncoder(s).Encode(resp); err != nil {
logger.Err(err).Msg("native consensus: encode response")
trusted := []string{}
candidateSet := make(map[string]struct{}, len(req.Candidates))
for _, addr := range req.Candidates {
candidateSet[addr] = struct{}{}
if _, ok := mySet[addr]; ok {
trusted = append(trusted, addr) // candidate we also confirm as reachable
}
}
// Extras we trust but that the requester didn't include → suggestions.
suggestions := []string{}
for _, addr := range myList {
if _, inCandidates := candidateSet[addr]; !inCandidates {
suggestions = append(suggestions, addr)
}
}
resp := common.ConsensusResponse{Trusted: trusted, Suggestions: suggestions}
if err := json.NewEncoder(s).Encode(resp); err != nil {
logger.Err(err).Msg("native consensus: encode response")
}
break
}
}
@@ -406,11 +560,16 @@ func (ix *IndexerService) reachableLiveIndexers(count int, from ...string) []str
// refreshIndexersFromDHT runs in background and queries the shared DHT for every known
// indexer PeerID whose local cache entry is missing or expired. This supplements the
// local cache with entries written by neighbouring natives.
func (ix *IndexerService) refreshIndexersFromDHT() {
func (ix *IndexerService) refreshIndexersFromDHT(ctx context.Context) {
t := time.NewTicker(dhtRefreshInterval)
defer t.Stop()
logger := oclib.GetLogger()
for range t.C {
for {
select {
case <-ctx.Done():
return
case <-t.C:
}
ix.Native.knownMu.RLock()
peerIDs := make([]string, 0, len(ix.Native.knownPeerIDs))
for pid := range ix.Native.knownPeerIDs {
@@ -427,10 +586,10 @@ func (ix *IndexerService) refreshIndexersFromDHT() {
continue // still fresh in local cache
}
key := ix.genIndexerKey(pid)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
ch, err := ix.DHT.SearchValue(ctx, key)
dhtCtx, dhtCancel := context.WithTimeout(context.Background(), 5*time.Second)
ch, err := ix.DHT.SearchValue(dhtCtx, key)
if err != nil {
cancel()
dhtCancel()
continue
}
var best *liveIndexerEntry
@@ -445,7 +604,7 @@ func (ix *IndexerService) refreshIndexersFromDHT() {
}
}
}
cancel()
dhtCancel()
if best != nil {
ix.Native.liveIndexersMu.Lock()
ix.Native.liveIndexers[best.PeerID] = best
@@ -468,11 +627,16 @@ func (ix *IndexerService) genIndexerKey(peerID string) string {
// runOffloadLoop periodically checks if real indexers are available and releases
// responsible peers so they can reconnect to actual indexers on their next attempt.
func (ix *IndexerService) runOffloadLoop() {
func (ix *IndexerService) runOffloadLoop(ctx context.Context) {
t := time.NewTicker(offloadInterval)
defer t.Stop()
logger := oclib.GetLogger()
for range t.C {
for {
select {
case <-ctx.Done():
return
case <-t.C:
}
fmt.Println("runOffloadLoop", ix.Native.responsiblePeers)
ix.Native.responsibleMu.RLock()
count := len(ix.Native.responsiblePeers)
@@ -540,38 +704,46 @@ func (ix *IndexerService) runOffloadLoop() {
func (ix *IndexerService) handleNativeGetPeers(s network.Stream) {
defer s.Close()
logger := oclib.GetLogger()
var req common.GetNativePeersRequest
if err := json.NewDecoder(s).Decode(&req); err != nil {
logger.Err(err).Msg("native get peers: decode")
return
}
if req.Count <= 0 {
req.Count = 1
}
excludeSet := make(map[string]struct{}, len(req.Exclude))
for _, e := range req.Exclude {
excludeSet[e] = struct{}{}
}
common.StreamNativeMu.RLock()
candidates := make([]string, 0, len(common.StaticNatives))
for addr := range common.StaticNatives {
if _, excluded := excludeSet[addr]; !excluded {
candidates = append(candidates, addr)
for {
var req common.GetNativePeersRequest
if err := json.NewDecoder(s).Decode(&req); err != nil {
logger.Err(err).Msg("native get peers: decode")
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) ||
strings.Contains(err.Error(), "reset") ||
strings.Contains(err.Error(), "closed") ||
strings.Contains(err.Error(), "too many connections") {
return
}
continue
}
if req.Count <= 0 {
req.Count = 1
}
}
common.StreamNativeMu.RUnlock()
rand.Shuffle(len(candidates), func(i, j int) { candidates[i], candidates[j] = candidates[j], candidates[i] })
if req.Count > len(candidates) {
req.Count = len(candidates)
}
excludeSet := make(map[string]struct{}, len(req.Exclude))
for _, e := range req.Exclude {
excludeSet[e] = struct{}{}
}
resp := common.GetNativePeersResponse{Peers: candidates[:req.Count]}
if err := json.NewEncoder(s).Encode(resp); err != nil {
logger.Err(err).Msg("native get peers: encode response")
common.StreamNativeMu.RLock()
candidates := make([]string, 0, len(common.StaticNatives))
for addr := range common.StaticNatives {
if _, excluded := excludeSet[addr]; !excluded {
candidates = append(candidates, addr)
}
}
common.StreamNativeMu.RUnlock()
rand.Shuffle(len(candidates), func(i, j int) { candidates[i], candidates[j] = candidates[j], candidates[i] })
if req.Count > len(candidates) {
req.Count = len(candidates)
}
resp := common.GetNativePeersResponse{Peers: candidates[:req.Count]}
if err := json.NewEncoder(s).Encode(resp); err != nil {
logger.Err(err).Msg("native get peers: encode response")
}
break
}
}

View File

@@ -4,6 +4,7 @@ import (
"context"
"oc-discovery/conf"
"oc-discovery/daemons/node/common"
"strings"
"sync"
oclib "cloud.o-forge.io/core/oc-lib"
@@ -60,9 +61,19 @@ func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int, isNative boo
}
}
if ix.DHT, err = dht.New(
context.Background(),
ix.Host,
// Parse bootstrap peers from configured native/indexer addresses so that the
// DHT can find its routing table entries even in a fresh deployment.
var bootstrapPeers []pp.AddrInfo
for _, addrStr := range strings.Split(conf.GetConfig().NativeIndexerAddresses+","+conf.GetConfig().IndexerAddresses, ",") {
addrStr = strings.TrimSpace(addrStr)
if addrStr == "" {
continue
}
if ad, err := pp.AddrInfoFromString(addrStr); err == nil {
bootstrapPeers = append(bootstrapPeers, *ad)
}
}
dhtOpts := []dht.Option{
dht.Mode(dht.ModeServer),
dht.ProtocolPrefix("oc"), // 🔥 réseau privé
dht.Validator(record.NamespacedValidator{
@@ -71,7 +82,11 @@ func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int, isNative boo
"name": DefaultValidator{},
"pid": DefaultValidator{},
}),
); err != nil {
}
if len(bootstrapPeers) > 0 {
dhtOpts = append(dhtOpts, dht.BootstrapPeers(bootstrapPeers...))
}
if ix.DHT, err = dht.New(context.Background(), ix.Host, dhtOpts...); err != nil {
logger.Info().Msg(err.Error())
return nil
}
@@ -90,6 +105,16 @@ func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int, isNative boo
}
func (ix *IndexerService) Close() {
if ix.Native != nil && ix.Native.cancel != nil {
ix.Native.cancel()
}
// Explicitly deregister from natives on clean shutdown so they evict this
// indexer immediately rather than waiting for TTL expiry (~90 s).
if !ix.IsNative {
if nativeAddrs := conf.GetConfig().NativeIndexerAddresses; nativeAddrs != "" {
common.UnregisterFromNative(ix.Host, nativeAddrs)
}
}
ix.DHT.Close()
ix.PS.UnregisterTopicValidator(common.TopicPubSubSearch)
if ix.nameIndex != nil {