Discovery Neo Oclib

This commit is contained in:
mr
2026-05-27 16:17:00 +02:00
parent 7f951afd41
commit 6ce6e6fe7d
20 changed files with 1436 additions and 1133 deletions
+8
View File
@@ -66,6 +66,10 @@ type Heartbeat struct {
// MembershipEvents carries SWIM events piggybacked on this heartbeat.
// Events are forwarded infection-style until HopsLeft reaches 0.
MembershipEvents []MemberEvent `json:"membership_events,omitempty"`
// PendingContact lists peer IDs for which this node has undelivered critical
// DTN entries. Indexers maintain an inverted index so those peers can
// discover who is waiting for them when they reconnect.
PendingContact []string `json:"pending_contact,omitempty"`
}
// SearchPeerRequest is sent by a node to an indexer via ProtocolSearchPeer.
@@ -134,6 +138,10 @@ type HeartbeatResponse struct {
// MembershipEvents carries SWIM events piggybacked on this response.
// The node should forward them to its other indexers (infection-style).
MembershipEvents []MemberEvent `json:"membership_events,omitempty"`
// PendingCallers lists peer IDs that have undelivered critical DTN messages
// for the receiving node, as recorded by this indexer. On receipt the node
// should initiate contact with each caller so it can flush its DTN cache.
PendingCallers []string `json:"pending_callers,omitempty"`
}
// ComputeIndexerScore computes a composite quality score [0, 100] for the connecting peer.
+33 -12
View File
@@ -29,7 +29,7 @@ var retryRunning atomic.Bool
// peer has at least 3 chances to respond or refute the suspicion signal.
const suspectTimeout = 3 * RecommendedHeartbeatInterval
func ConnectToIndexers(h host.Host, minIndexer int, maxIndexer int, recordFn ...func() json.RawMessage) error {
func ConnectToIndexers(h host.Host, minIndexer int, maxIndexer int, hooks ...HeartbeatHooks) error {
TimeWatcher = time.Now().UTC()
logger := oclib.GetLogger()
@@ -71,7 +71,7 @@ func ConnectToIndexers(h host.Host, minIndexer int, maxIndexer int, recordFn ...
// Start long-lived heartbeat to seed indexers. The single goroutine follows
// all subsequent StaticIndexers changes.
SendHeartbeat(context.Background(), ProtocolHeartbeat, conf.GetConfig().Name,
h, Indexers, 20*time.Second, maxIndexer, recordFn...)
h, Indexers, 20*time.Second, maxIndexer, hooks...)
// Watch for inbound connections: if a peer connects to us and our pool has
// room, probe it first to confirm it supports ProtocolHeartbeat (i.e. it is
@@ -270,17 +270,29 @@ func handleSuggestions(d *Directory, from string, suggestions []pp.AddrInfo) {
}
}
// HeartbeatHooks carries optional callbacks injected into the heartbeat loop.
type HeartbeatHooks struct {
// RecordFn returns a fresh signed PeerRecord for embedding in each heartbeat.
RecordFn func() json.RawMessage
// PendingContactFn returns the list of peer IDs for which the caller has
// undelivered critical DTN entries. Called on every tick.
PendingContactFn func() []string
// OnPendingCallers is invoked when an indexer response contains peer IDs
// that have undelivered messages for us. The caller should initiate contact
// with each of them so they can flush their DTN cache.
OnPendingCallers func(callerPeerIDs []string)
}
// SendHeartbeat starts a goroutine that sends periodic heartbeats to peers.
// recordFn, when provided, is called on each tick and its output is embedded in
// the heartbeat as a fresh signed PeerRecord so the receiving indexer can
// republish it to the DHT without an extra round-trip.
// Pass no recordFn (or nil) for indexer→indexer / native heartbeats.
func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.Host, directory *Directory, interval time.Duration, maxPool int, recordFn ...func() json.RawMessage) {
// hooks.RecordFn, when set, is called on each tick and its output is embedded
// in the heartbeat as a fresh signed PeerRecord.
// Pass an empty HeartbeatHooks (or none) for indexer→indexer / native heartbeats.
func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.Host, directory *Directory, interval time.Duration, maxPool int, hooks ...HeartbeatHooks) {
logger := oclib.GetLogger()
isIndexerHB := directory == Indexers
var recFn func() json.RawMessage
if len(recordFn) > 0 {
recFn = recordFn[0]
var hk HeartbeatHooks
if len(hooks) > 0 {
hk = hooks[0]
}
go func() {
logger.Info().Str("proto", string(proto)).Int("peers", len(directory.Addrs)).Msg("heartbeat started")
@@ -306,8 +318,11 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
IndexersBinded: addrs,
Need: need,
}
if recFn != nil {
baseHB.Record = recFn()
if hk.RecordFn != nil {
baseHB.Record = hk.RecordFn()
}
if hk.PendingContactFn != nil {
baseHB.PendingContact = hk.PendingContactFn()
}
// Piggyback SWIM membership events on every outgoing heartbeat batch.
// All peers in the pool receive the same events this tick.
@@ -550,6 +565,12 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
handleSuggestions(directory, ai.Info.ID.String(), resp.Suggestions)
}
// PendingCallers: peers that have undelivered DTN messages for us.
// Signal the DTN layer so it can flush immediately when it reaches them.
if resp != nil && len(resp.PendingCallers) > 0 && hk.OnPendingCallers != nil {
hk.OnPendingCallers(resp.PendingCallers)
}
// Handle SuggestMigrate: indexer is overloaded and wants us to move.
if resp != nil && resp.SuggestMigrate && isIndexerHB {
nonSeedCount := 0
+7 -2
View File
@@ -261,7 +261,10 @@ func (ix *LongLivedStreamRecordedService[T]) HandleHeartbeat(s network.Stream) {
}
func CheckHeartbeat(h host.Host, s network.Stream, dec *json.Decoder, streams map[pp.ID]HeartBeatStreamed, lock *sync.RWMutex, maxNodes int) (*pp.ID, *Heartbeat, error) {
if len(h.Network().Peers()) >= maxNodes {
// Use the heartbeat stream count, not h.Network().Peers(), which includes
// upstream indexer connections, short-lived protocol streams (publish/get/probe),
// and zombie libp2p connections whose heartbeat stream has already been GC'd.
if len(streams) >= maxNodes {
return nil, nil, fmt.Errorf("too many connections, try another indexer")
}
var hb Heartbeat
@@ -285,9 +288,11 @@ func CheckHeartbeat(h host.Host, s network.Stream, dec *json.Decoder, streams ma
// E: measure the indexer's own subnet diversity, not the node's view.
diversity := getOwnDiversityRate(h)
// fillRate: fraction of indexer capacity used — higher = more peers trust this indexer.
// Use heartbeat stream count (same as fill rate reported to nodes), not
// h.Network().Peers() which inflates the count with upstream/probe connections.
fillRate := 0.0
if maxNodes > 0 {
fillRate = float64(len(h.Network().Peers())) / float64(maxNodes)
fillRate = float64(len(streams)) / float64(maxNodes)
if fillRate > 1 {
fillRate = 1
}
+4 -4
View File
@@ -184,11 +184,10 @@ func TempStream(h host.Host, ad pp.AddrInfo, proto protocol.ID, did string, stre
}
ctxTTL, cancelTTL := context.WithTimeout(context.Background(), expiry)
defer cancelTTL()
if h.Network().Connectedness(ad.ID) != network.Connected {
fmt.Println(ad.ID, len(h.Network().ConnsToPeer(ad.ID)))
if len(h.Network().ConnsToPeer(ad.ID)) == 0 {
if err := h.Connect(ctxTTL, ad); err != nil {
fmt.Println("Connectedness", ad.ID, err)
return streams, err
}
}
@@ -233,7 +232,8 @@ func sendHeartbeat(ctx context.Context, h host.Host, proto protocol.ID, p *pp.Ad
pss, exists := streams[p.ID]
ctxTTL, cancel := context.WithTimeout(ctx, 3*interval)
defer cancel()
if h.Network().Connectedness(p.ID) != network.Connected {
fmt.Println(p.ID, len(h.Network().ConnsToPeer(p.ID)))
if len(h.Network().ConnsToPeer(p.ID)) == 0 {
if err := h.Connect(ctxTTL, *p); err != nil {
logger.Err(err)
return nil, 0, err
+3 -2
View File
@@ -3,12 +3,12 @@ package common
import (
"context"
"encoding/json"
"fmt"
"sort"
"time"
oclib "cloud.o-forge.io/core/oc-lib"
"github.com/libp2p/go-libp2p/core/host"
"github.com/libp2p/go-libp2p/core/network"
pp "github.com/libp2p/go-libp2p/core/peer"
)
@@ -153,7 +153,8 @@ func TriggerConsensus(h host.Host, remaining []pp.AddrInfo, need int) {
func probeIndexer(h host.Host, ai pp.AddrInfo) (*HeartbeatResponse, time.Duration, error) {
ctx, cancel := context.WithTimeout(context.Background(), 8*time.Second)
defer cancel()
if h.Network().Connectedness(ai.ID) != network.Connected {
fmt.Println(ai.ID, len(h.Network().ConnsToPeer(ai.ID)))
if len(h.Network().ConnsToPeer(ai.ID)) == 0 {
if err := h.Connect(ctx, ai); err != nil {
return nil, 0, err
}
+40 -2
View File
@@ -39,6 +39,8 @@ type PeerRecordPayload struct {
PubKey []byte `json:"public_key"`
ExpiryDate time.Time `json:"expiry_date"`
IsNano bool `json:"is_nano"`
// MasterID is the libp2p PeerID of this peer's MASTER, self-attested and signed.
MasterID string `json:"master_id,omitempty"`
// TTLSeconds is the publisher's declared lifetime for this record in seconds.
// 0 means "use the default (120 s)". Included in the signed payload so it
// cannot be altered by an intermediary.
@@ -105,6 +107,7 @@ func (pr *PeerRecord) ExtractPeer(ourkey string, key string, pubKey crypto.PubKe
NATSAddress: pr.NATSAddress,
WalletAddress: pr.WalletAddress,
Location: pr.Location,
MasterID: pr.MasterID,
}
if time.Now().UTC().After(pr.ExpiryDate) {
return pp.SELF == p.Relation, nil, errors.New("peer " + key + " is offline")
@@ -285,6 +288,20 @@ func (ix *IndexerService) initNodeHandler() {
}
cancel2()
}
// PendingContact: update inverted index — for each target peer in the list,
// record that hb.PeerID wants to contact it. Entries expire after 3 heartbeat
// intervals so stale callers are cleaned up automatically if they stop advertising.
if len(hb.PendingContact) > 0 {
expiry := time.Now().Add(3 * 20 * time.Second)
ix.pendingContactIndexMu.Lock()
for _, targetID := range hb.PendingContact {
if ix.pendingContactIndex[targetID] == nil {
ix.pendingContactIndex[targetID] = map[string]time.Time{}
}
ix.pendingContactIndex[targetID][hb.PeerID] = expiry
}
ix.pendingContactIndexMu.Unlock()
}
}
ix.Host.SetStreamHandler(common.ProtocolHeartbeat, ix.HandleHeartbeat)
ix.Host.SetStreamHandler(common.ProtocolPublish, ix.handleNodePublish)
@@ -351,7 +368,8 @@ func (ix *IndexerService) handleNodePublish(s network.Stream) {
}
continue
}
if _, err := rec.Verify(); err != nil {
pubKey, err := rec.Verify()
if err != nil {
ix.behavior.RecordBadSignature(remotePeer)
logger.Warn().Err(err).Str("peer", remotePeer.String()).Msg("bad signature on publish")
return
@@ -369,6 +387,26 @@ func (ix *IndexerService) handleNodePublish(s network.Stream) {
if err != nil {
return
}
// Chain of trust: PubKey → PeerID (libp2p invariant), then transport identity.
// This prevents a peer from publishing a record on behalf of someone else.
if derivedID, err := lpp.IDFromPublicKey(pubKey); err != nil || derivedID != pid {
ix.behavior.RecordBadSignature(remotePeer)
logger.Warn().Str("peer", remotePeer.String()).Msg("PubKey/PeerID mismatch on publish")
s.Reset()
return
}
if remotePeer != pid {
ix.behavior.RecordBadSignature(remotePeer)
logger.Warn().Str("remote", remotePeer.String()).Str("claimed", pid.String()).Msg("transport identity mismatch on publish")
s.Reset()
return
}
if rec.StreamAddress != "" && !strings.HasSuffix(rec.StreamAddress, "/p2p/"+rec.PeerID) {
ix.behavior.RecordBadSignature(remotePeer)
logger.Warn().Str("peer", remotePeer.String()).Msg("StreamAddress/PeerID mismatch on publish")
s.Reset()
return
}
ix.StreamMU.Lock()
defer ix.StreamMU.Unlock()
@@ -566,7 +604,7 @@ func (ix *IndexerService) handleIndirectProbe(s network.Stream) {
// Connect to target if not already connected.
ctx, cancel := context.WithTimeout(context.Background(), 6*time.Second)
defer cancel()
if ix.Host.Network().Connectedness(req.Target.ID) != network.Connected {
if len(ix.Host.Network().ConnsToPeer(req.Target.ID)) == 0 {
if err := ix.Host.Connect(ctx, req.Target); err != nil {
respond(false, 0)
return
+24
View File
@@ -79,6 +79,11 @@ type IndexerService struct {
// eventQueue holds SWIM membership events to be piggybacked on responses
// (infection-style dissemination toward connected nodes).
eventQueue *common.MembershipEventQueue
// pendingContactIndex is an inverted index built from Heartbeat.PendingContact.
// Maps target peer ID → { caller peer ID → expiry time }.
// Returned in HeartbeatResponse.PendingCallers when the target reconnects.
pendingContactIndex map[string]map[string]time.Time
pendingContactIndexMu sync.Mutex
}
// NewIndexerService creates an IndexerService.
@@ -95,6 +100,7 @@ func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int) *IndexerServ
behavior: newNodeBehaviorTracker(),
deletedDIDs: make(map[string]time.Time),
eventQueue: &common.MembershipEventQueue{},
pendingContactIndex: map[string]map[string]time.Time{},
}
if ps == nil {
ps, err = pubsub.NewGossipSub(context.Background(), ix.Host)
@@ -408,6 +414,24 @@ func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int) *IndexerServ
resp.Incarnation = ix.incarnation.Load()
resp.MembershipEvents = ix.eventQueue.Drain(5)
// PendingCallers: look up who has undelivered messages for this node.
// Clean up expired entries at the same time.
ix.pendingContactIndexMu.Lock()
if callers, ok := ix.pendingContactIndex[remotePeer.String()]; ok {
now := time.Now()
for callerID, exp := range callers {
if now.Before(exp) {
resp.PendingCallers = append(resp.PendingCallers, callerID)
} else {
delete(callers, callerID)
}
}
if len(callers) == 0 {
delete(ix.pendingContactIndex, remotePeer.String())
}
}
ix.pendingContactIndexMu.Unlock()
return resp
}
+32 -3
View File
@@ -10,6 +10,7 @@ import (
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/config"
"cloud.o-forge.io/core/oc-lib/dbs"
pp_model "cloud.o-forge.io/core/oc-lib/models/peer"
"cloud.o-forge.io/core/oc-lib/tools"
pp "github.com/libp2p/go-libp2p/core/peer"
@@ -51,11 +52,13 @@ func ListenNATS(n *Node) {
}
if err == nil {
switch propalgation.Action {
case tools.PB_ADMIRALTY_CONFIG, tools.PB_MINIO_CONFIG:
case tools.PB_ADMIRALTY_CONFIG, tools.PB_MINIO_CONFIG, tools.PB_SOURCE_PRESIGN:
var m configPayload
var proto protocol.ID = stream.ProtocolAdmiraltyConfigResource
if propalgation.Action == tools.PB_MINIO_CONFIG {
proto = stream.ProtocolMinioConfigResource
} else if propalgation.Action == tools.PB_SOURCE_PRESIGN {
proto = stream.ProtocolSourcePresignResource
}
if err := json.Unmarshal(propalgation.Payload, &m); err == nil {
peers, _ := n.GetPeerRecord(context.Background(), m.PeerID)
@@ -68,9 +71,33 @@ func ListenNATS(n *Node) {
if slices.Contains([]tools.DataType{tools.BOOKING, tools.PURCHASE_RESOURCE}, resp.Datatype) {
m := map[string]interface{}{}
if err := json.Unmarshal(propalgation.Payload, &m); err == nil {
delivered := false
if m["peer_id"] != nil {
n.StreamService.PublishCommon(&resp.Datatype, resp.User, resp.Groups,
_, err := n.StreamService.PublishCommon(&resp.Datatype, resp.User, resp.Groups,
fmt.Sprintf("%v", m["peer_id"]), stream.ProtocolCreateResource, propalgation.Payload)
delivered = err == nil
}
if !delivered {
// NANO unreachable — look up its MasterID from the DB record.
// The NANO self-attests its MASTER in its signed PeerRecord;
// if MasterID is set we forward there, otherwise we drop silently.
var destStruct struct {
DestPeerID string `json:"dest_peer_id"`
}
if json.Unmarshal(propalgation.Payload, &destStruct) == nil && destStruct.DestPeerID != "" {
d := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil).Search(&dbs.Filters{
And: map[string][]dbs.Filter{
"id": {{Operator: dbs.EQUAL.String(), Value: destStruct.DestPeerID}},
},
}, "", false, 0, 1)
if len(d.Data) > 0 {
nano := d.Data[0].(*pp_model.Peer)
if nano.MasterID != "" {
n.StreamService.PublishCommon(&resp.Datatype, resp.User, resp.Groups,
nano.MasterID, stream.ProtocolCreateResource, propalgation.Payload)
}
}
}
}
}
} else {
@@ -150,6 +177,7 @@ func ListenNATS(n *Node) {
// Re-emit on PEER_OBSERVE_RESPONSE_EVENT so the local oc-peer sees it.
tools.NewNATSCaller().SetNATSPub(tools.PEER_OBSERVE_RESPONSE_EVENT, tools.NATSResponse{
FromApp: resp.FromApp,
User: resp.User,
Datatype: tools.PEER,
Method: int(tools.PEER_OBSERVE_RESPONSE_EVENT),
Payload: propalgation.Payload,
@@ -183,7 +211,7 @@ func ListenNATS(n *Node) {
} else {
m := map[string]interface{}{}
if err := json.Unmarshal(propalgation.Payload, &m); err == nil {
fmt.Println("PB_SEARCH CATA", m)
fmt.Println("PB_SEARCH CATA", m, resp.User)
n.PubSubService.SearchPublishEvent(
context.Background(),
@@ -256,6 +284,7 @@ func handlePeerBehaviorEvent(n *Node, resp tools.NATSResponse) {
if b, err := json.Marshal(p.Serialize(p)); err == nil {
tools.NewNATSCaller().SetNATSPub(tools.CREATE_RESOURCE, tools.NATSResponse{
FromApp: "oc-discovery",
User: resp.User,
Datatype: tools.PEER,
Method: int(tools.CREATE_RESOURCE),
Payload: b,
+23 -2
View File
@@ -123,8 +123,25 @@ func InitNode(isNode bool, isIndexer bool) (*Node, error) {
b, _ := json.Marshal(fresh)
return json.RawMessage(b)
}
// streamSvcRef is set after InitStream below; the heartbeat goroutine
// first fires after 20 s so it is always non-nil by then.
var streamSvcRef *stream.StreamService
logger.Info().Msg("connect to indexers...")
common.ConnectToIndexers(node.Host, conf.GetConfig().MinIndexer, conf.GetConfig().MaxIndexer, buildRecord)
common.ConnectToIndexers(node.Host, conf.GetConfig().MinIndexer, conf.GetConfig().MaxIndexer,
common.HeartbeatHooks{
RecordFn: buildRecord,
PendingContactFn: func() []string {
if streamSvcRef != nil {
return streamSvcRef.PendingContacts()
}
return nil
},
OnPendingCallers: func(callers []string) {
if streamSvcRef != nil {
streamSvcRef.NudgeContacts(callers)
}
},
})
logger.Info().Msg("claims my node...")
if _, err := node.claimInfo(conf.GetConfig().Name, conf.GetConfig().Hostname); err != nil {
panic(err)
@@ -135,6 +152,7 @@ func InitNode(isNode bool, isIndexer bool) (*Node, error) {
if node.StreamService, err = stream.InitStream(context.Background(), node.Host, node.PeerID, 1000, node); err != nil {
panic(err)
}
streamSvcRef = node.StreamService
node.StreamService.IsPeerKnown = func(pid pp.ID) bool {
// 1. Local DB: known peer (handles blacklist).
access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
@@ -178,7 +196,7 @@ func InitNode(isNode bool, isIndexer bool) (*Node, error) {
}
fmt.Println("PUBSUB SendResponse bef peerrece")
if p, err := node.GetPeerRecord(ctx, evt.From); err == nil && len(p) > 0 && m["search"] != nil {
fmt.Println("PUBSUB SendResponse af peerrece", m)
fmt.Println("PUBSUB SendResponse af peerrece", m, evt.User)
node.StreamService.SendResponse(p[0], &evt, fmt.Sprintf("%v", m["search"]))
}
}
@@ -411,8 +429,10 @@ func (d *Node) claimInfo(
"peer_id": {{Operator: dbs.EQUAL.String(), Value: d.Host.ID().String()}},
},
}, "", false, 0, 1)
var masterID string
if len(peers.Data) > 0 {
did = peers.Data[0].GetID() // if already existing set up did as made
masterID = peers.Data[0].(*peer.Peer).MasterID
}
priv, err := tools.LoadKeyFromFilePrivate()
if err != nil {
@@ -434,6 +454,7 @@ func (d *Node) claimInfo(
PubKey: pubBytes,
IsNano: oclib.GetConfig().IsNano,
MasterID: masterID,
TTLSeconds: indexer.DefaultTTLSeconds,
ExpiryDate: now.Add(indexer.DefaultTTLSeconds * time.Second),
}
+1 -1
View File
@@ -49,7 +49,7 @@ func (ps *PubSubService) SearchPublishEvent(
// remote peers echo it back unchanged, allowing IsActive to validate results.
searchKey := ps.StreamService.ResourceSearches.Register(user, cancel, idleTimeout)
fmt.Println("PUBLISH ON PUBSUB", common.TopicPubSubSearch, searchKey)
return ps.publishEvent(searchCtx, dt, tools.PB_SEARCH, common.TopicPubSubSearch, searchKey, b)
return ps.publishEvent(searchCtx, dt, tools.PB_SEARCH, common.TopicPubSubSearch, user, b)
default:
return errors.New("no type of research found")
}
-362
View File
@@ -1,362 +0,0 @@
package stream
// dnt_cache.go — Disconnection Network Tolerance cache for outbound stream requests.
//
// When a stream write fails because the remote peer is unreachable, the request
// is saved here and retried on the next tick. Two levels are defined:
//
// - dntCritical : retry indefinitely (create / update / delete resource).
// - dntModerate : up to dntMaxModerateRetries retries, then abandon.
//
// Pubsub messages and search streams are explicitly excluded.
// Streams initiated from the indexer side are never enqueued here.
//
// # Crash-resilient persistence
//
// Critical entries are written to an encrypted file (AES-256-GCM) so they
// survive a node crash/restart. The AES key is derived deterministically from
// the node's Ed25519 private key via HKDF-SHA256 — no extra secret to manage.
// Moderate entries are intentionally not persisted: their retry budget is small
// enough that re-loading them after a restart would be misleading.
import (
"crypto/aes"
"crypto/cipher"
"crypto/rand"
"crypto/sha256"
"encoding/json"
"io"
"os"
"path/filepath"
"sync"
"time"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/tools"
"golang.org/x/crypto/hkdf"
"oc-discovery/conf"
pp "github.com/libp2p/go-libp2p/core/peer"
"github.com/libp2p/go-libp2p/core/protocol"
)
type dntLevel int
const (
dntCritical dntLevel = iota // retry until the message is delivered
dntModerate // retry up to dntMaxModerateRetries times
)
const dntMaxModerateRetries = 3
const dntRetryInterval = 15 * time.Second
// dntProtocols maps each stream protocol to its DNT level.
// Protocols absent from this map receive no caching (e.g. ProtocolSearchResource).
var dntProtocols = map[protocol.ID]dntLevel{
// Critical — data mutations that must eventually be delivered.
ProtocolCreateResource: dntCritical,
ProtocolUpdateResource: dntCritical,
ProtocolDeleteResource: dntCritical,
// Moderate — confirmations / config / planner: 3 retries before abandon.
ProtocolVerifyResource: dntModerate,
ProtocolSendPlanner: dntModerate,
ProtocolConsidersResource: dntModerate,
ProtocolMinioConfigResource: dntModerate,
ProtocolAdmiraltyConfigResource: dntModerate,
}
// dntEntryJSON is the on-disk representation of a dntEntry.
// pp.AddrInfo and protocol.ID don't have built-in JSON tags so we flatten them.
type dntEntryJSON struct {
DID string `json:"did"`
Addr pp.AddrInfo `json:"addr"`
DT *tools.DataType `json:"dt,omitempty"`
User string `json:"user"`
Payload []byte `json:"payload"`
Proto protocol.ID `json:"proto"`
Retries int `json:"retries"`
AddedAt time.Time `json:"added_at"`
}
type dntEntry struct {
did string
addr pp.AddrInfo
dt *tools.DataType
user string
payload []byte
proto protocol.ID
retries int
addedAt time.Time
}
func (e *dntEntry) toJSON() dntEntryJSON {
return dntEntryJSON{
DID: e.did,
Addr: e.addr,
DT: e.dt,
User: e.user,
Payload: e.payload,
Proto: e.proto,
Retries: e.retries,
AddedAt: e.addedAt,
}
}
func entryFromJSON(j dntEntryJSON) *dntEntry {
return &dntEntry{
did: j.DID,
addr: j.Addr,
dt: j.DT,
user: j.User,
payload: j.Payload,
proto: j.Proto,
retries: j.Retries,
addedAt: j.AddedAt,
}
}
type dntCache struct {
mu sync.Mutex
entries []*dntEntry
// aesKey is the derived AES-256 key used for on-disk encryption.
// Nil when key derivation failed: persistence is disabled but the in-memory
// cache continues to function normally.
aesKey []byte
}
// newDNTCache initialises the cache, derives the encryption key, and restores
// any critical entries that were persisted before the last crash.
func newDNTCache() *dntCache {
log := oclib.GetLogger()
c := &dntCache{}
key, err := deriveDNTKey()
if err != nil {
log.Warn().Err(err).Msg("[dnt] key derivation failed — persistence disabled")
} else {
c.aesKey = key
c.loadFromDisk()
}
return c
}
// enqueue adds an entry to the cache and persists critical entries to disk.
func (c *dntCache) enqueue(e *dntEntry) {
c.mu.Lock()
c.entries = append(c.entries, e)
c.mu.Unlock()
if dntProtocols[e.proto] == dntCritical {
go c.persistToDisk()
}
}
// drain atomically removes and returns all current entries.
func (c *dntCache) drain() []*dntEntry {
c.mu.Lock()
defer c.mu.Unlock()
out := c.entries
c.entries = nil
return out
}
// requeue puts entries back at the head of the list, preserving any new
// entries added while the retry loop was running.
func (c *dntCache) requeue(entries []*dntEntry) {
if len(entries) == 0 {
return
}
c.mu.Lock()
defer c.mu.Unlock()
c.entries = append(entries, c.entries...)
}
// ── Persistence ──────────────────────────────────────────────────────────────
// dntCachePath returns the path of the on-disk cache file, placed next to the
// node's private key so it lives on the same persistent volume.
func dntCachePath() string {
return filepath.Join(filepath.Dir(conf.GetConfig().PrivateKeyPath), "dnt_cache.bin")
}
// deriveDNTKey derives a 32-byte AES key from the node's Ed25519 private key
// using HKDF-SHA256. The derivation is deterministic: the same key is always
// produced from the same private key, so no symmetric secret needs storing.
func deriveDNTKey() ([]byte, error) {
priv, err := tools.LoadKeyFromFilePrivate()
if err != nil {
return nil, err
}
// Raw() on a libp2p Ed25519 private key returns the 64-byte representation
// (32-byte seed || 32-byte public key). We use the full 64 bytes as IKM.
raw, err := priv.Raw()
if err != nil {
return nil, err
}
reader := hkdf.New(sha256.New, raw, nil, []byte("oc-discovery/dnt-cache/v1"))
key := make([]byte, 32)
if _, err := io.ReadFull(reader, key); err != nil {
return nil, err
}
return key, nil
}
// persistToDisk encrypts all current critical entries and writes them to disk.
// Non-critical entries are deliberately excluded — they are not worth restoring
// after a restart given their limited retry budget.
func (c *dntCache) persistToDisk() {
if c.aesKey == nil {
return
}
log := oclib.GetLogger()
c.mu.Lock()
var toSave []dntEntryJSON
for _, e := range c.entries {
if dntProtocols[e.proto] == dntCritical {
toSave = append(toSave, e.toJSON())
}
}
c.mu.Unlock()
plaintext, err := json.Marshal(toSave)
if err != nil {
return
}
block, err := aes.NewCipher(c.aesKey)
if err != nil {
return
}
gcm, err := cipher.NewGCM(block)
if err != nil {
return
}
nonce := make([]byte, gcm.NonceSize())
if _, err := io.ReadFull(rand.Reader, nonce); err != nil {
return
}
ciphertext := gcm.Seal(nonce, nonce, plaintext, nil)
path := dntCachePath()
tmp := path + ".tmp"
if err := os.WriteFile(tmp, ciphertext, 0600); err != nil {
log.Warn().Err(err).Msg("[dnt] failed to write cache file")
return
}
if err := os.Rename(tmp, path); err != nil {
log.Warn().Err(err).Msg("[dnt] failed to rename cache file")
_ = os.Remove(tmp)
}
}
// loadFromDisk decrypts the on-disk cache and re-enqueues only critical entries.
// Errors (missing file, decryption failure) are non-fatal: the cache simply
// starts empty, which is safe.
func (c *dntCache) loadFromDisk() {
if c.aesKey == nil {
return
}
log := oclib.GetLogger()
path := dntCachePath()
data, err := os.ReadFile(path)
if err != nil {
if !os.IsNotExist(err) {
log.Warn().Err(err).Msg("[dnt] failed to read cache file")
}
return
}
block, err := aes.NewCipher(c.aesKey)
if err != nil {
return
}
gcm, err := cipher.NewGCM(block)
if err != nil {
return
}
if len(data) < gcm.NonceSize() {
log.Warn().Msg("[dnt] cache file too short, ignoring")
return
}
nonce, ciphertext := data[:gcm.NonceSize()], data[gcm.NonceSize():]
plaintext, err := gcm.Open(nil, nonce, ciphertext, nil)
if err != nil {
log.Warn().Err(err).Msg("[dnt] cache file decryption failed (key mismatch?), ignoring")
return
}
var saved []dntEntryJSON
if err := json.Unmarshal(plaintext, &saved); err != nil {
log.Warn().Err(err).Msg("[dnt] cache file unmarshal failed, ignoring")
return
}
count := 0
for _, j := range saved {
// Only restore critical entries — moderate entries are intentionally
// not persisted, but this guard defends against format changes.
if dntProtocols[j.Proto] != dntCritical {
continue
}
c.entries = append(c.entries, entryFromJSON(j))
count++
}
if count > 0 {
log.Info().Int("count", count).Msg("[dnt] restored critical entries from disk")
}
}
// ── Retry loop ────────────────────────────────────────────────────────────────
// startDNTLoop runs the background retry goroutine. Call once after init.
func (s *StreamService) startDNTLoop() {
logger := oclib.GetLogger()
ticker := time.NewTicker(dntRetryInterval)
defer ticker.Stop()
for range ticker.C {
entries := s.dnt.drain()
if len(entries) == 0 {
continue
}
var keep []*dntEntry
for _, e := range entries {
_, err := s.write(e.did, &e.addr, e.dt, e.user, e.payload, e.proto)
if err == nil {
level := dntProtocols[e.proto]
if level == dntCritical {
logger.Info().
Str("proto", string(e.proto)).
Str("peer", e.did).
Msg("[dnt] critical message delivered after retry")
} else {
logger.Info().
Str("proto", string(e.proto)).
Str("peer", e.did).
Int("retries", e.retries).
Msg("[dnt] moderate message delivered after retry")
}
continue
}
level := dntProtocols[e.proto]
switch level {
case dntCritical:
keep = append(keep, e)
case dntModerate:
e.retries++
if e.retries < dntMaxModerateRetries {
keep = append(keep, e)
} else {
logger.Warn().
Str("proto", string(e.proto)).
Str("peer", e.did).
Int("retries", e.retries).
Msg("[dnt] moderate message abandoned after max retries")
}
}
}
s.dnt.requeue(keep)
// Persist after each tick so the on-disk file reflects the current
// state (entries delivered are removed, new ones from concurrent
// enqueues are included).
go s.dnt.persistToDisk()
}
}
+446
View File
@@ -0,0 +1,446 @@
package stream
// DTN_cache.go — Disconnection Network Tolerance cache for outbound stream requests.
//
// When a stream write fails because the remote peer is unreachable, the request
// is saved here and retried on the next tick. Two levels are defined:
//
// - DTNCritical : retry indefinitely (create / update / delete resource).
// - DTNModerate : up to DTNMaxModerateRetries retries, then abandon.
//
// Pubsub messages and search streams are explicitly excluded.
// Streams initiated from the indexer side are never enqueued here.
//
// # Crash-resilient persistence
//
// Critical entries are written to an encrypted file (AES-256-GCM) so they
// survive a node crash/restart. The AES key is derived deterministically from
// the node's Ed25519 private key via HKDF-SHA256 — no extra secret to manage.
// Moderate entries are intentionally not persisted: their retry budget is small
// enough that re-loading them after a restart would be misleading.
import (
"crypto/aes"
"crypto/cipher"
"crypto/rand"
"crypto/sha256"
"encoding/json"
"io"
"os"
"path/filepath"
"sync"
"time"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/tools"
"golang.org/x/crypto/hkdf"
"oc-discovery/conf"
pp "github.com/libp2p/go-libp2p/core/peer"
"github.com/libp2p/go-libp2p/core/protocol"
)
type DTNLevel int
const (
DTNCritical DTNLevel = iota // retry until the message is delivered
DTNModerate // retry up to DTNMaxModerateRetries times
)
const DTNMaxModerateRetries = 3
const DTNRetryInterval = 15 * time.Second
// DTNProtocols maps each stream protocol to its DTN level.
// Protocols absent from this map receive no caching (e.g. ProtocolSearchResource).
var DTNProtocols = map[protocol.ID]DTNLevel{
// Critical — data mutations that must eventually be delivered.
ProtocolCreateResource: DTNCritical,
ProtocolUpdateResource: DTNCritical,
ProtocolDeleteResource: DTNCritical,
// Moderate — confirmations / config / planner: 3 retries before abandon.
ProtocolVerifyResource: DTNModerate,
ProtocolSendPlanner: DTNModerate,
ProtocolConsidersResource: DTNModerate,
ProtocolMinioConfigResource: DTNModerate,
ProtocolAdmiraltyConfigResource: DTNModerate,
ProtocolSourcePresignResource: DTNModerate,
}
// DTNEntryJSON is the on-disk representation of a DTNEntry.
// pp.AddrInfo and protocol.ID don't have built-in JSON tags so we flatten them.
type DTNEntryJSON struct {
DID string `json:"did"`
ResourceID string `json:"resource_id,omitempty"`
ForceCritical bool `json:"force_critical,omitempty"`
Addr pp.AddrInfo `json:"addr"`
DT *tools.DataType `json:"dt,omitempty"`
User string `json:"user"`
Payload []byte `json:"payload"`
Proto protocol.ID `json:"proto"`
Retries int `json:"retries"`
AddedAt time.Time `json:"added_at"`
}
type DTNEntry struct {
did string
resourceID string // UUID of the resource; empty for non-resource payloads (planner, config)
forceCritical bool // true when destination is NANO: all protocols become critical
addr pp.AddrInfo
dt *tools.DataType
user string
payload []byte
proto protocol.ID
retries int
addedAt time.Time
}
// isEffectivelyCritical returns true when the entry must be retried indefinitely,
// either because its protocol is inherently critical or because the destination
// is a NANO peer (forceCritical).
func (e *DTNEntry) isEffectivelyCritical() bool {
return DTNProtocols[e.proto] == DTNCritical || e.forceCritical
}
func (e *DTNEntry) toJSON() DTNEntryJSON {
return DTNEntryJSON{
DID: e.did,
ResourceID: e.resourceID,
ForceCritical: e.forceCritical,
Addr: e.addr,
DT: e.dt,
User: e.user,
Payload: e.payload,
Proto: e.proto,
Retries: e.retries,
AddedAt: e.addedAt,
}
}
func entryFromJSON(j DTNEntryJSON) *DTNEntry {
return &DTNEntry{
did: j.DID,
resourceID: j.ResourceID,
forceCritical: j.ForceCritical,
addr: j.Addr,
dt: j.DT,
user: j.User,
payload: j.Payload,
proto: j.Proto,
retries: j.Retries,
addedAt: j.AddedAt,
}
}
type DTNCache struct {
mu sync.Mutex
entries []*DTNEntry
// aesKey is the derived AES-256 key used for on-disk encryption.
// Nil when key derivation failed: persistence is disabled but the in-memory
// cache continues to function normally.
aesKey []byte
}
// newDNTCache initialises the cache, derives the encryption key, and restores
// any critical entries that were persisted before the last crash.
func newDNTCache() *DTNCache {
log := oclib.GetLogger()
c := &DTNCache{}
key, err := deriveDNTKey()
if err != nil {
log.Warn().Err(err).Msg("[dnt] key derivation failed — persistence disabled")
} else {
c.aesKey = key
c.loadFromDisk()
}
return c
}
// extractResourceID returns the "id" field from a JSON resource payload.
// Returns "" when the payload is not a resource object (planner, config, etc.).
func extractResourceID(payload []byte) string {
var obj struct {
ID string `json:"id"`
}
if err := json.Unmarshal(payload, &obj); err != nil {
return ""
}
return obj.ID
}
// enqueue adds an entry to the cache, respecting the resource lifecycle.
// Deduplication key is (did, resourceID): same resource to the same peer keeps
// only the latest mutation. resourceID is empty for non-resource payloads
// (planner, config), in which case deduplication falls back to did alone.
//
// - DELETE is terminal: any subsequent mutation on the same key is discarded.
// - UPDATE cannot be followed by CREATE: the resource already exists remotely.
// - All other cases replace the existing entry (newer mutation supersedes).
func (c *DTNCache) enqueue(e *DTNEntry) {
c.mu.Lock()
found, mutated := false, false
for i, existing := range c.entries {
if existing.did != e.did || existing.resourceID != e.resourceID {
continue
}
found = true
if existing.proto == ProtocolDeleteResource ||
(existing.proto == ProtocolUpdateResource && e.proto == ProtocolCreateResource) {
break // discard new entry silently — existing state is authoritative
}
c.entries[i] = e
mutated = true
break
}
if !found {
c.entries = append(c.entries, e)
mutated = true
}
c.mu.Unlock()
if mutated && e.isEffectivelyCritical() {
go c.persistToDisk()
}
}
// peersWithPending returns the distinct peer IDs (did) that have at least one
// critical entry in the cache. Used to populate Heartbeat.PendingContact.
func (c *DTNCache) peersWithPending() []string {
c.mu.Lock()
defer c.mu.Unlock()
seen := map[string]struct{}{}
var out []string
for _, e := range c.entries {
if e.isEffectivelyCritical() {
if _, ok := seen[e.did]; !ok {
seen[e.did] = struct{}{}
out = append(out, e.did)
}
}
}
return out
}
// drain atomically removes and returns all current entries.
func (c *DTNCache) drain() []*DTNEntry {
c.mu.Lock()
defer c.mu.Unlock()
out := c.entries
c.entries = nil
return out
}
// requeue puts entries back at the head of the list, preserving any new
// entries added while the retry loop was running.
func (c *DTNCache) requeue(entries []*DTNEntry) {
if len(entries) == 0 {
return
}
c.mu.Lock()
defer c.mu.Unlock()
c.entries = append(entries, c.entries...)
}
// ── Persistence ──────────────────────────────────────────────────────────────
// DTNCachePath returns the path of the on-disk cache file, placed next to the
// node's private key so it lives on the same persistent volume.
func DTNCachePath() string {
return filepath.Join(filepath.Dir(conf.GetConfig().PrivateKeyPath), "dnt_cache.bin")
}
// deriveDNTKey derives a 32-byte AES key from the node's Ed25519 private key
// using HKDF-SHA256. The derivation is deterministic: the same key is always
// produced from the same private key, so no symmetric secret needs storing.
func deriveDNTKey() ([]byte, error) {
priv, err := tools.LoadKeyFromFilePrivate()
if err != nil {
return nil, err
}
// Raw() on a libp2p Ed25519 private key returns the 64-byte representation
// (32-byte seed || 32-byte public key). We use the full 64 bytes as IKM.
raw, err := priv.Raw()
if err != nil {
return nil, err
}
reader := hkdf.New(sha256.New, raw, nil, []byte("oc-discovery/dnt-cache/v1"))
key := make([]byte, 32)
if _, err := io.ReadFull(reader, key); err != nil {
return nil, err
}
return key, nil
}
// persistToDisk encrypts all current critical entries and writes them to disk.
// Non-critical entries are deliberately excluded — they are not worth restoring
// after a restart given their limited retry budget.
func (c *DTNCache) persistToDisk() {
if c.aesKey == nil {
return
}
log := oclib.GetLogger()
c.mu.Lock()
var toSave []DTNEntryJSON
for _, e := range c.entries {
if e.isEffectivelyCritical() {
toSave = append(toSave, e.toJSON())
}
}
c.mu.Unlock()
plaintext, err := json.Marshal(toSave)
if err != nil {
return
}
block, err := aes.NewCipher(c.aesKey)
if err != nil {
return
}
gcm, err := cipher.NewGCM(block)
if err != nil {
return
}
nonce := make([]byte, gcm.NonceSize())
if _, err := io.ReadFull(rand.Reader, nonce); err != nil {
return
}
ciphertext := gcm.Seal(nonce, nonce, plaintext, nil)
path := DTNCachePath()
tmp := path + ".tmp"
if err := os.WriteFile(tmp, ciphertext, 0600); err != nil {
log.Warn().Err(err).Msg("[dnt] failed to write cache file")
return
}
if err := os.Rename(tmp, path); err != nil {
log.Warn().Err(err).Msg("[dnt] failed to rename cache file")
_ = os.Remove(tmp)
}
}
// loadFromDisk decrypts the on-disk cache and re-enqueues only critical entries.
// Errors (missing file, decryption failure) are non-fatal: the cache simply
// starts empty, which is safe.
func (c *DTNCache) loadFromDisk() {
if c.aesKey == nil {
return
}
log := oclib.GetLogger()
path := DTNCachePath()
data, err := os.ReadFile(path)
if err != nil {
if !os.IsNotExist(err) {
log.Warn().Err(err).Msg("[dnt] failed to read cache file")
}
return
}
block, err := aes.NewCipher(c.aesKey)
if err != nil {
return
}
gcm, err := cipher.NewGCM(block)
if err != nil {
return
}
if len(data) < gcm.NonceSize() {
log.Warn().Msg("[dnt] cache file too short, ignoring")
return
}
nonce, ciphertext := data[:gcm.NonceSize()], data[gcm.NonceSize():]
plaintext, err := gcm.Open(nil, nonce, ciphertext, nil)
if err != nil {
log.Warn().Err(err).Msg("[dnt] cache file decryption failed (key mismatch?), ignoring")
return
}
var saved []DTNEntryJSON
if err := json.Unmarshal(plaintext, &saved); err != nil {
log.Warn().Err(err).Msg("[dnt] cache file unmarshal failed, ignoring")
return
}
count := 0
for _, j := range saved {
// Only restore critical entries — moderate entries are intentionally
// not persisted, but this guard defends against format changes.
e := entryFromJSON(j)
if !e.isEffectivelyCritical() {
continue
}
c.entries = append(c.entries, e)
count++
}
if count > 0 {
log.Info().Int("count", count).Msg("[dnt] restored critical entries from disk")
}
}
// ── Retry loop ────────────────────────────────────────────────────────────────
// startDNTLoop runs the background retry goroutine. Call once after init.
func (s *StreamService) startDNTLoop() {
logger := oclib.GetLogger()
ticker := time.NewTicker(DTNRetryInterval)
defer ticker.Stop()
// retryEntries attempts delivery for the given entries and returns those
// that must be kept for the next round.
retryEntries := func(entries []*DTNEntry) []*DTNEntry {
var keep []*DTNEntry
for _, e := range entries {
_, err := s.write(e.did, &e.addr, e.dt, e.user, e.payload, e.proto)
if err == nil {
if e.isEffectivelyCritical() {
logger.Info().Str("proto", string(e.proto)).Str("peer", e.did).
Msg("[dnt] critical message delivered after retry")
} else {
logger.Info().Str("proto", string(e.proto)).Str("peer", e.did).
Int("retries", e.retries).Msg("[dnt] moderate message delivered after retry")
}
continue
}
if e.isEffectivelyCritical() {
keep = append(keep, e)
} else {
e.retries++
if e.retries < DTNMaxModerateRetries {
keep = append(keep, e)
} else {
logger.Warn().Str("proto", string(e.proto)).Str("peer", e.did).
Int("retries", e.retries).Msg("[dnt] moderate message abandoned after max retries")
}
}
}
return keep
}
for {
select {
case <-ticker.C:
entries := s.dnt.drain()
if len(entries) == 0 {
continue
}
s.dnt.requeue(retryEntries(entries))
go s.dnt.persistToDisk()
case peerID := <-s.dntNudge:
// A peer just signalled it is reachable — retry its entries immediately.
entries := s.dnt.drain()
var forPeer, other []*DTNEntry
for _, e := range entries {
if e.did == peerID {
forPeer = append(forPeer, e)
} else {
other = append(other, e)
}
}
kept := retryEntries(forPeer)
s.dnt.requeue(append(kept, other...))
if len(kept) < len(forPeer) {
go s.dnt.persistToDisk()
}
}
}
}
+92 -7
View File
@@ -15,6 +15,7 @@ import (
"cloud.o-forge.io/core/oc-lib/models/resources"
"cloud.o-forge.io/core/oc-lib/tools"
"github.com/libp2p/go-libp2p/core/network"
pp "github.com/libp2p/go-libp2p/core/peer"
)
type Verify struct {
@@ -23,8 +24,18 @@ type Verify struct {
func (ps *StreamService) handleEvent(protocol string, evt *common.Event, s network.Stream) error {
fmt.Println("handleEvent", protocol)
// Heartbeat received on an outgoing ProtocolObserve stream.
if protocol == ProtocolObserve {
// Distinguish between an open request and a close request by inspecting
// the ObserveRequest payload. The remote wraps both in a common.Event
// with Type=ProtocolObserve so the persistent readLoop can decode them.
var req ObserveRequest
if evt.Payload != nil {
json.Unmarshal(evt.Payload, &req) //nolint:errcheck — zero value means open
}
if req.Close {
ps.observeCache.cancel(s.Conn().RemotePeer().String())
return nil
}
return ps.handleIncomingObserve(s)
}
if protocol == observeHBEventType {
@@ -59,6 +70,11 @@ func (ps *StreamService) handleEvent(protocol string, evt *common.Event, s netwo
return err
}
}
if protocol == ProtocolSourcePresignResource {
if err := ps.pass(evt, tools.SOURCE_PRESIGN_EVENT); err != nil {
return err
}
}
if protocol == ProtocolAdmiraltyConfigResource {
if err := ps.pass(evt, tools.ADMIRALTY_CONFIG_EVENT); err != nil {
return err
@@ -125,9 +141,9 @@ func (abs *StreamService) sendPlanner(event *common.Event) error { //
}
func (abs *StreamService) retrieveResponse(event *common.Event) error { //
if !abs.ResourceSearches.IsActive(event.User) {
/*if !abs.ResourceSearches.IsActive(event.User) {
return nil // search already closed or timed out
}
}*/
res, err := resources.ToResource(int(event.DataType), event.Payload)
if err != nil || res == nil {
return nil
@@ -137,6 +153,7 @@ func (abs *StreamService) retrieveResponse(event *common.Event) error { //
b, err := json.Marshal(res.Serialize(res))
go tools.NewNATSCaller().SetNATSPub(tools.SEARCH_EVENT, tools.NATSResponse{
FromApp: "oc-discovery",
User: event.User,
Datatype: tools.DataType(event.DataType),
Method: int(tools.SEARCH_EVENT),
Payload: b,
@@ -147,6 +164,7 @@ func (abs *StreamService) retrieveResponse(event *common.Event) error { //
func (abs *StreamService) pass(event *common.Event, method tools.NATSMethod) error { //
go tools.NewNATSCaller().SetNATSPub(method, tools.NATSResponse{
FromApp: "oc-discovery",
User: event.User,
Datatype: tools.DataType(event.DataType),
Method: int(method),
Payload: event.Payload,
@@ -154,6 +172,36 @@ func (abs *StreamService) pass(event *common.Event, method tools.NATSMethod) err
return nil
}
// resolveBookingNano does a single DB lookup and returns:
//
// (nil, true) — not a booking, dest_peer_id absent, or dest == self → process normally, no forward
// (nano, true) — dest is one of our NANO peers → process + forward to nano
// (nil, false) — dest is unknown → ignore
func (ps *StreamService) resolveBookingNano(evt *common.Event) (*peer.Peer, bool) {
if tools.DataType(evt.DataType) != tools.BOOKING {
return nil, true
}
var b struct {
DestPeerID string `json:"dest_peer_id"`
}
if err := json.Unmarshal(evt.Payload, &b); err != nil || b.DestPeerID == "" {
return nil, true
}
if self, err := oclib.GetMySelf(); err == nil && self != nil && b.DestPeerID == self.GetID() {
return nil, true
}
d := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil).Search(&dbs.Filters{
And: map[string][]dbs.Filter{
"id": {{Operator: dbs.EQUAL.String(), Value: b.DestPeerID}},
"relation": {{Operator: dbs.EQUAL.String(), Value: peer.NANO}},
},
}, "", false, 0, 1)
if len(d.Data) == 0 {
return nil, false
}
return d.Data[0].(*peer.Peer), true
}
func (ps *StreamService) handleEventFromPartner(evt *common.Event, protocol string) error {
switch protocol {
case ProtocolSearchResource:
@@ -176,9 +224,10 @@ func (ps *StreamService) handleEventFromPartner(evt *common.Event, protocol stri
ps.SendResponse(p[0], evt, fmt.Sprintf("%v", search))
}
} else {
fmt.Println("SEND SEARCH_EVENT SetNATSPub", m)
go tools.NewNATSCaller().SetNATSPub(tools.SEARCH_EVENT, tools.NATSResponse{
fmt.Println("SEND SEARCH_EVENT SetNATSPub", m, evt.DataType, evt.User)
tools.NewNATSCaller().SetNATSPub(tools.SEARCH_EVENT, tools.NATSResponse{
FromApp: "oc-discovery",
User: evt.User,
Datatype: tools.DataType(evt.DataType),
Method: int(tools.SEARCH_EVENT),
Payload: evt.Payload,
@@ -186,19 +235,35 @@ func (ps *StreamService) handleEventFromPartner(evt *common.Event, protocol stri
}
case ProtocolCreateResource, ProtocolUpdateResource:
fmt.Println("RECEIVED Protocol.Update", string(evt.Payload))
go tools.NewNATSCaller().SetNATSPub(tools.CREATE_RESOURCE, tools.NATSResponse{
nano, ok := ps.resolveBookingNano(evt)
if !ok {
return nil
}
tools.NewNATSCaller().SetNATSPub(tools.CREATE_RESOURCE, tools.NATSResponse{
FromApp: "oc-discovery",
User: evt.User,
Datatype: tools.DataType(evt.DataType),
Method: int(tools.CREATE_RESOURCE),
Payload: evt.Payload,
})
if nano != nil {
ps.forwardToNano(nano, evt, protocol)
}
case ProtocolDeleteResource:
go tools.NewNATSCaller().SetNATSPub(tools.REMOVE_RESOURCE, tools.NATSResponse{
nano, ok := ps.resolveBookingNano(evt)
if !ok {
return nil
}
tools.NewNATSCaller().SetNATSPub(tools.REMOVE_RESOURCE, tools.NATSResponse{
FromApp: "oc-discovery",
User: evt.User,
Datatype: tools.DataType(evt.DataType),
Method: int(tools.REMOVE_RESOURCE),
Payload: evt.Payload,
})
if nano != nil {
ps.forwardToNano(nano, evt, protocol)
}
default:
return errors.New("no action authorized available : " + protocol)
}
@@ -223,11 +288,31 @@ func (abs *StreamService) SendResponse(p *peer.Peer, event *common.Event, search
access := oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil)
searched := access.Search(abs.FilterPeer(self.GetID(), event.Groups, search), "", false, 0, 0)
for _, ss := range searched.Data {
// SendResponse uses an admin request so SetAllowedInstances
// never calls FilterExploitationAuthorizations. Apply it
// explicitly here so we never leak private AEs to a remote peer.
if r, ok := ss.(resources.ResourceInterface); ok {
r.SetAllowedInstances(&tools.APIRequest{PeerID: p.UUID, Groups: event.Groups, Username: event.User})
}
if j, err := json.Marshal(ss); err == nil {
abs.PublishCommon(&dt, event.User, event.Groups, p.PeerID, ProtocolSearchResource, j)
}
}
}
}
// Close the ProtocolSearchResource stream to the requester immediately after
// sending all results. This prevents TempStream from reusing a stale (already
// closed by the remote) stream entry for a subsequent search from the same peer,
// which would cause write failure and no results for the second search.
if decodedID, err := pp.Decode(p.PeerID); err == nil {
abs.Mu.Lock()
if abs.Streams[ProtocolSearchResource] != nil {
if s, ok := abs.Streams[ProtocolSearchResource][decodedID]; ok {
s.Stream.Reset()
delete(abs.Streams[ProtocolSearchResource], decodedID)
}
}
abs.Mu.Unlock()
}
return nil
}
+181 -61
View File
@@ -27,7 +27,7 @@ const ProtocolObserve = "/opencloud/peer/observe/1.0"
// observeHBEventType is used as the common.Event.Type for heartbeat responses.
const observeHBEventType = "/opencloud/peer/observe/heartbeat"
const observeHBInterval = 30 * time.Second
const observeHBInterval = 10 * time.Second
const observeDrainDuration = 30 * time.Second
// observeBatchWindow is the accumulation window before a heartbeat batch is
@@ -45,7 +45,95 @@ type ObserveRequest struct {
// ObserveHeartbeat is sent by the observed side every observeHBInterval.
type ObserveHeartbeat struct {
State string `json:"state"` // always "online" when actively emitted
State string `json:"state"` // always "online" when actively emitted
SentAt time.Time `json:"sent_at,omitempty"` // timestamp set by sender; lets receiver compute one-way latency
}
const (
maxLatencyMs = 2000.0 // ms above which latency score → 0
latencySamples = 5 // sliding window size for latency averaging
fastThresholdMs = 200.0 // below = "fast", above = "slow"
reliableThreshold = 0.95 // miss_rate below 5% = "reliable"
)
// PeerObserveMetrics accumulates connection-quality data for one observed peer.
// Updated on every incoming heartbeat (observing side).
type PeerObserveMetrics struct {
mu sync.Mutex
firstObservedAt time.Time
lastHeartbeatAt time.Time
received uint64
latencies [latencySamples]time.Duration
latIdx int
latCount int
}
func (m *PeerObserveMetrics) record(latency time.Duration) {
m.mu.Lock()
defer m.mu.Unlock()
m.received++
m.lastHeartbeatAt = time.Now().UTC()
m.latencies[m.latIdx%latencySamples] = latency
m.latIdx++
if m.latCount < latencySamples {
m.latCount++
}
}
func (m *PeerObserveMetrics) snapshot() PeerObserveSnapshot {
m.mu.Lock()
defer m.mu.Unlock()
var total time.Duration
for i := 0; i < m.latCount; i++ {
total += m.latencies[i]
}
var avgMs float64
if m.latCount > 0 {
avgMs = float64(total.Milliseconds()) / float64(m.latCount)
}
expected := int64(time.Duration(m.lastHeartbeatAt.Second()-m.firstObservedAt.Second()) / observeHBInterval)
fmt.Println("EXPECTED", expected, m.received)
var missRate float64
if expected > 0 {
recv := int64(m.received)
if recv > expected {
recv = expected
}
missRate = 1.0 - float64(recv)/float64(expected)
}
latScore := 1.0 - avgMs/maxLatencyMs
if latScore < 0 {
latScore = 0
}
relScore := 1.0 - missRate
trust := (0.35*latScore + 0.65*relScore) * 100
speed := "fast"
if avgMs >= fastThresholdMs {
speed = "slow"
}
reliability := "reliable"
if relScore < reliableThreshold {
reliability = "watch"
}
return PeerObserveSnapshot{
LatencyMs: avgMs,
Speed: speed,
Reliability: reliability,
TrustScore: trust,
LastSeenAt: m.lastHeartbeatAt,
MissRate: missRate,
}
}
// PeerObserveSnapshot is the point-in-time quality summary sent to oc-peer via NATS.
type PeerObserveSnapshot struct {
LatencyMs float64 `json:"latency_ms"`
Speed string `json:"speed"` // "fast" | "slow"
Reliability string `json:"reliability"` // "reliable" | "watch"
TrustScore float64 `json:"trust_score"`
LastSeenAt time.Time `json:"last_seen_at"`
MissRate float64 `json:"miss_rate"`
}
// ShallowPeer is the minimal peer representation sent by oc-peer in a
@@ -204,18 +292,13 @@ func flushObserveBatch(peerIDs []string) {
// ── incoming observe handler (observed side) ──────────────────────────────────
// handleIncomingObserve is registered as the ProtocolObserve stream handler.
// It is called when a remote peer opens an observe stream to us.
// The function reads the request, validates it, then starts (or stops) the
// heartbeat goroutine and returns immediately — the goroutine owns the stream.
// handleIncomingObserve is called when a remote peer opens an observe stream
// to us (observed side). It starts a heartbeat goroutine that writes back on
// the same bidirectional rawStream — no separate reverse stream is opened.
// The goroutine stops via context cancellation (triggered by a close event
// read from rawStream) or when rawStream becomes unwritable.
func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
remotePeerID := rawStream.Conn().RemotePeer().String()
addr := rawStream.Conn().RemoteMultiaddr().String()
ad, err := pp.AddrInfoFromString(addr + "/p2p/" + remotePeerID)
if err != nil {
fmt.Println("qndlqnl EERR", addr, err)
return err
}
log := oclib.GetLogger()
// Drain mode: reject any new observations for 30 s after a close-all.
@@ -223,13 +306,11 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
draining := !s.drainUntil.IsZero() && time.Now().Before(s.drainUntil)
s.drainMu.RUnlock()
if draining {
rawStream.Close()
fmt.Println("Draining")
return errors.New("Draining")
return errors.New("draining")
}
// Read the observe request (with a generous deadline to avoid hangs).
// Guard: the requesting peer must not be blacklisted or be ourself.
did := ""
// Guard: the requesting peer must not be blacklisted.
access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
res := access.Search(&dbs.Filters{
And: map[string][]dbs.Filter{
@@ -238,11 +319,9 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
}, "", false, 0, 1)
if len(res.Data) > 0 {
p := res.Data[0].(*peer.Peer)
did = p.GetID()
if p.Relation == peer.BLACKLIST { // || p.Relation == peer.SELF
rawStream.Close()
if p.Relation == peer.BLACKLIST {
fmt.Println("CLOSE blacklist or self")
return errors.New("can't exploit blacklist or self")
return errors.New("can't observe blacklisted peer")
}
}
@@ -251,52 +330,32 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
s.observeCache.set(remotePeerID, cancel)
fmt.Println("LOOP OBSERVE")
go func() {
defer rawStream.Close()
// Do NOT close rawStream here: the persistent readLoop (HandleResponse)
// owns rawStream's lifecycle. We only stop writing.
defer cancel()
defer s.observeCache.delete(remotePeerID)
ticker := time.NewTicker(observeHBInterval)
defer ticker.Stop()
hbPayload, _ := json.Marshal(ObserveHeartbeat{State: "online"})
evt := common.NewEvent(observeHBEventType, s.Host.ID().String(), nil, "", hbPayload)
if evt == nil {
return
}
if s.Streams, err = common.TempStream(s.Host, *ad, ProtocolObserve, did, s.Streams, protocols, &s.Mu); err == nil {
stream := s.Streams[ProtocolObserve][ad.ID]
if err := json.NewEncoder(stream.Stream).Encode(evt); err != nil {
// Moderate connectivity event: the observer is unreachable.
// The deferred calls above purge this observer from the cache.
fmt.Println("LOOP EVT ERR", err)
log.Info().
Str("observer", remotePeerID).
Err(err).
Msg("[observe] heartbeat write failed — moderate connectivity event, purging observer from cache")
return
}
buildHBEvent := func() *common.Event {
p, _ := json.Marshal(ObserveHeartbeat{State: "online", SentAt: time.Now().UTC()})
return common.NewEvent(observeHBEventType, s.Host.ID().String(), nil, "", p)
}
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
rawStream.SetWriteDeadline(time.Now().Add(5 * time.Second))
fmt.Println("LOOP EVT", evt)
var err error
if s.Streams, err = common.TempStream(s.Host, *ad, ProtocolObserve, did, s.Streams, protocols, &s.Mu); err == nil {
stream := s.Streams[ProtocolObserve][ad.ID]
if err := json.NewEncoder(stream.Stream).Encode(evt); err != nil {
// Moderate connectivity event: the observer is unreachable.
// The deferred calls above purge this observer from the cache.
fmt.Println("LOOP EVT ERR", err)
log.Info().
Str("observer", remotePeerID).
Err(err).
Msg("[observe] heartbeat write failed — moderate connectivity event, purging observer from cache")
return
}
evt := buildHBEvent()
if err := json.NewEncoder(rawStream).Encode(evt); err != nil {
log.Info().
Str("observer", remotePeerID).
Err(err).
Msg("[observe] heartbeat write failed — stream closed, stopping goroutine")
return
}
rawStream.SetWriteDeadline(time.Time{})
}
@@ -308,14 +367,65 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
// ── heartbeat receiver (observing side) ───────────────────────────────────────
// handleObserveHeartbeat is called by readLoop when a heartbeat event arrives
// on an outgoing ProtocolObserve stream. It queues the peer_id in the batch
// accumulator; the batcher flushes to NATS after observeBatchWindow.
// on an outgoing ProtocolObserve stream. It updates per-peer metrics and flushes
// a quality snapshot to NATS.
func (ps *StreamService) handleObserveHeartbeat(evt *common.Event) error {
// ps.hbBatcher.add(evt.From)
flushObserveBatch([]string{evt.From})
var hb ObserveHeartbeat
if err := json.Unmarshal(evt.Payload, &hb); err == nil && !hb.SentAt.IsZero() {
latency := time.Since(hb.SentAt)
raw, _ := ps.observeMetrics.LoadOrStore(evt.From, &PeerObserveMetrics{
firstObservedAt: time.Now().UTC(),
})
raw.(*PeerObserveMetrics).record(latency)
fmt.Println("METRICS", raw)
ps.observeMetrics.Store(evt.From, raw)
}
ps.flushObserveForPeer(evt.From, evt.User)
return nil
}
// flushObserveForPeer sends a PEER_OBSERVE_RESPONSE_EVENT to NATS with a quality
// snapshot for peerID. Replaces the old flushObserveBatch (single-peer variant).
func (ps *StreamService) flushObserveForPeer(peerID string, user string) {
var snap *PeerObserveSnapshot
if raw, ok := ps.observeMetrics.Load(peerID); ok {
fmt.Println("RETRIEVED METRICS", raw)
s := raw.(*PeerObserveMetrics).snapshot()
snap = &s
}
fmt.Println("RETRIEVED METRICS 2", snap)
payload, err := json.Marshal(map[string]interface{}{
"peer_ids": []string{peerID},
"state": "online",
"metrics": map[string]*PeerObserveSnapshot{peerID: snap},
})
if err != nil {
return
}
tools.NewNATSCaller().SetNATSPub(tools.PEER_OBSERVE_RESPONSE_EVENT, tools.NATSResponse{
FromApp: "oc-discovery",
Datatype: tools.PEER,
User: user,
Method: int(tools.PEER_OBSERVE_RESPONSE_EVENT),
Payload: payload,
})
propPayload, err := json.Marshal(tools.PropalgationMessage{
DataType: int(tools.PEER),
Action: tools.PB_PROPAGATE,
Payload: payload,
})
if err != nil {
return
}
tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
FromApp: "oc-discovery",
Datatype: tools.PEER,
User: user,
Method: int(tools.PROPALGATION_EVENT),
Payload: propPayload,
})
}
// ── user→peer index (ref-counted observe management) ─────────────────────────
// userPeerIndex tracks which users are observing which peers.
@@ -514,7 +624,8 @@ func (ps *StreamService) openObserveStream(p ShallowPeer) error {
}
// closeObserveStream closes the ProtocolObserve stream to toPeerID and notifies
// the remote side.
// the remote side. The close event is wrapped in a common.Event so the remote's
// persistent readLoop can decode and handle it (cancel the heartbeat goroutine).
func (ps *StreamService) closeObserveStream(toPeerID string) error {
decodedID, err := pp.Decode(toPeerID)
if err != nil {
@@ -523,12 +634,15 @@ func (ps *StreamService) closeObserveStream(toPeerID string) error {
ps.Mu.Lock()
if ps.Streams[ProtocolObserve] != nil {
if s, ok := ps.Streams[ProtocolObserve][decodedID]; ok {
_ = json.NewEncoder(s.Stream).Encode(ObserveRequest{Close: true})
closePayload, _ := json.Marshal(ObserveRequest{Close: true})
closeEvt := common.NewEvent(ProtocolObserve, ps.Host.ID().String(), nil, "", closePayload)
_ = json.NewEncoder(s.Stream).Encode(closeEvt)
s.Stream.Close()
delete(ps.Streams[ProtocolObserve], decodedID)
}
}
ps.Mu.Unlock()
ps.observeMetrics.Delete(toPeerID)
return nil
}
@@ -537,7 +651,9 @@ func (ps *StreamService) closeObserveStream(toPeerID string) error {
func (ps *StreamService) CloseAllObserves() {
ps.Mu.Lock()
for _, s := range ps.Streams[ProtocolObserve] {
_ = json.NewEncoder(s.Stream).Encode(ObserveRequest{Close: true})
closePayload, _ := json.Marshal(ObserveRequest{Close: true})
closeEvt := common.NewEvent(ProtocolObserve, ps.Host.ID().String(), nil, "", closePayload)
_ = json.NewEncoder(s.Stream).Encode(closeEvt)
s.Stream.Close()
}
delete(ps.Streams, ProtocolObserve)
@@ -545,6 +661,10 @@ func (ps *StreamService) CloseAllObserves() {
// Reset user index so stale ref-counts don't block future opens.
ps.observeUsers = newUserPeerIndex()
ps.observeMetrics.Range(func(k, _ any) bool {
ps.observeMetrics.Delete(k)
return true
})
ps.drainMu.Lock()
ps.drainUntil = time.Now().Add(observeDrainDuration)
+40 -13
View File
@@ -61,15 +61,17 @@ func (ps *StreamService) PublishCommon(dt *tools.DataType, user string, groups [
}
stream, err := ps.write(toPeerID, ad, dt, user, resource, proto)
if err != nil {
if _, ok := dntProtocols[proto]; ok {
ps.dnt.enqueue(&dntEntry{
did: toPeerID,
addr: *ad,
dt: dt,
user: user,
payload: resource,
proto: proto,
addedAt: time.Now().UTC(),
if _, ok := DTNProtocols[proto]; ok {
ps.dnt.enqueue(&DTNEntry{
did: toPeerID,
resourceID: extractResourceID(resource),
forceCritical: pe.Relation == peer.NANO,
addr: *ad,
dt: dt,
user: user,
payload: resource,
proto: proto,
addedAt: time.Now().UTC(),
})
}
return nil, err
@@ -125,20 +127,45 @@ func (ps *StreamService) ToPartnerPublishEvent(
return nil
}
ks := []protocol.ID{}
for k := range protocolsPartners {
ks = append(ks, k)
// Extract creator_id to route to the correct nano.
// A master must only forward a resource to the nano that owns it.
var creatorID string
var minPayload struct {
CreatorID string `json:"creator_id"`
}
for _, rel := range []peer.PeerRelation{peer.PARTNER, peer.MASTER, peer.NANO} {
if json.Unmarshal(payload, &minPayload) == nil {
creatorID = minPayload.CreatorID
}
// PARTNER and MASTER receive every resource unconditionally.
for _, rel := range []peer.PeerRelation{peer.PARTNER, peer.MASTER} {
ps.PublishesCommon(dt, user, groups, &dbs.Filters{
And: map[string][]dbs.Filter{
"relation": {{Operator: dbs.EQUAL.String(), Value: rel}},
},
}, payload, proto)
}
// NANO: only send to the nano whose UUID matches the resource creator.
if creatorID != "" {
ps.PublishesCommon(dt, user, groups, &dbs.Filters{
And: map[string][]dbs.Filter{
"relation": {{Operator: dbs.EQUAL.String(), Value: peer.NANO}},
"id": {{Operator: dbs.EQUAL.String(), Value: creatorID}},
},
}, payload, proto)
}
return nil
}
// forwardToNano sends a booking mutation directly to a known NANO peer.
// The NANO peer is already resolved by the caller (resolveBookingNano).
// DTN critical is applied automatically by PublishCommon (Relation == NANO).
func (abs *StreamService) forwardToNano(nano *peer.Peer, evt *common.Event, proto string) {
dt := tools.DataType(evt.DataType)
abs.PublishCommon(&dt, evt.User, evt.Groups, nano.PeerID, protocol.ID(proto), evt.Payload)
}
func (s *StreamService) write(
did string,
peerID *pp.AddrInfo,
+51 -3
View File
@@ -27,6 +27,10 @@ const ProtocolConsidersResource = "/opencloud/resource/considers/1.0"
const ProtocolMinioConfigResource = "/opencloud/minio/config/1.0"
const ProtocolAdmiraltyConfigResource = "/opencloud/admiralty/config/1.0"
// ProtocolSourcePresignResource routes PB_SOURCE_PRESIGN to the resource-owner peer.
// The owner generates a pre-signed Minio URL and responds via PB_CONSIDERS.
const ProtocolSourcePresignResource = "/opencloud/resource/source-presign/1.0"
const ProtocolSearchResource = "/opencloud/resource/search/1.0"
const ProtocolCreateResource = "/opencloud/resource/create/1.0"
const ProtocolUpdateResource = "/opencloud/resource/update/1.0"
@@ -43,6 +47,7 @@ var protocols = map[protocol.ID]*common.ProtocolInfo{
ProtocolVerifyResource: {WaitResponse: true, TTL: 1 * time.Minute},
ProtocolMinioConfigResource: {WaitResponse: true, TTL: 1 * time.Minute},
ProtocolAdmiraltyConfigResource: {WaitResponse: true, TTL: 1 * time.Minute},
ProtocolSourcePresignResource: {WaitResponse: true, TTL: 1 * time.Minute},
ProtocolObserve: {WaitResponse: true, TTL: 1 * time.Minute},
}
@@ -63,8 +68,8 @@ type StreamService struct {
// IsPeerKnown, when set, is called at stream open for every inbound protocol.
// Return false to reset the stream immediately. Left nil until wired by the node.
IsPeerKnown func(pid pp.ID) bool
// dnt is the Disconnection Network Tolerance cache for outbound streams.
dnt *dntCache
// DTN is the Disconnection Network Tolerance cache for outbound streams.
dnt *DTNCache
// observeCache tracks running heartbeat goroutines on the OBSERVED side.
observeCache *observeCache
// hbBatcher accumulates incoming heartbeats (observing side) and flushes
@@ -78,6 +83,12 @@ type StreamService struct {
// observeUsers tracks which users are observing which peers so streams are
// closed only when the last observer for a peer disconnects.
observeUsers *userPeerIndex
// observeMetrics accumulates connection-quality data per observed peer (observing side).
// Keys are peer_id strings; values are *PeerObserveMetrics.
observeMetrics sync.Map
// DTNNudge receives peer IDs for which an immediate DTN retry should be
// attempted (e.g. when the peer just reconnected via PendingCallers).
dntNudge chan string
}
func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node common.DiscoveryPeer) (*StreamService, error) {
@@ -92,6 +103,7 @@ func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node c
dnt: newDNTCache(),
observeCache: newObserveCache(),
observeUsers: newUserPeerIndex(),
dntNudge: make(chan string, 32),
}
service.hbBatcher = newHeartbeatBatcher(flushObserveBatch)
for proto := range protocols {
@@ -105,6 +117,23 @@ func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node c
return service, nil
}
// PendingContacts returns the peer IDs that have at least one critical DTN
// entry pending. Called on each heartbeat tick to populate PendingContact.
func (s *StreamService) PendingContacts() []string {
return s.dnt.peersWithPending()
}
// NudgeContacts signals the DTN loop to retry immediately for the given peer
// IDs (typically received via HeartbeatResponse.PendingCallers).
func (s *StreamService) NudgeContacts(peerIDs []string) {
for _, id := range peerIDs {
select {
case s.dntNudge <- id:
default:
}
}
}
// gate wraps a stream handler with IsPeerKnown validation.
// If the peer is unknown the entire connection is closed and the handler is not called.
// IsPeerKnown is read at stream-open time so it works even when set after InitStream.
@@ -117,6 +146,17 @@ func (s *StreamService) gatePrivilege(h func(network.Stream)) func(network.Strea
},
}, "", false, 0, 1)
if len(d.Data) == 0 {
stream.Reset()
return
}
master := d.Data[0].(*peer.Peer)
if stream.Conn().RemotePeer().String() != master.PeerID {
logger := oclib.GetLogger()
logger.Warn().
Str("remote", stream.Conn().RemotePeer().String()).
Str("master", master.PeerID).
Msg("[gate] nano rejected stream from non-master peer")
stream.Reset()
return
}
}
@@ -162,9 +202,17 @@ func (s *StreamService) HandleResponse(stream network.Stream) {
Stream: stream,
Expiry: time.Now().UTC().Add(expiry + 1*time.Minute),
}
// ProtocolObserve uses a bidirectional long-lived stream: the remote writes
// heartbeats back on the same stream, and may later send a close event.
// Use a persistent readLoop so we can receive both heartbeats and close events.
protoInfo := protocols[stream.Protocol()]
if stream.Protocol() == ProtocolObserve {
protoInfo = &common.ProtocolInfo{PersistantStream: true}
}
go s.readLoop(s.Streams[stream.Protocol()][stream.Conn().RemotePeer()],
stream.Conn().RemotePeer(),
stream.Protocol(), protocols[stream.Protocol()])
stream.Protocol(), protoInfo)
}
func (s *StreamService) connectToPartners() error {