Discovery Neo Oclib
This commit is contained in:
@@ -66,6 +66,10 @@ type Heartbeat struct {
|
||||
// MembershipEvents carries SWIM events piggybacked on this heartbeat.
|
||||
// Events are forwarded infection-style until HopsLeft reaches 0.
|
||||
MembershipEvents []MemberEvent `json:"membership_events,omitempty"`
|
||||
// PendingContact lists peer IDs for which this node has undelivered critical
|
||||
// DTN entries. Indexers maintain an inverted index so those peers can
|
||||
// discover who is waiting for them when they reconnect.
|
||||
PendingContact []string `json:"pending_contact,omitempty"`
|
||||
}
|
||||
|
||||
// SearchPeerRequest is sent by a node to an indexer via ProtocolSearchPeer.
|
||||
@@ -134,6 +138,10 @@ type HeartbeatResponse struct {
|
||||
// MembershipEvents carries SWIM events piggybacked on this response.
|
||||
// The node should forward them to its other indexers (infection-style).
|
||||
MembershipEvents []MemberEvent `json:"membership_events,omitempty"`
|
||||
// PendingCallers lists peer IDs that have undelivered critical DTN messages
|
||||
// for the receiving node, as recorded by this indexer. On receipt the node
|
||||
// should initiate contact with each caller so it can flush its DTN cache.
|
||||
PendingCallers []string `json:"pending_callers,omitempty"`
|
||||
}
|
||||
|
||||
// ComputeIndexerScore computes a composite quality score [0, 100] for the connecting peer.
|
||||
|
||||
@@ -29,7 +29,7 @@ var retryRunning atomic.Bool
|
||||
// peer has at least 3 chances to respond or refute the suspicion signal.
|
||||
const suspectTimeout = 3 * RecommendedHeartbeatInterval
|
||||
|
||||
func ConnectToIndexers(h host.Host, minIndexer int, maxIndexer int, recordFn ...func() json.RawMessage) error {
|
||||
func ConnectToIndexers(h host.Host, minIndexer int, maxIndexer int, hooks ...HeartbeatHooks) error {
|
||||
TimeWatcher = time.Now().UTC()
|
||||
logger := oclib.GetLogger()
|
||||
|
||||
@@ -71,7 +71,7 @@ func ConnectToIndexers(h host.Host, minIndexer int, maxIndexer int, recordFn ...
|
||||
// Start long-lived heartbeat to seed indexers. The single goroutine follows
|
||||
// all subsequent StaticIndexers changes.
|
||||
SendHeartbeat(context.Background(), ProtocolHeartbeat, conf.GetConfig().Name,
|
||||
h, Indexers, 20*time.Second, maxIndexer, recordFn...)
|
||||
h, Indexers, 20*time.Second, maxIndexer, hooks...)
|
||||
|
||||
// Watch for inbound connections: if a peer connects to us and our pool has
|
||||
// room, probe it first to confirm it supports ProtocolHeartbeat (i.e. it is
|
||||
@@ -270,17 +270,29 @@ func handleSuggestions(d *Directory, from string, suggestions []pp.AddrInfo) {
|
||||
}
|
||||
}
|
||||
|
||||
// HeartbeatHooks carries optional callbacks injected into the heartbeat loop.
|
||||
type HeartbeatHooks struct {
|
||||
// RecordFn returns a fresh signed PeerRecord for embedding in each heartbeat.
|
||||
RecordFn func() json.RawMessage
|
||||
// PendingContactFn returns the list of peer IDs for which the caller has
|
||||
// undelivered critical DTN entries. Called on every tick.
|
||||
PendingContactFn func() []string
|
||||
// OnPendingCallers is invoked when an indexer response contains peer IDs
|
||||
// that have undelivered messages for us. The caller should initiate contact
|
||||
// with each of them so they can flush their DTN cache.
|
||||
OnPendingCallers func(callerPeerIDs []string)
|
||||
}
|
||||
|
||||
// SendHeartbeat starts a goroutine that sends periodic heartbeats to peers.
|
||||
// recordFn, when provided, is called on each tick and its output is embedded in
|
||||
// the heartbeat as a fresh signed PeerRecord so the receiving indexer can
|
||||
// republish it to the DHT without an extra round-trip.
|
||||
// Pass no recordFn (or nil) for indexer→indexer / native heartbeats.
|
||||
func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.Host, directory *Directory, interval time.Duration, maxPool int, recordFn ...func() json.RawMessage) {
|
||||
// hooks.RecordFn, when set, is called on each tick and its output is embedded
|
||||
// in the heartbeat as a fresh signed PeerRecord.
|
||||
// Pass an empty HeartbeatHooks (or none) for indexer→indexer / native heartbeats.
|
||||
func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.Host, directory *Directory, interval time.Duration, maxPool int, hooks ...HeartbeatHooks) {
|
||||
logger := oclib.GetLogger()
|
||||
isIndexerHB := directory == Indexers
|
||||
var recFn func() json.RawMessage
|
||||
if len(recordFn) > 0 {
|
||||
recFn = recordFn[0]
|
||||
var hk HeartbeatHooks
|
||||
if len(hooks) > 0 {
|
||||
hk = hooks[0]
|
||||
}
|
||||
go func() {
|
||||
logger.Info().Str("proto", string(proto)).Int("peers", len(directory.Addrs)).Msg("heartbeat started")
|
||||
@@ -306,8 +318,11 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
|
||||
IndexersBinded: addrs,
|
||||
Need: need,
|
||||
}
|
||||
if recFn != nil {
|
||||
baseHB.Record = recFn()
|
||||
if hk.RecordFn != nil {
|
||||
baseHB.Record = hk.RecordFn()
|
||||
}
|
||||
if hk.PendingContactFn != nil {
|
||||
baseHB.PendingContact = hk.PendingContactFn()
|
||||
}
|
||||
// Piggyback SWIM membership events on every outgoing heartbeat batch.
|
||||
// All peers in the pool receive the same events this tick.
|
||||
@@ -550,6 +565,12 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
|
||||
handleSuggestions(directory, ai.Info.ID.String(), resp.Suggestions)
|
||||
}
|
||||
|
||||
// PendingCallers: peers that have undelivered DTN messages for us.
|
||||
// Signal the DTN layer so it can flush immediately when it reaches them.
|
||||
if resp != nil && len(resp.PendingCallers) > 0 && hk.OnPendingCallers != nil {
|
||||
hk.OnPendingCallers(resp.PendingCallers)
|
||||
}
|
||||
|
||||
// Handle SuggestMigrate: indexer is overloaded and wants us to move.
|
||||
if resp != nil && resp.SuggestMigrate && isIndexerHB {
|
||||
nonSeedCount := 0
|
||||
|
||||
@@ -261,7 +261,10 @@ func (ix *LongLivedStreamRecordedService[T]) HandleHeartbeat(s network.Stream) {
|
||||
}
|
||||
|
||||
func CheckHeartbeat(h host.Host, s network.Stream, dec *json.Decoder, streams map[pp.ID]HeartBeatStreamed, lock *sync.RWMutex, maxNodes int) (*pp.ID, *Heartbeat, error) {
|
||||
if len(h.Network().Peers()) >= maxNodes {
|
||||
// Use the heartbeat stream count, not h.Network().Peers(), which includes
|
||||
// upstream indexer connections, short-lived protocol streams (publish/get/probe),
|
||||
// and zombie libp2p connections whose heartbeat stream has already been GC'd.
|
||||
if len(streams) >= maxNodes {
|
||||
return nil, nil, fmt.Errorf("too many connections, try another indexer")
|
||||
}
|
||||
var hb Heartbeat
|
||||
@@ -285,9 +288,11 @@ func CheckHeartbeat(h host.Host, s network.Stream, dec *json.Decoder, streams ma
|
||||
// E: measure the indexer's own subnet diversity, not the node's view.
|
||||
diversity := getOwnDiversityRate(h)
|
||||
// fillRate: fraction of indexer capacity used — higher = more peers trust this indexer.
|
||||
// Use heartbeat stream count (same as fill rate reported to nodes), not
|
||||
// h.Network().Peers() which inflates the count with upstream/probe connections.
|
||||
fillRate := 0.0
|
||||
if maxNodes > 0 {
|
||||
fillRate = float64(len(h.Network().Peers())) / float64(maxNodes)
|
||||
fillRate = float64(len(streams)) / float64(maxNodes)
|
||||
if fillRate > 1 {
|
||||
fillRate = 1
|
||||
}
|
||||
|
||||
@@ -184,11 +184,10 @@ func TempStream(h host.Host, ad pp.AddrInfo, proto protocol.ID, did string, stre
|
||||
}
|
||||
ctxTTL, cancelTTL := context.WithTimeout(context.Background(), expiry)
|
||||
defer cancelTTL()
|
||||
|
||||
if h.Network().Connectedness(ad.ID) != network.Connected {
|
||||
fmt.Println(ad.ID, len(h.Network().ConnsToPeer(ad.ID)))
|
||||
if len(h.Network().ConnsToPeer(ad.ID)) == 0 {
|
||||
if err := h.Connect(ctxTTL, ad); err != nil {
|
||||
fmt.Println("Connectedness", ad.ID, err)
|
||||
|
||||
return streams, err
|
||||
}
|
||||
}
|
||||
@@ -233,7 +232,8 @@ func sendHeartbeat(ctx context.Context, h host.Host, proto protocol.ID, p *pp.Ad
|
||||
pss, exists := streams[p.ID]
|
||||
ctxTTL, cancel := context.WithTimeout(ctx, 3*interval)
|
||||
defer cancel()
|
||||
if h.Network().Connectedness(p.ID) != network.Connected {
|
||||
fmt.Println(p.ID, len(h.Network().ConnsToPeer(p.ID)))
|
||||
if len(h.Network().ConnsToPeer(p.ID)) == 0 {
|
||||
if err := h.Connect(ctxTTL, *p); err != nil {
|
||||
logger.Err(err)
|
||||
return nil, 0, err
|
||||
|
||||
@@ -3,12 +3,12 @@ package common
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"github.com/libp2p/go-libp2p/core/host"
|
||||
"github.com/libp2p/go-libp2p/core/network"
|
||||
pp "github.com/libp2p/go-libp2p/core/peer"
|
||||
)
|
||||
|
||||
@@ -153,7 +153,8 @@ func TriggerConsensus(h host.Host, remaining []pp.AddrInfo, need int) {
|
||||
func probeIndexer(h host.Host, ai pp.AddrInfo) (*HeartbeatResponse, time.Duration, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 8*time.Second)
|
||||
defer cancel()
|
||||
if h.Network().Connectedness(ai.ID) != network.Connected {
|
||||
fmt.Println(ai.ID, len(h.Network().ConnsToPeer(ai.ID)))
|
||||
if len(h.Network().ConnsToPeer(ai.ID)) == 0 {
|
||||
if err := h.Connect(ctx, ai); err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
@@ -39,6 +39,8 @@ type PeerRecordPayload struct {
|
||||
PubKey []byte `json:"public_key"`
|
||||
ExpiryDate time.Time `json:"expiry_date"`
|
||||
IsNano bool `json:"is_nano"`
|
||||
// MasterID is the libp2p PeerID of this peer's MASTER, self-attested and signed.
|
||||
MasterID string `json:"master_id,omitempty"`
|
||||
// TTLSeconds is the publisher's declared lifetime for this record in seconds.
|
||||
// 0 means "use the default (120 s)". Included in the signed payload so it
|
||||
// cannot be altered by an intermediary.
|
||||
@@ -105,6 +107,7 @@ func (pr *PeerRecord) ExtractPeer(ourkey string, key string, pubKey crypto.PubKe
|
||||
NATSAddress: pr.NATSAddress,
|
||||
WalletAddress: pr.WalletAddress,
|
||||
Location: pr.Location,
|
||||
MasterID: pr.MasterID,
|
||||
}
|
||||
if time.Now().UTC().After(pr.ExpiryDate) {
|
||||
return pp.SELF == p.Relation, nil, errors.New("peer " + key + " is offline")
|
||||
@@ -285,6 +288,20 @@ func (ix *IndexerService) initNodeHandler() {
|
||||
}
|
||||
cancel2()
|
||||
}
|
||||
// PendingContact: update inverted index — for each target peer in the list,
|
||||
// record that hb.PeerID wants to contact it. Entries expire after 3 heartbeat
|
||||
// intervals so stale callers are cleaned up automatically if they stop advertising.
|
||||
if len(hb.PendingContact) > 0 {
|
||||
expiry := time.Now().Add(3 * 20 * time.Second)
|
||||
ix.pendingContactIndexMu.Lock()
|
||||
for _, targetID := range hb.PendingContact {
|
||||
if ix.pendingContactIndex[targetID] == nil {
|
||||
ix.pendingContactIndex[targetID] = map[string]time.Time{}
|
||||
}
|
||||
ix.pendingContactIndex[targetID][hb.PeerID] = expiry
|
||||
}
|
||||
ix.pendingContactIndexMu.Unlock()
|
||||
}
|
||||
}
|
||||
ix.Host.SetStreamHandler(common.ProtocolHeartbeat, ix.HandleHeartbeat)
|
||||
ix.Host.SetStreamHandler(common.ProtocolPublish, ix.handleNodePublish)
|
||||
@@ -351,7 +368,8 @@ func (ix *IndexerService) handleNodePublish(s network.Stream) {
|
||||
}
|
||||
continue
|
||||
}
|
||||
if _, err := rec.Verify(); err != nil {
|
||||
pubKey, err := rec.Verify()
|
||||
if err != nil {
|
||||
ix.behavior.RecordBadSignature(remotePeer)
|
||||
logger.Warn().Err(err).Str("peer", remotePeer.String()).Msg("bad signature on publish")
|
||||
return
|
||||
@@ -369,6 +387,26 @@ func (ix *IndexerService) handleNodePublish(s network.Stream) {
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
// Chain of trust: PubKey → PeerID (libp2p invariant), then transport identity.
|
||||
// This prevents a peer from publishing a record on behalf of someone else.
|
||||
if derivedID, err := lpp.IDFromPublicKey(pubKey); err != nil || derivedID != pid {
|
||||
ix.behavior.RecordBadSignature(remotePeer)
|
||||
logger.Warn().Str("peer", remotePeer.String()).Msg("PubKey/PeerID mismatch on publish")
|
||||
s.Reset()
|
||||
return
|
||||
}
|
||||
if remotePeer != pid {
|
||||
ix.behavior.RecordBadSignature(remotePeer)
|
||||
logger.Warn().Str("remote", remotePeer.String()).Str("claimed", pid.String()).Msg("transport identity mismatch on publish")
|
||||
s.Reset()
|
||||
return
|
||||
}
|
||||
if rec.StreamAddress != "" && !strings.HasSuffix(rec.StreamAddress, "/p2p/"+rec.PeerID) {
|
||||
ix.behavior.RecordBadSignature(remotePeer)
|
||||
logger.Warn().Str("peer", remotePeer.String()).Msg("StreamAddress/PeerID mismatch on publish")
|
||||
s.Reset()
|
||||
return
|
||||
}
|
||||
|
||||
ix.StreamMU.Lock()
|
||||
defer ix.StreamMU.Unlock()
|
||||
@@ -566,7 +604,7 @@ func (ix *IndexerService) handleIndirectProbe(s network.Stream) {
|
||||
// Connect to target if not already connected.
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 6*time.Second)
|
||||
defer cancel()
|
||||
if ix.Host.Network().Connectedness(req.Target.ID) != network.Connected {
|
||||
if len(ix.Host.Network().ConnsToPeer(req.Target.ID)) == 0 {
|
||||
if err := ix.Host.Connect(ctx, req.Target); err != nil {
|
||||
respond(false, 0)
|
||||
return
|
||||
|
||||
@@ -79,6 +79,11 @@ type IndexerService struct {
|
||||
// eventQueue holds SWIM membership events to be piggybacked on responses
|
||||
// (infection-style dissemination toward connected nodes).
|
||||
eventQueue *common.MembershipEventQueue
|
||||
// pendingContactIndex is an inverted index built from Heartbeat.PendingContact.
|
||||
// Maps target peer ID → { caller peer ID → expiry time }.
|
||||
// Returned in HeartbeatResponse.PendingCallers when the target reconnects.
|
||||
pendingContactIndex map[string]map[string]time.Time
|
||||
pendingContactIndexMu sync.Mutex
|
||||
}
|
||||
|
||||
// NewIndexerService creates an IndexerService.
|
||||
@@ -95,6 +100,7 @@ func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int) *IndexerServ
|
||||
behavior: newNodeBehaviorTracker(),
|
||||
deletedDIDs: make(map[string]time.Time),
|
||||
eventQueue: &common.MembershipEventQueue{},
|
||||
pendingContactIndex: map[string]map[string]time.Time{},
|
||||
}
|
||||
if ps == nil {
|
||||
ps, err = pubsub.NewGossipSub(context.Background(), ix.Host)
|
||||
@@ -408,6 +414,24 @@ func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int) *IndexerServ
|
||||
resp.Incarnation = ix.incarnation.Load()
|
||||
resp.MembershipEvents = ix.eventQueue.Drain(5)
|
||||
|
||||
// PendingCallers: look up who has undelivered messages for this node.
|
||||
// Clean up expired entries at the same time.
|
||||
ix.pendingContactIndexMu.Lock()
|
||||
if callers, ok := ix.pendingContactIndex[remotePeer.String()]; ok {
|
||||
now := time.Now()
|
||||
for callerID, exp := range callers {
|
||||
if now.Before(exp) {
|
||||
resp.PendingCallers = append(resp.PendingCallers, callerID)
|
||||
} else {
|
||||
delete(callers, callerID)
|
||||
}
|
||||
}
|
||||
if len(callers) == 0 {
|
||||
delete(ix.pendingContactIndex, remotePeer.String())
|
||||
}
|
||||
}
|
||||
ix.pendingContactIndexMu.Unlock()
|
||||
|
||||
return resp
|
||||
}
|
||||
|
||||
|
||||
+32
-3
@@ -10,6 +10,7 @@ import (
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/config"
|
||||
"cloud.o-forge.io/core/oc-lib/dbs"
|
||||
pp_model "cloud.o-forge.io/core/oc-lib/models/peer"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
pp "github.com/libp2p/go-libp2p/core/peer"
|
||||
@@ -51,11 +52,13 @@ func ListenNATS(n *Node) {
|
||||
}
|
||||
if err == nil {
|
||||
switch propalgation.Action {
|
||||
case tools.PB_ADMIRALTY_CONFIG, tools.PB_MINIO_CONFIG:
|
||||
case tools.PB_ADMIRALTY_CONFIG, tools.PB_MINIO_CONFIG, tools.PB_SOURCE_PRESIGN:
|
||||
var m configPayload
|
||||
var proto protocol.ID = stream.ProtocolAdmiraltyConfigResource
|
||||
if propalgation.Action == tools.PB_MINIO_CONFIG {
|
||||
proto = stream.ProtocolMinioConfigResource
|
||||
} else if propalgation.Action == tools.PB_SOURCE_PRESIGN {
|
||||
proto = stream.ProtocolSourcePresignResource
|
||||
}
|
||||
if err := json.Unmarshal(propalgation.Payload, &m); err == nil {
|
||||
peers, _ := n.GetPeerRecord(context.Background(), m.PeerID)
|
||||
@@ -68,9 +71,33 @@ func ListenNATS(n *Node) {
|
||||
if slices.Contains([]tools.DataType{tools.BOOKING, tools.PURCHASE_RESOURCE}, resp.Datatype) {
|
||||
m := map[string]interface{}{}
|
||||
if err := json.Unmarshal(propalgation.Payload, &m); err == nil {
|
||||
delivered := false
|
||||
if m["peer_id"] != nil {
|
||||
n.StreamService.PublishCommon(&resp.Datatype, resp.User, resp.Groups,
|
||||
_, err := n.StreamService.PublishCommon(&resp.Datatype, resp.User, resp.Groups,
|
||||
fmt.Sprintf("%v", m["peer_id"]), stream.ProtocolCreateResource, propalgation.Payload)
|
||||
delivered = err == nil
|
||||
}
|
||||
if !delivered {
|
||||
// NANO unreachable — look up its MasterID from the DB record.
|
||||
// The NANO self-attests its MASTER in its signed PeerRecord;
|
||||
// if MasterID is set we forward there, otherwise we drop silently.
|
||||
var destStruct struct {
|
||||
DestPeerID string `json:"dest_peer_id"`
|
||||
}
|
||||
if json.Unmarshal(propalgation.Payload, &destStruct) == nil && destStruct.DestPeerID != "" {
|
||||
d := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil).Search(&dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
"id": {{Operator: dbs.EQUAL.String(), Value: destStruct.DestPeerID}},
|
||||
},
|
||||
}, "", false, 0, 1)
|
||||
if len(d.Data) > 0 {
|
||||
nano := d.Data[0].(*pp_model.Peer)
|
||||
if nano.MasterID != "" {
|
||||
n.StreamService.PublishCommon(&resp.Datatype, resp.User, resp.Groups,
|
||||
nano.MasterID, stream.ProtocolCreateResource, propalgation.Payload)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@@ -150,6 +177,7 @@ func ListenNATS(n *Node) {
|
||||
// Re-emit on PEER_OBSERVE_RESPONSE_EVENT so the local oc-peer sees it.
|
||||
tools.NewNATSCaller().SetNATSPub(tools.PEER_OBSERVE_RESPONSE_EVENT, tools.NATSResponse{
|
||||
FromApp: resp.FromApp,
|
||||
User: resp.User,
|
||||
Datatype: tools.PEER,
|
||||
Method: int(tools.PEER_OBSERVE_RESPONSE_EVENT),
|
||||
Payload: propalgation.Payload,
|
||||
@@ -183,7 +211,7 @@ func ListenNATS(n *Node) {
|
||||
} else {
|
||||
m := map[string]interface{}{}
|
||||
if err := json.Unmarshal(propalgation.Payload, &m); err == nil {
|
||||
fmt.Println("PB_SEARCH CATA", m)
|
||||
fmt.Println("PB_SEARCH CATA", m, resp.User)
|
||||
|
||||
n.PubSubService.SearchPublishEvent(
|
||||
context.Background(),
|
||||
@@ -256,6 +284,7 @@ func handlePeerBehaviorEvent(n *Node, resp tools.NATSResponse) {
|
||||
if b, err := json.Marshal(p.Serialize(p)); err == nil {
|
||||
tools.NewNATSCaller().SetNATSPub(tools.CREATE_RESOURCE, tools.NATSResponse{
|
||||
FromApp: "oc-discovery",
|
||||
User: resp.User,
|
||||
Datatype: tools.PEER,
|
||||
Method: int(tools.CREATE_RESOURCE),
|
||||
Payload: b,
|
||||
|
||||
+23
-2
@@ -123,8 +123,25 @@ func InitNode(isNode bool, isIndexer bool) (*Node, error) {
|
||||
b, _ := json.Marshal(fresh)
|
||||
return json.RawMessage(b)
|
||||
}
|
||||
// streamSvcRef is set after InitStream below; the heartbeat goroutine
|
||||
// first fires after 20 s so it is always non-nil by then.
|
||||
var streamSvcRef *stream.StreamService
|
||||
logger.Info().Msg("connect to indexers...")
|
||||
common.ConnectToIndexers(node.Host, conf.GetConfig().MinIndexer, conf.GetConfig().MaxIndexer, buildRecord)
|
||||
common.ConnectToIndexers(node.Host, conf.GetConfig().MinIndexer, conf.GetConfig().MaxIndexer,
|
||||
common.HeartbeatHooks{
|
||||
RecordFn: buildRecord,
|
||||
PendingContactFn: func() []string {
|
||||
if streamSvcRef != nil {
|
||||
return streamSvcRef.PendingContacts()
|
||||
}
|
||||
return nil
|
||||
},
|
||||
OnPendingCallers: func(callers []string) {
|
||||
if streamSvcRef != nil {
|
||||
streamSvcRef.NudgeContacts(callers)
|
||||
}
|
||||
},
|
||||
})
|
||||
logger.Info().Msg("claims my node...")
|
||||
if _, err := node.claimInfo(conf.GetConfig().Name, conf.GetConfig().Hostname); err != nil {
|
||||
panic(err)
|
||||
@@ -135,6 +152,7 @@ func InitNode(isNode bool, isIndexer bool) (*Node, error) {
|
||||
if node.StreamService, err = stream.InitStream(context.Background(), node.Host, node.PeerID, 1000, node); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
streamSvcRef = node.StreamService
|
||||
node.StreamService.IsPeerKnown = func(pid pp.ID) bool {
|
||||
// 1. Local DB: known peer (handles blacklist).
|
||||
access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
|
||||
@@ -178,7 +196,7 @@ func InitNode(isNode bool, isIndexer bool) (*Node, error) {
|
||||
}
|
||||
fmt.Println("PUBSUB SendResponse bef peerrece")
|
||||
if p, err := node.GetPeerRecord(ctx, evt.From); err == nil && len(p) > 0 && m["search"] != nil {
|
||||
fmt.Println("PUBSUB SendResponse af peerrece", m)
|
||||
fmt.Println("PUBSUB SendResponse af peerrece", m, evt.User)
|
||||
node.StreamService.SendResponse(p[0], &evt, fmt.Sprintf("%v", m["search"]))
|
||||
}
|
||||
}
|
||||
@@ -411,8 +429,10 @@ func (d *Node) claimInfo(
|
||||
"peer_id": {{Operator: dbs.EQUAL.String(), Value: d.Host.ID().String()}},
|
||||
},
|
||||
}, "", false, 0, 1)
|
||||
var masterID string
|
||||
if len(peers.Data) > 0 {
|
||||
did = peers.Data[0].GetID() // if already existing set up did as made
|
||||
masterID = peers.Data[0].(*peer.Peer).MasterID
|
||||
}
|
||||
priv, err := tools.LoadKeyFromFilePrivate()
|
||||
if err != nil {
|
||||
@@ -434,6 +454,7 @@ func (d *Node) claimInfo(
|
||||
PubKey: pubBytes,
|
||||
|
||||
IsNano: oclib.GetConfig().IsNano,
|
||||
MasterID: masterID,
|
||||
TTLSeconds: indexer.DefaultTTLSeconds,
|
||||
ExpiryDate: now.Add(indexer.DefaultTTLSeconds * time.Second),
|
||||
}
|
||||
|
||||
@@ -49,7 +49,7 @@ func (ps *PubSubService) SearchPublishEvent(
|
||||
// remote peers echo it back unchanged, allowing IsActive to validate results.
|
||||
searchKey := ps.StreamService.ResourceSearches.Register(user, cancel, idleTimeout)
|
||||
fmt.Println("PUBLISH ON PUBSUB", common.TopicPubSubSearch, searchKey)
|
||||
return ps.publishEvent(searchCtx, dt, tools.PB_SEARCH, common.TopicPubSubSearch, searchKey, b)
|
||||
return ps.publishEvent(searchCtx, dt, tools.PB_SEARCH, common.TopicPubSubSearch, user, b)
|
||||
default:
|
||||
return errors.New("no type of research found")
|
||||
}
|
||||
|
||||
@@ -1,362 +0,0 @@
|
||||
package stream
|
||||
|
||||
// dnt_cache.go — Disconnection Network Tolerance cache for outbound stream requests.
|
||||
//
|
||||
// When a stream write fails because the remote peer is unreachable, the request
|
||||
// is saved here and retried on the next tick. Two levels are defined:
|
||||
//
|
||||
// - dntCritical : retry indefinitely (create / update / delete resource).
|
||||
// - dntModerate : up to dntMaxModerateRetries retries, then abandon.
|
||||
//
|
||||
// Pubsub messages and search streams are explicitly excluded.
|
||||
// Streams initiated from the indexer side are never enqueued here.
|
||||
//
|
||||
// # Crash-resilient persistence
|
||||
//
|
||||
// Critical entries are written to an encrypted file (AES-256-GCM) so they
|
||||
// survive a node crash/restart. The AES key is derived deterministically from
|
||||
// the node's Ed25519 private key via HKDF-SHA256 — no extra secret to manage.
|
||||
// Moderate entries are intentionally not persisted: their retry budget is small
|
||||
// enough that re-loading them after a restart would be misleading.
|
||||
|
||||
import (
|
||||
"crypto/aes"
|
||||
"crypto/cipher"
|
||||
"crypto/rand"
|
||||
"crypto/sha256"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
"golang.org/x/crypto/hkdf"
|
||||
|
||||
"oc-discovery/conf"
|
||||
|
||||
pp "github.com/libp2p/go-libp2p/core/peer"
|
||||
"github.com/libp2p/go-libp2p/core/protocol"
|
||||
)
|
||||
|
||||
type dntLevel int
|
||||
|
||||
const (
|
||||
dntCritical dntLevel = iota // retry until the message is delivered
|
||||
dntModerate // retry up to dntMaxModerateRetries times
|
||||
)
|
||||
|
||||
const dntMaxModerateRetries = 3
|
||||
const dntRetryInterval = 15 * time.Second
|
||||
|
||||
// dntProtocols maps each stream protocol to its DNT level.
|
||||
// Protocols absent from this map receive no caching (e.g. ProtocolSearchResource).
|
||||
var dntProtocols = map[protocol.ID]dntLevel{
|
||||
// Critical — data mutations that must eventually be delivered.
|
||||
ProtocolCreateResource: dntCritical,
|
||||
ProtocolUpdateResource: dntCritical,
|
||||
ProtocolDeleteResource: dntCritical,
|
||||
// Moderate — confirmations / config / planner: 3 retries before abandon.
|
||||
ProtocolVerifyResource: dntModerate,
|
||||
ProtocolSendPlanner: dntModerate,
|
||||
ProtocolConsidersResource: dntModerate,
|
||||
ProtocolMinioConfigResource: dntModerate,
|
||||
ProtocolAdmiraltyConfigResource: dntModerate,
|
||||
}
|
||||
|
||||
// dntEntryJSON is the on-disk representation of a dntEntry.
|
||||
// pp.AddrInfo and protocol.ID don't have built-in JSON tags so we flatten them.
|
||||
type dntEntryJSON struct {
|
||||
DID string `json:"did"`
|
||||
Addr pp.AddrInfo `json:"addr"`
|
||||
DT *tools.DataType `json:"dt,omitempty"`
|
||||
User string `json:"user"`
|
||||
Payload []byte `json:"payload"`
|
||||
Proto protocol.ID `json:"proto"`
|
||||
Retries int `json:"retries"`
|
||||
AddedAt time.Time `json:"added_at"`
|
||||
}
|
||||
|
||||
type dntEntry struct {
|
||||
did string
|
||||
addr pp.AddrInfo
|
||||
dt *tools.DataType
|
||||
user string
|
||||
payload []byte
|
||||
proto protocol.ID
|
||||
retries int
|
||||
addedAt time.Time
|
||||
}
|
||||
|
||||
func (e *dntEntry) toJSON() dntEntryJSON {
|
||||
return dntEntryJSON{
|
||||
DID: e.did,
|
||||
Addr: e.addr,
|
||||
DT: e.dt,
|
||||
User: e.user,
|
||||
Payload: e.payload,
|
||||
Proto: e.proto,
|
||||
Retries: e.retries,
|
||||
AddedAt: e.addedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func entryFromJSON(j dntEntryJSON) *dntEntry {
|
||||
return &dntEntry{
|
||||
did: j.DID,
|
||||
addr: j.Addr,
|
||||
dt: j.DT,
|
||||
user: j.User,
|
||||
payload: j.Payload,
|
||||
proto: j.Proto,
|
||||
retries: j.Retries,
|
||||
addedAt: j.AddedAt,
|
||||
}
|
||||
}
|
||||
|
||||
type dntCache struct {
|
||||
mu sync.Mutex
|
||||
entries []*dntEntry
|
||||
// aesKey is the derived AES-256 key used for on-disk encryption.
|
||||
// Nil when key derivation failed: persistence is disabled but the in-memory
|
||||
// cache continues to function normally.
|
||||
aesKey []byte
|
||||
}
|
||||
|
||||
// newDNTCache initialises the cache, derives the encryption key, and restores
|
||||
// any critical entries that were persisted before the last crash.
|
||||
func newDNTCache() *dntCache {
|
||||
log := oclib.GetLogger()
|
||||
c := &dntCache{}
|
||||
key, err := deriveDNTKey()
|
||||
if err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] key derivation failed — persistence disabled")
|
||||
} else {
|
||||
c.aesKey = key
|
||||
c.loadFromDisk()
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
// enqueue adds an entry to the cache and persists critical entries to disk.
|
||||
func (c *dntCache) enqueue(e *dntEntry) {
|
||||
c.mu.Lock()
|
||||
c.entries = append(c.entries, e)
|
||||
c.mu.Unlock()
|
||||
if dntProtocols[e.proto] == dntCritical {
|
||||
go c.persistToDisk()
|
||||
}
|
||||
}
|
||||
|
||||
// drain atomically removes and returns all current entries.
|
||||
func (c *dntCache) drain() []*dntEntry {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
out := c.entries
|
||||
c.entries = nil
|
||||
return out
|
||||
}
|
||||
|
||||
// requeue puts entries back at the head of the list, preserving any new
|
||||
// entries added while the retry loop was running.
|
||||
func (c *dntCache) requeue(entries []*dntEntry) {
|
||||
if len(entries) == 0 {
|
||||
return
|
||||
}
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
c.entries = append(entries, c.entries...)
|
||||
}
|
||||
|
||||
// ── Persistence ──────────────────────────────────────────────────────────────
|
||||
|
||||
// dntCachePath returns the path of the on-disk cache file, placed next to the
|
||||
// node's private key so it lives on the same persistent volume.
|
||||
func dntCachePath() string {
|
||||
return filepath.Join(filepath.Dir(conf.GetConfig().PrivateKeyPath), "dnt_cache.bin")
|
||||
}
|
||||
|
||||
// deriveDNTKey derives a 32-byte AES key from the node's Ed25519 private key
|
||||
// using HKDF-SHA256. The derivation is deterministic: the same key is always
|
||||
// produced from the same private key, so no symmetric secret needs storing.
|
||||
func deriveDNTKey() ([]byte, error) {
|
||||
priv, err := tools.LoadKeyFromFilePrivate()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// Raw() on a libp2p Ed25519 private key returns the 64-byte representation
|
||||
// (32-byte seed || 32-byte public key). We use the full 64 bytes as IKM.
|
||||
raw, err := priv.Raw()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
reader := hkdf.New(sha256.New, raw, nil, []byte("oc-discovery/dnt-cache/v1"))
|
||||
key := make([]byte, 32)
|
||||
if _, err := io.ReadFull(reader, key); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return key, nil
|
||||
}
|
||||
|
||||
// persistToDisk encrypts all current critical entries and writes them to disk.
|
||||
// Non-critical entries are deliberately excluded — they are not worth restoring
|
||||
// after a restart given their limited retry budget.
|
||||
func (c *dntCache) persistToDisk() {
|
||||
if c.aesKey == nil {
|
||||
return
|
||||
}
|
||||
log := oclib.GetLogger()
|
||||
c.mu.Lock()
|
||||
var toSave []dntEntryJSON
|
||||
for _, e := range c.entries {
|
||||
if dntProtocols[e.proto] == dntCritical {
|
||||
toSave = append(toSave, e.toJSON())
|
||||
}
|
||||
}
|
||||
c.mu.Unlock()
|
||||
|
||||
plaintext, err := json.Marshal(toSave)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
block, err := aes.NewCipher(c.aesKey)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
gcm, err := cipher.NewGCM(block)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
nonce := make([]byte, gcm.NonceSize())
|
||||
if _, err := io.ReadFull(rand.Reader, nonce); err != nil {
|
||||
return
|
||||
}
|
||||
ciphertext := gcm.Seal(nonce, nonce, plaintext, nil)
|
||||
|
||||
path := dntCachePath()
|
||||
tmp := path + ".tmp"
|
||||
if err := os.WriteFile(tmp, ciphertext, 0600); err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] failed to write cache file")
|
||||
return
|
||||
}
|
||||
if err := os.Rename(tmp, path); err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] failed to rename cache file")
|
||||
_ = os.Remove(tmp)
|
||||
}
|
||||
}
|
||||
|
||||
// loadFromDisk decrypts the on-disk cache and re-enqueues only critical entries.
|
||||
// Errors (missing file, decryption failure) are non-fatal: the cache simply
|
||||
// starts empty, which is safe.
|
||||
func (c *dntCache) loadFromDisk() {
|
||||
if c.aesKey == nil {
|
||||
return
|
||||
}
|
||||
log := oclib.GetLogger()
|
||||
path := dntCachePath()
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
if !os.IsNotExist(err) {
|
||||
log.Warn().Err(err).Msg("[dnt] failed to read cache file")
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
block, err := aes.NewCipher(c.aesKey)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
gcm, err := cipher.NewGCM(block)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
if len(data) < gcm.NonceSize() {
|
||||
log.Warn().Msg("[dnt] cache file too short, ignoring")
|
||||
return
|
||||
}
|
||||
nonce, ciphertext := data[:gcm.NonceSize()], data[gcm.NonceSize():]
|
||||
plaintext, err := gcm.Open(nil, nonce, ciphertext, nil)
|
||||
if err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] cache file decryption failed (key mismatch?), ignoring")
|
||||
return
|
||||
}
|
||||
|
||||
var saved []dntEntryJSON
|
||||
if err := json.Unmarshal(plaintext, &saved); err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] cache file unmarshal failed, ignoring")
|
||||
return
|
||||
}
|
||||
|
||||
count := 0
|
||||
for _, j := range saved {
|
||||
// Only restore critical entries — moderate entries are intentionally
|
||||
// not persisted, but this guard defends against format changes.
|
||||
if dntProtocols[j.Proto] != dntCritical {
|
||||
continue
|
||||
}
|
||||
c.entries = append(c.entries, entryFromJSON(j))
|
||||
count++
|
||||
}
|
||||
if count > 0 {
|
||||
log.Info().Int("count", count).Msg("[dnt] restored critical entries from disk")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Retry loop ────────────────────────────────────────────────────────────────
|
||||
|
||||
// startDNTLoop runs the background retry goroutine. Call once after init.
|
||||
func (s *StreamService) startDNTLoop() {
|
||||
logger := oclib.GetLogger()
|
||||
ticker := time.NewTicker(dntRetryInterval)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
entries := s.dnt.drain()
|
||||
if len(entries) == 0 {
|
||||
continue
|
||||
}
|
||||
var keep []*dntEntry
|
||||
for _, e := range entries {
|
||||
_, err := s.write(e.did, &e.addr, e.dt, e.user, e.payload, e.proto)
|
||||
if err == nil {
|
||||
level := dntProtocols[e.proto]
|
||||
if level == dntCritical {
|
||||
logger.Info().
|
||||
Str("proto", string(e.proto)).
|
||||
Str("peer", e.did).
|
||||
Msg("[dnt] critical message delivered after retry")
|
||||
} else {
|
||||
logger.Info().
|
||||
Str("proto", string(e.proto)).
|
||||
Str("peer", e.did).
|
||||
Int("retries", e.retries).
|
||||
Msg("[dnt] moderate message delivered after retry")
|
||||
}
|
||||
continue
|
||||
}
|
||||
level := dntProtocols[e.proto]
|
||||
switch level {
|
||||
case dntCritical:
|
||||
keep = append(keep, e)
|
||||
case dntModerate:
|
||||
e.retries++
|
||||
if e.retries < dntMaxModerateRetries {
|
||||
keep = append(keep, e)
|
||||
} else {
|
||||
logger.Warn().
|
||||
Str("proto", string(e.proto)).
|
||||
Str("peer", e.did).
|
||||
Int("retries", e.retries).
|
||||
Msg("[dnt] moderate message abandoned after max retries")
|
||||
}
|
||||
}
|
||||
}
|
||||
s.dnt.requeue(keep)
|
||||
// Persist after each tick so the on-disk file reflects the current
|
||||
// state (entries delivered are removed, new ones from concurrent
|
||||
// enqueues are included).
|
||||
go s.dnt.persistToDisk()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,446 @@
|
||||
package stream
|
||||
|
||||
// DTN_cache.go — Disconnection Network Tolerance cache for outbound stream requests.
|
||||
//
|
||||
// When a stream write fails because the remote peer is unreachable, the request
|
||||
// is saved here and retried on the next tick. Two levels are defined:
|
||||
//
|
||||
// - DTNCritical : retry indefinitely (create / update / delete resource).
|
||||
// - DTNModerate : up to DTNMaxModerateRetries retries, then abandon.
|
||||
//
|
||||
// Pubsub messages and search streams are explicitly excluded.
|
||||
// Streams initiated from the indexer side are never enqueued here.
|
||||
//
|
||||
// # Crash-resilient persistence
|
||||
//
|
||||
// Critical entries are written to an encrypted file (AES-256-GCM) so they
|
||||
// survive a node crash/restart. The AES key is derived deterministically from
|
||||
// the node's Ed25519 private key via HKDF-SHA256 — no extra secret to manage.
|
||||
// Moderate entries are intentionally not persisted: their retry budget is small
|
||||
// enough that re-loading them after a restart would be misleading.
|
||||
|
||||
import (
|
||||
"crypto/aes"
|
||||
"crypto/cipher"
|
||||
"crypto/rand"
|
||||
"crypto/sha256"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
"golang.org/x/crypto/hkdf"
|
||||
|
||||
"oc-discovery/conf"
|
||||
|
||||
pp "github.com/libp2p/go-libp2p/core/peer"
|
||||
"github.com/libp2p/go-libp2p/core/protocol"
|
||||
)
|
||||
|
||||
type DTNLevel int
|
||||
|
||||
const (
|
||||
DTNCritical DTNLevel = iota // retry until the message is delivered
|
||||
DTNModerate // retry up to DTNMaxModerateRetries times
|
||||
)
|
||||
|
||||
const DTNMaxModerateRetries = 3
|
||||
const DTNRetryInterval = 15 * time.Second
|
||||
|
||||
// DTNProtocols maps each stream protocol to its DTN level.
|
||||
// Protocols absent from this map receive no caching (e.g. ProtocolSearchResource).
|
||||
var DTNProtocols = map[protocol.ID]DTNLevel{
|
||||
// Critical — data mutations that must eventually be delivered.
|
||||
ProtocolCreateResource: DTNCritical,
|
||||
ProtocolUpdateResource: DTNCritical,
|
||||
ProtocolDeleteResource: DTNCritical,
|
||||
// Moderate — confirmations / config / planner: 3 retries before abandon.
|
||||
ProtocolVerifyResource: DTNModerate,
|
||||
ProtocolSendPlanner: DTNModerate,
|
||||
ProtocolConsidersResource: DTNModerate,
|
||||
ProtocolMinioConfigResource: DTNModerate,
|
||||
ProtocolAdmiraltyConfigResource: DTNModerate,
|
||||
ProtocolSourcePresignResource: DTNModerate,
|
||||
}
|
||||
|
||||
// DTNEntryJSON is the on-disk representation of a DTNEntry.
|
||||
// pp.AddrInfo and protocol.ID don't have built-in JSON tags so we flatten them.
|
||||
type DTNEntryJSON struct {
|
||||
DID string `json:"did"`
|
||||
ResourceID string `json:"resource_id,omitempty"`
|
||||
ForceCritical bool `json:"force_critical,omitempty"`
|
||||
Addr pp.AddrInfo `json:"addr"`
|
||||
DT *tools.DataType `json:"dt,omitempty"`
|
||||
User string `json:"user"`
|
||||
Payload []byte `json:"payload"`
|
||||
Proto protocol.ID `json:"proto"`
|
||||
Retries int `json:"retries"`
|
||||
AddedAt time.Time `json:"added_at"`
|
||||
}
|
||||
|
||||
type DTNEntry struct {
|
||||
did string
|
||||
resourceID string // UUID of the resource; empty for non-resource payloads (planner, config)
|
||||
forceCritical bool // true when destination is NANO: all protocols become critical
|
||||
addr pp.AddrInfo
|
||||
dt *tools.DataType
|
||||
user string
|
||||
payload []byte
|
||||
proto protocol.ID
|
||||
retries int
|
||||
addedAt time.Time
|
||||
}
|
||||
|
||||
// isEffectivelyCritical returns true when the entry must be retried indefinitely,
|
||||
// either because its protocol is inherently critical or because the destination
|
||||
// is a NANO peer (forceCritical).
|
||||
func (e *DTNEntry) isEffectivelyCritical() bool {
|
||||
return DTNProtocols[e.proto] == DTNCritical || e.forceCritical
|
||||
}
|
||||
|
||||
func (e *DTNEntry) toJSON() DTNEntryJSON {
|
||||
return DTNEntryJSON{
|
||||
DID: e.did,
|
||||
ResourceID: e.resourceID,
|
||||
ForceCritical: e.forceCritical,
|
||||
Addr: e.addr,
|
||||
DT: e.dt,
|
||||
User: e.user,
|
||||
Payload: e.payload,
|
||||
Proto: e.proto,
|
||||
Retries: e.retries,
|
||||
AddedAt: e.addedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func entryFromJSON(j DTNEntryJSON) *DTNEntry {
|
||||
return &DTNEntry{
|
||||
did: j.DID,
|
||||
resourceID: j.ResourceID,
|
||||
forceCritical: j.ForceCritical,
|
||||
addr: j.Addr,
|
||||
dt: j.DT,
|
||||
user: j.User,
|
||||
payload: j.Payload,
|
||||
proto: j.Proto,
|
||||
retries: j.Retries,
|
||||
addedAt: j.AddedAt,
|
||||
}
|
||||
}
|
||||
|
||||
type DTNCache struct {
|
||||
mu sync.Mutex
|
||||
entries []*DTNEntry
|
||||
// aesKey is the derived AES-256 key used for on-disk encryption.
|
||||
// Nil when key derivation failed: persistence is disabled but the in-memory
|
||||
// cache continues to function normally.
|
||||
aesKey []byte
|
||||
}
|
||||
|
||||
// newDNTCache initialises the cache, derives the encryption key, and restores
|
||||
// any critical entries that were persisted before the last crash.
|
||||
func newDNTCache() *DTNCache {
|
||||
log := oclib.GetLogger()
|
||||
c := &DTNCache{}
|
||||
key, err := deriveDNTKey()
|
||||
if err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] key derivation failed — persistence disabled")
|
||||
} else {
|
||||
c.aesKey = key
|
||||
c.loadFromDisk()
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
// extractResourceID returns the "id" field from a JSON resource payload.
|
||||
// Returns "" when the payload is not a resource object (planner, config, etc.).
|
||||
func extractResourceID(payload []byte) string {
|
||||
var obj struct {
|
||||
ID string `json:"id"`
|
||||
}
|
||||
if err := json.Unmarshal(payload, &obj); err != nil {
|
||||
return ""
|
||||
}
|
||||
return obj.ID
|
||||
}
|
||||
|
||||
// enqueue adds an entry to the cache, respecting the resource lifecycle.
|
||||
// Deduplication key is (did, resourceID): same resource to the same peer keeps
|
||||
// only the latest mutation. resourceID is empty for non-resource payloads
|
||||
// (planner, config), in which case deduplication falls back to did alone.
|
||||
//
|
||||
// - DELETE is terminal: any subsequent mutation on the same key is discarded.
|
||||
// - UPDATE cannot be followed by CREATE: the resource already exists remotely.
|
||||
// - All other cases replace the existing entry (newer mutation supersedes).
|
||||
func (c *DTNCache) enqueue(e *DTNEntry) {
|
||||
c.mu.Lock()
|
||||
found, mutated := false, false
|
||||
for i, existing := range c.entries {
|
||||
if existing.did != e.did || existing.resourceID != e.resourceID {
|
||||
continue
|
||||
}
|
||||
found = true
|
||||
if existing.proto == ProtocolDeleteResource ||
|
||||
(existing.proto == ProtocolUpdateResource && e.proto == ProtocolCreateResource) {
|
||||
break // discard new entry silently — existing state is authoritative
|
||||
}
|
||||
c.entries[i] = e
|
||||
mutated = true
|
||||
break
|
||||
}
|
||||
if !found {
|
||||
c.entries = append(c.entries, e)
|
||||
mutated = true
|
||||
}
|
||||
c.mu.Unlock()
|
||||
if mutated && e.isEffectivelyCritical() {
|
||||
go c.persistToDisk()
|
||||
}
|
||||
}
|
||||
|
||||
// peersWithPending returns the distinct peer IDs (did) that have at least one
|
||||
// critical entry in the cache. Used to populate Heartbeat.PendingContact.
|
||||
func (c *DTNCache) peersWithPending() []string {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
seen := map[string]struct{}{}
|
||||
var out []string
|
||||
for _, e := range c.entries {
|
||||
if e.isEffectivelyCritical() {
|
||||
if _, ok := seen[e.did]; !ok {
|
||||
seen[e.did] = struct{}{}
|
||||
out = append(out, e.did)
|
||||
}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// drain atomically removes and returns all current entries.
|
||||
func (c *DTNCache) drain() []*DTNEntry {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
out := c.entries
|
||||
c.entries = nil
|
||||
return out
|
||||
}
|
||||
|
||||
// requeue puts entries back at the head of the list, preserving any new
|
||||
// entries added while the retry loop was running.
|
||||
func (c *DTNCache) requeue(entries []*DTNEntry) {
|
||||
if len(entries) == 0 {
|
||||
return
|
||||
}
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
c.entries = append(entries, c.entries...)
|
||||
}
|
||||
|
||||
// ── Persistence ──────────────────────────────────────────────────────────────
|
||||
|
||||
// DTNCachePath returns the path of the on-disk cache file, placed next to the
|
||||
// node's private key so it lives on the same persistent volume.
|
||||
func DTNCachePath() string {
|
||||
return filepath.Join(filepath.Dir(conf.GetConfig().PrivateKeyPath), "dnt_cache.bin")
|
||||
}
|
||||
|
||||
// deriveDNTKey derives a 32-byte AES key from the node's Ed25519 private key
|
||||
// using HKDF-SHA256. The derivation is deterministic: the same key is always
|
||||
// produced from the same private key, so no symmetric secret needs storing.
|
||||
func deriveDNTKey() ([]byte, error) {
|
||||
priv, err := tools.LoadKeyFromFilePrivate()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// Raw() on a libp2p Ed25519 private key returns the 64-byte representation
|
||||
// (32-byte seed || 32-byte public key). We use the full 64 bytes as IKM.
|
||||
raw, err := priv.Raw()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
reader := hkdf.New(sha256.New, raw, nil, []byte("oc-discovery/dnt-cache/v1"))
|
||||
key := make([]byte, 32)
|
||||
if _, err := io.ReadFull(reader, key); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return key, nil
|
||||
}
|
||||
|
||||
// persistToDisk encrypts all current critical entries and writes them to disk.
|
||||
// Non-critical entries are deliberately excluded — they are not worth restoring
|
||||
// after a restart given their limited retry budget.
|
||||
func (c *DTNCache) persistToDisk() {
|
||||
if c.aesKey == nil {
|
||||
return
|
||||
}
|
||||
log := oclib.GetLogger()
|
||||
c.mu.Lock()
|
||||
var toSave []DTNEntryJSON
|
||||
for _, e := range c.entries {
|
||||
if e.isEffectivelyCritical() {
|
||||
toSave = append(toSave, e.toJSON())
|
||||
}
|
||||
}
|
||||
c.mu.Unlock()
|
||||
|
||||
plaintext, err := json.Marshal(toSave)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
block, err := aes.NewCipher(c.aesKey)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
gcm, err := cipher.NewGCM(block)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
nonce := make([]byte, gcm.NonceSize())
|
||||
if _, err := io.ReadFull(rand.Reader, nonce); err != nil {
|
||||
return
|
||||
}
|
||||
ciphertext := gcm.Seal(nonce, nonce, plaintext, nil)
|
||||
|
||||
path := DTNCachePath()
|
||||
tmp := path + ".tmp"
|
||||
if err := os.WriteFile(tmp, ciphertext, 0600); err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] failed to write cache file")
|
||||
return
|
||||
}
|
||||
if err := os.Rename(tmp, path); err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] failed to rename cache file")
|
||||
_ = os.Remove(tmp)
|
||||
}
|
||||
}
|
||||
|
||||
// loadFromDisk decrypts the on-disk cache and re-enqueues only critical entries.
|
||||
// Errors (missing file, decryption failure) are non-fatal: the cache simply
|
||||
// starts empty, which is safe.
|
||||
func (c *DTNCache) loadFromDisk() {
|
||||
if c.aesKey == nil {
|
||||
return
|
||||
}
|
||||
log := oclib.GetLogger()
|
||||
path := DTNCachePath()
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
if !os.IsNotExist(err) {
|
||||
log.Warn().Err(err).Msg("[dnt] failed to read cache file")
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
block, err := aes.NewCipher(c.aesKey)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
gcm, err := cipher.NewGCM(block)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
if len(data) < gcm.NonceSize() {
|
||||
log.Warn().Msg("[dnt] cache file too short, ignoring")
|
||||
return
|
||||
}
|
||||
nonce, ciphertext := data[:gcm.NonceSize()], data[gcm.NonceSize():]
|
||||
plaintext, err := gcm.Open(nil, nonce, ciphertext, nil)
|
||||
if err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] cache file decryption failed (key mismatch?), ignoring")
|
||||
return
|
||||
}
|
||||
|
||||
var saved []DTNEntryJSON
|
||||
if err := json.Unmarshal(plaintext, &saved); err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] cache file unmarshal failed, ignoring")
|
||||
return
|
||||
}
|
||||
|
||||
count := 0
|
||||
for _, j := range saved {
|
||||
// Only restore critical entries — moderate entries are intentionally
|
||||
// not persisted, but this guard defends against format changes.
|
||||
e := entryFromJSON(j)
|
||||
if !e.isEffectivelyCritical() {
|
||||
continue
|
||||
}
|
||||
c.entries = append(c.entries, e)
|
||||
count++
|
||||
}
|
||||
if count > 0 {
|
||||
log.Info().Int("count", count).Msg("[dnt] restored critical entries from disk")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Retry loop ────────────────────────────────────────────────────────────────
|
||||
|
||||
// startDNTLoop runs the background retry goroutine. Call once after init.
|
||||
func (s *StreamService) startDNTLoop() {
|
||||
logger := oclib.GetLogger()
|
||||
ticker := time.NewTicker(DTNRetryInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
// retryEntries attempts delivery for the given entries and returns those
|
||||
// that must be kept for the next round.
|
||||
retryEntries := func(entries []*DTNEntry) []*DTNEntry {
|
||||
var keep []*DTNEntry
|
||||
for _, e := range entries {
|
||||
_, err := s.write(e.did, &e.addr, e.dt, e.user, e.payload, e.proto)
|
||||
if err == nil {
|
||||
if e.isEffectivelyCritical() {
|
||||
logger.Info().Str("proto", string(e.proto)).Str("peer", e.did).
|
||||
Msg("[dnt] critical message delivered after retry")
|
||||
} else {
|
||||
logger.Info().Str("proto", string(e.proto)).Str("peer", e.did).
|
||||
Int("retries", e.retries).Msg("[dnt] moderate message delivered after retry")
|
||||
}
|
||||
continue
|
||||
}
|
||||
if e.isEffectivelyCritical() {
|
||||
keep = append(keep, e)
|
||||
} else {
|
||||
e.retries++
|
||||
if e.retries < DTNMaxModerateRetries {
|
||||
keep = append(keep, e)
|
||||
} else {
|
||||
logger.Warn().Str("proto", string(e.proto)).Str("peer", e.did).
|
||||
Int("retries", e.retries).Msg("[dnt] moderate message abandoned after max retries")
|
||||
}
|
||||
}
|
||||
}
|
||||
return keep
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
entries := s.dnt.drain()
|
||||
if len(entries) == 0 {
|
||||
continue
|
||||
}
|
||||
s.dnt.requeue(retryEntries(entries))
|
||||
go s.dnt.persistToDisk()
|
||||
|
||||
case peerID := <-s.dntNudge:
|
||||
// A peer just signalled it is reachable — retry its entries immediately.
|
||||
entries := s.dnt.drain()
|
||||
var forPeer, other []*DTNEntry
|
||||
for _, e := range entries {
|
||||
if e.did == peerID {
|
||||
forPeer = append(forPeer, e)
|
||||
} else {
|
||||
other = append(other, e)
|
||||
}
|
||||
}
|
||||
kept := retryEntries(forPeer)
|
||||
s.dnt.requeue(append(kept, other...))
|
||||
if len(kept) < len(forPeer) {
|
||||
go s.dnt.persistToDisk()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
"cloud.o-forge.io/core/oc-lib/models/resources"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
"github.com/libp2p/go-libp2p/core/network"
|
||||
pp "github.com/libp2p/go-libp2p/core/peer"
|
||||
)
|
||||
|
||||
type Verify struct {
|
||||
@@ -23,8 +24,18 @@ type Verify struct {
|
||||
|
||||
func (ps *StreamService) handleEvent(protocol string, evt *common.Event, s network.Stream) error {
|
||||
fmt.Println("handleEvent", protocol)
|
||||
// Heartbeat received on an outgoing ProtocolObserve stream.
|
||||
if protocol == ProtocolObserve {
|
||||
// Distinguish between an open request and a close request by inspecting
|
||||
// the ObserveRequest payload. The remote wraps both in a common.Event
|
||||
// with Type=ProtocolObserve so the persistent readLoop can decode them.
|
||||
var req ObserveRequest
|
||||
if evt.Payload != nil {
|
||||
json.Unmarshal(evt.Payload, &req) //nolint:errcheck — zero value means open
|
||||
}
|
||||
if req.Close {
|
||||
ps.observeCache.cancel(s.Conn().RemotePeer().String())
|
||||
return nil
|
||||
}
|
||||
return ps.handleIncomingObserve(s)
|
||||
}
|
||||
if protocol == observeHBEventType {
|
||||
@@ -59,6 +70,11 @@ func (ps *StreamService) handleEvent(protocol string, evt *common.Event, s netwo
|
||||
return err
|
||||
}
|
||||
}
|
||||
if protocol == ProtocolSourcePresignResource {
|
||||
if err := ps.pass(evt, tools.SOURCE_PRESIGN_EVENT); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if protocol == ProtocolAdmiraltyConfigResource {
|
||||
if err := ps.pass(evt, tools.ADMIRALTY_CONFIG_EVENT); err != nil {
|
||||
return err
|
||||
@@ -125,9 +141,9 @@ func (abs *StreamService) sendPlanner(event *common.Event) error { //
|
||||
}
|
||||
|
||||
func (abs *StreamService) retrieveResponse(event *common.Event) error { //
|
||||
if !abs.ResourceSearches.IsActive(event.User) {
|
||||
/*if !abs.ResourceSearches.IsActive(event.User) {
|
||||
return nil // search already closed or timed out
|
||||
}
|
||||
}*/
|
||||
res, err := resources.ToResource(int(event.DataType), event.Payload)
|
||||
if err != nil || res == nil {
|
||||
return nil
|
||||
@@ -137,6 +153,7 @@ func (abs *StreamService) retrieveResponse(event *common.Event) error { //
|
||||
b, err := json.Marshal(res.Serialize(res))
|
||||
go tools.NewNATSCaller().SetNATSPub(tools.SEARCH_EVENT, tools.NATSResponse{
|
||||
FromApp: "oc-discovery",
|
||||
User: event.User,
|
||||
Datatype: tools.DataType(event.DataType),
|
||||
Method: int(tools.SEARCH_EVENT),
|
||||
Payload: b,
|
||||
@@ -147,6 +164,7 @@ func (abs *StreamService) retrieveResponse(event *common.Event) error { //
|
||||
func (abs *StreamService) pass(event *common.Event, method tools.NATSMethod) error { //
|
||||
go tools.NewNATSCaller().SetNATSPub(method, tools.NATSResponse{
|
||||
FromApp: "oc-discovery",
|
||||
User: event.User,
|
||||
Datatype: tools.DataType(event.DataType),
|
||||
Method: int(method),
|
||||
Payload: event.Payload,
|
||||
@@ -154,6 +172,36 @@ func (abs *StreamService) pass(event *common.Event, method tools.NATSMethod) err
|
||||
return nil
|
||||
}
|
||||
|
||||
// resolveBookingNano does a single DB lookup and returns:
|
||||
//
|
||||
// (nil, true) — not a booking, dest_peer_id absent, or dest == self → process normally, no forward
|
||||
// (nano, true) — dest is one of our NANO peers → process + forward to nano
|
||||
// (nil, false) — dest is unknown → ignore
|
||||
func (ps *StreamService) resolveBookingNano(evt *common.Event) (*peer.Peer, bool) {
|
||||
if tools.DataType(evt.DataType) != tools.BOOKING {
|
||||
return nil, true
|
||||
}
|
||||
var b struct {
|
||||
DestPeerID string `json:"dest_peer_id"`
|
||||
}
|
||||
if err := json.Unmarshal(evt.Payload, &b); err != nil || b.DestPeerID == "" {
|
||||
return nil, true
|
||||
}
|
||||
if self, err := oclib.GetMySelf(); err == nil && self != nil && b.DestPeerID == self.GetID() {
|
||||
return nil, true
|
||||
}
|
||||
d := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil).Search(&dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
"id": {{Operator: dbs.EQUAL.String(), Value: b.DestPeerID}},
|
||||
"relation": {{Operator: dbs.EQUAL.String(), Value: peer.NANO}},
|
||||
},
|
||||
}, "", false, 0, 1)
|
||||
if len(d.Data) == 0 {
|
||||
return nil, false
|
||||
}
|
||||
return d.Data[0].(*peer.Peer), true
|
||||
}
|
||||
|
||||
func (ps *StreamService) handleEventFromPartner(evt *common.Event, protocol string) error {
|
||||
switch protocol {
|
||||
case ProtocolSearchResource:
|
||||
@@ -176,9 +224,10 @@ func (ps *StreamService) handleEventFromPartner(evt *common.Event, protocol stri
|
||||
ps.SendResponse(p[0], evt, fmt.Sprintf("%v", search))
|
||||
}
|
||||
} else {
|
||||
fmt.Println("SEND SEARCH_EVENT SetNATSPub", m)
|
||||
go tools.NewNATSCaller().SetNATSPub(tools.SEARCH_EVENT, tools.NATSResponse{
|
||||
fmt.Println("SEND SEARCH_EVENT SetNATSPub", m, evt.DataType, evt.User)
|
||||
tools.NewNATSCaller().SetNATSPub(tools.SEARCH_EVENT, tools.NATSResponse{
|
||||
FromApp: "oc-discovery",
|
||||
User: evt.User,
|
||||
Datatype: tools.DataType(evt.DataType),
|
||||
Method: int(tools.SEARCH_EVENT),
|
||||
Payload: evt.Payload,
|
||||
@@ -186,19 +235,35 @@ func (ps *StreamService) handleEventFromPartner(evt *common.Event, protocol stri
|
||||
}
|
||||
case ProtocolCreateResource, ProtocolUpdateResource:
|
||||
fmt.Println("RECEIVED Protocol.Update", string(evt.Payload))
|
||||
go tools.NewNATSCaller().SetNATSPub(tools.CREATE_RESOURCE, tools.NATSResponse{
|
||||
nano, ok := ps.resolveBookingNano(evt)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
tools.NewNATSCaller().SetNATSPub(tools.CREATE_RESOURCE, tools.NATSResponse{
|
||||
FromApp: "oc-discovery",
|
||||
User: evt.User,
|
||||
Datatype: tools.DataType(evt.DataType),
|
||||
Method: int(tools.CREATE_RESOURCE),
|
||||
Payload: evt.Payload,
|
||||
})
|
||||
if nano != nil {
|
||||
ps.forwardToNano(nano, evt, protocol)
|
||||
}
|
||||
case ProtocolDeleteResource:
|
||||
go tools.NewNATSCaller().SetNATSPub(tools.REMOVE_RESOURCE, tools.NATSResponse{
|
||||
nano, ok := ps.resolveBookingNano(evt)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
tools.NewNATSCaller().SetNATSPub(tools.REMOVE_RESOURCE, tools.NATSResponse{
|
||||
FromApp: "oc-discovery",
|
||||
User: evt.User,
|
||||
Datatype: tools.DataType(evt.DataType),
|
||||
Method: int(tools.REMOVE_RESOURCE),
|
||||
Payload: evt.Payload,
|
||||
})
|
||||
if nano != nil {
|
||||
ps.forwardToNano(nano, evt, protocol)
|
||||
}
|
||||
default:
|
||||
return errors.New("no action authorized available : " + protocol)
|
||||
}
|
||||
@@ -223,11 +288,31 @@ func (abs *StreamService) SendResponse(p *peer.Peer, event *common.Event, search
|
||||
access := oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil)
|
||||
searched := access.Search(abs.FilterPeer(self.GetID(), event.Groups, search), "", false, 0, 0)
|
||||
for _, ss := range searched.Data {
|
||||
// SendResponse uses an admin request so SetAllowedInstances
|
||||
// never calls FilterExploitationAuthorizations. Apply it
|
||||
// explicitly here so we never leak private AEs to a remote peer.
|
||||
if r, ok := ss.(resources.ResourceInterface); ok {
|
||||
r.SetAllowedInstances(&tools.APIRequest{PeerID: p.UUID, Groups: event.Groups, Username: event.User})
|
||||
}
|
||||
if j, err := json.Marshal(ss); err == nil {
|
||||
abs.PublishCommon(&dt, event.User, event.Groups, p.PeerID, ProtocolSearchResource, j)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Close the ProtocolSearchResource stream to the requester immediately after
|
||||
// sending all results. This prevents TempStream from reusing a stale (already
|
||||
// closed by the remote) stream entry for a subsequent search from the same peer,
|
||||
// which would cause write failure and no results for the second search.
|
||||
if decodedID, err := pp.Decode(p.PeerID); err == nil {
|
||||
abs.Mu.Lock()
|
||||
if abs.Streams[ProtocolSearchResource] != nil {
|
||||
if s, ok := abs.Streams[ProtocolSearchResource][decodedID]; ok {
|
||||
s.Stream.Reset()
|
||||
delete(abs.Streams[ProtocolSearchResource], decodedID)
|
||||
}
|
||||
}
|
||||
abs.Mu.Unlock()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
+181
-61
@@ -27,7 +27,7 @@ const ProtocolObserve = "/opencloud/peer/observe/1.0"
|
||||
// observeHBEventType is used as the common.Event.Type for heartbeat responses.
|
||||
const observeHBEventType = "/opencloud/peer/observe/heartbeat"
|
||||
|
||||
const observeHBInterval = 30 * time.Second
|
||||
const observeHBInterval = 10 * time.Second
|
||||
const observeDrainDuration = 30 * time.Second
|
||||
|
||||
// observeBatchWindow is the accumulation window before a heartbeat batch is
|
||||
@@ -45,7 +45,95 @@ type ObserveRequest struct {
|
||||
|
||||
// ObserveHeartbeat is sent by the observed side every observeHBInterval.
|
||||
type ObserveHeartbeat struct {
|
||||
State string `json:"state"` // always "online" when actively emitted
|
||||
State string `json:"state"` // always "online" when actively emitted
|
||||
SentAt time.Time `json:"sent_at,omitempty"` // timestamp set by sender; lets receiver compute one-way latency
|
||||
}
|
||||
|
||||
const (
|
||||
maxLatencyMs = 2000.0 // ms above which latency score → 0
|
||||
latencySamples = 5 // sliding window size for latency averaging
|
||||
fastThresholdMs = 200.0 // below = "fast", above = "slow"
|
||||
reliableThreshold = 0.95 // miss_rate below 5% = "reliable"
|
||||
)
|
||||
|
||||
// PeerObserveMetrics accumulates connection-quality data for one observed peer.
|
||||
// Updated on every incoming heartbeat (observing side).
|
||||
type PeerObserveMetrics struct {
|
||||
mu sync.Mutex
|
||||
firstObservedAt time.Time
|
||||
lastHeartbeatAt time.Time
|
||||
received uint64
|
||||
latencies [latencySamples]time.Duration
|
||||
latIdx int
|
||||
latCount int
|
||||
}
|
||||
|
||||
func (m *PeerObserveMetrics) record(latency time.Duration) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
m.received++
|
||||
m.lastHeartbeatAt = time.Now().UTC()
|
||||
m.latencies[m.latIdx%latencySamples] = latency
|
||||
m.latIdx++
|
||||
if m.latCount < latencySamples {
|
||||
m.latCount++
|
||||
}
|
||||
}
|
||||
|
||||
func (m *PeerObserveMetrics) snapshot() PeerObserveSnapshot {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
var total time.Duration
|
||||
for i := 0; i < m.latCount; i++ {
|
||||
total += m.latencies[i]
|
||||
}
|
||||
var avgMs float64
|
||||
if m.latCount > 0 {
|
||||
avgMs = float64(total.Milliseconds()) / float64(m.latCount)
|
||||
}
|
||||
expected := int64(time.Duration(m.lastHeartbeatAt.Second()-m.firstObservedAt.Second()) / observeHBInterval)
|
||||
fmt.Println("EXPECTED", expected, m.received)
|
||||
var missRate float64
|
||||
if expected > 0 {
|
||||
recv := int64(m.received)
|
||||
if recv > expected {
|
||||
recv = expected
|
||||
}
|
||||
missRate = 1.0 - float64(recv)/float64(expected)
|
||||
}
|
||||
latScore := 1.0 - avgMs/maxLatencyMs
|
||||
if latScore < 0 {
|
||||
latScore = 0
|
||||
}
|
||||
relScore := 1.0 - missRate
|
||||
trust := (0.35*latScore + 0.65*relScore) * 100
|
||||
|
||||
speed := "fast"
|
||||
if avgMs >= fastThresholdMs {
|
||||
speed = "slow"
|
||||
}
|
||||
reliability := "reliable"
|
||||
if relScore < reliableThreshold {
|
||||
reliability = "watch"
|
||||
}
|
||||
return PeerObserveSnapshot{
|
||||
LatencyMs: avgMs,
|
||||
Speed: speed,
|
||||
Reliability: reliability,
|
||||
TrustScore: trust,
|
||||
LastSeenAt: m.lastHeartbeatAt,
|
||||
MissRate: missRate,
|
||||
}
|
||||
}
|
||||
|
||||
// PeerObserveSnapshot is the point-in-time quality summary sent to oc-peer via NATS.
|
||||
type PeerObserveSnapshot struct {
|
||||
LatencyMs float64 `json:"latency_ms"`
|
||||
Speed string `json:"speed"` // "fast" | "slow"
|
||||
Reliability string `json:"reliability"` // "reliable" | "watch"
|
||||
TrustScore float64 `json:"trust_score"`
|
||||
LastSeenAt time.Time `json:"last_seen_at"`
|
||||
MissRate float64 `json:"miss_rate"`
|
||||
}
|
||||
|
||||
// ShallowPeer is the minimal peer representation sent by oc-peer in a
|
||||
@@ -204,18 +292,13 @@ func flushObserveBatch(peerIDs []string) {
|
||||
|
||||
// ── incoming observe handler (observed side) ──────────────────────────────────
|
||||
|
||||
// handleIncomingObserve is registered as the ProtocolObserve stream handler.
|
||||
// It is called when a remote peer opens an observe stream to us.
|
||||
// The function reads the request, validates it, then starts (or stops) the
|
||||
// heartbeat goroutine and returns immediately — the goroutine owns the stream.
|
||||
// handleIncomingObserve is called when a remote peer opens an observe stream
|
||||
// to us (observed side). It starts a heartbeat goroutine that writes back on
|
||||
// the same bidirectional rawStream — no separate reverse stream is opened.
|
||||
// The goroutine stops via context cancellation (triggered by a close event
|
||||
// read from rawStream) or when rawStream becomes unwritable.
|
||||
func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
|
||||
remotePeerID := rawStream.Conn().RemotePeer().String()
|
||||
addr := rawStream.Conn().RemoteMultiaddr().String()
|
||||
ad, err := pp.AddrInfoFromString(addr + "/p2p/" + remotePeerID)
|
||||
if err != nil {
|
||||
fmt.Println("qndlqnl EERR", addr, err)
|
||||
return err
|
||||
}
|
||||
log := oclib.GetLogger()
|
||||
|
||||
// Drain mode: reject any new observations for 30 s after a close-all.
|
||||
@@ -223,13 +306,11 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
|
||||
draining := !s.drainUntil.IsZero() && time.Now().Before(s.drainUntil)
|
||||
s.drainMu.RUnlock()
|
||||
if draining {
|
||||
rawStream.Close()
|
||||
fmt.Println("Draining")
|
||||
return errors.New("Draining")
|
||||
return errors.New("draining")
|
||||
}
|
||||
// Read the observe request (with a generous deadline to avoid hangs).
|
||||
// Guard: the requesting peer must not be blacklisted or be ourself.
|
||||
did := ""
|
||||
|
||||
// Guard: the requesting peer must not be blacklisted.
|
||||
access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
|
||||
res := access.Search(&dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
@@ -238,11 +319,9 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
|
||||
}, "", false, 0, 1)
|
||||
if len(res.Data) > 0 {
|
||||
p := res.Data[0].(*peer.Peer)
|
||||
did = p.GetID()
|
||||
if p.Relation == peer.BLACKLIST { // || p.Relation == peer.SELF
|
||||
rawStream.Close()
|
||||
if p.Relation == peer.BLACKLIST {
|
||||
fmt.Println("CLOSE blacklist or self")
|
||||
return errors.New("can't exploit blacklist or self")
|
||||
return errors.New("can't observe blacklisted peer")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -251,52 +330,32 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
|
||||
s.observeCache.set(remotePeerID, cancel)
|
||||
fmt.Println("LOOP OBSERVE")
|
||||
go func() {
|
||||
defer rawStream.Close()
|
||||
// Do NOT close rawStream here: the persistent readLoop (HandleResponse)
|
||||
// owns rawStream's lifecycle. We only stop writing.
|
||||
defer cancel()
|
||||
defer s.observeCache.delete(remotePeerID)
|
||||
|
||||
ticker := time.NewTicker(observeHBInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
hbPayload, _ := json.Marshal(ObserveHeartbeat{State: "online"})
|
||||
evt := common.NewEvent(observeHBEventType, s.Host.ID().String(), nil, "", hbPayload)
|
||||
if evt == nil {
|
||||
return
|
||||
}
|
||||
if s.Streams, err = common.TempStream(s.Host, *ad, ProtocolObserve, did, s.Streams, protocols, &s.Mu); err == nil {
|
||||
stream := s.Streams[ProtocolObserve][ad.ID]
|
||||
if err := json.NewEncoder(stream.Stream).Encode(evt); err != nil {
|
||||
// Moderate connectivity event: the observer is unreachable.
|
||||
// The deferred calls above purge this observer from the cache.
|
||||
fmt.Println("LOOP EVT ERR", err)
|
||||
log.Info().
|
||||
Str("observer", remotePeerID).
|
||||
Err(err).
|
||||
Msg("[observe] heartbeat write failed — moderate connectivity event, purging observer from cache")
|
||||
return
|
||||
}
|
||||
buildHBEvent := func() *common.Event {
|
||||
p, _ := json.Marshal(ObserveHeartbeat{State: "online", SentAt: time.Now().UTC()})
|
||||
return common.NewEvent(observeHBEventType, s.Host.ID().String(), nil, "", p)
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
|
||||
rawStream.SetWriteDeadline(time.Now().Add(5 * time.Second))
|
||||
fmt.Println("LOOP EVT", evt)
|
||||
var err error
|
||||
if s.Streams, err = common.TempStream(s.Host, *ad, ProtocolObserve, did, s.Streams, protocols, &s.Mu); err == nil {
|
||||
stream := s.Streams[ProtocolObserve][ad.ID]
|
||||
if err := json.NewEncoder(stream.Stream).Encode(evt); err != nil {
|
||||
// Moderate connectivity event: the observer is unreachable.
|
||||
// The deferred calls above purge this observer from the cache.
|
||||
fmt.Println("LOOP EVT ERR", err)
|
||||
log.Info().
|
||||
Str("observer", remotePeerID).
|
||||
Err(err).
|
||||
Msg("[observe] heartbeat write failed — moderate connectivity event, purging observer from cache")
|
||||
return
|
||||
}
|
||||
evt := buildHBEvent()
|
||||
if err := json.NewEncoder(rawStream).Encode(evt); err != nil {
|
||||
log.Info().
|
||||
Str("observer", remotePeerID).
|
||||
Err(err).
|
||||
Msg("[observe] heartbeat write failed — stream closed, stopping goroutine")
|
||||
return
|
||||
}
|
||||
rawStream.SetWriteDeadline(time.Time{})
|
||||
}
|
||||
@@ -308,14 +367,65 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
|
||||
// ── heartbeat receiver (observing side) ───────────────────────────────────────
|
||||
|
||||
// handleObserveHeartbeat is called by readLoop when a heartbeat event arrives
|
||||
// on an outgoing ProtocolObserve stream. It queues the peer_id in the batch
|
||||
// accumulator; the batcher flushes to NATS after observeBatchWindow.
|
||||
// on an outgoing ProtocolObserve stream. It updates per-peer metrics and flushes
|
||||
// a quality snapshot to NATS.
|
||||
func (ps *StreamService) handleObserveHeartbeat(evt *common.Event) error {
|
||||
// ps.hbBatcher.add(evt.From)
|
||||
flushObserveBatch([]string{evt.From})
|
||||
var hb ObserveHeartbeat
|
||||
if err := json.Unmarshal(evt.Payload, &hb); err == nil && !hb.SentAt.IsZero() {
|
||||
latency := time.Since(hb.SentAt)
|
||||
raw, _ := ps.observeMetrics.LoadOrStore(evt.From, &PeerObserveMetrics{
|
||||
firstObservedAt: time.Now().UTC(),
|
||||
})
|
||||
raw.(*PeerObserveMetrics).record(latency)
|
||||
fmt.Println("METRICS", raw)
|
||||
ps.observeMetrics.Store(evt.From, raw)
|
||||
}
|
||||
ps.flushObserveForPeer(evt.From, evt.User)
|
||||
return nil
|
||||
}
|
||||
|
||||
// flushObserveForPeer sends a PEER_OBSERVE_RESPONSE_EVENT to NATS with a quality
|
||||
// snapshot for peerID. Replaces the old flushObserveBatch (single-peer variant).
|
||||
func (ps *StreamService) flushObserveForPeer(peerID string, user string) {
|
||||
var snap *PeerObserveSnapshot
|
||||
if raw, ok := ps.observeMetrics.Load(peerID); ok {
|
||||
fmt.Println("RETRIEVED METRICS", raw)
|
||||
s := raw.(*PeerObserveMetrics).snapshot()
|
||||
snap = &s
|
||||
}
|
||||
fmt.Println("RETRIEVED METRICS 2", snap)
|
||||
payload, err := json.Marshal(map[string]interface{}{
|
||||
"peer_ids": []string{peerID},
|
||||
"state": "online",
|
||||
"metrics": map[string]*PeerObserveSnapshot{peerID: snap},
|
||||
})
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
tools.NewNATSCaller().SetNATSPub(tools.PEER_OBSERVE_RESPONSE_EVENT, tools.NATSResponse{
|
||||
FromApp: "oc-discovery",
|
||||
Datatype: tools.PEER,
|
||||
User: user,
|
||||
Method: int(tools.PEER_OBSERVE_RESPONSE_EVENT),
|
||||
Payload: payload,
|
||||
})
|
||||
propPayload, err := json.Marshal(tools.PropalgationMessage{
|
||||
DataType: int(tools.PEER),
|
||||
Action: tools.PB_PROPAGATE,
|
||||
Payload: payload,
|
||||
})
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
|
||||
FromApp: "oc-discovery",
|
||||
Datatype: tools.PEER,
|
||||
User: user,
|
||||
Method: int(tools.PROPALGATION_EVENT),
|
||||
Payload: propPayload,
|
||||
})
|
||||
}
|
||||
|
||||
// ── user→peer index (ref-counted observe management) ─────────────────────────
|
||||
|
||||
// userPeerIndex tracks which users are observing which peers.
|
||||
@@ -514,7 +624,8 @@ func (ps *StreamService) openObserveStream(p ShallowPeer) error {
|
||||
}
|
||||
|
||||
// closeObserveStream closes the ProtocolObserve stream to toPeerID and notifies
|
||||
// the remote side.
|
||||
// the remote side. The close event is wrapped in a common.Event so the remote's
|
||||
// persistent readLoop can decode and handle it (cancel the heartbeat goroutine).
|
||||
func (ps *StreamService) closeObserveStream(toPeerID string) error {
|
||||
decodedID, err := pp.Decode(toPeerID)
|
||||
if err != nil {
|
||||
@@ -523,12 +634,15 @@ func (ps *StreamService) closeObserveStream(toPeerID string) error {
|
||||
ps.Mu.Lock()
|
||||
if ps.Streams[ProtocolObserve] != nil {
|
||||
if s, ok := ps.Streams[ProtocolObserve][decodedID]; ok {
|
||||
_ = json.NewEncoder(s.Stream).Encode(ObserveRequest{Close: true})
|
||||
closePayload, _ := json.Marshal(ObserveRequest{Close: true})
|
||||
closeEvt := common.NewEvent(ProtocolObserve, ps.Host.ID().String(), nil, "", closePayload)
|
||||
_ = json.NewEncoder(s.Stream).Encode(closeEvt)
|
||||
s.Stream.Close()
|
||||
delete(ps.Streams[ProtocolObserve], decodedID)
|
||||
}
|
||||
}
|
||||
ps.Mu.Unlock()
|
||||
ps.observeMetrics.Delete(toPeerID)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -537,7 +651,9 @@ func (ps *StreamService) closeObserveStream(toPeerID string) error {
|
||||
func (ps *StreamService) CloseAllObserves() {
|
||||
ps.Mu.Lock()
|
||||
for _, s := range ps.Streams[ProtocolObserve] {
|
||||
_ = json.NewEncoder(s.Stream).Encode(ObserveRequest{Close: true})
|
||||
closePayload, _ := json.Marshal(ObserveRequest{Close: true})
|
||||
closeEvt := common.NewEvent(ProtocolObserve, ps.Host.ID().String(), nil, "", closePayload)
|
||||
_ = json.NewEncoder(s.Stream).Encode(closeEvt)
|
||||
s.Stream.Close()
|
||||
}
|
||||
delete(ps.Streams, ProtocolObserve)
|
||||
@@ -545,6 +661,10 @@ func (ps *StreamService) CloseAllObserves() {
|
||||
|
||||
// Reset user index so stale ref-counts don't block future opens.
|
||||
ps.observeUsers = newUserPeerIndex()
|
||||
ps.observeMetrics.Range(func(k, _ any) bool {
|
||||
ps.observeMetrics.Delete(k)
|
||||
return true
|
||||
})
|
||||
|
||||
ps.drainMu.Lock()
|
||||
ps.drainUntil = time.Now().Add(observeDrainDuration)
|
||||
|
||||
@@ -61,15 +61,17 @@ func (ps *StreamService) PublishCommon(dt *tools.DataType, user string, groups [
|
||||
}
|
||||
stream, err := ps.write(toPeerID, ad, dt, user, resource, proto)
|
||||
if err != nil {
|
||||
if _, ok := dntProtocols[proto]; ok {
|
||||
ps.dnt.enqueue(&dntEntry{
|
||||
did: toPeerID,
|
||||
addr: *ad,
|
||||
dt: dt,
|
||||
user: user,
|
||||
payload: resource,
|
||||
proto: proto,
|
||||
addedAt: time.Now().UTC(),
|
||||
if _, ok := DTNProtocols[proto]; ok {
|
||||
ps.dnt.enqueue(&DTNEntry{
|
||||
did: toPeerID,
|
||||
resourceID: extractResourceID(resource),
|
||||
forceCritical: pe.Relation == peer.NANO,
|
||||
addr: *ad,
|
||||
dt: dt,
|
||||
user: user,
|
||||
payload: resource,
|
||||
proto: proto,
|
||||
addedAt: time.Now().UTC(),
|
||||
})
|
||||
}
|
||||
return nil, err
|
||||
@@ -125,20 +127,45 @@ func (ps *StreamService) ToPartnerPublishEvent(
|
||||
|
||||
return nil
|
||||
}
|
||||
ks := []protocol.ID{}
|
||||
for k := range protocolsPartners {
|
||||
ks = append(ks, k)
|
||||
// Extract creator_id to route to the correct nano.
|
||||
// A master must only forward a resource to the nano that owns it.
|
||||
var creatorID string
|
||||
var minPayload struct {
|
||||
CreatorID string `json:"creator_id"`
|
||||
}
|
||||
for _, rel := range []peer.PeerRelation{peer.PARTNER, peer.MASTER, peer.NANO} {
|
||||
if json.Unmarshal(payload, &minPayload) == nil {
|
||||
creatorID = minPayload.CreatorID
|
||||
}
|
||||
|
||||
// PARTNER and MASTER receive every resource unconditionally.
|
||||
for _, rel := range []peer.PeerRelation{peer.PARTNER, peer.MASTER} {
|
||||
ps.PublishesCommon(dt, user, groups, &dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
"relation": {{Operator: dbs.EQUAL.String(), Value: rel}},
|
||||
},
|
||||
}, payload, proto)
|
||||
}
|
||||
|
||||
// NANO: only send to the nano whose UUID matches the resource creator.
|
||||
if creatorID != "" {
|
||||
ps.PublishesCommon(dt, user, groups, &dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
"relation": {{Operator: dbs.EQUAL.String(), Value: peer.NANO}},
|
||||
"id": {{Operator: dbs.EQUAL.String(), Value: creatorID}},
|
||||
},
|
||||
}, payload, proto)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// forwardToNano sends a booking mutation directly to a known NANO peer.
|
||||
// The NANO peer is already resolved by the caller (resolveBookingNano).
|
||||
// DTN critical is applied automatically by PublishCommon (Relation == NANO).
|
||||
func (abs *StreamService) forwardToNano(nano *peer.Peer, evt *common.Event, proto string) {
|
||||
dt := tools.DataType(evt.DataType)
|
||||
abs.PublishCommon(&dt, evt.User, evt.Groups, nano.PeerID, protocol.ID(proto), evt.Payload)
|
||||
}
|
||||
|
||||
func (s *StreamService) write(
|
||||
did string,
|
||||
peerID *pp.AddrInfo,
|
||||
|
||||
@@ -27,6 +27,10 @@ const ProtocolConsidersResource = "/opencloud/resource/considers/1.0"
|
||||
const ProtocolMinioConfigResource = "/opencloud/minio/config/1.0"
|
||||
const ProtocolAdmiraltyConfigResource = "/opencloud/admiralty/config/1.0"
|
||||
|
||||
// ProtocolSourcePresignResource routes PB_SOURCE_PRESIGN to the resource-owner peer.
|
||||
// The owner generates a pre-signed Minio URL and responds via PB_CONSIDERS.
|
||||
const ProtocolSourcePresignResource = "/opencloud/resource/source-presign/1.0"
|
||||
|
||||
const ProtocolSearchResource = "/opencloud/resource/search/1.0"
|
||||
const ProtocolCreateResource = "/opencloud/resource/create/1.0"
|
||||
const ProtocolUpdateResource = "/opencloud/resource/update/1.0"
|
||||
@@ -43,6 +47,7 @@ var protocols = map[protocol.ID]*common.ProtocolInfo{
|
||||
ProtocolVerifyResource: {WaitResponse: true, TTL: 1 * time.Minute},
|
||||
ProtocolMinioConfigResource: {WaitResponse: true, TTL: 1 * time.Minute},
|
||||
ProtocolAdmiraltyConfigResource: {WaitResponse: true, TTL: 1 * time.Minute},
|
||||
ProtocolSourcePresignResource: {WaitResponse: true, TTL: 1 * time.Minute},
|
||||
ProtocolObserve: {WaitResponse: true, TTL: 1 * time.Minute},
|
||||
}
|
||||
|
||||
@@ -63,8 +68,8 @@ type StreamService struct {
|
||||
// IsPeerKnown, when set, is called at stream open for every inbound protocol.
|
||||
// Return false to reset the stream immediately. Left nil until wired by the node.
|
||||
IsPeerKnown func(pid pp.ID) bool
|
||||
// dnt is the Disconnection Network Tolerance cache for outbound streams.
|
||||
dnt *dntCache
|
||||
// DTN is the Disconnection Network Tolerance cache for outbound streams.
|
||||
dnt *DTNCache
|
||||
// observeCache tracks running heartbeat goroutines on the OBSERVED side.
|
||||
observeCache *observeCache
|
||||
// hbBatcher accumulates incoming heartbeats (observing side) and flushes
|
||||
@@ -78,6 +83,12 @@ type StreamService struct {
|
||||
// observeUsers tracks which users are observing which peers so streams are
|
||||
// closed only when the last observer for a peer disconnects.
|
||||
observeUsers *userPeerIndex
|
||||
// observeMetrics accumulates connection-quality data per observed peer (observing side).
|
||||
// Keys are peer_id strings; values are *PeerObserveMetrics.
|
||||
observeMetrics sync.Map
|
||||
// DTNNudge receives peer IDs for which an immediate DTN retry should be
|
||||
// attempted (e.g. when the peer just reconnected via PendingCallers).
|
||||
dntNudge chan string
|
||||
}
|
||||
|
||||
func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node common.DiscoveryPeer) (*StreamService, error) {
|
||||
@@ -92,6 +103,7 @@ func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node c
|
||||
dnt: newDNTCache(),
|
||||
observeCache: newObserveCache(),
|
||||
observeUsers: newUserPeerIndex(),
|
||||
dntNudge: make(chan string, 32),
|
||||
}
|
||||
service.hbBatcher = newHeartbeatBatcher(flushObserveBatch)
|
||||
for proto := range protocols {
|
||||
@@ -105,6 +117,23 @@ func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node c
|
||||
return service, nil
|
||||
}
|
||||
|
||||
// PendingContacts returns the peer IDs that have at least one critical DTN
|
||||
// entry pending. Called on each heartbeat tick to populate PendingContact.
|
||||
func (s *StreamService) PendingContacts() []string {
|
||||
return s.dnt.peersWithPending()
|
||||
}
|
||||
|
||||
// NudgeContacts signals the DTN loop to retry immediately for the given peer
|
||||
// IDs (typically received via HeartbeatResponse.PendingCallers).
|
||||
func (s *StreamService) NudgeContacts(peerIDs []string) {
|
||||
for _, id := range peerIDs {
|
||||
select {
|
||||
case s.dntNudge <- id:
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// gate wraps a stream handler with IsPeerKnown validation.
|
||||
// If the peer is unknown the entire connection is closed and the handler is not called.
|
||||
// IsPeerKnown is read at stream-open time so it works even when set after InitStream.
|
||||
@@ -117,6 +146,17 @@ func (s *StreamService) gatePrivilege(h func(network.Stream)) func(network.Strea
|
||||
},
|
||||
}, "", false, 0, 1)
|
||||
if len(d.Data) == 0 {
|
||||
stream.Reset()
|
||||
return
|
||||
}
|
||||
master := d.Data[0].(*peer.Peer)
|
||||
if stream.Conn().RemotePeer().String() != master.PeerID {
|
||||
logger := oclib.GetLogger()
|
||||
logger.Warn().
|
||||
Str("remote", stream.Conn().RemotePeer().String()).
|
||||
Str("master", master.PeerID).
|
||||
Msg("[gate] nano rejected stream from non-master peer")
|
||||
stream.Reset()
|
||||
return
|
||||
}
|
||||
}
|
||||
@@ -162,9 +202,17 @@ func (s *StreamService) HandleResponse(stream network.Stream) {
|
||||
Stream: stream,
|
||||
Expiry: time.Now().UTC().Add(expiry + 1*time.Minute),
|
||||
}
|
||||
|
||||
// ProtocolObserve uses a bidirectional long-lived stream: the remote writes
|
||||
// heartbeats back on the same stream, and may later send a close event.
|
||||
// Use a persistent readLoop so we can receive both heartbeats and close events.
|
||||
protoInfo := protocols[stream.Protocol()]
|
||||
if stream.Protocol() == ProtocolObserve {
|
||||
protoInfo = &common.ProtocolInfo{PersistantStream: true}
|
||||
}
|
||||
go s.readLoop(s.Streams[stream.Protocol()][stream.Conn().RemotePeer()],
|
||||
stream.Conn().RemotePeer(),
|
||||
stream.Protocol(), protocols[stream.Protocol()])
|
||||
stream.Protocol(), protoInfo)
|
||||
}
|
||||
|
||||
func (s *StreamService) connectToPartners() error {
|
||||
|
||||
Reference in New Issue
Block a user