Discovery Neo Oclib

This commit is contained in:
mr
2026-05-27 16:17:00 +02:00
parent 7f951afd41
commit 6ce6e6fe7d
20 changed files with 1436 additions and 1133 deletions
-362
View File
@@ -1,362 +0,0 @@
package stream
// dnt_cache.go — Disconnection Network Tolerance cache for outbound stream requests.
//
// When a stream write fails because the remote peer is unreachable, the request
// is saved here and retried on the next tick. Two levels are defined:
//
// - dntCritical : retry indefinitely (create / update / delete resource).
// - dntModerate : up to dntMaxModerateRetries retries, then abandon.
//
// Pubsub messages and search streams are explicitly excluded.
// Streams initiated from the indexer side are never enqueued here.
//
// # Crash-resilient persistence
//
// Critical entries are written to an encrypted file (AES-256-GCM) so they
// survive a node crash/restart. The AES key is derived deterministically from
// the node's Ed25519 private key via HKDF-SHA256 — no extra secret to manage.
// Moderate entries are intentionally not persisted: their retry budget is small
// enough that re-loading them after a restart would be misleading.
import (
"crypto/aes"
"crypto/cipher"
"crypto/rand"
"crypto/sha256"
"encoding/json"
"io"
"os"
"path/filepath"
"sync"
"time"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/tools"
"golang.org/x/crypto/hkdf"
"oc-discovery/conf"
pp "github.com/libp2p/go-libp2p/core/peer"
"github.com/libp2p/go-libp2p/core/protocol"
)
type dntLevel int
const (
dntCritical dntLevel = iota // retry until the message is delivered
dntModerate // retry up to dntMaxModerateRetries times
)
const dntMaxModerateRetries = 3
const dntRetryInterval = 15 * time.Second
// dntProtocols maps each stream protocol to its DNT level.
// Protocols absent from this map receive no caching (e.g. ProtocolSearchResource).
var dntProtocols = map[protocol.ID]dntLevel{
// Critical — data mutations that must eventually be delivered.
ProtocolCreateResource: dntCritical,
ProtocolUpdateResource: dntCritical,
ProtocolDeleteResource: dntCritical,
// Moderate — confirmations / config / planner: 3 retries before abandon.
ProtocolVerifyResource: dntModerate,
ProtocolSendPlanner: dntModerate,
ProtocolConsidersResource: dntModerate,
ProtocolMinioConfigResource: dntModerate,
ProtocolAdmiraltyConfigResource: dntModerate,
}
// dntEntryJSON is the on-disk representation of a dntEntry.
// pp.AddrInfo and protocol.ID don't have built-in JSON tags so we flatten them.
type dntEntryJSON struct {
DID string `json:"did"`
Addr pp.AddrInfo `json:"addr"`
DT *tools.DataType `json:"dt,omitempty"`
User string `json:"user"`
Payload []byte `json:"payload"`
Proto protocol.ID `json:"proto"`
Retries int `json:"retries"`
AddedAt time.Time `json:"added_at"`
}
type dntEntry struct {
did string
addr pp.AddrInfo
dt *tools.DataType
user string
payload []byte
proto protocol.ID
retries int
addedAt time.Time
}
func (e *dntEntry) toJSON() dntEntryJSON {
return dntEntryJSON{
DID: e.did,
Addr: e.addr,
DT: e.dt,
User: e.user,
Payload: e.payload,
Proto: e.proto,
Retries: e.retries,
AddedAt: e.addedAt,
}
}
func entryFromJSON(j dntEntryJSON) *dntEntry {
return &dntEntry{
did: j.DID,
addr: j.Addr,
dt: j.DT,
user: j.User,
payload: j.Payload,
proto: j.Proto,
retries: j.Retries,
addedAt: j.AddedAt,
}
}
type dntCache struct {
mu sync.Mutex
entries []*dntEntry
// aesKey is the derived AES-256 key used for on-disk encryption.
// Nil when key derivation failed: persistence is disabled but the in-memory
// cache continues to function normally.
aesKey []byte
}
// newDNTCache initialises the cache, derives the encryption key, and restores
// any critical entries that were persisted before the last crash.
func newDNTCache() *dntCache {
log := oclib.GetLogger()
c := &dntCache{}
key, err := deriveDNTKey()
if err != nil {
log.Warn().Err(err).Msg("[dnt] key derivation failed — persistence disabled")
} else {
c.aesKey = key
c.loadFromDisk()
}
return c
}
// enqueue adds an entry to the cache and persists critical entries to disk.
func (c *dntCache) enqueue(e *dntEntry) {
c.mu.Lock()
c.entries = append(c.entries, e)
c.mu.Unlock()
if dntProtocols[e.proto] == dntCritical {
go c.persistToDisk()
}
}
// drain atomically removes and returns all current entries.
func (c *dntCache) drain() []*dntEntry {
c.mu.Lock()
defer c.mu.Unlock()
out := c.entries
c.entries = nil
return out
}
// requeue puts entries back at the head of the list, preserving any new
// entries added while the retry loop was running.
func (c *dntCache) requeue(entries []*dntEntry) {
if len(entries) == 0 {
return
}
c.mu.Lock()
defer c.mu.Unlock()
c.entries = append(entries, c.entries...)
}
// ── Persistence ──────────────────────────────────────────────────────────────
// dntCachePath returns the path of the on-disk cache file, placed next to the
// node's private key so it lives on the same persistent volume.
func dntCachePath() string {
return filepath.Join(filepath.Dir(conf.GetConfig().PrivateKeyPath), "dnt_cache.bin")
}
// deriveDNTKey derives a 32-byte AES key from the node's Ed25519 private key
// using HKDF-SHA256. The derivation is deterministic: the same key is always
// produced from the same private key, so no symmetric secret needs storing.
func deriveDNTKey() ([]byte, error) {
priv, err := tools.LoadKeyFromFilePrivate()
if err != nil {
return nil, err
}
// Raw() on a libp2p Ed25519 private key returns the 64-byte representation
// (32-byte seed || 32-byte public key). We use the full 64 bytes as IKM.
raw, err := priv.Raw()
if err != nil {
return nil, err
}
reader := hkdf.New(sha256.New, raw, nil, []byte("oc-discovery/dnt-cache/v1"))
key := make([]byte, 32)
if _, err := io.ReadFull(reader, key); err != nil {
return nil, err
}
return key, nil
}
// persistToDisk encrypts all current critical entries and writes them to disk.
// Non-critical entries are deliberately excluded — they are not worth restoring
// after a restart given their limited retry budget.
func (c *dntCache) persistToDisk() {
if c.aesKey == nil {
return
}
log := oclib.GetLogger()
c.mu.Lock()
var toSave []dntEntryJSON
for _, e := range c.entries {
if dntProtocols[e.proto] == dntCritical {
toSave = append(toSave, e.toJSON())
}
}
c.mu.Unlock()
plaintext, err := json.Marshal(toSave)
if err != nil {
return
}
block, err := aes.NewCipher(c.aesKey)
if err != nil {
return
}
gcm, err := cipher.NewGCM(block)
if err != nil {
return
}
nonce := make([]byte, gcm.NonceSize())
if _, err := io.ReadFull(rand.Reader, nonce); err != nil {
return
}
ciphertext := gcm.Seal(nonce, nonce, plaintext, nil)
path := dntCachePath()
tmp := path + ".tmp"
if err := os.WriteFile(tmp, ciphertext, 0600); err != nil {
log.Warn().Err(err).Msg("[dnt] failed to write cache file")
return
}
if err := os.Rename(tmp, path); err != nil {
log.Warn().Err(err).Msg("[dnt] failed to rename cache file")
_ = os.Remove(tmp)
}
}
// loadFromDisk decrypts the on-disk cache and re-enqueues only critical entries.
// Errors (missing file, decryption failure) are non-fatal: the cache simply
// starts empty, which is safe.
func (c *dntCache) loadFromDisk() {
if c.aesKey == nil {
return
}
log := oclib.GetLogger()
path := dntCachePath()
data, err := os.ReadFile(path)
if err != nil {
if !os.IsNotExist(err) {
log.Warn().Err(err).Msg("[dnt] failed to read cache file")
}
return
}
block, err := aes.NewCipher(c.aesKey)
if err != nil {
return
}
gcm, err := cipher.NewGCM(block)
if err != nil {
return
}
if len(data) < gcm.NonceSize() {
log.Warn().Msg("[dnt] cache file too short, ignoring")
return
}
nonce, ciphertext := data[:gcm.NonceSize()], data[gcm.NonceSize():]
plaintext, err := gcm.Open(nil, nonce, ciphertext, nil)
if err != nil {
log.Warn().Err(err).Msg("[dnt] cache file decryption failed (key mismatch?), ignoring")
return
}
var saved []dntEntryJSON
if err := json.Unmarshal(plaintext, &saved); err != nil {
log.Warn().Err(err).Msg("[dnt] cache file unmarshal failed, ignoring")
return
}
count := 0
for _, j := range saved {
// Only restore critical entries — moderate entries are intentionally
// not persisted, but this guard defends against format changes.
if dntProtocols[j.Proto] != dntCritical {
continue
}
c.entries = append(c.entries, entryFromJSON(j))
count++
}
if count > 0 {
log.Info().Int("count", count).Msg("[dnt] restored critical entries from disk")
}
}
// ── Retry loop ────────────────────────────────────────────────────────────────
// startDNTLoop runs the background retry goroutine. Call once after init.
func (s *StreamService) startDNTLoop() {
logger := oclib.GetLogger()
ticker := time.NewTicker(dntRetryInterval)
defer ticker.Stop()
for range ticker.C {
entries := s.dnt.drain()
if len(entries) == 0 {
continue
}
var keep []*dntEntry
for _, e := range entries {
_, err := s.write(e.did, &e.addr, e.dt, e.user, e.payload, e.proto)
if err == nil {
level := dntProtocols[e.proto]
if level == dntCritical {
logger.Info().
Str("proto", string(e.proto)).
Str("peer", e.did).
Msg("[dnt] critical message delivered after retry")
} else {
logger.Info().
Str("proto", string(e.proto)).
Str("peer", e.did).
Int("retries", e.retries).
Msg("[dnt] moderate message delivered after retry")
}
continue
}
level := dntProtocols[e.proto]
switch level {
case dntCritical:
keep = append(keep, e)
case dntModerate:
e.retries++
if e.retries < dntMaxModerateRetries {
keep = append(keep, e)
} else {
logger.Warn().
Str("proto", string(e.proto)).
Str("peer", e.did).
Int("retries", e.retries).
Msg("[dnt] moderate message abandoned after max retries")
}
}
}
s.dnt.requeue(keep)
// Persist after each tick so the on-disk file reflects the current
// state (entries delivered are removed, new ones from concurrent
// enqueues are included).
go s.dnt.persistToDisk()
}
}
+446
View File
@@ -0,0 +1,446 @@
package stream
// DTN_cache.go — Disconnection Network Tolerance cache for outbound stream requests.
//
// When a stream write fails because the remote peer is unreachable, the request
// is saved here and retried on the next tick. Two levels are defined:
//
// - DTNCritical : retry indefinitely (create / update / delete resource).
// - DTNModerate : up to DTNMaxModerateRetries retries, then abandon.
//
// Pubsub messages and search streams are explicitly excluded.
// Streams initiated from the indexer side are never enqueued here.
//
// # Crash-resilient persistence
//
// Critical entries are written to an encrypted file (AES-256-GCM) so they
// survive a node crash/restart. The AES key is derived deterministically from
// the node's Ed25519 private key via HKDF-SHA256 — no extra secret to manage.
// Moderate entries are intentionally not persisted: their retry budget is small
// enough that re-loading them after a restart would be misleading.
import (
"crypto/aes"
"crypto/cipher"
"crypto/rand"
"crypto/sha256"
"encoding/json"
"io"
"os"
"path/filepath"
"sync"
"time"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/tools"
"golang.org/x/crypto/hkdf"
"oc-discovery/conf"
pp "github.com/libp2p/go-libp2p/core/peer"
"github.com/libp2p/go-libp2p/core/protocol"
)
type DTNLevel int
const (
DTNCritical DTNLevel = iota // retry until the message is delivered
DTNModerate // retry up to DTNMaxModerateRetries times
)
const DTNMaxModerateRetries = 3
const DTNRetryInterval = 15 * time.Second
// DTNProtocols maps each stream protocol to its DTN level.
// Protocols absent from this map receive no caching (e.g. ProtocolSearchResource).
var DTNProtocols = map[protocol.ID]DTNLevel{
// Critical — data mutations that must eventually be delivered.
ProtocolCreateResource: DTNCritical,
ProtocolUpdateResource: DTNCritical,
ProtocolDeleteResource: DTNCritical,
// Moderate — confirmations / config / planner: 3 retries before abandon.
ProtocolVerifyResource: DTNModerate,
ProtocolSendPlanner: DTNModerate,
ProtocolConsidersResource: DTNModerate,
ProtocolMinioConfigResource: DTNModerate,
ProtocolAdmiraltyConfigResource: DTNModerate,
ProtocolSourcePresignResource: DTNModerate,
}
// DTNEntryJSON is the on-disk representation of a DTNEntry.
// pp.AddrInfo and protocol.ID don't have built-in JSON tags so we flatten them.
type DTNEntryJSON struct {
DID string `json:"did"`
ResourceID string `json:"resource_id,omitempty"`
ForceCritical bool `json:"force_critical,omitempty"`
Addr pp.AddrInfo `json:"addr"`
DT *tools.DataType `json:"dt,omitempty"`
User string `json:"user"`
Payload []byte `json:"payload"`
Proto protocol.ID `json:"proto"`
Retries int `json:"retries"`
AddedAt time.Time `json:"added_at"`
}
type DTNEntry struct {
did string
resourceID string // UUID of the resource; empty for non-resource payloads (planner, config)
forceCritical bool // true when destination is NANO: all protocols become critical
addr pp.AddrInfo
dt *tools.DataType
user string
payload []byte
proto protocol.ID
retries int
addedAt time.Time
}
// isEffectivelyCritical returns true when the entry must be retried indefinitely,
// either because its protocol is inherently critical or because the destination
// is a NANO peer (forceCritical).
func (e *DTNEntry) isEffectivelyCritical() bool {
return DTNProtocols[e.proto] == DTNCritical || e.forceCritical
}
func (e *DTNEntry) toJSON() DTNEntryJSON {
return DTNEntryJSON{
DID: e.did,
ResourceID: e.resourceID,
ForceCritical: e.forceCritical,
Addr: e.addr,
DT: e.dt,
User: e.user,
Payload: e.payload,
Proto: e.proto,
Retries: e.retries,
AddedAt: e.addedAt,
}
}
func entryFromJSON(j DTNEntryJSON) *DTNEntry {
return &DTNEntry{
did: j.DID,
resourceID: j.ResourceID,
forceCritical: j.ForceCritical,
addr: j.Addr,
dt: j.DT,
user: j.User,
payload: j.Payload,
proto: j.Proto,
retries: j.Retries,
addedAt: j.AddedAt,
}
}
type DTNCache struct {
mu sync.Mutex
entries []*DTNEntry
// aesKey is the derived AES-256 key used for on-disk encryption.
// Nil when key derivation failed: persistence is disabled but the in-memory
// cache continues to function normally.
aesKey []byte
}
// newDNTCache initialises the cache, derives the encryption key, and restores
// any critical entries that were persisted before the last crash.
func newDNTCache() *DTNCache {
log := oclib.GetLogger()
c := &DTNCache{}
key, err := deriveDNTKey()
if err != nil {
log.Warn().Err(err).Msg("[dnt] key derivation failed — persistence disabled")
} else {
c.aesKey = key
c.loadFromDisk()
}
return c
}
// extractResourceID returns the "id" field from a JSON resource payload.
// Returns "" when the payload is not a resource object (planner, config, etc.).
func extractResourceID(payload []byte) string {
var obj struct {
ID string `json:"id"`
}
if err := json.Unmarshal(payload, &obj); err != nil {
return ""
}
return obj.ID
}
// enqueue adds an entry to the cache, respecting the resource lifecycle.
// Deduplication key is (did, resourceID): same resource to the same peer keeps
// only the latest mutation. resourceID is empty for non-resource payloads
// (planner, config), in which case deduplication falls back to did alone.
//
// - DELETE is terminal: any subsequent mutation on the same key is discarded.
// - UPDATE cannot be followed by CREATE: the resource already exists remotely.
// - All other cases replace the existing entry (newer mutation supersedes).
func (c *DTNCache) enqueue(e *DTNEntry) {
c.mu.Lock()
found, mutated := false, false
for i, existing := range c.entries {
if existing.did != e.did || existing.resourceID != e.resourceID {
continue
}
found = true
if existing.proto == ProtocolDeleteResource ||
(existing.proto == ProtocolUpdateResource && e.proto == ProtocolCreateResource) {
break // discard new entry silently — existing state is authoritative
}
c.entries[i] = e
mutated = true
break
}
if !found {
c.entries = append(c.entries, e)
mutated = true
}
c.mu.Unlock()
if mutated && e.isEffectivelyCritical() {
go c.persistToDisk()
}
}
// peersWithPending returns the distinct peer IDs (did) that have at least one
// critical entry in the cache. Used to populate Heartbeat.PendingContact.
func (c *DTNCache) peersWithPending() []string {
c.mu.Lock()
defer c.mu.Unlock()
seen := map[string]struct{}{}
var out []string
for _, e := range c.entries {
if e.isEffectivelyCritical() {
if _, ok := seen[e.did]; !ok {
seen[e.did] = struct{}{}
out = append(out, e.did)
}
}
}
return out
}
// drain atomically removes and returns all current entries.
func (c *DTNCache) drain() []*DTNEntry {
c.mu.Lock()
defer c.mu.Unlock()
out := c.entries
c.entries = nil
return out
}
// requeue puts entries back at the head of the list, preserving any new
// entries added while the retry loop was running.
func (c *DTNCache) requeue(entries []*DTNEntry) {
if len(entries) == 0 {
return
}
c.mu.Lock()
defer c.mu.Unlock()
c.entries = append(entries, c.entries...)
}
// ── Persistence ──────────────────────────────────────────────────────────────
// DTNCachePath returns the path of the on-disk cache file, placed next to the
// node's private key so it lives on the same persistent volume.
func DTNCachePath() string {
return filepath.Join(filepath.Dir(conf.GetConfig().PrivateKeyPath), "dnt_cache.bin")
}
// deriveDNTKey derives a 32-byte AES key from the node's Ed25519 private key
// using HKDF-SHA256. The derivation is deterministic: the same key is always
// produced from the same private key, so no symmetric secret needs storing.
func deriveDNTKey() ([]byte, error) {
priv, err := tools.LoadKeyFromFilePrivate()
if err != nil {
return nil, err
}
// Raw() on a libp2p Ed25519 private key returns the 64-byte representation
// (32-byte seed || 32-byte public key). We use the full 64 bytes as IKM.
raw, err := priv.Raw()
if err != nil {
return nil, err
}
reader := hkdf.New(sha256.New, raw, nil, []byte("oc-discovery/dnt-cache/v1"))
key := make([]byte, 32)
if _, err := io.ReadFull(reader, key); err != nil {
return nil, err
}
return key, nil
}
// persistToDisk encrypts all current critical entries and writes them to disk.
// Non-critical entries are deliberately excluded — they are not worth restoring
// after a restart given their limited retry budget.
func (c *DTNCache) persistToDisk() {
if c.aesKey == nil {
return
}
log := oclib.GetLogger()
c.mu.Lock()
var toSave []DTNEntryJSON
for _, e := range c.entries {
if e.isEffectivelyCritical() {
toSave = append(toSave, e.toJSON())
}
}
c.mu.Unlock()
plaintext, err := json.Marshal(toSave)
if err != nil {
return
}
block, err := aes.NewCipher(c.aesKey)
if err != nil {
return
}
gcm, err := cipher.NewGCM(block)
if err != nil {
return
}
nonce := make([]byte, gcm.NonceSize())
if _, err := io.ReadFull(rand.Reader, nonce); err != nil {
return
}
ciphertext := gcm.Seal(nonce, nonce, plaintext, nil)
path := DTNCachePath()
tmp := path + ".tmp"
if err := os.WriteFile(tmp, ciphertext, 0600); err != nil {
log.Warn().Err(err).Msg("[dnt] failed to write cache file")
return
}
if err := os.Rename(tmp, path); err != nil {
log.Warn().Err(err).Msg("[dnt] failed to rename cache file")
_ = os.Remove(tmp)
}
}
// loadFromDisk decrypts the on-disk cache and re-enqueues only critical entries.
// Errors (missing file, decryption failure) are non-fatal: the cache simply
// starts empty, which is safe.
func (c *DTNCache) loadFromDisk() {
if c.aesKey == nil {
return
}
log := oclib.GetLogger()
path := DTNCachePath()
data, err := os.ReadFile(path)
if err != nil {
if !os.IsNotExist(err) {
log.Warn().Err(err).Msg("[dnt] failed to read cache file")
}
return
}
block, err := aes.NewCipher(c.aesKey)
if err != nil {
return
}
gcm, err := cipher.NewGCM(block)
if err != nil {
return
}
if len(data) < gcm.NonceSize() {
log.Warn().Msg("[dnt] cache file too short, ignoring")
return
}
nonce, ciphertext := data[:gcm.NonceSize()], data[gcm.NonceSize():]
plaintext, err := gcm.Open(nil, nonce, ciphertext, nil)
if err != nil {
log.Warn().Err(err).Msg("[dnt] cache file decryption failed (key mismatch?), ignoring")
return
}
var saved []DTNEntryJSON
if err := json.Unmarshal(plaintext, &saved); err != nil {
log.Warn().Err(err).Msg("[dnt] cache file unmarshal failed, ignoring")
return
}
count := 0
for _, j := range saved {
// Only restore critical entries — moderate entries are intentionally
// not persisted, but this guard defends against format changes.
e := entryFromJSON(j)
if !e.isEffectivelyCritical() {
continue
}
c.entries = append(c.entries, e)
count++
}
if count > 0 {
log.Info().Int("count", count).Msg("[dnt] restored critical entries from disk")
}
}
// ── Retry loop ────────────────────────────────────────────────────────────────
// startDNTLoop runs the background retry goroutine. Call once after init.
func (s *StreamService) startDNTLoop() {
logger := oclib.GetLogger()
ticker := time.NewTicker(DTNRetryInterval)
defer ticker.Stop()
// retryEntries attempts delivery for the given entries and returns those
// that must be kept for the next round.
retryEntries := func(entries []*DTNEntry) []*DTNEntry {
var keep []*DTNEntry
for _, e := range entries {
_, err := s.write(e.did, &e.addr, e.dt, e.user, e.payload, e.proto)
if err == nil {
if e.isEffectivelyCritical() {
logger.Info().Str("proto", string(e.proto)).Str("peer", e.did).
Msg("[dnt] critical message delivered after retry")
} else {
logger.Info().Str("proto", string(e.proto)).Str("peer", e.did).
Int("retries", e.retries).Msg("[dnt] moderate message delivered after retry")
}
continue
}
if e.isEffectivelyCritical() {
keep = append(keep, e)
} else {
e.retries++
if e.retries < DTNMaxModerateRetries {
keep = append(keep, e)
} else {
logger.Warn().Str("proto", string(e.proto)).Str("peer", e.did).
Int("retries", e.retries).Msg("[dnt] moderate message abandoned after max retries")
}
}
}
return keep
}
for {
select {
case <-ticker.C:
entries := s.dnt.drain()
if len(entries) == 0 {
continue
}
s.dnt.requeue(retryEntries(entries))
go s.dnt.persistToDisk()
case peerID := <-s.dntNudge:
// A peer just signalled it is reachable — retry its entries immediately.
entries := s.dnt.drain()
var forPeer, other []*DTNEntry
for _, e := range entries {
if e.did == peerID {
forPeer = append(forPeer, e)
} else {
other = append(other, e)
}
}
kept := retryEntries(forPeer)
s.dnt.requeue(append(kept, other...))
if len(kept) < len(forPeer) {
go s.dnt.persistToDisk()
}
}
}
}
+92 -7
View File
@@ -15,6 +15,7 @@ import (
"cloud.o-forge.io/core/oc-lib/models/resources"
"cloud.o-forge.io/core/oc-lib/tools"
"github.com/libp2p/go-libp2p/core/network"
pp "github.com/libp2p/go-libp2p/core/peer"
)
type Verify struct {
@@ -23,8 +24,18 @@ type Verify struct {
func (ps *StreamService) handleEvent(protocol string, evt *common.Event, s network.Stream) error {
fmt.Println("handleEvent", protocol)
// Heartbeat received on an outgoing ProtocolObserve stream.
if protocol == ProtocolObserve {
// Distinguish between an open request and a close request by inspecting
// the ObserveRequest payload. The remote wraps both in a common.Event
// with Type=ProtocolObserve so the persistent readLoop can decode them.
var req ObserveRequest
if evt.Payload != nil {
json.Unmarshal(evt.Payload, &req) //nolint:errcheck — zero value means open
}
if req.Close {
ps.observeCache.cancel(s.Conn().RemotePeer().String())
return nil
}
return ps.handleIncomingObserve(s)
}
if protocol == observeHBEventType {
@@ -59,6 +70,11 @@ func (ps *StreamService) handleEvent(protocol string, evt *common.Event, s netwo
return err
}
}
if protocol == ProtocolSourcePresignResource {
if err := ps.pass(evt, tools.SOURCE_PRESIGN_EVENT); err != nil {
return err
}
}
if protocol == ProtocolAdmiraltyConfigResource {
if err := ps.pass(evt, tools.ADMIRALTY_CONFIG_EVENT); err != nil {
return err
@@ -125,9 +141,9 @@ func (abs *StreamService) sendPlanner(event *common.Event) error { //
}
func (abs *StreamService) retrieveResponse(event *common.Event) error { //
if !abs.ResourceSearches.IsActive(event.User) {
/*if !abs.ResourceSearches.IsActive(event.User) {
return nil // search already closed or timed out
}
}*/
res, err := resources.ToResource(int(event.DataType), event.Payload)
if err != nil || res == nil {
return nil
@@ -137,6 +153,7 @@ func (abs *StreamService) retrieveResponse(event *common.Event) error { //
b, err := json.Marshal(res.Serialize(res))
go tools.NewNATSCaller().SetNATSPub(tools.SEARCH_EVENT, tools.NATSResponse{
FromApp: "oc-discovery",
User: event.User,
Datatype: tools.DataType(event.DataType),
Method: int(tools.SEARCH_EVENT),
Payload: b,
@@ -147,6 +164,7 @@ func (abs *StreamService) retrieveResponse(event *common.Event) error { //
func (abs *StreamService) pass(event *common.Event, method tools.NATSMethod) error { //
go tools.NewNATSCaller().SetNATSPub(method, tools.NATSResponse{
FromApp: "oc-discovery",
User: event.User,
Datatype: tools.DataType(event.DataType),
Method: int(method),
Payload: event.Payload,
@@ -154,6 +172,36 @@ func (abs *StreamService) pass(event *common.Event, method tools.NATSMethod) err
return nil
}
// resolveBookingNano does a single DB lookup and returns:
//
// (nil, true) — not a booking, dest_peer_id absent, or dest == self → process normally, no forward
// (nano, true) — dest is one of our NANO peers → process + forward to nano
// (nil, false) — dest is unknown → ignore
func (ps *StreamService) resolveBookingNano(evt *common.Event) (*peer.Peer, bool) {
if tools.DataType(evt.DataType) != tools.BOOKING {
return nil, true
}
var b struct {
DestPeerID string `json:"dest_peer_id"`
}
if err := json.Unmarshal(evt.Payload, &b); err != nil || b.DestPeerID == "" {
return nil, true
}
if self, err := oclib.GetMySelf(); err == nil && self != nil && b.DestPeerID == self.GetID() {
return nil, true
}
d := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil).Search(&dbs.Filters{
And: map[string][]dbs.Filter{
"id": {{Operator: dbs.EQUAL.String(), Value: b.DestPeerID}},
"relation": {{Operator: dbs.EQUAL.String(), Value: peer.NANO}},
},
}, "", false, 0, 1)
if len(d.Data) == 0 {
return nil, false
}
return d.Data[0].(*peer.Peer), true
}
func (ps *StreamService) handleEventFromPartner(evt *common.Event, protocol string) error {
switch protocol {
case ProtocolSearchResource:
@@ -176,9 +224,10 @@ func (ps *StreamService) handleEventFromPartner(evt *common.Event, protocol stri
ps.SendResponse(p[0], evt, fmt.Sprintf("%v", search))
}
} else {
fmt.Println("SEND SEARCH_EVENT SetNATSPub", m)
go tools.NewNATSCaller().SetNATSPub(tools.SEARCH_EVENT, tools.NATSResponse{
fmt.Println("SEND SEARCH_EVENT SetNATSPub", m, evt.DataType, evt.User)
tools.NewNATSCaller().SetNATSPub(tools.SEARCH_EVENT, tools.NATSResponse{
FromApp: "oc-discovery",
User: evt.User,
Datatype: tools.DataType(evt.DataType),
Method: int(tools.SEARCH_EVENT),
Payload: evt.Payload,
@@ -186,19 +235,35 @@ func (ps *StreamService) handleEventFromPartner(evt *common.Event, protocol stri
}
case ProtocolCreateResource, ProtocolUpdateResource:
fmt.Println("RECEIVED Protocol.Update", string(evt.Payload))
go tools.NewNATSCaller().SetNATSPub(tools.CREATE_RESOURCE, tools.NATSResponse{
nano, ok := ps.resolveBookingNano(evt)
if !ok {
return nil
}
tools.NewNATSCaller().SetNATSPub(tools.CREATE_RESOURCE, tools.NATSResponse{
FromApp: "oc-discovery",
User: evt.User,
Datatype: tools.DataType(evt.DataType),
Method: int(tools.CREATE_RESOURCE),
Payload: evt.Payload,
})
if nano != nil {
ps.forwardToNano(nano, evt, protocol)
}
case ProtocolDeleteResource:
go tools.NewNATSCaller().SetNATSPub(tools.REMOVE_RESOURCE, tools.NATSResponse{
nano, ok := ps.resolveBookingNano(evt)
if !ok {
return nil
}
tools.NewNATSCaller().SetNATSPub(tools.REMOVE_RESOURCE, tools.NATSResponse{
FromApp: "oc-discovery",
User: evt.User,
Datatype: tools.DataType(evt.DataType),
Method: int(tools.REMOVE_RESOURCE),
Payload: evt.Payload,
})
if nano != nil {
ps.forwardToNano(nano, evt, protocol)
}
default:
return errors.New("no action authorized available : " + protocol)
}
@@ -223,11 +288,31 @@ func (abs *StreamService) SendResponse(p *peer.Peer, event *common.Event, search
access := oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil)
searched := access.Search(abs.FilterPeer(self.GetID(), event.Groups, search), "", false, 0, 0)
for _, ss := range searched.Data {
// SendResponse uses an admin request so SetAllowedInstances
// never calls FilterExploitationAuthorizations. Apply it
// explicitly here so we never leak private AEs to a remote peer.
if r, ok := ss.(resources.ResourceInterface); ok {
r.SetAllowedInstances(&tools.APIRequest{PeerID: p.UUID, Groups: event.Groups, Username: event.User})
}
if j, err := json.Marshal(ss); err == nil {
abs.PublishCommon(&dt, event.User, event.Groups, p.PeerID, ProtocolSearchResource, j)
}
}
}
}
// Close the ProtocolSearchResource stream to the requester immediately after
// sending all results. This prevents TempStream from reusing a stale (already
// closed by the remote) stream entry for a subsequent search from the same peer,
// which would cause write failure and no results for the second search.
if decodedID, err := pp.Decode(p.PeerID); err == nil {
abs.Mu.Lock()
if abs.Streams[ProtocolSearchResource] != nil {
if s, ok := abs.Streams[ProtocolSearchResource][decodedID]; ok {
s.Stream.Reset()
delete(abs.Streams[ProtocolSearchResource], decodedID)
}
}
abs.Mu.Unlock()
}
return nil
}
+181 -61
View File
@@ -27,7 +27,7 @@ const ProtocolObserve = "/opencloud/peer/observe/1.0"
// observeHBEventType is used as the common.Event.Type for heartbeat responses.
const observeHBEventType = "/opencloud/peer/observe/heartbeat"
const observeHBInterval = 30 * time.Second
const observeHBInterval = 10 * time.Second
const observeDrainDuration = 30 * time.Second
// observeBatchWindow is the accumulation window before a heartbeat batch is
@@ -45,7 +45,95 @@ type ObserveRequest struct {
// ObserveHeartbeat is sent by the observed side every observeHBInterval.
type ObserveHeartbeat struct {
State string `json:"state"` // always "online" when actively emitted
State string `json:"state"` // always "online" when actively emitted
SentAt time.Time `json:"sent_at,omitempty"` // timestamp set by sender; lets receiver compute one-way latency
}
const (
maxLatencyMs = 2000.0 // ms above which latency score → 0
latencySamples = 5 // sliding window size for latency averaging
fastThresholdMs = 200.0 // below = "fast", above = "slow"
reliableThreshold = 0.95 // miss_rate below 5% = "reliable"
)
// PeerObserveMetrics accumulates connection-quality data for one observed peer.
// Updated on every incoming heartbeat (observing side).
type PeerObserveMetrics struct {
mu sync.Mutex
firstObservedAt time.Time
lastHeartbeatAt time.Time
received uint64
latencies [latencySamples]time.Duration
latIdx int
latCount int
}
func (m *PeerObserveMetrics) record(latency time.Duration) {
m.mu.Lock()
defer m.mu.Unlock()
m.received++
m.lastHeartbeatAt = time.Now().UTC()
m.latencies[m.latIdx%latencySamples] = latency
m.latIdx++
if m.latCount < latencySamples {
m.latCount++
}
}
func (m *PeerObserveMetrics) snapshot() PeerObserveSnapshot {
m.mu.Lock()
defer m.mu.Unlock()
var total time.Duration
for i := 0; i < m.latCount; i++ {
total += m.latencies[i]
}
var avgMs float64
if m.latCount > 0 {
avgMs = float64(total.Milliseconds()) / float64(m.latCount)
}
expected := int64(time.Duration(m.lastHeartbeatAt.Second()-m.firstObservedAt.Second()) / observeHBInterval)
fmt.Println("EXPECTED", expected, m.received)
var missRate float64
if expected > 0 {
recv := int64(m.received)
if recv > expected {
recv = expected
}
missRate = 1.0 - float64(recv)/float64(expected)
}
latScore := 1.0 - avgMs/maxLatencyMs
if latScore < 0 {
latScore = 0
}
relScore := 1.0 - missRate
trust := (0.35*latScore + 0.65*relScore) * 100
speed := "fast"
if avgMs >= fastThresholdMs {
speed = "slow"
}
reliability := "reliable"
if relScore < reliableThreshold {
reliability = "watch"
}
return PeerObserveSnapshot{
LatencyMs: avgMs,
Speed: speed,
Reliability: reliability,
TrustScore: trust,
LastSeenAt: m.lastHeartbeatAt,
MissRate: missRate,
}
}
// PeerObserveSnapshot is the point-in-time quality summary sent to oc-peer via NATS.
type PeerObserveSnapshot struct {
LatencyMs float64 `json:"latency_ms"`
Speed string `json:"speed"` // "fast" | "slow"
Reliability string `json:"reliability"` // "reliable" | "watch"
TrustScore float64 `json:"trust_score"`
LastSeenAt time.Time `json:"last_seen_at"`
MissRate float64 `json:"miss_rate"`
}
// ShallowPeer is the minimal peer representation sent by oc-peer in a
@@ -204,18 +292,13 @@ func flushObserveBatch(peerIDs []string) {
// ── incoming observe handler (observed side) ──────────────────────────────────
// handleIncomingObserve is registered as the ProtocolObserve stream handler.
// It is called when a remote peer opens an observe stream to us.
// The function reads the request, validates it, then starts (or stops) the
// heartbeat goroutine and returns immediately — the goroutine owns the stream.
// handleIncomingObserve is called when a remote peer opens an observe stream
// to us (observed side). It starts a heartbeat goroutine that writes back on
// the same bidirectional rawStream — no separate reverse stream is opened.
// The goroutine stops via context cancellation (triggered by a close event
// read from rawStream) or when rawStream becomes unwritable.
func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
remotePeerID := rawStream.Conn().RemotePeer().String()
addr := rawStream.Conn().RemoteMultiaddr().String()
ad, err := pp.AddrInfoFromString(addr + "/p2p/" + remotePeerID)
if err != nil {
fmt.Println("qndlqnl EERR", addr, err)
return err
}
log := oclib.GetLogger()
// Drain mode: reject any new observations for 30 s after a close-all.
@@ -223,13 +306,11 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
draining := !s.drainUntil.IsZero() && time.Now().Before(s.drainUntil)
s.drainMu.RUnlock()
if draining {
rawStream.Close()
fmt.Println("Draining")
return errors.New("Draining")
return errors.New("draining")
}
// Read the observe request (with a generous deadline to avoid hangs).
// Guard: the requesting peer must not be blacklisted or be ourself.
did := ""
// Guard: the requesting peer must not be blacklisted.
access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
res := access.Search(&dbs.Filters{
And: map[string][]dbs.Filter{
@@ -238,11 +319,9 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
}, "", false, 0, 1)
if len(res.Data) > 0 {
p := res.Data[0].(*peer.Peer)
did = p.GetID()
if p.Relation == peer.BLACKLIST { // || p.Relation == peer.SELF
rawStream.Close()
if p.Relation == peer.BLACKLIST {
fmt.Println("CLOSE blacklist or self")
return errors.New("can't exploit blacklist or self")
return errors.New("can't observe blacklisted peer")
}
}
@@ -251,52 +330,32 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
s.observeCache.set(remotePeerID, cancel)
fmt.Println("LOOP OBSERVE")
go func() {
defer rawStream.Close()
// Do NOT close rawStream here: the persistent readLoop (HandleResponse)
// owns rawStream's lifecycle. We only stop writing.
defer cancel()
defer s.observeCache.delete(remotePeerID)
ticker := time.NewTicker(observeHBInterval)
defer ticker.Stop()
hbPayload, _ := json.Marshal(ObserveHeartbeat{State: "online"})
evt := common.NewEvent(observeHBEventType, s.Host.ID().String(), nil, "", hbPayload)
if evt == nil {
return
}
if s.Streams, err = common.TempStream(s.Host, *ad, ProtocolObserve, did, s.Streams, protocols, &s.Mu); err == nil {
stream := s.Streams[ProtocolObserve][ad.ID]
if err := json.NewEncoder(stream.Stream).Encode(evt); err != nil {
// Moderate connectivity event: the observer is unreachable.
// The deferred calls above purge this observer from the cache.
fmt.Println("LOOP EVT ERR", err)
log.Info().
Str("observer", remotePeerID).
Err(err).
Msg("[observe] heartbeat write failed — moderate connectivity event, purging observer from cache")
return
}
buildHBEvent := func() *common.Event {
p, _ := json.Marshal(ObserveHeartbeat{State: "online", SentAt: time.Now().UTC()})
return common.NewEvent(observeHBEventType, s.Host.ID().String(), nil, "", p)
}
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
rawStream.SetWriteDeadline(time.Now().Add(5 * time.Second))
fmt.Println("LOOP EVT", evt)
var err error
if s.Streams, err = common.TempStream(s.Host, *ad, ProtocolObserve, did, s.Streams, protocols, &s.Mu); err == nil {
stream := s.Streams[ProtocolObserve][ad.ID]
if err := json.NewEncoder(stream.Stream).Encode(evt); err != nil {
// Moderate connectivity event: the observer is unreachable.
// The deferred calls above purge this observer from the cache.
fmt.Println("LOOP EVT ERR", err)
log.Info().
Str("observer", remotePeerID).
Err(err).
Msg("[observe] heartbeat write failed — moderate connectivity event, purging observer from cache")
return
}
evt := buildHBEvent()
if err := json.NewEncoder(rawStream).Encode(evt); err != nil {
log.Info().
Str("observer", remotePeerID).
Err(err).
Msg("[observe] heartbeat write failed — stream closed, stopping goroutine")
return
}
rawStream.SetWriteDeadline(time.Time{})
}
@@ -308,14 +367,65 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
// ── heartbeat receiver (observing side) ───────────────────────────────────────
// handleObserveHeartbeat is called by readLoop when a heartbeat event arrives
// on an outgoing ProtocolObserve stream. It queues the peer_id in the batch
// accumulator; the batcher flushes to NATS after observeBatchWindow.
// on an outgoing ProtocolObserve stream. It updates per-peer metrics and flushes
// a quality snapshot to NATS.
func (ps *StreamService) handleObserveHeartbeat(evt *common.Event) error {
// ps.hbBatcher.add(evt.From)
flushObserveBatch([]string{evt.From})
var hb ObserveHeartbeat
if err := json.Unmarshal(evt.Payload, &hb); err == nil && !hb.SentAt.IsZero() {
latency := time.Since(hb.SentAt)
raw, _ := ps.observeMetrics.LoadOrStore(evt.From, &PeerObserveMetrics{
firstObservedAt: time.Now().UTC(),
})
raw.(*PeerObserveMetrics).record(latency)
fmt.Println("METRICS", raw)
ps.observeMetrics.Store(evt.From, raw)
}
ps.flushObserveForPeer(evt.From, evt.User)
return nil
}
// flushObserveForPeer sends a PEER_OBSERVE_RESPONSE_EVENT to NATS with a quality
// snapshot for peerID. Replaces the old flushObserveBatch (single-peer variant).
func (ps *StreamService) flushObserveForPeer(peerID string, user string) {
var snap *PeerObserveSnapshot
if raw, ok := ps.observeMetrics.Load(peerID); ok {
fmt.Println("RETRIEVED METRICS", raw)
s := raw.(*PeerObserveMetrics).snapshot()
snap = &s
}
fmt.Println("RETRIEVED METRICS 2", snap)
payload, err := json.Marshal(map[string]interface{}{
"peer_ids": []string{peerID},
"state": "online",
"metrics": map[string]*PeerObserveSnapshot{peerID: snap},
})
if err != nil {
return
}
tools.NewNATSCaller().SetNATSPub(tools.PEER_OBSERVE_RESPONSE_EVENT, tools.NATSResponse{
FromApp: "oc-discovery",
Datatype: tools.PEER,
User: user,
Method: int(tools.PEER_OBSERVE_RESPONSE_EVENT),
Payload: payload,
})
propPayload, err := json.Marshal(tools.PropalgationMessage{
DataType: int(tools.PEER),
Action: tools.PB_PROPAGATE,
Payload: payload,
})
if err != nil {
return
}
tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
FromApp: "oc-discovery",
Datatype: tools.PEER,
User: user,
Method: int(tools.PROPALGATION_EVENT),
Payload: propPayload,
})
}
// ── user→peer index (ref-counted observe management) ─────────────────────────
// userPeerIndex tracks which users are observing which peers.
@@ -514,7 +624,8 @@ func (ps *StreamService) openObserveStream(p ShallowPeer) error {
}
// closeObserveStream closes the ProtocolObserve stream to toPeerID and notifies
// the remote side.
// the remote side. The close event is wrapped in a common.Event so the remote's
// persistent readLoop can decode and handle it (cancel the heartbeat goroutine).
func (ps *StreamService) closeObserveStream(toPeerID string) error {
decodedID, err := pp.Decode(toPeerID)
if err != nil {
@@ -523,12 +634,15 @@ func (ps *StreamService) closeObserveStream(toPeerID string) error {
ps.Mu.Lock()
if ps.Streams[ProtocolObserve] != nil {
if s, ok := ps.Streams[ProtocolObserve][decodedID]; ok {
_ = json.NewEncoder(s.Stream).Encode(ObserveRequest{Close: true})
closePayload, _ := json.Marshal(ObserveRequest{Close: true})
closeEvt := common.NewEvent(ProtocolObserve, ps.Host.ID().String(), nil, "", closePayload)
_ = json.NewEncoder(s.Stream).Encode(closeEvt)
s.Stream.Close()
delete(ps.Streams[ProtocolObserve], decodedID)
}
}
ps.Mu.Unlock()
ps.observeMetrics.Delete(toPeerID)
return nil
}
@@ -537,7 +651,9 @@ func (ps *StreamService) closeObserveStream(toPeerID string) error {
func (ps *StreamService) CloseAllObserves() {
ps.Mu.Lock()
for _, s := range ps.Streams[ProtocolObserve] {
_ = json.NewEncoder(s.Stream).Encode(ObserveRequest{Close: true})
closePayload, _ := json.Marshal(ObserveRequest{Close: true})
closeEvt := common.NewEvent(ProtocolObserve, ps.Host.ID().String(), nil, "", closePayload)
_ = json.NewEncoder(s.Stream).Encode(closeEvt)
s.Stream.Close()
}
delete(ps.Streams, ProtocolObserve)
@@ -545,6 +661,10 @@ func (ps *StreamService) CloseAllObserves() {
// Reset user index so stale ref-counts don't block future opens.
ps.observeUsers = newUserPeerIndex()
ps.observeMetrics.Range(func(k, _ any) bool {
ps.observeMetrics.Delete(k)
return true
})
ps.drainMu.Lock()
ps.drainUntil = time.Now().Add(observeDrainDuration)
+40 -13
View File
@@ -61,15 +61,17 @@ func (ps *StreamService) PublishCommon(dt *tools.DataType, user string, groups [
}
stream, err := ps.write(toPeerID, ad, dt, user, resource, proto)
if err != nil {
if _, ok := dntProtocols[proto]; ok {
ps.dnt.enqueue(&dntEntry{
did: toPeerID,
addr: *ad,
dt: dt,
user: user,
payload: resource,
proto: proto,
addedAt: time.Now().UTC(),
if _, ok := DTNProtocols[proto]; ok {
ps.dnt.enqueue(&DTNEntry{
did: toPeerID,
resourceID: extractResourceID(resource),
forceCritical: pe.Relation == peer.NANO,
addr: *ad,
dt: dt,
user: user,
payload: resource,
proto: proto,
addedAt: time.Now().UTC(),
})
}
return nil, err
@@ -125,20 +127,45 @@ func (ps *StreamService) ToPartnerPublishEvent(
return nil
}
ks := []protocol.ID{}
for k := range protocolsPartners {
ks = append(ks, k)
// Extract creator_id to route to the correct nano.
// A master must only forward a resource to the nano that owns it.
var creatorID string
var minPayload struct {
CreatorID string `json:"creator_id"`
}
for _, rel := range []peer.PeerRelation{peer.PARTNER, peer.MASTER, peer.NANO} {
if json.Unmarshal(payload, &minPayload) == nil {
creatorID = minPayload.CreatorID
}
// PARTNER and MASTER receive every resource unconditionally.
for _, rel := range []peer.PeerRelation{peer.PARTNER, peer.MASTER} {
ps.PublishesCommon(dt, user, groups, &dbs.Filters{
And: map[string][]dbs.Filter{
"relation": {{Operator: dbs.EQUAL.String(), Value: rel}},
},
}, payload, proto)
}
// NANO: only send to the nano whose UUID matches the resource creator.
if creatorID != "" {
ps.PublishesCommon(dt, user, groups, &dbs.Filters{
And: map[string][]dbs.Filter{
"relation": {{Operator: dbs.EQUAL.String(), Value: peer.NANO}},
"id": {{Operator: dbs.EQUAL.String(), Value: creatorID}},
},
}, payload, proto)
}
return nil
}
// forwardToNano sends a booking mutation directly to a known NANO peer.
// The NANO peer is already resolved by the caller (resolveBookingNano).
// DTN critical is applied automatically by PublishCommon (Relation == NANO).
func (abs *StreamService) forwardToNano(nano *peer.Peer, evt *common.Event, proto string) {
dt := tools.DataType(evt.DataType)
abs.PublishCommon(&dt, evt.User, evt.Groups, nano.PeerID, protocol.ID(proto), evt.Payload)
}
func (s *StreamService) write(
did string,
peerID *pp.AddrInfo,
+51 -3
View File
@@ -27,6 +27,10 @@ const ProtocolConsidersResource = "/opencloud/resource/considers/1.0"
const ProtocolMinioConfigResource = "/opencloud/minio/config/1.0"
const ProtocolAdmiraltyConfigResource = "/opencloud/admiralty/config/1.0"
// ProtocolSourcePresignResource routes PB_SOURCE_PRESIGN to the resource-owner peer.
// The owner generates a pre-signed Minio URL and responds via PB_CONSIDERS.
const ProtocolSourcePresignResource = "/opencloud/resource/source-presign/1.0"
const ProtocolSearchResource = "/opencloud/resource/search/1.0"
const ProtocolCreateResource = "/opencloud/resource/create/1.0"
const ProtocolUpdateResource = "/opencloud/resource/update/1.0"
@@ -43,6 +47,7 @@ var protocols = map[protocol.ID]*common.ProtocolInfo{
ProtocolVerifyResource: {WaitResponse: true, TTL: 1 * time.Minute},
ProtocolMinioConfigResource: {WaitResponse: true, TTL: 1 * time.Minute},
ProtocolAdmiraltyConfigResource: {WaitResponse: true, TTL: 1 * time.Minute},
ProtocolSourcePresignResource: {WaitResponse: true, TTL: 1 * time.Minute},
ProtocolObserve: {WaitResponse: true, TTL: 1 * time.Minute},
}
@@ -63,8 +68,8 @@ type StreamService struct {
// IsPeerKnown, when set, is called at stream open for every inbound protocol.
// Return false to reset the stream immediately. Left nil until wired by the node.
IsPeerKnown func(pid pp.ID) bool
// dnt is the Disconnection Network Tolerance cache for outbound streams.
dnt *dntCache
// DTN is the Disconnection Network Tolerance cache for outbound streams.
dnt *DTNCache
// observeCache tracks running heartbeat goroutines on the OBSERVED side.
observeCache *observeCache
// hbBatcher accumulates incoming heartbeats (observing side) and flushes
@@ -78,6 +83,12 @@ type StreamService struct {
// observeUsers tracks which users are observing which peers so streams are
// closed only when the last observer for a peer disconnects.
observeUsers *userPeerIndex
// observeMetrics accumulates connection-quality data per observed peer (observing side).
// Keys are peer_id strings; values are *PeerObserveMetrics.
observeMetrics sync.Map
// DTNNudge receives peer IDs for which an immediate DTN retry should be
// attempted (e.g. when the peer just reconnected via PendingCallers).
dntNudge chan string
}
func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node common.DiscoveryPeer) (*StreamService, error) {
@@ -92,6 +103,7 @@ func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node c
dnt: newDNTCache(),
observeCache: newObserveCache(),
observeUsers: newUserPeerIndex(),
dntNudge: make(chan string, 32),
}
service.hbBatcher = newHeartbeatBatcher(flushObserveBatch)
for proto := range protocols {
@@ -105,6 +117,23 @@ func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node c
return service, nil
}
// PendingContacts returns the peer IDs that have at least one critical DTN
// entry pending. Called on each heartbeat tick to populate PendingContact.
func (s *StreamService) PendingContacts() []string {
return s.dnt.peersWithPending()
}
// NudgeContacts signals the DTN loop to retry immediately for the given peer
// IDs (typically received via HeartbeatResponse.PendingCallers).
func (s *StreamService) NudgeContacts(peerIDs []string) {
for _, id := range peerIDs {
select {
case s.dntNudge <- id:
default:
}
}
}
// gate wraps a stream handler with IsPeerKnown validation.
// If the peer is unknown the entire connection is closed and the handler is not called.
// IsPeerKnown is read at stream-open time so it works even when set after InitStream.
@@ -117,6 +146,17 @@ func (s *StreamService) gatePrivilege(h func(network.Stream)) func(network.Strea
},
}, "", false, 0, 1)
if len(d.Data) == 0 {
stream.Reset()
return
}
master := d.Data[0].(*peer.Peer)
if stream.Conn().RemotePeer().String() != master.PeerID {
logger := oclib.GetLogger()
logger.Warn().
Str("remote", stream.Conn().RemotePeer().String()).
Str("master", master.PeerID).
Msg("[gate] nano rejected stream from non-master peer")
stream.Reset()
return
}
}
@@ -162,9 +202,17 @@ func (s *StreamService) HandleResponse(stream network.Stream) {
Stream: stream,
Expiry: time.Now().UTC().Add(expiry + 1*time.Minute),
}
// ProtocolObserve uses a bidirectional long-lived stream: the remote writes
// heartbeats back on the same stream, and may later send a close event.
// Use a persistent readLoop so we can receive both heartbeats and close events.
protoInfo := protocols[stream.Protocol()]
if stream.Protocol() == ProtocolObserve {
protoInfo = &common.ProtocolInfo{PersistantStream: true}
}
go s.readLoop(s.Streams[stream.Protocol()][stream.Conn().RemotePeer()],
stream.Conn().RemotePeer(),
stream.Protocol(), protocols[stream.Protocol()])
stream.Protocol(), protoInfo)
}
func (s *StreamService) connectToPartners() error {